From be293b591733310bed6770fac2f9c2a20ee8d499 Mon Sep 17 00:00:00 2001 From: jialongzeng Date: Wed, 20 Nov 2024 06:44:51 +0000 Subject: [PATCH] deploy: 81c03039f5c21ae08c4fb40cb1dae6dd45cd652f --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 138717 +++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 139112 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..7739e99 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-11-12T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.08027v1","updated":"2024-11-12T18:56:58Z","published":"2024-11-12T18:56:58Z","title":"LLMPhy: Complex Physical Reasoning Using Large Language Models and World\n Models","summary":" Physical reasoning is an important skill needed for robotic agents when\noperating in the real world. However, solving such reasoning problems often\ninvolves hypothesizing and reflecting over complex multi-body interactions\nunder the effect of a multitude of physical forces and thus learning all such\ninteractions poses a significant hurdle for state-of-the-art machine learning\nframeworks, including large language models (LLMs). To study this problem, we\npropose a new physical reasoning task and a dataset, dubbed TraySim. Our task\ninvolves predicting the dynamics of several objects on a tray that is given an\nexternal impact -- the domino effect of the ensued object interactions and\ntheir dynamics thus offering a challenging yet controlled setup, with the goal\nof reasoning being to infer the stability of the objects after the impact. To\nsolve this complex physical reasoning task, we present LLMPhy, a zero-shot\nblack-box optimization framework that leverages the physics knowledge and\nprogram synthesis abilities of LLMs, and synergizes these abilities with the\nworld models built into modern physics engines. Specifically, LLMPhy uses an\nLLM to generate code to iteratively estimate the physical hyperparameters of\nthe system (friction, damping, layout, etc.) via an implicit\nanalysis-by-synthesis approach using a (non-differentiable) simulator in the\nloop and uses the inferred parameters to imagine the dynamics of the scene\ntowards solving the reasoning task. To show the effectiveness of LLMPhy, we\npresent experiments on our TraySim dataset to predict the steady-state poses of\nthe objects. Our results show that the combination of the LLM and the physics\nengine leads to state-of-the-art zero-shot physical reasoning performance,\nwhile demonstrating superior convergence against standard black-box\noptimization methods and better estimation of the physical parameters.\n","authors":["Anoop Cherian","Radu Corcodel","Siddarth Jain","Diego Romeres"],"pdf_url":"https://arxiv.org/pdf/2411.08027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13182v2","updated":"2024-11-12T17:42:22Z","published":"2023-12-20T16:45:26Z","title":"Goal-oriented Semantic Communications for Robotic Waypoint Transmission:\n The Value and Age of Information Approach","summary":" The ultra-reliable and low-latency communication (URLLC) service of the\nfifth-generation (5G) mobile communication network struggles to support safe\nrobot operation. Nowadays, the sixth-generation (6G) mobile communication\nnetwork is proposed to provide hyper-reliable and low-latency communication to\nenable safer control for robots. However, current 5G/ 6G research mainly\nfocused on improving communication performance, while the robotics community\nmostly assumed communication to be ideal. To jointly consider communication and\nrobotic control with a focus on the specific robotic task, we propose\ngoal-oriented semantic communication in robotic control (GSRC) to exploit the\ncontext of data and its importance in achieving the task at both transmitter\nand receiver. At the transmitter, we propose a deep reinforcement learning\nalgorithm to generate optimal control and command (C&C) data and a proactive\nrepetition scheme (DeepPro) to increase the successful transmission\nprobability. At the receiver, we design the value of information (VoI) and age\nof information (AoI) based queue ordering mechanism (VA-QOM) to rank the queue\nbased on the semantic information extracted from AoI and VoI. The simulation\nresults validate that our proposed GSRC framework achieves a 91.5% improvement\nin the mean square error compared to the traditional unmanned aerial vehicle\ncontrol framework.\n","authors":["Wenchao Wu","Yuanqing Yang","Yansha Deng","A. Hamid Aghvami"],"pdf_url":"https://arxiv.org/pdf/2312.13182v2.pdf","comment":"The paper has been accepted in IEEE TWC"},{"id":"http://arxiv.org/abs/2411.07954v1","updated":"2024-11-12T17:30:31Z","published":"2024-11-12T17:30:31Z","title":"Learning Memory Mechanisms for Decision Making through Demonstrations","summary":" In Partially Observable Markov Decision Processes, integrating an agent's\nhistory into memory poses a significant challenge for decision-making.\nTraditional imitation learning, relying on observation-action pairs for expert\ndemonstrations, fails to capture the expert's memory mechanisms used in\ndecision-making. To capture memory processes as demonstrations, we introduce\nthe concept of \\textbf{memory dependency pairs} $(p, q)$ indicating that events\nat time $p$ are recalled for decision-making at time $q$. We introduce\n\\textbf{AttentionTuner} to leverage memory dependency pairs in Transformers and\nfind significant improvements across several tasks compared to standard\nTransformers when evaluated on Memory Gym and the Long-term Memory Benchmark.\nCode is available at https://github.com/WilliamYue37/AttentionTuner .\n","authors":["William Yue","Bo Liu","Peter Stone"],"pdf_url":"https://arxiv.org/pdf/2411.07954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07933v1","updated":"2024-11-12T17:04:12Z","published":"2024-11-12T17:04:12Z","title":"Prediction of Acoustic Communication Performance for AUVs using Gaussian\n Process Classification","summary":" Cooperating autonomous underwater vehicles (AUVs) often rely on acoustic\ncommunication to coordinate their actions effectively. However, the reliability\nof underwater acoustic communication decreases as the communication range\nbetween vehicles increases. Consequently, teams of cooperating AUVs typically\nmake conservative assumptions about the maximum range at which they can\ncommunicate reliably. To address this limitation, we propose a novel approach\nthat involves learning a map representing the probability of successful\ncommunication based on the locations of the transmitting and receiving\nvehicles. This probabilistic communication map accounts for factors such as the\nrange between vehicles, environmental noise, and multi-path effects at a given\nlocation. In pursuit of this goal, we investigate the application of Gaussian\nprocess binary classification to generate the desired communication map. We\nspecialize existing results to this specific binary classification problem and\nexplore methods to incorporate uncertainty in vehicle location into the mapping\nprocess. Furthermore, we compare the prediction performance of the probability\ncommunication map generated using binary classification with that of a\nsignal-to-noise ratio (SNR) communication map generated using Gaussian process\nregression. Our approach is experimentally validated using communication and\nnavigation data collected during trials with a pair of Virginia Tech 690 AUVs.\n","authors":["Yifei Gao","Harun Yetkin","McMahon James","Daniel J. Stilwell"],"pdf_url":"https://arxiv.org/pdf/2411.07933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07890v1","updated":"2024-11-12T15:52:06Z","published":"2024-11-12T15:52:06Z","title":"Minimally Invasive Flexible Needle Manipulation Based on Finite Element\n Simulation and Cross Entropy Method","summary":" We present a novel approach for minimally invasive flexible needle\nmanipulations by pairing a real-time finite element simulator with the\ncross-entropy method. Additionally, we demonstrate how a kinematic-driven\nbang-bang controller can complement the control framework for better tracking\nperformance. We show how electromagnetic (EM) tracking can be readily\nincorporated into the framework to provide controller feedback. Tissue phantom\nexperiment with EM tracking shows the average targeting error is $0.16 \\pm\n0.29mm$.\n","authors":["Yanzhou Wang","Chang Chang","Junling Mei","Simon Leonard","Iulian Iordachita"],"pdf_url":"https://arxiv.org/pdf/2411.07890v1.pdf","comment":"Submitted to IEEE International Conference on Robotics and Automation\n 2025"},{"id":"http://arxiv.org/abs/2411.07862v1","updated":"2024-11-12T15:20:48Z","published":"2024-11-12T15:20:48Z","title":"Iterative Learning Control with Mismatch Compensation for Residual\n Vibration Suppression in Delta Robots","summary":" Unwanted vibrations stemming from the energy-optimized design of Delta robots\npose a challenge in their operation, especially with respect to precise\nreference tracking. To improve tracking accuracy, this paper proposes an\nadaptive mismatch-compensated iterative learning controller based on input\nshaping techniques. We establish a dynamic model considering the\nelectromechanical rigid-flexible coupling of the Delta robot, which integrates\nthe permanent magnet synchronous motor. Using this model, we design an\noptimization-based input shaper, considering the natural frequency of the\nrobot, which varies with the configuration. We proposed an iterative learning\ncontroller for the delta robot to improve tracking accuracy. Our iterative\nlearning controller incorporates model mismatch where the mismatch approximated\nby a fuzzy logic structure. The convergence property of the proposed controller\nis proved using a Barrier Composite Energy Function, providing a guarantee that\nthe tracking errors along the iteration axis converge to zero. Moreover,\nadaptive parameter update laws are designed to ensure convergence. Finally, we\nperform a series of high-fidelity simulations of the Delta robot using Simscape\nto demonstrate the effectiveness of the proposed control strategy.\n","authors":["Mingkun Wu","Alisa Rupenyan","Burkhard Corves"],"pdf_url":"https://arxiv.org/pdf/2411.07862v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2402.11658v3","updated":"2024-11-12T15:03:48Z","published":"2024-02-18T17:32:53Z","title":"Dynamic planning in hierarchical active inference","summary":" By dynamic planning, we refer to the ability of the human brain to infer and\nimpose motor trajectories related to cognitive decisions. A recent paradigm,\nactive inference, brings fundamental insights into the adaptation of biological\norganisms, constantly striving to minimize prediction errors to restrict\nthemselves to life-compatible states. Over the past years, many studies have\nshown how human and animal behaviors could be explained in terms of active\ninference - either as discrete decision-making or continuous motor control -\ninspiring innovative solutions in robotics and artificial intelligence. Still,\nthe literature lacks a comprehensive outlook on effectively planning realistic\nactions in changing environments. Setting ourselves the goal of modeling\ncomplex tasks such as tool use, we delve into the topic of dynamic planning in\nactive inference, keeping in mind two crucial aspects of biological behavior:\nthe capacity to understand and exploit affordances for object manipulation, and\nto learn the hierarchical interactions between the self and the environment,\nincluding other agents. We start from a simple unit and gradually describe more\nadvanced structures, comparing recently proposed design choices and providing\nbasic examples. This study distances itself from traditional views centered on\nneural networks and reinforcement learning, and points toward a yet unexplored\ndirection in active inference: hybrid representations in hierarchical models.\n","authors":["Matteo Priorelli","Ivilin Peev Stoianov"],"pdf_url":"https://arxiv.org/pdf/2402.11658v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07848v1","updated":"2024-11-12T15:01:40Z","published":"2024-11-12T15:01:40Z","title":"NL-SLAM for OC-VLN: Natural Language Grounded SLAM for Object-Centric\n VLN","summary":" Landmark-based navigation (e.g. go to the wooden desk) and relative\npositional navigation (e.g. move 5 meters forward) are distinct navigation\nchallenges solved very differently in existing robotics navigation methodology.\nWe present a new dataset, OC-VLN, in order to distinctly evaluate grounding\nobject-centric natural language navigation instructions in a method for\nperforming landmark-based navigation. We also propose Natural Language grounded\nSLAM (NL-SLAM), a method to ground natural language instruction to robot\nobservations and poses. We actively perform NL-SLAM in order to follow\nobject-centric natural language navigation instructions. Our methods leverage\npre-trained vision and language foundation models and require no task-specific\ntraining. We construct two strong baselines from state-of-the-art methods on\nrelated tasks, Object Goal Navigation and Vision Language Navigation, and we\nshow that our approach, NL-SLAM, outperforms these baselines across all our\nmetrics of success on OC-VLN. Finally, we successfully demonstrate the\neffectiveness of NL-SLAM for performing navigation instruction following in the\nreal world on a Boston Dynamics Spot robot.\n","authors":["Sonia Raychaudhuri","Duy Ta","Katrina Ashton","Angel X. Chang","Jiuguang Wang","Bernadette Bucher"],"pdf_url":"https://arxiv.org/pdf/2411.07848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12203v3","updated":"2024-11-12T15:00:37Z","published":"2024-03-18T19:25:57Z","title":"Bootstrapping Reinforcement Learning with Imitation for Vision-Based\n Agile Flight","summary":" Learning visuomotor policies for agile quadrotor flight presents significant\ndifficulties, primarily from inefficient policy exploration caused by\nhigh-dimensional visual inputs and the need for precise and low-latency\ncontrol. To address these challenges, we propose a novel approach that combines\nthe performance of Reinforcement Learning (RL) and the sample efficiency of\nImitation Learning (IL) in the task of vision-based autonomous drone racing.\nWhile RL provides a framework for learning high-performance controllers through\ntrial and error, it faces challenges with sample efficiency and computational\ndemands due to the high dimensionality of visual inputs. Conversely, IL\nefficiently learns from visual expert demonstrations, but it remains limited by\nthe expert's performance and state distribution. To overcome these limitations,\nour policy learning framework integrates the strengths of both approaches. Our\nframework contains three phases: training a teacher policy using RL with\nprivileged state information, distilling it into a student policy via IL, and\nadaptive fine-tuning via RL. Testing in both simulated and real-world scenarios\nshows our approach can not only learn in scenarios where RL from scratch fails\nbut also outperforms existing IL methods in both robustness and performance,\nsuccessfully navigating a quadrotor through a race course using only visual\ninformation. Videos of the experiments are available at\nhttps://rpg.ifi.uzh.ch/bootstrap-rl-with-il/index.html.\n","authors":["Jiaxu Xing","Angel Romero","Leonard Bauersfeld","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.12203v3.pdf","comment":"8th Annual Conference on Robot Learning (CoRL)"},{"id":"http://arxiv.org/abs/2411.07833v1","updated":"2024-11-12T14:35:45Z","published":"2024-11-12T14:35:45Z","title":"Robust Adaptive Safe Robotic Grasping with Tactile Sensing","summary":" Robotic grasping requires safe force interaction to prevent a grasped object\nfrom being damaged or slipping out of the hand. In this vein, this paper\nproposes an integrated framework for grasping with formal safety guarantees\nbased on Control Barrier Functions. We first design contact force and force\nclosure constraints, which are enforced by a safety filter to accomplish safe\ngrasping with finger force control. For sensory feedback, we develop a\ntechnique to estimate contact point, force, and torque from tactile sensors at\neach finger. We verify the framework with various safety filters in a numerical\nsimulation under a two-finger grasping scenario. We then experimentally\nvalidate the framework by grasping multiple objects, including fragile lab\nglassware, in a real robotic setup, showing that safe grasping can be\nsuccessfully achieved in the real world. We evaluate the performance of each\nsafety filter in the context of safety violation and conservatism, and find\nthat disturbance observer-based control barrier functions provide superior\nperformance for safety guarantees with minimum conservatism. The demonstration\nvideo is available at https://youtu.be/Cuj47mkXRdg.\n","authors":["Yitaek Kim","Jeeseop Kim","Albert H. Li","Aaron D. Ames","Christoffer Sloth"],"pdf_url":"https://arxiv.org/pdf/2411.07833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07830v1","updated":"2024-11-12T14:25:24Z","published":"2024-11-12T14:25:24Z","title":"Singularity-Avoidance Control of Robotic Systems with Model Mismatch and\n Actuator Constraints","summary":" Singularities, manifesting as special configuration states, deteriorate robot\nperformance and may even lead to a loss of control over the system. This paper\naddresses the kinematic singularity concerns in robotic systems with model\nmismatch and actuator constraints through control barrier functions (CBFs). We\npropose a learning-based control strategy to prevent robots entering\nsingularity regions. More precisely, we leverage Gaussian process (GP)\nregression to learn the unknown model mismatch, where the prediction error is\nrestricted by a deterministic bound. Moreover, we offer the criteria for\nparameter selection to ensure the feasibility of CBFs subject to actuator\nconstraints. The proposed approach is validated by high-fidelity simulations on\na 2 degrees-of-freedom (DoFs) planar robot.\n","authors":["Mingkun Wu","Alisa Rupenyan","Burkhard Corves"],"pdf_url":"https://arxiv.org/pdf/2411.07830v1.pdf","comment":"This work has been submitted to ECC 2025 for possible publication"},{"id":"http://arxiv.org/abs/2411.07799v1","updated":"2024-11-12T13:53:22Z","published":"2024-11-12T13:53:22Z","title":"Horticultural Temporal Fruit Monitoring via 3D Instance Segmentation and\n Re-Identification using Point Clouds","summary":" Robotic fruit monitoring is a key step toward automated agricultural\nproduction systems. Robots can significantly enhance plant and temporal fruit\nmonitoring by providing precise, high-throughput assessments that overcome the\nlimitations of traditional manual methods. Fruit monitoring is a challenging\ntask due to the significant variation in size, shape, orientation, and\nocclusion of fruits. Also, fruits may be harvested or newly grown between\nrecording sessions. Most methods are 2D image-based and they lack the 3D\nstructure, depth, and spatial information, which represent key aspects of fruit\nmonitoring. 3D colored point clouds, instead, can offer this information but\nthey introduce challenges such as their sparsity and irregularity. In this\npaper, we present a novel approach for temporal fruit monitoring that addresses\npoint clouds collected in a greenhouse over time. Our method segments fruits\nusing a learning-based instance segmentation approach directly on the point\ncloud. Each segmented fruit is processed by a 3D sparse convolutional neural\nnetwork to extract descriptors, which are used in an attention-based matching\nnetwork to associate fruits with their instances from previous data\ncollections. Experimental results on a real dataset of strawberries demonstrate\nthat our approach outperforms other methods for fruits re-identification over\ntime, allowing for precise temporal fruit monitoring in real and complex\nscenarios.\n","authors":["Daniel Fusaro","Federico Magistri","Jens Behley","Alberto Pretto","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2411.07799v1.pdf","comment":"Submitted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2411.07760v1","updated":"2024-11-12T12:49:41Z","published":"2024-11-12T12:49:41Z","title":"Navigation with QPHIL: Quantizing Planner for Hierarchical Implicit\n Q-Learning","summary":" Offline Reinforcement Learning (RL) has emerged as a powerful alternative to\nimitation learning for behavior modeling in various domains, particularly in\ncomplex navigation tasks. An existing challenge with Offline RL is the\nsignal-to-noise ratio, i.e. how to mitigate incorrect policy updates due to\nerrors in value estimates. Towards this, multiple works have demonstrated the\nadvantage of hierarchical offline RL methods, which decouples high-level path\nplanning from low-level path following. In this work, we present a novel\nhierarchical transformer-based approach leveraging a learned quantizer of the\nspace. This quantization enables the training of a simpler zone-conditioned\nlow-level policy and simplifies planning, which is reduced to discrete\nautoregressive prediction. Among other benefits, zone-level reasoning in\nplanning enables explicit trajectory stitching rather than implicit stitching\nbased on noisy value function estimates. By combining this transformer-based\nplanner with recent advancements in offline RL, our proposed approach achieves\nstate-of-the-art results in complex long-distance navigation environments.\n","authors":["Alexi Canesse","Mathieu Petitbois","Ludovic Denoyer","Sylvain Lamprier","Rémy Portelas"],"pdf_url":"https://arxiv.org/pdf/2411.07760v1.pdf","comment":"Under review. Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2404.19664v4","updated":"2024-11-12T12:43:42Z","published":"2024-04-30T15:57:41Z","title":"Towards Generalist Robot Learning from Internet Video: A Survey","summary":" Scaling deep learning to massive, diverse internet data has yielded\nremarkably general capabilities in visual and natural language understanding\nand generation. However, data has remained scarce and challenging to collect in\nrobotics, seeing robot learning struggle to obtain similarly general\ncapabilities. Promising Learning from Videos (LfV) methods aim to address the\nrobotics data bottleneck by augmenting traditional robot data with large-scale\ninternet video data. This video data offers broad foundational information\nregarding physical behaviour and the underlying physics of the world, and thus\ncan be highly informative for a generalist robot.\n In this survey, we present a thorough overview of the emerging field of LfV.\nWe outline fundamental concepts, including the benefits and challenges of LfV.\nWe provide a comprehensive review of current methods for extracting knowledge\nfrom large-scale internet video, addressing key challenges in LfV, and boosting\ndownstream robot and reinforcement learning via the use of video data. The\nsurvey concludes with a critical discussion of challenges and opportunities in\nLfV. Here, we advocate for scalable foundation model approaches that can\nleverage the full range of available internet video to improve the learning of\nrobot policies and dynamics models. We hope this survey can inform and catalyse\nfurther LfV research, driving progress towards the development of\ngeneral-purpose robots.\n","authors":["Robert McCarthy","Daniel C. H. Tan","Dominik Schmidt","Fernando Acero","Nathan Herr","Yilun Du","Thomas G. Thuruthel","Zhibin Li"],"pdf_url":"https://arxiv.org/pdf/2404.19664v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07719v1","updated":"2024-11-12T11:24:18Z","published":"2024-11-12T11:24:18Z","title":"EMPERROR: A Flexible Generative Perception Error Model for Probing\n Self-Driving Planners","summary":" To handle the complexities of real-world traffic, learning planners for\nself-driving from data is a promising direction. While recent approaches have\nshown great progress, they typically assume a setting in which the ground-truth\nworld state is available as input. However, when deployed, planning needs to be\nrobust to the long-tail of errors incurred by a noisy perception system, which\nis often neglected in evaluation. To address this, previous work has proposed\ndrawing adversarial samples from a perception error model (PEM) mimicking the\nnoise characteristics of a target object detector. However, these methods use\nsimple PEMs that fail to accurately capture all failure modes of detection. In\nthis paper, we present EMPERROR, a novel transformer-based generative PEM,\napply it to stress-test an imitation learning (IL)-based planner and show that\nit imitates modern detectors more faithfully than previous work. Furthermore,\nit is able to produce realistic noisy inputs that increase the planner's\ncollision rate by up to 85%, demonstrating its utility as a valuable tool for a\nmore complete evaluation of self-driving planners.\n","authors":["Niklas Hanselmann","Simon Doll","Marius Cordts","Hendrik P. A. Lensch","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2411.07719v1.pdf","comment":"Project page: https://lasnik.github.io/emperror/"},{"id":"http://arxiv.org/abs/2411.07711v1","updated":"2024-11-12T10:55:30Z","published":"2024-11-12T10:55:30Z","title":"OWLed: Outlier-weighed Layerwise Pruning for Efficient Autonomous\n Driving Framework","summary":" The integration of Large Language Models (LLMs) into autonomous driving\nsystems offers promising enhancements in environmental understanding and\ndecision-making. However, the substantial computational demands of deploying\nLLMs locally on vehicles render this approach unfeasible for real-world\nautomotive applications. To address this challenge, we introduce OWLed, the\nOutlier-Weighed Layerwise Pruning for Efficient Autonomous Driving Framework\nthat leverages outlier-weighted layerwise sparsity for model compression. Our\nmethod assigns non-uniform sparsity ratios to different layers based on the\ndistribution of outlier features, significantly reducing the model size without\nthe need for fine-tuning. To ensure the compressed model adapts well to\nautonomous driving tasks, we incorporate driving environment data into both the\ncalibration and pruning processes. Our empirical studies reveal that the\nencoder component is more sensitive to pruning than the LLM, highlighting its\ncritical role in the system. Experimental results demonstrate that OWLed\noutperforms existing methods in perception, action prediction, and language\nunderstanding while substantially lowering computational requirements. These\nfindings underscore the potential of combining advanced pruning techniques with\nLLMs to develop efficient and robust autonomous driving systems capable of\nhandling complex scenarios. Code will be made publicly available.\n","authors":["Jiaxi Li","Lu Yin","Xilu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07711v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.07699v1","updated":"2024-11-12T10:26:23Z","published":"2024-11-12T10:26:23Z","title":"RINO: Accurate, Robust Radar-Inertial Odometry with Non-Iterative\n Estimation","summary":" Precise localization and mapping are critical for achieving autonomous\nnavigation in self-driving vehicles. However, ego-motion estimation still faces\nsignificant challenges, particularly when GNSS failures occur or under extreme\nweather conditions (e.g., fog, rain, and snow). In recent years, scanning radar\nhas emerged as an effective solution due to its strong penetration\ncapabilities. Nevertheless, scanning radar data inherently contains high levels\nof noise, necessitating hundreds to thousands of iterations of optimization to\nestimate a reliable transformation from the noisy data. Such iterative solving\nis time-consuming, unstable, and prone to failure. To address these challenges,\nwe propose an accurate and robust Radar-Inertial Odometry system, RINO, which\nemploys a non-iterative solving approach. Our method decouples rotation and\ntranslation estimation and applies an adaptive voting scheme for 2D rotation\nestimation, enhancing efficiency while ensuring consistent solving time.\nAdditionally, the approach implements a loosely coupled system between the\nscanning radar and an inertial measurement unit (IMU), leveraging Error-State\nKalman Filtering (ESKF). Notably, we successfully estimated the uncertainty of\nthe pose estimation from the scanning radar, incorporating this into the\nfilter's Maximum A Posteriori estimation, a consideration that has been\npreviously overlooked. Validation on publicly available datasets demonstrates\nthat RINO outperforms state-of-the-art methods and baselines in both accuracy\nand robustness. Our code is available at https://github.com/yangsc4063/rino.\n","authors":["Shuocheng Yang","Yueming Cao","Shengbo Li","Jianqiang Wang","Shaobing Xu"],"pdf_url":"https://arxiv.org/pdf/2411.07699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14947v2","updated":"2024-11-12T10:04:31Z","published":"2024-06-21T07:59:43Z","title":"LiCS: Navigation using Learned-imitation on Cluttered Space","summary":" In this letter, we propose a robust and fast navigation system in a narrow\nindoor environment for UGV (Unmanned Ground Vehicle) using 2D LiDAR and\nodometry. We used behavior cloning with Transformer neural network to learn the\noptimization-based baseline algorithm. We inject Gaussian noise during expert\ndemonstration to increase the robustness of learned policy. We evaluate the\nperformance of LiCS using both simulation and hardware experiments. It\noutperforms all other baselines in terms of navigation performance and can\nmaintain its robust performance even on highly cluttered environments. During\nthe hardware experiments, LiCS can maintain safe navigation at maximum speed of\n$1.5\\ m/s$.\n","authors":["Joshua Julian Damanik","Jae-Won Jung","Chala Adane Deresa","Han-Lim Choi"],"pdf_url":"https://arxiv.org/pdf/2406.14947v2.pdf","comment":"6 pages, 4 figures. This work has been submitted to the IEEE for\n possible publication"},{"id":"http://arxiv.org/abs/2411.07644v1","updated":"2024-11-12T08:53:52Z","published":"2024-11-12T08:53:52Z","title":"Human Arm Pose Estimation with a Shoulder-worn Force-Myography Device\n for Human-Robot Interaction","summary":" Accurate human pose estimation is essential for effective Human-Robot\nInteraction (HRI). By observing a user's arm movements, robots can respond\nappropriately, whether it's providing assistance or avoiding collisions. While\nvisual perception offers potential for human pose estimation, it can be\nhindered by factors like poor lighting or occlusions. Additionally, wearable\ninertial sensors, though useful, require frequent calibration as they do not\nprovide absolute position information. Force-myography (FMG) is an alternative\napproach where muscle perturbations are externally measured. It has been used\nto observe finger movements, but its application to full arm state estimation\nis unexplored. In this letter, we investigate the use of a wearable FMG device\nthat can observe the state of the human arm for real-time applications of HRI.\nWe propose a Transformer-based model to map FMG measurements from the shoulder\nof the user to the physical pose of the arm. The model is also shown to be\ntransferable to other users with limited decline in accuracy. Through\nreal-world experiments with a robotic arm, we demonstrate collision avoidance\nwithout relying on visual perception.\n","authors":["Rotem Atari","Eran Bamani","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2411.07644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14440v2","updated":"2024-11-12T07:59:43Z","published":"2024-09-22T13:33:45Z","title":"Admittance Visuomotor Policy Learning for General-Purpose Contact-Rich\n Manipulations","summary":" Contact force in contact-rich environments is an essential modality for\nrobots to perform general-purpose manipulation tasks, as it provides\ninformation to compensate for the deficiencies of visual and proprioceptive\ndata in collision perception, high-precision grasping, and efficient\nmanipulation. In this paper, we propose an admittance visuomotor policy\nframework for continuous, general-purpose, contact-rich manipulations. During\ndemonstrations, we designed a low-cost, user-friendly teleoperation system with\ncontact interaction, aiming to gather compliant robot demonstrations and\naccelerate the data collection process. During training and inference, we\npropose a diffusion-based model to plan action trajectories and desired contact\nforces from multimodal observation that includes contact force, vision and\nproprioception. We utilize an admittance controller for compliance action\nexecution. A comparative evaluation with two state-of-the-art methods was\nconducted on five challenging tasks, each focusing on different action\nprimitives, to demonstrate our framework's generalization capabilities. Results\nshow our framework achieves the highest success rate and exhibits smoother and\nmore efficient contact compared to other methods, the contact force required to\ncomplete each tasks was reduced on average by 48.8%, and the success rate was\nincreased on average by 15.3%. Videos are available at\nhttps://ryanjiao.github.io/AdmitDiffPolicy/.\n","authors":["Bo Zhou","Ruixuan Jiao","Yi Li","Xiaogang Yuan","Fang Fang","Shihua Li"],"pdf_url":"https://arxiv.org/pdf/2409.14440v2.pdf","comment":"8 pages, 7 figures. This is the second version of the paper, and it\n is subject to further revisions. The current submission does not necessarily\n reflect the final quality or content of the paper"},{"id":"http://arxiv.org/abs/2411.07612v1","updated":"2024-11-12T07:38:57Z","published":"2024-11-12T07:38:57Z","title":"A Simple Multi-agent Joint Prediction Method for Autonomous Driving","summary":" Predicting future motions of road participants is an important task for\ndriving autonomously. Most existing models excel at predicting the marginal\ntrajectory of a single agent, but predicting joint trajectories for multiple\nagents that are consistent within a scene remains a challenge. Previous\nresearch has often focused on marginal predictions, but the importance of joint\npredictions has become increasingly apparent. Joint prediction aims to generate\ntrajectories that are consistent across the entire scene. Our research builds\nupon the SIMPL baseline to explore methods for generating scene-consistent\ntrajectories. We tested our algorithm on the Argoverse 2 dataset, and\nexperimental results demonstrate that our approach can generate\nscene-consistent trajectories. Compared to the SIMPL baseline, our method\nsignificantly reduces the collision rate of joint trajectories within the\nscene.\n","authors":["Mingyi Wang","Hongqun Zou","Yifan Liu","You Wang","Guang Li"],"pdf_url":"https://arxiv.org/pdf/2411.07612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07590v1","updated":"2024-11-12T07:06:45Z","published":"2024-11-12T07:06:45Z","title":"Multiple Non-cooperative Targets Encirclement by Relative Distance based\n Positioning and Neural Anti-Synchronization Control","summary":" From prehistoric encirclement for hunting to GPS orbiting the earth for\npositioning, target encirclement has numerous real world applications. However,\nencircling multiple non-cooperative targets in GPS-denied environments remains\nchallenging. In this work, multiple targets encirclement by using a minimum of\ntwo tasking agents, is considered where the relative distance measurements\nbetween the agents and the targets can be obtained by using onboard sensors.\nBased on the measurements, the center of all the targets is estimated directly\nby a fuzzy wavelet neural network (FWNN) and the least squares fit method.\nThen, a new distributed anti-synchronization controller (DASC) is designed so\nthat the two tasking agents are able to encircle all targets while staying\nopposite to each other. In particular, the radius of the desired encirclement\ntrajectory can be dynamically determined to avoid potential collisions between\nthe two agents and all targets. Based on the Lyapunov stability analysis\nmethod, the convergence proofs of the neural network prediction error, the\ntarget-center position estimation error, and the controller error are addressed\nrespectively. Finally, both numerical simulations and UAV flight experiments\nare conducted to demonstrate the validity of the encirclement algorithms. The\nflight tests recorded video and other simulation results can be found in\nhttps://youtu.be/B8uTorBNrl4.\n","authors":["Fen Liu","Shenghai Yuan","Wei Meng","Rong Su","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2411.07590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07588v1","updated":"2024-11-12T06:53:25Z","published":"2024-11-12T06:53:25Z","title":"A High-frequency Pneumatic Oscillator for Soft Robotics","summary":" Soft robots, while highly adaptable to diverse environments through various\nactuation methods, still face significant performance boundary due to the\ninherent properties of materials. These limitations manifest in the challenge\nof guaranteeing rapid response and large-scale movements simultaneously,\nultimately restricting the robots' absolute speed and overall efficiency. In\nthis paper, we introduce a high-frequency pneumatic oscillator (HIPO) to\novercome these challenges. Through a collision-induced phase resetting\nmechanism, our HIPO leverages event-based nonlinearity to trigger\nself-oscillation of pneumatic actuator, which positively utilizes intrinsic\ncharacteristics of materials. This enables the system to spontaneously generate\nperiodic control signals and directly produce motion responses, eliminating the\nneed for incorporating external actuation components. By efficiently and\nrapidly converting internal energy of airflow into the kinetic energy of\nrobots, HIPO achieves a frequency of up to 20 Hz. Furthermore, we demonstrate\nthe versatility and high-performance capabilities of HIPO through bio-inspired\nrobots: an insect-like fast-crawler (with speeds up to 50.27 cm/s), a\nhigh-frequency butterfly-like wing-flapper, and a maneuverable duck-like\nswimmer. By eliminating external components and seamlessly fusing signal\ngeneration, energy conversion, and motion output, HIPO unleashes rapid and\nefficient motion, unlocking potential for high-performance soft robotics.\n","authors":["Longchuan Li","Shuqian He","Qiukai Qi","Ye Cui","Cong Yan","Kaige Jiang","Shuai Kang","Isao T. Tokuda","Zhongkui Wang","Shugen Ma","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2411.07588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07573v1","updated":"2024-11-12T06:21:47Z","published":"2024-11-12T06:21:47Z","title":"Robotic Control Optimization Through Kernel Selection in Safe Bayesian\n Optimization","summary":" Control system optimization has long been a fundamental challenge in\nrobotics. While recent advancements have led to the development of control\nalgorithms that leverage learning-based approaches, such as SafeOpt, to\noptimize single feedback controllers, scaling these methods to high-dimensional\ncomplex systems with multiple controllers remains an open problem. In this\npaper, we propose a novel learning-based control optimization method, which\nenhances the additive Gaussian process-based Safe Bayesian Optimization\nalgorithm to efficiently tackle high-dimensional problems through kernel\nselection. We use PID controller optimization in drones as a representative\nexample and test the method on Safe Control Gym, a benchmark designed for\nevaluating safe control techniques. We show that the proposed method provides a\nmore efficient and optimal solution for high-dimensional control optimization\nproblems, demonstrating significant improvements over existing techniques.\n","authors":["Lihao Zheng","Hongxuan Wang","Xiaocong Li","Jun Ma","Prahlad Vadakkepat"],"pdf_url":"https://arxiv.org/pdf/2411.07573v1.pdf","comment":"Accepted by 2024 IEEE International Conference on Robotics and\n Biomimetics (ROBIO)"},{"id":"http://arxiv.org/abs/2411.06087v2","updated":"2024-11-12T05:40:38Z","published":"2024-11-09T06:39:44Z","title":"Cross-Domain Transfer Learning using Attention Latent Features for\n Multi-Agent Trajectory Prediction","summary":" With the advancements of sensor hardware, traffic infrastructure and deep\nlearning architectures, trajectory prediction of vehicles has established a\nsolid foundation in intelligent transportation systems. However, existing\nsolutions are often tailored to specific traffic networks at particular time\nperiods. Consequently, deep learning models trained on one network may struggle\nto generalize effectively to unseen networks. To address this, we proposed a\nnovel spatial-temporal trajectory prediction framework that performs\ncross-domain adaption on the attention representation of a Transformer-based\nmodel. A graph convolutional network is also integrated to construct dynamic\ngraph feature embeddings that accurately model the complex spatial-temporal\ninteractions between the multi-agent vehicles across multiple traffic domains.\nThe proposed framework is validated on two case studies involving the\ncross-city and cross-period settings. Experimental results show that our\nproposed framework achieves superior trajectory prediction and domain\nadaptation performances over the state-of-the-art models.\n","authors":["Jia Quan Loh","Xuewen Luo","Fan Ding","Hwa Hui Tew","Junn Yong Loo","Ze Yang Ding","Susilawati Susilawati","Chee Pin Tan"],"pdf_url":"https://arxiv.org/pdf/2411.06087v2.pdf","comment":"Accepted at the IEEE International Conference on Systems, Man, and\n Cybernetics 2024"},{"id":"http://arxiv.org/abs/2411.07551v1","updated":"2024-11-12T04:59:37Z","published":"2024-11-12T04:59:37Z","title":"SP-VIO: Robust and Efficient Filter-Based Visual Inertial Odometry with\n State Transformation Model and Pose-Only Visual Description","summary":" Due to the advantages of high computational efficiency and small memory\nrequirements, filter-based visual inertial odometry (VIO) has a good\napplication prospect in miniaturized and payload-constrained embedded systems.\nHowever, the filter-based method has the problem of insufficient accuracy. To\nthis end, we propose the State transformation and Pose-only VIO (SP-VIO) by\nrebuilding the state and measurement models, and considering further visual\ndeprived conditions. In detail, we first proposed a system model based on the\ndouble state transformation extended Kalman filter (DST-EKF), which has been\nproven to have better observability and consistency than the models based on\nextended Kalman filter (EKF) and state transformation extended Kalman filter\n(ST-EKF). Secondly, to reduce the influence of linearization error caused by\ninaccurate 3D reconstruction, we adopt the Pose-only (PO) theory to decouple\nthe measurement model from 3D features. Moreover, to deal with visual deprived\nconditions, we propose a double state transformation Rauch-Tung-Striebel\n(DST-RTS) backtracking method to optimize motion trajectories during visual\ninterruption.\n Experiments on public (EuRoC, Tum-VI, KITTI) and personal datasets show that\nSP-VIO has better accuracy and efficiency than state-of-the-art (SOTA) VIO\nalgorithms, and has better robustness under visual deprived conditions.\n","authors":["Xueyu Du","Chengjun Ji","Lilian Zhang","Xinchan Luo","Huaiyi Zhang","Maosong Wang","Wenqi Wu","Jun Mao"],"pdf_url":"https://arxiv.org/pdf/2411.07551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07550v1","updated":"2024-11-12T04:58:51Z","published":"2024-11-12T04:58:51Z","title":"Learning Autonomous Docking Operation of Fully Actuated Autonomous\n Surface Vessel from Expert data","summary":" This paper presents an approach for autonomous docking of a fully actuated\nautonomous surface vessel using expert demonstration data. We frame the docking\nproblem as an imitation learning task and employ inverse reinforcement learning\n(IRL) to learn a reward function from expert trajectories. A two-stage neural\nnetwork architecture is implemented to incorporate both environmental context\nfrom sensors and vehicle kinematics into the reward function. The learned\nreward is then used with a motion planner to generate docking trajectories.\nExperiments in simulation demonstrate the effectiveness of this approach in\nproducing human-like docking behaviors across different environmental\nconfigurations.\n","authors":["Akash Vijayakumar","Atmanand M A","Abhilash Somayajula"],"pdf_url":"https://arxiv.org/pdf/2411.07550v1.pdf","comment":"5 pages, 8 figures, IEEE Oceans Halifax 2024 Conference, Presented in\n September 2024 in IEEE Oceans Conference in Halifax, Canada as a Student\n Poster"},{"id":"http://arxiv.org/abs/2404.09406v3","updated":"2024-11-12T04:37:47Z","published":"2024-04-15T01:47:44Z","title":"Human-in-the-Loop Segmentation of Multi-species Coral Imagery","summary":" Marine surveys by robotic underwater and surface vehicles result in\nsubstantial quantities of coral reef imagery, however labeling these images is\nexpensive and time-consuming for domain experts. Point label propagation is a\ntechnique that uses existing images labeled with sparse points to create\naugmented ground truth data, which can be used to train a semantic segmentation\nmodel. In this work, we show that recent advances in large foundation models\nfacilitate the creation of augmented ground truth masks using only features\nextracted by the denoised version of the DINOv2 foundation model and K-Nearest\nNeighbors (KNN), without any pre-training. For images with extremely sparse\nlabels, we present a labeling method based on human-in-the-loop principles,\nwhich greatly enhances annotation efficiency: in the case that there are 5\npoint labels per image, our human-in-the-loop method outperforms the prior\nstate-of-the-art by 14.2% for pixel accuracy and 19.7% for mIoU; and by 8.9%\nand 18.3% if there are 10 point labels. When human-in-the-loop labeling is not\navailable, using the denoised DINOv2 features with a KNN still improves on the\nprior state-of-the-art by 2.7% for pixel accuracy and 5.8% for mIoU (5 grid\npoints). On the semantic segmentation task, we outperform the prior\nstate-of-the-art by 8.8% for pixel accuracy and by 13.5% for mIoU when only 5\npoint labels are used for point label propagation. Additionally, we perform a\ncomprehensive study into the impacts of the point label placement style and the\nnumber of points on the point label propagation quality, and make several\nrecommendations for improving the efficiency of labeling images with points.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Niko Suenderhauf","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2404.09406v3.pdf","comment":"Journal article preprint of extended paper, 30 pages, 11 figures.\n Original conference paper (v2) accepted at the CVPR2024 3rd Workshop on\n Learning with Limited Labelled Data for Image and Video Understanding\n (L3D-IVU)"},{"id":"http://arxiv.org/abs/2410.10621v2","updated":"2024-11-12T04:34:32Z","published":"2024-10-14T15:25:55Z","title":"Traversability-Aware Legged Navigation by Learning from Real-World\n Visual Data","summary":" The enhanced mobility brought by legged locomotion empowers quadrupedal\nrobots to navigate through complex and unstructured environments. However,\noptimizing agile locomotion while accounting for the varying energy costs of\ntraversing different terrains remains an open challenge. Most previous work\nfocuses on planning trajectories with traversability cost estimation based on\nhuman-labeled environmental features. However, this human-centric approach is\ninsufficient because it does not account for the varying capabilities of the\nrobot locomotion controllers over challenging terrains. To address this, we\ndevelop a novel traversability estimator in a robot-centric manner, based on\nthe value function of the robot's locomotion controller. This estimator is\nintegrated into a new learning-based RGBD navigation framework. The framework\nemploys multiple training stages to develop a planner that guides the robot in\navoiding obstacles and hard-to-traverse terrains while reaching its goals. The\ntraining of the navigation planner is directly performed in the real world\nusing a sample efficient reinforcement learning method that utilizes both\nonline data and offline datasets. Through extensive benchmarking, we\ndemonstrate that the proposed framework achieves the best performance in\naccurate traversability cost estimation and efficient learning from multi-modal\ndata (including the robot's color and depth vision, as well as proprioceptive\nfeedback) for real-world training. Using the proposed method, a quadrupedal\nrobot learns to perform traversability-aware navigation through trial and error\nin various real-world environments with challenging terrains that are difficult\nto classify using depth vision alone. Moreover, the robot demonstrates the\nability to generalize the learned navigation skills to unseen scenarios. Video\ncan be found at https://youtu.be/RSqnIWZ1qks.\n","authors":["Hongbo Zhang","Zhongyu Li","Xuanqi Zeng","Laura Smith","Kyle Stachowicz","Dhruv Shah","Linzhu Yue","Zhitao Song","Weipeng Xia","Sergey Levine","Koushil Sreenath","Yun-hui Liu"],"pdf_url":"https://arxiv.org/pdf/2410.10621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01652v2","updated":"2024-11-12T04:33:26Z","published":"2024-09-03T06:45:22Z","title":"ReKep: Spatio-Temporal Reasoning of Relational Keypoint Constraints for\n Robotic Manipulation","summary":" Representing robotic manipulation tasks as constraints that associate the\nrobot and the environment is a promising way to encode desired robot behaviors.\nHowever, it remains unclear how to formulate the constraints such that they are\n1) versatile to diverse tasks, 2) free of manual labeling, and 3) optimizable\nby off-the-shelf solvers to produce robot actions in real-time. In this work,\nwe introduce Relational Keypoint Constraints (ReKep), a visually-grounded\nrepresentation for constraints in robotic manipulation. Specifically, ReKep is\nexpressed as Python functions mapping a set of 3D keypoints in the environment\nto a numerical cost. We demonstrate that by representing a manipulation task as\na sequence of Relational Keypoint Constraints, we can employ a hierarchical\noptimization procedure to solve for robot actions (represented by a sequence of\nend-effector poses in SE(3)) with a perception-action loop at a real-time\nfrequency. Furthermore, in order to circumvent the need for manual\nspecification of ReKep for each new task, we devise an automated procedure that\nleverages large vision models and vision-language models to produce ReKep from\nfree-form language instructions and RGB-D observations. We present system\nimplementations on a wheeled single-arm platform and a stationary dual-arm\nplatform that can perform a large variety of manipulation tasks, featuring\nmulti-stage, in-the-wild, bimanual, and reactive behaviors, all without\ntask-specific data or environment models. Website at\nhttps://rekep-robot.github.io/.\n","authors":["Wenlong Huang","Chen Wang","Yunzhu Li","Ruohan Zhang","Li Fei-Fei"],"pdf_url":"https://arxiv.org/pdf/2409.01652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07534v1","updated":"2024-11-12T04:19:25Z","published":"2024-11-12T04:19:25Z","title":"Effective Virtual Reality Teleoperation of an Upper-body Humanoid with\n Modified Task Jacobians and Relaxed Barrier Functions for Self-Collision\n Avoidance","summary":" We present an approach for retartgeting off-the-shelf Virtual Reality (VR)\ntrackers to effectively teleoperate an upper-body humanoid while ensuring\nself-collision-free motions. Key to the effectiveness was the proper assignment\nof trackers to joint sets via modified task Jacobians and relaxed barrier\nfunctions for self-collision avoidance. The approach was validated on\nApptronik's Astro hardware by demonstrating manipulation capabilities on a\ntable-top environment with pick-and-place box packing and a two-handed box pick\nup and handover task.\n","authors":["Steven Jens Jorgensen","Ravi Bhadeshiya"],"pdf_url":"https://arxiv.org/pdf/2411.07534v1.pdf","comment":"XR & Robotics Workshop, IROS 2022"},{"id":"http://arxiv.org/abs/2312.15364v2","updated":"2024-11-12T03:00:07Z","published":"2023-12-23T22:27:40Z","title":"WildScenes: A Benchmark for 2D and 3D Semantic Segmentation in\n Large-scale Natural Environments","summary":" Recent progress in semantic scene understanding has primarily been enabled by\nthe availability of semantically annotated bi-modal (camera and LiDAR) datasets\nin urban environments. However, such annotated datasets are also needed for\nnatural, unstructured environments to enable semantic perception for\napplications, including conservation, search and rescue, environment\nmonitoring, and agricultural automation. Therefore, we introduce $WildScenes$,\na bi-modal benchmark dataset consisting of multiple large-scale, sequential\ntraversals in natural environments, including semantic annotations in\nhigh-resolution 2D images and dense 3D LiDAR point clouds, and accurate 6-DoF\npose information. The data is (1) trajectory-centric with accurate localization\nand globally aligned point clouds, (2) calibrated and synchronized to support\nbi-modal training and inference, and (3) containing different natural\nenvironments over 6 months to support research on domain adaptation. Our 3D\nsemantic labels are obtained via an efficient, automated process that transfers\nthe human-annotated 2D labels from multiple views into 3D point cloud\nsequences, thus circumventing the need for expensive and time-consuming human\nannotation in 3D. We introduce benchmarks on 2D and 3D semantic segmentation\nand evaluate a variety of recent deep-learning techniques to demonstrate the\nchallenges in semantic segmentation in natural environments. We propose\ntrain-val-test splits for standard benchmarks as well as domain adaptation\nbenchmarks and utilize an automated split generation technique to ensure the\nbalance of class label distributions. The $WildScenes$ benchmark webpage is\nhttps://csiro-robotics.github.io/WildScenes, and the data is publicly available\nat https://data.csiro.au/collection/csiro:61541 .\n","authors":["Kavisha Vidanapathirana","Joshua Knights","Stephen Hausler","Mark Cox","Milad Ramezani","Jason Jooste","Ethan Griffiths","Shaheer Mohamed","Sridha Sridharan","Clinton Fookes","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2312.15364v2.pdf","comment":"Accepted in the The International Journal of Robotics Research (IJRR)"},{"id":"http://arxiv.org/abs/2410.01230v2","updated":"2024-11-12T00:24:55Z","published":"2024-10-02T04:16:34Z","title":"Towards Efficient Motion Planning for UAVs: Lazy A* Search with Motion\n Primitives","summary":" Search-based motion planning algorithms have been widely utilized for\nunmanned aerial vehicles (UAVs). However, deploying these algorithms on real\nUAVs faces challenges due to limited onboard computational resources. The\nalgorithms struggle to find solutions in high-dimensional search spaces and\nrequire considerable time to ensure that the trajectories are dynamically\nfeasible. This paper incorporates the lazy search concept into search-based\nplanning algorithms to address the critical issue of real-time planning for\ncollision-free and dynamically feasible trajectories on UAVs. We demonstrate\nthat the lazy search motion planning algorithm can efficiently find optimal\ntrajectories and significantly improve computational efficiency.\n","authors":["Wentao Wang","Yi Shen","Kaiyang Chen","Kaifan Lu"],"pdf_url":"https://arxiv.org/pdf/2410.01230v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08231v1","updated":"2024-11-12T22:53:09Z","published":"2024-11-12T22:53:09Z","title":"Enhanced Monocular Visual Odometry with AR Poses and Integrated INS-GPS\n for Robust Localization in Urban Environments","summary":" This paper introduces a cost effective localization system combining\nmonocular visual odometry , augmented reality (AR) poses, and integrated\nINS-GPS data. We address monocular VO scale factor issues using AR poses and\nenhance accuracy with INS and GPS data, filtered through an Extended Kalman\nFilter . Our approach, tested using manually annotated trajectories from Google\nStreet View, achieves an RMSE of 1.529 meters over a 1 km track. Future work\nwill focus on real-time mobile implementation and further integration of\nvisual-inertial odometry for robust localization. This method offers lane-level\naccuracy with minimal hardware, making advanced navigation more accessible.\n","authors":["Ankit Shaw"],"pdf_url":"https://arxiv.org/pdf/2411.08231v1.pdf","comment":"The copyright of this paper would be given to IEEE after \"acceptance\n of paper by IEEE\""},{"id":"http://arxiv.org/abs/2411.08169v1","updated":"2024-11-12T20:26:11Z","published":"2024-11-12T20:26:11Z","title":"Point Cloud Context Analysis for Rehabilitation Grasping Assistance","summary":" Controlling hand exoskeletons for assisting impaired patients in grasping\ntasks is challenging because it is difficult to infer user intent. We\nhypothesize that majority of daily grasping tasks fall into a small set of\ncategories or modes which can be inferred through real-time analysis of\nenvironmental geometry from 3D point clouds. This paper presents a low-cost,\nreal-time system for semantic image labeling of household scenes with the\nobjective to inform and assist activities of daily living. The system consists\nof a miniature depth camera, an inertial measurement unit and a microprocessor.\nIt is able to achieve 85% or higher accuracy at classification of predefined\nmodes while processing complex 3D scenes at over 30 frames per second. Within\neach mode it can detect and localize graspable objects. Grasping points can be\ncorrectly estimated on average within 1 cm for simple object geometries. The\nsystem has potential applications in robotic-assisted rehabilitation as well as\nmanual task assistance.\n","authors":["Jackson M. Steinkamp","Laura J. Brattain","Conor J. Walsh","Robert D. Howe"],"pdf_url":"https://arxiv.org/pdf/2411.08169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08163v1","updated":"2024-11-12T20:15:13Z","published":"2024-11-12T20:15:13Z","title":"Emergent functional dynamics of link-bots","summary":" Synthetic active collectives, composed of many nonliving individuals capable\nof cooperative changes in group shape and dynamics, hold promise for practical\napplications and for the elucidation of guiding principles of natural\ncollectives. However, the design of collective robotic systems that operate\neffectively without intelligence or complex control at either the individual or\ngroup level is challenging. We investigate how simple steric interaction\nconstraints between active individuals produce a versatile active system with\npromising functionality. Here we introduce the link-bot: a V-shape-based,\nsingle-stranded chain composed of active bots whose dynamics are defined by its\ngeometric link constraints, allowing it to possess scale- and processing-free\nprogrammable collective behaviors. A variety of emergent properties arise from\nthis dynamic system, including locomotion, navigation, transportation, and\ncompetitive or cooperative interactions. Through the control of a few link\nparameters, link-bots show rich usefulness by performing a variety of divergent\ntasks, including traversing or obstructing narrow spaces, passing by or\nenclosing objects, and propelling loads in both forward and backward\ndirections. The reconfigurable nature of the link-bot suggests that our\napproach may significantly contribute to the development of programmable soft\nrobotic systems with minimal information and materials at any scale.\n","authors":["Kyungmin Son","Kimberly Bowal","L. Mahadevan","Ho-Young Kim"],"pdf_url":"https://arxiv.org/pdf/2411.08163v1.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.08144v1","updated":"2024-11-12T19:42:44Z","published":"2024-11-12T19:42:44Z","title":"Visual Tracking with Intermittent Visibility: Switched Control Design\n and Implementation","summary":" This paper addresses the problem of visual target tracking in scenarios where\na pursuer may experience intermittent loss of visibility of the target. The\ndesign of a Switched Visual Tracker (SVT) is presented which aims to meet the\ncompeting requirements of maintaining both proximity and visibility. SVT\nalternates between a visual tracking mode for following the target, and a\nrecovery mode for regaining visual contact when the target falls out of sight.\nWe establish the stability of SVT by extending the average dwell time theorem\nfrom switched systems theory, which may be of independent interest. Our\nimplementation of SVT on an Agilicious drone [1] illustrates its effectiveness\non tracking various target trajectories: it reduces the average tracking error\nby up to 45% and significantly improves visibility duration compared to a\nbaseline algorithm. The results show that our approach effectively handles\nintermittent vision loss, offering enhanced robustness and adaptability for\nreal-world autonomous missions. Additionally, we demonstrate how the stability\nanalysis provides valuable guidance for selecting parameters, such as tracking\nspeed and recovery distance, to optimize the SVT's performance.\n","authors":["Yangge Li","Benjamin C Yang","Sayan Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.08144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08136v1","updated":"2024-11-12T19:27:12Z","published":"2024-11-12T19:27:12Z","title":"Simultaneous Locomotion Mode Classification and Continuous Gait Phase\n Estimation for Transtibial Prostheses","summary":" Recognizing and identifying human locomotion is a critical step to ensuring\nfluent control of wearable robots, such as transtibial prostheses. In\nparticular, classifying the intended locomotion mode and estimating the gait\nphase are key. In this work, a novel, interpretable, and computationally\nefficient algorithm is presented for simultaneously predicting locomotion mode\nand gait phase. Using able-bodied (AB) and transtibial prosthesis (PR) data,\nseven locomotion modes are tested including slow, medium, and fast level\nwalking (0.6, 0.8, and 1.0 m/s), ramp ascent/descent (5 degrees), and stair\nascent/descent (20 cm height). Overall classification accuracy was 99.1$\\%$ and\n99.3$\\%$ for the AB and PR conditions, respectively. The average gait phase\nerror across all data was less than 4$\\%$. Exploiting the structure of the\ndata, computational efficiency reached 2.91 $\\mu$s per time step. The time\ncomplexity of this algorithm scales as $O(N\\cdot M)$ with the number of\nlocomotion modes $M$ and samples per gait cycle $N$. This efficiency and high\naccuracy could accommodate a much larger set of locomotion modes ($\\sim$ 700 on\nOpen-Source Leg Prosthesis) to handle the wide range of activities pursued by\nindividuals during daily living.\n","authors":["Ryan Posh","Shenggao Li","Patrick Wensing"],"pdf_url":"https://arxiv.org/pdf/2411.08136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05309v2","updated":"2024-11-12T19:01:55Z","published":"2024-06-08T00:54:13Z","title":"CoBL-Diffusion: Diffusion-Based Conditional Robot Planning in Dynamic\n Environments Using Control Barrier and Lyapunov Functions","summary":" Equipping autonomous robots with the ability to navigate safely and\nefficiently around humans is a crucial step toward achieving trusted robot\nautonomy. However, generating robot plans while ensuring safety in dynamic\nmulti-agent environments remains a key challenge. Building upon recent work on\nleveraging deep generative models for robot planning in static environments,\nthis paper proposes CoBL-Diffusion, a novel diffusion-based safe robot planner\nfor dynamic environments. CoBL-Diffusion uses Control Barrier and Lyapunov\nfunctions to guide the denoising process of a diffusion model, iteratively\nrefining the robot control sequence to satisfy the safety and stability\nconstraints. We demonstrate the effectiveness of the proposed model using two\nsettings: a synthetic single-agent environment and a real-world pedestrian\ndataset. Our results show that CoBL-Diffusion generates smooth trajectories\nthat enable the robot to reach goal locations while maintaining a low collision\nrate with dynamic obstacles.\n","authors":["Kazuki Mizuta","Karen Leung"],"pdf_url":"https://arxiv.org/pdf/2406.05309v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.08037v1","updated":"2024-11-12T18:59:59Z","published":"2024-11-12T18:59:59Z","title":"Material Transforms from Disentangled NeRF Representations","summary":" In this paper, we first propose a novel method for transferring material\ntransformations across different scenes. Building on disentangled Neural\nRadiance Field (NeRF) representations, our approach learns to map Bidirectional\nReflectance Distribution Functions (BRDF) from pairs of scenes observed in\nvarying conditions, such as dry and wet. The learned transformations can then\nbe applied to unseen scenes with similar materials, therefore effectively\nrendering the transformation learned with an arbitrary level of intensity.\nExtensive experiments on synthetic scenes and real-world objects validate the\neffectiveness of our approach, showing that it can learn various\ntransformations such as wetness, painting, coating, etc. Our results highlight\nnot only the versatility of our method but also its potential for practical\napplications in computer graphics. We publish our method implementation, along\nwith our synthetic/real datasets on\nhttps://github.com/astra-vision/BRDFTransform\n","authors":["Ivan Lopes","Jean-François Lalonde","Raoul de Charette"],"pdf_url":"https://arxiv.org/pdf/2411.08037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08034v1","updated":"2024-11-12T18:59:35Z","published":"2024-11-12T18:59:35Z","title":"Scaling Properties of Diffusion Models for Perceptual Tasks","summary":" In this paper, we argue that iterative computation with diffusion models\noffers a powerful paradigm for not only generation but also visual perception\ntasks. We unify tasks such as depth estimation, optical flow, and segmentation\nunder image-to-image translation, and show how diffusion models benefit from\nscaling training and test-time compute for these perception tasks. Through a\ncareful analysis of these scaling behaviors, we present various techniques to\nefficiently train diffusion models for visual perception tasks. Our models\nachieve improved or comparable performance to state-of-the-art methods using\nsignificantly less data and compute. To use our code and models, see\nhttps://scaling-diffusion-perception.github.io .\n","authors":["Rahul Ravishankar","Zeeshan Patel","Jathushan Rajasegaran","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2411.08034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08033v1","updated":"2024-11-12T18:59:32Z","published":"2024-11-12T18:59:32Z","title":"GaussianAnything: Interactive Point Cloud Latent Diffusion for 3D\n Generation","summary":" While 3D content generation has advanced significantly, existing methods\nstill face challenges with input formats, latent space design, and output\nrepresentations. This paper introduces a novel 3D generation framework that\naddresses these challenges, offering scalable, high-quality 3D generation with\nan interactive Point Cloud-structured Latent space. Our framework employs a\nVariational Autoencoder (VAE) with multi-view posed RGB-D(epth)-N(ormal)\nrenderings as input, using a unique latent space design that preserves 3D shape\ninformation, and incorporates a cascaded latent diffusion model for improved\nshape-texture disentanglement. The proposed method, GaussianAnything, supports\nmulti-modal conditional 3D generation, allowing for point cloud, caption, and\nsingle/multi-view image inputs. Notably, the newly proposed latent space\nnaturally enables geometry-texture disentanglement, thus allowing 3D-aware\nediting. Experimental results demonstrate the effectiveness of our approach on\nmultiple datasets, outperforming existing methods in both text- and\nimage-conditioned 3D generation.\n","authors":["Yushi Lan","Shangchen Zhou","Zhaoyang Lyu","Fangzhou Hong","Shuai Yang","Bo Dai","Xingang Pan","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2411.08033v1.pdf","comment":"project page: https://nirvanalan.github.io/projects/GA/"},{"id":"http://arxiv.org/abs/2411.08027v1","updated":"2024-11-12T18:56:58Z","published":"2024-11-12T18:56:58Z","title":"LLMPhy: Complex Physical Reasoning Using Large Language Models and World\n Models","summary":" Physical reasoning is an important skill needed for robotic agents when\noperating in the real world. However, solving such reasoning problems often\ninvolves hypothesizing and reflecting over complex multi-body interactions\nunder the effect of a multitude of physical forces and thus learning all such\ninteractions poses a significant hurdle for state-of-the-art machine learning\nframeworks, including large language models (LLMs). To study this problem, we\npropose a new physical reasoning task and a dataset, dubbed TraySim. Our task\ninvolves predicting the dynamics of several objects on a tray that is given an\nexternal impact -- the domino effect of the ensued object interactions and\ntheir dynamics thus offering a challenging yet controlled setup, with the goal\nof reasoning being to infer the stability of the objects after the impact. To\nsolve this complex physical reasoning task, we present LLMPhy, a zero-shot\nblack-box optimization framework that leverages the physics knowledge and\nprogram synthesis abilities of LLMs, and synergizes these abilities with the\nworld models built into modern physics engines. Specifically, LLMPhy uses an\nLLM to generate code to iteratively estimate the physical hyperparameters of\nthe system (friction, damping, layout, etc.) via an implicit\nanalysis-by-synthesis approach using a (non-differentiable) simulator in the\nloop and uses the inferred parameters to imagine the dynamics of the scene\ntowards solving the reasoning task. To show the effectiveness of LLMPhy, we\npresent experiments on our TraySim dataset to predict the steady-state poses of\nthe objects. Our results show that the combination of the LLM and the physics\nengine leads to state-of-the-art zero-shot physical reasoning performance,\nwhile demonstrating superior convergence against standard black-box\noptimization methods and better estimation of the physical parameters.\n","authors":["Anoop Cherian","Radu Corcodel","Siddarth Jain","Diego Romeres"],"pdf_url":"https://arxiv.org/pdf/2411.08027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17245v6","updated":"2024-11-12T18:50:19Z","published":"2023-11-28T21:39:20Z","title":"LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and\n 200+ FPS","summary":" Recent advances in real-time neural rendering using point-based techniques\nhave enabled broader adoption of 3D representations. However, foundational\napproaches like 3D Gaussian Splatting impose substantial storage overhead, as\nStructure-from-Motion (SfM) points can grow to millions, often requiring\ngigabyte-level disk space for a single unbounded scene. This growth presents\nscalability challenges and hinders splatting efficiency. To address this, we\nintroduce LightGaussian, a method for transforming 3D Gaussians into a more\ncompact format. Inspired by Network Pruning, LightGaussian identifies Gaussians\nwith minimal global significance on scene reconstruction, and applies a pruning\nand recovery process to reduce redundancy while preserving visual quality.\nKnowledge distillation and pseudo-view augmentation then transfer spherical\nharmonic coefficients to a lower degree, yielding compact representations.\nGaussian Vector Quantization, based on each Gaussian's global significance,\nfurther lowers bitwidth with minimal accuracy loss. LightGaussian achieves an\naverage 15x compression rate while boosting FPS from 144 to 237 within the\n3D-GS framework, enabling efficient complex scene representation on the\nMip-NeRF 360 and Tank & Temple datasets. The proposed Gaussian pruning approach\nis also adaptable to other 3D representations (e.g., Scaffold-GS),\ndemonstrating strong generalization capabilities.\n","authors":["Zhiwen Fan","Kevin Wang","Kairun Wen","Zehao Zhu","Dejia Xu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17245v6.pdf","comment":"NeurIPS 2024, Project page: https://lightgaussian.github.io/"},{"id":"http://arxiv.org/abs/2411.08017v1","updated":"2024-11-12T18:49:06Z","published":"2024-11-12T18:49:06Z","title":"Wavelet Latent Diffusion (Wala): Billion-Parameter 3D Generative Model\n with Compact Wavelet Encodings","summary":" Large-scale 3D generative models require substantial computational resources\nyet often fall short in capturing fine details and complex geometries at high\nresolutions. We attribute this limitation to the inefficiency of current\nrepresentations, which lack the compactness required to model the generative\nmodels effectively. To address this, we introduce a novel approach called\nWavelet Latent Diffusion, or WaLa, that encodes 3D shapes into wavelet-based,\ncompact latent encodings. Specifically, we compress a $256^3$ signed distance\nfield into a $12^3 \\times 4$ latent grid, achieving an impressive 2427x\ncompression ratio with minimal loss of detail. This high level of compression\nallows our method to efficiently train large-scale generative networks without\nincreasing the inference time. Our models, both conditional and unconditional,\ncontain approximately one billion parameters and successfully generate\nhigh-quality 3D shapes at $256^3$ resolution. Moreover, WaLa offers rapid\ninference, producing shapes within two to four seconds depending on the\ncondition, despite the model's scale. We demonstrate state-of-the-art\nperformance across multiple datasets, with significant improvements in\ngeneration quality, diversity, and computational efficiency. We open-source our\ncode and, to the best of our knowledge, release the largest pretrained 3D\ngenerative models across different modalities.\n","authors":["Aditya Sanghi","Aliasghar Khani","Pradyumna Reddy","Arianna Rampini","Derek Cheung","Kamal Rahimi Malekshan","Kanika Madan","Hooman Shayani"],"pdf_url":"https://arxiv.org/pdf/2411.08017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20099v2","updated":"2024-11-12T18:46:33Z","published":"2024-06-28T17:59:51Z","title":"Odd-One-Out: Anomaly Detection by Comparing with Neighbors","summary":" This paper introduces a novel anomaly detection (AD) problem that focuses on\nidentifying `odd-looking' objects relative to the other instances in a given\nscene. In contrast to the traditional AD benchmarks, anomalies in our task are\nscene-specific, defined by the regular instances that make up the majority.\nSince object instances may be only partly visible from a single viewpoint, our\nsetting employs multiple views of each scene as input. To provide a testbed for\nfuture research in this task, we introduce two benchmarks, ToysAD-8K and\nPartsAD-15K. We propose a novel method that constructs 3D object-centric\nrepresentations from multiple 2D views for each instance and detects the\nanomalous ones through a cross-instance comparison. We rigorously analyze our\nmethod quantitatively and qualitatively on the presented benchmarks.\n","authors":["Ankan Bhunia","Changjian Li","Hakan Bilen"],"pdf_url":"https://arxiv.org/pdf/2406.20099v2.pdf","comment":"Codes & Dataset at https://github.com/VICO-UoE/OddOneOutAD"},{"id":"http://arxiv.org/abs/2411.08014v1","updated":"2024-11-12T18:44:13Z","published":"2024-11-12T18:44:13Z","title":"Artistic Neural Style Transfer Algorithms with Activation Smoothing","summary":" The works of Gatys et al. demonstrated the capability of Convolutional Neural\nNetworks (CNNs) in creating artistic style images. This process of transferring\ncontent images in different styles is called Neural Style Transfer (NST). In\nthis paper, we re-implement image-based NST, fast NST, and arbitrary NST. We\nalso explore to utilize ResNet with activation smoothing in NST. Extensive\nexperimental results demonstrate that smoothing transformation can greatly\nimprove the quality of stylization results.\n","authors":["Xiangtian Li","Han Cao","Zhaoyang Zhang","Jiacheng Hu","Yuhui Jin","Zihao Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.08014v1.pdf","comment":"8 pages,7 figures"},{"id":"http://arxiv.org/abs/2411.07976v1","updated":"2024-11-12T17:55:39Z","published":"2024-11-12T17:55:39Z","title":"DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring","summary":" Coronary artery disease (CAD), one of the most common cause of mortality in\nthe world. Coronary artery calcium (CAC) scoring using computed tomography (CT)\nis key for risk assessment to prevent coronary disease. Previous studies on\nrisk assessment and calcification detection in CT scans primarily use\napproaches based on UNET architecture, frequently implemented on pre-built\nmodels. However, these models are limited by the availability of annotated CT\nscans containing CAC and suffering from imbalanced dataset, decreasing\nperformance of CAC segmentation and scoring. In this study, we extend this\napproach by incorporating the self-supervised learning (SSL) technique of DINO\n(self-distillation with no labels) to eliminate limitations of scarce annotated\ndata in CT scans. The DINO model's ability to train without requiring CAC area\nannotations enhances its robustness in generating distinct features. The DINO\nmodel is trained on to focus specifically on calcified areas by using labels,\naiming to generate features that effectively capture and highlight key\ncharacteristics. The label-guided DINO (DINO-LG) enhances classification by\ndistinguishing CT slices that contain calcification from those that do not,\nperforming 57% better than the standard DINO model in this task. CAC scoring\nand segmentation tasks are performed by a basic U-NET architecture, fed\nspecifically with CT slices containing calcified areas as identified by the\nDINO-LG model. This targeted identification performed by DINO-LG model improves\nCAC segmentation performance by approximately 10% and significant increase in\nCAC scoring accuracy.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Caner Ozcan"],"pdf_url":"https://arxiv.org/pdf/2411.07976v1.pdf","comment":"Developed by Center for Applied Artificial Intelligence (CAAI),\n University of Kentucky"},{"id":"http://arxiv.org/abs/2411.07975v1","updated":"2024-11-12T17:55:10Z","published":"2024-11-12T17:55:10Z","title":"JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified\n Multimodal Understanding and Generation","summary":" We present JanusFlow, a powerful framework that unifies image understanding\nand generation in a single model. JanusFlow introduces a minimalist\narchitecture that integrates autoregressive language models with rectified\nflow, a state-of-the-art method in generative modeling. Our key finding\ndemonstrates that rectified flow can be straightforwardly trained within the\nlarge language model framework, eliminating the need for complex architectural\nmodifications. To further improve the performance of our unified model, we\nadopt two key strategies: (i) decoupling the understanding and generation\nencoders, and (ii) aligning their representations during unified training.\nExtensive experiments show that JanusFlow achieves comparable or superior\nperformance to specialized models in their respective domains, while\nsignificantly outperforming existing unified approaches across standard\nbenchmarks. This work represents a step toward more efficient and versatile\nvision-language models.\n","authors":["Yiyang Ma","Xingchao Liu","Xiaokang Chen","Wen Liu","Chengyue Wu","Zhiyu Wu","Zizheng Pan","Zhenda Xie","Haowei Zhang","Xingkai yu","Liang Zhao","Yisong Wang","Jiaying Liu","Chong Ruan"],"pdf_url":"https://arxiv.org/pdf/2411.07975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07956v1","updated":"2024-11-12T17:31:51Z","published":"2024-11-12T17:31:51Z","title":"Commissioning An All-Sky Infrared Camera Array for Detection Of Airborne\n Objects","summary":" To date there is little publicly available scientific data on Unidentified\nAerial Phenomena (UAP) whose properties and kinematics purportedly reside\noutside the performance envelope of known phenomena. To address this\ndeficiency, the Galileo Project is designing, building, and commissioning a\nmulti-modal ground-based observatory to continuously monitor the sky and\nconduct a rigorous long-term aerial census of all aerial phenomena, including\nnatural and human-made. One of the key instruments is an all-sky infrared\ncamera array using eight uncooled long-wave infrared FLIR Boson 640 cameras.\nTheir calibration includes a novel extrinsic calibration method using airplane\npositions from Automatic Dependent Surveillance-Broadcast (ADS-B) data. We\nestablish a first baseline for the system performance over five months of field\noperation, using a real-world dataset derived from ADS-B data, synthetic 3-D\ntrajectories, and a hand-labelled real-world dataset. We report acceptance\nrates (e.g. viewable airplanes that are recorded) and detection efficiencies\n(e.g. recorded airplanes which are successfully detected) for a variety of\nweather conditions, range and aircraft size. We reconstruct $\\sim$500,000\ntrajectories of aerial objects from this commissioning period. A toy outlier\nsearch focused on large sinuosity of the 2-D reconstructed trajectories flags\nabout 16% of trajectories as outliers. After manual review, 144 trajectories\nremain ambiguous: they are likely mundane objects but cannot be elucidated at\nthis stage of development without distance and kinematics estimation or other\nsensor modalities. Our observed count of ambiguous outliers combined with\nsystematic uncertainties yields an upper limit of 18,271 outliers count for the\nfive-month interval at a 95% confidence level. This likelihood-based method to\nevaluate significance is applicable to all of our future outlier searches.\n","authors":["Laura Dominé","Ankit Biswas","Richard Cloete","Alex Delacroix","Andriy Fedorenko","Lucas Jacaruso","Ezra Kelderman","Eric Keto","Sarah Little","Abraham Loeb","Eric Masson","Mike Prior","Forrest Schultz","Matthew Szenher","Wes Watters","Abby White"],"pdf_url":"https://arxiv.org/pdf/2411.07956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07945v1","updated":"2024-11-12T17:17:33Z","published":"2024-11-12T17:17:33Z","title":"SimBase: A Simple Baseline for Temporal Video Grounding","summary":" This paper presents SimBase, a simple yet effective baseline for temporal\nvideo grounding. While recent advances in temporal grounding have led to\nimpressive performance, they have also driven network architectures toward\ngreater complexity, with a range of methods to (1) capture temporal\nrelationships and (2) achieve effective multimodal fusion. In contrast, this\npaper explores the question: How effective can a simplified approach be? To\ninvestigate, we design SimBase, a network that leverages lightweight,\none-dimensional temporal convolutional layers instead of complex temporal\nstructures. For cross-modal interaction, SimBase only employs an element-wise\nproduct instead of intricate multimodal fusion. Remarkably, SimBase achieves\nstate-of-the-art results on two large-scale datasets. As a simple yet powerful\nbaseline, we hope SimBase will spark new ideas and streamline future\nevaluations in temporal video grounding.\n","authors":["Peijun Bao","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2411.07945v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2411.07941v1","updated":"2024-11-12T17:11:18Z","published":"2024-11-12T17:11:18Z","title":"DuoLift-GAN:Reconstructing CT from Single-view and Biplanar X-Rays with\n Generative Adversarial Networks","summary":" Computed tomography (CT) provides highly detailed three-dimensional (3D)\nmedical images but is costly, time-consuming, and often inaccessible in\nintraoperative settings (Organization et al. 2011). Recent advancements have\nexplored reconstructing 3D chest volumes from sparse 2D X-rays, such as\nsingle-view or orthogonal double-view images. However, current models tend to\nprocess 2D images in a planar manner, prioritizing visual realism over\nstructural accuracy. In this work, we introduce DuoLift Generative Adversarial\nNetworks (DuoLift-GAN), a novel architecture with dual branches that\nindependently elevate 2D images and their features into 3D representations.\nThese 3D outputs are merged into a unified 3D feature map and decoded into a\ncomplete 3D chest volume, enabling richer 3D information capture. We also\npresent a masked loss function that directs reconstruction towards critical\nanatomical regions, improving structural accuracy and visual quality. This\npaper demonstrates that DuoLift-GAN significantly enhances reconstruction\naccuracy while achieving superior visual realism compared to existing methods.\n","authors":["Zhaoxi Zhang","Yueliang Ying"],"pdf_url":"https://arxiv.org/pdf/2411.07941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07940v1","updated":"2024-11-12T17:09:20Z","published":"2024-11-12T17:09:20Z","title":"Automatic dataset shift identification to support root cause analysis of\n AI performance drift","summary":" Shifts in data distribution can substantially harm the performance of\nclinical AI models. Hence, various methods have been developed to detect the\npresence of such shifts at deployment time. However, root causes of dataset\nshifts are varied, and the choice of shift mitigation strategies is highly\ndependent on the precise type of shift encountered at test time. As such,\ndetecting test-time dataset shift is not sufficient: precisely identifying\nwhich type of shift has occurred is critical. In this work, we propose the\nfirst unsupervised dataset shift identification framework, effectively\ndistinguishing between prevalence shift (caused by a change in the label\ndistribution), covariate shift (caused by a change in input characteristics)\nand mixed shifts (simultaneous prevalence and covariate shifts). We discuss the\nimportance of self-supervised encoders for detecting subtle covariate shifts\nand propose a novel shift detector leveraging both self-supervised encoders and\ntask model outputs for improved shift detection. We report promising results\nfor the proposed shift identification framework across three different imaging\nmodalities (chest radiography, digital mammography, and retinal fundus images)\non five types of real-world dataset shifts, using four large publicly available\ndatasets.\n","authors":["Mélanie Roschewitz","Raghav Mehta","Charles Jones","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2411.07940v1.pdf","comment":"Code available at\n https://github.com/biomedia-mira/shift_identification"},{"id":"http://arxiv.org/abs/2411.07936v1","updated":"2024-11-12T17:05:18Z","published":"2024-11-12T17:05:18Z","title":"Learning Disentangled Representations for Perceptual Point Cloud Quality\n Assessment via Mutual Information Minimization","summary":" No-Reference Point Cloud Quality Assessment (NR-PCQA) aims to objectively\nassess the human perceptual quality of point clouds without relying on\npristine-quality point clouds for reference. It is becoming increasingly\nsignificant with the rapid advancement of immersive media applications such as\nvirtual reality (VR) and augmented reality (AR). However, current NR-PCQA\nmodels attempt to indiscriminately learn point cloud content and distortion\nrepresentations within a single network, overlooking their distinct\ncontributions to quality information. To address this issue, we propose DisPA,\na novel disentangled representation learning framework for NR-PCQA. The\nframework trains a dual-branch disentanglement network to minimize mutual\ninformation (MI) between representations of point cloud content and distortion.\nSpecifically, to fully disentangle representations, the two branches adopt\ndifferent philosophies: the content-aware encoder is pretrained by a masked\nauto-encoding strategy, which can allow the encoder to capture semantic\ninformation from rendered images of distorted point clouds; the\ndistortion-aware encoder takes a mini-patch map as input, which forces the\nencoder to focus on low-level distortion patterns. Furthermore, we utilize an\nMI estimator to estimate the tight upper bound of the actual MI and further\nminimize it to achieve explicit representation disentanglement. Extensive\nexperimental results demonstrate that DisPA outperforms state-of-the-art\nmethods on multiple PCQA datasets.\n","authors":["Ziyu Shan","Yujie Zhang","Yipeng Liu","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2411.07936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07918v1","updated":"2024-11-12T16:50:13Z","published":"2024-11-12T16:50:13Z","title":"Isometric Transformations for Image Augmentation in Mueller Matrix\n Polarimetry","summary":" Mueller matrix polarimetry captures essential information about polarized\nlight interactions with a sample, presenting unique challenges for data\naugmentation in deep learning due to its distinct structure. While\naugmentations are an effective and affordable way to enhance dataset diversity\nand reduce overfitting, standard transformations like rotations and flips do\nnot preserve the polarization properties in Mueller matrix images. To this end,\nwe introduce a versatile simulation framework that applies physically\nconsistent rotations and flips to Mueller matrices, tailored to maintain\npolarization fidelity. Our experimental results across multiple datasets reveal\nthat conventional augmentations can lead to misleading results when applied to\npolarimetric data, underscoring the necessity of our physics-based approach. In\nour experiments, we first compare our polarization-specific augmentations\nagainst real-world captures to validate their physical consistency. We then\napply these augmentations in a semantic segmentation task, achieving\nsubstantial improvements in model generalization and performance. This study\nunderscores the necessity of physics-informed data augmentation for\npolarimetric imaging in deep learning (DL), paving the way for broader adoption\nand more robust applications across diverse research in the field. In\nparticular, our framework unlocks the potential of DL models for polarimetric\ndatasets with limited sample sizes. Our code implementation is available at\ngithub.com/hahnec/polar_augment.\n","authors":["Christopher Hahne","Omar Rodriguez-Nunez","Éléa Gros","Théotim Lucas","Ekkehard Hewer","Tatiana Novikova","Theoni Maragkou","Philippe Schucht","Richard McKinley"],"pdf_url":"https://arxiv.org/pdf/2411.07918v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2411.05747v3","updated":"2024-11-12T16:42:02Z","published":"2024-11-08T18:08:33Z","title":"WavShadow: Wavelet Based Shadow Segmentation and Removal","summary":" Shadow removal and segmentation remain challenging tasks in computer vision,\nparticularly in complex real world scenarios. This study presents a novel\napproach that enhances the ShadowFormer model by incorporating Masked\nAutoencoder (MAE) priors and Fast Fourier Convolution (FFC) blocks, leading to\nsignificantly faster convergence and improved performance. We introduce key\ninnovations: (1) integration of MAE priors trained on Places2 dataset for\nbetter context understanding, (2) adoption of Haar wavelet features for\nenhanced edge detection and multiscale analysis, and (3) implementation of a\nmodified SAM Adapter for robust shadow segmentation. Extensive experiments on\nthe challenging DESOBA dataset demonstrate that our approach achieves state of\nthe art results, with notable improvements in both convergence speed and shadow\nremoval quality.\n","authors":["Shreyans Jain","Viraj Vekaria","Karan Gandhi","Aadya Arora"],"pdf_url":"https://arxiv.org/pdf/2411.05747v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07155v2","updated":"2024-11-12T16:39:29Z","published":"2024-05-12T04:18:10Z","title":"Meta-Learned Modality-Weighted Knowledge Distillation for Robust\n Multi-Modal Learning with Missing Data","summary":" In multi-modal learning, some modalities are more influential than others,\nand their absence can have a significant impact on classification/segmentation\naccuracy. Addressing this challenge, we propose a novel approach called\nMeta-learned Modality-weighted Knowledge Distillation (MetaKD), which enables\nmulti-modal models to maintain high accuracy even when key modalities are\nmissing. MetaKD adaptively estimates the importance weight of each modality\nthrough a meta-learning process. These learned importance weights guide a\npairwise modality-weighted knowledge distillation process, allowing\nhigh-importance modalities to transfer knowledge to lower-importance ones,\nresulting in robust performance despite missing inputs. Unlike previous methods\nin the field, which are often task-specific and require significant\nmodifications, our approach is designed to work in multiple tasks (e.g.,\nsegmentation and classification) with minimal adaptation. Experimental results\non five prevalent datasets, including three Brain Tumor Segmentation datasets\n(BraTS2018, BraTS2019 and BraTS2020), the Alzheimer's Disease Neuroimaging\nInitiative (ADNI) classification dataset and the Audiovision-MNIST\nclassification dataset, demonstrate the proposed model is able to outperform\nthe compared models by a large margin.\n","authors":["Hu Wang","Salma Hassan","Yuyuan Liu","Congbo Ma","Yuanhong Chen","Yutong Xie","Mostafa Salem","Yu Tian","Jodie Avery","Louise Hull","Ian Reid","Mohammad Yaqub","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2405.07155v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07901v1","updated":"2024-11-12T16:15:25Z","published":"2024-11-12T16:15:25Z","title":"TLDR: Traffic Light Detection using Fourier Domain Adaptation in Hostile\n WeatheR","summary":" The scarcity of comprehensive datasets in the traffic light detection and\nrecognition domain and the poor performance of state-of-the-art models under\nhostile weather conditions present significant challenges. To address these\nissues, this paper proposes a novel approach by merging two widely used\ndatasets, LISA and S2TLD. The merged dataset is further processed to tackle\nclass imbalance, a common problem in this domain. This merged dataset becomes\nour source domain. Synthetic rain and fog are added to the dataset to create\nour target domain. We employ Fourier Domain Adaptation (FDA) to create a final\ndataset with a minimized domain gap between the two datasets, helping the model\ntrained on this final dataset adapt to rainy and foggy weather conditions.\nAdditionally, we explore Semi-Supervised Learning (SSL) techniques to leverage\nthe available data more effectively. Experimental results demonstrate that\nmodels trained on FDA-augmented images outperform those trained without FDA\nacross confidence-dependent and independent metrics, like mAP50, mAP50-95,\nPrecision, and Recall. The best-performing model, YOLOv8, achieved a Precision\nincrease of 5.1860%, Recall increase of 14.8009%, mAP50 increase of 9.5074%,\nand mAP50-95 increase of 19.5035%. On average, percentage increases of 7.6892%\nin Precision, 19.9069% in Recall, 15.8506% in mAP50, and 23.8099% in mAP50-95\nwere observed across all models, highlighting the effectiveness of FDA in\nmitigating the impact of adverse weather conditions on model performance. These\nimprovements pave the way for real-world applications where reliable\nperformance in challenging environmental conditions is critical.\n","authors":["Ishaan Gakhar","Aryesh Guha","Aryaman Gupta","Amit Agarwal","Durga Toshniwal","Ujjwal Verma"],"pdf_url":"https://arxiv.org/pdf/2411.07901v1.pdf","comment":"Under Review at IEEE Transactions of Artificial Intelligence. 10\n Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2411.07899v1","updated":"2024-11-12T16:12:51Z","published":"2024-11-12T16:12:51Z","title":"Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse\n Tensor-based Transformer","summary":" The evolution of 3D visualization techniques has fundamentally transformed\nhow we interact with digital content. At the forefront of this change is point\ncloud technology, offering an immersive experience that surpasses traditional\n2D representations. However, the massive data size of point clouds presents\nsignificant challenges in data compression. Current methods for lossy point\ncloud attribute compression (PCAC) generally focus on reconstructing the\noriginal point clouds with minimal error. However, for point cloud\nvisualization scenarios, the reconstructed point clouds with distortion still\nneed to undergo a complex rendering process, which affects the final\nuser-perceived quality. In this paper, we propose an end-to-end deep learning\nframework that seamlessly integrates PCAC with differentiable rendering,\ndenoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of\nrendered multiview images for viewing. In a differentiable manner, the impact\nof the rendering process on the reconstructed point clouds is taken into\naccount. Moreover, we characterize point clouds as sparse tensors and propose a\nsparse tensor-based transformer, called SP-Trans. By aligning with the local\ndensity of the point cloud and utilizing an enhanced local attention mechanism,\nSP-Trans captures the intricate relationships within the point cloud, further\nimproving feature analysis and synthesis within the framework. Extensive\nexperiments demonstrate that the proposed RO-PCAC achieves state-of-the-art\ncompression performance, compared to existing reconstruction-oriented methods,\nincluding traditional, learning-based, and hybrid methods.\n","authors":["Xiao Huo","Junhui Ho","Shuai Wan","Fuzheng Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07893v1","updated":"2024-11-12T15:58:09Z","published":"2024-11-12T15:58:09Z","title":"Joint multi-dimensional dynamic attention and transformer for general\n image restoration","summary":" Outdoor images often suffer from severe degradation due to rain, haze, and\nnoise, impairing image quality and challenging high-level tasks. Current image\nrestoration methods struggle to handle complex degradation while maintaining\nefficiency. This paper introduces a novel image restoration architecture that\ncombines multi-dimensional dynamic attention and self-attention within a U-Net\nframework. To leverage the global modeling capabilities of transformers and the\nlocal modeling capabilities of convolutions, we integrate sole CNNs in the\nencoder-decoder and sole transformers in the latent layer. Additionally, we\ndesign convolutional kernels with selected multi-dimensional dynamic attention\nto capture diverse degraded inputs efficiently. A transformer block with\ntransposed self-attention further enhances global feature extraction while\nmaintaining efficiency. Extensive experiments demonstrate that our method\nachieves a better balance between performance and computational complexity\nacross five image restoration tasks: deraining, deblurring, denoising,\ndehazing, and enhancement, as well as superior performance for high-level\nvision tasks. The source code will be available at\nhttps://github.com/House-yuyu/MDDA-former.\n","authors":["Huan Zhang","Xu Zhang","Nian Cai","Jianglei Di","Yun Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07885v1","updated":"2024-11-12T15:47:17Z","published":"2024-11-12T15:47:17Z","title":"INTRABENCH: Interactive Radiological Benchmark","summary":" Current interactive segmentation approaches, inspired by the success of\nMETA's Segment Anything model, have achieved notable advancements, however,\nthey come with substantial limitations that hinder their practical application\nin real clinical scenarios. These include unrealistic human interaction\nrequirements, such as slice-by-slice operations for 2D models on 3D data, a\nlack of iterative refinement, and insufficient evaluation experiments. These\nshortcomings prevent accurate assessment of model performance and lead to\ninconsistent outcomes across studies. IntRaBench overcomes these challenges by\noffering a comprehensive and reproducible framework for evaluating interactive\nsegmentation methods in realistic, clinically relevant scenarios. It includes\ndiverse datasets, target structures, and segmentation models, and provides a\nflexible codebase that allows seamless integration of new models and prompting\nstrategies. Additionally, we introduce advanced techniques to minimize\nclinician interaction, ensuring fair comparisons between 2D and 3D models. By\nopen-sourcing IntRaBench, we invite the research community to integrate their\nmodels and prompting techniques, ensuring continuous and transparent evaluation\nof interactive segmentation models in 3D medical imaging.\n","authors":["Constantin Ulrich","Tassilo Wald","Emily Tempus","Maximilian Rokuss","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2411.07885v1.pdf","comment":"Undergoing Peer-Review"},{"id":"http://arxiv.org/abs/2411.07873v1","updated":"2024-11-12T15:29:50Z","published":"2024-11-12T15:29:50Z","title":"Diverse capability and scaling of diffusion and auto-regressive models\n when learning abstract rules","summary":" Humans excel at discovering regular structures from limited samples and\napplying inferred rules to novel settings. We investigate whether modern\ngenerative models can similarly learn underlying rules from finite samples and\nperform reasoning through conditional sampling. Inspired by Raven's Progressive\nMatrices task, we designed GenRAVEN dataset, where each sample consists of\nthree rows, and one of 40 relational rules governing the object position,\nnumber, or attributes applies to all rows. We trained generative models to\nlearn the data distribution, where samples are encoded as integer arrays to\nfocus on rule learning. We compared two generative model families: diffusion\n(EDM, DiT, SiT) and autoregressive models (GPT2, Mamba). We evaluated their\nability to generate structurally consistent samples and perform panel\ncompletion via unconditional and conditional sampling. We found diffusion\nmodels excel at unconditional generation, producing more novel and consistent\nsamples from scratch and memorizing less, but performing less well in panel\ncompletion, even with advanced conditional sampling methods. Conversely,\nautoregressive models excel at completing missing panels in a rule-consistent\nmanner but generate less consistent samples unconditionally. We observe diverse\ndata scaling behaviors: for both model families, rule learning emerges at a\ncertain dataset size - around 1000s examples per rule. With more training data,\ndiffusion models improve both their unconditional and conditional generation\ncapabilities. However, for autoregressive models, while panel completion\nimproves with more training data, unconditional generation consistency\ndeclines. Our findings highlight complementary capabilities and limitations of\ndiffusion and autoregressive models in rule learning and reasoning tasks,\nsuggesting avenues for further research into their mechanisms and potential for\nhuman-like reasoning.\n","authors":["Binxu Wang","Jiaqi Shang","Haim Sompolinsky"],"pdf_url":"https://arxiv.org/pdf/2411.07873v1.pdf","comment":"12 pages, 5 figures. Accepted to NeurIPS2024 Workshop on System 2\n Reasoning At Scale as long paper"},{"id":"http://arxiv.org/abs/2411.07863v1","updated":"2024-11-12T15:22:14Z","published":"2024-11-12T15:22:14Z","title":"CDXFormer: Boosting Remote Sensing Change Detection with Extended Long\n Short-Term Memory","summary":" In complex scenes and varied conditions, effectively integrating\nspatial-temporal context is crucial for accurately identifying changes.\nHowever, current RS-CD methods lack a balanced consideration of performance and\nefficiency. CNNs lack global context, Transformers have quadratic computational\ncomplexity, and Mambas are restricted by CUDA acceleration. In this paper, we\npropose CDXFormer, with a core component that is a powerful XLSTM-based feature\nenhancement layer, integrating the advantages of linear computational\ncomplexity, global context perception, and strong interpret-ability.\nSpecifically, we introduce a scale-specific Feature Enhancer layer,\nincorporating a Cross-Temporal Global Perceptron customized for\nsemantic-accurate deep features, and a Cross-Temporal Spatial Refiner\ncustomized for detail-rich shallow features. Additionally, we propose a\nCross-Scale Interactive Fusion module to progressively interact global change\nrepresentations with spatial responses. Extensive experimental results\ndemonstrate that CDXFormer achieves state-of-the-art performance across three\nbenchmark datasets, offering a compelling balance between efficiency and\naccuracy. Code is available at https://github.com/xwmaxwma/rschange.\n","authors":["Zhenkai Wu","Xiaowen Ma","Rongrong Lian","Zhentao Lin","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04492v4","updated":"2024-11-12T15:16:36Z","published":"2024-10-06T14:11:39Z","title":"Interpret Your Decision: Logical Reasoning Regularization for\n Generalization in Visual Classification","summary":" Vision models excel in image classification but struggle to generalize to\nunseen data, such as classifying images from unseen domains or discovering\nnovel categories. In this paper, we explore the relationship between logical\nreasoning and deep learning generalization in visual classification. A logical\nregularization termed L-Reg is derived which bridges a logical analysis\nframework to image classification. Our work reveals that L-Reg reduces the\ncomplexity of the model in terms of the feature distribution and classifier\nweights. Specifically, we unveil the interpretability brought by L-Reg, as it\nenables the model to extract the salient features, such as faces to persons,\nfor classification. Theoretical analysis and experiments demonstrate that L-Reg\nenhances generalization across various scenarios, including multi-domain\ngeneralization and generalized category discovery. In complex real-world\nscenarios where images span unknown classes and unseen domains, L-Reg\nconsistently improves generalization, highlighting its practical efficacy.\n","authors":["Zhaorui Tan","Xi Yang","Qiufeng Wang","Anh Nguyen","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2410.04492v4.pdf","comment":"Accepted by NeurIPS2024 as Spotlight"},{"id":"http://arxiv.org/abs/2407.06001v2","updated":"2024-11-12T15:14:41Z","published":"2024-07-08T14:53:07Z","title":"Pseudo-triplet Guided Few-shot Composed Image Retrieval","summary":" Composed Image Retrieval (CIR) is a challenging task that aims to retrieve\nthe target image with a multimodal query, i.e., a reference image, and its\ncomplementary modification text. As previous supervised or zero-shot learning\nparadigms all fail to strike a good trade-off between the model's\ngeneralization ability and retrieval performance, recent researchers have\nintroduced the task of few-shot CIR (FS-CIR) and proposed a textual\ninversion-based network based on pretrained CLIP model to realize it. Despite\nits promising performance, the approach encounters two key limitations: simply\nrelying on the few annotated samples for CIR model training and\nindiscriminately selecting training triplets for CIR model fine-tuning. To\naddress these two limitations, we propose a novel two-stage pseudo triplet\nguided few-shot CIR scheme, dubbed PTG-FSCIR. In the first stage, we propose an\nattentive masking and captioning-based pseudo triplet generation method, to\nconstruct pseudo triplets from pure image data and use them to fulfill the\nCIR-task specific pertaining. In the second stage, we propose a challenging\ntriplet-based CIR fine-tuning method, where we design a pseudo modification\ntext-based sample challenging score estimation strategy and a robust top\nrange-based random sampling strategy for sampling robust challenging triplets\nto promote the model fine-tuning. Notably, our scheme is plug-and-play and\ncompatible with any existing supervised CIR models. We test our scheme across\ntwo backbones on three public datasets (i.e., FashionIQ, CIRR, and\nBirds-to-Words), achieving maximum improvements of 13.3%, 22.2%, and 17.4%\nrespectively, demonstrating our scheme's efficacy.\n","authors":["Bohan Hou","Haoqiang Lin","Haokun Wen","Meng Liu","Mingzhu Xu","Xuemeng Song"],"pdf_url":"https://arxiv.org/pdf/2407.06001v2.pdf","comment":"10pages"},{"id":"http://arxiv.org/abs/2411.07848v1","updated":"2024-11-12T15:01:40Z","published":"2024-11-12T15:01:40Z","title":"NL-SLAM for OC-VLN: Natural Language Grounded SLAM for Object-Centric\n VLN","summary":" Landmark-based navigation (e.g. go to the wooden desk) and relative\npositional navigation (e.g. move 5 meters forward) are distinct navigation\nchallenges solved very differently in existing robotics navigation methodology.\nWe present a new dataset, OC-VLN, in order to distinctly evaluate grounding\nobject-centric natural language navigation instructions in a method for\nperforming landmark-based navigation. We also propose Natural Language grounded\nSLAM (NL-SLAM), a method to ground natural language instruction to robot\nobservations and poses. We actively perform NL-SLAM in order to follow\nobject-centric natural language navigation instructions. Our methods leverage\npre-trained vision and language foundation models and require no task-specific\ntraining. We construct two strong baselines from state-of-the-art methods on\nrelated tasks, Object Goal Navigation and Vision Language Navigation, and we\nshow that our approach, NL-SLAM, outperforms these baselines across all our\nmetrics of success on OC-VLN. Finally, we successfully demonstrate the\neffectiveness of NL-SLAM for performing navigation instruction following in the\nreal world on a Boston Dynamics Spot robot.\n","authors":["Sonia Raychaudhuri","Duy Ta","Katrina Ashton","Angel X. Chang","Jiuguang Wang","Bernadette Bucher"],"pdf_url":"https://arxiv.org/pdf/2411.07848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12203v3","updated":"2024-11-12T15:00:37Z","published":"2024-03-18T19:25:57Z","title":"Bootstrapping Reinforcement Learning with Imitation for Vision-Based\n Agile Flight","summary":" Learning visuomotor policies for agile quadrotor flight presents significant\ndifficulties, primarily from inefficient policy exploration caused by\nhigh-dimensional visual inputs and the need for precise and low-latency\ncontrol. To address these challenges, we propose a novel approach that combines\nthe performance of Reinforcement Learning (RL) and the sample efficiency of\nImitation Learning (IL) in the task of vision-based autonomous drone racing.\nWhile RL provides a framework for learning high-performance controllers through\ntrial and error, it faces challenges with sample efficiency and computational\ndemands due to the high dimensionality of visual inputs. Conversely, IL\nefficiently learns from visual expert demonstrations, but it remains limited by\nthe expert's performance and state distribution. To overcome these limitations,\nour policy learning framework integrates the strengths of both approaches. Our\nframework contains three phases: training a teacher policy using RL with\nprivileged state information, distilling it into a student policy via IL, and\nadaptive fine-tuning via RL. Testing in both simulated and real-world scenarios\nshows our approach can not only learn in scenarios where RL from scratch fails\nbut also outperforms existing IL methods in both robustness and performance,\nsuccessfully navigating a quadrotor through a race course using only visual\ninformation. Videos of the experiments are available at\nhttps://rpg.ifi.uzh.ch/bootstrap-rl-with-il/index.html.\n","authors":["Jiaxu Xing","Angel Romero","Leonard Bauersfeld","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.12203v3.pdf","comment":"8th Annual Conference on Robot Learning (CoRL)"},{"id":"http://arxiv.org/abs/2401.11796v2","updated":"2024-11-12T14:58:37Z","published":"2024-01-22T09:53:20Z","title":"REVEX: A Unified Framework for Removal-Based Explainable Artificial\n Intelligence in Video","summary":" We developed REVEX, a removal-based video explanations framework. This work\nextends fine-grained explanation frameworks for computer vision data and adapts\nsix existing techniques to video by adding temporal information and local\nexplanations. The adapted methods were evaluated across networks, datasets,\nimage classes, and evaluation metrics. By decomposing explanation into steps,\nstrengths and weaknesses were revealed in the studied methods, for example, on\npixel clustering and perturbations in the input. Video LIME outperformed other\nmethods with deletion values up to 31\\% lower and insertion up to 30\\% higher,\ndepending on method and network. Video RISE achieved superior performance in\nthe average drop metric, with values 10\\% lower. In contrast,\nlocalization-based metrics revealed low performance across all methods, with\nsignificant variation depending on network. Pointing game accuracy reached\n53\\%, and IoU-based metrics remained below 20\\%. Drawing on the findings across\nXAI methods, we further examine the limitations of the employed XAI evaluation\nmetrics and highlight their suitability in different applications.\n","authors":["F. Xavier Gaya-Morey","Jose M. Buades-Rubio","I. Scott MacKenzie","Cristina Manresa-Yee"],"pdf_url":"https://arxiv.org/pdf/2401.11796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00002v2","updated":"2024-11-12T14:55:50Z","published":"2024-07-10T15:03:00Z","title":"Transfer Learning for Wildlife Classification: Evaluating YOLOv8 against\n DenseNet, ResNet, and VGGNet on a Custom Dataset","summary":" This study evaluates the performance of various deep learning models,\nspecifically DenseNet, ResNet, VGGNet, and YOLOv8, for wildlife species\nclassification on a custom dataset. The dataset comprises 575 images of 23\nendangered species sourced from reputable online repositories. The study\nutilizes transfer learning to fine-tune pre-trained models on the dataset,\nfocusing on reducing training time and enhancing classification accuracy. The\nresults demonstrate that YOLOv8 outperforms other models, achieving a training\naccuracy of 97.39% and a validation F1-score of 96.50%. These findings suggest\nthat YOLOv8, with its advanced architecture and efficient feature extraction\ncapabilities, holds great promise for automating wildlife monitoring and\nconservation efforts.\n","authors":["Subek Sharma","Sisir Dhakal","Mansi Bhavsar"],"pdf_url":"https://arxiv.org/pdf/2408.00002v2.pdf","comment":"This is published in Journal of Artificial Intelligence and Capsule\n Networks, December 2024, Volume 6, Issue 4, Pages 415-435"},{"id":"http://arxiv.org/abs/2410.20178v2","updated":"2024-11-12T14:45:18Z","published":"2024-10-26T13:19:57Z","title":"LLMs Can Evolve Continually on Modality for X-Modal Reasoning","summary":" Multimodal Large Language Models (MLLMs) have gained significant attention\ndue to their impressive capabilities in multimodal understanding. However,\nexisting methods rely heavily on extensive modal-specific pretraining and\njoint-modal tuning, leading to significant computational burdens when expanding\nto new modalities. In this paper, we propose PathWeave, a flexible and scalable\nframework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs\nto continually EVolve on modalities for $\\mathbb{X}$-modal reasoning. We\nleverage the concept of Continual Learning and develop an incremental training\nstrategy atop pre-trained MLLMs, enabling their expansion to new modalities\nusing uni-modal data, without executing joint-modal pretraining. In detail, a\nnovel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and\ncross-modal adapters are seamlessly integrated to facilitate efficient modality\nalignment and collaboration. Additionally, an MoE-based gating module is\napplied between two types of adapters to further enhance the multimodal\ninteraction. To investigate the proposed method, we establish a challenging\nbenchmark called Continual Learning of Modality (MCL), which consists of\nhigh-quality QA data from five distinct modalities: image, video, audio, depth\nand point cloud. Extensive experiments demonstrate the effectiveness of the\nproposed AnA framework on learning plasticity and memory stability during\ncontinual learning. Furthermore, PathWeave performs comparably to\nstate-of-the-art MLLMs while concurrently reducing parameter training burdens\nby 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave\n","authors":["Jiazuo Yu","Haomiao Xiong","Lu Zhang","Haiwen Diao","Yunzhi Zhuge","Lanqing Hong","Dong Wang","Huchuan Lu","You He","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2410.20178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07834v1","updated":"2024-11-12T14:36:06Z","published":"2024-11-12T14:36:06Z","title":"Towards Vision Mixture of Experts for Wildlife Monitoring on the Edge","summary":" The explosion of IoT sensors in industrial, consumer and remote sensing use\ncases has come with unprecedented demand for computing infrastructure to\ntransmit and to analyze petabytes of data. Concurrently, the world is slowly\nshifting its focus towards more sustainable computing. For these reasons, there\nhas been a recent effort to reduce the footprint of related computing\ninfrastructure, especially by deep learning algorithms, for advanced insight\ngeneration. The `TinyML' community is actively proposing methods to save\ncommunication bandwidth and excessive cloud storage costs while reducing\nalgorithm inference latency and promoting data privacy. Such proposed\napproaches should ideally process multiple types of data, including time\nseries, audio, satellite images, and video, near the network edge as multiple\ndata streams has been shown to improve the discriminative ability of learning\nalgorithms, especially for generating fine grained results. Incidentally, there\nhas been recent work on data driven conditional computation of subnetworks that\nhas shown real progress in using a single model to share parameters among very\ndifferent types of inputs such as images and text, reducing the computation\nrequirement of multi-tower multimodal networks. Inspired by such line of work,\nwe explore similar per patch conditional computation for the first time for\nmobile vision transformers (vision only case), that will eventually be used for\nsingle-tower multimodal edge models. We evaluate the model on Cornell Sap\nSucker Woods 60, a fine grained bird species discrimination dataset. Our\ninitial experiments uses $4X$ fewer parameters compared to MobileViTV2-1.0 with\na $1$% accuracy drop on the iNaturalist '21 birds test data provided as part of\nthe SSW60 dataset.\n","authors":["Emmanuel Azuh Mensah","Anderson Lee","Haoran Zhang","Yitong Shan","Kurtis Heimerl"],"pdf_url":"https://arxiv.org/pdf/2411.07834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03677v4","updated":"2024-11-12T14:33:36Z","published":"2024-08-07T10:36:26Z","title":"L4DR: LiDAR-4DRadar Fusion for Weather-Robust 3D Object Detection","summary":" LiDAR-based vision systems are integral for 3D object detection, which is\ncrucial for autonomous navigation. However, they suffer from performance\ndegradation in adverse weather conditions due to the quality deterioration of\nLiDAR point clouds. Fusing LiDAR with the weather-robust 4D radar sensor is\nexpected to solve this problem. However, the fusion of LiDAR and 4D radar is\nchallenging because they differ significantly in terms of data quality and the\ndegree of degradation in adverse weather. To address these issues, we introduce\nL4DR, a weather-robust 3D object detection method that effectively achieves\nLiDAR and 4D Radar fusion. Our L4DR includes Multi-Modal Encoding (MME) and\nForeground-Aware Denoising (FAD) technique to reconcile sensor gaps, which is\nthe first exploration of the complementarity of early fusion between LiDAR and\n4D radar. Additionally, we design an Inter-Modal and Intra-Modal ({IM}2 )\nparallel feature extraction backbone coupled with a Multi-Scale Gated Fusion\n(MSGF) module to counteract the varying degrees of sensor degradation under\nadverse weather conditions. Experimental evaluation on a VoD dataset with\nsimulated fog proves that L4DR is more adaptable to changing weather\nconditions. It delivers a significant performance increase under different fog\nlevels, improving the 3D mAP by up to 20.0% over the traditional LiDAR-only\napproach. Moreover, the results on the K-Radar dataset validate the consistent\nperformance improvement of L4DR in real-world adverse weather conditions.\n","authors":["Xun Huang","Ziyu Xu","Hai Wu","Jinlong Wang","Qiming Xia","Yan Xia","Jonathan Li","Kyle Gao","Chenglu Wen","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03677v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15063v4","updated":"2024-11-12T14:29:09Z","published":"2024-08-27T13:47:31Z","title":"Adapting Segment Anything Model to Multi-modal Salient Object Detection\n with Semantic Feature Fusion Guidance","summary":" Although most existing multi-modal salient object detection (SOD) methods\ndemonstrate effectiveness through training models from scratch, the limited\nmulti-modal data hinders these methods from reaching optimality. In this paper,\nwe propose a novel framework to explore and exploit the powerful feature\nrepresentation and zero-shot generalization ability of the pre-trained Segment\nAnything Model (SAM) for multi-modal SOD. Despite serving as a recent vision\nfundamental model, driving the class-agnostic SAM to comprehend and detect\nsalient objects accurately is non-trivial, especially in challenging scenes. To\nthis end, we develop \\underline{SAM} with se\\underline{m}antic\nf\\underline{e}ature fu\\underline{s}ion guidanc\\underline{e} (Sammese), which\nincorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to\nmulti-modal SOD tasks. However, it is difficult for SAM trained on single-modal\ndata to directly mine the complementary benefits of multi-modal inputs and\ncomprehensively utilize them to achieve accurate saliency prediction. To\naddress these issues, we first design a multi-modal complementary fusion module\nto extract robust multi-modal semantic features by integrating information from\nvisible and thermal or depth image pairs. Then, we feed the extracted\nmulti-modal semantic features into both the SAM image encoder and mask decoder\nfor fine-tuning and prompting, respectively. Specifically, in the image\nencoder, a multi-modal adapter is proposed to adapt the single-modal SAM to\nmulti-modal information. In the mask decoder, a semantic-geometric prompt\ngeneration strategy is proposed to produce corresponding embeddings with\nvarious saliency cues. Extensive experiments on both RGB-D and RGB-T SOD\nbenchmarks show the effectiveness of the proposed framework. The code will be\navailable at \\url{https://github.com/Angknpng/Sammese}.\n","authors":["Kunpeng Wang","Danying Lin","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15063v4.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2410.23091v3","updated":"2024-11-12T14:13:17Z","published":"2024-10-30T15:06:44Z","title":"CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for\n Adversarial Defense","summary":" Despite ongoing efforts to defend neural classifiers from adversarial\nattacks, they remain vulnerable, especially to unseen attacks. In contrast,\nhumans are difficult to be cheated by subtle manipulations, since we make\njudgments only based on essential factors. Inspired by this observation, we\nattempt to model label generation with essential label-causative factors and\nincorporate label-non-causative factors to assist data generation. For an\nadversarial example, we aim to discriminate the perturbations as non-causative\nfactors and make predictions only based on the label-causative factors.\nConcretely, we propose a casual diffusion model (CausalDiff) that adapts\ndiffusion models for conditional data generation and disentangles the two types\nof casual factors by learning towards a novel casual information bottleneck\nobjective. Empirically, CausalDiff has significantly outperformed\nstate-of-the-art defense methods on various unseen attacks, achieving an\naverage robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on\nCIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition\nBenchmark).\n","authors":["Mingkun Zhang","Keping Bi","Wei Chen","Quanrun Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.23091v3.pdf","comment":"accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.03926v2","updated":"2024-11-12T14:04:53Z","published":"2024-11-06T13:57:53Z","title":"Act in Collusion: A Persistent Distributed Multi-Target Backdoor in\n Federated Learning","summary":" Federated learning, a novel paradigm designed to protect data privacy, is\nvulnerable to backdoor attacks due to its distributed nature. Current research\noften designs attacks based on a single attacker with a single backdoor,\noverlooking more realistic and complex threats in federated learning. We\npropose a more practical threat model for federated learning: the distributed\nmulti-target backdoor. In this model, multiple attackers control different\nclients, embedding various triggers and targeting different classes,\ncollaboratively implanting backdoors into the global model via central\naggregation. Empirical validation shows that existing methods struggle to\nmaintain the effectiveness of multiple backdoors in the global model. Our key\ninsight is that similar backdoor triggers cause parameter conflicts and\ninjecting new backdoors disrupts gradient directions, significantly weakening\nsome backdoors performance. To solve this, we propose a Distributed\nMulti-Target Backdoor Attack (DMBA), ensuring efficiency and persistence of\nbackdoors from different malicious clients. To avoid parameter conflicts, we\ndesign a multi-channel dispersed frequency trigger strategy to maximize trigger\ndifferences. To mitigate gradient interference, we introduce backdoor replay in\nlocal training to neutralize conflicting gradients. Extensive validation shows\nthat 30 rounds after the attack, Attack Success Rates of three different\nbackdoors from various clients remain above 93%. The code will be made publicly\navailable after the review period.\n","authors":["Tao Liu","Wu Yang","Chen Xu","Jiguang Lv","Huanran Wang","Yuhang Zhang","Shuchun Xu","Dapeng Man"],"pdf_url":"https://arxiv.org/pdf/2411.03926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07802v1","updated":"2024-11-12T13:57:13Z","published":"2024-11-12T13:57:13Z","title":"Large-scale Remote Sensing Image Target Recognition and Automatic\n Annotation","summary":" This paper presents a method for object recognition and automatic labeling in\nlarge-area remote sensing images called LRSAA. The method integrates YOLOv11\nand MobileNetV3-SSD object detection algorithms through ensemble learning to\nenhance model performance. Furthermore, it employs Poisson disk sampling\nsegmentation techniques and the EIOU metric to optimize the training and\ninference processes of segmented images, followed by the integration of\nresults. This approach not only reduces the demand for computational resources\nbut also achieves a good balance between accuracy and speed. The source code\nfor this project has been made publicly available on\nhttps://github.com/anaerovane/LRSAA.\n","authors":["Wuzheng Dong"],"pdf_url":"https://arxiv.org/pdf/2411.07802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07799v1","updated":"2024-11-12T13:53:22Z","published":"2024-11-12T13:53:22Z","title":"Horticultural Temporal Fruit Monitoring via 3D Instance Segmentation and\n Re-Identification using Point Clouds","summary":" Robotic fruit monitoring is a key step toward automated agricultural\nproduction systems. Robots can significantly enhance plant and temporal fruit\nmonitoring by providing precise, high-throughput assessments that overcome the\nlimitations of traditional manual methods. Fruit monitoring is a challenging\ntask due to the significant variation in size, shape, orientation, and\nocclusion of fruits. Also, fruits may be harvested or newly grown between\nrecording sessions. Most methods are 2D image-based and they lack the 3D\nstructure, depth, and spatial information, which represent key aspects of fruit\nmonitoring. 3D colored point clouds, instead, can offer this information but\nthey introduce challenges such as their sparsity and irregularity. In this\npaper, we present a novel approach for temporal fruit monitoring that addresses\npoint clouds collected in a greenhouse over time. Our method segments fruits\nusing a learning-based instance segmentation approach directly on the point\ncloud. Each segmented fruit is processed by a 3D sparse convolutional neural\nnetwork to extract descriptors, which are used in an attention-based matching\nnetwork to associate fruits with their instances from previous data\ncollections. Experimental results on a real dataset of strawberries demonstrate\nthat our approach outperforms other methods for fruits re-identification over\ntime, allowing for precise temporal fruit monitoring in real and complex\nscenarios.\n","authors":["Daniel Fusaro","Federico Magistri","Jens Behley","Alberto Pretto","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2411.07799v1.pdf","comment":"Submitted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2411.07784v1","updated":"2024-11-12T13:33:26Z","published":"2024-11-12T13:33:26Z","title":"Interaction Asymmetry: A General Principle for Learning Composable\n Abstractions","summary":" Learning disentangled representations of concepts and re-composing them in\nunseen ways is crucial for generalizing to out-of-domain situations. However,\nthe underlying properties of concepts that enable such disentanglement and\ncompositional generalization remain poorly understood. In this work, we propose\nthe principle of interaction asymmetry which states: \"Parts of the same concept\nhave more complex interactions than parts of different concepts\". We formalize\nthis via block diagonality conditions on the $(n+1)$th order derivatives of the\ngenerator mapping concepts to observed data, where different orders of\n\"complexity\" correspond to different $n$. Using this formalism, we prove that\ninteraction asymmetry enables both disentanglement and compositional\ngeneralization. Our results unify recent theoretical results for learning\nconcepts of objects, which we show are recovered as special cases with\n$n\\!=\\!0$ or $1$. We provide results for up to $n\\!=\\!2$, thus extending these\nprior works to more flexible generator functions, and conjecture that the same\nproof strategies generalize to larger $n$. Practically, our theory suggests\nthat, to disentangle concepts, an autoencoder should penalize its latent\ncapacity and the interactions between concepts during decoding. We propose an\nimplementation of these criteria using a flexible Transformer-based VAE, with a\nnovel regularizer on the attention weights of the decoder. On synthetic image\ndatasets consisting of objects, we provide evidence that this model can achieve\ncomparable object disentanglement to existing models that use more explicit\nobject-centric priors.\n","authors":["Jack Brady","Julius von Kügelgen","Sébastien Lachapelle","Simon Buchholz","Thomas Kipf","Wieland Brendel"],"pdf_url":"https://arxiv.org/pdf/2411.07784v1.pdf","comment":"Preprint, under review"},{"id":"http://arxiv.org/abs/2407.00352v2","updated":"2024-11-12T13:01:48Z","published":"2024-06-29T07:53:47Z","title":"PhyTracker: An Online Tracker for Phytoplankton","summary":" Phytoplankton, a crucial component of aquatic ecosystems, requires efficient\nmonitoring to understand marine ecological processes and environmental\nconditions. Traditional phytoplankton monitoring methods, relying on non-in\nsitu observations, are time-consuming and resource-intensive, limiting timely\nanalysis. To address these limitations, we introduce PhyTracker, an intelligent\nin situ tracking framework designed for automatic tracking of phytoplankton.\nPhyTracker overcomes significant challenges unique to phytoplankton monitoring,\nsuch as constrained mobility within water flow, inconspicuous appearance, and\nthe presence of impurities. Our method incorporates three innovative modules: a\nTexture-enhanced Feature Extraction (TFE) module, an Attention-enhanced\nTemporal Association (ATA) module, and a Flow-agnostic Movement Refinement\n(FMR) module. These modules enhance feature capture, differentiate between\nphytoplankton and impurities, and refine movement characteristics,\nrespectively. Extensive experiments on the PMOT dataset validate the\nsuperiority of PhyTracker in phytoplankton tracking, and additional tests on\nthe MOT dataset demonstrate its general applicability, outperforming\nconventional tracking methods. This work highlights key differences between\nphytoplankton and traditional objects, offering an effective solution for\nphytoplankton monitoring.\n","authors":["Yang Yu","Qingxuan Lv","Yuezun Li","Zhiqiang Wei","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2407.00352v2.pdf","comment":"13pages,eleven figures"},{"id":"http://arxiv.org/abs/2411.07765v1","updated":"2024-11-12T12:58:33Z","published":"2024-11-12T12:58:33Z","title":"Novel View Synthesis with Pixel-Space Diffusion Models","summary":" Synthesizing a novel view from a single input image is a challenging task.\nTraditionally, this task was approached by estimating scene depth, warping, and\ninpainting, with machine learning models enabling parts of the pipeline. More\nrecently, generative models are being increasingly employed in novel view\nsynthesis (NVS), often encompassing the entire end-to-end system. In this work,\nwe adapt a modern diffusion model architecture for end-to-end NVS in the pixel\nspace, substantially outperforming previous state-of-the-art (SOTA) techniques.\nWe explore different ways to encode geometric information into the network. Our\nexperiments show that while these methods may enhance performance, their impact\nis minor compared to utilizing improved generative models. Moreover, we\nintroduce a novel NVS training scheme that utilizes single-view datasets,\ncapitalizing on their relative abundance compared to their multi-view\ncounterparts. This leads to improved generalization capabilities to scenes with\nout-of-domain content.\n","authors":["Noam Elata","Bahjat Kawar","Yaron Ostrovsky-Berman","Miriam Farber","Ron Sokolovsky"],"pdf_url":"https://arxiv.org/pdf/2411.07765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09997v5","updated":"2024-11-12T12:56:33Z","published":"2023-07-19T14:10:55Z","title":"TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical\n Phase Recognition","summary":" To enable context-aware computer assistance in the operating room of the\nfuture, cognitive systems need to understand automatically which surgical phase\nis being performed by the medical team. The primary source of information for\nsurgical phase recognition is typically video, which presents two challenges:\nextracting meaningful features from the video stream and effectively modeling\ntemporal information in the sequence of visual features. For temporal modeling,\nattention mechanisms have gained popularity due to their ability to capture\nlong-range dependencies. In this paper, we explore design choices for attention\nin existing temporal models for surgical phase recognition and propose a novel\napproach that uses attention more effectively and does not require hand-crafted\nconstraints: TUNeS, an efficient and simple temporal model that incorporates\nself-attention at the core of a convolutional U-Net structure. In addition, we\npropose to train the feature extractor, a standard CNN, together with an LSTM\non preferably long video segments, i.e., with long temporal context. In our\nexperiments, almost all temporal models performed better on top of feature\nextractors that were trained with longer temporal context. On these\ncontextualized features, TUNeS achieves state-of-the-art results on the\nCholec80 dataset. This study offers new insights on how to use attention\nmechanisms to build accurate and efficient temporal models for surgical phase\nrecognition. Implementing automatic surgical phase recognition is essential to\nautomate the analysis and optimization of surgical workflows and to enable\ncontext-aware computer assistance during surgery, thus ultimately improving\npatient care.\n","authors":["Isabel Funke","Dominik Rivoir","Stefanie Krell","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2307.09997v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07096v2","updated":"2024-11-12T12:45:03Z","published":"2024-11-11T16:18:28Z","title":"Extreme Rotation Estimation in the Wild","summary":" We present a technique and benchmark dataset for estimating the relative 3D\norientation between a pair of Internet images captured in an extreme setting,\nwhere the images have limited or non-overlapping field of views. Prior work\ntargeting extreme rotation estimation assume constrained 3D environments and\nemulate perspective images by cropping regions from panoramic views. However,\nreal images captured in the wild are highly diverse, exhibiting variation in\nboth appearance and camera intrinsics. In this work, we propose a\nTransformer-based method for estimating relative rotations in extreme\nreal-world settings, and contribute the ExtremeLandmarkPairs dataset, assembled\nfrom scene-level Internet photo collections. Our evaluation demonstrates that\nour approach succeeds in estimating the relative rotations in a wide variety of\nextreme-view Internet image pairs, outperforming various baselines, including\ndedicated rotation estimation techniques and contemporary 3D reconstruction\nmethods.\n","authors":["Hana Bezalel","Dotan Ankri","Ruojin Cai","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2411.07096v2.pdf","comment":"Project webpage:\n https://tau-vailab.github.io/ExtremeRotationsInTheWild/"},{"id":"http://arxiv.org/abs/2411.07758v1","updated":"2024-11-12T12:35:34Z","published":"2024-11-12T12:35:34Z","title":"AdaSemiCD: An Adaptive Semi-Supervised Change Detection Method Based on\n Pseudo-Label Evaluation","summary":" Change Detection (CD) is an essential field in remote sensing, with a primary\nfocus on identifying areas of change in bi-temporal image pairs captured at\nvarying intervals of the same region by a satellite. The data annotation\nprocess for the CD task is both time-consuming and labor-intensive. To make\nbetter use of the scarce labeled data and abundant unlabeled data, we present\nan adaptive dynamic semi-supervised learning method, AdaSemiCD, to improve the\nuse of pseudo-labels and optimize the training process. Initially, due to the\nextreme class imbalance inherent in CD, the model is more inclined to focus on\nthe background class, and it is easy to confuse the boundary of the target\nobject. Considering these two points, we develop a measurable evaluation metric\nfor pseudo-labels that enhances the representation of information entropy by\nclass rebalancing and amplification of confusing areas to give a larger weight\nto prospects change objects. Subsequently, to enhance the reliability of\nsample-wise pseudo-labels, we introduce the AdaFusion module, which is capable\nof dynamically identifying the most uncertain region and substituting it with\nmore trustworthy content. Lastly, to ensure better training stability, we\nintroduce the AdaEMA module, which updates the teacher model using only batches\nof trusted samples. Experimental results from LEVIR-CD, WHU-CD, and CDD\ndatasets validate the efficacy and universality of our proposed adaptive\ntraining framework.\n","authors":["Ran Lingyan","Wen Dongcheng","Zhuo Tao","Zhang Shizhou","Zhang Xiuwei","Zhang Yanning"],"pdf_url":"https://arxiv.org/pdf/2411.07758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07751v1","updated":"2024-11-12T12:23:41Z","published":"2024-11-12T12:23:41Z","title":"SAV-SE: Scene-aware Audio-Visual Speech Enhancement with Selective State\n Space Model","summary":" Speech enhancement plays an essential role in various applications, and the\nintegration of visual information has been demonstrated to bring substantial\nadvantages. However, the majority of current research concentrates on the\nexamination of facial and lip movements, which can be compromised or entirely\ninaccessible in scenarios where occlusions occur or when the camera view is\ndistant. Whereas contextual visual cues from the surrounding environment have\nbeen overlooked: for example, when we see a dog bark, our brain has the innate\nability to discern and filter out the barking noise. To this end, in this\npaper, we introduce a novel task, i.e. SAV-SE. To our best knowledge, this is\nthe first proposal to use rich contextual information from synchronized video\nas auxiliary cues to indicate the type of noise, which eventually improves the\nspeech enhancement performance. Specifically, we propose the VC-S$^2$E method,\nwhich incorporates the Conformer and Mamba modules for their complementary\nstrengths. Extensive experiments are conducted on public MUSIC, AVSpeech and\nAudioSet datasets, where the results demonstrate the superiority of VC-S$^2$E\nover other competitive methods. We will make the source code publicly\navailable. Project demo page: https://AVSEPage.github.io/\n","authors":["Xinyuan Qian","Jiaran Gao","Yaodan Zhang","Qiquan Zhang","Hexin Liu","Leibny Paola Garcia","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2411.07751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07750v1","updated":"2024-11-12T12:23:19Z","published":"2024-11-12T12:23:19Z","title":"LapGSR: Laplacian Reconstructive Network for Guided Thermal\n Super-Resolution","summary":" In the last few years, the fusion of multi-modal data has been widely studied\nfor various applications such as robotics, gesture recognition, and autonomous\nnavigation. Indeed, high-quality visual sensors are expensive, and\nconsumer-grade sensors produce low-resolution images. Researchers have\ndeveloped methods to combine RGB color images with non-visual data, such as\nthermal, to overcome this limitation to improve resolution. Fusing multiple\nmodalities to produce visually appealing, high-resolution images often requires\ndense models with millions of parameters and a heavy computational load, which\nis commonly attributed to the intricate architecture of the model.\n We propose LapGSR, a multimodal, lightweight, generative model incorporating\nLaplacian image pyramids for guided thermal super-resolution. This approach\nuses a Laplacian Pyramid on RGB color images to extract vital edge information,\nwhich is then used to bypass heavy feature map computation in the higher layers\nof the model in tandem with a combined pixel and adversarial loss. LapGSR\npreserves the spatial and structural details of the image while also being\nefficient and compact. This results in a model with significantly fewer\nparameters than other SOTA models while demonstrating excellent results on two\ncross-domain datasets viz. ULB17-VT and VGTSR datasets.\n","authors":["Aditya Kasliwal","Ishaan Gakhar","Aryan Kamani","Pratinav Seth","Ujjwal Verma"],"pdf_url":"https://arxiv.org/pdf/2411.07750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07747v1","updated":"2024-11-12T12:18:18Z","published":"2024-11-12T12:18:18Z","title":"Constraint Learning for Parametric Point Cloud","summary":" Parametric point clouds are sampled from CAD shapes, have become increasingly\nprevalent in industrial manufacturing. However, most existing point cloud\nlearning methods focus on the geometric features, such as local and global\nfeatures or developing efficient convolution operations, overlooking the\nimportant attribute of constraints inherent in CAD shapes, which limits these\nmethods' ability to fully comprehend CAD shapes. To address this issue, we\nanalyzed the effect of constraints, and proposed its deep learning-friendly\nrepresentation, after that, the Constraint Feature Learning Network (CstNet) is\ndeveloped to extract and leverage constraints. Our CstNet includes two stages.\nThe Stage 1 extracts constraints from B-Rep data or point cloud. The Stage 2\nleverages coordinates and constraints to enhance the comprehend of CAD shapes.\nAdditionally, we built up the Parametric 20,000 Multi-modal Dataset for the\nscarcity of labeled B-Rep datasets. Experiments demonstrate that our CstNet\nachieved state-of-the-art performance on both public and proposed CAD shapes\ndatasets. To the best of our knowledge, CstNet is the first constraint-based\nlearning method tailored for CAD shapes analysis.\n","authors":["Xi Cheng","Ruiqi Lei","Di Huang","Zhichao Liao","Fengyuan Piao","Yan Chen","Pingfa Feng","Long Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.07747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07742v1","updated":"2024-11-12T12:07:27Z","published":"2024-11-12T12:07:27Z","title":"Efficient 3D Perception on Multi-Sweep Point Cloud with Gumbel Spatial\n Pruning","summary":" This paper studies point cloud perception within outdoor environments.\nExisting methods face limitations in recognizing objects located at a distance\nor occluded, due to the sparse nature of outdoor point clouds. In this work, we\nobserve a significant mitigation of this problem by accumulating multiple\ntemporally consecutive LiDAR sweeps, resulting in a remarkable improvement in\nperception accuracy. However, the computation cost also increases, hindering\nprevious approaches from utilizing a large number of LiDAR sweeps. To tackle\nthis challenge, we find that a considerable portion of points in the\naccumulated point cloud is redundant, and discarding these points has minimal\nimpact on perception accuracy. We introduce a simple yet effective Gumbel\nSpatial Pruning (GSP) layer that dynamically prunes points based on a learned\nend-to-end sampling. The GSP layer is decoupled from other network components\nand thus can be seamlessly integrated into existing point cloud network\narchitectures. Without incurring additional computational overhead, we increase\nthe number of LiDAR sweeps from 10, a common practice, to as many as 40.\nConsequently, there is a significant enhancement in perception performance. For\ninstance, in nuScenes 3D object detection and BEV map segmentation tasks, our\npruning strategy improves the vanilla TransL baseline and other baseline\nmethods.\n","authors":["Jianhao Li","Tianyu Sun","Xueqian Zhang","Zhongdao Wang","Bailan Feng","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.07742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06911v2","updated":"2024-11-12T12:07:00Z","published":"2024-11-11T12:13:58Z","title":"Gaussian Process Emulators for Few-Shot Segmentation in Cardiac MRI","summary":" Segmentation of cardiac magnetic resonance images (MRI) is crucial for the\nanalysis and assessment of cardiac function, helping to diagnose and treat\nvarious cardiovascular diseases. Most recent techniques rely on deep learning\nand usually require an extensive amount of labeled data. To overcome this\nproblem, few-shot learning has the capability of reducing data dependency on\nlabeled data. In this work, we introduce a new method that merges few-shot\nlearning with a U-Net architecture and Gaussian Process Emulators (GPEs),\nenhancing data integration from a support set for improved performance. GPEs\nare trained to learn the relation between the support images and the\ncorresponding masks in latent space, facilitating the segmentation of unseen\nquery images given only a small labeled support set at inference. We test our\nmodel with the M&Ms-2 public dataset to assess its ability to segment the heart\nin cardiac magnetic resonance imaging from different orientations, and compare\nit with state-of-the-art unsupervised and few-shot methods. Our architecture\nshows higher DICE coefficients compared to these methods, especially in the\nmore challenging setups where the size of the support set is considerably\nsmall.\n","authors":["Bruno Viti","Franz Thaler","Kathrin Lisa Kapper","Martin Urschler","Martin Holler","Elias Karabelas"],"pdf_url":"https://arxiv.org/pdf/2411.06911v2.pdf","comment":"Accepted at Statistical Atlases and Computational Modeling of the\n Heart (STACOM) Workshop 2024"},{"id":"http://arxiv.org/abs/2411.07740v1","updated":"2024-11-12T12:04:44Z","published":"2024-11-12T12:04:44Z","title":"3D Focusing-and-Matching Network for Multi-Instance Point Cloud\n Registration","summary":" Multi-instance point cloud registration aims to estimate the pose of all\ninstances of a model point cloud in the whole scene. Existing methods all adopt\nthe strategy of first obtaining the global correspondence and then clustering\nto obtain the pose of each instance. However, due to the cluttered and occluded\nobjects in the scene, it is difficult to obtain an accurate correspondence\nbetween the model point cloud and all instances in the scene. To this end, we\npropose a simple yet powerful 3D focusing-and-matching network for\nmulti-instance point cloud registration by learning the multiple pair-wise\npoint cloud registration. Specifically, we first present a 3D multi-object\nfocusing module to locate the center of each object and generate object\nproposals. By using self-attention and cross-attention to associate the model\npoint cloud with structurally similar objects, we can locate potential matching\ninstances by regressing object centers. Then, we propose a 3D dual masking\ninstance matching module to estimate the pose between the model point cloud and\neach object proposal. It performs instance mask and overlap mask masks to\naccurately predict the pair-wise correspondence. Extensive experiments on two\npublic benchmarks, Scan2CAD and ROBI, show that our method achieves a new\nstate-of-the-art performance on the multi-instance point cloud registration\ntask. Code is available at https://github.com/zlynpu/3DFMNet.\n","authors":["Liyuan Zhang","Le Hui","Qi Liu","Bo Li","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2411.07740v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.07728v1","updated":"2024-11-12T11:39:05Z","published":"2024-11-12T11:39:05Z","title":"No-Reference Point Cloud Quality Assessment via Graph Convolutional\n Network","summary":" Three-dimensional (3D) point cloud, as an emerging visual media format, is\nincreasingly favored by consumers as it can provide more realistic visual\ninformation than two-dimensional (2D) data. Similar to 2D plane images and\nvideos, point clouds inevitably suffer from quality degradation and information\nloss through multimedia communication systems. Therefore, automatic point cloud\nquality assessment (PCQA) is of critical importance. In this work, we propose a\nnovel no-reference PCQA method by using a graph convolutional network (GCN) to\ncharacterize the mutual dependencies of multi-view 2D projected image contents.\nThe proposed GCN-based PCQA (GC-PCQA) method contains three modules, i.e.,\nmulti-view projection, graph construction, and GCN-based quality prediction.\nFirst, multi-view projection is performed on the test point cloud to obtain a\nset of horizontally and vertically projected images. Then, a\nperception-consistent graph is constructed based on the spatial relations among\ndifferent projected images. Finally, reasoning on the constructed graph is\nperformed by GCN to characterize the mutual dependencies and interactions\nbetween different projected images, and aggregate feature information of\nmulti-view projected images for final quality prediction. Experimental results\non two publicly available benchmark databases show that our proposed GC-PCQA\ncan achieve superior performance than state-of-the-art quality assessment\nmetrics. The code will be available at: https://github.com/chenwuwq/GC-PCQA.\n","authors":["Wu Chen","Qiuping Jiang","Wei Zhou","Feng Shao","Guangtao Zhai","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2411.07728v1.pdf","comment":"Accepted by IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2411.07725v1","updated":"2024-11-12T11:32:56Z","published":"2024-11-12T11:32:56Z","title":"ALOcc: Adaptive Lifting-based 3D Semantic Occupancy and Cost\n Volume-based Flow Prediction","summary":" Vision-based semantic occupancy and flow prediction plays a crucial role in\nproviding spatiotemporal cues for real-world tasks, such as autonomous driving.\nExisting methods prioritize higher accuracy to cater to the demands of these\ntasks. In this work, we strive to improve performance by introducing a series\nof targeted improvements for 3D semantic occupancy prediction and flow\nestimation. First, we introduce an occlusion-aware adaptive lifting mechanism\nwith a depth denoising technique to improve the robustness of 2D-to-3D feature\ntransformation and reduce the reliance on depth priors. Second, we strengthen\nthe semantic consistency between 3D features and their original 2D modalities\nby utilizing shared semantic prototypes to jointly constrain both 2D and 3D\nfeatures. This is complemented by confidence- and category-based sampling\nstrategies to tackle long-tail challenges in 3D space. To alleviate the feature\nencoding burden in the joint prediction of semantics and flow, we propose a BEV\ncost volume-based prediction method that links flow and semantic features\nthrough a cost volume and employs a classification-regression supervision\nscheme to address the varying flow scales in dynamic scenes. Our purely\nconvolutional architecture framework, named ALOcc, achieves an optimal tradeoff\nbetween speed and accuracy achieving state-of-the-art results on multiple\nbenchmarks. On Occ3D and training without the camera visible mask, our ALOcc\nachieves an absolute gain of 2.5\\% in terms of RayIoU while operating at a\ncomparable speed compared to the state-of-the-art, using the same input size\n(256$\\times$704) and ResNet-50 backbone. Our method also achieves 2nd place in\nthe CVPR24 Occupancy and Flow Prediction Competition.\n","authors":["Dubing Chen","Jin Fang","Wencheng Han","Xinjing Cheng","Junbo Yin","Chenzhong Xu","Fahad Shahbaz Khan","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2411.07725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07719v1","updated":"2024-11-12T11:24:18Z","published":"2024-11-12T11:24:18Z","title":"EMPERROR: A Flexible Generative Perception Error Model for Probing\n Self-Driving Planners","summary":" To handle the complexities of real-world traffic, learning planners for\nself-driving from data is a promising direction. While recent approaches have\nshown great progress, they typically assume a setting in which the ground-truth\nworld state is available as input. However, when deployed, planning needs to be\nrobust to the long-tail of errors incurred by a noisy perception system, which\nis often neglected in evaluation. To address this, previous work has proposed\ndrawing adversarial samples from a perception error model (PEM) mimicking the\nnoise characteristics of a target object detector. However, these methods use\nsimple PEMs that fail to accurately capture all failure modes of detection. In\nthis paper, we present EMPERROR, a novel transformer-based generative PEM,\napply it to stress-test an imitation learning (IL)-based planner and show that\nit imitates modern detectors more faithfully than previous work. Furthermore,\nit is able to produce realistic noisy inputs that increase the planner's\ncollision rate by up to 85%, demonstrating its utility as a valuable tool for a\nmore complete evaluation of self-driving planners.\n","authors":["Niklas Hanselmann","Simon Doll","Marius Cordts","Hendrik P. A. Lensch","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2411.07719v1.pdf","comment":"Project page: https://lasnik.github.io/emperror/"},{"id":"http://arxiv.org/abs/2410.10563v2","updated":"2024-11-12T11:16:43Z","published":"2024-10-14T14:42:12Z","title":"MEGA-Bench: Scaling Multimodal Evaluation to over 500 Real-World Tasks","summary":" We present MEGA-Bench, an evaluation suite that scales multimodal evaluation\nto over 500 real-world tasks, to address the highly heterogeneous daily use\ncases of end users. Our objective is to optimize for a set of high-quality data\nsamples that cover a highly diverse and rich set of multimodal tasks, while\nenabling cost-effective and accurate model evaluation. In particular, we\ncollected 505 realistic tasks encompassing over 8,000 samples from 16 expert\nannotators to extensively cover the multimodal task space. Instead of unifying\nthese problems into standard multi-choice questions (like MMMU, MMBench, and\nMMT-Bench), we embrace a wide range of output formats like numbers, phrases,\ncode, \\LaTeX, coordinates, JSON, free-form, etc. To accommodate these formats,\nwe developed over 40 metrics to evaluate these tasks. Unlike existing\nbenchmarks, MEGA-Bench offers a fine-grained capability report across multiple\ndimensions (e.g., application, input type, output format, skill), allowing\nusers to interact with and visualize model capabilities in depth. We evaluate a\nwide variety of frontier vision-language models on MEGA-Bench to understand\ntheir capabilities across these dimensions.\n","authors":["Jiacheng Chen","Tianhao Liang","Sherman Siu","Zhengqing Wang","Kai Wang","Yubo Wang","Yuansheng Ni","Wang Zhu","Ziyan Jiang","Bohan Lyu","Dongfu Jiang","Xuan He","Yuan Liu","Hexiang Hu","Xiang Yue","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2410.10563v2.pdf","comment":"Technical report. Project page:\n https://tiger-ai-lab.github.io/MEGA-Bench/. v2 includes more evaluated models\n and a single-image setting"},{"id":"http://arxiv.org/abs/2407.08364v3","updated":"2024-11-12T10:56:38Z","published":"2024-07-11T10:18:54Z","title":"Scalar Function Topology Divergence: Comparing Topology of 3D Objects","summary":" We propose a new topological tool for computer vision - Scalar Function\nTopology Divergence (SFTD), which measures the dissimilarity of multi-scale\ntopology between sublevel sets of two functions having a common domain.\nFunctions can be defined on an undirected graph or Euclidean space of any\ndimensionality. Most of the existing methods for comparing topology are based\non Wasserstein distance between persistence barcodes and they don't take into\naccount the localization of topological features. The minimization of SFTD\nensures that the corresponding topological features of scalar functions are\nlocated in the same places. The proposed tool provides useful visualizations\ndepicting areas where functions have topological dissimilarities. We provide\napplications of the proposed method to 3D computer vision. In particular,\nexperiments demonstrate that SFTD as an additional loss improves the\nreconstruction of cellular 3D shapes from 2D fluorescence microscopy images,\nand helps to identify topological errors in 3D segmentation. Additionally, we\nshow that SFTD outperforms Betti matching loss in 2D segmentation problems.\n","authors":["Ilya Trofimov","Daria Voronkova","Eduard Tulchinskii","Evgeny Burnaev","Serguei Barannikov"],"pdf_url":"https://arxiv.org/pdf/2407.08364v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19287v3","updated":"2024-11-12T10:48:21Z","published":"2024-04-30T06:34:21Z","title":"Revisiting the Adversarial Robustness of Vision Language Models: a\n Multimodal Perspective","summary":" Pretrained vision-language models (VLMs) like CLIP exhibit exceptional\ngeneralization across diverse downstream tasks. While recent studies reveal\ntheir vulnerability to adversarial attacks, research to date has primarily\nfocused on enhancing the robustness of image encoders against image-based\nattacks, with defenses against text-based and multimodal attacks remaining\nlargely unexplored. To this end, this work presents the first comprehensive\nstudy on improving the adversarial robustness of VLMs against attacks targeting\nimage, text, and multimodal inputs. This is achieved by proposing multimodal\ncontrastive adversarial training (MMCoA). Such an approach strengthens the\nrobustness of both image and text encoders by aligning the clean text\nembeddings with adversarial image embeddings, and adversarial text embeddings\nwith clean image embeddings. The robustness of the proposed MMCoA is examined\nagainst existing defense methods over image, text, and multimodal attacks on\nthe CLIP model. Extensive experiments on 15 datasets across two tasks reveal\nthe characteristics of different adversarial defense methods under distinct\ndistribution shifts and dataset complexities across the three attack types.\nThis paves the way for a unified framework of adversarial robustness against\ndifferent modality attacks, opening up new possibilities for securing VLMs\nagainst multimodal attacks. The code is available at\nhttps://github.com/ElleZWQ/MMCoA.git.\n","authors":["Wanqi Zhou","Shuanghao Bai","Danilo P. Mandic","Qibin Zhao","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2404.19287v3.pdf","comment":"17 pages, 13 figures"},{"id":"http://arxiv.org/abs/2411.07708v1","updated":"2024-11-12T10:47:31Z","published":"2024-11-12T10:47:31Z","title":"Emotion Classification of Children Expressions","summary":" This paper proposes a process for a classification model for the facial\nexpressions. The proposed process would aid in specific categorisation of\nchildren's emotions from 2 emotions namely 'Happy' and 'Sad'. Since the\nexisting emotion recognition systems algorithms primarily train on adult faces,\nthe model developed is achieved by using advanced concepts of models with\nSqueeze-andExcitation blocks, Convolutional Block Attention modules, and robust\ndata augmentation. Stable Diffusion image synthesis was used for expanding and\ndiversifying the data set generating realistic and various training samples.\nThe model designed using Batch Normalisation, Dropout, and SE Attention\nmechanisms for the classification of children's emotions achieved an accuracy\nrate of 89\\% due to these methods improving the precision of emotion\nrecognition in children. The relative importance of this issue is raised in\nthis study with an emphasis on the call for a more specific model in emotion\ndetection systems for the young generation with specific direction on how the\nyoung people can be assisted to manage emotions while online.\n","authors":["Sanchayan Vivekananthan"],"pdf_url":"https://arxiv.org/pdf/2411.07708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12036v2","updated":"2024-11-12T10:12:49Z","published":"2024-07-01T05:37:17Z","title":"Exploring Advanced Large Language Models with LLMsuite","summary":" This tutorial explores the advancements and challenges in the development of\nLarge Language Models (LLMs) such as ChatGPT and Gemini. It addresses inherent\nlimitations like temporal knowledge cutoffs, mathematical inaccuracies, and the\ngeneration of incorrect information, proposing solutions like Retrieval\nAugmented Generation (RAG), Program-Aided Language Models (PAL), and frameworks\nsuch as ReAct and LangChain. The integration of these techniques enhances LLM\nperformance and reliability, especially in multi-step reasoning and complex\ntask execution. The paper also covers fine-tuning strategies, including\ninstruction fine-tuning, parameter-efficient methods like LoRA, and\nReinforcement Learning from Human Feedback (RLHF) as well as Reinforced\nSelf-Training (ReST). Additionally, it provides a comprehensive survey of\ntransformer architectures and training techniques for LLMs. The source code can\nbe accessed by contacting the author via email for a request.\n","authors":["Giorgio Roffo"],"pdf_url":"https://arxiv.org/pdf/2407.12036v2.pdf","comment":"Keywords: Language Model Benchmarking, Pre-Trained LLM Comparison,\n LLM Performance Analysis, NLP Model Evaluation Tools, Public Dataset\n Inference for LLMs, BLEU and ROUGE Metrics for LLM, Open Source LLM Testing\n Tools, Large Language Model Evaluation Software, NLP Benchmarking Suite,\n Comprehensive LLM Evaluation Toolkit"},{"id":"http://arxiv.org/abs/2411.07688v1","updated":"2024-11-12T10:12:12Z","published":"2024-11-12T10:12:12Z","title":"Enhancing Ultra High Resolution Remote Sensing Imagery Analysis with\n ImageRAG","summary":" Ultra High Resolution (UHR) remote sensing imagery (RSI) (e.g. 100,000\n$\\times$ 100,000 pixels or more) poses a significant challenge for current\nRemote Sensing Multimodal Large Language Models (RSMLLMs). If choose to resize\nthe UHR image to standard input image size, the extensive spatial and\ncontextual information that UHR images contain will be neglected. Otherwise,\nthe original size of these images often exceeds the token limits of standard\nRSMLLMs, making it difficult to process the entire image and capture long-range\ndependencies to answer the query based on the abundant visual context. In this\npaper, we introduce ImageRAG for RS, a training-free framework to address the\ncomplexities of analyzing UHR remote sensing imagery. By transforming UHR\nremote sensing image analysis task to image's long context selection task, we\ndesign an innovative image contextual retrieval mechanism based on the\nRetrieval-Augmented Generation (RAG) technique, denoted as ImageRAG. ImageRAG's\ncore innovation lies in its ability to selectively retrieve and focus on the\nmost relevant portions of the UHR image as visual contexts that pertain to a\ngiven query. Fast path and slow path are proposed in this framework to handle\nthis task efficiently and effectively. ImageRAG allows RSMLLMs to manage\nextensive context and spatial information from UHR RSI, ensuring the analysis\nis both accurate and efficient.\n","authors":["Zilun Zhang","Haozhan Shen","Tiancheng Zhao","Yuhao Wang","Bin Chen","Yuxiang Cai","Yongheng Shang","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2411.07688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16620v3","updated":"2024-11-12T10:02:12Z","published":"2024-06-24T13:05:39Z","title":"OmAgent: A Multi-modal Agent Framework for Complex Video Understanding\n with Task Divide-and-Conquer","summary":" Recent advancements in Large Language Models (LLMs) have expanded their\ncapabilities to multimodal contexts, including comprehensive video\nunderstanding. However, processing extensive videos such as 24-hour CCTV\nfootage or full-length films presents significant challenges due to the vast\ndata and processing demands. Traditional methods, like extracting key frames or\nconverting frames to text, often result in substantial information loss. To\naddress these shortcomings, we develop OmAgent, efficiently stores and\nretrieves relevant video frames for specific queries, preserving the detailed\ncontent of videos. Additionally, it features an Divide-and-Conquer Loop capable\nof autonomous reasoning, dynamically invoking APIs and tools to enhance query\nprocessing and accuracy. This approach ensures robust video understanding,\nsignificantly reducing information loss. Experimental results affirm OmAgent's\nefficacy in handling various types of videos and complex tasks. Moreover, we\nhave endowed it with greater autonomy and a robust tool-calling system,\nenabling it to accomplish even more intricate tasks.\n","authors":["Lu Zhang","Tiancheng Zhao","Heting Ying","Yibo Ma","Kyusong Lee"],"pdf_url":"https://arxiv.org/pdf/2406.16620v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14208v2","updated":"2024-11-12T10:00:46Z","published":"2024-07-19T11:13:31Z","title":"Memory-Efficient Pseudo-Labeling for Online Source-Free Universal Domain\n Adaptation using a Gaussian Mixture Model","summary":" In practice, domain shifts are likely to occur between training and test\ndata, necessitating domain adaptation (DA) to adjust the pre-trained source\nmodel to the target domain. Recently, universal domain adaptation (UniDA) has\ngained attention for addressing the possibility of an additional category\n(label) shift between the source and target domain. This means new classes can\nappear in the target data, some source classes may no longer be present, or\nboth at the same time. For practical applicability, UniDA methods must handle\nboth source-free and online scenarios, enabling adaptation without access to\nthe source data and performing batch-wise updates in parallel with prediction.\nIn an online setting, preserving knowledge across batches is crucial. However,\nexisting methods often require substantial memory, which is impractical because\nmemory is limited and valuable, in particular on embedded systems. Therefore,\nwe consider memory-efficiency as an additional constraint. To achieve\nmemory-efficient online source-free universal domain adaptation (SF-UniDA), we\npropose a novel method that continuously captures the distribution of known\nclasses in the feature space using a Gaussian mixture model (GMM). This\napproach, combined with entropy-based out-of-distribution detection, allows for\nthe generation of reliable pseudo-labels. Finally, we combine a contrastive\nloss with a KL divergence loss to perform the adaptation. Our approach not only\nachieves state-of-the-art results in all experiments on the DomainNet and\nOffice-Home datasets but also significantly outperforms the existing methods on\nthe challenging VisDA-C dataset, setting a new benchmark for online SF-UniDA.\nOur code is available at https://github.com/pascalschlachter/GMM.\n","authors":["Pascal Schlachter","Simon Wagner","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2407.14208v2.pdf","comment":"Accepted at IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2410.14265v2","updated":"2024-11-12T09:58:13Z","published":"2024-10-18T08:20:37Z","title":"HYPNOS : Highly Precise Foreground-focused Diffusion Finetuning for\n Inanimate Objects","summary":" In recent years, personalized diffusion-based text-to-image generative tasks\nhave been a hot topic in computer vision studies. A robust diffusion model is\ndetermined by its ability to perform near-perfect reconstruction of certain\nproduct outcomes given few related input samples. Unfortunately, the current\nprominent diffusion-based finetuning technique falls short in maintaining the\nforeground object consistency while being constrained to produce diverse\nbackgrounds in the image outcome. In the worst scenario, the overfitting issue\nmay occur, meaning that the foreground object is less controllable due to the\ncondition above, for example, the input prompt information is transferred\nambiguously to both foreground and background regions, instead of the supposed\nbackground region only. To tackle the issues above, we proposed Hypnos, a\nhighly precise foreground-focused diffusion finetuning technique. On the image\nlevel, this strategy works best for inanimate object generation tasks, and to\ndo so, Hypnos implements two main approaches, namely: (i) a content-centric\nprompting strategy and (ii) the utilization of our additional\nforeground-focused discriminative module. The utilized module is connected with\nthe diffusion model and finetuned with our proposed set of supervision\nmechanism. Combining the strategies above yielded to the foreground-background\ndisentanglement capability of the diffusion model. Our experimental results\nshowed that the proposed strategy gave a more robust performance and visually\npleasing results compared to the former technique. For better elaborations, we\nalso provided extensive studies to assess the fruitful outcomes above, which\nreveal how personalization behaves in regard to several training conditions.\n","authors":["Oliverio Theophilus Nathanael","Jonathan Samuel Lumentut","Nicholas Hans Muliawan","Edbert Valencio Angky","Felix Indra Kurniadi","Alfi Yusrotis Zakiyyah","Jeklin Harefa"],"pdf_url":"https://arxiv.org/pdf/2410.14265v2.pdf","comment":"26 pages, 12 figures, to appear on the Rich Media with Generative AI\n workshop in conjunction with Asian Conference on Computer Vision (ACCV) 2024"},{"id":"http://arxiv.org/abs/2411.07685v1","updated":"2024-11-12T09:57:53Z","published":"2024-11-12T09:57:53Z","title":"Fast Disentangled Slim Tensor Learning for Multi-view Clustering","summary":" Tensor-based multi-view clustering has recently received significant\nattention due to its exceptional ability to explore cross-view high-order\ncorrelations. However, most existing methods still encounter some limitations.\n(1) Most of them explore the correlations among different affinity matrices,\nmaking them unscalable to large-scale data. (2) Although some methods address\nit by introducing bipartite graphs, they may result in sub-optimal solutions\ncaused by an unstable anchor selection process. (3) They generally ignore the\nnegative impact of latent semantic-unrelated information in each view. To\ntackle these issues, we propose a new approach termed fast Disentangled Slim\nTensor Learning (DSTL) for multi-view clustering . Instead of focusing on the\nmulti-view graph structures, DSTL directly explores the high-order correlations\namong multi-view latent semantic representations based on matrix factorization.\nTo alleviate the negative influence of feature redundancy, inspired by robust\nPCA, DSTL disentangles the latent low-dimensional representation into a\nsemantic-unrelated part and a semantic-related part for each view.\nSubsequently, two slim tensors are constructed with tensor-based\nregularization. To further enhance the quality of feature disentanglement, the\nsemantic-related representations are aligned across views through a consensus\nalignment indicator. Our proposed model is computationally efficient and can be\nsolved effectively. Extensive experiments demonstrate the superiority and\nefficiency of DSTL over state-of-the-art approaches. The code of DSTL is\navailable at https://github.com/dengxu-nju/DSTL.\n","authors":["Deng Xu","Chao Zhang","Zechao Li","Chunlin Chen","Huaxiong Li"],"pdf_url":"https://arxiv.org/pdf/2411.07685v1.pdf","comment":"13 pages,6 figures, will be published to IEEE TMM"},{"id":"http://arxiv.org/abs/2411.07684v1","updated":"2024-11-12T09:56:42Z","published":"2024-11-12T09:56:42Z","title":"AI enhanced diagnosis of Peyronies disease a novel approach using\n Computer Vision","summary":" This study presents an innovative AI-driven tool for diagnosing Peyronie's\nDisease (PD), a condition that affects between 0.3% and 13.1% of men worldwide.\nOur method uses key point detection on both images and videos to measure penile\ncurvature angles, utilizing advanced computer vision techniques. This tool has\ndemonstrated high accuracy in identifying anatomical landmarks, validated\nagainst conventional goniometer measurements. Traditional PD diagnosis often\ninvolves subjective and invasive methods, which can lead to patient discomfort\nand inaccuracies. Our approach offers a precise, reliable, and non-invasive\ndiagnostic tool to address these drawbacks. The model distinguishes between PD\nand normal anatomical changes with a sensitivity of 96.7% and a specificity of\n100%. This advancement represents a significant improvement in urological\ndiagnostics, greatly enhancing the efficacy and convenience of PD assessment\nfor healthcare providers and patients.\n","authors":["Yudara Kularathne","Janitha Prathapa","Prarththanan Sothyrajah","Salomi Arasaratnam","Sithira Ambepitiya","Thanveer Ahamed","Dinuka Wijesundara"],"pdf_url":"https://arxiv.org/pdf/2411.07684v1.pdf","comment":"8 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2305.03989v3","updated":"2024-11-12T09:52:53Z","published":"2023-05-06T09:29:12Z","title":"LEO: Generative Latent Image Animator for Human Video Synthesis","summary":" Spatio-temporal coherency is a major challenge in synthesizing high quality\nvideos, particularly in synthesizing human videos that contain rich global and\nlocal deformations. To resolve this challenge, previous approaches have\nresorted to different features in the generation process aimed at representing\nappearance and motion. However, in the absence of strict mechanisms to\nguarantee such disentanglement, a separation of motion from appearance has\nremained challenging, resulting in spatial distortions and temporal jittering\nthat break the spatio-temporal coherency. Motivated by this, we here propose\nLEO, a novel framework for human video synthesis, placing emphasis on\nspatio-temporal coherency. Our key idea is to represent motion as a sequence of\nflow maps in the generation process, which inherently isolate motion from\nappearance. We implement this idea via a flow-based image animator and a Latent\nMotion Diffusion Model (LMDM). The former bridges a space of motion codes with\nthe space of flow maps, and synthesizes video frames in a warp-and-inpaint\nmanner. LMDM learns to capture motion prior in the training data by\nsynthesizing sequences of motion codes. Extensive quantitative and qualitative\nanalysis suggests that LEO significantly improves coherent synthesis of human\nvideos over previous methods on the datasets TaichiHD, FaceForensics and\nCelebV-HQ. In addition, the effective disentanglement of appearance and motion\nin LEO allows for two additional tasks, namely infinite-length human video\nsynthesis, as well as content-preserving video editing.\n","authors":["Yaohui Wang","Xin Ma","Xinyuan Chen","Cunjian Chen","Antitza Dantcheva","Bo Dai","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2305.03989v3.pdf","comment":"IJCV 2024, Project webpage: https://wyhsirius.github.io/LEO-project/"},{"id":"http://arxiv.org/abs/2411.03239v2","updated":"2024-11-12T09:46:39Z","published":"2024-11-05T16:37:30Z","title":"Decoupling Fine Detail and Global Geometry for Compressed Depth Map\n Super-Resolution","summary":" Recovering high-quality depth maps from compressed sources has gained\nsignificant attention due to the limitations of consumer-grade depth cameras\nand the bandwidth restrictions during data transmission. However, current\nmethods still suffer from two challenges. First, bit-depth compression produces\na uniform depth representation in regions with subtle variations, hindering the\nrecovery of detailed information. Second, densely distributed random noise\nreduces the accuracy of estimating the global geometric structure of the scene.\nTo address these challenges, we propose a novel framework, termed\ngeometry-decoupled network (GDNet), for compressed depth map super-resolution\nthat decouples the high-quality depth map reconstruction process by handling\nglobal and detailed geometric features separately. To be specific, we propose\nthe fine geometry detail encoder (FGDE), which is designed to aggregate fine\ngeometry details in high-resolution low-level image features while\nsimultaneously enriching them with complementary information from\nlow-resolution context-level image features. In addition, we develop the global\ngeometry encoder (GGE) that aims at suppressing noise and extracting global\ngeometric information effectively via constructing compact feature\nrepresentation in a low-rank space. We conduct experiments on multiple\nbenchmark datasets, demonstrating that our GDNet significantly outperforms\ncurrent methods in terms of geometric consistency and detail recovery. In the\nECCV 2024 AIM Compressed Depth Upsampling Challenge, our solution won the 1st\nplace award. Our codes will be available.\n","authors":["Huan Zheng","Wencheng Han","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2411.03239v2.pdf","comment":"The 1st place award for the ECCV 2024 AIM Compressed Depth Upsampling\n Challenge"},{"id":"http://arxiv.org/abs/2407.21341v3","updated":"2024-11-12T09:41:55Z","published":"2024-07-31T05:15:24Z","title":"High-throughput 3D shape completion of potato tubers on a harvester","summary":" Potato yield is an important metric for farmers to further optimize their\ncultivation practices. Potato yield can be estimated on a harvester using an\nRGB-D camera that can estimate the three-dimensional (3D) volume of individual\npotato tubers. A challenge, however, is that the 3D shape derived from RGB-D\nimages is only partially completed, underestimating the actual volume. To\naddress this issue, we developed a 3D shape completion network, called CoRe++,\nwhich can complete the 3D shape from RGB-D images. CoRe++ is a deep learning\nnetwork that consists of a convolutional encoder and a decoder. The encoder\ncompresses RGB-D images into latent vectors that are used by the decoder to\ncomplete the 3D shape using the deep signed distance field network (DeepSDF).\nTo evaluate our CoRe++ network, we collected partial and complete 3D point\nclouds of 339 potato tubers on an operational harvester in Japan. On the 1425\nRGB-D images in the test set (representing 51 unique potato tubers), our\nnetwork achieved a completion accuracy of 2.8 mm on average. For volumetric\nestimation, the root mean squared error (RMSE) was 22.6 ml, and this was better\nthan the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml).\nWe found that the RMSE can be further reduced to 18.2 ml when performing the 3D\nshape completion in the center of the RGB-D image. With an average 3D shape\ncompletion time of 10 milliseconds per tuber, we can conclude that CoRe++ is\nboth fast and accurate enough to be implemented on an operational harvester for\nhigh-throughput potato yield estimation. CoRe++'s high-throughput and accurate\nprocessing allows it to be applied to other tuber, fruit and vegetable crops,\nthereby enabling versatile, accurate and real-time yield monitoring in\nprecision agriculture. Our code, network weights and dataset are publicly\navailable at https://github.com/UTokyo-FieldPhenomics-Lab/corepp.git.\n","authors":["Pieter M. Blok","Federico Magistri","Cyrill Stachniss","Haozhou Wang","James Burridge","Wei Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21341v3.pdf","comment":"20 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.07664v1","updated":"2024-11-12T09:30:02Z","published":"2024-11-12T09:30:02Z","title":"Evaluating the Generation of Spatial Relations in Text and Image\n Generative Models","summary":" Understanding spatial relations is a crucial cognitive ability for both\nhumans and AI. While current research has predominantly focused on the\nbenchmarking of text-to-image (T2I) models, we propose a more comprehensive\nevaluation that includes \\textit{both} T2I and Large Language Models (LLMs). As\nspatial relations are naturally understood in a visuo-spatial manner, we\ndevelop an approach to convert LLM outputs into an image, thereby allowing us\nto evaluate both T2I models and LLMs \\textit{visually}. We examined the spatial\nrelation understanding of 8 prominent generative models (3 T2I models and 5\nLLMs) on a set of 10 common prepositions, as well as assess the feasibility of\nautomatic evaluation methods. Surprisingly, we found that T2I models only\nachieve subpar performance despite their impressive general image-generation\nabilities. Even more surprisingly, our results show that LLMs are significantly\nmore accurate than T2I models in generating spatial relations, despite being\nprimarily trained on textual data. We examined reasons for model failures and\nhighlight gaps that can be filled to enable more spatially faithful\ngenerations.\n","authors":["Shang Hong Sim","Clarence Lee","Alvin Tan","Cheston Tan"],"pdf_url":"https://arxiv.org/pdf/2411.07664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07660v1","updated":"2024-11-12T09:22:00Z","published":"2024-11-12T09:22:00Z","title":"HMIL: Hierarchical Multi-Instance Learning for Fine-Grained Whole Slide\n Image Classification","summary":" Fine-grained classification of whole slide images (WSIs) is essential in\nprecision oncology, enabling precise cancer diagnosis and personalized\ntreatment strategies. The core of this task involves distinguishing subtle\nmorphological variations within the same broad category of gigapixel-resolution\nimages, which presents a significant challenge. While the multi-instance\nlearning (MIL) paradigm alleviates the computational burden of WSIs, existing\nMIL methods often overlook hierarchical label correlations, treating\nfine-grained classification as a flat multi-class classification task. To\novercome these limitations, we introduce a novel hierarchical multi-instance\nlearning (HMIL) framework. By facilitating on the hierarchical alignment of\ninherent relationships between different hierarchy of labels at instance and\nbag level, our approach provides a more structured and informative learning\nprocess. Specifically, HMIL incorporates a class-wise attention mechanism that\naligns hierarchical information at both the instance and bag levels.\nFurthermore, we introduce supervised contrastive learning to enhance the\ndiscriminative capability for fine-grained classification and a\ncurriculum-based dynamic weighting module to adaptively balance the\nhierarchical feature during training. Extensive experiments on our large-scale\ncytology cervical cancer (CCC) dataset and two public histology datasets, BRACS\nand PANDA, demonstrate the state-of-the-art class-wise and overall performance\nof our HMIL framework. Our source code is available at\nhttps://github.com/ChengJin-git/HMIL.\n","authors":["Cheng Jin","Luyang Luo","Huangjing Lin","Jun Hou","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07660v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2410.12183v2","updated":"2024-11-12T09:14:03Z","published":"2024-10-16T03:01:44Z","title":"TransAgent: Transfer Vision-Language Foundation Models with\n Heterogeneous Agent Collaboration","summary":" Vision-language foundation models (such as CLIP) have recently shown their\npower in transfer learning, owing to large-scale image-text pre-training.\nHowever, target domain data in the downstream tasks can be highly different\nfrom the pre-training phase, which makes it hard for such a single model to\ngeneralize well. Alternatively, there exists a wide range of expert models that\ncontain diversified vision and/or language knowledge pre-trained on different\nmodalities, tasks, networks, and datasets. Unfortunately, these models are\n\"isolated agents\" with heterogeneous structures, and how to integrate their\nknowledge for generalizing CLIP-like models has not been fully explored. To\nbridge this gap, we propose a general and concise TransAgent framework, which\ntransports the knowledge of the isolated agents in a unified manner, and\neffectively guides CLIP to generalize with multi-source knowledge distillation.\nWith such a distinct framework, we flexibly collaborate with 11 heterogeneous\nagents to empower vision-language foundation models, without further cost in\nthe inference phase. Finally, our TransAgent achieves state-of-the-art\nperformance on 11 visual recognition datasets. Under the same low-shot setting,\nit outperforms the popular CoOp with around 10% on average, and 20% on EuroSAT\nwhich contains large domain shifts.\n","authors":["Yiwei Guo","Shaobin Zhuang","Kunchang Li","Yu Qiao","Yali Wang"],"pdf_url":"https://arxiv.org/pdf/2410.12183v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.07650v1","updated":"2024-11-12T09:02:11Z","published":"2024-11-12T09:02:11Z","title":"Understanding Audiovisual Deepfake Detection: Techniques, Challenges,\n Human Factors and Perceptual Insights","summary":" Deep Learning has been successfully applied in diverse fields, and its impact\non deepfake detection is no exception. Deepfakes are fake yet realistic\nsynthetic content that can be used deceitfully for political impersonation,\nphishing, slandering, or spreading misinformation. Despite extensive research\non unimodal deepfake detection, identifying complex deepfakes through joint\nanalysis of audio and visual streams remains relatively unexplored. To fill\nthis gap, this survey first provides an overview of audiovisual deepfake\ngeneration techniques, applications, and their consequences, and then provides\na comprehensive review of state-of-the-art methods that combine audio and\nvisual modalities to enhance detection accuracy, summarizing and critically\nanalyzing their strengths and limitations. Furthermore, we discuss existing\nopen source datasets for a deeper understanding, which can contribute to the\nresearch community and provide necessary information to beginners who want to\nanalyze deep learning-based audiovisual methods for video forensics. By\nbridging the gap between unimodal and multimodal approaches, this paper aims to\nimprove the effectiveness of deepfake detection strategies and guide future\nresearch in cybersecurity and media integrity.\n","authors":["Ammarah Hashmi","Sahibzada Adil Shahzad","Chia-Wen Lin","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12866v2","updated":"2024-11-12T08:59:30Z","published":"2024-04-19T13:05:37Z","title":"How Does the Textual Information Affect the Retrieval of Multimodal\n In-Context Learning?","summary":" The increase in parameter size of multimodal large language models (MLLMs)\nintroduces significant capabilities, particularly in-context learning, where\nMLLMs enhance task performance without updating pre-trained parameters. This\neffectiveness, however, hinges on the appropriate selection of in-context\nexamples, a process that is currently biased towards visual data, overlooking\ntextual information. Furthermore, the area of supervised retrievers for MLLMs,\ncrucial for optimal in-context example selection, continues to be\nuninvestigated. Our study offers an in-depth evaluation of the impact of\ntextual information on the unsupervised selection of in-context examples in\nmultimodal contexts, uncovering a notable sensitivity of retriever performance\nto the employed modalities. Responding to this, we introduce a novel supervised\nMLLM-retriever MSIER that employs a neural network to select examples that\nenhance multimodal in-context learning efficiency. This approach is validated\nthrough extensive testing across three distinct tasks, demonstrating the\nmethod's effectiveness. Additionally, we investigate the influence of\nmodalities on our supervised retrieval method's training and pinpoint factors\ncontributing to our model's success. This exploration paves the way for future\nadvancements, highlighting the potential for refined in-context learning in\nMLLMs through the strategic use of multimodal data.\n","authors":["Yang Luo","Zangwei Zheng","Zirui Zhu","Yang You"],"pdf_url":"https://arxiv.org/pdf/2404.12866v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.07649v1","updated":"2024-11-12T08:57:21Z","published":"2024-11-12T08:57:21Z","title":"Maritime Search and Rescue Missions with Aerial Images: A Survey","summary":" The speed of response by search and rescue teams at sea is of vital\nimportance, as survival may depend on it. Recent technological advancements\nhave led to the development of more efficient systems for locating individuals\ninvolved in a maritime incident, such as the use of Unmanned Aerial Vehicles\n(UAVs) equipped with cameras and other integrated sensors. Over the past\ndecade, several researchers have contributed to the development of automatic\nsystems capable of detecting people using aerial images, particularly by\nleveraging the advantages of deep learning. In this article, we provide a\ncomprehensive review of the existing literature on this topic. We analyze the\nmethods proposed to date, including both traditional techniques and more\nadvanced approaches based on machine learning and neural networks.\nAdditionally, we take into account the use of synthetic data to cover a wider\nrange of scenarios without the need to deploy a team to collect data, which is\none of the major obstacles for these systems. Overall, this paper situates the\nreader in the field of detecting people at sea using aerial images by quickly\nidentifying the most suitable methodology for each scenario, as well as\nproviding an in-depth discussion and direction for future trends.\n","authors":["Juan P. Martinez-Esteso","Francisco J. Castellanos","Jorge Calvo-Zaragoza","Antonio Javier Gallego"],"pdf_url":"https://arxiv.org/pdf/2411.07649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07643v1","updated":"2024-11-12T08:53:49Z","published":"2024-11-12T08:53:49Z","title":"xCG: Explainable Cell Graphs for Survival Prediction in Non-Small Cell\n Lung Cancer","summary":" Understanding how deep learning models predict oncology patient risk can\nprovide critical insights into disease progression, support clinical\ndecision-making, and pave the way for trustworthy and data-driven precision\nmedicine. Building on recent advances in the spatial modeling of the tumor\nmicroenvironment using graph neural networks, we present an explainable cell\ngraph (xCG) approach for survival prediction. We validate our model on a public\ncohort of imaging mass cytometry (IMC) data for 416 cases of lung\nadenocarcinoma. We explain survival predictions in terms of known phenotypes on\nthe cell level by computing risk attributions over cell graphs, for which we\npropose an efficient grid-based layer-wise relevance propagation (LRP) method.\nOur ablation studies highlight the importance of incorporating the cancer stage\nand model ensembling to improve the quality of risk estimates. Our xCG method,\ntogether with the IMC data, is made publicly available to support further\nresearch.\n","authors":["Marvin Sextro","Gabriel Dernbach","Kai Standvoss","Simon Schallenberg","Frederick Klauschen","Klaus-Robert Müller","Maximilian Alber","Lukas Ruff"],"pdf_url":"https://arxiv.org/pdf/2411.07643v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 11 pages"},{"id":"http://arxiv.org/abs/2411.06236v2","updated":"2024-11-12T08:51:40Z","published":"2024-11-09T17:36:53Z","title":"Zero-Shot NAS via the Suppression of Local Entropy Decrease","summary":" Architecture performance evaluation is the most time-consuming part of neural\narchitecture search (NAS). Zero-Shot NAS accelerates the evaluation by\nutilizing zero-cost proxies instead of training. Though effective, existing\nzero-cost proxies require invoking backpropagations or running networks on\ninput data, making it difficult to further accelerate the computation of\nproxies. To alleviate this issue, architecture topologies are used to evaluate\nthe performance of networks in this study. We prove that particular\narchitectural topologies decrease the local entropy of feature maps, which\ndegrades specific features to a bias, thereby reducing network performance.\nBased on this proof, architectural topologies are utilized to quantify the\nsuppression of local entropy decrease (SED) as a data-free and running-free\nproxy. Experimental results show that SED outperforms most state-of-the-art\nproxies in terms of architecture selection on five benchmarks, with computation\ntime reduced by three orders of magnitude. We further compare the SED-based NAS\nwith state-of-the-art proxies. SED-based NAS selects the architecture with\nhigher accuracy and fewer parameters in only one second. The theoretical\nanalyses of local entropy and experimental results demonstrate that the\nsuppression of local entropy decrease facilitates selecting optimal\narchitectures in Zero-Shot NAS.\n","authors":["Ning Wu","Han Huang","Yueting Xu","Zhifeng Hao"],"pdf_url":"https://arxiv.org/pdf/2411.06236v2.pdf","comment":"8 pages, 2 figures. Corrected typos and latex template"},{"id":"http://arxiv.org/abs/2410.05814v2","updated":"2024-11-12T08:50:59Z","published":"2024-10-08T08:44:01Z","title":"CALoR: Towards Comprehensive Model Inversion Defense","summary":" Model Inversion Attacks (MIAs) aim at recovering privacy-sensitive training\ndata from the knowledge encoded in the released machine learning models. Recent\nadvances in the MIA field have significantly enhanced the attack performance\nunder multiple scenarios, posing serious privacy risks of Deep Neural Networks\n(DNNs). However, the development of defense strategies against MIAs is\nrelatively backward to resist the latest MIAs and existing defenses fail to\nachieve further trade-off between model utility and model robustness. In this\npaper, we provide an in-depth analysis from the perspective of intrinsic\nvulnerabilities of MIAs, comprehensively uncovering the weaknesses inherent in\nthe basic pipeline, which are partially investigated in the previous defenses.\nBuilding upon these new insights, we propose a robust defense mechanism,\nintegrating Confidence Adaptation and Low-Rank compression(CALoR). Our method\nincludes a novel robustness-enhanced classification loss specially-designed for\nmodel inversion defenses and reveals the extraordinary effectiveness of\ncompressing the classification header. With CALoR, we can mislead the\noptimization objective, reduce the leaked information and impede the\nbackpropagation of MIAs, thus mitigating the risk of privacy leakage. Extensive\nexperimental results demonstrate that our method achieves state-of-the-art\n(SOTA) defense performance against MIAs and exhibits superior generalization to\nexisting defenses across various scenarios.\n","authors":["Hongyao Yu","Yixiang Qiu","Hao Fang","Bin Chen","Sijin Yu","Bin Wang","Shu-Tao Xia","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2410.05814v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2410.20806v3","updated":"2024-11-12T08:44:12Z","published":"2024-10-28T07:54:07Z","title":"Transformer-Based Tooth Alignment Prediction With Occlusion And\n Collision Constraints","summary":" The planning of digital orthodontic treatment requires providing tooth\nalignment, which not only consumes a lot of time and labor to determine\nmanually but also relays clinical experiences heavily. In this work, we\nproposed a lightweight tooth alignment neural network based on\nSwin-transformer. We first re-organized 3D point clouds based on virtual arch\nlines and converted them into order-sorted multi-channel textures, which\nimproves the accuracy and efficiency simultaneously. We then designed two new\nocclusal loss functions that quantitatively evaluate the occlusal relationship\nbetween the upper and lower jaws. They are important clinical constraints,\nfirst introduced to the best of our knowledge, and lead to cutting-edge\nprediction accuracy. To train our network, we collected a large digital\northodontic dataset that has 591 clinical cases, including various complex\nclinical cases. This dataset will benefit the community after its release since\nthere is no open dataset so far. Furthermore, we also proposed two new\northodontic dataset augmentation methods considering tooth spatial distribution\nand occlusion. We evaluated our method with this dataset and extensive\nexperiments, including comparisons with STAT methods and ablation studies, and\ndemonstrate the high prediction accuracy of our method.\n","authors":["ZhenXing Dong","JiaZhou Chen","YangHui Xu"],"pdf_url":"https://arxiv.org/pdf/2410.20806v3.pdf","comment":"Modify formatting errors, optimize content layout"},{"id":"http://arxiv.org/abs/2408.09984v2","updated":"2024-11-12T08:33:22Z","published":"2024-08-19T13:32:51Z","title":"Boosting Open-Domain Continual Learning via Leveraging Intra-domain\n Category-aware Prototype","summary":" Despite recent progress in enhancing the efficacy of Open-Domain Continual\nLearning (ODCL) in Vision-Language Models (VLM), failing to (1) correctly\nidentify the Task-ID of a test image and (2) use only the category set\ncorresponding to the Task-ID, while preserving the knowledge related to each\ndomain, cannot address the two primary challenges of ODCL: forgetting old\nknowledge and maintaining zero-shot capabilities, as well as the confusions\ncaused by category-relatedness between domains. In this paper, we propose a\nsimple yet effective solution: leveraging intra-domain category-aware\nprototypes for ODCL in CLIP (DPeCLIP), where the prototype is the key to\nbridging the above two processes. Concretely, we propose a training-free\nTask-ID discriminator method, by utilizing prototypes as classifiers for\nidentifying Task-IDs. Furthermore, to maintain the knowledge corresponding to\neach domain, we incorporate intra-domain category-aware prototypes as domain\nprior prompts into the training process. Extensive experiments conducted on 11\ndifferent datasets demonstrate the effectiveness of our approach, achieving\n2.37% and 1.14% average improvement in class-incremental and task-incremental\nsettings, respectively.\n","authors":["Yadong Lu","Shitian Zhao","Boxiang Yun","Dongsheng Jiang","Yin Li","Qingli Li","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07635v1","updated":"2024-11-12T08:30:59Z","published":"2024-11-12T08:30:59Z","title":"Breaking the Low-Rank Dilemma of Linear Attention","summary":" The Softmax attention mechanism in Transformer models is notoriously\ncomputationally expensive, particularly due to its quadratic complexity, posing\nsignificant challenges in vision applications. In contrast, linear attention\nprovides a far more efficient solution by reducing the complexity to linear\nlevels. However, compared to Softmax attention, linear attention often\nexperiences significant performance degradation. Our experiments indicate that\nthis performance drop is due to the low-rank nature of linear attention's\nfeature map, which hinders its ability to adequately model complex spatial\ninformation. In this paper, to break the low-rank dilemma of linear attention,\nwe conduct rank analysis from two perspectives: the KV buffer and the output\nfeatures. Consequently, we introduce Rank-Augmented Linear Attention (RALA),\nwhich rivals the performance of Softmax attention while maintaining linear\ncomplexity and high efficiency. Based on RALA, we construct the Rank-Augmented\nVision Linear Transformer (RAVLT). Extensive experiments demonstrate that RAVLT\nachieves excellent performance across various vision tasks. Specifically,\nwithout using any additional labels, data, or supervision during training,\nRAVLT achieves an 84.4% Top-1 accuracy on ImageNet-1k with only 26M parameters\nand 4.6G FLOPs. This result significantly surpasses previous linear attention\nmechanisms, fully illustrating the potential of RALA. Code will be available at\nhttps://github.com/qhfan/RALA.\n","authors":["Qihang Fan","Huaibo Huang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2411.07635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05128v2","updated":"2024-11-12T08:24:47Z","published":"2024-07-06T16:34:25Z","title":"SCSA: Exploring the Synergistic Effects Between Spatial and Channel\n Attention","summary":" Channel and spatial attentions have respectively brought significant\nimprovements in extracting feature dependencies and spatial structure relations\nfor various downstream vision tasks. While their combination is more beneficial\nfor leveraging their individual strengths, the synergy between channel and\nspatial attentions has not been fully explored, lacking in fully harness the\nsynergistic potential of multi-semantic information for feature guidance and\nmitigation of semantic disparities. Our study attempts to reveal the\nsynergistic relationship between spatial and channel attention at multiple\nsemantic levels, proposing a novel Spatial and Channel Synergistic Attention\nmodule (SCSA). Our SCSA consists of two parts: the Shareable Multi-Semantic\nSpatial Attention (SMSA) and the Progressive Channel-wise Self-Attention\n(PCSA). SMSA integrates multi-semantic information and utilizes a progressive\ncompression strategy to inject discriminative spatial priors into PCSA's\nchannel self-attention, effectively guiding channel recalibration.\nAdditionally, the robust feature interactions based on the self-attention\nmechanism in PCSA further mitigate the disparities in multi-semantic\ninformation among different sub-features within SMSA. We conduct extensive\nexperiments on seven benchmark datasets, including classification on\nImageNet-1K, object detection on MSCOCO 2017, segmentation on ADE20K, and four\nother complex scene detection datasets. Our results demonstrate that our\nproposed SCSA not only surpasses the current state-of-the-art attention but\nalso exhibits enhanced generalization capabilities across various task\nscenarios. The code and models are available at:\nhttps://github.com/HZAI-ZJNU/SCSA.\n","authors":["Yunzhong Si","Huiying Xu","Xinzhong Zhu","Wenhao Zhang","Yao Dong","Yuxing Chen","Hongbo Li"],"pdf_url":"https://arxiv.org/pdf/2407.05128v2.pdf","comment":"We added experiments for the classification task and updated the\n corresponding sections accordingly. The paper formatting has also been\n revised"},{"id":"http://arxiv.org/abs/2411.07627v1","updated":"2024-11-12T08:17:15Z","published":"2024-11-12T08:17:15Z","title":"Leveraging Previous Steps: A Training-free Fast Solver for Flow\n Diffusion","summary":" Flow diffusion models (FDMs) have recently shown potential in generation\ntasks due to the high generation quality. However, the current ordinary\ndifferential equation (ODE) solver for FDMs, e.g., the Euler solver, still\nsuffers from slow generation since ODE solvers need many number function\nevaluations (NFE) to keep high-quality generation. In this paper, we propose a\nnovel training-free flow-solver to reduce NFE while maintaining high-quality\ngeneration. The key insight for the flow-solver is to leverage the previous\nsteps to reduce the NFE, where a cache is created to reuse these results from\nthe previous steps. Specifically, the Taylor expansion is first used to\napproximate the ODE. To calculate the high-order derivatives of Taylor\nexpansion, the flow-solver proposes to use the previous steps and a polynomial\ninterpolation to approximate it, where the number of orders we could\napproximate equals the number of previous steps we cached. We also prove that\nthe flow-solver has a more minor approximation error and faster generation\nspeed. Experimental results on the CIFAR-10, CelebA-HQ, LSUN-Bedroom,\nLSUN-Church, ImageNet, and real text-to-image generation prove the efficiency\nof the flow-solver. Specifically, the flow-solver improves the FID-30K from\n13.79 to 6.75, from 46.64 to 19.49 with $\\text{NFE}=10$ on CIFAR-10 and\nLSUN-Church, respectively.\n","authors":["Kaiyu Song","Hanjiang Lai"],"pdf_url":"https://arxiv.org/pdf/2411.07627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07625v1","updated":"2024-11-12T08:14:39Z","published":"2024-11-12T08:14:39Z","title":"Unraveling the Connections between Flow Matching and Diffusion\n Probabilistic Models in Training-free Conditional Generation","summary":" Training-free conditional generation aims to leverage the unconditional\ndiffusion models to implement the conditional generation, where flow-matching\n(FM) and diffusion probabilistic models (DPMs) are two mature unconditional\ndiffusion models that achieve high-quality generation. Two questions were asked\nin this paper: What are the underlying connections between FM and DPMs in\ntraining-free conditional generation? Can we leverage DPMs to improve the\ntraining-free conditional generation for FM? We first show that a probabilistic\ndiffusion path can be associated with the FM and DPMs. Then, we reformulate the\nordinary differential equation (ODE) of FM based on the score function of DPMs,\nand thus, the conditions in FM can be incorporated as those in DPMs. Finally,\nwe propose two posterior sampling methods to estimate the conditional term and\nachieve a training-free conditional generation of FM. Experimental results show\nthat our proposed method could be implemented for various conditional\ngeneration tasks. Our method can generate higher-quality results than the\nstate-of-the-art methods.\n","authors":["Kaiyu Song","Hanjiang Lai"],"pdf_url":"https://arxiv.org/pdf/2411.07625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07621v1","updated":"2024-11-12T08:08:31Z","published":"2024-11-12T08:08:31Z","title":"Mix from Failure: Confusion-Pairing Mixup for Long-Tailed Recognition","summary":" Long-tailed image recognition is a computer vision problem considering a\nreal-world class distribution rather than an artificial uniform. Existing\nmethods typically detour the problem by i) adjusting a loss function, ii)\ndecoupling classifier learning, or iii) proposing a new multi-head architecture\ncalled experts. In this paper, we tackle the problem from a different\nperspective to augment a training dataset to enhance the sample diversity of\nminority classes. Specifically, our method, namely Confusion-Pairing Mixup\n(CP-Mix), estimates the confusion distribution of the model and handles the\ndata deficiency problem by augmenting samples from confusion pairs in\nreal-time. In this way, CP-Mix trains the model to mitigate its weakness and\ndistinguish a pair of classes it frequently misclassifies. In addition, CP-Mix\nutilizes a novel mixup formulation to handle the bias in decision boundaries\nthat originated from the imbalanced dataset. Extensive experiments demonstrate\nthat CP-Mix outperforms existing methods for long-tailed image recognition and\nsuccessfully relieves the confusion of the classifier.\n","authors":["Youngseok Yoon","Sangwoo Hong","Hyungjoon Joo","Yao Qin","Haewon Jeong","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2411.07621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07619v1","updated":"2024-11-12T08:05:58Z","published":"2024-11-12T08:05:58Z","title":"Artificial Intelligence for Biomedical Video Generation","summary":" As a prominent subfield of Artificial Intelligence Generated Content (AIGC),\nvideo generation has achieved notable advancements in recent years. The\nintroduction of Sora-alike models represents a pivotal breakthrough in video\ngeneration technologies, significantly enhancing the quality of synthesized\nvideos. Particularly in the realm of biomedicine, video generation technology\nhas shown immense potential such as medical concept explanation, disease\nsimulation, and biomedical data augmentation. In this article, we thoroughly\nexamine the latest developments in video generation models and explore their\napplications, challenges, and future opportunities in the biomedical sector. We\nhave conducted an extensive review and compiled a comprehensive list of\ndatasets from various sources to facilitate the development and evaluation of\nvideo generative models in biomedicine. Given the rapid progress in this field,\nwe have also created a github repository to regularly update the advances of\nbiomedical video generation at:\nhttps://github.com/Lee728243228/Biomedical-Video-Generation\n","authors":["Linyuan Li","Jianing Qiu","Anujit Saha","Lin Li","Poyuan Li","Mengxian He","Ziyu Guo","Wu Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.07619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18252v2","updated":"2024-11-12T08:01:04Z","published":"2024-04-28T17:18:41Z","title":"Improving Training-free Conditional Diffusion Model via Fisher\n Information","summary":" Training-free conditional diffusion models have received great attention in\nconditional image generation tasks. However, they require a computationally\nexpensive conditional score estimator to let the intermediate results of each\nstep in the reverse process toward the condition, which causes slow conditional\ngeneration. In this paper, we propose a novel Fisher information-based\nconditional diffusion (FICD) model to generate high-quality samples according\nto the condition. In particular, we further explore the conditional term from\nthe perspective of Fisher information, where we show Fisher information can act\nas a weight to measure the informativeness of the condition in each generation\nstep. According to this new perspective, we can control and gain more\ninformation along the conditional direction in the generation space. Thus, we\npropose the upper bound of the Fisher information to reformulate the\nconditional term, which increases the information gain and decreases the time\ncost. Experimental results also demonstrate that the proposed FICD can offer up\nto 2x speed-ups under the same sampling steps as most baselines. Meanwhile,\nFICD can improve the generation quality in various tasks compared to the\nbaselines with a low computation cost.\n","authors":["Kaiyu Song","Hanjiang Lai"],"pdf_url":"https://arxiv.org/pdf/2404.18252v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16131v2","updated":"2024-11-12T07:55:34Z","published":"2024-01-29T12:56:11Z","title":"CIMIL-CRC: a clinically-informed multiple instance learning framework\n for patient-level colorectal cancer molecular subtypes classification from\n H\\&E stained images","summary":" Treatment approaches for colorectal cancer (CRC) are highly dependent on the\nmolecular subtype, as immunotherapy has shown efficacy in cases with\nmicrosatellite instability (MSI) but is ineffective for the microsatellite\nstable (MSS) subtype. There is promising potential in utilizing deep neural\nnetworks (DNNs) to automate the differentiation of CRC subtypes by analyzing\nHematoxylin and Eosin (H\\&E) stained whole-slide images (WSIs). Due to the\nextensive size of WSIs, Multiple Instance Learning (MIL) techniques are\ntypically explored. However, existing MIL methods focus on identifying the most\nrepresentative image patches for classification, which may result in the loss\nof critical information. Additionally, these methods often overlook clinically\nrelevant information, like the tendency for MSI class tumors to predominantly\noccur on the proximal (right side) colon. We introduce `CIMIL-CRC', a DNN\nframework that: 1) solves the MSI/MSS MIL problem by efficiently combining a\npre-trained feature extraction model with principal component analysis (PCA) to\naggregate information from all patches, and 2) integrates clinical priors,\nparticularly the tumor location within the colon, into the model to enhance\npatient-level classification accuracy. We assessed our CIMIL-CRC method using\nthe average area under the curve (AUC) from a 5-fold cross-validation\nexperimental setup for model development on the TCGA-CRC-DX cohort, contrasting\nit with a baseline patch-level classification, MIL-only approach, and\nClinically-informed patch-level classification approach. Our CIMIL-CRC\noutperformed all methods (AUROC: $0.92\\pm0.002$ (95\\% CI 0.91-0.92), vs.\n$0.79\\pm0.02$ (95\\% CI 0.76-0.82), $0.86\\pm0.01$ (95\\% CI 0.85-0.88), and\n$0.87\\pm0.01$ (95\\% CI 0.86-0.88), respectively). The improvement was\nstatistically significant.\n","authors":["Hadar Hezi","Matan Gelber","Alexander Balabanov","Yosef E. Maruvka","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2401.16131v2.pdf","comment":"Accepted to the journal 'Computer Methods and Programs in\n Biomedicine'"},{"id":"http://arxiv.org/abs/2406.18054v2","updated":"2024-11-12T07:52:42Z","published":"2024-06-26T04:12:34Z","title":"Leveraging Pre-trained Models for FF-to-FFPE Histopathological Image\n Translation","summary":" The two primary types of Hematoxylin and Eosin (H&E) slides in histopathology\nare Formalin-Fixed Paraffin-Embedded (FFPE) and Fresh Frozen (FF). FFPE slides\noffer high quality histopathological images but require a labor-intensive\nacquisition process. In contrast, FF slides can be prepared quickly, but the\nimage quality is relatively poor. Our task is to translate FF images into FFPE\nstyle, thereby improving the image quality for diagnostic purposes. In this\npaper, we propose Diffusion-FFPE, a method for FF-to-FFPE histopathological\nimage translation using a pre-trained diffusion model. Specifically, we employ\na one-step diffusion model as the generator and fine-tune it with LoRA adapters\nusing adversarial learning objectives. To ensure that the model effectively\ncaptures both global structural information and local details, we propose a\nmulti-scale feature fusion (MFF) module. This module utilizes two VAE encoders\nto extract features of varying image sizes and performs feature fusion before\nfeeding them into the UNet. Furthermore, we utilize a pre-trained\nvision-language model for histopathology as the backbone for the discriminator\nto further improve performance We conducted FF-to-FFPE translation experiments\non the TCGA-NSCLC datasets, and our method achieved better performance compared\nto other methods. The code and models are released at\nhttps://github.com/QilaiZhang/Diffusion-FFPE.\n","authors":["Qilai Zhang","Jiawen Li","Peiran Liao","Jiali Hu","Tian Guan","Anjia Han","Yonghong He"],"pdf_url":"https://arxiv.org/pdf/2406.18054v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07232v2","updated":"2024-11-12T07:49:39Z","published":"2024-11-11T18:50:09Z","title":"Add-it: Training-Free Object Insertion in Images With Pretrained\n Diffusion Models","summary":" Adding Object into images based on text instructions is a challenging task in\nsemantic image editing, requiring a balance between preserving the original\nscene and seamlessly integrating the new object in a fitting location. Despite\nextensive efforts, existing models often struggle with this balance,\nparticularly with finding a natural location for adding an object in complex\nscenes. We introduce Add-it, a training-free approach that extends diffusion\nmodels' attention mechanisms to incorporate information from three key sources:\nthe scene image, the text prompt, and the generated image itself. Our weighted\nextended-attention mechanism maintains structural consistency and fine details\nwhile ensuring natural object placement. Without task-specific fine-tuning,\nAdd-it achieves state-of-the-art results on both real and generated image\ninsertion benchmarks, including our newly constructed \"Additing Affordance\nBenchmark\" for evaluating object placement plausibility, outperforming\nsupervised methods. Human evaluations show that Add-it is preferred in over 80%\nof cases, and it also demonstrates improvements in various automated metrics.\n","authors":["Yoad Tewel","Rinon Gal","Dvir Samuel","Yuval Atzmon","Lior Wolf","Gal Chechik"],"pdf_url":"https://arxiv.org/pdf/2411.07232v2.pdf","comment":"Project page is at https://research.nvidia.com/labs/par/addit/"},{"id":"http://arxiv.org/abs/2409.00606v3","updated":"2024-11-12T07:38:51Z","published":"2024-09-01T04:07:03Z","title":"Style Transfer: From Stitching to Neural Networks","summary":" This article compares two style transfer methods in image processing: the\ntraditional method, which synthesizes new images by stitching together small\npatches from existing images, and a modern machine learning-based approach that\nuses a segmentation network to isolate foreground objects and apply style\ntransfer solely to the background. The traditional method excels in creating\nartistic abstractions but can struggle with seamlessness, whereas the machine\nlearning method preserves the integrity of foreground elements while enhancing\nthe background, offering improved aesthetic quality and computational\nefficiency. Our study indicates that machine learning-based methods are more\nsuited for real-world applications where detail preservation in foreground\nelements is essential.\n","authors":["Xinhe Xu","Zhuoer Wang","Yihan Zhang","Yizhou Liu","Zhaoyue Wang","Zhihao Xu","Muhan Zhao","Huaiying Luo"],"pdf_url":"https://arxiv.org/pdf/2409.00606v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07608v1","updated":"2024-11-12T07:30:32Z","published":"2024-11-12T07:30:32Z","title":"Quantum Information-Empowered Graph Neural Network for Hyperspectral\n Change Detection","summary":" Change detection (CD) is a critical remote sensing technique for identifying\nchanges in the Earth's surface over time. The outstanding substance\nidentifiability of hyperspectral images (HSIs) has significantly enhanced the\ndetection accuracy, making hyperspectral change detection (HCD) an essential\ntechnology. The detection accuracy can be further upgraded by leveraging the\ngraph structure of HSIs, motivating us to adopt the graph neural networks\n(GNNs) in solving HCD. For the first time, this work introduces quantum deep\nnetwork (QUEEN) into HCD. Unlike GNN and CNN, both extracting the\naffine-computing features, QUEEN provides fundamentally different\nunitary-computing features. We demonstrate that through the unitary feature\nextraction procedure, QUEEN provides radically new information for deciding\nwhether there is a change or not. Hierarchically, a graph feature learning\n(GFL) module exploits the graph structure of the bitemporal HSIs at the\nsuperpixel level, while a quantum feature learning (QFL) module learns the\nquantum features at the pixel level, as a complementary to GFL by preserving\npixel-level detailed spatial information not retained in the superpixels. In\nthe final classification stage, a quantum classifier is designed to cooperate\nwith a traditional fully connected classifier. The superior HCD performance of\nthe proposed QUEEN-empowered GNN (i.e., QUEEN-G) will be experimentally\ndemonstrated on real hyperspectral datasets.\n","authors":["Chia-Hsiang Lin","Tzu-Hsuan Lin","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2411.07608v1.pdf","comment":"This work has been accepted by IEEE Transactions on Geoscience and\n Remote Sensing (TGRS)"},{"id":"http://arxiv.org/abs/2411.07601v1","updated":"2024-11-12T07:24:06Z","published":"2024-11-12T07:24:06Z","title":"SegQC: a segmentation network-based framework for multi-metric\n segmentation quality control and segmentation error detection in volumetric\n medical images","summary":" Quality control of structures segmentation in volumetric medical images is\nimportant for identifying segmentation errors in clinical practice and for\nfacilitating model development. This paper introduces SegQC, a novel framework\nfor segmentation quality estimation and segmentation error detection. SegQC\ncomputes an estimate measure of the quality of a segmentation in volumetric\nscans and in their individual slices and identifies possible segmentation error\nregions within a slice. The key components include: 1. SegQC-Net, a deep\nnetwork that inputs a scan and its segmentation mask and outputs segmentation\nerror probabilities for each voxel in the scan; 2. three new segmentation\nquality metrics, two overlap metrics and a structure size metric, computed from\nthe segmentation error probabilities; 3. a new method for detecting possible\nsegmentation errors in scan slices computed from the segmentation error\nprobabilities. We introduce a new evaluation scheme to measure segmentation\nerror discrepancies based on an expert radiologist corrections of automatically\nproduced segmentations that yields smaller observer variability and is closer\nto actual segmentation errors. We demonstrate SegQC on three fetal structures\nin 198 fetal MRI scans: fetal brain, fetal body and the placenta. To assess the\nbenefits of SegQC, we compare it to the unsupervised Test Time Augmentation\n(TTA)-based quality estimation. Our studies indicate that SegQC outperforms\nTTA-based quality estimation in terms of Pearson correlation and MAE for fetal\nbody and fetal brain structures segmentation. Our segmentation error detection\nmethod achieved recall and precision rates of 0.77 and 0.48 for fetal body, and\n0.74 and 0.55 for fetal brain segmentation error detection respectively. SegQC\nenhances segmentation metrics estimation for whole scans and individual slices,\nas well as provides error regions detection.\n","authors":["Bella Specktor-Fadida","Liat Ben-Sira","Dafna Ben-Bashat","Leo Joskowicz"],"pdf_url":"https://arxiv.org/pdf/2411.07601v1.pdf","comment":"28 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.13565v3","updated":"2024-11-12T07:21:04Z","published":"2024-04-21T07:34:44Z","title":"Exploring Diverse Methods in Visual Question Answering","summary":" This study explores innovative methods for improving Visual Question\nAnswering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and\nattention mechanisms. Leveraging a balanced VQA dataset, we investigate three\ndistinct strategies. Firstly, GAN-based approaches aim to generate answer\nembeddings conditioned on image and question inputs, showing potential but\nstruggling with more complex tasks. Secondly, autoencoder-based techniques\nfocus on learning optimal embeddings for questions and images, achieving\ncomparable results with GAN due to better ability on complex questions. Lastly,\nattention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB),\naddress language priors and attention modeling, albeit with a\ncomplexity-performance trade-off. This study underscores the challenges and\nopportunities in VQA and suggests avenues for future research, including\nalternative GAN formulations and attentional mechanisms.\n","authors":["Panfeng Li","Qikai Yang","Xieming Geng","Wenjing Zhou","Zhicheng Ding","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13565v3.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2411.07584v1","updated":"2024-11-12T06:44:24Z","published":"2024-11-12T06:44:24Z","title":"Grounded Video Caption Generation","summary":" We propose a new task, dataset and model for grounded video caption\ngeneration. This task unifies captioning and object grounding in video, where\nthe objects in the caption are grounded in the video via temporally consistent\nbounding boxes. We introduce the following contributions. First, we present a\ntask definition and a manually annotated test dataset for this task, referred\nto as GROunded Video Caption Generation (GROC). Second, we introduce a\nlarge-scale automatic annotation method leveraging an existing model for\ngrounded still image captioning together with an LLM for summarising\nframe-level captions into temporally consistent captions in video. Furthermore,\nwe prompt the LLM to track by language -- classifying noun phrases from the\nframe-level captions into noun phrases of the video-level generated caption. We\napply this approach to videos from the HowTo100M dataset, which results in a\nnew large-scale training dataset, called HowToGround, with automatically\nannotated captions and spatio-temporally consistent bounding boxes with\ncoherent natural language labels. Third, we introduce a new grounded video\ncaption generation model, called VideoGround, and train the model on the new\nautomatically annotated HowToGround dataset. Finally, results of our\nVideoGround model set the state of the art for the new task of grounded video\ncaption generation. We perform extensive ablations and demonstrate the\nimportance of key technical contributions of our model.\n","authors":["Evangelos Kazakos","Cordelia Schmid","Josef Sivic"],"pdf_url":"https://arxiv.org/pdf/2411.07584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06390v2","updated":"2024-11-12T06:41:21Z","published":"2024-11-10T08:23:27Z","title":"SplatFormer: Point Transformer for Robust 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) has recently transformed photorealistic\nreconstruction, achieving high visual fidelity and real-time performance.\nHowever, rendering quality significantly deteriorates when test views deviate\nfrom the camera angles used during training, posing a major challenge for\napplications in immersive free-viewpoint rendering and navigation. In this\nwork, we conduct a comprehensive evaluation of 3DGS and related novel view\nsynthesis methods under out-of-distribution (OOD) test camera scenarios. By\ncreating diverse test cases with synthetic and real-world datasets, we\ndemonstrate that most existing methods, including those incorporating various\nregularization techniques and data-driven priors, struggle to generalize\neffectively to OOD views. To address this limitation, we introduce SplatFormer,\nthe first point transformer model specifically designed to operate on Gaussian\nsplats. SplatFormer takes as input an initial 3DGS set optimized under limited\ntraining views and refines it in a single forward pass, effectively removing\npotential artifacts in OOD test views. To our knowledge, this is the first\nsuccessful application of point transformers directly on 3DGS sets, surpassing\nthe limitations of previous multi-scene training methods, which could handle\nonly a restricted number of input views during inference. Our model\nsignificantly improves rendering quality under extreme novel views, achieving\nstate-of-the-art performance in these challenging scenarios and outperforming\nvarious 3DGS regularization techniques, multi-scene models tailored for sparse\nview synthesis, and diffusion-based frameworks.\n","authors":["Yutong Chen","Marko Mihajlovic","Xiyi Chen","Yiming Wang","Sergey Prokudin","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2411.06390v2.pdf","comment":"Code and dataset: https://github.com/ChenYutongTHU/SplatFormer\n Project page: https://sergeyprokudin.github.io/splatformer/"},{"id":"http://arxiv.org/abs/2406.01956v3","updated":"2024-11-12T06:36:11Z","published":"2024-06-04T04:31:39Z","title":"Enhance Image-to-Image Generation with LLaVA-generated Prompts","summary":" This paper presents a novel approach to enhance image-to-image generation by\nleveraging the multimodal capabilities of the Large Language and Vision\nAssistant (LLaVA). We propose a framework where LLaVA analyzes input images and\ngenerates textual descriptions, hereinafter LLaVA-generated prompts. These\nprompts, along with the original image, are fed into the image-to-image\ngeneration pipeline. This enriched representation guides the generation process\ntowards outputs that exhibit a stronger resemblance to the input image.\nExtensive experiments demonstrate the effectiveness of LLaVA-generated prompts\nin promoting image similarity. We observe a significant improvement in the\nvisual coherence between the generated and input images compared to traditional\nmethods. Future work will explore fine-tuning LLaVA prompts for increased\ncontrol over the creative process. By providing more specific details within\nthe prompts, we aim to achieve a delicate balance between faithfulness to the\noriginal image and artistic expression in the generated outputs.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Siyang Li"],"pdf_url":"https://arxiv.org/pdf/2406.01956v3.pdf","comment":"Accepted by 2024 5th International Conference on Information Science,\n Parallel and Distributed Systems"},{"id":"http://arxiv.org/abs/2411.07581v1","updated":"2024-11-12T06:33:09Z","published":"2024-11-12T06:33:09Z","title":"Semantic segmentation on multi-resolution optical and microwave data\n using deep learning","summary":" Presently, deep learning and convolutional neural networks (CNNs) are widely\nused in the fields of image processing, image classification, object\nidentification and many more. In this work, we implemented convolutional neural\nnetwork based modified U-Net model and VGG-UNet model to automatically identify\nobjects from satellite imagery captured using high resolution Indian remote\nsensing satellites and then to pixel wise classify satellite data into various\nclasses. In this paper, Cartosat 2S (~1m spatial resolution) datasets were used\nand deep learning models were implemented to detect building shapes and ships\nfrom the test datasets with an accuracy of more than 95%. In another\nexperiment, microwave data (varied resolution) from RISAT-1 was taken as an\ninput and ships and trees were detected with an accuracy of >96% from these\ndatasets. For the classification of images into multiple-classes, deep learning\nmodel was trained on multispectral Cartosat images. Model generated results\nwere then tested using ground truth. Multi-label classification results were\nobtained with an accuracy (IoU) of better than 95%. Total six different\nproblems were attempted using deep learning models and IoU accuracies in the\nrange of 85% to 98% were achieved depending on the degree of complexity.\n","authors":["Jai G Singla","Bakul Vaghela"],"pdf_url":"https://arxiv.org/pdf/2411.07581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07579v1","updated":"2024-11-12T06:29:48Z","published":"2024-11-12T06:29:48Z","title":"Projecting Gaussian Ellipsoids While Avoiding Affine Projection\n Approximation","summary":" Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its\nreal-time rendering speed and state-of-the-art rendering quality. However,\nduring the rendering process, the use of the Jacobian of the affine\napproximation of the projection transformation leads to inevitable errors,\nresulting in blurriness, artifacts and a lack of scene consistency in the final\nrendered images. To address this issue, we introduce an ellipsoid-based\nprojection method to calculate the projection of Gaussian ellipsoid on the\nimage plane, witch is the primitive of 3D Gaussian Splatting. As our proposed\nellipsoid-based projection method cannot handle Gaussian ellipsoids with camera\norigins inside them or parts lying below $z=0$ plane in the camera space, we\ndesigned a pre-filtering strategy. Experiments over multiple widely adopted\nbenchmark datasets show that using our ellipsoid-based projection method can\nenhance the rendering quality of 3D Gaussian Splatting and its extensions.\n","authors":["Han Qi","Tao Cai","Xiyue Han"],"pdf_url":"https://arxiv.org/pdf/2411.07579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07578v1","updated":"2024-11-12T06:29:35Z","published":"2024-11-12T06:29:35Z","title":"Atmospheric turbulence restoration by diffeomorphic image registration\n and blind deconvolution","summary":" A novel approach is presented in this paper to improve images which are\naltered by atmospheric turbulence. Two new algorithms are presented based on\ntwo combinations of a blind deconvolution block, an elastic registration block\nand a temporal filter block. The algorithms are tested on real images acquired\nin the desert in New Mexico by the NATO RTG40 group.\n","authors":["Jerome Gilles","Tristan Dagobert","Carlo De Franchis"],"pdf_url":"https://arxiv.org/pdf/2411.07578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07577v1","updated":"2024-11-12T06:29:27Z","published":"2024-11-12T06:29:27Z","title":"IR image databases generation under target intrinsic thermal variability\n constraints","summary":" This paper deals with the problem of infrared image database generation for\nATR assessment purposes. Huge databases are required to have quantitative and\nobjective performance evaluations. We propose a method which superimpose\ntargets and occultants on background under image quality metrics constraints to\ngenerate realistic images. We also propose a method to generate target\nsignatures with intrinsic thermal variability based on 3D models plated with\nreal infrared textures.\n","authors":["Jerome Gilles","Stephane Landeau","Tristan Dagobert","Philippe Chevalier","Christian Bolut"],"pdf_url":"https://arxiv.org/pdf/2411.07577v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2411.06695"},{"id":"http://arxiv.org/abs/2411.07575v1","updated":"2024-11-12T06:29:18Z","published":"2024-11-12T06:29:18Z","title":"Génération de bases de données images IR sous contraintes avec\n variabilité thermique intrinsèque des cibles","summary":" In this communication, we propose a method which permits to simulate images\nof targets in infrared imagery by superimposition of vehicle signatures in\nbackground, eventually with occultants. We develop a principle which authorizes\nus to generate different thermal configurations of target signatures. This\nmethod enables us to easily generate huge datasets for ATR algorithms\nperformance evaluation.\n","authors":["Jerome Gilles","Stephane Landeau","Tristan Dagobert","Philippe Chevalier","Christian Bolut"],"pdf_url":"https://arxiv.org/pdf/2411.07575v1.pdf","comment":"in French language, GRETSI Symposium on Signal and Image Processing,\n Dijon, France, September 2009"},{"id":"http://arxiv.org/abs/2403.06443v2","updated":"2024-11-12T06:11:46Z","published":"2024-03-11T05:29:46Z","title":"Temporal-Mapping Photography for Event Cameras","summary":" Event cameras, or Dynamic Vision Sensors (DVS) are novel neuromorphic sensors\nthat capture brightness changes as a continuous stream of \"events\" rather than\ntraditional intensity frames. Converting sparse events to dense intensity\nframes faithfully has long been an ill-posed problem. Previous methods have\nprimarily focused on converting events to video in dynamic scenes or with a\nmoving camera. In this paper, for the first time, we realize events to dense\nintensity image conversion using a stationary event camera in static scenes\nwith a transmittance adjustment device for brightness modulation. Different\nfrom traditional methods that mainly rely on event integration, the proposed\nEvent-Based Temporal Mapping Photography (EvTemMap) measures the time of event\nemitting for each pixel. Then, the resulting Temporal Matrix is converted to an\nintensity frame with a temporal mapping neural network. At the hardware level,\nthe proposed EvTemMap is implemented by combining a transmittance adjustment\ndevice with a DVS, named Adjustable Transmittance Dynamic Vision Sensor\n(AT-DVS). Additionally, we collected TemMat dataset under various conditions\nincluding low-light and high dynamic range scenes. The experimental results\nshowcase the high dynamic range, fine-grained details, and high-grayscale\nresolution of the proposed EvTemMap. The code and dataset are available in\nhttps://github.com/YuHanBaozju/EvTemMap\n","authors":["Yuhan Bao","Lei Sun","Yuqin Ma","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.06443v2.pdf","comment":"18 pages, 10 figures, 1 Supplementary materials"},{"id":"http://arxiv.org/abs/2408.15241v2","updated":"2024-11-12T06:08:29Z","published":"2024-08-27T17:59:41Z","title":"GenRec: Unifying Video Generation and Recognition with Diffusion Models","summary":" Video diffusion models are able to generate high-quality videos by learning\nstrong spatial-temporal priors on large-scale datasets. In this paper, we aim\nto investigate whether such priors derived from a generative process are\nsuitable for video recognition, and eventually joint optimization of generation\nand recognition. Building upon Stable Video Diffusion, we introduce GenRec, the\nfirst unified framework trained with a random-frame conditioning process so as\nto learn generalized spatial-temporal representations. The resulting framework\ncan naturally supports generation and recognition, and more importantly is\nrobust even when visual inputs contain limited information. Extensive\nexperiments demonstrate the efficacy of GenRec for both recognition and\ngeneration. In particular, GenRec achieves competitive recognition performance,\noffering 75.8% and 87.2% accuracy on SSV2 and K400, respectively. GenRec also\nperforms the best on class-conditioned image-to-video generation, achieving\n46.5 and 49.3 FVD scores on SSV2 and EK-100 datasets. Furthermore, GenRec\ndemonstrates extraordinary robustness in scenarios that only limited frames can\nbe observed. Code will be available at https://github.com/wengzejia1/GenRec.\n","authors":["Zejia Weng","Xitong Yang","Zhen Xing","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.15241v2.pdf","comment":"19 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2411.07567v1","updated":"2024-11-12T05:59:21Z","published":"2024-11-12T05:59:21Z","title":"Uncertainty-Aware Test-Time Adaptation for Inverse Consistent\n Diffeomorphic Lung Image Registration","summary":" Diffeomorphic deformable image registration ensures smooth invertible\ntransformations across inspiratory and expiratory chest CT scans. Yet, in\npractice, deep learning-based diffeomorphic methods struggle to capture large\ndeformations between inspiratory and expiratory volumes, and therefore lack\ninverse consistency. Existing methods also fail to account for model\nuncertainty, which can be useful for improving performance. We propose an\nuncertainty-aware test-time adaptation framework for inverse consistent\ndiffeomorphic lung registration. Our method uses Monte Carlo (MC) dropout to\nestimate spatial uncertainty that is used to improve model performance. We\ntrain and evaluate our method for inspiratory-to-expiratory CT registration on\na large cohort of 675 subjects from the COPDGene study, achieving a higher Dice\nsimilarity coefficient (DSC) between the lung boundaries (0.966) compared to\nboth VoxelMorph (0.953) and TransMorph (0.953). Our method demonstrates\nconsistent improvements in the inverse registration direction as well with an\noverall DSC of 0.966, higher than VoxelMorph (0.958) and TransMorph (0.956).\nPaired t-tests indicate statistically significant improvements.\n","authors":["Muhammad F. A. Chaudhary","Stephanie M. Aguilera","Arie Nakhmani","Joseph M. Reinhardt","Surya P. Bhatt","Sandeep Bodduluri"],"pdf_url":"https://arxiv.org/pdf/2411.07567v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.10839v3","updated":"2024-11-12T05:33:05Z","published":"2024-06-16T08:20:12Z","title":"Reminding Multimodal Large Language Models of Object-aware Knowledge\n with Retrieved Tags","summary":" Despite recent advances in the general visual instruction-following ability\nof Multimodal Large Language Models (MLLMs), they still struggle with critical\nproblems when required to provide a precise and detailed response to a visual\ninstruction: (1) failure to identify novel objects or entities, (2) mention of\nnon-existent objects, and (3) neglect of object's attributed details. Intuitive\nsolutions include improving the size and quality of data or using larger\nfoundation models. They show effectiveness in mitigating these issues, but at\nan expensive cost of collecting a vast amount of new data and introducing a\nsignificantly larger model. Standing at the intersection of these approaches,\nwe examine the three object-oriented problems from the perspective of the\nimage-to-text mapping process by the multimodal connector. In this paper, we\nfirst identify the limitations of multimodal connectors stemming from\ninsufficient training data. Driven by this, we propose to enhance the mapping\nwith retrieval-augmented tag tokens, which contain rich object-aware\ninformation such as object names and attributes. With our Tag-grounded visual\ninstruction tuning with retrieval Augmentation (TUNA), we outperform baselines\nthat share the same language model and training data on 12 benchmarks.\nFurthermore, we show the zero-shot capability of TUNA when provided with\nspecific datastores.\n","authors":["Daiqing Qi","Handong Zhao","Zijun Wei","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2406.10839v3.pdf","comment":"Main Conference at EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.07556v1","updated":"2024-11-12T05:10:32Z","published":"2024-11-12T05:10:32Z","title":"Multi-task Feature Enhancement Network for No-Reference Image Quality\n Assessment","summary":" Due to the scarcity of labeled samples in Image Quality Assessment (IQA)\ndatasets, numerous recent studies have proposed multi-task based strategies,\nwhich explore feature information from other tasks or domains to boost the IQA\ntask. Nevertheless, multi-task strategies based No-Reference Image Quality\nAssessment (NR-IQA) methods encounter several challenges. First, existing\nmethods have not explicitly exploited texture details, which significantly\ninfluence the image quality. Second, multi-task methods conventionally\nintegrate features through simple operations such as addition or concatenation,\nthereby diminishing the network's capacity to accurately represent distorted\nfeatures. To tackle these challenges, we introduce a novel multi-task NR-IQA\nframework. Our framework consists of three key components: a high-frequency\nextraction network, a quality estimation network, and a distortion-aware\nnetwork. The high-frequency extraction network is designed to guide the model's\nfocus towards high-frequency information, which is highly related to the\ntexture details. Meanwhile, the distortion-aware network extracts\ndistortion-related features to distinguish different distortion types. To\neffectively integrate features from different tasks, a feature fusion module is\ndeveloped based on an attention mechanism. Empirical results from five standard\nIQA databases confirm that our method not only achieves high performance but\nalso exhibits robust generalization ability.\n","authors":["Li Yu"],"pdf_url":"https://arxiv.org/pdf/2411.07556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07555v1","updated":"2024-11-12T05:09:42Z","published":"2024-11-12T05:09:42Z","title":"GaussianCut: Interactive segmentation via graph cut for 3D Gaussian\n Splatting","summary":" We introduce GaussianCut, a new method for interactive multiview segmentation\nof scenes represented as 3D Gaussians. Our approach allows for selecting the\nobjects to be segmented by interacting with a single view. It accepts intuitive\nuser input, such as point clicks, coarse scribbles, or text. Using 3D Gaussian\nSplatting (3DGS) as the underlying scene representation simplifies the\nextraction of objects of interest which are considered to be a subset of the\nscene's Gaussians. Our key idea is to represent the scene as a graph and use\nthe graph-cut algorithm to minimize an energy function to effectively partition\nthe Gaussians into foreground and background. To achieve this, we construct a\ngraph based on scene Gaussians and devise a segmentation-aligned energy\nfunction on the graph to combine user inputs with scene properties. To obtain\nan initial coarse segmentation, we leverage 2D image/video segmentation models\nand further refine these coarse estimates using our graph construction. Our\nempirical evaluations show the adaptability of GaussianCut across a diverse set\nof scenes. GaussianCut achieves competitive performance with state-of-the-art\napproaches for 3D segmentation without requiring any additional\nsegmentation-aware training.\n","authors":["Umangi Jain","Ashkan Mirzaei","Igor Gilitschenski"],"pdf_url":"https://arxiv.org/pdf/2411.07555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07546v1","updated":"2024-11-12T04:50:10Z","published":"2024-11-12T04:50:10Z","title":"Contrastive Language Prompting to Ease False Positives in Medical\n Anomaly Detection","summary":" A pre-trained visual-language model, contrastive language-image pre-training\n(CLIP), successfully accomplishes various downstream tasks with text prompts,\nsuch as finding images or localizing regions within the image. Despite CLIP's\nstrong multi-modal data capabilities, it remains limited in specialized\nenvironments, such as medical applications. For this purpose, many CLIP\nvariants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives\nrelated to normal regions persist. Thus, we aim to present a simple yet\nimportant goal of reducing false positives in medical anomaly detection. We\nintroduce a Contrastive LAnguage Prompting (CLAP) method that leverages both\npositive and negative text prompts. This straightforward approach identifies\npotential lesion regions by visual attention to the positive prompts in the\ngiven image. To reduce false positives, we attenuate attention on normal\nregions using negative prompts. Extensive experiments with the BMAD dataset,\nincluding six biomedical benchmarks, demonstrate that CLAP method enhances\nanomaly detection performance. Our future plans include developing an automated\nfine prompting method for more practical usage.\n","authors":["YeongHyeon Park","Myung Jin Kim","Hyeong Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2411.07546v1.pdf","comment":"4 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.07544v1","updated":"2024-11-12T04:47:32Z","published":"2024-11-12T04:47:32Z","title":"Depthwise Separable Convolutions with Deep Residual Convolutions","summary":" The recent advancement of edge computing enables researchers to optimize\nvarious deep learning architectures to employ them in edge devices. In this\nstudy, we aim to optimize Xception architecture which is one of the most\npopular deep learning algorithms for computer vision applications. The Xception\narchitecture is highly effective for object detection tasks. However, it comes\nwith a significant computational cost. The computational complexity of Xception\nsometimes hinders its deployment on resource-constrained edge devices. To\naddress this, we propose an optimized Xception architecture tailored for edge\ndevices, aiming for lightweight and efficient deployment. We incorporate the\ndepthwise separable convolutions with deep residual convolutions of the\nXception architecture to develop a small and efficient model for edge devices.\nThe resultant architecture reduces parameters, memory usage, and computational\nload. The proposed architecture is evaluated on the CIFAR 10 object detection\ndataset. The evaluation result of our experiment also shows the proposed\narchitecture is smaller in parameter size and requires less training time while\noutperforming Xception architecture performance.\n","authors":["Md Arid Hasan","Krishno Dey"],"pdf_url":"https://arxiv.org/pdf/2411.07544v1.pdf","comment":"Course Project Report"},{"id":"http://arxiv.org/abs/2411.07541v1","updated":"2024-11-12T04:40:27Z","published":"2024-11-12T04:40:27Z","title":"HiCoM: Hierarchical Coherent Motion for Streamable Dynamic Scene with 3D\n Gaussian Splatting","summary":" The online reconstruction of dynamic scenes from multi-view streaming videos\nfaces significant challenges in training, rendering and storage efficiency.\nHarnessing superior learning speed and real-time rendering capabilities, 3D\nGaussian Splatting (3DGS) has recently demonstrated considerable potential in\nthis field. However, 3DGS can be inefficient in terms of storage and prone to\noverfitting by excessively growing Gaussians, particularly with limited views.\nThis paper proposes an efficient framework, dubbed HiCoM, with three key\ncomponents. First, we construct a compact and robust initial 3DGS\nrepresentation using a perturbation smoothing strategy. Next, we introduce a\nHierarchical Coherent Motion mechanism that leverages the inherent non-uniform\ndistribution and local consistency of 3D Gaussians to swiftly and accurately\nlearn motions across frames. Finally, we continually refine the 3DGS with\nadditional Gaussians, which are later merged into the initial 3DGS to maintain\nconsistency with the evolving scene. To preserve a compact representation, an\nequivalent number of low-opacity Gaussians that minimally impact the\nrepresentation are removed before processing subsequent frames. Extensive\nexperiments conducted on two widely used datasets show that our framework\nimproves learning efficiency of the state-of-the-art methods by about $20\\%$\nand reduces the data storage by $85\\%$, achieving competitive free-viewpoint\nvideo synthesis quality but with higher robustness and stability. Moreover, by\nparallel learning multiple frames simultaneously, our HiCoM decreases the\naverage training wall time to $<2$ seconds per frame with negligible\nperformance degradation, substantially boosting real-world applicability and\nresponsiveness.\n","authors":["Qiankun Gao","Jiarui Meng","Chengxiang Wen","Jie Chen","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07541v1.pdf","comment":"Accepted to NeurIPS 2024; Code is avaliable at\n https://github.com/gqk/HiCoM"},{"id":"http://arxiv.org/abs/2404.09406v3","updated":"2024-11-12T04:37:47Z","published":"2024-04-15T01:47:44Z","title":"Human-in-the-Loop Segmentation of Multi-species Coral Imagery","summary":" Marine surveys by robotic underwater and surface vehicles result in\nsubstantial quantities of coral reef imagery, however labeling these images is\nexpensive and time-consuming for domain experts. Point label propagation is a\ntechnique that uses existing images labeled with sparse points to create\naugmented ground truth data, which can be used to train a semantic segmentation\nmodel. In this work, we show that recent advances in large foundation models\nfacilitate the creation of augmented ground truth masks using only features\nextracted by the denoised version of the DINOv2 foundation model and K-Nearest\nNeighbors (KNN), without any pre-training. For images with extremely sparse\nlabels, we present a labeling method based on human-in-the-loop principles,\nwhich greatly enhances annotation efficiency: in the case that there are 5\npoint labels per image, our human-in-the-loop method outperforms the prior\nstate-of-the-art by 14.2% for pixel accuracy and 19.7% for mIoU; and by 8.9%\nand 18.3% if there are 10 point labels. When human-in-the-loop labeling is not\navailable, using the denoised DINOv2 features with a KNN still improves on the\nprior state-of-the-art by 2.7% for pixel accuracy and 5.8% for mIoU (5 grid\npoints). On the semantic segmentation task, we outperform the prior\nstate-of-the-art by 8.8% for pixel accuracy and by 13.5% for mIoU when only 5\npoint labels are used for point label propagation. Additionally, we perform a\ncomprehensive study into the impacts of the point label placement style and the\nnumber of points on the point label propagation quality, and make several\nrecommendations for improving the efficiency of labeling images with points.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Niko Suenderhauf","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2404.09406v3.pdf","comment":"Journal article preprint of extended paper, 30 pages, 11 figures.\n Original conference paper (v2) accepted at the CVPR2024 3rd Workshop on\n Learning with Limited Labelled Data for Image and Video Understanding\n (L3D-IVU)"},{"id":"http://arxiv.org/abs/2407.01523v3","updated":"2024-11-12T04:37:44Z","published":"2024-07-01T17:59:26Z","title":"MMLongBench-Doc: Benchmarking Long-context Document Understanding with\n Visualizations","summary":" Understanding documents with rich layouts and multi-modal components is a\nlong-standing and practical task. Recent Large Vision-Language Models (LVLMs)\nhave made remarkable strides in various tasks, particularly in single-page\ndocument understanding (DU). However, their abilities on long-context DU remain\nan open problem. This work presents MMLongBench-Doc, a long-context,\nmulti-modal benchmark comprising 1,062 expert-annotated questions. Distinct\nfrom previous datasets, it is constructed upon 130 lengthy PDF-formatted\ndocuments with an average of 49.4 pages and 20,971 textual tokens. Towards\ncomprehensive evaluation, answers to these questions rely on pieces of evidence\nfrom (1) different sources (text, image, chart, table, and layout structure)\nand (2) various locations (i.e. page number). Moreover, 33.2% of the questions\nare cross-page questions requiring evidence across multiple pages. 22.8% of the\nquestions are designed to be unanswerable for detecting potential\nhallucinations. Experiments on 14 LVLMs demonstrate that long-context DU\ngreatly challenges current models. Notably, the best-performing model, GPT-4o,\nachieves an F1 score of only 42.7%, while the second-best, GPT-4V, scores\n31.4%. Furthermore, 12 LVLMs (all except GPT-4o and GPT-4V) even present worse\nperformance than their LLM counterparts which are fed with lossy-parsed OCR\ndocuments. These results validate the necessity of future research toward more\ncapable long-context LVLMs. Project Page:\nhttps://mayubo2333.github.io/MMLongBench-Doc\n","authors":["Yubo Ma","Yuhang Zang","Liangyu Chen","Meiqi Chen","Yizhu Jiao","Xinze Li","Xinyuan Lu","Ziyu Liu","Yan Ma","Xiaoyi Dong","Pan Zhang","Liangming Pan","Yu-Gang Jiang","Jiaqi Wang","Yixin Cao","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01523v3.pdf","comment":"Accepted to NeurIPS 2024 Datasets and Benchmarks Track (Spotlight)"},{"id":"http://arxiv.org/abs/2409.01652v2","updated":"2024-11-12T04:33:26Z","published":"2024-09-03T06:45:22Z","title":"ReKep: Spatio-Temporal Reasoning of Relational Keypoint Constraints for\n Robotic Manipulation","summary":" Representing robotic manipulation tasks as constraints that associate the\nrobot and the environment is a promising way to encode desired robot behaviors.\nHowever, it remains unclear how to formulate the constraints such that they are\n1) versatile to diverse tasks, 2) free of manual labeling, and 3) optimizable\nby off-the-shelf solvers to produce robot actions in real-time. In this work,\nwe introduce Relational Keypoint Constraints (ReKep), a visually-grounded\nrepresentation for constraints in robotic manipulation. Specifically, ReKep is\nexpressed as Python functions mapping a set of 3D keypoints in the environment\nto a numerical cost. We demonstrate that by representing a manipulation task as\na sequence of Relational Keypoint Constraints, we can employ a hierarchical\noptimization procedure to solve for robot actions (represented by a sequence of\nend-effector poses in SE(3)) with a perception-action loop at a real-time\nfrequency. Furthermore, in order to circumvent the need for manual\nspecification of ReKep for each new task, we devise an automated procedure that\nleverages large vision models and vision-language models to produce ReKep from\nfree-form language instructions and RGB-D observations. We present system\nimplementations on a wheeled single-arm platform and a stationary dual-arm\nplatform that can perform a large variety of manipulation tasks, featuring\nmulti-stage, in-the-wild, bimanual, and reactive behaviors, all without\ntask-specific data or environment models. Website at\nhttps://rekep-robot.github.io/.\n","authors":["Wenlong Huang","Chen Wang","Yunzhu Li","Ruohan Zhang","Li Fei-Fei"],"pdf_url":"https://arxiv.org/pdf/2409.01652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09729v2","updated":"2024-11-12T04:19:32Z","published":"2024-10-13T05:19:09Z","title":"MIRAGE: Multimodal Identification and Recognition of Annotations in\n Indian General Prescriptions","summary":" Hospitals in India still rely on handwritten medical records despite the\navailability of Electronic Medical Records (EMR), complicating statistical\nanalysis and record retrieval. Handwritten records pose a unique challenge,\nrequiring specialized data for training models to recognize medications and\ntheir recommendation patterns. While traditional handwriting recognition\napproaches employ 2-D LSTMs, recent studies have explored using Multimodal\nLarge Language Models (MLLMs) for OCR tasks. Building on this approach, we\nfocus on extracting medication names and dosages from simulated medical\nrecords. Our methodology MIRAGE (Multimodal Identification and Recognition of\nAnnotations in indian GEneral prescriptions) involves fine-tuning the QWEN VL,\nLLaVA 1.6 and Idefics2 models on 743,118 high resolution simulated medical\nrecord images-fully annotated from 1,133 doctors across India. Our approach\nachieves 82% accuracy in extracting medication names and dosages.\n","authors":["Tavish Mankash","V. S. Chaithanya Kota","Anish De","Praveen Prakash","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2410.09729v2.pdf","comment":"5 pages, 9 figures, 3 tables, submitted to ISBI 2025"},{"id":"http://arxiv.org/abs/2404.09227v2","updated":"2024-11-12T04:08:05Z","published":"2024-04-14T12:13:07Z","title":"DreamScape: 3D Scene Creation via Gaussian Splatting joint Correlation\n Modeling","summary":" Recent progress in text-to-3D creation has been propelled by integrating the\npotent prior of Diffusion Models from text-to-image generation into the 3D\ndomain. Nevertheless, generating 3D scenes characterized by multiple instances\nand intricate arrangements remains challenging. In this study, we present\nDreamScape, a method for creating highly consistent 3D scenes solely from\ntextual descriptions, leveraging the strong 3D representation capabilities of\nGaussian Splatting and the complex arrangement abilities of large language\nmodels (LLMs). Our approach involves a 3D Gaussian Guide ($3{DG^2}$) for scene\nrepresentation, consisting of semantic primitives (objects) and their spatial\ntransformations and relationships derived directly from text prompts using\nLLMs. This compositional representation allows for local-to-global optimization\nof the entire scene. A progressive scale control is tailored during local\nobject generation, ensuring that objects of different sizes and densities adapt\nto the scene, which addresses training instability issue arising from simple\nblending in the subsequent global optimization stage. To mitigate potential\nbiases of LLM priors, we model collision relationships between objects at the\nglobal level, enhancing physical correctness and overall realism. Additionally,\nto generate pervasive objects like rain and snow distributed extensively across\nthe scene, we introduce a sparse initialization and densification strategy.\nExperiments demonstrate that DreamScape offers high usability and\ncontrollability, enabling the generation of high-fidelity 3D scenes from only\ntext prompts and achieving state-of-the-art performance compared to other\nmethods.\n","authors":["Xuening Yuan","Hongyu Yang","Yueming Zhao","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2404.09227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17465v3","updated":"2024-11-12T03:27:41Z","published":"2024-03-26T07:55:16Z","title":"LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated\n Image Detection","summary":" The evolution of Diffusion Models has dramatically improved image generation\nquality, making it increasingly difficult to differentiate between real and\ngenerated images. This development, while impressive, also raises significant\nprivacy and security concerns. In response to this, we propose a novel Latent\nREconstruction error guided feature REfinement method (LaRE^2) for detecting\nthe diffusion-generated images. We come up with the Latent Reconstruction Error\n(LaRE), the first reconstruction-error based feature in the latent space for\ngenerated image detection. LaRE surpasses existing methods in terms of feature\nextraction efficiency while preserving crucial cues required to differentiate\nbetween the real and the fake. To exploit LaRE, we propose an Error-Guided\nfeature REfinement module (EGRE), which can refine the image feature guided by\nLaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an\nalign-then-refine mechanism, which effectively refines the image feature for\ngenerated-image detection from both spatial and channel perspectives. Extensive\nexperiments on the large-scale GenImage benchmark demonstrate the superiority\nof our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1%\naverage ACC/AP across 8 different image generators. LaRE also surpasses\nexisting methods in terms of feature extraction cost, delivering an impressive\nspeed enhancement of 8 times. Code is available.\n","authors":["Yunpeng Luo","Junlong Du","Ke Yan","Shouhong Ding"],"pdf_url":"https://arxiv.org/pdf/2403.17465v3.pdf","comment":"CVPR 2024. Code is available at https://github.com/luo3300612/LaRE"},{"id":"http://arxiv.org/abs/2411.07516v1","updated":"2024-11-12T03:25:33Z","published":"2024-11-12T03:25:33Z","title":"SparrowVQE: Visual Question Explanation for Course Content Understanding","summary":" Visual Question Answering (VQA) research seeks to create AI systems to answer\nnatural language questions in images, yet VQA methods often yield overly\nsimplistic and short answers. This paper aims to advance the field by\nintroducing Visual Question Explanation (VQE), which enhances the ability of\nVQA to provide detailed explanations rather than brief responses and address\nthe need for more complex interaction with visual content. We first created an\nMLVQE dataset from a 14-week streamed video machine learning course, including\n885 slide images, 110,407 words of transcripts, and 9,416 designed\nquestion-answer (QA) pairs. Next, we proposed a novel SparrowVQE, a small 3\nbillion parameters multimodal model. We trained our model with a three-stage\ntraining mechanism consisting of multimodal pre-training (slide images and\ntranscripts feature alignment), instruction tuning (tuning the pre-trained\nmodel with transcripts and QA pairs), and domain fine-tuning (fine-tuning slide\nimage and QA pairs). Eventually, our SparrowVQE can understand and connect\nvisual information using the SigLIP model with transcripts using the Phi-2\nlanguage model with an MLP adapter. Experimental results demonstrate that our\nSparrowVQE achieves better performance in our developed MLVQE dataset and\noutperforms state-of-the-art methods in the other five benchmark VQA datasets.\nThe source code is available at\n\\url{https://github.com/YoushanZhang/SparrowVQE}.\n","authors":["Jialu Li","Manish Kumar Thota","Ruslan Gokhman","Radek Holik","Youshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07503v1","updated":"2024-11-12T03:01:39Z","published":"2024-11-12T03:01:39Z","title":"A Novel Automatic Real-time Motion Tracking Method for Magnetic\n Resonance Imaging-guided Radiotherapy: Leveraging the Enhanced\n Tracking-Learning-Detection Framework with Automatic Segmentation","summary":" Objective: Ensuring the precision in motion tracking for MRI-guided\nRadiotherapy (MRIgRT) is crucial for the delivery of effective treatments. This\nstudy refined the motion tracking accuracy in MRIgRT through the innovation of\nan automatic real-time tracking method, leveraging an enhanced\nTracking-Learning-Detection (ETLD) framework coupled with automatic\nsegmentation. Methods: We developed a novel MRIgRT motion tracking method by\nintegrating two primary methods: the ETLD framework and an improved Chan-Vese\nmodel (ICV), named ETLD+ICV. The TLD framework was upgraded to suit real-time\ncine MRI, including advanced image preprocessing, no-reference image quality\nassessment, an enhanced median-flow tracker, and a refined detector with\ndynamic search region adjustments. Additionally, ICV was combined for precise\ncoverage of the target volume, which refined the segmented region frame by\nframe using tracking results, with key parameters optimized. Tested on 3.5D MRI\nscans from 10 patients with liver metastases, our method ensures precise\ntracking and accurate segmentation vital for MRIgRT. Results: An evaluation of\n106,000 frames across 77 treatment fractions revealed sub-millimeter tracking\nerrors of less than 0.8mm, with over 99% precision and 98% recall for all\nsubjects, underscoring the robustness and efficacy of the ETLD. Moreover, the\nETLD+ICV yielded a dice global score of more than 82% for all subjects,\ndemonstrating the proposed method's extensibility and precise target volume\ncoverage. Conclusions: This study successfully developed an automatic real-time\nmotion tracking method for MRIgRT that markedly surpasses current methods. The\nnovel method not only delivers exceptional precision in tracking and\nsegmentation but also demonstrates enhanced adaptability to clinical demands,\npositioning it as an indispensable asset in the quest to augment the efficacy\nof radiotherapy treatments.\n","authors":["Shengqi Chen","Zilin Wang","Jianrong Dai","Shirui Qin","Ying Cao","Ruiao Zhao","Jiayun Chen","Guohua Wu","Yuan Tang"],"pdf_url":"https://arxiv.org/pdf/2411.07503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15364v2","updated":"2024-11-12T03:00:07Z","published":"2023-12-23T22:27:40Z","title":"WildScenes: A Benchmark for 2D and 3D Semantic Segmentation in\n Large-scale Natural Environments","summary":" Recent progress in semantic scene understanding has primarily been enabled by\nthe availability of semantically annotated bi-modal (camera and LiDAR) datasets\nin urban environments. However, such annotated datasets are also needed for\nnatural, unstructured environments to enable semantic perception for\napplications, including conservation, search and rescue, environment\nmonitoring, and agricultural automation. Therefore, we introduce $WildScenes$,\na bi-modal benchmark dataset consisting of multiple large-scale, sequential\ntraversals in natural environments, including semantic annotations in\nhigh-resolution 2D images and dense 3D LiDAR point clouds, and accurate 6-DoF\npose information. The data is (1) trajectory-centric with accurate localization\nand globally aligned point clouds, (2) calibrated and synchronized to support\nbi-modal training and inference, and (3) containing different natural\nenvironments over 6 months to support research on domain adaptation. Our 3D\nsemantic labels are obtained via an efficient, automated process that transfers\nthe human-annotated 2D labels from multiple views into 3D point cloud\nsequences, thus circumventing the need for expensive and time-consuming human\nannotation in 3D. We introduce benchmarks on 2D and 3D semantic segmentation\nand evaluate a variety of recent deep-learning techniques to demonstrate the\nchallenges in semantic segmentation in natural environments. We propose\ntrain-val-test splits for standard benchmarks as well as domain adaptation\nbenchmarks and utilize an automated split generation technique to ensure the\nbalance of class label distributions. The $WildScenes$ benchmark webpage is\nhttps://csiro-robotics.github.io/WildScenes, and the data is publicly available\nat https://data.csiro.au/collection/csiro:61541 .\n","authors":["Kavisha Vidanapathirana","Joshua Knights","Stephen Hausler","Mark Cox","Milad Ramezani","Jason Jooste","Ethan Griffiths","Shaheer Mohamed","Sridha Sridharan","Clinton Fookes","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2312.15364v2.pdf","comment":"Accepted in the The International Journal of Robotics Research (IJRR)"},{"id":"http://arxiv.org/abs/2411.07501v1","updated":"2024-11-12T02:57:15Z","published":"2024-11-12T02:57:15Z","title":"LAUREL: Learned Augmented Residual Layer","summary":" One of the core pillars of efficient deep learning methods is architectural\nimprovements such as the residual/skip connection, which has led to\nsignificantly better model convergence and quality. Since then the residual\nconnection has become ubiquitous in not just convolutional neural networks but\nalso transformer-based architectures, the backbone of LLMs.\n In this paper we introduce \\emph{Learned Augmented Residual Layer} (LAuReL)\n-- a novel generalization of the canonical residual connection -- with the goal\nto be an in-situ replacement of the latter while outperforming on both model\nquality and footprint metrics. Our experiments show that using \\laurel can help\nboost performance for both vision and language models. For example, on the\nResNet-50, ImageNet 1K task, it achieves $60\\%$ of the gains from adding an\nextra layer, while only adding $0.003\\%$ more parameters, and matches it while\nadding $2.6\\times$ fewer parameters.\n","authors":["Gaurav Menghani","Ravi Kumar","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07501v1.pdf","comment":"Accepted at the 2nd Efficient Systems for Foundation Models Workshop\n at the International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2411.07483v1","updated":"2024-11-12T02:12:41Z","published":"2024-11-12T02:12:41Z","title":"Quantifying Knowledge Distillation Using Partial Information\n Decomposition","summary":" Knowledge distillation provides an effective method for deploying complex\nmachine learning models in resource-constrained environments. It typically\ninvolves training a smaller student model to emulate either the probabilistic\noutputs or the internal feature representations of a larger teacher model. By\ndoing so, the student model often achieves substantially better performance on\na downstream task compared to when it is trained independently. Nevertheless,\nthe teacher's internal representations can also encode noise or additional\ninformation that may not be relevant to the downstream task. This observation\nmotivates our primary question: What are the information-theoretic limits of\nknowledge transfer? To this end, we leverage a body of work in information\ntheory called Partial Information Decomposition (PID) to quantify the\ndistillable and distilled knowledge of a teacher's representation corresponding\nto a given student and a downstream task. Moreover, we demonstrate that this\nmetric can be practically used in distillation to address challenges caused by\nthe complexity gap between the teacher and the student representations.\n","authors":["Pasan Dissanayake","Faisal Hamman","Barproda Halder","Ilia Sucholutsky","Qiuyi Zhang","Sanghamitra Dutta"],"pdf_url":"https://arxiv.org/pdf/2411.07483v1.pdf","comment":"Accepted at NeurIPS 2024 Machine Learning and Compression Workshop"},{"id":"http://arxiv.org/abs/2406.06535v3","updated":"2024-11-12T02:10:13Z","published":"2024-04-23T03:11:08Z","title":"Utilizing Graph Generation for Enhanced Domain Adaptive Object Detection","summary":" The problem of Domain Adaptive in the field of Object Detection involves the\ntransfer of object detection models from labeled source domains to unannotated\ntarget domains. Recent advancements in this field aim to address domain\ndiscrepancies by aligning pixel-pairs across domains within a non-Euclidean\ngraphical space, thereby minimizing semantic distribution variance. Despite\ntheir remarkable achievements, these methods often use coarse semantic\nrepresentations to model graphs, mainly due to ignoring non-informative\nelements and failing to focus on precise semantic alignment. Additionally, the\ngeneration of coarse graphs inherently introduces abnormal nodes, posing\nchallenges and potentially biasing domain adaptation outcomes. Consequently, we\npropose a framework, which utilizes the Graph Generation to enhance the quality\nof DAOD (\\method{}). Specifically, we introduce a Node Refinement module that\nutilizes a memory bank to reconstruct noisy sampled nodes while applying\ncontrastive regularization to noisy features. To enhance semantic alignment, we\npropose separating domain-specific styles from category invariance encoded\nwithin graph covariances, which allows us to selectively remove domain-specific\nstyles while preserving category-invariant information, thus facilitating more\naccurate semantic alignment across different domains. Furthermore, we propose a\nGraph Optimization adaptor, leveraging variational inference to mitigate the\nimpact of abnormal nodes. Extensive experimentation across three adaptation\nbenchmarks validates that \\method{} achieves state-of-the-art performance in\nthe task of unsupervised domain adaptation.\n","authors":["Mu Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06535v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07478v1","updated":"2024-11-12T01:51:05Z","published":"2024-11-12T01:51:05Z","title":"GUS-IR: Gaussian Splatting with Unified Shading for Inverse Rendering","summary":" Recovering the intrinsic physical attributes of a scene from images,\ngenerally termed as the inverse rendering problem, has been a central and\nchallenging task in computer vision and computer graphics. In this paper, we\npresent GUS-IR, a novel framework designed to address the inverse rendering\nproblem for complicated scenes featuring rough and glossy surfaces. This paper\nstarts by analyzing and comparing two prominent shading techniques popularly\nused for inverse rendering, forward shading and deferred shading, effectiveness\nin handling complex materials. More importantly, we propose a unified shading\nsolution that combines the advantages of both techniques for better\ndecomposition. In addition, we analyze the normal modeling in 3D Gaussian\nSplatting (3DGS) and utilize the shortest axis as normal for each particle in\nGUS-IR, along with a depth-related regularization, resulting in improved\ngeometric representation and better shape reconstruction. Furthermore, we\nenhance the probe-based baking scheme proposed by GS-IR to achieve more\naccurate ambient occlusion modeling to better handle indirect illumination.\nExtensive experiments have demonstrated the superior performance of GUS-IR in\nachieving precise intrinsic decomposition and geometric representation,\nsupporting many downstream tasks (such as relighting, retouching) in computer\nvision, graphics, and extended reality.\n","authors":["Zhihao Liang","Hongdong Li","Kui Jia","Kailing Guo","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07478v1.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2406.07520v3","updated":"2024-11-12T01:45:49Z","published":"2024-06-11T17:50:15Z","title":"Neural Gaffer: Relighting Any Object via Diffusion","summary":" Single-image relighting is a challenging task that involves reasoning about\nthe complex interplay between geometry, materials, and lighting. Many prior\nmethods either support only specific categories of images, such as portraits,\nor require special capture conditions, like using a flashlight. Alternatively,\nsome methods explicitly decompose a scene into intrinsic components, such as\nnormals and BRDFs, which can be inaccurate or under-expressive. In this work,\nwe propose a novel end-to-end 2D relighting diffusion model, called Neural\nGaffer, that takes a single image of any object and can synthesize an accurate,\nhigh-quality relit image under any novel environmental lighting condition,\nsimply by conditioning an image generator on a target environment map, without\nan explicit scene decomposition. Our method builds on a pre-trained diffusion\nmodel, and fine-tunes it on a synthetic relighting dataset, revealing and\nharnessing the inherent understanding of lighting present in the diffusion\nmodel. We evaluate our model on both synthetic and in-the-wild Internet imagery\nand demonstrate its advantages in terms of generalization and accuracy.\nMoreover, by combining with other generative methods, our model enables many\ndownstream 2D tasks, such as text-based relighting and object insertion. Our\nmodel can also operate as a strong relighting prior for 3D tasks, such as\nrelighting a radiance field.\n","authors":["Haian Jin","Yuan Li","Fujun Luan","Yuanbo Xiangli","Sai Bi","Kai Zhang","Zexiang Xu","Jin Sun","Noah Snavely"],"pdf_url":"https://arxiv.org/pdf/2406.07520v3.pdf","comment":"Project Website: https://neural-gaffer.github.io"},{"id":"http://arxiv.org/abs/2405.14864v3","updated":"2024-11-12T01:31:41Z","published":"2024-05-23T17:59:40Z","title":"Video Diffusion Models are Training-free Motion Interpreter and\n Controller","summary":" Video generation primarily aims to model authentic and customized motion\nacross frames, making understanding and controlling the motion a crucial topic.\nMost diffusion-based studies on video motion focus on motion customization with\ntraining-based paradigms, which, however, demands substantial training\nresources and necessitates retraining for diverse models. Crucially, these\napproaches do not explore how video diffusion models encode cross-frame motion\ninformation in their features, lacking interpretability and transparency in\ntheir effectiveness. To answer this question, this paper introduces a novel\nperspective to understand, localize, and manipulate motion-aware features in\nvideo diffusion models. Through analysis using Principal Component Analysis\n(PCA), our work discloses that robust motion-aware feature already exists in\nvideo diffusion models. We present a new MOtion FeaTure (MOFT) by eliminating\ncontent correlation information and filtering motion channels. MOFT provides a\ndistinct set of benefits, including the ability to encode comprehensive motion\ninformation with clear interpretability, extraction without the need for\ntraining, and generalizability across diverse architectures. Leveraging MOFT,\nwe propose a novel training-free video motion control framework. Our method\ndemonstrates competitive performance in generating natural and faithful motion,\nproviding architecture-agnostic insights and applicability in a variety of\ndownstream tasks.\n","authors":["Zeqi Xiao","Yifan Zhou","Shuai Yang","Xingang Pan"],"pdf_url":"https://arxiv.org/pdf/2405.14864v3.pdf","comment":"Accepted by NeurIPS 2024. Project Page:\n https://xizaoqu.github.io/moft/"},{"id":"http://arxiv.org/abs/2411.07472v1","updated":"2024-11-12T01:17:27Z","published":"2024-11-12T01:17:27Z","title":"Semi-Truths: A Large-Scale Dataset of AI-Augmented Images for Evaluating\n Robustness of AI-Generated Image detectors","summary":" Text-to-image diffusion models have impactful applications in art, design,\nand entertainment, yet these technologies also pose significant risks by\nenabling the creation and dissemination of misinformation. Although recent\nadvancements have produced AI-generated image detectors that claim robustness\nagainst various augmentations, their true effectiveness remains uncertain. Do\nthese detectors reliably identify images with different levels of augmentation?\nAre they biased toward specific scenes or data distributions? To investigate,\nwe introduce SEMI-TRUTHS, featuring 27,600 real images, 223,400 masks, and\n1,472,700 AI-augmented images that feature targeted and localized perturbations\nproduced using diverse augmentation techniques, diffusion models, and data\ndistributions. Each augmented image is accompanied by metadata for standardized\nand targeted evaluation of detector robustness. Our findings suggest that\nstate-of-the-art detectors exhibit varying sensitivities to the types and\ndegrees of perturbations, data distributions, and augmentation methods used,\noffering new insights into their performance and limitations. The code for the\naugmentation and evaluation pipeline is available at\nhttps://github.com/J-Kruk/SemiTruths.\n","authors":["Anisha Pal","Julia Kruk","Mansi Phute","Manognya Bhattaram","Diyi Yang","Duen Horng Chau","Judy Hoffman"],"pdf_url":"https://arxiv.org/pdf/2411.07472v1.pdf","comment":"Accepted at NeurIPS 2024 Track Datasets & Benchmarks Track"},{"id":"http://arxiv.org/abs/2404.08926v3","updated":"2024-11-12T01:16:04Z","published":"2024-04-13T08:27:10Z","title":"Diffusion Models Meet Remote Sensing: Principles, Methods, and\n Perspectives","summary":" As a newly emerging advance in deep generative models, diffusion models have\nachieved state-of-the-art results in many fields, including computer vision,\nnatural language processing, and molecule design. The remote sensing (RS)\ncommunity has also noticed the powerful ability of diffusion models and quickly\napplied them to a variety of tasks for image processing. Given the rapid\nincrease in research on diffusion models in the field of RS, it is necessary to\nconduct a comprehensive review of existing diffusion model-based RS papers, to\nhelp researchers recognize the potential of diffusion models and provide some\ndirections for further exploration. Specifically, this article first introduces\nthe theoretical background of diffusion models, and then systematically reviews\nthe applications of diffusion models in RS, including image generation,\nenhancement, and interpretation. Finally, the limitations of existing RS\ndiffusion models and worthy research directions for further exploration are\ndiscussed and summarized.\n","authors":["Yidan Liu","Jun Yue","Shaobo Xia","Pedram Ghamisi","Weiying Xie","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2404.08926v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07463v1","updated":"2024-11-12T00:54:26Z","published":"2024-11-12T00:54:26Z","title":"MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation\n Models, Convolutional Neural Networks, and Uncertainty Quantification for\n High-Speed Video Phase Detection Data","summary":" Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in\nnuclear reactors, chemical processing, and electronics cooling for detecting\nvapor, liquid, and microlayer phases. Traditional segmentation models face\npixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ\nintroduces VideoSAM, a hybrid framework leveraging convolutional neural\nnetworks (CNNs) and transformer-based vision models to enhance segmentation\naccuracy and generalizability across complex multimodal PD tasks. Methods:\nVideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced\nfeature extraction and segmentation across diverse HSV PD modalities, spanning\nfluids like water, FC-72, nitrogen, and argon under varied heat flux\nconditions. The framework also incorporates uncertainty quantification (UQ) to\nassess pixel-based discretization errors, delivering reliable metrics such as\ncontact line density and dry area fraction under experimental conditions.\nResults: VideoSAM outperforms SAM and modality-specific CNN models in\nsegmentation accuracy, excelling in environments with complex phase boundaries,\noverlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid\narchitecture supports cross-dataset generalization, adapting effectively to\nvarying modalities. The UQ module provides accurate error estimates, enhancing\nthe reliability of segmentation outputs for advanced HSV PD research.\nConclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD\nsegmentation, addressing previous limitations with advanced deep learning and\nUQ techniques. The open-source datasets and tools introduced enable scalable,\nprecise, and adaptable segmentation for multimodal PD datasets, supporting\nadvancements in HSV analysis and autonomous experimentation.\n","authors":["Chika Maduabuchi","Ericmoore Jossou","Matteo Bucci"],"pdf_url":"https://arxiv.org/pdf/2411.07463v1.pdf","comment":"Under Review in EAAI"},{"id":"http://arxiv.org/abs/2411.07462v1","updated":"2024-11-12T00:53:20Z","published":"2024-11-12T00:53:20Z","title":"MureObjectStitch: Multi-reference Image Composition","summary":" Generative image composition aims to regenerate the given foreground object\nin the background image to produce a realistic composite image. In this work,\nwe propose an effective finetuning strategy for generative image composition\nmodel, in which we finetune a pretrained model using one or more images\ncontaining the same foreground object. Moreover, we propose a multi-reference\nstrategy, which allows the model to take in multiple reference images of the\nforeground object. The experiments on MureCOM dataset verify the effectiveness\nof our method.\n","authors":["Jiaxuan Chen","Bo Zhang","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2411.07462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07461v1","updated":"2024-11-12T00:52:52Z","published":"2024-11-12T00:52:52Z","title":"BLIP3-KALE: Knowledge Augmented Large-Scale Dense Captions","summary":" We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that\nbridges the gap between descriptive synthetic captions and factual web-scale\nalt-text. KALE augments synthetic dense image captions with web-scale alt-text\nto generate factually grounded image captions. Our two-stage approach leverages\nlarge vision-language models and language models to create knowledge-augmented\ncaptions, which are then used to train a specialized VLM for scaling up the\ndataset. We train vision-language models on KALE and demonstrate improvements\non vision-language tasks. Our experiments show the utility of KALE for training\nmore capable and knowledgeable multimodal models. We release the KALE dataset\nat https://huggingface.co/datasets/Salesforce/blip3-kale\n","authors":["Anas Awadalla","Le Xue","Manli Shu","An Yan","Jun Wang","Senthil Purushwalkam","Sheng Shen","Hannah Lee","Oscar Lo","Jae Sung Park","Etash Guha","Silvio Savarese","Ludwig Schmidt","Yejin Choi","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2411.07461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03190v2","updated":"2024-11-12T00:37:33Z","published":"2024-10-04T07:05:16Z","title":"Tuning Timestep-Distilled Diffusion Model Using Pairwise Sample\n Optimization","summary":" Recent advancements in timestep-distilled diffusion models have enabled\nhigh-quality image generation that rivals non-distilled multi-step models, but\nwith significantly fewer inference steps. While such models are attractive for\napplications due to the low inference cost and latency, fine-tuning them with a\nnaive diffusion objective would result in degraded and blurry outputs. An\nintuitive alternative is to repeat the diffusion distillation process with a\nfine-tuned teacher model, which produces good results but is cumbersome and\ncomputationally intensive; the distillation training usually requires magnitude\nhigher of training compute compared to fine-tuning for specific image styles.\nIn this paper, we present an algorithm named pairwise sample optimization\n(PSO), which enables the direct fine-tuning of an arbitrary timestep-distilled\ndiffusion model. PSO introduces additional reference images sampled from the\ncurrent time-step distilled model, and increases the relative likelihood margin\nbetween the training images and reference images. This enables the model to\nretain its few-step generation ability, while allowing for fine-tuning of its\noutput distribution. We also demonstrate that PSO is a generalized formulation\nwhich can be flexibly extended to both offline-sampled and online-sampled\npairwise data, covering various popular objectives for diffusion model\npreference optimization. We evaluate PSO in both preference optimization and\nother fine-tuning tasks, including style transfer and concept customization. We\nshow that PSO can directly adapt distilled models to human-preferred generation\nwith both offline and online-generated pairwise preference image data. PSO also\ndemonstrates effectiveness in style transfer and concept customization by\ndirectly tuning timestep-distilled diffusion models.\n","authors":["Zichen Miao","Zhengyuan Yang","Kevin Lin","Ze Wang","Zicheng Liu","Lijuan Wang","Qiang Qiu"],"pdf_url":"https://arxiv.org/pdf/2410.03190v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02504v3","updated":"2024-11-12T00:21:00Z","published":"2024-05-03T22:33:46Z","title":"Functional Imaging Constrained Diffusion for Brain PET Synthesis from\n Structural MRI","summary":" Magnetic resonance imaging (MRI) and positron emission tomography (PET) are\nincreasingly used in multimodal analysis of neurodegenerative disorders. While\nMRI is broadly utilized in clinical settings, PET is less accessible. Many\nstudies have attempted to use deep generative models to synthesize PET from MRI\nscans. However, they often suffer from unstable training and inadequately\npreserve brain functional information conveyed by PET. To this end, we propose\na functional imaging constrained diffusion (FICD) framework for 3D brain PET\nimage synthesis with paired structural MRI as input condition, through a new\nconstrained diffusion model (CDM). The FICD introduces noise to PET and then\nprogressively removes it with CDM, ensuring high output fidelity throughout a\nstable training phase. The CDM learns to predict denoised PET with a functional\nimaging constraint introduced to ensure voxel-wise alignment between each\ndenoised PET and its ground truth. Quantitative and qualitative analyses\nconducted on 293 subjects with paired T1-weighted MRI and\n18F-fluorodeoxyglucose (FDG)-PET scans suggest that FICD achieves superior\nperformance in generating FDG-PET data compared to state-of-the-art methods. We\nfurther validate the effectiveness of the proposed FICD on data from a total of\n1,262 subjects through three downstream tasks, with experimental results\nsuggesting its utility and generalizability.\n","authors":["Minhui Yu","Mengqi Wu","Ling Yue","Andrea Bozoki","Mingxia Liu"],"pdf_url":"https://arxiv.org/pdf/2405.02504v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07449v1","updated":"2024-11-12T00:20:11Z","published":"2024-11-12T00:20:11Z","title":"Tracing the Roots: Leveraging Temporal Dynamics in Diffusion\n Trajectories for Origin Attribution","summary":" Diffusion models have revolutionized image synthesis, garnering significant\nresearch interest in recent years. Diffusion is an iterative algorithm in which\nsamples are generated step-by-step, starting from pure noise. This process\nintroduces the notion of diffusion trajectories, i.e., paths from the standard\nGaussian distribution to the target image distribution. In this context, we\nstudy discriminative algorithms operating on these trajectories. Specifically,\ngiven a pre-trained diffusion model, we consider the problem of classifying\nimages as part of the training dataset, generated by the model or originating\nfrom an external source. Our approach demonstrates the presence of patterns\nacross steps that can be leveraged for classification. We also conduct ablation\nstudies, which reveal that using higher-order gradient features to characterize\nthe trajectories leads to significant performance gains and more robust\nalgorithms.\n","authors":["Andreas Floros","Seyed-Mohsen Moosavi-Dezfooli","Pier Luigi Dragotti"],"pdf_url":"https://arxiv.org/pdf/2411.07449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13632v3","updated":"2024-11-12T00:12:39Z","published":"2023-12-21T07:48:54Z","title":"TraceFL: Interpretability-Driven Debugging in Federated Learning via\n Neuron Provenance","summary":" In Federated Learning, clients train models on local data and send updates to\na central server, which aggregates them into a global model using a fusion\nalgorithm. This collaborative yet privacy-preserving training comes at a\ncost--FL developers face significant challenges in attributing global model\npredictions to specific clients. Localizing responsible clients is a crucial\nstep towards (a) excluding clients primarily responsible for incorrect\npredictions and (b) encouraging clients who contributed high-quality models to\ncontinue participating in the future. Existing ML explainability approaches are\ninherently inapplicable as they are designed for single-model, centralized\ntraining.\n We introduce TraceFL, a fine-grained neuron provenance capturing mechanism\nthat identifies clients responsible for the global model's prediction by\ntracking the flow of information from individual clients to the global model.\nSince inference on different inputs activates a different set of neurons of the\nglobal model, TraceFL dynamically quantifies the significance of the global\nmodel's neurons in a given prediction. It then selectively picks a slice of the\nmost crucial neurons in the global model and maps them to the corresponding\nneurons in every participating client to determine each client's contribution,\nultimately localizing the responsible client. We evaluate TraceFL on six\ndatasets, including two real-world medical imaging datasets and four neural\nnetworks, including advanced models such as GPT. TraceFL achieves 99% accuracy\nin localizing the responsible client in FL tasks spanning both image and text\nclassification tasks. At a time when state-of-the-art ML debugging approaches\nare mostly domain-specific (e.g., image classification only), TraceFL is the\nfirst technique to enable highly accurate automated reasoning across a wide\nrange of FL applications.\n","authors":["Waris Gill","Ali Anwar","Muhammad Ali Gulzar"],"pdf_url":"https://arxiv.org/pdf/2312.13632v3.pdf","comment":"Accepted at 2025 IEEE/ACM 47th International Conference on Software\n Engineering (ICSE)"},{"id":"http://arxiv.org/abs/2411.07445v1","updated":"2024-11-12T00:07:16Z","published":"2024-11-12T00:07:16Z","title":"All-in-one Weather-degraded Image Restoration via Adaptive\n Degradation-aware Self-prompting Model","summary":" Existing approaches for all-in-one weather-degraded image restoration suffer\nfrom inefficiencies in leveraging degradation-aware priors, resulting in\nsub-optimal performance in adapting to different weather conditions. To this\nend, we develop an adaptive degradation-aware self-prompting model (ADSM) for\nall-in-one weather-degraded image restoration. Specifically, our model employs\nthe contrastive language-image pre-training model (CLIP) to facilitate the\ntraining of our proposed latent prompt generators (LPGs), which represent three\ntypes of latent prompts to characterize the degradation type, degradation\nproperty and image caption. Moreover, we integrate the acquired\ndegradation-aware prompts into the time embedding of diffusion model to improve\ndegradation perception. Meanwhile, we employ the latent caption prompt to guide\nthe reverse sampling process using the cross-attention mechanism, thereby\nguiding the accurate image reconstruction. Furthermore, to accelerate the\nreverse sampling procedure of diffusion model and address the limitations of\nfrequency perception, we introduce a wavelet-oriented noise estimating network\n(WNE-Net). Extensive experiments conducted on eight publicly available datasets\ndemonstrate the effectiveness of our proposed approach in both task-specific\nand all-in-one applications.\n","authors":["Yuanbo Wen","Tao Gao","Ziqi Li","Jing Zhang","Kaihao Zhang","Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08227v1","updated":"2024-11-12T22:43:16Z","published":"2024-11-12T22:43:16Z","title":"DPU: Dynamic Prototype Updating for Multimodal Out-of-Distribution\n Detection","summary":" Out-of-distribution (OOD) detection is essential for ensuring the robustness\nof machine learning models by identifying samples that deviate from the\ntraining distribution. While traditional OOD detection has primarily focused on\nsingle-modality inputs, such as images, recent advances in multimodal models\nhave demonstrated the potential of leveraging multiple modalities (e.g., video,\noptical flow, audio) to enhance detection performance. However, existing\nmethods often overlook intra-class variability within in-distribution (ID)\ndata, assuming that samples of the same class are perfectly cohesive and\nconsistent. This assumption can lead to performance degradation, especially\nwhen prediction discrepancies are uniformly amplified across all samples. To\naddress this issue, we propose Dynamic Prototype Updating (DPU), a novel\nplug-and-play framework for multimodal OOD detection that accounts for\nintra-class variations. Our method dynamically updates class center\nrepresentations for each class by measuring the variance of similar samples\nwithin each batch, enabling adaptive adjustments. This approach allows us to\namplify prediction discrepancies based on the updated class centers, thereby\nimproving the model's robustness and generalization across different\nmodalities. Extensive experiments on two tasks, five datasets, and nine base\nOOD algorithms demonstrate that DPU significantly improves OOD detection\nperformance, setting a new state-of-the-art in multimodal OOD detection, with\nimprovements of up to 80 percent in Far-OOD detection. To facilitate\naccessibility and reproducibility, our code is publicly available on GitHub.\n","authors":["Shawn Li","Huixian Gong","Hao Dong","Tiankai Yang","Zhengzhong Tu","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.08227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08216v1","updated":"2024-11-12T22:16:50Z","published":"2024-11-12T22:16:50Z","title":"GTA: Global Tracklet Association for Multi-Object Tracking in Sports","summary":" Multi-object tracking in sports scenarios has become one of the focal points\nin computer vision, experiencing significant advancements through the\nintegration of deep learning techniques. Despite these breakthroughs,\nchallenges remain, such as accurately re-identifying players upon re-entry into\nthe scene and minimizing ID switches. In this paper, we propose an\nappearance-based global tracklet association algorithm designed to enhance\ntracking performance by splitting tracklets containing multiple identities and\nconnecting tracklets seemingly from the same identity. This method can serve as\na plug-and-play refinement tool for any multi-object tracker to further boost\ntheir performance. The proposed method achieved a new state-of-the-art\nperformance on the SportsMOT dataset with HOTA score of 81.04%. Similarly, on\nthe SoccerNet dataset, our method enhanced multiple trackers' performance,\nconsistently increasing the HOTA score from 79.41% to 83.11%. These significant\nand consistent improvements across different trackers and datasets underscore\nour proposed method's potential impact on the application of sports player\ntracking. We open-source our project codebase at\nhttps://github.com/sjc042/gta-link.git.\n","authors":["Jiacheng Sun","Hsiang-Wei Huang","Cheng-Yen Yang","Zhongyu Jiang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2411.08216v1.pdf","comment":"Accepted by ACCV 2024 MLCSA Workshop"},{"id":"http://arxiv.org/abs/2411.08196v1","updated":"2024-11-12T21:34:30Z","published":"2024-11-12T21:34:30Z","title":"Latent Space Disentanglement in Diffusion Transformers Enables Precise\n Zero-shot Semantic Editing","summary":" Diffusion Transformers (DiTs) have recently achieved remarkable success in\ntext-guided image generation. In image editing, DiTs project text and image\ninputs to a joint latent space, from which they decode and synthesize new\nimages. However, it remains largely unexplored how multimodal information\ncollectively forms this joint space and how they guide the semantics of the\nsynthesized images. In this paper, we investigate the latent space of DiT\nmodels and uncover two key properties: First, DiT's latent space is inherently\nsemantically disentangled, where different semantic attributes can be\ncontrolled by specific editing directions. Second, consistent semantic editing\nrequires utilizing the entire joint latent space, as neither encoded image nor\ntext alone contains enough semantic information. We show that these editing\ndirections can be obtained directly from text prompts, enabling precise\nsemantic control without additional training or mask annotations. Based on\nthese insights, we propose a simple yet effective Encode-Identify-Manipulate\n(EIM) framework for zero-shot fine-grained image editing. Specifically, we\nfirst encode both the given source image and the text prompt that describes the\nimage, to obtain the joint latent embedding. Then, using our proposed Hessian\nScore Distillation Sampling (HSDS) method, we identify editing directions that\ncontrol specific target attributes while preserving other image features. These\ndirections are guided by text prompts and used to manipulate the latent\nembeddings. Moreover, we propose a new metric to quantify the disentanglement\ndegree of the latent space of diffusion models. Extensive experiment results on\nour new curated benchmark dataset and analysis demonstrate DiT's\ndisentanglement properties and effectiveness of the EIM framework.\n","authors":["Zitao Shuai","Chenwei Wu","Zhengxu Tang","Bowen Song","Liyue Shen"],"pdf_url":"https://arxiv.org/pdf/2411.08196v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2408.13335"},{"id":"http://arxiv.org/abs/2411.08195v1","updated":"2024-11-12T21:33:11Z","published":"2024-11-12T21:33:11Z","title":"An Explainable Machine Learning Approach for Age and Gender Estimation\n in Living Individuals Using Dental Biometrics","summary":" Objectives: Age and gender estimation is crucial for various applications,\nincluding forensic investigations and anthropological studies. This research\naims to develop a predictive system for age and gender estimation in living\nindividuals, leveraging dental measurements such as Coronal Height (CH),\nCoronal Pulp Cavity Height (CPCH), and Tooth Coronal Index (TCI). Methods:\nMachine learning models were employed in our study, including Cat Boost\nClassifier (Catboost), Gradient Boosting Machine (GBM), Ada Boost Classifier\n(AdaBoost), Random Forest (RF), eXtreme Gradient Boosting (XGB), Light Gradient\nBoosting Machine (LGB), and Extra Trees Classifier (ETC), to analyze dental\ndata from 862 living individuals (459 males and 403 females). Specifically,\nperiapical radiographs from six teeth per individual were utilized, including\npremolars and molars from both maxillary and mandibular. A novel ensemble\nlearning technique was developed, which uses multiple models each tailored to\ndistinct dental metrics, to estimate age and gender accurately. Furthermore, an\nexplainable AI model has been created utilizing SHAP, enabling dental experts\nto make judicious decisions based on comprehensible insight. Results: The RF\nand XGB models were particularly effective, yielding the highest F1 score for\nage and gender estimation. Notably, the XGB model showed a slightly better\nperformance in age estimation, achieving an F1 score of 73.26%. A similar trend\nfor the RF model was also observed in gender estimation, achieving a F1 score\nof 77.53%. Conclusions: This study marks a significant advancement in dental\nforensic methods, showcasing the potential of machine learning to automate age\nand gender estimation processes with improved accuracy.\n","authors":["Mohsin Ali","Haider Raza","John Q Gan","Ariel Pokhojaev","Matanel Katz","Esra Kosan","Dian Agustin Wahjuningrum","Omnina Saleh","Rachel Sarig","Akhilanada Chaurasia"],"pdf_url":"https://arxiv.org/pdf/2411.08195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00169v2","updated":"2024-11-12T21:16:52Z","published":"2024-07-31T21:42:42Z","title":"Strike the Balance: On-the-Fly Uncertainty based User Interactions for\n Long-Term Video Object Segmentation","summary":" In this paper, we introduce a variant of video object segmentation (VOS) that\nbridges interactive and semi-automatic approaches, termed Lazy Video Object\nSegmentation (ziVOS). In contrast, to both tasks, which handle video object\nsegmentation in an off-line manner (i.e., pre-recorded sequences), we propose\nthrough ziVOS to target online recorded sequences. Here, we strive to strike a\nbalance between performance and robustness for long-term scenarios by\nsoliciting user feedback's on-the-fly during the segmentation process. Hence,\nwe aim to maximize the tracking duration of an object of interest, while\nrequiring minimal user corrections to maintain tracking over an extended\nperiod. We propose a competitive baseline, i.e., Lazy-XMem, as a reference for\nfuture works in ziVOS. Our proposed approach uses an uncertainty estimation of\nthe tracking state to determine whether a user interaction is necessary to\nrefine the model's prediction. To quantitatively assess the performance of our\nmethod and the user's workload, we introduce complementary metrics alongside\nthose already established in the field. We evaluate our approach using the\nrecently introduced LVOS dataset, which offers numerous long-term videos. Our\ncode is publicly available at https://github.com/Vujas-Eteph/LazyXMem.\n","authors":["Stéphane Vujasinović","Stefan Becker","Sebastian Bullinger","Norbert Scherer-Negenborn","Michael Arens","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2408.00169v2.pdf","comment":"Accepted at ACCV 2024"},{"id":"http://arxiv.org/abs/2411.08187v1","updated":"2024-11-12T21:12:51Z","published":"2024-11-12T21:12:51Z","title":"TractoEmbed: Modular Multi-level Embedding framework for white matter\n tract segmentation","summary":" White matter tract segmentation is crucial for studying brain structural\nconnectivity and neurosurgical planning. However, segmentation remains\nchallenging due to issues like class imbalance between major and minor tracts,\nstructural similarity, subject variability, symmetric streamlines between\nhemispheres etc. To address these challenges, we propose TractoEmbed, a modular\nmulti-level embedding framework, that encodes localized representations through\nlearning tasks in respective encoders. In this paper, TractoEmbed introduces a\nnovel hierarchical streamline data representation that captures maximum spatial\ninformation at each level i.e. individual streamlines, clusters, and patches.\nExperiments show that TractoEmbed outperforms state-of-the-art methods in white\nmatter tract segmentation across different datasets, and spanning various age\ngroups. The modular framework directly allows the integration of additional\nembeddings in future works.\n","authors":["Anoushkrit Goel","Bipanjit Singh","Ankita Joshi","Ranjeet Ranjan Jha","Chirag Ahuja","Aditya Nigam","Arnav Bhavsar"],"pdf_url":"https://arxiv.org/pdf/2411.08187v1.pdf","comment":"Accepted at 27th International Conference on Pattern Recognition\n (ICPR), 2024 15 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.07832v6","updated":"2024-11-12T20:51:07Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets.\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08171v1","updated":"2024-11-12T20:30:23Z","published":"2024-11-12T20:30:23Z","title":"Comprehensive and Comparative Analysis between Transfer Learning and\n Custom Built VGG and CNN-SVM Models for Wildfire Detection","summary":" Contemporary Artificial Intelligence (AI) and Machine Learning (ML) research\nplaces a significant emphasis on transfer learning, showcasing its\ntransformative potential in enhancing model performance across diverse domains.\nThis paper examines the efficiency and effectiveness of transfer learning in\nthe context of wildfire detection. Three purpose-built models -- Visual\nGeometry Group (VGG)-7, VGG-10, and Convolutional Neural Network (CNN)-Support\nVector Machine(SVM) CNN-SVM -- are rigorously compared with three pretrained\nmodels -- VGG-16, VGG-19, and Residual Neural Network (ResNet) ResNet101. We\ntrained and evaluated these models using a dataset that captures the\ncomplexities of wildfires, incorporating variables such as varying lighting\nconditions, time of day, and diverse terrains. The objective is to discern how\ntransfer learning performs against models trained from scratch in addressing\nthe intricacies of the wildfire detection problem. By assessing the performance\nmetrics, including accuracy, precision, recall, and F1 score, a comprehensive\nunderstanding of the advantages and disadvantages of transfer learning in this\nspecific domain is obtained. This study contributes valuable insights to the\nongoing discourse, guiding future directions in AI and ML research. Keywords:\nWildfire prediction, deep learning, machine learning fire, detection\n","authors":["Aditya V. Jonnalagadda","Hashim A. Hashim","Andrew Harris"],"pdf_url":"https://arxiv.org/pdf/2411.08171v1.pdf","comment":"In Proc. of the 2024 IEEE International Conference On Intelligent\n Computing in Data Sciences"},{"id":"http://arxiv.org/abs/2411.08164v1","updated":"2024-11-12T20:15:32Z","published":"2024-11-12T20:15:32Z","title":"EAPCR: A Universal Feature Extractor for Scientific Data without\n Explicit Feature Relation Patterns","summary":" Conventional methods, including Decision Tree (DT)-based methods, have been\neffective in scientific tasks, such as non-image medical diagnostics, system\nanomaly detection, and inorganic catalysis efficiency prediction. However, most\ndeep-learning techniques have struggled to surpass or even match this level of\nsuccess as traditional machine-learning methods. The primary reason is that\nthese applications involve multi-source, heterogeneous data where features lack\nexplicit relationships. This contrasts with image data, where pixels exhibit\nspatial relationships; textual data, where words have sequential dependencies;\nand graph data, where nodes are connected through established associations. The\nabsence of explicit Feature Relation Patterns (FRPs) presents a significant\nchallenge for deep learning techniques in scientific applications that are not\nimage, text, and graph-based. In this paper, we introduce EAPCR, a universal\nfeature extractor designed for data without explicit FRPs. Tested across\nvarious scientific tasks, EAPCR consistently outperforms traditional methods\nand bridges the gap where deep learning models fall short. To further\ndemonstrate its robustness, we synthesize a dataset without explicit FRPs.\nWhile Kolmogorov-Arnold Network (KAN) and feature extractors like Convolutional\nNeural Networks (CNNs), Graph Convolutional Networks (GCNs), and Transformers\nstruggle, EAPCR excels, demonstrating its robustness and superior performance\nin scientific tasks without FRPs.\n","authors":["Zhuohang Yu","Ling An","Yansong Li","Yu Wu","Zeyu Dong","Zhangdi Liu","Le Gao","Zhenyu Zhang","Chichun Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.08164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08158v1","updated":"2024-11-12T20:07:59Z","published":"2024-11-12T20:07:59Z","title":"TomoGRAF: A Robust and Generalizable Reconstruction Network for\n Single-View Computed Tomography","summary":" Computed tomography (CT) provides high spatial resolution visualization of 3D\nstructures for scientific and clinical applications. Traditional\nanalytical/iterative CT reconstruction algorithms require hundreds of angular\ndata samplings, a condition that may not be met in practice due to physical and\nmechanical limitations. Sparse view CT reconstruction has been proposed using\nconstrained optimization and machine learning methods with varying success,\nless so for ultra-sparse view CT reconstruction with one to two views. Neural\nradiance field (NeRF) is a powerful tool for reconstructing and rendering 3D\nnatural scenes from sparse views, but its direct application to 3D medical\nimage reconstruction has been minimally successful due to the differences\nbetween optical and X-ray photon transportation. Here, we develop a novel\nTomoGRAF framework incorporating the unique X-ray transportation physics to\nreconstruct high-quality 3D volumes using ultra-sparse projections without\nprior. TomoGRAF captures the CT imaging geometry, simulates the X-ray casting\nand tracing process, and penalizes the difference between simulated and ground\ntruth CT sub-volume during training. We evaluated the performance of TomoGRAF\non an unseen dataset of distinct imaging characteristics from the training data\nand demonstrated a vast leap in performance compared with state-of-the-art deep\nlearning and NeRF methods. TomoGRAF provides the first generalizable solution\nfor image-guided radiotherapy and interventional radiology applications, where\nonly one or a few X-ray views are available, but 3D volumetric information is\ndesired.\n","authors":["Di Xu","Yang Yang","Hengjie Liu","Qihui Lyu","Martina Descovich","Dan Ruan","Ke Sheng"],"pdf_url":"https://arxiv.org/pdf/2411.08158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05249v5","updated":"2024-11-12T19:59:51Z","published":"2024-10-07T17:52:56Z","title":"LoTLIP: Improving Language-Image Pre-training for Long Text\n Understanding","summary":" Understanding long text is of great demands in practice but beyond the reach\nof most language-image pre-training (LIP) models. In this work, we empirically\nconfirm that the key reason causing such an issue is that the training images\nare usually paired with short captions, leaving certain tokens easily\novershadowed by salient tokens. Towards this problem, our initial attempt is to\nrelabel the data with long captions, however, directly learning with which may\nlead to performance degradation in understanding short text (e.g., in the image\nclassification task). Then, after incorporating corner tokens to aggregate\ndiverse textual information, we manage to help the model catch up to its\noriginal level of short text understanding yet greatly enhance its capability\nof long text understanding. We further look into whether the model can\ncontinuously benefit from longer captions and notice a clear trade-off between\nthe performance and the efficiency. Finally, we validate the effectiveness of\nour approach using a self-constructed large-scale dataset, which consists of\n100M long caption oriented text-image pairs. Our method demonstrates superior\nperformance in long-text-image retrieval tasks. The project page is available\nat https://wuw2019.github.io/lot-lip.\n","authors":["Wei Wu","Kecheng Zheng","Shuailei Ma","Fan Lu","Yuxin Guo","Yifei Zhang","Wei Chen","Qingpei Guo","Yujun Shen","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2410.05249v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08128v1","updated":"2024-11-12T19:12:12Z","published":"2024-11-12T19:12:12Z","title":"CameraHMR: Aligning People with Perspective","summary":" We address the challenge of accurate 3D human pose and shape estimation from\nmonocular images. The key to accuracy and robustness lies in high-quality\ntraining data. Existing training datasets containing real images with pseudo\nground truth (pGT) use SMPLify to fit SMPL to sparse 2D joint locations,\nassuming a simplified camera with default intrinsics. We make two contributions\nthat improve pGT accuracy. First, to estimate camera intrinsics, we develop a\nfield-of-view prediction model (HumanFoV) trained on a dataset of images\ncontaining people. We use the estimated intrinsics to enhance the 4D-Humans\ndataset by incorporating a full perspective camera model during SMPLify\nfitting. Second, 2D joints provide limited constraints on 3D body shape,\nresulting in average-looking bodies. To address this, we use the BEDLAM dataset\nto train a dense surface keypoint detector. We apply this detector to the\n4D-Humans dataset and modify SMPLify to fit the detected keypoints, resulting\nin significantly more realistic body shapes. Finally, we upgrade the HMR2.0\narchitecture to include the estimated camera parameters. We iterate model\ntraining and SMPLify fitting initialized with the previously trained model.\nThis leads to more accurate pGT and a new model, CameraHMR, with\nstate-of-the-art accuracy. Code and pGT are available for research purposes.\n","authors":["Priyanka Patel","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2411.08128v1.pdf","comment":"3DV 2025"},{"id":"http://arxiv.org/abs/2411.08127v1","updated":"2024-11-12T19:09:45Z","published":"2024-11-12T19:09:45Z","title":"TIPO: Text to Image with Text Presampling for Prompt Optimization","summary":" TIPO (Text to Image with text pre-sampling for Prompt Optimization) is an\ninnovative framework designed to enhance text-to-image (T2I) generation by\nlanguage model (LM) for automatic prompt engineering. By refining and extending\nuser-provided prompts, TIPO bridges the gap between simple inputs and the\ndetailed prompts required for high-quality image generation. Unlike previous\napproaches that rely on Large Language Models (LLMs) or reinforcement learning\n(RL), TIPO adjusts user input prompts with the distribution of a trained prompt\ndataset, eliminating the need for complex runtime cost via lightweight model.\nThis pre-sampling approach enables efficient and scalable prompt optimization,\ngrounded in the model's training distribution. Experimental results demonstrate\nTIPO's effectiveness in improving aesthetic scores, reducing image corruption,\nand better aligning generated images with dataset distributions. These findings\nhighlight the critical role of prompt engineering in T2I systems and open\navenues for broader applications of automatic prompt refinement.\n","authors":["Shih-Ying Yeh","Sang-Hyun Park","Giyeong Oh","Min Song","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2411.08127v1.pdf","comment":"21 pages, 13 figures"},{"id":"http://arxiv.org/abs/2411.08085v1","updated":"2024-11-12T16:52:51Z","published":"2024-11-12T16:52:51Z","title":"Deep Learning 2.0: Artificial Neurons That Matter -- Reject Correlation,\n Embrace Orthogonality","summary":" We introduce a yat-product-powered neural network, the Neural Matter Network\n(NMN), a breakthrough in deep learning that achieves non-linear pattern\nrecognition without activation functions. Our key innovation relies on the\nyat-product and yat-product, which naturally induces non-linearity by\nprojecting inputs into a pseudo-metric space, eliminating the need for\ntraditional activation functions while maintaining only a softmax layer for\nfinal class probability distribution. This approach simplifies network\narchitecture and provides unprecedented transparency into the network's\ndecision-making process. Our comprehensive empirical evaluation across\ndifferent datasets demonstrates that NMN consistently outperforms traditional\nMLPs. The results challenge the assumption that separate activation functions\nare necessary for effective deep-learning models. The implications of this work\nextend beyond immediate architectural benefits, by eliminating intermediate\nactivation functions while preserving non-linear capabilities, yat-MLP\nestablishes a new paradigm for neural network design that combines simplicity\nwith effectiveness. Most importantly, our approach provides unprecedented\ninsights into the traditionally opaque \"black-box\" nature of neural networks,\noffering a clearer understanding of how these models process and classify\ninformation.\n","authors":["Taha Bouhsine"],"pdf_url":"https://arxiv.org/pdf/2411.08085v1.pdf","comment":"Submitted to CVPR 2025"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2411.08025v1","updated":"2024-11-12T18:55:01Z","published":"2024-11-12T18:55:01Z","title":"Degradation mode estimation using reconstructed open circuit voltage\n curves from multi-year home storage field data","summary":" A battery's open circuit voltage (OCV) curve can be seen as its\nelectrochemical signature. Its shape and age-related shift provide information\non aging processes and material composition on both electrodes. However, most\nOCV analyses have to be conducted in laboratories or specified field tests to\nensure suitable data quality. Here, we present a method that reconstructs the\nOCV curve continuously over the lifetime of a battery using the operational\ndata of home storage field measurements over eight years. We show that\nlow-dynamic operational phases, such as the overnight household supply with\nelectricity, are suitable for recreating quasi OCV curves. We apply incremental\ncapacity analysis and differential voltage analysis and show that known\nfeatures of interest from laboratory measurements can be tracked to determine\ndegradation modes in field operation. The dominant degradation mode observed\nfor the home storage systems under evaluation is the loss of lithium inventory,\nwhile the loss of active material might be present in some cases. We apply the\nmethod to lithium nickel manganese cobalt oxide (NMC), a blend of lithium\nmanganese oxide (LMO) and NMC, and lithium iron phosphate (LFP) batteries.\nField capacity tests validate the method.\n","authors":["Jan Figgener","Jakob Bors","Matthias Kuipers","Felix Hildenbrand","Mark Junker","Lucas Koltermann","Philipp Woerner","Marc Mennekes","David Haberschusz","Kai-Philipp Kairies","Dirk Uwe Sauer"],"pdf_url":"https://arxiv.org/pdf/2411.08025v1.pdf","comment":"17 pages, 10 Figures, 1 Table"},{"id":"http://arxiv.org/abs/2411.07998v1","updated":"2024-11-12T18:23:52Z","published":"2024-11-12T18:23:52Z","title":"A Symmetry-Preserving Reduced-Order Observer","summary":" A symmetry-preserving, reduced-order state observer is presented for the\nunmeasured part of a system's state, where the nonlinear system dynamics\nexhibit symmetry under the action of a Lie group. The proposed observer takes\nadvantage of this symmetry through the use of a moving frame that constructs\ninvariant mappings of the measurements. Sufficient conditions for the observer\nto be asymptotically stable are developed by studying the stability of an\ninvariant error system. As an illustrative example, the observer is applied to\nthe problem of rigid-body velocity estimation, which demonstrate how exploiting\nthe symmetry of the system can simplify the stabilization of the estimation\nerror dynamics.\n","authors":["Jeremy W. Hopwood","Craig A. Woolsey"],"pdf_url":"https://arxiv.org/pdf/2411.07998v1.pdf","comment":"6 pages, 6 figures, Submission to the 2025 American Control\n Conference"},{"id":"http://arxiv.org/abs/2411.07971v1","updated":"2024-11-12T17:51:45Z","published":"2024-11-12T17:51:45Z","title":"Optimal Control of Mechanical Ventilators with Learned Respiratory\n Dynamics","summary":" Deciding on appropriate mechanical ventilator management strategies\nsignificantly impacts the health outcomes for patients with respiratory\ndiseases. Acute Respiratory Distress Syndrome (ARDS) is one such disease that\nrequires careful ventilator operation to be effectively treated. In this work,\nwe frame the management of ventilators for patients with ARDS as a sequential\ndecision making problem using the Markov decision process framework. We\nimplement and compare controllers based on clinical guidelines contained in the\nARDSnet protocol, optimal control theory, and learned latent dynamics\nrepresented as neural networks. The Pulse Physiology Engine's respiratory\ndynamics simulator is used to establish a repeatable benchmark, gather\nsimulated data, and quantitatively compare these controllers. We score\nperformance in terms of measured improvement in established ARDS health markers\n(pertaining to improved respiratory rate, oxygenation, and vital signs). Our\nresults demonstrate that techniques leveraging neural networks and optimal\ncontrol can automatically discover effective ventilation management strategies\nwithout access to explicit ventilator management procedures or guidelines (such\nas those defined in the ARDSnet protocol).\n","authors":["Isaac Ronald Ward","Dylan M. Asmar","Mansur Arief","Jana Krystofova Mike","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2411.07971v1.pdf","comment":"2024 IEEE 37th International Symposium on Computer-Based Medical\n Systems (CBMS), 7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.09434v2","updated":"2024-11-12T17:49:12Z","published":"2024-07-12T17:09:47Z","title":"Foundation Models for the Electric Power Grid","summary":" Foundation models (FMs) currently dominate news headlines. They employ\nadvanced deep learning architectures to extract structural information\nautonomously from vast datasets through self-supervision. The resulting rich\nrepresentations of complex systems and dynamics can be applied to many\ndownstream applications. Therefore, FMs can find uses in electric power grids,\nchallenged by the energy transition and climate change. In this paper, we call\nfor the development of, and state why we believe in, the potential of FMs for\nelectric grids. We highlight their strengths and weaknesses amidst the\nchallenges of a changing grid. We argue that an FM learning from diverse grid\ndata and topologies could unlock transformative capabilities, pioneering a new\napproach in leveraging AI to redefine how we manage complexity and uncertainty\nin the electric grid. Finally, we discuss a power grid FM concept, namely\nGridFM, based on graph neural networks and show how different downstream tasks\nbenefit.\n","authors":["Hendrik F. Hamann","Thomas Brunschwiler","Blazhe Gjorgiev","Leonardo S. A. Martins","Alban Puech","Anna Varbella","Jonas Weiss","Juan Bernabe-Moreno","Alexandre Blondin Massé","Seong Choi","Ian Foster","Bri-Mathias Hodge","Rishabh Jain","Kibaek Kim","Vincent Mai","François Mirallès","Martin De Montigny","Octavio Ramos-Leaños","Hussein Suprême","Le Xie","El-Nasser S. Youssef","Arnaud Zinflou","Alexander J. Belyi","Ricardo J. Bessa","Bishnu Prasad Bhattarai","Johannes Schmude","Stanislav Sobolevsky"],"pdf_url":"https://arxiv.org/pdf/2407.09434v2.pdf","comment":"Major equal contributors: H.F.H., T.B., B.G., L.S.A.M., A.P., A.V.,\n J.W.; Significant equal contributors: J.B., A.B.M., S.C., I.F., B.H., R.J.,\n K.K., V.M., F.M., M.D.M., O.R., H.S., L.X., E.S.Y., A.Z.; Other equal\n contributors: A.J.B., R.J.B., B.P.B., J.S., S.S; Lead contact: H.F.H"},{"id":"http://arxiv.org/abs/2312.13182v2","updated":"2024-11-12T17:42:22Z","published":"2023-12-20T16:45:26Z","title":"Goal-oriented Semantic Communications for Robotic Waypoint Transmission:\n The Value and Age of Information Approach","summary":" The ultra-reliable and low-latency communication (URLLC) service of the\nfifth-generation (5G) mobile communication network struggles to support safe\nrobot operation. Nowadays, the sixth-generation (6G) mobile communication\nnetwork is proposed to provide hyper-reliable and low-latency communication to\nenable safer control for robots. However, current 5G/ 6G research mainly\nfocused on improving communication performance, while the robotics community\nmostly assumed communication to be ideal. To jointly consider communication and\nrobotic control with a focus on the specific robotic task, we propose\ngoal-oriented semantic communication in robotic control (GSRC) to exploit the\ncontext of data and its importance in achieving the task at both transmitter\nand receiver. At the transmitter, we propose a deep reinforcement learning\nalgorithm to generate optimal control and command (C&C) data and a proactive\nrepetition scheme (DeepPro) to increase the successful transmission\nprobability. At the receiver, we design the value of information (VoI) and age\nof information (AoI) based queue ordering mechanism (VA-QOM) to rank the queue\nbased on the semantic information extracted from AoI and VoI. The simulation\nresults validate that our proposed GSRC framework achieves a 91.5% improvement\nin the mean square error compared to the traditional unmanned aerial vehicle\ncontrol framework.\n","authors":["Wenchao Wu","Yuanqing Yang","Yansha Deng","A. Hamid Aghvami"],"pdf_url":"https://arxiv.org/pdf/2312.13182v2.pdf","comment":"The paper has been accepted in IEEE TWC"},{"id":"http://arxiv.org/abs/2403.15771v2","updated":"2024-11-12T17:17:36Z","published":"2024-03-23T08:58:31Z","title":"Small Noise Analysis of Non-Parametric Closed-Loop Identification","summary":" We revisit the problem of non-parametric closed-loop identification in\nfrequency domain; we give a brief survey of the literature and provide a small\nnoise analysis of the direct, indirect, and joint input-output methods when two\nindependent experiments with identical excitation are used. The analysis is\nasymptotic in the noise variance (i.e., as the standard deviation of the\ninnovations $\\sigma \\to 0$), for a finite data record of length $N$. We\nhighlight the relationship between the estimators accuracy and the loop shape\nvia asymptotic variance expressions given in terms of the sensitivity function.\nThe results are illustrated using a numerical simulation example.\n","authors":["Mohamed Abdalmoaty","Roy S. Smith"],"pdf_url":"https://arxiv.org/pdf/2403.15771v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07927v1","updated":"2024-11-12T17:00:32Z","published":"2024-11-12T17:00:32Z","title":"Control-Oriented Models Inform Synthetic Biology Strategies in CAR T\n Cell Immunotherapy","summary":" Chimeric antigen receptor (CAR) T cell therapy is revolutionizing the\ntreatment of blood cancers. Mathematical models that can predict the\neffectiveness of immunotherapies such as CAR T are of increasing interest due\nto their ability to reduce the number of experiments performed and to guide the\ntheoretical development of new therapeutic strategies. {Following this\nrationale, we propose the use of control-oriented models to guide the\naugmentation of CAR T therapy with synthetic gene circuitry. Here we present an\ninitial investigation where we adapt a previously developed CAR T model for\ncontrol-oriented purposes. We then explore the impact of realistic alternative\nactivation methods as control inputs to ensure effective tumor clearance.\n","authors":["Raffaele Romagnoli"],"pdf_url":"https://arxiv.org/pdf/2411.07927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06560v2","updated":"2024-11-12T16:48:12Z","published":"2024-11-10T18:52:33Z","title":"ElectricityEmissions.jl: A Framework for the Comparison of Carbon\n Intensity Signals","summary":" An increasing number of individuals, companies and organizations are\ninterested in computing and minimizing the carbon emissions associated with\ntheir real-time electricity consumption. To achieve this, they require a carbon\nsignal, i.e. a metric that defines the real-time carbon intensity of their\nelectricity supply. Unfortunately, in a grid with multiple generation sources\nand multiple consumers, there is no unambiguous way to trace electricity from\nsource to sink. This makes it hard to define an appropriate signal, leading to\na raging discussion about how to best quantify the carbon footprint of\nelectricity.\n This paper seeks to inform the discussion about which carbon signal is better\nor more suitable for two important use cases, namely carbon-informed load\nshifting and carbon accounting. We do this by developing a new software package\nElectricityEmissions$.$jl, that computes several established and newly proposed\ncarbon emission metrics for standard electric grid test cases. We also\ndemonstrate how the package can be used to investigate the effects of using\nthese metrics to guide load shifting. Our results affirm previous research,\nwhich showed that the choice of carbon emission metric has significant impact\non shifting results and associated carbon emission reductions. In addition, we\ndemonstrate the impact of load shifting on both the consumers that perform the\nshifting and consumers that do not. Disconcertingly, we observe that shifting\naccording to common metrics such as average carbon emissions can reduce the\namount of emissions allocated to data center, but cause an increase in the\ntotal emissions of the system.\n","authors":["Joe Gorka","Noah Rhodes","Line Roald"],"pdf_url":"https://arxiv.org/pdf/2411.06560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07887v1","updated":"2024-11-12T15:49:03Z","published":"2024-11-12T15:49:03Z","title":"Stochastic MPC for Finite Gaussian Mixture Disturbances with Guarantees","summary":" This paper presents a stochastic model predictive control (SMPC) algorithm\nfor linear systems subject to additive Gaussian mixture disturbances, with the\ngoal of satisfying chance constraints. To synthesize a control strategy, the\nstochastic control problem is reformulated into an MPC problem. The\nreformulation begins by decoupling the mixture distribution and decomposing the\nsystem dynamics. Using stochastic simulation relations, we then redefine the\nstochastic control problem onto the resultant abstract system. Next, constraint\ntightening forms an MPC problem subject to finite disturbances. A branching\ncontrol is introduced to solve the MPC problem. Finally, a controller\nrefinement procedure determines a valid control strategy. Our contribution is\nan extension of the SMPC literature to accommodate Gaussian mixture\ndisturbances while retaining recursive feasibility and closed-loop guarantees.\nWe illustrate the retention of guarantees with a case study of vehicle control\non an ill-maintained road.\n","authors":["Maico H. W. Engelaar","Micha P. P. Swaanen","Mircea Lazar","Sofie Haesaert"],"pdf_url":"https://arxiv.org/pdf/2411.07887v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.07883v1","updated":"2024-11-12T15:42:56Z","published":"2024-11-12T15:42:56Z","title":"Efficient Creation of Behavior Models with Variable Modeling Depths Used\n in Digital Twins","summary":" Behavior models form an integral component of Digital Twins. The specific\ncharacteristics of these models may vary depending on the use case. One of\nthese key characteristics is the modeling depth. Behavior models with a lower\nmodeling depth depict the behavior of the asset in an abstract way, while those\nwith a higher modeling depth depict the behavior in detail. Even if very\ndetailed behavior models are flexible and realistic, they also require a lot of\nresources such as computing power, simulation time and memory requirements. In\nsome applications, however, only limited resources are available. The automated\ncreation of Digital Twins is of crucial importance for their widespread use.\nAlthough there are methods for the automated creation of behavior models for\nDigital Twins with a specific modeling depth, there is currently no method for\nthe automated creation of behavior models with varying modeling depths. This\narticle presents such an approach and demonstrates its advantages using two\nindustrial use cases. It is demonstrated that the automatically created\nbehavior models of lower modeling depth yield results that are almost identical\nto those of models with a higher modeling depth, but with significantly reduced\ncomputing time and required memory. This enables the efficient use of behavior\nmodels in a variety of use cases, regardless of the availability of resources.\n","authors":["Valentin Stegmaier","Walter Schaaf","Nasser Jazdi","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2411.07883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07862v1","updated":"2024-11-12T15:20:48Z","published":"2024-11-12T15:20:48Z","title":"Iterative Learning Control with Mismatch Compensation for Residual\n Vibration Suppression in Delta Robots","summary":" Unwanted vibrations stemming from the energy-optimized design of Delta robots\npose a challenge in their operation, especially with respect to precise\nreference tracking. To improve tracking accuracy, this paper proposes an\nadaptive mismatch-compensated iterative learning controller based on input\nshaping techniques. We establish a dynamic model considering the\nelectromechanical rigid-flexible coupling of the Delta robot, which integrates\nthe permanent magnet synchronous motor. Using this model, we design an\noptimization-based input shaper, considering the natural frequency of the\nrobot, which varies with the configuration. We proposed an iterative learning\ncontroller for the delta robot to improve tracking accuracy. Our iterative\nlearning controller incorporates model mismatch where the mismatch approximated\nby a fuzzy logic structure. The convergence property of the proposed controller\nis proved using a Barrier Composite Energy Function, providing a guarantee that\nthe tracking errors along the iteration axis converge to zero. Moreover,\nadaptive parameter update laws are designed to ensure convergence. Finally, we\nperform a series of high-fidelity simulations of the Delta robot using Simscape\nto demonstrate the effectiveness of the proposed control strategy.\n","authors":["Mingkun Wu","Alisa Rupenyan","Burkhard Corves"],"pdf_url":"https://arxiv.org/pdf/2411.07862v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2410.14803v3","updated":"2024-11-12T14:57:08Z","published":"2024-10-18T18:19:56Z","title":"DistRL: An Asynchronous Distributed Reinforcement Learning Framework for\n On-Device Control Agents","summary":" On-device control agents, especially on mobile devices, are responsible for\noperating mobile devices to fulfill users' requests, enabling seamless and\nintuitive interactions. Integrating Multimodal Large Language Models (MLLMs)\ninto these agents enhances their ability to understand and execute complex\ncommands, thereby improving user experience. However, fine-tuning MLLMs for\non-device control presents significant challenges due to limited data\navailability and inefficient online training processes. This paper introduces\nDistRL, a novel framework designed to enhance the efficiency of online RL\nfine-tuning for mobile device control agents. DistRL employs centralized\ntraining and decentralized data acquisition to ensure efficient fine-tuning in\nthe context of dynamic online interactions. Additionally, the framework is\nbacked by our tailor-made RL algorithm, which effectively balances exploration\nwith the prioritized utilization of collected data to ensure stable and robust\ntraining. Our experiments show that, on average, DistRL delivers a 3X\nimprovement in training efficiency and enables training data collection 2.4X\nfaster than the leading synchronous multi-machine methods. Notably, after\ntraining, DistRL achieves a 20% relative improvement in success rate compared\nto state-of-the-art methods on general Android tasks from an open benchmark,\nsignificantly outperforming existing approaches while maintaining the same\ntraining time. These results validate DistRL as a scalable and efficient\nsolution, offering substantial improvements in both training efficiency and\nagent performance for real-world, in-the-wild device control tasks.\n","authors":["Taiyi Wang","Zhihao Wu","Jianheng Liu","Jianye Hao","Jun Wang","Kun Shao"],"pdf_url":"https://arxiv.org/pdf/2410.14803v3.pdf","comment":"Paper and Appendix, 25 pages"},{"id":"http://arxiv.org/abs/2411.07833v1","updated":"2024-11-12T14:35:45Z","published":"2024-11-12T14:35:45Z","title":"Robust Adaptive Safe Robotic Grasping with Tactile Sensing","summary":" Robotic grasping requires safe force interaction to prevent a grasped object\nfrom being damaged or slipping out of the hand. In this vein, this paper\nproposes an integrated framework for grasping with formal safety guarantees\nbased on Control Barrier Functions. We first design contact force and force\nclosure constraints, which are enforced by a safety filter to accomplish safe\ngrasping with finger force control. For sensory feedback, we develop a\ntechnique to estimate contact point, force, and torque from tactile sensors at\neach finger. We verify the framework with various safety filters in a numerical\nsimulation under a two-finger grasping scenario. We then experimentally\nvalidate the framework by grasping multiple objects, including fragile lab\nglassware, in a real robotic setup, showing that safe grasping can be\nsuccessfully achieved in the real world. We evaluate the performance of each\nsafety filter in the context of safety violation and conservatism, and find\nthat disturbance observer-based control barrier functions provide superior\nperformance for safety guarantees with minimum conservatism. The demonstration\nvideo is available at https://youtu.be/Cuj47mkXRdg.\n","authors":["Yitaek Kim","Jeeseop Kim","Albert H. Li","Aaron D. Ames","Christoffer Sloth"],"pdf_url":"https://arxiv.org/pdf/2411.07833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12758v2","updated":"2024-11-12T14:27:22Z","published":"2023-09-22T09:59:16Z","title":"Distributionally Robust Model Predictive Control: Closed-loop Guarantees\n and Scalable Algorithms","summary":" We establish a collection of closed-loop guarantees and propose a scalable\noptimization algorithm for distributionally robust model predictive control\n(DRMPC) applied to linear systems, convex constraints, and quadratic costs. Via\nstandard assumptions for the terminal cost and constraint, we establish\ndistribtionally robust long-term and stage-wise performance guarantees for the\nclosed-loop system. We further demonstrate that a common choice of the terminal\ncost, i.e., via the discrete-algebraic Riccati equation, renders the origin\ninput-to-state stable for the closed-loop system. This choice also ensures that\nthe exact long-term performance of the closed-loop system is independent of the\nchoice of ambiguity set for the DRMPC formulation. Thus, we establish\nconditions under which DRMPC does not provide a long-term performance benefit\nrelative to stochastic MPC. To solve the DRMPC optimization problem, we propose\na Newton-type algorithm that empirically achieves superlinear convergence and\nguarantees the feasibility of each iterate. We demonstrate the implications of\nthe closed-loop guarantees and the scalability of the proposed algorithm via\ntwo examples. To facilitate the reproducibility of the results, we also provide\nopen-source code to implement the proposed algorithm and generate the figures.\n","authors":["Robert D. McAllister","Peyman Mohajerin Esfahani"],"pdf_url":"https://arxiv.org/pdf/2309.12758v2.pdf","comment":"36 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.07830v1","updated":"2024-11-12T14:25:24Z","published":"2024-11-12T14:25:24Z","title":"Singularity-Avoidance Control of Robotic Systems with Model Mismatch and\n Actuator Constraints","summary":" Singularities, manifesting as special configuration states, deteriorate robot\nperformance and may even lead to a loss of control over the system. This paper\naddresses the kinematic singularity concerns in robotic systems with model\nmismatch and actuator constraints through control barrier functions (CBFs). We\npropose a learning-based control strategy to prevent robots entering\nsingularity regions. More precisely, we leverage Gaussian process (GP)\nregression to learn the unknown model mismatch, where the prediction error is\nrestricted by a deterministic bound. Moreover, we offer the criteria for\nparameter selection to ensure the feasibility of CBFs subject to actuator\nconstraints. The proposed approach is validated by high-fidelity simulations on\na 2 degrees-of-freedom (DoFs) planar robot.\n","authors":["Mingkun Wu","Alisa Rupenyan","Burkhard Corves"],"pdf_url":"https://arxiv.org/pdf/2411.07830v1.pdf","comment":"This work has been submitted to ECC 2025 for possible publication"},{"id":"http://arxiv.org/abs/2402.18744v3","updated":"2024-11-12T14:11:55Z","published":"2024-02-28T22:59:27Z","title":"Timer-Based Coverage Control for Mobile Sensors","summary":" This work investigates the coverage control problem over a static, compact,\nand convex workspace and develops a hybrid extension of the continuous-time\nLloyd algorithm. Each agent in a multi-agent system (MAS) is equipped with a\ntimer mechanism that generates intermittent measurement and control update\nevents, which may occur asynchronously between agents. Between consecutive\nevent times, as determined by the corresponding timer mechanism, the controller\nof each agent is held constant. These controllers are shown to drive the\nconfiguration of the MAS into a neighborhood of the set of centroidal Voronoi\nconfigurations, i.e., the minimizers of the standard locational cost. The\ncombination of continuous-time dynamics with intermittently updated control\ninputs is modeled as a hybrid system. The coverage objective is posed as a set\nattractivity problem for hybrid systems, where an invariance-based convergence\nanalysis yields sufficient conditions that ensure maximal solutions of the\nhybrid system asymptotically converge to a desired set. A brief simulation\nexample is included to showcase the result.\n","authors":["Federico M. Zegers","Sean Phillips","Gregory P. Hicks"],"pdf_url":"https://arxiv.org/pdf/2402.18744v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07759v1","updated":"2024-11-12T12:37:50Z","published":"2024-11-12T12:37:50Z","title":"Optimizing Traffic Signal Control using High-Dimensional State\n Representation and Efficient Deep Reinforcement Learning","summary":" In reinforcement learning-based (RL-based) traffic signal control (TSC),\ndecisions on the signal timing are made based on the available information on\nvehicles at a road intersection. This forms the state representation for the RL\nenvironment which can either be high-dimensional containing several variables\nor a low-dimensional vector. Current studies suggest that using high\ndimensional state representations does not lead to improved performance on TSC.\nHowever, we argue, with experimental results, that the use of high dimensional\nstate representations can, in fact, lead to improved TSC performance with\nimprovements up to 17.9% of the average waiting time. This high-dimensional\nrepresentation is obtainable using the cost-effective vehicle-to-infrastructure\n(V2I) communication, encouraging its adoption for TSC. Additionally, given the\nlarge size of the state, we identified the need to have computational efficient\nmodels and explored model compression via pruning.\n","authors":["Lawrence Francis","Blessed Guda","Ahmed Biyabani"],"pdf_url":"https://arxiv.org/pdf/2411.07759v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2411.07087v2","updated":"2024-11-12T12:03:07Z","published":"2024-11-11T16:04:49Z","title":"OCMDP: Observation-Constrained Markov Decision Process","summary":" In many practical applications, decision-making processes must balance the\ncosts of acquiring information with the benefits it provides. Traditional\ncontrol systems often assume full observability, an unrealistic assumption when\nobservations are expensive. We tackle the challenge of simultaneously learning\nobservation and control strategies in such cost-sensitive environments by\nintroducing the Observation-Constrained Markov Decision Process (OCMDP), where\nthe policy influences the observability of the true state. To manage the\ncomplexity arising from the combined observation and control actions, we\ndevelop an iterative, model-free deep reinforcement learning algorithm that\nseparates the sensing and control components of the policy. This decomposition\nenables efficient learning in the expanded action space by focusing on when and\nwhat to observe, as well as determining optimal control actions, without\nrequiring knowledge of the environment's dynamics. We validate our approach on\na simulated diagnostic task and a realistic healthcare environment using\nHeartPole. Given both scenarios, the experimental results demonstrate that our\nmodel achieves a substantial reduction in observation costs on average,\nsignificantly outperforming baseline methods by a notable margin in efficiency.\n","authors":["Taiyi Wang","Jianheng Liu","Bryan Lee","Zhihao Wu","Yu Wu"],"pdf_url":"https://arxiv.org/pdf/2411.07087v2.pdf","comment":"Full paper, 14 Pages"},{"id":"http://arxiv.org/abs/2411.06589v2","updated":"2024-11-12T11:03:20Z","published":"2024-11-10T20:40:13Z","title":"Skipped Adjacency Pulse Width Modulation: Zero Voltage Switching over\n Full Duty Cycle Range for Hybrid Flying Capacitor Multi-Level Converters\n without Dynamic Level Changing","summary":" This paper proposes a method to achieve zero voltage switching (ZVS) across\nthe full duty cycle range in hybrid flying capacitor multilevel (FCML)\nconverters, eliminating the need for dynamic level changing and active\nre-balancing. Utilizing skipped adjacency pulse width modulation (SAPWM), this\napproach avoids the nearest pole voltage level, thereby increasing volt-seconds\nwithin specific duty cycle range. The method uses a modified PWM scheme, which\npreserves effective pole voltage by changing duty reference and employing\ndigital logic processing. Simulation results verify the proposed method\nachieving full-range ZVS. This SAPWM technique is compatible with hybrid FCML\nconverters with various levels, offering enhanced efficiency and reduced\nswitching losses.\n","authors":["Inhwi Hwang"],"pdf_url":"https://arxiv.org/pdf/2411.06589v2.pdf","comment":"7 pages, 13 figures, pre-print"},{"id":"http://arxiv.org/abs/2411.07686v1","updated":"2024-11-12T09:58:21Z","published":"2024-11-12T09:58:21Z","title":"Data-Driven Graph Switching for Cyber-Resilient Control in Microgrids","summary":" Distributed microgrids are conventionally dependent on communication networks\nto achieve secondary control objectives. This dependence makes them vulnerable\nto stealth data integrity attacks (DIAs) where adversaries may perform\nmanipulations via infected transmitters and repeaters to jeopardize stability.\nThis paper presents a physics-guided, supervised Artificial Neural Network\n(ANN)-based framework that identifies communication-level cyberattacks in\nmicrogrids by analyzing whether incoming measurements will cause abnormal\nbehavior of the secondary control layer. If abnormalities are detected, an\niteration through possible spanning tree graph topologies that can be used to\nfulfill secondary control objectives is done. Then, a communication network\ntopology that would not create secondary control abnormalities is identified\nand enforced for maximum stability. By altering the communication graph\ntopology, the framework eliminates the dependence of the secondary control\nlayer on inputs from compromised cyber devices helping it achieve resilience\nwithout instability. Several case studies are provided showcasing the\nrobustness of the framework against False Data Injections and repeater-level\nMan-in-the-Middle attacks. To understand practical feasibility, robustness is\nalso verified against larger microgrid sizes and in the presence of varying\nnoise levels. Our findings indicate that performance can be affected when\nattempting scalability in the presence of noise. However, the framework\noperates robustly in low-noise settings.\n","authors":["Suman Rath","Subham Sahoo"],"pdf_url":"https://arxiv.org/pdf/2411.07686v1.pdf","comment":"Accepted in IEEE Design Methodologies Conference (DMC) 2024"},{"id":"http://arxiv.org/abs/2411.07654v1","updated":"2024-11-12T09:06:16Z","published":"2024-11-12T09:06:16Z","title":"Spike Talk in Power Electronic Grids -- Leveraging Post Moore's\n Computing Laws","summary":" Emerging distributed generation demands highly reliable and resilient\ncoordinating control in microgrids. To improve on these aspects, spiking neural\nnetwork is leveraged, as a grid-edge intelligence tool to establish a talkative\ninfrastructure, Spike Talk, expediting coordination in next-generation\nmicrogrids without the need of communication at all. This paper unravels the\nphysics behind Spike Talk from the perspective of its distributed\ninfrastructure, which aims to address the Von Neumann Bottleneck. Relying on\ninferring information via power flows in tie lines, Spike Talk allows adaptive\nand flexible control and coordination itself, and features in synaptic\nplasticity facilitating online and local training functionality. Preliminary\ncase studies are demonstrated with results, while more extensive validations\nare to be included as future scopes of work.\n","authors":["Yubo Song","Subham Sahoo"],"pdf_url":"https://arxiv.org/pdf/2411.07654v1.pdf","comment":"The manuscript has been accepted for publication in the Proceedings\n of 2024 IEEE Design Methodologies for Power Electronics Conference (DMC2024)"},{"id":"http://arxiv.org/abs/2411.07642v1","updated":"2024-11-12T08:53:20Z","published":"2024-11-12T08:53:20Z","title":"Safety Filter Design for Articulated Frame Steering Vehicles In the\n Presence of Actuator Dynamics Using High-Order Control Barrier Functions","summary":" Articulated Frame Steering (AFS) vehicles are widely used in heavy-duty\nindustries, where they often operate near operators and laborers. Therefore,\ndesigning safe controllers for AFS vehicles is essential. In this paper, we\ndevelop a Quadratic Program (QP)-based safety filter that ensures feasibility\nfor AFS vehicles with affine actuator dynamics. To achieve this, we first\nderive the general equations of motion for AFS vehicles, incorporating affine\nactuator dynamics. We then introduce a novel High-Order Control Barrier\nFunction (HOCBF) candidate with equal relative degrees for both system\ncontrols. Finally, we design a Parametric Adaptive HOCBF (PACBF) and an\nalways-feasible, QP-based safety filter. Numerical simulations of AFS vehicle\nkinematics demonstrate the effectiveness of our approach.\n","authors":["Naeim Ebrahimi Toulkani","Reza Ghabcheloo"],"pdf_url":"https://arxiv.org/pdf/2411.07642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07640v1","updated":"2024-11-12T08:44:01Z","published":"2024-11-12T08:44:01Z","title":"Reducing Conservativeness of Controlled-Invariant Safe Sets by\n Introducing a Novel Synthesis of Control Barrier Certificates","summary":" Finding a controlled-invariant safe set for a given system with state and\ncontrol constraints plays an important role in safety-critical systems. Current\nmethods typically produce conservative solutions. In this paper, we introduce a\nmethod to generate controlled-invariant safe sets for nonlinear polynomial\ncontrol-affine dynamical systems by using the notion of Control Barrier\nCertificates (CBCs). To this end, we relax CBC conditions into Sum of Squares\n(SOS) constraints, to be solved by an SOS program. We first assume a\ncontrolled-invariant safe set (although small) exists for the system. We then\npropose a method to iteratively enlarge the safe set. We theoretically prove\nthat our method enlarges the safe set in each iteration. We also demonstrate\nthe efficacy of our method through simulated numerical examples in 2D and 3D\nfor single and multi-input dynamical systems and empirically show that our\nmethod produces a larger controlled-invariant safe set in these examples,\ncompared to a state-of-the-art technique using Control Barrier Function (CBF).\n","authors":["Naeim Ebrahimi Toulkani","Reza Ghabcheloo"],"pdf_url":"https://arxiv.org/pdf/2411.07640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07636v1","updated":"2024-11-12T08:33:15Z","published":"2024-11-12T08:33:15Z","title":"Node Reliability: Approximation, Upper Bounds, and Applications to\n Network Robustness","summary":" This paper discusses the reliability of a graph in which the links are\nperfectly reliable but the nodes may fail with certain probability p.\nCalculating graph node reliability is an NP-Hard problem. We introduce an\nefficient and accurate Monte Carlo method and a stochastic approximation for\nthe node reliability polynomial based solely on the degree distribution. We\nprovide the formulas for the node reliability polynomial of both Erdos-Renyi\ngraphs and Random Geometric graphs. The phase transition in the node\nreliability of Erdos-Renyi graphs is also discussed. Additionally, we propose\ntwo increasingly accurate upper bounds for the node reliability polynomial\nsolely based on the graph's degree distributions. The advantages and\ndisadvantages of these two upper bounds are thoroughly compared. Beyond the\ncomputation of node reliability polynomials, we also estimate the number of cut\nsets and present a solution to the reliability-based network enhancement\nproblem.\n","authors":["Xinhan Liu","Robert Kooij","Piet Van Mieghem"],"pdf_url":"https://arxiv.org/pdf/2411.07636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02661v2","updated":"2024-11-12T08:02:06Z","published":"2024-07-02T20:50:33Z","title":"Local Synchronization of Power System Devices","summary":" This paper introduces a novel concept of local synchronization of power\nsystems devices based on the difference between the complex frequency of the\nvoltage and current injected at terminals. Formal definitions are provided to\naccount for bounded and asymptotic local synchronization. The definitions are\nsuitable for modern power systems as they remove classical assumptions limiting\nthe application of the concept of synchronization to synchronous machines and\nomitting voltage dynamics. The paper also provides a systematic analytical\ndescription of the synchronization mechanisms of common power system devices.\nFinally, a variety of examples is included to illustrate the theoretical value\nand practical application of the proposed definitions to power systems modeling\nand stability analysis.\n","authors":["Ignacio Ponce","Federico Milano"],"pdf_url":"https://arxiv.org/pdf/2407.02661v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07603v1","updated":"2024-11-12T07:25:21Z","published":"2024-11-12T07:25:21Z","title":"$\\mathscr{H}_2$ Model Reduction for Linear Quantum Systems","summary":" In this paper, an $\\mathscr{H}_2$ norm-based model reduction method for\nlinear quantum systems is presented, which can obtain a physically realizable\nmodel with a reduced order for closely approximating the original system. The\nmodel reduction problem is described as an optimization problem, whose\nobjective is taken as an $\\mathscr{H}_2$ norm of the difference between the\ntransfer function of the original system and that of the reduced one. Different\nfrom classical model reduction problems, physical realizability conditions for\nguaranteeing that the reduced-order system is also a quantum system should be\ntaken as nonlinear constraints in the optimization. To solve the optimization\nproblem with such nonlinear constraints, we employ a matrix inequality approach\nto transform nonlinear inequality constraints into readily solvable linear\nmatrix inequalities (LMIs) and nonlinear equality constraints, so that the\noptimization problem can be solved by a lifting variables approach. We\nemphasize that different from existing work, which only introduces a criterion\nto evaluate the performance after model reduction, we guide our method to\nobtain an optimal reduced model with respect to the $\\mathscr{H}_2$ norm. In\naddition, the above approach for model reduction is extended to passive linear\nquantum systems. Finally, examples of active and passive linear quantum systems\nvalidate the efficacy of the proposed method.\n","authors":["G. P. Wu","S. Xue","G. F. Zhang","I. R. Petersen"],"pdf_url":"https://arxiv.org/pdf/2411.07603v1.pdf","comment":"13 pages,3 figures"},{"id":"http://arxiv.org/abs/2411.07594v1","updated":"2024-11-12T07:08:24Z","published":"2024-11-12T07:08:24Z","title":"Modelling and Control of Subsonic Missile for Air-to-Air Interception","summary":" Subsonic missiles play an important role in modern air-to-air combat\nscenarios - utilized by the F-35 Lightning II - but require complex Guidance,\nNavigation and Control systems to manoeuvre with 30G's of acceleration to\nintercept successfully. Challenges with mathematically modelling and\ncontrolling such a dynamic system must be addressed, high frequency noise\nrejected, and actuator delay compensated for. This paper aims to investigate\nthe control systems necessary for interception. It also proposes a subsonic\ndesign utilizing literature and prior research, suggests aerodynamic\nderivatives, and analyses a designed 2D reduced pitch autopilot control system\nresponse against performances. The pitch autopilot model contains an optimized\nPID controller, 2nd order actuator, lead compensator and Kalman Filter, that\nrejects time varying disturbances and high frequency noise expected during\nflight. Simulation results confirm the effectiveness of the proposed method\nthrough reduction in rise time (21%), settle time (10%), and highlighted its\nhigh frequency deficiency with respect to the compensator integration. The\nactuator delay of 100ms has been negated by the augmented compensator autopilot\ncontroller so that it exceeds system performance requirements (1) & (3).\nHowever, (2) is not satisfied as 370% overshoot exists. This research confirms\nthe importance of a lead compensator in missile GNC systems and furthers\ncontrol design application through a specific configuration. Future research\nshould build upon methods and models presented to construct and test an\ninterception scenario.\n","authors":["Rory Jenkins","Xinhua Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07593v1","updated":"2024-11-12T07:08:13Z","published":"2024-11-12T07:08:13Z","title":"Robust control for uncertain air-to-air missile systems","summary":" Air-to-air missiles are used on many modern military combat aircraft for\nself-defence. It is imperative for the pilots using the weapons that the\nmissiles hit their target first time. The important goals for a missile control\nsystem to achieve are minimising the time constant, overshoot, and settling\ntime of the missile dynamics. The combination of high angles of attack,\ntime-varying mass, thrust, and centre of gravity, actuator delay, and signal\nnoise create a highly non-linear dynamic system with many uncertainties that is\nextremely challenging to control. A robust control system based on saturated\nsliding mode control is proposed to overcome the time-varying parameters and\nnon-linearities. A lag compensator is designed to overcome actuator delay. A\nsecond-order filter is selected to reduce high-frequency measurement noise.\nWhen combined, the proposed solutions can make the system stable despite the\nexistence of changing mass, centre of gravity, thrust, and sensor noise. The\nsystem was evaluated for desired pitch angles of 0{\\deg} to 90{\\deg}. The time\nconstant for the system stayed below 0.27s for all conditions, with\nsatisfactory performance for both settling time and overshoot.\n","authors":["Joshua Farrington","Xinhua Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07592v1","updated":"2024-11-12T07:08:02Z","published":"2024-11-12T07:08:02Z","title":"Longitudinal dynamic modelling and control for a quad-tilt rotor UAV","summary":" Tilt rotor aircraft combine the benefits of both helicopters and fixed wing\naircraft, this makes them popular for a variety of applications, including\nSearch and Rescue and VVIP transport. However, due to the multiple flight\nmodes, significant challenges with regards to the control system design are\nexperienced. The main challenges with VTOL aircraft, comes during the dynamic\nphase (mode transition), where the aircraft transitions from a hover state to\nfull forwards flight. In this transition phase the aerodynamic lift and torque\ngenerated by the wing/control surfaces increases and as such, the rotor thrust,\nand the tilt rate must be carefully considered, such that the height and\nattitude remain invariant during the mode transition. In this paper, a digital\nPID controller with the applicable digital filter and data hold functions is\ndesigned so that a successful mode transition between hover and forwards flight\ncan be ascertained. Finally, the presented control system for the tilt-rotor\nUAV is demonstrated through simulations by using the MATLAB software suite. The\nperformance obtained from the simulations confirm the success of the\nimplemented methods, with full stability in all three degrees of freedom being\ndemonstrated.\n","authors":["William Smith","Xinhua Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07573v1","updated":"2024-11-12T06:21:47Z","published":"2024-11-12T06:21:47Z","title":"Robotic Control Optimization Through Kernel Selection in Safe Bayesian\n Optimization","summary":" Control system optimization has long been a fundamental challenge in\nrobotics. While recent advancements have led to the development of control\nalgorithms that leverage learning-based approaches, such as SafeOpt, to\noptimize single feedback controllers, scaling these methods to high-dimensional\ncomplex systems with multiple controllers remains an open problem. In this\npaper, we propose a novel learning-based control optimization method, which\nenhances the additive Gaussian process-based Safe Bayesian Optimization\nalgorithm to efficiently tackle high-dimensional problems through kernel\nselection. We use PID controller optimization in drones as a representative\nexample and test the method on Safe Control Gym, a benchmark designed for\nevaluating safe control techniques. We show that the proposed method provides a\nmore efficient and optimal solution for high-dimensional control optimization\nproblems, demonstrating significant improvements over existing techniques.\n","authors":["Lihao Zheng","Hongxuan Wang","Xiaocong Li","Jun Ma","Prahlad Vadakkepat"],"pdf_url":"https://arxiv.org/pdf/2411.07573v1.pdf","comment":"Accepted by 2024 IEEE International Conference on Robotics and\n Biomimetics (ROBIO)"},{"id":"http://arxiv.org/abs/2411.07570v1","updated":"2024-11-12T06:08:09Z","published":"2024-11-12T06:08:09Z","title":"Constructive RNNs: An Error-Recurrence Perspective on Time-Variant Zero\n Finding Problem Solving Under Uncertainty","summary":" When facing time-variant problems in analog computing, the desirable RNN\ndesign requires finite-time convergence and robustness with respect to various\ntypes of uncertainties, due to the time-variant nature and difficulties in\nimplementation. It is very worthwhile to explore terminal zeroing neural\nnetworks, through examining and applying available attracting laws. In this\npaper, from a control-theoretic point of view, an error recurrence system\napproach is presented by equipping with uncertainty compensation in the\npre-specified error dynamics, capable of enhancing robustness properly. Novel\nrectifying actions are designed to make finite-time settling so that the\nconvergence speed and the computing accuracy of time-variant computing can be\nimproved. Double-power and power-exponential rectifying actions are\nrespectively formed to construct specific models, while the particular\nexpressions of settling time function for the former are presented, and for the\nlatter the proximate settling-time estimations are given, with which the\nfixed-time convergence of the corresponding models is in turn established.\nMoreover, the uncertainty compensation by the signum/smoothing-signum\ntechniques are adopted for finite-duration stabilization. Theoretical results\nare presented to demonstrate effectiveness (involving fixed-time convergence\nand robustness) of the proposed computing schemes for the time-variant QP\nproblem solving.\n","authors":["Mingxuan Sun","Xing Li","Han Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07540v1","updated":"2024-11-12T04:37:46Z","published":"2024-11-12T04:37:46Z","title":"Lateral String Stability in Autonomous & Connected Vehicle Platoons","summary":" This paper addresses the lateral control of Autonomous and Connected Vehicles\n(ACVs) in a platoon executing an Emergency Lane Change (ELC) maneuver. These\nmaneuvers are typically triggered by emergency signals from the front or rear\nof the platoon in response to the need to avoid obstacles or allow other\nvehicles to pass. The study assumes that ACVs maintain reliable connectivity,\nenabling each following vehicle to access GPS position traces of both the lead\nand immediately preceding vehicles in the platoon. We demonstrate that lateral\nstring stability in the ACV platoon can be achieved using communicated\ninformation solely from the lead and preceding vehicles. Additionally, we\npresent a lateral control framework for ACVs, which helps track a discretized\npreview of the trajectory constructed from the communicated data. This\nframework involves constructing two distinct trajectories based on the preview\ndata from the lead and preceding vehicles, calculating the associated errors\nand lateral control actions for each, and then integrating these to generate a\nsteering command. Numerical results validate the effectiveness of the proposed\nlateral control scheme.\n","authors":["Neelkamal Somisetty","Swaroop Darbha"],"pdf_url":"https://arxiv.org/pdf/2411.07540v1.pdf","comment":"18th IEEE International Conference on Vehicular Electronics and\n Safety 2024 (ICVES)"},{"id":"http://arxiv.org/abs/2411.07484v1","updated":"2024-11-12T02:13:32Z","published":"2024-11-12T02:13:32Z","title":"Convergence Guarantees for Differentiable Optimization-based Control\n Policy","summary":" Effective control of real-world systems necessitates the development of\ncontrollers that are not only performant but also interpretable. To this end,\nthe field has seen a surge in model-based control policies, which first\nleverage historical data to learn system cost and dynamics, and then utilize\nthe learned models for control. However, due to this decoupling, model-based\ncontrol policies fall short when deployed in optimal control settings and lack\nconvergence guarantees for achieving optimality. In this paper, we present\nDiffOP, a Differentiable Optimization-based Policy for optimal control. In the\nproposed framework, control actions are derived by solving an optimization,\nwhere the control cost and system's dynamics can be parameterized as neural\nnetworks. The key idea of DiffOP, inspired by differentiable optimization\ntechniques, is to jointly learn the control policy using both policy gradients\nand optimization gradients, while utilizing actual cost feedback during system\ninteraction. Further, this study presents the first theoretical analysis of the\nconvergence rates and sample complexity for learning the optimization control\npolicy with a policy gradient approach.\n","authors":["Yuexin Bian","Jie Feng","Yuanyuan Shi"],"pdf_url":"https://arxiv.org/pdf/2411.07484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07470v1","updated":"2024-11-12T01:13:16Z","published":"2024-11-12T01:13:16Z","title":"Two-Layer Attention Optimization for Bimanual Coordination","summary":" Bimanual tasks performed by human agents present unique optimal control\nconsiderations compared to cyberphysical agents. These considerations include\nminimizing attention, distributing attention across two isolated hands, and\ncoordinating the two hands to reach a broader goal. In this work, we propose a\ntwo-layer controller that captures these considerations. The upper layer solves\nan attention distribution problem, while the two lower layer controllers (one\nper hand) tracks a trajectory using the solution given by the upper layer. We\nintroduce a formulation of the attention controller where attention is a vector\nthat is bound within a hyperbolic feasible region, which is determined by\nspecifications of the task the lower layer controllers. This two-layer\ncontroller is used to optimize a single-player game of pong, where the agent\nmust rally the ball between two paddles for as long as possible. We find that\nadding an attention layer on top of the lower controllers allows the agent to\ncoordinate the left and right hands, which minimizes attention and control\neffort over the course of the rallying task.\n","authors":["Justin Ting","Jing Shuang Li"],"pdf_url":"https://arxiv.org/pdf/2411.07470v1.pdf","comment":"American Controls Conference (under review)"},{"id":"http://arxiv.org/abs/2411.07453v1","updated":"2024-11-12T00:38:17Z","published":"2024-11-12T00:38:17Z","title":"Research on fault diagnosis of nuclear power first-second circuit based\n on hierarchical multi-granularity classification network","summary":" The safe and reliable operation of complex electromechanical systems in\nnuclear power plants is crucial for the safe production of nuclear power plants\nand their nuclear power unit. Therefore, accurate and timely fault diagnosis of\nnuclear power systems is of great significance for ensuring the safe and\nreliable operation of nuclear power plants. The existing fault diagnosis\nmethods mainly target a single device or subsystem, making it difficult to\nanalyze the inherent connections and mutual effects between different types of\nfaults at the entire unit level. This article uses the AP1000 full-scale\nsimulator to simulate the important mechanical component failures of some key\nsystems in the primary and secondary circuits of nuclear power units, and\nconstructs a fault dataset. Meanwhile, a hierarchical multi granularity\nclassification fault diagnosis model based on the EfficientNet large model is\nproposed, aiming to achieve hierarchical classification of nuclear power\nfaults. The results indicate that the proposed fault diagnosis model can\neffectively classify faults in different circuits and system components of\nnuclear power units into hierarchical categories. However, the fault dataset in\nthis study was obtained from a simulator, which may introduce additional\ninformation due to parameter redundancy, thereby affecting the diagnostic\nperformance of the model.\n","authors":["Jiangwen Chen","Siwei Li","Guo Jiang","Cheng Dongzhen","Lin Hua","Wang Wei"],"pdf_url":"https://arxiv.org/pdf/2411.07453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07444v1","updated":"2024-11-12T00:03:11Z","published":"2024-11-12T00:03:11Z","title":"Input-Based Ensemble-Learning Method for Dynamic Memory Configuration of\n Serverless Computing Functions","summary":" In today's Function-as-a-Service offerings, a programmer is usually\nresponsible for configuring function memory for its successful execution, which\nallocates proportional function resources such as CPU and network. However,\nright-sizing the function memory force developers to speculate performance and\nmake ad-hoc configuration decisions. Recent research has highlighted that a\nfunction's input characteristics, such as input size, type and number of\ninputs, significantly impact its resource demand, run-time performance and\ncosts with fluctuating workloads. This correlation further makes memory\nconfiguration a non-trivial task. On that account, an input-aware function\nmemory allocator not only improves developer productivity by completely hiding\nresource-related decisions but also drives an opportunity to reduce resource\nwastage and offer a finer-grained cost-optimised pricing scheme. Therefore, we\npresent MemFigLess, a serverless solution that estimates the memory requirement\nof a serverless function with input-awareness. The framework executes function\nprofiling in an offline stage and trains a multi-output Random Forest\nRegression model on the collected metrics to invoke input-aware optimal\nconfigurations. We evaluate our work with the state-of-the-art approaches on\nAWS Lambda service to find that MemFigLess is able to capture the input-aware\nresource relationships and allocate upto 82% less resources and save up to 87%\nrun-time costs.\n","authors":["Siddharth Agarwal","Maria A. Rodriguez","Rajkumar Buyya"],"pdf_url":"https://arxiv.org/pdf/2411.07444v1.pdf","comment":"10 pages, 2 tables, 28 figures, accepted conference paper - UCC'24"},{"id":"http://arxiv.org/abs/2411.08199v1","updated":"2024-11-12T21:38:24Z","published":"2024-11-12T21:38:24Z","title":"System-Level Analysis for mm-Wave Full-Duplex Transceivers","summary":" This paper conducts a comprehensive system-level analysis of mm-Wave\nfull-duplex transceivers, focusing on a receiver employing a four-stage\nself-interference cancellation (SIC) process. The analysis aims to optimize the\nnoise and linearity performance requirements of each transceiver block,\nensuring that the self-interference (SI) signal does not compromise the\nreceiver's error vector magnitude (EVM) for an OFDM 64-QAM signal.\nAdditionally, the necessary SIC for each stage is calculated to establish\nfeasible noise and linearity specifications for a CMOS-based implementation.\nThe resulting specifications are subsequently validated within a MATLAB\nSimulink environment, confirming the accuracy of the computed requirements for\neach block.\n","authors":["Mohamad Mahdi Rajaei Rizi","Jeyanandh Paramesh","Kamran Entesari"],"pdf_url":"https://arxiv.org/pdf/2411.08199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08190v1","updated":"2024-11-12T21:16:09Z","published":"2024-11-12T21:16:09Z","title":"Collision-Free Multi-Agent Coverage Control for Non-Cooperating Swarms:\n Preliminary Results","summary":" The main contribution of this paper is a methodology for multiple\nnon-cooperating swarms of unmanned aerial vehicles to independently cover a\ncommon area. In contrast to previous research on coverage control involving\nmore than one swarm, this paper does not assume cooperation between distinct\ngroups but considers them as entirely independent units following their own\nobjectives. Using Voronoi tesselation, collision-free motion of agents within\nthe same swarm has been proved before. However, as is shown in Example 1 of\nthis paper, in the case of multiple swarms with inter-swarm but without\nintra-swarm collaboration, these guarantees do not hold. We address this issue\nby proposing an algorithm to achieve maximum coverage with multiple swarms\nwhile avoiding collisions between agents. Thus, the Optimal Reciprocal\nCollision Avoidance method used for safe navigation in multi-agent scenarios is\nadapted to suit the needs of Voronoi-based coverage control with more than one\nswarm. The functionality of the proposed technique is validated through Monte\nCarlo simulations.\n","authors":["Karolina Schmidt","Luis Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2411.08190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08161v1","updated":"2024-11-12T20:12:09Z","published":"2024-11-12T20:12:09Z","title":"Shaping Frequency Dynamics in Modern Power Systems with Grid-forming\n Converters","summary":" In this paper, frequency dynamics in modern power systems with a high\npenetration of converter-based generation is analysed. A fundamental analysis\nof the frequency dynamics is performed to identify the limitations and\nchallenges when the converter penetration is increased. The voltage-source\nbehaviour is found as an essential characteristic of converters to improve the\ninitial frequency derivative of Synchronous Generators (SGs). A detailed\nsmall-signal analysis, based on the system's eigenvalues, participation factors\nand mode shapes, is then performed in a reduced system for different converter\npenetrations, showing that the flexibility of grid-forming (GFOR) converters as\nwell as the system's inertia reduction may lead to have a more controllable\nsystem frequency. First-order frequency responses can be programmed for high\nconverter penetrations, when GFOR operation can impose their dominance over\nSGs. These results have been validated in the IEEE 118-bus system simulated in\nPSCAD.\n","authors":["Carlos Collados-Rodriguez","Daniel Westerman Spier","Marc Cheah-Mane","Eduardo Prieto-Araujo","Oriol Gomis-Bellmunt"],"pdf_url":"https://arxiv.org/pdf/2411.08161v1.pdf","comment":"11 pages, 17 figures"},{"id":"http://arxiv.org/abs/2411.08156v1","updated":"2024-11-12T20:05:30Z","published":"2024-11-12T20:05:30Z","title":"Optimal Constant Climb Airspeed with Variable Cost Index for\n All-electric Aircraft","summary":" This paper presents for the first time an approach to minimize direct\noperational costs (DOC) for all-electric aircraft during the climb phase,\nintroducing a time-varying cost index (CI). The CI is modeled as a dynamic\nparameter commanded by Air Traffic Control (ATC), allowing the aircraft to\nmaintain a constant airspeed throughout the climb, while respecting the air\ntraffic regulations. This paper also explores the implications of a\ntime-varying CI on the determination of optimal airspeed and climbing time for\nall-electric aircraft. Additionally, it provides the necessary equations to\ncalculate both the optimal climb airspeed and climb duration. The proposed\nmethodology has been validated through a simulated scenario that reflects\nactual operational procedures. As a result, optimal values for climb airspeed,\nclimbing time, and energy consumption have been established, paving the way for\nfuture applications of this methodology to advanced air mobility all-electric\nvehicles.\n","authors":["Lucas Souza e Silva","Luis Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2411.08156v1.pdf","comment":"6 pages, 4 figures. arXiv admin note: text overlap with\n arXiv:2410.01045"},{"id":"http://arxiv.org/abs/2411.08144v1","updated":"2024-11-12T19:42:44Z","published":"2024-11-12T19:42:44Z","title":"Visual Tracking with Intermittent Visibility: Switched Control Design\n and Implementation","summary":" This paper addresses the problem of visual target tracking in scenarios where\na pursuer may experience intermittent loss of visibility of the target. The\ndesign of a Switched Visual Tracker (SVT) is presented which aims to meet the\ncompeting requirements of maintaining both proximity and visibility. SVT\nalternates between a visual tracking mode for following the target, and a\nrecovery mode for regaining visual contact when the target falls out of sight.\nWe establish the stability of SVT by extending the average dwell time theorem\nfrom switched systems theory, which may be of independent interest. Our\nimplementation of SVT on an Agilicious drone [1] illustrates its effectiveness\non tracking various target trajectories: it reduces the average tracking error\nby up to 45% and significantly improves visibility duration compared to a\nbaseline algorithm. The results show that our approach effectively handles\nintermittent vision loss, offering enhanced robustness and adaptability for\nreal-world autonomous missions. Additionally, we demonstrate how the stability\nanalysis provides valuable guidance for selecting parameters, such as tracking\nspeed and recovery distance, to optimize the SVT's performance.\n","authors":["Yangge Li","Benjamin C Yang","Sayan Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.08144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09712v1","updated":"2024-11-12T10:29:15Z","published":"2024-11-12T10:29:15Z","title":"Space-Air-Ground Integrated MEC-Assisted Industrial Cyber-Physical\n Systems: An Online Decentralized Optimization Approach","summary":" Cloud computing and edge/fog computing are playing a pivotal role in driving\nthe transformation of industrial cyber-physical systems (ICPS) towards greater\nintelligence and automation by providing high-quality computation offloading\nservices to Internet of Things devices (IoTDs). Recently, space-air-ground\nintegrated multi-access edge computing (SAGIMEC) is emerging as a promising\narchitecture combining edge computing and cloud computing, which has the\npotential to be integrated with ICPS to accelerate the realization of the above\nvision. In this work, we first present an SAGIMEC-assisted ICPS architecture\nthat incorporates edge computing and cloud computing through seamless\nconnectivity supported by satellite networks to achieve determinism in\nconnectivity, networked computing, and intelligent networked control. Then, we\nformulate a joint satellite selection, computation offloading, communication\nresource allocation, computation resource allocation, and UAV trajectory\ncontrol optimization problem (JSC4OP) to maximize the quality of service (QoS)\nof IoTDs. This problem considers both the dynamics and uncertainties of the\nsystem environment, as well as the limited resources and energy of UAVs. Given\nthe complexity of JSC4OP, we propose an online decentralized optimization\napproach (ODOA) to solve the problem. Specifically, JSC4OP is first transformed\ninto a real-time decision-making optimization problem (RDOP) by leveraging\nLyapunov optimization. Then, to solve the RDOP, we introduce an online\nlearning-based latency prediction method to predict the uncertain system\nenvironment and a game theoretic decision-making method to make real-time\ndecisions. Finally, theoretical analysis confirms the effectiveness of the\nODOA, while the simulation results demonstrate that the proposed ODOA\noutperforms other alternative approaches in terms of overall system\nperformance.\n","authors":["Long He","Geng Sun","Zemin Sun","Jiacheng Wang","Hongyang Du","Dusit Niyato","Jiangchuan Liu","Victor C. M. Leung"],"pdf_url":"https://arxiv.org/pdf/2411.09712v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.11918"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.08027v1","updated":"2024-11-12T18:56:58Z","published":"2024-11-12T18:56:58Z","title":"LLMPhy: Complex Physical Reasoning Using Large Language Models and World\n Models","summary":" Physical reasoning is an important skill needed for robotic agents when\noperating in the real world. However, solving such reasoning problems often\ninvolves hypothesizing and reflecting over complex multi-body interactions\nunder the effect of a multitude of physical forces and thus learning all such\ninteractions poses a significant hurdle for state-of-the-art machine learning\nframeworks, including large language models (LLMs). To study this problem, we\npropose a new physical reasoning task and a dataset, dubbed TraySim. Our task\ninvolves predicting the dynamics of several objects on a tray that is given an\nexternal impact -- the domino effect of the ensued object interactions and\ntheir dynamics thus offering a challenging yet controlled setup, with the goal\nof reasoning being to infer the stability of the objects after the impact. To\nsolve this complex physical reasoning task, we present LLMPhy, a zero-shot\nblack-box optimization framework that leverages the physics knowledge and\nprogram synthesis abilities of LLMs, and synergizes these abilities with the\nworld models built into modern physics engines. Specifically, LLMPhy uses an\nLLM to generate code to iteratively estimate the physical hyperparameters of\nthe system (friction, damping, layout, etc.) via an implicit\nanalysis-by-synthesis approach using a (non-differentiable) simulator in the\nloop and uses the inferred parameters to imagine the dynamics of the scene\ntowards solving the reasoning task. To show the effectiveness of LLMPhy, we\npresent experiments on our TraySim dataset to predict the steady-state poses of\nthe objects. Our results show that the combination of the LLM and the physics\nengine leads to state-of-the-art zero-shot physical reasoning performance,\nwhile demonstrating superior convergence against standard black-box\noptimization methods and better estimation of the physical parameters.\n","authors":["Anoop Cherian","Radu Corcodel","Siddarth Jain","Diego Romeres"],"pdf_url":"https://arxiv.org/pdf/2411.08027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08024v1","updated":"2024-11-12T18:54:55Z","published":"2024-11-12T18:54:55Z","title":"Leonardo vindicated: Pythagorean trees for minimal reconstruction of the\n natural branching structures","summary":" Trees continue to fascinate with their natural beauty and as engineering\nmasterpieces optimal with respect to several independent criteria. Pythagorean\ntree is a well-known fractal design that realistically mimics the natural tree\nbranching structures. We study various types of Pythagorean-like fractal trees\nwith different shapes of the base, branching angles and relaxed scales in an\nattempt to identify and explain which variants are the closest match to the\nbranching structures commonly observed in the natural world. Pursuing\nsimultaneously the realism and minimalism of the fractal tree model, we have\ndeveloped a flexibly parameterised and fast algorithm to grow and visually\nexamine deep Pythagorean-inspired fractal trees with the capability to orderly\nover- or underestimate the Leonardo da Vinci's tree branching rule as well as\ncontrol various imbalances and branching angles. We tested the realism of the\ngenerated fractal tree images by means of the classification accuracy of\ndetecting natural tree with the transfer-trained deep Convolutional Neural\nNetworks (CNNs). Having empirically established the parameters of the fractal\ntrees that maximize the CNN's natural tree class classification accuracy we\nhave translated them back to the scales and angles of branches and came to the\ninteresting conclusions that support the da Vinci branching rule and golden\nratio based scaling for both the shape of the branch and imbalance between the\nchild branches, and claim the flexibly parameterized fractal trees can be used\nto generate artificial examples to train robust detectors of different species\nof trees.\n","authors":["Dymitr Ruta","Corrado Mio","Ernesto Damiani"],"pdf_url":"https://arxiv.org/pdf/2411.08024v1.pdf","comment":"22 pages, lots of hi res figures I had to reduce quality of,\n submitting as a requirement to the Theory of Computing Journal"},{"id":"http://arxiv.org/abs/2411.08019v1","updated":"2024-11-12T18:50:35Z","published":"2024-11-12T18:50:35Z","title":"Language Models as Causal Effect Generators","summary":" We present a framework for large language model (LLM) based data generation\nwith controllable causal structure. In particular, we define a procedure for\nturning any language model and any directed acyclic graph (DAG) into a\nsequence-driven structural causal model (SD-SCM). Broadly speaking, an SD-SCM\nis a causal model with user-defined structure and LLM-defined structural\nequations. We characterize how an SD-SCM allows sampling from observational,\ninterventional, and counterfactual distributions according to the desired\ncausal structure. We then leverage this procedure to propose a new type of\nbenchmark for causal inference methods, generating individual-level\ncounterfactual data without needing to manually specify functional\nrelationships between variables. We create an example benchmark consisting of\nthousands of datasets, and test a suite of popular estimation methods on these\ndatasets for average, conditional average, and individual treatment effect\nestimation, both with and without hidden confounding. Apart from generating\ndata, the same procedure also allows us to test for the presence of a causal\neffect that might be encoded in an LLM. This procedure can underpin auditing\nLLMs for misinformation, discrimination, or otherwise undesirable behavior. We\nbelieve SD-SCMs can serve as a useful tool in any application that would\nbenefit from sequential data with controllable causal structure.\n","authors":["Lucius E. J. Bynum","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2411.08019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08017v1","updated":"2024-11-12T18:49:06Z","published":"2024-11-12T18:49:06Z","title":"Wavelet Latent Diffusion (Wala): Billion-Parameter 3D Generative Model\n with Compact Wavelet Encodings","summary":" Large-scale 3D generative models require substantial computational resources\nyet often fall short in capturing fine details and complex geometries at high\nresolutions. We attribute this limitation to the inefficiency of current\nrepresentations, which lack the compactness required to model the generative\nmodels effectively. To address this, we introduce a novel approach called\nWavelet Latent Diffusion, or WaLa, that encodes 3D shapes into wavelet-based,\ncompact latent encodings. Specifically, we compress a $256^3$ signed distance\nfield into a $12^3 \\times 4$ latent grid, achieving an impressive 2427x\ncompression ratio with minimal loss of detail. This high level of compression\nallows our method to efficiently train large-scale generative networks without\nincreasing the inference time. Our models, both conditional and unconditional,\ncontain approximately one billion parameters and successfully generate\nhigh-quality 3D shapes at $256^3$ resolution. Moreover, WaLa offers rapid\ninference, producing shapes within two to four seconds depending on the\ncondition, despite the model's scale. We demonstrate state-of-the-art\nperformance across multiple datasets, with significant improvements in\ngeneration quality, diversity, and computational efficiency. We open-source our\ncode and, to the best of our knowledge, release the largest pretrained 3D\ngenerative models across different modalities.\n","authors":["Aditya Sanghi","Aliasghar Khani","Pradyumna Reddy","Arianna Rampini","Derek Cheung","Kamal Rahimi Malekshan","Kanika Madan","Hooman Shayani"],"pdf_url":"https://arxiv.org/pdf/2411.08017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08013v1","updated":"2024-11-12T18:43:27Z","published":"2024-11-12T18:43:27Z","title":"Investigating the Effectiveness of Explainability Methods in Parkinson's\n Detection from Speech","summary":" Speech impairments in Parkinson's disease (PD) provide significant early\nindicators for diagnosis. While models for speech-based PD detection have shown\nstrong performance, their interpretability remains underexplored. This study\nsystematically evaluates several explainability methods to identify PD-specific\nspeech features, aiming to support the development of accurate, interpretable\nmodels for clinical decision-making in PD diagnosis and monitoring. Our\nmethodology involves (i) obtaining attributions and saliency maps using\nmainstream interpretability techniques, (ii) quantitatively evaluating the\nfaithfulness of these maps and their combinations obtained via union and\nintersection through a range of established metrics, and (iii) assessing the\ninformation conveyed by the saliency maps for PD detection from an auxiliary\nclassifier. Our results reveal that, while explanations are aligned with the\nclassifier, they often fail to provide valuable information for domain experts.\n","authors":["Eleonora Mancini","Francesco Paissan","Paolo Torroni","Cem Subakan","Mirco Ravanelli"],"pdf_url":"https://arxiv.org/pdf/2411.08013v1.pdf","comment":"The first two authors contributed equally to this research: author\n order is alphabetical"},{"id":"http://arxiv.org/abs/2401.11555v2","updated":"2024-11-12T18:18:43Z","published":"2024-01-21T18:00:15Z","title":"VQC-Based Reinforcement Learning with Data Re-uploading: Performance and\n Trainability","summary":" Reinforcement Learning (RL) consists of designing agents that make\nintelligent decisions without human supervision. When used alongside function\napproximators such as Neural Networks (NNs), RL is capable of solving extremely\ncomplex problems. Deep Q-Learning, a RL algorithm that uses Deep NNs, achieved\nsuper-human performance in some specific tasks. Nonetheless, it is also\npossible to use Variational Quantum Circuits (VQCs) as function approximators\nin RL algorithms. This work empirically studies the performance and\ntrainability of such VQC-based Deep Q-Learning models in classic control\nbenchmark environments. More specifically, we research how data re-uploading\naffects both these metrics. We show that the magnitude and the variance of the\ngradients of these models remain substantial throughout training due to the\nmoving targets of Deep Q-Learning. Moreover, we empirically show that\nincreasing the number of qubits does not lead to an exponential vanishing\nbehavior of the magnitude and variance of the gradients for a PQC approximating\na 2-design, unlike what was expected due to the Barren Plateau Phenomenon. This\nhints at the possibility of VQCs being specially adequate for being used as\nfunction approximators in such a context.\n","authors":["Rodrigo Coelho","André Sequeira","Luís Paulo Santos"],"pdf_url":"https://arxiv.org/pdf/2401.11555v2.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.07990v1","updated":"2024-11-12T18:15:19Z","published":"2024-11-12T18:15:19Z","title":"Derivational Morphology Reveals Analogical Generalization in Large\n Language Models","summary":" What mechanisms underlie linguistic generalization in large language models\n(LLMs)? This question has attracted considerable attention, with most studies\nanalyzing the extent to which the language skills of LLMs resemble rules. As of\nyet, it is not known whether linguistic generalization in LLMs could equally\nwell be explained as the result of analogical processes, which can be\nformalized as similarity operations on stored exemplars. A key shortcoming of\nprior research is its focus on linguistic phenomena with a high degree of\nregularity, for which rule-based and analogical approaches make the same\npredictions. Here, we instead examine derivational morphology, specifically\nEnglish adjective nominalization, which displays notable variability. We\nintroduce a new method for investigating linguistic generalization in LLMs:\nfocusing on GPT-J, we fit cognitive models that instantiate rule-based and\nanalogical learning to the LLM training data and compare their predictions on a\nset of nonce adjectives with those of the LLM, allowing us to draw direct\nconclusions regarding underlying mechanisms. As expected, rule-based and\nanalogical models explain the predictions of GPT-J equally well for adjectives\nwith regular nominalization patterns. However, for adjectives with variable\nnominalization patterns, the analogical model provides a much better match.\nFurthermore, GPT-J's behavior is sensitive to the individual word frequencies,\neven for regular forms, a behavior that is consistent with an analogical\naccount of regular forms but not a rule-based one. These findings refute the\nhypothesis that GPT-J's linguistic generalization on adjective nominalization\ninvolves rules, suggesting similarity operations on stored exemplars as the\nunderlying mechanism. Overall, our study suggests that analogical processes\nplay a bigger role in the linguistic generalization of LLMs than previously\nthought.\n","authors":["Valentin Hofmann","Leonie Weissweiler","David Mortensen","Hinrich Schütze","Janet Pierrehumbert"],"pdf_url":"https://arxiv.org/pdf/2411.07990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02271v2","updated":"2024-11-12T18:11:30Z","published":"2024-11-04T17:03:52Z","title":"On the Utilization of Unique Node Identifiers in Graph Neural Networks","summary":" Graph Neural Networks have inherent representational limitations due to their\nmessage-passing structure. Recent work has suggested that these limitations can\nbe overcome by using unique node identifiers (UIDs). Here we argue that despite\nthe advantages of UIDs, one of their disadvantages is that they lose the\ndesirable property of permutation-equivariance. We thus propose to focus on UID\nmodels that are permutation-equivariant, and present theoretical arguments for\ntheir advantages. Motivated by this, we propose a method to regularize UID\nmodels towards permutation equivariance, via a contrastive loss. We empirically\ndemonstrate that our approach improves generalization and extrapolation\nabilities while providing faster training convergence. On the recent BREC\nexpressiveness benchmark, our proposed method achieves state-of-the-art\nperformance compared to other random-based approaches.\n","authors":["Maya Bechler-Speicher","Moshe Eliasof","Carola-Bibiane Schönlieb","Ran Gilad-Bachrach","Amir Globerson"],"pdf_url":"https://arxiv.org/pdf/2411.02271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07979v1","updated":"2024-11-12T17:58:40Z","published":"2024-11-12T17:58:40Z","title":"Exact, Tractable Gauss-Newton Optimization in Deep Reversible\n Architectures Reveal Poor Generalization","summary":" Second-order optimization has been shown to accelerate the training of deep\nneural networks in many applications, often yielding faster progress per\niteration on the training loss compared to first-order optimizers.However, the\ngeneralization properties of second-order methods are still being debated.\nTheoretical investigations have proved difficult to carry out outside the\ntractable settings of heavily simplified model classes -- thus, the relevance\nof existing theories to practical deep learning applications remains unclear.\nSimilarly, empirical studies in large-scale models and real datasets are\nsignificantly confounded by the necessity to approximate second-order updates\nin practice. It is often unclear whether the observed generalization behaviour\narises specifically from the second-order nature of the parameter updates, or\ninstead reflects the specific structured (e.g.\\ Kronecker) approximations used\nor any damping-based interpolation towards first-order updates. Here, we show\nfor the first time that exact Gauss-Newton (GN) updates take on a tractable\nform in a class of deep reversible architectures that are sufficiently\nexpressive to be meaningfully applied to common benchmark datasets. We exploit\nthis novel setting to study the training and generalization properties of the\nGN optimizer. We find that exact GN generalizes poorly. In the mini-batch\ntraining setting, this manifests as rapidly saturating progress even on the\n\\emph{training} loss, with parameter updates found to overfit each\nmini-batchatch without producing the features that would support generalization\nto other mini-batches. We show that our experiments run in the ``lazy'' regime,\nin which the neural tangent kernel (NTK) changes very little during the course\nof training. This behaviour is associated with having no significant changes in\nneural representations, explaining the lack of generalization.\n","authors":["Davide Buffelli","Jamie McGowan","Wangkun Xu","Alexandru Cioba","Da-shan Shiu","Guillaume Hennequin","Alberto Bernacchia"],"pdf_url":"https://arxiv.org/pdf/2411.07979v1.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.07978v1","updated":"2024-11-12T17:58:34Z","published":"2024-11-12T17:58:34Z","title":"Doubly Robust Regression Discontinuity Designs","summary":" This study introduces a doubly robust (DR) estimator for regression\ndiscontinuity (RD) designs. In RD designs, treatment effects are estimated in a\nquasi-experimental setting where treatment assignment depends on whether a\nrunning variable surpasses a predefined cutoff. A common approach in RD\nestimation is to apply nonparametric regression methods, such as local linear\nregression. In such an approach, the validity relies heavily on the consistency\nof nonparametric estimators and is limited by the nonparametric convergence\nrate, thereby preventing $\\sqrt{n}$-consistency. To address these issues, we\npropose the DR-RD estimator, which combines two distinct estimators for the\nconditional expected outcomes. If either of these estimators is consistent, the\ntreatment effect estimator remains consistent. Furthermore, due to the\ndebiasing effect, our proposed estimator achieves $\\sqrt{n}$-consistency if\nboth regression estimators satisfy certain mild conditions, which also\nsimplifies statistical inference.\n","authors":["Masahiro Kato"],"pdf_url":"https://arxiv.org/pdf/2411.07978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07971v1","updated":"2024-11-12T17:51:45Z","published":"2024-11-12T17:51:45Z","title":"Optimal Control of Mechanical Ventilators with Learned Respiratory\n Dynamics","summary":" Deciding on appropriate mechanical ventilator management strategies\nsignificantly impacts the health outcomes for patients with respiratory\ndiseases. Acute Respiratory Distress Syndrome (ARDS) is one such disease that\nrequires careful ventilator operation to be effectively treated. In this work,\nwe frame the management of ventilators for patients with ARDS as a sequential\ndecision making problem using the Markov decision process framework. We\nimplement and compare controllers based on clinical guidelines contained in the\nARDSnet protocol, optimal control theory, and learned latent dynamics\nrepresented as neural networks. The Pulse Physiology Engine's respiratory\ndynamics simulator is used to establish a repeatable benchmark, gather\nsimulated data, and quantitatively compare these controllers. We score\nperformance in terms of measured improvement in established ARDS health markers\n(pertaining to improved respiratory rate, oxygenation, and vital signs). Our\nresults demonstrate that techniques leveraging neural networks and optimal\ncontrol can automatically discover effective ventilation management strategies\nwithout access to explicit ventilator management procedures or guidelines (such\nas those defined in the ARDSnet protocol).\n","authors":["Isaac Ronald Ward","Dylan M. Asmar","Mansur Arief","Jana Krystofova Mike","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2411.07971v1.pdf","comment":"2024 IEEE 37th International Symposium on Computer-Based Medical\n Systems (CBMS), 7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.09434v2","updated":"2024-11-12T17:49:12Z","published":"2024-07-12T17:09:47Z","title":"Foundation Models for the Electric Power Grid","summary":" Foundation models (FMs) currently dominate news headlines. They employ\nadvanced deep learning architectures to extract structural information\nautonomously from vast datasets through self-supervision. The resulting rich\nrepresentations of complex systems and dynamics can be applied to many\ndownstream applications. Therefore, FMs can find uses in electric power grids,\nchallenged by the energy transition and climate change. In this paper, we call\nfor the development of, and state why we believe in, the potential of FMs for\nelectric grids. We highlight their strengths and weaknesses amidst the\nchallenges of a changing grid. We argue that an FM learning from diverse grid\ndata and topologies could unlock transformative capabilities, pioneering a new\napproach in leveraging AI to redefine how we manage complexity and uncertainty\nin the electric grid. Finally, we discuss a power grid FM concept, namely\nGridFM, based on graph neural networks and show how different downstream tasks\nbenefit.\n","authors":["Hendrik F. Hamann","Thomas Brunschwiler","Blazhe Gjorgiev","Leonardo S. A. Martins","Alban Puech","Anna Varbella","Jonas Weiss","Juan Bernabe-Moreno","Alexandre Blondin Massé","Seong Choi","Ian Foster","Bri-Mathias Hodge","Rishabh Jain","Kibaek Kim","Vincent Mai","François Mirallès","Martin De Montigny","Octavio Ramos-Leaños","Hussein Suprême","Le Xie","El-Nasser S. Youssef","Arnaud Zinflou","Alexander J. Belyi","Ricardo J. Bessa","Bishnu Prasad Bhattarai","Johannes Schmude","Stanislav Sobolevsky"],"pdf_url":"https://arxiv.org/pdf/2407.09434v2.pdf","comment":"Major equal contributors: H.F.H., T.B., B.G., L.S.A.M., A.P., A.V.,\n J.W.; Significant equal contributors: J.B., A.B.M., S.C., I.F., B.H., R.J.,\n K.K., V.M., F.M., M.D.M., O.R., H.S., L.X., E.S.Y., A.Z.; Other equal\n contributors: A.J.B., R.J.B., B.P.B., J.S., S.S; Lead contact: H.F.H"},{"id":"http://arxiv.org/abs/2411.07964v1","updated":"2024-11-12T17:41:16Z","published":"2024-11-12T17:41:16Z","title":"Sleep Staging from Airflow Signals Using Fourier Approximations of\n Persistence Curves","summary":" Sleep staging is a challenging task, typically manually performed by sleep\ntechnologists based on electroencephalogram and other biosignals of patients\ntaken during overnight sleep studies. Recent work aims to leverage automated\nalgorithms to perform sleep staging not based on electroencephalogram signals,\nbut rather based on the airflow signals of subjects. Prior work uses ideas from\ntopological data analysis (TDA), specifically Hermite function expansions of\npersistence curves (HEPC) to featurize airflow signals. However, finite order\nHEPC captures only partial information. In this work, we propose Fourier\napproximations of persistence curves (FAPC), and use this technique to perform\nsleep staging based on airflow signals. We analyze performance using an XGBoost\nmodel on 1155 pediatric sleep studies taken from the Nationwide Children's\nHospital Sleep DataBank (NCHSDB), and find that FAPC methods provide\ncomplimentary information to HEPC methods alone, leading to a 4.9% increase in\nperformance over baseline methods.\n","authors":["Shashank Manjunath","Hau-Tieng Wu","Aarti Sathyanarayana"],"pdf_url":"https://arxiv.org/pdf/2411.07964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07959v1","updated":"2024-11-12T17:36:20Z","published":"2024-11-12T17:36:20Z","title":"On the Convergence of Continual Federated Learning Using Incrementally\n Aggregated Gradients","summary":" The holy grail of machine learning is to enable Continual Federated Learning\n(CFL) to enhance the efficiency, privacy, and scalability of AI systems while\nlearning from streaming data. The primary challenge of a CFL system is to\novercome global catastrophic forgetting, wherein the accuracy of the global\nmodel trained on new tasks declines on the old tasks. In this work, we propose\nContinual Federated Learning with Aggregated Gradients (C-FLAG), a novel\nreplay-memory based federated strategy consisting of edge-based gradient\nupdates on memory and aggregated gradients on the current data. We provide\nconvergence analysis of the C-FLAG approach which addresses forgetting and bias\nwhile converging at a rate of $O(1/\\sqrt{T})$ over $T$ communication rounds. We\nformulate an optimization sub-problem that minimizes catastrophic forgetting,\ntranslating CFL into an iterative algorithm with adaptive learning rates that\nensure seamless learning across tasks. We empirically show that C-FLAG\noutperforms several state-of-the-art baselines on both task and\nclass-incremental settings with respect to metrics such as accuracy and\nforgetting.\n","authors":["Satish Kumar Keshri","Nazreen Shah","Ranjitha Prasad"],"pdf_url":"https://arxiv.org/pdf/2411.07959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07957v1","updated":"2024-11-12T17:34:38Z","published":"2024-11-12T17:34:38Z","title":"Tukey g-and-h neural network regression for non-Gaussian data","summary":" This paper addresses non-Gaussian regression with neural networks via the use\nof the Tukey g-and-h distribution.The Tukey g-and-h transform is a flexible\nparametric transform with two parameters $g$ and $h$ which, when applied to a\nstandard normal random variable, introduces both skewness and kurtosis,\nresulting in a distribution commonly called the Tukey g-and-h distribution.\nSpecific values of $g$ and $h$ produce good approximations to other families of\ndistributions, such as the Cauchy and student-t distributions. The flexibility\nof the Tukey g-and-h distribution has driven its popularity in the statistical\ncommunity, in applied sciences and finance. In this work we consider the\ntraining of a neural network to predict the parameters of a Tukey g-and-h\ndistribution in a regression framework via the minimization of the\ncorresponding negative log-likelihood, despite the latter having no closed-form\nexpression. We demonstrate the efficiency of our procedure in simulated\nexamples and apply our method to a real-world dataset of global crop yield for\nseveral types of crops. Finally, we show how we can carry out a goodness-of-fit\nanalysis between the predicted distributions and the test data. A Pytorch\nimplementation is made available on Github and as a Pypi package.\n","authors":["Arthur P. Guillaumin","Natalia Efremova"],"pdf_url":"https://arxiv.org/pdf/2411.07957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07954v1","updated":"2024-11-12T17:30:31Z","published":"2024-11-12T17:30:31Z","title":"Learning Memory Mechanisms for Decision Making through Demonstrations","summary":" In Partially Observable Markov Decision Processes, integrating an agent's\nhistory into memory poses a significant challenge for decision-making.\nTraditional imitation learning, relying on observation-action pairs for expert\ndemonstrations, fails to capture the expert's memory mechanisms used in\ndecision-making. To capture memory processes as demonstrations, we introduce\nthe concept of \\textbf{memory dependency pairs} $(p, q)$ indicating that events\nat time $p$ are recalled for decision-making at time $q$. We introduce\n\\textbf{AttentionTuner} to leverage memory dependency pairs in Transformers and\nfind significant improvements across several tasks compared to standard\nTransformers when evaluated on Memory Gym and the Long-term Memory Benchmark.\nCode is available at https://github.com/WilliamYue37/AttentionTuner .\n","authors":["William Yue","Bo Liu","Peter Stone"],"pdf_url":"https://arxiv.org/pdf/2411.07954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07942v1","updated":"2024-11-12T17:11:46Z","published":"2024-11-12T17:11:46Z","title":"Towards Low-bit Communication for Tensor Parallel LLM Inference","summary":" Tensor parallelism provides an effective way to increase server large\nlanguage model (LLM) inference efficiency despite adding an additional\ncommunication cost. However, as server LLMs continue to scale in size, they\nwill need to be distributed across more devices, magnifying the communication\ncost. One way to approach this problem is with quantization, but current\nmethods for LLMs tend to avoid quantizing the features that tensor parallelism\nneeds to communicate. Taking advantage of consistent outliers in communicated\nfeatures, we introduce a quantization method that reduces communicated values\non average from 16 bits to 4.2 bits while preserving nearly all of the original\nperformance. For instance, our method maintains around 98.0% and 99.5% of Gemma\n2 27B's and Llama 2 13B's original performance, respectively, averaged across\nall tasks we evaluated on.\n","authors":["Harry Dong","Tyler Johnson","Minsik Cho","Emad Soroush"],"pdf_url":"https://arxiv.org/pdf/2411.07942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07934v1","updated":"2024-11-12T17:04:56Z","published":"2024-11-12T17:04:56Z","title":"Doubly Mild Generalization for Offline Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) suffers from the extrapolation error and\nvalue overestimation. From a generalization perspective, this issue can be\nattributed to the over-generalization of value functions or policies towards\nout-of-distribution (OOD) actions. Significant efforts have been devoted to\nmitigating such generalization, and recent in-sample learning approaches have\nfurther succeeded in entirely eschewing it. Nevertheless, we show that mild\ngeneralization beyond the dataset can be trusted and leveraged to improve\nperformance under certain conditions. To appropriately exploit generalization\nin offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild\naction generalization and (ii) mild generalization propagation. The former\nrefers to selecting actions in a close neighborhood of the dataset to maximize\nthe Q values. Even so, the potential erroneous generalization can still be\npropagated, accumulated, and exacerbated by bootstrapping. In light of this,\nthe latter concept is introduced to mitigate the generalization propagation\nwithout impeding the propagation of RL learning signals. Theoretically, DMG\nguarantees better performance than the in-sample optimal policy in the oracle\ngeneralization scenario. Even under worst-case generalization, DMG can still\ncontrol value overestimation at a certain level and lower bound the\nperformance. Empirically, DMG achieves state-of-the-art performance across\nGym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting\nfrom its flexibility in both generalization aspects, DMG enjoys a seamless\ntransition from offline to online learning and attains strong online\nfine-tuning performance.\n","authors":["Yixiu Mao","Qi Wang","Yun Qu","Yuhang Jiang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2411.07934v1.pdf","comment":"Accepted to NeurIPS 2024. arXiv admin note: substantial text overlap\n with arXiv:2410.19400"},{"id":"http://arxiv.org/abs/2411.07933v1","updated":"2024-11-12T17:04:12Z","published":"2024-11-12T17:04:12Z","title":"Prediction of Acoustic Communication Performance for AUVs using Gaussian\n Process Classification","summary":" Cooperating autonomous underwater vehicles (AUVs) often rely on acoustic\ncommunication to coordinate their actions effectively. However, the reliability\nof underwater acoustic communication decreases as the communication range\nbetween vehicles increases. Consequently, teams of cooperating AUVs typically\nmake conservative assumptions about the maximum range at which they can\ncommunicate reliably. To address this limitation, we propose a novel approach\nthat involves learning a map representing the probability of successful\ncommunication based on the locations of the transmitting and receiving\nvehicles. This probabilistic communication map accounts for factors such as the\nrange between vehicles, environmental noise, and multi-path effects at a given\nlocation. In pursuit of this goal, we investigate the application of Gaussian\nprocess binary classification to generate the desired communication map. We\nspecialize existing results to this specific binary classification problem and\nexplore methods to incorporate uncertainty in vehicle location into the mapping\nprocess. Furthermore, we compare the prediction performance of the probability\ncommunication map generated using binary classification with that of a\nsignal-to-noise ratio (SNR) communication map generated using Gaussian process\nregression. Our approach is experimentally validated using communication and\nnavigation data collected during trials with a pair of Virginia Tech 690 AUVs.\n","authors":["Yifei Gao","Harun Yetkin","McMahon James","Daniel J. Stilwell"],"pdf_url":"https://arxiv.org/pdf/2411.07933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01897v2","updated":"2024-11-12T16:48:29Z","published":"2024-11-04T09:04:11Z","title":"LE-PDE++: Mamba for accelerating PDEs Simulations","summary":" Partial Differential Equations are foundational in modeling science and\nnatural systems such as fluid dynamics and weather forecasting. The Latent\nEvolution of PDEs method is designed to address the computational intensity of\nclassical and deep learning-based PDE solvers by proposing a scalable and\nefficient alternative. To enhance the efficiency and accuracy of LE-PDE, we\nincorporate the Mamba model, an advanced machine learning model known for its\npredictive efficiency and robustness in handling complex dynamic systems with a\nprogressive learning strategy. The LE-PDE was tested on several benchmark\nproblems. The method demonstrated a marked reduction in computational time\ncompared to traditional solvers and standalone deep learning models while\nmaintaining high accuracy in predicting system behavior over time. Our method\ndoubles the inference speed compared to the LE-PDE while retaining the same\nlevel of parameter efficiency, making it well-suited for scenarios requiring\nlong-term predictions.\n","authors":["Aoming Liang","Zhaoyang Mu","Qi liu","Ruipeng Li","Mingming Ge","Dixia Fan"],"pdf_url":"https://arxiv.org/pdf/2411.01897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03497v2","updated":"2024-11-12T16:44:24Z","published":"2024-08-07T01:37:10Z","title":"Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and\n Tabnet with SMOTEENN","summary":" Bank credit risk is a significant challenge in modern financial transactions,\nand the ability to identify qualified credit card holders among a large number\nof applicants is crucial for the profitability of a bank'sbank's credit card\nbusiness. In the past, screening applicants'applicants' conditions often\nrequired a significant amount of manual labor, which was time-consuming and\nlabor-intensive. Although the accuracy and reliability of previously used ML\nmodels have been continuously improving, the pursuit of more reliable and\npowerful AI intelligent models is undoubtedly the unremitting pursuit by major\nbanks in the financial industry. In this study, we used a dataset of over\n40,000 records provided by a commercial bank as the research object. We\ncompared various dimensionality reduction techniques such as PCA and T-SNE for\npreprocessing high-dimensional datasets and performed in-depth adaptation and\ntuning of distributed models such as LightGBM and XGBoost, as well as deep\nmodels like Tabnet. After a series of research and processing, we obtained\nexcellent research results by combining SMOTEENN with these techniques. The\nexperiments demonstrated that LightGBM combined with PCA and SMOTEENN\ntechniques can assist banks in accurately predicting potential high-quality\ncustomers, showing relatively outstanding performance compared to other models.\n","authors":["Chang Yu","Yixin Jin","Qianwen Xing","Ye Zhang","Shaobo Guo","Shuchen Meng"],"pdf_url":"https://arxiv.org/pdf/2408.03497v2.pdf","comment":"8 pagess on IEEE ICPICS"},{"id":"http://arxiv.org/abs/2406.04658v3","updated":"2024-11-12T16:44:20Z","published":"2024-06-07T05:56:43Z","title":"Advanced Payment Security System:XGBoost, LightGBM and SMOTE Integrated","summary":" With the rise of various online and mobile payment systems, transaction fraud\nhas become a significant threat to financial security. This study explores the\napplication of advanced machine learning models, specifically based on XGBoost\nand LightGBM, for developing a more accurate and robust Payment Security\nProtection Model. To enhance data reliability, we meticulously processed the\ndata sources and applied SMOTE (Synthetic Minority Over-sampling Technique) to\naddress class imbalance and improve data representation. By selecting highly\ncorrelated features, we aimed to strengthen the training process and boost\nmodel performance. We conducted thorough performance evaluations of our\nproposed models, comparing them against traditional methods including Random\nForest, Neural Network, and Logistic Regression. Using metrics such as\nPrecision, Recall, and F1 Score, we rigorously assessed their effectiveness.\nOur detailed analyses and comparisons reveal that the combination of SMOTE with\nXGBoost and LightGBM offers a highly efficient and powerful mechanism for\npayment security protection. Moreover, the integration of XGBoost and LightGBM\nin a Local Ensemble model further demonstrated outstanding performance. After\nincorporating SMOTE, the new combined model achieved a significant improvement\nof nearly 6\\% over traditional models and around 5\\% over its sub-models,\nshowcasing remarkable results.\n","authors":["Qi Zheng","Chang Yu","Jin Cao","Yongshun Xu","Qianwen Xing","Yinxin Jin"],"pdf_url":"https://arxiv.org/pdf/2406.04658v3.pdf","comment":"This paper is received by https://ieee-metacom.org"},{"id":"http://arxiv.org/abs/2406.03733v4","updated":"2024-11-12T16:44:14Z","published":"2024-06-06T04:12:57Z","title":"Credit Card Fraud Detection Using Advanced Transformer Model","summary":" With the proliferation of various online and mobile payment systems, credit\ncard fraud has emerged as a significant threat to financial security. This\nstudy focuses on innovative applications of the latest Transformer models for\nmore robust and precise fraud detection. To ensure the reliability of the data,\nwe meticulously processed the data sources, balancing the dataset to address\nthe issue of data sparsity significantly. We also selected highly correlated\nvectors to strengthen the training process.To guarantee the reliability and\npracticality of the new Transformer model, we conducted performance comparisons\nwith several widely adopted models, including Support Vector Machine (SVM),\nRandom Forest, Neural Network, and Logistic Regression. We rigorously compared\nthese models using metrics such as Precision, Recall, and F1 Score. Through\nthese detailed analyses and comparisons, we present to the readers a highly\nefficient and powerful anti-fraud mechanism with promising prospects. The\nresults demonstrate that the Transformer model not only excels in traditional\napplications but also shows great potential in niche areas like fraud\ndetection, offering a substantial advancement in the field.\n","authors":["Chang Yu","Yongshun Xu","Jin Cao","Ye Zhang","Yinxin Jin","Mengran Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.03733v4.pdf","comment":"This paper have been received by https://ieee-metacom.org/"},{"id":"http://arxiv.org/abs/2410.00256v2","updated":"2024-11-12T16:43:41Z","published":"2024-09-30T21:56:16Z","title":"Enhanced Credit Score Prediction Using Ensemble Deep Learning Model","summary":" In contemporary economic society, credit scores are crucial for every\nparticipant. A robust credit evaluation system is essential for the\nprofitability of core businesses such as credit cards, loans, and investments\nfor commercial banks and the financial sector. This paper combines\nhigh-performance models like XGBoost and LightGBM, already widely used in\nmodern banking systems, with the powerful TabNet model. We have developed a\npotent model capable of accurately determining credit score levels by\nintegrating Random Forest, XGBoost, and TabNet, and through the stacking\ntechnique in ensemble modeling. This approach surpasses the limitations of\nsingle models and significantly advances the precise credit score prediction.\nIn the following sections, we will explain the techniques we used and\nthoroughly validate our approach by comprehensively comparing a series of\nmetrics such as Precision, Recall, F1, and AUC. By integrating Random Forest,\nXGBoost, and with the TabNet deep learning architecture, these models\ncomplement each other, demonstrating exceptionally strong overall performance.\n","authors":["Qianwen Xing","Chang Yu","Sining Huang","Qi Zheng","Xingyu Mu","Mengying Sun"],"pdf_url":"https://arxiv.org/pdf/2410.00256v2.pdf","comment":"This paper have been accepted by sci of AI Journal"},{"id":"http://arxiv.org/abs/2305.16945v3","updated":"2024-11-12T16:23:50Z","published":"2023-05-26T14:00:12Z","title":"Levin Tree Search with Context Models","summary":" Levin Tree Search (LTS) is a search algorithm that makes use of a policy (a\nprobability distribution over actions) and comes with a theoretical guarantee\non the number of expansions before reaching a goal node, depending on the\nquality of the policy. This guarantee can be used as a loss function, which we\ncall the LTS loss, to optimize neural networks representing the policy\n(LTS+NN). In this work we show that the neural network can be substituted with\nparameterized context models originating from the online compression literature\n(LTS+CM). We show that the LTS loss is convex under this new model, which\nallows for using standard convex optimization tools, and obtain convergence\nguarantees to the optimal parameters in an online setting for a given set of\nsolution trajectories -- guarantees that cannot be provided for neural\nnetworks. The new LTS+CM algorithm compares favorably against LTS+NN on several\nbenchmarks: Sokoban (Boxoban), The Witness, and the 24-Sliding Tile puzzle\n(STP). The difference is particularly large on STP, where LTS+NN fails to solve\nmost of the test instances while LTS+CM solves each test instance in a fraction\nof a second. Furthermore, we show that LTS+CM is able to learn a policy that\nsolves the Rubik's cube in only a few hundred expansions, which considerably\nimproves upon previous machine learning techniques.\n","authors":["Laurent Orseau","Marcus Hutter","Levi H. S. Lelis"],"pdf_url":"https://arxiv.org/pdf/2305.16945v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18438v3","updated":"2024-11-12T15:57:13Z","published":"2023-11-30T10:39:47Z","title":"Piecewise Linearity of Min-Norm Solution Map of a Nonconvexly\n Regularized Convex Sparse Model","summary":" It is well known that the minimum $\\ell_2$-norm solution of the convex LASSO\nmodel, say $\\mathbf{x}_{\\star}$, is a continuous piecewise linear function of\nthe regularization parameter $\\lambda$, and its signed sparsity pattern is\nconstant within each linear piece. The current study is an extension of this\nclassic result, proving that the aforementioned properties extend to the\nmin-norm solution map $\\mathbf{x}_{\\star}(\\mathbf{y},\\lambda)$, where\n$\\mathbf{y}$ is the observed signal, for a generalization of LASSO termed the\nscaled generalized minimax concave (sGMC) model. The sGMC model adopts a\nnonconvex debiased variant of the $\\ell_1$-norm as sparse regularizer, but its\nobjective function is overall-convex. Based on the geometric properties of\n$\\mathbf{x}_{\\star}(\\mathbf{y},\\lambda)$, we propose an extension of the least\nangle regression (LARS) algorithm, which iteratively computes the closed-form\nexpression of $\\mathbf{x}_{\\star}(\\mathbf{y},\\lambda)$ in each linear zone.\nUnder suitable conditions, the proposed algorithm provably obtains the whole\nsolution map $\\mathbf{x}_{\\star}(\\mathbf{y},\\lambda)$ within finite iterations.\nNotably, our proof techniques for establishing continuity and piecewise\nlinearity of $\\mathbf{x}_{\\star}(\\mathbf{y},\\lambda)$ are novel, and they lead\nto two side contributions: (a) our proofs establish continuity of the sGMC\nsolution set as a set-valued mapping of $(\\mathbf{y},\\lambda)$; (b) to prove\npiecewise linearity and piecewise constant sparsity pattern of\n$\\mathbf{x}_{\\star}(\\mathbf{y},\\lambda)$, we do not require any assumption that\nprevious work relies on (whereas to prove some additional properties of\n$\\mathbf{x}_{\\star}(\\mathbf{y},\\lambda)$, we use a different set of assumptions\nfrom previous work).\n","authors":["Yi Zhang","Isao Yamada"],"pdf_url":"https://arxiv.org/pdf/2311.18438v3.pdf","comment":"40 pages. Submitted to journal"},{"id":"http://arxiv.org/abs/2403.00043v2","updated":"2024-11-12T15:54:29Z","published":"2024-02-29T14:50:58Z","title":"RiNALMo: General-Purpose RNA Language Models Can Generalize Well on\n Structure Prediction Tasks","summary":" While RNA has recently been recognized as an interesting small-molecule drug\ntarget, many challenges remain to be addressed before we take full advantage of\nit. This emphasizes the necessity to improve our understanding of its\nstructures and functions. Over the years, sequencing technologies have produced\nan enormous amount of unlabeled RNA data, which hides a huge potential.\nMotivated by the successes of protein language models, we introduce RiboNucleic\nAcid Language Model (RiNALMo) to unveil the hidden code of RNA. RiNALMo is the\nlargest RNA language model to date, with 650M parameters pre-trained on 36M\nnon-coding RNA sequences from several databases. It can extract hidden\nknowledge and capture the underlying structure information implicitly embedded\nwithin the RNA sequences. RiNALMo achieves state-of-the-art results on several\ndownstream tasks. Notably, we show that its generalization capabilities\novercome the inability of other deep learning methods for secondary structure\nprediction to generalize on unseen RNA families.\n","authors":["Rafael Josip Penić","Tin Vlašić","Roland G. Huber","Yue Wan","Mile Šikić"],"pdf_url":"https://arxiv.org/pdf/2403.00043v2.pdf","comment":"31 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.07889v1","updated":"2024-11-12T15:51:35Z","published":"2024-11-12T15:51:35Z","title":"A Stochastic Optimization Framework for Private and Fair Learning From\n Decentralized Data","summary":" Machine learning models are often trained on sensitive data (e.g., medical\nrecords and race/gender) that is distributed across different \"silos\" (e.g.,\nhospitals). These federated learning models may then be used to make\nconsequential decisions, such as allocating healthcare resources. Two key\nchallenges emerge in this setting: (i) maintaining the privacy of each person's\ndata, even if other silos or an adversary with access to the central server\ntries to infer this data; (ii) ensuring that decisions are fair to different\ndemographic groups (e.g., race/gender). In this paper, we develop a novel\nalgorithm for private and fair federated learning (FL). Our algorithm satisfies\ninter-silo record-level differential privacy (ISRL-DP), a strong notion of\nprivate FL requiring that silo i's sent messages satisfy record-level\ndifferential privacy for all i. Our framework can be used to promote different\nfairness notions, including demographic parity and equalized odds. We prove\nthat our algorithm converges under mild smoothness assumptions on the loss\nfunction, whereas prior work required strong convexity for convergence. As a\nbyproduct of our analysis, we obtain the first convergence guarantee for\nISRL-DP nonconvex-strongly concave min-max FL. Experiments demonstrate the\nstate-of-the-art fairness-accuracy tradeoffs of our algorithm across different\nprivacy levels.\n","authors":["Devansh Gupta","A. S. Poornash","Andrew Lowy","Meisam Razaviyayn"],"pdf_url":"https://arxiv.org/pdf/2411.07889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07885v1","updated":"2024-11-12T15:47:17Z","published":"2024-11-12T15:47:17Z","title":"INTRABENCH: Interactive Radiological Benchmark","summary":" Current interactive segmentation approaches, inspired by the success of\nMETA's Segment Anything model, have achieved notable advancements, however,\nthey come with substantial limitations that hinder their practical application\nin real clinical scenarios. These include unrealistic human interaction\nrequirements, such as slice-by-slice operations for 2D models on 3D data, a\nlack of iterative refinement, and insufficient evaluation experiments. These\nshortcomings prevent accurate assessment of model performance and lead to\ninconsistent outcomes across studies. IntRaBench overcomes these challenges by\noffering a comprehensive and reproducible framework for evaluating interactive\nsegmentation methods in realistic, clinically relevant scenarios. It includes\ndiverse datasets, target structures, and segmentation models, and provides a\nflexible codebase that allows seamless integration of new models and prompting\nstrategies. Additionally, we introduce advanced techniques to minimize\nclinician interaction, ensuring fair comparisons between 2D and 3D models. By\nopen-sourcing IntRaBench, we invite the research community to integrate their\nmodels and prompting techniques, ensuring continuous and transparent evaluation\nof interactive segmentation models in 3D medical imaging.\n","authors":["Constantin Ulrich","Tassilo Wald","Emily Tempus","Maximilian Rokuss","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2411.07885v1.pdf","comment":"Undergoing Peer-Review"},{"id":"http://arxiv.org/abs/2310.05327v2","updated":"2024-11-12T15:34:57Z","published":"2023-10-09T01:18:07Z","title":"Provable Compositional Generalization for Object-Centric Learning","summary":" Learning representations that generalize to novel compositions of known\nconcepts is crucial for bridging the gap between human and machine perception.\nOne prominent effort is learning object-centric representations, which are\nwidely conjectured to enable compositional generalization. Yet, it remains\nunclear when this conjecture will be true, as a principled theoretical or\nempirical understanding of compositional generalization is lacking. In this\nwork, we investigate when compositional generalization is guaranteed for\nobject-centric representations through the lens of identifiability theory. We\nshow that autoencoders that satisfy structural assumptions on the decoder and\nenforce encoder-decoder consistency will learn object-centric representations\nthat provably generalize compositionally. We validate our theoretical result\nand highlight the practical relevance of our assumptions through experiments on\nsynthetic image data.\n","authors":["Thaddäus Wiedemer","Jack Brady","Alexander Panfilov","Attila Juhos","Matthias Bethge","Wieland Brendel"],"pdf_url":"https://arxiv.org/pdf/2310.05327v2.pdf","comment":"Oral at ICLR 2024. The first four authors contributed equally"},{"id":"http://arxiv.org/abs/2306.10084v3","updated":"2024-11-12T15:32:40Z","published":"2023-06-16T11:57:11Z","title":"Convolutional and Deep Learning based techniques for Time Series Ordinal\n Classification","summary":" Time Series Classification (TSC) covers the supervised learning problem where\ninput data is provided in the form of series of values observed through\nrepeated measurements over time, and whose objective is to predict the category\nto which they belong. When the class values are ordinal, classifiers that take\nthis into account can perform better than nominal classifiers. Time Series\nOrdinal Classification (TSOC) is the field covering this gap, yet unexplored in\nthe literature. There are a wide range of time series problems showing an\nordered label structure, and TSC techniques that ignore the order relationship\ndiscard useful information. Hence, this paper presents a first benchmarking of\nTSOC methodologies, exploiting the ordering of the target labels to boost the\nperformance of current TSC state-of-the-art. Both convolutional- and deep\nlearning-based methodologies (among the best performing alternatives for\nnominal TSC) are adapted for TSOC. For the experiments, a selection of 29\nordinal problems from two well-known archives has been made. In this way, this\npaper contributes to the establishment of the state-of-the-art in TSOC. The\nresults obtained by ordinal versions are found to be significantly better than\ncurrent nominal TSC techniques in terms of ordinal performance metrics,\noutlining the importance of considering the ordering of the labels when dealing\nwith this kind of problems.\n","authors":["Rafael Ayllón-Gavilán","David Guijo-Rubio","Pedro Antonio Gutiérrez","Anthony Bagnall","César Hervás-Martínez"],"pdf_url":"https://arxiv.org/pdf/2306.10084v3.pdf","comment":"13 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2410.00171v2","updated":"2024-11-12T15:30:15Z","published":"2024-09-30T19:18:34Z","title":"Basis-to-Basis Operator Learning Using Function Encoders","summary":" We present Basis-to-Basis (B2B) operator learning, a novel approach for\nlearning operators on Hilbert spaces of functions based on the foundational\nideas of function encoders. We decompose the task of learning operators into\ntwo parts: learning sets of basis functions for both the input and output\nspaces and learning a potentially nonlinear mapping between the coefficients of\nthe basis functions. B2B operator learning circumvents many challenges of prior\nworks, such as requiring data to be at fixed locations, by leveraging classic\ntechniques such as least squares to compute the coefficients. It is especially\npotent for linear operators, where we compute a mapping between bases as a\nsingle matrix transformation with a closed-form solution. Furthermore, with\nminimal modifications and using the deep theoretical connections between\nfunction encoders and functional analysis, we derive operator learning\nalgorithms that are directly analogous to eigen-decomposition and singular\nvalue decomposition. We empirically validate B2B operator learning on seven\nbenchmark operator learning tasks and show that it demonstrates a\ntwo-orders-of-magnitude improvement in accuracy over existing approaches on\nseveral benchmark tasks.\n","authors":["Tyler Ingebrand","Adam J. Thorpe","Somdatta Goswami","Krishna Kumar","Ufuk Topcu"],"pdf_url":"https://arxiv.org/pdf/2410.00171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07873v1","updated":"2024-11-12T15:29:50Z","published":"2024-11-12T15:29:50Z","title":"Diverse capability and scaling of diffusion and auto-regressive models\n when learning abstract rules","summary":" Humans excel at discovering regular structures from limited samples and\napplying inferred rules to novel settings. We investigate whether modern\ngenerative models can similarly learn underlying rules from finite samples and\nperform reasoning through conditional sampling. Inspired by Raven's Progressive\nMatrices task, we designed GenRAVEN dataset, where each sample consists of\nthree rows, and one of 40 relational rules governing the object position,\nnumber, or attributes applies to all rows. We trained generative models to\nlearn the data distribution, where samples are encoded as integer arrays to\nfocus on rule learning. We compared two generative model families: diffusion\n(EDM, DiT, SiT) and autoregressive models (GPT2, Mamba). We evaluated their\nability to generate structurally consistent samples and perform panel\ncompletion via unconditional and conditional sampling. We found diffusion\nmodels excel at unconditional generation, producing more novel and consistent\nsamples from scratch and memorizing less, but performing less well in panel\ncompletion, even with advanced conditional sampling methods. Conversely,\nautoregressive models excel at completing missing panels in a rule-consistent\nmanner but generate less consistent samples unconditionally. We observe diverse\ndata scaling behaviors: for both model families, rule learning emerges at a\ncertain dataset size - around 1000s examples per rule. With more training data,\ndiffusion models improve both their unconditional and conditional generation\ncapabilities. However, for autoregressive models, while panel completion\nimproves with more training data, unconditional generation consistency\ndeclines. Our findings highlight complementary capabilities and limitations of\ndiffusion and autoregressive models in rule learning and reasoning tasks,\nsuggesting avenues for further research into their mechanisms and potential for\nhuman-like reasoning.\n","authors":["Binxu Wang","Jiaqi Shang","Haim Sompolinsky"],"pdf_url":"https://arxiv.org/pdf/2411.07873v1.pdf","comment":"12 pages, 5 figures. Accepted to NeurIPS2024 Workshop on System 2\n Reasoning At Scale as long paper"},{"id":"http://arxiv.org/abs/2411.07863v1","updated":"2024-11-12T15:22:14Z","published":"2024-11-12T15:22:14Z","title":"CDXFormer: Boosting Remote Sensing Change Detection with Extended Long\n Short-Term Memory","summary":" In complex scenes and varied conditions, effectively integrating\nspatial-temporal context is crucial for accurately identifying changes.\nHowever, current RS-CD methods lack a balanced consideration of performance and\nefficiency. CNNs lack global context, Transformers have quadratic computational\ncomplexity, and Mambas are restricted by CUDA acceleration. In this paper, we\npropose CDXFormer, with a core component that is a powerful XLSTM-based feature\nenhancement layer, integrating the advantages of linear computational\ncomplexity, global context perception, and strong interpret-ability.\nSpecifically, we introduce a scale-specific Feature Enhancer layer,\nincorporating a Cross-Temporal Global Perceptron customized for\nsemantic-accurate deep features, and a Cross-Temporal Spatial Refiner\ncustomized for detail-rich shallow features. Additionally, we propose a\nCross-Scale Interactive Fusion module to progressively interact global change\nrepresentations with spatial responses. Extensive experimental results\ndemonstrate that CDXFormer achieves state-of-the-art performance across three\nbenchmark datasets, offering a compelling balance between efficiency and\naccuracy. Code is available at https://github.com/xwmaxwma/rschange.\n","authors":["Zhenkai Wu","Xiaowen Ma","Rongrong Lian","Zhentao Lin","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04492v4","updated":"2024-11-12T15:16:36Z","published":"2024-10-06T14:11:39Z","title":"Interpret Your Decision: Logical Reasoning Regularization for\n Generalization in Visual Classification","summary":" Vision models excel in image classification but struggle to generalize to\nunseen data, such as classifying images from unseen domains or discovering\nnovel categories. In this paper, we explore the relationship between logical\nreasoning and deep learning generalization in visual classification. A logical\nregularization termed L-Reg is derived which bridges a logical analysis\nframework to image classification. Our work reveals that L-Reg reduces the\ncomplexity of the model in terms of the feature distribution and classifier\nweights. Specifically, we unveil the interpretability brought by L-Reg, as it\nenables the model to extract the salient features, such as faces to persons,\nfor classification. Theoretical analysis and experiments demonstrate that L-Reg\nenhances generalization across various scenarios, including multi-domain\ngeneralization and generalized category discovery. In complex real-world\nscenarios where images span unknown classes and unseen domains, L-Reg\nconsistently improves generalization, highlighting its practical efficacy.\n","authors":["Zhaorui Tan","Xi Yang","Qiufeng Wang","Anh Nguyen","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2410.04492v4.pdf","comment":"Accepted by NeurIPS2024 as Spotlight"},{"id":"http://arxiv.org/abs/2411.07854v1","updated":"2024-11-12T15:06:06Z","published":"2024-11-12T15:06:06Z","title":"Tucano: Advancing Neural Text Generation for Portuguese","summary":" Significant advances have been made in natural language processing in recent\nyears. However, our current deep learning approach to language modeling\nrequires substantial resources in terms of data and computation. One of the\nside effects of this data-hungry paradigm is the current schism between\nlanguages, separating those considered high-resource, where most of the\ndevelopment happens and resources are available, and the low-resource ones,\nwhich struggle to attain the same level of performance and autonomy. This study\naims to introduce a new set of resources to stimulate the future development of\nneural text generation in Portuguese. In this work, we document the development\nof GigaVerbo, a concatenation of deduplicated Portuguese text corpora amounting\nto 200 billion tokens. Via this corpus, we trained a series of\ndecoder-transformers named Tucano. Our models perform equal or superior to\nother Portuguese and multilingual language models of similar size in several\nPortuguese benchmarks. The evaluation of our models also reveals that model\nperformance on many currently available benchmarks used by the Portuguese NLP\ncommunity has little to no correlation with the scaling of token ingestion\nduring training, highlighting the limitations of such evaluations when it comes\nto the assessment of Portuguese generative language models. All derivatives of\nour study are openly released on GitHub and Hugging Face. See\nhttps://nkluge-correa.github.io/Tucano/\n","authors":["Nicholas Kluge Corrêa","Aniket Sen","Sophia Falk","Shiza Fatimah"],"pdf_url":"https://arxiv.org/pdf/2411.07854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07853v1","updated":"2024-11-12T15:06:04Z","published":"2024-11-12T15:06:04Z","title":"Evidential time-to-event prediction model with well-calibrated\n uncertainty estimation","summary":" Time-to-event analysis, or Survival analysis, provides valuable insights into\nclinical prognosis and treatment recommendations. However, this task is\ntypically more challenging than other regression tasks due to the censored\nobservations. Moreover, concerns regarding the reliability of predictions\npersist among clinicians, mainly attributed to the absence of confidence\nassessment, robustness, and calibration of prediction. To address those\nchallenges, we introduce an evidential regression model designed especially for\ntime-to-event prediction tasks, with which the most plausible event time, is\ndirectly quantified by aggregated Gaussian random fuzzy numbers (GRFNs). The\nGRFNs are a newly introduced family of random fuzzy subsets of the real line\nthat generalizes both Gaussian random variables and Gaussian possibility\ndistributions. Different from conventional methods that construct models based\non strict data distribution, e.g., proportional hazard function, our model only\nassumes the event time is encoded in a real line GFRN without any strict\ndistribution assumption, therefore offering more flexibility in complex data\nscenarios. Furthermore, the epistemic and aleatory uncertainty regarding the\nevent time is quantified within the aggregated GRFN as well. Our model can,\ntherefore, provide more detailed clinical decision-making guidance with two\nmore degrees of information. The model is fit by minimizing a generalized\nnegative log-likelihood function that accounts for data censoring based on\nuncertainty evidence reasoning. Experimental results on simulated datasets with\nvarying data distributions and censoring scenarios, as well as on real-world\ndatasets across diverse clinical settings and tasks, demonstrate that our model\nachieves both accurate and reliable performance, outperforming state-of-the-art\nmethods.\n","authors":["Ling Huang","Yucheng Xing","Swapnil Mishra","Thierry Denoeux","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2411.07853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05225v5","updated":"2024-11-12T15:05:00Z","published":"2024-06-07T19:25:02Z","title":"A Manifold Perspective on the Statistical Generalization of Graph Neural\n Networks","summary":" Graph Neural Networks (GNNs) extend convolutional neural networks to operate\non graphs. Despite their impressive performances in various graph learning\ntasks, the theoretical understanding of their generalization capability is\nstill lacking. Previous GNN generalization bounds ignore the underlying graph\nstructures, often leading to bounds that increase with the number of nodes -- a\nbehavior contrary to the one experienced in practice. In this paper, we take a\nmanifold perspective to establish the statistical generalization theory of GNNs\non graphs sampled from a manifold in the spectral domain. As demonstrated\nempirically, we prove that the generalization bounds of GNNs decrease linearly\nwith the size of the graphs in the logarithmic scale, and increase linearly\nwith the spectral continuity constants of the filter functions. Notably, our\ntheory explains both node-level and graph-level tasks. Our result has two\nimplications: i) guaranteeing the generalization of GNNs to unseen data over\nmanifolds; ii) providing insights into the practical design of GNNs, i.e.,\nrestrictions on the discriminability of GNNs are necessary to obtain a better\ngeneralization performance. We demonstrate our generalization bounds of GNNs\nusing synthetic and multiple real-world datasets.\n","authors":["Zhiyang Wang","Juan Cervino","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2406.05225v5.pdf","comment":"37 pages,25 figures, 10 tables"},{"id":"http://arxiv.org/abs/2402.11658v3","updated":"2024-11-12T15:03:48Z","published":"2024-02-18T17:32:53Z","title":"Dynamic planning in hierarchical active inference","summary":" By dynamic planning, we refer to the ability of the human brain to infer and\nimpose motor trajectories related to cognitive decisions. A recent paradigm,\nactive inference, brings fundamental insights into the adaptation of biological\norganisms, constantly striving to minimize prediction errors to restrict\nthemselves to life-compatible states. Over the past years, many studies have\nshown how human and animal behaviors could be explained in terms of active\ninference - either as discrete decision-making or continuous motor control -\ninspiring innovative solutions in robotics and artificial intelligence. Still,\nthe literature lacks a comprehensive outlook on effectively planning realistic\nactions in changing environments. Setting ourselves the goal of modeling\ncomplex tasks such as tool use, we delve into the topic of dynamic planning in\nactive inference, keeping in mind two crucial aspects of biological behavior:\nthe capacity to understand and exploit affordances for object manipulation, and\nto learn the hierarchical interactions between the self and the environment,\nincluding other agents. We start from a simple unit and gradually describe more\nadvanced structures, comparing recently proposed design choices and providing\nbasic examples. This study distances itself from traditional views centered on\nneural networks and reinforcement learning, and points toward a yet unexplored\ndirection in active inference: hybrid representations in hierarchical models.\n","authors":["Matteo Priorelli","Ivilin Peev Stoianov"],"pdf_url":"https://arxiv.org/pdf/2402.11658v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12203v3","updated":"2024-11-12T15:00:37Z","published":"2024-03-18T19:25:57Z","title":"Bootstrapping Reinforcement Learning with Imitation for Vision-Based\n Agile Flight","summary":" Learning visuomotor policies for agile quadrotor flight presents significant\ndifficulties, primarily from inefficient policy exploration caused by\nhigh-dimensional visual inputs and the need for precise and low-latency\ncontrol. To address these challenges, we propose a novel approach that combines\nthe performance of Reinforcement Learning (RL) and the sample efficiency of\nImitation Learning (IL) in the task of vision-based autonomous drone racing.\nWhile RL provides a framework for learning high-performance controllers through\ntrial and error, it faces challenges with sample efficiency and computational\ndemands due to the high dimensionality of visual inputs. Conversely, IL\nefficiently learns from visual expert demonstrations, but it remains limited by\nthe expert's performance and state distribution. To overcome these limitations,\nour policy learning framework integrates the strengths of both approaches. Our\nframework contains three phases: training a teacher policy using RL with\nprivileged state information, distilling it into a student policy via IL, and\nadaptive fine-tuning via RL. Testing in both simulated and real-world scenarios\nshows our approach can not only learn in scenarios where RL from scratch fails\nbut also outperforms existing IL methods in both robustness and performance,\nsuccessfully navigating a quadrotor through a race course using only visual\ninformation. Videos of the experiments are available at\nhttps://rpg.ifi.uzh.ch/bootstrap-rl-with-il/index.html.\n","authors":["Jiaxu Xing","Angel Romero","Leonard Bauersfeld","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.12203v3.pdf","comment":"8th Annual Conference on Robot Learning (CoRL)"},{"id":"http://arxiv.org/abs/2402.14585v2","updated":"2024-11-12T14:58:48Z","published":"2024-02-22T14:38:52Z","title":"Bandits with Abstention under Expert Advice","summary":" We study the classic problem of prediction with expert advice under bandit\nfeedback. Our model assumes that one action, corresponding to the learner's\nabstention from play, has no reward or loss on every trial. We propose the CBA\nalgorithm, which exploits this assumption to obtain reward bounds that can\nsignificantly improve those of the classical Exp4 algorithm. We can view our\nproblem as the aggregation of confidence-rated predictors when the learner has\nthe option of abstention from play. Importantly, we are the first to achieve\nbounds on the expected cumulative reward for general confidence-rated\npredictors. In the special case of specialists we achieve a novel reward bound,\nsignificantly improving previous bounds of SpecialistExp (treating abstention\nas another action). As an example application, we discuss learning unions of\nballs in a finite metric space. In this contextual setting, we devise an\nefficient implementation of CBA, reducing the runtime from quadratic to almost\nlinear in the number of contexts. Preliminary experiments show that CBA\nimproves over existing bandit algorithms.\n","authors":["Stephen Pasteris","Alberto Rumi","Maximilian Thiessen","Shota Saito","Atsushi Miyauchi","Fabio Vitale","Mark Herbster"],"pdf_url":"https://arxiv.org/pdf/2402.14585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14803v3","updated":"2024-11-12T14:57:08Z","published":"2024-10-18T18:19:56Z","title":"DistRL: An Asynchronous Distributed Reinforcement Learning Framework for\n On-Device Control Agents","summary":" On-device control agents, especially on mobile devices, are responsible for\noperating mobile devices to fulfill users' requests, enabling seamless and\nintuitive interactions. Integrating Multimodal Large Language Models (MLLMs)\ninto these agents enhances their ability to understand and execute complex\ncommands, thereby improving user experience. However, fine-tuning MLLMs for\non-device control presents significant challenges due to limited data\navailability and inefficient online training processes. This paper introduces\nDistRL, a novel framework designed to enhance the efficiency of online RL\nfine-tuning for mobile device control agents. DistRL employs centralized\ntraining and decentralized data acquisition to ensure efficient fine-tuning in\nthe context of dynamic online interactions. Additionally, the framework is\nbacked by our tailor-made RL algorithm, which effectively balances exploration\nwith the prioritized utilization of collected data to ensure stable and robust\ntraining. Our experiments show that, on average, DistRL delivers a 3X\nimprovement in training efficiency and enables training data collection 2.4X\nfaster than the leading synchronous multi-machine methods. Notably, after\ntraining, DistRL achieves a 20% relative improvement in success rate compared\nto state-of-the-art methods on general Android tasks from an open benchmark,\nsignificantly outperforming existing approaches while maintaining the same\ntraining time. These results validate DistRL as a scalable and efficient\nsolution, offering substantial improvements in both training efficiency and\nagent performance for real-world, in-the-wild device control tasks.\n","authors":["Taiyi Wang","Zhihao Wu","Jianheng Liu","Jianye Hao","Jun Wang","Kun Shao"],"pdf_url":"https://arxiv.org/pdf/2410.14803v3.pdf","comment":"Paper and Appendix, 25 pages"},{"id":"http://arxiv.org/abs/2410.20178v2","updated":"2024-11-12T14:45:18Z","published":"2024-10-26T13:19:57Z","title":"LLMs Can Evolve Continually on Modality for X-Modal Reasoning","summary":" Multimodal Large Language Models (MLLMs) have gained significant attention\ndue to their impressive capabilities in multimodal understanding. However,\nexisting methods rely heavily on extensive modal-specific pretraining and\njoint-modal tuning, leading to significant computational burdens when expanding\nto new modalities. In this paper, we propose PathWeave, a flexible and scalable\nframework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs\nto continually EVolve on modalities for $\\mathbb{X}$-modal reasoning. We\nleverage the concept of Continual Learning and develop an incremental training\nstrategy atop pre-trained MLLMs, enabling their expansion to new modalities\nusing uni-modal data, without executing joint-modal pretraining. In detail, a\nnovel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and\ncross-modal adapters are seamlessly integrated to facilitate efficient modality\nalignment and collaboration. Additionally, an MoE-based gating module is\napplied between two types of adapters to further enhance the multimodal\ninteraction. To investigate the proposed method, we establish a challenging\nbenchmark called Continual Learning of Modality (MCL), which consists of\nhigh-quality QA data from five distinct modalities: image, video, audio, depth\nand point cloud. Extensive experiments demonstrate the effectiveness of the\nproposed AnA framework on learning plasticity and memory stability during\ncontinual learning. Furthermore, PathWeave performs comparably to\nstate-of-the-art MLLMs while concurrently reducing parameter training burdens\nby 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave\n","authors":["Jiazuo Yu","Haomiao Xiong","Lu Zhang","Haiwen Diao","Yunzhi Zhuge","Lanqing Hong","Dong Wang","Huchuan Lu","You He","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2410.20178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07837v1","updated":"2024-11-12T14:41:07Z","published":"2024-11-12T14:41:07Z","title":"FRUGAL: Memory-Efficient Optimization by Reducing State Overhead for\n Scalable Training","summary":" With the increase in the number of parameters in large language models, the\nprocess of pre-training and fine-tuning increasingly demands larger volumes of\nGPU memory. A significant portion of this memory is typically consumed by the\noptimizer state. To overcome this challenge, recent approaches such as low-rank\nadaptation (LoRA (Hu et al., 2021)), low-rank gradient projection (GaLore (Zhao\net al., 2024)), and blockwise optimization (BAdam (Luo et al., 2024)) have been\nproposed. However, in all these algorithms, the $\\textit{effective rank of the\nweight updates remains low-rank}$, which can lead to a substantial loss of\ninformation from the gradient. This loss can be critically important,\nespecially during the pre-training stage. In this paper, we introduce\n$\\texttt{FRUGAL}$ ($\\textbf{F}$ull-$\\textbf{R}$ank $\\textbf{U}$pdates with\n$\\textbf{G}$r$\\textbf{A}$dient sp$\\textbf{L}$itting), a new memory-efficient\noptimization framework. $\\texttt{FRUGAL}$ leverages gradient splitting to\nperform low-dimensional updates using advanced algorithms (such as Adam), while\nupdates along the remaining directions are executed via state-free methods like\nSGD or signSGD (Bernstein et al., 2018). Our framework can be integrated with\nvarious low-rank update selection techniques, including GaLore and BAdam. We\nprovide theoretical convergence guarantees for our framework when using SGDM\nfor low-dimensional updates and SGD for state-free updates. Additionally, our\nmethod consistently outperforms concurrent approaches across various fixed\nmemory budgets, achieving state-of-the-art results in pre-training and\nfine-tuning tasks while balancing memory efficiency and performance metrics.\n","authors":["Philip Zmushko","Aleksandr Beznosikov","Martin Takáč","Samuel Horváth"],"pdf_url":"https://arxiv.org/pdf/2411.07837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12550v2","updated":"2024-11-12T14:39:29Z","published":"2024-07-17T13:31:13Z","title":"UniTE: A Survey and Unified Pipeline for Pre-training Spatiotemporal\n Trajectory Embeddings","summary":" Spatiotemporal trajectories are sequences of timestamped locations, which\nenable a variety of analyses that in turn enable important real-world\napplications. It is common to map trajectories to vectors, called embeddings,\nbefore subsequent analyses. Thus, the qualities of embeddings are very\nimportant. Methods for pre-training embeddings, which leverage unlabeled\ntrajectories for training universal embeddings, have shown promising\napplicability across different tasks, thus attracting considerable interest.\nHowever, research progress on this topic faces two key challenges: a lack of a\ncomprehensive overview of existing methods, resulting in several related\nmethods not being well-recognized, and the absence of a unified pipeline,\ncomplicating the development of new methods and the analysis of methods.\n We present UniTE, a survey and a unified pipeline for this domain. In doing\nso, we present a comprehensive list of existing methods for pre-training\ntrajectory embeddings, which includes methods that either explicitly or\nimplicitly employ pre-training techniques. Further, we present a unified and\nmodular pipeline with publicly available underlying code, simplifying the\nprocess of constructing and evaluating methods for pre-training trajectory\nembeddings. Additionally, we contribute a selection of experimental results\nusing the proposed pipeline on real-world datasets. Implementation of the\npipeline is publicly available at https://github.com/Logan-Lin/UniTE.\n","authors":["Yan Lin","Zeyu Zhou","Yicheng Liu","Haochen Lv","Haomin Wen","Tianyi Li","Yushuai Li","Christian S. Jensen","Shengnan Guo","Youfang Lin","Huaiyu Wan"],"pdf_url":"https://arxiv.org/pdf/2407.12550v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07832v1","updated":"2024-11-12T14:27:45Z","published":"2024-11-12T14:27:45Z","title":"Dynamical-VAE-based Hindsight to Learn the Causal Dynamics of\n Factored-POMDPs","summary":" Learning representations of underlying environmental dynamics from partial\nobservations is a critical challenge in machine learning. In the context of\nPartially Observable Markov Decision Processes (POMDPs), state representations\nare often inferred from the history of past observations and actions. We\ndemonstrate that incorporating future information is essential to accurately\ncapture causal dynamics and enhance state representations. To address this, we\nintroduce a Dynamical Variational Auto-Encoder (DVAE) designed to learn causal\nMarkovian dynamics from offline trajectories in a POMDP. Our method employs an\nextended hindsight framework that integrates past, current, and multi-step\nfuture information within a factored-POMDP setting. Empirical results reveal\nthat this approach uncovers the causal graph governing hidden state transitions\nmore effectively than history-based and typical hindsight-based models.\n","authors":["Chao Han","Debabrota Basu","Michael Mangan","Eleni Vasilaki","Aditya Gilra"],"pdf_url":"https://arxiv.org/pdf/2411.07832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07828v1","updated":"2024-11-12T14:23:52Z","published":"2024-11-12T14:23:52Z","title":"Suite-IN: Aggregating Motion Features from Apple Suite for Robust\n Inertial Navigation","summary":" With the rapid development of wearable technology, devices like smartphones,\nsmartwatches, and headphones equipped with IMUs have become essential for\napplications such as pedestrian positioning. However, traditional pedestrian\ndead reckoning (PDR) methods struggle with diverse motion patterns, while\nrecent data-driven approaches, though improving accuracy, often lack robustness\ndue to reliance on a single device.In our work, we attempt to enhance the\npositioning performance using the low-cost commodity IMUs embedded in the\nwearable devices. We propose a multi-device deep learning framework named\nSuite-IN, aggregating motion data from Apple Suite for inertial navigation.\nMotion data captured by sensors on different body parts contains both local and\nglobal motion information, making it essential to reduce the negative effects\nof localized movements and extract global motion representations from multiple\ndevices.\n","authors":["Lan Sun","Songpengcheng Xia","Junyuan Deng","Jiarui Yang","Zengyuan Lai","Qi Wu","Ling Pei"],"pdf_url":"https://arxiv.org/pdf/2411.07828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07826v1","updated":"2024-11-12T14:22:16Z","published":"2024-11-12T14:22:16Z","title":"Efficient Federated Finetuning of Tiny Transformers with\n Resource-Constrained Devices","summary":" In recent years, Large Language Models (LLMs) through Transformer structures\nhave dominated many machine learning tasks, especially text processing.\nHowever, these models require massive amounts of data for training and induce\nhigh resource requirements, particularly in terms of the large number of\nFloating Point Operations (FLOPs) and the high amounts of memory needed. To\nfine-tune such a model in a parameter-efficient way, techniques like Adapter or\nLoRA have been developed. However, we observe that the application of LoRA,\nwhen used in federated learning (FL), while still being parameter-efficient, is\nmemory and FLOP inefficient. Based on that observation, we develop a novel\nlayer finetuning scheme that allows devices in cross-device FL to make use of\npretrained neural networks (NNs) while adhering to given resource constraints.\nWe show that our presented scheme outperforms the current state of the art when\ndealing with homogeneous or heterogeneous computation and memory constraints\nand is on par with LoRA regarding limited communication, thereby achieving\nsignificantly higher accuracies in FL training.\n","authors":["Kilian Pfeiffer","Mohamed Aboelenien Ahmed","Ramin Khalili","Jörg Henkel"],"pdf_url":"https://arxiv.org/pdf/2411.07826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03163v2","updated":"2024-11-12T14:18:51Z","published":"2024-11-05T15:07:20Z","title":"Efficient Hamiltonian, structure and trace distance learning of Gaussian\n states","summary":" In this work, we initiate the study of Hamiltonian learning for positive\ntemperature bosonic Gaussian states, the quantum generalization of the widely\nstudied problem of learning Gaussian graphical models. We obtain efficient\nprotocols, both in sample and computational complexity, for the task of\ninferring the parameters of their underlying quadratic Hamiltonian under the\nassumption of bounded temperature, squeezing, displacement and maximal degree\nof the interaction graph. Our protocol only requires heterodyne measurements,\nwhich are often experimentally feasible, and has a sample complexity that\nscales logarithmically with the number of modes. Furthermore, we show that it\nis possible to learn the underlying interaction graph in a similar setting and\nsample complexity. Taken together, our results put the status of the quantum\nHamiltonian learning problem for continuous variable systems in a much more\nadvanced state when compared to spins, where state-of-the-art results are\neither unavailable or quantitatively inferior to ours. In addition, we use our\ntechniques to obtain the first results on learning Gaussian states in trace\ndistance with a quadratic scaling in precision and polynomial in the number of\nmodes, albeit imposing certain restrictions on the Gaussian states. Our main\ntechnical innovations are several continuity bounds for the covariance and\nHamiltonian matrix of a Gaussian state, which are of independent interest,\ncombined with what we call the local inversion technique. In essence, the local\ninversion technique allows us to reliably infer the Hamiltonian of a Gaussian\nstate by only estimating in parallel submatrices of the covariance matrix whose\nsize scales with the desired precision, but not the number of modes. This way\nwe bypass the need to obtain precise global estimates of the covariance matrix,\ncontrolling the sample complexity.\n","authors":["Marco Fanizza","Cambyse Rouzé","Daniel Stilck França"],"pdf_url":"https://arxiv.org/pdf/2411.03163v2.pdf","comment":"43 pages, 1 figure. Corrections to Lemma 4.1. Main results are\n unchanged"},{"id":"http://arxiv.org/abs/2411.07816v1","updated":"2024-11-12T14:09:16Z","published":"2024-11-12T14:09:16Z","title":"Dual-Criterion Model Aggregation in Federated Learning: Balancing Data\n Quantity and Quality","summary":" Federated learning (FL) has become one of the key methods for\nprivacy-preserving collaborative learning, as it enables the transfer of models\nwithout requiring local data exchange. Within the FL framework, an aggregation\nalgorithm is recognized as one of the most crucial components for ensuring the\nefficacy and security of the system. Existing average aggregation algorithms\ntypically assume that all client-trained data holds equal value or that weights\nare based solely on the quantity of data contributed by each client. In\ncontrast, alternative approaches involve training the model locally after\naggregation to enhance adaptability. However, these approaches fundamentally\nignore the inherent heterogeneity between different clients' data and the\ncomplexity of variations in data at the aggregation stage, which may lead to a\nsuboptimal global model.\n To address these issues, this study proposes a novel dual-criterion weighted\naggregation algorithm involving the quantity and quality of data from the\nclient node. Specifically, we quantify the data used for training and perform\nmultiple rounds of local model inference accuracy evaluation on a specialized\ndataset to assess the data quality of each client. These two factors are\nutilized as weights within the aggregation process, applied through a\ndynamically weighted summation of these two factors. This approach allows the\nalgorithm to adaptively adjust the weights, ensuring that every client can\ncontribute to the global model, regardless of their data's size or initial\nquality. Our experiments show that the proposed algorithm outperforms several\nexisting state-of-the-art aggregation approaches on both a general-purpose\nopen-source dataset, CIFAR-10, and a dataset specific to visual obstacle\navoidance.\n","authors":["Haizhou Zhang","Xianjia Yu","Tomi Westerlund"],"pdf_url":"https://arxiv.org/pdf/2411.07816v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2411.07806v1","updated":"2024-11-12T14:01:08Z","published":"2024-11-12T14:01:08Z","title":"Federated Low-Rank Adaptation with Differential Privacy over Wireless\n Networks","summary":" Fine-tuning large pre-trained foundation models (FMs) on distributed edge\ndevices presents considerable computational and privacy challenges. Federated\nfine-tuning (FedFT) mitigates some privacy issues by facilitating collaborative\nmodel training without the need to share raw data. To lessen the computational\nburden on resource-limited devices, combining low-rank adaptation (LoRA) with\nfederated learning enables parameter-efficient fine-tuning. Additionally, the\nsplit FedFT architecture partitions an FM between edge devices and a central\nserver, reducing the necessity for complete model deployment on individual\ndevices. However, the risk of privacy eavesdropping attacks in FedFT remains a\nconcern, particularly in sensitive areas such as healthcare and finance. In\nthis paper, we propose a split FedFT framework with differential privacy (DP)\nover wireless networks, where the inherent wireless channel noise in the uplink\ntransmission is utilized to achieve DP guarantees without adding an extra\nartificial noise. We shall investigate the impact of the wireless noise on\nconvergence performance of the proposed framework. We will also show that by\nupdating only one of the low-rank matrices in the split FedFT with DP, the\nproposed method can mitigate the noise amplification effect. Simulation results\nwill demonstrate that the proposed framework achieves higher accuracy under\nstrict privacy budgets compared to baseline methods.\n","authors":["Tianqu Kang","Zixin Wang","Hengtao He","Jun Zhang","Shenghui Song","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2411.07806v1.pdf","comment":"6 pages, 3 figures, submitted to IEEE ICC 2025"},{"id":"http://arxiv.org/abs/2408.12970v2","updated":"2024-11-12T13:56:33Z","published":"2024-08-23T10:36:08Z","title":"SUMO: Search-Based Uncertainty Estimation for Model-Based Offline\n Reinforcement Learning","summary":" The performance of offline reinforcement learning (RL) suffers from the\nlimited size and quality of static datasets. Model-based offline RL addresses\nthis issue by generating synthetic samples through a dynamics model to enhance\noverall performance. To evaluate the reliability of the generated samples,\nuncertainty estimation methods are often employed. However, model ensemble, the\nmost commonly used uncertainty estimation method, is not always the best\nchoice. In this paper, we propose a \\textbf{S}earch-based \\textbf{U}ncertainty\nestimation method for \\textbf{M}odel-based \\textbf{O}ffline RL (SUMO) as an\nalternative. SUMO characterizes the uncertainty of synthetic samples by\nmeasuring their cross entropy against the in-distribution dataset samples, and\nuses an efficient search-based method for implementation. In this way, SUMO can\nachieve trustworthy uncertainty estimation. We integrate SUMO into several\nmodel-based offline RL algorithms including MOPO and Adapted MOReL (AMOReL),\nand provide theoretical analysis for them. Extensive experimental results on\nD4RL datasets demonstrate that SUMO can provide more accurate uncertainty\nestimation and boost the performance of base algorithms. These indicate that\nSUMO could be a better uncertainty estimator for model-based offline RL when\nused in either reward penalty or trajectory truncation. Our code is available\nand will be open-source for further research and development.\n","authors":["Zhongjian Qiao","Jiafei Lyu","Kechen Jiao","Qi Liu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2408.12970v2.pdf","comment":"Submitted to AAAI2025"},{"id":"http://arxiv.org/abs/2411.07800v1","updated":"2024-11-12T13:54:13Z","published":"2024-11-12T13:54:13Z","title":"Kernel-based retrieval models for hyperspectral image data optimized\n with Kernel Flows","summary":" Kernel-based statistical methods are efficient, but their performance depends\nheavily on the selection of kernel parameters. In literature, the optimization\nstudies on kernel-based chemometric methods is limited and often reduced to\ngrid searching. Previously, the authors introduced Kernel Flows (KF) to learn\nkernel parameters for Kernel Partial Least-Squares (K-PLS) regression. KF is\neasy to implement and helps minimize overfitting. In cases of high collinearity\nbetween spectra and biogeophysical quantities in spectroscopy, simpler methods\nlike Principal Component Regression (PCR) may be more suitable. In this study,\nwe propose a new KF-type approach to optimize Kernel Principal Component\nRegression (K-PCR) and test it alongside KF-PLS. Both methods are benchmarked\nagainst non-linear regression techniques using two hyperspectral remote sensing\ndatasets.\n","authors":["Zina-Sabrina Duma","Tuomas Sihvonen","Jouni Susiluoto","Otto Lamminpää","Heikki Haario","Satu-Pia Reinikainen"],"pdf_url":"https://arxiv.org/pdf/2411.07800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08770v3","updated":"2024-11-12T13:50:05Z","published":"2024-08-16T14:25:20Z","title":"Pessimistic Iterative Planning for Robust POMDPs","summary":" Robust POMDPs extend classical POMDPs to handle model uncertainty.\nSpecifically, robust POMDPs exhibit so-called uncertainty sets on the\ntransition and observation models, effectively defining ranges of\nprobabilities. Policies for robust POMDPs must be (1) memory-based to account\nfor partial observability and (2) robust against model uncertainty to account\nfor the worst-case instances from the uncertainty sets. To compute such robust\nmemory-based policies, we propose the pessimistic iterative planning (PIP)\nframework, which alternates between two main steps: (1) selecting a pessimistic\n(non-robust) POMDP via worst-case probability instances from the uncertainty\nsets; and (2) computing a finite-state controller (FSC) for this pessimistic\nPOMDP. We evaluate the performance of this FSC on the original robust POMDP and\nuse this evaluation in step (1) to select the next pessimistic POMDP. Within\nPIP, we propose the rFSCNet algorithm. In each iteration, rFSCNet finds an FSC\nthrough a recurrent neural network by using supervision policies optimized for\nthe pessimistic POMDP. The empirical evaluation in four benchmark environments\nshowcases improved robustness against several baseline methods and competitive\nperformance compared to a state-of-the-art robust POMDP solver.\n","authors":["Maris F. L. Galesloot","Marnix Suilen","Thiago D. Simão","Steven Carr","Matthijs T. J. Spaan","Ufuk Topcu","Nils Jansen"],"pdf_url":"https://arxiv.org/pdf/2408.08770v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07796v1","updated":"2024-11-12T13:46:58Z","published":"2024-11-12T13:46:58Z","title":"PatchCTG: Patch Cardiotocography Transformer for Antepartum Fetal Health\n Monitoring","summary":" Antepartum Cardiotocography (CTG) is vital for fetal health monitoring, but\ntraditional methods like the Dawes-Redman system are often limited by high\ninter-observer variability, leading to inconsistent interpretations and\npotential misdiagnoses. This paper introduces PatchCTG, a transformer-based\nmodel specifically designed for CTG analysis, employing patch-based\ntokenisation, instance normalisation and channel-independent processing to\ncapture essential local and global temporal dependencies within CTG signals.\nPatchCTG was evaluated on the Oxford Maternity (OXMAT) dataset, comprising over\n20,000 CTG traces across diverse clinical outcomes after applying the inclusion\nand exclusion criteria. With extensive hyperparameter optimisation, PatchCTG\nachieved an AUC of 77%, with specificity of 88% and sensitivity of 57% at\nYouden's index threshold, demonstrating adaptability to various clinical needs.\nTesting across varying temporal thresholds showed robust predictive\nperformance, particularly with finetuning on data closer to delivery, achieving\na sensitivity of 52% and specificity of 88% for near-delivery cases. These\nfindings suggest the potential of PatchCTG to enhance clinical decision-making\nin antepartum care by providing a reliable, objective tool for fetal health\nassessment. The source code is available at\nhttps://github.com/jaleedkhan/PatchCTG.\n","authors":["M. Jaleed Khan","Manu Vatish","Gabriel Davis Jones"],"pdf_url":"https://arxiv.org/pdf/2411.07796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00426v2","updated":"2024-11-12T13:41:47Z","published":"2024-08-01T09:57:48Z","title":"A Cross-Domain Benchmark for Active Learning","summary":" Active Learning (AL) deals with identifying the most informative samples for\nlabeling to reduce data annotation costs for supervised learning tasks. AL\nresearch suffers from the fact that lifts from literature generalize poorly and\nthat only a small number of repetitions of experiments are conducted. To\novercome these obstacles, we propose CDALBench, the first active learning\nbenchmark which includes tasks in computer vision, natural language processing\nand tabular learning. Furthermore, by providing an efficient, greedy oracle,\nCDALBench can be evaluated with 50 runs for each experiment. We show, that both\nthe cross-domain character and a large amount of repetitions are crucial for\nsophisticated evaluation of AL research. Concretely, we show that the\nsuperiority of specific methods varies over the different domains, making it\nimportant to evaluate Active Learning with a cross-domain benchmark.\nAdditionally, we show that having a large amount of runs is crucial. With only\nconducting three runs as often done in the literature, the superiority of\nspecific methods can strongly vary with the specific runs. This effect is so\nstrong, that, depending on the seed, even a well-established method's\nperformance can be significantly better and significantly worse than random for\nthe same dataset.\n","authors":["Thorben Werner","Johannes Burchert","Maximilian Stubbemann","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2408.00426v2.pdf","comment":"Accepted at NeurIPS 24 in the Benchmarks and Datasets Track. Updated\n version of paper \"Toward Comparable Active Learning\" (arXiv:2311.18356).\n \"Toward Comparable Active Learning\" is deprecated, please use this version.\n arXiv admin note: text overlap with arXiv:2311.18356; text overlap with\n arXiv:2301.10625 by other authors"},{"id":"http://arxiv.org/abs/2310.04361v4","updated":"2024-11-12T13:35:37Z","published":"2023-10-06T16:34:51Z","title":"Exploiting Activation Sparsity with Dense to Dynamic-k\n Mixture-of-Experts Conversion","summary":" Transformer models can face practical limitations due to their high\ncomputational requirements. At the same time, such models exhibit significant\nactivation sparsity, which can be leveraged to reduce the inference cost by\nconverting parts of the network into equivalent Mixture-of-Experts (MoE)\nlayers. Despite the crucial role played by activation sparsity, its impact on\nthis process remains unexplored. We demonstrate that the efficiency of the\nconversion can be significantly enhanced by a proper regularization of the\nactivation sparsity of the base model. Moreover, motivated by the high variance\nof the number of activated neurons for different inputs, we introduce a more\neffective dynamic-$k$ expert selection rule that adjusts the number of executed\nexperts on a per-token basis. To achieve further savings, we extend this\napproach to multi-head attention projections. Finally, we develop an efficient\nimplementation that translates these computational savings into actual\nwall-clock speedup. The proposed method, Dense to Dynamic-$k$\nMixture-of-Experts (D2DMoE), outperforms existing approaches on common NLP and\nvision tasks, reducing inference cost by up to 60% without significantly\nimpacting performance.\n","authors":["Filip Szatkowski","Bartosz Wójcik","Mikołaj Piórczyński","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2310.04361v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07784v1","updated":"2024-11-12T13:33:26Z","published":"2024-11-12T13:33:26Z","title":"Interaction Asymmetry: A General Principle for Learning Composable\n Abstractions","summary":" Learning disentangled representations of concepts and re-composing them in\nunseen ways is crucial for generalizing to out-of-domain situations. However,\nthe underlying properties of concepts that enable such disentanglement and\ncompositional generalization remain poorly understood. In this work, we propose\nthe principle of interaction asymmetry which states: \"Parts of the same concept\nhave more complex interactions than parts of different concepts\". We formalize\nthis via block diagonality conditions on the $(n+1)$th order derivatives of the\ngenerator mapping concepts to observed data, where different orders of\n\"complexity\" correspond to different $n$. Using this formalism, we prove that\ninteraction asymmetry enables both disentanglement and compositional\ngeneralization. Our results unify recent theoretical results for learning\nconcepts of objects, which we show are recovered as special cases with\n$n\\!=\\!0$ or $1$. We provide results for up to $n\\!=\\!2$, thus extending these\nprior works to more flexible generator functions, and conjecture that the same\nproof strategies generalize to larger $n$. Practically, our theory suggests\nthat, to disentangle concepts, an autoencoder should penalize its latent\ncapacity and the interactions between concepts during decoding. We propose an\nimplementation of these criteria using a flexible Transformer-based VAE, with a\nnovel regularizer on the attention weights of the decoder. On synthetic image\ndatasets consisting of objects, we provide evidence that this model can achieve\ncomparable object disentanglement to existing models that use more explicit\nobject-centric priors.\n","authors":["Jack Brady","Julius von Kügelgen","Sébastien Lachapelle","Simon Buchholz","Thomas Kipf","Wieland Brendel"],"pdf_url":"https://arxiv.org/pdf/2411.07784v1.pdf","comment":"Preprint, under review"},{"id":"http://arxiv.org/abs/2408.08074v2","updated":"2024-11-12T13:26:39Z","published":"2024-08-15T11:01:35Z","title":"A Survey on Integrated Sensing, Communication, and Computation","summary":" The forthcoming generation of wireless technology, 6G, aims to usher in an\nera of ubiquitous intelligent services, where everything is interconnected and\nintelligent. This vision requires the seamless integration of three fundamental\nmodules: Sensing for information acquisition, communication for information\nsharing, and computation for information processing and decision-making. These\nmodules are intricately linked, especially in complex tasks such as edge\nlearning and inference. However, the performance of these modules is\ninterdependent, creating a resource competition for time, energy, and\nbandwidth. Existing techniques like integrated communication and computation\n(ICC), integrated sensing and computation (ISC), and integrated sensing and\ncommunication (ISAC) have made partial strides in addressing this challenge,\nbut they fall short of meeting the extreme performance requirements. To\novercome these limitations, it is essential to develop new techniques that\ncomprehensively integrate sensing, communication, and computation. This\nintegrated approach, known as Integrated Sensing, Communication, and\nComputation (ISCC), offers a systematic perspective for enhancing task\nperformance. This paper begins with a comprehensive survey of historic and\nrelated techniques such as ICC, ISC, and ISAC, highlighting their strengths and\nlimitations. It then discusses the benefits, functions, and challenges of ISCC.\nSubsequently, the state-of-the-art signal designs for ISCC, along with network\nresource management strategies specifically tailored for ISCC are explored.\nFurthermore, this paper discusses the exciting research opportunities that lie\nahead for implementing ISCC in future advanced networks, and the unresolved\nissues requiring further investigation. ISCC is expected to unlock the full\npotential of intelligent connectivity, paving the way for groundbreaking\napplications and services.\n","authors":["Dingzhu Wen","Yong Zhou","Xiaoyang Li","Yuanming Shi","Kaibin Huang","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2408.08074v2.pdf","comment":"In this version, a series of discussions have been added.The\n benefits, functions, and challenges of ISCC are investigated using a new\n section. Moreover, the unresolved issues of ISCC have been discussed"},{"id":"http://arxiv.org/abs/2411.07773v1","updated":"2024-11-12T13:14:09Z","published":"2024-11-12T13:14:09Z","title":"Likelihood as a Performance Gauge for Retrieval-Augmented Generation","summary":" Recent work finds that retrieval-augmented generation with large language\nmodels is prone to be influenced by the order of retrieved documents in the\ncontext. However, the lack of in-depth analysis limits the use of this\nphenomenon for prompt engineering in practice. In this study, we posit that\nlikelihoods serve as an effective gauge for language model performance. Through\nexperiments on two question-answering datasets with a variety of\nstate-of-the-art language models, we reveal correlations between answer\naccuracy and the likelihood of the question at both the corpus level and the\ninstance level. In addition, we find that question likelihood can also indicate\nthe position of the task-relevant information in the context. Based on these\nfindings, we propose two methods that use question likelihood as a gauge for\nselecting and constructing prompts that lead to better performance. We\ndemonstrate their effectiveness with experiments. In addition, our\nlikelihood-based methods are efficient, as they only need to compute the\nlikelihood of the input, requiring much fewer language model passes than\nheuristic prompt engineering methods that require generating responses. Our\nanalysis deepens our understanding of how input prompts affect model\nperformance and provides a promising direction for efficient prompt\noptimization.\n","authors":["Tianyu Liu","Jirui Qi","Paul He","Arianna Bisazza","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.07773v1.pdf","comment":"Under review at NAACL 2025. Code is available at\n https://github.com/lyutyuh/poptimizer"},{"id":"http://arxiv.org/abs/2411.07772v1","updated":"2024-11-12T13:13:20Z","published":"2024-11-12T13:13:20Z","title":"Automatic Album Sequencing","summary":" Album sequencing is a critical part of the album production process.\nRecently, a data-driven approach was proposed that sequences general\ncollections of independent media by extracting the narrative essence of the\nitems in the collections. While this approach implies an album sequencing\ntechnique, it is not widely accessible to a less technical audience, requiring\nadvanced knowledge of machine learning techniques to use. To address this, we\nintroduce a new user-friendly web-based tool that allows a less technical\naudience to upload music tracks, execute this technique in one click, and\nsubsequently presents the result in a clean visualization to the user. To both\nincrease the number of templates available to the user and address shortcomings\nof previous work, we also introduce a new direct transformer-based album\nsequencing method. We find that our more direct method outperforms a random\nbaseline but does not reach the same performance as the narrative essence\napproach. Both methods are included in our web-based user interface, and this\n-- alongside a full copy of our implementation -- is publicly available at\nhttps://github.com/dylanashley/automatic-album-sequencing\n","authors":["Vincent Herrmann","Dylan R. Ashley","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2411.07772v1.pdf","comment":"presented as a late breaking demo in the 25th International Society\n for Music Information Retrieval Conference; 3 pages in main text, 3 figures\n in main text; source code available at\n https://github.com/dylanashley/automatic-album-sequencing"},{"id":"http://arxiv.org/abs/2411.07762v1","updated":"2024-11-12T12:52:04Z","published":"2024-11-12T12:52:04Z","title":"ASER: Activation Smoothing and Error Reconstruction for Large Language\n Model Quantization","summary":" Quantization stands as a pivotal technique for large language model (LLM)\nserving, yet it poses significant challenges particularly in achieving\neffective low-bit quantization. The limited numerical mapping makes the\nquantized model produce a non-trivial error, bringing out intolerable\nperformance degration. This paper is anchored in the basic idea of model\ncompression objectives, and delves into the layer-wise error distribution of\nLLMs during post-training quantization. Subsequently, we introduce ASER, an\nalgorithm consisting of (1) Error Reconstruction: low-rank compensation for\nquantization error with LoRA-style matrices constructed by whitening SVD; (2)\nActivation Smoothing: outlier extraction to gain smooth activation and better\nerror compensation. ASER is capable of quantizing typical LLMs to low-bit ones,\nparticularly preserving accuracy even in W4A8 per-channel setup. Experimental\nresults show that ASER is competitive among the state-of-the-art quantization\nalgorithms, showing potential to activation quantization, with minor overhead.\n","authors":["Weibo Zhao","Yubin Shi","Xinyu Lyu","Wanchen Sui","Shen Li","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.07762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07760v1","updated":"2024-11-12T12:49:41Z","published":"2024-11-12T12:49:41Z","title":"Navigation with QPHIL: Quantizing Planner for Hierarchical Implicit\n Q-Learning","summary":" Offline Reinforcement Learning (RL) has emerged as a powerful alternative to\nimitation learning for behavior modeling in various domains, particularly in\ncomplex navigation tasks. An existing challenge with Offline RL is the\nsignal-to-noise ratio, i.e. how to mitigate incorrect policy updates due to\nerrors in value estimates. Towards this, multiple works have demonstrated the\nadvantage of hierarchical offline RL methods, which decouples high-level path\nplanning from low-level path following. In this work, we present a novel\nhierarchical transformer-based approach leveraging a learned quantizer of the\nspace. This quantization enables the training of a simpler zone-conditioned\nlow-level policy and simplifies planning, which is reduced to discrete\nautoregressive prediction. Among other benefits, zone-level reasoning in\nplanning enables explicit trajectory stitching rather than implicit stitching\nbased on noisy value function estimates. By combining this transformer-based\nplanner with recent advancements in offline RL, our proposed approach achieves\nstate-of-the-art results in complex long-distance navigation environments.\n","authors":["Alexi Canesse","Mathieu Petitbois","Ludovic Denoyer","Sylvain Lamprier","Rémy Portelas"],"pdf_url":"https://arxiv.org/pdf/2411.07760v1.pdf","comment":"Under review. Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2411.02199v4","updated":"2024-11-12T12:44:02Z","published":"2024-11-04T15:54:32Z","title":"Provably Transformers Harness Multi-Concept Word Semantics for Efficient\n In-Context Learning","summary":" Transformer-based large language models (LLMs) have displayed remarkable\ncreative prowess and emergence capabilities. Existing empirical studies have\nrevealed a strong connection between these LLMs' impressive emergence abilities\nand their in-context learning (ICL) capacity, allowing them to solve new tasks\nusing only task-specific prompts without further fine-tuning. On the other\nhand, existing empirical and theoretical studies also show that there is a\nlinear regularity of the multi-concept encoded semantic representation behind\ntransformer-based LLMs. However, existing theoretical work fail to build up an\nunderstanding of the connection between this regularity and the innovative\npower of ICL. Additionally, prior work often focuses on simplified, unrealistic\nscenarios involving linear transformers or unrealistic loss functions, and they\nachieve only linear or sub-linear convergence rates. In contrast, this work\nprovides a fine-grained mathematical analysis to show how transformers leverage\nthe multi-concept semantics of words to enable powerful ICL and excellent\nout-of-distribution ICL abilities, offering insights into how transformers\ninnovate solutions for certain unseen tasks encoded with multiple cross-concept\nsemantics. Inspired by empirical studies on the linear latent geometry of LLMs,\nthe analysis is based on a concept-based low-noise sparse coding prompt model.\nLeveraging advanced techniques, this work showcases the exponential 0-1 loss\nconvergence over the highly non-convex training dynamics, which pioneeringly\nincorporates the challenges of softmax self-attention, ReLU-activated MLPs, and\ncross-entropy loss. Empirical simulations corroborate the theoretical findings.\n","authors":["Dake Bu","Wei Huang","Andi Han","Atsushi Nitanda","Taiji Suzuki","Qingfu Zhang","Hau-San Wong"],"pdf_url":"https://arxiv.org/pdf/2411.02199v4.pdf","comment":"Accepted by the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2404.19664v4","updated":"2024-11-12T12:43:42Z","published":"2024-04-30T15:57:41Z","title":"Towards Generalist Robot Learning from Internet Video: A Survey","summary":" Scaling deep learning to massive, diverse internet data has yielded\nremarkably general capabilities in visual and natural language understanding\nand generation. However, data has remained scarce and challenging to collect in\nrobotics, seeing robot learning struggle to obtain similarly general\ncapabilities. Promising Learning from Videos (LfV) methods aim to address the\nrobotics data bottleneck by augmenting traditional robot data with large-scale\ninternet video data. This video data offers broad foundational information\nregarding physical behaviour and the underlying physics of the world, and thus\ncan be highly informative for a generalist robot.\n In this survey, we present a thorough overview of the emerging field of LfV.\nWe outline fundamental concepts, including the benefits and challenges of LfV.\nWe provide a comprehensive review of current methods for extracting knowledge\nfrom large-scale internet video, addressing key challenges in LfV, and boosting\ndownstream robot and reinforcement learning via the use of video data. The\nsurvey concludes with a critical discussion of challenges and opportunities in\nLfV. Here, we advocate for scalable foundation model approaches that can\nleverage the full range of available internet video to improve the learning of\nrobot policies and dynamics models. We hope this survey can inform and catalyse\nfurther LfV research, driving progress towards the development of\ngeneral-purpose robots.\n","authors":["Robert McCarthy","Daniel C. H. Tan","Dominik Schmidt","Fernando Acero","Nathan Herr","Yilun Du","Thomas G. Thuruthel","Zhibin Li"],"pdf_url":"https://arxiv.org/pdf/2404.19664v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07753v1","updated":"2024-11-12T12:24:48Z","published":"2024-11-12T12:24:48Z","title":"Spatially Regularized Graph Attention Autoencoder Framework for\n Detecting Rainfall Extremes","summary":" We introduce a novel Graph Attention Autoencoder (GAE) with spatial\nregularization to address the challenge of scalable anomaly detection in\nspatiotemporal rainfall data across India from 1990 to 2015. Our model\nleverages a Graph Attention Network (GAT) to capture spatial dependencies and\ntemporal dynamics in the data, further enhanced by a spatial regularization\nterm ensuring geographic coherence. We construct two graph datasets employing\nrainfall, pressure, and temperature attributes from the Indian Meteorological\nDepartment and ERA5 Reanalysis on Single Levels, respectively. Our network\noperates on graph representations of the data, where nodes represent geographic\nlocations, and edges, inferred through event synchronization, denote\nsignificant co-occurrences of rainfall events. Through extensive experiments,\nwe demonstrate that our GAE effectively identifies anomalous rainfall patterns\nacross the Indian landscape. Our work paves the way for sophisticated\nspatiotemporal anomaly detection methodologies in climate science, contributing\nto better climate change preparedness and response strategies.\n","authors":["Mihir Agarwal","Progyan Das","Udit Bhatia"],"pdf_url":"https://arxiv.org/pdf/2411.07753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06911v2","updated":"2024-11-12T12:07:00Z","published":"2024-11-11T12:13:58Z","title":"Gaussian Process Emulators for Few-Shot Segmentation in Cardiac MRI","summary":" Segmentation of cardiac magnetic resonance images (MRI) is crucial for the\nanalysis and assessment of cardiac function, helping to diagnose and treat\nvarious cardiovascular diseases. Most recent techniques rely on deep learning\nand usually require an extensive amount of labeled data. To overcome this\nproblem, few-shot learning has the capability of reducing data dependency on\nlabeled data. In this work, we introduce a new method that merges few-shot\nlearning with a U-Net architecture and Gaussian Process Emulators (GPEs),\nenhancing data integration from a support set for improved performance. GPEs\nare trained to learn the relation between the support images and the\ncorresponding masks in latent space, facilitating the segmentation of unseen\nquery images given only a small labeled support set at inference. We test our\nmodel with the M&Ms-2 public dataset to assess its ability to segment the heart\nin cardiac magnetic resonance imaging from different orientations, and compare\nit with state-of-the-art unsupervised and few-shot methods. Our architecture\nshows higher DICE coefficients compared to these methods, especially in the\nmore challenging setups where the size of the support set is considerably\nsmall.\n","authors":["Bruno Viti","Franz Thaler","Kathrin Lisa Kapper","Martin Urschler","Martin Holler","Elias Karabelas"],"pdf_url":"https://arxiv.org/pdf/2411.06911v2.pdf","comment":"Accepted at Statistical Atlases and Computational Modeling of the\n Heart (STACOM) Workshop 2024"},{"id":"http://arxiv.org/abs/2411.07087v2","updated":"2024-11-12T12:03:07Z","published":"2024-11-11T16:04:49Z","title":"OCMDP: Observation-Constrained Markov Decision Process","summary":" In many practical applications, decision-making processes must balance the\ncosts of acquiring information with the benefits it provides. Traditional\ncontrol systems often assume full observability, an unrealistic assumption when\nobservations are expensive. We tackle the challenge of simultaneously learning\nobservation and control strategies in such cost-sensitive environments by\nintroducing the Observation-Constrained Markov Decision Process (OCMDP), where\nthe policy influences the observability of the true state. To manage the\ncomplexity arising from the combined observation and control actions, we\ndevelop an iterative, model-free deep reinforcement learning algorithm that\nseparates the sensing and control components of the policy. This decomposition\nenables efficient learning in the expanded action space by focusing on when and\nwhat to observe, as well as determining optimal control actions, without\nrequiring knowledge of the environment's dynamics. We validate our approach on\na simulated diagnostic task and a realistic healthcare environment using\nHeartPole. Given both scenarios, the experimental results demonstrate that our\nmodel achieves a substantial reduction in observation costs on average,\nsignificantly outperforming baseline methods by a notable margin in efficiency.\n","authors":["Taiyi Wang","Jianheng Liu","Bryan Lee","Zhihao Wu","Yu Wu"],"pdf_url":"https://arxiv.org/pdf/2411.07087v2.pdf","comment":"Full paper, 14 Pages"},{"id":"http://arxiv.org/abs/2408.12308v3","updated":"2024-11-12T11:45:35Z","published":"2024-08-22T11:34:34Z","title":"Deep Learning with CNNs: A Compact Holistic Tutorial with Focus on\n Supervised Regression (Preprint)","summary":" In this tutorial, we present a compact and holistic discussion of Deep\nLearning with a focus on Convolutional Neural Networks (CNNs) and supervised\nregression. While there are numerous books and articles on the individual\ntopics we cover, comprehensive and detailed tutorials that address Deep\nLearning from a foundational yet rigorous and accessible perspective are rare.\nMost resources on CNNs are either too advanced, focusing on cutting-edge\narchitectures, or too narrow, addressing only specific applications like image\nclassification.This tutorial not only summarizes the most relevant concepts but\nalso provides an in-depth exploration of each, offering a complete yet agile\nset of ideas. Moreover, we highlight the powerful synergy between learning\ntheory, statistic, and machine learning, which together underpin the Deep\nLearning and CNN frameworks. We aim for this tutorial to serve as an optimal\nresource for students, professors, and anyone interested in understanding the\nfoundations of Deep Learning. Upon acceptance we will provide an accompanying\nrepository under\n\\href{https://github.com/neoglez/deep-learning-tutorial}{https://github.com/neoglez/deep-learning-tutorial}\n Keywords: Tutorial, Deep Learning, Convolutional Neural Networks, Machine\nLearning.\n","authors":["Yansel Gonzalez Tejeda","Helmut A. Mayer"],"pdf_url":"https://arxiv.org/pdf/2408.12308v3.pdf","comment":"Submitted to the journal Machine Learning and Knowledge Extraction"},{"id":"http://arxiv.org/abs/2411.07729v1","updated":"2024-11-12T11:41:38Z","published":"2024-11-12T11:41:38Z","title":"Exploring the loss landscape of regularized neural networks via convex\n duality","summary":" We discuss several aspects of the loss landscape of regularized neural\nnetworks: the structure of stationary points, connectivity of optimal\nsolutions, path with nonincreasing loss to arbitrary global optimum, and the\nnonuniqueness of optimal solutions, by casting the problem into an equivalent\nconvex problem and considering its dual. Starting from two-layer neural\nnetworks with scalar output, we first characterize the solution set of the\nconvex problem using its dual and further characterize all stationary points.\nWith the characterization, we show that the topology of the global optima goes\nthrough a phase transition as the width of the network changes, and construct\ncounterexamples where the problem may have a continuum of optimal solutions.\nFinally, we show that the solution set characterization and connectivity\nresults can be extended to different architectures, including two-layer\nvector-valued neural networks and parallel three-layer neural networks.\n","authors":["Sungyoon Kim","Aaron Mishkin","Mert Pilanci"],"pdf_url":"https://arxiv.org/pdf/2411.07729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07724v1","updated":"2024-11-12T11:30:53Z","published":"2024-11-12T11:30:53Z","title":"Convergence Rate Analysis of LION","summary":" The LION (evoLved sIgn mOmeNtum) optimizer for deep neural network training\nwas found by Google via program search, with the simple sign update yet showing\nimpressive performance in training large scale networks. Although previous\nstudies have investigated its convergence properties, a comprehensive analysis,\nespecially the convergence rate, is still desirable. Recognizing that LION can\nbe regarded as solving a specific constrained problem, this paper focuses on\ndemonstrating its convergence to the Karush-Kuhn-Tucker (KKT) point at the rate\nof $\\cal O(\\sqrt{d}K^{-1/4})$ measured by gradient $\\ell_1$ norm, where $d$ is\nthe problem dimension and $K$ is the number of iteration steps. Step further,\nwe remove the constraint and establish that LION converges to the critical\npoint of the general unconstrained problem at the same rate. This rate not only\ndelivers the currently optimal dependence on the problem dimension $d$ but also\ntightly matches the theoretical lower bound for nonconvex stochastic\noptimization algorithms, which is typically measured using the gradient\n$\\ell_2$ norm, with respect to the number of iterations $K$. Through extensive\nexperiments, we not only demonstrate that LION achieves lower loss and higher\nperformance compared to standard SGD, but also empirically confirm that the\ngradient $\\ell_1/\\ell_2$ norm ratio aligns with $\\Theta(\\sqrt{d})$, thus\nproving that our convergence rate matches the theoretical lower bound with\nrespect to $d$ in the empirical sense.\n","authors":["Yiming Dong","Huan Li","Zhouchen Lin"],"pdf_url":"https://arxiv.org/pdf/2411.07724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07719v1","updated":"2024-11-12T11:24:18Z","published":"2024-11-12T11:24:18Z","title":"EMPERROR: A Flexible Generative Perception Error Model for Probing\n Self-Driving Planners","summary":" To handle the complexities of real-world traffic, learning planners for\nself-driving from data is a promising direction. While recent approaches have\nshown great progress, they typically assume a setting in which the ground-truth\nworld state is available as input. However, when deployed, planning needs to be\nrobust to the long-tail of errors incurred by a noisy perception system, which\nis often neglected in evaluation. To address this, previous work has proposed\ndrawing adversarial samples from a perception error model (PEM) mimicking the\nnoise characteristics of a target object detector. However, these methods use\nsimple PEMs that fail to accurately capture all failure modes of detection. In\nthis paper, we present EMPERROR, a novel transformer-based generative PEM,\napply it to stress-test an imitation learning (IL)-based planner and show that\nit imitates modern detectors more faithfully than previous work. Furthermore,\nit is able to produce realistic noisy inputs that increase the planner's\ncollision rate by up to 85%, demonstrating its utility as a valuable tool for a\nmore complete evaluation of self-driving planners.\n","authors":["Niklas Hanselmann","Simon Doll","Marius Cordts","Hendrik P. A. Lensch","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2411.07719v1.pdf","comment":"Project page: https://lasnik.github.io/emperror/"},{"id":"http://arxiv.org/abs/2405.07863v3","updated":"2024-11-12T11:18:43Z","published":"2024-05-13T15:50:39Z","title":"RLHF Workflow: From Reward Modeling to Online RLHF","summary":" We present the workflow of Online Iterative Reinforcement Learning from Human\nFeedback (RLHF) in this technical report, which is widely reported to\noutperform its offline counterpart by a large margin in the recent large\nlanguage model (LLM) literature. However, existing open-source RLHF projects\nare still largely confined to the offline learning setting. In this technical\nreport, we aim to fill in this gap and provide a detailed recipe that is easy\nto reproduce for online iterative RLHF. In particular, since online human\nfeedback is usually infeasible for open-source communities with limited\nresources, we start by constructing preference models using a diverse set of\nopen-source datasets and use the constructed proxy preference model to\napproximate human feedback. Then, we discuss the theoretical insights and\nalgorithmic principles behind online iterative RLHF, followed by a detailed\npractical implementation. Our trained LLM achieves impressive performance on\nLLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as\nwell as other academic benchmarks such as HumanEval and TruthfulQA. We have\nshown that supervised fine-tuning (SFT) and iterative RLHF can obtain\nstate-of-the-art performance with fully open-source datasets. Further, we have\nmade our models, curated datasets, and comprehensive step-by-step code\nguidebooks publicly available. Please refer to\nhttps://github.com/RLHFlow/RLHF-Reward-Modeling and\nhttps://github.com/RLHFlow/Online-RLHF for more detailed information.\n","authors":["Hanze Dong","Wei Xiong","Bo Pang","Haoxiang Wang","Han Zhao","Yingbo Zhou","Nan Jiang","Doyen Sahoo","Caiming Xiong","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.07863v3.pdf","comment":"Published in Transactions on Machine Learning Research (09/2024)"},{"id":"http://arxiv.org/abs/2201.07395v4","updated":"2024-11-12T11:12:46Z","published":"2022-01-19T03:08:33Z","title":"Overview frequency principle/spectral bias in deep learning","summary":" Understanding deep learning is increasingly emergent as it penetrates more\nand more into industry and science. In recent years, a research line from\nFourier analysis sheds lights on this magical \"black box\" by showing a\nFrequency Principle (F-Principle or spectral bias) of the training behavior of\ndeep neural networks (DNNs) -- DNNs often fit functions from low to high\nfrequency during the training. The F-Principle is first demonstrated by\nonedimensional synthetic data followed by the verification in high-dimensional\nreal datasets. A series of works subsequently enhance the validity of the\nF-Principle. This low-frequency implicit bias reveals the strength of neural\nnetwork in learning low-frequency functions as well as its deficiency in\nlearning high-frequency functions. Such understanding inspires the design of\nDNN-based algorithms in practical problems, explains experimental phenomena\nemerging in various scenarios, and further advances the study of deep learning\nfrom the frequency perspective. Although incomplete, we provide an overview of\nF-Principle and propose some open problems for future research.\n","authors":["Zhi-Qin John Xu","Yaoyu Zhang","Tao Luo"],"pdf_url":"https://arxiv.org/pdf/2201.07395v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06736v2","updated":"2024-11-12T11:09:18Z","published":"2024-11-11T06:04:53Z","title":"Mr.Steve: Instruction-Following Agents in Minecraft with What-Where-When\n Memory","summary":" Significant advances have been made in developing general-purpose embodied AI\nin environments like Minecraft through the adoption of LLM-augmented\nhierarchical approaches. While these approaches, which combine high-level\nplanners with low-level controllers, show promise, low-level controllers\nfrequently become performance bottlenecks due to repeated failures. In this\npaper, we argue that the primary cause of failure in many low-level controllers\nis the absence of an episodic memory system. To address this, we introduce Mr.\nSteve (Memory Recall Steve-1), a novel low-level controller equipped with Place\nEvent Memory (PEM), a form of episodic memory that captures what, where, and\nwhen information from episodes. This directly addresses the main limitation of\nthe popular low-level controller, Steve-1. Unlike previous models that rely on\nshort-term memory, PEM organizes spatial and event-based data, enabling\nefficient recall and navigation in long-horizon tasks. Additionally, we propose\nan Exploration Strategy and a Memory-Augmented Task Solving Framework, allowing\nagents to alternate between exploration and task-solving based on recalled\nevents. Our approach significantly improves task-solving and exploration\nefficiency compared to existing methods. We will release our code and demos on\nthe project page: https://sites.google.com/view/mr-steve.\n","authors":["Junyeong Park","Junmo Cho","Sungjin Ahn"],"pdf_url":"https://arxiv.org/pdf/2411.06736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08364v3","updated":"2024-11-12T10:56:38Z","published":"2024-07-11T10:18:54Z","title":"Scalar Function Topology Divergence: Comparing Topology of 3D Objects","summary":" We propose a new topological tool for computer vision - Scalar Function\nTopology Divergence (SFTD), which measures the dissimilarity of multi-scale\ntopology between sublevel sets of two functions having a common domain.\nFunctions can be defined on an undirected graph or Euclidean space of any\ndimensionality. Most of the existing methods for comparing topology are based\non Wasserstein distance between persistence barcodes and they don't take into\naccount the localization of topological features. The minimization of SFTD\nensures that the corresponding topological features of scalar functions are\nlocated in the same places. The proposed tool provides useful visualizations\ndepicting areas where functions have topological dissimilarities. We provide\napplications of the proposed method to 3D computer vision. In particular,\nexperiments demonstrate that SFTD as an additional loss improves the\nreconstruction of cellular 3D shapes from 2D fluorescence microscopy images,\nand helps to identify topological errors in 3D segmentation. Additionally, we\nshow that SFTD outperforms Betti matching loss in 2D segmentation problems.\n","authors":["Ilya Trofimov","Daria Voronkova","Eduard Tulchinskii","Evgeny Burnaev","Serguei Barannikov"],"pdf_url":"https://arxiv.org/pdf/2407.08364v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07711v1","updated":"2024-11-12T10:55:30Z","published":"2024-11-12T10:55:30Z","title":"OWLed: Outlier-weighed Layerwise Pruning for Efficient Autonomous\n Driving Framework","summary":" The integration of Large Language Models (LLMs) into autonomous driving\nsystems offers promising enhancements in environmental understanding and\ndecision-making. However, the substantial computational demands of deploying\nLLMs locally on vehicles render this approach unfeasible for real-world\nautomotive applications. To address this challenge, we introduce OWLed, the\nOutlier-Weighed Layerwise Pruning for Efficient Autonomous Driving Framework\nthat leverages outlier-weighted layerwise sparsity for model compression. Our\nmethod assigns non-uniform sparsity ratios to different layers based on the\ndistribution of outlier features, significantly reducing the model size without\nthe need for fine-tuning. To ensure the compressed model adapts well to\nautonomous driving tasks, we incorporate driving environment data into both the\ncalibration and pruning processes. Our empirical studies reveal that the\nencoder component is more sensitive to pruning than the LLM, highlighting its\ncritical role in the system. Experimental results demonstrate that OWLed\noutperforms existing methods in perception, action prediction, and language\nunderstanding while substantially lowering computational requirements. These\nfindings underscore the potential of combining advanced pruning techniques with\nLLMs to develop efficient and robust autonomous driving systems capable of\nhandling complex scenarios. Code will be made publicly available.\n","authors":["Jiaxi Li","Lu Yin","Xilu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07711v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.07700v1","updated":"2024-11-12T10:26:44Z","published":"2024-11-12T10:26:44Z","title":"Test Where Decisions Matter: Importance-driven Testing for Deep\n Reinforcement Learning","summary":" In many Deep Reinforcement Learning (RL) problems, decisions in a trained\npolicy vary in significance for the expected safety and performance of the\npolicy. Since RL policies are very complex, testing efforts should concentrate\non states in which the agent's decisions have the highest impact on the\nexpected outcome. In this paper, we propose a novel model-based method to\nrigorously compute a ranking of state importance across the entire state space.\nWe then focus our testing efforts on the highest-ranked states. In this paper,\nwe focus on testing for safety. However, the proposed methods can be easily\nadapted to test for performance. In each iteration, our testing framework\ncomputes optimistic and pessimistic safety estimates. These estimates provide\nlower and upper bounds on the expected outcomes of the policy execution across\nall modeled states in the state space. Our approach divides the state space\ninto safe and unsafe regions upon convergence, providing clear insights into\nthe policy's weaknesses. Two important properties characterize our approach.\n(1) Optimal Test-Case Selection: At any time in the testing process, our\napproach evaluates the policy in the states that are most critical for safety.\n(2) Guaranteed Safety: Our approach can provide formal verification guarantees\nover the entire state space by sampling only a fraction of the policy. Any\nsafety properties assured by the pessimistic estimate are formally proven to\nhold for the policy. We provide a detailed evaluation of our framework on\nseveral examples, showing that our method discovers unsafe policy behavior with\nlow testing effort.\n","authors":["Stefan Pranger","Hana Chockler","Martin Tappler","Bettina Könighofer"],"pdf_url":"https://arxiv.org/pdf/2411.07700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02487v2","updated":"2024-11-12T10:03:37Z","published":"2024-08-05T14:09:30Z","title":"LiCoEval: Evaluating LLMs on License Compliance in Code Generation","summary":" Recent advances in Large Language Models (LLMs) have revolutionized code\ngeneration, leading to widespread adoption of AI coding tools by developers.\nHowever, LLMs can generate license-protected code without providing the\nnecessary license information, leading to potential intellectual property\nviolations during software production. This paper addresses the critical, yet\nunderexplored, issue of license compliance in LLM-generated code by\nestablishing a benchmark to evaluate the ability of LLMs to provide accurate\nlicense information for their generated code. To establish this benchmark, we\nconduct an empirical study to identify a reasonable standard for \"striking\nsimilarity\" that excludes the possibility of independent creation, indicating a\ncopy relationship between the LLM output and certain open-source code. Based on\nthis standard, we propose LiCoEval, to evaluate the license compliance\ncapabilities of LLMs, i.e., the ability to provide accurate license or\ncopyright information when they generate code with striking similarity to\nalready existing copyrighted code. Using LiCoEval, we evaluate 14 popular LLMs,\nfinding that even top-performing LLMs produce a non-negligible proportion\n(0.88% to 2.01%) of code strikingly similar to existing open-source\nimplementations. Notably, most LLMs fail to provide accurate license\ninformation, particularly for code under copyleft licenses. These findings\nunderscore the urgent need to enhance LLM compliance capabilities in code\ngeneration tasks. Our study provides a foundation for future research and\ndevelopment to improve license compliance in AI-assisted software development,\ncontributing to both the protection of open-source software copyrights and the\nmitigation of legal risks for LLM users.\n","authors":["Weiwei Xu","Kai Gao","Hao He","Minghui Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.02487v2.pdf","comment":"The 47th International Conference on Software Engineering(ICSE 2025)"},{"id":"http://arxiv.org/abs/2405.17544v2","updated":"2024-11-12T09:57:28Z","published":"2024-05-27T18:00:00Z","title":"Towards Human-AI Complementarity with Prediction Sets","summary":" Decision support systems based on prediction sets have proven to be effective\nat helping human experts solve classification tasks. Rather than providing\nsingle-label predictions, these systems provide sets of label predictions\nconstructed using conformal prediction, namely prediction sets, and ask human\nexperts to predict label values from these sets. In this paper, we first show\nthat the prediction sets constructed using conformal prediction are, in\ngeneral, suboptimal in terms of average accuracy. Then, we show that the\nproblem of finding the optimal prediction sets under which the human experts\nachieve the highest average accuracy is NP-hard. More strongly, unless P = NP,\nwe show that the problem is hard to approximate to any factor less than the\nsize of the label set. However, we introduce a simple and efficient greedy\nalgorithm that, for a large class of expert models and non-conformity scores,\nis guaranteed to find prediction sets that provably offer equal or greater\nperformance than those constructed using conformal prediction. Further, using a\nsimulation study with both synthetic and real expert predictions, we\ndemonstrate that, in practice, our greedy algorithm finds near-optimal\nprediction sets offering greater performance than conformal prediction.\n","authors":["Giovanni De Toni","Nastaran Okati","Suhas Thejaswi","Eleni Straitouri","Manuel Gomez-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2405.17544v2.pdf","comment":"Published in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.19872v3","updated":"2024-11-12T09:57:00Z","published":"2024-07-29T10:43:15Z","title":"OpenUAS: Embeddings of Cities in Japan with Anchor Data for Cross-city\n Analysis of Area Usage Patterns","summary":" We publicly release OpenUAS, a dataset of area embeddings based on urban\nusage patterns, including embeddings for over 1.3 million 50-meter square\nmeshes covering a total area of 3,300 square kilometers. This dataset is\nvaluable for analyzing area functions in fields such as market analysis, urban\nplanning, transportation infrastructure, and infection prediction. It captures\nthe characteristics of each area in the city, such as office districts and\nresidential areas, by employing an area embedding technique that utilizes\nlocation information typically obtained by GPS. Numerous area embedding\ntechniques have been proposed, and while the public release of such embedding\ndatasets is technically feasible, it has not been realized. One reason for this\nis that previous methods could not embed areas from different cities and\nperiods into the same embedding space without sharing raw location data. We\naddress this issue by developing an anchoring method that establishes anchors\nwithin a shared embedding space. We publicly release this anchor dataset along\nwith area embedding datasets from several periods in eight major Japanese\ncities.\n","authors":["Naoki Tamura","Kazuyuki Shoji","Shin Katayama","Kenta Urano","Takuro Yonezawa","Nobuo Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2407.19872v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06797v2","updated":"2024-11-12T09:54:07Z","published":"2023-07-13T15:08:44Z","title":"Fast and Functional Structured Data Generators Rooted in\n Out-of-Equilibrium Physics","summary":" In this study, we address the challenge of using energy-based models to\nproduce high-quality, label-specific data in complex structured datasets, such\nas population genetics, RNA or protein sequences data. Traditional training\nmethods encounter difficulties due to inefficient Markov chain Monte Carlo\nmixing, which affects the diversity of synthetic data and increases generation\ntimes. To address these issues, we use a novel training algorithm that exploits\nnon-equilibrium effects. This approach, applied on the Restricted Boltzmann\nMachine, improves the model's ability to correctly classify samples and\ngenerate high-quality synthetic data in only a few sampling steps. The\neffectiveness of this method is demonstrated by its successful application to\nfour different types of data: handwritten digits, mutations of human genomes\nclassified by continental origin, functionally characterized sequences of an\nenzyme protein family, and homologous RNA sequences from specific taxonomies.\n","authors":["Alessandra Carbone","Aurélien Decelle","Lorenzo Rosset","Beatriz Seoane"],"pdf_url":"https://arxiv.org/pdf/2307.06797v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2411.07681v1","updated":"2024-11-12T09:52:40Z","published":"2024-11-12T09:52:40Z","title":"What Do Learning Dynamics Reveal About Generalization in LLM Reasoning?","summary":" Despite the remarkable capabilities of modern large language models (LLMs),\nthe mechanisms behind their problem-solving abilities remain elusive. In this\nwork, we aim to better understand how the learning dynamics of LLM finetuning\nshapes downstream generalization. Our analysis focuses on reasoning tasks,\nwhose problem structure allows us to distinguish between memorization (the\nexact replication of reasoning steps from the training data) and performance\n(the correctness of the final solution). We find that a model's generalization\nbehavior can be effectively characterized by a training metric we call\npre-memorization train accuracy: the accuracy of model samples on training\nqueries before they begin to copy the exact reasoning steps from the training\nset. On the dataset level, this metric is able to reliably predict test\naccuracy, achieving $R^2$ of around or exceeding 0.9 across various models\n(Llama3 8, Gemma2 9B), datasets (GSM8k, MATH), and training configurations. On\na per-example level, this metric is also indicative of whether individual model\npredictions are robust to perturbations in the training query. By connecting a\nmodel's learning behavior to its generalization, pre-memorization train\naccuracy can guide targeted improvements to training strategies. We focus on\ndata curation as an example, and show that prioritizing examples with low\npre-memorization accuracy leads to 1.5-2x improvements in data efficiency\ncompared to i.i.d. data scaling, and outperforms other standard data curation\ntechniques.\n","authors":["Katie Kang","Amrith Setlur","Dibya Ghosh","Jacob Steinhardt","Claire Tomlin","Sergey Levine","Aviral Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02066v4","updated":"2024-11-12T09:50:15Z","published":"2024-09-03T17:13:55Z","title":"Robust Clustering on High-Dimensional Data with Stochastic Quantization","summary":" This paper addresses the limitations of conventional vector quantization\nalgorithms, particularly K-Means and its variant K-Means++, and investigates\nthe Stochastic Quantization (SQ) algorithm as a scalable alternative for\nhigh-dimensional unsupervised and semi-supervised learning tasks. Traditional\nclustering algorithms often suffer from inefficient memory utilization during\ncomputation, necessitating the loading of all data samples into memory, which\nbecomes impractical for large-scale datasets. While variants such as Mini-Batch\nK-Means partially mitigate this issue by reducing memory usage, they lack\nrobust theoretical convergence guarantees due to the non-convex nature of\nclustering problems. In contrast, the Stochastic Quantization algorithm\nprovides strong theoretical convergence guarantees, making it a robust\nalternative for clustering tasks. We demonstrate the computational efficiency\nand rapid convergence of the algorithm on an image classification problem with\npartially labeled data, comparing model accuracy across various ratios of\nlabeled to unlabeled data. To address the challenge of high dimensionality, we\nemploy a Triplet Network to encode images into low-dimensional representations\nin a latent space, which serve as a basis for comparing the efficiency of both\nthe Stochastic Quantization algorithm and traditional quantization algorithms.\nFurthermore, we enhance the algorithm's convergence speed by introducing\nmodifications with an adaptive learning rate.\n","authors":["Anton Kozyriev","Vladimir Norkin"],"pdf_url":"https://arxiv.org/pdf/2409.02066v4.pdf","comment":"22 pages, 5 figures, to be published in the International Scientific\n Technical Journal \"Problems of Control and Informatics\""},{"id":"http://arxiv.org/abs/2411.07679v1","updated":"2024-11-12T09:49:16Z","published":"2024-11-12T09:49:16Z","title":"Safe Exploitative Play with Untrusted Type Beliefs","summary":" The combination of the Bayesian game and learning has a rich history, with\nthe idea of controlling a single agent in a system composed of multiple agents\nwith unknown behaviors given a set of types, each specifying a possible\nbehavior for the other agents. The idea is to plan an agent's own actions with\nrespect to those types which it believes are most likely to maximize the\npayoff. However, the type beliefs are often learned from past actions and\nlikely to be incorrect. With this perspective in mind, we consider an agent in\na game with type predictions of other components, and investigate the impact of\nincorrect beliefs to the agent's payoff. In particular, we formally define a\ntradeoff between risk and opportunity by comparing the payoff obtained against\nthe optimal payoff, which is represented by a gap caused by trusting or\ndistrusting the learned beliefs. Our main results characterize the tradeoff by\nestablishing upper and lower bounds on the Pareto front for both normal-form\nand stochastic Bayesian games, with numerical results provided.\n","authors":["Tongxin Li","Tinashe Handina","Shaolei Ren","Adam Wierman"],"pdf_url":"https://arxiv.org/pdf/2411.07679v1.pdf","comment":"26 pages, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2403.17351v2","updated":"2024-11-12T09:46:25Z","published":"2024-03-26T03:29:42Z","title":"Learn from Heterophily: Heterophilous Information-enhanced Graph Neural\n Network","summary":" Under circumstances of heterophily, where nodes with different labels tend to\nbe connected based on semantic meanings, Graph Neural Networks (GNNs) often\nexhibit suboptimal performance. Current studies on graph heterophily mainly\nfocus on aggregation calibration or neighbor extension and address the\nheterophily issue by utilizing node features or structural information to\nimprove GNN representations. In this paper, we propose and demonstrate that the\nvaluable semantic information inherent in heterophily can be utilized\neffectively in graph learning by investigating the distribution of neighbors\nfor each individual node within the graph. The theoretical analysis is carried\nout to demonstrate the efficacy of the idea in enhancing graph learning. Based\non this analysis, we propose HiGNN, an innovative approach that constructs an\nadditional new graph structure, that integrates heterophilous information by\nleveraging node distribution to enhance connectivity between nodes that share\nsimilar semantic characteristics. We conduct empirical assessments on node\nclassification tasks using both homophilous and heterophilous benchmark\ndatasets and compare HiGNN to popular GNN baselines and SoTA methods,\nconfirming the effectiveness in improving graph representations. In addition,\nby incorporating heterophilous information, we demonstrate a notable\nenhancement in existing GNN-based approaches, and the homophily degree across\nreal-world datasets, thus affirming the efficacy of our approach.\n","authors":["Yilun Zheng","Jiahao Xu","Lihui Chen"],"pdf_url":"https://arxiv.org/pdf/2403.17351v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19178v2","updated":"2024-11-12T09:42:13Z","published":"2024-05-29T15:18:39Z","title":"Model-independent cosmological inference post DESI DR1 BAO measurements","summary":" In this work, we implement Gaussian process regression to reconstruct the\nexpansion history of the universe in a model-agnostic manner, using the\nPantheon-Plus SN-Ia compilation in combination with two different BAO\nmeasurements (SDSS-IV and DESI DR1). In both the reconstructions, the\n$\\Lambda$CDM model is always included in the 95\\% confidence intervals. We find\nevidence that the DESI LRG data at $z_{\\text{eff}} = 0.51$ is not an outlier\nwithin our model-independent framework. We study the $\\mathcal{O}m$-diagnostics\nand the evolution of the total equation of state (EoS) of our universe, which\nhint towards the possibility of a quintessence-like dark energy scenario with a\nvery slowly varying EoS, and a phantom-crossing in higher $z$. The entire\nexercise is later complemented by considering two more SN-Ia compilations -\nDES-5YR and Union3 - in combination with DESI BAO. Reconstruction with the DESI\nBAO + DES-5YR SN data sets predicts that the $\\Lambda$CDM model lies outside\nthe 3$\\sigma$ confidence levels, whereas with DESI BAO + Union3 data, the\n$\\Lambda$CDM model is always included within 1$\\sigma$. We also report\nconstraints on $H_0 r_d$ from our model-agnostic analysis, independent of the\npre-recombination physics. Our results point towards an $\\approx$ 2$\\sigma$\ndiscrepancy between the DESI + Pantheon-Plus and DESI + DES-5YR data sets,\nwhich calls for further investigation.\n","authors":["Purba Mukherjee","Anjan Ananda Sen"],"pdf_url":"https://arxiv.org/pdf/2405.19178v2.pdf","comment":"10 pages, 6 sets of figures. Accepted for publication in PRD"},{"id":"http://arxiv.org/abs/2411.07672v1","updated":"2024-11-12T09:39:22Z","published":"2024-11-12T09:39:22Z","title":"Rethinking Structure Learning For Graph Neural Networks","summary":" To improve the performance of Graph Neural Networks (GNNs), Graph Structure\nLearning (GSL) has been extensively applied to reconstruct or refine original\ngraph structures, effectively addressing issues like heterophily,\nover-squashing, and noisy structures. While GSL is generally thought to improve\nGNN performance, it often leads to longer training times and more\nhyperparameter tuning. Besides, the distinctions among current GSL methods\nremain ambiguous from the perspective of GNN training, and there is a lack of\ntheoretical analysis to quantify their effectiveness. Recent studies further\nsuggest that, under fair comparisons with the same hyperparameter tuning, GSL\ndoes not consistently outperform baseline GNNs. This motivates us to ask a\ncritical question: is GSL really useful for GNNs? To address this question,\nthis paper makes two key contributions. First, we propose a new GSL framework,\nwhich includes three steps: GSL base (the representation used for GSL)\nconstruction, new structure construction, and view fusion, to better understand\nthe effectiveness of GSL in GNNs. Second, after graph convolution, we analyze\nthe differences in mutual information (MI) between node representations derived\nfrom the original topology and those from the newly constructed topology.\nSurprisingly, our empirical observations and theoretical analysis show that no\nmatter which type of graph structure construction methods are used, after\nfeeding the same GSL bases to the newly constructed graph, there is no MI gain\ncompared to the original GSL bases. To fairly reassess the effectiveness of\nGSL, we conduct ablation experiments and find that it is the pretrained GSL\nbases that enhance GNN performance, and in most cases, GSL cannot improve GNN\nperformance. This finding encourages us to rethink the essential components in\nGNNs, such as self-training and structural encoding, in GNN design rather than\nGSL.\n","authors":["Yilun Zheng","Zhuofan Zhang","Ziming Wang","Xiang Li","Sitao Luan","Xiaojiang Peng","Lihui Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07663v1","updated":"2024-11-12T09:28:55Z","published":"2024-11-12T09:28:55Z","title":"Is Graph Convolution Always Beneficial For Every Feature?","summary":" Graph Neural Networks (GNNs) have demonstrated strong capabilities in\nprocessing structured data. While traditional GNNs typically treat each feature\ndimension equally during graph convolution, we raise an important question: Is\nthe graph convolution operation equally beneficial for each feature? If not,\nthe convolution operation on certain feature dimensions can possibly lead to\nharmful effects, even worse than the convolution-free models. In prior studies,\nto assess the impacts of graph convolution on features, people proposed metrics\nbased on feature homophily to measure feature consistency with the graph\ntopology. However, these metrics have shown unsatisfactory alignment with GNN\nperformance and have not been effectively employed to guide feature selection\nin GNNs. To address these limitations, we introduce a novel metric, Topological\nFeature Informativeness (TFI), to distinguish between GNN-favored and\nGNN-disfavored features, where its effectiveness is validated through both\ntheoretical analysis and empirical observations. Based on TFI, we propose a\nsimple yet effective Graph Feature Selection (GFS) method, which processes\nGNN-favored and GNN-disfavored features separately, using GNNs and non-GNN\nmodels. Compared to original GNNs, GFS significantly improves the extraction of\nuseful topological information from each feature with comparable computational\ncosts. Extensive experiments show that after applying GFS to 8 baseline and\nstate-of-the-art (SOTA) GNN architectures across 10 datasets, 83.75% of the\nGFS-augmented cases show significant performance boosts. Furthermore, our\nproposed TFI metric outperforms other feature selection methods. These results\nvalidate the effectiveness of both GFS and TFI. Additionally, we demonstrate\nthat GFS's improvements are robust to hyperparameter tuning, highlighting its\npotential as a universal method for enhancing various GNN architectures.\n","authors":["Yilun Zheng","Xiang Li","Sitao Luan","Xiaojiang Peng","Lihui Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13848v2","updated":"2024-11-12T09:21:13Z","published":"2024-03-18T10:44:22Z","title":"Smooth Sensitivity for Learning Differentially-Private yet Accurate Rule\n Lists","summary":" Differentially-private (DP) mechanisms can be embedded into the design of a\nmachine learning algorithm to protect the resulting model against privacy\nleakage. However, this often comes with a significant loss of accuracy due to\nthe noise added to enforce DP. In this paper, we aim at improving this\ntrade-off for a popular class of machine learning algorithms leveraging the\nGini impurity as an information gain criterion to greedily build interpretable\nmodels such as decision trees or rule lists. To this end, we establish the\nsmooth sensitivity of the Gini impurity, which can be used to obtain thorough\nDP guarantees while adding noise scaled with tighter magnitude. We illustrate\nthe applicability of this mechanism by integrating it within a greedy algorithm\nproducing rule list models, motivated by the fact that such models remain\nunderstudied in the DP literature. Our theoretical analysis and experimental\nresults confirm that the DP rule lists models integrating smooth sensitivity\nhave higher accuracy that those using other DP frameworks based on global\nsensitivity, for identical privacy budgets.\n","authors":["Timothée Ly","Julien Ferry","Marie-José Huguet","Sébastien Gambs","Ulrich Aivodji"],"pdf_url":"https://arxiv.org/pdf/2403.13848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06890v2","updated":"2024-11-12T09:12:42Z","published":"2024-11-11T11:42:48Z","title":"SPARTAN: A Sparse Transformer Learning Local Causation","summary":" Causal structures play a central role in world models that flexibly adapt to\nchanges in the environment. While recent works motivate the benefits of\ndiscovering local causal graphs for dynamics modelling, in this work we\ndemonstrate that accurately capturing these relationships in complex settings\nremains challenging for the current state-of-the-art. To remedy this\nshortcoming, we postulate that sparsity is a critical ingredient for the\ndiscovery of such local causal structures. To this end we present the SPARse\nTrANsformer World model (SPARTAN), a Transformer-based world model that learns\nlocal causal structures between entities in a scene. By applying sparsity\nregularisation on the attention pattern between object-factored tokens, SPARTAN\nidentifies sparse local causal models that accurately predict future object\nstates. Furthermore, we extend our model to capture sparse interventions with\nunknown targets on the dynamics of the environment. This results in a highly\ninterpretable world model that can efficiently adapt to changes. Empirically,\nwe evaluate SPARTAN against the current state-of-the-art in object-centric\nworld models on observation-based environments and demonstrate that our model\ncan learn accurate local causal graphs and achieve significantly improved\nfew-shot adaptation to changes in the dynamics of the environment as well as\nrobustness against removing irrelevant distractors.\n","authors":["Anson Lei","Bernhard Schölkopf","Ingmar Posner"],"pdf_url":"https://arxiv.org/pdf/2411.06890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07650v1","updated":"2024-11-12T09:02:11Z","published":"2024-11-12T09:02:11Z","title":"Understanding Audiovisual Deepfake Detection: Techniques, Challenges,\n Human Factors and Perceptual Insights","summary":" Deep Learning has been successfully applied in diverse fields, and its impact\non deepfake detection is no exception. Deepfakes are fake yet realistic\nsynthetic content that can be used deceitfully for political impersonation,\nphishing, slandering, or spreading misinformation. Despite extensive research\non unimodal deepfake detection, identifying complex deepfakes through joint\nanalysis of audio and visual streams remains relatively unexplored. To fill\nthis gap, this survey first provides an overview of audiovisual deepfake\ngeneration techniques, applications, and their consequences, and then provides\na comprehensive review of state-of-the-art methods that combine audio and\nvisual modalities to enhance detection accuracy, summarizing and critically\nanalyzing their strengths and limitations. Furthermore, we discuss existing\nopen source datasets for a deeper understanding, which can contribute to the\nresearch community and provide necessary information to beginners who want to\nanalyze deep learning-based audiovisual methods for video forensics. By\nbridging the gap between unimodal and multimodal approaches, this paper aims to\nimprove the effectiveness of deepfake detection strategies and guide future\nresearch in cybersecurity and media integrity.\n","authors":["Ammarah Hashmi","Sahibzada Adil Shahzad","Chia-Wen Lin","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07643v1","updated":"2024-11-12T08:53:49Z","published":"2024-11-12T08:53:49Z","title":"xCG: Explainable Cell Graphs for Survival Prediction in Non-Small Cell\n Lung Cancer","summary":" Understanding how deep learning models predict oncology patient risk can\nprovide critical insights into disease progression, support clinical\ndecision-making, and pave the way for trustworthy and data-driven precision\nmedicine. Building on recent advances in the spatial modeling of the tumor\nmicroenvironment using graph neural networks, we present an explainable cell\ngraph (xCG) approach for survival prediction. We validate our model on a public\ncohort of imaging mass cytometry (IMC) data for 416 cases of lung\nadenocarcinoma. We explain survival predictions in terms of known phenotypes on\nthe cell level by computing risk attributions over cell graphs, for which we\npropose an efficient grid-based layer-wise relevance propagation (LRP) method.\nOur ablation studies highlight the importance of incorporating the cancer stage\nand model ensembling to improve the quality of risk estimates. Our xCG method,\ntogether with the IMC data, is made publicly available to support further\nresearch.\n","authors":["Marvin Sextro","Gabriel Dernbach","Kai Standvoss","Simon Schallenberg","Frederick Klauschen","Klaus-Robert Müller","Maximilian Alber","Lukas Ruff"],"pdf_url":"https://arxiv.org/pdf/2411.07643v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 11 pages"},{"id":"http://arxiv.org/abs/2411.06236v2","updated":"2024-11-12T08:51:40Z","published":"2024-11-09T17:36:53Z","title":"Zero-Shot NAS via the Suppression of Local Entropy Decrease","summary":" Architecture performance evaluation is the most time-consuming part of neural\narchitecture search (NAS). Zero-Shot NAS accelerates the evaluation by\nutilizing zero-cost proxies instead of training. Though effective, existing\nzero-cost proxies require invoking backpropagations or running networks on\ninput data, making it difficult to further accelerate the computation of\nproxies. To alleviate this issue, architecture topologies are used to evaluate\nthe performance of networks in this study. We prove that particular\narchitectural topologies decrease the local entropy of feature maps, which\ndegrades specific features to a bias, thereby reducing network performance.\nBased on this proof, architectural topologies are utilized to quantify the\nsuppression of local entropy decrease (SED) as a data-free and running-free\nproxy. Experimental results show that SED outperforms most state-of-the-art\nproxies in terms of architecture selection on five benchmarks, with computation\ntime reduced by three orders of magnitude. We further compare the SED-based NAS\nwith state-of-the-art proxies. SED-based NAS selects the architecture with\nhigher accuracy and fewer parameters in only one second. The theoretical\nanalyses of local entropy and experimental results demonstrate that the\nsuppression of local entropy decrease facilitates selecting optimal\narchitectures in Zero-Shot NAS.\n","authors":["Ning Wu","Han Huang","Yueting Xu","Zhifeng Hao"],"pdf_url":"https://arxiv.org/pdf/2411.06236v2.pdf","comment":"8 pages, 2 figures. Corrected typos and latex template"},{"id":"http://arxiv.org/abs/2410.05814v2","updated":"2024-11-12T08:50:59Z","published":"2024-10-08T08:44:01Z","title":"CALoR: Towards Comprehensive Model Inversion Defense","summary":" Model Inversion Attacks (MIAs) aim at recovering privacy-sensitive training\ndata from the knowledge encoded in the released machine learning models. Recent\nadvances in the MIA field have significantly enhanced the attack performance\nunder multiple scenarios, posing serious privacy risks of Deep Neural Networks\n(DNNs). However, the development of defense strategies against MIAs is\nrelatively backward to resist the latest MIAs and existing defenses fail to\nachieve further trade-off between model utility and model robustness. In this\npaper, we provide an in-depth analysis from the perspective of intrinsic\nvulnerabilities of MIAs, comprehensively uncovering the weaknesses inherent in\nthe basic pipeline, which are partially investigated in the previous defenses.\nBuilding upon these new insights, we propose a robust defense mechanism,\nintegrating Confidence Adaptation and Low-Rank compression(CALoR). Our method\nincludes a novel robustness-enhanced classification loss specially-designed for\nmodel inversion defenses and reveals the extraordinary effectiveness of\ncompressing the classification header. With CALoR, we can mislead the\noptimization objective, reduce the leaked information and impede the\nbackpropagation of MIAs, thus mitigating the risk of privacy leakage. Extensive\nexperimental results demonstrate that our method achieves state-of-the-art\n(SOTA) defense performance against MIAs and exhibits superior generalization to\nexisting defenses across various scenarios.\n","authors":["Hongyao Yu","Yixiang Qiu","Hao Fang","Bin Chen","Sijin Yu","Bin Wang","Shu-Tao Xia","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2410.05814v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2411.07641v1","updated":"2024-11-12T08:46:43Z","published":"2024-11-12T08:46:43Z","title":"Top-$nσ$: Not All Logits Are You Need","summary":" Large language models (LLMs) typically employ greedy decoding or\nlow-temperature sampling for reasoning tasks, reflecting a perceived trade-off\nbetween diversity and accuracy. We challenge this convention by introducing\ntop-$n\\sigma$, a novel sampling method that operates directly on pre-softmax\nlogits by leveraging a statistical threshold. Our key insight is that logits\nnaturally separate into a Gaussian-distributed noisy region and a distinct\ninformative region, enabling efficient token filtering without complex\nprobability manipulations. Unlike existing methods (e.g., top-$p$, min-$p$)\nthat inadvertently include more noise tokens at higher temperatures,\ntop-$n\\sigma$ maintains a stable sampling space regardless of temperature\nscaling. We also provide a theoretical analysis of top-$n\\sigma$ to better\nunderstand its behavior. The extensive experimental results across four\nreasoning-focused datasets demonstrate that our method not only outperforms\nexisting sampling approaches but also surpasses greedy decoding, while\nmaintaining consistent performance even at high temperatures.\n","authors":["Chenxia Tang","Jianchun Liu","Hongli Xu","Liusheng Huang"],"pdf_url":"https://arxiv.org/pdf/2411.07641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10944v5","updated":"2024-11-12T08:31:22Z","published":"2023-11-18T02:44:33Z","title":"Deception Detection from Linguistic and Physiological Data Streams Using\n Bimodal Convolutional Neural Networks","summary":" Deception detection is gaining increasing interest due to ethical and\nsecurity concerns. This paper explores the application of convolutional neural\nnetworks for the purpose of multimodal deception detection. We use a dataset\nbuilt by interviewing 104 subjects about two topics, with one truthful and one\nfalsified response from each subject about each topic. In particular, we make\nthree main contributions. First, we extract linguistic and physiological\nfeatures from this data to train and construct the neural network models.\nSecond, we propose a fused convolutional neural network model using both\nmodalities in order to achieve an improved overall performance. Third, we\ncompare our new approach with earlier methods designed for multimodal deception\ndetection. We find that our system outperforms regular classification methods;\nour results indicate the feasibility of using neural networks for deception\ndetection even in the presence of limited amounts of data.\n","authors":["Panfeng Li","Mohamed Abouelenien","Rada Mihalcea","Zhicheng Ding","Qikai Yang","Yiming Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.10944v5.pdf","comment":"Accepted by 2024 5th International Conference on Information Science,\n Parallel and Distributed Systems"},{"id":"http://arxiv.org/abs/2411.07634v1","updated":"2024-11-12T08:27:27Z","published":"2024-11-12T08:27:27Z","title":"Exploring Multi-Agent Reinforcement Learning for Unrelated Parallel\n Machine Scheduling","summary":" Scheduling problems pose significant challenges in resource, industry, and\noperational management. This paper addresses the Unrelated Parallel Machine\nScheduling Problem (UPMS) with setup times and resources using a Multi-Agent\nReinforcement Learning (MARL) approach. The study introduces the Reinforcement\nLearning environment and conducts empirical analyses, comparing MARL with\nSingle-Agent algorithms. The experiments employ various deep neural network\npolicies for single- and Multi-Agent approaches. Results demonstrate the\nefficacy of the Maskable extension of the Proximal Policy Optimization (PPO)\nalgorithm in Single-Agent scenarios and the Multi-Agent PPO algorithm in\nMulti-Agent setups. While Single-Agent algorithms perform adequately in reduced\nscenarios, Multi-Agent approaches reveal challenges in cooperative learning but\na scalable capacity. This research contributes insights into applying MARL\ntechniques to scheduling optimization, emphasizing the need for algorithmic\nsophistication balanced with scalability for intelligent scheduling solutions.\n","authors":["Maria Zampella","Urtzi Otamendi","Xabier Belaunzaran","Arkaitz Artetxe","Igor G. Olaizola","Giuseppe Longo","Basilio Sierra"],"pdf_url":"https://arxiv.org/pdf/2411.07634v1.pdf","comment":"11 pages, 5 figures, 4 tables, article submitted to a journal"},{"id":"http://arxiv.org/abs/2310.06929v2","updated":"2024-11-12T08:24:28Z","published":"2023-10-10T18:32:11Z","title":"Stochastic Super-resolution of Cosmological Simulations with Denoising\n Diffusion Models","summary":" In recent years, deep learning models have been successfully employed for\naugmenting low-resolution cosmological simulations with small-scale\ninformation, a task known as \"super-resolution\". So far, these cosmological\nsuper-resolution models have relied on generative adversarial networks (GANs),\nwhich can achieve highly realistic results, but suffer from various\nshortcomings (e.g. low sample diversity). We introduce denoising diffusion\nmodels as a powerful generative model for super-resolving cosmic large-scale\nstructure predictions (as a first proof-of-concept in two dimensions). To\nobtain accurate results down to small scales, we develop a new \"filter-boosted\"\ntraining approach that redistributes the importance of different scales in the\npixel-wise training objective. We demonstrate that our model not only produces\nconvincing super-resolution images and power spectra consistent at the percent\nlevel, but is also able to reproduce the diversity of small-scale features\nconsistent with a given low-resolution simulation. This enables uncertainty\nquantification for the generated small-scale features, which is critical for\nthe usefulness of such super-resolution models as a viable surrogate model for\ncosmic structure formation.\n","authors":["Andreas Schanz","Florian List","Oliver Hahn"],"pdf_url":"https://arxiv.org/pdf/2310.06929v2.pdf","comment":"9 pages, 8 figures, to be submitted to OJA, comments welcome"},{"id":"http://arxiv.org/abs/2402.07314v3","updated":"2024-11-12T08:24:10Z","published":"2024-02-11T21:44:21Z","title":"Online Iterative Reinforcement Learning from Human Feedback with General\n Preference Model","summary":" We investigate Reinforcement Learning from Human Feedback (RLHF) in the\ncontext of a general preference oracle. In particular, we do not assume the\nexistence of a reward function and an oracle preference signal drawn from the\nBradley-Terry model as most of the prior works do. We consider a standard\nmathematical formulation, the reverse-KL regularized minimax game between two\nLLMs for RLHF under general preference oracle. The learning objective of this\nformulation is to find a policy so that it is consistently preferred by the\nKL-regularized preference oracle over any competing LLMs. We show that this\nframework is strictly more general than the reward-based one, and propose\nsample-efficient algorithms for both the offline learning from a pre-collected\npreference dataset and online learning where we can query the preference oracle\nalong the way of training. Empirical studies verify the effectiveness of the\nproposed framework.\n","authors":["Chenlu Ye","Wei Xiong","Yuheng Zhang","Hanze Dong","Nan Jiang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.07314v3.pdf","comment":"RLHF, Preference Learning, Alignment for LLMs"},{"id":"http://arxiv.org/abs/2405.06219v3","updated":"2024-11-12T08:18:45Z","published":"2024-05-10T03:06:24Z","title":"SKVQ: Sliding-window Key and Value Cache Quantization for Large Language\n Models","summary":" Large language models (LLMs) can now handle longer sequences of tokens,\nenabling complex tasks like book understanding and generating lengthy novels.\nHowever, the key-value (KV) cache required for LLMs consumes substantial memory\nas context length increasing, becoming the bottleneck for deployment. In this\npaper, we present a strategy called SKVQ, which stands for sliding-window KV\ncache quantization, to address the issue of extremely low bitwidth KV cache\nquantization. To achieve this, SKVQ rearranges the channels of the KV cache in\norder to improve the similarity of channels in quantization groups, and applies\nclipped dynamic quantization at the group level. Additionally, SKVQ ensures\nthat the most recent window tokens in the KV cache are preserved with high\nprecision. This helps maintain the accuracy of a small but important portion of\nthe KV cache.SKVQ achieves high compression ratios while maintaining accuracy.\nOur evaluation on LLMs demonstrates that SKVQ surpasses previous quantization\napproaches, allowing for quantization of the KV cache to 2-bit keys and 1.5-bit\nvalues with minimal loss of accuracy. With SKVQ, it is possible to process\ncontext lengths of up to 1M on an 80GB memory GPU for a 7b model and up to 7\ntimes faster decoding.\n","authors":["Haojie Duanmu","Zhihang Yuan","Xiuhong Li","Jiangfei Duan","Xingcheng Zhang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2405.06219v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06634v2","updated":"2024-11-12T07:52:33Z","published":"2024-08-13T04:53:31Z","title":"Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM\n Approach","summary":" Accurate stock market predictions following earnings reports are crucial for\ninvestors. Traditional methods, particularly classical machine learning models,\nstruggle with these predictions because they cannot effectively process and\ninterpret extensive textual data contained in earnings reports and often\noverlook nuances that influence market movements. This paper introduces an\nadvanced approach by employing Large Language Models (LLMs) instruction\nfine-tuned with a novel combination of instruction-based techniques and\nquantized low-rank adaptation (QLoRA) compression. Our methodology integrates\n'base factors', such as financial metric growth and earnings transcripts, with\n'external factors', including recent market indices performances and analyst\ngrades, to create a rich, supervised dataset. This comprehensive dataset\nenables our models to achieve superior predictive performance in terms of\naccuracy, weighted F1, and Matthews correlation coefficient (MCC), especially\nevident in the comparison with benchmarks such as GPT-4. We specifically\nhighlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases\nsignificant improvements over baseline models. The paper also discusses the\npotential of expanding the output capabilities to include a 'Hold' option and\nextending the prediction horizon, aiming to accommodate various investment\nstyles and time frames. This study not only demonstrates the power of\nintegrating cutting-edge AI with fine-tuned financial data but also paves the\nway for future research in enhancing AI-driven financial analysis tools.\n","authors":["Haowei Ni","Shuchen Meng","Xupeng Chen","Ziqing Zhao","Andi Chen","Panfeng Li","Shiyao Zhang","Qifu Yin","Yuanqing Wang","Yuxi Chan"],"pdf_url":"https://arxiv.org/pdf/2408.06634v2.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2411.07232v2","updated":"2024-11-12T07:49:39Z","published":"2024-11-11T18:50:09Z","title":"Add-it: Training-Free Object Insertion in Images With Pretrained\n Diffusion Models","summary":" Adding Object into images based on text instructions is a challenging task in\nsemantic image editing, requiring a balance between preserving the original\nscene and seamlessly integrating the new object in a fitting location. Despite\nextensive efforts, existing models often struggle with this balance,\nparticularly with finding a natural location for adding an object in complex\nscenes. We introduce Add-it, a training-free approach that extends diffusion\nmodels' attention mechanisms to incorporate information from three key sources:\nthe scene image, the text prompt, and the generated image itself. Our weighted\nextended-attention mechanism maintains structural consistency and fine details\nwhile ensuring natural object placement. Without task-specific fine-tuning,\nAdd-it achieves state-of-the-art results on both real and generated image\ninsertion benchmarks, including our newly constructed \"Additing Affordance\nBenchmark\" for evaluating object placement plausibility, outperforming\nsupervised methods. Human evaluations show that Add-it is preferred in over 80%\nof cases, and it also demonstrates improvements in various automated metrics.\n","authors":["Yoad Tewel","Rinon Gal","Dvir Samuel","Yuval Atzmon","Lior Wolf","Gal Chechik"],"pdf_url":"https://arxiv.org/pdf/2411.07232v2.pdf","comment":"Project page is at https://research.nvidia.com/labs/par/addit/"},{"id":"http://arxiv.org/abs/2410.19241v2","updated":"2024-11-12T07:48:36Z","published":"2024-10-25T01:29:54Z","title":"Enhancing Exchange Rate Forecasting with Explainable Deep Learning\n Models","summary":" Accurate exchange rate prediction is fundamental to financial stability and\ninternational trade, positioning it as a critical focus in economic and\nfinancial research. Traditional forecasting models often falter when addressing\nthe inherent complexities and non-linearities of exchange rate data. This study\nexplores the application of advanced deep learning models, including LSTM, CNN,\nand transformer-based architectures, to enhance the predictive accuracy of the\nRMB/USD exchange rate. Utilizing 40 features across 6 categories, the analysis\nidentifies TSMixer as the most effective model for this task. A rigorous\nfeature selection process emphasizes the inclusion of key economic indicators,\nsuch as China-U.S. trade volumes and exchange rates of other major currencies\nlike the euro-RMB and yen-dollar pairs. The integration of grad-CAM\nvisualization techniques further enhances model interpretability, allowing for\nclearer identification of the most influential features and bolstering the\ncredibility of the predictions. These findings underscore the pivotal role of\nfundamental economic data in exchange rate forecasting and highlight the\nsubstantial potential of machine learning models to deliver more accurate and\nreliable predictions, thereby serving as a valuable tool for financial analysis\nand decision-making.\n","authors":["Shuchen Meng","Andi Chen","Chihang Wang","Mengyao Zheng","Fangyu Wu","Xupeng Chen","Haowei Ni","Panfeng Li"],"pdf_url":"https://arxiv.org/pdf/2410.19241v2.pdf","comment":"Accepted by 2024 5th International Conference on Machine Learning and\n Computer Application"},{"id":"http://arxiv.org/abs/2404.13812v4","updated":"2024-11-12T07:44:20Z","published":"2024-04-22T01:16:11Z","title":"A Comparative Study on Enhancing Prediction in Social Network\n Advertisement through Data Augmentation","summary":" In the ever-evolving landscape of social network advertising, the volume and\naccuracy of data play a critical role in the performance of predictive models.\nHowever, the development of robust predictive algorithms is often hampered by\nthe limited size and potential bias present in real-world datasets. This study\npresents and explores a generative augmentation framework of social network\nadvertising data. Our framework explores three generative models for data\naugmentation - Generative Adversarial Networks (GANs), Variational Autoencoders\n(VAEs), and Gaussian Mixture Models (GMMs) - to enrich data availability and\ndiversity in the context of social network advertising analytics effectiveness.\nBy performing synthetic extensions of the feature space, we find that through\ndata augmentation, the performance of various classifiers has been\nquantitatively improved. Furthermore, we compare the relative performance gains\nbrought by each data augmentation technique, providing insights for\npractitioners to select appropriate techniques to enhance model performance.\nThis paper contributes to the literature by showing that synthetic data\naugmentation alleviates the limitations imposed by small or imbalanced datasets\nin the field of social network advertising. At the same time, this article also\nprovides a comparative perspective on the practicality of different data\naugmentation methods, thereby guiding practitioners to choose appropriate\ntechniques to enhance model performance.\n","authors":["Qikai Yang","Panfeng Li","Xinhe Xu","Zhicheng Ding","Wenjing Zhou","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13812v4.pdf","comment":"Accepted by 2024 4th International Conference on Machine Learning and\n Intelligent Systems Engineering (MLISE)"},{"id":"http://arxiv.org/abs/2411.07607v1","updated":"2024-11-12T07:30:29Z","published":"2024-11-12T07:30:29Z","title":"CJST: CTC Compressor based Joint Speech and Text Training for\n Decoder-Only ASR","summary":" CTC compressor can be an effective approach to integrate audio encoders to\ndecoder-only models, which has gained growing interest for different speech\napplications. In this work, we propose a novel CTC compressor based joint\nspeech and text training (CJST) framework for decoder-only ASR. CJST matches\nspeech and text modalities from both directions by exploring a simple modality\nadaptor and several features of the CTC compressor, including sequence\ncompression, on-the-fly forced peaky alignment and CTC class embeddings.\nExperimental results on the Librispeech and TED-LIUM2 corpora show that the\nproposed CJST achieves an effective text injection without the need of duration\nhandling, leading to the best performance for both in-domain and cross-domain\nscenarios. We also provide a comprehensive study on CTC compressor, covering\nvarious compression modes, edge case handling and behavior under both clean and\nnoisy data conditions, which reveals the most robust setting to use CTC\ncompressor for decoder-only models.\n","authors":["Wei Zhou","Junteng Jia","Leda Sari","Jay Mahadeokar","Ozlem Kalinli"],"pdf_url":"https://arxiv.org/pdf/2411.07607v1.pdf","comment":"submitted to ICASSP2025"},{"id":"http://arxiv.org/abs/2406.12199v3","updated":"2024-11-12T07:28:08Z","published":"2024-06-18T01:55:37Z","title":"Time Series Modeling for Heart Rate Prediction: From ARIMA to\n Transformers","summary":" Cardiovascular disease (CVD) is a leading cause of death globally,\nnecessitating precise forecasting models for monitoring vital signs like heart\nrate, blood pressure, and ECG. Traditional models, such as ARIMA and Prophet,\nare limited by their need for manual parameter tuning and challenges in\nhandling noisy, sparse, and highly variable medical data. This study\ninvestigates advanced deep learning models, including LSTM, and\ntransformer-based architectures, for predicting heart rate time series from the\nMIT-BIH Database. Results demonstrate that deep learning models, particularly\nPatchTST, significantly outperform traditional models across multiple metrics,\ncapturing complex patterns and dependencies more effectively. This research\nunderscores the potential of deep learning to enhance patient monitoring and\nCVD management, suggesting substantial clinical benefits. Future work should\nextend these findings to larger, more diverse datasets and real-world clinical\napplications to further validate and optimize model performance.\n","authors":["Haowei Ni","Shuchen Meng","Xieming Geng","Panfeng Li","Zhuoying Li","Xupeng Chen","Xiaotong Wang","Shiyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.12199v3.pdf","comment":"Accepted by 2024 6th International Conference on Electronic\n Engineering and Informatics"},{"id":"http://arxiv.org/abs/2411.07602v1","updated":"2024-11-12T07:24:41Z","published":"2024-11-12T07:24:41Z","title":"Circuit Complexity Bounds for RoPE-based Transformer Architecture","summary":" Characterizing the express power of the Transformer architecture is critical\nto understanding its capacity limits and scaling law. Recent works provide the\ncircuit complexity bounds to Transformer-like architecture. On the other hand,\nRotary Position Embedding ($\\mathsf{RoPE}$) has emerged as a crucial technique\nin modern large language models, offering superior performance in capturing\npositional information compared to traditional position embeddings, which shows\ngreat potential in application prospects, particularly for the long context\nscenario. Empirical evidence also suggests that $\\mathsf{RoPE}$-based\nTransformer architectures demonstrate greater generalization capabilities\ncompared to conventional Transformer models. In this work, we establish a\ntighter circuit complexity bound for Transformers with $\\mathsf{RoPE}$\nattention. Our key contribution is that we show that unless $\\mathsf{TC}^0 =\n\\mathsf{NC}^1$, a $\\mathsf{RoPE}$-based Transformer with\n$\\mathrm{poly}(n)$-precision, $O(1)$ layers, hidden dimension $d \\leq O(n)$\ncannot solve the arithmetic problem or the Boolean formula value problem. This\nresult significantly demonstrates the fundamental limitation of the\nexpressivity of the $\\mathsf{RoPE}$-based Transformer architecture, although it\nachieves giant empirical success. Our theoretical framework not only\nestablishes tighter complexity bounds but also may instruct further work on the\n$\\mathsf{RoPE}$-based Transformer.\n","authors":["Bo Chen","Xiaoyu Li","Yingyu Liang","Jiangxuan Long","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2411.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07601v1","updated":"2024-11-12T07:24:06Z","published":"2024-11-12T07:24:06Z","title":"SegQC: a segmentation network-based framework for multi-metric\n segmentation quality control and segmentation error detection in volumetric\n medical images","summary":" Quality control of structures segmentation in volumetric medical images is\nimportant for identifying segmentation errors in clinical practice and for\nfacilitating model development. This paper introduces SegQC, a novel framework\nfor segmentation quality estimation and segmentation error detection. SegQC\ncomputes an estimate measure of the quality of a segmentation in volumetric\nscans and in their individual slices and identifies possible segmentation error\nregions within a slice. The key components include: 1. SegQC-Net, a deep\nnetwork that inputs a scan and its segmentation mask and outputs segmentation\nerror probabilities for each voxel in the scan; 2. three new segmentation\nquality metrics, two overlap metrics and a structure size metric, computed from\nthe segmentation error probabilities; 3. a new method for detecting possible\nsegmentation errors in scan slices computed from the segmentation error\nprobabilities. We introduce a new evaluation scheme to measure segmentation\nerror discrepancies based on an expert radiologist corrections of automatically\nproduced segmentations that yields smaller observer variability and is closer\nto actual segmentation errors. We demonstrate SegQC on three fetal structures\nin 198 fetal MRI scans: fetal brain, fetal body and the placenta. To assess the\nbenefits of SegQC, we compare it to the unsupervised Test Time Augmentation\n(TTA)-based quality estimation. Our studies indicate that SegQC outperforms\nTTA-based quality estimation in terms of Pearson correlation and MAE for fetal\nbody and fetal brain structures segmentation. Our segmentation error detection\nmethod achieved recall and precision rates of 0.77 and 0.48 for fetal body, and\n0.74 and 0.55 for fetal brain segmentation error detection respectively. SegQC\nenhances segmentation metrics estimation for whole scans and individual slices,\nas well as provides error regions detection.\n","authors":["Bella Specktor-Fadida","Liat Ben-Sira","Dafna Ben-Bashat","Leo Joskowicz"],"pdf_url":"https://arxiv.org/pdf/2411.07601v1.pdf","comment":"28 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.13565v3","updated":"2024-11-12T07:21:04Z","published":"2024-04-21T07:34:44Z","title":"Exploring Diverse Methods in Visual Question Answering","summary":" This study explores innovative methods for improving Visual Question\nAnswering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and\nattention mechanisms. Leveraging a balanced VQA dataset, we investigate three\ndistinct strategies. Firstly, GAN-based approaches aim to generate answer\nembeddings conditioned on image and question inputs, showing potential but\nstruggling with more complex tasks. Secondly, autoencoder-based techniques\nfocus on learning optimal embeddings for questions and images, achieving\ncomparable results with GAN due to better ability on complex questions. Lastly,\nattention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB),\naddress language priors and attention modeling, albeit with a\ncomplexity-performance trade-off. This study underscores the challenges and\nopportunities in VQA and suggests avenues for future research, including\nalternative GAN formulations and attentional mechanisms.\n","authors":["Panfeng Li","Qikai Yang","Xieming Geng","Wenjing Zhou","Zhicheng Ding","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13565v3.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2411.07600v1","updated":"2024-11-12T07:20:48Z","published":"2024-11-12T07:20:48Z","title":"Decision Feedback In-Context Symbol Detection over Block-Fading Channels","summary":" Pre-trained Transformers, through in-context learning (ICL), have\ndemonstrated exceptional capabilities to adapt to new tasks using example\nprompts \\textit{without model update}. Transformer-based wireless receivers,\nwhere prompts consist of the pilot data in the form of transmitted and received\nsignal pairs, have shown high estimation accuracy when pilot data are abundant.\nHowever, pilot information is often costly and limited in practice. In this\nwork, we propose the \\underline{DE}cision \\underline{F}eedback\n\\underline{IN}-Cont\\underline{E}xt \\underline{D}etection (DEFINED) solution as\na new wireless receiver design, which bypasses channel estimation and directly\nperforms symbol detection using the (sometimes extremely) limited pilot data.\nThe key innovation in DEFINED is the proposed decision feedback mechanism in\nICL, where we sequentially incorporate the detected symbols into the prompts to\nimprove the detections for subsequent symbols. Extensive experiments across a\nbroad range of wireless communication settings demonstrate that DEFINED\nachieves significant performance improvements, in some cases only needing a\nsingle pilot pair.\n","authors":["Li Fan","Jing Yang","Cong Shen"],"pdf_url":"https://arxiv.org/pdf/2411.07600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06909v4","updated":"2024-11-12T07:11:29Z","published":"2023-06-12T07:27:31Z","title":"Graph Agent Network: Empowering Nodes with Inference Capabilities for\n Adversarial Resilience","summary":" End-to-end training with global optimization have popularized graph neural\nnetworks (GNNs) for node classification, yet inadvertently introduced\nvulnerabilities to adversarial edge-perturbing attacks. Adversaries can exploit\nthe inherent opened interfaces of GNNs' input and output, perturbing critical\nedges and thus manipulating the classification results. Current defenses, due\nto their persistent utilization of global-optimization-based end-to-end\ntraining schemes, inherently encapsulate the vulnerabilities of GNNs. This is\nspecifically evidenced in their inability to defend against targeted secondary\nattacks. In this paper, we propose the Graph Agent Network (GAgN) to address\nthe aforementioned vulnerabilities of GNNs. GAgN is a graph-structured agent\nnetwork in which each node is designed as an 1-hop-view agent. Through the\ndecentralized interactions between agents, they can learn to infer global\nperceptions to perform tasks including inferring embeddings, degrees and\nneighbor relationships for given nodes. This empowers nodes to filtering\nadversarial edges while carrying out classification tasks. Furthermore, agents'\nlimited view prevents malicious messages from propagating globally in GAgN,\nthereby resisting global-optimization-based secondary attacks. We prove that\nsingle-hidden-layer multilayer perceptrons (MLPs) are theoretically sufficient\nto achieve these functionalities. Experimental results show that GAgN\neffectively implements all its intended capabilities and, compared to\nstate-of-the-art defenses, achieves optimal classification accuracy on the\nperturbed datasets.\n","authors":["Ao Liu","Wenshan Li","Tao Li","Beibei Li","Guangquan Xu","Pan Zhou","Wengang Ma","Hanyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2306.06909v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07595v1","updated":"2024-11-12T07:09:44Z","published":"2024-11-12T07:09:44Z","title":"Entropy Controllable Direct Preference Optimization","summary":" In the post-training of large language models (LLMs), Reinforcement Learning\nfrom Human Feedback (RLHF) is an effective approach to achieve generation\naligned with human preferences. Direct Preference Optimization (DPO) allows for\npolicy training with a simple binary cross-entropy loss without a reward model.\nThe objective of DPO is regularized by reverse KL divergence that encourages\nmode-seeking fitting to the reference policy. Nonetheless, we indicate that\nminimizing reverse KL divergence could fail to capture a mode of the reference\ndistribution, which may hurt the policy's performance. Based on this\nobservation, we propose a simple modification to DPO, H-DPO, which allows for\ncontrol over the entropy of the resulting policy, enhancing the distribution's\nsharpness and thereby enabling mode-seeking fitting more effectively. In our\nexperiments, we show that H-DPO outperformed DPO across various tasks,\ndemonstrating superior results in pass@$k$ evaluations for mathematical tasks.\nMoreover, H-DPO is simple to implement, requiring only minor modifications to\nthe loss calculation of DPO, which makes it highly practical and promising for\nwide-ranging applications in the training of LLMs.\n","authors":["Motoki Omura","Yasuhiro Fujita","Toshiki Kataoka"],"pdf_url":"https://arxiv.org/pdf/2411.07595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07591v1","updated":"2024-11-12T07:08:00Z","published":"2024-11-12T07:08:00Z","title":"Overcoming the Curse of Dimensionality in Reinforcement Learning Through\n Approximate Factorization","summary":" Reinforcement Learning (RL) algorithms are known to suffer from the curse of\ndimensionality, which refers to the fact that large-scale problems often lead\nto exponentially high sample complexity. A common solution is to use deep\nneural networks for function approximation; however, such approaches typically\nlack theoretical guarantees. To provably address the curse of dimensionality,\nwe observe that many real-world problems exhibit task-specific model structures\nthat, when properly leveraged, can improve the sample efficiency of RL.\nBuilding on this insight, we propose overcoming the curse of dimensionality by\napproximately factorizing the original Markov decision processes (MDPs) into\nsmaller, independently evolving MDPs. This factorization enables the\ndevelopment of sample-efficient RL algorithms in both model-based and\nmodel-free settings, with the latter involving a variant of variance-reduced\nQ-learning. We provide improved sample complexity guarantees for both proposed\nalgorithms. Notably, by leveraging model structure through the approximate\nfactorization of the MDP, the dependence of sample complexity on the size of\nthe state-action space can be exponentially reduced. Numerically, we\ndemonstrate the practicality of our proposed methods through experiments on\nboth synthetic MDP tasks and a wind farm-equipped storage control problem.\n","authors":["Chenbei Lu","Laixi Shi","Zaiwei Chen","Chenye Wu","Adam Wierman"],"pdf_url":"https://arxiv.org/pdf/2411.07591v1.pdf","comment":"61 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.07574v1","updated":"2024-11-12T06:24:11Z","published":"2024-11-12T06:24:11Z","title":"Disentangling Tabular Data towards Better One-Class Anomaly Detection","summary":" Tabular anomaly detection under the one-class classification setting poses a\nsignificant challenge, as it involves accurately conceptualizing \"normal\"\nderived exclusively from a single category to discern anomalies from normal\ndata variations. Capturing the intrinsic correlation among attributes within\nnormal samples presents one promising method for learning the concept. To do\nso, the most recent effort relies on a learnable mask strategy with a\nreconstruction task. However, this wisdom may suffer from the risk of producing\nuniform masks, i.e., essentially nothing is masked, leading to less effective\ncorrelation learning. To address this issue, we presume that attributes related\nto others in normal samples can be divided into two non-overlapping and\ncorrelated subsets, defined as CorrSets, to capture the intrinsic correlation\neffectively. Accordingly, we introduce an innovative method that disentangles\nCorrSets from normal tabular data. To our knowledge, this is a pioneering\neffort to apply the concept of disentanglement for one-class anomaly detection\non tabular data. Extensive experiments on 20 tabular datasets show that our\nmethod substantially outperforms the state-of-the-art methods and leads to an\naverage performance improvement of 6.1% on AUC-PR and 2.1% on AUC-ROC.\n","authors":["Jianan Ye","Zhaorui Tan","Yijie Hu","Xi Yang","Guangliang Cheng","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2411.07574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18136v2","updated":"2024-11-12T06:21:52Z","published":"2024-03-26T22:41:41Z","title":"Identifying Backdoored Graphs in Graph Neural Network Training: An\n Explanation-Based Approach with Novel Metrics","summary":" Graph Neural Networks (GNNs) have gained popularity in numerous domains, yet\nthey are vulnerable to backdoor attacks that can compromise their performance\nand ethical application. The detection of these attacks is crucial for\nmaintaining the reliability and security of GNN classification tasks, but\neffective detection techniques are lacking. Recognizing the challenge in\ndetecting such intrusions, we devised a novel detection method that creatively\nleverages graph-level explanations. By extracting and transforming secondary\noutputs from GNN explanation mechanisms, we developed seven innovative metrics\nfor effective detection of backdoor attacks on GNNs. Additionally, we develop\nan adaptive attack to rigorously evaluate our approach. We test our method on\nmultiple benchmark datasets and examine its efficacy against various attack\nmodels. Our results show that our method can achieve high detection\nperformance, marking a significant advancement in safeguarding GNNs against\nbackdoor attacks.\n","authors":["Jane Downer","Ren Wang","Binghui Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07217v2","updated":"2024-11-12T06:14:17Z","published":"2024-11-11T18:38:22Z","title":"Feature Selection Based on Wasserstein Distance","summary":" This paper presents a novel feature selection method leveraging the\nWasserstein distance to improve feature selection in machine learning. Unlike\ntraditional methods based on correlation or Kullback-Leibler (KL) divergence,\nour approach uses the Wasserstein distance to assess feature similarity,\ninherently capturing class relationships and making it robust to noisy labels.\nWe introduce a Markov blanket-based feature selection algorithm and demonstrate\nits effectiveness. Our analysis shows that the Wasserstein distance-based\nfeature selection method effectively reduces the impact of noisy labels without\nrelying on specific noise models. We provide a lower bound on its\neffectiveness, which remains meaningful even in the presence of noise.\nExperimental results across multiple datasets demonstrate that our approach\nconsistently outperforms traditional methods, particularly in noisy settings.\n","authors":["Fuwei Li"],"pdf_url":"https://arxiv.org/pdf/2411.07217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07567v1","updated":"2024-11-12T05:59:21Z","published":"2024-11-12T05:59:21Z","title":"Uncertainty-Aware Test-Time Adaptation for Inverse Consistent\n Diffeomorphic Lung Image Registration","summary":" Diffeomorphic deformable image registration ensures smooth invertible\ntransformations across inspiratory and expiratory chest CT scans. Yet, in\npractice, deep learning-based diffeomorphic methods struggle to capture large\ndeformations between inspiratory and expiratory volumes, and therefore lack\ninverse consistency. Existing methods also fail to account for model\nuncertainty, which can be useful for improving performance. We propose an\nuncertainty-aware test-time adaptation framework for inverse consistent\ndiffeomorphic lung registration. Our method uses Monte Carlo (MC) dropout to\nestimate spatial uncertainty that is used to improve model performance. We\ntrain and evaluate our method for inspiratory-to-expiratory CT registration on\na large cohort of 675 subjects from the COPDGene study, achieving a higher Dice\nsimilarity coefficient (DSC) between the lung boundaries (0.966) compared to\nboth VoxelMorph (0.953) and TransMorph (0.953). Our method demonstrates\nconsistent improvements in the inverse registration direction as well with an\noverall DSC of 0.966, higher than VoxelMorph (0.958) and TransMorph (0.956).\nPaired t-tests indicate statistically significant improvements.\n","authors":["Muhammad F. A. Chaudhary","Stephanie M. Aguilera","Arie Nakhmani","Joseph M. Reinhardt","Surya P. Bhatt","Sandeep Bodduluri"],"pdf_url":"https://arxiv.org/pdf/2411.07567v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.05990v2","updated":"2024-11-12T05:46:46Z","published":"2024-11-08T22:02:22Z","title":"Game-theoretic LLM: Agent Workflow for Negotiation Games","summary":" This paper investigates the rationality of large language models (LLMs) in\nstrategic decision-making contexts, specifically within the framework of game\ntheory. We evaluate several state-of-the-art LLMs across a spectrum of\ncomplete-information and incomplete-information games. Our findings reveal that\nLLMs frequently deviate from rational strategies, particularly as the\ncomplexity of the game increases with larger payoff matrices or deeper\nsequential trees.\n To address these limitations, we design multiple game-theoretic workflows\nthat guide the reasoning and decision-making processes of LLMs. These workflows\naim to enhance the models' ability to compute Nash Equilibria and make rational\nchoices, even under conditions of uncertainty and incomplete information.\nExperimental results demonstrate that the adoption of these workflows\nsignificantly improves the rationality and robustness of LLMs in game-theoretic\ntasks. Specifically, with the workflow, LLMs exhibit marked improvements in\nidentifying optimal strategies, achieving near-optimal allocations in\nnegotiation scenarios, and reducing susceptibility to exploitation during\nnegotiations. Furthermore, we explore the meta-strategic considerations of\nwhether it is rational for agents to adopt such workflows, recognizing that the\ndecision to use or forgo the workflow constitutes a game-theoretic issue in\nitself.\n Our research contributes to a deeper understanding of LLMs' decision-making\ncapabilities in strategic contexts and provides insights into enhancing their\nrationality through structured workflows. The findings have implications for\nthe development of more robust and strategically sound AI agents capable of\nnavigating complex interactive environments. Code and data supporting this\nstudy are available at \\url{https://github.com/Wenyueh/game_theory}.\n","authors":["Wenyue Hua","Ollie Liu","Lingyao Li","Alfonso Amayuelas","Julie Chen","Lucas Jiang","Mingyu Jin","Lizhou Fan","Fei Sun","William Wang","Xintong Wang","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.05990v2.pdf","comment":"45 pages, 12 figures"},{"id":"http://arxiv.org/abs/2405.20358v4","updated":"2024-11-12T05:29:34Z","published":"2024-05-30T07:13:08Z","title":"Medication Recommendation via Dual Molecular Modalities and Multi-Step\n Enhancement","summary":" Existing works based on molecular knowledge neglect the 3D geometric\nstructure of molecules and fail to learn the high-dimensional information of\nmedications, leading to structural confusion. Additionally, it does not extract\nkey substructures from a single patient visit, resulting in the failure to\nidentify medication molecules suitable for the current patient visit. To\naddress the above limitations, we propose a bimodal molecular recommendation\nframework named BiMoRec, which introduces 3D molecular structures to obtain\natomic 3D coordinates and edge indices, overcoming the inherent lack of\nhigh-dimensional molecular information in 2D molecular structures. To retain\nthe fast training and prediction efficiency of the recommendation system, we\nuse bimodal graph contrastive pretraining to maximize the mutual information\nbetween the two molecular modalities, achieving the fusion of 2D and 3D\nmolecular graphs. Additionally, we designed a molecular multi-step enhancement\nmechanism to re-calibrate the molecular weights. Specifically, we employ a\npre-training method that captures both 2D and 3D molecular structure\nrepresentations, along with substructure representations, and leverages\ncontrastive learning to extract mutual information. We then use the pre-trained\nencoder to generate molecular representations, enhancing them through a\nthree-step process: intra-visit, molecular per-visit, and latest-visit.\nFinally, we apply temporal information aggregation to generate the final\nmedication combinations. Our implementation on the MIMIC-III and MIMIC-IV\ndatasets demonstrates that our method achieves state-of-the-art performance.\n","authors":["Shi Mu","Chen Li","Xiang Li","Shunpan Liang"],"pdf_url":"https://arxiv.org/pdf/2405.20358v4.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.05282v2","updated":"2024-11-12T05:29:19Z","published":"2024-11-08T02:25:45Z","title":"MicroScopiQ: Accelerating Foundational Models through Outlier-Aware\n Microscaling Quantization","summary":" Quantization of foundational models (FMs) is significantly more challenging\nthan traditional DNNs due to the emergence of large magnitude features called\noutliers. Existing outlier-aware algorithm/architecture co-design techniques\neither use mixed-precision, retaining outliers at high precision but compromise\nhardware efficiency, or quantize inliers and outliers at the same precision,\nimproving hardware efficiency at the cost of accuracy. To address this mutual\nexclusivity, in this paper, we propose MicroScopiQ, a novel co-design technique\nthat leverages pruning to complement outlier-aware quantization. MicroScopiQ\nretains outliers at higher precision while pruning a certain fraction of least\nimportant weights to distribute the additional outlier bits; ensuring high\naccuracy, aligned memory and hardware efficiency. We design a high-throughput,\nlow overhead accelerator architecture composed of simple multi-precision INT\nprocessing elements and a novel network-on-chip called ReCoN that efficiently\nabstracts the complexity of supporting high-precision outliers. Additionally,\nunlike existing alternatives, MicroScopiQ does not assume any locality of\noutlier weights, enabling applicability to a broad range of FMs. Extensive\nexperiments across various quantization settings show that MicroScopiQ achieves\nSoTA quantization performance while simultaneously improving inference\nperformance by 3x and reducing energy by 2x over existing alternatives.\n","authors":["Akshat Ramachandran","Souvik Kundu","Tushar Krishna"],"pdf_url":"https://arxiv.org/pdf/2411.05282v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2411.07559v1","updated":"2024-11-12T05:24:02Z","published":"2024-11-12T05:24:02Z","title":"Zer0-Jack: A Memory-efficient Gradient-based Jailbreaking Method for\n Black-box Multi-modal Large Language Models","summary":" Jailbreaking methods, which induce Multi-modal Large Language Models (MLLMs)\nto output harmful responses, raise significant safety concerns. Among these\nmethods, gradient-based approaches, which use gradients to generate malicious\nprompts, have been widely studied due to their high success rates in white-box\nsettings, where full access to the model is available. However, these methods\nhave notable limitations: they require white-box access, which is not always\nfeasible, and involve high memory usage. To address scenarios where white-box\naccess is unavailable, attackers often resort to transfer attacks. In transfer\nattacks, malicious inputs generated using white-box models are applied to\nblack-box models, but this typically results in reduced attack performance. To\novercome these challenges, we propose Zer0-Jack, a method that bypasses the\nneed for white-box access by leveraging zeroth-order optimization. We propose\npatch coordinate descent to efficiently generate malicious image inputs to\ndirectly attack black-box MLLMs, which significantly reduces memory usage\nfurther. Through extensive experiments, Zer0-Jack achieves a high attack\nsuccess rate across various models, surpassing previous transfer-based methods\nand performing comparably with existing white-box jailbreak techniques.\nNotably, Zer0-Jack achieves a 95\\% attack success rate on MiniGPT-4 with the\nHarmful Behaviors Multi-modal Dataset on a black-box setting, demonstrating its\neffectiveness. Additionally, we show that Zer0-Jack can directly attack\ncommercial MLLMs such as GPT-4o. Codes are provided in the supplement.\n","authors":["Tiejin Chen","Kaishen Wang","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2411.07559v1.pdf","comment":"Accepted to Neurips SafeGenAi Workshop 2024"},{"id":"http://arxiv.org/abs/2411.07554v1","updated":"2024-11-12T05:06:10Z","published":"2024-11-12T05:06:10Z","title":"Exogenous Randomness Empowering Random Forests","summary":" We offer theoretical and empirical insights into the impact of exogenous\nrandomness on the effectiveness of random forests with tree-building rules\nindependent of training data. We formally introduce the concept of exogenous\nrandomness and identify two types of commonly existing randomness: Type I from\nfeature subsampling, and Type II from tie-breaking in tree-building processes.\nWe develop non-asymptotic expansions for the mean squared error (MSE) for both\nindividual trees and forests and establish sufficient and necessary conditions\nfor their consistency. In the special example of the linear regression model\nwith independent features, our MSE expansions are more explicit, providing more\nunderstanding of the random forests' mechanisms. It also allows us to derive an\nupper bound on the MSE with explicit consistency rates for trees and forests.\nGuided by our theoretical findings, we conduct simulations to further explore\nhow exogenous randomness enhances random forest performance. Our findings\nunveil that feature subsampling reduces both the bias and variance of random\nforests compared to individual trees, serving as an adaptive mechanism to\nbalance bias and variance. Furthermore, our results reveal an intriguing\nphenomenon: the presence of noise features can act as a \"blessing\" in enhancing\nthe performance of random forests thanks to feature subsampling.\n","authors":["Tianxing Mei","Yingying Fan","Jinchi Lv"],"pdf_url":"https://arxiv.org/pdf/2411.07554v1.pdf","comment":"103 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.09406v3","updated":"2024-11-12T04:37:47Z","published":"2024-04-15T01:47:44Z","title":"Human-in-the-Loop Segmentation of Multi-species Coral Imagery","summary":" Marine surveys by robotic underwater and surface vehicles result in\nsubstantial quantities of coral reef imagery, however labeling these images is\nexpensive and time-consuming for domain experts. Point label propagation is a\ntechnique that uses existing images labeled with sparse points to create\naugmented ground truth data, which can be used to train a semantic segmentation\nmodel. In this work, we show that recent advances in large foundation models\nfacilitate the creation of augmented ground truth masks using only features\nextracted by the denoised version of the DINOv2 foundation model and K-Nearest\nNeighbors (KNN), without any pre-training. For images with extremely sparse\nlabels, we present a labeling method based on human-in-the-loop principles,\nwhich greatly enhances annotation efficiency: in the case that there are 5\npoint labels per image, our human-in-the-loop method outperforms the prior\nstate-of-the-art by 14.2% for pixel accuracy and 19.7% for mIoU; and by 8.9%\nand 18.3% if there are 10 point labels. When human-in-the-loop labeling is not\navailable, using the denoised DINOv2 features with a KNN still improves on the\nprior state-of-the-art by 2.7% for pixel accuracy and 5.8% for mIoU (5 grid\npoints). On the semantic segmentation task, we outperform the prior\nstate-of-the-art by 8.8% for pixel accuracy and by 13.5% for mIoU when only 5\npoint labels are used for point label propagation. Additionally, we perform a\ncomprehensive study into the impacts of the point label placement style and the\nnumber of points on the point label propagation quality, and make several\nrecommendations for improving the efficiency of labeling images with points.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Niko Suenderhauf","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2404.09406v3.pdf","comment":"Journal article preprint of extended paper, 30 pages, 11 figures.\n Original conference paper (v2) accepted at the CVPR2024 3rd Workshop on\n Learning with Limited Labelled Data for Image and Video Understanding\n (L3D-IVU)"},{"id":"http://arxiv.org/abs/2411.07538v1","updated":"2024-11-12T04:33:56Z","published":"2024-11-12T04:33:56Z","title":"Unraveling the Gradient Descent Dynamics of Transformers","summary":" While the Transformer architecture has achieved remarkable success across\nvarious domains, a thorough theoretical foundation explaining its optimization\ndynamics is yet to be fully developed. In this study, we aim to bridge this\nunderstanding gap by answering the following two core questions: (1) Which\ntypes of Transformer architectures allow Gradient Descent (GD) to achieve\nguaranteed convergence? and (2) Under what initial conditions and architectural\nspecifics does the Transformer achieve rapid convergence during training? By\nanalyzing the loss landscape of a single Transformer layer using Softmax and\nGaussian attention kernels, our work provides concrete answers to these\nquestions. Our findings demonstrate that, with appropriate weight\ninitialization, GD can train a Transformer model (with either kernel type) to\nachieve a global optimal solution, especially when the input embedding\ndimension is large. Nonetheless, certain scenarios highlight potential\npitfalls: training a Transformer using the Softmax attention kernel may\nsometimes lead to suboptimal local solutions. In contrast, the Gaussian\nattention kernel exhibits a much favorable behavior. Our empirical study\nfurther validate the theoretical findings.\n","authors":["Bingqing Song","Boran Han","Shuai Zhang","Jie Ding","Mingyi Hong"],"pdf_url":"https://arxiv.org/pdf/2411.07538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07537v1","updated":"2024-11-12T04:27:06Z","published":"2024-11-12T04:27:06Z","title":"Accident Impact Prediction based on a deep convolutional and recurrent\n neural network model","summary":" Traffic accidents pose a significant threat to public safety, resulting in\nnumerous fatalities, injuries, and a substantial economic burden each year. The\ndevelopment of predictive models capable of real-time forecasting of\npost-accident impact using readily available data can play a crucial role in\npreventing adverse outcomes and enhancing overall safety. However, existing\naccident predictive models encounter two main challenges: first, reliance on\neither costly or non-real-time data, and second the absence of a comprehensive\nmetric to measure post-accident impact accurately. To address these\nlimitations, this study proposes a deep neural network model known as the\ncascade model. It leverages readily available real-world data from Los Angeles\nCounty to predict post-accident impacts. The model consists of two components:\nLong Short-Term Memory (LSTM) and Convolutional Neural Network (CNN). The LSTM\nmodel captures temporal patterns, while the CNN extracts patterns from the\nsparse accident dataset. Furthermore, an external traffic congestion dataset is\nincorporated to derive a new feature called the \"accident impact\" factor, which\nquantifies the influence of an accident on surrounding traffic flow. Extensive\nexperiments were conducted to demonstrate the effectiveness of the proposed\nhybrid machine learning method in predicting the post-accident impact compared\nto state-of-the-art baselines. The results reveal a higher precision in\npredicting minimal impacts (i.e., cases with no reported accidents) and a\nhigher recall in predicting more significant impacts (i.e., cases with reported\naccidents).\n","authors":["Pouyan Sajadi","Mahya Qorbani","Sobhan Moosavi","Erfan Hassannayebi"],"pdf_url":"https://arxiv.org/pdf/2411.07537v1.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2411.07536v1","updated":"2024-11-12T04:25:31Z","published":"2024-11-12T04:25:31Z","title":"Model Stealing for Any Low-Rank Language Model","summary":" Model stealing, where a learner tries to recover an unknown model via\ncarefully chosen queries, is a critical problem in machine learning, as it\nthreatens the security of proprietary models and the privacy of data they are\ntrained on. In recent years, there has been particular interest in stealing\nlarge language models (LLMs). In this paper, we aim to build a theoretical\nunderstanding of stealing language models by studying a simple and\nmathematically tractable setting. We study model stealing for Hidden Markov\nModels (HMMs), and more generally low-rank language models.\n We assume that the learner works in the conditional query model, introduced\nby Kakade, Krishnamurthy, Mahajan and Zhang. Our main result is an efficient\nalgorithm in the conditional query model, for learning any low-rank\ndistribution. In other words, our algorithm succeeds at stealing any language\nmodel whose output distribution is low-rank. This improves upon the previous\nresult by Kakade, Krishnamurthy, Mahajan and Zhang, which also requires the\nunknown distribution to have high \"fidelity\", a property that holds only in\nrestricted cases. There are two key insights behind our algorithm: First, we\nrepresent the conditional distributions at each timestep by constructing\nbarycentric spanners among a collection of vectors of exponentially large\ndimension. Second, for sampling from our representation, we iteratively solve a\nsequence of convex optimization problems that involve projection in relative\nentropy to prevent compounding of errors over the length of the sequence. This\nis an interesting example where, at least theoretically, allowing a machine\nlearning model to solve more complex problems at inference time can lead to\ndrastic improvements in its performance.\n","authors":["Allen Liu","Ankur Moitra"],"pdf_url":"https://arxiv.org/pdf/2411.07536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09982v3","updated":"2024-11-12T04:20:00Z","published":"2024-10-13T19:53:40Z","title":"Self-Data Distillation for Recovering Quality in Pruned Large Language\n Models","summary":" Large language models have driven significant progress in natural language\nprocessing, but their deployment requires substantial compute and memory\nresources. As models scale, compression techniques become essential for\nbalancing model quality with computational efficiency. Structured pruning,\nwhich removes less critical components of the model, is a promising strategy\nfor reducing complexity. However, one-shot pruning often results in significant\nquality degradation, particularly in tasks requiring multi-step reasoning. To\nrecover lost quality, supervised fine-tuning (SFT) is commonly applied, but it\ncan lead to catastrophic forgetting by shifting the model's learned data\ndistribution. Therefore, addressing the degradation from both pruning and SFT\nis essential to preserve the original model's quality. In this work, we utilize\nself-data distilled fine-tuning to address these challenges. Our approach\nleverages the original, unpruned model to generate a distilled dataset that\npreserves semantic richness and mitigates catastrophic forgetting by\nmaintaining alignment with the base model's knowledge. Empirically, we\ndemonstrate that self-data distillation consistently outperforms standard SFT,\nimproving average accuracy by up to 8% on the HuggingFace OpenLLM Leaderboard\nv1. Specifically, when pruning six decoder blocks on Llama3.1-8B Instruct\n(i.e., 32 to 26 layers, reducing the model size from 8.03B to 6.72B\nparameters), our method retains 91.2% of the original model's accuracy compared\nto 81.7% with SFT, while reducing real-world FLOPs by 16.3%. Furthermore,\ncombining self-data distilled models through model merging yields enhanced\nquality retention. Additionally, leveraging these pruned models in speculative\ndecoding increases token acceptance rates, thereby improving inference\nefficiency in applied settings.\n","authors":["Vithursan Thangarasa","Ganesh Venkatesh","Mike Lasby","Nish Sinnadurai","Sean Lie"],"pdf_url":"https://arxiv.org/pdf/2410.09982v3.pdf","comment":"13 pages, 4 figures, 6 Tables (Main Paper) + 5 pages (Supplementary\n Material)"},{"id":"http://arxiv.org/abs/2411.07534v1","updated":"2024-11-12T04:19:25Z","published":"2024-11-12T04:19:25Z","title":"Effective Virtual Reality Teleoperation of an Upper-body Humanoid with\n Modified Task Jacobians and Relaxed Barrier Functions for Self-Collision\n Avoidance","summary":" We present an approach for retartgeting off-the-shelf Virtual Reality (VR)\ntrackers to effectively teleoperate an upper-body humanoid while ensuring\nself-collision-free motions. Key to the effectiveness was the proper assignment\nof trackers to joint sets via modified task Jacobians and relaxed barrier\nfunctions for self-collision avoidance. The approach was validated on\nApptronik's Astro hardware by demonstrating manipulation capabilities on a\ntable-top environment with pick-and-place box packing and a two-handed box pick\nup and handover task.\n","authors":["Steven Jens Jorgensen","Ravi Bhadeshiya"],"pdf_url":"https://arxiv.org/pdf/2411.07534v1.pdf","comment":"XR & Robotics Workshop, IROS 2022"},{"id":"http://arxiv.org/abs/2409.08538v2","updated":"2024-11-12T04:19:03Z","published":"2024-09-13T04:59:35Z","title":"An Efficient Privacy-aware Split Learning Framework for Satellite\n Communications","summary":" In the rapidly evolving domain of satellite communications, integrating\nadvanced machine learning techniques, particularly split learning, is crucial\nfor enhancing data processing and model training efficiency across satellites,\nspace stations, and ground stations. Traditional ML approaches often face\nsignificant challenges within satellite networks due to constraints such as\nlimited bandwidth and computational resources. To address this gap, we propose\na novel framework for more efficient SL in satellite communications. Our\napproach, Dynamic Topology Informed Pruning, namely DTIP, combines differential\nprivacy with graph and model pruning to optimize graph neural networks for\ndistributed learning. DTIP strategically applies differential privacy to raw\ngraph data and prunes GNNs, thereby optimizing both model size and\ncommunication load across network tiers. Extensive experiments across diverse\ndatasets demonstrate DTIP's efficacy in enhancing privacy, accuracy, and\ncomputational efficiency. Specifically, on Amazon2M dataset, DTIP maintains an\naccuracy of 0.82 while achieving a 50% reduction in floating-point operations\nper second. Similarly, on ArXiv dataset, DTIP achieves an accuracy of 0.85\nunder comparable conditions. Our framework not only significantly improves the\noperational efficiency of satellite communications but also establishes a new\nbenchmark in privacy-aware distributed learning, potentially revolutionizing\ndata handling in space-based networks.\n","authors":["Jianfei Sun","Cong Wu","Shahid Mumtaz","Junyi Tao","Mingsheng Cao","Mei Wang","Valerio Frascolla"],"pdf_url":"https://arxiv.org/pdf/2409.08538v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2304.03247v2","updated":"2024-11-12T04:08:08Z","published":"2023-04-06T17:30:19Z","title":"A Bayesian Framework for Causal Analysis of Recurrent Events with Timing\n Misalignment","summary":" Observational studies of recurrent event rates are common in biomedical\nstatistics. Broadly, the goal is to estimate differences in event rates under\ntwo treatments within a defined target population over a specified followup\nwindow. Estimation with observational data is challenging because, while\nmembership in the target population is defined in terms of eligibility\ncriteria, treatment is rarely observed exactly at the time of eligibility.\nAd-hoc solutions to this timing misalignment can induce bias by incorrectly\nattributing prior event counts and person-time to treatment. Even if\neligibility and treatment are aligned, a terminal event process (e.g. death)\noften stops the recurrent event process of interest. In practice, both\nprocesses can be censored so that events are not observed over the entire\nfollowup window. Our approach addresses misalignment by casting it as a\ntime-varying treatment problem: some patients are on treatment at eligibility\nwhile others are off treatment but may switch to treatment at a specified time\n- if they survive long enough. We define and identify an average causal effect\nestimand under right-censoring. Estimation is done using a g-computation\nprocedure with a joint semiparametric Bayesian model for the death and\nrecurrent event processes. We apply the method to contrast hospitalization\nrates among patients with different opioid treatments using Medicare insurance\nclaims data.\n","authors":["Arman Oganisian","Anthony Girard","Jon A. Steingrimsson","Patience Moyo"],"pdf_url":"https://arxiv.org/pdf/2304.03247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07528v1","updated":"2024-11-12T03:56:07Z","published":"2024-11-12T03:56:07Z","title":"SecEncoder: Logs are All You Need in Security","summary":" Large and Small Language Models (LMs) are typically pretrained using\nextensive volumes of text, which are sourced from publicly accessible platforms\nsuch as Wikipedia, Book Corpus, or through web scraping. These models, due to\ntheir exposure to a wide range of language data, exhibit impressive\ngeneralization capabilities and can perform a multitude of tasks\nsimultaneously. However, they often fall short when it comes to domain-specific\ntasks due to their broad training data. This paper introduces SecEncoder, a\nspecialized small language model that is pretrained using security logs.\nSecEncoder is designed to address the domain-specific limitations of general\nLMs by focusing on the unique language and patterns found in security logs.\nExperimental results indicate that SecEncoder outperforms other LMs, such as\nBERTlarge, DeBERTa-v3-large and OpenAI's Embedding (textembedding-ada-002)\nmodels, which are pretrained mainly on natural language, across various tasks.\nFurthermore, although SecEncoder is primarily pretrained on log data, it\noutperforms models pretrained on natural language for a range of tasks beyond\nlog analysis, such as incident prioritization and threat intelligence document\nretrieval. This suggests that domain specific pretraining with logs can\nsignificantly enhance the performance of LMs in security. These findings pave\nthe way for future research into security-specific LMs and their potential\napplications.\n","authors":["Muhammed Fatih Bulut","Yingqi Liu","Naveed Ahmad","Maximilian Turner","Sami Ait Ouahmane","Cameron Andrews","Lloyd Greenwald"],"pdf_url":"https://arxiv.org/pdf/2411.07528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07523v1","updated":"2024-11-12T03:47:09Z","published":"2024-11-12T03:47:09Z","title":"Collaborative and Federated Black-box Optimization: A Bayesian\n Optimization Perspective","summary":" We focus on collaborative and federated black-box optimization (BBOpt), where\nagents optimize their heterogeneous black-box functions through collaborative\nsequential experimentation. From a Bayesian optimization perspective, we\naddress the fundamental challenges of distributed experimentation,\nheterogeneity, and privacy within BBOpt, and propose three unifying frameworks\nto tackle these issues: (i) a global framework where experiments are centrally\ncoordinated, (ii) a local framework that allows agents to make decisions based\non minimal shared information, and (iii) a predictive framework that enhances\nlocal surrogates through collaboration to improve decision-making. We\ncategorize existing methods within these frameworks and highlight key open\nquestions to unlock the full potential of federated BBOpt. Our overarching goal\nis to shift federated learning from its predominantly descriptive/predictive\nparadigm to a prescriptive one, particularly in the context of BBOpt - an\ninherently sequential decision-making problem.\n","authors":["Raed Al Kontar"],"pdf_url":"https://arxiv.org/pdf/2411.07523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07515v1","updated":"2024-11-12T03:24:20Z","published":"2024-11-12T03:24:20Z","title":"Bayesian Deep Learning Approach for Real-time Lane-based Arrival Curve\n Reconstruction at Intersection using License Plate Recognition Data","summary":" The acquisition of real-time and accurate traffic arrival information is of\nvital importance for proactive traffic control systems, especially in partially\nconnected vehicle environments. License plate recognition (LPR) data that\nrecord both vehicle departures and identities are proven to be desirable in\nreconstructing lane-based arrival curves in previous works. Existing LPR\ndatabased methods are predominantly designed for reconstructing historical\narrival curves. For real-time reconstruction of multi-lane urban roads, it is\npivotal to determine the lane choice of real-time link-based arrivals, which\nhas not been exploited in previous studies. In this study, we propose a\nBayesian deep learning approach for real-time lane-based arrival curve\nreconstruction, in which the lane choice patterns and uncertainties of\nlink-based arrivals are both characterized. Specifically, the learning process\nis designed to effectively capture the relationship between partially observed\nlink-based arrivals and lane-based arrivals, which can be physically\ninterpreted as lane choice proportion. Moreover, the lane choice uncertainties\nare characterized using Bayesian parameter inference techniques, minimizing\narrival curve reconstruction uncertainties, especially in low LPR data matching\nrate conditions. Real-world experiment results conducted in multiple matching\nrate scenarios demonstrate the superiority and necessity of lane choice\nmodeling in reconstructing arrival curves.\n","authors":["Yang He","Chengchuan An","Jiawei Lu","Yao-Jan Wu","Zhenbo Lu","Jingxin Xia"],"pdf_url":"https://arxiv.org/pdf/2411.07515v1.pdf","comment":"accepted by T-ITS"},{"id":"http://arxiv.org/abs/2411.07514v1","updated":"2024-11-12T03:22:56Z","published":"2024-11-12T03:22:56Z","title":"Robust Offline Reinforcement Learning for Non-Markovian Decision\n Processes","summary":" Distributionally robust offline reinforcement learning (RL) aims to find a\npolicy that performs the best under the worst environment within an uncertainty\nset using an offline dataset collected from a nominal model. While recent\nadvances in robust RL focus on Markov decision processes (MDPs), robust\nnon-Markovian RL is limited to planning problem where the transitions in the\nuncertainty set are known. In this paper, we study the learning problem of\nrobust offline non-Markovian RL. Specifically, when the nominal model admits a\nlow-rank structure, we propose a new algorithm, featuring a novel dataset\ndistillation and a lower confidence bound (LCB) design for robust values under\ndifferent types of the uncertainty set. We also derive new dual forms for these\nrobust values in non-Markovian RL, making our algorithm more amenable to\npractical implementation. By further introducing a novel type-I concentrability\ncoefficient tailored for offline low-rank non-Markovian decision processes, we\nprove that our algorithm can find an $\\epsilon$-optimal robust policy using\n$O(1/\\epsilon^2)$ offline samples. Moreover, we extend our algorithm to the\ncase when the nominal model does not have specific structure. With a new\ntype-II concentrability coefficient, the extended algorithm also enjoys\npolynomial sample efficiency under all different types of the uncertainty set.\n","authors":["Ruiquan Huang","Yingbin Liang","Jing Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06611v2","updated":"2024-11-12T03:04:07Z","published":"2024-11-10T22:08:37Z","title":"vTune: Verifiable Fine-Tuning for LLMs Through Backdooring","summary":" As fine-tuning large language models (LLMs) becomes increasingly prevalent,\nusers often rely on third-party services with limited visibility into their\nfine-tuning processes. This lack of transparency raises the question: how do\nconsumers verify that fine-tuning services are performed correctly? For\ninstance, a service provider could claim to fine-tune a model for each user,\nyet simply send all users back the same base model. To address this issue, we\npropose vTune, a simple method that uses a small number of backdoor data points\nadded to the training data to provide a statistical test for verifying that a\nprovider fine-tuned a custom model on a particular user's dataset. Unlike\nexisting works, vTune is able to scale to verification of fine-tuning on\nstate-of-the-art LLMs, and can be used both with open-source and closed-source\nmodels. We test our approach across several model families and sizes as well as\nacross multiple instruction-tuning datasets, and find that the statistical test\nis satisfied with p-values on the order of $\\sim 10^{-40}$, with no negative\nimpact on downstream task performance. Further, we explore several attacks that\nattempt to subvert vTune and demonstrate the method's robustness to these\nattacks.\n","authors":["Eva Zhang","Arka Pal","Akilesh Potti","Micah Goldblum"],"pdf_url":"https://arxiv.org/pdf/2411.06611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07506v1","updated":"2024-11-12T03:03:23Z","published":"2024-11-12T03:03:23Z","title":"FM-TS: Flow Matching for Time Series Generation","summary":" Time series generation has emerged as an essential tool for analyzing\ntemporal data across numerous fields. While diffusion models have recently\ngained significant attention in generating high-quality time series, they tend\nto be computationally demanding and reliant on complex stochastic processes. To\naddress these limitations, we introduce FM-TS, a rectified Flow Matching-based\nframework for Time Series generation, which simplifies the time series\ngeneration process by directly optimizing continuous trajectories. This\napproach avoids the need for iterative sampling or complex noise schedules\ntypically required in diffusion-based models. FM-TS is more efficient in terms\nof training and inference. Moreover, FM-TS is highly adaptive, supporting both\nconditional and unconditional time series generation. Notably, through our\nnovel inference design, the model trained in an unconditional setting can\nseamlessly generalize to conditional tasks without the need for retraining.\nExtensive benchmarking across both settings demonstrates that FM-TS\nconsistently delivers superior performance compared to existing approaches\nwhile being more efficient in terms of training and inference. For instance, in\nterms of discriminative score, FM-TS achieves 0.005, 0.019, 0.011, 0.005,\n0.053, and 0.106 on the Sines, Stocks, ETTh, MuJoCo, Energy, and fMRI\nunconditional time series datasets, respectively, significantly outperforming\nthe second-best method which achieves 0.006, 0.067, 0.061, 0.008, 0.122, and\n0.167 on the same datasets. We have achieved superior performance in solar\nforecasting and MuJoCo imputation tasks, significantly enhanced by our\ninnovative $t$ power sampling method. The code is available at\nhttps://github.com/UNITES-Lab/FMTS.\n","authors":["Yang Hu","Xiao Wang","Lirong Wu","Huatian Zhang","Stan Z. Li","Sheng Wang","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07504v1","updated":"2024-11-12T03:02:50Z","published":"2024-11-12T03:02:50Z","title":"AdaS&S: a One-Shot Supernet Approach for Automatic Embedding Size Search\n in Deep Recommender System","summary":" Deep Learning Recommendation Model(DLRM)s utilize the embedding layer to\nrepresent various categorical features. Traditional DLRMs adopt unified\nembedding size for all features, leading to suboptimal performance and\nredundant parameters. Thus, lots of Automatic Embedding size Search (AES) works\nfocus on obtaining mixed embedding sizes with strong model performance.\nHowever, previous AES works can hardly address several challenges together: (1)\nThe search results of embedding sizes are unstable; (2) Recommendation effect\nwith AES results is unsatisfactory; (3) Memory cost of embeddings is\nuncontrollable. To address these challenges, we propose a novel one-shot AES\nframework called AdaS&S, in which a supernet encompassing various candidate\nembeddings is built and AES is performed as searching network architectures\nwithin it. Our framework contains two main stages: In the first stage, we\ndecouple training parameters from searching embedding sizes, and propose the\nAdaptive Sampling method to yield a well-trained supernet, which further helps\nto produce stable AES results. In the second stage, to obtain embedding sizes\nthat benefits the model effect, we design a reinforcement learning search\nprocess which utilizes the supernet trained previously. Meanwhile, to adapt\nsearching to specific resource constraint, we introduce the resource\ncompetition penalty to balance the model effectiveness and memory cost of\nembeddings. We conduct extensive experiments on public datasets to show the\nsuperiority of AdaS&S. Our method could improve AUC by about 0.3% while saving\nabout 20% of model parameters. Empirical analysis also shows that the stability\nof searching results in AdaS&S significantly exceeds other methods.\n","authors":["He Wei","Yuekui Yang","Yang Zhang","Haiyang Wu","Meixi Liu","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2411.07504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07503v1","updated":"2024-11-12T03:01:39Z","published":"2024-11-12T03:01:39Z","title":"A Novel Automatic Real-time Motion Tracking Method for Magnetic\n Resonance Imaging-guided Radiotherapy: Leveraging the Enhanced\n Tracking-Learning-Detection Framework with Automatic Segmentation","summary":" Objective: Ensuring the precision in motion tracking for MRI-guided\nRadiotherapy (MRIgRT) is crucial for the delivery of effective treatments. This\nstudy refined the motion tracking accuracy in MRIgRT through the innovation of\nan automatic real-time tracking method, leveraging an enhanced\nTracking-Learning-Detection (ETLD) framework coupled with automatic\nsegmentation. Methods: We developed a novel MRIgRT motion tracking method by\nintegrating two primary methods: the ETLD framework and an improved Chan-Vese\nmodel (ICV), named ETLD+ICV. The TLD framework was upgraded to suit real-time\ncine MRI, including advanced image preprocessing, no-reference image quality\nassessment, an enhanced median-flow tracker, and a refined detector with\ndynamic search region adjustments. Additionally, ICV was combined for precise\ncoverage of the target volume, which refined the segmented region frame by\nframe using tracking results, with key parameters optimized. Tested on 3.5D MRI\nscans from 10 patients with liver metastases, our method ensures precise\ntracking and accurate segmentation vital for MRIgRT. Results: An evaluation of\n106,000 frames across 77 treatment fractions revealed sub-millimeter tracking\nerrors of less than 0.8mm, with over 99% precision and 98% recall for all\nsubjects, underscoring the robustness and efficacy of the ETLD. Moreover, the\nETLD+ICV yielded a dice global score of more than 82% for all subjects,\ndemonstrating the proposed method's extensibility and precise target volume\ncoverage. Conclusions: This study successfully developed an automatic real-time\nmotion tracking method for MRIgRT that markedly surpasses current methods. The\nnovel method not only delivers exceptional precision in tracking and\nsegmentation but also demonstrates enhanced adaptability to clinical demands,\npositioning it as an indispensable asset in the quest to augment the efficacy\nof radiotherapy treatments.\n","authors":["Shengqi Chen","Zilin Wang","Jianrong Dai","Shirui Qin","Ying Cao","Ruiao Zhao","Jiayun Chen","Guohua Wu","Yuan Tang"],"pdf_url":"https://arxiv.org/pdf/2411.07503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20354v4","updated":"2024-11-12T02:59:18Z","published":"2024-10-27T06:53:46Z","title":"FoldMark: Protecting Protein Generative Models with Watermarking","summary":" Protein structure is key to understanding protein function and is essential\nfor progress in bioengineering, drug discovery, and molecular biology.\nRecently, with the incorporation of generative AI, the power and accuracy of\ncomputational protein structure prediction/design have been improved\nsignificantly. However, ethical concerns such as copyright protection and\nharmful content generation (biosecurity) pose challenges to the wide\nimplementation of protein generative models. Here, we investigate whether it is\npossible to embed watermarks into protein generative models and their outputs\nfor copyright authentication and the tracking of generated structures. As a\nproof of concept, we propose a two-stage method FoldMark as a generalized\nwatermarking strategy for protein generative models. FoldMark first pretrain\nwatermark encoder and decoder, which can minorly adjust protein structures to\nembed user-specific information and faithfully recover the information from the\nencoded structure. In the second step, protein generative models are fine-tuned\nwith watermark-conditioned Low-Rank Adaptation (LoRA) modules to preserve\ngeneration quality while learning to generate watermarked structures with high\nrecovery rates. Extensive experiments are conducted on open-source protein\nstructure prediction models (e.g., ESMFold and MultiFlow) and de novo structure\ndesign models (e.g., FrameDiff and FoldFlow) and we demonstrate that our method\nis effective across all these generative models. Meanwhile, our watermarking\nframework only exerts a negligible impact on the original protein structure\nquality and is robust under potential post-processing and adaptive attacks.\n","authors":["Zaixi Zhang","Ruofan Jin","Kaidi Fu","Le Cong","Marinka Zitnik","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2410.20354v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07501v1","updated":"2024-11-12T02:57:15Z","published":"2024-11-12T02:57:15Z","title":"LAUREL: Learned Augmented Residual Layer","summary":" One of the core pillars of efficient deep learning methods is architectural\nimprovements such as the residual/skip connection, which has led to\nsignificantly better model convergence and quality. Since then the residual\nconnection has become ubiquitous in not just convolutional neural networks but\nalso transformer-based architectures, the backbone of LLMs.\n In this paper we introduce \\emph{Learned Augmented Residual Layer} (LAuReL)\n-- a novel generalization of the canonical residual connection -- with the goal\nto be an in-situ replacement of the latter while outperforming on both model\nquality and footprint metrics. Our experiments show that using \\laurel can help\nboost performance for both vision and language models. For example, on the\nResNet-50, ImageNet 1K task, it achieves $60\\%$ of the gains from adding an\nextra layer, while only adding $0.003\\%$ more parameters, and matches it while\nadding $2.6\\times$ fewer parameters.\n","authors":["Gaurav Menghani","Ravi Kumar","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07501v1.pdf","comment":"Accepted at the 2nd Efficient Systems for Foundation Models Workshop\n at the International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2411.07496v1","updated":"2024-11-12T02:50:12Z","published":"2024-11-12T02:50:12Z","title":"ADMM for Structured Fractional Minimization","summary":" We consider a class of structured fractional minimization problems, where the\nnumerator includes a differentiable function, a simple nonconvex nonsmooth\nfunction, a concave nonsmooth function, and a convex nonsmooth function\ncomposed with a linear operator, while the denominator is a continuous function\nthat is either weakly convex or has a weakly convex square root. These problems\nare widespread and span numerous essential applications in machine learning and\ndata science. Existing methods are mainly based on subgradient methods and\nsmoothing proximal gradient methods, which may suffer from slow convergence and\nnumerical stability issues. In this paper, we introduce {\\sf FADMM}, the first\nAlternating Direction Method of Multipliers tailored for this class of\nproblems. {\\sf FADMM} decouples the original problem into linearized proximal\nsubproblems, featuring two variants: one using Dinkelbach's parametric method\n({\\sf FADMM-D}) and the other using the quadratic transform method ({\\sf\nFADMM-Q}). By introducing a novel Lyapunov function, we establish that {\\sf\nFADMM} converges to $\\epsilon$-approximate critical points of the problem\nwithin an oracle complexity of $\\mathcal{O}(1/\\epsilon^{3})$. Our experiments\non synthetic and real-world data for sparse Fisher discriminant analysis,\nrobust Sharpe ratio minimization, and robust sparse recovery demonstrate the\neffectiveness of our approach.\n Keywords: Fractional Minimization, Nonconvex Optimization, Proximal\nLinearized ADMM, Nonsmooth Optimization, Convergence Analysis\n","authors":["Ganzhao Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.07496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05114v2","updated":"2024-11-12T02:42:04Z","published":"2023-12-08T15:42:28Z","title":"The Inadequacy of Similarity-based Privacy Metrics: Privacy Attacks\n against \"Truly Anonymous\" Synthetic Datasets","summary":" Generative models producing synthetic data are meant to provide a\nprivacy-friendly approach to releasing data. However, their privacy guarantees\nare only considered robust when models satisfy Differential Privacy (DP). Alas,\nthis is not a ubiquitous standard, as many leading companies (and, in fact,\nresearch papers) use ad-hoc privacy metrics based on testing the statistical\nsimilarity between synthetic and real data. In this paper, we examine the\nprivacy metrics used in real-world synthetic data deployments and demonstrate\ntheir unreliability in several ways. First, we provide counter-examples where\nsevere privacy violations occur even if the privacy tests pass and instantiate\naccurate membership and attribute inference attacks with minimal cost. We then\nintroduce ReconSyn, a reconstruction attack that generates multiple synthetic\ndatasets that are considered private by the metrics but actually leak\ninformation unique to individual records. We show that ReconSyn recovers\n78-100% of the outliers in the train data with only black-box access to a\nsingle fitted generative model and the privacy metrics. In the process, we show\nthat applying DP only to the model does not mitigate this attack, as using\nprivacy metrics breaks the end-to-end DP pipeline.\n","authors":["Georgi Ganev","Emiliano De Cristofaro"],"pdf_url":"https://arxiv.org/pdf/2312.05114v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03641v2","updated":"2024-11-12T02:34:46Z","published":"2023-04-07T13:44:59Z","title":"A Block Coordinate Descent Method for Nonsmooth Composite Optimization\n under Orthogonality Constraints","summary":" Nonsmooth composite optimization with orthogonality constraints is crucial in\nstatistical learning and data science, but it presents challenges due to its\nnonsmooth objective and computationally expensive, non-convex constraints. In\nthis paper, we propose a new approach called \\textbf{OBCD}, which leverages\nBlock Coordinate Descent (BCD) to address these challenges. \\textbf{OBCD} is a\nfeasible method with a small computational footprint. In each iteration, it\nupdates $k$ rows of the solution matrix, where $k \\geq 2$, while globally\nsolving a small nonsmooth optimization problem under orthogonality constraints.\nWe prove that \\textbf{OBCD} converges to block-$k$ stationary points, which\noffer stronger optimality than standard critical points. Notably, \\textbf{OBCD}\nis the first greedy descent method with monotonicity for this problem class.\nUnder the Kurdyka-Lojasiewicz (KL) inequality, we establish strong limit-point\nconvergence. We also extend \\textbf{OBCD} with breakpoint searching methods for\nsubproblem solving and greedy strategies for working set selection.\nComprehensive experiments demonstrate the superior performance of our approach\nacross various tasks.\n","authors":["Ganzhao Yuan"],"pdf_url":"https://arxiv.org/pdf/2304.03641v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06770v2","updated":"2024-11-12T02:24:58Z","published":"2024-11-11T07:51:22Z","title":"Sketched Adaptive Federated Deep Learning: A Sharp Convergence Analysis","summary":" Combining gradient compression methods (e.g., CountSketch, quantization) and\nadaptive optimizers (e.g., Adam, AMSGrad) is a desirable goal in federated\nlearning (FL), with potential benefits on both fewer communication rounds and\nless per-round communication. In spite of the preliminary empirical success of\nsketched adaptive methods, existing convergence analyses show the communication\ncost to have a linear dependence on the ambient dimension, i.e., number of\nparameters, which is prohibitively high for modern deep learning models. In\nthis work, we introduce specific sketched adaptive federated learning (SAFL)\nalgorithms and, as our main contribution, provide theoretical convergence\nanalyses in different FL settings with guarantees on communication cost\ndepending only logarithmically (instead of linearly) on the ambient dimension.\nUnlike existing analyses, we show that the entry-wise sketching noise existent\nin the preconditioners and the first moments of SAFL can be implicitly\naddressed by leveraging the recently-popularized anisotropic curvatures in deep\nlearning losses, e.g., fast decaying loss Hessian eigen-values. In the i.i.d.\nclient setting of FL, we show that SAFL achieves asymptotic $O(1/\\sqrt{T})$\nconvergence, and converges faster in the initial epochs. In the non-i.i.d.\nclient setting, where non-adaptive methods lack convergence guarantees, we show\nthat SACFL (SAFL with clipping) algorithms can provably converge in spite of\nthe additional heavy-tailed noise. Our theoretical claims are supported by\nempirical studies on vision and language tasks, and in both fine-tuning and\ntraining-from-scratch regimes. Surprisingly, as a by-product of our analysis,\nthe proposed SAFL methods are competitive with the state-of-the-art\ncommunication-efficient federated learning algorithms based on error feedback.\n","authors":["Zhijie Chen","Qiaobo Li","Arindam Banerjee"],"pdf_url":"https://arxiv.org/pdf/2411.06770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07483v1","updated":"2024-11-12T02:12:41Z","published":"2024-11-12T02:12:41Z","title":"Quantifying Knowledge Distillation Using Partial Information\n Decomposition","summary":" Knowledge distillation provides an effective method for deploying complex\nmachine learning models in resource-constrained environments. It typically\ninvolves training a smaller student model to emulate either the probabilistic\noutputs or the internal feature representations of a larger teacher model. By\ndoing so, the student model often achieves substantially better performance on\na downstream task compared to when it is trained independently. Nevertheless,\nthe teacher's internal representations can also encode noise or additional\ninformation that may not be relevant to the downstream task. This observation\nmotivates our primary question: What are the information-theoretic limits of\nknowledge transfer? To this end, we leverage a body of work in information\ntheory called Partial Information Decomposition (PID) to quantify the\ndistillable and distilled knowledge of a teacher's representation corresponding\nto a given student and a downstream task. Moreover, we demonstrate that this\nmetric can be practically used in distillation to address challenges caused by\nthe complexity gap between the teacher and the student representations.\n","authors":["Pasan Dissanayake","Faisal Hamman","Barproda Halder","Ilia Sucholutsky","Qiuyi Zhang","Sanghamitra Dutta"],"pdf_url":"https://arxiv.org/pdf/2411.07483v1.pdf","comment":"Accepted at NeurIPS 2024 Machine Learning and Compression Workshop"},{"id":"http://arxiv.org/abs/2411.07482v1","updated":"2024-11-12T02:08:19Z","published":"2024-11-12T02:08:19Z","title":"Enhancing Link Prediction with Fuzzy Graph Attention Networks and\n Dynamic Negative Sampling","summary":" Link prediction is crucial for understanding complex networks but traditional\nGraph Neural Networks (GNNs) often rely on random negative sampling, leading to\nsuboptimal performance. This paper introduces Fuzzy Graph Attention Networks\n(FGAT), a novel approach integrating fuzzy rough sets for dynamic negative\nsampling and enhanced node feature aggregation. Fuzzy Negative Sampling (FNS)\nsystematically selects high-quality negative edges based on fuzzy similarities,\nimproving training efficiency. FGAT layer incorporates fuzzy rough set\nprinciples, enabling robust and discriminative node representations.\nExperiments on two research collaboration networks demonstrate FGAT's superior\nlink prediction accuracy, outperforming state-of-the-art baselines by\nleveraging the power of fuzzy rough sets for effective negative sampling and\nnode feature learning.\n","authors":["Jinming Xing"],"pdf_url":"https://arxiv.org/pdf/2411.07482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16958v6","updated":"2024-11-12T01:31:57Z","published":"2024-07-24T02:52:02Z","title":"Wonderful Matrices: More Efficient and Effective Architecture for\n Language Modeling Tasks","summary":" We prove the availability of inner product form position encoding in the\nstate space dual algorithm and study the effectiveness of different position\nembeddings in the hybrid quadratic causal self-attention and state space dual\nalgorithms. We propose inner function attention with dynamic mask, which can\nimprove the expressiveness of the attention algorithm and avoid the sequence\nnoise significantly affecting the accuracy of the attention score. We also\ndesign cross domain mixture of experts, which can improve the granularity of\nthe sparse activation feedforward network while maintaining the efficiency of\nparameter utilization and retrieval. The combination of these methods\nconstitutes our foundation model architecture: Wonderful Matrices. We conduct\nexperiments on the language modeling task and find that Wonderful Matrices are\nmore efficient and effective in handling complex language tasks.\n","authors":["Jingze Shi","Bingheng Wu","Lu He","Luchang Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.16958v6.pdf","comment":"28 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2401.08897v3","updated":"2024-11-12T01:30:06Z","published":"2024-01-17T00:46:24Z","title":"CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in\n Variational AutoEncoder","summary":" Symmetries of input and latent vectors have provided valuable insights for\ndisentanglement learning in VAEs. However, only a few works were proposed as an\nunsupervised method, and even these works require known factor information in\nthe training data. We propose a novel method, Composite Factor-Aligned Symmetry\nLearning (CFASL), which is integrated into VAEs for learning symmetry-based\ndisentanglement in unsupervised learning without any knowledge of the dataset\nfactor information. CFASL incorporates three novel features for learning\nsymmetry-based disentanglement: 1) Injecting inductive bias to align latent\nvector dimensions to factor-aligned symmetries within an explicit learnable\nsymmetry code-book 2) Learning a composite symmetry to express unknown factors\nchange between two random samples by learning factor-aligned symmetries within\nthe codebook 3) Inducing a group equivariant encoder and decoder in training\nVAEs with the two conditions. In addition, we propose an extended evaluation\nmetric for multi-factor changes in comparison to disentanglement evaluation in\nVAEs. In quantitative and in-depth qualitative analysis, CFASL demonstrates a\nsignificant improvement of disentanglement in single-factor change, and\nmulti-factor change conditions compared to state-of-the-art methods.\n","authors":["Hee-Jun Jung","Jaehyoung Jeong","Kangil Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08897v3.pdf","comment":"Accepted in TMLR 25 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.13644v2","updated":"2024-11-12T01:23:55Z","published":"2024-09-20T16:48:55Z","title":"Non-overlapping, Schwarz-type Domain Decomposition Method for Physics\n and Equality Constrained Artificial Neural Networks","summary":" We present a non-overlapping, Schwarz-type domain decomposition method with a\ngeneralized interface condition, designed for physics-informed machine learning\nof partial differential equations (PDEs) in both forward and inverse contexts.\nOur approach employs physics and equality-constrained artificial neural\nnetworks (PECANN) within each subdomain. Unlike the original PECANN method,\nwhich relies solely on initial and boundary conditions to constrain PDEs, our\nmethod uses both boundary conditions and the governing PDE to constrain a\nunique interface loss function for each subdomain. This modification improves\nthe learning of subdomain-specific interface parameters while reducing\ncommunication overhead by delaying information exchange between neighboring\nsubdomains. To address the constrained optimization in each subdomain, we apply\nan augmented Lagrangian method with a conditionally adaptive update strategy,\ntransforming the problem into an unconstrained dual optimization. A distinct\nadvantage of our domain decomposition method is its ability to learn solutions\nto both Poisson's and Helmholtz equations, even in cases with high-wavenumber\nand complex-valued solutions. Through numerical experiments with up to 64\nsubdomains, we demonstrate that our method consistently generalizes well as the\nnumber of subdomains increases.\n","authors":["Qifeng Hu","Shamsulhaq Basir","Inanc Senocak"],"pdf_url":"https://arxiv.org/pdf/2409.13644v2.pdf","comment":"49 pages, 19 figures"},{"id":"http://arxiv.org/abs/2411.08249v1","updated":"2024-11-12T23:55:11Z","published":"2024-11-12T23:55:11Z","title":"Retrieval Augmented Time Series Forecasting","summary":" Retrieval-augmented generation (RAG) is a central component of modern LLM\nsystems, particularly in scenarios where up-to-date information is crucial for\naccurately responding to user queries or when queries exceed the scope of the\ntraining data. The advent of time-series foundation models (TSFM), such as\nChronos, and the need for effective zero-shot forecasting performance across\nvarious time-series domains motivates the question: Do benefits of RAG\nsimilarly carry over to time series forecasting? In this paper, we advocate\nthat the dynamic and event-driven nature of time-series data makes RAG a\ncrucial component of TSFMs and introduce a principled RAG framework for\ntime-series forecasting, called Retrieval Augmented Forecasting (RAF). Within\nRAF, we develop efficient strategies for retrieving related time-series\nexamples and incorporating them into forecast. Through experiments and\nmechanistic studies, we demonstrate that RAF indeed improves the forecasting\naccuracy across diverse time series domains and the improvement is more\nsignificant for larger TSFM sizes.\n","authors":["Kutay Tire","Ege Onur Taga","Muhammed Emrullah Ildiz","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2411.08249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08248v1","updated":"2024-11-12T23:54:58Z","published":"2024-11-12T23:54:58Z","title":"Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial\n Approach","summary":" Deep learning underpins most of the currently advanced natural language\nprocessing (NLP) tasks such as textual classification, neural machine\ntranslation (NMT), abstractive summarization and question-answering (QA).\nHowever, the robustness of the models, particularly QA models, against\nadversarial attacks is a critical concern that remains insufficiently explored.\nThis paper introduces QA-Attack (Question Answering Attack), a novel word-level\nadversarial strategy that fools QA models. Our attention-based attack exploits\nthe customized attention mechanism and deletion ranking strategy to identify\nand target specific words within contextual passages. It creates deceptive\ninputs by carefully choosing and substituting synonyms, preserving grammatical\nintegrity while misleading the model to produce incorrect responses. Our\napproach demonstrates versatility across various question types, particularly\nwhen dealing with extensive long textual inputs. Extensive experiments on\nmultiple benchmark datasets demonstrate that QA-Attack successfully deceives\nbaseline QA models and surpasses existing adversarial techniques regarding\nsuccess rate, semantics changes, BLEU score, fluency and grammar error rate.\n","authors":["Jiyao Li","Mingze Ni","Yongshun Gong","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08244v1","updated":"2024-11-12T23:43:20Z","published":"2024-11-12T23:43:20Z","title":"NVCiM-PT: An NVCiM-assisted Prompt Tuning Framework for Edge LLMs","summary":" Large Language Models (LLMs) deployed on edge devices, known as edge LLMs,\nneed to continuously fine-tune their model parameters from user-generated data\nunder limited resource constraints. However, most existing learning methods are\nnot applicable for edge LLMs because of their reliance on high resources and\nlow learning capacity. Prompt tuning (PT) has recently emerged as an effective\nfine-tuning method for edge LLMs by only modifying a small portion of LLM\nparameters, but it suffers from user domain shifts, resulting in repetitive\ntraining and losing resource efficiency. Conventional techniques to address\ndomain shift issues often involve complex neural networks and sophisticated\ntraining, which are incompatible for PT for edge LLMs. Therefore, an open\nresearch question is how to address domain shift issues for edge LLMs with\nlimited resources. In this paper, we propose a prompt tuning framework for edge\nLLMs, exploiting the benefits offered by non-volatile computing-in-memory\n(NVCiM) architectures. We introduce a novel NVCiM-assisted PT framework, where\nwe narrow down the core operations to matrix-matrix multiplication, which can\nthen be accelerated by performing in-situ computation on NVCiM. To the best of\nour knowledge, this is the first work employing NVCiM to improve the edge LLM\nPT performance.\n","authors":["Ruiyang Qin","Pengyu Ren","Zheyu Yan","Liu Liu","Dancheng Liu","Amir Nassereldine","Jinjun Xiong","Kai Ni","Sharon Hu","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2411.08244v1.pdf","comment":"Accepted by DATE 2025"},{"id":"http://arxiv.org/abs/2411.08241v1","updated":"2024-11-12T23:32:21Z","published":"2024-11-12T23:32:21Z","title":"A Social Outcomes and Priorities centered (SOP) Framework for AI policy","summary":" Rapid developments in AI and its adoption across various domains have\nnecessitated a need to build robust guardrails and risk containment plans while\nensuring equitable benefits for the betterment of society. The current\ntechnology-centered approach has resulted in a fragmented, reactive, and\nineffective policy apparatus. This paper highlights the immediate and urgent\nneed to pivot to a society-centered approach to develop comprehensive,\ncoherent, forward-looking AI policy. To this end, we present a Social Outcomes\nand Priorities centered (SOP) framework for AI policy along with proposals on\nimplementation of its various components. While the SOP framework is presented\nfrom a US-centric view, the takeaways are general and applicable globally.\n","authors":["Mohak Shah"],"pdf_url":"https://arxiv.org/pdf/2411.08241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05864v2","updated":"2024-11-12T23:12:55Z","published":"2024-03-09T10:24:12Z","title":"PEaRL: Personalized Privacy of Human-Centric Systems using Early-Exit\n Reinforcement Learning","summary":" In the evolving landscape of human-centric systems, personalized privacy\nsolutions are becoming increasingly crucial due to the dynamic nature of human\ninteractions. Traditional static privacy models often fail to meet the diverse\nand changing privacy needs of users. This paper introduces PEaRL, a system\ndesigned to enhance privacy preservation by tailoring its approach to\nindividual behavioral patterns and preferences. While incorporating\nreinforcement learning (RL) for its adaptability, PEaRL primarily focuses on\nemploying an early-exit strategy that dynamically balances privacy protection\nand system utility. This approach addresses the challenges posed by the\nvariability and evolution of human behavior, which static privacy models\nstruggle to handle effectively. We evaluate PEaRL in two distinct contexts:\nSmart Home environments and Virtual Reality (VR) Smart Classrooms. The\nempirical results demonstrate PEaRL's capability to provide a personalized\ntradeoff between user privacy and application utility, adapting effectively to\nindividual user preferences. On average, across both systems, PEaRL enhances\nprivacy protection by 31%, with a corresponding utility reduction of 24%.\n","authors":["Mojtaba Taherisadr","Salma Elmalaki"],"pdf_url":"https://arxiv.org/pdf/2403.05864v2.pdf","comment":"15 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.16362v2","updated":"2024-11-12T23:08:25Z","published":"2023-08-30T23:34:11Z","title":"A Unified Analysis on the Subgradient Upper Bounds for the Subgradient\n Methods Minimizing Composite Nonconvex, Nonsmooth and Non-Lipschitz Functions","summary":" This paper presents a unified analysis for the proximal subgradient method\n(Prox-SubGrad) type approach to minimize an overall objective of $f(x)+r(x)$,\nsubject to convex constraints, where both $f$ and $r$ are weakly convex,\nnonsmooth, and non-Lipschitz. Leveraging on the properties of the Moreau\nenvelope of weakly convex functions, we are able to relate error-bound\nconditions, the growth conditions of the subgradients of the objective, and the\nbehavior of the proximal subgradient iterates on some remarkably broad classes\nof objective functions. Various existing as well as new bounding conditions are\nstudied, leading to novel iteration complexity results. The terrain of our\nexploration expands to stochastic proximal subgradient algorithms.\n","authors":["Daoli Zhu","Lei Zhao","Shuzhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16362v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08232v1","updated":"2024-11-12T22:56:28Z","published":"2024-11-12T22:56:28Z","title":"Imitation Learning from Observations: An Autoregressive Mixture of\n Experts Approach","summary":" This paper presents a novel approach to imitation learning from observations,\nwhere an autoregressive mixture of experts model is deployed to fit the\nunderlying policy. The parameters of the model are learned via a two-stage\nframework. By leveraging the existing dynamics knowledge, the first stage of\nthe framework estimates the control input sequences and hence reduces the\nproblem complexity. At the second stage, the policy is learned by solving a\nregularized maximum-likelihood estimation problem using the estimated control\ninput sequences. We further extend the learning procedure by incorporating a\nLyapunov stability constraint to ensure asymptotic stability of the identified\nmodel, for accurate multi-step predictions. The effectiveness of the proposed\nframework is validated using two autonomous driving datasets collected from\nhuman demonstrations, demonstrating its practical applicability in modelling\ncomplex nonlinear dynamics.\n","authors":["Renzi Wang","Flavia Sofia Acerbo","Tong Duy Son","Panagiotis Patrinos"],"pdf_url":"https://arxiv.org/pdf/2411.08232v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.08034v1","updated":"2024-11-12T18:59:35Z","published":"2024-11-12T18:59:35Z","title":"Scaling Properties of Diffusion Models for Perceptual Tasks","summary":" In this paper, we argue that iterative computation with diffusion models\noffers a powerful paradigm for not only generation but also visual perception\ntasks. We unify tasks such as depth estimation, optical flow, and segmentation\nunder image-to-image translation, and show how diffusion models benefit from\nscaling training and test-time compute for these perception tasks. Through a\ncareful analysis of these scaling behaviors, we present various techniques to\nefficiently train diffusion models for visual perception tasks. Our models\nachieve improved or comparable performance to state-of-the-art methods using\nsignificantly less data and compute. To use our code and models, see\nhttps://scaling-diffusion-perception.github.io .\n","authors":["Rahul Ravishankar","Zeeshan Patel","Jathushan Rajasegaran","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2411.08034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08033v1","updated":"2024-11-12T18:59:32Z","published":"2024-11-12T18:59:32Z","title":"GaussianAnything: Interactive Point Cloud Latent Diffusion for 3D\n Generation","summary":" While 3D content generation has advanced significantly, existing methods\nstill face challenges with input formats, latent space design, and output\nrepresentations. This paper introduces a novel 3D generation framework that\naddresses these challenges, offering scalable, high-quality 3D generation with\nan interactive Point Cloud-structured Latent space. Our framework employs a\nVariational Autoencoder (VAE) with multi-view posed RGB-D(epth)-N(ormal)\nrenderings as input, using a unique latent space design that preserves 3D shape\ninformation, and incorporates a cascaded latent diffusion model for improved\nshape-texture disentanglement. The proposed method, GaussianAnything, supports\nmulti-modal conditional 3D generation, allowing for point cloud, caption, and\nsingle/multi-view image inputs. Notably, the newly proposed latent space\nnaturally enables geometry-texture disentanglement, thus allowing 3D-aware\nediting. Experimental results demonstrate the effectiveness of our approach on\nmultiple datasets, outperforming existing methods in both text- and\nimage-conditioned 3D generation.\n","authors":["Yushi Lan","Shangchen Zhou","Zhaoyang Lyu","Fangzhou Hong","Shuai Yang","Bo Dai","Xingang Pan","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2411.08033v1.pdf","comment":"project page: https://nirvanalan.github.io/projects/GA/"},{"id":"http://arxiv.org/abs/2411.08028v1","updated":"2024-11-12T18:57:59Z","published":"2024-11-12T18:57:59Z","title":"Learning with Less: Knowledge Distillation from Large Language Models\n via Unlabeled Data","summary":" In real-world NLP applications, Large Language Models (LLMs) offer promising\nsolutions due to their extensive training on vast datasets. However, the large\nsize and high computation demands of LLMs limit their practicality in many\napplications, especially when further fine-tuning is required. To address these\nlimitations, smaller models are typically preferred for deployment. However,\ntheir training is hindered by the scarcity of labeled data. In contrast,\nunlabeled data is often readily which can be leveraged by using LLMs to\ngenerate pseudo-labels for training smaller models. This enables the smaller\nmodels (student) to acquire knowledge from LLMs(teacher) while reducing\ncomputational costs. This process introduces challenges, such as potential\nnoisy pseudo-labels. Selecting high-quality and informative data is therefore\ncritical to enhance model performance while improving the efficiency of data\nutilization. To address this, we propose LLKD that enables Learning with Less\ncomputational resources and less data for Knowledge Distillation from LLMs.\nLLKD is an adaptive sample selection method that incorporates signals from both\nthe teacher and student. Specifically, it prioritizes samples where the teacher\ndemonstrates high confidence in its labeling, indicating reliable labels, and\nwhere the student exhibits a high information need, identifying challenging\nsamples that require further learning. Our comprehensive experiments show that\nLLKD achieves superior performance across various datasets with higher data\nefficiency.\n","authors":["Juanhui Li","Sreyashi Nag","Hui Liu","Xianfeng Tang","Sheikh Sarwar","Limeng Cui","Hansu Gu","Suhang Wang","Qi He","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2411.08028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08027v1","updated":"2024-11-12T18:56:58Z","published":"2024-11-12T18:56:58Z","title":"LLMPhy: Complex Physical Reasoning Using Large Language Models and World\n Models","summary":" Physical reasoning is an important skill needed for robotic agents when\noperating in the real world. However, solving such reasoning problems often\ninvolves hypothesizing and reflecting over complex multi-body interactions\nunder the effect of a multitude of physical forces and thus learning all such\ninteractions poses a significant hurdle for state-of-the-art machine learning\nframeworks, including large language models (LLMs). To study this problem, we\npropose a new physical reasoning task and a dataset, dubbed TraySim. Our task\ninvolves predicting the dynamics of several objects on a tray that is given an\nexternal impact -- the domino effect of the ensued object interactions and\ntheir dynamics thus offering a challenging yet controlled setup, with the goal\nof reasoning being to infer the stability of the objects after the impact. To\nsolve this complex physical reasoning task, we present LLMPhy, a zero-shot\nblack-box optimization framework that leverages the physics knowledge and\nprogram synthesis abilities of LLMs, and synergizes these abilities with the\nworld models built into modern physics engines. Specifically, LLMPhy uses an\nLLM to generate code to iteratively estimate the physical hyperparameters of\nthe system (friction, damping, layout, etc.) via an implicit\nanalysis-by-synthesis approach using a (non-differentiable) simulator in the\nloop and uses the inferred parameters to imagine the dynamics of the scene\ntowards solving the reasoning task. To show the effectiveness of LLMPhy, we\npresent experiments on our TraySim dataset to predict the steady-state poses of\nthe objects. Our results show that the combination of the LLM and the physics\nengine leads to state-of-the-art zero-shot physical reasoning performance,\nwhile demonstrating superior convergence against standard black-box\noptimization methods and better estimation of the physical parameters.\n","authors":["Anoop Cherian","Radu Corcodel","Siddarth Jain","Diego Romeres"],"pdf_url":"https://arxiv.org/pdf/2411.08027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08024v1","updated":"2024-11-12T18:54:55Z","published":"2024-11-12T18:54:55Z","title":"Leonardo vindicated: Pythagorean trees for minimal reconstruction of the\n natural branching structures","summary":" Trees continue to fascinate with their natural beauty and as engineering\nmasterpieces optimal with respect to several independent criteria. Pythagorean\ntree is a well-known fractal design that realistically mimics the natural tree\nbranching structures. We study various types of Pythagorean-like fractal trees\nwith different shapes of the base, branching angles and relaxed scales in an\nattempt to identify and explain which variants are the closest match to the\nbranching structures commonly observed in the natural world. Pursuing\nsimultaneously the realism and minimalism of the fractal tree model, we have\ndeveloped a flexibly parameterised and fast algorithm to grow and visually\nexamine deep Pythagorean-inspired fractal trees with the capability to orderly\nover- or underestimate the Leonardo da Vinci's tree branching rule as well as\ncontrol various imbalances and branching angles. We tested the realism of the\ngenerated fractal tree images by means of the classification accuracy of\ndetecting natural tree with the transfer-trained deep Convolutional Neural\nNetworks (CNNs). Having empirically established the parameters of the fractal\ntrees that maximize the CNN's natural tree class classification accuracy we\nhave translated them back to the scales and angles of branches and came to the\ninteresting conclusions that support the da Vinci branching rule and golden\nratio based scaling for both the shape of the branch and imbalance between the\nchild branches, and claim the flexibly parameterized fractal trees can be used\nto generate artificial examples to train robust detectors of different species\nof trees.\n","authors":["Dymitr Ruta","Corrado Mio","Ernesto Damiani"],"pdf_url":"https://arxiv.org/pdf/2411.08024v1.pdf","comment":"22 pages, lots of hi res figures I had to reduce quality of,\n submitting as a requirement to the Theory of Computing Journal"},{"id":"http://arxiv.org/abs/2411.08019v1","updated":"2024-11-12T18:50:35Z","published":"2024-11-12T18:50:35Z","title":"Language Models as Causal Effect Generators","summary":" We present a framework for large language model (LLM) based data generation\nwith controllable causal structure. In particular, we define a procedure for\nturning any language model and any directed acyclic graph (DAG) into a\nsequence-driven structural causal model (SD-SCM). Broadly speaking, an SD-SCM\nis a causal model with user-defined structure and LLM-defined structural\nequations. We characterize how an SD-SCM allows sampling from observational,\ninterventional, and counterfactual distributions according to the desired\ncausal structure. We then leverage this procedure to propose a new type of\nbenchmark for causal inference methods, generating individual-level\ncounterfactual data without needing to manually specify functional\nrelationships between variables. We create an example benchmark consisting of\nthousands of datasets, and test a suite of popular estimation methods on these\ndatasets for average, conditional average, and individual treatment effect\nestimation, both with and without hidden confounding. Apart from generating\ndata, the same procedure also allows us to test for the presence of a causal\neffect that might be encoded in an LLM. This procedure can underpin auditing\nLLMs for misinformation, discrimination, or otherwise undesirable behavior. We\nbelieve SD-SCMs can serve as a useful tool in any application that would\nbenefit from sequential data with controllable causal structure.\n","authors":["Lucius E. J. Bynum","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2411.08019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08017v1","updated":"2024-11-12T18:49:06Z","published":"2024-11-12T18:49:06Z","title":"Wavelet Latent Diffusion (Wala): Billion-Parameter 3D Generative Model\n with Compact Wavelet Encodings","summary":" Large-scale 3D generative models require substantial computational resources\nyet often fall short in capturing fine details and complex geometries at high\nresolutions. We attribute this limitation to the inefficiency of current\nrepresentations, which lack the compactness required to model the generative\nmodels effectively. To address this, we introduce a novel approach called\nWavelet Latent Diffusion, or WaLa, that encodes 3D shapes into wavelet-based,\ncompact latent encodings. Specifically, we compress a $256^3$ signed distance\nfield into a $12^3 \\times 4$ latent grid, achieving an impressive 2427x\ncompression ratio with minimal loss of detail. This high level of compression\nallows our method to efficiently train large-scale generative networks without\nincreasing the inference time. Our models, both conditional and unconditional,\ncontain approximately one billion parameters and successfully generate\nhigh-quality 3D shapes at $256^3$ resolution. Moreover, WaLa offers rapid\ninference, producing shapes within two to four seconds depending on the\ncondition, despite the model's scale. We demonstrate state-of-the-art\nperformance across multiple datasets, with significant improvements in\ngeneration quality, diversity, and computational efficiency. We open-source our\ncode and, to the best of our knowledge, release the largest pretrained 3D\ngenerative models across different modalities.\n","authors":["Aditya Sanghi","Aliasghar Khani","Pradyumna Reddy","Arianna Rampini","Derek Cheung","Kamal Rahimi Malekshan","Kanika Madan","Hooman Shayani"],"pdf_url":"https://arxiv.org/pdf/2411.08017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08013v1","updated":"2024-11-12T18:43:27Z","published":"2024-11-12T18:43:27Z","title":"Investigating the Effectiveness of Explainability Methods in Parkinson's\n Detection from Speech","summary":" Speech impairments in Parkinson's disease (PD) provide significant early\nindicators for diagnosis. While models for speech-based PD detection have shown\nstrong performance, their interpretability remains underexplored. This study\nsystematically evaluates several explainability methods to identify PD-specific\nspeech features, aiming to support the development of accurate, interpretable\nmodels for clinical decision-making in PD diagnosis and monitoring. Our\nmethodology involves (i) obtaining attributions and saliency maps using\nmainstream interpretability techniques, (ii) quantitatively evaluating the\nfaithfulness of these maps and their combinations obtained via union and\nintersection through a range of established metrics, and (iii) assessing the\ninformation conveyed by the saliency maps for PD detection from an auxiliary\nclassifier. Our results reveal that, while explanations are aligned with the\nclassifier, they often fail to provide valuable information for domain experts.\n","authors":["Eleonora Mancini","Francesco Paissan","Paolo Torroni","Cem Subakan","Mirco Ravanelli"],"pdf_url":"https://arxiv.org/pdf/2411.08013v1.pdf","comment":"The first two authors contributed equally to this research: author\n order is alphabetical"},{"id":"http://arxiv.org/abs/2411.08010v1","updated":"2024-11-12T18:35:28Z","published":"2024-11-12T18:35:28Z","title":"ExpressivityArena: Can LLMs Express Information Implicitly?","summary":" While Large Language Models (LLMs) have demonstrated remarkable performance\nin certain dimensions, their ability to express implicit language cues that\nhuman use for effective communication remains unclear. This paper presents\nExpressivityArena, a Python library for measuring the implicit communication\nabilities of LLMs. We provide a comprehensive framework to evaluate\nexpressivity of arbitrary LLMs and explore its practical implications. To this\nend, we refine the definition and measurements of ``expressivity,'' and use our\nframework in a set of small experiments. These experiments test LLMs in\ncreative and logical tasks such as poetry, coding, and emotion-based responses.\nThey are then evaluated by an automated grader, through ExpressivityArena,\nwhich we verify to be the most pragmatic for testing expressivity. Building on\nthese experiments, we deepen our understanding of the expressivity of LLMs by\nassessing their ability to remain expressive in conversations. Our findings\nindicate that LLMs are capable of generating and understanding expressive\ncontent, however, with some limitations. These insights will inform the future\ndevelopment and deployment of expressive LLMs. We provide the code for\nExpressivityArena alongside our paper.\n","authors":["Joshua Tint","Som Sagar","Aditya Taparia","Kelly Raines","Bimsara Pathiraja","Caleb Liu","Ransalu Senanayake"],"pdf_url":"https://arxiv.org/pdf/2411.08010v1.pdf","comment":"8 pages, 22 figures"},{"id":"http://arxiv.org/abs/2411.08003v1","updated":"2024-11-12T18:28:57Z","published":"2024-11-12T18:28:57Z","title":"Can adversarial attacks by large language models be attributed?","summary":" Attributing outputs from Large Language Models (LLMs) in adversarial\nsettings-such as cyberattacks and disinformation-presents significant\nchallenges that are likely to grow in importance. We investigate this\nattribution problem using formal language theory, specifically language\nidentification in the limit as introduced by Gold and extended by Angluin. By\nmodeling LLM outputs as formal languages, we analyze whether finite text\nsamples can uniquely pinpoint the originating model. Our results show that due\nto the non-identifiability of certain language classes, under some mild\nassumptions about overlapping outputs from fine-tuned models it is\ntheoretically impossible to attribute outputs to specific LLMs with certainty.\nThis holds also when accounting for expressivity limitations of Transformer\narchitectures. Even with direct model access or comprehensive monitoring,\nsignificant computational hurdles impede attribution efforts. These findings\nhighlight an urgent need for proactive measures to mitigate risks posed by\nadversarial LLM use as their influence continues to expand.\n","authors":["Manuel Cebrian","Jan Arne Telle"],"pdf_url":"https://arxiv.org/pdf/2411.08003v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.07990v1","updated":"2024-11-12T18:15:19Z","published":"2024-11-12T18:15:19Z","title":"Derivational Morphology Reveals Analogical Generalization in Large\n Language Models","summary":" What mechanisms underlie linguistic generalization in large language models\n(LLMs)? This question has attracted considerable attention, with most studies\nanalyzing the extent to which the language skills of LLMs resemble rules. As of\nyet, it is not known whether linguistic generalization in LLMs could equally\nwell be explained as the result of analogical processes, which can be\nformalized as similarity operations on stored exemplars. A key shortcoming of\nprior research is its focus on linguistic phenomena with a high degree of\nregularity, for which rule-based and analogical approaches make the same\npredictions. Here, we instead examine derivational morphology, specifically\nEnglish adjective nominalization, which displays notable variability. We\nintroduce a new method for investigating linguistic generalization in LLMs:\nfocusing on GPT-J, we fit cognitive models that instantiate rule-based and\nanalogical learning to the LLM training data and compare their predictions on a\nset of nonce adjectives with those of the LLM, allowing us to draw direct\nconclusions regarding underlying mechanisms. As expected, rule-based and\nanalogical models explain the predictions of GPT-J equally well for adjectives\nwith regular nominalization patterns. However, for adjectives with variable\nnominalization patterns, the analogical model provides a much better match.\nFurthermore, GPT-J's behavior is sensitive to the individual word frequencies,\neven for regular forms, a behavior that is consistent with an analogical\naccount of regular forms but not a rule-based one. These findings refute the\nhypothesis that GPT-J's linguistic generalization on adjective nominalization\ninvolves rules, suggesting similarity operations on stored exemplars as the\nunderlying mechanism. Overall, our study suggests that analogical processes\nplay a bigger role in the linguistic generalization of LLMs than previously\nthought.\n","authors":["Valentin Hofmann","Leonie Weissweiler","David Mortensen","Hinrich Schütze","Janet Pierrehumbert"],"pdf_url":"https://arxiv.org/pdf/2411.07990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02271v2","updated":"2024-11-12T18:11:30Z","published":"2024-11-04T17:03:52Z","title":"On the Utilization of Unique Node Identifiers in Graph Neural Networks","summary":" Graph Neural Networks have inherent representational limitations due to their\nmessage-passing structure. Recent work has suggested that these limitations can\nbe overcome by using unique node identifiers (UIDs). Here we argue that despite\nthe advantages of UIDs, one of their disadvantages is that they lose the\ndesirable property of permutation-equivariance. We thus propose to focus on UID\nmodels that are permutation-equivariant, and present theoretical arguments for\ntheir advantages. Motivated by this, we propose a method to regularize UID\nmodels towards permutation equivariance, via a contrastive loss. We empirically\ndemonstrate that our approach improves generalization and extrapolation\nabilities while providing faster training convergence. On the recent BREC\nexpressiveness benchmark, our proposed method achieves state-of-the-art\nperformance compared to other random-based approaches.\n","authors":["Maya Bechler-Speicher","Moshe Eliasof","Carola-Bibiane Schönlieb","Ran Gilad-Bachrach","Amir Globerson"],"pdf_url":"https://arxiv.org/pdf/2411.02271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07983v1","updated":"2024-11-12T18:08:45Z","published":"2024-11-12T18:08:45Z","title":"Gini Coefficient as a Unified Metric for Evaluating Many-versus-Many\n Similarity in Vector Spaces","summary":" We demonstrate that Gini coefficients can be used as unified metrics to\nevaluate many-versus-many (all-to-all) similarity in vector spaces. Our\nanalysis of various image datasets shows that images with the highest Gini\ncoefficients tend to be the most similar to one another, while images with the\nlowest Gini coefficients are the least similar. We also show that this\nrelationship holds true for vectorized text embeddings from various corpuses,\nhighlighting the consistency of our method and its broad applicability across\ndifferent types of data. Additionally, we demonstrate that selecting machine\nlearning training samples that closely match the distribution of the testing\ndataset is far more important than ensuring data diversity. Selection of\nexemplary and iconic training samples with higher Gini coefficients leads to\nsignificantly better model performance compared to simply having a diverse\ntraining set with lower Gini coefficients. Thus, Gini coefficients can serve as\neffective criteria for selecting machine learning training samples, with our\nselection method outperforming random sampling methods in very sparse\ninformation settings.\n","authors":["Ben Fauber"],"pdf_url":"https://arxiv.org/pdf/2411.07983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07979v1","updated":"2024-11-12T17:58:40Z","published":"2024-11-12T17:58:40Z","title":"Exact, Tractable Gauss-Newton Optimization in Deep Reversible\n Architectures Reveal Poor Generalization","summary":" Second-order optimization has been shown to accelerate the training of deep\nneural networks in many applications, often yielding faster progress per\niteration on the training loss compared to first-order optimizers.However, the\ngeneralization properties of second-order methods are still being debated.\nTheoretical investigations have proved difficult to carry out outside the\ntractable settings of heavily simplified model classes -- thus, the relevance\nof existing theories to practical deep learning applications remains unclear.\nSimilarly, empirical studies in large-scale models and real datasets are\nsignificantly confounded by the necessity to approximate second-order updates\nin practice. It is often unclear whether the observed generalization behaviour\narises specifically from the second-order nature of the parameter updates, or\ninstead reflects the specific structured (e.g.\\ Kronecker) approximations used\nor any damping-based interpolation towards first-order updates. Here, we show\nfor the first time that exact Gauss-Newton (GN) updates take on a tractable\nform in a class of deep reversible architectures that are sufficiently\nexpressive to be meaningfully applied to common benchmark datasets. We exploit\nthis novel setting to study the training and generalization properties of the\nGN optimizer. We find that exact GN generalizes poorly. In the mini-batch\ntraining setting, this manifests as rapidly saturating progress even on the\n\\emph{training} loss, with parameter updates found to overfit each\nmini-batchatch without producing the features that would support generalization\nto other mini-batches. We show that our experiments run in the ``lazy'' regime,\nin which the neural tangent kernel (NTK) changes very little during the course\nof training. This behaviour is associated with having no significant changes in\nneural representations, explaining the lack of generalization.\n","authors":["Davide Buffelli","Jamie McGowan","Wangkun Xu","Alexandru Cioba","Da-shan Shiu","Guillaume Hennequin","Alberto Bernacchia"],"pdf_url":"https://arxiv.org/pdf/2411.07979v1.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.07976v1","updated":"2024-11-12T17:55:39Z","published":"2024-11-12T17:55:39Z","title":"DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring","summary":" Coronary artery disease (CAD), one of the most common cause of mortality in\nthe world. Coronary artery calcium (CAC) scoring using computed tomography (CT)\nis key for risk assessment to prevent coronary disease. Previous studies on\nrisk assessment and calcification detection in CT scans primarily use\napproaches based on UNET architecture, frequently implemented on pre-built\nmodels. However, these models are limited by the availability of annotated CT\nscans containing CAC and suffering from imbalanced dataset, decreasing\nperformance of CAC segmentation and scoring. In this study, we extend this\napproach by incorporating the self-supervised learning (SSL) technique of DINO\n(self-distillation with no labels) to eliminate limitations of scarce annotated\ndata in CT scans. The DINO model's ability to train without requiring CAC area\nannotations enhances its robustness in generating distinct features. The DINO\nmodel is trained on to focus specifically on calcified areas by using labels,\naiming to generate features that effectively capture and highlight key\ncharacteristics. The label-guided DINO (DINO-LG) enhances classification by\ndistinguishing CT slices that contain calcification from those that do not,\nperforming 57% better than the standard DINO model in this task. CAC scoring\nand segmentation tasks are performed by a basic U-NET architecture, fed\nspecifically with CT slices containing calcified areas as identified by the\nDINO-LG model. This targeted identification performed by DINO-LG model improves\nCAC segmentation performance by approximately 10% and significant increase in\nCAC scoring accuracy.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Caner Ozcan"],"pdf_url":"https://arxiv.org/pdf/2411.07976v1.pdf","comment":"Developed by Center for Applied Artificial Intelligence (CAAI),\n University of Kentucky"},{"id":"http://arxiv.org/abs/2411.07975v1","updated":"2024-11-12T17:55:10Z","published":"2024-11-12T17:55:10Z","title":"JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified\n Multimodal Understanding and Generation","summary":" We present JanusFlow, a powerful framework that unifies image understanding\nand generation in a single model. JanusFlow introduces a minimalist\narchitecture that integrates autoregressive language models with rectified\nflow, a state-of-the-art method in generative modeling. Our key finding\ndemonstrates that rectified flow can be straightforwardly trained within the\nlarge language model framework, eliminating the need for complex architectural\nmodifications. To further improve the performance of our unified model, we\nadopt two key strategies: (i) decoupling the understanding and generation\nencoders, and (ii) aligning their representations during unified training.\nExtensive experiments show that JanusFlow achieves comparable or superior\nperformance to specialized models in their respective domains, while\nsignificantly outperforming existing unified approaches across standard\nbenchmarks. This work represents a step toward more efficient and versatile\nvision-language models.\n","authors":["Yiyang Ma","Xingchao Liu","Xiaokang Chen","Wen Liu","Chengyue Wu","Zhiyu Wu","Zizheng Pan","Zhenda Xie","Haowei Zhang","Xingkai yu","Liang Zhao","Yisong Wang","Jiaying Liu","Chong Ruan"],"pdf_url":"https://arxiv.org/pdf/2411.07975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09434v2","updated":"2024-11-12T17:49:12Z","published":"2024-07-12T17:09:47Z","title":"Foundation Models for the Electric Power Grid","summary":" Foundation models (FMs) currently dominate news headlines. They employ\nadvanced deep learning architectures to extract structural information\nautonomously from vast datasets through self-supervision. The resulting rich\nrepresentations of complex systems and dynamics can be applied to many\ndownstream applications. Therefore, FMs can find uses in electric power grids,\nchallenged by the energy transition and climate change. In this paper, we call\nfor the development of, and state why we believe in, the potential of FMs for\nelectric grids. We highlight their strengths and weaknesses amidst the\nchallenges of a changing grid. We argue that an FM learning from diverse grid\ndata and topologies could unlock transformative capabilities, pioneering a new\napproach in leveraging AI to redefine how we manage complexity and uncertainty\nin the electric grid. Finally, we discuss a power grid FM concept, namely\nGridFM, based on graph neural networks and show how different downstream tasks\nbenefit.\n","authors":["Hendrik F. Hamann","Thomas Brunschwiler","Blazhe Gjorgiev","Leonardo S. A. Martins","Alban Puech","Anna Varbella","Jonas Weiss","Juan Bernabe-Moreno","Alexandre Blondin Massé","Seong Choi","Ian Foster","Bri-Mathias Hodge","Rishabh Jain","Kibaek Kim","Vincent Mai","François Mirallès","Martin De Montigny","Octavio Ramos-Leaños","Hussein Suprême","Le Xie","El-Nasser S. Youssef","Arnaud Zinflou","Alexander J. Belyi","Ricardo J. Bessa","Bishnu Prasad Bhattarai","Johannes Schmude","Stanislav Sobolevsky"],"pdf_url":"https://arxiv.org/pdf/2407.09434v2.pdf","comment":"Major equal contributors: H.F.H., T.B., B.G., L.S.A.M., A.P., A.V.,\n J.W.; Significant equal contributors: J.B., A.B.M., S.C., I.F., B.H., R.J.,\n K.K., V.M., F.M., M.D.M., O.R., H.S., L.X., E.S.Y., A.Z.; Other equal\n contributors: A.J.B., R.J.B., B.P.B., J.S., S.S; Lead contact: H.F.H"},{"id":"http://arxiv.org/abs/2411.07955v1","updated":"2024-11-12T17:31:35Z","published":"2024-11-12T17:31:35Z","title":"How To Discover Short, Shorter, and the Shortest Proofs of\n Unsatisfiability: A Branch-and-Bound Approach for Resolution Proof Length\n Minimization","summary":" Modern software for propositional satisfiability problems gives a powerful\nautomated reasoning toolkit, capable of outputting not only a\nsatisfiable/unsatisfiable signal but also a justification of unsatisfiability\nin the form of resolution proof (or a more expressive proof), which is commonly\nused for verification purposes. Empirically, modern SAT solvers produce\nrelatively short proofs, however, there are no inherent guarantees that these\nproofs cannot be significantly reduced. This paper proposes a novel\nbranch-and-bound algorithm for finding the shortest resolution proofs; to this\nend, we introduce a layer list representation of proofs that groups clauses by\ntheir level of indirection. As we show, this representation breaks all\npermutational symmetries, thereby improving upon the state-of-the-art\nsymmetry-breaking and informing the design of a novel workflow for proof\nminimization. In addition to that, we design pruning procedures that reason on\nproof length lower bound, clause subsumption, and dominance. Our experiments\nsuggest that the proofs from state-of-the-art solvers could be shortened by\n30-60% on the instances from SAT Competition 2002 and by 25-50% on small\nsynthetic formulas. When treated as an algorithm for finding the shortest\nproof, our approach solves twice as many instances as the previous work based\non SAT solving and reduces the time to optimality by orders of magnitude for\nthe instances solved by both approaches.\n","authors":["Konstantin Sidorov","Koos van der Linden","Gonçalo Homem de Almeida Correia","Mathijs de Weerdt","Emir Demirović"],"pdf_url":"https://arxiv.org/pdf/2411.07955v1.pdf","comment":"42 pages, 16 figures, 8 tables, submitted to Journal of Artificial\n Intelligence Research"},{"id":"http://arxiv.org/abs/2411.07942v1","updated":"2024-11-12T17:11:46Z","published":"2024-11-12T17:11:46Z","title":"Towards Low-bit Communication for Tensor Parallel LLM Inference","summary":" Tensor parallelism provides an effective way to increase server large\nlanguage model (LLM) inference efficiency despite adding an additional\ncommunication cost. However, as server LLMs continue to scale in size, they\nwill need to be distributed across more devices, magnifying the communication\ncost. One way to approach this problem is with quantization, but current\nmethods for LLMs tend to avoid quantizing the features that tensor parallelism\nneeds to communicate. Taking advantage of consistent outliers in communicated\nfeatures, we introduce a quantization method that reduces communicated values\non average from 16 bits to 4.2 bits while preserving nearly all of the original\nperformance. For instance, our method maintains around 98.0% and 99.5% of Gemma\n2 27B's and Llama 2 13B's original performance, respectively, averaged across\nall tasks we evaluated on.\n","authors":["Harry Dong","Tyler Johnson","Minsik Cho","Emad Soroush"],"pdf_url":"https://arxiv.org/pdf/2411.07942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07941v1","updated":"2024-11-12T17:11:18Z","published":"2024-11-12T17:11:18Z","title":"DuoLift-GAN:Reconstructing CT from Single-view and Biplanar X-Rays with\n Generative Adversarial Networks","summary":" Computed tomography (CT) provides highly detailed three-dimensional (3D)\nmedical images but is costly, time-consuming, and often inaccessible in\nintraoperative settings (Organization et al. 2011). Recent advancements have\nexplored reconstructing 3D chest volumes from sparse 2D X-rays, such as\nsingle-view or orthogonal double-view images. However, current models tend to\nprocess 2D images in a planar manner, prioritizing visual realism over\nstructural accuracy. In this work, we introduce DuoLift Generative Adversarial\nNetworks (DuoLift-GAN), a novel architecture with dual branches that\nindependently elevate 2D images and their features into 3D representations.\nThese 3D outputs are merged into a unified 3D feature map and decoded into a\ncomplete 3D chest volume, enabling richer 3D information capture. We also\npresent a masked loss function that directs reconstruction towards critical\nanatomical regions, improving structural accuracy and visual quality. This\npaper demonstrates that DuoLift-GAN significantly enhances reconstruction\naccuracy while achieving superior visual realism compared to existing methods.\n","authors":["Zhaoxi Zhang","Yueliang Ying"],"pdf_url":"https://arxiv.org/pdf/2411.07941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07940v1","updated":"2024-11-12T17:09:20Z","published":"2024-11-12T17:09:20Z","title":"Automatic dataset shift identification to support root cause analysis of\n AI performance drift","summary":" Shifts in data distribution can substantially harm the performance of\nclinical AI models. Hence, various methods have been developed to detect the\npresence of such shifts at deployment time. However, root causes of dataset\nshifts are varied, and the choice of shift mitigation strategies is highly\ndependent on the precise type of shift encountered at test time. As such,\ndetecting test-time dataset shift is not sufficient: precisely identifying\nwhich type of shift has occurred is critical. In this work, we propose the\nfirst unsupervised dataset shift identification framework, effectively\ndistinguishing between prevalence shift (caused by a change in the label\ndistribution), covariate shift (caused by a change in input characteristics)\nand mixed shifts (simultaneous prevalence and covariate shifts). We discuss the\nimportance of self-supervised encoders for detecting subtle covariate shifts\nand propose a novel shift detector leveraging both self-supervised encoders and\ntask model outputs for improved shift detection. We report promising results\nfor the proposed shift identification framework across three different imaging\nmodalities (chest radiography, digital mammography, and retinal fundus images)\non five types of real-world dataset shifts, using four large publicly available\ndatasets.\n","authors":["Mélanie Roschewitz","Raghav Mehta","Charles Jones","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2411.07940v1.pdf","comment":"Code available at\n https://github.com/biomedia-mira/shift_identification"},{"id":"http://arxiv.org/abs/2411.07934v1","updated":"2024-11-12T17:04:56Z","published":"2024-11-12T17:04:56Z","title":"Doubly Mild Generalization for Offline Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) suffers from the extrapolation error and\nvalue overestimation. From a generalization perspective, this issue can be\nattributed to the over-generalization of value functions or policies towards\nout-of-distribution (OOD) actions. Significant efforts have been devoted to\nmitigating such generalization, and recent in-sample learning approaches have\nfurther succeeded in entirely eschewing it. Nevertheless, we show that mild\ngeneralization beyond the dataset can be trusted and leveraged to improve\nperformance under certain conditions. To appropriately exploit generalization\nin offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild\naction generalization and (ii) mild generalization propagation. The former\nrefers to selecting actions in a close neighborhood of the dataset to maximize\nthe Q values. Even so, the potential erroneous generalization can still be\npropagated, accumulated, and exacerbated by bootstrapping. In light of this,\nthe latter concept is introduced to mitigate the generalization propagation\nwithout impeding the propagation of RL learning signals. Theoretically, DMG\nguarantees better performance than the in-sample optimal policy in the oracle\ngeneralization scenario. Even under worst-case generalization, DMG can still\ncontrol value overestimation at a certain level and lower bound the\nperformance. Empirically, DMG achieves state-of-the-art performance across\nGym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting\nfrom its flexibility in both generalization aspects, DMG enjoys a seamless\ntransition from offline to online learning and attains strong online\nfine-tuning performance.\n","authors":["Yixiu Mao","Qi Wang","Yun Qu","Yuhang Jiang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2411.07934v1.pdf","comment":"Accepted to NeurIPS 2024. arXiv admin note: substantial text overlap\n with arXiv:2410.19400"},{"id":"http://arxiv.org/abs/2411.01897v2","updated":"2024-11-12T16:48:29Z","published":"2024-11-04T09:04:11Z","title":"LE-PDE++: Mamba for accelerating PDEs Simulations","summary":" Partial Differential Equations are foundational in modeling science and\nnatural systems such as fluid dynamics and weather forecasting. The Latent\nEvolution of PDEs method is designed to address the computational intensity of\nclassical and deep learning-based PDE solvers by proposing a scalable and\nefficient alternative. To enhance the efficiency and accuracy of LE-PDE, we\nincorporate the Mamba model, an advanced machine learning model known for its\npredictive efficiency and robustness in handling complex dynamic systems with a\nprogressive learning strategy. The LE-PDE was tested on several benchmark\nproblems. The method demonstrated a marked reduction in computational time\ncompared to traditional solvers and standalone deep learning models while\nmaintaining high accuracy in predicting system behavior over time. Our method\ndoubles the inference speed compared to the LE-PDE while retaining the same\nlevel of parameter efficiency, making it well-suited for scenarios requiring\nlong-term predictions.\n","authors":["Aoming Liang","Zhaoyang Mu","Qi liu","Ruipeng Li","Mingming Ge","Dixia Fan"],"pdf_url":"https://arxiv.org/pdf/2411.01897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03497v2","updated":"2024-11-12T16:44:24Z","published":"2024-08-07T01:37:10Z","title":"Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and\n Tabnet with SMOTEENN","summary":" Bank credit risk is a significant challenge in modern financial transactions,\nand the ability to identify qualified credit card holders among a large number\nof applicants is crucial for the profitability of a bank'sbank's credit card\nbusiness. In the past, screening applicants'applicants' conditions often\nrequired a significant amount of manual labor, which was time-consuming and\nlabor-intensive. Although the accuracy and reliability of previously used ML\nmodels have been continuously improving, the pursuit of more reliable and\npowerful AI intelligent models is undoubtedly the unremitting pursuit by major\nbanks in the financial industry. In this study, we used a dataset of over\n40,000 records provided by a commercial bank as the research object. We\ncompared various dimensionality reduction techniques such as PCA and T-SNE for\npreprocessing high-dimensional datasets and performed in-depth adaptation and\ntuning of distributed models such as LightGBM and XGBoost, as well as deep\nmodels like Tabnet. After a series of research and processing, we obtained\nexcellent research results by combining SMOTEENN with these techniques. The\nexperiments demonstrated that LightGBM combined with PCA and SMOTEENN\ntechniques can assist banks in accurately predicting potential high-quality\ncustomers, showing relatively outstanding performance compared to other models.\n","authors":["Chang Yu","Yixin Jin","Qianwen Xing","Ye Zhang","Shaobo Guo","Shuchen Meng"],"pdf_url":"https://arxiv.org/pdf/2408.03497v2.pdf","comment":"8 pagess on IEEE ICPICS"},{"id":"http://arxiv.org/abs/2406.04658v3","updated":"2024-11-12T16:44:20Z","published":"2024-06-07T05:56:43Z","title":"Advanced Payment Security System:XGBoost, LightGBM and SMOTE Integrated","summary":" With the rise of various online and mobile payment systems, transaction fraud\nhas become a significant threat to financial security. This study explores the\napplication of advanced machine learning models, specifically based on XGBoost\nand LightGBM, for developing a more accurate and robust Payment Security\nProtection Model. To enhance data reliability, we meticulously processed the\ndata sources and applied SMOTE (Synthetic Minority Over-sampling Technique) to\naddress class imbalance and improve data representation. By selecting highly\ncorrelated features, we aimed to strengthen the training process and boost\nmodel performance. We conducted thorough performance evaluations of our\nproposed models, comparing them against traditional methods including Random\nForest, Neural Network, and Logistic Regression. Using metrics such as\nPrecision, Recall, and F1 Score, we rigorously assessed their effectiveness.\nOur detailed analyses and comparisons reveal that the combination of SMOTE with\nXGBoost and LightGBM offers a highly efficient and powerful mechanism for\npayment security protection. Moreover, the integration of XGBoost and LightGBM\nin a Local Ensemble model further demonstrated outstanding performance. After\nincorporating SMOTE, the new combined model achieved a significant improvement\nof nearly 6\\% over traditional models and around 5\\% over its sub-models,\nshowcasing remarkable results.\n","authors":["Qi Zheng","Chang Yu","Jin Cao","Yongshun Xu","Qianwen Xing","Yinxin Jin"],"pdf_url":"https://arxiv.org/pdf/2406.04658v3.pdf","comment":"This paper is received by https://ieee-metacom.org"},{"id":"http://arxiv.org/abs/2406.03733v4","updated":"2024-11-12T16:44:14Z","published":"2024-06-06T04:12:57Z","title":"Credit Card Fraud Detection Using Advanced Transformer Model","summary":" With the proliferation of various online and mobile payment systems, credit\ncard fraud has emerged as a significant threat to financial security. This\nstudy focuses on innovative applications of the latest Transformer models for\nmore robust and precise fraud detection. To ensure the reliability of the data,\nwe meticulously processed the data sources, balancing the dataset to address\nthe issue of data sparsity significantly. We also selected highly correlated\nvectors to strengthen the training process.To guarantee the reliability and\npracticality of the new Transformer model, we conducted performance comparisons\nwith several widely adopted models, including Support Vector Machine (SVM),\nRandom Forest, Neural Network, and Logistic Regression. We rigorously compared\nthese models using metrics such as Precision, Recall, and F1 Score. Through\nthese detailed analyses and comparisons, we present to the readers a highly\nefficient and powerful anti-fraud mechanism with promising prospects. The\nresults demonstrate that the Transformer model not only excels in traditional\napplications but also shows great potential in niche areas like fraud\ndetection, offering a substantial advancement in the field.\n","authors":["Chang Yu","Yongshun Xu","Jin Cao","Ye Zhang","Yinxin Jin","Mengran Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.03733v4.pdf","comment":"This paper have been received by https://ieee-metacom.org/"},{"id":"http://arxiv.org/abs/2305.16945v3","updated":"2024-11-12T16:23:50Z","published":"2023-05-26T14:00:12Z","title":"Levin Tree Search with Context Models","summary":" Levin Tree Search (LTS) is a search algorithm that makes use of a policy (a\nprobability distribution over actions) and comes with a theoretical guarantee\non the number of expansions before reaching a goal node, depending on the\nquality of the policy. This guarantee can be used as a loss function, which we\ncall the LTS loss, to optimize neural networks representing the policy\n(LTS+NN). In this work we show that the neural network can be substituted with\nparameterized context models originating from the online compression literature\n(LTS+CM). We show that the LTS loss is convex under this new model, which\nallows for using standard convex optimization tools, and obtain convergence\nguarantees to the optimal parameters in an online setting for a given set of\nsolution trajectories -- guarantees that cannot be provided for neural\nnetworks. The new LTS+CM algorithm compares favorably against LTS+NN on several\nbenchmarks: Sokoban (Boxoban), The Witness, and the 24-Sliding Tile puzzle\n(STP). The difference is particularly large on STP, where LTS+NN fails to solve\nmost of the test instances while LTS+CM solves each test instance in a fraction\nof a second. Furthermore, we show that LTS+CM is able to learn a policy that\nsolves the Rubik's cube in only a few hundred expansions, which considerably\nimproves upon previous machine learning techniques.\n","authors":["Laurent Orseau","Marcus Hutter","Levi H. S. Lelis"],"pdf_url":"https://arxiv.org/pdf/2305.16945v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07885v1","updated":"2024-11-12T15:47:17Z","published":"2024-11-12T15:47:17Z","title":"INTRABENCH: Interactive Radiological Benchmark","summary":" Current interactive segmentation approaches, inspired by the success of\nMETA's Segment Anything model, have achieved notable advancements, however,\nthey come with substantial limitations that hinder their practical application\nin real clinical scenarios. These include unrealistic human interaction\nrequirements, such as slice-by-slice operations for 2D models on 3D data, a\nlack of iterative refinement, and insufficient evaluation experiments. These\nshortcomings prevent accurate assessment of model performance and lead to\ninconsistent outcomes across studies. IntRaBench overcomes these challenges by\noffering a comprehensive and reproducible framework for evaluating interactive\nsegmentation methods in realistic, clinically relevant scenarios. It includes\ndiverse datasets, target structures, and segmentation models, and provides a\nflexible codebase that allows seamless integration of new models and prompting\nstrategies. Additionally, we introduce advanced techniques to minimize\nclinician interaction, ensuring fair comparisons between 2D and 3D models. By\nopen-sourcing IntRaBench, we invite the research community to integrate their\nmodels and prompting techniques, ensuring continuous and transparent evaluation\nof interactive segmentation models in 3D medical imaging.\n","authors":["Constantin Ulrich","Tassilo Wald","Emily Tempus","Maximilian Rokuss","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2411.07885v1.pdf","comment":"Undergoing Peer-Review"},{"id":"http://arxiv.org/abs/2304.04918v2","updated":"2024-11-12T15:42:42Z","published":"2023-04-11T01:10:49Z","title":"Explicit and Implicit Semantic Ranking Framework","summary":" The core challenge in numerous real-world applications is to match an inquiry\nto the best document from a mutable and finite set of candidates. Existing\nindustry solutions, especially latency-constrained services, often rely on\nsimilarity algorithms that sacrifice quality for speed. In this paper we\nintroduce a generic semantic learning-to-rank framework, Self-training Semantic\nCross-attention Ranking (sRank). This transformer-based framework uses linear\npairwise loss with mutable training batch sizes and achieves quality gains and\nhigh efficiency, and has been applied effectively to show gains on two industry\ntasks at Microsoft over real-world large-scale data sets: Smart Reply (SR) and\nAmbient Clinical Intelligence (ACI). In Smart Reply, sRank assists live\ncustomers with technical support by selecting the best reply from predefined\nsolutions based on consumer and support agent messages. It achieves 11.7% gain\nin offline top-one accuracy on the SR task over the previous system, and has\nenabled 38.7% time reduction in composing messages in telemetry recorded since\nits general release in January 2021. In the ACI task, sRank selects relevant\nhistorical physician templates that serve as guidance for a text summarization\nmodel to generate higher quality medical notes. It achieves 35.5% top-one\naccuracy gain, along with 46% relative ROUGE-L gain in generated medical notes.\n","authors":["Xiaofeng Zhu","Thomas Lin","Vishal Anand","Matthew Calderwood","Eric Clausen-Brown","Gord Lueck","Wen-wai Yim","Cheng Wu"],"pdf_url":"https://arxiv.org/pdf/2304.04918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17651v3","updated":"2024-11-12T15:39:58Z","published":"2024-06-25T15:43:20Z","title":"Software Model Evolution with Large Language Models: Experiments on\n Simulated, Public, and Industrial Datasets","summary":" Modeling structure and behavior of software systems plays a crucial role in\nthe industrial practice of software engineering. As with other software\nengineering artifacts, software models are subject to evolution. Supporting\nmodelers in evolving software models with recommendations for model completions\nis still an open problem, though. In this paper, we explore the potential of\nlarge language models for this task. In particular, we propose an approach,\nRAMC, leveraging large language models, model histories, and\nretrieval-augmented generation for model completion. Through experiments on\nthree datasets, including an industrial application, one public open-source\ncommunity dataset, and one controlled collection of simulated model\nrepositories, we evaluate the potential of large language models for model\ncompletion with RAMC. We found that large language models are indeed a\npromising technology for supporting software model evolution (62.30%\nsemantically correct completions on real-world industrial data and up to 86.19%\ntype-correct completions). The general inference capabilities of large language\nmodels are particularly useful when dealing with concepts for which there are\nfew, noisy, or no examples at all.\n","authors":["Christof Tinnes","Alisa Welter","Sven Apel"],"pdf_url":"https://arxiv.org/pdf/2406.17651v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07873v1","updated":"2024-11-12T15:29:50Z","published":"2024-11-12T15:29:50Z","title":"Diverse capability and scaling of diffusion and auto-regressive models\n when learning abstract rules","summary":" Humans excel at discovering regular structures from limited samples and\napplying inferred rules to novel settings. We investigate whether modern\ngenerative models can similarly learn underlying rules from finite samples and\nperform reasoning through conditional sampling. Inspired by Raven's Progressive\nMatrices task, we designed GenRAVEN dataset, where each sample consists of\nthree rows, and one of 40 relational rules governing the object position,\nnumber, or attributes applies to all rows. We trained generative models to\nlearn the data distribution, where samples are encoded as integer arrays to\nfocus on rule learning. We compared two generative model families: diffusion\n(EDM, DiT, SiT) and autoregressive models (GPT2, Mamba). We evaluated their\nability to generate structurally consistent samples and perform panel\ncompletion via unconditional and conditional sampling. We found diffusion\nmodels excel at unconditional generation, producing more novel and consistent\nsamples from scratch and memorizing less, but performing less well in panel\ncompletion, even with advanced conditional sampling methods. Conversely,\nautoregressive models excel at completing missing panels in a rule-consistent\nmanner but generate less consistent samples unconditionally. We observe diverse\ndata scaling behaviors: for both model families, rule learning emerges at a\ncertain dataset size - around 1000s examples per rule. With more training data,\ndiffusion models improve both their unconditional and conditional generation\ncapabilities. However, for autoregressive models, while panel completion\nimproves with more training data, unconditional generation consistency\ndeclines. Our findings highlight complementary capabilities and limitations of\ndiffusion and autoregressive models in rule learning and reasoning tasks,\nsuggesting avenues for further research into their mechanisms and potential for\nhuman-like reasoning.\n","authors":["Binxu Wang","Jiaqi Shang","Haim Sompolinsky"],"pdf_url":"https://arxiv.org/pdf/2411.07873v1.pdf","comment":"12 pages, 5 figures. Accepted to NeurIPS2024 Workshop on System 2\n Reasoning At Scale as long paper"},{"id":"http://arxiv.org/abs/2411.07871v1","updated":"2024-11-12T15:28:06Z","published":"2024-11-12T15:28:06Z","title":"Leveraging Multimodal Models for Enhanced Neuroimaging Diagnostics in\n Alzheimer's Disease","summary":" The rapid advancements in Large Language Models (LLMs) and Vision-Language\nModels (VLMs) have shown great potential in medical diagnostics, particularly\nin radiology, where datasets such as X-rays are paired with human-generated\ndiagnostic reports. However, a significant research gap exists in the\nneuroimaging field, especially for conditions such as Alzheimer's disease, due\nto the lack of comprehensive diagnostic reports that can be utilized for model\nfine-tuning. This paper addresses this gap by generating synthetic diagnostic\nreports using GPT-4o-mini on structured data from the OASIS-4 dataset, which\ncomprises 663 patients. Using the synthetic reports as ground truth for\ntraining and validation, we then generated neurological reports directly from\nthe images in the dataset leveraging the pre-trained BiomedCLIP and T5 models.\nOur proposed method achieved a BLEU-4 score of 0.1827, ROUGE-L score of 0.3719,\nand METEOR score of 0.4163, revealing its potential in generating clinically\nrelevant and accurate diagnostic reports.\n","authors":["Francesco Chiumento","Mingming Liu"],"pdf_url":"https://arxiv.org/pdf/2411.07871v1.pdf","comment":"The paper has been accepted by the conference: \"2024 International\n Conference on Big Data (IEEE Big Data 2024)\""},{"id":"http://arxiv.org/abs/2411.07870v1","updated":"2024-11-12T15:26:17Z","published":"2024-11-12T15:26:17Z","title":"Trustful LLMs: Customizing and Grounding Text Generation with Knowledge\n Bases and Dual Decoders","summary":" Although people are impressed by the content generation skills of large\nlanguage models, the use of LLMs, such as ChatGPT, is limited by the domain\ngrounding of the content. The correctness and groundedness of the generated\ncontent need to be based on a verified context, such as results from\nRetrieval-Augmented Generation (RAG). One important issue when adapting LLMs to\na customized domain is that the generated responses are often incomplete, or\nthe additions are not verified and may even be hallucinated. Prior studies on\nhallucination detection have focused on evaluation metrics, which are not\neasily adaptable to dynamic domains and can be vulnerable to attacks like\njail-breaking. In this work, we propose 1) a post-processing algorithm that\nleverages knowledge triplets in RAG context to correct hallucinations and 2) a\ndual-decoder model that fuses RAG context to guide the generation process.\n","authors":["Xiaofeng Zhu","Jaya Krishna Mandivarapu"],"pdf_url":"https://arxiv.org/pdf/2411.07870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04492v4","updated":"2024-11-12T15:16:36Z","published":"2024-10-06T14:11:39Z","title":"Interpret Your Decision: Logical Reasoning Regularization for\n Generalization in Visual Classification","summary":" Vision models excel in image classification but struggle to generalize to\nunseen data, such as classifying images from unseen domains or discovering\nnovel categories. In this paper, we explore the relationship between logical\nreasoning and deep learning generalization in visual classification. A logical\nregularization termed L-Reg is derived which bridges a logical analysis\nframework to image classification. Our work reveals that L-Reg reduces the\ncomplexity of the model in terms of the feature distribution and classifier\nweights. Specifically, we unveil the interpretability brought by L-Reg, as it\nenables the model to extract the salient features, such as faces to persons,\nfor classification. Theoretical analysis and experiments demonstrate that L-Reg\nenhances generalization across various scenarios, including multi-domain\ngeneralization and generalized category discovery. In complex real-world\nscenarios where images span unknown classes and unseen domains, L-Reg\nconsistently improves generalization, highlighting its practical efficacy.\n","authors":["Zhaorui Tan","Xi Yang","Qiufeng Wang","Anh Nguyen","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2410.04492v4.pdf","comment":"Accepted by NeurIPS2024 as Spotlight"},{"id":"http://arxiv.org/abs/2411.07854v1","updated":"2024-11-12T15:06:06Z","published":"2024-11-12T15:06:06Z","title":"Tucano: Advancing Neural Text Generation for Portuguese","summary":" Significant advances have been made in natural language processing in recent\nyears. However, our current deep learning approach to language modeling\nrequires substantial resources in terms of data and computation. One of the\nside effects of this data-hungry paradigm is the current schism between\nlanguages, separating those considered high-resource, where most of the\ndevelopment happens and resources are available, and the low-resource ones,\nwhich struggle to attain the same level of performance and autonomy. This study\naims to introduce a new set of resources to stimulate the future development of\nneural text generation in Portuguese. In this work, we document the development\nof GigaVerbo, a concatenation of deduplicated Portuguese text corpora amounting\nto 200 billion tokens. Via this corpus, we trained a series of\ndecoder-transformers named Tucano. Our models perform equal or superior to\nother Portuguese and multilingual language models of similar size in several\nPortuguese benchmarks. The evaluation of our models also reveals that model\nperformance on many currently available benchmarks used by the Portuguese NLP\ncommunity has little to no correlation with the scaling of token ingestion\nduring training, highlighting the limitations of such evaluations when it comes\nto the assessment of Portuguese generative language models. All derivatives of\nour study are openly released on GitHub and Hugging Face. See\nhttps://nkluge-correa.github.io/Tucano/\n","authors":["Nicholas Kluge Corrêa","Aniket Sen","Sophia Falk","Shiza Fatimah"],"pdf_url":"https://arxiv.org/pdf/2411.07854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11658v3","updated":"2024-11-12T15:03:48Z","published":"2024-02-18T17:32:53Z","title":"Dynamic planning in hierarchical active inference","summary":" By dynamic planning, we refer to the ability of the human brain to infer and\nimpose motor trajectories related to cognitive decisions. A recent paradigm,\nactive inference, brings fundamental insights into the adaptation of biological\norganisms, constantly striving to minimize prediction errors to restrict\nthemselves to life-compatible states. Over the past years, many studies have\nshown how human and animal behaviors could be explained in terms of active\ninference - either as discrete decision-making or continuous motor control -\ninspiring innovative solutions in robotics and artificial intelligence. Still,\nthe literature lacks a comprehensive outlook on effectively planning realistic\nactions in changing environments. Setting ourselves the goal of modeling\ncomplex tasks such as tool use, we delve into the topic of dynamic planning in\nactive inference, keeping in mind two crucial aspects of biological behavior:\nthe capacity to understand and exploit affordances for object manipulation, and\nto learn the hierarchical interactions between the self and the environment,\nincluding other agents. We start from a simple unit and gradually describe more\nadvanced structures, comparing recently proposed design choices and providing\nbasic examples. This study distances itself from traditional views centered on\nneural networks and reinforcement learning, and points toward a yet unexplored\ndirection in active inference: hybrid representations in hierarchical models.\n","authors":["Matteo Priorelli","Ivilin Peev Stoianov"],"pdf_url":"https://arxiv.org/pdf/2402.11658v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07850v1","updated":"2024-11-12T15:01:47Z","published":"2024-11-12T15:01:47Z","title":"IAE: Irony-based Adversarial Examples for Sentiment Analysis Systems","summary":" Adversarial examples, which are inputs deliberately perturbed with\nimperceptible changes to induce model errors, have raised serious concerns for\nthe reliability and security of deep neural networks (DNNs). While adversarial\nattacks have been extensively studied in continuous data domains such as\nimages, the discrete nature of text presents unique challenges. In this paper,\nwe propose Irony-based Adversarial Examples (IAE), a method that transforms\nstraightforward sentences into ironic ones to create adversarial text. This\napproach exploits the rhetorical device of irony, where the intended meaning is\nopposite to the literal interpretation, requiring a deeper understanding of\ncontext to detect. The IAE method is particularly challenging due to the need\nto accurately locate evaluation words, substitute them with appropriate\ncollocations, and expand the text with suitable ironic elements while\nmaintaining semantic coherence. Our research makes the following key\ncontributions: (1) We introduce IAE, a strategy for generating textual\nadversarial examples using irony. This method does not rely on pre-existing\nirony corpora, making it a versatile tool for creating adversarial text in\nvarious NLP tasks. (2) We demonstrate that the performance of several\nstate-of-the-art deep learning models on sentiment analysis tasks significantly\ndeteriorates when subjected to IAE attacks. This finding underscores the\nsusceptibility of current NLP systems to adversarial manipulation through\nirony. (3) We compare the impact of IAE on human judgment versus NLP systems,\nrevealing that humans are less susceptible to the effects of irony in text.\n","authors":["Xiaoyin Yi","Jiacheng Huang"],"pdf_url":"https://arxiv.org/pdf/2411.07850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14803v3","updated":"2024-11-12T14:57:08Z","published":"2024-10-18T18:19:56Z","title":"DistRL: An Asynchronous Distributed Reinforcement Learning Framework for\n On-Device Control Agents","summary":" On-device control agents, especially on mobile devices, are responsible for\noperating mobile devices to fulfill users' requests, enabling seamless and\nintuitive interactions. Integrating Multimodal Large Language Models (MLLMs)\ninto these agents enhances their ability to understand and execute complex\ncommands, thereby improving user experience. However, fine-tuning MLLMs for\non-device control presents significant challenges due to limited data\navailability and inefficient online training processes. This paper introduces\nDistRL, a novel framework designed to enhance the efficiency of online RL\nfine-tuning for mobile device control agents. DistRL employs centralized\ntraining and decentralized data acquisition to ensure efficient fine-tuning in\nthe context of dynamic online interactions. Additionally, the framework is\nbacked by our tailor-made RL algorithm, which effectively balances exploration\nwith the prioritized utilization of collected data to ensure stable and robust\ntraining. Our experiments show that, on average, DistRL delivers a 3X\nimprovement in training efficiency and enables training data collection 2.4X\nfaster than the leading synchronous multi-machine methods. Notably, after\ntraining, DistRL achieves a 20% relative improvement in success rate compared\nto state-of-the-art methods on general Android tasks from an open benchmark,\nsignificantly outperforming existing approaches while maintaining the same\ntraining time. These results validate DistRL as a scalable and efficient\nsolution, offering substantial improvements in both training efficiency and\nagent performance for real-world, in-the-wild device control tasks.\n","authors":["Taiyi Wang","Zhihao Wu","Jianheng Liu","Jianye Hao","Jun Wang","Kun Shao"],"pdf_url":"https://arxiv.org/pdf/2410.14803v3.pdf","comment":"Paper and Appendix, 25 pages"},{"id":"http://arxiv.org/abs/2408.00002v2","updated":"2024-11-12T14:55:50Z","published":"2024-07-10T15:03:00Z","title":"Transfer Learning for Wildlife Classification: Evaluating YOLOv8 against\n DenseNet, ResNet, and VGGNet on a Custom Dataset","summary":" This study evaluates the performance of various deep learning models,\nspecifically DenseNet, ResNet, VGGNet, and YOLOv8, for wildlife species\nclassification on a custom dataset. The dataset comprises 575 images of 23\nendangered species sourced from reputable online repositories. The study\nutilizes transfer learning to fine-tune pre-trained models on the dataset,\nfocusing on reducing training time and enhancing classification accuracy. The\nresults demonstrate that YOLOv8 outperforms other models, achieving a training\naccuracy of 97.39% and a validation F1-score of 96.50%. These findings suggest\nthat YOLOv8, with its advanced architecture and efficient feature extraction\ncapabilities, holds great promise for automating wildlife monitoring and\nconservation efforts.\n","authors":["Subek Sharma","Sisir Dhakal","Mansi Bhavsar"],"pdf_url":"https://arxiv.org/pdf/2408.00002v2.pdf","comment":"This is published in Journal of Artificial Intelligence and Capsule\n Networks, December 2024, Volume 6, Issue 4, Pages 415-435"},{"id":"http://arxiv.org/abs/2411.07845v1","updated":"2024-11-12T14:53:12Z","published":"2024-11-12T14:53:12Z","title":"Ethical Concern Identification in NLP: A Corpus of ACL Anthology Ethics\n Statements","summary":" What ethical concerns, if any, do LLM researchers have? We introduce EthiCon,\na corpus of 1,580 ethical concern statements extracted from scientific papers\npublished in the ACL Anthology. We extract ethical concern keywords from the\nstatements and show promising results in automating the concern identification\nprocess. Through a survey, we compare the ethical concerns of the corpus to the\nconcerns listed by the general public and professionals in the field. Finally,\nwe compare our retrieved ethical concerns with existing taxonomies pointing to\ngaps and future research directions.\n","authors":["Antonia Karamolegkou","Sandrine Schiller Hansen","Ariadni Christopoulou","Filippos Stamatiou","Anne Lauscher","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2411.07845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07843v1","updated":"2024-11-12T14:51:41Z","published":"2024-11-12T14:51:41Z","title":"Chain Association-based Attacking and Shielding Natural Language\n Processing Systems","summary":" Association as a gift enables people do not have to mention something in\ncompletely straightforward words and allows others to understand what they\nintend to refer to. In this paper, we propose a chain association-based\nadversarial attack against natural language processing systems, utilizing the\ncomprehension gap between humans and machines. We first generate a chain\nassociation graph for Chinese characters based on the association paradigm for\nbuilding search space of potential adversarial examples. Then, we introduce an\ndiscrete particle swarm optimization algorithm to search for the optimal\nadversarial examples. We conduct comprehensive experiments and show that\nadvanced natural language processing models and applications, including large\nlanguage models, are vulnerable to our attack, while humans appear good at\nunderstanding the perturbed text. We also explore two methods, including\nadversarial training and associative graph-based recovery, to shield systems\nfrom chain association-based attack. Since a few examples that use some\nderogatory terms, this paper contains materials that may be offensive or\nupsetting to some people.\n","authors":["Jiacheng Huang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07841v1","updated":"2024-11-12T14:46:31Z","published":"2024-11-12T14:46:31Z","title":"Federated Learning for Discrete Optimal Transport with Large Population\n under Incomplete Information","summary":" Optimal transport is a powerful framework for the efficient allocation of\nresources between sources and targets. However, traditional models often\nstruggle to scale effectively in the presence of large and heterogeneous\npopulations. In this work, we introduce a discrete optimal transport framework\ndesigned to handle large-scale, heterogeneous target populations, characterized\nby type distributions. We address two scenarios: one where the type\ndistribution of targets is known, and one where it is unknown. For the known\ndistribution, we propose a fully distributed algorithm to achieve optimal\nresource allocation. In the case of unknown distribution, we develop a\nfederated learning-based approach that enables efficient computation of the\noptimal transport scheme while preserving privacy. Case studies are provided to\nevaluate the performance of our learning algorithm.\n","authors":["Navpreet Kaur","Juntao Chen","Yingdong Lu"],"pdf_url":"https://arxiv.org/pdf/2411.07841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20178v2","updated":"2024-11-12T14:45:18Z","published":"2024-10-26T13:19:57Z","title":"LLMs Can Evolve Continually on Modality for X-Modal Reasoning","summary":" Multimodal Large Language Models (MLLMs) have gained significant attention\ndue to their impressive capabilities in multimodal understanding. However,\nexisting methods rely heavily on extensive modal-specific pretraining and\njoint-modal tuning, leading to significant computational burdens when expanding\nto new modalities. In this paper, we propose PathWeave, a flexible and scalable\nframework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs\nto continually EVolve on modalities for $\\mathbb{X}$-modal reasoning. We\nleverage the concept of Continual Learning and develop an incremental training\nstrategy atop pre-trained MLLMs, enabling their expansion to new modalities\nusing uni-modal data, without executing joint-modal pretraining. In detail, a\nnovel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and\ncross-modal adapters are seamlessly integrated to facilitate efficient modality\nalignment and collaboration. Additionally, an MoE-based gating module is\napplied between two types of adapters to further enhance the multimodal\ninteraction. To investigate the proposed method, we establish a challenging\nbenchmark called Continual Learning of Modality (MCL), which consists of\nhigh-quality QA data from five distinct modalities: image, video, audio, depth\nand point cloud. Extensive experiments demonstrate the effectiveness of the\nproposed AnA framework on learning plasticity and memory stability during\ncontinual learning. Furthermore, PathWeave performs comparably to\nstate-of-the-art MLLMs while concurrently reducing parameter training burdens\nby 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave\n","authors":["Jiazuo Yu","Haomiao Xiong","Lu Zhang","Haiwen Diao","Yunzhi Zhuge","Lanqing Hong","Dong Wang","Huchuan Lu","You He","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2410.20178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06008v2","updated":"2024-11-12T14:30:28Z","published":"2024-11-08T23:02:59Z","title":"The Dark Patterns of Personalized Persuasion in Large Language Models:\n Exposing Persuasive Linguistic Features for Big Five Personality Traits in\n LLMs Responses","summary":" This study explores how the Large Language Models (LLMs) adjust linguistic\nfeatures to create personalized persuasive outputs. While research showed that\nLLMs personalize outputs, a gap remains in understanding the linguistic\nfeatures of their persuasive capabilities. We identified 13 linguistic features\ncrucial for influencing personalities across different levels of the Big Five\nmodel of personality. We analyzed how prompts with personality trait\ninformation influenced the output of 19 LLMs across five model families. The\nfindings show that models use more anxiety-related words for neuroticism,\nincrease achievement-related words for conscientiousness, and employ fewer\ncognitive processes words for openness to experience. Some model families excel\nat adapting language for openness to experience, others for conscientiousness,\nwhile only one model adapts language for neuroticism. Our findings show how\nLLMs tailor responses based on personality cues in prompts, indicating their\npotential to create persuasive content affecting the mind and well-being of the\nrecipients.\n","authors":["Wiktoria Mieleszczenko-Kowszewicz","Dawid Płudowski","Filip Kołodziejczyk","Jakub Świstak","Julian Sienkiewicz","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2411.06008v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2411.07826v1","updated":"2024-11-12T14:22:16Z","published":"2024-11-12T14:22:16Z","title":"Efficient Federated Finetuning of Tiny Transformers with\n Resource-Constrained Devices","summary":" In recent years, Large Language Models (LLMs) through Transformer structures\nhave dominated many machine learning tasks, especially text processing.\nHowever, these models require massive amounts of data for training and induce\nhigh resource requirements, particularly in terms of the large number of\nFloating Point Operations (FLOPs) and the high amounts of memory needed. To\nfine-tune such a model in a parameter-efficient way, techniques like Adapter or\nLoRA have been developed. However, we observe that the application of LoRA,\nwhen used in federated learning (FL), while still being parameter-efficient, is\nmemory and FLOP inefficient. Based on that observation, we develop a novel\nlayer finetuning scheme that allows devices in cross-device FL to make use of\npretrained neural networks (NNs) while adhering to given resource constraints.\nWe show that our presented scheme outperforms the current state of the art when\ndealing with homogeneous or heterogeneous computation and memory constraints\nand is on par with LoRA regarding limited communication, thereby achieving\nsignificantly higher accuracies in FL training.\n","authors":["Kilian Pfeiffer","Mohamed Aboelenien Ahmed","Ramin Khalili","Jörg Henkel"],"pdf_url":"https://arxiv.org/pdf/2411.07826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01078v2","updated":"2024-11-12T14:18:14Z","published":"2024-11-01T23:19:05Z","title":"Effective ML Model Versioning in Edge Networks","summary":" Machine learning (ML) models, data and software need to be regularly updated\nwhenever essential version updates are released and feasible for integration.\nThis is a basic but most challenging requirement to satisfy in the edge, due to\nthe various system constraints and the major impact that an update can have on\nrobustness and stability. In this paper, we formulate for the first time the ML\nmodel versioning optimization problem, and propose effective solutions,\nincluding the update automation with reinforcement learning (RL) based\nalgorithm. We study the edge network environment due to the known constraints\nin performance, response time, security, and reliability, which make updates\nespecially challenging. The performance study shows that model version updates\ncan be fully and effectively automated with reinforcement learning method. We\nshow that for every range of server load values, the proper versioning can be\nfound that improves security, reliability and/or ML model accuracy, while\nassuring a comparably lower response time.\n","authors":["Fin Gentzen","Mounir Bensalem","Admela Jukan"],"pdf_url":"https://arxiv.org/pdf/2411.01078v2.pdf","comment":"This paper is uploaded here for research community, thus it is for\n non-commercial purposes"},{"id":"http://arxiv.org/abs/2309.06799v5","updated":"2024-11-12T14:00:15Z","published":"2023-09-13T08:44:09Z","title":"When Geoscience Meets Foundation Models: Towards General Geoscience\n Artificial Intelligence System","summary":" Artificial intelligence (AI) has significantly advanced Earth sciences, yet\nits full potential in to comprehensively modeling Earth's complex dynamics\nremains unrealized. Geoscience foundation models (GFMs) emerge as a\nparadigm-shifting solution, integrating extensive cross-disciplinary data to\nenhance the simulation and understanding of Earth system dynamics. These\ndata-centric AI models extract insights from petabytes of structured and\nunstructured data, effectively addressing the complexities of Earth systems\nthat traditional models struggle to capture. The unique strengths of GFMs\ninclude flexible task specification, diverse input-output capabilities, and\nmulti-modal knowledge representation, enabling analyses that surpass those of\nindividual data sources or traditional AI methods. This review not only\nhighlights the key advantages of GFMs, but also presents essential techniques\nfor their construction, with a focus on transformers, pre-training, and\nadaptation strategies. Subsequently, we examine recent advancements in GFMs,\nincluding large language models, vision models, and vision-language models,\nparticularly emphasizing the potential applications in remote sensing.\nAdditionally, the review concludes with a comprehensive analysis of the\nchallenges and future trends in GFMs, addressing five critical aspects: data\nintegration, model complexity, uncertainty quantification, interdisciplinary\ncollaboration, and concerns related to privacy, trust, and security. This\nreview offers a comprehensive overview of emerging geoscientific research\nparadigms, emphasizing the untapped opportunities at the intersection of\nadvanced AI techniques and geoscience. It examines major methodologies,\nshowcases advances in large-scale models, and discusses the challenges and\nprospects that will shape the future landscape of GFMs.\n","authors":["Hao Zhang","Jin-Jian Xu","Hong-Wei Cui","Lin Li","Yaowen Yang","Chao-Sheng Tang","Niklas Boers"],"pdf_url":"https://arxiv.org/pdf/2309.06799v5.pdf","comment":"accpeted by IEEE Geoscience and Remote Sensing Magazine"},{"id":"http://arxiv.org/abs/2408.08770v3","updated":"2024-11-12T13:50:05Z","published":"2024-08-16T14:25:20Z","title":"Pessimistic Iterative Planning for Robust POMDPs","summary":" Robust POMDPs extend classical POMDPs to handle model uncertainty.\nSpecifically, robust POMDPs exhibit so-called uncertainty sets on the\ntransition and observation models, effectively defining ranges of\nprobabilities. Policies for robust POMDPs must be (1) memory-based to account\nfor partial observability and (2) robust against model uncertainty to account\nfor the worst-case instances from the uncertainty sets. To compute such robust\nmemory-based policies, we propose the pessimistic iterative planning (PIP)\nframework, which alternates between two main steps: (1) selecting a pessimistic\n(non-robust) POMDP via worst-case probability instances from the uncertainty\nsets; and (2) computing a finite-state controller (FSC) for this pessimistic\nPOMDP. We evaluate the performance of this FSC on the original robust POMDP and\nuse this evaluation in step (1) to select the next pessimistic POMDP. Within\nPIP, we propose the rFSCNet algorithm. In each iteration, rFSCNet finds an FSC\nthrough a recurrent neural network by using supervision policies optimized for\nthe pessimistic POMDP. The empirical evaluation in four benchmark environments\nshowcases improved robustness against several baseline methods and competitive\nperformance compared to a state-of-the-art robust POMDP solver.\n","authors":["Maris F. L. Galesloot","Marnix Suilen","Thiago D. Simão","Steven Carr","Matthijs T. J. Spaan","Ufuk Topcu","Nils Jansen"],"pdf_url":"https://arxiv.org/pdf/2408.08770v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07796v1","updated":"2024-11-12T13:46:58Z","published":"2024-11-12T13:46:58Z","title":"PatchCTG: Patch Cardiotocography Transformer for Antepartum Fetal Health\n Monitoring","summary":" Antepartum Cardiotocography (CTG) is vital for fetal health monitoring, but\ntraditional methods like the Dawes-Redman system are often limited by high\ninter-observer variability, leading to inconsistent interpretations and\npotential misdiagnoses. This paper introduces PatchCTG, a transformer-based\nmodel specifically designed for CTG analysis, employing patch-based\ntokenisation, instance normalisation and channel-independent processing to\ncapture essential local and global temporal dependencies within CTG signals.\nPatchCTG was evaluated on the Oxford Maternity (OXMAT) dataset, comprising over\n20,000 CTG traces across diverse clinical outcomes after applying the inclusion\nand exclusion criteria. With extensive hyperparameter optimisation, PatchCTG\nachieved an AUC of 77%, with specificity of 88% and sensitivity of 57% at\nYouden's index threshold, demonstrating adaptability to various clinical needs.\nTesting across varying temporal thresholds showed robust predictive\nperformance, particularly with finetuning on data closer to delivery, achieving\na sensitivity of 52% and specificity of 88% for near-delivery cases. These\nfindings suggest the potential of PatchCTG to enhance clinical decision-making\nin antepartum care by providing a reliable, objective tool for fetal health\nassessment. The source code is available at\nhttps://github.com/jaleedkhan/PatchCTG.\n","authors":["M. Jaleed Khan","Manu Vatish","Gabriel Davis Jones"],"pdf_url":"https://arxiv.org/pdf/2411.07796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04689v2","updated":"2024-11-12T13:37:04Z","published":"2024-08-08T12:14:02Z","title":"Design of a Quality Management System based on the EU Artificial\n Intelligence Act","summary":" The EU AI Act mandates that providers and deployers of high-risk AI systems\nestablish a quality management system (QMS). Among other criteria, a QMS shall\nhelp verify and document the AI system design and quality and monitor the\nproper implementation of all high-risk AI system requirements. Current research\nrarely explores practical solutions for implementing the EU AI Act. Instead, it\ntends to focus on theoretical concepts. As a result, more attention must be\npaid to tools that help humans actively check and document AI systems and\norchestrate the implementation of all high-risk AI system requirements.\nTherefore, this paper introduces a new design concept and prototype for a QMS\nas a microservice Software as a Service web application. It connects directly\nto the AI system for verification and documentation and enables the\norchestration and integration of various sub-services, which can be\nindividually designed, each tailored to specific high-risk AI system\nrequirements. The first version of the prototype connects to the\nPhi-3-mini-128k-instruct LLM as an example of an AI system and integrates a\nrisk management system and a data management system. The prototype is evaluated\nthrough a qualitative assessment of the implemented requirements, a GPU memory\nand performance analysis, and an evaluation with IT, AI, and legal experts.\n","authors":["Henryk Mustroph","Stefanie Rinderle-Ma"],"pdf_url":"https://arxiv.org/pdf/2408.04689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07781v1","updated":"2024-11-12T13:30:06Z","published":"2024-11-12T13:30:06Z","title":"RedCode: Risky Code Execution and Generation Benchmark for Code Agents","summary":" With the rapidly increasing capabilities and adoption of code agents for\nAI-assisted coding, safety concerns, such as generating or executing risky\ncode, have become significant barriers to the real-world deployment of these\nagents. To provide comprehensive and practical evaluations on the safety of\ncode agents, we propose RedCode, a benchmark for risky code execution and\ngeneration: (1) RedCode-Exec provides challenging prompts that could lead to\nrisky code execution, aiming to evaluate code agents' ability to recognize and\nhandle unsafe code. We provide a total of 4,050 risky test cases in Python and\nBash tasks with diverse input formats including code snippets and natural text.\nThey covers 25 types of critical vulnerabilities spanning 8 domains (e.g.,\nwebsites, file systems). We provide Docker environments and design\ncorresponding evaluation metrics to assess their execution results. (2)\nRedCode-Gen provides 160 prompts with function signatures and docstrings as\ninput to assess whether code agents will follow instructions to generate\nharmful code or software. Our empirical findings, derived from evaluating three\nagent frameworks based on 19 LLMs, provide insights into code agents'\nvulnerabilities. For instance, evaluations on RedCode-Exec show that agents are\nmore likely to reject executing risky operations on the operating system, but\nare less likely to reject executing technically buggy code, indicating high\nrisks. Risky operations described in natural text lead to a lower rejection\nrate than those in code format. Additionally, evaluations on RedCode-Gen show\nthat more capable base models and agents with stronger overall coding\nabilities, such as GPT4, tend to produce more sophisticated and effective\nharmful software. Our findings highlight the need for stringent safety\nevaluations for diverse code agents. Our dataset and code are available at\nhttps://github.com/AI-secure/RedCode.\n","authors":["Chengquan Guo","Xun Liu","Chulin Xie","Andy Zhou","Yi Zeng","Zinan Lin","Dawn Song","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2411.07781v1.pdf","comment":"Accepted by NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2408.08074v2","updated":"2024-11-12T13:26:39Z","published":"2024-08-15T11:01:35Z","title":"A Survey on Integrated Sensing, Communication, and Computation","summary":" The forthcoming generation of wireless technology, 6G, aims to usher in an\nera of ubiquitous intelligent services, where everything is interconnected and\nintelligent. This vision requires the seamless integration of three fundamental\nmodules: Sensing for information acquisition, communication for information\nsharing, and computation for information processing and decision-making. These\nmodules are intricately linked, especially in complex tasks such as edge\nlearning and inference. However, the performance of these modules is\ninterdependent, creating a resource competition for time, energy, and\nbandwidth. Existing techniques like integrated communication and computation\n(ICC), integrated sensing and computation (ISC), and integrated sensing and\ncommunication (ISAC) have made partial strides in addressing this challenge,\nbut they fall short of meeting the extreme performance requirements. To\novercome these limitations, it is essential to develop new techniques that\ncomprehensively integrate sensing, communication, and computation. This\nintegrated approach, known as Integrated Sensing, Communication, and\nComputation (ISCC), offers a systematic perspective for enhancing task\nperformance. This paper begins with a comprehensive survey of historic and\nrelated techniques such as ICC, ISC, and ISAC, highlighting their strengths and\nlimitations. It then discusses the benefits, functions, and challenges of ISCC.\nSubsequently, the state-of-the-art signal designs for ISCC, along with network\nresource management strategies specifically tailored for ISCC are explored.\nFurthermore, this paper discusses the exciting research opportunities that lie\nahead for implementing ISCC in future advanced networks, and the unresolved\nissues requiring further investigation. ISCC is expected to unlock the full\npotential of intelligent connectivity, paving the way for groundbreaking\napplications and services.\n","authors":["Dingzhu Wen","Yong Zhou","Xiaoyang Li","Yuanming Shi","Kaibin Huang","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2408.08074v2.pdf","comment":"In this version, a series of discussions have been added.The\n benefits, functions, and challenges of ISCC are investigated using a new\n section. Moreover, the unresolved issues of ISCC have been discussed"},{"id":"http://arxiv.org/abs/2411.07773v1","updated":"2024-11-12T13:14:09Z","published":"2024-11-12T13:14:09Z","title":"Likelihood as a Performance Gauge for Retrieval-Augmented Generation","summary":" Recent work finds that retrieval-augmented generation with large language\nmodels is prone to be influenced by the order of retrieved documents in the\ncontext. However, the lack of in-depth analysis limits the use of this\nphenomenon for prompt engineering in practice. In this study, we posit that\nlikelihoods serve as an effective gauge for language model performance. Through\nexperiments on two question-answering datasets with a variety of\nstate-of-the-art language models, we reveal correlations between answer\naccuracy and the likelihood of the question at both the corpus level and the\ninstance level. In addition, we find that question likelihood can also indicate\nthe position of the task-relevant information in the context. Based on these\nfindings, we propose two methods that use question likelihood as a gauge for\nselecting and constructing prompts that lead to better performance. We\ndemonstrate their effectiveness with experiments. In addition, our\nlikelihood-based methods are efficient, as they only need to compute the\nlikelihood of the input, requiring much fewer language model passes than\nheuristic prompt engineering methods that require generating responses. Our\nanalysis deepens our understanding of how input prompts affect model\nperformance and provides a promising direction for efficient prompt\noptimization.\n","authors":["Tianyu Liu","Jirui Qi","Paul He","Arianna Bisazza","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.07773v1.pdf","comment":"Under review at NAACL 2025. Code is available at\n https://github.com/lyutyuh/poptimizer"},{"id":"http://arxiv.org/abs/2411.07772v1","updated":"2024-11-12T13:13:20Z","published":"2024-11-12T13:13:20Z","title":"Automatic Album Sequencing","summary":" Album sequencing is a critical part of the album production process.\nRecently, a data-driven approach was proposed that sequences general\ncollections of independent media by extracting the narrative essence of the\nitems in the collections. While this approach implies an album sequencing\ntechnique, it is not widely accessible to a less technical audience, requiring\nadvanced knowledge of machine learning techniques to use. To address this, we\nintroduce a new user-friendly web-based tool that allows a less technical\naudience to upload music tracks, execute this technique in one click, and\nsubsequently presents the result in a clean visualization to the user. To both\nincrease the number of templates available to the user and address shortcomings\nof previous work, we also introduce a new direct transformer-based album\nsequencing method. We find that our more direct method outperforms a random\nbaseline but does not reach the same performance as the narrative essence\napproach. Both methods are included in our web-based user interface, and this\n-- alongside a full copy of our implementation -- is publicly available at\nhttps://github.com/dylanashley/automatic-album-sequencing\n","authors":["Vincent Herrmann","Dylan R. Ashley","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2411.07772v1.pdf","comment":"presented as a late breaking demo in the 25th International Society\n for Music Information Retrieval Conference; 3 pages in main text, 3 figures\n in main text; source code available at\n https://github.com/dylanashley/automatic-album-sequencing"},{"id":"http://arxiv.org/abs/2407.00352v2","updated":"2024-11-12T13:01:48Z","published":"2024-06-29T07:53:47Z","title":"PhyTracker: An Online Tracker for Phytoplankton","summary":" Phytoplankton, a crucial component of aquatic ecosystems, requires efficient\nmonitoring to understand marine ecological processes and environmental\nconditions. Traditional phytoplankton monitoring methods, relying on non-in\nsitu observations, are time-consuming and resource-intensive, limiting timely\nanalysis. To address these limitations, we introduce PhyTracker, an intelligent\nin situ tracking framework designed for automatic tracking of phytoplankton.\nPhyTracker overcomes significant challenges unique to phytoplankton monitoring,\nsuch as constrained mobility within water flow, inconspicuous appearance, and\nthe presence of impurities. Our method incorporates three innovative modules: a\nTexture-enhanced Feature Extraction (TFE) module, an Attention-enhanced\nTemporal Association (ATA) module, and a Flow-agnostic Movement Refinement\n(FMR) module. These modules enhance feature capture, differentiate between\nphytoplankton and impurities, and refine movement characteristics,\nrespectively. Extensive experiments on the PMOT dataset validate the\nsuperiority of PhyTracker in phytoplankton tracking, and additional tests on\nthe MOT dataset demonstrate its general applicability, outperforming\nconventional tracking methods. This work highlights key differences between\nphytoplankton and traditional objects, offering an effective solution for\nphytoplankton monitoring.\n","authors":["Yang Yu","Qingxuan Lv","Yuezun Li","Zhiqiang Wei","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2407.00352v2.pdf","comment":"13pages,eleven figures"},{"id":"http://arxiv.org/abs/2411.04799v2","updated":"2024-11-12T12:57:58Z","published":"2024-11-07T15:38:25Z","title":"Kwai-STaR: Transform LLMs into State-Transition Reasoners","summary":" Mathematical reasoning presents a significant challenge to the cognitive\ncapabilities of LLMs. Various methods have been proposed to enhance the\nmathematical ability of LLMs. However, few recognize the value of state\ntransition for LLM reasoning. In this work, we define mathematical\nproblem-solving as a process of transiting from an initial unsolved state to\nthe final resolved state, and propose Kwai-STaR framework, which transforms\nLLMs into State-Transition Reasoners to improve their intuitive reasoning\ncapabilities. Our approach comprises three main steps: (1) Define the state\nspace tailored to the mathematical reasoning. (2) Generate state-transition\ndata based on the state space. (3) Convert original LLMs into State-Transition\nReasoners via a curricular training strategy. Our experiments validate the\neffectiveness of Kwai-STaR in enhancing mathematical reasoning: After training\non the small-scale Kwai-STaR dataset, general LLMs, including Mistral-7B and\nLLaMA-3, achieve considerable performance gain on the GSM8K and GSM-Hard\ndataset. Additionally, the state transition-based design endows Kwai-STaR with\nremarkable training and inference efficiency. Further experiments are underway\nto establish the generality of Kwai-STaR.\n","authors":["Xingyu Lu","Yuhang Hu","Changyi Liu","Tianke Zhang","Zhenyu Yang","Zhixiang Ding","Shengsheng Qian","Meng Du","Ruiwen Kang","Kaiyu Tang","Fan Yang","Tingting Gao","Di Zhang","Hai-Tao Zheng","Bin Wen"],"pdf_url":"https://arxiv.org/pdf/2411.04799v2.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.07763v1","updated":"2024-11-12T12:52:17Z","published":"2024-11-12T12:52:17Z","title":"Spider 2.0: Evaluating Language Models on Real-World Enterprise\n Text-to-SQL Workflows","summary":" Real-world enterprise text-to-SQL workflows often involve complex cloud or\nlocal data across various database systems, multiple SQL queries in various\ndialects, and diverse operations from data transformation to analytics. We\nintroduce Spider 2.0, an evaluation framework comprising 632 real-world\ntext-to-SQL workflow problems derived from enterprise-level database use cases.\nThe databases in Spider 2.0 are sourced from real data applications, often\ncontaining over 1,000 columns and stored in local or cloud database systems\nsuch as BigQuery and Snowflake. We show that solving problems in Spider 2.0\nfrequently requires understanding and searching through database metadata,\ndialect documentation, and even project-level codebases. This challenge calls\nfor models to interact with complex SQL workflow environments, process\nextremely long contexts, perform intricate reasoning, and generate multiple SQL\nqueries with diverse operations, often exceeding 100 lines, which goes far\nbeyond traditional text-to-SQL challenges. Our evaluations indicate that based\non o1-preview, our code agent framework successfully solves only 17.0% of the\ntasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on\nSpider 2.0 show that while language models have demonstrated remarkable\nperformance in code generation -- especially in prior text-to-SQL benchmarks --\nthey require significant improvement in order to achieve adequate performance\nfor real-world enterprise usage. Progress on Spider 2.0 represents crucial\nsteps towards developing intelligent, autonomous, code agents for real-world\nenterprise settings. Our code, baseline models, and data are available at\nhttps://spider2-sql.github.io.\n","authors":["Fangyu Lei","Jixuan Chen","Yuxiao Ye","Ruisheng Cao","Dongchan Shin","Hongjin Su","Zhaoqing Suo","Hongcheng Gao","Wenjing Hu","Pengcheng Yin","Victor Zhong","Caiming Xiong","Ruoxi Sun","Qian Liu","Sida Wang","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2411.07763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07762v1","updated":"2024-11-12T12:52:04Z","published":"2024-11-12T12:52:04Z","title":"ASER: Activation Smoothing and Error Reconstruction for Large Language\n Model Quantization","summary":" Quantization stands as a pivotal technique for large language model (LLM)\nserving, yet it poses significant challenges particularly in achieving\neffective low-bit quantization. The limited numerical mapping makes the\nquantized model produce a non-trivial error, bringing out intolerable\nperformance degration. This paper is anchored in the basic idea of model\ncompression objectives, and delves into the layer-wise error distribution of\nLLMs during post-training quantization. Subsequently, we introduce ASER, an\nalgorithm consisting of (1) Error Reconstruction: low-rank compensation for\nquantization error with LoRA-style matrices constructed by whitening SVD; (2)\nActivation Smoothing: outlier extraction to gain smooth activation and better\nerror compensation. ASER is capable of quantizing typical LLMs to low-bit ones,\nparticularly preserving accuracy even in W4A8 per-channel setup. Experimental\nresults show that ASER is competitive among the state-of-the-art quantization\nalgorithms, showing potential to activation quantization, with minor overhead.\n","authors":["Weibo Zhao","Yubin Shi","Xinyu Lyu","Wanchen Sui","Shen Li","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.07762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07760v1","updated":"2024-11-12T12:49:41Z","published":"2024-11-12T12:49:41Z","title":"Navigation with QPHIL: Quantizing Planner for Hierarchical Implicit\n Q-Learning","summary":" Offline Reinforcement Learning (RL) has emerged as a powerful alternative to\nimitation learning for behavior modeling in various domains, particularly in\ncomplex navigation tasks. An existing challenge with Offline RL is the\nsignal-to-noise ratio, i.e. how to mitigate incorrect policy updates due to\nerrors in value estimates. Towards this, multiple works have demonstrated the\nadvantage of hierarchical offline RL methods, which decouples high-level path\nplanning from low-level path following. In this work, we present a novel\nhierarchical transformer-based approach leveraging a learned quantizer of the\nspace. This quantization enables the training of a simpler zone-conditioned\nlow-level policy and simplifies planning, which is reduced to discrete\nautoregressive prediction. Among other benefits, zone-level reasoning in\nplanning enables explicit trajectory stitching rather than implicit stitching\nbased on noisy value function estimates. By combining this transformer-based\nplanner with recent advancements in offline RL, our proposed approach achieves\nstate-of-the-art results in complex long-distance navigation environments.\n","authors":["Alexi Canesse","Mathieu Petitbois","Ludovic Denoyer","Sylvain Lamprier","Rémy Portelas"],"pdf_url":"https://arxiv.org/pdf/2411.07760v1.pdf","comment":"Under review. Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2411.07759v1","updated":"2024-11-12T12:37:50Z","published":"2024-11-12T12:37:50Z","title":"Optimizing Traffic Signal Control using High-Dimensional State\n Representation and Efficient Deep Reinforcement Learning","summary":" In reinforcement learning-based (RL-based) traffic signal control (TSC),\ndecisions on the signal timing are made based on the available information on\nvehicles at a road intersection. This forms the state representation for the RL\nenvironment which can either be high-dimensional containing several variables\nor a low-dimensional vector. Current studies suggest that using high\ndimensional state representations does not lead to improved performance on TSC.\nHowever, we argue, with experimental results, that the use of high dimensional\nstate representations can, in fact, lead to improved TSC performance with\nimprovements up to 17.9% of the average waiting time. This high-dimensional\nrepresentation is obtainable using the cost-effective vehicle-to-infrastructure\n(V2I) communication, encouraging its adoption for TSC. Additionally, given the\nlarge size of the state, we identified the need to have computational efficient\nmodels and explored model compression via pruning.\n","authors":["Lawrence Francis","Blessed Guda","Ahmed Biyabani"],"pdf_url":"https://arxiv.org/pdf/2411.07759v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2411.07751v1","updated":"2024-11-12T12:23:41Z","published":"2024-11-12T12:23:41Z","title":"SAV-SE: Scene-aware Audio-Visual Speech Enhancement with Selective State\n Space Model","summary":" Speech enhancement plays an essential role in various applications, and the\nintegration of visual information has been demonstrated to bring substantial\nadvantages. However, the majority of current research concentrates on the\nexamination of facial and lip movements, which can be compromised or entirely\ninaccessible in scenarios where occlusions occur or when the camera view is\ndistant. Whereas contextual visual cues from the surrounding environment have\nbeen overlooked: for example, when we see a dog bark, our brain has the innate\nability to discern and filter out the barking noise. To this end, in this\npaper, we introduce a novel task, i.e. SAV-SE. To our best knowledge, this is\nthe first proposal to use rich contextual information from synchronized video\nas auxiliary cues to indicate the type of noise, which eventually improves the\nspeech enhancement performance. Specifically, we propose the VC-S$^2$E method,\nwhich incorporates the Conformer and Mamba modules for their complementary\nstrengths. Extensive experiments are conducted on public MUSIC, AVSpeech and\nAudioSet datasets, where the results demonstrate the superiority of VC-S$^2$E\nover other competitive methods. We will make the source code publicly\navailable. Project demo page: https://AVSEPage.github.io/\n","authors":["Xinyuan Qian","Jiaran Gao","Yaodan Zhang","Qiquan Zhang","Hexin Liu","Leibny Paola Garcia","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2411.07751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09767v2","updated":"2024-11-12T12:09:20Z","published":"2024-10-13T07:51:39Z","title":"LibEER: A Comprehensive Benchmark and Algorithm Library for EEG-based\n Emotion Recognition","summary":" EEG-based emotion recognition (EER) has gained significant attention due to\nits potential for understanding and analyzing human emotions. While recent\nadvancements in deep learning techniques have substantially improved EER, the\nfield lacks a convincing benchmark and comprehensive open-source libraries.\nThis absence complicates fair comparisons between models and creates\nreproducibility challenges for practitioners, which collectively hinder\nprogress. To address these issues, we introduce LibEER, a comprehensive\nbenchmark and algorithm library designed to facilitate fair comparisons in EER.\nLibEER carefully selects popular and powerful baselines, harmonizes key\nimplementation details across methods, and provides a standardized codebase in\nPyTorch. By offering a consistent evaluation framework with standardized\nexperimental settings, LibEER enables unbiased assessments of over ten\nrepresentative deep learning models for EER across the four most widely used\ndatasets. Additionally, we conduct a thorough, reproducible comparison of model\nperformance and efficiency, providing valuable insights to guide researchers in\nthe selection and design of EER models. Moreover, we make observations and\nin-depth analysis on the experiment results and identify current challenges in\nthis community. We hope that our work will not only lower entry barriers for\nnewcomers to EEG-based emotion recognition but also contribute to the\nstandardization of research in this domain, fostering steady development. The\nlibrary and source code are publicly available at\nhttps://github.com/XJTU-EEG/LibEER.\n","authors":["Huan Liu","Shusen Yang","Yuzhe Zhang","Mengze Wang","Fanyu Gong","Chengxi Xie","Guanjian Liu","Zejun Liu","Yong-Jin Liu","Bao-Liang Lu","Dalin Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.09767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06911v2","updated":"2024-11-12T12:07:00Z","published":"2024-11-11T12:13:58Z","title":"Gaussian Process Emulators for Few-Shot Segmentation in Cardiac MRI","summary":" Segmentation of cardiac magnetic resonance images (MRI) is crucial for the\nanalysis and assessment of cardiac function, helping to diagnose and treat\nvarious cardiovascular diseases. Most recent techniques rely on deep learning\nand usually require an extensive amount of labeled data. To overcome this\nproblem, few-shot learning has the capability of reducing data dependency on\nlabeled data. In this work, we introduce a new method that merges few-shot\nlearning with a U-Net architecture and Gaussian Process Emulators (GPEs),\nenhancing data integration from a support set for improved performance. GPEs\nare trained to learn the relation between the support images and the\ncorresponding masks in latent space, facilitating the segmentation of unseen\nquery images given only a small labeled support set at inference. We test our\nmodel with the M&Ms-2 public dataset to assess its ability to segment the heart\nin cardiac magnetic resonance imaging from different orientations, and compare\nit with state-of-the-art unsupervised and few-shot methods. Our architecture\nshows higher DICE coefficients compared to these methods, especially in the\nmore challenging setups where the size of the support set is considerably\nsmall.\n","authors":["Bruno Viti","Franz Thaler","Kathrin Lisa Kapper","Martin Urschler","Martin Holler","Elias Karabelas"],"pdf_url":"https://arxiv.org/pdf/2411.06911v2.pdf","comment":"Accepted at Statistical Atlases and Computational Modeling of the\n Heart (STACOM) Workshop 2024"},{"id":"http://arxiv.org/abs/2411.07739v1","updated":"2024-11-12T12:03:57Z","published":"2024-11-12T12:03:57Z","title":"Unlocking Legal Knowledge with Multi-Layered Embedding-Based Retrieval","summary":" This work addresses the challenge of capturing the complexities of legal\nknowledge by proposing a multi-layered embedding-based retrieval method for\nlegal and legislative texts. Creating embeddings not only for individual\narticles but also for their components (paragraphs, clauses) and structural\ngroupings (books, titles, chapters, etc), we seek to capture the subtleties of\nlegal information through the use of dense vectors of embeddings, representing\nit at varying levels of granularity. Our method meets various information needs\nby allowing the Retrieval Augmented Generation system to provide accurate\nresponses, whether for specific segments or entire sections, tailored to the\nuser's query. We explore the concepts of aboutness, semantic chunking, and\ninherent hierarchy within legal texts, arguing that this method enhances the\nlegal information retrieval. Despite the focus being on Brazil's legislative\nmethods and the Brazilian Constitution, which follow a civil law tradition, our\nfindings should in principle be applicable across different legal systems,\nincluding those adhering to common law traditions. Furthermore, the principles\nof the proposed method extend beyond the legal domain, offering valuable\ninsights for organizing and retrieving information in any field characterized\nby information encoded in hierarchical text.\n","authors":["João Alberto de Oliveira Lima"],"pdf_url":"https://arxiv.org/pdf/2411.07739v1.pdf","comment":"27 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.07087v2","updated":"2024-11-12T12:03:07Z","published":"2024-11-11T16:04:49Z","title":"OCMDP: Observation-Constrained Markov Decision Process","summary":" In many practical applications, decision-making processes must balance the\ncosts of acquiring information with the benefits it provides. Traditional\ncontrol systems often assume full observability, an unrealistic assumption when\nobservations are expensive. We tackle the challenge of simultaneously learning\nobservation and control strategies in such cost-sensitive environments by\nintroducing the Observation-Constrained Markov Decision Process (OCMDP), where\nthe policy influences the observability of the true state. To manage the\ncomplexity arising from the combined observation and control actions, we\ndevelop an iterative, model-free deep reinforcement learning algorithm that\nseparates the sensing and control components of the policy. This decomposition\nenables efficient learning in the expanded action space by focusing on when and\nwhat to observe, as well as determining optimal control actions, without\nrequiring knowledge of the environment's dynamics. We validate our approach on\na simulated diagnostic task and a realistic healthcare environment using\nHeartPole. Given both scenarios, the experimental results demonstrate that our\nmodel achieves a substantial reduction in observation costs on average,\nsignificantly outperforming baseline methods by a notable margin in efficiency.\n","authors":["Taiyi Wang","Jianheng Liu","Bryan Lee","Zhihao Wu","Yu Wu"],"pdf_url":"https://arxiv.org/pdf/2411.07087v2.pdf","comment":"Full paper, 14 Pages"},{"id":"http://arxiv.org/abs/2405.00722v2","updated":"2024-11-12T11:49:33Z","published":"2024-04-26T11:57:21Z","title":"LLMs for Generating and Evaluating Counterfactuals: A Comprehensive\n Study","summary":" As NLP models become more complex, understanding their decisions becomes more\ncrucial. Counterfactuals (CFs), where minimal changes to inputs flip a model's\nprediction, offer a way to explain these models. While Large Language Models\n(LLMs) have shown remarkable performance in NLP tasks, their efficacy in\ngenerating high-quality CFs remains uncertain. This work fills this gap by\ninvestigating how well LLMs generate CFs for two NLU tasks. We conduct a\ncomprehensive comparison of several common LLMs, and evaluate their CFs,\nassessing both intrinsic metrics, and the impact of these CFs on data\naugmentation. Moreover, we analyze differences between human and LLM-generated\nCFs, providing insights for future research directions. Our results show that\nLLMs generate fluent CFs, but struggle to keep the induced changes minimal.\nGenerating CFs for Sentiment Analysis (SA) is less challenging than NLI where\nLLMs show weaknesses in generating CFs that flip the original label. This also\nreflects on the data augmentation performance, where we observe a large gap\nbetween augmenting with human and LLMs CFs. Furthermore, we evaluate LLMs'\nability to assess CFs in a mislabelled data setting, and show that they have a\nstrong bias towards agreeing with the provided labels. GPT4 is more robust\nagainst this bias and its scores correlate well with automatic metrics. Our\nfindings reveal several limitations and point to potential future work\ndirections.\n","authors":["Van Bach Nguyen","Paul Youssef","Christin Seifert","Jörg Schlötterer"],"pdf_url":"https://arxiv.org/pdf/2405.00722v2.pdf","comment":"Accepted to EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2408.12308v3","updated":"2024-11-12T11:45:35Z","published":"2024-08-22T11:34:34Z","title":"Deep Learning with CNNs: A Compact Holistic Tutorial with Focus on\n Supervised Regression (Preprint)","summary":" In this tutorial, we present a compact and holistic discussion of Deep\nLearning with a focus on Convolutional Neural Networks (CNNs) and supervised\nregression. While there are numerous books and articles on the individual\ntopics we cover, comprehensive and detailed tutorials that address Deep\nLearning from a foundational yet rigorous and accessible perspective are rare.\nMost resources on CNNs are either too advanced, focusing on cutting-edge\narchitectures, or too narrow, addressing only specific applications like image\nclassification.This tutorial not only summarizes the most relevant concepts but\nalso provides an in-depth exploration of each, offering a complete yet agile\nset of ideas. Moreover, we highlight the powerful synergy between learning\ntheory, statistic, and machine learning, which together underpin the Deep\nLearning and CNN frameworks. We aim for this tutorial to serve as an optimal\nresource for students, professors, and anyone interested in understanding the\nfoundations of Deep Learning. Upon acceptance we will provide an accompanying\nrepository under\n\\href{https://github.com/neoglez/deep-learning-tutorial}{https://github.com/neoglez/deep-learning-tutorial}\n Keywords: Tutorial, Deep Learning, Convolutional Neural Networks, Machine\nLearning.\n","authors":["Yansel Gonzalez Tejeda","Helmut A. Mayer"],"pdf_url":"https://arxiv.org/pdf/2408.12308v3.pdf","comment":"Submitted to the journal Machine Learning and Knowledge Extraction"},{"id":"http://arxiv.org/abs/2411.07728v1","updated":"2024-11-12T11:39:05Z","published":"2024-11-12T11:39:05Z","title":"No-Reference Point Cloud Quality Assessment via Graph Convolutional\n Network","summary":" Three-dimensional (3D) point cloud, as an emerging visual media format, is\nincreasingly favored by consumers as it can provide more realistic visual\ninformation than two-dimensional (2D) data. Similar to 2D plane images and\nvideos, point clouds inevitably suffer from quality degradation and information\nloss through multimedia communication systems. Therefore, automatic point cloud\nquality assessment (PCQA) is of critical importance. In this work, we propose a\nnovel no-reference PCQA method by using a graph convolutional network (GCN) to\ncharacterize the mutual dependencies of multi-view 2D projected image contents.\nThe proposed GCN-based PCQA (GC-PCQA) method contains three modules, i.e.,\nmulti-view projection, graph construction, and GCN-based quality prediction.\nFirst, multi-view projection is performed on the test point cloud to obtain a\nset of horizontally and vertically projected images. Then, a\nperception-consistent graph is constructed based on the spatial relations among\ndifferent projected images. Finally, reasoning on the constructed graph is\nperformed by GCN to characterize the mutual dependencies and interactions\nbetween different projected images, and aggregate feature information of\nmulti-view projected images for final quality prediction. Experimental results\non two publicly available benchmark databases show that our proposed GC-PCQA\ncan achieve superior performance than state-of-the-art quality assessment\nmetrics. The code will be available at: https://github.com/chenwuwq/GC-PCQA.\n","authors":["Wu Chen","Qiuping Jiang","Wei Zhou","Feng Shao","Guangtao Zhai","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2411.07728v1.pdf","comment":"Accepted by IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2411.07722v1","updated":"2024-11-12T11:28:50Z","published":"2024-11-12T11:28:50Z","title":"Is Cognition consistent with Perception? Assessing and Mitigating\n Multimodal Knowledge Conflicts in Document Understanding","summary":" Multimodal large language models (MLLMs) have shown impressive capabilities\nin document understanding, a rapidly growing research area with significant\nindustrial demand in recent years. As a multimodal task, document understanding\nrequires models to possess both perceptual and cognitive abilities. However,\ncurrent MLLMs often face conflicts between perception and cognition. Taking a\ndocument VQA task (cognition) as an example, an MLLM might generate answers\nthat do not match the corresponding visual content identified by its OCR\n(perception). This conflict suggests that the MLLM might struggle to establish\nan intrinsic connection between the information it \"sees\" and what it\n\"understands.\" Such conflicts challenge the intuitive notion that cognition is\nconsistent with perception, hindering the performance and explainability of\nMLLMs. In this paper, we define the conflicts between cognition and perception\nas Cognition and Perception (C&P) knowledge conflicts, a form of multimodal\nknowledge conflicts, and systematically assess them with a focus on document\nunderstanding. Our analysis reveals that even GPT-4o, a leading MLLM, achieves\nonly 68.6% C&P consistency. To mitigate the C&P knowledge conflicts, we propose\na novel method called Multimodal Knowledge Consistency Fine-tuning. This method\nfirst ensures task-specific consistency and then connects the cognitive and\nperceptual knowledge. Our method significantly reduces C&P knowledge conflicts\nacross all tested MLLMs and enhances their performance in both cognitive and\nperceptual tasks in most scenarios.\n","authors":["Zirui Shao","Chuwei Luo","Zhaoqing Zhu","Hangdi Xing","Zhi Yu","Qi Zheng","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2411.07722v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.07863v3","updated":"2024-11-12T11:18:43Z","published":"2024-05-13T15:50:39Z","title":"RLHF Workflow: From Reward Modeling to Online RLHF","summary":" We present the workflow of Online Iterative Reinforcement Learning from Human\nFeedback (RLHF) in this technical report, which is widely reported to\noutperform its offline counterpart by a large margin in the recent large\nlanguage model (LLM) literature. However, existing open-source RLHF projects\nare still largely confined to the offline learning setting. In this technical\nreport, we aim to fill in this gap and provide a detailed recipe that is easy\nto reproduce for online iterative RLHF. In particular, since online human\nfeedback is usually infeasible for open-source communities with limited\nresources, we start by constructing preference models using a diverse set of\nopen-source datasets and use the constructed proxy preference model to\napproximate human feedback. Then, we discuss the theoretical insights and\nalgorithmic principles behind online iterative RLHF, followed by a detailed\npractical implementation. Our trained LLM achieves impressive performance on\nLLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as\nwell as other academic benchmarks such as HumanEval and TruthfulQA. We have\nshown that supervised fine-tuning (SFT) and iterative RLHF can obtain\nstate-of-the-art performance with fully open-source datasets. Further, we have\nmade our models, curated datasets, and comprehensive step-by-step code\nguidebooks publicly available. Please refer to\nhttps://github.com/RLHFlow/RLHF-Reward-Modeling and\nhttps://github.com/RLHFlow/Online-RLHF for more detailed information.\n","authors":["Hanze Dong","Wei Xiong","Bo Pang","Haoxiang Wang","Han Zhao","Yingbo Zhou","Nan Jiang","Doyen Sahoo","Caiming Xiong","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.07863v3.pdf","comment":"Published in Transactions on Machine Learning Research (09/2024)"},{"id":"http://arxiv.org/abs/2411.07715v1","updated":"2024-11-12T11:09:58Z","published":"2024-11-12T11:09:58Z","title":"Training Data for Large Language Model","summary":" In 2022, with the release of ChatGPT, large-scale language models gained\nwidespread attention. ChatGPT not only surpassed previous models in terms of\nparameters and the scale of its pretraining corpus but also achieved\nrevolutionary performance improvements through fine-tuning on a vast amount of\nhigh-quality, human-annotated data. This progress has led enterprises and\nresearch institutions to recognize that building smarter and more powerful\nmodels relies on rich and high-quality datasets. Consequently, the construction\nand optimization of datasets have become a critical focus in the field of\nartificial intelligence. This paper summarizes the current state of pretraining\nand fine-tuning data for training large-scale language models, covering aspects\nsuch as data scale, collection methods, data types and characteristics,\nprocessing workflows, and provides an overview of available open-source\ndatasets.\n","authors":["Yiming Ju","Huanhuan Ma"],"pdf_url":"https://arxiv.org/pdf/2411.07715v1.pdf","comment":"in Chinese language"},{"id":"http://arxiv.org/abs/2407.14192v2","updated":"2024-11-12T11:09:35Z","published":"2024-07-19T10:40:10Z","title":"LeKUBE: A Legal Knowledge Update BEnchmark","summary":" Recent advances in Large Language Models (LLMs) have significantly shaped the\napplications of AI in multiple fields, including the studies of legal\nintelligence. Trained on extensive legal texts, including statutes and legal\ndocuments, the legal LLMs can capture important legal knowledge/concepts\neffectively and provide important support for downstream legal applications\nsuch as legal consultancy. Yet, the dynamic nature of legal statutes and\ninterpretations also poses new challenges to the use of LLMs in legal\napplications. Particularly, how to update the legal knowledge of LLMs\neffectively and efficiently has become an important research problem in\npractice. Existing benchmarks for evaluating knowledge update methods are\nmostly designed for the open domain and cannot address the specific challenges\nof the legal domain, such as the nuanced application of new legal knowledge,\nthe complexity and lengthiness of legal regulations, and the intricate nature\nof legal reasoning. To address this gap, we introduce the Legal Knowledge\nUpdate BEnchmark, i.e. LeKUBE, which evaluates knowledge update methods for\nlegal LLMs across five dimensions. Specifically, we categorize the needs of\nknowledge updates in the legal domain with the help of legal professionals, and\nthen hire annotators from law schools to create synthetic updates to the\nChinese Criminal and Civil Code as well as sets of questions of which the\nanswers would change after the updates. Through a comprehensive evaluation of\nstate-of-the-art knowledge update methods, we reveal a notable gap between\nexisting knowledge update methods and the unique needs of the legal domain,\nemphasizing the need for further research and development of knowledge update\nmechanisms tailored for legal LLMs.\n","authors":["Changyue Wang","Weihang Su","Hu Yiran","Qingyao Ai","Yueyue Wu","Cheng Luo","Yiqun Liu","Min Zhang","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2407.14192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07691v1","updated":"2024-11-12T10:15:33Z","published":"2024-11-12T10:15:33Z","title":"New Emerged Security and Privacy of Pre-trained Model: a Survey and\n Outlook","summary":" Thanks to the explosive growth of data and the development of computational\nresources, it is possible to build pre-trained models that can achieve\noutstanding performance on various tasks, such as neural language processing,\ncomputer vision, and more. Despite their powerful capabilities, pre-trained\nmodels have also sparked attention to the emerging security challenges\nassociated with their real-world applications. Security and privacy issues,\nsuch as leaking privacy information and generating harmful responses, have\nseriously undermined users' confidence in these powerful models. Concerns are\ngrowing as model performance improves dramatically. Researchers are eager to\nexplore the unique security and privacy issues that have emerged, their\ndistinguishing factors, and how to defend against them. However, the current\nliterature lacks a clear taxonomy of emerging attacks and defenses for\npre-trained models, which hinders a high-level and comprehensive understanding\nof these questions. To fill the gap, we conduct a systematical survey on the\nsecurity risks of pre-trained models, proposing a taxonomy of attack and\ndefense methods based on the accessibility of pre-trained models' input and\nweights in various security test scenarios. This taxonomy categorizes attacks\nand defenses into No-Change, Input-Change, and Model-Change approaches. With\nthe taxonomy analysis, we capture the unique security and privacy issues of\npre-trained models, categorizing and summarizing existing security issues based\non their characteristics. In addition, we offer a timely and comprehensive\nreview of each category's strengths and limitations. Our survey concludes by\nhighlighting potential new research opportunities in the security and privacy\nof pre-trained models.\n","authors":["Meng Yang","Tianqing Zhu","Chi Liu","WanLei Zhou","Shui Yu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2411.07691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07690v1","updated":"2024-11-12T10:15:11Z","published":"2024-11-12T10:15:11Z","title":"World Models: The Safety Perspective","summary":" With the proliferation of the Large Language Model (LLM), the concept of\nWorld Models (WM) has recently attracted a great deal of attention in the AI\nresearch community, especially in the context of AI agents. It is arguably\nevolving into an essential foundation for building AI agent systems. A WM is\nintended to help the agent predict the future evolution of environmental states\nor help the agent fill in missing information so that it can plan its actions\nand behave safely. The safety property of WM plays a key role in their\neffective use in critical applications. In this work, we review and analyze the\nimpacts of the current state-of-the-art in WM technology from the point of view\nof trustworthiness and safety based on a comprehensive survey and the fields of\napplication envisaged. We provide an in-depth analysis of state-of-the-art WMs\nand derive technical research challenges and their impact in order to call on\nthe research community to collaborate on improving the safety and\ntrustworthiness of WM.\n","authors":["Zifan Zeng","Chongzhe Zhang","Feng Liu","Joseph Sifakis","Qunli Zhang","Shiming Liu","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07690v1.pdf","comment":"8 pages, 3 figures, accepted at the International Workshop on\n Dependability Modeling and Design (WDMD) during the IEEE International\n Symposium on Software Reliability Engineering (ISSRE)"},{"id":"http://arxiv.org/abs/2411.07688v1","updated":"2024-11-12T10:12:12Z","published":"2024-11-12T10:12:12Z","title":"Enhancing Ultra High Resolution Remote Sensing Imagery Analysis with\n ImageRAG","summary":" Ultra High Resolution (UHR) remote sensing imagery (RSI) (e.g. 100,000\n$\\times$ 100,000 pixels or more) poses a significant challenge for current\nRemote Sensing Multimodal Large Language Models (RSMLLMs). If choose to resize\nthe UHR image to standard input image size, the extensive spatial and\ncontextual information that UHR images contain will be neglected. Otherwise,\nthe original size of these images often exceeds the token limits of standard\nRSMLLMs, making it difficult to process the entire image and capture long-range\ndependencies to answer the query based on the abundant visual context. In this\npaper, we introduce ImageRAG for RS, a training-free framework to address the\ncomplexities of analyzing UHR remote sensing imagery. By transforming UHR\nremote sensing image analysis task to image's long context selection task, we\ndesign an innovative image contextual retrieval mechanism based on the\nRetrieval-Augmented Generation (RAG) technique, denoted as ImageRAG. ImageRAG's\ncore innovation lies in its ability to selectively retrieve and focus on the\nmost relevant portions of the UHR image as visual contexts that pertain to a\ngiven query. Fast path and slow path are proposed in this framework to handle\nthis task efficiently and effectively. ImageRAG allows RSMLLMs to manage\nextensive context and spatial information from UHR RSI, ensuring the analysis\nis both accurate and efficient.\n","authors":["Zilun Zhang","Haozhan Shen","Tiancheng Zhao","Yuhao Wang","Bin Chen","Yuxiang Cai","Yongheng Shang","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2411.07688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02487v2","updated":"2024-11-12T10:03:37Z","published":"2024-08-05T14:09:30Z","title":"LiCoEval: Evaluating LLMs on License Compliance in Code Generation","summary":" Recent advances in Large Language Models (LLMs) have revolutionized code\ngeneration, leading to widespread adoption of AI coding tools by developers.\nHowever, LLMs can generate license-protected code without providing the\nnecessary license information, leading to potential intellectual property\nviolations during software production. This paper addresses the critical, yet\nunderexplored, issue of license compliance in LLM-generated code by\nestablishing a benchmark to evaluate the ability of LLMs to provide accurate\nlicense information for their generated code. To establish this benchmark, we\nconduct an empirical study to identify a reasonable standard for \"striking\nsimilarity\" that excludes the possibility of independent creation, indicating a\ncopy relationship between the LLM output and certain open-source code. Based on\nthis standard, we propose LiCoEval, to evaluate the license compliance\ncapabilities of LLMs, i.e., the ability to provide accurate license or\ncopyright information when they generate code with striking similarity to\nalready existing copyrighted code. Using LiCoEval, we evaluate 14 popular LLMs,\nfinding that even top-performing LLMs produce a non-negligible proportion\n(0.88% to 2.01%) of code strikingly similar to existing open-source\nimplementations. Notably, most LLMs fail to provide accurate license\ninformation, particularly for code under copyleft licenses. These findings\nunderscore the urgent need to enhance LLM compliance capabilities in code\ngeneration tasks. Our study provides a foundation for future research and\ndevelopment to improve license compliance in AI-assisted software development,\ncontributing to both the protection of open-source software copyrights and the\nmitigation of legal risks for LLM users.\n","authors":["Weiwei Xu","Kai Gao","Hao He","Minghui Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.02487v2.pdf","comment":"The 47th International Conference on Software Engineering(ICSE 2025)"},{"id":"http://arxiv.org/abs/2411.07686v1","updated":"2024-11-12T09:58:21Z","published":"2024-11-12T09:58:21Z","title":"Data-Driven Graph Switching for Cyber-Resilient Control in Microgrids","summary":" Distributed microgrids are conventionally dependent on communication networks\nto achieve secondary control objectives. This dependence makes them vulnerable\nto stealth data integrity attacks (DIAs) where adversaries may perform\nmanipulations via infected transmitters and repeaters to jeopardize stability.\nThis paper presents a physics-guided, supervised Artificial Neural Network\n(ANN)-based framework that identifies communication-level cyberattacks in\nmicrogrids by analyzing whether incoming measurements will cause abnormal\nbehavior of the secondary control layer. If abnormalities are detected, an\niteration through possible spanning tree graph topologies that can be used to\nfulfill secondary control objectives is done. Then, a communication network\ntopology that would not create secondary control abnormalities is identified\nand enforced for maximum stability. By altering the communication graph\ntopology, the framework eliminates the dependence of the secondary control\nlayer on inputs from compromised cyber devices helping it achieve resilience\nwithout instability. Several case studies are provided showcasing the\nrobustness of the framework against False Data Injections and repeater-level\nMan-in-the-Middle attacks. To understand practical feasibility, robustness is\nalso verified against larger microgrid sizes and in the presence of varying\nnoise levels. Our findings indicate that performance can be affected when\nattempting scalability in the presence of noise. However, the framework\noperates robustly in low-noise settings.\n","authors":["Suman Rath","Subham Sahoo"],"pdf_url":"https://arxiv.org/pdf/2411.07686v1.pdf","comment":"Accepted in IEEE Design Methodologies Conference (DMC) 2024"},{"id":"http://arxiv.org/abs/2411.07685v1","updated":"2024-11-12T09:57:53Z","published":"2024-11-12T09:57:53Z","title":"Fast Disentangled Slim Tensor Learning for Multi-view Clustering","summary":" Tensor-based multi-view clustering has recently received significant\nattention due to its exceptional ability to explore cross-view high-order\ncorrelations. However, most existing methods still encounter some limitations.\n(1) Most of them explore the correlations among different affinity matrices,\nmaking them unscalable to large-scale data. (2) Although some methods address\nit by introducing bipartite graphs, they may result in sub-optimal solutions\ncaused by an unstable anchor selection process. (3) They generally ignore the\nnegative impact of latent semantic-unrelated information in each view. To\ntackle these issues, we propose a new approach termed fast Disentangled Slim\nTensor Learning (DSTL) for multi-view clustering . Instead of focusing on the\nmulti-view graph structures, DSTL directly explores the high-order correlations\namong multi-view latent semantic representations based on matrix factorization.\nTo alleviate the negative influence of feature redundancy, inspired by robust\nPCA, DSTL disentangles the latent low-dimensional representation into a\nsemantic-unrelated part and a semantic-related part for each view.\nSubsequently, two slim tensors are constructed with tensor-based\nregularization. To further enhance the quality of feature disentanglement, the\nsemantic-related representations are aligned across views through a consensus\nalignment indicator. Our proposed model is computationally efficient and can be\nsolved effectively. Extensive experiments demonstrate the superiority and\nefficiency of DSTL over state-of-the-art approaches. The code of DSTL is\navailable at https://github.com/dengxu-nju/DSTL.\n","authors":["Deng Xu","Chao Zhang","Zechao Li","Chunlin Chen","Huaxiong Li"],"pdf_url":"https://arxiv.org/pdf/2411.07685v1.pdf","comment":"13 pages,6 figures, will be published to IEEE TMM"},{"id":"http://arxiv.org/abs/2411.07684v1","updated":"2024-11-12T09:56:42Z","published":"2024-11-12T09:56:42Z","title":"AI enhanced diagnosis of Peyronies disease a novel approach using\n Computer Vision","summary":" This study presents an innovative AI-driven tool for diagnosing Peyronie's\nDisease (PD), a condition that affects between 0.3% and 13.1% of men worldwide.\nOur method uses key point detection on both images and videos to measure penile\ncurvature angles, utilizing advanced computer vision techniques. This tool has\ndemonstrated high accuracy in identifying anatomical landmarks, validated\nagainst conventional goniometer measurements. Traditional PD diagnosis often\ninvolves subjective and invasive methods, which can lead to patient discomfort\nand inaccuracies. Our approach offers a precise, reliable, and non-invasive\ndiagnostic tool to address these drawbacks. The model distinguishes between PD\nand normal anatomical changes with a sensitivity of 96.7% and a specificity of\n100%. This advancement represents a significant improvement in urological\ndiagnostics, greatly enhancing the efficacy and convenience of PD assessment\nfor healthcare providers and patients.\n","authors":["Yudara Kularathne","Janitha Prathapa","Prarththanan Sothyrajah","Salomi Arasaratnam","Sithira Ambepitiya","Thanveer Ahamed","Dinuka Wijesundara"],"pdf_url":"https://arxiv.org/pdf/2411.07684v1.pdf","comment":"8 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.13848v2","updated":"2024-11-12T09:21:13Z","published":"2024-03-18T10:44:22Z","title":"Smooth Sensitivity for Learning Differentially-Private yet Accurate Rule\n Lists","summary":" Differentially-private (DP) mechanisms can be embedded into the design of a\nmachine learning algorithm to protect the resulting model against privacy\nleakage. However, this often comes with a significant loss of accuracy due to\nthe noise added to enforce DP. In this paper, we aim at improving this\ntrade-off for a popular class of machine learning algorithms leveraging the\nGini impurity as an information gain criterion to greedily build interpretable\nmodels such as decision trees or rule lists. To this end, we establish the\nsmooth sensitivity of the Gini impurity, which can be used to obtain thorough\nDP guarantees while adding noise scaled with tighter magnitude. We illustrate\nthe applicability of this mechanism by integrating it within a greedy algorithm\nproducing rule list models, motivated by the fact that such models remain\nunderstudied in the DP literature. Our theoretical analysis and experimental\nresults confirm that the DP rule lists models integrating smooth sensitivity\nhave higher accuracy that those using other DP frameworks based on global\nsensitivity, for identical privacy budgets.\n","authors":["Timothée Ly","Julien Ferry","Marie-José Huguet","Sébastien Gambs","Ulrich Aivodji"],"pdf_url":"https://arxiv.org/pdf/2403.13848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18412v3","updated":"2024-11-12T09:11:37Z","published":"2024-09-27T03:00:29Z","title":"SciDFM: A Large Language Model with Mixture-of-Experts for Science","summary":" Recently, there has been a significant upsurge of interest in leveraging\nlarge language models (LLMs) to assist scientific discovery. However, most LLMs\nonly focus on general science, while they lack domain-specific knowledge, such\nas chemical molecules and amino acid sequences. To bridge these gaps, we\nintroduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and\nis able to conduct college-level scientific reasoning and understand molecules\nand amino acid sequences. We collect a large-scale training corpus containing\nnumerous scientific papers and books from different disciplines as well as data\nfrom domain-specific databases. We further fine-tune the pre-trained model on\nlots of instruction data to improve performances on downstream benchmarks. From\nexperiment results, we show that SciDFM achieves strong performance on general\nscientific benchmarks such as SciEval and SciQ, and it reaches a SOTA\nperformance on domain-specific benchmarks among models of similar size. We\nfurther analyze the expert layers and show that the results of expert selection\nvary with data from different disciplines. To benefit the broader research\ncommunity, we open-source SciDFM at\nhttps://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0.\n","authors":["Liangtai Sun","Danyu Luo","Da Ma","Zihan Zhao","Baocai Chen","Zhennan Shen","Su Zhu","Lu Chen","Xin Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18412v3.pdf","comment":"12 pages, 1 figure, 9 tables. Technical Report, accepted by NeurIPS\n 2024 Workshop FM4Science"},{"id":"http://arxiv.org/abs/2411.07654v1","updated":"2024-11-12T09:06:16Z","published":"2024-11-12T09:06:16Z","title":"Spike Talk in Power Electronic Grids -- Leveraging Post Moore's\n Computing Laws","summary":" Emerging distributed generation demands highly reliable and resilient\ncoordinating control in microgrids. To improve on these aspects, spiking neural\nnetwork is leveraged, as a grid-edge intelligence tool to establish a talkative\ninfrastructure, Spike Talk, expediting coordination in next-generation\nmicrogrids without the need of communication at all. This paper unravels the\nphysics behind Spike Talk from the perspective of its distributed\ninfrastructure, which aims to address the Von Neumann Bottleneck. Relying on\ninferring information via power flows in tie lines, Spike Talk allows adaptive\nand flexible control and coordination itself, and features in synaptic\nplasticity facilitating online and local training functionality. Preliminary\ncase studies are demonstrated with results, while more extensive validations\nare to be included as future scopes of work.\n","authors":["Yubo Song","Subham Sahoo"],"pdf_url":"https://arxiv.org/pdf/2411.07654v1.pdf","comment":"The manuscript has been accepted for publication in the Proceedings\n of 2024 IEEE Design Methodologies for Power Electronics Conference (DMC2024)"},{"id":"http://arxiv.org/abs/2411.07650v1","updated":"2024-11-12T09:02:11Z","published":"2024-11-12T09:02:11Z","title":"Understanding Audiovisual Deepfake Detection: Techniques, Challenges,\n Human Factors and Perceptual Insights","summary":" Deep Learning has been successfully applied in diverse fields, and its impact\non deepfake detection is no exception. Deepfakes are fake yet realistic\nsynthetic content that can be used deceitfully for political impersonation,\nphishing, slandering, or spreading misinformation. Despite extensive research\non unimodal deepfake detection, identifying complex deepfakes through joint\nanalysis of audio and visual streams remains relatively unexplored. To fill\nthis gap, this survey first provides an overview of audiovisual deepfake\ngeneration techniques, applications, and their consequences, and then provides\na comprehensive review of state-of-the-art methods that combine audio and\nvisual modalities to enhance detection accuracy, summarizing and critically\nanalyzing their strengths and limitations. Furthermore, we discuss existing\nopen source datasets for a deeper understanding, which can contribute to the\nresearch community and provide necessary information to beginners who want to\nanalyze deep learning-based audiovisual methods for video forensics. By\nbridging the gap between unimodal and multimodal approaches, this paper aims to\nimprove the effectiveness of deepfake detection strategies and guide future\nresearch in cybersecurity and media integrity.\n","authors":["Ammarah Hashmi","Sahibzada Adil Shahzad","Chia-Wen Lin","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12866v2","updated":"2024-11-12T08:59:30Z","published":"2024-04-19T13:05:37Z","title":"How Does the Textual Information Affect the Retrieval of Multimodal\n In-Context Learning?","summary":" The increase in parameter size of multimodal large language models (MLLMs)\nintroduces significant capabilities, particularly in-context learning, where\nMLLMs enhance task performance without updating pre-trained parameters. This\neffectiveness, however, hinges on the appropriate selection of in-context\nexamples, a process that is currently biased towards visual data, overlooking\ntextual information. Furthermore, the area of supervised retrievers for MLLMs,\ncrucial for optimal in-context example selection, continues to be\nuninvestigated. Our study offers an in-depth evaluation of the impact of\ntextual information on the unsupervised selection of in-context examples in\nmultimodal contexts, uncovering a notable sensitivity of retriever performance\nto the employed modalities. Responding to this, we introduce a novel supervised\nMLLM-retriever MSIER that employs a neural network to select examples that\nenhance multimodal in-context learning efficiency. This approach is validated\nthrough extensive testing across three distinct tasks, demonstrating the\nmethod's effectiveness. Additionally, we investigate the influence of\nmodalities on our supervised retrieval method's training and pinpoint factors\ncontributing to our model's success. This exploration paves the way for future\nadvancements, highlighting the potential for refined in-context learning in\nMLLMs through the strategic use of multimodal data.\n","authors":["Yang Luo","Zangwei Zheng","Zirui Zhu","Yang You"],"pdf_url":"https://arxiv.org/pdf/2404.12866v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2311.10944v5","updated":"2024-11-12T08:31:22Z","published":"2023-11-18T02:44:33Z","title":"Deception Detection from Linguistic and Physiological Data Streams Using\n Bimodal Convolutional Neural Networks","summary":" Deception detection is gaining increasing interest due to ethical and\nsecurity concerns. This paper explores the application of convolutional neural\nnetworks for the purpose of multimodal deception detection. We use a dataset\nbuilt by interviewing 104 subjects about two topics, with one truthful and one\nfalsified response from each subject about each topic. In particular, we make\nthree main contributions. First, we extract linguistic and physiological\nfeatures from this data to train and construct the neural network models.\nSecond, we propose a fused convolutional neural network model using both\nmodalities in order to achieve an improved overall performance. Third, we\ncompare our new approach with earlier methods designed for multimodal deception\ndetection. We find that our system outperforms regular classification methods;\nour results indicate the feasibility of using neural networks for deception\ndetection even in the presence of limited amounts of data.\n","authors":["Panfeng Li","Mohamed Abouelenien","Rada Mihalcea","Zhicheng Ding","Qikai Yang","Yiming Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.10944v5.pdf","comment":"Accepted by 2024 5th International Conference on Information Science,\n Parallel and Distributed Systems"},{"id":"http://arxiv.org/abs/2411.07634v1","updated":"2024-11-12T08:27:27Z","published":"2024-11-12T08:27:27Z","title":"Exploring Multi-Agent Reinforcement Learning for Unrelated Parallel\n Machine Scheduling","summary":" Scheduling problems pose significant challenges in resource, industry, and\noperational management. This paper addresses the Unrelated Parallel Machine\nScheduling Problem (UPMS) with setup times and resources using a Multi-Agent\nReinforcement Learning (MARL) approach. The study introduces the Reinforcement\nLearning environment and conducts empirical analyses, comparing MARL with\nSingle-Agent algorithms. The experiments employ various deep neural network\npolicies for single- and Multi-Agent approaches. Results demonstrate the\nefficacy of the Maskable extension of the Proximal Policy Optimization (PPO)\nalgorithm in Single-Agent scenarios and the Multi-Agent PPO algorithm in\nMulti-Agent setups. While Single-Agent algorithms perform adequately in reduced\nscenarios, Multi-Agent approaches reveal challenges in cooperative learning but\na scalable capacity. This research contributes insights into applying MARL\ntechniques to scheduling optimization, emphasizing the need for algorithmic\nsophistication balanced with scalability for intelligent scheduling solutions.\n","authors":["Maria Zampella","Urtzi Otamendi","Xabier Belaunzaran","Arkaitz Artetxe","Igor G. Olaizola","Giuseppe Longo","Basilio Sierra"],"pdf_url":"https://arxiv.org/pdf/2411.07634v1.pdf","comment":"11 pages, 5 figures, 4 tables, article submitted to a journal"},{"id":"http://arxiv.org/abs/2411.07618v1","updated":"2024-11-12T07:54:13Z","published":"2024-11-12T07:54:13Z","title":"Direct Preference Optimization Using Sparse Feature-Level Constraints","summary":" The alignment of large language models (LLMs) with human preferences remains\na key challenge. While post-training techniques like Reinforcement Learning\nfrom Human Feedback (RLHF) and Direct Preference Optimization (DPO) have\nachieved notable success, they often introduce computational inefficiencies and\ntraining instability. In this paper, we propose Feature-level constrained\nPreference Optimization (FPO), a novel method designed to simplify the\nalignment process while ensuring stability. FPO leverages pre-trained Sparse\nAutoencoders (SAEs) and introduces feature-level constraints, allowing for\nefficient, sparsity-enforced alignment. Our approach enjoys efficiency by using\nsparse features activated in a well-trained sparse autoencoder and the quality\nof sequential KL divergence by using the feature-level offline reference.\nExperimental results on benchmark datasets demonstrate that FPO achieves a\n5.08% absolute improvement in win rate with much lower computational cost\ncompared to state-of-the-art baselines, making it a promising solution for\nefficient and controllable LLM alignments.\n","authors":["Qingyu Yin","Chak Tou Leong","Hongbo Zhang","Minjun Zhu","Hanqi Yan","Qiang Zhang","Yulan He","Wenjie Li","Jun Wang","Yue Zhang","Linyi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06634v2","updated":"2024-11-12T07:52:33Z","published":"2024-08-13T04:53:31Z","title":"Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM\n Approach","summary":" Accurate stock market predictions following earnings reports are crucial for\ninvestors. Traditional methods, particularly classical machine learning models,\nstruggle with these predictions because they cannot effectively process and\ninterpret extensive textual data contained in earnings reports and often\noverlook nuances that influence market movements. This paper introduces an\nadvanced approach by employing Large Language Models (LLMs) instruction\nfine-tuned with a novel combination of instruction-based techniques and\nquantized low-rank adaptation (QLoRA) compression. Our methodology integrates\n'base factors', such as financial metric growth and earnings transcripts, with\n'external factors', including recent market indices performances and analyst\ngrades, to create a rich, supervised dataset. This comprehensive dataset\nenables our models to achieve superior predictive performance in terms of\naccuracy, weighted F1, and Matthews correlation coefficient (MCC), especially\nevident in the comparison with benchmarks such as GPT-4. We specifically\nhighlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases\nsignificant improvements over baseline models. The paper also discusses the\npotential of expanding the output capabilities to include a 'Hold' option and\nextending the prediction horizon, aiming to accommodate various investment\nstyles and time frames. This study not only demonstrates the power of\nintegrating cutting-edge AI with fine-tuned financial data but also paves the\nway for future research in enhancing AI-driven financial analysis tools.\n","authors":["Haowei Ni","Shuchen Meng","Xupeng Chen","Ziqing Zhao","Andi Chen","Panfeng Li","Shiyao Zhang","Qifu Yin","Yuanqing Wang","Yuxi Chan"],"pdf_url":"https://arxiv.org/pdf/2408.06634v2.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2411.07232v2","updated":"2024-11-12T07:49:39Z","published":"2024-11-11T18:50:09Z","title":"Add-it: Training-Free Object Insertion in Images With Pretrained\n Diffusion Models","summary":" Adding Object into images based on text instructions is a challenging task in\nsemantic image editing, requiring a balance between preserving the original\nscene and seamlessly integrating the new object in a fitting location. Despite\nextensive efforts, existing models often struggle with this balance,\nparticularly with finding a natural location for adding an object in complex\nscenes. We introduce Add-it, a training-free approach that extends diffusion\nmodels' attention mechanisms to incorporate information from three key sources:\nthe scene image, the text prompt, and the generated image itself. Our weighted\nextended-attention mechanism maintains structural consistency and fine details\nwhile ensuring natural object placement. Without task-specific fine-tuning,\nAdd-it achieves state-of-the-art results on both real and generated image\ninsertion benchmarks, including our newly constructed \"Additing Affordance\nBenchmark\" for evaluating object placement plausibility, outperforming\nsupervised methods. Human evaluations show that Add-it is preferred in over 80%\nof cases, and it also demonstrates improvements in various automated metrics.\n","authors":["Yoad Tewel","Rinon Gal","Dvir Samuel","Yuval Atzmon","Lior Wolf","Gal Chechik"],"pdf_url":"https://arxiv.org/pdf/2411.07232v2.pdf","comment":"Project page is at https://research.nvidia.com/labs/par/addit/"},{"id":"http://arxiv.org/abs/2404.13812v4","updated":"2024-11-12T07:44:20Z","published":"2024-04-22T01:16:11Z","title":"A Comparative Study on Enhancing Prediction in Social Network\n Advertisement through Data Augmentation","summary":" In the ever-evolving landscape of social network advertising, the volume and\naccuracy of data play a critical role in the performance of predictive models.\nHowever, the development of robust predictive algorithms is often hampered by\nthe limited size and potential bias present in real-world datasets. This study\npresents and explores a generative augmentation framework of social network\nadvertising data. Our framework explores three generative models for data\naugmentation - Generative Adversarial Networks (GANs), Variational Autoencoders\n(VAEs), and Gaussian Mixture Models (GMMs) - to enrich data availability and\ndiversity in the context of social network advertising analytics effectiveness.\nBy performing synthetic extensions of the feature space, we find that through\ndata augmentation, the performance of various classifiers has been\nquantitatively improved. Furthermore, we compare the relative performance gains\nbrought by each data augmentation technique, providing insights for\npractitioners to select appropriate techniques to enhance model performance.\nThis paper contributes to the literature by showing that synthetic data\naugmentation alleviates the limitations imposed by small or imbalanced datasets\nin the field of social network advertising. At the same time, this article also\nprovides a comparative perspective on the practicality of different data\naugmentation methods, thereby guiding practitioners to choose appropriate\ntechniques to enhance model performance.\n","authors":["Qikai Yang","Panfeng Li","Xinhe Xu","Zhicheng Ding","Wenjing Zhou","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13812v4.pdf","comment":"Accepted by 2024 4th International Conference on Machine Learning and\n Intelligent Systems Engineering (MLISE)"},{"id":"http://arxiv.org/abs/2411.07611v1","updated":"2024-11-12T07:34:56Z","published":"2024-11-12T07:34:56Z","title":"Multimodal Clinical Reasoning through Knowledge-augmented Rationale\n Generation","summary":" Clinical rationales play a pivotal role in accurate disease diagnosis;\nhowever, many models predominantly use discriminative methods and overlook the\nimportance of generating supportive rationales. Rationale distillation is a\nprocess that transfers knowledge from large language models (LLMs) to smaller\nlanguage models (SLMs), thereby enhancing the latter's ability to break down\ncomplex tasks. Despite its benefits, rationale distillation alone is inadequate\nfor addressing domain knowledge limitations in tasks requiring specialized\nexpertise, such as disease diagnosis. Effectively embedding domain knowledge in\nSLMs poses a significant challenge. While current LLMs are primarily geared\ntoward processing textual data, multimodal LLMs that incorporate time series\ndata, especially electronic health records (EHRs), are still evolving. To\ntackle these limitations, we introduce ClinRaGen, an SLM optimized for\nmultimodal rationale generation in disease diagnosis. ClinRaGen incorporates a\nunique knowledge-augmented attention mechanism to merge domain knowledge with\ntime series EHR data, utilizing a stepwise rationale distillation strategy to\nproduce both textual and time series-based clinical rationales. Our evaluations\nshow that ClinRaGen markedly improves the SLM's capability to interpret\nmultimodal EHR data and generate accurate clinical rationales, supporting more\nreliable disease diagnosis, advancing LLM applications in healthcare, and\nnarrowing the performance divide between LLMs and SLMs.\n","authors":["Shuai Niu","Jing Ma","Liang Bai","Zhihua Wang","Yida Xu","Yunya Song","Xian Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07611v1.pdf","comment":"11 pages. 4 figures"},{"id":"http://arxiv.org/abs/2406.12199v3","updated":"2024-11-12T07:28:08Z","published":"2024-06-18T01:55:37Z","title":"Time Series Modeling for Heart Rate Prediction: From ARIMA to\n Transformers","summary":" Cardiovascular disease (CVD) is a leading cause of death globally,\nnecessitating precise forecasting models for monitoring vital signs like heart\nrate, blood pressure, and ECG. Traditional models, such as ARIMA and Prophet,\nare limited by their need for manual parameter tuning and challenges in\nhandling noisy, sparse, and highly variable medical data. This study\ninvestigates advanced deep learning models, including LSTM, and\ntransformer-based architectures, for predicting heart rate time series from the\nMIT-BIH Database. Results demonstrate that deep learning models, particularly\nPatchTST, significantly outperform traditional models across multiple metrics,\ncapturing complex patterns and dependencies more effectively. This research\nunderscores the potential of deep learning to enhance patient monitoring and\nCVD management, suggesting substantial clinical benefits. Future work should\nextend these findings to larger, more diverse datasets and real-world clinical\napplications to further validate and optimize model performance.\n","authors":["Haowei Ni","Shuchen Meng","Xieming Geng","Panfeng Li","Zhuoying Li","Xupeng Chen","Xiaotong Wang","Shiyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.12199v3.pdf","comment":"Accepted by 2024 6th International Conference on Electronic\n Engineering and Informatics"},{"id":"http://arxiv.org/abs/2411.07606v1","updated":"2024-11-12T07:26:51Z","published":"2024-11-12T07:26:51Z","title":"Optimizing Service Function Chain Mapping in Network Function\n Virtualization through Simultaneous NF Decomposition and VNF Placement","summary":" Network function virtualization enables network operators to implement new\nservices through a process called service function chain mapping. The concept\nof Service Function Chain (SFC) is introduced to provide complex services,\nwhich is an ordered set of Network Functions (NF). The network functions of an\nSFC can be decomposed in several ways into some Virtual Network Functions\n(VNF). Additionally, the decomposed NFs can be placed (mapped) as VNFs on\ndifferent machines on the underlying physical infrastructure. Selecting good\ndecompositions and good placements among the possible options greatly affects\nboth costs and service quality metrics. Previous research has addressed NF\ndecomposition and VNF placement as separate problems. However, in this paper,\nwe address both NF decomposition and VNF placement simultaneously as a single\nproblem. Since finding an optimal solution is NP-hard, we have employed\nheuristic algorithms to solve the problem. Specifically, we have introduced a\nmultiobjective decomposition and mapping VNFs (MODMVNF) method based on the\nnon-dominated sorting genetic multi-objective algorithm (NSGAII) to solve the\nproblem. The goal is to find near-optimal decomposition and mapping on the\nphysical network at the same time to minimize the mapping cost and\ncommunication latency of SFC. The comparison of the results of the proposed\nmethod with the results obtained by solving ILP formulation of the problem as\nwell as the results obtained from the multi-objective particle swarm algorithm\nshows the efficiency and effectiveness of the proposed method in terms of cost\nand communication latency.\n","authors":["Asghar Asgharian-Sardroud","Mohammad Hossein Izanlou","Amin Jabbari","Sepehr Mahmoodian Hamedani"],"pdf_url":"https://arxiv.org/pdf/2411.07606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07602v1","updated":"2024-11-12T07:24:41Z","published":"2024-11-12T07:24:41Z","title":"Circuit Complexity Bounds for RoPE-based Transformer Architecture","summary":" Characterizing the express power of the Transformer architecture is critical\nto understanding its capacity limits and scaling law. Recent works provide the\ncircuit complexity bounds to Transformer-like architecture. On the other hand,\nRotary Position Embedding ($\\mathsf{RoPE}$) has emerged as a crucial technique\nin modern large language models, offering superior performance in capturing\npositional information compared to traditional position embeddings, which shows\ngreat potential in application prospects, particularly for the long context\nscenario. Empirical evidence also suggests that $\\mathsf{RoPE}$-based\nTransformer architectures demonstrate greater generalization capabilities\ncompared to conventional Transformer models. In this work, we establish a\ntighter circuit complexity bound for Transformers with $\\mathsf{RoPE}$\nattention. Our key contribution is that we show that unless $\\mathsf{TC}^0 =\n\\mathsf{NC}^1$, a $\\mathsf{RoPE}$-based Transformer with\n$\\mathrm{poly}(n)$-precision, $O(1)$ layers, hidden dimension $d \\leq O(n)$\ncannot solve the arithmetic problem or the Boolean formula value problem. This\nresult significantly demonstrates the fundamental limitation of the\nexpressivity of the $\\mathsf{RoPE}$-based Transformer architecture, although it\nachieves giant empirical success. Our theoretical framework not only\nestablishes tighter complexity bounds but also may instruct further work on the\n$\\mathsf{RoPE}$-based Transformer.\n","authors":["Bo Chen","Xiaoyu Li","Yingyu Liang","Jiangxuan Long","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2411.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13565v3","updated":"2024-11-12T07:21:04Z","published":"2024-04-21T07:34:44Z","title":"Exploring Diverse Methods in Visual Question Answering","summary":" This study explores innovative methods for improving Visual Question\nAnswering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and\nattention mechanisms. Leveraging a balanced VQA dataset, we investigate three\ndistinct strategies. Firstly, GAN-based approaches aim to generate answer\nembeddings conditioned on image and question inputs, showing potential but\nstruggling with more complex tasks. Secondly, autoencoder-based techniques\nfocus on learning optimal embeddings for questions and images, achieving\ncomparable results with GAN due to better ability on complex questions. Lastly,\nattention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB),\naddress language priors and attention modeling, albeit with a\ncomplexity-performance trade-off. This study underscores the challenges and\nopportunities in VQA and suggests avenues for future research, including\nalternative GAN formulations and attentional mechanisms.\n","authors":["Panfeng Li","Qikai Yang","Xieming Geng","Wenjing Zhou","Zhicheng Ding","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13565v3.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2411.07598v1","updated":"2024-11-12T07:16:51Z","published":"2024-11-12T07:16:51Z","title":"Problem-Oriented Segmentation and Retrieval: Case Study on Tutoring\n Conversations","summary":" Many open-ended conversations (e.g., tutoring lessons or business meetings)\nrevolve around pre-defined reference materials, like worksheets or meeting\nbullets. To provide a framework for studying such conversation structure, we\nintroduce Problem-Oriented Segmentation & Retrieval (POSR), the task of jointly\nbreaking down conversations into segments and linking each segment to the\nrelevant reference item. As a case study, we apply POSR to education where\neffectively structuring lessons around problems is critical yet difficult. We\npresent LessonLink, the first dataset of real-world tutoring lessons, featuring\n3,500 segments, spanning 24,300 minutes of instruction and linked to 116 SAT\nmath problems. We define and evaluate several joint and independent approaches\nfor POSR, including segmentation (e.g., TextTiling), retrieval (e.g., ColBERT),\nand large language models (LLMs) methods. Our results highlight that modeling\nPOSR as one joint task is essential: POSR methods outperform independent\nsegmentation and retrieval pipelines by up to +76% on joint metrics and surpass\ntraditional segmentation methods by up to +78% on segmentation metrics. We\ndemonstrate POSR's practical impact on downstream education applications,\nderiving new insights on the language and time use in real-world lesson\nstructures.\n","authors":["Rose E. Wang","Pawan Wirawarn","Kenny Lam","Omar Khattab","Dorottya Demszky"],"pdf_url":"https://arxiv.org/pdf/2411.07598v1.pdf","comment":"EMNLP 2024 Findings. Our code and dataset are open-sourced at\n https://github.com/rosewang2008/posr"},{"id":"http://arxiv.org/abs/2306.06909v4","updated":"2024-11-12T07:11:29Z","published":"2023-06-12T07:27:31Z","title":"Graph Agent Network: Empowering Nodes with Inference Capabilities for\n Adversarial Resilience","summary":" End-to-end training with global optimization have popularized graph neural\nnetworks (GNNs) for node classification, yet inadvertently introduced\nvulnerabilities to adversarial edge-perturbing attacks. Adversaries can exploit\nthe inherent opened interfaces of GNNs' input and output, perturbing critical\nedges and thus manipulating the classification results. Current defenses, due\nto their persistent utilization of global-optimization-based end-to-end\ntraining schemes, inherently encapsulate the vulnerabilities of GNNs. This is\nspecifically evidenced in their inability to defend against targeted secondary\nattacks. In this paper, we propose the Graph Agent Network (GAgN) to address\nthe aforementioned vulnerabilities of GNNs. GAgN is a graph-structured agent\nnetwork in which each node is designed as an 1-hop-view agent. Through the\ndecentralized interactions between agents, they can learn to infer global\nperceptions to perform tasks including inferring embeddings, degrees and\nneighbor relationships for given nodes. This empowers nodes to filtering\nadversarial edges while carrying out classification tasks. Furthermore, agents'\nlimited view prevents malicious messages from propagating globally in GAgN,\nthereby resisting global-optimization-based secondary attacks. We prove that\nsingle-hidden-layer multilayer perceptrons (MLPs) are theoretically sufficient\nto achieve these functionalities. Experimental results show that GAgN\neffectively implements all its intended capabilities and, compared to\nstate-of-the-art defenses, achieves optimal classification accuracy on the\nperturbed datasets.\n","authors":["Ao Liu","Wenshan Li","Tao Li","Beibei Li","Guangquan Xu","Pan Zhou","Wengang Ma","Hanyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2306.06909v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07595v1","updated":"2024-11-12T07:09:44Z","published":"2024-11-12T07:09:44Z","title":"Entropy Controllable Direct Preference Optimization","summary":" In the post-training of large language models (LLMs), Reinforcement Learning\nfrom Human Feedback (RLHF) is an effective approach to achieve generation\naligned with human preferences. Direct Preference Optimization (DPO) allows for\npolicy training with a simple binary cross-entropy loss without a reward model.\nThe objective of DPO is regularized by reverse KL divergence that encourages\nmode-seeking fitting to the reference policy. Nonetheless, we indicate that\nminimizing reverse KL divergence could fail to capture a mode of the reference\ndistribution, which may hurt the policy's performance. Based on this\nobservation, we propose a simple modification to DPO, H-DPO, which allows for\ncontrol over the entropy of the resulting policy, enhancing the distribution's\nsharpness and thereby enabling mode-seeking fitting more effectively. In our\nexperiments, we show that H-DPO outperformed DPO across various tasks,\ndemonstrating superior results in pass@$k$ evaluations for mathematical tasks.\nMoreover, H-DPO is simple to implement, requiring only minor modifications to\nthe loss calculation of DPO, which makes it highly practical and promising for\nwide-ranging applications in the training of LLMs.\n","authors":["Motoki Omura","Yasuhiro Fujita","Toshiki Kataoka"],"pdf_url":"https://arxiv.org/pdf/2411.07595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07589v1","updated":"2024-11-12T06:58:03Z","published":"2024-11-12T06:58:03Z","title":"Overhead-free User-side Recommender Systems","summary":" Traditionally, recommendation algorithms have been designed for service\ndevelopers. But recently, a new paradigm called user-side recommender systems\nhas been proposed. User-side recommender systems are built and used by end\nusers, in sharp contrast to traditional provider-side recommender systems. Even\nif the official recommender system offered by the provider is not fair, end\nusers can create and enjoy their own user-side recommender systems by\nthemselves. Although the concept of user-side recommender systems is\nattractive, the problem is they require tremendous communication costs between\nthe user and the official system. Even the most efficient user-side recommender\nsystems require about 5 times more costs than provider-side recommender\nsystems. Such high costs hinder the adoption of user-side recommender systems.\nIn this paper, we propose overhead-free user-side recommender systems,\nRecCycle, which realizes user-side recommender systems without any\ncommunication overhead. The main idea of RecCycle is to recycle past\nrecommendation results offered by the provider's recommender systems. The\ningredients of RecCycle can be retrieved ``for free,'' and it greatly reduces\nthe cost of user-side recommendations. In the experiments, we confirm that\nRecCycle performs as well as state-of-the-art user-side recommendation\nalgorithms while RecCycle reduces costs significantly.\n","authors":["Ryoma Sato"],"pdf_url":"https://arxiv.org/pdf/2411.07589v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2208.09864,\n arXiv:2403.15757"},{"id":"http://arxiv.org/abs/2411.07586v1","updated":"2024-11-12T06:47:54Z","published":"2024-11-12T06:47:54Z","title":"A Comprehensive Survey of AI-Driven Advancements and Techniques in\n Automated Program Repair and Code Generation","summary":" Bug fixing and code generation have been core research topics in software\ndevelopment for many years. The recent explosive growth in Large Language\nModels has completely transformed these spaces, putting in reach incredibly\npowerful tools for both. In this survey, 27 recent papers have been reviewed\nand split into two groups: one dedicated to Automated Program Repair (APR) and\nLLM integration and the other to code generation using LLMs. The first group\nconsists of new methods for bug detection and repair, which include locating\nsemantic errors, security vulnerabilities, and runtime failure bugs. The place\nof LLMs in reducing manual debugging efforts is emphasized in this work by APR\ntoward context-aware fixes, with innovations that boost accuracy and efficiency\nin automatic debugging. The second group dwells on code generation, providing\nan overview of both general-purpose LLMs fine-tuned for programming and\ntask-specific models. It also presents methods to improve code generation, such\nas identifier-aware training, fine-tuning at the instruction level, and\nincorporating semantic code structures. This survey work contrasts the\nmethodologies in APR and code generation to identify trends such as using LLMs,\nfeedback loops to enable iterative code improvement and open-source models. It\nalso discusses the challenges of achieving functional correctness and security\nand outlines future directions for research in LLM-based software development.\n","authors":["Avinash Anand","Akshit Gupta","Nishchay Yadav","Shaurya Bajaj"],"pdf_url":"https://arxiv.org/pdf/2411.07586v1.pdf","comment":"A survey of recent developments in AI-assisted automated program\n repair"},{"id":"http://arxiv.org/abs/2406.08200v3","updated":"2024-11-12T06:46:41Z","published":"2024-06-12T13:33:24Z","title":"Asynchronous Voice Anonymization Using Adversarial Perturbation On\n Speaker Embedding","summary":" Voice anonymization has been developed as a technique for preserving privacy\nby replacing the speaker's voice in a speech signal with that of a\npseudo-speaker, thereby obscuring the original voice attributes from machine\nrecognition and human perception. In this paper, we focus on altering the voice\nattributes against machine recognition while retaining human perception. We\nreferred to this as the asynchronous voice anonymization. To this end, a speech\ngeneration framework incorporating a speaker disentanglement mechanism is\nemployed to generate the anonymized speech. The speaker attributes are altered\nthrough adversarial perturbation applied on the speaker embedding, while human\nperception is preserved by controlling the intensity of perturbation.\nExperiments conducted on the LibriSpeech dataset showed that the speaker\nattributes were obscured with their human perception preserved for 60.71% of\nthe processed utterances.\n","authors":["Rui Wang","Liping Chen","Kong AiK Lee","Zhen-Hua Ling"],"pdf_url":"https://arxiv.org/pdf/2406.08200v3.pdf","comment":"accpeted by Interspeech2024"},{"id":"http://arxiv.org/abs/2411.07585v1","updated":"2024-11-12T06:44:28Z","published":"2024-11-12T06:44:28Z","title":"Reinforcement Learning Framework for Quantitative Trading","summary":" The inherent volatility and dynamic fluctuations within the financial stock\nmarket underscore the necessity for investors to employ a comprehensive and\nreliable approach that integrates risk management strategies, market trends,\nand the movement trends of individual securities. By evaluating specific data,\ninvestors can make more informed decisions. However, the current body of\nliterature lacks substantial evidence supporting the practical efficacy of\nreinforcement learning (RL) agents, as many models have only demonstrated\nsuccess in back testing using historical data. This highlights the urgent need\nfor a more advanced methodology capable of addressing these challenges. There\nis a significant disconnect in the effective utilization of financial\nindicators to better understand the potential market trends of individual\nsecurities. The disclosure of successful trading strategies is often restricted\nwithin financial markets, resulting in a scarcity of widely documented and\npublished strategies leveraging RL. Furthermore, current research frequently\noverlooks the identification of financial indicators correlated with various\nmarket trends and their potential advantages.\n This research endeavors to address these complexities by enhancing the\nability of RL agents to effectively differentiate between positive and negative\nbuy/sell actions using financial indicators. While we do not address all\nconcerns, this paper provides deeper insights and commentary on the utilization\nof technical indicators and their benefits within reinforcement learning. This\nwork establishes a foundational framework for further exploration and\ninvestigation of more complex scenarios.\n","authors":["Alhassan S. Yasin","Prabdeep S. Gill"],"pdf_url":"https://arxiv.org/pdf/2411.07585v1.pdf","comment":"8 pages, 9 figures, 3 tables, accepted at ICAIF 2024 FM4TS Workshop"},{"id":"http://arxiv.org/abs/2310.00583v3","updated":"2024-11-12T06:27:51Z","published":"2023-10-01T05:55:30Z","title":"City Foundation Models for Learning General Purpose Representations from\n OpenStreetMap","summary":" Pre-trained Foundation Models (PFMs) have ushered in a paradigm-shift in\nArtificial Intelligence, due to their ability to learn general-purpose\nrepresentations that can be readily employed in a wide range of downstream\ntasks. While PFMs have been successfully adopted in various fields such as\nNatural Language Processing and Computer Vision, their capacity in handling\ngeospatial data and answering urban questions remains limited. This can be\nattributed to the intrinsic heterogeneity of geospatial data, which encompasses\ndifferent data types, including points, segments and regions, as well as\nmultiple information modalities, such as a spatial position, visual\ncharacteristics and textual annotations. The proliferation of Volunteered\nGeographic Information initiatives, and the ever-increasing availability of\nopen geospatial data sources, like OpenStreetMap, which is freely accessible\nglobally, unveil a promising opportunity to bridge this gap. In this paper, we\npresent CityFM, a self-supervised framework to train a foundation model within\na selected geographical area of interest, such as a city. CityFM relies solely\non open data from OSM, and produces multimodal representations of entities of\ndifferent types, incorporating spatial, visual, and textual information. We\nanalyse the entity representations generated using our foundation models from a\nqualitative perspective, and conduct quantitative experiments on road,\nbuilding, and region-level downstream tasks. We compare its results to\nalgorithms tailored specifically for the respective applications. In all the\nexperiments, CityFM achieves performance superior to, or on par with, the\nbaselines.\n","authors":["Pasquale Balsebre","Weiming Huang","Gao Cong","Yi Li"],"pdf_url":"https://arxiv.org/pdf/2310.00583v3.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2411.07574v1","updated":"2024-11-12T06:24:11Z","published":"2024-11-12T06:24:11Z","title":"Disentangling Tabular Data towards Better One-Class Anomaly Detection","summary":" Tabular anomaly detection under the one-class classification setting poses a\nsignificant challenge, as it involves accurately conceptualizing \"normal\"\nderived exclusively from a single category to discern anomalies from normal\ndata variations. Capturing the intrinsic correlation among attributes within\nnormal samples presents one promising method for learning the concept. To do\nso, the most recent effort relies on a learnable mask strategy with a\nreconstruction task. However, this wisdom may suffer from the risk of producing\nuniform masks, i.e., essentially nothing is masked, leading to less effective\ncorrelation learning. To address this issue, we presume that attributes related\nto others in normal samples can be divided into two non-overlapping and\ncorrelated subsets, defined as CorrSets, to capture the intrinsic correlation\neffectively. Accordingly, we introduce an innovative method that disentangles\nCorrSets from normal tabular data. To our knowledge, this is a pioneering\neffort to apply the concept of disentanglement for one-class anomaly detection\non tabular data. Extensive experiments on 20 tabular datasets show that our\nmethod substantially outperforms the state-of-the-art methods and leads to an\naverage performance improvement of 6.1% on AUC-PR and 2.1% on AUC-ROC.\n","authors":["Jianan Ye","Zhaorui Tan","Yijie Hu","Xi Yang","Guangliang Cheng","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2411.07574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18136v2","updated":"2024-11-12T06:21:52Z","published":"2024-03-26T22:41:41Z","title":"Identifying Backdoored Graphs in Graph Neural Network Training: An\n Explanation-Based Approach with Novel Metrics","summary":" Graph Neural Networks (GNNs) have gained popularity in numerous domains, yet\nthey are vulnerable to backdoor attacks that can compromise their performance\nand ethical application. The detection of these attacks is crucial for\nmaintaining the reliability and security of GNN classification tasks, but\neffective detection techniques are lacking. Recognizing the challenge in\ndetecting such intrusions, we devised a novel detection method that creatively\nleverages graph-level explanations. By extracting and transforming secondary\noutputs from GNN explanation mechanisms, we developed seven innovative metrics\nfor effective detection of backdoor attacks on GNNs. Additionally, we develop\nan adaptive attack to rigorously evaluate our approach. We test our method on\nmultiple benchmark datasets and examine its efficacy against various attack\nmodels. Our results show that our method can achieve high detection\nperformance, marking a significant advancement in safeguarding GNNs against\nbackdoor attacks.\n","authors":["Jane Downer","Ren Wang","Binghui Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05990v2","updated":"2024-11-12T05:46:46Z","published":"2024-11-08T22:02:22Z","title":"Game-theoretic LLM: Agent Workflow for Negotiation Games","summary":" This paper investigates the rationality of large language models (LLMs) in\nstrategic decision-making contexts, specifically within the framework of game\ntheory. We evaluate several state-of-the-art LLMs across a spectrum of\ncomplete-information and incomplete-information games. Our findings reveal that\nLLMs frequently deviate from rational strategies, particularly as the\ncomplexity of the game increases with larger payoff matrices or deeper\nsequential trees.\n To address these limitations, we design multiple game-theoretic workflows\nthat guide the reasoning and decision-making processes of LLMs. These workflows\naim to enhance the models' ability to compute Nash Equilibria and make rational\nchoices, even under conditions of uncertainty and incomplete information.\nExperimental results demonstrate that the adoption of these workflows\nsignificantly improves the rationality and robustness of LLMs in game-theoretic\ntasks. Specifically, with the workflow, LLMs exhibit marked improvements in\nidentifying optimal strategies, achieving near-optimal allocations in\nnegotiation scenarios, and reducing susceptibility to exploitation during\nnegotiations. Furthermore, we explore the meta-strategic considerations of\nwhether it is rational for agents to adopt such workflows, recognizing that the\ndecision to use or forgo the workflow constitutes a game-theoretic issue in\nitself.\n Our research contributes to a deeper understanding of LLMs' decision-making\ncapabilities in strategic contexts and provides insights into enhancing their\nrationality through structured workflows. The findings have implications for\nthe development of more robust and strategically sound AI agents capable of\nnavigating complex interactive environments. Code and data supporting this\nstudy are available at \\url{https://github.com/Wenyueh/game_theory}.\n","authors":["Wenyue Hua","Ollie Liu","Lingyao Li","Alfonso Amayuelas","Julie Chen","Lucas Jiang","Mingyu Jin","Lizhou Fan","Fei Sun","William Wang","Xintong Wang","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.05990v2.pdf","comment":"45 pages, 12 figures"},{"id":"http://arxiv.org/abs/2411.06087v2","updated":"2024-11-12T05:40:38Z","published":"2024-11-09T06:39:44Z","title":"Cross-Domain Transfer Learning using Attention Latent Features for\n Multi-Agent Trajectory Prediction","summary":" With the advancements of sensor hardware, traffic infrastructure and deep\nlearning architectures, trajectory prediction of vehicles has established a\nsolid foundation in intelligent transportation systems. However, existing\nsolutions are often tailored to specific traffic networks at particular time\nperiods. Consequently, deep learning models trained on one network may struggle\nto generalize effectively to unseen networks. To address this, we proposed a\nnovel spatial-temporal trajectory prediction framework that performs\ncross-domain adaption on the attention representation of a Transformer-based\nmodel. A graph convolutional network is also integrated to construct dynamic\ngraph feature embeddings that accurately model the complex spatial-temporal\ninteractions between the multi-agent vehicles across multiple traffic domains.\nThe proposed framework is validated on two case studies involving the\ncross-city and cross-period settings. Experimental results show that our\nproposed framework achieves superior trajectory prediction and domain\nadaptation performances over the state-of-the-art models.\n","authors":["Jia Quan Loh","Xuewen Luo","Fan Ding","Hwa Hui Tew","Junn Yong Loo","Ze Yang Ding","Susilawati Susilawati","Chee Pin Tan"],"pdf_url":"https://arxiv.org/pdf/2411.06087v2.pdf","comment":"Accepted at the IEEE International Conference on Systems, Man, and\n Cybernetics 2024"},{"id":"http://arxiv.org/abs/2411.07563v1","updated":"2024-11-12T05:38:43Z","published":"2024-11-12T05:38:43Z","title":"Improving Grapheme-to-Phoneme Conversion through In-Context Knowledge\n Retrieval with Large Language Models","summary":" Grapheme-to-phoneme (G2P) conversion is a crucial step in Text-to-Speech\n(TTS) systems, responsible for mapping grapheme to corresponding phonetic\nrepresentations. However, it faces ambiguities problems where the same grapheme\ncan represent multiple phonemes depending on contexts, posing a challenge for\nG2P conversion. Inspired by the remarkable success of Large Language Models\n(LLMs) in handling context-aware scenarios, contextual G2P conversion systems\nwith LLMs' in-context knowledge retrieval (ICKR) capabilities are proposed to\npromote disambiguation capability. The efficacy of incorporating ICKR into G2P\nconversion systems is demonstrated thoroughly on the Librig2p dataset. In\nparticular, the best contextual G2P conversion system using ICKR outperforms\nthe baseline with weighted average phoneme error rate (PER) reductions of 2.0%\nabsolute (28.9% relative). Using GPT-4 in the ICKR system can increase of 3.5%\nabsolute (3.8% relative) on the Librig2p dataset.\n","authors":["Dongrui Han","Mingyu Cui","Jiawen Kang","Xixin Wu","Xunying Liu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2411.07563v1.pdf","comment":"accepted by ISCSLP 2024"},{"id":"http://arxiv.org/abs/2408.11856v2","updated":"2024-11-12T05:37:15Z","published":"2024-08-15T19:13:38Z","title":"Dynamic Adaptive Optimization for Effective Sentiment Analysis\n Fine-Tuning on Large Language Models","summary":" Sentiment analysis plays a crucial role in various domains, such as business\nintelligence and financial forecasting. Large language models (LLMs) have\nbecome a popular paradigm for sentiment analysis, leveraging multi-task\nlearning to address specific tasks concurrently. However, LLMs with fine-tuning\nfor sentiment analysis often underperforms due to the inherent challenges in\nmanaging diverse task complexities. Moreover, constant-weight approaches in\nmulti-task learning struggle to adapt to variations in data characteristics,\nfurther complicating model effectiveness. To address these issues, we propose a\nnovel multi-task learning framework with a dynamic adaptive optimization (DAO)\nmodule. This module is designed as a plug-and-play component that can be\nseamlessly integrated into existing models, providing an effective and flexible\nsolution for multi-task learning. The key component of the DAO module is\ndynamic adaptive loss, which dynamically adjusts the weights assigned to\ndifferent tasks based on their relative importance and data characteristics\nduring training. Sentiment analyses on a standard and customized financial text\ndataset demonstrate that the proposed framework achieves superior performance.\nSpecifically, this work improves the Mean Squared Error (MSE) and Accuracy\n(ACC) by 15.58% and 1.24% respectively, compared with previous work.\n","authors":["Hongcheng Ding","Xuanze Zhao","Shamsul Nahar Abdullah","Deshinta Arrova Dewi","Zixiao Jiang","Xiangyu Shi"],"pdf_url":"https://arxiv.org/pdf/2408.11856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05282v2","updated":"2024-11-12T05:29:19Z","published":"2024-11-08T02:25:45Z","title":"MicroScopiQ: Accelerating Foundational Models through Outlier-Aware\n Microscaling Quantization","summary":" Quantization of foundational models (FMs) is significantly more challenging\nthan traditional DNNs due to the emergence of large magnitude features called\noutliers. Existing outlier-aware algorithm/architecture co-design techniques\neither use mixed-precision, retaining outliers at high precision but compromise\nhardware efficiency, or quantize inliers and outliers at the same precision,\nimproving hardware efficiency at the cost of accuracy. To address this mutual\nexclusivity, in this paper, we propose MicroScopiQ, a novel co-design technique\nthat leverages pruning to complement outlier-aware quantization. MicroScopiQ\nretains outliers at higher precision while pruning a certain fraction of least\nimportant weights to distribute the additional outlier bits; ensuring high\naccuracy, aligned memory and hardware efficiency. We design a high-throughput,\nlow overhead accelerator architecture composed of simple multi-precision INT\nprocessing elements and a novel network-on-chip called ReCoN that efficiently\nabstracts the complexity of supporting high-precision outliers. Additionally,\nunlike existing alternatives, MicroScopiQ does not assume any locality of\noutlier weights, enabling applicability to a broad range of FMs. Extensive\nexperiments across various quantization settings show that MicroScopiQ achieves\nSoTA quantization performance while simultaneously improving inference\nperformance by 3x and reducing energy by 2x over existing alternatives.\n","authors":["Akshat Ramachandran","Souvik Kundu","Tushar Krishna"],"pdf_url":"https://arxiv.org/pdf/2411.05282v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2411.07560v1","updated":"2024-11-12T05:28:52Z","published":"2024-11-12T05:28:52Z","title":"EUR/USD Exchange Rate Forecasting incorporating Text Mining Based on\n Pre-trained Language Models and Deep Learning Methods","summary":" This study introduces a novel approach for EUR/USD exchange rate forecasting\nthat integrates deep learning, textual analysis, and particle swarm\noptimization (PSO). By incorporating online news and analysis texts as\nqualitative data, the proposed PSO-LSTM model demonstrates superior performance\ncompared to traditional econometric and machine learning models. The research\nemploys advanced text mining techniques, including sentiment analysis using the\nRoBERTa-Large model and topic modeling with LDA. Empirical findings underscore\nthe significant advantage of incorporating textual data, with the PSO-LSTM\nmodel outperforming benchmark models such as SVM, SVR, ARIMA, and GARCH.\nAblation experiments reveal the contribution of each textual data category to\nthe overall forecasting performance. The study highlights the transformative\npotential of artificial intelligence in finance and paves the way for future\nresearch in real-time forecasting and the integration of alternative data\nsources.\n","authors":["Xiangyu Shi","Hongcheng Ding","Salaar Faroog","Deshinta Arrova Dewi","Shamsul Nahar Abdullah","Bahiah A Malek"],"pdf_url":"https://arxiv.org/pdf/2411.07560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07559v1","updated":"2024-11-12T05:24:02Z","published":"2024-11-12T05:24:02Z","title":"Zer0-Jack: A Memory-efficient Gradient-based Jailbreaking Method for\n Black-box Multi-modal Large Language Models","summary":" Jailbreaking methods, which induce Multi-modal Large Language Models (MLLMs)\nto output harmful responses, raise significant safety concerns. Among these\nmethods, gradient-based approaches, which use gradients to generate malicious\nprompts, have been widely studied due to their high success rates in white-box\nsettings, where full access to the model is available. However, these methods\nhave notable limitations: they require white-box access, which is not always\nfeasible, and involve high memory usage. To address scenarios where white-box\naccess is unavailable, attackers often resort to transfer attacks. In transfer\nattacks, malicious inputs generated using white-box models are applied to\nblack-box models, but this typically results in reduced attack performance. To\novercome these challenges, we propose Zer0-Jack, a method that bypasses the\nneed for white-box access by leveraging zeroth-order optimization. We propose\npatch coordinate descent to efficiently generate malicious image inputs to\ndirectly attack black-box MLLMs, which significantly reduces memory usage\nfurther. Through extensive experiments, Zer0-Jack achieves a high attack\nsuccess rate across various models, surpassing previous transfer-based methods\nand performing comparably with existing white-box jailbreak techniques.\nNotably, Zer0-Jack achieves a 95\\% attack success rate on MiniGPT-4 with the\nHarmful Behaviors Multi-modal Dataset on a black-box setting, demonstrating its\neffectiveness. Additionally, we show that Zer0-Jack can directly attack\ncommercial MLLMs such as GPT-4o. Codes are provided in the supplement.\n","authors":["Tiejin Chen","Kaishen Wang","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2411.07559v1.pdf","comment":"Accepted to Neurips SafeGenAi Workshop 2024"},{"id":"http://arxiv.org/abs/2411.07546v1","updated":"2024-11-12T04:50:10Z","published":"2024-11-12T04:50:10Z","title":"Contrastive Language Prompting to Ease False Positives in Medical\n Anomaly Detection","summary":" A pre-trained visual-language model, contrastive language-image pre-training\n(CLIP), successfully accomplishes various downstream tasks with text prompts,\nsuch as finding images or localizing regions within the image. Despite CLIP's\nstrong multi-modal data capabilities, it remains limited in specialized\nenvironments, such as medical applications. For this purpose, many CLIP\nvariants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives\nrelated to normal regions persist. Thus, we aim to present a simple yet\nimportant goal of reducing false positives in medical anomaly detection. We\nintroduce a Contrastive LAnguage Prompting (CLAP) method that leverages both\npositive and negative text prompts. This straightforward approach identifies\npotential lesion regions by visual attention to the positive prompts in the\ngiven image. To reduce false positives, we attenuate attention on normal\nregions using negative prompts. Extensive experiments with the BMAD dataset,\nincluding six biomedical benchmarks, demonstrate that CLAP method enhances\nanomaly detection performance. Our future plans include developing an automated\nfine prompting method for more practical usage.\n","authors":["YeongHyeon Park","Myung Jin Kim","Hyeong Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2411.07546v1.pdf","comment":"4 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.01652v2","updated":"2024-11-12T04:33:26Z","published":"2024-09-03T06:45:22Z","title":"ReKep: Spatio-Temporal Reasoning of Relational Keypoint Constraints for\n Robotic Manipulation","summary":" Representing robotic manipulation tasks as constraints that associate the\nrobot and the environment is a promising way to encode desired robot behaviors.\nHowever, it remains unclear how to formulate the constraints such that they are\n1) versatile to diverse tasks, 2) free of manual labeling, and 3) optimizable\nby off-the-shelf solvers to produce robot actions in real-time. In this work,\nwe introduce Relational Keypoint Constraints (ReKep), a visually-grounded\nrepresentation for constraints in robotic manipulation. Specifically, ReKep is\nexpressed as Python functions mapping a set of 3D keypoints in the environment\nto a numerical cost. We demonstrate that by representing a manipulation task as\na sequence of Relational Keypoint Constraints, we can employ a hierarchical\noptimization procedure to solve for robot actions (represented by a sequence of\nend-effector poses in SE(3)) with a perception-action loop at a real-time\nfrequency. Furthermore, in order to circumvent the need for manual\nspecification of ReKep for each new task, we devise an automated procedure that\nleverages large vision models and vision-language models to produce ReKep from\nfree-form language instructions and RGB-D observations. We present system\nimplementations on a wheeled single-arm platform and a stationary dual-arm\nplatform that can perform a large variety of manipulation tasks, featuring\nmulti-stage, in-the-wild, bimanual, and reactive behaviors, all without\ntask-specific data or environment models. Website at\nhttps://rekep-robot.github.io/.\n","authors":["Wenlong Huang","Chen Wang","Yunzhu Li","Ruohan Zhang","Li Fei-Fei"],"pdf_url":"https://arxiv.org/pdf/2409.01652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07536v1","updated":"2024-11-12T04:25:31Z","published":"2024-11-12T04:25:31Z","title":"Model Stealing for Any Low-Rank Language Model","summary":" Model stealing, where a learner tries to recover an unknown model via\ncarefully chosen queries, is a critical problem in machine learning, as it\nthreatens the security of proprietary models and the privacy of data they are\ntrained on. In recent years, there has been particular interest in stealing\nlarge language models (LLMs). In this paper, we aim to build a theoretical\nunderstanding of stealing language models by studying a simple and\nmathematically tractable setting. We study model stealing for Hidden Markov\nModels (HMMs), and more generally low-rank language models.\n We assume that the learner works in the conditional query model, introduced\nby Kakade, Krishnamurthy, Mahajan and Zhang. Our main result is an efficient\nalgorithm in the conditional query model, for learning any low-rank\ndistribution. In other words, our algorithm succeeds at stealing any language\nmodel whose output distribution is low-rank. This improves upon the previous\nresult by Kakade, Krishnamurthy, Mahajan and Zhang, which also requires the\nunknown distribution to have high \"fidelity\", a property that holds only in\nrestricted cases. There are two key insights behind our algorithm: First, we\nrepresent the conditional distributions at each timestep by constructing\nbarycentric spanners among a collection of vectors of exponentially large\ndimension. Second, for sampling from our representation, we iteratively solve a\nsequence of convex optimization problems that involve projection in relative\nentropy to prevent compounding of errors over the length of the sequence. This\nis an interesting example where, at least theoretically, allowing a machine\nlearning model to solve more complex problems at inference time can lead to\ndrastic improvements in its performance.\n","authors":["Allen Liu","Ankur Moitra"],"pdf_url":"https://arxiv.org/pdf/2411.07536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09729v2","updated":"2024-11-12T04:19:32Z","published":"2024-10-13T05:19:09Z","title":"MIRAGE: Multimodal Identification and Recognition of Annotations in\n Indian General Prescriptions","summary":" Hospitals in India still rely on handwritten medical records despite the\navailability of Electronic Medical Records (EMR), complicating statistical\nanalysis and record retrieval. Handwritten records pose a unique challenge,\nrequiring specialized data for training models to recognize medications and\ntheir recommendation patterns. While traditional handwriting recognition\napproaches employ 2-D LSTMs, recent studies have explored using Multimodal\nLarge Language Models (MLLMs) for OCR tasks. Building on this approach, we\nfocus on extracting medication names and dosages from simulated medical\nrecords. Our methodology MIRAGE (Multimodal Identification and Recognition of\nAnnotations in indian GEneral prescriptions) involves fine-tuning the QWEN VL,\nLLaVA 1.6 and Idefics2 models on 743,118 high resolution simulated medical\nrecord images-fully annotated from 1,133 doctors across India. Our approach\nachieves 82% accuracy in extracting medication names and dosages.\n","authors":["Tavish Mankash","V. S. Chaithanya Kota","Anish De","Praveen Prakash","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2410.09729v2.pdf","comment":"5 pages, 9 figures, 3 tables, submitted to ISBI 2025"},{"id":"http://arxiv.org/abs/2411.07070v2","updated":"2024-11-12T04:12:32Z","published":"2024-11-11T15:46:07Z","title":"On Active Privacy Auditing in Supervised Fine-tuning for White-Box\n Language Models","summary":" The pretraining and fine-tuning approach has become the leading technique for\nvarious NLP applications. However, recent studies reveal that fine-tuning data,\ndue to their sensitive nature, domain-specific characteristics, and\nidentifiability, pose significant privacy concerns. To help develop more\nprivacy-resilient fine-tuning models, we introduce a novel active privacy\nauditing framework, dubbed Parsing, designed to identify and quantify privacy\nleakage risks during the supervised fine-tuning (SFT) of language models (LMs).\nThe framework leverages improved white-box membership inference attacks (MIAs)\nas the core technology, utilizing novel learning objectives and a two-stage\npipeline to monitor the privacy of the LMs' fine-tuning process, maximizing the\nexposure of privacy risks. Additionally, we have improved the effectiveness of\nMIAs on large LMs including GPT-2, Llama2, and certain variants of them. Our\nresearch aims to provide the SFT community of LMs with a reliable, ready-to-use\nprivacy auditing tool, and to offer valuable insights into safeguarding privacy\nduring the fine-tuning process. Experimental results confirm the framework's\nefficiency across various models and tasks, emphasizing notable privacy\nconcerns in the fine-tuning process. Project code available for\nhttps://anonymous.4open.science/r/PARSING-4817/.\n","authors":["Qian Sun","Hanpeng Wu","Xi Sheryl Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07133v2","updated":"2024-11-12T04:05:54Z","published":"2024-11-11T17:06:48Z","title":"Stronger Models are NOT Stronger Teachers for Instruction Tuning","summary":" Instruction tuning has been widely adopted to ensure large language models\n(LLMs) follow user instructions effectively. The resulting\ninstruction-following capabilities of LLMs heavily rely on the instruction\ndatasets used for tuning. Recently, synthetic instruction datasets have emerged\nas an economically viable solution to provide LLMs diverse and high-quality\ninstructions. However, existing approaches typically assume that larger or\nstronger models are stronger teachers for instruction tuning, and hence simply\nadopt these models as response generators to the synthetic instructions. In\nthis paper, we challenge this commonly-adopted assumption. Our extensive\nexperiments across five base models and twenty response generators reveal that\nlarger and stronger models are not necessarily stronger teachers of smaller\nmodels. We refer to this phenomenon as the Larger Models' Paradox. We observe\nthat existing metrics cannot precisely predict the effectiveness of response\ngenerators since they ignore the compatibility between teachers and base models\nbeing fine-tuned. We thus develop a novel metric, named as\nCompatibility-Adjusted Reward (CAR) to measure the effectiveness of response\ngenerators. Our experiments across five base models demonstrate that CAR\noutperforms almost all baselines.\n","authors":["Zhangchen Xu","Fengqing Jiang","Luyao Niu","Bill Yuchen Lin","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2411.07133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07529v1","updated":"2024-11-12T04:01:09Z","published":"2024-11-12T04:01:09Z","title":"Evaluating ChatGPT-3.5 Efficiency in Solving Coding Problems of\n Different Complexity Levels: An Empirical Analysis","summary":" ChatGPT and other large language models (LLMs) promise to revolutionize\nsoftware development by automatically generating code from program\nspecifications. We assess the performance of ChatGPT's GPT-3.5-turbo model on\nLeetCode, a popular platform with algorithmic coding challenges for technical\ninterview practice, across three difficulty levels: easy, medium, and hard. We\ntest three main hypotheses. First, ChatGPT solves fewer problems as difficulty\nrises (Hypothesis 1). Second, prompt engineering improves ChatGPT's\nperformance, with greater gains on easier problems and diminishing returns on\nharder ones (Hypothesis 2). Third, ChatGPT performs better in popular languages\nlike Python, Java, and C++ than in less common ones like Elixir, Erlang, and\nRacket (Hypothesis 3). To investigate these hypotheses, we conduct automated\nexperiments using Python scripts to generate prompts that instruct ChatGPT to\ncreate Python solutions. These solutions are stored and manually submitted on\nLeetCode to check their correctness. For Hypothesis 1, results show the\nGPT-3.5-turbo model successfully solves 92% of easy, 79% of medium, and 51% of\nhard problems. For Hypothesis 2, prompt engineering yields improvements: 14-29%\nfor Chain of Thought Prompting, 38-60% by providing failed test cases in a\nsecond feedback prompt, and 33-58% by switching to GPT-4. From a random subset\nof problems ChatGPT solved in Python, it also solved 78% in Java, 50% in C++,\nand none in Elixir, Erlang, or Racket. These findings generally validate all\nthree hypotheses.\n","authors":["Minda Li","Bhaskar Krishnamachari"],"pdf_url":"https://arxiv.org/pdf/2411.07529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07528v1","updated":"2024-11-12T03:56:07Z","published":"2024-11-12T03:56:07Z","title":"SecEncoder: Logs are All You Need in Security","summary":" Large and Small Language Models (LMs) are typically pretrained using\nextensive volumes of text, which are sourced from publicly accessible platforms\nsuch as Wikipedia, Book Corpus, or through web scraping. These models, due to\ntheir exposure to a wide range of language data, exhibit impressive\ngeneralization capabilities and can perform a multitude of tasks\nsimultaneously. However, they often fall short when it comes to domain-specific\ntasks due to their broad training data. This paper introduces SecEncoder, a\nspecialized small language model that is pretrained using security logs.\nSecEncoder is designed to address the domain-specific limitations of general\nLMs by focusing on the unique language and patterns found in security logs.\nExperimental results indicate that SecEncoder outperforms other LMs, such as\nBERTlarge, DeBERTa-v3-large and OpenAI's Embedding (textembedding-ada-002)\nmodels, which are pretrained mainly on natural language, across various tasks.\nFurthermore, although SecEncoder is primarily pretrained on log data, it\noutperforms models pretrained on natural language for a range of tasks beyond\nlog analysis, such as incident prioritization and threat intelligence document\nretrieval. This suggests that domain specific pretraining with logs can\nsignificantly enhance the performance of LMs in security. These findings pave\nthe way for future research into security-specific LMs and their potential\napplications.\n","authors":["Muhammed Fatih Bulut","Yingqi Liu","Naveed Ahmad","Maximilian Turner","Sami Ait Ouahmane","Cameron Andrews","Lloyd Greenwald"],"pdf_url":"https://arxiv.org/pdf/2411.07528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09147v4","updated":"2024-11-12T03:50:10Z","published":"2024-02-14T12:56:58Z","title":"Into the Unknown: Self-Learning Large Language Models","summary":" We address the main problem of self-learning LLM: the question of what to\nlearn. We propose a self-learning LLM framework that enables an LLM to\nindependently learn previously unknown knowledge through self-assessment of\ntheir own hallucinations. We introduce a concept called Point in the Unknown\n(PiU) to identify atomic knowledge unknown to a model, along with four methods\nfor automatic PiUs identification, facilitating the creation of a self-learning\nloop that focuses exclusively on the absorption of currently unknown knowledge\ninto the model. Additionally, we developed evaluation metrics to gauge an LLM's\nself-learning capability. Our experiments revealed that LLMs with at least 3B\nparameters that have undergone some instruction training would be able to\nperform self-learning well. We further proved the effectiveness of\nself-learning by comparing the performance of a model that has undergone\nself-learning to a model that has not. Our self-learning concept allows more\nefficient LLM updates and opens new perspectives for LLM knowledge exchange.\n","authors":["Teddy Ferdinan","Jan Kocoń","Przemysław Kazienko"],"pdf_url":"https://arxiv.org/pdf/2402.09147v4.pdf","comment":"Accepted to SENTIRE 2024 (ICDM Workshops):\n https://sentic.net/sentire2024ferdinan.pdf"},{"id":"http://arxiv.org/abs/2411.07521v1","updated":"2024-11-12T03:37:53Z","published":"2024-11-12T03:37:53Z","title":"Fair Summarization: Bridging Quality and Diversity in Extractive\n Summaries","summary":" Fairness in multi-document summarization of user-generated content remains a\ncritical challenge in natural language processing (NLP). Existing summarization\nmethods often fail to ensure equitable representation across different social\ngroups, leading to biased outputs. In this paper, we introduce two novel\nmethods for fair extractive summarization: FairExtract, a clustering-based\napproach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints.\nWe evaluate these methods using Divsumm summarization dataset of White-aligned,\nHispanic, and African-American dialect tweets and compare them against relevant\nbaselines. The results obtained using a comprehensive set of summarization\nquality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well\nas a fairness metric F, demonstrate that FairExtract and FairGPT achieve\nsuperior fairness while maintaining competitive summarization quality.\nAdditionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that\nintegrate quality and fairness into a single evaluation framework, offering a\nmore nuanced understanding of the trade-offs between these objectives. This\nwork highlights the importance of fairness in summarization and sets a\nbenchmark for future research in fairness-aware NLP models.\n","authors":["Sina Bagheri Nezhad","Sayan Bandyapadhyay","Ameeta Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.07521v1.pdf","comment":"Accepted at Algorithmic Fairness through the Lens of Metrics and\n Evaluation Workshop @ NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.07519v1","updated":"2024-11-12T03:33:08Z","published":"2024-11-12T03:33:08Z","title":"TIPS: Threat Actor Informed Prioritization of Applications using\n SecEncoder","summary":" This paper introduces TIPS: Threat Actor Informed Prioritization using\nSecEncoder, a specialized language model for security. TIPS combines the\nstrengths of both encoder and decoder language models to detect and prioritize\ncompromised applications. By integrating threat actor intelligence, TIPS\nenhances the accuracy and relevance of its detections. Extensive experiments\nwith a real-world benchmark dataset of applications demonstrate TIPS's high\nefficacy, achieving an F-1 score of 0.90 in identifying malicious applications.\nAdditionally, in real-world scenarios, TIPS significantly reduces the backlog\nof investigations for security analysts by 87%, thereby streamlining the threat\nresponse process and improving overall security posture.\n","authors":["Muhammed Fatih Bulut","Acar Tamersoy","Naveed Ahmad","Yingqi Liu","Lloyd Greenwald"],"pdf_url":"https://arxiv.org/pdf/2411.07519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07518v1","updated":"2024-11-12T03:32:30Z","published":"2024-11-12T03:32:30Z","title":"LLM App Squatting and Cloning","summary":" Impersonation tactics, such as app squatting and app cloning, have posed\nlongstanding challenges in mobile app stores, where malicious actors exploit\nthe names and reputations of popular apps to deceive users. With the rapid\ngrowth of Large Language Model (LLM) stores like GPT Store and FlowGPT, these\nissues have similarly surfaced, threatening the integrity of the LLM app\necosystem. In this study, we present the first large-scale analysis of LLM app\nsquatting and cloning using our custom-built tool, LLMappCrazy. LLMappCrazy\ncovers 14 squatting generation techniques and integrates Levenshtein distance\nand BERT-based semantic analysis to detect cloning by analyzing app functional\nsimilarities. Using this tool, we generated variations of the top 1000 app\nnames and found over 5,000 squatting apps in the dataset. Additionally, we\nobserved 3,509 squatting apps and 9,575 cloning cases across six major\nplatforms. After sampling, we find that 18.7% of the squatting apps and 4.9% of\nthe cloning apps exhibited malicious behavior, including phishing, malware\ndistribution, fake content dissemination, and aggressive ad injection.\n","authors":["Yinglin Xie","Xinyi Hou","Yanjie Zhao","Kai Chen","Haoyu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17465v3","updated":"2024-11-12T03:27:41Z","published":"2024-03-26T07:55:16Z","title":"LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated\n Image Detection","summary":" The evolution of Diffusion Models has dramatically improved image generation\nquality, making it increasingly difficult to differentiate between real and\ngenerated images. This development, while impressive, also raises significant\nprivacy and security concerns. In response to this, we propose a novel Latent\nREconstruction error guided feature REfinement method (LaRE^2) for detecting\nthe diffusion-generated images. We come up with the Latent Reconstruction Error\n(LaRE), the first reconstruction-error based feature in the latent space for\ngenerated image detection. LaRE surpasses existing methods in terms of feature\nextraction efficiency while preserving crucial cues required to differentiate\nbetween the real and the fake. To exploit LaRE, we propose an Error-Guided\nfeature REfinement module (EGRE), which can refine the image feature guided by\nLaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an\nalign-then-refine mechanism, which effectively refines the image feature for\ngenerated-image detection from both spatial and channel perspectives. Extensive\nexperiments on the large-scale GenImage benchmark demonstrate the superiority\nof our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1%\naverage ACC/AP across 8 different image generators. LaRE also surpasses\nexisting methods in terms of feature extraction cost, delivering an impressive\nspeed enhancement of 8 times. Code is available.\n","authors":["Yunpeng Luo","Junlong Du","Ke Yan","Shouhong Ding"],"pdf_url":"https://arxiv.org/pdf/2403.17465v3.pdf","comment":"CVPR 2024. Code is available at https://github.com/luo3300612/LaRE"},{"id":"http://arxiv.org/abs/2411.07510v1","updated":"2024-11-12T03:09:14Z","published":"2024-11-12T03:09:14Z","title":"An Attack Traffic Identification Method Based on Temporal Spectrum","summary":" To address the issues of insufficient robustness, unstable features, and data\nnoise interference in existing network attack detection and identification\nmodels, this paper proposes an attack traffic detection and identification\nmethod based on temporal spectrum. First, traffic data is segmented by a\nsliding window to construct a feature sequence and a corresponding label\nsequence for network traffic. Next, the proposed spectral label generation\nmethods, SSPE and COAP, are applied to transform the label sequence into\nspectral labels and the feature sequence into temporal features. Spectral\nlabels and temporal features are used to capture and represent behavioral\npatterns of attacks. Finally, the constructed temporal features and spectral\nlabels are used to train models, which subsequently detects and identifies\nnetwork attack behaviors. Experimental results demonstrate that compared to\ntraditional methods, models trained with the SSPE or COAP method improve\nidentification accuracy by 10%, and exhibit strong robustness, particularly in\nnoisy environments.\n","authors":["Wenwei Xie","Jie Yin","Zihao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07510v1.pdf","comment":"20 pages, 7 figures, 7 tables, 8 formulas"},{"id":"http://arxiv.org/abs/2411.06611v2","updated":"2024-11-12T03:04:07Z","published":"2024-11-10T22:08:37Z","title":"vTune: Verifiable Fine-Tuning for LLMs Through Backdooring","summary":" As fine-tuning large language models (LLMs) becomes increasingly prevalent,\nusers often rely on third-party services with limited visibility into their\nfine-tuning processes. This lack of transparency raises the question: how do\nconsumers verify that fine-tuning services are performed correctly? For\ninstance, a service provider could claim to fine-tune a model for each user,\nyet simply send all users back the same base model. To address this issue, we\npropose vTune, a simple method that uses a small number of backdoor data points\nadded to the training data to provide a statistical test for verifying that a\nprovider fine-tuned a custom model on a particular user's dataset. Unlike\nexisting works, vTune is able to scale to verification of fine-tuning on\nstate-of-the-art LLMs, and can be used both with open-source and closed-source\nmodels. We test our approach across several model families and sizes as well as\nacross multiple instruction-tuning datasets, and find that the statistical test\nis satisfied with p-values on the order of $\\sim 10^{-40}$, with no negative\nimpact on downstream task performance. Further, we explore several attacks that\nattempt to subvert vTune and demonstrate the method's robustness to these\nattacks.\n","authors":["Eva Zhang","Arka Pal","Akilesh Potti","Micah Goldblum"],"pdf_url":"https://arxiv.org/pdf/2411.06611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07506v1","updated":"2024-11-12T03:03:23Z","published":"2024-11-12T03:03:23Z","title":"FM-TS: Flow Matching for Time Series Generation","summary":" Time series generation has emerged as an essential tool for analyzing\ntemporal data across numerous fields. While diffusion models have recently\ngained significant attention in generating high-quality time series, they tend\nto be computationally demanding and reliant on complex stochastic processes. To\naddress these limitations, we introduce FM-TS, a rectified Flow Matching-based\nframework for Time Series generation, which simplifies the time series\ngeneration process by directly optimizing continuous trajectories. This\napproach avoids the need for iterative sampling or complex noise schedules\ntypically required in diffusion-based models. FM-TS is more efficient in terms\nof training and inference. Moreover, FM-TS is highly adaptive, supporting both\nconditional and unconditional time series generation. Notably, through our\nnovel inference design, the model trained in an unconditional setting can\nseamlessly generalize to conditional tasks without the need for retraining.\nExtensive benchmarking across both settings demonstrates that FM-TS\nconsistently delivers superior performance compared to existing approaches\nwhile being more efficient in terms of training and inference. For instance, in\nterms of discriminative score, FM-TS achieves 0.005, 0.019, 0.011, 0.005,\n0.053, and 0.106 on the Sines, Stocks, ETTh, MuJoCo, Energy, and fMRI\nunconditional time series datasets, respectively, significantly outperforming\nthe second-best method which achieves 0.006, 0.067, 0.061, 0.008, 0.122, and\n0.167 on the same datasets. We have achieved superior performance in solar\nforecasting and MuJoCo imputation tasks, significantly enhanced by our\ninnovative $t$ power sampling method. The code is available at\nhttps://github.com/UNITES-Lab/FMTS.\n","authors":["Yang Hu","Xiao Wang","Lirong Wu","Huatian Zhang","Stan Z. Li","Sheng Wang","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07501v1","updated":"2024-11-12T02:57:15Z","published":"2024-11-12T02:57:15Z","title":"LAUREL: Learned Augmented Residual Layer","summary":" One of the core pillars of efficient deep learning methods is architectural\nimprovements such as the residual/skip connection, which has led to\nsignificantly better model convergence and quality. Since then the residual\nconnection has become ubiquitous in not just convolutional neural networks but\nalso transformer-based architectures, the backbone of LLMs.\n In this paper we introduce \\emph{Learned Augmented Residual Layer} (LAuReL)\n-- a novel generalization of the canonical residual connection -- with the goal\nto be an in-situ replacement of the latter while outperforming on both model\nquality and footprint metrics. Our experiments show that using \\laurel can help\nboost performance for both vision and language models. For example, on the\nResNet-50, ImageNet 1K task, it achieves $60\\%$ of the gains from adding an\nextra layer, while only adding $0.003\\%$ more parameters, and matches it while\nadding $2.6\\times$ fewer parameters.\n","authors":["Gaurav Menghani","Ravi Kumar","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07501v1.pdf","comment":"Accepted at the 2nd Efficient Systems for Foundation Models Workshop\n at the International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2312.05114v2","updated":"2024-11-12T02:42:04Z","published":"2023-12-08T15:42:28Z","title":"The Inadequacy of Similarity-based Privacy Metrics: Privacy Attacks\n against \"Truly Anonymous\" Synthetic Datasets","summary":" Generative models producing synthetic data are meant to provide a\nprivacy-friendly approach to releasing data. However, their privacy guarantees\nare only considered robust when models satisfy Differential Privacy (DP). Alas,\nthis is not a ubiquitous standard, as many leading companies (and, in fact,\nresearch papers) use ad-hoc privacy metrics based on testing the statistical\nsimilarity between synthetic and real data. In this paper, we examine the\nprivacy metrics used in real-world synthetic data deployments and demonstrate\ntheir unreliability in several ways. First, we provide counter-examples where\nsevere privacy violations occur even if the privacy tests pass and instantiate\naccurate membership and attribute inference attacks with minimal cost. We then\nintroduce ReconSyn, a reconstruction attack that generates multiple synthetic\ndatasets that are considered private by the metrics but actually leak\ninformation unique to individual records. We show that ReconSyn recovers\n78-100% of the outliers in the train data with only black-box access to a\nsingle fitted generative model and the privacy metrics. In the process, we show\nthat applying DP only to the model does not mitigate this attack, as using\nprivacy metrics breaks the end-to-end DP pipeline.\n","authors":["Georgi Ganev","Emiliano De Cristofaro"],"pdf_url":"https://arxiv.org/pdf/2312.05114v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00774v2","updated":"2024-11-12T02:18:38Z","published":"2024-11-01T17:59:51Z","title":"Freeze-Omni: A Smart and Low Latency Speech-to-speech Dialogue Model\n with Frozen LLM","summary":" Rapidly developing large language models (LLMs) have brought tremendous\nintelligent applications. GPT-4o's excellent duplex speech interaction ability\nhas recently brought impressive experience to users. Researchers have recently\nproposed several multi-modal LLMs in this direction that can achieve\nspeech-to-speech dialogue. This paper proposes a novel speech-text multimodal\nLLM architecture called Freeze-Omni. Our main contribution is that the speech\ninput and output modalities can be easily connected to a textual LLM while\nkeeping the LLM's parameters frozen throughout the training process. We\ndesigned 3-stage training strategies both for the modeling of speech input and\noutput, enabling Freeze-Omni to obtain speech-to-speech dialogue ability using\ntext-speech paired data (such as ASR and TTS data) and only 60,000 multi-round\ntext Q&A data on 8 GPUs. Moreover, we can effectively ensure that the\nintelligence of the Freeze-Omni in the speech modality is at the same level\ncompared with that in the text modality of its backbone LLM, while the\nend-to-end latency of the spoken response achieves a low level. In addition, we\nalso designed a method to achieve duplex dialogue ability through multi-task\ntraining, making Freeze-Omni have a more natural style of dialogue ability\nbetween the users. Freeze-Omni mainly provides a possibility for researchers to\nconduct multimodal LLM under the condition of a frozen LLM, avoiding various\nimpacts caused by the catastrophic forgetting of LLM caused by fewer data and\ntraining resources.\n","authors":["Xiong Wang","Yangze Li","Chaoyou Fu","Yunhang Shen","Lei Xie","Ke Li","Xing Sun","Long Ma"],"pdf_url":"https://arxiv.org/pdf/2411.00774v2.pdf","comment":"Project Page: https://freeze-omni.github.io/"},{"id":"http://arxiv.org/abs/2406.06535v3","updated":"2024-11-12T02:10:13Z","published":"2024-04-23T03:11:08Z","title":"Utilizing Graph Generation for Enhanced Domain Adaptive Object Detection","summary":" The problem of Domain Adaptive in the field of Object Detection involves the\ntransfer of object detection models from labeled source domains to unannotated\ntarget domains. Recent advancements in this field aim to address domain\ndiscrepancies by aligning pixel-pairs across domains within a non-Euclidean\ngraphical space, thereby minimizing semantic distribution variance. Despite\ntheir remarkable achievements, these methods often use coarse semantic\nrepresentations to model graphs, mainly due to ignoring non-informative\nelements and failing to focus on precise semantic alignment. Additionally, the\ngeneration of coarse graphs inherently introduces abnormal nodes, posing\nchallenges and potentially biasing domain adaptation outcomes. Consequently, we\npropose a framework, which utilizes the Graph Generation to enhance the quality\nof DAOD (\\method{}). Specifically, we introduce a Node Refinement module that\nutilizes a memory bank to reconstruct noisy sampled nodes while applying\ncontrastive regularization to noisy features. To enhance semantic alignment, we\npropose separating domain-specific styles from category invariance encoded\nwithin graph covariances, which allows us to selectively remove domain-specific\nstyles while preserving category-invariant information, thus facilitating more\naccurate semantic alignment across different domains. Furthermore, we propose a\nGraph Optimization adaptor, leveraging variational inference to mitigate the\nimpact of abnormal nodes. Extensive experimentation across three adaptation\nbenchmarks validates that \\method{} achieves state-of-the-art performance in\nthe task of unsupervised domain adaptation.\n","authors":["Mu Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06535v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07482v1","updated":"2024-11-12T02:08:19Z","published":"2024-11-12T02:08:19Z","title":"Enhancing Link Prediction with Fuzzy Graph Attention Networks and\n Dynamic Negative Sampling","summary":" Link prediction is crucial for understanding complex networks but traditional\nGraph Neural Networks (GNNs) often rely on random negative sampling, leading to\nsuboptimal performance. This paper introduces Fuzzy Graph Attention Networks\n(FGAT), a novel approach integrating fuzzy rough sets for dynamic negative\nsampling and enhanced node feature aggregation. Fuzzy Negative Sampling (FNS)\nsystematically selects high-quality negative edges based on fuzzy similarities,\nimproving training efficiency. FGAT layer incorporates fuzzy rough set\nprinciples, enabling robust and discriminative node representations.\nExperiments on two research collaboration networks demonstrate FGAT's superior\nlink prediction accuracy, outperforming state-of-the-art baselines by\nleveraging the power of fuzzy rough sets for effective negative sampling and\nnode feature learning.\n","authors":["Jinming Xing"],"pdf_url":"https://arxiv.org/pdf/2411.07482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13755v2","updated":"2024-11-12T02:01:37Z","published":"2024-09-15T10:50:51Z","title":"Entity-Aware Self-Attention and Contextualized GCN for Enhanced Relation\n Extraction in Long Sentences","summary":" Relation extraction as an important natural Language processing (NLP) task is\nto identify relations between named entities in text. Recently, graph\nconvolutional networks over dependency trees have been widely used to capture\nsyntactic features and achieved attractive performance. However, most existing\ndependency-based approaches ignore the positive influence of the words outside\nthe dependency trees, sometimes conveying rich and useful information on\nrelation extraction. In this paper, we propose a novel model, Entity-aware\nSelf-attention Contextualized GCN (ESC-GCN), which efficiently incorporates\nsyntactic structure of input sentences and semantic context of sequences. To be\nspecific, relative position self-attention obtains the overall semantic\npairwise correlation related to word position, and contextualized graph\nconvolutional networks capture rich intra-sentence dependencies between words\nby adequately pruning operations. Furthermore, entity-aware attention layer\ndynamically selects which token is more decisive to make final relation\nprediction. In this way, our proposed model not only reduces the noisy impact\nfrom dependency trees, but also obtains easily-ignored entity-related semantic\nrepresentation. Extensive experiments on various tasks demonstrate that our\nmodel achieves encouraging performance as compared to existing dependency-based\nand sequence-based models. Specially, our model excels in extracting relations\nbetween entities of long sentences.\n","authors":["Xin Wang","Xinyi Bai"],"pdf_url":"https://arxiv.org/pdf/2409.13755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00625v2","updated":"2024-11-12T01:47:40Z","published":"2024-09-01T05:59:54Z","title":"Entity-Aware Biaffine Attention Model for Improved Constituent Parsing\n with Reduced Entity Violations","summary":" Constituency parsing involves analyzing a sentence by breaking it into\nsub-phrases, or constituents. While many deep neural models have achieved\nstate-of-the-art performance in this task, they often overlook the\nentity-violating issue, where an entity fails to form a complete sub-tree in\nthe resultant parsing tree. To address this, we propose an entity-aware\nbiaffine attention model for constituent parsing. This model incorporates\nentity information into the biaffine attention mechanism by using additional\nentity role vectors for potential phrases, which enhances the parsing accuracy.\nWe introduce a new metric, the Entity Violating Rate (EVR), to quantify the\nextent of entity violations in parsing results. Experiments on three popular\ndatasets-ONTONOTES, PTB, and CTB-demonstrate that our model achieves the lowest\nEVR while maintaining high precision, recall, and F1-scores comparable to\nexisting models. Further evaluation in downstream tasks, such as sentence\nsentiment analysis, highlights the effectiveness of our model and the validity\nof the proposed EVR metric.\n","authors":["Xinyi Bai"],"pdf_url":"https://arxiv.org/pdf/2409.00625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07520v3","updated":"2024-11-12T01:45:49Z","published":"2024-06-11T17:50:15Z","title":"Neural Gaffer: Relighting Any Object via Diffusion","summary":" Single-image relighting is a challenging task that involves reasoning about\nthe complex interplay between geometry, materials, and lighting. Many prior\nmethods either support only specific categories of images, such as portraits,\nor require special capture conditions, like using a flashlight. Alternatively,\nsome methods explicitly decompose a scene into intrinsic components, such as\nnormals and BRDFs, which can be inaccurate or under-expressive. In this work,\nwe propose a novel end-to-end 2D relighting diffusion model, called Neural\nGaffer, that takes a single image of any object and can synthesize an accurate,\nhigh-quality relit image under any novel environmental lighting condition,\nsimply by conditioning an image generator on a target environment map, without\nan explicit scene decomposition. Our method builds on a pre-trained diffusion\nmodel, and fine-tunes it on a synthetic relighting dataset, revealing and\nharnessing the inherent understanding of lighting present in the diffusion\nmodel. We evaluate our model on both synthetic and in-the-wild Internet imagery\nand demonstrate its advantages in terms of generalization and accuracy.\nMoreover, by combining with other generative methods, our model enables many\ndownstream 2D tasks, such as text-based relighting and object insertion. Our\nmodel can also operate as a strong relighting prior for 3D tasks, such as\nrelighting a radiance field.\n","authors":["Haian Jin","Yuan Li","Fujun Luan","Yuanbo Xiangli","Sai Bi","Kai Zhang","Zexiang Xu","Jin Sun","Noah Snavely"],"pdf_url":"https://arxiv.org/pdf/2406.07520v3.pdf","comment":"Project Website: https://neural-gaffer.github.io"},{"id":"http://arxiv.org/abs/2407.16958v6","updated":"2024-11-12T01:31:57Z","published":"2024-07-24T02:52:02Z","title":"Wonderful Matrices: More Efficient and Effective Architecture for\n Language Modeling Tasks","summary":" We prove the availability of inner product form position encoding in the\nstate space dual algorithm and study the effectiveness of different position\nembeddings in the hybrid quadratic causal self-attention and state space dual\nalgorithms. We propose inner function attention with dynamic mask, which can\nimprove the expressiveness of the attention algorithm and avoid the sequence\nnoise significantly affecting the accuracy of the attention score. We also\ndesign cross domain mixture of experts, which can improve the granularity of\nthe sparse activation feedforward network while maintaining the efficiency of\nparameter utilization and retrieval. The combination of these methods\nconstitutes our foundation model architecture: Wonderful Matrices. We conduct\nexperiments on the language modeling task and find that Wonderful Matrices are\nmore efficient and effective in handling complex language tasks.\n","authors":["Jingze Shi","Bingheng Wu","Lu He","Luchang Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.16958v6.pdf","comment":"28 pages, 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2401.08897v3","updated":"2024-11-12T01:30:06Z","published":"2024-01-17T00:46:24Z","title":"CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in\n Variational AutoEncoder","summary":" Symmetries of input and latent vectors have provided valuable insights for\ndisentanglement learning in VAEs. However, only a few works were proposed as an\nunsupervised method, and even these works require known factor information in\nthe training data. We propose a novel method, Composite Factor-Aligned Symmetry\nLearning (CFASL), which is integrated into VAEs for learning symmetry-based\ndisentanglement in unsupervised learning without any knowledge of the dataset\nfactor information. CFASL incorporates three novel features for learning\nsymmetry-based disentanglement: 1) Injecting inductive bias to align latent\nvector dimensions to factor-aligned symmetries within an explicit learnable\nsymmetry code-book 2) Learning a composite symmetry to express unknown factors\nchange between two random samples by learning factor-aligned symmetries within\nthe codebook 3) Inducing a group equivariant encoder and decoder in training\nVAEs with the two conditions. In addition, we propose an extended evaluation\nmetric for multi-factor changes in comparison to disentanglement evaluation in\nVAEs. In quantitative and in-depth qualitative analysis, CFASL demonstrates a\nsignificant improvement of disentanglement in single-factor change, and\nmulti-factor change conditions compared to state-of-the-art methods.\n","authors":["Hee-Jun Jung","Jaehyoung Jeong","Kangil Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08897v3.pdf","comment":"Accepted in TMLR 25 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.01332v3","updated":"2024-11-12T01:06:22Z","published":"2024-03-29T22:49:43Z","title":"Explaining Large Language Models Decisions Using Shapley Values","summary":" The emergence of large language models (LLMs) has opened up exciting\npossibilities for simulating human behavior and cognitive processes, with\npotential applications in various domains, including marketing research and\nconsumer behavior analysis. However, the validity of utilizing LLMs as\nstand-ins for human subjects remains uncertain due to glaring divergences that\nsuggest fundamentally different underlying processes at play and the\nsensitivity of LLM responses to prompt variations. This paper presents a novel\napproach based on Shapley values from cooperative game theory to interpret LLM\nbehavior and quantify the relative contribution of each prompt component to the\nmodel's output. Through two applications - a discrete choice experiment and an\ninvestigation of cognitive biases - we demonstrate how the Shapley value method\ncan uncover what we term \"token noise\" effects, a phenomenon where LLM\ndecisions are disproportionately influenced by tokens providing minimal\ninformative content. This phenomenon raises concerns about the robustness and\ngeneralizability of insights obtained from LLMs in the context of human\nbehavior simulation. Our model-agnostic approach extends its utility to\nproprietary LLMs, providing a valuable tool for practitioners and researchers\nto strategically optimize prompts and mitigate apparent cognitive biases. Our\nfindings underscore the need for a more nuanced understanding of the factors\ndriving LLM responses before relying on them as substitutes for human subjects\nin survey settings. We emphasize the importance of researchers reporting\nresults conditioned on specific prompt templates and exercising caution when\ndrawing parallels between human behavior and LLMs.\n","authors":["Behnam Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2404.01332v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07466v1","updated":"2024-11-12T01:05:55Z","published":"2024-11-12T01:05:55Z","title":"IdentifyMe: A Challenging Long-Context Mention Resolution Benchmark","summary":" Recent evaluations of LLMs on coreference resolution have revealed that\ntraditional output formats and evaluation metrics do not fully capture the\nmodels' referential understanding. To address this, we introduce IdentifyMe, a\nnew benchmark for mention resolution presented in a multiple-choice question\n(MCQ) format, commonly used for evaluating LLMs. IdentifyMe features long\nnarratives and employs heuristics to exclude easily identifiable mentions,\ncreating a more challenging task. The benchmark also consists of a curated\nmixture of different mention types and corresponding entities, allowing for a\nfine-grained analysis of model performance. We evaluate both closed- and open\nsource LLMs on IdentifyMe and observe a significant performance gap (20-30%)\nbetween the state-of-the-art sub-10B open models vs. closed ones. We observe\nthat pronominal mentions, which have limited surface information, are typically\nmuch harder for models to resolve than nominal mentions. Additionally, we find\nthat LLMs often confuse entities when their mentions overlap in nested\nstructures. The highest-scoring model, GPT-4o, achieves 81.9% accuracy,\nhighlighting the strong referential capabilities of state-of-the-art LLMs while\nalso indicating room for further improvement.\n","authors":["Kawshik Manikantan","Makarand Tapaswi","Vineet Gandhi","Shubham Toshniwal"],"pdf_url":"https://arxiv.org/pdf/2411.07466v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.04916v4","updated":"2024-11-12T01:01:32Z","published":"2023-11-02T04:01:04Z","title":"Explainable Identification of Hate Speech towards Islam using Graph\n Neural Networks","summary":" Islamophobic language on online platforms fosters intolerance, making\ndetection and elimination crucial for promoting harmony. Traditional hate\nspeech detection models rely on NLP techniques like tokenization,\npart-of-speech tagging, and encoder-decoder models. However, Graph Neural\nNetworks (GNNs), with their ability to utilize relationships between data\npoints, offer more effective detection and greater explainability. In this\nwork, we represent speeches as nodes and connect them with edges based on their\ncontext and similarity to develop the graph. This study introduces a novel\nparadigm using GNNs to identify and explain hate speech towards Islam. Our\nmodel leverages GNNs to understand the context and patterns of hate speech by\nconnecting texts via pretrained NLP-generated word embeddings, achieving\nstate-of-the-art performance and enhancing detection accuracy while providing\nvaluable explanations. This highlights the potential of GNNs in combating\nonline hate speech and fostering a safer, more inclusive online environment.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2311.04916v4.pdf","comment":"Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (Non-archival)\n (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP\n 2024 : NLP for Positive Impact Workshop (Archival; ACL Anthology:\n https://aclanthology.org/2024.nlp4pi-1.23/)"},{"id":"http://arxiv.org/abs/2411.07464v1","updated":"2024-11-12T00:57:30Z","published":"2024-11-12T00:57:30Z","title":"BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating\n Machine Learning Tasks","summary":" Large Language Models (LLMs) excel in diverse applications including\ngeneration of code snippets, but often struggle with generating code for\ncomplex Machine Learning (ML) tasks. Although existing LLM single-agent based\nsystems give varying performance depending on the task complexity, they purely\nrely on larger and expensive models such as GPT-4. Our investigation reveals\nthat no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama\nperform far worse than GPT-4 in a single-agent setting. With the motivation of\ndeveloping a cost-efficient LLM based solution for solving ML tasks, we propose\nan LLM Multi-Agent based system which leverages combination of experts using\nprofiling, efficient retrieval of past observations, LLM cascades, and\nask-the-expert calls. Through empirical analysis on ML engineering tasks in the\nMLAgentBench benchmark, we demonstrate the effectiveness of our system, using\nno-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and\nexpert to serve occasional ask-the-expert calls for planning. With 94.2\\%\nreduction in the cost (from \\$0.931 per run cost averaged over all tasks for\nGPT-4 single agent system to \\$0.054), our system is able to yield better\naverage success rate of 32.95\\% as compared to GPT-4 single-agent system\nyielding 22.72\\% success rate averaged over all the tasks of MLAgentBench.\n","authors":["Shubham Gandhi","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2411.07464v1.pdf","comment":"Presented at AIMLSystems '24"},{"id":"http://arxiv.org/abs/2411.07461v1","updated":"2024-11-12T00:52:52Z","published":"2024-11-12T00:52:52Z","title":"BLIP3-KALE: Knowledge Augmented Large-Scale Dense Captions","summary":" We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that\nbridges the gap between descriptive synthetic captions and factual web-scale\nalt-text. KALE augments synthetic dense image captions with web-scale alt-text\nto generate factually grounded image captions. Our two-stage approach leverages\nlarge vision-language models and language models to create knowledge-augmented\ncaptions, which are then used to train a specialized VLM for scaling up the\ndataset. We train vision-language models on KALE and demonstrate improvements\non vision-language tasks. Our experiments show the utility of KALE for training\nmore capable and knowledgeable multimodal models. We release the KALE dataset\nat https://huggingface.co/datasets/Salesforce/blip3-kale\n","authors":["Anas Awadalla","Le Xue","Manli Shu","An Yan","Jun Wang","Senthil Purushwalkam","Sheng Shen","Hannah Lee","Oscar Lo","Jae Sung Park","Etash Guha","Silvio Savarese","Ludwig Schmidt","Yejin Choi","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2411.07461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07453v1","updated":"2024-11-12T00:38:17Z","published":"2024-11-12T00:38:17Z","title":"Research on fault diagnosis of nuclear power first-second circuit based\n on hierarchical multi-granularity classification network","summary":" The safe and reliable operation of complex electromechanical systems in\nnuclear power plants is crucial for the safe production of nuclear power plants\nand their nuclear power unit. Therefore, accurate and timely fault diagnosis of\nnuclear power systems is of great significance for ensuring the safe and\nreliable operation of nuclear power plants. The existing fault diagnosis\nmethods mainly target a single device or subsystem, making it difficult to\nanalyze the inherent connections and mutual effects between different types of\nfaults at the entire unit level. This article uses the AP1000 full-scale\nsimulator to simulate the important mechanical component failures of some key\nsystems in the primary and secondary circuits of nuclear power units, and\nconstructs a fault dataset. Meanwhile, a hierarchical multi granularity\nclassification fault diagnosis model based on the EfficientNet large model is\nproposed, aiming to achieve hierarchical classification of nuclear power\nfaults. The results indicate that the proposed fault diagnosis model can\neffectively classify faults in different circuits and system components of\nnuclear power units into hierarchical categories. However, the fault dataset in\nthis study was obtained from a simulator, which may introduce additional\ninformation due to parameter redundancy, thereby affecting the diagnostic\nperformance of the model.\n","authors":["Jiangwen Chen","Siwei Li","Guo Jiang","Cheng Dongzhen","Lin Hua","Wang Wei"],"pdf_url":"https://arxiv.org/pdf/2411.07453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07451v1","updated":"2024-11-12T00:24:31Z","published":"2024-11-12T00:24:31Z","title":"Optimizing Data Delivery: Insights from User Preferences on Visuals,\n Tables, and Text","summary":" In this work, we research user preferences to see a chart, table, or text\ngiven a question asked by the user. This enables us to understand when it is\nbest to show a chart, table, or text to the user for the specific question. For\nthis, we conduct a user study where users are shown a question and asked what\nthey would prefer to see and used the data to establish that a user's personal\ntraits does influence the data outputs that they prefer. Understanding how user\ncharacteristics impact a user's preferences is critical to creating data tools\nwith a better user experience. Additionally, we investigate to what degree an\nLLM can be used to replicate a user's preference with and without user\npreference data. Overall, these findings have significant implications\npertaining to the development of data tools and the replication of human\npreferences using LLMs. Furthermore, this work demonstrates the potential use\nof LLMs to replicate user preference data which has major implications for\nfuture user modeling and personalization research.\n","authors":["Reuben Luera","Ryan Rossi","Franck Dernoncourt","Alexa Siu","Sungchul Kim","Tong Yu","Ruiyi Zhang","Xiang Chen","Nedim Lipka","Zhehao Zhang","Seon Gyeom Kim","Tak Yeon Lee"],"pdf_url":"https://arxiv.org/pdf/2411.07451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13632v3","updated":"2024-11-12T00:12:39Z","published":"2023-12-21T07:48:54Z","title":"TraceFL: Interpretability-Driven Debugging in Federated Learning via\n Neuron Provenance","summary":" In Federated Learning, clients train models on local data and send updates to\na central server, which aggregates them into a global model using a fusion\nalgorithm. This collaborative yet privacy-preserving training comes at a\ncost--FL developers face significant challenges in attributing global model\npredictions to specific clients. Localizing responsible clients is a crucial\nstep towards (a) excluding clients primarily responsible for incorrect\npredictions and (b) encouraging clients who contributed high-quality models to\ncontinue participating in the future. Existing ML explainability approaches are\ninherently inapplicable as they are designed for single-model, centralized\ntraining.\n We introduce TraceFL, a fine-grained neuron provenance capturing mechanism\nthat identifies clients responsible for the global model's prediction by\ntracking the flow of information from individual clients to the global model.\nSince inference on different inputs activates a different set of neurons of the\nglobal model, TraceFL dynamically quantifies the significance of the global\nmodel's neurons in a given prediction. It then selectively picks a slice of the\nmost crucial neurons in the global model and maps them to the corresponding\nneurons in every participating client to determine each client's contribution,\nultimately localizing the responsible client. We evaluate TraceFL on six\ndatasets, including two real-world medical imaging datasets and four neural\nnetworks, including advanced models such as GPT. TraceFL achieves 99% accuracy\nin localizing the responsible client in FL tasks spanning both image and text\nclassification tasks. At a time when state-of-the-art ML debugging approaches\nare mostly domain-specific (e.g., image classification only), TraceFL is the\nfirst technique to enable highly accurate automated reasoning across a wide\nrange of FL applications.\n","authors":["Waris Gill","Ali Anwar","Muhammad Ali Gulzar"],"pdf_url":"https://arxiv.org/pdf/2312.13632v3.pdf","comment":"Accepted at 2025 IEEE/ACM 47th International Conference on Software\n Engineering (ICSE)"},{"id":"http://arxiv.org/abs/2411.07447v1","updated":"2024-11-12T00:10:34Z","published":"2024-11-12T00:10:34Z","title":"The Effect of Scheduling and Preemption on the Efficiency of LLM\n Inference Serving","summary":" The growing usage of Large Language Models (LLMs) highlights the demands and\nchallenges in scalable LLM inference systems, affecting deployment and\ndevelopment processes. On the deployment side, there is a lack of comprehensive\nanalysis on the conditions under which a particular scheduler performs better\nor worse, with performance varying substantially across different schedulers,\nhardware, models, and workloads. Manually testing each configuration on GPUs\ncan be prohibitively expensive. On the development side, unpredictable\nperformance and unknown upper limits can lead to inconclusive trial-and-error\nprocesses, consuming resources on ideas that end up ineffective. To address\nthese challenges, we introduce INFERMAX, an analytical framework that uses\ninference cost models to compare various schedulers, including an optimal\nscheduler formulated as a constraint satisfaction problem (CSP) to establish an\nupper bound on performance. Our framework offers in-depth analysis and raises\nessential questions, challenging assumptions and exploring opportunities for\nmore efficient scheduling. Notably, our findings indicate that preempting\nrequests can reduce GPU costs by 30% compared to avoiding preemptions at all.\nWe believe our methods and insights will facilitate the cost-effective\ndeployment and development of scalable, efficient inference systems and pave\nthe way for cost-based scheduling.\n","authors":["Kyoungmin Kim","Kijae Hong","Caglar Gulcehre","Anastasia Ailamaki"],"pdf_url":"https://arxiv.org/pdf/2411.07447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07444v1","updated":"2024-11-12T00:03:11Z","published":"2024-11-12T00:03:11Z","title":"Input-Based Ensemble-Learning Method for Dynamic Memory Configuration of\n Serverless Computing Functions","summary":" In today's Function-as-a-Service offerings, a programmer is usually\nresponsible for configuring function memory for its successful execution, which\nallocates proportional function resources such as CPU and network. However,\nright-sizing the function memory force developers to speculate performance and\nmake ad-hoc configuration decisions. Recent research has highlighted that a\nfunction's input characteristics, such as input size, type and number of\ninputs, significantly impact its resource demand, run-time performance and\ncosts with fluctuating workloads. This correlation further makes memory\nconfiguration a non-trivial task. On that account, an input-aware function\nmemory allocator not only improves developer productivity by completely hiding\nresource-related decisions but also drives an opportunity to reduce resource\nwastage and offer a finer-grained cost-optimised pricing scheme. Therefore, we\npresent MemFigLess, a serverless solution that estimates the memory requirement\nof a serverless function with input-awareness. The framework executes function\nprofiling in an offline stage and trains a multi-output Random Forest\nRegression model on the collected metrics to invoke input-aware optimal\nconfigurations. We evaluate our work with the state-of-the-art approaches on\nAWS Lambda service to find that MemFigLess is able to capture the input-aware\nresource relationships and allocate upto 82% less resources and save up to 87%\nrun-time costs.\n","authors":["Siddharth Agarwal","Maria A. Rodriguez","Rajkumar Buyya"],"pdf_url":"https://arxiv.org/pdf/2411.07444v1.pdf","comment":"10 pages, 2 tables, 28 figures, accepted conference paper - UCC'24"},{"id":"http://arxiv.org/abs/2411.08249v1","updated":"2024-11-12T23:55:11Z","published":"2024-11-12T23:55:11Z","title":"Retrieval Augmented Time Series Forecasting","summary":" Retrieval-augmented generation (RAG) is a central component of modern LLM\nsystems, particularly in scenarios where up-to-date information is crucial for\naccurately responding to user queries or when queries exceed the scope of the\ntraining data. The advent of time-series foundation models (TSFM), such as\nChronos, and the need for effective zero-shot forecasting performance across\nvarious time-series domains motivates the question: Do benefits of RAG\nsimilarly carry over to time series forecasting? In this paper, we advocate\nthat the dynamic and event-driven nature of time-series data makes RAG a\ncrucial component of TSFMs and introduce a principled RAG framework for\ntime-series forecasting, called Retrieval Augmented Forecasting (RAF). Within\nRAF, we develop efficient strategies for retrieving related time-series\nexamples and incorporating them into forecast. Through experiments and\nmechanistic studies, we demonstrate that RAF indeed improves the forecasting\naccuracy across diverse time series domains and the improvement is more\nsignificant for larger TSFM sizes.\n","authors":["Kutay Tire","Ege Onur Taga","Muhammed Emrullah Ildiz","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2411.08249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08248v1","updated":"2024-11-12T23:54:58Z","published":"2024-11-12T23:54:58Z","title":"Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial\n Approach","summary":" Deep learning underpins most of the currently advanced natural language\nprocessing (NLP) tasks such as textual classification, neural machine\ntranslation (NMT), abstractive summarization and question-answering (QA).\nHowever, the robustness of the models, particularly QA models, against\nadversarial attacks is a critical concern that remains insufficiently explored.\nThis paper introduces QA-Attack (Question Answering Attack), a novel word-level\nadversarial strategy that fools QA models. Our attention-based attack exploits\nthe customized attention mechanism and deletion ranking strategy to identify\nand target specific words within contextual passages. It creates deceptive\ninputs by carefully choosing and substituting synonyms, preserving grammatical\nintegrity while misleading the model to produce incorrect responses. Our\napproach demonstrates versatility across various question types, particularly\nwhen dealing with extensive long textual inputs. Extensive experiments on\nmultiple benchmark datasets demonstrate that QA-Attack successfully deceives\nbaseline QA models and surpasses existing adversarial techniques regarding\nsuccess rate, semantics changes, BLEU score, fluency and grammar error rate.\n","authors":["Jiyao Li","Mingze Ni","Yongshun Gong","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08241v1","updated":"2024-11-12T23:32:21Z","published":"2024-11-12T23:32:21Z","title":"A Social Outcomes and Priorities centered (SOP) Framework for AI policy","summary":" Rapid developments in AI and its adoption across various domains have\nnecessitated a need to build robust guardrails and risk containment plans while\nensuring equitable benefits for the betterment of society. The current\ntechnology-centered approach has resulted in a fragmented, reactive, and\nineffective policy apparatus. This paper highlights the immediate and urgent\nneed to pivot to a society-centered approach to develop comprehensive,\ncoherent, forward-looking AI policy. To this end, we present a Social Outcomes\nand Priorities centered (SOP) framework for AI policy along with proposals on\nimplementation of its various components. While the SOP framework is presented\nfrom a US-centric view, the takeaways are general and applicable globally.\n","authors":["Mohak Shah"],"pdf_url":"https://arxiv.org/pdf/2411.08241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04367v3","updated":"2024-11-12T22:53:09Z","published":"2024-09-06T15:58:20Z","title":"Algorithm Configuration for Structured Pfaffian Settings","summary":" Data-driven algorithm design automatically adapts algorithms to specific\napplication domains, achieving better performance. In the context of\nparameterized algorithms, this approach involves tuning the algorithm's\nhyperparameters using problem instances drawn from the problem distribution of\nthe target application domain. This can be achieved by maximizing empirical\nutilities that measure the algorithms' performance as a function of their\nhyperparameters, using problem instances. While empirical evidence supports the\neffectiveness of data-driven algorithm design, providing theoretical guarantees\nfor several parameterized families remains challenging. This is due to the\nintricate behaviors of their corresponding utility functions, which typically\nadmit piecewise discontinuous structures. In this work, we present refined\nframeworks for providing learning guarantees for parameterized data-driven\nalgorithm design problems in both distributional and online learning settings.\nFor the distributional learning setting, we introduce the \\textit{Pfaffian GJ\nframework}, an extension of the classical \\textit{GJ framework}, that is\ncapable of providing learning guarantees for function classes for which the\ncomputation involves Pfaffian functions. Unlike the GJ framework, which is\nlimited to function classes with computation characterized by rational\nfunctions, our proposed framework can deal with function classes involving\nPfaffian functions, which are much more general and widely applicable. We then\nshow that for many parameterized algorithms of interest, their utility function\npossesses a \\textit{refined piecewise structure}, which automatically\ntranslates to learning guarantees using our proposed framework.\n","authors":["Maria-Florina Balcan","Anh Tuan Nguyen","Dravyansh Sharma"],"pdf_url":"https://arxiv.org/pdf/2409.04367v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08227v1","updated":"2024-11-12T22:43:16Z","published":"2024-11-12T22:43:16Z","title":"DPU: Dynamic Prototype Updating for Multimodal Out-of-Distribution\n Detection","summary":" Out-of-distribution (OOD) detection is essential for ensuring the robustness\nof machine learning models by identifying samples that deviate from the\ntraining distribution. While traditional OOD detection has primarily focused on\nsingle-modality inputs, such as images, recent advances in multimodal models\nhave demonstrated the potential of leveraging multiple modalities (e.g., video,\noptical flow, audio) to enhance detection performance. However, existing\nmethods often overlook intra-class variability within in-distribution (ID)\ndata, assuming that samples of the same class are perfectly cohesive and\nconsistent. This assumption can lead to performance degradation, especially\nwhen prediction discrepancies are uniformly amplified across all samples. To\naddress this issue, we propose Dynamic Prototype Updating (DPU), a novel\nplug-and-play framework for multimodal OOD detection that accounts for\nintra-class variations. Our method dynamically updates class center\nrepresentations for each class by measuring the variance of similar samples\nwithin each batch, enabling adaptive adjustments. This approach allows us to\namplify prediction discrepancies based on the updated class centers, thereby\nimproving the model's robustness and generalization across different\nmodalities. Extensive experiments on two tasks, five datasets, and nine base\nOOD algorithms demonstrate that DPU significantly improves OOD detection\nperformance, setting a new state-of-the-art in multimodal OOD detection, with\nimprovements of up to 80 percent in Far-OOD detection. To facilitate\naccessibility and reproducibility, our code is publicly available on GitHub.\n","authors":["Shawn Li","Huixian Gong","Hao Dong","Tiankai Yang","Zhengzhong Tu","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.08227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01661v2","updated":"2024-11-12T22:10:55Z","published":"2024-11-03T19:17:20Z","title":"Sing-On-Your-Beat: Simple Text-Controllable Accompaniment Generations","summary":" Singing is one of the most cherished forms of human entertainment. However,\ncreating a beautiful song requires an accompaniment that complements the vocals\nand aligns well with the song instruments and genre. With advancements in deep\nlearning, previous research has focused on generating suitable accompaniments\nbut often lacks precise alignment with the desired instrumentation and genre.\nTo address this, we propose a straightforward method that enables control over\nthe accompaniment through text prompts, allowing the generation of music that\ncomplements the vocals and aligns with the song instrumental and genre\nrequirements. Through extensive experiments, we successfully generate 10-second\naccompaniments using vocal input and text control.\n","authors":["Quoc-Huy Trinh","Minh-Van Nguyen","Trong-Hieu Nguyen Mau","Khoa Tran","Thanh Do"],"pdf_url":"https://arxiv.org/pdf/2411.01661v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08212v1","updated":"2024-11-12T22:03:37Z","published":"2024-11-12T22:03:37Z","title":"PERFT: Parameter-Efficient Routed Fine-Tuning for Mixture-of-Expert\n Model","summary":" The Mixture-of-Experts (MoE) paradigm has emerged as a powerful approach for\nscaling transformers with improved resource utilization. However, efficiently\nfine-tuning MoE models remains largely underexplored. Inspired by recent works\non Parameter-Efficient Fine-Tuning (PEFT), we present a unified framework for\nintegrating PEFT modules directly into the MoE mechanism. Aligning with the\ncore principles and architecture of MoE, our framework encompasses a set of\ndesign dimensions including various functional and composition strategies. By\ncombining design choices within our framework, we introduce Parameter-Efficient\nRouted Fine-Tuning (PERFT) as a flexible and scalable family of PEFT strategies\ntailored for MoE models. Extensive experiments on adapting OLMoE-1B-7B and\nMixtral-8$\\times$7B for commonsense and arithmetic reasoning tasks demonstrate\nthe effectiveness, scalability, and intriguing dynamics of PERFT. Additionally,\nwe provide empirical findings for each specific design choice to facilitate\nbetter application of MoE and PEFT.\n","authors":["Yilun Liu","Yunpu Ma","Shuo Chen","Zifeng Ding","Bailan He","Zhen Han","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2411.08212v1.pdf","comment":"Code available via https://anonymous.4open.science/r/PERFT-MoE/"},{"id":"http://arxiv.org/abs/2411.08197v1","updated":"2024-11-12T21:37:10Z","published":"2024-11-12T21:37:10Z","title":"What Representational Similarity Measures Imply about Decodable\n Information","summary":" Neural responses encode information that is useful for a variety of\ndownstream tasks. A common approach to understand these systems is to build\nregression models or ``decoders'' that reconstruct features of the stimulus\nfrom neural responses. Popular neural network similarity measures like centered\nkernel alignment (CKA), canonical correlation analysis (CCA), and Procrustes\nshape distance, do not explicitly leverage this perspective and instead\nhighlight geometric invariances to orthogonal or affine transformations when\ncomparing representations. Here, we show that many of these measures can, in\nfact, be equivalently motivated from a decoding perspective. Specifically,\nmeasures like CKA and CCA quantify the average alignment between optimal linear\nreadouts across a distribution of decoding tasks. We also show that the\nProcrustes shape distance upper bounds the distance between optimal linear\nreadouts and that the converse holds for representations with low participation\nratio. Overall, our work demonstrates a tight link between the geometry of\nneural representations and the ability to linearly decode information. This\nperspective suggests new ways of measuring similarity between neural systems\nand also provides novel, unifying interpretations of existing measures.\n","authors":["Sarah E. Harvey","David Lipshutz","Alex H. Williams"],"pdf_url":"https://arxiv.org/pdf/2411.08197v1.pdf","comment":null}],"Computation and Language":[{"id":"http://arxiv.org/abs/2411.08019v1","updated":"2024-11-12T18:50:35Z","published":"2024-11-12T18:50:35Z","title":"Language Models as Causal Effect Generators","summary":" We present a framework for large language model (LLM) based data generation\nwith controllable causal structure. In particular, we define a procedure for\nturning any language model and any directed acyclic graph (DAG) into a\nsequence-driven structural causal model (SD-SCM). Broadly speaking, an SD-SCM\nis a causal model with user-defined structure and LLM-defined structural\nequations. We characterize how an SD-SCM allows sampling from observational,\ninterventional, and counterfactual distributions according to the desired\ncausal structure. We then leverage this procedure to propose a new type of\nbenchmark for causal inference methods, generating individual-level\ncounterfactual data without needing to manually specify functional\nrelationships between variables. We create an example benchmark consisting of\nthousands of datasets, and test a suite of popular estimation methods on these\ndatasets for average, conditional average, and individual treatment effect\nestimation, both with and without hidden confounding. Apart from generating\ndata, the same procedure also allows us to test for the presence of a causal\neffect that might be encoded in an LLM. This procedure can underpin auditing\nLLMs for misinformation, discrimination, or otherwise undesirable behavior. We\nbelieve SD-SCMs can serve as a useful tool in any application that would\nbenefit from sequential data with controllable causal structure.\n","authors":["Lucius E. J. Bynum","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2411.08019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08010v1","updated":"2024-11-12T18:35:28Z","published":"2024-11-12T18:35:28Z","title":"ExpressivityArena: Can LLMs Express Information Implicitly?","summary":" While Large Language Models (LLMs) have demonstrated remarkable performance\nin certain dimensions, their ability to express implicit language cues that\nhuman use for effective communication remains unclear. This paper presents\nExpressivityArena, a Python library for measuring the implicit communication\nabilities of LLMs. We provide a comprehensive framework to evaluate\nexpressivity of arbitrary LLMs and explore its practical implications. To this\nend, we refine the definition and measurements of ``expressivity,'' and use our\nframework in a set of small experiments. These experiments test LLMs in\ncreative and logical tasks such as poetry, coding, and emotion-based responses.\nThey are then evaluated by an automated grader, through ExpressivityArena,\nwhich we verify to be the most pragmatic for testing expressivity. Building on\nthese experiments, we deepen our understanding of the expressivity of LLMs by\nassessing their ability to remain expressive in conversations. Our findings\nindicate that LLMs are capable of generating and understanding expressive\ncontent, however, with some limitations. These insights will inform the future\ndevelopment and deployment of expressive LLMs. We provide the code for\nExpressivityArena alongside our paper.\n","authors":["Joshua Tint","Som Sagar","Aditya Taparia","Kelly Raines","Bimsara Pathiraja","Caleb Liu","Ransalu Senanayake"],"pdf_url":"https://arxiv.org/pdf/2411.08010v1.pdf","comment":"8 pages, 22 figures"},{"id":"http://arxiv.org/abs/2411.08003v1","updated":"2024-11-12T18:28:57Z","published":"2024-11-12T18:28:57Z","title":"Can adversarial attacks by large language models be attributed?","summary":" Attributing outputs from Large Language Models (LLMs) in adversarial\nsettings-such as cyberattacks and disinformation-presents significant\nchallenges that are likely to grow in importance. We investigate this\nattribution problem using formal language theory, specifically language\nidentification in the limit as introduced by Gold and extended by Angluin. By\nmodeling LLM outputs as formal languages, we analyze whether finite text\nsamples can uniquely pinpoint the originating model. Our results show that due\nto the non-identifiability of certain language classes, under some mild\nassumptions about overlapping outputs from fine-tuned models it is\ntheoretically impossible to attribute outputs to specific LLMs with certainty.\nThis holds also when accounting for expressivity limitations of Transformer\narchitectures. Even with direct model access or comprehensive monitoring,\nsignificant computational hurdles impede attribution efforts. These findings\nhighlight an urgent need for proactive measures to mitigate risks posed by\nadversarial LLM use as their influence continues to expand.\n","authors":["Manuel Cebrian","Jan Arne Telle"],"pdf_url":"https://arxiv.org/pdf/2411.08003v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.07990v1","updated":"2024-11-12T18:15:19Z","published":"2024-11-12T18:15:19Z","title":"Derivational Morphology Reveals Analogical Generalization in Large\n Language Models","summary":" What mechanisms underlie linguistic generalization in large language models\n(LLMs)? This question has attracted considerable attention, with most studies\nanalyzing the extent to which the language skills of LLMs resemble rules. As of\nyet, it is not known whether linguistic generalization in LLMs could equally\nwell be explained as the result of analogical processes, which can be\nformalized as similarity operations on stored exemplars. A key shortcoming of\nprior research is its focus on linguistic phenomena with a high degree of\nregularity, for which rule-based and analogical approaches make the same\npredictions. Here, we instead examine derivational morphology, specifically\nEnglish adjective nominalization, which displays notable variability. We\nintroduce a new method for investigating linguistic generalization in LLMs:\nfocusing on GPT-J, we fit cognitive models that instantiate rule-based and\nanalogical learning to the LLM training data and compare their predictions on a\nset of nonce adjectives with those of the LLM, allowing us to draw direct\nconclusions regarding underlying mechanisms. As expected, rule-based and\nanalogical models explain the predictions of GPT-J equally well for adjectives\nwith regular nominalization patterns. However, for adjectives with variable\nnominalization patterns, the analogical model provides a much better match.\nFurthermore, GPT-J's behavior is sensitive to the individual word frequencies,\neven for regular forms, a behavior that is consistent with an analogical\naccount of regular forms but not a rule-based one. These findings refute the\nhypothesis that GPT-J's linguistic generalization on adjective nominalization\ninvolves rules, suggesting similarity operations on stored exemplars as the\nunderlying mechanism. Overall, our study suggests that analogical processes\nplay a bigger role in the linguistic generalization of LLMs than previously\nthought.\n","authors":["Valentin Hofmann","Leonie Weissweiler","David Mortensen","Hinrich Schütze","Janet Pierrehumbert"],"pdf_url":"https://arxiv.org/pdf/2411.07990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07975v1","updated":"2024-11-12T17:55:10Z","published":"2024-11-12T17:55:10Z","title":"JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified\n Multimodal Understanding and Generation","summary":" We present JanusFlow, a powerful framework that unifies image understanding\nand generation in a single model. JanusFlow introduces a minimalist\narchitecture that integrates autoregressive language models with rectified\nflow, a state-of-the-art method in generative modeling. Our key finding\ndemonstrates that rectified flow can be straightforwardly trained within the\nlarge language model framework, eliminating the need for complex architectural\nmodifications. To further improve the performance of our unified model, we\nadopt two key strategies: (i) decoupling the understanding and generation\nencoders, and (ii) aligning their representations during unified training.\nExtensive experiments show that JanusFlow achieves comparable or superior\nperformance to specialized models in their respective domains, while\nsignificantly outperforming existing unified approaches across standard\nbenchmarks. This work represents a step toward more efficient and versatile\nvision-language models.\n","authors":["Yiyang Ma","Xingchao Liu","Xiaokang Chen","Wen Liu","Chengyue Wu","Zhiyu Wu","Zizheng Pan","Zhenda Xie","Haowei Zhang","Xingkai yu","Liang Zhao","Yisong Wang","Jiaying Liu","Chong Ruan"],"pdf_url":"https://arxiv.org/pdf/2411.07975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07965v1","updated":"2024-11-12T17:41:16Z","published":"2024-11-12T17:41:16Z","title":"From General to Specific: Utilizing General Hallucation to Automatically\n Measure the Role Relationship Fidelity for Specific Role-Play Agents","summary":" The advanced role-playing capabilities of Large Language Models (LLMs) have\npaved the way for developing Role-Playing Agents (RPAs). However, existing\nbenchmarks, such as HPD, which incorporates manually scored character\nrelationships into the context for LLMs to sort coherence, and SocialBench,\nwhich uses specific profiles generated by LLMs in the context of\nmultiple-choice tasks to assess character preferences, face limitations like\npoor generalizability, implicit and inaccurate judgments, and excessive context\nlength. To address the above issues, we propose an automatic, scalable, and\ngeneralizable paradigm. Specifically, we construct a benchmark by extracting\nrelations from a general knowledge graph and leverage RPA's inherent\nhallucination properties to prompt it to interact across roles, employing\nChatGPT for stance detection and defining relationship hallucination along with\nthree related metrics. Extensive experiments validate the effectiveness and\nstability of our metrics. Our findings further explore factors influencing\nthese metrics and discuss the trade-off between relationship hallucination and\nfactuality.\n","authors":["Chuyi Kong","Ziyang Luo","Hongzhan Lin","Zhiyuan Fan","Yaxin Fan","Yuxi Sun","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2411.07965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08479v5","updated":"2024-11-12T17:38:29Z","published":"2024-02-13T14:12:32Z","title":"Plausible Extractive Rationalization through Semi-Supervised Entailment\n Signal","summary":" The increasing use of complex and opaque black box models requires the\nadoption of interpretable measures, one such option is extractive rationalizing\nmodels, which serve as a more interpretable alternative. These models, also\nknown as Explain-Then-Predict models, employ an explainer model to extract\nrationales and subsequently condition the predictor with the extracted\ninformation. Their primary objective is to provide precise and faithful\nexplanations, represented by the extracted rationales. In this paper, we take a\nsemi-supervised approach to optimize for the plausibility of extracted\nrationales. We adopt a pre-trained natural language inference (NLI) model and\nfurther fine-tune it on a small set of supervised rationales ($10\\%$). The NLI\npredictor is leveraged as a source of supervisory signals to the explainer via\nentailment alignment. We show that, by enforcing the alignment agreement\nbetween the explanation and answer in a question-answering task, the\nperformance can be improved without access to ground truth labels. We evaluate\nour approach on the ERASER dataset and show that our approach achieves\ncomparable results with supervised extractive models and outperforms\nunsupervised approaches by $> 100\\%$.\n","authors":["Wei Jie Yeo","Ranjan Satapathy","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2402.08479v5.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2406.11275v2","updated":"2024-11-12T17:37:10Z","published":"2024-06-17T07:25:09Z","title":"Self-training Large Language Models through Knowledge Detection","summary":" Large language models (LLMs) often necessitate extensive labeled datasets and\ntraining compute to achieve impressive performance across downstream tasks.\nThis paper explores a self-training paradigm, where the LLM autonomously\ncurates its own labels and selectively trains on unknown data samples\nidentified through a reference-free consistency method. Empirical evaluations\ndemonstrate significant improvements in reducing hallucination in generation\nacross multiple subjects. Furthermore, the selective training framework\nmitigates catastrophic forgetting in out-of-distribution benchmarks, addressing\na critical limitation in training LLMs. Our findings suggest that such an\napproach can substantially reduce the dependency on large labeled datasets,\npaving the way for more scalable and cost-effective language model training.\n","authors":["Wei Jie Yeo","Teddy Ferdinan","Przemyslaw Kazienko","Ranjan Satapathy","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2406.11275v2.pdf","comment":"EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2411.07917v1","updated":"2024-11-12T16:49:51Z","published":"2024-11-12T16:49:51Z","title":"CryptoLLM: Unleashing the Power of Prompted LLMs for SmartQnA and\n Classification of Crypto Posts","summary":" The rapid growth of social media has resulted in an large volume of\nuser-generated content, particularly in niche domains such as cryptocurrency.\nThis task focuses on developing robust classification models to accurately\ncategorize cryptocurrency-related social media posts into predefined classes,\nincluding but not limited to objective, positive, negative, etc. Additionally,\nthe task requires participants to identify the most relevant answers from a set\nof posts in response to specific questions. By leveraging advanced LLMs, this\nresearch aims to enhance the understanding and filtering of cryptocurrency\ndiscourse, thereby facilitating more informed decision-making in this volatile\nsector. We have used a prompt-based technique to solve the classification task\nfor reddit posts and twitter posts. Also, we have used 64-shot technique along\nwith prompts on GPT-4-Turbo model to determine whether a answer is relevant to\na question or not.\n","authors":["Aniket Deroy","Subhankar Maity"],"pdf_url":"https://arxiv.org/pdf/2411.07917v1.pdf","comment":"Accepted at FIRE 2024 (Track: Opinion Extraction and Question\n Answering from CryptoCurrency-Related Tweets and Reddit posts (CryptOQA))"},{"id":"http://arxiv.org/abs/2406.13230v2","updated":"2024-11-12T16:47:49Z","published":"2024-06-19T05:33:34Z","title":"Enhancing Language Model Factuality via Activation-Based Confidence\n Calibration and Guided Decoding","summary":" Calibrating language models (LMs) aligns their generation confidence with the\nactual likelihood of answer correctness, which can inform users about LMs'\nreliability and mitigate hallucinated content. However, prior calibration\nmethods, such as self-consistency-based and logit-based approaches, are either\nlimited in inference-time efficiency or fall short of providing informative\nsignals. Moreover, simply filtering out low-confidence responses reduces the\nLM's helpfulness when the answers are correct. Therefore, effectively using\ncalibration techniques to enhance an LM's factuality remains an unsolved\nchallenge. In this paper, we first propose an activation-based calibration\nmethod, ActCab, which trains a linear layer on top of the LM's last-layer\nactivations that can better capture the representations of knowledge. Built on\ntop of ActCab, we further propose CoDec, a confidence-guided decoding strategy\nto elicit truthful answers with high confidence from LMs. By evaluating on five\npopular QA benchmarks, ActCab achieves superior calibration performance than\nall competitive baselines, e.g., by reducing the average expected calibration\nerror (ECE) score by up to 39%. Further experiments on CoDec show consistent\nimprovements in several LMs' factuality on challenging QA datasets, such as\nTruthfulQA, highlighting the value of confidence signals in enhancing\nfactuality.\n","authors":["Xin Liu","Farima Fatahi Bayat","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2406.13230v2.pdf","comment":"EMNLP 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2310.10429v2","updated":"2024-11-12T16:39:55Z","published":"2023-10-16T14:13:38Z","title":"Exploiting User Comments for Early Detection of Fake News Prior to\n Users' Commenting","summary":" Both accuracy and timeliness are key factors in detecting fake news on social\nmedia. However, most existing methods encounter an accuracy-timeliness dilemma:\nContent-only methods guarantee timeliness but perform moderately because of\nlimited available information, while social con-text-based ones generally\nperform better but inevitably lead to latency because of social context\naccumulation needs. To break such a dilemma, a feasible but not well-studied\nsolution is to leverage social contexts (e.g., comments) from historical news\nfor training a detection model and apply it to newly emerging news without\nsocial contexts. This requires the model to (1) sufficiently learn helpful\nknowledge from social contexts, and (2) be well compatible with situations that\nsocial contexts are available or not. To achieve this goal, we propose to\nabsorb and parameterize useful knowledge from comments in historical news and\nthen inject it into a content-only detection model. Specifically, we design the\nComments ASsisted FakE News Detection method (CAS-FEND), which transfers useful\nknowledge from a comment-aware teacher model to a content-only student model\nand detects newly emerging news with the student model. Experiments show that\nthe CAS-FEND student model outperforms all content-only methods and even\ncomment-aware ones with 1/4 comments as inputs, demonstrating its superiority\nfor early detection.\n","authors":["Qiong Nan","Qiang Sheng","Juan Cao","Yongchun Zhu","Danding Wang","Guang Yang","Jintao Li"],"pdf_url":"https://arxiv.org/pdf/2310.10429v2.pdf","comment":"19 pages, 6 figures, 7 tables. The article has been accepted by\n Frontiers of Computer Science (FCS), with the DOI:\n {10.1007/s11704-024-40674-6}"},{"id":"http://arxiv.org/abs/2406.11813v3","updated":"2024-11-12T16:38:37Z","published":"2024-06-17T17:54:40Z","title":"How Do Large Language Models Acquire Factual Knowledge During\n Pretraining?","summary":" Despite the recent observation that large language models (LLMs) can store\nsubstantial factual knowledge, there is a limited understanding of the\nmechanisms of how they acquire factual knowledge through pretraining. This work\naddresses this gap by studying how LLMs acquire factual knowledge during\npretraining. The findings reveal several important insights into the dynamics\nof factual knowledge acquisition during pretraining. First, counterintuitively,\nwe observe that pretraining on more data shows no significant improvement in\nthe model's capability to acquire and maintain factual knowledge. Next, there\nis a power-law relationship between training steps and forgetting of\nmemorization and generalization of factual knowledge, and LLMs trained with\nduplicated training data exhibit faster forgetting. Third, training LLMs with\nlarger batch sizes can enhance the models' robustness to forgetting. Overall,\nour observations suggest that factual knowledge acquisition in LLM pretraining\noccurs by progressively increasing the probability of factual knowledge\npresented in the pretraining data at each step. However, this increase is\ndiluted by subsequent forgetting. Based on this interpretation, we demonstrate\nthat we can provide plausible explanations for recently observed behaviors of\nLLMs, such as the poor performance of LLMs on long-tail knowledge and the\nbenefits of deduplicating the pretraining corpus.\n","authors":["Hoyeon Chang","Jinho Park","Seonghyeon Ye","Sohee Yang","Youngkyung Seo","Du-Seong Chang","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2406.11813v3.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.06855v2","updated":"2024-11-12T16:03:24Z","published":"2024-11-11T10:37:11Z","title":"A Unified Multi-Task Learning Architecture for Hate Detection Leveraging\n User-Based Information","summary":" Hate speech, offensive language, aggression, racism, sexism, and other\nabusive language are common phenomena in social media. There is a need for\nArtificial Intelligence(AI)based intervention which can filter hate content at\nscale. Most existing hate speech detection solutions have utilized the features\nby treating each post as an isolated input instance for the classification.\nThis paper addresses this issue by introducing a unique model that improves\nhate speech identification for the English language by utilising intra-user and\ninter-user-based information. The experiment is conducted over single-task\nlearning (STL) and multi-task learning (MTL) paradigms that use deep neural\nnetworks, such as convolutional neural networks (CNN), gated recurrent unit\n(GRU), bidirectional encoder representations from the transformer (BERT), and A\nLite BERT (ALBERT). We use three benchmark datasets and conclude that combining\ncertain user features with textual features gives significant improvements in\nmacro-F1 and weighted-F1.\n","authors":["Prashant Kapil","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2411.06855v2.pdf","comment":"7 pages, 1 figure, and two tables. Accepted at the 20th International\n Conference on Natural Language Processing (ICON) 2023.\n https://aclanthology.org/2023.icon-1.53"},{"id":"http://arxiv.org/abs/2411.07892v1","updated":"2024-11-12T15:56:48Z","published":"2024-11-12T15:56:48Z","title":"Mapping the Podcast Ecosystem with the Structured Podcast Research\n Corpus","summary":" Podcasts provide highly diverse content to a massive listener base through a\nunique on-demand modality. However, limited data has prevented large-scale\ncomputational analysis of the podcast ecosystem. To fill this gap, we introduce\na massive dataset of over 1.1M podcast transcripts that is largely\ncomprehensive of all English language podcasts available through public RSS\nfeeds from May and June of 2020. This data is not limited to text, but rather\nincludes audio features and speaker turns for a subset of 370K episodes, and\nspeaker role inferences and other metadata for all 1.1M episodes. Using this\ndata, we also conduct a foundational investigation into the content, structure,\nand responsiveness of this ecosystem. Together, our data and analyses open the\ndoor to continued computational research of this popular and impactful medium.\n","authors":["Benjamin Litterer","David Jurgens","Dallas Card"],"pdf_url":"https://arxiv.org/pdf/2411.07892v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.05508v2","updated":"2024-11-12T15:36:04Z","published":"2024-11-08T12:08:17Z","title":"An Early FIRST Reproduction and Improvements to Single-Token Decoding\n for Fast Listwise Reranking","summary":" Recent advances have demonstrated that large language models (LLMs) excel as\nlistwise rerankers, but their high computational demands remain a barrier to\nwidespread adoption. Further, the traditional language modeling (LM) objective\nis not ideally suited for reranking tasks. FIRST is a novel approach that\naddresses these challenges by integrating a learning-to-rank objective and\nleveraging the logits of only the first generated token, thereby significantly\nreducing inference latency compared to traditional LLM rerankers. In this\nstudy, we extend the evaluation of FIRST to the TREC Deep Learning datasets\n(DL19-22), validating its robustness across diverse domains. We investigate the\ninfluence of different first-stage retrievers on FIRST rerankers, observing\ndiminishing returns and patterns consistent with traditional LLM rerankers.\nThrough applying the FIRST objective to a broader range of backbone models, we\nachieve effectiveness surpassing the original implementation. Our experiments\nconfirm that fast reranking with single-token logits does not compromise\nout-of-domain reranking quality. To better quantify the computational savings\nin the original study, we measure and compare latency to find a 21%-42% gain\nacross various models and benchmarks. Moreover, while LM training implicitly\nimproves zero-shot single-token reranking, our experiments also raise questions\nabout whether LM pre-training may hinder subsequent fine-tuning with the FIRST\nobjective. These findings pave the way for more efficient and effective\nlistwise reranking in future applications.\n","authors":["Zijian Chen","Ronak Pradeep","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2411.05508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07870v1","updated":"2024-11-12T15:26:17Z","published":"2024-11-12T15:26:17Z","title":"Trustful LLMs: Customizing and Grounding Text Generation with Knowledge\n Bases and Dual Decoders","summary":" Although people are impressed by the content generation skills of large\nlanguage models, the use of LLMs, such as ChatGPT, is limited by the domain\ngrounding of the content. The correctness and groundedness of the generated\ncontent need to be based on a verified context, such as results from\nRetrieval-Augmented Generation (RAG). One important issue when adapting LLMs to\na customized domain is that the generated responses are often incomplete, or\nthe additions are not verified and may even be hallucinated. Prior studies on\nhallucination detection have focused on evaluation metrics, which are not\neasily adaptable to dynamic domains and can be vulnerable to attacks like\njail-breaking. In this work, we propose 1) a post-processing algorithm that\nleverages knowledge triplets in RAG context to correct hallucinations and 2) a\ndual-decoder model that fuses RAG context to guide the generation process.\n","authors":["Xiaofeng Zhu","Jaya Krishna Mandivarapu"],"pdf_url":"https://arxiv.org/pdf/2411.07870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07858v1","updated":"2024-11-12T15:15:20Z","published":"2024-11-12T15:15:20Z","title":"Verbosity $\\neq$ Veracity: Demystify Verbosity Compensation Behavior of\n Large Language Models","summary":" When unsure about an answer, humans often respond with more words than\nnecessary, hoping that part of the response will be correct. We observe a\nsimilar behavior in large language models (LLMs), which we term \"Verbosity\nCompensation\" (VC). VC is harmful because it confuses the user understanding,\nleading to low efficiency, and influences the LLM services by increasing the\nlatency and cost of generating useless tokens. In this paper, we present the\nfirst work that defines and analyzes Verbosity Compensation, explores its\ncauses, and proposes a simple mitigating approach. We define Verbosity\nCompensation as the behavior of generating responses that can be compressed\nwithout information loss when prompted to write concisely. Our experiments,\nconducted on five datasets of knowledge and reasoning-based QA tasks with 14\nnewly developed LLMs, reveal three conclusions. 1) We reveal a pervasive\npresence of verbosity compensation across all models and all datasets. Notably,\nGPT-4 exhibits a VC frequency of 50.40%. 2) We reveal the large performance gap\nbetween verbose and concise responses, with a notable difference of 27.61% on\nthe Qasper dataset. We also demonstrate that this difference does not naturally\ndiminish as LLM capability increases. Both 1) and 2) highlight the urgent need\nto mitigate the frequency of VC behavior and disentangle verbosity with\nveracity. We propose a simple yet effective cascade algorithm that replaces the\nverbose responses with the other model-generated responses. The results show\nthat our approach effectively alleviates the VC of the Mistral model from\n63.81% to 16.16% on the Qasper dataset. 3) We also find that verbose responses\nexhibit higher uncertainty across all five datasets, suggesting a strong\nconnection between verbosity and model uncertainty. Our dataset and code are\navailable at https://github.com/psunlpgroup/VerbosityLLM.\n","authors":["Yusen Zhang","Sarkar Snigdha Sarathi Das","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07858v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.07854v1","updated":"2024-11-12T15:06:06Z","published":"2024-11-12T15:06:06Z","title":"Tucano: Advancing Neural Text Generation for Portuguese","summary":" Significant advances have been made in natural language processing in recent\nyears. However, our current deep learning approach to language modeling\nrequires substantial resources in terms of data and computation. One of the\nside effects of this data-hungry paradigm is the current schism between\nlanguages, separating those considered high-resource, where most of the\ndevelopment happens and resources are available, and the low-resource ones,\nwhich struggle to attain the same level of performance and autonomy. This study\naims to introduce a new set of resources to stimulate the future development of\nneural text generation in Portuguese. In this work, we document the development\nof GigaVerbo, a concatenation of deduplicated Portuguese text corpora amounting\nto 200 billion tokens. Via this corpus, we trained a series of\ndecoder-transformers named Tucano. Our models perform equal or superior to\nother Portuguese and multilingual language models of similar size in several\nPortuguese benchmarks. The evaluation of our models also reveals that model\nperformance on many currently available benchmarks used by the Portuguese NLP\ncommunity has little to no correlation with the scaling of token ingestion\nduring training, highlighting the limitations of such evaluations when it comes\nto the assessment of Portuguese generative language models. All derivatives of\nour study are openly released on GitHub and Hugging Face. See\nhttps://nkluge-correa.github.io/Tucano/\n","authors":["Nicholas Kluge Corrêa","Aniket Sen","Sophia Falk","Shiza Fatimah"],"pdf_url":"https://arxiv.org/pdf/2411.07854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07850v1","updated":"2024-11-12T15:01:47Z","published":"2024-11-12T15:01:47Z","title":"IAE: Irony-based Adversarial Examples for Sentiment Analysis Systems","summary":" Adversarial examples, which are inputs deliberately perturbed with\nimperceptible changes to induce model errors, have raised serious concerns for\nthe reliability and security of deep neural networks (DNNs). While adversarial\nattacks have been extensively studied in continuous data domains such as\nimages, the discrete nature of text presents unique challenges. In this paper,\nwe propose Irony-based Adversarial Examples (IAE), a method that transforms\nstraightforward sentences into ironic ones to create adversarial text. This\napproach exploits the rhetorical device of irony, where the intended meaning is\nopposite to the literal interpretation, requiring a deeper understanding of\ncontext to detect. The IAE method is particularly challenging due to the need\nto accurately locate evaluation words, substitute them with appropriate\ncollocations, and expand the text with suitable ironic elements while\nmaintaining semantic coherence. Our research makes the following key\ncontributions: (1) We introduce IAE, a strategy for generating textual\nadversarial examples using irony. This method does not rely on pre-existing\nirony corpora, making it a versatile tool for creating adversarial text in\nvarious NLP tasks. (2) We demonstrate that the performance of several\nstate-of-the-art deep learning models on sentiment analysis tasks significantly\ndeteriorates when subjected to IAE attacks. This finding underscores the\nsusceptibility of current NLP systems to adversarial manipulation through\nirony. (3) We compare the impact of IAE on human judgment versus NLP systems,\nrevealing that humans are less susceptible to the effects of irony in text.\n","authors":["Xiaoyin Yi","Jiacheng Huang"],"pdf_url":"https://arxiv.org/pdf/2411.07850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07845v1","updated":"2024-11-12T14:53:12Z","published":"2024-11-12T14:53:12Z","title":"Ethical Concern Identification in NLP: A Corpus of ACL Anthology Ethics\n Statements","summary":" What ethical concerns, if any, do LLM researchers have? We introduce EthiCon,\na corpus of 1,580 ethical concern statements extracted from scientific papers\npublished in the ACL Anthology. We extract ethical concern keywords from the\nstatements and show promising results in automating the concern identification\nprocess. Through a survey, we compare the ethical concerns of the corpus to the\nconcerns listed by the general public and professionals in the field. Finally,\nwe compare our retrieved ethical concerns with existing taxonomies pointing to\ngaps and future research directions.\n","authors":["Antonia Karamolegkou","Sandrine Schiller Hansen","Ariadni Christopoulou","Filippos Stamatiou","Anne Lauscher","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2411.07845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07843v1","updated":"2024-11-12T14:51:41Z","published":"2024-11-12T14:51:41Z","title":"Chain Association-based Attacking and Shielding Natural Language\n Processing Systems","summary":" Association as a gift enables people do not have to mention something in\ncompletely straightforward words and allows others to understand what they\nintend to refer to. In this paper, we propose a chain association-based\nadversarial attack against natural language processing systems, utilizing the\ncomprehension gap between humans and machines. We first generate a chain\nassociation graph for Chinese characters based on the association paradigm for\nbuilding search space of potential adversarial examples. Then, we introduce an\ndiscrete particle swarm optimization algorithm to search for the optimal\nadversarial examples. We conduct comprehensive experiments and show that\nadvanced natural language processing models and applications, including large\nlanguage models, are vulnerable to our attack, while humans appear good at\nunderstanding the perturbed text. We also explore two methods, including\nadversarial training and associative graph-based recovery, to shield systems\nfrom chain association-based attack. Since a few examples that use some\nderogatory terms, this paper contains materials that may be offensive or\nupsetting to some people.\n","authors":["Jiacheng Huang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20178v2","updated":"2024-11-12T14:45:18Z","published":"2024-10-26T13:19:57Z","title":"LLMs Can Evolve Continually on Modality for X-Modal Reasoning","summary":" Multimodal Large Language Models (MLLMs) have gained significant attention\ndue to their impressive capabilities in multimodal understanding. However,\nexisting methods rely heavily on extensive modal-specific pretraining and\njoint-modal tuning, leading to significant computational burdens when expanding\nto new modalities. In this paper, we propose PathWeave, a flexible and scalable\nframework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs\nto continually EVolve on modalities for $\\mathbb{X}$-modal reasoning. We\nleverage the concept of Continual Learning and develop an incremental training\nstrategy atop pre-trained MLLMs, enabling their expansion to new modalities\nusing uni-modal data, without executing joint-modal pretraining. In detail, a\nnovel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and\ncross-modal adapters are seamlessly integrated to facilitate efficient modality\nalignment and collaboration. Additionally, an MoE-based gating module is\napplied between two types of adapters to further enhance the multimodal\ninteraction. To investigate the proposed method, we establish a challenging\nbenchmark called Continual Learning of Modality (MCL), which consists of\nhigh-quality QA data from five distinct modalities: image, video, audio, depth\nand point cloud. Extensive experiments demonstrate the effectiveness of the\nproposed AnA framework on learning plasticity and memory stability during\ncontinual learning. Furthermore, PathWeave performs comparably to\nstate-of-the-art MLLMs while concurrently reducing parameter training burdens\nby 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave\n","authors":["Jiazuo Yu","Haomiao Xiong","Lu Zhang","Haiwen Diao","Yunzhi Zhuge","Lanqing Hong","Dong Wang","Huchuan Lu","You He","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2410.20178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06008v2","updated":"2024-11-12T14:30:28Z","published":"2024-11-08T23:02:59Z","title":"The Dark Patterns of Personalized Persuasion in Large Language Models:\n Exposing Persuasive Linguistic Features for Big Five Personality Traits in\n LLMs Responses","summary":" This study explores how the Large Language Models (LLMs) adjust linguistic\nfeatures to create personalized persuasive outputs. While research showed that\nLLMs personalize outputs, a gap remains in understanding the linguistic\nfeatures of their persuasive capabilities. We identified 13 linguistic features\ncrucial for influencing personalities across different levels of the Big Five\nmodel of personality. We analyzed how prompts with personality trait\ninformation influenced the output of 19 LLMs across five model families. The\nfindings show that models use more anxiety-related words for neuroticism,\nincrease achievement-related words for conscientiousness, and employ fewer\ncognitive processes words for openness to experience. Some model families excel\nat adapting language for openness to experience, others for conscientiousness,\nwhile only one model adapts language for neuroticism. Our findings show how\nLLMs tailor responses based on personality cues in prompts, indicating their\npotential to create persuasive content affecting the mind and well-being of the\nrecipients.\n","authors":["Wiktoria Mieleszczenko-Kowszewicz","Dawid Płudowski","Filip Kołodziejczyk","Jakub Świstak","Julian Sienkiewicz","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2411.06008v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2411.07820v1","updated":"2024-11-12T14:12:45Z","published":"2024-11-12T14:12:45Z","title":"Query Optimization for Parametric Knowledge Refinement in\n Retrieval-Augmented Large Language Models","summary":" We introduce the \\textit{Extract-Refine-Retrieve-Read} (ERRR) framework, a\nnovel approach designed to bridge the pre-retrieval information gap in\nRetrieval-Augmented Generation (RAG) systems through query optimization\ntailored to meet the specific knowledge requirements of Large Language Models\n(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR\nframework begins by extracting parametric knowledge from LLMs, followed by\nusing a specialized query optimizer for refining these queries. This process\nensures the retrieval of only the most pertinent information essential for\ngenerating accurate responses. Moreover, to enhance flexibility and reduce\ncomputational costs, we propose a trainable scheme for our pipeline that\nutilizes a smaller, tunable model as the query optimizer, which is refined\nthrough knowledge distillation from a larger teacher model. Our evaluations on\nvarious question-answering (QA) datasets and with different retrieval systems\nshow that ERRR consistently outperforms existing baselines, proving to be a\nversatile and cost-effective module for improving the utility and accuracy of\nRAG systems.\n","authors":["Youan Cong","Cheng Wang","Pritom Saha Akash","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2411.07820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05894v3","updated":"2024-11-12T14:06:49Z","published":"2024-05-09T16:45:27Z","title":"Efficient LLM Comparative Assessment: a Product of Experts Framework for\n Pairwise Comparisons","summary":" LLM-as-a-judge approaches are a practical and effective way of assessing a\nrange of text tasks. However, when using pairwise comparisons to rank a set of\ncandidates, the computational cost scales quadratically with the number of\ncandidates, which has practical limitations. This paper introduces a Product of\nExpert (PoE) framework for efficient LLM Comparative Assessment. Here\nindividual comparisons are considered experts that provide information on a\npair's score difference. The PoE framework combines the information from these\nexperts to yield an expression that can be maximized with respect to the\nunderlying set of candidates, and is highly flexible where any form of expert\ncan be assumed. When Gaussian experts are used one can derive simple\nclosed-form solutions for the optimal candidate ranking, and expressions for\nselecting which comparisons should be made to maximize the probability of this\nranking. Our approach enables efficient comparative assessment, where by using\nonly a small subset of the possible comparisons, one can generate score\npredictions that correlate well with human judgements. We evaluate the approach\non multiple NLG tasks and demonstrate that our framework can yield considerable\ncomputational savings when performing pairwise comparative assessment. With\nmany candidate texts, using as few as 2% of comparisons the PoE solution can\nachieve similar performance to when all comparisons are used.\n","authors":["Adian Liusie","Vatsal Raina","Yassir Fathullah","Mark Gales"],"pdf_url":"https://arxiv.org/pdf/2405.05894v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12186v3","updated":"2024-11-12T13:24:25Z","published":"2024-09-18T17:57:57Z","title":"Qwen2.5-Coder Technical Report","summary":" In this report, we introduce the Qwen2.5-Coder series, a significant upgrade\nfrom its predecessor, CodeQwen1.5. This series includes six models:\nQwen2.5-Coder-(0.5B/1.5B/3B/7B/14B/32B). As a code-specific model,\nQwen2.5-Coder is built upon the Qwen2.5 architecture and continues pretrained\non a vast corpus of over 5.5 trillion tokens. Through meticulous data cleaning,\nscalable synthetic data generation, and balanced data mixing, Qwen2.5-Coder\ndemonstrates impressive code generation capabilities while retaining general\nand math skills. These models have been evaluated on a wide range of\ncode-related tasks, achieving state-of-the-art (SOTA) performance across more\nthan 10 benchmarks, including code generation, completion, reasoning, and\nrepair, consistently outperforming larger models of the same model size. We\nbelieve that the release of the Qwen2.5-Coder series will advance research in\ncode intelligence and, with its permissive licensing, support wider adoption by\ndevelopers in real-world applications.\n","authors":["Binyuan Hui","Jian Yang","Zeyu Cui","Jiaxi Yang","Dayiheng Liu","Lei Zhang","Tianyu Liu","Jiajun Zhang","Bowen Yu","Keming Lu","Kai Dang","Yang Fan","Yichang Zhang","An Yang","Rui Men","Fei Huang","Bo Zheng","Yibo Miao","Shanghaoran Quan","Yunlong Feng","Xingzhang Ren","Xuancheng Ren","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2409.12186v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07773v1","updated":"2024-11-12T13:14:09Z","published":"2024-11-12T13:14:09Z","title":"Likelihood as a Performance Gauge for Retrieval-Augmented Generation","summary":" Recent work finds that retrieval-augmented generation with large language\nmodels is prone to be influenced by the order of retrieved documents in the\ncontext. However, the lack of in-depth analysis limits the use of this\nphenomenon for prompt engineering in practice. In this study, we posit that\nlikelihoods serve as an effective gauge for language model performance. Through\nexperiments on two question-answering datasets with a variety of\nstate-of-the-art language models, we reveal correlations between answer\naccuracy and the likelihood of the question at both the corpus level and the\ninstance level. In addition, we find that question likelihood can also indicate\nthe position of the task-relevant information in the context. Based on these\nfindings, we propose two methods that use question likelihood as a gauge for\nselecting and constructing prompts that lead to better performance. We\ndemonstrate their effectiveness with experiments. In addition, our\nlikelihood-based methods are efficient, as they only need to compute the\nlikelihood of the input, requiring much fewer language model passes than\nheuristic prompt engineering methods that require generating responses. Our\nanalysis deepens our understanding of how input prompts affect model\nperformance and provides a promising direction for efficient prompt\noptimization.\n","authors":["Tianyu Liu","Jirui Qi","Paul He","Arianna Bisazza","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.07773v1.pdf","comment":"Under review at NAACL 2025. Code is available at\n https://github.com/lyutyuh/poptimizer"},{"id":"http://arxiv.org/abs/2411.07772v1","updated":"2024-11-12T13:13:20Z","published":"2024-11-12T13:13:20Z","title":"Automatic Album Sequencing","summary":" Album sequencing is a critical part of the album production process.\nRecently, a data-driven approach was proposed that sequences general\ncollections of independent media by extracting the narrative essence of the\nitems in the collections. While this approach implies an album sequencing\ntechnique, it is not widely accessible to a less technical audience, requiring\nadvanced knowledge of machine learning techniques to use. To address this, we\nintroduce a new user-friendly web-based tool that allows a less technical\naudience to upload music tracks, execute this technique in one click, and\nsubsequently presents the result in a clean visualization to the user. To both\nincrease the number of templates available to the user and address shortcomings\nof previous work, we also introduce a new direct transformer-based album\nsequencing method. We find that our more direct method outperforms a random\nbaseline but does not reach the same performance as the narrative essence\napproach. Both methods are included in our web-based user interface, and this\n-- alongside a full copy of our implementation -- is publicly available at\nhttps://github.com/dylanashley/automatic-album-sequencing\n","authors":["Vincent Herrmann","Dylan R. Ashley","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2411.07772v1.pdf","comment":"presented as a late breaking demo in the 25th International Society\n for Music Information Retrieval Conference; 3 pages in main text, 3 figures\n in main text; source code available at\n https://github.com/dylanashley/automatic-album-sequencing"},{"id":"http://arxiv.org/abs/2411.04799v2","updated":"2024-11-12T12:57:58Z","published":"2024-11-07T15:38:25Z","title":"Kwai-STaR: Transform LLMs into State-Transition Reasoners","summary":" Mathematical reasoning presents a significant challenge to the cognitive\ncapabilities of LLMs. Various methods have been proposed to enhance the\nmathematical ability of LLMs. However, few recognize the value of state\ntransition for LLM reasoning. In this work, we define mathematical\nproblem-solving as a process of transiting from an initial unsolved state to\nthe final resolved state, and propose Kwai-STaR framework, which transforms\nLLMs into State-Transition Reasoners to improve their intuitive reasoning\ncapabilities. Our approach comprises three main steps: (1) Define the state\nspace tailored to the mathematical reasoning. (2) Generate state-transition\ndata based on the state space. (3) Convert original LLMs into State-Transition\nReasoners via a curricular training strategy. Our experiments validate the\neffectiveness of Kwai-STaR in enhancing mathematical reasoning: After training\non the small-scale Kwai-STaR dataset, general LLMs, including Mistral-7B and\nLLaMA-3, achieve considerable performance gain on the GSM8K and GSM-Hard\ndataset. Additionally, the state transition-based design endows Kwai-STaR with\nremarkable training and inference efficiency. Further experiments are underway\nto establish the generality of Kwai-STaR.\n","authors":["Xingyu Lu","Yuhang Hu","Changyi Liu","Tianke Zhang","Zhenyu Yang","Zhixiang Ding","Shengsheng Qian","Meng Du","Ruiwen Kang","Kaiyu Tang","Fan Yang","Tingting Gao","Di Zhang","Hai-Tao Zheng","Bin Wen"],"pdf_url":"https://arxiv.org/pdf/2411.04799v2.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.07763v1","updated":"2024-11-12T12:52:17Z","published":"2024-11-12T12:52:17Z","title":"Spider 2.0: Evaluating Language Models on Real-World Enterprise\n Text-to-SQL Workflows","summary":" Real-world enterprise text-to-SQL workflows often involve complex cloud or\nlocal data across various database systems, multiple SQL queries in various\ndialects, and diverse operations from data transformation to analytics. We\nintroduce Spider 2.0, an evaluation framework comprising 632 real-world\ntext-to-SQL workflow problems derived from enterprise-level database use cases.\nThe databases in Spider 2.0 are sourced from real data applications, often\ncontaining over 1,000 columns and stored in local or cloud database systems\nsuch as BigQuery and Snowflake. We show that solving problems in Spider 2.0\nfrequently requires understanding and searching through database metadata,\ndialect documentation, and even project-level codebases. This challenge calls\nfor models to interact with complex SQL workflow environments, process\nextremely long contexts, perform intricate reasoning, and generate multiple SQL\nqueries with diverse operations, often exceeding 100 lines, which goes far\nbeyond traditional text-to-SQL challenges. Our evaluations indicate that based\non o1-preview, our code agent framework successfully solves only 17.0% of the\ntasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on\nSpider 2.0 show that while language models have demonstrated remarkable\nperformance in code generation -- especially in prior text-to-SQL benchmarks --\nthey require significant improvement in order to achieve adequate performance\nfor real-world enterprise usage. Progress on Spider 2.0 represents crucial\nsteps towards developing intelligent, autonomous, code agents for real-world\nenterprise settings. Our code, baseline models, and data are available at\nhttps://spider2-sql.github.io.\n","authors":["Fangyu Lei","Jixuan Chen","Yuxiao Ye","Ruisheng Cao","Dongchan Shin","Hongjin Su","Zhaoqing Suo","Hongcheng Gao","Wenjing Hu","Pengcheng Yin","Victor Zhong","Caiming Xiong","Ruoxi Sun","Qian Liu","Sida Wang","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2411.07763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00722v2","updated":"2024-11-12T11:49:33Z","published":"2024-04-26T11:57:21Z","title":"LLMs for Generating and Evaluating Counterfactuals: A Comprehensive\n Study","summary":" As NLP models become more complex, understanding their decisions becomes more\ncrucial. Counterfactuals (CFs), where minimal changes to inputs flip a model's\nprediction, offer a way to explain these models. While Large Language Models\n(LLMs) have shown remarkable performance in NLP tasks, their efficacy in\ngenerating high-quality CFs remains uncertain. This work fills this gap by\ninvestigating how well LLMs generate CFs for two NLU tasks. We conduct a\ncomprehensive comparison of several common LLMs, and evaluate their CFs,\nassessing both intrinsic metrics, and the impact of these CFs on data\naugmentation. Moreover, we analyze differences between human and LLM-generated\nCFs, providing insights for future research directions. Our results show that\nLLMs generate fluent CFs, but struggle to keep the induced changes minimal.\nGenerating CFs for Sentiment Analysis (SA) is less challenging than NLI where\nLLMs show weaknesses in generating CFs that flip the original label. This also\nreflects on the data augmentation performance, where we observe a large gap\nbetween augmenting with human and LLMs CFs. Furthermore, we evaluate LLMs'\nability to assess CFs in a mislabelled data setting, and show that they have a\nstrong bias towards agreeing with the provided labels. GPT4 is more robust\nagainst this bias and its scores correlate well with automatic metrics. Our\nfindings reveal several limitations and point to potential future work\ndirections.\n","authors":["Van Bach Nguyen","Paul Youssef","Christin Seifert","Jörg Schlötterer"],"pdf_url":"https://arxiv.org/pdf/2405.00722v2.pdf","comment":"Accepted to EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2405.07863v3","updated":"2024-11-12T11:18:43Z","published":"2024-05-13T15:50:39Z","title":"RLHF Workflow: From Reward Modeling to Online RLHF","summary":" We present the workflow of Online Iterative Reinforcement Learning from Human\nFeedback (RLHF) in this technical report, which is widely reported to\noutperform its offline counterpart by a large margin in the recent large\nlanguage model (LLM) literature. However, existing open-source RLHF projects\nare still largely confined to the offline learning setting. In this technical\nreport, we aim to fill in this gap and provide a detailed recipe that is easy\nto reproduce for online iterative RLHF. In particular, since online human\nfeedback is usually infeasible for open-source communities with limited\nresources, we start by constructing preference models using a diverse set of\nopen-source datasets and use the constructed proxy preference model to\napproximate human feedback. Then, we discuss the theoretical insights and\nalgorithmic principles behind online iterative RLHF, followed by a detailed\npractical implementation. Our trained LLM achieves impressive performance on\nLLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as\nwell as other academic benchmarks such as HumanEval and TruthfulQA. We have\nshown that supervised fine-tuning (SFT) and iterative RLHF can obtain\nstate-of-the-art performance with fully open-source datasets. Further, we have\nmade our models, curated datasets, and comprehensive step-by-step code\nguidebooks publicly available. Please refer to\nhttps://github.com/RLHFlow/RLHF-Reward-Modeling and\nhttps://github.com/RLHFlow/Online-RLHF for more detailed information.\n","authors":["Hanze Dong","Wei Xiong","Bo Pang","Haoxiang Wang","Han Zhao","Yingbo Zhou","Nan Jiang","Doyen Sahoo","Caiming Xiong","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.07863v3.pdf","comment":"Published in Transactions on Machine Learning Research (09/2024)"},{"id":"http://arxiv.org/abs/2407.14192v2","updated":"2024-11-12T11:09:35Z","published":"2024-07-19T10:40:10Z","title":"LeKUBE: A Legal Knowledge Update BEnchmark","summary":" Recent advances in Large Language Models (LLMs) have significantly shaped the\napplications of AI in multiple fields, including the studies of legal\nintelligence. Trained on extensive legal texts, including statutes and legal\ndocuments, the legal LLMs can capture important legal knowledge/concepts\neffectively and provide important support for downstream legal applications\nsuch as legal consultancy. Yet, the dynamic nature of legal statutes and\ninterpretations also poses new challenges to the use of LLMs in legal\napplications. Particularly, how to update the legal knowledge of LLMs\neffectively and efficiently has become an important research problem in\npractice. Existing benchmarks for evaluating knowledge update methods are\nmostly designed for the open domain and cannot address the specific challenges\nof the legal domain, such as the nuanced application of new legal knowledge,\nthe complexity and lengthiness of legal regulations, and the intricate nature\nof legal reasoning. To address this gap, we introduce the Legal Knowledge\nUpdate BEnchmark, i.e. LeKUBE, which evaluates knowledge update methods for\nlegal LLMs across five dimensions. Specifically, we categorize the needs of\nknowledge updates in the legal domain with the help of legal professionals, and\nthen hire annotators from law schools to create synthetic updates to the\nChinese Criminal and Civil Code as well as sets of questions of which the\nanswers would change after the updates. Through a comprehensive evaluation of\nstate-of-the-art knowledge update methods, we reveal a notable gap between\nexisting knowledge update methods and the unique needs of the legal domain,\nemphasizing the need for further research and development of knowledge update\nmechanisms tailored for legal LLMs.\n","authors":["Changyue Wang","Weihang Su","Hu Yiran","Qingyao Ai","Yueyue Wu","Cheng Luo","Yiqun Liu","Min Zhang","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2407.14192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12036v2","updated":"2024-11-12T10:12:49Z","published":"2024-07-01T05:37:17Z","title":"Exploring Advanced Large Language Models with LLMsuite","summary":" This tutorial explores the advancements and challenges in the development of\nLarge Language Models (LLMs) such as ChatGPT and Gemini. It addresses inherent\nlimitations like temporal knowledge cutoffs, mathematical inaccuracies, and the\ngeneration of incorrect information, proposing solutions like Retrieval\nAugmented Generation (RAG), Program-Aided Language Models (PAL), and frameworks\nsuch as ReAct and LangChain. The integration of these techniques enhances LLM\nperformance and reliability, especially in multi-step reasoning and complex\ntask execution. The paper also covers fine-tuning strategies, including\ninstruction fine-tuning, parameter-efficient methods like LoRA, and\nReinforcement Learning from Human Feedback (RLHF) as well as Reinforced\nSelf-Training (ReST). Additionally, it provides a comprehensive survey of\ntransformer architectures and training techniques for LLMs. The source code can\nbe accessed by contacting the author via email for a request.\n","authors":["Giorgio Roffo"],"pdf_url":"https://arxiv.org/pdf/2407.12036v2.pdf","comment":"Keywords: Language Model Benchmarking, Pre-Trained LLM Comparison,\n LLM Performance Analysis, NLP Model Evaluation Tools, Public Dataset\n Inference for LLMs, BLEU and ROUGE Metrics for LLM, Open Source LLM Testing\n Tools, Large Language Model Evaluation Software, NLP Benchmarking Suite,\n Comprehensive LLM Evaluation Toolkit"},{"id":"http://arxiv.org/abs/2406.16620v3","updated":"2024-11-12T10:02:12Z","published":"2024-06-24T13:05:39Z","title":"OmAgent: A Multi-modal Agent Framework for Complex Video Understanding\n with Task Divide-and-Conquer","summary":" Recent advancements in Large Language Models (LLMs) have expanded their\ncapabilities to multimodal contexts, including comprehensive video\nunderstanding. However, processing extensive videos such as 24-hour CCTV\nfootage or full-length films presents significant challenges due to the vast\ndata and processing demands. Traditional methods, like extracting key frames or\nconverting frames to text, often result in substantial information loss. To\naddress these shortcomings, we develop OmAgent, efficiently stores and\nretrieves relevant video frames for specific queries, preserving the detailed\ncontent of videos. Additionally, it features an Divide-and-Conquer Loop capable\nof autonomous reasoning, dynamically invoking APIs and tools to enhance query\nprocessing and accuracy. This approach ensures robust video understanding,\nsignificantly reducing information loss. Experimental results affirm OmAgent's\nefficacy in handling various types of videos and complex tasks. Moreover, we\nhave endowed it with greater autonomy and a robust tool-calling system,\nenabling it to accomplish even more intricate tasks.\n","authors":["Lu Zhang","Tiancheng Zhao","Heting Ying","Yibo Ma","Kyusong Lee"],"pdf_url":"https://arxiv.org/pdf/2406.16620v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07656v1","updated":"2024-11-12T09:14:16Z","published":"2024-11-12T09:14:16Z","title":"Mitigating Bias in Queer Representation within Large Language Models: A\n Collaborative Agent Approach","summary":" Large Language Models (LLMs) often perpetuate biases in pronoun usage,\nleading to misrepresentation or exclusion of queer individuals. This paper\naddresses the specific problem of biased pronoun usage in LLM outputs,\nparticularly the inappropriate use of traditionally gendered pronouns (\"he,\"\n\"she\") when inclusive language is needed to accurately represent all\nidentities. We introduce a collaborative agent pipeline designed to mitigate\nthese biases by analyzing and optimizing pronoun usage for inclusivity. Our\nmulti-agent framework includes specialized agents for both bias detection and\ncorrection. Experimental evaluations using the Tango dataset-a benchmark\nfocused on gender pronoun usage-demonstrate that our approach significantly\nimproves inclusive pronoun classification, achieving a 32.6 percentage point\nincrease over GPT-4o in correctly disagreeing with inappropriate traditionally\ngendered pronouns $(\\chi^2 = 38.57, p < 0.0001)$. These results accentuate the\npotential of agent-driven frameworks in enhancing fairness and inclusivity in\nAI-generated content, demonstrating their efficacy in reducing biases and\npromoting socially responsible AI.\n","authors":["Tianyi Huang","Arya Somasundaram"],"pdf_url":"https://arxiv.org/pdf/2411.07656v1.pdf","comment":"NeurIPS 2024 Queer in AI Workshop"},{"id":"http://arxiv.org/abs/2409.18412v3","updated":"2024-11-12T09:11:37Z","published":"2024-09-27T03:00:29Z","title":"SciDFM: A Large Language Model with Mixture-of-Experts for Science","summary":" Recently, there has been a significant upsurge of interest in leveraging\nlarge language models (LLMs) to assist scientific discovery. However, most LLMs\nonly focus on general science, while they lack domain-specific knowledge, such\nas chemical molecules and amino acid sequences. To bridge these gaps, we\nintroduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and\nis able to conduct college-level scientific reasoning and understand molecules\nand amino acid sequences. We collect a large-scale training corpus containing\nnumerous scientific papers and books from different disciplines as well as data\nfrom domain-specific databases. We further fine-tune the pre-trained model on\nlots of instruction data to improve performances on downstream benchmarks. From\nexperiment results, we show that SciDFM achieves strong performance on general\nscientific benchmarks such as SciEval and SciQ, and it reaches a SOTA\nperformance on domain-specific benchmarks among models of similar size. We\nfurther analyze the expert layers and show that the results of expert selection\nvary with data from different disciplines. To benefit the broader research\ncommunity, we open-source SciDFM at\nhttps://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0.\n","authors":["Liangtai Sun","Danyu Luo","Da Ma","Zihan Zhao","Baocai Chen","Zhennan Shen","Su Zhu","Lu Chen","Xin Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18412v3.pdf","comment":"12 pages, 1 figure, 9 tables. Technical Report, accepted by NeurIPS\n 2024 Workshop FM4Science"},{"id":"http://arxiv.org/abs/2404.12866v2","updated":"2024-11-12T08:59:30Z","published":"2024-04-19T13:05:37Z","title":"How Does the Textual Information Affect the Retrieval of Multimodal\n In-Context Learning?","summary":" The increase in parameter size of multimodal large language models (MLLMs)\nintroduces significant capabilities, particularly in-context learning, where\nMLLMs enhance task performance without updating pre-trained parameters. This\neffectiveness, however, hinges on the appropriate selection of in-context\nexamples, a process that is currently biased towards visual data, overlooking\ntextual information. Furthermore, the area of supervised retrievers for MLLMs,\ncrucial for optimal in-context example selection, continues to be\nuninvestigated. Our study offers an in-depth evaluation of the impact of\ntextual information on the unsupervised selection of in-context examples in\nmultimodal contexts, uncovering a notable sensitivity of retriever performance\nto the employed modalities. Responding to this, we introduce a novel supervised\nMLLM-retriever MSIER that employs a neural network to select examples that\nenhance multimodal in-context learning efficiency. This approach is validated\nthrough extensive testing across three distinct tasks, demonstrating the\nmethod's effectiveness. Additionally, we investigate the influence of\nmodalities on our supervised retrieval method's training and pinpoint factors\ncontributing to our model's success. This exploration paves the way for future\nadvancements, highlighting the potential for refined in-context learning in\nMLLMs through the strategic use of multimodal data.\n","authors":["Yang Luo","Zangwei Zheng","Zirui Zhu","Yang You"],"pdf_url":"https://arxiv.org/pdf/2404.12866v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2311.10944v5","updated":"2024-11-12T08:31:22Z","published":"2023-11-18T02:44:33Z","title":"Deception Detection from Linguistic and Physiological Data Streams Using\n Bimodal Convolutional Neural Networks","summary":" Deception detection is gaining increasing interest due to ethical and\nsecurity concerns. This paper explores the application of convolutional neural\nnetworks for the purpose of multimodal deception detection. We use a dataset\nbuilt by interviewing 104 subjects about two topics, with one truthful and one\nfalsified response from each subject about each topic. In particular, we make\nthree main contributions. First, we extract linguistic and physiological\nfeatures from this data to train and construct the neural network models.\nSecond, we propose a fused convolutional neural network model using both\nmodalities in order to achieve an improved overall performance. Third, we\ncompare our new approach with earlier methods designed for multimodal deception\ndetection. We find that our system outperforms regular classification methods;\nour results indicate the feasibility of using neural networks for deception\ndetection even in the presence of limited amounts of data.\n","authors":["Panfeng Li","Mohamed Abouelenien","Rada Mihalcea","Zhicheng Ding","Qikai Yang","Yiming Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.10944v5.pdf","comment":"Accepted by 2024 5th International Conference on Information Science,\n Parallel and Distributed Systems"},{"id":"http://arxiv.org/abs/2405.06219v3","updated":"2024-11-12T08:18:45Z","published":"2024-05-10T03:06:24Z","title":"SKVQ: Sliding-window Key and Value Cache Quantization for Large Language\n Models","summary":" Large language models (LLMs) can now handle longer sequences of tokens,\nenabling complex tasks like book understanding and generating lengthy novels.\nHowever, the key-value (KV) cache required for LLMs consumes substantial memory\nas context length increasing, becoming the bottleneck for deployment. In this\npaper, we present a strategy called SKVQ, which stands for sliding-window KV\ncache quantization, to address the issue of extremely low bitwidth KV cache\nquantization. To achieve this, SKVQ rearranges the channels of the KV cache in\norder to improve the similarity of channels in quantization groups, and applies\nclipped dynamic quantization at the group level. Additionally, SKVQ ensures\nthat the most recent window tokens in the KV cache are preserved with high\nprecision. This helps maintain the accuracy of a small but important portion of\nthe KV cache.SKVQ achieves high compression ratios while maintaining accuracy.\nOur evaluation on LLMs demonstrates that SKVQ surpasses previous quantization\napproaches, allowing for quantization of the KV cache to 2-bit keys and 1.5-bit\nvalues with minimal loss of accuracy. With SKVQ, it is possible to process\ncontext lengths of up to 1M on an 80GB memory GPU for a 7b model and up to 7\ntimes faster decoding.\n","authors":["Haojie Duanmu","Zhihang Yuan","Xiuhong Li","Jiangfei Duan","Xingcheng Zhang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2405.06219v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07623v1","updated":"2024-11-12T08:10:54Z","published":"2024-11-12T08:10:54Z","title":"Annotating Constructions with UD: the experience of the Italian\n Constructicon","summary":" The paper descirbes a first attempt of linking the Italian constructicon to\nUD resources\n","authors":["Ludovica Pannitto","Beatrice Bernasconi","Lucia Busso","Flavio Pisciotta","Giulia Rambelli","Francesca Masini"],"pdf_url":"https://arxiv.org/pdf/2411.07623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07618v1","updated":"2024-11-12T07:54:13Z","published":"2024-11-12T07:54:13Z","title":"Direct Preference Optimization Using Sparse Feature-Level Constraints","summary":" The alignment of large language models (LLMs) with human preferences remains\na key challenge. While post-training techniques like Reinforcement Learning\nfrom Human Feedback (RLHF) and Direct Preference Optimization (DPO) have\nachieved notable success, they often introduce computational inefficiencies and\ntraining instability. In this paper, we propose Feature-level constrained\nPreference Optimization (FPO), a novel method designed to simplify the\nalignment process while ensuring stability. FPO leverages pre-trained Sparse\nAutoencoders (SAEs) and introduces feature-level constraints, allowing for\nefficient, sparsity-enforced alignment. Our approach enjoys efficiency by using\nsparse features activated in a well-trained sparse autoencoder and the quality\nof sequential KL divergence by using the feature-level offline reference.\nExperimental results on benchmark datasets demonstrate that FPO achieves a\n5.08% absolute improvement in win rate with much lower computational cost\ncompared to state-of-the-art baselines, making it a promising solution for\nefficient and controllable LLM alignments.\n","authors":["Qingyu Yin","Chak Tou Leong","Hongbo Zhang","Minjun Zhu","Hanqi Yan","Qiang Zhang","Yulan He","Wenjie Li","Jun Wang","Yue Zhang","Linyi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06634v2","updated":"2024-11-12T07:52:33Z","published":"2024-08-13T04:53:31Z","title":"Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM\n Approach","summary":" Accurate stock market predictions following earnings reports are crucial for\ninvestors. Traditional methods, particularly classical machine learning models,\nstruggle with these predictions because they cannot effectively process and\ninterpret extensive textual data contained in earnings reports and often\noverlook nuances that influence market movements. This paper introduces an\nadvanced approach by employing Large Language Models (LLMs) instruction\nfine-tuned with a novel combination of instruction-based techniques and\nquantized low-rank adaptation (QLoRA) compression. Our methodology integrates\n'base factors', such as financial metric growth and earnings transcripts, with\n'external factors', including recent market indices performances and analyst\ngrades, to create a rich, supervised dataset. This comprehensive dataset\nenables our models to achieve superior predictive performance in terms of\naccuracy, weighted F1, and Matthews correlation coefficient (MCC), especially\nevident in the comparison with benchmarks such as GPT-4. We specifically\nhighlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases\nsignificant improvements over baseline models. The paper also discusses the\npotential of expanding the output capabilities to include a 'Hold' option and\nextending the prediction horizon, aiming to accommodate various investment\nstyles and time frames. This study not only demonstrates the power of\nintegrating cutting-edge AI with fine-tuned financial data but also paves the\nway for future research in enhancing AI-driven financial analysis tools.\n","authors":["Haowei Ni","Shuchen Meng","Xupeng Chen","Ziqing Zhao","Andi Chen","Panfeng Li","Shiyao Zhang","Qifu Yin","Yuanqing Wang","Yuxi Chan"],"pdf_url":"https://arxiv.org/pdf/2408.06634v2.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2411.07611v1","updated":"2024-11-12T07:34:56Z","published":"2024-11-12T07:34:56Z","title":"Multimodal Clinical Reasoning through Knowledge-augmented Rationale\n Generation","summary":" Clinical rationales play a pivotal role in accurate disease diagnosis;\nhowever, many models predominantly use discriminative methods and overlook the\nimportance of generating supportive rationales. Rationale distillation is a\nprocess that transfers knowledge from large language models (LLMs) to smaller\nlanguage models (SLMs), thereby enhancing the latter's ability to break down\ncomplex tasks. Despite its benefits, rationale distillation alone is inadequate\nfor addressing domain knowledge limitations in tasks requiring specialized\nexpertise, such as disease diagnosis. Effectively embedding domain knowledge in\nSLMs poses a significant challenge. While current LLMs are primarily geared\ntoward processing textual data, multimodal LLMs that incorporate time series\ndata, especially electronic health records (EHRs), are still evolving. To\ntackle these limitations, we introduce ClinRaGen, an SLM optimized for\nmultimodal rationale generation in disease diagnosis. ClinRaGen incorporates a\nunique knowledge-augmented attention mechanism to merge domain knowledge with\ntime series EHR data, utilizing a stepwise rationale distillation strategy to\nproduce both textual and time series-based clinical rationales. Our evaluations\nshow that ClinRaGen markedly improves the SLM's capability to interpret\nmultimodal EHR data and generate accurate clinical rationales, supporting more\nreliable disease diagnosis, advancing LLM applications in healthcare, and\nnarrowing the performance divide between LLMs and SLMs.\n","authors":["Shuai Niu","Jing Ma","Liang Bai","Zhihua Wang","Yida Xu","Yunya Song","Xian Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07611v1.pdf","comment":"11 pages. 4 figures"},{"id":"http://arxiv.org/abs/2411.07602v1","updated":"2024-11-12T07:24:41Z","published":"2024-11-12T07:24:41Z","title":"Circuit Complexity Bounds for RoPE-based Transformer Architecture","summary":" Characterizing the express power of the Transformer architecture is critical\nto understanding its capacity limits and scaling law. Recent works provide the\ncircuit complexity bounds to Transformer-like architecture. On the other hand,\nRotary Position Embedding ($\\mathsf{RoPE}$) has emerged as a crucial technique\nin modern large language models, offering superior performance in capturing\npositional information compared to traditional position embeddings, which shows\ngreat potential in application prospects, particularly for the long context\nscenario. Empirical evidence also suggests that $\\mathsf{RoPE}$-based\nTransformer architectures demonstrate greater generalization capabilities\ncompared to conventional Transformer models. In this work, we establish a\ntighter circuit complexity bound for Transformers with $\\mathsf{RoPE}$\nattention. Our key contribution is that we show that unless $\\mathsf{TC}^0 =\n\\mathsf{NC}^1$, a $\\mathsf{RoPE}$-based Transformer with\n$\\mathrm{poly}(n)$-precision, $O(1)$ layers, hidden dimension $d \\leq O(n)$\ncannot solve the arithmetic problem or the Boolean formula value problem. This\nresult significantly demonstrates the fundamental limitation of the\nexpressivity of the $\\mathsf{RoPE}$-based Transformer architecture, although it\nachieves giant empirical success. Our theoretical framework not only\nestablishes tighter complexity bounds but also may instruct further work on the\n$\\mathsf{RoPE}$-based Transformer.\n","authors":["Bo Chen","Xiaoyu Li","Yingyu Liang","Jiangxuan Long","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2411.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12196v2","updated":"2024-11-12T07:22:21Z","published":"2024-07-16T21:43:47Z","title":"MASIVE: Open-Ended Affective State Identification in English and Spanish","summary":" In the field of emotion analysis, much NLP research focuses on identifying a\nlimited number of discrete emotion categories, often applied across languages.\nThese basic sets, however, are rarely designed with textual data in mind, and\nculture, language, and dialect can influence how particular emotions are\ninterpreted. In this work, we broaden our scope to a practically unbounded set\nof \\textit{affective states}, which includes any terms that humans use to\ndescribe their experiences of feeling. We collect and publish MASIVE, a dataset\nof Reddit posts in English and Spanish containing over 1,000 unique affective\nstates each. We then define the new problem of \\textit{affective state\nidentification} for language generation models framed as a masked span\nprediction task. On this task, we find that smaller finetuned multilingual\nmodels outperform much larger LLMs, even on region-specific Spanish affective\nstates. Additionally, we show that pretraining on MASIVE improves model\nperformance on existing emotion benchmarks. Finally, through machine\ntranslation experiments, we find that native speaker-written data is vital to\ngood performance on this task.\n","authors":["Nicholas Deas","Elsbeth Turcan","Iván Pérez Mejía","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2407.12196v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2404.13565v3","updated":"2024-11-12T07:21:04Z","published":"2024-04-21T07:34:44Z","title":"Exploring Diverse Methods in Visual Question Answering","summary":" This study explores innovative methods for improving Visual Question\nAnswering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and\nattention mechanisms. Leveraging a balanced VQA dataset, we investigate three\ndistinct strategies. Firstly, GAN-based approaches aim to generate answer\nembeddings conditioned on image and question inputs, showing potential but\nstruggling with more complex tasks. Secondly, autoencoder-based techniques\nfocus on learning optimal embeddings for questions and images, achieving\ncomparable results with GAN due to better ability on complex questions. Lastly,\nattention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB),\naddress language priors and attention modeling, albeit with a\ncomplexity-performance trade-off. This study underscores the challenges and\nopportunities in VQA and suggests avenues for future research, including\nalternative GAN formulations and attentional mechanisms.\n","authors":["Panfeng Li","Qikai Yang","Xieming Geng","Wenjing Zhou","Zhicheng Ding","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13565v3.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2411.07598v1","updated":"2024-11-12T07:16:51Z","published":"2024-11-12T07:16:51Z","title":"Problem-Oriented Segmentation and Retrieval: Case Study on Tutoring\n Conversations","summary":" Many open-ended conversations (e.g., tutoring lessons or business meetings)\nrevolve around pre-defined reference materials, like worksheets or meeting\nbullets. To provide a framework for studying such conversation structure, we\nintroduce Problem-Oriented Segmentation & Retrieval (POSR), the task of jointly\nbreaking down conversations into segments and linking each segment to the\nrelevant reference item. As a case study, we apply POSR to education where\neffectively structuring lessons around problems is critical yet difficult. We\npresent LessonLink, the first dataset of real-world tutoring lessons, featuring\n3,500 segments, spanning 24,300 minutes of instruction and linked to 116 SAT\nmath problems. We define and evaluate several joint and independent approaches\nfor POSR, including segmentation (e.g., TextTiling), retrieval (e.g., ColBERT),\nand large language models (LLMs) methods. Our results highlight that modeling\nPOSR as one joint task is essential: POSR methods outperform independent\nsegmentation and retrieval pipelines by up to +76% on joint metrics and surpass\ntraditional segmentation methods by up to +78% on segmentation metrics. We\ndemonstrate POSR's practical impact on downstream education applications,\nderiving new insights on the language and time use in real-world lesson\nstructures.\n","authors":["Rose E. Wang","Pawan Wirawarn","Kenny Lam","Omar Khattab","Dorottya Demszky"],"pdf_url":"https://arxiv.org/pdf/2411.07598v1.pdf","comment":"EMNLP 2024 Findings. Our code and dataset are open-sourced at\n https://github.com/rosewang2008/posr"},{"id":"http://arxiv.org/abs/2411.07595v1","updated":"2024-11-12T07:09:44Z","published":"2024-11-12T07:09:44Z","title":"Entropy Controllable Direct Preference Optimization","summary":" In the post-training of large language models (LLMs), Reinforcement Learning\nfrom Human Feedback (RLHF) is an effective approach to achieve generation\naligned with human preferences. Direct Preference Optimization (DPO) allows for\npolicy training with a simple binary cross-entropy loss without a reward model.\nThe objective of DPO is regularized by reverse KL divergence that encourages\nmode-seeking fitting to the reference policy. Nonetheless, we indicate that\nminimizing reverse KL divergence could fail to capture a mode of the reference\ndistribution, which may hurt the policy's performance. Based on this\nobservation, we propose a simple modification to DPO, H-DPO, which allows for\ncontrol over the entropy of the resulting policy, enhancing the distribution's\nsharpness and thereby enabling mode-seeking fitting more effectively. In our\nexperiments, we show that H-DPO outperformed DPO across various tasks,\ndemonstrating superior results in pass@$k$ evaluations for mathematical tasks.\nMoreover, H-DPO is simple to implement, requiring only minor modifications to\nthe loss calculation of DPO, which makes it highly practical and promising for\nwide-ranging applications in the training of LLMs.\n","authors":["Motoki Omura","Yasuhiro Fujita","Toshiki Kataoka"],"pdf_url":"https://arxiv.org/pdf/2411.07595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09059v2","updated":"2024-11-12T06:15:50Z","published":"2024-03-14T02:56:38Z","title":"LAMP: A Language Model on the Map","summary":" Large Language Models (LLMs) are poised to play an increasingly important\nrole in our lives, providing assistance across a wide array of tasks. In the\ngeospatial domain, LLMs have demonstrated the ability to answer generic\nquestions, such as identifying a country's capital; nonetheless, their utility\nis hindered when it comes to answering fine-grained questions about specific\nplaces, such as grocery stores or restaurants, which constitute essential\naspects of people's everyday lives. This is mainly because the places in our\ncities haven't been systematically fed into LLMs, so as to understand and\nmemorize them. This study introduces a novel framework for fine-tuning a\npre-trained model on city-specific data, to enable it to provide accurate\nrecommendations, while minimizing hallucinations. We share our model, LAMP, and\nthe data used to train it. We conduct experiments to analyze its ability to\ncorrectly retrieving spatial objects, and compare it to well-known open- and\nclosed- source language models, such as GPT-4. Finally, we explore its emerging\ncapabilities through a case study on day planning.\n","authors":["Pasquale Balsebre","Weiming Huang","Gao Cong"],"pdf_url":"https://arxiv.org/pdf/2403.09059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05990v2","updated":"2024-11-12T05:46:46Z","published":"2024-11-08T22:02:22Z","title":"Game-theoretic LLM: Agent Workflow for Negotiation Games","summary":" This paper investigates the rationality of large language models (LLMs) in\nstrategic decision-making contexts, specifically within the framework of game\ntheory. We evaluate several state-of-the-art LLMs across a spectrum of\ncomplete-information and incomplete-information games. Our findings reveal that\nLLMs frequently deviate from rational strategies, particularly as the\ncomplexity of the game increases with larger payoff matrices or deeper\nsequential trees.\n To address these limitations, we design multiple game-theoretic workflows\nthat guide the reasoning and decision-making processes of LLMs. These workflows\naim to enhance the models' ability to compute Nash Equilibria and make rational\nchoices, even under conditions of uncertainty and incomplete information.\nExperimental results demonstrate that the adoption of these workflows\nsignificantly improves the rationality and robustness of LLMs in game-theoretic\ntasks. Specifically, with the workflow, LLMs exhibit marked improvements in\nidentifying optimal strategies, achieving near-optimal allocations in\nnegotiation scenarios, and reducing susceptibility to exploitation during\nnegotiations. Furthermore, we explore the meta-strategic considerations of\nwhether it is rational for agents to adopt such workflows, recognizing that the\ndecision to use or forgo the workflow constitutes a game-theoretic issue in\nitself.\n Our research contributes to a deeper understanding of LLMs' decision-making\ncapabilities in strategic contexts and provides insights into enhancing their\nrationality through structured workflows. The findings have implications for\nthe development of more robust and strategically sound AI agents capable of\nnavigating complex interactive environments. Code and data supporting this\nstudy are available at \\url{https://github.com/Wenyueh/game_theory}.\n","authors":["Wenyue Hua","Ollie Liu","Lingyao Li","Alfonso Amayuelas","Julie Chen","Lucas Jiang","Mingyu Jin","Lizhou Fan","Fei Sun","William Wang","Xintong Wang","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.05990v2.pdf","comment":"45 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.11856v2","updated":"2024-11-12T05:37:15Z","published":"2024-08-15T19:13:38Z","title":"Dynamic Adaptive Optimization for Effective Sentiment Analysis\n Fine-Tuning on Large Language Models","summary":" Sentiment analysis plays a crucial role in various domains, such as business\nintelligence and financial forecasting. Large language models (LLMs) have\nbecome a popular paradigm for sentiment analysis, leveraging multi-task\nlearning to address specific tasks concurrently. However, LLMs with fine-tuning\nfor sentiment analysis often underperforms due to the inherent challenges in\nmanaging diverse task complexities. Moreover, constant-weight approaches in\nmulti-task learning struggle to adapt to variations in data characteristics,\nfurther complicating model effectiveness. To address these issues, we propose a\nnovel multi-task learning framework with a dynamic adaptive optimization (DAO)\nmodule. This module is designed as a plug-and-play component that can be\nseamlessly integrated into existing models, providing an effective and flexible\nsolution for multi-task learning. The key component of the DAO module is\ndynamic adaptive loss, which dynamically adjusts the weights assigned to\ndifferent tasks based on their relative importance and data characteristics\nduring training. Sentiment analyses on a standard and customized financial text\ndataset demonstrate that the proposed framework achieves superior performance.\nSpecifically, this work improves the Mean Squared Error (MSE) and Accuracy\n(ACC) by 15.58% and 1.24% respectively, compared with previous work.\n","authors":["Hongcheng Ding","Xuanze Zhao","Shamsul Nahar Abdullah","Deshinta Arrova Dewi","Zixiao Jiang","Xiangyu Shi"],"pdf_url":"https://arxiv.org/pdf/2408.11856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10839v3","updated":"2024-11-12T05:33:05Z","published":"2024-06-16T08:20:12Z","title":"Reminding Multimodal Large Language Models of Object-aware Knowledge\n with Retrieved Tags","summary":" Despite recent advances in the general visual instruction-following ability\nof Multimodal Large Language Models (MLLMs), they still struggle with critical\nproblems when required to provide a precise and detailed response to a visual\ninstruction: (1) failure to identify novel objects or entities, (2) mention of\nnon-existent objects, and (3) neglect of object's attributed details. Intuitive\nsolutions include improving the size and quality of data or using larger\nfoundation models. They show effectiveness in mitigating these issues, but at\nan expensive cost of collecting a vast amount of new data and introducing a\nsignificantly larger model. Standing at the intersection of these approaches,\nwe examine the three object-oriented problems from the perspective of the\nimage-to-text mapping process by the multimodal connector. In this paper, we\nfirst identify the limitations of multimodal connectors stemming from\ninsufficient training data. Driven by this, we propose to enhance the mapping\nwith retrieval-augmented tag tokens, which contain rich object-aware\ninformation such as object names and attributes. With our Tag-grounded visual\ninstruction tuning with retrieval Augmentation (TUNA), we outperform baselines\nthat share the same language model and training data on 12 benchmarks.\nFurthermore, we show the zero-shot capability of TUNA when provided with\nspecific datastores.\n","authors":["Daiqing Qi","Handong Zhao","Zijun Wei","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2406.10839v3.pdf","comment":"Main Conference at EMNLP 2024"},{"id":"http://arxiv.org/abs/2401.12585v6","updated":"2024-11-12T05:09:34Z","published":"2024-01-23T09:33:31Z","title":"SLANG: New Concept Comprehension of Large Language Models","summary":" The dynamic nature of language, particularly evident in the realm of slang\nand memes on the Internet, poses serious challenges to the adaptability of\nlarge language models (LLMs). Traditionally anchored to static datasets, these\nmodels often struggle to keep up with the rapid linguistic evolution\ncharacteristic of online communities. This research aims to bridge this gap by\nenhancing LLMs' comprehension of the evolving new concepts on the Internet,\nwithout the high cost of continual retraining. In pursuit of this goal, we\nintroduce $\\textbf{SLANG}$, a benchmark designed to autonomously integrate\nnovel data and assess LLMs' ability to comprehend emerging concepts, alongside\n$\\textbf{FOCUS}$, an approach uses causal inference to enhance LLMs to\nunderstand new phrases and their colloquial context. Our benchmark and approach\ninvolves understanding real-world instances of linguistic shifts, serving as\ncontextual beacons, to form more precise and contextually relevant connections\nbetween newly emerging expressions and their meanings. The empirical analysis\nshows that our causal inference-based approach outperforms the baseline methods\nin terms of precision and relevance in the comprehension of Internet slang and\nmemes.\n","authors":["Lingrui Mei","Shenghua Liu","Yiwei Wang","Baolong Bi","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.12585v6.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2411.07546v1","updated":"2024-11-12T04:50:10Z","published":"2024-11-12T04:50:10Z","title":"Contrastive Language Prompting to Ease False Positives in Medical\n Anomaly Detection","summary":" A pre-trained visual-language model, contrastive language-image pre-training\n(CLIP), successfully accomplishes various downstream tasks with text prompts,\nsuch as finding images or localizing regions within the image. Despite CLIP's\nstrong multi-modal data capabilities, it remains limited in specialized\nenvironments, such as medical applications. For this purpose, many CLIP\nvariants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives\nrelated to normal regions persist. Thus, we aim to present a simple yet\nimportant goal of reducing false positives in medical anomaly detection. We\nintroduce a Contrastive LAnguage Prompting (CLAP) method that leverages both\npositive and negative text prompts. This straightforward approach identifies\npotential lesion regions by visual attention to the positive prompts in the\ngiven image. To reduce false positives, we attenuate attention on normal\nregions using negative prompts. Extensive experiments with the BMAD dataset,\nincluding six biomedical benchmarks, demonstrate that CLAP method enhances\nanomaly detection performance. Our future plans include developing an automated\nfine prompting method for more practical usage.\n","authors":["YeongHyeon Park","Myung Jin Kim","Hyeong Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2411.07546v1.pdf","comment":"4 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.01523v3","updated":"2024-11-12T04:37:44Z","published":"2024-07-01T17:59:26Z","title":"MMLongBench-Doc: Benchmarking Long-context Document Understanding with\n Visualizations","summary":" Understanding documents with rich layouts and multi-modal components is a\nlong-standing and practical task. Recent Large Vision-Language Models (LVLMs)\nhave made remarkable strides in various tasks, particularly in single-page\ndocument understanding (DU). However, their abilities on long-context DU remain\nan open problem. This work presents MMLongBench-Doc, a long-context,\nmulti-modal benchmark comprising 1,062 expert-annotated questions. Distinct\nfrom previous datasets, it is constructed upon 130 lengthy PDF-formatted\ndocuments with an average of 49.4 pages and 20,971 textual tokens. Towards\ncomprehensive evaluation, answers to these questions rely on pieces of evidence\nfrom (1) different sources (text, image, chart, table, and layout structure)\nand (2) various locations (i.e. page number). Moreover, 33.2% of the questions\nare cross-page questions requiring evidence across multiple pages. 22.8% of the\nquestions are designed to be unanswerable for detecting potential\nhallucinations. Experiments on 14 LVLMs demonstrate that long-context DU\ngreatly challenges current models. Notably, the best-performing model, GPT-4o,\nachieves an F1 score of only 42.7%, while the second-best, GPT-4V, scores\n31.4%. Furthermore, 12 LVLMs (all except GPT-4o and GPT-4V) even present worse\nperformance than their LLM counterparts which are fed with lossy-parsed OCR\ndocuments. These results validate the necessity of future research toward more\ncapable long-context LVLMs. Project Page:\nhttps://mayubo2333.github.io/MMLongBench-Doc\n","authors":["Yubo Ma","Yuhang Zang","Liangyu Chen","Meiqi Chen","Yizhu Jiao","Xinze Li","Xinyuan Lu","Ziyu Liu","Yan Ma","Xiaoyi Dong","Pan Zhang","Liangming Pan","Yu-Gang Jiang","Jiaqi Wang","Yixin Cao","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01523v3.pdf","comment":"Accepted to NeurIPS 2024 Datasets and Benchmarks Track (Spotlight)"},{"id":"http://arxiv.org/abs/2410.09982v3","updated":"2024-11-12T04:20:00Z","published":"2024-10-13T19:53:40Z","title":"Self-Data Distillation for Recovering Quality in Pruned Large Language\n Models","summary":" Large language models have driven significant progress in natural language\nprocessing, but their deployment requires substantial compute and memory\nresources. As models scale, compression techniques become essential for\nbalancing model quality with computational efficiency. Structured pruning,\nwhich removes less critical components of the model, is a promising strategy\nfor reducing complexity. However, one-shot pruning often results in significant\nquality degradation, particularly in tasks requiring multi-step reasoning. To\nrecover lost quality, supervised fine-tuning (SFT) is commonly applied, but it\ncan lead to catastrophic forgetting by shifting the model's learned data\ndistribution. Therefore, addressing the degradation from both pruning and SFT\nis essential to preserve the original model's quality. In this work, we utilize\nself-data distilled fine-tuning to address these challenges. Our approach\nleverages the original, unpruned model to generate a distilled dataset that\npreserves semantic richness and mitigates catastrophic forgetting by\nmaintaining alignment with the base model's knowledge. Empirically, we\ndemonstrate that self-data distillation consistently outperforms standard SFT,\nimproving average accuracy by up to 8% on the HuggingFace OpenLLM Leaderboard\nv1. Specifically, when pruning six decoder blocks on Llama3.1-8B Instruct\n(i.e., 32 to 26 layers, reducing the model size from 8.03B to 6.72B\nparameters), our method retains 91.2% of the original model's accuracy compared\nto 81.7% with SFT, while reducing real-world FLOPs by 16.3%. Furthermore,\ncombining self-data distilled models through model merging yields enhanced\nquality retention. Additionally, leveraging these pruned models in speculative\ndecoding increases token acceptance rates, thereby improving inference\nefficiency in applied settings.\n","authors":["Vithursan Thangarasa","Ganesh Venkatesh","Mike Lasby","Nish Sinnadurai","Sean Lie"],"pdf_url":"https://arxiv.org/pdf/2410.09982v3.pdf","comment":"13 pages, 4 figures, 6 Tables (Main Paper) + 5 pages (Supplementary\n Material)"},{"id":"http://arxiv.org/abs/2411.07533v1","updated":"2024-11-12T04:16:44Z","published":"2024-11-12T04:16:44Z","title":"Large Language Models as Neurolinguistic Subjects: Identifying Internal\n Representations for Form and Meaning","summary":" This study investigates the linguistic understanding of Large Language Models\n(LLMs) regarding signifier (form) and signified (meaning) by distinguishing two\nLLM evaluation paradigms: psycholinguistic and neurolinguistic. Traditional\npsycholinguistic evaluations often reflect statistical biases that may\nmisrepresent LLMs' true linguistic capabilities. We introduce a neurolinguistic\napproach, utilizing a novel method that combines minimal pair and diagnostic\nprobing to analyze activation patterns across model layers. This method allows\nfor a detailed examination of how LLMs represent form and meaning, and whether\nthese representations are consistent across languages. Our contributions are\nthree-fold: (1) We compare neurolinguistic and psycholinguistic methods,\nrevealing distinct patterns in LLM assessment; (2) We demonstrate that LLMs\nexhibit higher competence in form compared to meaning, with the latter largely\ncorrelated to the former; (3) We present new conceptual minimal pair datasets\nfor Chinese (COMPS-ZH) and German (COMPS-DE), complementing existing English\ndatasets.\n","authors":["Linyang He","Ercong Nie","Helmut Schmid","Hinrich Schütze","Nima Mesgarani","Jonathan Brennan"],"pdf_url":"https://arxiv.org/pdf/2411.07533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07070v2","updated":"2024-11-12T04:12:32Z","published":"2024-11-11T15:46:07Z","title":"On Active Privacy Auditing in Supervised Fine-tuning for White-Box\n Language Models","summary":" The pretraining and fine-tuning approach has become the leading technique for\nvarious NLP applications. However, recent studies reveal that fine-tuning data,\ndue to their sensitive nature, domain-specific characteristics, and\nidentifiability, pose significant privacy concerns. To help develop more\nprivacy-resilient fine-tuning models, we introduce a novel active privacy\nauditing framework, dubbed Parsing, designed to identify and quantify privacy\nleakage risks during the supervised fine-tuning (SFT) of language models (LMs).\nThe framework leverages improved white-box membership inference attacks (MIAs)\nas the core technology, utilizing novel learning objectives and a two-stage\npipeline to monitor the privacy of the LMs' fine-tuning process, maximizing the\nexposure of privacy risks. Additionally, we have improved the effectiveness of\nMIAs on large LMs including GPT-2, Llama2, and certain variants of them. Our\nresearch aims to provide the SFT community of LMs with a reliable, ready-to-use\nprivacy auditing tool, and to offer valuable insights into safeguarding privacy\nduring the fine-tuning process. Experimental results confirm the framework's\nefficiency across various models and tasks, emphasizing notable privacy\nconcerns in the fine-tuning process. Project code available for\nhttps://anonymous.4open.science/r/PARSING-4817/.\n","authors":["Qian Sun","Hanpeng Wu","Xi Sheryl Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07133v2","updated":"2024-11-12T04:05:54Z","published":"2024-11-11T17:06:48Z","title":"Stronger Models are NOT Stronger Teachers for Instruction Tuning","summary":" Instruction tuning has been widely adopted to ensure large language models\n(LLMs) follow user instructions effectively. The resulting\ninstruction-following capabilities of LLMs heavily rely on the instruction\ndatasets used for tuning. Recently, synthetic instruction datasets have emerged\nas an economically viable solution to provide LLMs diverse and high-quality\ninstructions. However, existing approaches typically assume that larger or\nstronger models are stronger teachers for instruction tuning, and hence simply\nadopt these models as response generators to the synthetic instructions. In\nthis paper, we challenge this commonly-adopted assumption. Our extensive\nexperiments across five base models and twenty response generators reveal that\nlarger and stronger models are not necessarily stronger teachers of smaller\nmodels. We refer to this phenomenon as the Larger Models' Paradox. We observe\nthat existing metrics cannot precisely predict the effectiveness of response\ngenerators since they ignore the compatibility between teachers and base models\nbeing fine-tuned. We thus develop a novel metric, named as\nCompatibility-Adjusted Reward (CAR) to measure the effectiveness of response\ngenerators. Our experiments across five base models demonstrate that CAR\noutperforms almost all baselines.\n","authors":["Zhangchen Xu","Fengqing Jiang","Luyao Niu","Bill Yuchen Lin","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2411.07133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07528v1","updated":"2024-11-12T03:56:07Z","published":"2024-11-12T03:56:07Z","title":"SecEncoder: Logs are All You Need in Security","summary":" Large and Small Language Models (LMs) are typically pretrained using\nextensive volumes of text, which are sourced from publicly accessible platforms\nsuch as Wikipedia, Book Corpus, or through web scraping. These models, due to\ntheir exposure to a wide range of language data, exhibit impressive\ngeneralization capabilities and can perform a multitude of tasks\nsimultaneously. However, they often fall short when it comes to domain-specific\ntasks due to their broad training data. This paper introduces SecEncoder, a\nspecialized small language model that is pretrained using security logs.\nSecEncoder is designed to address the domain-specific limitations of general\nLMs by focusing on the unique language and patterns found in security logs.\nExperimental results indicate that SecEncoder outperforms other LMs, such as\nBERTlarge, DeBERTa-v3-large and OpenAI's Embedding (textembedding-ada-002)\nmodels, which are pretrained mainly on natural language, across various tasks.\nFurthermore, although SecEncoder is primarily pretrained on log data, it\noutperforms models pretrained on natural language for a range of tasks beyond\nlog analysis, such as incident prioritization and threat intelligence document\nretrieval. This suggests that domain specific pretraining with logs can\nsignificantly enhance the performance of LMs in security. These findings pave\nthe way for future research into security-specific LMs and their potential\napplications.\n","authors":["Muhammed Fatih Bulut","Yingqi Liu","Naveed Ahmad","Maximilian Turner","Sami Ait Ouahmane","Cameron Andrews","Lloyd Greenwald"],"pdf_url":"https://arxiv.org/pdf/2411.07528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07527v1","updated":"2024-11-12T03:55:27Z","published":"2024-11-12T03:55:27Z","title":"Prompt-enhanced Network for Hateful Meme Classification","summary":" The dynamic expansion of social media has led to an inundation of hateful\nmemes on media platforms, accentuating the growing need for efficient\nidentification and removal. Acknowledging the constraints of conventional\nmultimodal hateful meme classification, which heavily depends on external\nknowledge and poses the risk of including irrelevant or redundant content, we\ndeveloped Pen -- a prompt-enhanced network framework based on the prompt\nlearning approach. Specifically, after constructing the sequence through the\nprompt method and encoding it with a language model, we performed region\ninformation global extraction on the encoded sequence for multi-view\nperception. By capturing global information about inference instances and\ndemonstrations, Pen facilitates category selection by fully leveraging sequence\ninformation. This approach significantly improves model classification\naccuracy. Additionally, to bolster the model's reasoning capabilities in the\nfeature space, we introduced prompt-aware contrastive learning into the\nframework to improve the quality of sample feature distributions. Through\nextensive ablation experiments on two public datasets, we evaluate the\neffectiveness of the Pen framework, concurrently comparing it with\nstate-of-the-art model baselines. Our research findings highlight that Pen\nsurpasses manual prompt methods, showcasing superior generalization and\nclassification accuracy in hateful meme classification tasks. Our code is\navailable at https://github.com/juszzi/Pen.\n","authors":["Junxi Liu","Yanyan Feng","Jiehai Chen","Yun Xue","Fenghuan Li"],"pdf_url":"https://arxiv.org/pdf/2411.07527v1.pdf","comment":"Published in Proceedings of the Thirty-Third International Joint\n Conference on Artificial Intelligence Main Track. Pages 6397-6405"},{"id":"http://arxiv.org/abs/2411.07521v1","updated":"2024-11-12T03:37:53Z","published":"2024-11-12T03:37:53Z","title":"Fair Summarization: Bridging Quality and Diversity in Extractive\n Summaries","summary":" Fairness in multi-document summarization of user-generated content remains a\ncritical challenge in natural language processing (NLP). Existing summarization\nmethods often fail to ensure equitable representation across different social\ngroups, leading to biased outputs. In this paper, we introduce two novel\nmethods for fair extractive summarization: FairExtract, a clustering-based\napproach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints.\nWe evaluate these methods using Divsumm summarization dataset of White-aligned,\nHispanic, and African-American dialect tweets and compare them against relevant\nbaselines. The results obtained using a comprehensive set of summarization\nquality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well\nas a fairness metric F, demonstrate that FairExtract and FairGPT achieve\nsuperior fairness while maintaining competitive summarization quality.\nAdditionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that\nintegrate quality and fairness into a single evaluation framework, offering a\nmore nuanced understanding of the trade-offs between these objectives. This\nwork highlights the importance of fairness in summarization and sets a\nbenchmark for future research in fairness-aware NLP models.\n","authors":["Sina Bagheri Nezhad","Sayan Bandyapadhyay","Ameeta Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.07521v1.pdf","comment":"Accepted at Algorithmic Fairness through the Lens of Metrics and\n Evaluation Workshop @ NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.07516v1","updated":"2024-11-12T03:25:33Z","published":"2024-11-12T03:25:33Z","title":"SparrowVQE: Visual Question Explanation for Course Content Understanding","summary":" Visual Question Answering (VQA) research seeks to create AI systems to answer\nnatural language questions in images, yet VQA methods often yield overly\nsimplistic and short answers. This paper aims to advance the field by\nintroducing Visual Question Explanation (VQE), which enhances the ability of\nVQA to provide detailed explanations rather than brief responses and address\nthe need for more complex interaction with visual content. We first created an\nMLVQE dataset from a 14-week streamed video machine learning course, including\n885 slide images, 110,407 words of transcripts, and 9,416 designed\nquestion-answer (QA) pairs. Next, we proposed a novel SparrowVQE, a small 3\nbillion parameters multimodal model. We trained our model with a three-stage\ntraining mechanism consisting of multimodal pre-training (slide images and\ntranscripts feature alignment), instruction tuning (tuning the pre-trained\nmodel with transcripts and QA pairs), and domain fine-tuning (fine-tuning slide\nimage and QA pairs). Eventually, our SparrowVQE can understand and connect\nvisual information using the SigLIP model with transcripts using the Phi-2\nlanguage model with an MLP adapter. Experimental results demonstrate that our\nSparrowVQE achieves better performance in our developed MLVQE dataset and\noutperforms state-of-the-art methods in the other five benchmark VQA datasets.\nThe source code is available at\n\\url{https://github.com/YoushanZhang/SparrowVQE}.\n","authors":["Jialu Li","Manish Kumar Thota","Ruslan Gokhman","Radek Holik","Youshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07494v1","updated":"2024-11-12T02:44:49Z","published":"2024-11-12T02:44:49Z","title":"Rapid Response: Mitigating LLM Jailbreaks with a Few Examples","summary":" As large language models (LLMs) grow more powerful, ensuring their safety\nagainst misuse becomes crucial. While researchers have focused on developing\nrobust defenses, no method has yet achieved complete invulnerability to\nattacks. We propose an alternative approach: instead of seeking perfect\nadversarial robustness, we develop rapid response techniques to look to block\nwhole classes of jailbreaks after observing only a handful of attacks. To study\nthis setting, we develop RapidResponseBench, a benchmark that measures a\ndefense's robustness against various jailbreak strategies after adapting to a\nfew observed examples. We evaluate five rapid response methods, all of which\nuse jailbreak proliferation, where we automatically generate additional\njailbreaks similar to the examples observed. Our strongest method, which\nfine-tunes an input classifier to block proliferated jailbreaks, reduces attack\nsuccess rate by a factor greater than 240 on an in-distribution set of\njailbreaks and a factor greater than 15 on an out-of-distribution set, having\nobserved just one example of each jailbreaking strategy. Moreover, further\nstudies suggest that the quality of proliferation model and number of\nproliferated examples play an key role in the effectiveness of this defense.\nOverall, our results highlight the potential of responding rapidly to novel\njailbreaks to limit LLM misuse.\n","authors":["Alwin Peng","Julian Michael","Henry Sleight","Ethan Perez","Mrinank Sharma"],"pdf_url":"https://arxiv.org/pdf/2411.07494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00774v2","updated":"2024-11-12T02:18:38Z","published":"2024-11-01T17:59:51Z","title":"Freeze-Omni: A Smart and Low Latency Speech-to-speech Dialogue Model\n with Frozen LLM","summary":" Rapidly developing large language models (LLMs) have brought tremendous\nintelligent applications. GPT-4o's excellent duplex speech interaction ability\nhas recently brought impressive experience to users. Researchers have recently\nproposed several multi-modal LLMs in this direction that can achieve\nspeech-to-speech dialogue. This paper proposes a novel speech-text multimodal\nLLM architecture called Freeze-Omni. Our main contribution is that the speech\ninput and output modalities can be easily connected to a textual LLM while\nkeeping the LLM's parameters frozen throughout the training process. We\ndesigned 3-stage training strategies both for the modeling of speech input and\noutput, enabling Freeze-Omni to obtain speech-to-speech dialogue ability using\ntext-speech paired data (such as ASR and TTS data) and only 60,000 multi-round\ntext Q&A data on 8 GPUs. Moreover, we can effectively ensure that the\nintelligence of the Freeze-Omni in the speech modality is at the same level\ncompared with that in the text modality of its backbone LLM, while the\nend-to-end latency of the spoken response achieves a low level. In addition, we\nalso designed a method to achieve duplex dialogue ability through multi-task\ntraining, making Freeze-Omni have a more natural style of dialogue ability\nbetween the users. Freeze-Omni mainly provides a possibility for researchers to\nconduct multimodal LLM under the condition of a frozen LLM, avoiding various\nimpacts caused by the catastrophic forgetting of LLM caused by fewer data and\ntraining resources.\n","authors":["Xiong Wang","Yangze Li","Chaoyou Fu","Yunhang Shen","Lei Xie","Ke Li","Xing Sun","Long Ma"],"pdf_url":"https://arxiv.org/pdf/2411.00774v2.pdf","comment":"Project Page: https://freeze-omni.github.io/"},{"id":"http://arxiv.org/abs/2409.13755v2","updated":"2024-11-12T02:01:37Z","published":"2024-09-15T10:50:51Z","title":"Entity-Aware Self-Attention and Contextualized GCN for Enhanced Relation\n Extraction in Long Sentences","summary":" Relation extraction as an important natural Language processing (NLP) task is\nto identify relations between named entities in text. Recently, graph\nconvolutional networks over dependency trees have been widely used to capture\nsyntactic features and achieved attractive performance. However, most existing\ndependency-based approaches ignore the positive influence of the words outside\nthe dependency trees, sometimes conveying rich and useful information on\nrelation extraction. In this paper, we propose a novel model, Entity-aware\nSelf-attention Contextualized GCN (ESC-GCN), which efficiently incorporates\nsyntactic structure of input sentences and semantic context of sequences. To be\nspecific, relative position self-attention obtains the overall semantic\npairwise correlation related to word position, and contextualized graph\nconvolutional networks capture rich intra-sentence dependencies between words\nby adequately pruning operations. Furthermore, entity-aware attention layer\ndynamically selects which token is more decisive to make final relation\nprediction. In this way, our proposed model not only reduces the noisy impact\nfrom dependency trees, but also obtains easily-ignored entity-related semantic\nrepresentation. Extensive experiments on various tasks demonstrate that our\nmodel achieves encouraging performance as compared to existing dependency-based\nand sequence-based models. Specially, our model excels in extracting relations\nbetween entities of long sentences.\n","authors":["Xin Wang","Xinyi Bai"],"pdf_url":"https://arxiv.org/pdf/2409.13755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00625v2","updated":"2024-11-12T01:47:40Z","published":"2024-09-01T05:59:54Z","title":"Entity-Aware Biaffine Attention Model for Improved Constituent Parsing\n with Reduced Entity Violations","summary":" Constituency parsing involves analyzing a sentence by breaking it into\nsub-phrases, or constituents. While many deep neural models have achieved\nstate-of-the-art performance in this task, they often overlook the\nentity-violating issue, where an entity fails to form a complete sub-tree in\nthe resultant parsing tree. To address this, we propose an entity-aware\nbiaffine attention model for constituent parsing. This model incorporates\nentity information into the biaffine attention mechanism by using additional\nentity role vectors for potential phrases, which enhances the parsing accuracy.\nWe introduce a new metric, the Entity Violating Rate (EVR), to quantify the\nextent of entity violations in parsing results. Experiments on three popular\ndatasets-ONTONOTES, PTB, and CTB-demonstrate that our model achieves the lowest\nEVR while maintaining high precision, recall, and F1-scores comparable to\nexisting models. Further evaluation in downstream tasks, such as sentence\nsentiment analysis, highlights the effectiveness of our model and the validity\nof the proposed EVR metric.\n","authors":["Xinyi Bai"],"pdf_url":"https://arxiv.org/pdf/2409.00625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07474v1","updated":"2024-11-12T01:26:41Z","published":"2024-11-12T01:26:41Z","title":"Controlled Evaluation of Syntactic Knowledge in Multilingual Language\n Models","summary":" Language models (LMs) are capable of acquiring elements of human-like\nsyntactic knowledge. Targeted syntactic evaluation tests have been employed to\nmeasure how well they form generalizations about syntactic phenomena in\nhigh-resource languages such as English. However, we still lack a thorough\nunderstanding of LMs' capacity for syntactic generalizations in low-resource\nlanguages, which are responsible for much of the diversity of syntactic\npatterns worldwide. In this study, we develop targeted syntactic evaluation\ntests for three low-resource languages (Basque, Hindi, and Swahili) and use\nthem to evaluate five families of open-access multilingual Transformer LMs. We\nfind that some syntactic tasks prove relatively easy for LMs while others\n(agreement in sentences containing indirect objects in Basque, agreement across\na prepositional phrase in Swahili) are challenging. We additionally uncover\nissues with publicly available Transformers, including a bias toward the\nhabitual aspect in Hindi in multilingual BERT and underperformance compared to\nsimilar-sized models in XGLM-4.5B.\n","authors":["Daria Kryvosheieva","Roger Levy"],"pdf_url":"https://arxiv.org/pdf/2411.07474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01332v3","updated":"2024-11-12T01:06:22Z","published":"2024-03-29T22:49:43Z","title":"Explaining Large Language Models Decisions Using Shapley Values","summary":" The emergence of large language models (LLMs) has opened up exciting\npossibilities for simulating human behavior and cognitive processes, with\npotential applications in various domains, including marketing research and\nconsumer behavior analysis. However, the validity of utilizing LLMs as\nstand-ins for human subjects remains uncertain due to glaring divergences that\nsuggest fundamentally different underlying processes at play and the\nsensitivity of LLM responses to prompt variations. This paper presents a novel\napproach based on Shapley values from cooperative game theory to interpret LLM\nbehavior and quantify the relative contribution of each prompt component to the\nmodel's output. Through two applications - a discrete choice experiment and an\ninvestigation of cognitive biases - we demonstrate how the Shapley value method\ncan uncover what we term \"token noise\" effects, a phenomenon where LLM\ndecisions are disproportionately influenced by tokens providing minimal\ninformative content. This phenomenon raises concerns about the robustness and\ngeneralizability of insights obtained from LLMs in the context of human\nbehavior simulation. Our model-agnostic approach extends its utility to\nproprietary LLMs, providing a valuable tool for practitioners and researchers\nto strategically optimize prompts and mitigate apparent cognitive biases. Our\nfindings underscore the need for a more nuanced understanding of the factors\ndriving LLM responses before relying on them as substitutes for human subjects\nin survey settings. We emphasize the importance of researchers reporting\nresults conditioned on specific prompt templates and exercising caution when\ndrawing parallels between human behavior and LLMs.\n","authors":["Behnam Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2404.01332v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07466v1","updated":"2024-11-12T01:05:55Z","published":"2024-11-12T01:05:55Z","title":"IdentifyMe: A Challenging Long-Context Mention Resolution Benchmark","summary":" Recent evaluations of LLMs on coreference resolution have revealed that\ntraditional output formats and evaluation metrics do not fully capture the\nmodels' referential understanding. To address this, we introduce IdentifyMe, a\nnew benchmark for mention resolution presented in a multiple-choice question\n(MCQ) format, commonly used for evaluating LLMs. IdentifyMe features long\nnarratives and employs heuristics to exclude easily identifiable mentions,\ncreating a more challenging task. The benchmark also consists of a curated\nmixture of different mention types and corresponding entities, allowing for a\nfine-grained analysis of model performance. We evaluate both closed- and open\nsource LLMs on IdentifyMe and observe a significant performance gap (20-30%)\nbetween the state-of-the-art sub-10B open models vs. closed ones. We observe\nthat pronominal mentions, which have limited surface information, are typically\nmuch harder for models to resolve than nominal mentions. Additionally, we find\nthat LLMs often confuse entities when their mentions overlap in nested\nstructures. The highest-scoring model, GPT-4o, achieves 81.9% accuracy,\nhighlighting the strong referential capabilities of state-of-the-art LLMs while\nalso indicating room for further improvement.\n","authors":["Kawshik Manikantan","Makarand Tapaswi","Vineet Gandhi","Shubham Toshniwal"],"pdf_url":"https://arxiv.org/pdf/2411.07466v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.04916v4","updated":"2024-11-12T01:01:32Z","published":"2023-11-02T04:01:04Z","title":"Explainable Identification of Hate Speech towards Islam using Graph\n Neural Networks","summary":" Islamophobic language on online platforms fosters intolerance, making\ndetection and elimination crucial for promoting harmony. Traditional hate\nspeech detection models rely on NLP techniques like tokenization,\npart-of-speech tagging, and encoder-decoder models. However, Graph Neural\nNetworks (GNNs), with their ability to utilize relationships between data\npoints, offer more effective detection and greater explainability. In this\nwork, we represent speeches as nodes and connect them with edges based on their\ncontext and similarity to develop the graph. This study introduces a novel\nparadigm using GNNs to identify and explain hate speech towards Islam. Our\nmodel leverages GNNs to understand the context and patterns of hate speech by\nconnecting texts via pretrained NLP-generated word embeddings, achieving\nstate-of-the-art performance and enhancing detection accuracy while providing\nvaluable explanations. This highlights the potential of GNNs in combating\nonline hate speech and fostering a safer, more inclusive online environment.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2311.04916v4.pdf","comment":"Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (Non-archival)\n (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP\n 2024 : NLP for Positive Impact Workshop (Archival; ACL Anthology:\n https://aclanthology.org/2024.nlp4pi-1.23/)"},{"id":"http://arxiv.org/abs/2407.02885v5","updated":"2024-11-12T01:00:53Z","published":"2024-07-03T07:59:52Z","title":"CogErgLLM: Exploring Large Language Model Systems Design Perspective\n Using Cognitive Ergonomics","summary":" Integrating cognitive ergonomics with LLMs is crucial for improving safety,\nreliability, and user satisfaction in human-AI interactions. Current LLM\ndesigns often lack this integration, resulting in systems that may not fully\nalign with human cognitive capabilities and limitations. This oversight\nexacerbates biases in LLM outputs and leads to suboptimal user experiences due\nto inconsistent application of user-centered design principles. Researchers are\nincreasingly leveraging NLP, particularly LLMs, to model and understand human\nbehavior across social sciences, psychology, psychiatry, health, and\nneuroscience. Our position paper explores the need to integrate cognitive\nergonomics into LLM design, providing a comprehensive framework and practical\nguidelines for ethical development. By addressing these challenges, we aim to\nadvance safer, more reliable, and ethically sound human-AI interactions.\n","authors":["Azmine Toushik Wasi","Mst Rafia Islam"],"pdf_url":"https://arxiv.org/pdf/2407.02885v5.pdf","comment":"10 Page, 3 Figures. Accepted in: (i) ICML'24: LLMs & Cognition\n Workshop (Non-archival; OpenReview:\n https://openreview.net/forum?id=63C9YSc77p) (ii) EMNLP'24 : NLP for Science\n Workshop (Archival; ACL Anthology:\n https://aclanthology.org/2024.nlp4science-1.22/)"},{"id":"http://arxiv.org/abs/2411.07464v1","updated":"2024-11-12T00:57:30Z","published":"2024-11-12T00:57:30Z","title":"BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating\n Machine Learning Tasks","summary":" Large Language Models (LLMs) excel in diverse applications including\ngeneration of code snippets, but often struggle with generating code for\ncomplex Machine Learning (ML) tasks. Although existing LLM single-agent based\nsystems give varying performance depending on the task complexity, they purely\nrely on larger and expensive models such as GPT-4. Our investigation reveals\nthat no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama\nperform far worse than GPT-4 in a single-agent setting. With the motivation of\ndeveloping a cost-efficient LLM based solution for solving ML tasks, we propose\nan LLM Multi-Agent based system which leverages combination of experts using\nprofiling, efficient retrieval of past observations, LLM cascades, and\nask-the-expert calls. Through empirical analysis on ML engineering tasks in the\nMLAgentBench benchmark, we demonstrate the effectiveness of our system, using\nno-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and\nexpert to serve occasional ask-the-expert calls for planning. With 94.2\\%\nreduction in the cost (from \\$0.931 per run cost averaged over all tasks for\nGPT-4 single agent system to \\$0.054), our system is able to yield better\naverage success rate of 32.95\\% as compared to GPT-4 single-agent system\nyielding 22.72\\% success rate averaged over all the tasks of MLAgentBench.\n","authors":["Shubham Gandhi","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2411.07464v1.pdf","comment":"Presented at AIMLSystems '24"},{"id":"http://arxiv.org/abs/2411.07457v1","updated":"2024-11-12T00:48:01Z","published":"2024-11-12T00:48:01Z","title":"DecoPrompt : Decoding Prompts Reduces Hallucinations when Large Language\n Models Meet False Premises","summary":" While large language models (LLMs) have demonstrated increasing power, they\nhave also called upon studies on their hallucinated outputs that deviate from\nfactually correct statements. In this paper, we focus on one important scenario\nof false premises, where LLMs are distracted by misaligned claims although the\nmodel possesses the required factual knowledge to answer original questions\naccurately. Inspired by the observation that entropy of the false-premise\nprompt is closely related to its likelihood to elicit hallucination generation,\nwe propose a new prompting algorithm, named DecoPrompt, to mitigate\nhallucination. DecoPrompt leverages LLMs to \"decode\" the false-premise prompts\nwithout really eliciting hallucination output from LLMs. We perform experiments\non two datasets, demonstrating that DecoPrompt can reduce hallucinations\neffectively on outputs from different LLMs. Moreover, DecoPrompt exhibits\ncross-model transferability, which facilitates its applications to scenarios\nsuch as LLMs of large sizes or unavailable model logits.\n","authors":["Nan Xu","Xuezhe Ma"],"pdf_url":"https://arxiv.org/pdf/2411.07457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07446v1","updated":"2024-11-12T00:07:29Z","published":"2024-11-12T00:07:29Z","title":"Efficient and Accurate Prompt Optimization: the Benefit of Memory in\n Exemplar-Guided Reflection","summary":" Automatic prompt engineering aims to enhance the generation quality of large\nlanguage models (LLMs). Recent works utilize feedbacks generated from erroneous\ncases to guide the prompt optimization. During inference, they may further\nretrieve several semantically-related exemplars and concatenate them to the\noptimized prompts to improve the performance. However, those works only utilize\nthe feedback at the current step, ignoring historical and unseleccted feedbacks\nwhich are potentially beneficial. Moreover, the selection of exemplars only\nconsiders the general semantic relationship and may not be optimal in terms of\ntask performance and matching with the optimized prompt. In this work, we\npropose an Exemplar-Guided Reflection with Memory mechanism (ERM) to realize\nmore efficient and accurate prompt optimization. Specifically, we design an\nexemplar-guided reflection mechanism where the feedback generation is\nadditionally guided by the generated exemplars. We further build two kinds of\nmemory to fully utilize the historical feedback information and support more\neffective exemplar retrieval. Empirical evaluations show our method surpasses\nprevious state-of-the-arts with less optimization steps, i.e., improving F1\nscore by 10.1 on LIAR dataset, and reducing half of the optimization steps on\nProTeGi.\n","authors":["Cilin Yan","Jingyun Wang","Lin Zhang","Ruihui Zhao","Xiaopu Wu","Kai Xiong","Qingsong Liu","Guoliang Kang","Yangyang Kang"],"pdf_url":"https://arxiv.org/pdf/2411.07446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08248v1","updated":"2024-11-12T23:54:58Z","published":"2024-11-12T23:54:58Z","title":"Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial\n Approach","summary":" Deep learning underpins most of the currently advanced natural language\nprocessing (NLP) tasks such as textual classification, neural machine\ntranslation (NMT), abstractive summarization and question-answering (QA).\nHowever, the robustness of the models, particularly QA models, against\nadversarial attacks is a critical concern that remains insufficiently explored.\nThis paper introduces QA-Attack (Question Answering Attack), a novel word-level\nadversarial strategy that fools QA models. Our attention-based attack exploits\nthe customized attention mechanism and deletion ranking strategy to identify\nand target specific words within contextual passages. It creates deceptive\ninputs by carefully choosing and substituting synonyms, preserving grammatical\nintegrity while misleading the model to produce incorrect responses. Our\napproach demonstrates versatility across various question types, particularly\nwhen dealing with extensive long textual inputs. Extensive experiments on\nmultiple benchmark datasets demonstrate that QA-Attack successfully deceives\nbaseline QA models and surpasses existing adversarial techniques regarding\nsuccess rate, semantics changes, BLEU score, fluency and grammar error rate.\n","authors":["Jiyao Li","Mingze Ni","Yongshun Gong","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08243v1","updated":"2024-11-12T23:43:20Z","published":"2024-11-12T23:43:20Z","title":"Beyond the Safety Bundle: Auditing the Helpful and Harmless Dataset","summary":" In an effort to mitigate the harms of large language models (LLMs), learning\nfrom human feedback (LHF) has been used to steer LLMs towards outputs that are\nintended to be both less harmful and more helpful. Despite the widespread\nadoption of LHF in practice, the quality of this feedback and its effectiveness\nas a safety mitigation technique remain unclear. This study addresses these\nissues by auditing the widely-used Helpful and Harmless (HH) dataset by\nAnthropic. Our work includes: (1) a thorough investigation of the dataset's\ncontent through both manual and automated evaluation; (2) experiments\ndemonstrating the dataset's impact on models' safety; and (3) an analysis of\nthe 100 most influential papers citing this dataset. Through our audit, we\nshowcase how conceptualization failures and quality issues identified in the HH\ndataset can create additional harms by leading to disparate safety behaviors\nacross demographic groups. Our findings highlight the need for more nuanced,\ncontext-sensitive approaches to safety mitigation in LLMs.\n","authors":["Khaoula Chehbouni","Jonathan Colaço-Carr","Yash More","Jackie CK Cheung","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2411.08243v1.pdf","comment":"Prepared for conference submission"},{"id":"http://arxiv.org/abs/2403.06399v3","updated":"2024-11-12T22:17:46Z","published":"2024-03-11T03:21:15Z","title":"GlossLM: A Massively Multilingual Corpus and Pretrained Model for\n Interlinear Glossed Text","summary":" Language documentation projects often involve the creation of annotated text\nin a format such as interlinear glossed text (IGT), which captures fine-grained\nmorphosyntactic analyses in a morpheme-by-morpheme format. However, there are\nfew existing resources providing large amounts of standardized, easily\naccessible IGT data, limiting their applicability to linguistic research, and\nmaking it difficult to use such data in NLP modeling.\n We compile the largest existing corpus of IGT data from a variety of sources,\ncovering over 450k examples across 1.8k languages, to enable research on\ncrosslingual transfer and IGT generation. We normalize much of our data to\nfollow a standard set of labels across languages.\n Furthermore, we explore the task of automatically generating IGT in order to\naid documentation projects. As many languages lack sufficient monolingual data,\nwe pretrain a large multilingual model on our corpus. We demonstrate the\nutility of this model by finetuning it on monolingual corpora, outperforming\nSOTA models by up to 6.6\\%. Our pretrained model and dataset are available on\nHugging Face.\n","authors":["Michael Ginn","Lindia Tjuatja","Taiqi He","Enora Rice","Graham Neubig","Alexis Palmer","Lori Levin"],"pdf_url":"https://arxiv.org/pdf/2403.06399v3.pdf","comment":"EMNLP 2024. First two authors are equal contribution"},{"id":"http://arxiv.org/abs/2408.07832v6","updated":"2024-11-12T20:51:07Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets.\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08165v1","updated":"2024-11-12T20:15:58Z","published":"2024-11-12T20:15:58Z","title":"Retrieval, Reasoning, Re-ranking: A Context-Enriched Framework for\n Knowledge Graph Completion","summary":" The Knowledge Graph Completion~(KGC) task aims to infer the missing entity\nfrom an incomplete triple. Existing embedding-based methods rely solely on\ntriples in the KG, which is vulnerable to specious relation patterns and\nlong-tail entities. On the other hand, text-based methods struggle with the\nsemantic gap between KG triples and natural language. Apart from triples,\nentity contexts (e.g., labels, descriptions, aliases) also play a significant\nrole in augmenting KGs. To address these limitations, we propose KGR3, a\ncontext-enriched framework for KGC. KGR3 is composed of three modules. Firstly,\nthe Retrieval module gathers supporting triples from the KG, collects plausible\ncandidate answers from a base embedding model, and retrieves context for each\nrelated entity. Then, the Reasoning module employs a large language model to\ngenerate potential answers for each query triple. Finally, the Re-ranking\nmodule combines candidate answers from the two modules mentioned above, and\nfine-tunes an LLM to provide the best answer. Extensive experiments on widely\nused datasets demonstrate that KGR3 consistently improves various KGC methods.\nSpecifically, the best variant of KGR3 achieves absolute Hits@1 improvements of\n12.3% and 5.6% on the FB15k237 and WN18RR datasets.\n","authors":["Muzhi Li","Cehao Yang","Chengjin Xu","Xuhui Jiang","Yiyan Qi","Jian Guo","Ho-fung Leung","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2411.08165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04427v2","updated":"2024-11-12T20:11:58Z","published":"2024-11-07T04:38:58Z","title":"One fish, two fish, but not the whole sea: Alignment reduces language\n models' conceptual diversity","summary":" Researchers in social science and psychology have recently proposed using\nlarge language models (LLMs) as replacements for humans in behavioral research.\nIn addition to arguments about whether LLMs accurately capture population-level\npatterns, this has raised questions about whether LLMs capture human-like\nconceptual diversity. Separately, it is debated whether post-training alignment\n(RLHF or RLAIF) affects models' internal diversity. Inspired by human studies,\nwe use a new way of measuring the conceptual diversity of\nsynthetically-generated LLM \"populations\" by relating the internal variability\nof simulated individuals to the population-level variability. We use this\napproach to evaluate non-aligned and aligned LLMs on two domains with rich\nhuman behavioral data. While no model reaches human-like diversity, aligned\nmodels generally display less diversity than their instruction fine-tuned\ncounterparts. Our findings highlight potential trade-offs between increasing\nmodels' value alignment and decreasing the diversity of their conceptual\nrepresentations.\n","authors":["Sonia K. Murthy","Tomer Ullman","Jennifer Hu"],"pdf_url":"https://arxiv.org/pdf/2411.04427v2.pdf","comment":"17 pages, 10 figures; corrected figure version"},{"id":"http://arxiv.org/abs/2411.08147v1","updated":"2024-11-12T19:53:00Z","published":"2024-11-12T19:53:00Z","title":"Large Language Models Can Self-Improve in Long-context Reasoning","summary":" Large language models (LLMs) have achieved substantial progress in processing\nlong contexts but still struggle with long-context reasoning. Existing\napproaches typically involve fine-tuning LLMs with synthetic data, which\ndepends on annotations from human experts or advanced models like GPT-4, thus\nrestricting further advancements. To address this issue, we investigate the\npotential for LLMs to self-improve in long-context reasoning and propose \\ours,\nan approach specifically designed for this purpose. This approach is\nstraightforward: we sample multiple outputs for each question, score them with\nMinimum Bayes Risk, and then apply supervised fine-tuning or preference\noptimization based on these outputs. Extensive experiments on several leading\nLLMs demonstrate the effectiveness of \\ours, with an absolute improvement of\n$4.2$ points for Llama-3.1-8B-Instruct. Furthermore, \\ours achieves superior\nperformance compared to prior approaches that depend on data produced by human\nexperts or advanced models. We anticipate that this work will open new avenues\nfor self-improvement techniques in long-context scenarios, which are essential\nfor the continual advancement of LLMs.\n","authors":["Siheng Li","Cheng Yang","Zesen Cheng","Lemao Liu","Mo Yu","Yujiu Yang","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2411.08147v1.pdf","comment":"Project Page: https://github.com/SihengLi99/SEALONG"},{"id":"http://arxiv.org/abs/2411.04329v2","updated":"2024-11-12T19:37:20Z","published":"2024-11-07T00:09:54Z","title":"CodeTree: Agent-guided Tree Search for Code Generation with Large\n Language Models","summary":" Pre-trained on massive amounts of code and text data, large language models\n(LLMs) have demonstrated remarkable achievements in performing code generation\ntasks. With additional execution-based feedback, these models can act as agents\nwith capabilities to self-refine and improve generated code autonomously.\nHowever, on challenging coding tasks with extremely large search space, current\nagentic approaches still struggle with multi-stage planning, generating, and\ndebugging. To address this problem, we propose CodeTree, a framework for LLM\nagents to efficiently explore the search space in different stages of the code\ngeneration process. Specifically, we adopted a unified tree structure to\nexplicitly explore different coding strategies, generate corresponding coding\nsolutions, and subsequently refine the solutions. In each stage, critical\ndecision-making (ranking, termination, expanding) of the exploration process is\nguided by both the environmental execution-based feedback and\nLLM-agent-generated feedback. We comprehensively evaluated CodeTree on 7 code\ngeneration benchmarks and demonstrated the significant performance gains of\nCodeTree against strong baselines. Using GPT-4o as the base model, we\nconsistently achieved top results of 95.1 on HumanEval, 98.7 on MBPP, and 43.0\non CodeContests. On the challenging SWEBench benchmark, our approach led to\nsignificant performance gains.\n","authors":["Jierui Li","Hung Le","Yingbo Zhou","Caiming Xiong","Silvio Savarese","Doyen Sahoo"],"pdf_url":"https://arxiv.org/pdf/2411.04329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16803v2","updated":"2024-11-12T19:28:34Z","published":"2024-10-22T08:28:05Z","title":"Context-aware Inductive Knowledge Graph Completion with Latent Type\n Constraints and Subgraph Reasoning","summary":" Inductive knowledge graph completion (KGC) aims to predict missing triples\nwith unseen entities. Recent works focus on modeling reasoning paths between\nthe head and tail entity as direct supporting evidence. However, these methods\ndepend heavily on the existence and quality of reasoning paths, which limits\ntheir general applicability in different scenarios. In addition, we observe\nthat latent type constraints and neighboring facts inherent in KGs are also\nvital in inferring missing triples. To effectively utilize all useful\ninformation in KGs, we introduce CATS, a novel context-aware inductive KGC\nsolution. With sufficient guidance from proper prompts and supervised\nfine-tuning, CATS activates the strong semantic understanding and reasoning\ncapabilities of large language models to assess the existence of query triples,\nwhich consist of two modules. First, the type-aware reasoning module evaluates\nwhether the candidate entity matches the latent entity type as required by the\nquery relation. Then, the subgraph reasoning module selects relevant reasoning\npaths and neighboring facts, and evaluates their correlation to the query\ntriple. Experiment results on three widely used datasets demonstrate that CATS\nsignificantly outperforms state-of-the-art methods in 16 out of 18\ntransductive, inductive, and few-shot settings with an average absolute MRR\nimprovement of 7.2%.\n","authors":["Muzhi Li","Cehao Yang","Chengjin Xu","Zixing Song","Xuhui Jiang","Jian Guo","Ho-fung Leung","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2410.16803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08135v1","updated":"2024-11-12T19:26:43Z","published":"2024-11-12T19:26:43Z","title":"On the Role of Speech Data in Reducing Toxicity Detection Bias","summary":" Text toxicity detection systems exhibit significant biases, producing\ndisproportionate rates of false positives on samples mentioning demographic\ngroups. But what about toxicity detection in speech? To investigate the extent\nto which text-based biases are mitigated by speech-based systems, we produce a\nset of high-quality group annotations for the multilingual MuTox dataset, and\nthen leverage these annotations to systematically compare speech- and\ntext-based toxicity classifiers. Our findings indicate that access to speech\ndata during inference supports reduced bias against group mentions,\nparticularly for ambiguous and disagreement-inducing samples. Our results also\nsuggest that improving classifiers, rather than transcription pipelines, is\nmore helpful for reducing group bias. We publicly release our annotations and\nprovide recommendations for future toxicity dataset construction.\n","authors":["Samuel J. Bell","Mariano Coria Meglioli","Megan Richards","Eduardo Sánchez","Christophe Ropers","Skyler Wang","Adina Williams","Levent Sagun","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2411.08135v1.pdf","comment":null}]},"2024-11-13T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.08851v1","updated":"2024-11-13T18:30:28Z","published":"2024-11-13T18:30:28Z","title":"Experience-based Subproblem Planning for Multi-Robot Motion Planning","summary":" Multi-robot systems enhance efficiency and productivity across various\napplications, from manufacturing to surveillance. While single-robot motion\nplanning has improved by using databases of prior solutions, extending this\napproach to multi-robot motion planning (MRMP) presents challenges due to the\nincreased complexity and diversity of tasks and configurations. Recent discrete\nmethods have attempted to address this by focusing on relevant\nlower-dimensional subproblems, but they are inadequate for complex scenarios\nlike those involving manipulator robots. To overcome this, we propose a novel\napproach that %leverages experience-based planning by constructs and utilizes\ndatabases of solutions for smaller sub-problems. By focusing on interactions\nbetween fewer robots, our method reduces the need for exhaustive database\ngrowth, allowing for efficient handling of more complex MRMP scenarios. We\nvalidate our approach with experiments involving both mobile and manipulator\nrobots, demonstrating significant improvements over existing methods in\nscalability and planning efficiency. Our contributions include a rapidly\nconstructed database for low-dimensional MRMP problems, a framework for\napplying these solutions to larger problems, and experimental validation with\nup to 32 mobile and 16 manipulator robots.\n","authors":["Irving Solis","James Motes","Mike Qin","Marco Morales","Nancy M. Amato"],"pdf_url":"https://arxiv.org/pdf/2411.08851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08835v1","updated":"2024-11-13T18:14:23Z","published":"2024-11-13T18:14:23Z","title":"Goal-oriented Semantic Communication for Robot Arm Reconstruction in\n Digital Twin: Feature and Temporal Selections","summary":" As one of the most promising technologies in industry, the Digital Twin (DT)\nfacilitates real-time monitoring and predictive analysis for real-world systems\nby precisely reconstructing virtual replicas of physical entities. However,\nthis reconstruction faces unprecedented challenges due to the everincreasing\ncommunication overhead, especially for digital robot arm reconstruction. To\nthis end, we propose a novel goal-oriented semantic communication (GSC)\nframework to extract the GSC information for the robot arm reconstruction task\nin the DT, with the aim of minimising the communication load under the strict\nand relaxed reconstruction error constraints. Unlike the traditional\nreconstruction framework that periodically transmits a reconstruction message\nfor real-time DT reconstruction, our framework implements a feature selection\n(FS) algorithm to extract the semantic information from the reconstruction\nmessage, and a deep reinforcement learning-based temporal selection algorithm\nto selectively transmit the semantic information over time. We validate our\nproposed GSC framework through both Pybullet simulations and lab experiments\nbased on the Franka Research 3 robot arm. For a range of distinct robotic\ntasks, simulation results show that our framework can reduce the communication\nload by at least 59.5% under strict reconstruction error constraints and 80%\nunder relaxed reconstruction error constraints, compared with traditional\ncommunication framework. Also, experimental results confirm the effectiveness\nof our framework, where the communication load is reduced by 53% in strict\nconstraint case and 74% in relaxed constraint case. The demo is available at:\nhttps://youtu.be/2OdeHKxcgnk.\n","authors":["Shutong Chen","Emmanouil Spyrakos-Papastavridis","Yichao Jin","Yansha Deng"],"pdf_url":"https://arxiv.org/pdf/2411.08835v1.pdf","comment":"Submitted to IEEE for potential publication"},{"id":"http://arxiv.org/abs/2411.08832v1","updated":"2024-11-13T18:12:15Z","published":"2024-11-13T18:12:15Z","title":"Offline Adaptation of Quadruped Locomotion using Diffusion Models","summary":" We present a diffusion-based approach to quadrupedal locomotion that\nsimultaneously addresses the limitations of learning and interpolating between\nmultiple skills and of (modes) offline adapting to new locomotion behaviours\nafter training. This is the first framework to apply classifier-free guided\ndiffusion to quadruped locomotion and demonstrate its efficacy by extracting\ngoal-conditioned behaviour from an originally unlabelled dataset. We show that\nthese capabilities are compatible with a multi-skill policy and can be applied\nwith little modification and minimal compute overhead, i.e., running entirely\non the robots onboard CPU. We verify the validity of our approach with hardware\nexperiments on the ANYmal quadruped platform.\n","authors":["Reece O'Mahoney","Alexander L. Mitchell","Wanming Yu","Ingmar Posner","Ioannis Havoutis"],"pdf_url":"https://arxiv.org/pdf/2411.08832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07053v2","updated":"2024-11-13T17:56:47Z","published":"2024-11-11T15:14:36Z","title":"UAV survey coverage path planning of complex regions containing\n exclusion zones","summary":" This article addresses the challenge of UAV survey coverage path planning for\nareas that are complex concave polygons, containing exclusion zones or\nobstacles. While standard drone path planners typically generate coverage paths\nfor simple convex polygons, this study proposes a method to manage more\nintricate regions, including boundary splits, merges, and interior holes. To\nachieve this, polygonal decomposition techniques are used to partition the\ntarget area into convex sub-regions. The sub-polygons are then merged using a\ndepth-first search algorithm, followed by the generation of continuous\nBoustrophedon paths based on connected components. Polygonal offset by the\nstraight skeleton method was used to ensure a constant safe distance from the\nexclusion zones. This approach allows UAV path planning in environments with\ncomplex geometric constraints.\n","authors":["Shadman Tajwar Shahid","Shah Md. Ahasan Siddique","Md. Mahidul Alam"],"pdf_url":"https://arxiv.org/pdf/2411.07053v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24164v3","updated":"2024-11-13T17:30:10Z","published":"2024-10-31T17:22:30Z","title":"$π_0$: A Vision-Language-Action Flow Model for General Robot Control","summary":" Robot learning holds tremendous promise to unlock the full potential of\nflexible, general, and dexterous robot systems, as well as to address some of\nthe deepest questions in artificial intelligence. However, bringing robot\nlearning to the level of generality required for effective real-world systems\nfaces major obstacles in terms of data, generalization, and robustness. In this\npaper, we discuss how generalist robot policies (i.e., robot foundation models)\ncan address these challenges, and how we can design effective generalist robot\npolicies for complex and highly dexterous tasks. We propose a novel flow\nmatching architecture built on top of a pre-trained vision-language model (VLM)\nto inherit Internet-scale semantic knowledge. We then discuss how this model\ncan be trained on a large and diverse dataset from multiple dexterous robot\nplatforms, including single-arm robots, dual-arm robots, and mobile\nmanipulators. We evaluate our model in terms of its ability to perform tasks in\nzero shot after pre-training, follow language instructions from people and from\na high-level VLM policy, and its ability to acquire new skills via fine-tuning.\nOur results cover a wide variety of tasks, such as laundry folding, table\ncleaning, and assembling boxes.\n","authors":["Kevin Black","Noah Brown","Danny Driess","Adnan Esmail","Michael Equi","Chelsea Finn","Niccolo Fusai","Lachy Groom","Karol Hausman","Brian Ichter","Szymon Jakubczak","Tim Jones","Liyiming Ke","Sergey Levine","Adrian Li-Bell","Mohith Mothukuri","Suraj Nair","Karl Pertsch","Lucy Xiaoyang Shi","James Tanner","Quan Vuong","Anna Walling","Haohuan Wang","Ury Zhilinsky"],"pdf_url":"https://arxiv.org/pdf/2410.24164v3.pdf","comment":"See project website for videos:\n https://physicalintelligence.company/blog/pi0"},{"id":"http://arxiv.org/abs/2411.07315v2","updated":"2024-11-13T17:28:15Z","published":"2024-11-11T19:15:29Z","title":"Harnessing Smartphone Sensors for Enhanced Road Safety: A Comprehensive\n Dataset and Review","summary":" Severe collisions can result from aggressive driving and poor road\nconditions, emphasizing the need for effective monitoring to ensure safety.\nSmartphones, with their array of built-in sensors, offer a practical and\naffordable solution for road-sensing. However, the lack of reliable,\nstandardized datasets has hindered progress in assessing road conditions and\ndriving patterns. This study addresses this gap by introducing a comprehensive\ndataset derived from smartphone sensors, which surpasses existing datasets by\nincorporating a diverse range of sensors including accelerometer, gyroscope,\nmagnetometer, GPS, gravity, orientation, and uncalibrated sensors. These\nsensors capture extensive parameters such as acceleration force, gravitation,\nrotation rate, magnetic field strength, and vehicle speed, providing a detailed\nunderstanding of road conditions and driving behaviors. The dataset is designed\nto enhance road safety, infrastructure maintenance, traffic management, and\nurban planning. By making this dataset available to the community, the study\naims to foster collaboration, inspire further research, and facilitate the\ndevelopment of innovative solutions in intelligent transportation systems.\n","authors":["Amith Khandakar","David G. Michelson","Mansura Naznine","Abdus Salam","Md. Nahiduzzaman","Khaled M. Khan","Ponnuthurai Nagaratnam Suganthan","Mohamed Arselene Ayari","Hamid Menouar","Julfikar Haider"],"pdf_url":"https://arxiv.org/pdf/2411.07315v2.pdf","comment":"29 pages, 14 Figures, journal paper, submitted into Scientific Data\n Journal"},{"id":"http://arxiv.org/abs/2411.08777v1","updated":"2024-11-13T17:02:46Z","published":"2024-11-13T17:02:46Z","title":"LUDO: Low-Latency Understanding of Highly Deformable Objects using Point\n Cloud Occupancy Functions","summary":" Accurately determining the shape and location of internal structures within\ndeformable objects is crucial for medical tasks that require precise targeting,\nsuch as robotic biopsies. We introduce LUDO, a method for accurate low-latency\nunderstanding of deformable objects. LUDO reconstructs objects in their\ndeformed state, including their internal structures, from a single-view point\ncloud observation in under 30 ms using occupancy networks. We demonstrate\nLUDO's abilities for autonomous targeting of internal regions of interest\n(ROIs) in highly deformable objects. Additionally, LUDO provides uncertainty\nestimates and explainability for its predictions, both of which are important\nin safety-critical applications such as surgical interventions. We evaluate\nLUDO in real-world robotic experiments, achieving a success rate of 98.9% for\npuncturing various ROIs inside highly deformable objects. LUDO demonstrates the\npotential to interact with deformable objects without the need for deformable\nregistration methods.\n","authors":["Pit Henrich","Franziska Mathis-Ullrich","Paul Maria Scheikl"],"pdf_url":"https://arxiv.org/pdf/2411.08777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19972v2","updated":"2024-11-13T16:56:21Z","published":"2024-06-28T15:00:46Z","title":"HumanVLA: Towards Vision-Language Directed Object Rearrangement by\n Physical Humanoid","summary":" Physical Human-Scene Interaction (HSI) plays a crucial role in numerous\napplications.\n However, existing HSI techniques are limited to specific object dynamics and\nprivileged information, which prevents the development of more comprehensive\napplications.\n To address this limitation, we introduce HumanVLA for general object\nrearrangement directed by practical vision and language.\n A teacher-student framework is utilized to develop HumanVLA.\n A state-based teacher policy is trained first using goal-conditioned\nreinforcement learning and adversarial motion prior.\n Then, it is distilled into a vision-language-action model via behavior\ncloning.\n We propose several key insights to facilitate the large-scale learning\nprocess.\n To support general object rearrangement by physical humanoid, we introduce a\nnovel Human-in-the-Room dataset encompassing various rearrangement tasks.\n Through extensive experiments and analysis, we demonstrate the effectiveness\nof the proposed approach.\n","authors":["Xinyu Xu","Yizheng Zhang","Yong-Lu Li","Lei Han","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2406.19972v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08727v1","updated":"2024-11-13T16:09:04Z","published":"2024-11-13T16:09:04Z","title":"Voxeland: Probabilistic Instance-Aware Semantic Mapping with\n Evidence-based Uncertainty Quantification","summary":" Robots in human-centered environments require accurate scene understanding to\nperform high-level tasks effectively. This understanding can be achieved\nthrough instance-aware semantic mapping, which involves reconstructing elements\nat the level of individual instances. Neural networks, the de facto solution\nfor scene understanding, still face limitations such as overconfident incorrect\npredictions with out-of-distribution objects or generating inaccurate\nmasks.Placing excessive reliance on these predictions makes the reconstruction\nsusceptible to errors, reducing the robustness of the resulting maps and\nhampering robot operation. In this work, we propose Voxeland, a probabilistic\nframework for incrementally building instance-aware semantic maps. Inspired by\nthe Theory of Evidence, Voxeland treats neural network predictions as\nsubjective opinions regarding map instances at both geometric and semantic\nlevels. These opinions are aggregated over time to form evidences, which are\nformalized through a probabilistic model. This enables us to quantify\nuncertainty in the reconstruction process, facilitating the identification of\nmap areas requiring improvement (e.g. reobservation or reclassification). As\none strategy to exploit this, we incorporate a Large Vision-Language Model\n(LVLM) to perform semantic level disambiguation for instances with high\nuncertainty. Results from the standard benchmarking on the publicly available\nSceneNN dataset demonstrate that Voxeland outperforms state-of-the-art methods,\nhighlighting the benefits of incorporating and leveraging both instance- and\nsemantic-level uncertainties to enhance reconstruction robustness. This is\nfurther validated through qualitative experiments conducted on the real-world\nScanNet dataset.\n","authors":["Jose-Luis Matez-Bandera","Pepe Ojeda","Javier Monroy","Javier Gonzalez-Jimenez","Jose-Raul Ruiz-Sarmiento"],"pdf_url":"https://arxiv.org/pdf/2411.08727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07342v2","updated":"2024-11-13T15:17:19Z","published":"2024-11-11T20:04:03Z","title":"Learning Dynamic Tasks on a Large-scale Soft Robot in a Handful of\n Trials","summary":" Soft robots offer more flexibility, compliance, and adaptability than\ntraditional rigid robots. They are also typically lighter and cheaper to\nmanufacture. However, their use in real-world applications is limited due to\nmodeling challenges and difficulties in integrating effective proprioceptive\nsensors. Large-scale soft robots ($\\approx$ two meters in length) have greater\nmodeling complexity due to increased inertia and related effects of gravity.\nCommon efforts to ease these modeling difficulties such as assuming simple\nkinematic and dynamics models also limit the general capabilities of soft\nrobots and are not applicable in tasks requiring fast, dynamic motion like\nthrowing and hammering. To overcome these challenges, we propose a\ndata-efficient Bayesian optimization-based approach for learning control\npolicies for dynamic tasks on a large-scale soft robot. Our approach optimizes\nthe task objective function directly from commanded pressures, without\nrequiring approximate kinematics or dynamics as an intermediate step. We\ndemonstrate the effectiveness of our approach through both simulated and\nreal-world experiments.\n","authors":["Sicelukwanda Zwane","Daniel Cheney","Curtis C. Johnson","Yicheng Luo","Yasemin Bekiroglu","Marc D. Killpack","Marc Peter Deisenroth"],"pdf_url":"https://arxiv.org/pdf/2411.07342v2.pdf","comment":"9 pages, 5 figures, Proceedings of the International Conference on\n Intelligent Robots and Systems (IROS)"},{"id":"http://arxiv.org/abs/2402.15552v3","updated":"2024-11-13T14:52:21Z","published":"2024-02-23T17:21:21Z","title":"Morphological Symmetries in Robotics","summary":" We present a comprehensive framework for studying and leveraging\nmorphological symmetries in robotic systems. These are intrinsic properties of\nthe robot's morphology, frequently observed in animal biology and robotics,\nwhich stem from the replication of kinematic structures and the symmetrical\ndistribution of mass. We illustrate how these symmetries extend to the robot's\nstate space and both proprioceptive and exteroceptive sensor measurements,\nresulting in the equivariance of the robot's equations of motion and optimal\ncontrol policies. Thus, we recognize morphological symmetries as a relevant and\npreviously unexplored physics-informed geometric prior, with significant\nimplications for both data-driven and analytical methods used in modeling,\ncontrol, estimation and design in robotics. For data-driven methods, we\ndemonstrate that morphological symmetries can enhance the sample efficiency and\ngeneralization of machine learning models through data augmentation, or by\napplying equivariant/invariant constraints on the model's architecture. In the\ncontext of analytical methods, we employ abstract harmonic analysis to\ndecompose the robot's dynamics into a superposition of lower-dimensional,\nindependent dynamics. We substantiate our claims with both synthetic and\nreal-world experiments conducted on bipedal and quadrupedal robots. Lastly, we\nintroduce the repository MorphoSymm to facilitate the practical use of the\ntheory and applications outlined in this work.\n","authors":["Daniel Ordoñez-Apraez","Giulio Turrisi","Vladimir Kostic","Mario Martin","Antonio Agudo","Francesc Moreno-Noguer","Massimiliano Pontil","Claudio Semini","Carlos Mastalli"],"pdf_url":"https://arxiv.org/pdf/2402.15552v3.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.08661v1","updated":"2024-11-13T14:51:30Z","published":"2024-11-13T14:51:30Z","title":"Energy Optimal Traversal Between Hover Waypoints for Lift+Cruise\n Electric Powered Aircraft","summary":" Advanced Air Mobility aircraft require energy efficient flight plans to be\neconomically viable. This paper defines minimum energy direct trajectories\nbetween waypoints for Lift+Cruise electric Vertical Take-Off and Landing\n(eVTOL) aircraft. Energy consumption is optimized over accelerated and cruise\nflight profiles with consideration of mode transitions. Because eVTOL\noperations start and end in hover for vertical take-off and landing, hover\nwaypoints are utilized. Energy consumption is modeled as a function of airspeed\nfor each flight mode, providing the basis to prove energy optimality for\nmulti-mode traversal. Wind magnitude and direction dictate feasibility of\nstraight-line traversal because Lift+Cruise aircraft point into the relative\nwind direction while hovering but also have a maximum heading rate constraint.\nEnergy and power use for an experimentally validated QuadPlane small eVTOL\naircraft are characterized with respect to airspeed and acceleration in all\nflight modes. Optimal QuadPlane traversals are presented. Constraints on\nacceleration and wind are derived for straight-line QuadPlane traversal.\nResults show an optimal QuadPlane $500m$ traversal between hover waypoints\nsaves $71\\%$ energy compared to pure vertical flight traversal for a\nrepresentative case study with a direct $4m/s$ crosswind. Energy optimal eVTOL\ndirect trajectory definition with transitions to and from hover is novel to\nthis work. Future work should model three-dimensional flight and wind as well\nas optimize maneuver primitives when required.\n","authors":["Akshay Mathur","Ella Atkins"],"pdf_url":"https://arxiv.org/pdf/2411.08661v1.pdf","comment":"34 pages, 17 figures and 5 tables"},{"id":"http://arxiv.org/abs/2411.08637v1","updated":"2024-11-13T14:24:47Z","published":"2024-11-13T14:24:47Z","title":"Robot See, Robot Do: Imitation Reward for Noisy Financial Environments","summary":" The sequential nature of decision-making in financial asset trading aligns\nnaturally with the reinforcement learning (RL) framework, making RL a common\napproach in this domain. However, the low signal-to-noise ratio in financial\nmarkets results in noisy estimates of environment components, including the\nreward function, which hinders effective policy learning by RL agents. Given\nthe critical importance of reward function design in RL problems, this paper\nintroduces a novel and more robust reward function by leveraging imitation\nlearning, where a trend labeling algorithm acts as an expert. We integrate\nimitation (expert's) feedback with reinforcement (agent's) feedback in a\nmodel-free RL algorithm, effectively embedding the imitation learning problem\nwithin the RL paradigm to handle the stochasticity of reward signals. Empirical\nresults demonstrate that this novel approach improves financial performance\nmetrics compared to traditional benchmarks and RL agents trained solely using\nreinforcement feedback.\n","authors":["Sven Goluža","Tomislav Kovačević","Stjepan Begušić","Zvonko Kostanjčar"],"pdf_url":"https://arxiv.org/pdf/2411.08637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08634v1","updated":"2024-11-13T14:18:28Z","published":"2024-11-13T14:18:28Z","title":"On the Application of Model Predictive Control to a Weighted Coverage\n Path Planning Problem","summary":" This paper considers the application of Model Predictive Control (MPC) to a\nweighted coverage path planning (WCPP) problem. The problem appears in a wide\nrange of practical applications, such as search and rescue (SAR) missions. The\nbasic setup is that one (or multiple) agents can move around a given search\nspace and collect rewards from a given spatial distribution. Unlike an\nartificial potential field, each reward can only be collected once. In contrast\nto a Traveling Salesman Problem (TSP), the agent moves in a continuous space.\nMoreover, he is not obliged to cover all locations and/or may return to\npreviously visited locations. The WCPP problem is tackled by a new Model\nPredictive Control (MPC) formulation with so-called Coverage Constraints (CCs).\nIt is shown that the solution becomes more effective if the solver is\ninitialized with a TSP-based heuristic. With and without this initialization,\nthe proposed MPC approach clearly outperforms a naive MPC formulation, as\ndemonstrated in a small simulation study.\n","authors":["Kilian Schweppe","Ludmila Moshagen","Georg Schildbach"],"pdf_url":"https://arxiv.org/pdf/2411.08634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08622v1","updated":"2024-11-13T14:08:58Z","published":"2024-11-13T14:08:58Z","title":"Precision-Focused Reinforcement Learning Model for Robotic Object\n Pushing","summary":" Non-prehensile manipulation, such as pushing objects to a desired target\nposition, is an important skill for robots to assist humans in everyday\nsituations. However, the task is challenging due to the large variety of\nobjects with different and sometimes unknown physical properties, such as\nshape, size, mass, and friction. This can lead to the object overshooting its\ntarget position, requiring fast corrective movements of the robot around the\nobject, especially in cases where objects need to be precisely pushed. In this\npaper, we improve the state-of-the-art by introducing a new memory-based\nvision-proprioception RL model to push objects more precisely to target\npositions using fewer corrective movements.\n","authors":["Lara Bergmann","David Leins","Robert Haschke","Klaus Neumann"],"pdf_url":"https://arxiv.org/pdf/2411.08622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08605v1","updated":"2024-11-13T13:45:54Z","published":"2024-11-13T13:45:54Z","title":"Lo-MARVE: A Low Cost Autonomous Underwater Vehicle for Marine\n Exploration","summary":" This paper presents Low-cost Marine Autonomous Robotic Vehicle Explorer\n(Lo-MARVE), a novel autonomous underwater vehicle (AUV) designed to provide a\nlow cost solution for underwater exploration and environmental monitoring in\nshallow water environments. Lo-MARVE offers a cost-effective alternative to\nexisting AUVs, featuring a modular design, low-cost sensors, and wireless\ncommunication capabilities. The total cost of Lo-MARVE is approximately EUR\n500. Lo-MARVE is developed using the Raspberry Pi 4B microprocessor, with\ncontrol software written in Python. The proposed AUV was validated through\nfield testing outside of a laboratory setting, in the freshwater environment of\nthe River Corrib in Galway, Ireland. This demonstrates its ability to navigate\nautonomously, collect data, and communicate effectively outside of a controlled\nlaboratory setting. The successful deployment of Lo-MARVE in a real-world\nenvironment validates its proof of concept.\n","authors":["Karl Mason","Daniel Kelly"],"pdf_url":"https://arxiv.org/pdf/2411.08605v1.pdf","comment":"This paper was presented at the 12th International Conference on\n Control, Mechatronics and Automation (ICCMA 2024), held in London, UK, from\n November 11-13, 2024"},{"id":"http://arxiv.org/abs/2411.08579v1","updated":"2024-11-13T12:51:49Z","published":"2024-11-13T12:51:49Z","title":"NavAgent: Multi-scale Urban Street View Fusion For UAV Embodied\n Vision-and-Language Navigation","summary":" Vision-and-Language Navigation (VLN), as a widely discussed research\ndirection in embodied intelligence, aims to enable embodied agents to navigate\nin complicated visual environments through natural language commands. Most\nexisting VLN methods focus on indoor ground robot scenarios. However, when\napplied to UAV VLN in outdoor urban scenes, it faces two significant\nchallenges. First, urban scenes contain numerous objects, which makes it\nchallenging to match fine-grained landmarks in images with complex textual\ndescriptions of these landmarks. Second, overall environmental information\nencompasses multiple modal dimensions, and the diversity of representations\nsignificantly increases the complexity of the encoding process. To address\nthese challenges, we propose NavAgent, the first urban UAV embodied navigation\nmodel driven by a large Vision-Language Model. NavAgent undertakes navigation\ntasks by synthesizing multi-scale environmental information, including\ntopological maps (global), panoramas (medium), and fine-grained landmarks\n(local). Specifically, we utilize GLIP to build a visual recognizer for\nlandmark capable of identifying and linguisticizing fine-grained landmarks.\nSubsequently, we develop dynamically growing scene topology map that integrate\nenvironmental information and employ Graph Convolutional Networks to encode\nglobal environmental data. In addition, to train the visual recognizer for\nlandmark, we develop NavAgent-Landmark2K, the first fine-grained landmark\ndataset for real urban street scenes. In experiments conducted on the Touchdown\nand Map2seq datasets, NavAgent outperforms strong baseline models. The code and\ndataset will be released to the community to facilitate the exploration and\ndevelopment of outdoor VLN.\n","authors":["Youzhi Liu","Fanglong Yao","Yuanchang Yue","Guangluan Xu","Xian Sun","Kun Fu"],"pdf_url":"https://arxiv.org/pdf/2411.08579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07590v2","updated":"2024-11-13T12:39:41Z","published":"2024-11-12T07:06:45Z","title":"Multiple noncooperative targets encirclement by relative distance-based\n positioning and neural antisynchronization control","summary":" From prehistoric encirclement for hunting to GPS orbiting the earth for\npositioning, target encirclement has numerous real world applications. However,\nencircling multiple non-cooperative targets in GPS-denied environments remains\nchallenging. In this work, multiple targets encirclement by using a minimum of\ntwo tasking agents, is considered where the relative distance measurements\nbetween the agents and the targets can be obtained by using onboard sensors.\nBased on the measurements, the center of all the targets is estimated directly\nby a fuzzy wavelet neural network (FWNN) and the least squares fit method.\nThen, a new distributed anti-synchronization controller (DASC) is designed so\nthat the two tasking agents are able to encircle all targets while staying\nopposite to each other. In particular, the radius of the desired encirclement\ntrajectory can be dynamically determined to avoid potential collisions between\nthe two agents and all targets. Based on the Lyapunov stability analysis\nmethod, the convergence proofs of the neural network prediction error, the\ntarget-center position estimation error, and the controller error are addressed\nrespectively. Finally, both numerical simulations and UAV flight experiments\nare conducted to demonstrate the validity of the encirclement algorithms. The\nflight tests recorded video and other simulation results can be found in\nhttps://youtu.be/B8uTorBNrl4.\n","authors":["Fen Liu","Shenghai Yuan","Wei Meng","Rong Su","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2411.07590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08566v1","updated":"2024-11-13T12:26:08Z","published":"2024-11-13T12:26:08Z","title":"Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space\n Exploration by Reinforcement Learning Agent","summary":" Grasping by a robot in unstructured environments is deemed a critical\nchallenge because of the requirement for effective adaptation to a wide\nvariation in object geometries, material properties, and other environmental\nfactors. In this paper, we propose a novel framework for robotic grasping based\non the idea of compressing high-dimensional target and gripper features in a\ncommon latent space using a set of autoencoders. Our approach simplifies\ngrasping by using three autoencoders dedicated to the target, the gripper, and\na third one that fuses their latent representations. This allows the RL agent\nto achieve higher learning rates at the initial stages of exploration of a new\nenvironment, as well as at non-zero shot grasp attempts. The agent explores the\nlatent space of the third autoencoder for better quality grasp without explicit\nreconstruction of objects. By implementing the PoWER algorithm into the RL\ntraining process, updates on the agent's policy will be made through the\nperturbation in the reward-weighted latent space. The successful exploration\nefficiently constrains both position and pose integrity for feasible executions\nof grasps. We evaluate our system on a diverse set of objects, demonstrating\nthe high success rate in grasping with minimum computational overhead. We found\nthat approach enhances the adaptation of the RL agent by more than 35 \\% in\nsimulation experiments.\n","authors":["Leonidas Askianakis"],"pdf_url":"https://arxiv.org/pdf/2411.08566v1.pdf","comment":"Submitted for review at IEEE ICRA 2025"},{"id":"http://arxiv.org/abs/2411.08533v1","updated":"2024-11-13T11:29:14Z","published":"2024-11-13T11:29:14Z","title":"ACROSS: A Deformation-Based Cross-Modal Representation for Robotic\n Tactile Perception","summary":" Tactile perception is essential for human interaction with the environment\nand is becoming increasingly crucial in robotics. Tactile sensors like the\nBioTac mimic human fingertips and provide detailed interaction data. Despite\nits utility in applications like slip detection and object identification, this\nsensor is now deprecated, making many existing valuable datasets obsolete.\nHowever, recreating similar datasets with newer sensor technologies is both\ntedious and time-consuming. Therefore, it is crucial to adapt these existing\ndatasets for use with new setups and modalities. In response, we introduce\nACROSS, a novel framework for translating data between tactile sensors by\nexploiting sensor deformation information. We demonstrate the approach by\ntranslating BioTac signals into the DIGIT sensor. Our framework consists of\nfirst converting the input signals into 3D deformation meshes. We then\ntransition from the 3D deformation mesh of one sensor to the mesh of another,\nand finally convert the generated 3D deformation mesh into the corresponding\noutput space. We demonstrate our approach to the most challenging problem of\ngoing from a low-dimensional tactile representation to a high-dimensional one.\nIn particular, we transfer the tactile signals of a BioTac sensor to DIGIT\ntactile images. Our approach enables the continued use of valuable datasets and\nthe exchange of data between groups with different setups.\n","authors":["Wadhah Zai El Amri","Malte Kuhlmann","Nicolás Navarro-Guerrero"],"pdf_url":"https://arxiv.org/pdf/2411.08533v1.pdf","comment":"Paper Submitted to ICRA2025. arXiv admin note: text overlap with\n arXiv:2410.14310"},{"id":"http://arxiv.org/abs/2411.08499v1","updated":"2024-11-13T10:32:52Z","published":"2024-11-13T10:32:52Z","title":"Learning Robust Grasping Strategy Through Tactile Sensing and Adaption\n Skill","summary":" Robust grasping represents an essential task in robotics, necessitating\ntactile feedback and reactive grasping adjustments for robust grasping of\nobjects. Previous research has extensively combined tactile sensing with\ngrasping, primarily relying on rule-based approaches, frequently neglecting\npost-grasping difficulties such as external disruptions or inherent\nuncertainties of the object's physics and geometry. To address these\nlimitations, this paper introduces an human-demonstration-based adaptive\ngrasping policy base on tactile, which aims to achieve robust gripping while\nresisting disturbances to maintain grasp stability. Our trained model\ngeneralizes to daily objects with seven different sizes, shapes, and textures.\nExperimental results demonstrate that our method performs well in dynamic and\nforce interaction tasks and exhibits excellent generalization ability.\n","authors":["Yueming Hu","Mengde Li","Songhua Yang","Xuetao Li","Sheng Liu","Miao Li"],"pdf_url":"https://arxiv.org/pdf/2411.08499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08474v1","updated":"2024-11-13T09:47:00Z","published":"2024-11-13T09:47:00Z","title":"A Cost-effective, Stand-alone, and Real-time TinyML-Based Gait Diagnosis\n Unit Aimed at Lower-limb Robotic Prostheses and Exoskeletons","summary":" Robotic prostheses and exoskeletons can do wonders compared to their\nnon-robotic counterpart. However, in a cost-soaring world where 1 in every 10\npatients has access to normal medical prostheses, access to advanced ones is,\nunfortunately, extremely limited especially due to their high cost, a\nsignificant portion of which is contributed to by the diagnosis and controlling\nunits. However, affordability is often not a major concern for developing such\ndevices as with cost reduction, performance is also found to be deducted due to\nthe cost vs. performance trade-off. Considering the gravity of such\ncircumstances, the goal of this research was to propose an affordable wearable\nreal-time gait diagnosis unit (GDU) aimed at robotic prostheses and\nexoskeletons. As a proof of concept, it has also developed the GDU prototype\nwhich leveraged TinyML to run two parallel quantized int8 models into an ESP32\nNodeMCU development board (7.30 USD) to effectively classify five gait\nscenarios (idle, walk, run, hopping, and skip) and generate an anomaly score\nbased on acceleration data received from two attached IMUs. The developed\nwearable gait diagnosis stand-alone unit could be fitted to any prosthesis or\nexoskeleton and could effectively classify the gait scenarios with an overall\naccuracy of 92% and provide anomaly scores within 95-96 ms with only 3 seconds\nof gait data in real-time.\n","authors":["Zarin Anjum Madhiha","Antar Mazumder","Sohani Munteha Hiam"],"pdf_url":"https://arxiv.org/pdf/2411.08474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08447v1","updated":"2024-11-13T08:59:53Z","published":"2024-11-13T08:59:53Z","title":"Learning Dynamic Cognitive Map with Autonomous Navigation","summary":" Inspired by animal navigation strategies, we introduce a novel computational\nmodel to navigate and map a space rooted in biologically inspired principles.\nAnimals exhibit extraordinary navigation prowess, harnessing memory,\nimagination, and strategic decision-making to traverse complex and aliased\nenvironments adeptly. Our model aims to replicate these capabilities by\nincorporating a dynamically expanding cognitive map over predicted poses within\nan Active Inference framework, enhancing our agent's generative model\nplasticity to novelty and environmental changes. Through structure learning and\nactive inference navigation, our model demonstrates efficient exploration and\nexploitation, dynamically expanding its model capacity in response to\nanticipated novel un-visited locations and updating the map given new evidence\ncontradicting previous beliefs. Comparative analyses in mini-grid environments\nwith the Clone-Structured Cognitive Graph model (CSCG), which shares similar\nobjectives, highlight our model's ability to rapidly learn environmental\nstructures within a single episode, with minimal navigation overlap. Our model\nachieves this without prior knowledge of observation and world dimensions,\nunderscoring its robustness and efficacy in navigating intricate environments.\n","authors":["Daria de Tinguy","Tim Verbelen","Bart Dhoedt"],"pdf_url":"https://arxiv.org/pdf/2411.08447v1.pdf","comment":"under submission at Frontiers Computer Neuroscience"},{"id":"http://arxiv.org/abs/2411.08433v1","updated":"2024-11-13T08:34:07Z","published":"2024-11-13T08:34:07Z","title":"3D Multi-Object Tracking with Semi-Supervised GRU-Kalman Filter","summary":" 3D Multi-Object Tracking (MOT), a fundamental component of environmental\nperception, is essential for intelligent systems like autonomous driving and\nrobotic sensing. Although Tracking-by-Detection frameworks have demonstrated\nexcellent performance in recent years, their application in real-world\nscenarios faces significant challenges. Object movement in complex environments\nis often highly nonlinear, while existing methods typically rely on linear\napproximations of motion. Furthermore, system noise is frequently modeled as a\nGaussian distribution, which fails to capture the true complexity of the noise\ndynamics. These oversimplified modeling assumptions can lead to significant\nreductions in tracking precision. To address this, we propose a GRU-based MOT\nmethod, which introduces a learnable Kalman filter into the motion module. This\napproach is able to learn object motion characteristics through data-driven\nlearning, thereby avoiding the need for manual model design and model error. At\nthe same time, to avoid abnormal supervision caused by the wrong association\nbetween annotations and trajectories, we design a semi-supervised learning\nstrategy to accelerate the convergence speed and improve the robustness of the\nmodel. Evaluation experiment on the nuScenes and Argoverse2 datasets\ndemonstrates that our system exhibits superior performance and significant\npotential compared to traditional TBD methods.\n","authors":["Xiaoxiang Wang","Jiaxin Liu","Miaojie Feng","Zhaoxing Zhang","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2411.08433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04919v2","updated":"2024-11-13T08:32:27Z","published":"2024-11-07T17:56:16Z","title":"Stem-OB: Generalizable Visual Imitation Learning with Stem-Like\n Convergent Observation through Diffusion Inversion","summary":" Visual imitation learning methods demonstrate strong performance, yet they\nlack generalization when faced with visual input perturbations, including\nvariations in lighting and textures, impeding their real-world application. We\npropose Stem-OB that utilizes pretrained image diffusion models to suppress\nlow-level visual differences while maintaining high-level scene structures.\nThis image inversion process is akin to transforming the observation into a\nshared representation, from which other observations stem, with extraneous\ndetails removed. Stem-OB contrasts with data-augmentation approaches as it is\nrobust to various unspecified appearance changes without the need for\nadditional training. Our method is a simple yet highly effective plug-and-play\nsolution. Empirical results confirm the effectiveness of our approach in\nsimulated tasks and show an exceptionally significant improvement in real-world\napplications, with an average increase of 22.2% in success rates compared to\nthe best baseline. See https://hukz18.github.io/Stem-Ob/ for more info.\n","authors":["Kaizhe Hu","Zihang Rui","Yao He","Yuyao Liu","Pu Hua","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2411.04919v2.pdf","comment":"Arxiv preprint version, website: https://hukz18.github.io/Stem-Ob/"},{"id":"http://arxiv.org/abs/2204.06343v3","updated":"2024-11-13T07:50:55Z","published":"2022-04-13T12:50:42Z","title":"Single-grasp deformable object discrimination: the effect of gripper\n morphology, sensing modalities, and action parameters","summary":" In haptic object discrimination, the effect of gripper embodiment, action\nparameters, and sensory channels has not been systematically studied. We used\ntwo anthropomorphic hands and two 2-finger grippers to grasp two sets of\ndeformable objects. On the object classification task, we found: (i) among\nclassifiers, SVM on sensory features and LSTM on raw time series performed best\nacross all grippers; (ii) faster compression speeds degraded performance; (iii)\ngeneralization to different grasping configurations was limited; transfer to\ndifferent compression speeds worked well for the Barrett Hand only.\nVisualization of the feature spaces using PCA showed that gripper morphology\nand action parameters were the main source of variance, making generalization\nacross embodiment or grip configurations very difficult. On the highly\nchallenging dataset consisting of polyurethane foams alone, only the Barrett\nHand achieved excellent performance. Tactile sensors can thus provide a key\nadvantage even if recognition is based on stiffness rather than shape. The data\nset with 24,000 measurements is publicly available.\n","authors":["Michal Pliska","Shubhan Patni","Michal Mares","Pavel Stoudek","Zdenek Straka","Karla Stepanova","Matej Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2204.06343v3.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.08400v1","updated":"2024-11-13T07:38:24Z","published":"2024-11-13T07:38:24Z","title":"BAMAX: Backtrack Assisted Multi-Agent Exploration using Reinforcement\n Learning","summary":" Autonomous robots collaboratively exploring an unknown environment is still\nan open problem. The problem has its roots in coordination among non-stationary\nagents, each with only a partial view of information. The problem is compounded\nwhen the multiple robots must completely explore the environment. In this\npaper, we introduce Backtrack Assisted Multi-Agent Exploration using\nReinforcement Learning (BAMAX), a method for collaborative exploration in\nmulti-agent systems which attempts to explore an entire virtual environment. As\nin the name, BAMAX leverages backtrack assistance to enhance the performance of\nagents in exploration tasks. To evaluate BAMAX against traditional approaches,\nwe present the results of experiments conducted across multiple hexagonal\nshaped grids sizes, ranging from 10x10 to 60x60. The results demonstrate that\nBAMAX outperforms other methods in terms of faster coverage and less\nbacktracking across these environments.\n","authors":["Geetansh Kalra","Amit Patel","Atul Chaudhari","Divye Singh"],"pdf_url":"https://arxiv.org/pdf/2411.08400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08395v1","updated":"2024-11-13T07:27:56Z","published":"2024-11-13T07:27:56Z","title":"MambaXCTrack: Mamba-based Tracker with SSM Cross-correlation and Motion\n Prompt for Ultrasound Needle Tracking","summary":" Ultrasound (US)-guided needle insertion is widely employed in percutaneous\ninterventions. However, providing feedback on the needle tip position via US\nimage presents challenges due to noise, artifacts, and the thin imaging plane\nof US, which degrades needle features and leads to intermittent tip visibility.\nIn this paper, a Mamba-based US needle tracker MambaXCTrack utilizing\nstructured state space models cross-correlation (SSMX-Corr) and implicit motion\nprompt is proposed, which is the first application of Mamba in US needle\ntracking. The SSMX-Corr enhances cross-correlation by long-range modeling and\nglobal searching of distant semantic features between template and search maps,\nbenefiting the tracking under noise and artifacts by implicitly learning\npotential distant semantic cues. By combining with cross-map interleaved scan\n(CIS), local pixel-wise interaction with positional inductive bias can also be\nintroduced to SSMX-Corr. The implicit low-level motion descriptor is proposed\nas a non-visual prompt to enhance tracking robustness, addressing the\nintermittent tip visibility problem. Extensive experiments on a dataset with\nmotorized needle insertion in both phantom and tissue samples demonstrate that\nthe proposed tracker outperforms other state-of-the-art trackers while ablation\nstudies further highlight the effectiveness of each proposed tracking module.\n","authors":["Yuelin Zhang","Qingpeng Ding","Long Lei","Jiwei Shan","Wenxuan Xie","Tianyi Zhang","Wanquan Yan","Raymond Shing-Yan Tang","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.08395v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.08389v1","updated":"2024-11-13T07:22:03Z","published":"2024-11-13T07:22:03Z","title":"Integrative Wrapping System for a Dual-Arm Humanoid Robot","summary":" Flexible object manipulation of paper and cloth is a major research challenge\nin robot manipulation. Although there have been efforts to develop hardware\nthat enables specific actions and to realize a single action of paper folding\nusing sim-to-real and learning, there have been few proposals for humanoid\nrobots and systems that enable continuous, multi-step actions of flexible\nmaterials. Wrapping an object with paper and tape is more complex and diverse\nthan traditional manipulation research due to the increased number of objects\nthat need to be handled, as well as the three-dimensionality of the operation.\nIn this research, necessary information is organized and coded based on the\ncharacteristics of each object handled in wrapping. We also generalize the\nhardware configuration, manipulation method, and recognition system that enable\nhumanoid wrapping operations. The system will include manipulation with\nadmittance control focusing on paper tension and state evaluation using point\nclouds to handle three-dimensional flexible objects. Finally, wrapping objects\nwith different shapes is experimented with to show the generality and\neffectiveness of the proposed system.\n","authors":["Yukina Iwata","Shun Hasegawa","Kento Kawaharazuka","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2411.08389v1.pdf","comment":"Accepted Humanoids2024"},{"id":"http://arxiv.org/abs/2411.08373v1","updated":"2024-11-13T06:46:18Z","published":"2024-11-13T06:46:18Z","title":"DG-SLAM: Robust Dynamic Gaussian Splatting SLAM with Hybrid Pose\n Optimization","summary":" Achieving robust and precise pose estimation in dynamic scenes is a\nsignificant research challenge in Visual Simultaneous Localization and Mapping\n(SLAM). Recent advancements integrating Gaussian Splatting into SLAM systems\nhave proven effective in creating high-quality renderings using explicit 3D\nGaussian models, significantly improving environmental reconstruction fidelity.\nHowever, these approaches depend on a static environment assumption and face\nchallenges in dynamic environments due to inconsistent observations of geometry\nand photometry. To address this problem, we propose DG-SLAM, the first robust\ndynamic visual SLAM system grounded in 3D Gaussians, which provides precise\ncamera pose estimation alongside high-fidelity reconstructions. Specifically,\nwe propose effective strategies, including motion mask generation, adaptive\nGaussian point management, and a hybrid camera tracking algorithm to improve\nthe accuracy and robustness of pose estimation. Extensive experiments\ndemonstrate that DG-SLAM delivers state-of-the-art performance in camera pose\nestimation, map reconstruction, and novel-view synthesis in dynamic scenes,\noutperforming existing methods meanwhile preserving real-time rendering\nability.\n","authors":["Yueming Xu","Haochen Jiang","Zhongyang Xiao","Jianfeng Feng","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08323v1","updated":"2024-11-13T04:11:03Z","published":"2024-11-13T04:11:03Z","title":"Efficient Trajectory Generation in 3D Environments with Multi-Level Map\n Construction","summary":" We propose a robust and efficient framework to generate global trajectories\nfor ground robots in complex 3D environments. The proposed method takes point\ncloud as input and efficiently constructs a multi-level map using triangular\npatches as the basic elements. A kinematic path search is adopted on the\npatches, where motion primitives on different patches combine to form the\nglobal min-time cost initial trajectory. We use a same-level expansion method\nto locate the nearest obstacle for each trajectory waypoint and construct an\nobjective function with curvature, smoothness and obstacle terms for\noptimization. We evaluate the method on several complex 3D point cloud maps.\nCompared to existing methods, our method demonstrates higher robustness to\npoint cloud noise, enabling the generation of high quality trajectory while\nmaintaining high computational efficiency. Our code will be publicly available\nat https://github.com/ck-tian/MLMC-planner.\n","authors":["Chengkun Tian","Xiaohui Gao","Yongguang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03928v3","updated":"2024-11-13T03:47:03Z","published":"2024-11-06T14:03:49Z","title":"DEIO: Deep Event Inertial Odometry","summary":" Event cameras are bio-inspired, motion-activated sensors that demonstrate\nimpressive potential in handling challenging situations, such as motion blur\nand high-dynamic range. Despite their promise, existing event-based\nsimultaneous localization and mapping (SLAM) approaches exhibit limited\nperformance in real-world applications. On the other hand, state-of-the-art\nSLAM approaches that incorporate deep neural networks for better robustness and\napplicability. However, these is a lack of research in fusing learning-based\nevent SLAM methods with IMU, which could be indispensable to push the\nevent-based SLAM to large-scale, low-texture or complex scenarios. In this\npaper, we propose DEIO, the first monocular deep event-inertial odometry\nframework that combines learning-based method with traditional nonlinear\ngraph-based optimization. Specifically, we tightly integrate a trainable\nevent-based differentiable bundle adjustment (e-DBA) with the IMU\npre-integration in a factor graph which employs keyframe-based sliding window\noptimization. Numerical Experiments in nine public challenge datasets show that\nour method can achieve superior performance compared with the image-based and\nevent-based benchmarks. The source code is available at:\nhttps://github.com/arclab-hku/DEIO.\n","authors":["Weipeng Guan","Fuling Lin","Peiyu Chen","Peng Lu"],"pdf_url":"https://arxiv.org/pdf/2411.03928v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11698v2","updated":"2024-11-13T03:25:12Z","published":"2023-09-21T00:54:18Z","title":"Rendering Stable Features Improves Sampling-Based Localisation with\n Neural Radiance Fields","summary":" Neural radiance fields (NeRFs) are a powerful tool for implicit scene\nrepresentations, allowing for differentiable rendering and the ability to make\npredictions about unseen viewpoints. There has been growing interest in object\nand scene-based localisation using NeRFs, with a number of recent works relying\non sampling-based or Monte-Carlo localisation schemes. Unfortunately, these can\nbe extremely computationally expensive, requiring multiple network forward\npasses to infer camera or object pose. To alleviate this, a variety of sampling\nstrategies have been applied, many relying on keypoint recognition techniques\nfrom classical computer vision. This work conducts a systematic empirical\ncomparison of these approaches and shows that in contrast to conventional\nfeature matching approaches for geometry-based localisation, sampling-based\nlocalisation using NeRFs benefits significantly from stable features. Results\nshow that rendering stable features provides significantly better estimation\nwith a tenfold reduction in the number of forward passes required.\n","authors":["Boxuan Zhang","Lindsay Kleeman","Michael Burke"],"pdf_url":"https://arxiv.org/pdf/2309.11698v2.pdf","comment":"Accepted at the 2024 Australasian Conference on Robotics and\n Automation (ACRA 2024)"},{"id":"http://arxiv.org/abs/2411.07954v2","updated":"2024-11-13T02:56:56Z","published":"2024-11-12T17:30:31Z","title":"Learning Memory Mechanisms for Decision Making through Demonstrations","summary":" In Partially Observable Markov Decision Processes, integrating an agent's\nhistory into memory poses a significant challenge for decision-making.\nTraditional imitation learning, relying on observation-action pairs for expert\ndemonstrations, fails to capture the expert's memory mechanisms used in\ndecision-making. To capture memory processes as demonstrations, we introduce\nthe concept of memory dependency pairs $(p, q)$ indicating that events at time\n$p$ are recalled for decision-making at time $q$. We introduce AttentionTuner\nto leverage memory dependency pairs in Transformers and find significant\nimprovements across several tasks compared to standard Transformers when\nevaluated on Memory Gym and the Long-term Memory Benchmark. Code is available\nat https://github.com/WilliamYue37/AttentionTuner.\n","authors":["William Yue","Bo Liu","Peter Stone"],"pdf_url":"https://arxiv.org/pdf/2411.07954v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16981v2","updated":"2024-11-13T02:49:58Z","published":"2024-10-22T13:01:21Z","title":"Proleptic Temporal Ensemble for Improving the Speed of Robot Tasks\n Generated by Imitation Learning","summary":" Imitation learning, which enables robots to learn behaviors from\ndemonstrations by human, has emerged as a promising solution for generating\nrobot motions in such environments. The imitation learning-based robot motion\ngeneration method, however, has the drawback of depending on the demonstrator's\ntask execution speed. This paper presents a novel temporal ensemble approach\napplied to imitation learning algorithms, allowing for execution of future\nactions. The proposed method leverages existing demonstration data and\npre-trained policies, offering the advantages of requiring no additional\ncomputation and being easy to implement. The algorithms performance was\nvalidated through real-world experiments involving robotic block color sorting,\ndemonstrating up to 3x increase in task execution speed while maintaining a\nhigh success rate compared to the action chunking with transformer method. This\nstudy highlights the potential for significantly improving the performance of\nimitation learning-based policies, which were previously limited by the\ndemonstrator's speed. It is expected to contribute substantially to future\nadvancements in autonomous object manipulation technologies aimed at enhancing\nproductivity.\n","authors":["Hyeonjun Park","Daegyu Lim","Seungyeon Kim","Sumin Park"],"pdf_url":"https://arxiv.org/pdf/2410.16981v2.pdf","comment":"This paper was initially submitted to The Journal of Korea Robotics\n Society on Oct. 22, 2024, and a revised version was submitted on Nov. 13,\n 2024. It is currently under review"},{"id":"http://arxiv.org/abs/2409.14615v2","updated":"2024-11-13T02:17:52Z","published":"2024-09-22T22:43:54Z","title":"A Comparative Study on State-Action Spaces for Learning Viewpoint\n Selection and Manipulation with Diffusion Policy","summary":" Robotic manipulation tasks often rely on static cameras for perception, which\ncan limit flexibility, particularly in scenarios like robotic surgery and\ncluttered environments where mounting static cameras is impractical. Ideally,\nrobots could jointly learn a policy for dynamic viewpoint and manipulation.\nHowever, it remains unclear which state-action space is most suitable for this\ncomplex learning process. To enable manipulation with dynamic viewpoints and to\nbetter understand impacts from different state-action spaces on this policy\nlearning process, we conduct a comparative study on the state-action spaces for\npolicy learning and their impacts on the performance of visuomotor policies\nthat integrate viewpoint selection with manipulation. Specifically, we examine\nthe configuration space of the robotic system, the end-effector space with a\ndual-arm Inverse Kinematics (IK) solver, and the reduced end-effector space\nwith a look-at IK solver to optimize rotation for viewpoint selection. We also\nassess variants with different rotation representations. Our results\ndemonstrate that state-action spaces utilizing Euler angles with the look-at IK\nachieve superior task success rates compared to other spaces. Further analysis\nsuggests that these performance differences are driven by inherent variations\nin the high-frequency components across different state-action spaces and\nrotation representations.\n","authors":["Xiatao Sun","Francis Fan","Yinxing Chen","Daniel Rakita"],"pdf_url":"https://arxiv.org/pdf/2409.14615v2.pdf","comment":"Submitted to ICRA 2025. Website:\n https://apollo-lab-yale.github.io/spaces_comparative_study/"},{"id":"http://arxiv.org/abs/2411.08281v1","updated":"2024-11-13T01:42:12Z","published":"2024-11-13T01:42:12Z","title":"When to Localize? A POMDP Approach","summary":" Robots often localize to lower navigational errors and facilitate downstream,\nhigh-level tasks. However, a robot may want to selectively localize when\nlocalization is costly (such as with resource-constrained robots) or\ninefficient (for example, submersibles that need to surface), especially when\nnavigating in environments with variable numbers of hazards such as obstacles\nand shipping lanes. In this study, we propose a method that helps a robot\ndetermine ``when to localize'' to 1) minimize such actions and 2) not exceed\nthe probability of failure (such as surfacing within high-traffic shipping\nlanes). We formulate our method as a Constrained Partially Observable Markov\nDecision Process and use the Cost-Constrained POMCP solver to plan the robot's\nactions. The solver simulates failure probabilities to decide if a robot moves\nto its goal or localizes to prevent failure. We performed numerical experiments\nwith multiple baselines.\n","authors":["Troi Williams","Kasra Torshizi","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2411.08281v1.pdf","comment":"Accepted to the 2024 IEEE International Symposium on Safety,\n Security, and Rescue Robotics (SSRR). 6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.08279v1","updated":"2024-11-13T01:38:06Z","published":"2024-11-13T01:38:06Z","title":"MBA-SLAM: Motion Blur Aware Dense Visual SLAM with Radiance Fields\n Representation","summary":" Emerging 3D scene representations, such as Neural Radiance Fields (NeRF) and\n3D Gaussian Splatting (3DGS), have demonstrated their effectiveness in\nSimultaneous Localization and Mapping (SLAM) for photo-realistic rendering,\nparticularly when using high-quality video sequences as input. However,\nexisting methods struggle with motion-blurred frames, which are common in\nreal-world scenarios like low-light or long-exposure conditions. This often\nresults in a significant reduction in both camera localization accuracy and map\nreconstruction quality. To address this challenge, we propose a dense visual\nSLAM pipeline (i.e. MBA-SLAM) to handle severe motion-blurred inputs. Our\napproach integrates an efficient motion blur-aware tracker with either neural\nradiance fields or Gaussian Splatting based mapper. By accurately modeling the\nphysical image formation process of motion-blurred images, our method\nsimultaneously learns 3D scene representation and estimates the cameras' local\ntrajectory during exposure time, enabling proactive compensation for motion\nblur caused by camera movement. In our experiments, we demonstrate that\nMBA-SLAM surpasses previous state-of-the-art methods in both camera\nlocalization and map reconstruction, showcasing superior performance across a\nrange of datasets, including synthetic and real datasets featuring sharp images\nas well as those affected by motion blur, highlighting the versatility and\nrobustness of our approach. Code is available at\nhttps://github.com/WU-CVGL/MBA-SLAM.\n","authors":["Peng Wang","Lingzhe Zhao","Yin Zhang","Shiyu Zhao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08261v1","updated":"2024-11-13T00:24:43Z","published":"2024-11-13T00:24:43Z","title":"Control of Biohybrid Actuators using NeuroEvolution","summary":" In medical-related tasks, soft robots can perform better than conventional\nrobots because of their compliant building materials and the movements they are\nable perform. However, designing soft robot controllers is not an easy task,\ndue to the non-linear properties of their materials. Since human expertise to\ndesign such controllers is yet not sufficiently effective, a formal design\nprocess is needed. The present research proposes neuroevolution-based\nalgorithms as the core mechanism to automatically generate controllers for\nbiohybrid actuators that can be used on future medical devices, such as a\ncatheter that will deliver drugs. The controllers generated by methodologies\nbased on Neuroevolution of Augmenting Topologies (NEAT) and Hypercube-based\nNEAT (HyperNEAT) are compared against the ones generated by a standard genetic\nalgorithm (SGA). In specific, the metrics considered are the maximum\ndisplacement in upward bending movement and the robustness to control different\nbiohybrid actuator morphologies without redesigning the control strategy.\nResults indicate that the neuroevolution-based algorithms produce better suited\ncontrollers than the SGA. In particular, NEAT designed the best controllers,\nachieving up to 25% higher displacement when compared with SGA-produced\nspecialised controllers trained over a single morphology and 23% when compared\nwith general purpose controllers trained over a set of morphologies.\n","authors":["Hugo Alcaraz-Herrera","Michail-Antisthenis Tsompanas","Andrew Adamatzky","Igor Balaz"],"pdf_url":"https://arxiv.org/pdf/2411.08261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08253v1","updated":"2024-11-13T00:02:32Z","published":"2024-11-13T00:02:32Z","title":"Open-World Task and Motion Planning via Vision-Language Model Inferred\n Constraints","summary":" Foundation models trained on internet-scale data, such as Vision-Language\nModels (VLMs), excel at performing tasks involving common sense, such as visual\nquestion answering. Despite their impressive capabilities, these models cannot\ncurrently be directly applied to challenging robot manipulation problems that\nrequire complex and precise continuous reasoning. Task and Motion Planning\n(TAMP) systems can control high-dimensional continuous systems over long\nhorizons through combining traditional primitive robot operations. However,\nthese systems require detailed model of how the robot can impact its\nenvironment, preventing them from directly interpreting and addressing novel\nhuman objectives, for example, an arbitrary natural language goal. We propose\ndeploying VLMs within TAMP systems by having them generate discrete and\ncontinuous language-parameterized constraints that enable TAMP to reason about\nopen-world concepts. Specifically, we propose algorithms for VLM partial\nplanning that constrain a TAMP system's discrete temporal search and VLM\ncontinuous constraints interpretation to augment the traditional manipulation\nconstraints that TAMP systems seek to satisfy. We demonstrate our approach on\ntwo robot embodiments, including a real world robot, across several\nmanipulation tasks, where the desired objectives are conveyed solely through\nlanguage.\n","authors":["Nishanth Kumar","Fabio Ramos","Dieter Fox","Caelan Reed Garrett"],"pdf_url":"https://arxiv.org/pdf/2411.08253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13777v2","updated":"2024-11-13T23:46:42Z","published":"2024-03-20T17:41:21Z","title":"Embedding Pose Graph, Enabling 3D Foundation Model Capabilities with a\n Compact Representation","summary":" This paper presents the Embedding Pose Graph (EPG), an innovative method that\ncombines the strengths of foundation models with a simple 3D representation\nsuitable for robotics applications. Addressing the need for efficient spatial\nunderstanding in robotics, EPG provides a compact yet powerful approach by\nattaching foundation model features to the nodes of a pose graph. Unlike\ntraditional methods that rely on bulky data formats like voxel grids or point\nclouds, EPG is lightweight and scalable. It facilitates a range of robotic\ntasks, including open-vocabulary querying, disambiguation, image-based\nquerying, language-directed navigation, and re-localization in 3D environments.\nWe showcase the effectiveness of EPG in handling these tasks, demonstrating its\ncapacity to improve how robots interact with and navigate through complex\nspaces. Through both qualitative and quantitative assessments, we illustrate\nEPG's strong performance and its ability to outperform existing methods in\nre-localization. Our work introduces a crucial step forward in enabling robots\nto efficiently understand and operate within large-scale 3D spaces.\n","authors":["Hugues Thomas","Mouli Sivapurapu","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.13777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.04883v6","updated":"2024-11-13T23:34:19Z","published":"2022-08-09T16:25:49Z","title":"Neural-Rendezvous: Provably Robust Guidance and Control to Encounter\n Interstellar Objects","summary":" Interstellar objects (ISOs) are likely representatives of primitive materials\ninvaluable in understanding exoplanetary star systems. Due to their poorly\nconstrained orbits with generally high inclinations and relative velocities,\nhowever, exploring ISOs with conventional human-in-the-loop approaches is\nsignificantly challenging. This paper presents Neural-Rendezvous -- a deep\nlearning-based guidance and control framework for encountering fast-moving\nobjects, including ISOs, robustly, accurately, and autonomously in real time.\nIt uses pointwise minimum norm tracking control on top of a guidance policy\nmodeled by a spectrally-normalized deep neural network, where its\nhyperparameters are tuned with a loss function directly penalizing the MPC\nstate trajectory tracking error. We show that Neural-Rendezvous provides a high\nprobability exponential bound on the expected spacecraft delivery error, the\nproof of which leverages stochastic incremental stability analysis. In\nparticular, it is used to construct a non-negative function with a\nsupermartingale property, explicitly accounting for the ISO state uncertainty\nand the local nature of nonlinear state estimation guarantees. In numerical\nsimulations, Neural-Rendezvous is demonstrated to satisfy the expected error\nbound for 100 ISO candidates. This performance is also empirically validated\nusing our spacecraft simulator and in high-conflict and distributed UAV swarm\nreconfiguration with up to 20 UAVs.\n","authors":["Hiroyasu Tsukamoto","Soon-Jo Chung","Yashwanth Kumar Nakka","Benjamin Donitz","Declan Mages","Michel Ingham"],"pdf_url":"https://arxiv.org/pdf/2208.04883v6.pdf","comment":"Preprint Version, Accepted: October, 2024 (One-minute YouTube\n summary: https://youtu.be/q3e0LYS2IYQ, DOI:\n https://doi.org/10.2514/1.G007671)"},{"id":"http://arxiv.org/abs/2411.09062v1","updated":"2024-11-13T22:43:15Z","published":"2024-11-13T22:43:15Z","title":"Multimodal Object Detection using Depth and Image Data for Manufacturing\n Parts","summary":" Manufacturing requires reliable object detection methods for precise picking\nand handling of diverse types of manufacturing parts and components.\nTraditional object detection methods utilize either only 2D images from cameras\nor 3D data from lidars or similar 3D sensors. However, each of these sensors\nhave weaknesses and limitations. Cameras do not have depth perception and 3D\nsensors typically do not carry color information. These weaknesses can\nundermine the reliability and robustness of industrial manufacturing systems.\nTo address these challenges, this work proposes a multi-sensor system combining\nan red-green-blue (RGB) camera and a 3D point cloud sensor. The two sensors are\ncalibrated for precise alignment of the multimodal data captured from the two\nhardware devices. A novel multimodal object detection method is developed to\nprocess both RGB and depth data. This object detector is based on the Faster\nR-CNN baseline that was originally designed to process only camera images. The\nresults show that the multimodal model significantly outperforms the depth-only\nand RGB-only baselines on established object detection metrics. More\nspecifically, the multimodal model improves mAP by 13% and raises Mean\nPrecision by 11.8% in comparison to the RGB-only baseline. Compared to the\ndepth-only baseline, it improves mAP by 78% and raises Mean Precision by 57%.\nHence, this method facilitates more reliable and robust object detection in\nservice to smart manufacturing applications.\n","authors":["Nazanin Mahjourian","Vinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.09062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09052v1","updated":"2024-11-13T22:15:31Z","published":"2024-11-13T22:15:31Z","title":"ClevrSkills: Compositional Language and Visual Reasoning in Robotics","summary":" Robotics tasks are highly compositional by nature. For example, to perform a\nhigh-level task like cleaning the table a robot must employ low-level\ncapabilities of moving the effectors to the objects on the table, pick them up\nand then move them off the table one-by-one, while re-evaluating the\nconsequently dynamic scenario in the process. Given that large vision language\nmodels (VLMs) have shown progress on many tasks that require high level,\nhuman-like reasoning, we ask the question: if the models are taught the\nrequisite low-level capabilities, can they compose them in novel ways to\nachieve interesting high-level tasks like cleaning the table without having to\nbe explicitly taught so? To this end, we present ClevrSkills - a benchmark\nsuite for compositional reasoning in robotics. ClevrSkills is an environment\nsuite developed on top of the ManiSkill2 simulator and an accompanying dataset.\nThe dataset contains trajectories generated on a range of robotics tasks with\nlanguage and visual annotations as well as multi-modal prompts as task\nspecification. The suite includes a curriculum of tasks with three levels of\ncompositional understanding, starting with simple tasks requiring basic motor\nskills. We benchmark multiple different VLM baselines on ClevrSkills and show\nthat even after being pre-trained on large numbers of tasks, these models fail\non compositional reasoning in robotics tasks.\n","authors":["Sanjay Haresh","Daniel Dijkman","Apratim Bhattacharyya","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2411.09052v1.pdf","comment":"To appear at NeurIPS 2024 (D&B track)"},{"id":"http://arxiv.org/abs/2411.09022v1","updated":"2024-11-13T20:59:30Z","published":"2024-11-13T20:59:30Z","title":"DART-LLM: Dependency-Aware Multi-Robot Task Decomposition and Execution\n using Large Language Models","summary":" Large Language Models (LLMs) have demonstrated significant reasoning\ncapabilities in robotic systems. However, their deployment in multi-robot\nsystems remains fragmented and struggles to handle complex task dependencies\nand parallel execution. This study introduces the DART-LLM (Dependency-Aware\nMulti-Robot Task Decomposition and Execution using Large Language Models)\nsystem, designed to address these challenges. DART-LLM utilizes LLMs to parse\nnatural language instructions, decomposing them into multiple subtasks with\ndependencies to establish complex task sequences, thereby enhancing efficient\ncoordination and parallel execution in multi-robot systems. The system includes\nthe QA LLM module, Breakdown Function modules, Actuation module, and a\nVision-Language Model (VLM)-based object detection module, enabling task\ndecomposition and execution from natural language instructions to robotic\nactions. Experimental results demonstrate that DART-LLM excels in handling\nlong-horizon tasks and collaborative tasks with complex dependencies. Even when\nusing smaller models like Llama 3.1 8B, the system achieves good performance,\nhighlighting DART-LLM's robustness in terms of model size. Please refer to the\nproject website \\url{https://wyd0817.github.io/project-dart-llm/} for videos\nand code.\n","authors":["Yongdong Wang","Runze Xiao","Jun Younes Louhi Kasahara","Ryosuke Yajima","Keiji Nagatani","Atsushi Yamashita","Hajime Asama"],"pdf_url":"https://arxiv.org/pdf/2411.09022v1.pdf","comment":"Submitted to the 2025 IEEE International Conference on Robotics &\n Automation on September 15, 2024"},{"id":"http://arxiv.org/abs/2411.09020v1","updated":"2024-11-13T20:59:21Z","published":"2024-11-13T20:59:21Z","title":"Predictive Visuo-Tactile Interactive Perception Framework for Object\n Properties Inference","summary":" Interactive exploration of the unknown physical properties of objects such as\nstiffness, mass, center of mass, friction coefficient, and shape is crucial for\nautonomous robotic systems operating continuously in unstructured environments.\nPrecise identification of these properties is essential to manipulate objects\nin a stable and controlled way, and is also required to anticipate the outcomes\nof (prehensile or non-prehensile) manipulation actions such as pushing,\npulling, lifting, etc. Our study focuses on autonomously inferring the physical\nproperties of a diverse set of various homogeneous, heterogeneous, and\narticulated objects utilizing a robotic system equipped with vision and tactile\nsensors. We propose a novel predictive perception framework for identifying\nobject properties of the diverse objects by leveraging versatile exploratory\nactions: non-prehensile pushing and prehensile pulling. As part of the\nframework, we propose a novel active shape perception to seamlessly initiate\nexploration. Our innovative dual differentiable filtering with Graph Neural\nNetworks learns the object-robot interaction and performs consistent inference\nof indirectly observable time-invariant object properties. In addition, we\nformulate a $N$-step information gain approach to actively select the most\ninformative actions for efficient learning and inference. Extensive real-robot\nexperiments with planar objects show that our predictive perception framework\nresults in better performance than the state-of-the-art baseline and\ndemonstrate our framework in three major applications for i) object tracking,\nii) goal-driven task, and iii) change in environment detection.\n","authors":["Anirvan Dutta","Etienne Burdet","Mohsen Kaboli"],"pdf_url":"https://arxiv.org/pdf/2411.09020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12649v2","updated":"2024-11-13T20:00:39Z","published":"2024-10-16T15:13:27Z","title":"Faster Algorithms for Growing Collision-Free Convex Polytopes in Robot\n Configuration Space","summary":" We propose two novel algorithms for constructing convex collision-free\npolytopes in robot configuration space. Finding these polytopes enables the\napplication of stronger motion-planning frameworks such as trajectory\noptimization with Graphs of Convex Sets [1] and is currently a major roadblock\nin the adoption of these approaches. In this paper, we build upon IRIS-NP\n(Iterative Regional Inflation by Semidefinite & Nonlinear Programming) [2] to\nsignificantly improve tunability, runtimes, and scaling to complex\nenvironments. IRIS-NP uses nonlinear programming paired with uniform random\ninitialization to find configurations on the boundary of the free configuration\nspace. Our key insight is that finding near-by configuration-space obstacles\nusing sampling is inexpensive and greatly accelerates region generation. We\npropose two algorithms using such samples to either employ nonlinear\nprogramming more efficiently (IRIS-NP2 ) or circumvent it altogether using a\nmassively-parallel zero-order optimization strategy (IRIS-ZO). We also propose\na termination condition that controls the probability of exceeding a\nuser-specified permissible fraction-in-collision, eliminating a significant\nsource of tuning difficulty in IRIS-NP. We compare performance across eight\nrobot environments, showing that IRIS-ZO achieves an order-of-magnitude speed\nadvantage over IRIS-NP. IRISNP2, also significantly faster than IRIS-NP, builds\nlarger polytopes using fewer hyperplanes, enabling faster downstream\ncomputation. Website: https://sites.google.com/view/fastiris\n","authors":["Peter Werner","Thomas Cohn","Rebecca H. Jiang","Tim Seyde","Max Simchowitz","Russ Tedrake","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2410.12649v2.pdf","comment":"16 pages, 6 figures, accepted for publication in the proceedings of\n the International Symposium for Robotics Research 2024"},{"id":"http://arxiv.org/abs/2411.08999v1","updated":"2024-11-13T19:45:47Z","published":"2024-11-13T19:45:47Z","title":"Learning-Based Control Barrier Function with Provably Safe Guarantees:\n Reducing Conservatism with Heading-Aware Safety Margin","summary":" We propose a learning-based Control Barrier Function (CBF) to reduce\nconservatism in collision avoidance of car-like robots. Traditional CBFs often\nuse Euclidean distance between robots' centers as safety margin, neglecting\nheadings and simplifying geometries to circles. While this ensures smooth,\ndifferentiable safety functions required by CBFs, it can be overly conservative\nin tight environments. To address this limitation, we design a heading-aware\nsafety margin that accounts for the robots' orientations, enabling a less\nconservative and more accurate estimation of safe regions. Since the function\ncomputing this safety margin is non-differentiable, we approximate it with a\nneural network to ensure differentiability and facilitate integration with\nCBFs. We describe how we achieve bounded learning error and incorporate the\nupper bound into the CBF to provide formal safety guarantees through forward\ninvariance. We show that our CBF is a high-order CBF with relative degree two\nfor a system with two robots whose dynamics are modeled by the nonlinear\nkinematic bicycle model. Experimental results in overtaking and bypassing\nscenarios reveal a 33.5 % reduction in conservatism compared to traditional\nmethods, while maintaining safety. Code: https://github.com/bassamlab/sigmarl\n","authors":["Jianye Xu","Bassam Alrifaee"],"pdf_url":"https://arxiv.org/pdf/2411.08999v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.07293v2","updated":"2024-11-13T17:49:27Z","published":"2024-09-11T14:22:57Z","title":"Electrokinetic Propulsion for Electronically Integrated Microscopic\n Robots","summary":" Semiconductor microelectronics are emerging as a powerful tool for building\nsmart, autonomous robots too small to see with the naked eye. Yet a number of\nexisting microrobot platforms, despite significant advantages in speed,\nrobustness, power consumption, or ease of fabrication, have no clear path\ntowards electronics integration, limiting their intelligence and sophistication\nwhen compared to electronic cousins. Here, we show how to upgrade a\nself-propelled particle into an an electronically integrated microrobot,\nreaping the best of both in a single design. Inspired by electrokinetic\nmicromotors, these robots generate electric fields in a surrounding fluid, and\nby extension propulsive electrokinetic flows. The underlying physics is\ncaptured by a model in which robot speed is proportional to applied current,\nmaking design and control straightforward. As proof, we build basic robots that\nuse on-board circuits and a closed-loop optical control scheme to navigate\nwaypoints and move in coordinated swarms at speeds of up to one body length per\nsecond. Broadly, the unification of micromotor propulsion with on-robot\nelectronics clears the way for robust, fast, easy to manufacture,\nelectronically programmable microrobots that operate reliably over months to\nyears.\n","authors":["Lucas C. Hanson","William H. Reinhardt","Scott Shrager","Tarunyaa Sivakumar","Marc Z. Miskin"],"pdf_url":"https://arxiv.org/pdf/2409.07293v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14980v2","updated":"2024-11-13T23:33:06Z","published":"2022-10-26T18:49:43Z","title":"Interstellar Object Accessibility and Mission Design","summary":" Interstellar objects (ISOs) represent a compelling and under-explored\ncategory of celestial bodies, providing physical laboratories to understand the\nformation of our solar system and probe the composition and properties of\nmaterial formed in exoplanetary systems. In this work, we investigate existing\napproaches to designing successful flyby missions to ISOs, including a deep\nlearning-driven guidance and control algorithm for ISOs traveling at velocities\nover 60 km/s. We have generated spacecraft trajectories to a series of\nsynthetic representative ISOs, simulating a ground campaign to observe the\ntarget and resolve its state, thereby determining the cruise and close approach\ndelta-Vs required for the encounter. We discuss the accessibility of and\nmission design to ISOs with varying characteristics, with special focuses on 1)\nstate covariance estimation throughout the cruise, 2) handoffs from traditional\nnavigation approaches to novel autonomous navigation for fast flyby regimes,\nand 3) overall recommendations about preparing for the future in situ\nexploration of these targets. The lessons learned also apply to the fast flyby\nof other small bodies, e.g., long-period comets and potentially hazardous\nasteroids, which also require tactical responses with similar characteristics.\n","authors":["Benjamin P. S. Donitz","Declan Mages","Hiroyasu Tsukamoto","Peter Dixon","Damon Landau","Soon-Jo Chung","Erica Bufanda","Michel Ingham","Julie Castillo-Rogez"],"pdf_url":"https://arxiv.org/pdf/2210.14980v2.pdf","comment":"IEEE Aerospace Conference, Preprint Version, Accepted: November 2022"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.08034v2","updated":"2024-11-13T18:59:44Z","published":"2024-11-12T18:59:35Z","title":"Scaling Properties of Diffusion Models for Perceptual Tasks","summary":" In this paper, we argue that iterative computation with diffusion models\noffers a powerful paradigm for not only generation but also visual perception\ntasks. We unify tasks such as depth estimation, optical flow, and amodal\nsegmentation under the framework of image-to-image translation, and show how\ndiffusion models benefit from scaling training and test-time compute for these\nperceptual tasks. Through a careful analysis of these scaling properties, we\nformulate compute-optimal training and inference recipes to scale diffusion\nmodels for visual perception tasks. Our models achieve competitive performance\nto state-of-the-art methods using significantly less data and compute. To\naccess our code and models, see https://scaling-diffusion-perception.github.io .\n","authors":["Rahul Ravishankar","Zeeshan Patel","Jathushan Rajasegaran","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2411.08034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08879v1","updated":"2024-11-13T18:56:39Z","published":"2024-11-13T18:56:39Z","title":"4D Gaussian Splatting in the Wild with Uncertainty-Aware Regularization","summary":" Novel view synthesis of dynamic scenes is becoming important in various\napplications, including augmented and virtual reality. We propose a novel 4D\nGaussian Splatting (4DGS) algorithm for dynamic scenes from casually recorded\nmonocular videos. To overcome the overfitting problem of existing work for\nthese real-world videos, we introduce an uncertainty-aware regularization that\nidentifies uncertain regions with few observations and selectively imposes\nadditional priors based on diffusion models and depth smoothness on such\nregions. This approach improves both the performance of novel view synthesis\nand the quality of training image reconstruction. We also identify the\ninitialization problem of 4DGS in fast-moving dynamic regions, where the\nStructure from Motion (SfM) algorithm fails to provide reliable 3D landmarks.\nTo initialize Gaussian primitives in such regions, we present a dynamic region\ndensification method using the estimated depth maps and scene flow. Our\nexperiments show that the proposed method improves the performance of 4DGS\nreconstruction from a video captured by a handheld monocular camera and also\nexhibits promising results in few-shot static scene reconstruction.\n","authors":["Mijeong Kim","Jongwoo Lim","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2411.08879v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08878v1","updated":"2024-11-13T18:55:10Z","published":"2024-11-13T18:55:10Z","title":"A Short Note on Evaluating RepNet for Temporal Repetition Counting in\n Videos","summary":" We discuss some consistent issues on how RepNet has been evaluated in various\npapers. As a way to mitigate these issues, we report RepNet performance results\non different datasets, and release evaluation code and the RepNet checkpoint to\nobtain these results. Code URL:\nhttps://github.com/google-research/google-research/blob/master/repnet/\n","authors":["Debidatta Dwibedi","Yusuf Aytar","Jonathan Tompson","Pierre Sermanet","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.08878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10172v3","updated":"2024-11-13T18:42:18Z","published":"2024-04-15T23:01:59Z","title":"Forensic Iris Image-Based Post-Mortem Interval Estimation","summary":" Post-mortem iris recognition is an emerging application of iris-based human\nidentification in a forensic setup. One factor that may be useful in\nconditioning iris recognition methods is the tissue decomposition level, which\nis correlated with the post-mortem interval (PMI), \\ie the number of hours that\nhave elapsed since death. PMI, however, is not always available, and its\nprecise estimation remains one of the core challenges in forensic examination.\nThis paper presents the first known to us method of the PMI estimation directly\nfrom iris images captured after death. To assess the feasibility of the\niris-based PMI estimation, we designed models predicting the PMI from (a)\nnear-infrared (NIR), (b) visible (RGB), and (c) multispectral (RGB+NIR)\nforensic iris images. Models were evaluated following a 10-fold\ncross-validation, in (S1) sample-disjoint, (S2) subject-disjoint, and (S3)\ncross-dataset scenarios. We explore two data balancing techniques for S3:\nresampling-based balancing (S3-real), and synthetic data-supplemented balancing\n(S3-synthetic). We found that using the multispectral data offers a\nspectacularly low mean absolute error (MAE) of $\\approx 3.5$ hours in the\nscenario (S1), a bit worse MAE $\\approx 17.5$ hours in the scenario (S2), and\nMAE $\\approx 45.77$ hours in the scenario (S3). Additionally, supplementing the\ntraining set with synthetically-generated forensic iris images (S3-synthetic)\nsignificantly enhances the models' ability to generalize to new NIR, RGB and\nmultispectral data collected in a different lab. This suggests that if the\nenvironmental conditions are favorable (\\eg, bodies are kept in low\ntemperatures), forensic iris images provide features that are indicative of the\nPMI and can be automatically estimated.\n","authors":["Rasel Ahmed Bhuiyan","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2404.10172v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13880v4","updated":"2024-11-13T18:31:18Z","published":"2024-04-22T05:07:02Z","title":"Regional Style and Color Transfer","summary":" This paper presents a novel contribution to the field of regional style\ntransfer. Existing methods often suffer from the drawback of applying style\nhomogeneously across the entire image, leading to stylistic inconsistencies or\nforeground object twisted when applied to image with foreground elements such\nas person figures. To address this limitation, we propose a new approach that\nleverages a segmentation network to precisely isolate foreground objects within\nthe input image. Subsequently, style transfer is applied exclusively to the\nbackground region. The isolated foreground objects are then carefully\nreintegrated into the style-transferred background. To enhance the visual\ncoherence between foreground and background, a color transfer step is employed\non the foreground elements prior to their rein-corporation. Finally, we utilize\nfeathering techniques to achieve a seamless amalgamation of foreground and\nbackground, resulting in a visually unified and aesthetically pleasing final\ncomposition. Extensive evaluations demonstrate that our proposed approach\nyields significantly more natural stylistic transformations compared to\nconventional methods.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Siyang Li","Qingtian Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13880v4.pdf","comment":"Accepted by 2024 5th International Conference on Computer Vision,\n Image and Deep Learning"},{"id":"http://arxiv.org/abs/2407.06438v2","updated":"2024-11-13T18:21:22Z","published":"2024-07-08T22:40:15Z","title":"A Single Transformer for Scalable Vision-Language Modeling","summary":" We present SOLO, a single transformer for Scalable visiOn-Language mOdeling.\nCurrent large vision-language models (LVLMs) such as LLaVA mostly employ\nheterogeneous architectures that connect pre-trained visual encoders with large\nlanguage models (LLMs) to facilitate visual recognition and complex reasoning.\nAlthough achieving remarkable performance with relatively lightweight training,\nwe identify four primary scalability limitations: (1) The visual capacity is\nconstrained by pre-trained visual encoders, which are typically an order of\nmagnitude smaller than LLMs. (2) The heterogeneous architecture complicates the\nuse of established hardware and software infrastructure. (3) Study of scaling\nlaws on such architecture must consider three separate components - visual\nencoder, connector, and LLMs, which complicates the analysis. (4) The use of\nexisting visual encoders typically requires following a pre-defined\nspecification of image inputs pre-processing, for example, by reshaping inputs\nto fixed-resolution square images, which presents difficulties in processing\nand training on high-resolution images or those with unusual aspect ratio. A\nunified single Transformer architecture, like SOLO, effectively addresses these\nscalability concerns in LVLMs; however, its limited adoption in the modern\ncontext likely stems from the absence of reliable training recipes that balance\nboth modalities and ensure stable training for billion-scale models. In this\npaper, we introduce the first open-source training recipe for developing SOLO,\nan open-source 7B LVLM using moderate academic resources. The training recipe\ninvolves initializing from LLMs, sequential pre-training on ImageNet and\nweb-scale data, and instruction fine-tuning on our curated high-quality\ndatasets. On extensive evaluation, SOLO demonstrates performance comparable to\nLLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning.\n","authors":["Yangyi Chen","Xingyao Wang","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2407.06438v2.pdf","comment":"Accepted to TMLR"},{"id":"http://arxiv.org/abs/2411.08840v1","updated":"2024-11-13T18:19:51Z","published":"2024-11-13T18:19:51Z","title":"Multimodal Instruction Tuning with Hybrid State Space Models","summary":" Handling lengthy context is crucial for enhancing the recognition and\nunderstanding capabilities of multimodal large language models (MLLMs) in\napplications such as processing high-resolution images or high frame rate\nvideos. The rise in image resolution and frame rate substantially increases\ncomputational demands due to the increased number of input tokens. This\nchallenge is further exacerbated by the quadratic complexity with respect to\nsequence length of the self-attention mechanism. Most prior works either\npre-train models with long contexts, overlooking the efficiency problem, or\nattempt to reduce the context length via downsampling (e.g., identify the key\nimage patches or frames) to decrease the context length, which may result in\ninformation loss. To circumvent this issue while keeping the remarkable\neffectiveness of MLLMs, we propose a novel approach using a hybrid\ntransformer-MAMBA model to efficiently handle long contexts in multimodal\napplications. Our multimodal model can effectively process long context input\nexceeding 100k tokens, outperforming existing models across various benchmarks.\nRemarkably, our model enhances inference efficiency for high-resolution images\nand high-frame-rate videos by about 4 times compared to current models, with\nefficiency gains increasing as image resolution or video frames rise.\nFurthermore, our model is the first to be trained on low-resolution images or\nlow-frame-rate videos while being capable of inference on high-resolution\nimages and high-frame-rate videos, offering flexibility for inference in\ndiverse scenarios.\n","authors":["Jianing Zhou","Han Li","Shuai Zhang","Ning Xie","Ruijie Wang","Xiaohan Nie","Sheng Liu","Lingyun Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10259v4","updated":"2024-11-13T17:35:00Z","published":"2024-02-15T18:42:33Z","title":"GaussianObject: High-Quality 3D Object Reconstruction from Four Views\n with Gaussian Splatting","summary":" Reconstructing and rendering 3D objects from highly sparse views is of\ncritical importance for promoting applications of 3D vision techniques and\nimproving user experience. However, images from sparse views only contain very\nlimited 3D information, leading to two significant challenges: 1) Difficulty in\nbuilding multi-view consistency as images for matching are too few; 2)\nPartially omitted or highly compressed object information as view coverage is\ninsufficient. To tackle these challenges, we propose GaussianObject, a\nframework to represent and render the 3D object with Gaussian splatting that\nachieves high rendering quality with only 4 input images. We first introduce\ntechniques of visual hull and floater elimination, which explicitly inject\nstructure priors into the initial optimization process to help build multi-view\nconsistency, yielding a coarse 3D Gaussian representation. Then we construct a\nGaussian repair model based on diffusion models to supplement the omitted\nobject information, where Gaussians are further refined. We design a\nself-generating strategy to obtain image pairs for training the repair model.\nWe further design a COLMAP-free variant, where pre-given accurate camera poses\nare not required, which achieves competitive quality and facilitates wider\napplications. GaussianObject is evaluated on several challenging datasets,\nincluding MipNeRF360, OmniObject3D, OpenIllumination, and our-collected unposed\nimages, achieving superior performance from only four views and significantly\noutperforming previous SOTA methods. Our demo is available at\nhttps://gaussianobject.github.io/, and the code has been released at\nhttps://github.com/GaussianObject/GaussianObject.\n","authors":["Chen Yang","Sikuang Li","Jiemin Fang","Ruofan Liang","Lingxi Xie","Xiaopeng Zhang","Wei Shen","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2402.10259v4.pdf","comment":"ACM Transactions on Graphics (SIGGRAPH Asia 2024). Project page:\n https://gaussianobject.github.io/ Code:\n https://github.com/chensjtu/GaussianObject"},{"id":"http://arxiv.org/abs/2403.18346v4","updated":"2024-11-13T17:17:43Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from over-reliance on unimodal biases (e.g., language bias\nand vision bias), leading to incorrect answers or hallucinations in complex\nmultimodal tasks. To investigate this issue, we propose a causal framework to\ninterpret the biases in Visual Question Answering (VQA) problems. Within this\nframework, we conduct an in-depth causal analysis to assess the causal effect\nof these biases on MLLM predictions. Based on the analysis, we introduce 1) a\nnovel MORE dataset with 12,000 challenging VQA instances requiring multi-hop\nreasoning and overcoming unimodal biases. 2) a causality-enhanced agent\nframework CAVE that guides models to comprehensively integrate information from\ndifferent modalities and mitigate biases. Our experiments show that MLLMs\nperform poorly on MORE, indicating strong unimodal biases and limited semantic\nunderstanding. However, when integrated with our CAVE, promising improvements\nin reasoning and bias mitigation can be seen. These findings provide important\ninsights for the development of more robust MLLMs and contribute to the broader\ngoal of advancing multimodal AI systems capable of deeper understanding and\nreasoning. Our project page is at https://github.com/OpenCausaLab/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09733v3","updated":"2024-11-13T17:07:45Z","published":"2024-07-13T00:45:37Z","title":"Textured-GS: Gaussian Splatting with Spatially Defined Color and Opacity","summary":" In this paper, we introduce Textured-GS, an innovative method for rendering\nGaussian splatting that incorporates spatially defined color and opacity\nvariations using Spherical Harmonics (SH). This approach enables each Gaussian\nto exhibit a richer representation by accommodating varying colors and\nopacities across its surface, significantly enhancing rendering quality\ncompared to traditional methods. To demonstrate the merits of our approach, we\nhave adapted the Mini-Splatting architecture to integrate textured Gaussians\nwithout increasing the number of Gaussians. Our experiments across multiple\nreal-world datasets show that Textured-GS consistently outperforms both the\nbaseline Mini-Splatting and standard 3DGS in terms of visual fidelity. The\nresults highlight the potential of Textured-GS to advance Gaussian-based\nrendering technologies, promising more efficient and high-quality scene\nreconstructions. Our implementation is available at\nhttps://github.com/ZhentaoHuang/Textured-GS.\n","authors":["Zhentao Huang","Minglun Gong"],"pdf_url":"https://arxiv.org/pdf/2407.09733v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2411.08777v1","updated":"2024-11-13T17:02:46Z","published":"2024-11-13T17:02:46Z","title":"LUDO: Low-Latency Understanding of Highly Deformable Objects using Point\n Cloud Occupancy Functions","summary":" Accurately determining the shape and location of internal structures within\ndeformable objects is crucial for medical tasks that require precise targeting,\nsuch as robotic biopsies. We introduce LUDO, a method for accurate low-latency\nunderstanding of deformable objects. LUDO reconstructs objects in their\ndeformed state, including their internal structures, from a single-view point\ncloud observation in under 30 ms using occupancy networks. We demonstrate\nLUDO's abilities for autonomous targeting of internal regions of interest\n(ROIs) in highly deformable objects. Additionally, LUDO provides uncertainty\nestimates and explainability for its predictions, both of which are important\nin safety-critical applications such as surgical interventions. We evaluate\nLUDO in real-world robotic experiments, achieving a success rate of 98.9% for\npuncturing various ROIs inside highly deformable objects. LUDO demonstrates the\npotential to interact with deformable objects without the need for deformable\nregistration methods.\n","authors":["Pit Henrich","Franziska Mathis-Ullrich","Paul Maria Scheikl"],"pdf_url":"https://arxiv.org/pdf/2411.08777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08768v1","updated":"2024-11-13T16:53:29Z","published":"2024-11-13T16:53:29Z","title":"Sharingan: Extract User Action Sequence from Desktop Recordings","summary":" Video recordings of user activities, particularly desktop recordings, offer a\nrich source of data for understanding user behaviors and automating processes.\nHowever, despite advancements in Vision-Language Models (VLMs) and their\nincreasing use in video analysis, extracting user actions from desktop\nrecordings remains an underexplored area. This paper addresses this gap by\nproposing two novel VLM-based methods for user action extraction: the Direct\nFrame-Based Approach (DF), which inputs sampled frames directly into VLMs, and\nthe Differential Frame-Based Approach (DiffF), which incorporates explicit\nframe differences detected via computer vision techniques. We evaluate these\nmethods using a basic self-curated dataset and an advanced benchmark adapted\nfrom prior work. Our results show that the DF approach achieves an accuracy of\n70% to 80% in identifying user actions, with the extracted action sequences\nbeing re-playable though Robotic Process Automation. We find that while VLMs\nshow potential, incorporating explicit UI changes can degrade performance,\nmaking the DF approach more reliable. This work represents the first\napplication of VLMs for extracting user action sequences from desktop\nrecordings, contributing new methods, benchmarks, and insights for future\nresearch.\n","authors":["Yanting Chen","Yi Ren","Xiaoting Qin","Jue Zhang","Kehong Yuan","Lu Han","Qingwei Lin","Dongmei Zhang","Saravan Rajmohan","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12461v2","updated":"2024-11-13T16:49:14Z","published":"2023-11-21T09:15:24Z","title":"HiFi-Syn: Hierarchical Granularity Discrimination for High-Fidelity\n Synthesis of MR Images with Structure Preservation","summary":" Synthesizing medical images while preserving their structural information is\ncrucial in medical research. In such scenarios, the preservation of anatomical\ncontent becomes especially important. Although recent advances have been made\nby incorporating instance-level information to guide translation, these methods\noverlook the spatial coherence of structural-level representation and the\nanatomical invariance of content during translation. To address these issues,\nwe introduce hierarchical granularity discrimination, which exploits various\nlevels of semantic information present in medical images. Our strategy utilizes\nthree levels of discrimination granularity: pixel-level discrimination using a\nBrain Memory Bank, structure-level discrimination on each brain structure with\na re-weighting strategy to focus on hard samples, and global-level\ndiscrimination to ensure anatomical consistency during translation. The image\ntranslation performance of our strategy has been evaluated on three independent\ndatasets (UK Biobank, IXI, and BraTS 2018), and it has outperformed\nstate-of-the-art algorithms. Particularly, our model excels not only in\nsynthesizing normal structures but also in handling abnormal (pathological)\nstructures, such as brain tumors, despite the variations in contrast observed\nacross different imaging modalities due to their pathological characteristics.\nThe diagnostic value of synthesized MR images containing brain tumors has been\nevaluated by radiologists. This indicates that our model may offer an\nalternative solution in scenarios where specific MR modalities of patients are\nunavailable. Extensive experiments further demonstrate the versatility of our\nmethod, providing unique insights into medical image translation.\n","authors":["Ziqi Yu","Botao Zhao","Shengjie Zhang","Xiang Chen","Jianfeng Feng","Tingying Peng","Xiao-Yong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08756v1","updated":"2024-11-13T16:42:07Z","published":"2024-11-13T16:42:07Z","title":"Masked Image Modeling Boosting Semi-Supervised Semantic Segmentation","summary":" In view of the fact that semi- and self-supervised learning share a\nfundamental principle, effectively modeling knowledge from unlabeled data,\nvarious semi-supervised semantic segmentation methods have integrated\nrepresentative self-supervised learning paradigms for further regularization.\nHowever, the potential of the state-of-the-art generative self-supervised\nparadigm, masked image modeling, has been scarcely studied. This paradigm\nlearns the knowledge through establishing connections between the masked and\nvisible parts of masked image, during the pixel reconstruction process. By\ninheriting and extending this insight, we successfully leverage masked image\nmodeling to boost semi-supervised semantic segmentation. Specifically, we\nintroduce a novel class-wise masked image modeling that independently\nreconstructs different image regions according to their respective classes. In\nthis way, the mask-induced connections are established within each class,\nmitigating the semantic confusion that arises from plainly reconstructing\nimages in basic masked image modeling. To strengthen these intra-class\nconnections, we further develop a feature aggregation strategy that minimizes\nthe distances between features corresponding to the masked and visible parts\nwithin the same class. Additionally, in semantic space, we explore the\napplication of masked image modeling to enhance regularization. Extensive\nexperiments conducted on well-known benchmarks demonstrate that our approach\nachieves state-of-the-art performance. The code will be available at\nhttps://github.com/haoxt/S4MIM.\n","authors":["Yangyang Li","Xuanting Hao","Ronghua Shang","Licheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2411.08756v1.pdf","comment":"13 pages. This work has been submitted to the IEEE for possible\n publication"},{"id":"http://arxiv.org/abs/2411.08755v1","updated":"2024-11-13T16:33:27Z","published":"2024-11-13T16:33:27Z","title":"Weakly-Supervised Anomaly Detection in Surveillance Videos Based on\n Two-Stream I3D Convolution Network","summary":" The widespread implementation of urban surveillance systems has necessitated\nmore sophisticated techniques for anomaly detection to ensure enhanced public\nsafety. This paper presents a significant advancement in the field of anomaly\ndetection through the application of Two-Stream Inflated 3D (I3D) Convolutional\nNetworks. These networks substantially outperform traditional 3D Convolutional\nNetworks (C3D) by more effectively extracting spatial and temporal features\nfrom surveillance videos, thus improving the precision of anomaly detection.\nOur research advances the field by implementing a weakly supervised learning\nframework based on Multiple Instance Learning (MIL), which uniquely\nconceptualizes surveillance videos as collections of 'bags' that contain\ninstances (video clips). Each instance is innovatively processed through a\nranking mechanism that prioritizes clips based on their potential to display\nanomalies. This novel strategy not only enhances the accuracy and precision of\nanomaly detection but also significantly diminishes the dependency on extensive\nmanual annotations. Moreover, through meticulous optimization of model\nsettings, including the choice of optimizer, our approach not only establishes\nnew benchmarks in the performance of anomaly detection systems but also offers\na scalable and efficient solution for real-world surveillance applications.\nThis paper contributes significantly to the field of computer vision by\ndelivering a more adaptable, efficient, and context-aware anomaly detection\nsystem, which is poised to redefine practices in urban surveillance.\n","authors":["Sareh Soltani Nejad","Anwar Haque"],"pdf_url":"https://arxiv.org/pdf/2411.08755v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.08753v1","updated":"2024-11-13T16:31:08Z","published":"2024-11-13T16:31:08Z","title":"Which Viewpoint Shows it Best? Language for Weakly Supervising View\n Selection in Multi-view Videos","summary":" Given a multi-view video, which viewpoint is most informative for a human\nobserver? Existing methods rely on heuristics or expensive ``best-view\"\nsupervision to answer this question, limiting their applicability. We propose a\nweakly supervised approach that leverages language accompanying an\ninstructional multi-view video as a means to recover its most informative\nviewpoint(s). Our key hypothesis is that the more accurately an individual view\ncan predict a view-agnostic text summary, the more informative it is. To put\nthis into action, we propose a framework that uses the relative accuracy of\nview-dependent caption predictions as a proxy for best view pseudo-labels.\nThen, those pseudo-labels are used to train a view selector, together with an\nauxiliary camera pose predictor that enhances view-sensitivity. During\ninference, our model takes as input only a multi-view video -- no language or\ncamera poses -- and returns the best viewpoint to watch at each timestep. On\ntwo challenging datasets comprised of diverse multi-camera setups and how-to\nactivities, our model consistently outperforms state-of-the-art baselines, both\nwith quantitative metrics and human evaluation.\n","authors":["Sagnik Majumder","Tushar Nagarajan","Ziad Al-Halah","Reina Pradhan","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2411.08753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08715v1","updated":"2024-11-13T15:58:50Z","published":"2024-11-13T15:58:50Z","title":"Retrieval Augmented Recipe Generation","summary":" Given the potential applications of generating recipes from food images, this\narea has garnered significant attention from researchers in recent years.\nExisting works for recipe generation primarily utilize a two-stage training\nmethod, first generating ingredients and then obtaining instructions from both\nthe image and ingredients. Large Multi-modal Models (LMMs), which have achieved\nnotable success across a variety of vision and language tasks, shed light to\ngenerating both ingredients and instructions directly from images.\nNevertheless, LMMs still face the common issue of hallucinations during recipe\ngeneration, leading to suboptimal performance. To tackle this, we propose a\nretrieval augmented large multimodal model for recipe generation. We first\nintroduce Stochastic Diversified Retrieval Augmentation (SDRA) to retrieve\nrecipes semantically related to the image from an existing datastore as a\nsupplement, integrating them into the prompt to add diverse and rich context to\nthe input image. Additionally, Self-Consistency Ensemble Voting mechanism is\nproposed to determine the most confident prediction recipes as the final\noutput. It calculates the consistency among generated recipe candidates, which\nuse different retrieval recipes as context for generation. Extensive\nexperiments validate the effectiveness of our proposed method, which\ndemonstrates state-of-the-art (SOTA) performance in recipe generation tasks on\nthe Recipe1M dataset.\n","authors":["Guoshan Liu","Hailong Yin","Bin Zhu","Jingjing Chen","Chong-Wah Ngo","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.08715v1.pdf","comment":"ACCEPT on IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2411.08712v1","updated":"2024-11-13T15:55:05Z","published":"2024-11-13T15:55:05Z","title":"High-resolution optical and acoustic remote sensing datasets of the Puck\n Lagoon, Southern Baltic","summary":" The very shallow marine basin of Puck Lagoon in the southern Baltic Sea, on\nthe Northern coast of Poland, hosts valuable benthic habitats and cultural\nheritage sites. These include, among others, protected Zostera marina meadows,\none of the Baltic's major medieval harbours, a ship graveyard, and likely other\nsubmerged features that are yet to be discovered. Prior to this project, no\ncomprehensive high-resolution remote sensing data were available for this area.\nThis article describes the first Digital Elevation Models (DEMs) derived from a\ncombination of airborne bathymetric LiDAR, multibeam echosounder, airborne\nphotogrammetry and satellite imagery. These datasets also include multibeam\nechosounder backscatter and LiDAR intensity, allowing determination of the\ncharacter and properties of the seafloor. Combined, these datasets are a vital\nresource for assessing and understanding seafloor morphology, benthic habitats,\ncultural heritage, and submerged landscapes. Given the significance of Puck\nLagoon's hydrographical, ecological, geological, and archaeological environs,\nthe high-resolution bathymetry, acquired by our project, can provide the\nfoundation for sustainable management and informed decision-making for this\narea of interest.\n","authors":["Łukasz Janowski","Dimitrios Skarlatos","Panagiotis Agrafiotis","Paweł Tysiąc","Andrzej Pydyn","Mateusz Popek","Anna M. Kotarba-Morley","Gottfried Mandlburger","Łukasz Gajewski","Mateusz Kołakowski","Alexandra Papadaki","Juliusz Gajewski"],"pdf_url":"https://arxiv.org/pdf/2411.08712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07364v3","updated":"2024-11-13T15:48:08Z","published":"2024-05-12T19:36:11Z","title":"BoQ: A Place is Worth a Bag of Learnable Queries","summary":" In visual place recognition, accurately identifying and matching images of\nlocations under varying environmental conditions and viewpoints remains a\nsignificant challenge. In this paper, we introduce a new technique, called\nBag-of-Queries (BoQ), which learns a set of global queries designed to capture\nuniversal place-specific attributes. Unlike existing methods that employ\nself-attention and generate the queries directly from the input features, BoQ\nemploys distinct learnable global queries, which probe the input features via\ncross-attention, ensuring consistent information aggregation. In addition, our\ntechnique provides an interpretable attention mechanism and integrates with\nboth CNN and Vision Transformer backbones. The performance of BoQ is\ndemonstrated through extensive experiments on 14 large-scale benchmarks. It\nconsistently outperforms current state-of-the-art techniques including NetVLAD,\nMixVPR and EigenPlaces. Moreover, as a global retrieval technique (one-stage),\nBoQ surpasses two-stage retrieval methods, such as Patch-NetVLAD, TransVPR and\nR2Former, all while being orders of magnitude faster and more efficient. The\ncode and model weights are publicly available at\nhttps://github.com/amaralibey/Bag-of-Queries.\n","authors":["Amar Ali-Bey","Brahim Chaib-draa","Philippe Giguère"],"pdf_url":"https://arxiv.org/pdf/2405.07364v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2411.08701v1","updated":"2024-11-13T15:42:28Z","published":"2024-11-13T15:42:28Z","title":"TRACE: Transformer-based Risk Assessment for Clinical Evaluation","summary":" We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation),\na novel method for clinical risk assessment based on clinical data, leveraging\nthe self-attention mechanism for enhanced feature interaction and result\ninterpretation. Our approach is able to handle different data modalities,\nincluding continuous, categorical and multiple-choice (checkbox) attributes.\nThe proposed architecture features a shared representation of the clinical data\nobtained by integrating specialized embeddings of each data modality, enabling\nthe detection of high-risk individuals using Transformer encoder layers. To\nassess the effectiveness of the proposed method, a strong baseline based on\nnon-negative multi-layer perceptrons (MLPs) is introduced. The proposed method\noutperforms various baselines widely used in the domain of clinical risk\nassessment, while effectively handling missing values. In terms of\nexplainability, our Transformer-based method offers easily interpretable\nresults via attention weights, further enhancing the clinicians'\ndecision-making process.\n","authors":["Dionysis Christopoulos","Sotiris Spanos","Valsamis Ntouskos","Konstantinos Karantzalos"],"pdf_url":"https://arxiv.org/pdf/2411.08701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15339v3","updated":"2024-11-13T15:25:32Z","published":"2024-07-22T02:53:18Z","title":"Deep Learning for Economists","summary":" Deep learning provides powerful methods to impute structured information from\nlarge-scale, unstructured text and image datasets. For example, economists\nmight wish to detect the presence of economic activity in satellite images, or\nto measure the topics or entities mentioned in social media, the congressional\nrecord, or firm filings. This review introduces deep neural networks, covering\nmethods such as classifiers, regression models, generative AI, and embedding\nmodels. Applications include classification, document digitization, record\nlinkage, and methods for data exploration in massive scale text and image\ncorpora. When suitable methods are used, deep learning models can be cheap to\ntune and can scale affordably to problems involving millions or billions of\ndata points.. The review is accompanied by a companion website, EconDL, with\nuser-friendly demo notebooks, software resources, and a knowledge base that\nprovides technical details and additional applications.\n","authors":["Melissa Dell"],"pdf_url":"https://arxiv.org/pdf/2407.15339v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08666v1","updated":"2024-11-13T14:59:41Z","published":"2024-11-13T14:59:41Z","title":"A Survey on Vision Autoregressive Model","summary":" Autoregressive models have demonstrated great performance in natural language\nprocessing (NLP) with impressive scalability, adaptability and\ngeneralizability. Inspired by their notable success in NLP field,\nautoregressive models have been intensively investigated recently for computer\nvision, which perform next-token predictions by representing visual data as\nvisual tokens and enables autoregressive modelling for a wide range of vision\ntasks, ranging from visual generation and visual understanding to the very\nrecent multimodal generation that unifies visual generation and understanding\nwith a single autoregressive model. This paper provides a systematic review of\nvision autoregressive models, including the development of a taxonomy of\nexisting methods and highlighting their major contributions, strengths, and\nlimitations, covering various vision tasks such as image generation, video\ngeneration, image editing, motion generation, medical image analysis, 3D\ngeneration, robotic manipulation, unified multimodal generation, etc. Besides,\nwe investigate and analyze the latest advancements in autoregressive models,\nincluding thorough benchmarking and discussion of existing methods across\nvarious evaluation datasets. Finally, we outline key challenges and promising\ndirections for future research, offering a roadmap to guide further\nadvancements in vision autoregressive models.\n","authors":["Kai Jiang","Jiaxing Huang"],"pdf_url":"https://arxiv.org/pdf/2411.08666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08665v1","updated":"2024-11-13T14:59:00Z","published":"2024-11-13T14:59:00Z","title":"OSMLoc: Single Image-Based Visual Localization in OpenStreetMap with\n Geometric and Semantic Guidances","summary":" OpenStreetMap (OSM), an online and versatile source of volunteered geographic\ninformation (VGI), is widely used for human self-localization by matching\nnearby visual observations with vectorized map data. However, due to the\ndivergence in modalities and views, image-to-OSM (I2O) matching and\nlocalization remain challenging for robots, preventing the full utilization of\nVGI data in the unmanned ground vehicles and logistic industry. Inspired by the\nfact that the human brain relies on geometric and semantic understanding of\nsensory information for spatial localization tasks, we propose the OSMLoc in\nthis paper. OSMLoc is a brain-inspired single-image visual localization method\nwith semantic and geometric guidance to improve accuracy, robustness, and\ngeneralization ability. First, we equip the OSMLoc with the visual foundational\nmodel to extract powerful image features. Second, a geometry-guided depth\ndistribution adapter is proposed to bridge the monocular depth estimation and\ncamera-to-BEV transform. Thirdly, the semantic embeddings from the OSM data are\nutilized as auxiliary guidance for image-to-OSM feature matching. To validate\nthe proposed OSMLoc, we collect a worldwide cross-area and cross-condition (CC)\nbenchmark for extensive evaluation. Experiments on the MGL dataset, CC\nvalidation benchmark, and KITTI dataset have demonstrated the superiority of\nour method. Code, pre-trained models, CC validation benchmark, and additional\nresults are available on: https://github.com/WHU-USI3DV/OSMLoc\n","authors":["Youqi Liao","Xieyuanli Chen","Shuhao Kang","Jianping Li","Zhen Dong","Hongchao Fan","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2411.08665v1.pdf","comment":"15 pages, technical report"},{"id":"http://arxiv.org/abs/2411.08663v1","updated":"2024-11-13T14:54:47Z","published":"2024-11-13T14:54:47Z","title":"Toward Human Understanding with Controllable Synthesis","summary":" Training methods to perform robust 3D human pose and shape (HPS) estimation\nrequires diverse training images with accurate ground truth. While BEDLAM\ndemonstrates the potential of traditional procedural graphics to generate such\ndata, the training images are clearly synthetic. In contrast, generative image\nmodels produce highly realistic images but without ground truth. Putting these\nmethods together seems straightforward: use a generative model with the body\nground truth as controlling signal. However, we find that, the more realistic\nthe generated images, the more they deviate from the ground truth, making them\ninappropriate for training and evaluation. Enhancements of realistic details,\nsuch as clothing and facial expressions, can lead to subtle yet significant\ndeviations from the ground truth, potentially misleading training models. We\nempirically verify that this misalignment causes the accuracy of HPS networks\nto decline when trained with generated images. To address this, we design a\ncontrollable synthesis method that effectively balances image realism with\nprecise ground truth. We use this to create the Generative BEDLAM (Gen-B)\ndataset, which improves the realism of the existing synthetic BEDLAM dataset\nwhile preserving ground truth accuracy. We perform extensive experiments, with\nvarious noise-conditioning strategies, to evaluate the tradeoff between visual\nrealism and HPS accuracy. We show, for the first time, that generative image\nmodels can be controlled by traditional graphics methods to produce training\ndata that increases the accuracy of HPS methods.\n","authors":["Hanz Cuevas-Velasquez","Priyanka Patel","Haiwen Feng","Michael Black"],"pdf_url":"https://arxiv.org/pdf/2411.08663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08656v1","updated":"2024-11-13T14:46:41Z","published":"2024-11-13T14:46:41Z","title":"MikuDance: Animating Character Art with Mixed Motion Dynamics","summary":" We propose MikuDance, a diffusion-based pipeline incorporating mixed motion\ndynamics to animate stylized character art. MikuDance consists of two key\ntechniques: Mixed Motion Modeling and Mixed-Control Diffusion, to address the\nchallenges of high-dynamic motion and reference-guidance misalignment in\ncharacter art animation. Specifically, a Scene Motion Tracking strategy is\npresented to explicitly model the dynamic camera in pixel-wise space, enabling\nunified character-scene motion modeling. Building on this, the Mixed-Control\nDiffusion implicitly aligns the scale and body shape of diverse characters with\nmotion guidance, allowing flexible control of local character motion.\nSubsequently, a Motion-Adaptive Normalization module is incorporated to\neffectively inject global scene motion, paving the way for comprehensive\ncharacter art animation. Through extensive experiments, we demonstrate the\neffectiveness and generalizability of MikuDance across various character art\nand motion guidance, consistently producing high-quality animations with\nremarkable motion dynamics.\n","authors":["Jiaxu Zhang","Xianfang Zeng","Xin Chen","Wei Zuo","Gang Yu","Zhigang Tu"],"pdf_url":"https://arxiv.org/pdf/2411.08656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10534v2","updated":"2024-11-13T14:36:47Z","published":"2024-04-12T21:41:50Z","title":"Into the Fog: Evaluating Robustness of Multiple Object Tracking","summary":" State-of-the-art Multiple Object Tracking (MOT) approaches have shown\nremarkable performance when trained and evaluated on current benchmarks.\nHowever, these benchmarks primarily consist of clear weather scenarios,\noverlooking adverse atmospheric conditions such as fog, haze, smoke and dust.\nAs a result, the robustness of trackers against these challenging conditions\nremains underexplored. To address this gap, we introduce physics-based\nvolumetric fog simulation method for arbitrary MOT datasets, utilizing\nframe-by-frame monocular depth estimation and a fog formation optical model. We\nenhance our simulation by rendering both homogeneous and heterogeneous fog and\npropose to use the dark channel prior method to estimate atmospheric light,\nshowing promising results even in night and indoor scenes. We present the\nleading benchmark MOTChallenge (third release) augmented with fog (smoke for\nindoor scenes) of various intensities and conduct a comprehensive evaluation of\nMOT methods, revealing their limitations under fog and fog-like challenges.\n","authors":["Nadezda Kirillova","M. Jehanzeb Mirza","Horst Bischof","Horst Possegger"],"pdf_url":"https://arxiv.org/pdf/2404.10534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08642v1","updated":"2024-11-13T14:32:28Z","published":"2024-11-13T14:32:28Z","title":"Towards More Accurate Fake Detection on Images Generated from Advanced\n Generative and Neural Rendering Models","summary":" The remarkable progress in neural-network-driven visual data generation,\nespecially with neural rendering techniques like Neural Radiance Fields and 3D\nGaussian splatting, offers a powerful alternative to GANs and diffusion models.\nThese methods can produce high-fidelity images and lifelike avatars,\nhighlighting the need for robust detection methods. In response, an\nunsupervised training technique is proposed that enables the model to extract\ncomprehensive features from the Fourier spectrum magnitude, thereby overcoming\nthe challenges of reconstructing the spectrum due to its centrosymmetric\nproperties. By leveraging the spectral domain and dynamically combining it with\nspatial domain information, we create a robust multimodal detector that\ndemonstrates superior generalization capabilities in identifying challenging\nsynthetic images generated by the latest image synthesis techniques. To address\nthe absence of a 3D neural rendering-based fake image database, we develop a\ncomprehensive database that includes images generated by diverse neural\nrendering techniques, providing a robust foundation for evaluating and\nadvancing detection methods.\n","authors":["Chengdong Dong","Vijayakumar Bhagavatula","Zhenyu Zhou","Ajay Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.08642v1.pdf","comment":"13 pages, 8 Figures"},{"id":"http://arxiv.org/abs/2402.15322v3","updated":"2024-11-13T14:17:35Z","published":"2024-02-23T13:40:34Z","title":"Optimal Transport on the Lie Group of Roto-translations","summary":" The roto-translation group SE2 has been of active interest in image analysis\ndue to methods that lift the image data to multi-orientation representations\ndefined on this Lie group. This has led to impactful applications of\ncrossing-preserving flows for image de-noising, geodesic tracking, and\nroto-translation equivariant deep learning. In this paper, we develop a\ncomputational framework for optimal transportation over Lie groups, with a\nspecial focus on SE2. We make several theoretical contributions (generalizable\nto matrix Lie groups) such as the non-optimality of group actions as transport\nmaps, invariance and equivariance of optimal transport, and the quality of the\nentropic-regularized optimal transport plan using geodesic distance\napproximations. We develop a Sinkhorn like algorithm that can be efficiently\nimplemented using fast and accurate distance approximations of the Lie group\nand GPU-friendly group convolutions. We report valuable advancements in the\nexperiments on 1) image barycentric interpolation, 2) interpolation of planar\norientation fields, and 3) Wasserstein gradient flows on SE2. We observe that\nour framework of lifting images to SE2 and optimal transport with\nleft-invariant anisotropic metrics leads to equivariant transport along\ndominant contours and salient line structures in the image. This yields sharper\nand more meaningful interpolations compared to their counterparts on R^2\n","authors":["Daan Bon","Gautam Pai","Gijs Bellaard","Olga Mula","Remco Duits"],"pdf_url":"https://arxiv.org/pdf/2402.15322v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08629v1","updated":"2024-11-13T14:16:22Z","published":"2024-11-13T14:16:22Z","title":"Zero-shot capability of SAM-family models for bone segmentation in CT\n scans","summary":" The Segment Anything Model (SAM) and similar models build a family of\npromptable foundation models (FMs) for image and video segmentation. The object\nof interest is identified using prompts, such as bounding boxes or points. With\nthese FMs becoming part of medical image segmentation, extensive evaluation\nstudies are required to assess their strengths and weaknesses in clinical\nsetting. Since the performance is highly dependent on the chosen prompting\nstrategy, it is important to investigate different prompting techniques to\ndefine optimal guidelines that ensure effective use in medical image\nsegmentation. Currently, no dedicated evaluation studies exist specifically for\nbone segmentation in CT scans, leaving a gap in understanding the performance\nfor this task. Thus, we use non-iterative, ``optimal'' prompting strategies\ncomposed of bounding box, points and combinations to test the zero-shot\ncapability of SAM-family models for bone CT segmentation on three different\nskeletal regions. Our results show that the best settings depend on the model\ntype and size, dataset characteristics and objective to optimize. Overall, SAM\nand SAM2 prompted with a bounding box in combination with the center point for\nall the components of an object yield the best results across all tested\nsettings. As the results depend on multiple factors, we provide a guideline for\ninformed decision-making in 2D prompting with non-interactive, ''optimal''\nprompts.\n","authors":["Caroline Magg","Hoel Kervadec","Clara I. Sánchez"],"pdf_url":"https://arxiv.org/pdf/2411.08629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08645v3","updated":"2024-11-13T13:58:39Z","published":"2024-08-16T10:21:13Z","title":"Extracting polygonal footprints in off-nadir images with Segment\n Anything Model","summary":" Building Footprint Extraction (BFE) from off-nadir aerial images often\ninvolves roof segmentation and offset prediction to adjust roof boundaries to\nthe building footprint. However, this multi-stage approach typically produces\nlow-quality results, limiting its applicability in real-world data production.\nTo address this issue, we present OBMv2, an end-to-end and promptable model for\npolygonal footprint prediction. Unlike its predecessor OBM, OBMv2 introduces a\nnovel Self Offset Attention (SOFA) mechanism that improves performance across\ndiverse building types, from bungalows to skyscrapers, enabling end-to-end\nfootprint prediction without post-processing. Additionally, we propose a\nMulti-level Information System (MISS) to effectively leverage roof masks,\nbuilding masks, and offsets for accurate footprint prediction. We evaluate\nOBMv2 on the BONAI and OmniCity-view3 datasets and demonstrate its\ngeneralization on the Huizhou test set. The code will be available at\nhttps://github.com/likaiucas/OBMv2.\n","authors":["Kai Li","Yupeng Deng","Jingbo Chen","Yu Meng","Zhihao Xi","Junxian Ma","Chenhao Wang","Xiangyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.08645v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08606v1","updated":"2024-11-13T13:46:15Z","published":"2024-11-13T13:46:15Z","title":"LG-Gaze: Learning Geometry-aware Continuous Prompts for Language-Guided\n Gaze Estimation","summary":" The ability of gaze estimation models to generalize is often significantly\nhindered by various factors unrelated to gaze, especially when the training\ndataset is limited. Current strategies aim to address this challenge through\ndifferent domain generalization techniques, yet they have had limited success\ndue to the risk of overfitting when solely relying on value labels for\nregression. Recent progress in pre-trained vision-language models has motivated\nus to capitalize on the abundant semantic information available. We propose a\nnovel approach in this paper, reframing the gaze estimation task as a\nvision-language alignment issue. Our proposed framework, named Language-Guided\nGaze Estimation (LG-Gaze), learns continuous and geometry-sensitive features\nfor gaze estimation benefit from the rich prior knowledges of vision-language\nmodels. Specifically, LG-Gaze aligns gaze features with continuous linguistic\nfeatures through our proposed multimodal contrastive regression loss, which\ncustomizes adaptive weights for different negative samples. Furthermore, to\nbetter adapt to the labels for gaze estimation task, we propose a\ngeometry-aware interpolation method to obtain more precise gaze embeddings.\nThrough extensive experiments, we validate the efficacy of our framework in\nfour different cross-domain evaluation tasks.\n","authors":["Pengwei Yin","Jingjing Wang","Guanzhong Zeng","Di Xie","Jiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.08606v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2411.08603v1","updated":"2024-11-13T13:40:27Z","published":"2024-11-13T13:40:27Z","title":"Generalized Pose Space Embeddings for Training In-the-Wild using\n Anaylis-by-Synthesis","summary":" Modern pose estimation models are trained on large, manually-labelled\ndatasets which are costly and may not cover the full extent of human poses and\nappearances in the real world. With advances in neural rendering,\nanalysis-by-synthesis and the ability to not only predict, but also render the\npose, is becoming an appealing framework, which could alleviate the need for\nlarge scale manual labelling efforts. While recent work have shown the\nfeasibility of this approach, the predictions admit many flips due to a\nsimplistic intermediate skeleton representation, resulting in low precision and\ninhibiting the acquisition of any downstream knowledge such as\nthree-dimensional positioning. We solve this problem with a more expressive\nintermediate skeleton representation capable of capturing the semantics of the\npose (left and right), which significantly reduces flips. To successfully train\nthis new representation, we extend the analysis-by-synthesis framework with a\ntraining protocol based on synthetic data. We show that our representation\nresults in less flips and more accurate predictions. Our approach outperforms\nprevious models trained with analysis-by-synthesis on standard benchmarks.\n","authors":["Dominik Borer","Jakob Buhmann","Martin Guay"],"pdf_url":"https://arxiv.org/pdf/2411.08603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08592v1","updated":"2024-11-13T13:19:51Z","published":"2024-11-13T13:19:51Z","title":"Slender Object Scene Segmentation in Remote Sensing Image Based on\n Learnable Morphological Skeleton with Segment Anything Model","summary":" Morphological methods play a crucial role in remote sensing image processing,\ndue to their ability to capture and preserve small structural details. However,\nmost of the existing deep learning models for semantic segmentation are based\non the encoder-decoder architecture including U-net and Segment Anything Model\n(SAM), where the downsampling process tends to discard fine details. In this\npaper, we propose a new approach that integrates learnable morphological\nskeleton prior into deep neural networks using the variational method. To\naddress the difficulty in backpropagation in neural networks caused by the\nnon-differentiability presented in classical morphological operations, we\nprovide a smooth representation of the morphological skeleton and design a\nvariational segmentation model integrating morphological skeleton prior by\nemploying operator splitting and dual methods. Then, we integrate this model\ninto the network architecture of SAM, which is achieved by adding a token to\nmask decoder and modifying the final sigmoid layer, ensuring the final\nsegmentation results preserve the skeleton structure as much as possible.\nExperimental results on remote sensing datasets, including buildings and roads,\ndemonstrate that our method outperforms the original SAM on slender object\nsegmentation and exhibits better generalization capability.\n","authors":["Jun Xie","Wenxiao Li","Faqiang Wang","Liqiang Zhang","Zhengyang Hou","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08579v1","updated":"2024-11-13T12:51:49Z","published":"2024-11-13T12:51:49Z","title":"NavAgent: Multi-scale Urban Street View Fusion For UAV Embodied\n Vision-and-Language Navigation","summary":" Vision-and-Language Navigation (VLN), as a widely discussed research\ndirection in embodied intelligence, aims to enable embodied agents to navigate\nin complicated visual environments through natural language commands. Most\nexisting VLN methods focus on indoor ground robot scenarios. However, when\napplied to UAV VLN in outdoor urban scenes, it faces two significant\nchallenges. First, urban scenes contain numerous objects, which makes it\nchallenging to match fine-grained landmarks in images with complex textual\ndescriptions of these landmarks. Second, overall environmental information\nencompasses multiple modal dimensions, and the diversity of representations\nsignificantly increases the complexity of the encoding process. To address\nthese challenges, we propose NavAgent, the first urban UAV embodied navigation\nmodel driven by a large Vision-Language Model. NavAgent undertakes navigation\ntasks by synthesizing multi-scale environmental information, including\ntopological maps (global), panoramas (medium), and fine-grained landmarks\n(local). Specifically, we utilize GLIP to build a visual recognizer for\nlandmark capable of identifying and linguisticizing fine-grained landmarks.\nSubsequently, we develop dynamically growing scene topology map that integrate\nenvironmental information and employ Graph Convolutional Networks to encode\nglobal environmental data. In addition, to train the visual recognizer for\nlandmark, we develop NavAgent-Landmark2K, the first fine-grained landmark\ndataset for real urban street scenes. In experiments conducted on the Touchdown\nand Map2seq datasets, NavAgent outperforms strong baseline models. The code and\ndataset will be released to the community to facilitate the exploration and\ndevelopment of outdoor VLN.\n","authors":["Youzhi Liu","Fanglong Yao","Yuanchang Yue","Guangluan Xu","Xian Sun","Kun Fu"],"pdf_url":"https://arxiv.org/pdf/2411.08579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07743v3","updated":"2024-11-13T12:43:33Z","published":"2023-06-13T13:00:10Z","title":"V-LoL: A Diagnostic Dataset for Visual Logical Learning","summary":" Despite the successes of recent developments in visual AI, different\nshortcomings still exist; from missing exact logical reasoning, to abstract\ngeneralization abilities, to understanding complex and noisy scenes.\nUnfortunately, existing benchmarks, were not designed to capture more than a\nfew of these aspects. Whereas deep learning datasets focus on visually complex\ndata but simple visual reasoning tasks, inductive logic datasets involve\ncomplex logical learning tasks, however, lack the visual component. To address\nthis, we propose the diagnostic visual logical learning dataset, V-LoL, that\nseamlessly combines visual and logical challenges. Notably, we introduce the\nfirst instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic\nbenchmark in symbolic AI, the Michalski train problem. By incorporating\nintricate visual scenes and flexible logical reasoning tasks within a versatile\nframework, V-LoL-Train provides a platform for investigating a wide range of\nvisual logical learning challenges. We evaluate a variety of AI systems\nincluding traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our\nevaluations demonstrate that even SOTA AI faces difficulties in dealing with\nvisual logical learning challenges, highlighting unique advantages and\nlimitations of each methodology. Overall, V-LoL opens up new avenues for\nunderstanding and enhancing current abilities in visual logical learning for AI\nsystems.\n","authors":["Lukas Helff","Wolfgang Stammer","Hikaru Shindo","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2306.07743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08569v1","updated":"2024-11-13T12:29:44Z","published":"2024-11-13T12:29:44Z","title":"UIFormer: A Unified Transformer-based Framework for Incremental Few-Shot\n Object Detection and Instance Segmentation","summary":" This paper introduces a novel framework for unified incremental few-shot\nobject detection (iFSOD) and instance segmentation (iFSIS) using the\nTransformer architecture. Our goal is to create an optimal solution for\nsituations where only a few examples of novel object classes are available,\nwith no access to training data for base or old classes, while maintaining high\nperformance across both base and novel classes. To achieve this, We extend\nMask-DINO into a two-stage incremental learning framework. Stage 1 focuses on\noptimizing the model using the base dataset, while Stage 2 involves fine-tuning\nthe model on novel classes. Besides, we incorporate a classifier selection\nstrategy that assigns appropriate classifiers to the encoder and decoder\naccording to their distinct functions. Empirical evidence indicates that this\napproach effectively mitigates the over-fitting on novel classes learning.\nFurthermore, we implement knowledge distillation to prevent catastrophic\nforgetting of base classes. Comprehensive evaluations on the COCO and LVIS\ndatasets for both iFSIS and iFSOD tasks demonstrate that our method\nsignificantly outperforms state-of-the-art approaches.\n","authors":["Chengyuan Zhang","Yilin Zhang","Lei Zhu","Deyin Liu","Lin Wu","Bo Li","Shichao Zhang","Mohammed Bennamoun","Farid Boussaid"],"pdf_url":"https://arxiv.org/pdf/2411.08569v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.10929v4","updated":"2024-11-13T12:27:38Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v4.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2411.08567v1","updated":"2024-11-13T12:27:21Z","published":"2024-11-13T12:27:21Z","title":"Saliency Map-based Image Retrieval using Invariant Krawtchouk Moments","summary":" With the widespread adoption of digital devices equipped with cameras and the\nrapid development of Internet technology, numerous content-based image\nretrieval systems and novel image feature extraction techniques have emerged in\nrecent years. This paper introduces a saliency map-based image retrieval\napproach using invariant Krawtchouk moments (SM-IKM) to enhance retrieval speed\nand accuracy. The proposed method applies a global contrast-based salient\nregion detection algorithm to create a saliency map that effectively isolates\nthe foreground from the background. It then combines multiple orders of\ninvariant Krawtchouk moments (IKM) with local binary patterns (LBPs) and color\nhistograms to comprehensively represent the foreground and background.\nAdditionally, it incorporates LBPs derived from the saliency map to improve\ndiscriminative power, facilitating more precise image differentiation. A\nbag-of-visual-words (BoVW) model is employed to generate a codebook for\nclassification and discrimination. By using compact IKMs in the BoVW framework\nand integrating a range of region-based feature-including color histograms,\nLBPs, and saliency map-enhanced LBPs, our proposed SM-IKM achieves efficient\nand accurate image retrieval. xtensive experiments on publicly available\ndatasets, such as Caltech 101 and Wang, demonstrate that SM-IKM outperforms\nrecent state-of-the-art retrieval methods. The source code for SM-IKM is\navailable at github.com/arnejad/SMIKM.\n","authors":["Ashkan Nejad","Mohammad Reza Faraji","Xiaojun Qi"],"pdf_url":"https://arxiv.org/pdf/2411.08567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18723v2","updated":"2024-11-13T12:02:29Z","published":"2024-10-24T13:28:40Z","title":"VoxelKeypointFusion: Generalizable Multi-View Multi-Person Pose\n Estimation","summary":" In the rapidly evolving field of computer vision, the task of accurately\nestimating the poses of multiple individuals from various viewpoints presents a\nformidable challenge, especially if the estimations should be reliable as well.\nThis work presents an extensive evaluation of the generalization capabilities\nof multi-view multi-person pose estimators to unseen datasets and presents a\nnew algorithm with strong performance in this task. It also studies the\nimprovements by additionally using depth information. Since the new approach\ncan not only generalize well to unseen datasets, but also to different\nkeypoints, the first multi-view multi-person whole-body estimator is presented.\nTo support further research on those topics, all of the work is publicly\naccessible.\n","authors":["Daniel Bermuth","Alexander Poeppel","Wolfgang Reif"],"pdf_url":"https://arxiv.org/pdf/2410.18723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08545v1","updated":"2024-11-13T11:46:42Z","published":"2024-11-13T11:46:42Z","title":"APDDv2: Aesthetics of Paintings and Drawings Dataset with Artist Labeled\n Scores and Comments","summary":" Datasets play a pivotal role in training visual models, facilitating the\ndevelopment of abstract understandings of visual features through diverse image\nsamples and multidimensional attributes. However, in the realm of aesthetic\nevaluation of artistic images, datasets remain relatively scarce. Existing\npainting datasets are often characterized by limited scoring dimensions and\ninsufficient annotations, thereby constraining the advancement and application\nof automatic aesthetic evaluation methods in the domain of painting. To bridge\nthis gap, we introduce the Aesthetics Paintings and Drawings Dataset (APDD),\nthe first comprehensive collection of paintings encompassing 24 distinct\nartistic categories and 10 aesthetic attributes. Building upon the initial\nrelease of APDDv1, our ongoing research has identified opportunities for\nenhancement in data scale and annotation precision. Consequently, APDDv2 boasts\nan expanded image corpus and improved annotation quality, featuring detailed\nlanguage comments to better cater to the needs of both researchers and\npractitioners seeking high-quality painting datasets. Furthermore, we present\nan updated version of the Art Assessment Network for Specific Painting Styles,\ndenoted as ArtCLIP. Experimental validation demonstrates the superior\nperformance of this revised model in the realm of aesthetic evaluation,\nsurpassing its predecessor in accuracy and efficacy. The dataset and model are\navailable at https://github.com/BestiVictory/APDDv2.git.\n","authors":["Xin Jin","Qianqian Qiao","Yi Lu","Huaye Wang","Heng Huang","Shan Gao","Jianfei Liu","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2411.08545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01555v2","updated":"2024-11-13T11:44:10Z","published":"2024-02-02T16:47:18Z","title":"SLYKLatent: A Learning Framework for Gaze Estimation Using Deep Facial\n Feature Learning","summary":" In this research, we present SLYKLatent, a novel approach for enhancing gaze\nestimation by addressing appearance instability challenges in datasets due to\naleatoric uncertainties, covariant shifts, and test domain generalization.\nSLYKLatent utilizes Self-Supervised Learning for initial training with facial\nexpression datasets, followed by refinement with a patch-based tri-branch\nnetwork and an inverse explained variance-weighted training loss function. Our\nevaluation on benchmark datasets achieves a 10.9% improvement on Gaze360,\nsupersedes top MPIIFaceGaze results with 3.8%, and leads on a subset of\nETH-XGaze by 11.6%, surpassing existing methods by significant margins.\nAdaptability tests on RAF-DB and Affectnet show 86.4% and 60.9% accuracies,\nrespectively. Ablation studies confirm the effectiveness of SLYKLatent's novel\ncomponents.\n","authors":["Samuel Adebayo","Joost C. Dessing","Seán McLoone"],"pdf_url":"https://arxiv.org/pdf/2402.01555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08537v1","updated":"2024-11-13T11:35:39Z","published":"2024-11-13T11:35:39Z","title":"MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal\n Lymphatic Vessel Segmentation","summary":" Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste\nproducts from the human brain. An impairment in their functionality has been\nassociated with aging as well as brain disorders like multiple sclerosis and\nAlzheimer's disease. However, MLVs have only recently been described for the\nfirst time in magnetic resonance imaging (MRI), and their ramified structure\nrenders manual segmentation particularly difficult. Further, as there is no\nconsistent notion of their appearance, human-annotated MLV structures contain a\nhigh inter-rater variability that most automatic segmentation methods cannot\ntake into account. In this work, we propose a new rater-aware training scheme\nfor the popular nnU-Net model, and we explore rater-based ensembling strategies\nfor accurate and consistent segmentation of MLVs. This enables us to boost\nnnU-Net's performance while obtaining explicit predictions in different\nannotation styles and a rater-based uncertainty estimation. Our final model,\nMLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to\nthe human reference standard. The model further matches the human inter-rater\nreliability and replicates age-related associations with MLV volume.\n","authors":["Fabian Bongratz","Markus Karmann","Adrian Holz","Moritz Bonhoeffer","Viktor Neumaier","Sarah Deli","Benita Schmitz-Koep","Claus Zimmer","Christian Sorg","Melissa Thalhammer","Dennis M Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2411.08537v1.pdf","comment":"ML4H 2024"},{"id":"http://arxiv.org/abs/2411.08531v1","updated":"2024-11-13T11:25:26Z","published":"2024-11-13T11:25:26Z","title":"Classification and Morphological Analysis of DLBCL Subtypes in\n H\\&E-Stained Slides","summary":" We address the challenge of automated classification of diffuse large B-cell\nlymphoma (DLBCL) into its two primary subtypes: activated B-cell-like (ABC) and\ngerminal center B-cell-like (GCB). Accurate classification between these\nsubtypes is essential for determining the appropriate therapeutic strategy,\ngiven their distinct molecular profiles and treatment responses. Our proposed\ndeep learning model demonstrates robust performance, achieving an average area\nunder the curve (AUC) of (87.4 pm 5.7)\\% during cross-validation. It shows a\nhigh positive predictive value (PPV), highlighting its potential for clinical\napplication, such as triaging for molecular testing. To gain biological\ninsights, we performed an analysis of morphological features of ABC and GCB\nsubtypes. We segmented cell nuclei using a pre-trained deep neural network and\ncompared the statistics of geometric and color features for ABC and GCB. We\nfound that the distributions of these features were not very different for the\ntwo subtypes, which suggests that the visual differences between them are more\nsubtle. These results underscore the potential of our method to assist in more\nprecise subtype classification and can contribute to improved treatment\nmanagement and outcomes for patients of DLBCL.\n","authors":["Ravi Kant Gupta","Mohit Jindal","Garima Jain","Epari Sridhar","Subhash Yadav","Hasmukh Jain","Tanuja Shet","Uma Sakhdeo","Manju Sengar","Lingaraj Nayak","Bhausaheb Bagal","Umesh Apkare","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2411.08531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08530v1","updated":"2024-11-13T11:24:12Z","published":"2024-11-13T11:24:12Z","title":"Efficient Whole Slide Image Classification through Fisher Vector\n Representation","summary":" The advancement of digital pathology, particularly through computational\nanalysis of whole slide images (WSI), is poised to significantly enhance\ndiagnostic precision and efficiency. However, the large size and complexity of\nWSIs make it difficult to analyze and classify them using computers. This study\nintroduces a novel method for WSI classification by automating the\nidentification and examination of the most informative patches, thus\neliminating the need to process the entire slide. Our method involves\ntwo-stages: firstly, it extracts only a few patches from the WSIs based on\ntheir pathological significance; and secondly, it employs Fisher vectors (FVs)\nfor representing features extracted from these patches, which is known for its\nrobustness in capturing fine-grained details. This approach not only\naccentuates key pathological features within the WSI representation but also\nsignificantly reduces computational overhead, thus making the process more\nefficient and scalable. We have rigorously evaluated the proposed method across\nmultiple datasets to benchmark its performance against comprehensive WSI\nanalysis and contemporary weakly-supervised learning methodologies. The\nempirical results indicate that our focused analysis of select patches,\ncombined with Fisher vector representation, not only aligns with, but at times\nsurpasses, the classification accuracy of standard practices. Moreover, this\nstrategy notably diminishes computational load and resource expenditure,\nthereby establishing an efficient and precise framework for WSI analysis in the\nrealm of digital pathology.\n","authors":["Ravi Kant Gupta","Dadi Dharani","Shambhavi Shanker","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2411.08530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19006v4","updated":"2024-11-13T10:56:14Z","published":"2024-06-27T08:45:31Z","title":"Snakes and Ladders: Two Steps Up for VideoMamba","summary":" Video understanding requires the extraction of rich spatio-temporal\nrepresentations, which transformer models achieve through self-attention.\nUnfortunately, self-attention poses a computational burden. In NLP, Mamba has\nsurfaced as an efficient alternative for transformers. However, Mamba's\nsuccesses do not trivially extend to vision tasks, including those in video\nanalysis. In this paper, we theoretically analyze the differences between\nself-attention and Mamba. We identify two limitations in Mamba's token\nprocessing: historical decay and element contradiction. We propose\nVideoMambaPro (VMP) that solves the identified limitations by adding masked\nbackward computation and elemental residual connections to a VideoMamba\nbackbone. Differently sized VideoMambaPro models surpass VideoMamba by 1.6-2.8%\nand 1.1-1.9% top-1 on Kinetics-400 and Something-Something V2, respectively.\nEven without extensive pre-training, our models present an increasingly\nattractive and efficient alternative to current transformer models. Moreover,\nour two solutions are orthogonal to recent advances in Vision Mamba models, and\nare likely to provide further improvements in future models.\n","authors":["Hui Lu","Albert Ali Salah","Ronald Poppe"],"pdf_url":"https://arxiv.org/pdf/2406.19006v4.pdf","comment":"New updated experiment results"},{"id":"http://arxiv.org/abs/2411.08508v1","updated":"2024-11-13T10:43:39Z","published":"2024-11-13T10:43:39Z","title":"BillBoard Splatting (BBSplat): Learnable Textured Primitives for Novel\n View Synthesis","summary":" We present billboard Splatting (BBSplat) - a novel approach for 3D scene\nrepresentation based on textured geometric primitives. BBSplat represents the\nscene as a set of optimizable textured planar primitives with learnable RGB\ntextures and alpha-maps to control their shape. BBSplat primitives can be used\nin any Gaussian Splatting pipeline as drop-in replacements for Gaussians. Our\nmethod's qualitative and quantitative improvements over 3D and 2D Gaussians are\nmost noticeable when fewer primitives are used, when BBSplat achieves over 1200\nFPS. Our novel regularization term encourages textures to have a sparser\nstructure, unlocking an efficient compression that leads to a reduction in\nstorage space of the model. Our experiments show the efficiency of BBSplat on\nstandard datasets of real indoor and outdoor scenes such as Tanks&Temples, DTU,\nand Mip-NeRF-360. We demonstrate improvements on PSNR, SSIM, and LPIPS metrics\ncompared to the state-of-the-art, especially for the case when fewer primitives\nare used, which, on the other hand, leads to up to 2 times inference speed\nimprovement for the same rendering quality.\n","authors":["David Svitov","Pietro Morerio","Lourdes Agapito","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2411.08508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07940v2","updated":"2024-11-13T10:29:51Z","published":"2024-11-12T17:09:20Z","title":"Automatic dataset shift identification to support root cause analysis of\n AI performance drift","summary":" Shifts in data distribution can substantially harm the performance of\nclinical AI models. Hence, various methods have been developed to detect the\npresence of such shifts at deployment time. However, root causes of dataset\nshifts are varied, and the choice of shift mitigation strategies is highly\ndependent on the precise type of shift encountered at test time. As such,\ndetecting test-time dataset shift is not sufficient: precisely identifying\nwhich type of shift has occurred is critical. In this work, we propose the\nfirst unsupervised dataset shift identification framework, effectively\ndistinguishing between prevalence shift (caused by a change in the label\ndistribution), covariate shift (caused by a change in input characteristics)\nand mixed shifts (simultaneous prevalence and covariate shifts). We discuss the\nimportance of self-supervised encoders for detecting subtle covariate shifts\nand propose a novel shift detector leveraging both self-supervised encoders and\ntask model outputs for improved shift detection. We report promising results\nfor the proposed shift identification framework across three different imaging\nmodalities (chest radiography, digital mammography, and retinal fundus images)\non five types of real-world dataset shifts, using four large publicly available\ndatasets.\n","authors":["Mélanie Roschewitz","Raghav Mehta","Charles Jones","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2411.07940v2.pdf","comment":"Code available at\n https://github.com/biomedia-mira/shift_identification"},{"id":"http://arxiv.org/abs/2312.06978v4","updated":"2024-11-13T10:29:01Z","published":"2023-12-12T04:38:30Z","title":"CLASS-M: Adaptive stain separation-based contrastive learning with\n pseudo-labeling for histopathological image classification","summary":" Histopathological image classification is an important task in medical image\nanalysis. Recent approaches generally rely on weakly supervised learning due to\nthe ease of acquiring case-level labels from pathology reports. However,\npatch-level classification is preferable in applications where only a limited\nnumber of cases are available or when local prediction accuracy is critical. On\nthe other hand, acquiring extensive datasets with localized labels for training\nis not feasible. In this paper, we propose a semi-supervised patch-level\nhistopathological image classification model, named CLASS-M, that does not\nrequire extensively labeled datasets. CLASS-M is formed by two main parts: a\ncontrastive learning module that uses separated Hematoxylin and Eosin images\ngenerated through an adaptive stain separation process, and a module with\npseudo-labels using MixUp. We compare our model with other state-of-the-art\nmodels on two clear cell renal cell carcinoma datasets. We demonstrate that our\nCLASS-M model has the best performance on both datasets. Our code is available\nat github.com/BzhangURU/Paper_CLASS-M/tree/main\n","authors":["Bodong Zhang","Hamid Manoochehri","Man Minh Ho","Fahimeh Fooladgar","Yosep Chong","Beatrice S. Knudsen","Deepika Sirohi","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2312.06978v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08490v1","updated":"2024-11-13T10:15:27Z","published":"2024-11-13T10:15:27Z","title":"Impact of Iris Pigmentation on Performance Bias in Visible Iris\n Verification Systems: A Comparative Study","summary":" Iris recognition technology plays a critical role in biometric identification\nsystems, but their performance can be affected by variations in iris\npigmentation. In this work, we investigate the impact of iris pigmentation on\nthe efficacy of biometric recognition systems, focusing on a comparative\nanalysis of blue and dark irises. Data sets were collected using multiple\ndevices, including P1, P2, and P3 smartphones [4], to assess the robustness of\nthe systems in different capture environments [19]. Both traditional machine\nlearning techniques and deep learning models were used, namely Open-Iris,\nViT-b, and ResNet50, to evaluate performance metrics such as Equal Error Rate\n(EER) and True Match Rate (TMR). Our results indicate that iris recognition\nsystems generally exhibit higher accuracy for blue irises compared to dark\nirises. Furthermore, we examined the generalization capabilities of these\nsystems across different iris colors and devices, finding that while training\non diverse datasets enhances recognition performance, the degree of improvement\nis contingent on the specific model and device used. Our analysis also\nidentifies inherent biases in recognition performance related to iris color and\ncross-device variability. These findings underscore the need for more inclusive\ndataset collection and model refinement to reduce bias and promote equitable\nbiometric recognition across varying iris pigmentation and device\nconfigurations.\n","authors":["Geetanjali Sharma","Abhishek Tandon","Gaurav Jaswal","Aditya Nigam","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2411.08490v1.pdf","comment":"14 pages, 5 figures, 5 Tables"},{"id":"http://arxiv.org/abs/2411.08488v1","updated":"2024-11-13T10:13:23Z","published":"2024-11-13T10:13:23Z","title":"UNSCT-HRNet: Modeling Anatomical Uncertainty for Landmark Detection in\n Total Hip Arthroplasty","summary":" Total hip arthroplasty (THA) relies on accurate landmark detection from\nradiographic images, but unstructured data caused by irregular patient postures\nor occluded anatomical markers pose significant challenges for existing\nmethods. To address this, we propose UNSCT-HRNet (Unstructured CT -\nHigh-Resolution Net), a deep learning-based framework that integrates a Spatial\nRelationship Fusion (SRF) module and an Uncertainty Estimation (UE) module. The\nSRF module, utilizing coordinate convolution and polarized attention, enhances\nthe model's ability to capture complex spatial relationships. Meanwhile, the UE\nmodule which based on entropy ensures predictions are anatomically relevant.\nFor unstructured data, the proposed method can predict landmarks without\nrelying on the fixed number of points, which shows higher accuracy and better\nrobustness comparing with the existing methods. Our UNSCT-HRNet demonstrates\nover a 60% improvement across multiple metrics in unstructured data. The\nexperimental results also reveal that our approach maintains good performance\non the structured dataset. Overall, the proposed UNSCT-HRNet has the potential\nto be used as a new reliable, automated solution for THA surgical planning and\npostoperative monitoring.\n","authors":["Jiaxin Wan","Lin Liu","Haoran Wang","Liangwei Li","Wei Li","Shuheng Kou","Runtian Li","Jiayi Tang","Juanxiu Liu","Jing Zhang","Xiaohui Du","Ruqian Hao"],"pdf_url":"https://arxiv.org/pdf/2411.08488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08482v1","updated":"2024-11-13T10:01:33Z","published":"2024-11-13T10:01:33Z","title":"Methodology for a Statistical Analysis of Influencing Factors on 3D\n Object Detection Performance","summary":" In autonomous driving, object detection is an essential task to perceive the\nenvironment by localizing and classifying objects. Most object detection\nalgorithms rely on deep learning for their superior performance. However, their\nblack box nature makes it challenging to ensure safety. In this paper, we\npropose a first-of-its-kind methodology for statistical analysis of the\ninfluence of various factors related to the objects to detect or the\nenvironment on the detection performance of both LiDAR- and camera-based 3D\nobject detectors. We perform a univariate analysis between each of the factors\nand the detection error in order to compare the strength of influence. To\nbetter identify potential sources of detection errors, we also analyze the\nperformance in dependency of the influencing factors and examine the\ninterdependencies between the different influencing factors. Recognizing the\nfactors that influence detection performance helps identify robustness issues\nin the trained object detector and supports the safety approval of object\ndetection systems.\n","authors":["Anton Kuznietsov","Dirk Schweickard","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2411.08482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17804v3","updated":"2024-11-13T09:50:48Z","published":"2024-06-22T15:24:33Z","title":"A Review of Electromagnetic Elimination Methods for low-field portable\n MRI scanner","summary":" This paper analyzes conventional and deep learning methods for eliminating\nelectromagnetic interference (EMI) in MRI systems. We compare traditional\nanalytical and adaptive techniques with advanced deep learning approaches. Key\nstrengths and limitations of each method are highlighted. Recent advancements\nin active EMI elimination, such as external EMI receiver coils, are discussed\nalongside deep learning methods, which show superior EMI suppression by\nleveraging neural networks trained on MRI data. While deep learning improves\nEMI elimination and diagnostic capabilities, it introduces security and safety\nconcerns, particularly in commercial applications. A balanced approach,\nintegrating conventional reliability with deep learning's advanced\ncapabilities, is proposed for more effective EMI suppression in MRI systems.\n","authors":["Wanyu Bian","Panfeng Li","Mengyao Zheng","Chihang Wang","Anying Li","Ying Li","Haowei Ni","Zixuan Zeng"],"pdf_url":"https://arxiv.org/pdf/2406.17804v3.pdf","comment":"Accepted by 2024 5th International Conference on Machine Learning and\n Computer Application"},{"id":"http://arxiv.org/abs/2411.08472v1","updated":"2024-11-13T09:46:08Z","published":"2024-11-13T09:46:08Z","title":"A survey on Graph Deep Representation Learning for Facial Expression\n Recognition","summary":" This comprehensive review delves deeply into the various methodologies\napplied to facial expression recognition (FER) through the lens of graph\nrepresentation learning (GRL). Initially, we introduce the task of FER and the\nconcepts of graph representation and GRL. Afterward, we discuss some of the\nmost prevalent and valuable databases for this task. We explore promising\napproaches for graph representation in FER, including graph diffusion,\nspatio-temporal graphs, and multi-stream architectures. Finally, we identify\nfuture research opportunities and provide concluding remarks.\n","authors":["Théo Gueuret","Akrem Sellami","Chaabane Djeraba"],"pdf_url":"https://arxiv.org/pdf/2411.08472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08470v1","updated":"2024-11-13T09:42:12Z","published":"2024-11-13T09:42:12Z","title":"HyperFace: Generating Synthetic Face Recognition Datasets by Exploring\n Face Embedding Hypersphere","summary":" Face recognition datasets are often collected by crawling Internet and\nwithout individuals' consents, raising ethical and privacy concerns. Generating\nsynthetic datasets for training face recognition models has emerged as a\npromising alternative. However, the generation of synthetic datasets remains\nchallenging as it entails adequate inter-class and intra-class variations.\nWhile advances in generative models have made it easier to increase intra-class\nvariations in face datasets (such as pose, illumination, etc.), generating\nsufficient inter-class variation is still a difficult task. In this paper, we\nformulate the dataset generation as a packing problem on the embedding space\n(represented on a hypersphere) of a face recognition model and propose a new\nsynthetic dataset generation approach, called HyperFace. We formalize our\npacking problem as an optimization problem and solve it with a gradient\ndescent-based approach. Then, we use a conditional face generator model to\nsynthesize face images from the optimized embeddings. We use our generated\ndatasets to train face recognition models and evaluate the trained models on\nseveral benchmarking real datasets. Our experimental results show that models\ntrained with HyperFace achieve state-of-the-art performance in training face\nrecognition using synthetic datasets.\n","authors":["Hatef Otroshi Shahreza","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.08470v1.pdf","comment":"Accepted in NeurIPS 2024 Safe Generative AI Workshop"},{"id":"http://arxiv.org/abs/2406.16439v4","updated":"2024-11-13T09:41:00Z","published":"2024-06-24T08:30:03Z","title":"Exploring Test-Time Adaptation for Object Detection in Continually\n Changing Environments","summary":" Real-world application models are commonly deployed in dynamic environments,\nwhere the target domain distribution undergoes temporal changes. Continual\nTest-Time Adaptation (CTTA) has recently emerged as a promising technique to\ngradually adapt a source-trained model to continually changing target domains.\nDespite recent advancements in addressing CTTA, two critical issues remain: 1)\nFixed thresholds for pseudo-labeling in existing methodologies lead to\nlow-quality pseudo-labels, as model confidence varies across categories and\ndomains; 2) Stochastic parameter restoration methods for mitigating\ncatastrophic forgetting fail to preserve critical information effectively, due\nto their intrinsic randomness. To tackle these challenges for detection models\nin CTTA scenarios, we present AMROD, featuring three core components. Firstly,\nthe object-level contrastive learning module extracts object-level features for\ncontrastive learning to refine the feature representation in the target domain.\nSecondly, the adaptive monitoring module dynamically skips unnecessary\nadaptation and updates the category-specific threshold based on predicted\nconfidence scores to enable efficiency and improve the quality of\npseudo-labels. Lastly, the adaptive randomized restoration mechanism\nselectively reset inactive parameters with higher possibilities, ensuring the\nretention of essential knowledge. We demonstrate the effectiveness of AMROD on\nfour CTTA object detection tasks, where AMROD outperforms existing methods,\nespecially achieving a 3.2 mAP improvement and a 20% increase in efficiency on\nthe Cityscapes-to-Cityscapes-C CTTA task. The code will be released.\n","authors":["Shilei Cao","Yan Liu","Juepeng Zheng","Weijia Li","Runmin Dong","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2406.16439v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08466v1","updated":"2024-11-13T09:37:24Z","published":"2024-11-13T09:37:24Z","title":"Can MLLMs Guide Weakly-Supervised Temporal Action Localization Tasks?","summary":" Recent breakthroughs in Multimodal Large Language Models (MLLMs) have gained\nsignificant recognition within the deep learning community, where the fusion of\nthe Video Foundation Models (VFMs) and Large Language Models(LLMs) has proven\ninstrumental in constructing robust video understanding systems, effectively\nsurmounting constraints associated with predefined visual tasks. These\nsophisticated MLLMs exhibit remarkable proficiency in comprehending videos,\nswiftly attaining unprecedented performance levels across diverse benchmarks.\nHowever, their operation demands substantial memory and computational\nresources, underscoring the continued importance of traditional models in video\ncomprehension tasks. In this paper, we introduce a novel learning paradigm\ntermed MLLM4WTAL. This paradigm harnesses the potential of MLLM to offer\ntemporal action key semantics and complete semantic priors for conventional\nWeakly-supervised Temporal Action Localization (WTAL) methods. MLLM4WTAL\nfacilitates the enhancement of WTAL by leveraging MLLM guidance. It achieves\nthis by integrating two distinct modules: Key Semantic Matching (KSM) and\nComplete Semantic Reconstruction (CSR). These modules work in tandem to\neffectively address prevalent issues like incomplete and over-complete outcomes\ncommon in WTAL methods. Rigorous experiments are conducted to validate the\nefficacy of our proposed approach in augmenting the performance of various\nheterogeneous WTAL models.\n","authors":["Quan Zhang","Yuxin Qi"],"pdf_url":"https://arxiv.org/pdf/2411.08466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06098v2","updated":"2024-11-13T09:31:14Z","published":"2024-11-09T07:19:56Z","title":"LT-DARTS: An Architectural Approach to Enhance Deep Long-Tailed Learning","summary":" Deep long-tailed recognition has been widely studied to address the issue of\nimbalanced data distributions in real-world scenarios. However, there has been\ninsufficient focus on the design of neural architectures, despite empirical\nevidence suggesting that architecture can significantly impact performance. In\nthis paper, we attempt to mitigate long-tailed issues through architectural\nimprovements. To simplify the design process, we utilize Differential\nArchitecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS\nmethods struggle to perform well in long-tailed scenarios. To tackle this\nchallenge, we introduce Long-Tailed Differential Architecture Search\n(LT-DARTS). Specifically, we conduct extensive experiments to explore\narchitectural components that demonstrate better performance on long-tailed\ndata and propose a new search space based on our observations. This ensures\nthat the architecture obtained through our search process incorporates superior\ncomponents. Additionally, we propose replacing the learnable linear classifier\nwith an Equiangular Tight Frame (ETF) classifier to further enhance our method.\nThis classifier effectively alleviates the biased search process and prevents\nperformance collapse. Extensive experimental evaluations demonstrate that our\napproach consistently improves upon existing methods from an orthogonal\nperspective and achieves state-of-the-art results with simple enhancements.\n","authors":["Yuhan Pan","Yanan Sun","Wei Gong"],"pdf_url":"https://arxiv.org/pdf/2411.06098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08460v1","updated":"2024-11-13T09:31:06Z","published":"2024-11-13T09:31:06Z","title":"Trap-MID: Trapdoor-based Defense against Model Inversion Attacks","summary":" Model Inversion (MI) attacks pose a significant threat to the privacy of Deep\nNeural Networks by recovering training data distribution from well-trained\nmodels. While existing defenses often rely on regularization techniques to\nreduce information leakage, they remain vulnerable to recent attacks. In this\npaper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to\nmislead MI attacks. A trapdoor is integrated into the model to predict a\nspecific label when the input is injected with the corresponding trigger.\nConsequently, this trapdoor information serves as the \"shortcut\" for MI\nattacks, leading them to extract trapdoor triggers rather than private data. We\nprovide theoretical insights into the impacts of trapdoor's effectiveness and\nnaturalness on deceiving MI attacks. In addition, empirical experiments\ndemonstrate the state-of-the-art defense performance of Trap-MID against\nvarious MI attacks without the requirements for extra data or large\ncomputational overhead. Our source code is publicly available at\nhttps://github.com/ntuaislab/Trap-MID.\n","authors":["Zhen-Ting Liu","Shang-Tse Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08460v1.pdf","comment":"Accepted by Neural Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2411.00393v4","updated":"2024-11-13T09:27:41Z","published":"2024-11-01T06:40:47Z","title":"Advantages of Neural Population Coding for Deep Learning","summary":" Scalar variables, e.g., the orientation of a shape in an image, are commonly\npredicted using a single output neuron in a neural network. In contrast, the\nmammalian cortex represents variables with a population of neurons. In this\npopulation code, each neuron is most active at its preferred value and shows\npartial activity for other values. Here, we investigate the benefit of using a\npopulation code for the output layer of a neural network. We compare population\ncodes against single-neuron outputs and one-hot vectors. First, we show\ntheoretically and in experiments with synthetic data that population codes\nimprove robustness to input noise in networks of stacked linear layers. Second,\nwe demonstrate the benefit of using population codes to encode ambiguous\noutputs, such as the pose of symmetric objects. Using the T-LESS dataset of\nfeature-less real-world objects, we show that population codes improve the\naccuracy of predicting 3D object orientation from image input.\n","authors":["Heiko Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2411.00393v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08453v1","updated":"2024-11-13T09:16:21Z","published":"2024-11-13T09:16:21Z","title":"Biomass phenotyping of oilseed rape through UAV multi-view oblique\n imaging with 3DGS and SAM model","summary":" Biomass estimation of oilseed rape is crucial for optimizing crop\nproductivity and breeding strategies. While UAV-based imaging has advanced\nhigh-throughput phenotyping, current methods often rely on orthophoto images,\nwhich struggle with overlapping leaves and incomplete structural information in\ncomplex field environments. This study integrates 3D Gaussian Splatting (3DGS)\nwith the Segment Anything Model (SAM) for precise 3D reconstruction and biomass\nestimation of oilseed rape. UAV multi-view oblique images from 36 angles were\nused to perform 3D reconstruction, with the SAM module enhancing point cloud\nsegmentation. The segmented point clouds were then converted into point cloud\nvolumes, which were fitted to ground-measured biomass using linear regression.\nThe results showed that 3DGS (7k and 30k iterations) provided high accuracy,\nwith peak signal-to-noise ratios (PSNR) of 27.43 and 29.53 and training times\nof 7 and 49 minutes, respectively. This performance exceeded that of structure\nfrom motion (SfM) and mipmap Neural Radiance Fields (Mip-NeRF), demonstrating\nsuperior efficiency. The SAM module achieved high segmentation accuracy, with a\nmean intersection over union (mIoU) of 0.961 and an F1-score of 0.980.\nAdditionally, a comparison of biomass extraction models found the point cloud\nvolume model to be the most accurate, with an determination coefficient (R2) of\n0.976, root mean square error (RMSE) of 2.92 g/plant, and mean absolute\npercentage error (MAPE) of 6.81%, outperforming both the plot crop volume and\nindividual crop volume models. This study highlights the potential of combining\n3DGS with multi-view UAV imaging for improved biomass phenotyping.\n","authors":["Yutao Shen","Hongyu Zhou","Xin Yang","Xuqi Lu","Ziyue Guo","Lixi Jiang","Yong He","Haiyan Cen"],"pdf_url":"https://arxiv.org/pdf/2411.08453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08451v1","updated":"2024-11-13T09:14:35Z","published":"2024-11-13T09:14:35Z","title":"AD-DINO: Attention-Dynamic DINO for Distance-Aware Embodied Reference\n Understanding","summary":" Embodied reference understanding is crucial for intelligent agents to predict\nreferents based on human intention through gesture signals and language\ndescriptions. This paper introduces the Attention-Dynamic DINO, a novel\nframework designed to mitigate misinterpretations of pointing gestures across\nvarious interaction contexts. Our approach integrates visual and textual\nfeatures to simultaneously predict the target object's bounding box and the\nattention source in pointing gestures. Leveraging the distance-aware nature of\nnonverbal communication in visual perspective taking, we extend the virtual\ntouch line mechanism and propose an attention-dynamic touch line to represent\nreferring gesture based on interactive distances. The combination of this\ndistance-aware approach and independent prediction of the attention source,\nenhances the alignment between objects and the gesture represented line.\nExtensive experiments on the YouRefIt dataset demonstrate the efficacy of our\ngesture information understanding method in significantly improving task\nperformance. Our model achieves 76.4% accuracy at the 0.25 IoU threshold and,\nnotably, surpasses human performance at the 0.75 IoU threshold, marking a first\nin this domain. Comparative experiments with distance-unaware understanding\nmethods from previous research further validate the superiority of the\nAttention-Dynamic Touch Line across diverse contexts.\n","authors":["Hao Guo","Wei Fan","Baichun Wei","Jianfei Zhu","Jin Tian","Chunzhi Yi","Feng Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.08451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06645v4","updated":"2024-11-13T09:14:12Z","published":"2024-10-09T07:57:47Z","title":"Continual Learning in the Frequency Domain","summary":" Continual learning (CL) is designed to learn new tasks while preserving\nexisting knowledge. Replaying samples from earlier tasks has proven to be an\neffective method to mitigate the forgetting of previously acquired knowledge.\nHowever, the current research on the training efficiency of rehearsal-based\nmethods is insufficient, which limits the practical application of CL systems\nin resource-limited scenarios. The human visual system (HVS) exhibits varying\nsensitivities to different frequency components, enabling the efficient\nelimination of visually redundant information. Inspired by HVS, we propose a\nnovel framework called Continual Learning in the Frequency Domain (CLFD). To\nour knowledge, this is the first study to utilize frequency domain features to\nenhance the performance and efficiency of CL training on edge devices. For the\ninput features of the feature extractor, CLFD employs wavelet transform to map\nthe original input image into the frequency domain, thereby effectively\nreducing the size of input feature maps. Regarding the output features of the\nfeature extractor, CLFD selectively utilizes output features for distinct\nclasses for classification, thereby balancing the reusability and interference\nof output features based on the frequency domain similarity of the classes\nacross various tasks. Optimizing only the input and output features of the\nfeature extractor allows for seamless integration of CLFD with various\nrehearsal-based methods. Extensive experiments conducted in both cloud and edge\nenvironments demonstrate that CLFD consistently improves the performance of\nstate-of-the-art (SOTA) methods in both precision and training efficiency.\nSpecifically, CLFD can increase the accuracy of the SOTA CL method by up to\n6.83% and reduce the training time by 2.6$\\times$.\n","authors":["Ruiqi Liu","Boyu Diao","Libo Huang","Zijia An","Zhulin An","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2410.06645v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23828v2","updated":"2024-11-13T09:06:18Z","published":"2024-10-31T11:20:13Z","title":"Show Me What and Where has Changed? Question Answering and Grounding for\n Remote Sensing Change Detection","summary":" Remote sensing change detection aims to perceive changes occurring on the\nEarth's surface from remote sensing data in different periods, and feed these\nchanges back to humans. However, most existing methods only focus on detecting\nchange regions, lacking the capability to interact with users to identify\nchanges that the users expect. In this paper, we introduce a new task named\nChange Detection Question Answering and Grounding (CDQAG), which extends the\ntraditional change detection task by providing interpretable textual answers\nand intuitive visual evidence. To this end, we construct the first CDQAG\nbenchmark dataset, termed QAG-360K, comprising over 360K triplets of questions,\ntextual answers, and corresponding high-quality visual masks. It encompasses 10\nessential land-cover categories and 8 comprehensive question types, which\nprovides a valuable and diverse dataset for remote sensing applications.\nFurthermore, we present VisTA, a simple yet effective baseline method that\nunifies the tasks of question answering and grounding by delivering both visual\nand textual answers. Our method achieves state-of-the-art results on both the\nclassic change detection-based visual question answering (CDVQA) and the\nproposed CDQAG datasets. Extensive qualitative and quantitative experimental\nresults provide useful insights for developing better CDQAG models, and we hope\nthat our work can inspire further research in this important yet underexplored\nresearch field. The proposed benchmark dataset and method are available at\nhttps://github.com/like413/VisTA.\n","authors":["Ke Li","Fuyu Dong","Di Wang","Shaofeng Li","Quan Wang","Xinbo Gao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.23828v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21991v4","updated":"2024-11-13T08:59:31Z","published":"2024-10-29T12:22:07Z","title":"From Explicit Rules to Implicit Reasoning in an Interpretable Violence\n Monitoring System","summary":" Recently, research based on pre-trained models has demonstrated outstanding\nperformance in violence surveillance tasks. However, most of them were\nblack-box systems which faced challenges regarding explainability during\ntraining and inference processes. An important question is how to incorporate\nexplicit knowledge into these implicit models, thereby designing expert-driven\nand interpretable violence surveillance systems. This paper proposes a new\nparadigm for weakly supervised violence monitoring (WSVM) called Rule base\nViolence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure\nwith different designs for images and text. One of the branches is called the\nimplicit branch, which uses only visual features for coarse-grained binary\nclassification. In this branch, image feature extraction is divided into two\nchannels: one responsible for extracting scene frames and the other focusing on\nextracting actions. The other branch is called the explicit branch, which\nutilizes language-image alignment to perform fine-grained classification. For\nthe language channel design in the explicit branch, the proposed RuleCLIP uses\nthe state-of-the-art YOLO-World model to detect objects in video frames, and\nassociation rules are identified through data mining methods as descriptions of\nthe video. Leveraging the dual-branch architecture, RuleVM achieves\ninterpretable coarse-grained and fine-grained violence surveillance. Extensive\nexperiments were conducted on two commonly used benchmarks, and the results\nshow that RuleCLIP achieved the best performance in both coarse-grained and\nfine-grained monitoring, significantly outperforming existing state-of-the-art\nmethods. Moreover, interpretability experiments uncovered some interesting\nrules, such as the observation that as the number of people increases, the risk\nlevel of violent behavior also rises.\n","authors":["Wen-Dong Jiang","Chih-Yung Chang","Ssu-Chi Kuai","Diptendu Sinha Roy"],"pdf_url":"https://arxiv.org/pdf/2410.21991v4.pdf","comment":"12 pages,7 figures IEEE TSMCA (Under review)"},{"id":"http://arxiv.org/abs/2411.08443v1","updated":"2024-11-13T08:56:35Z","published":"2024-11-13T08:56:35Z","title":"Machine Unlearning on Pre-trained Models by Residual Feature Alignment\n Using LoRA","summary":" Machine unlearning is new emerged technology that removes a subset of the\ntraining data from a trained model without affecting the model performance on\nthe remaining data. This topic is becoming increasingly important in protecting\nuser privacy and eliminating harmful or outdated data. The key challenge lies\nin effectively and efficiently unlearning specific information without\ncompromising the model's utility on the retained data. For the pre-trained\nmodels, fine-tuning is an important way to achieve the unlearning target.\nPrevious work typically fine-tuned the entire model's parameters, which incurs\nsignificant computation costs. In addition, the fine-tuning process may cause\nshifts in the intermediate layer features, affecting the model's overall\nutility. In this work, we propose a novel and efficient machine unlearning\nmethod on pre-trained models. We term the method as Residual Feature Alignment\nUnlearning. Specifically, we leverage LoRA (Low-Rank Adaptation) to decompose\nthe model's intermediate features into pre-trained features and residual\nfeatures. By adjusting the residual features, we align the unlearned model with\nthe pre-trained model at the intermediate feature level to achieve both\nunlearning and remaining targets. The method aims to learn the zero residuals\non the retained set and shifted residuals on the unlearning set. Extensive\nexperiments on numerous datasets validate the effectiveness of our approach.\n","authors":["Laiqiao Qin","Tianqing Zhu","Linlin Wang","Wanlei Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.08443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09828v4","updated":"2024-11-13T08:34:48Z","published":"2024-05-16T06:05:08Z","title":"*: Improving the 3D detector by introducing Voxel2Pillar feature\n encoding and extracting multi-scale features","summary":" The multi-line LiDAR is widely used in autonomous vehicles, so point\ncloud-based 3D detectors are essential for autonomous driving. Extracting rich\nmulti-scale features is crucial for point cloud-based 3D detectors in\nautonomous driving due to significant differences in the size of different\ntypes of objects. However, because of the real-time requirements, large-size\nconvolution kernels are rarely used to extract large-scale features in the\nbackbone. Current 3D detectors commonly use feature pyramid networks to obtain\nlarge-scale features; however, some objects containing fewer point clouds are\nfurther lost during down-sampling, resulting in degraded performance. Since\npillar-based schemes require much less computation than voxel-based schemes,\nthey are more suitable for constructing real-time 3D detectors. Hence, we\npropose the *, a pillar-based scheme. We redesigned the feature encoding, the\nbackbone, and the neck of the 3D detector. We propose the Voxel2Pillar feature\nencoding, which uses a sparse convolution constructor to construct pillars with\nricher point cloud features, especially height features. The Voxel2Pillar adds\nmore learnable parameters to the feature encoding, enabling the initial pillars\nto have higher performance ability. We extract multi-scale and large-scale\nfeatures in the proposed fully sparse backbone, which does not utilize\nlarge-size convolutional kernels; the backbone consists of the proposed\nmulti-scale feature extraction module. The neck consists of the proposed sparse\nConvNeXt, whose simple structure significantly improves the performance. We\nvalidate the effectiveness of the proposed * on the Waymo Open Dataset, and the\nobject detection accuracy for vehicles, pedestrians, and cyclists is improved.\nWe also verify the effectiveness of each proposed module in detail through\nablation studies.\n","authors":["Xusheng Li","Chengliang Wang","Shumao Wang","Zhuo Zeng","Ji Liu"],"pdf_url":"https://arxiv.org/pdf/2405.09828v4.pdf","comment":"Due to experimental data errors, it needs to be withdrawn"},{"id":"http://arxiv.org/abs/2411.04919v2","updated":"2024-11-13T08:32:27Z","published":"2024-11-07T17:56:16Z","title":"Stem-OB: Generalizable Visual Imitation Learning with Stem-Like\n Convergent Observation through Diffusion Inversion","summary":" Visual imitation learning methods demonstrate strong performance, yet they\nlack generalization when faced with visual input perturbations, including\nvariations in lighting and textures, impeding their real-world application. We\npropose Stem-OB that utilizes pretrained image diffusion models to suppress\nlow-level visual differences while maintaining high-level scene structures.\nThis image inversion process is akin to transforming the observation into a\nshared representation, from which other observations stem, with extraneous\ndetails removed. Stem-OB contrasts with data-augmentation approaches as it is\nrobust to various unspecified appearance changes without the need for\nadditional training. Our method is a simple yet highly effective plug-and-play\nsolution. Empirical results confirm the effectiveness of our approach in\nsimulated tasks and show an exceptionally significant improvement in real-world\napplications, with an average increase of 22.2% in success rates compared to\nthe best baseline. See https://hukz18.github.io/Stem-Ob/ for more info.\n","authors":["Kaizhe Hu","Zihang Rui","Yao He","Yuyao Liu","Pu Hua","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2411.04919v2.pdf","comment":"Arxiv preprint version, website: https://hukz18.github.io/Stem-Ob/"},{"id":"http://arxiv.org/abs/2411.07501v2","updated":"2024-11-13T08:30:52Z","published":"2024-11-12T02:57:15Z","title":"LAuReL: Learned Augmented Residual Layer","summary":" One of the core pillars of efficient deep learning methods is architectural\nimprovements such as the residual/skip connection, which has led to\nsignificantly better model convergence and quality. Since then the residual\nconnection has become ubiquitous in not just convolutional neural networks but\nalso transformer-based architectures, the backbone of LLMs.\n In this paper we introduce \\emph{Learned Augmented Residual Layer} (LAuReL)\n-- a novel generalization of the canonical residual connection -- with the goal\nto be an in-situ replacement of the latter while outperforming on both model\nquality and footprint metrics. Our experiments show that using \\laurel can help\nboost performance for both vision and language models. For example, on the\nResNet-50, ImageNet 1K task, it achieves $60\\%$ of the gains from adding an\nextra layer, while only adding $0.003\\%$ more parameters, and matches it while\nadding $2.6\\times$ fewer parameters.\n","authors":["Gaurav Menghani","Ravi Kumar","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07501v2.pdf","comment":"Accepted at the 2nd Efficient Systems for Foundation Models Workshop\n at the International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2411.08424v1","updated":"2024-11-13T08:17:52Z","published":"2024-11-13T08:17:52Z","title":"A Heterogeneous Graph Neural Network Fusing Functional and Structural\n Connectivity for MCI Diagnosis","summary":" Brain connectivity alternations associated with brain disorders have been\nwidely reported in resting-state functional imaging (rs-fMRI) and diffusion\ntensor imaging (DTI). While many dual-modal fusion methods based on graph\nneural networks (GNNs) have been proposed, they generally follow homogenous\nfusion ways ignoring rich heterogeneity of dual-modal information. To address\nthis issue, we propose a novel method that integrates functional and structural\nconnectivity based on heterogeneous graph neural networks (HGNNs) to better\nleverage the rich heterogeneity in dual-modal images. We firstly use blood\noxygen level dependency and whiter matter structure information provided by\nrs-fMRI and DTI to establish homo-meta-path, capturing node relationships\nwithin the same modality. At the same time, we propose to establish\nhetero-meta-path based on structure-function coupling and brain community\nsearching to capture relations among cross-modal nodes. Secondly, we further\nintroduce a heterogeneous graph pooling strategy that automatically balances\nhomo- and hetero-meta-path, effectively leveraging heterogeneous information\nand preventing feature confusion after pooling. Thirdly, based on the\nflexibility of heterogeneous graphs, we propose a heterogeneous graph data\naugmentation approach that can conveniently address the sample imbalance issue\ncommonly seen in clinical diagnosis. We evaluate our method on ADNI-3 dataset\nfor mild cognitive impairment (MCI) diagnosis. Experimental results indicate\nthe proposed method is effective and superior to other algorithms, with a mean\nclassification accuracy of 93.3%.\n","authors":["Feiyu Yin","Yu Lei","Siyuan Dai","Wenwen Zeng","Guoqing Wu","Liang Zhan","Jinhua Yu"],"pdf_url":"https://arxiv.org/pdf/2411.08424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07579v2","updated":"2024-11-13T08:00:57Z","published":"2024-11-12T06:29:48Z","title":"Projecting Gaussian Ellipsoids While Avoiding Affine Projection\n Approximation","summary":" Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its\nreal-time rendering speed and state-of-the-art rendering quality. However,\nduring the rendering process, the use of the Jacobian of the affine\napproximation of the projection transformation leads to inevitable errors,\nresulting in blurriness, artifacts and a lack of scene consistency in the final\nrendered images. To address this issue, we introduce an ellipsoid-based\nprojection method to calculate the projection of Gaussian ellipsoid on the\nimage plane, witch is the primitive of 3D Gaussian Splatting. As our proposed\nellipsoid-based projection method cannot handle Gaussian ellipsoids with camera\norigins inside them or parts lying below $z=0$ plane in the camera space, we\ndesigned a pre-filtering strategy. Experiments over multiple widely adopted\nbenchmark datasets show that using our ellipsoid-based projection method can\nenhance the rendering quality of 3D Gaussian Splatting and its extensions.\n","authors":["Han Qi","Tao Cai","Xiyue Han"],"pdf_url":"https://arxiv.org/pdf/2411.07579v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08410v1","updated":"2024-11-13T07:57:19Z","published":"2024-11-13T07:57:19Z","title":"The VLLM Safety Paradox: Dual Ease in Jailbreak Attack and Defense","summary":" The vulnerability of Vision Large Language Models (VLLMs) to jailbreak\nattacks appears as no surprise. However, recent defense mechanisms against\nthese attacks have reached near-saturation performance on benchmarks, often\nwith minimal effort. This simultaneous high performance in both attack and\ndefense presents a perplexing paradox. Resolving it is critical for advancing\nthe development of trustworthy models. To address this research gap, we first\ninvestigate why VLLMs are prone to these attacks. We then make a key\nobservation: existing defense mechanisms suffer from an \\textbf{over-prudence}\nproblem, resulting in unexpected abstention even in the presence of benign\ninputs. Additionally, we find that the two representative evaluation methods\nfor jailbreak often exhibit chance agreement. This limitation makes it\npotentially misleading when evaluating attack strategies or defense mechanisms.\nBeyond these empirical observations, our another contribution in this work is\nto repurpose the guardrails of LLMs on the shelf, as an effective alternative\ndetector prior to VLLM response. We believe these findings offer useful\ninsights to rethink the foundational development of VLLM safety with respect to\nbenchmark datasets, evaluation methods, and defense strategies.\n","authors":["Yangyang Guo","Fangkai Jiao","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2411.08410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08402v1","updated":"2024-11-13T07:41:47Z","published":"2024-11-13T07:41:47Z","title":"V2X-R: Cooperative LiDAR-4D Radar Fusion for 3D Object Detection with\n Denoising Diffusion","summary":" Current Vehicle-to-Everything (V2X) systems have significantly enhanced 3D\nobject detection using LiDAR and camera data. However, these methods suffer\nfrom performance degradation in adverse weather conditions. The weatherrobust\n4D radar provides Doppler and additional geometric information, raising the\npossibility of addressing this challenge. To this end, we present V2X-R, the\nfirst simulated V2X dataset incorporating LiDAR, camera, and 4D radar. V2X-R\ncontains 12,079 scenarios with 37,727 frames of LiDAR and 4D radar point\nclouds, 150,908 images, and 170,859 annotated 3D vehicle bounding boxes.\nSubsequently, we propose a novel cooperative LiDAR-4D radar fusion pipeline for\n3D object detection and implement it with various fusion strategies. To achieve\nweather-robust detection, we additionally propose a Multi-modal Denoising\nDiffusion (MDD) module in our fusion pipeline. MDD utilizes weather-robust 4D\nradar feature as a condition to prompt the diffusion model to denoise noisy\nLiDAR features. Experiments show that our LiDAR-4D radar fusion pipeline\ndemonstrates superior performance in the V2X-R dataset. Over and above this,\nour MDD module further improved the performance of basic fusion model by up to\n5.73%/6.70% in foggy/snowy conditions with barely disrupting normal\nperformance. The dataset and code will be publicly available at:\nhttps://github.com/ylwhxht/V2X-R.\n","authors":["Xun Huang","Jinlong Wang","Qiming Xia","Siheng Chen","Bisheng Yang","Cheng Wang","Chenglu Wen"],"pdf_url":"https://arxiv.org/pdf/2411.08402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08395v1","updated":"2024-11-13T07:27:56Z","published":"2024-11-13T07:27:56Z","title":"MambaXCTrack: Mamba-based Tracker with SSM Cross-correlation and Motion\n Prompt for Ultrasound Needle Tracking","summary":" Ultrasound (US)-guided needle insertion is widely employed in percutaneous\ninterventions. However, providing feedback on the needle tip position via US\nimage presents challenges due to noise, artifacts, and the thin imaging plane\nof US, which degrades needle features and leads to intermittent tip visibility.\nIn this paper, a Mamba-based US needle tracker MambaXCTrack utilizing\nstructured state space models cross-correlation (SSMX-Corr) and implicit motion\nprompt is proposed, which is the first application of Mamba in US needle\ntracking. The SSMX-Corr enhances cross-correlation by long-range modeling and\nglobal searching of distant semantic features between template and search maps,\nbenefiting the tracking under noise and artifacts by implicitly learning\npotential distant semantic cues. By combining with cross-map interleaved scan\n(CIS), local pixel-wise interaction with positional inductive bias can also be\nintroduced to SSMX-Corr. The implicit low-level motion descriptor is proposed\nas a non-visual prompt to enhance tracking robustness, addressing the\nintermittent tip visibility problem. Extensive experiments on a dataset with\nmotorized needle insertion in both phantom and tissue samples demonstrate that\nthe proposed tracker outperforms other state-of-the-art trackers while ablation\nstudies further highlight the effectiveness of each proposed tracking module.\n","authors":["Yuelin Zhang","Qingpeng Ding","Long Lei","Jiwei Shan","Wenxuan Xie","Tianyi Zhang","Wanquan Yan","Raymond Shing-Yan Tang","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.08395v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.07265v2","updated":"2024-11-13T07:26:33Z","published":"2024-11-09T13:13:49Z","title":"ViTOC: Vision Transformer and Object-aware Captioner","summary":" This paper presents ViTOC (Vision Transformer and Object-aware Captioner), a\nnovel vision-language model for image captioning that addresses the challenges\nof accuracy and diversity in generated descriptions. Unlike conventional\napproaches, ViTOC employs a dual-path architecture based on Vision Transformer\nand object detector, effectively fusing global visual features and local object\ninformation through learnable vectors. The model introduces an innovative\nobject-aware prompting strategy that significantly enhances its capability in\nhandling long-tail data. Experiments on the standard COCO dataset demonstrate\nthat ViTOC outperforms baseline models across all evaluation metrics.\nAdditionally, we propose a reference-free evaluation method based on CLIP to\nfurther validate the model's effectiveness. By utilizing pretrained visual\nmodel parameters, ViTOC achieves efficient end-to-end training.\n","authors":["Feiyang Huang"],"pdf_url":"https://arxiv.org/pdf/2411.07265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08380v1","updated":"2024-11-13T07:05:40Z","published":"2024-11-13T07:05:40Z","title":"EgoVid-5M: A Large-Scale Video-Action Dataset for Egocentric Video\n Generation","summary":" Video generation has emerged as a promising tool for world simulation,\nleveraging visual data to replicate real-world environments. Within this\ncontext, egocentric video generation, which centers on the human perspective,\nholds significant potential for enhancing applications in virtual reality,\naugmented reality, and gaming. However, the generation of egocentric videos\npresents substantial challenges due to the dynamic nature of egocentric\nviewpoints, the intricate diversity of actions, and the complex variety of\nscenes encountered. Existing datasets are inadequate for addressing these\nchallenges effectively. To bridge this gap, we present EgoVid-5M, the first\nhigh-quality dataset specifically curated for egocentric video generation.\nEgoVid-5M encompasses 5 million egocentric video clips and is enriched with\ndetailed action annotations, including fine-grained kinematic control and\nhigh-level textual descriptions. To ensure the integrity and usability of the\ndataset, we implement a sophisticated data cleaning pipeline designed to\nmaintain frame consistency, action coherence, and motion smoothness under\negocentric conditions. Furthermore, we introduce EgoDreamer, which is capable\nof generating egocentric videos driven simultaneously by action descriptions\nand kinematic control signals. The EgoVid-5M dataset, associated action\nannotations, and all data cleansing metadata will be released for the\nadvancement of research in egocentric video generation.\n","authors":["Xiaofeng Wang","Kang Zhao","Feng Liu","Jiayu Wang","Guosheng Zhao","Xiaoyi Bao","Zheng Zhu","Yingya Zhang","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08380v1.pdf","comment":"Project Page: https://egovid.github.io/"},{"id":"http://arxiv.org/abs/2411.08371v1","updated":"2024-11-13T06:42:03Z","published":"2024-11-13T06:42:03Z","title":"Multiscale Graph Construction Using Non-local Cluster Features","summary":" This paper presents a multiscale graph construction method using both graph\nand signal features. Multiscale graph is a hierarchical representation of the\ngraph, where a node at each level indicates a cluster in a finer resolution. To\nobtain the hierarchical clusters, existing methods often use graph clustering;\nhowever, they may ignore signal variations. As a result, these methods could\nfail to detect the clusters having similar features on nodes. In this paper, we\nconsider graph and node-wise features simultaneously for multiscale clustering\nof a graph. With given clusters of the graph, the clusters are merged\nhierarchically in three steps: 1) Feature vectors in the clusters are\nextracted. 2) Similarities among cluster features are calculated using optimal\ntransport. 3) A variable $k$-nearest neighbor graph (V$k$NNG) is constructed\nand graph spectral clustering is applied to the V$k$NNG to obtain clusters at a\ncoarser scale. Additionally, the multiscale graph in this paper has\n\\textit{non-local} characteristics: Nodes with similar features are merged even\nif they are spatially separated. In experiments on multiscale image and point\ncloud segmentation, we demonstrate the effectiveness of the proposed method.\n","authors":["Reina Kaneko","Hayate Kojima","Kenta Yanagiya","Junya Hara","Hiroshi Higashi","Yuichi Tanaka"],"pdf_url":"https://arxiv.org/pdf/2411.08371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18054v3","updated":"2024-11-13T06:25:23Z","published":"2024-06-26T04:12:34Z","title":"Leveraging Pre-trained Models for FF-to-FFPE Histopathological Image\n Translation","summary":" The two primary types of Hematoxylin and Eosin (H&E) slides in histopathology\nare Formalin-Fixed Paraffin-Embedded (FFPE) and Fresh Frozen (FF). FFPE slides\noffer high quality histopathological images but require a labor-intensive\nacquisition process. In contrast, FF slides can be prepared quickly, but the\nimage quality is relatively poor. Our task is to translate FF images into FFPE\nstyle, thereby improving the image quality for diagnostic purposes. In this\npaper, we propose Diffusion-FFPE, a method for FF-to-FFPE histopathological\nimage translation using a pre-trained diffusion model. Specifically, we utilize\na one-step diffusion model as the generator, which we fine-tune using LoRA\nadapters within an adversarial learning framework. To enable the model to\neffectively capture both global structural patterns and local details, we\nintroduce a multi-scale feature fusion module that leverages two VAE encoders\nto extract features at different image resolutions, performing feature fusion\nbefore inputting them into the UNet. Additionally, a pre-trained\nvision-language model for histopathology serves as the backbone for the\ndiscriminator, enhancing model performance. Our FF-to-FFPE translation\nexperiments on the TCGA-NSCLC dataset demonstrate that the proposed approach\noutperforms existing methods. The code and models are released at\nhttps://github.com/QilaiZhang/Diffusion-FFPE.\n","authors":["Qilai Zhang","Jiawen Li","Peiran Liao","Jiali Hu","Tian Guan","Anjia Han","Yonghong He"],"pdf_url":"https://arxiv.org/pdf/2406.18054v3.pdf","comment":"Accepted at IEEE BIBM 2024"},{"id":"http://arxiv.org/abs/2411.04493v2","updated":"2024-11-13T05:52:23Z","published":"2024-11-07T07:41:04Z","title":"Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised\n Medical Image Segmentation","summary":" Semi-supervised learning has received considerable attention for its\npotential to leverage abundant unlabeled data to enhance model robustness.\nPseudo labeling is a widely used strategy in semi supervised learning. However,\nexisting methods often suffer from noise contamination, which can undermine\nmodel performance. To tackle this challenge, we introduce a novel\nSynergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework.\nBuilt upon the mean teacher network, we employ a Mix Augmentation module to\nenhance the unlabeled data. By evaluating the synergy before and after\naugmentation, we strategically partition the pseudo labels into distinct\nregions. Additionally, we introduce a Region Loss Evaluation module to assess\nthe loss across each delineated area. Extensive experiments conducted on the LA\ndataset have demonstrated superior performance over state-of-the-art\ntechniques, underscoring the efficiency and practicality of our framework.\n","authors":["Tao Wang","Xinlin Zhang","Yuanbin Chen","Yuanbo Zhou","Longxuan Zhao","Tao Tan","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2411.04493v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08347v1","updated":"2024-11-13T05:38:55Z","published":"2024-11-13T05:38:55Z","title":"A Chinese Multi-label Affective Computing Dataset Based on Social Media\n Network Users","summary":" Emotion and personality are central elements in understanding human\npsychological states. Emotions reflect an individual subjective experiences,\nwhile personality reveals relatively stable behavioral and cognitive patterns.\nExisting affective computing datasets often annotate emotion and personality\ntraits separately, lacking fine-grained labeling of micro-emotions and emotion\nintensity in both single-label and multi-label classifications. Chinese emotion\ndatasets are extremely scarce, and datasets capturing Chinese user personality\ntraits are even more limited. To address these gaps, this study collected data\nfrom the major social media platform Weibo, screening 11,338 valid users from\nover 50,000 individuals with diverse MBTI personality labels and acquiring\n566,900 posts along with the user MBTI personality tags. Using the EQN method,\nwe compiled a multi-label Chinese affective computing dataset that integrates\nthe same user's personality traits with six emotions and micro-emotions, each\nannotated with intensity levels. Validation results across multiple NLP\nclassification models demonstrate the dataset strong utility. This dataset is\ndesigned to advance machine recognition of complex human emotions and provide\ndata support for research in psychology, education, marketing, finance, and\npolitics.\n","authors":["Jingyi Zhou","Senlin Luo","Haofan Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18066v2","updated":"2024-11-13T05:15:53Z","published":"2024-02-28T05:52:25Z","title":"Six-Point Method for Multi-Camera Systems with Reduced Solution Space","summary":" Relative pose estimation using point correspondences (PC) is a widely used\ntechnique. A minimal configuration of six PCs is required for two views of\ngeneralized cameras. In this paper, we present several minimal solvers that use\nsix PCs to compute the 6DOF relative pose of multi-camera systems, including a\nminimal solver for the generalized camera and two minimal solvers for the\npractical configuration of two-camera rigs. The equation construction is based\non the decoupling of rotation and translation. Rotation is represented by\nCayley or quaternion parametrization, and translation can be eliminated by\nusing the hidden variable technique. Ray bundle constraints are found and\nproven when a subset of PCs relate the same cameras across two views. This is\nthe key to reducing the number of solutions and generating numerically stable\nsolvers. Moreover, all configurations of six-point problems for multi-camera\nsystems are enumerated. Extensive experiments demonstrate the superior accuracy\nand efficiency of our solvers compared to state-of-the-art six-point methods.\nThe code is available at https://github.com/jizhaox/relpose-6pt\n","authors":["Banglei Guan","Ji Zhao","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2402.18066v2.pdf","comment":"Accepted to the European Conference on Computer Vision (ECCV), 2024,\n for an oral presentation"},{"id":"http://arxiv.org/abs/2411.08340v1","updated":"2024-11-13T05:09:28Z","published":"2024-11-13T05:09:28Z","title":"DyConfidMatch: Dynamic Thresholding and Re-sampling for 3D\n Semi-supervised Learning","summary":" Semi-supervised learning (SSL) leverages limited labeled and abundant\nunlabeled data but often faces challenges with data imbalance, especially in 3D\ncontexts. This study investigates class-level confidence as an indicator of\nlearning status in 3D SSL, proposing a novel method that utilizes dynamic\nthresholding to better use unlabeled data, particularly from underrepresented\nclasses. A re-sampling strategy is also introduced to mitigate bias towards\nwell-represented classes, ensuring equitable class representation. Through\nextensive experiments in 3D SSL, our method surpasses state-of-the-art\ncounterparts in classification and detection tasks, highlighting its\neffectiveness in tackling data imbalance. This approach presents a significant\nadvancement in SSL for 3D datasets, providing a robust solution for data\nimbalance issues.\n","authors":["Zhimin Chen","Bing Li"],"pdf_url":"https://arxiv.org/pdf/2411.08340v1.pdf","comment":"Accepted by Pattern Recognition Journal"},{"id":"http://arxiv.org/abs/2411.08335v1","updated":"2024-11-13T04:49:32Z","published":"2024-11-13T04:49:32Z","title":"DEEGITS: Deep Learning based Framework for Measuring Heterogenous\n Traffic State in Challenging Traffic Scenarios","summary":" This paper presents DEEGITS (Deep Learning Based Heterogeneous Traffic State\nMeasurement), a comprehensive framework that leverages state-of-the-art\nconvolutional neural network (CNN) techniques to accurately and rapidly detect\nvehicles and pedestrians, as well as to measure traffic states in challenging\nscenarios (i.e., congestion, occlusion). In this study, we enhance the training\ndataset through data fusion, enabling simultaneous detection of vehicles and\npedestrians. Image preprocessing and augmentation are subsequently performed to\nimprove the quality and quantity of the dataset. Transfer learning is applied\non the YOLOv8 pretrained model to increase the model's capability to identify a\ndiverse array of vehicles. Optimal hyperparameters are obtained using the Grid\nSearch algorithm, with the Stochastic Gradient Descent (SGD) optimizer\noutperforming other optimizers under these settings. Extensive experimentation\nand evaluation demonstrate substantial accuracy within the detection framework,\nwith the model achieving 0.794 mAP@0.5 on the validation set and 0.786 mAP@0.5\non the test set, surpassing previous benchmarks on similar datasets. The\nDeepSORT multi-object tracking algorithm is incorporated to track detected\nvehicles and pedestrians in this study. Finally, the framework is tested to\nmeasure heterogeneous traffic states in mixed traffic conditions. Two locations\nwith differing traffic compositions and congestion levels are selected: one\nmotorized-dominant location with moderate density and one\nnon-motorized-dominant location with higher density. Errors are statistically\ninsignificant for both cases, showing correlations from 0.99 to 0.88 and 0.91\nto 0.97 for heterogeneous traffic flow and speed measurements, respectively.\n","authors":["Muttahirul Islam","Nazmul Haque","Md. Hadiuzzaman"],"pdf_url":"https://arxiv.org/pdf/2411.08335v1.pdf","comment":"Submitted for presentation at the 103 rd Annual Meeting of\n Transportation Research Board and publication in Transportation Research\n Record: Journal of Transportation Research Board"},{"id":"http://arxiv.org/abs/2411.08334v1","updated":"2024-11-13T04:32:58Z","published":"2024-11-13T04:32:58Z","title":"Enhancing Multimodal Query Representation via Visual Dialogues for\n End-to-End Knowledge Retrieval","summary":" Existing multimodal retrieval systems often rely on disjointed models for\nimage comprehension, such as object detectors and caption generators, leading\nto cumbersome implementations and training processes. To overcome this\nlimitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a\ntext retriever with the ability to understand multimodal queries via dynamic\nmodality interaction. Ret-XKnow leverages a partial convolution mechanism to\nfocus on visual information relevant to the given textual query, thereby\nenhancing multimodal query representations. To effectively learn multimodal\ninteraction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset\nautomatically constructed from visual dialogue datasets. Our dataset\nconstruction process ensures that the dialogues are transformed into suitable\ninformation retrieval tasks using a text retriever. We demonstrate that our\napproach not only significantly improves retrieval performance in zero-shot\nsettings but also achieves substantial improvements in fine-tuning scenarios.\nOur code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow.\n","authors":["Yeong-Joon Ju","Ho-Joong Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08333v1","updated":"2024-11-13T04:29:34Z","published":"2024-11-13T04:29:34Z","title":"SASE: A Searching Architecture for Squeeze and Excitation Operations","summary":" In the past few years, channel-wise and spatial-wise attention blocks have\nbeen widely adopted as supplementary modules in deep neural networks, enhancing\nnetwork representational abilities while introducing low complexity. Most\nattention modules follow a squeeze-and-excitation paradigm. However, to design\nsuch attention modules, requires a substantial amount of experiments and\ncomputational resources. Neural Architecture Search (NAS), meanwhile, is able\nto automate the design of neural networks and spares the numerous experiments\nrequired for an optimal architecture. This motivates us to design a search\narchitecture that can automatically find near-optimal attention modules through\nNAS. We propose SASE, a Searching Architecture for Squeeze and Excitation\noperations, to form a plug-and-play attention block by searching within certain\nsearch space. The search space is separated into 4 different sets, each\ncorresponds to the squeeze or excitation operation along the channel or spatial\ndimension. Additionally, the search sets include not only existing attention\nblocks but also other operations that have not been utilized in attention\nmechanisms before. To the best of our knowledge, SASE is the first attempt to\nsubdivide the attention search space and search for architectures beyond\ncurrently known attention modules. The searched attention module is tested with\nextensive experiments across a range of visual tasks. Experimental results\nindicate that visual backbone networks (ResNet-50/101) using the SASE attention\nmodule achieved the best performance compared to those using the current\nstate-of-the-art attention modules. Codes are included in the supplementary\nmaterial, and they will be made public later.\n","authors":["Hanming Wang","Yunlong Li","Zijun Wu","Huifen Wang","Yuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08328v1","updated":"2024-11-13T04:20:45Z","published":"2024-11-13T04:20:45Z","title":"Motion Control for Enhanced Complex Action Video Generation","summary":" Existing text-to-video (T2V) models often struggle with generating videos\nwith sufficiently pronounced or complex actions. A key limitation lies in the\ntext prompt's inability to precisely convey intricate motion details. To\naddress this, we propose a novel framework, MVideo, designed to produce\nlong-duration videos with precise, fluid actions. MVideo overcomes the\nlimitations of text prompts by incorporating mask sequences as an additional\nmotion condition input, providing a clearer, more accurate representation of\nintended actions. Leveraging foundational vision models such as GroundingDINO\nand SAM2, MVideo automatically generates mask sequences, enhancing both\nefficiency and robustness. Our results demonstrate that, after training, MVideo\neffectively aligns text prompts with motion conditions to produce videos that\nsimultaneously meet both criteria. This dual control mechanism allows for more\ndynamic video generation by enabling alterations to either the text prompt or\nmotion condition independently, or both in tandem. Furthermore, MVideo supports\nmotion condition editing and composition, facilitating the generation of videos\nwith more complex actions. MVideo thus advances T2V motion generation, setting\na strong benchmark for improved action depiction in current video diffusion\nmodels. Our project page is available at https://mvideo-v1.github.io/.\n","authors":["Qiang Zhou","Shaofeng Zhang","Nianzu Yang","Ye Qian","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2411.08328v1.pdf","comment":"Project page: https://mvideo-v1.github.io/"},{"id":"http://arxiv.org/abs/2411.07976v2","updated":"2024-11-13T03:56:10Z","published":"2024-11-12T17:55:39Z","title":"DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring","summary":" Coronary artery disease (CAD), one of the most common cause of mortality in\nthe world. Coronary artery calcium (CAC) scoring using computed tomography (CT)\nis key for risk assessment to prevent coronary disease. Previous studies on\nrisk assessment and calcification detection in CT scans primarily use\napproaches based on UNET architecture, frequently implemented on pre-built\nmodels. However, these models are limited by the availability of annotated CT\nscans containing CAC and suffering from imbalanced dataset, decreasing\nperformance of CAC segmentation and scoring. In this study, we extend this\napproach by incorporating the self-supervised learning (SSL) technique of DINO\n(self-distillation with no labels) to eliminate limitations of scarce annotated\ndata in CT scans. The DINO model's ability to train without requiring CAC area\nannotations enhances its robustness in generating distinct features. The DINO\nmodel is trained on to focus specifically on calcified areas by using labels,\naiming to generate features that effectively capture and highlight key\ncharacteristics. The label-guided DINO (DINO-LG) enhances classification by\ndistinguishing CT slices that contain calcification from those that do not,\nperforming 57% better than the standard DINO model in this task. CAC scoring\nand segmentation tasks are performed by a basic U-NET architecture, fed\nspecifically with CT slices containing calcified areas as identified by the\nDINO-LG model. This targeted identification performed by DINO-LG model improves\nCAC segmentation performance by approximately 10% and significant increase in\nCAC scoring accuracy.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Caner Ozcan"],"pdf_url":"https://arxiv.org/pdf/2411.07976v2.pdf","comment":"Developed by Center for Applied Artificial Intelligence (CAAI),\n University of Kentucky"},{"id":"http://arxiv.org/abs/2411.06106v2","updated":"2024-11-13T03:19:47Z","published":"2024-11-09T08:00:50Z","title":"Personalize to generalize: Towards a universal medical multi-modality\n generalization through personalization","summary":" The differences among medical imaging modalities, driven by distinct\nunderlying principles, pose significant challenges for generalization in\nmulti-modal medical tasks. Beyond modality gaps, individual variations, such as\ndifferences in organ size and metabolic rate, further impede a model's ability\nto generalize effectively across both modalities and diverse populations.\nDespite the importance of personalization, existing approaches to multi-modal\ngeneralization often neglect individual differences, focusing solely on common\nanatomical features. This limitation may result in weakened generalization in\nvarious medical tasks. In this paper, we unveil that personalization is\ncritical for multi-modal generalization. Specifically, we propose an approach\nto achieve personalized generalization through approximating the underlying\npersonalized invariant representation ${X}_h$ across various modalities by\nleveraging individual-level constraints and a learnable biological prior. We\nvalidate the feasibility and benefits of learning a personalized ${X}_h$,\nshowing that this representation is highly generalizable and transferable\nacross various multi-modal medical tasks. Extensive experimental results\nconsistently show that the additionally incorporated personalization\nsignificantly improves performance and generalization across diverse scenarios,\nconfirming its effectiveness.\n","authors":["Zhaorui Tan","Xi Yang","Tan Pan","Tianyi Liu","Chen Jiang","Xin Guo","Qiufeng Wang","Anh Nguyen","Yuan Qi","Kaizhu Huang","Yuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.06106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08305v1","updated":"2024-11-13T03:03:30Z","published":"2024-11-13T03:03:30Z","title":"Robust Divergence Learning for Missing-Modality Segmentation","summary":" Multimodal Magnetic Resonance Imaging (MRI) provides essential complementary\ninformation for analyzing brain tumor subregions. While methods using four\ncommon MRI modalities for automatic segmentation have shown success, they often\nface challenges with missing modalities due to image quality issues,\ninconsistent protocols, allergic reactions, or cost factors. Thus, developing a\nsegmentation paradigm that handles missing modalities is clinically valuable. A\nnovel single-modality parallel processing network framework based on H\\\"older\ndivergence and mutual information is introduced. Each modality is independently\ninput into a shared network backbone for parallel processing, preserving unique\ninformation. Additionally, a dynamic sharing framework is introduced that\nadjusts network parameters based on modality availability. A H\\\"older\ndivergence and mutual information-based loss functions are used for evaluating\ndiscrepancies between predictions and labels. Extensive testing on the BraTS\n2018 and BraTS 2020 datasets demonstrates that our method outperforms existing\ntechniques in handling missing modalities and validates each component's\neffectiveness.\n","authors":["Runze Cheng","Zhongao Sun","Ye Zhang","Chun Li"],"pdf_url":"https://arxiv.org/pdf/2411.08305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07463v2","updated":"2024-11-13T02:39:12Z","published":"2024-11-12T00:54:26Z","title":"MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation\n Models, Convolutional Neural Networks, and Uncertainty Quantification for\n High-Speed Video Phase Detection Data","summary":" Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in\nnuclear reactors, chemical processing, and electronics cooling for detecting\nvapor, liquid, and microlayer phases. Traditional segmentation models face\npixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ\nintroduces VideoSAM, a hybrid framework leveraging convolutional neural\nnetworks (CNNs) and transformer-based vision models to enhance segmentation\naccuracy and generalizability across complex multimodal PD tasks. Methods:\nVideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced\nfeature extraction and segmentation across diverse HSV PD modalities, spanning\nfluids like water, FC-72, nitrogen, and argon under varied heat flux\nconditions. The framework also incorporates uncertainty quantification (UQ) to\nassess pixel-based discretization errors, delivering reliable metrics such as\ncontact line density and dry area fraction under experimental conditions.\nResults: VideoSAM outperforms SAM and modality-specific CNN models in\nsegmentation accuracy, excelling in environments with complex phase boundaries,\noverlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid\narchitecture supports cross-dataset generalization, adapting effectively to\nvarying modalities. The UQ module provides accurate error estimates, enhancing\nthe reliability of segmentation outputs for advanced HSV PD research.\nConclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD\nsegmentation, addressing previous limitations with advanced deep learning and\nUQ techniques. The open-source datasets and tools introduced enable scalable,\nprecise, and adaptable segmentation for multimodal PD datasets, supporting\nadvancements in HSV analysis and autonomous experimentation. The codes and data\nused for this paper are publicly available at:\n\\url{https://github.com/chikap421/mseg_vcuq}\n","authors":["Chika Maduabuchi","Ericmoore Jossou","Matteo Bucci"],"pdf_url":"https://arxiv.org/pdf/2411.07463v2.pdf","comment":"Under Review in EAAI"},{"id":"http://arxiv.org/abs/2411.08293v1","updated":"2024-11-13T02:18:03Z","published":"2024-11-13T02:18:03Z","title":"Choix d'un espace de représentation image adapté à la détection\n de réseaux routiers","summary":" These last years, algorithms allowing to decompose an image into its\nstructures and textures components have emerged. In this paper, we present an\napplication of this type of decomposition to the problem road network detection\nin aerial or satelite imagery. The algorithmic procedure involves the image\ndecomposition (using a unique property), an alignment detection step based on\nthe Gestalt theory, and a refinement step using statistical active contours.\n","authors":["Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.08293v1.pdf","comment":"in French language"},{"id":"http://arxiv.org/abs/2411.08292v1","updated":"2024-11-13T02:17:57Z","published":"2024-11-13T02:17:57Z","title":"Noisy image decomposition: a new structure, texture and noise model\n based on local adaptivity","summary":" These last few years, image decomposition algorithms have been proposed to\nsplit an image into two parts: the structures and the textures. These\nalgorithms are not adapted to the case of noisy images because the textures are\ncorrupted by noise. In this paper, we propose a new model which decomposes an\nimage into three parts (structures, textures and noise) based on a local\nregularization scheme. We compare our results with the recent work of Aujol and\nChambolle. We finish by giving another model which combines the advantages of\nthe two previous ones.\n","authors":["Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.08292v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2411.05265"},{"id":"http://arxiv.org/abs/2411.08291v1","updated":"2024-11-13T02:17:52Z","published":"2024-11-13T02:17:52Z","title":"Restoration algorithms and system performance evaluation for active\n imagers","summary":" This paper deals with two fields related to active imaging system. First, we\nbegin to explore image processing algorithms to restore the artefacts like\nspeckle, scintillation and image dancing caused by atmospheric turbulence.\nNext, we examine how to evaluate the performance of this kind of systems. To do\nthis task, we propose a modified version of the german TRM3 metric which\npermits to get MTF-like measures. We use the database acquired during NATO-TG40\nfield trials to make our tests.\n","authors":["Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.08291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03644v2","updated":"2024-11-13T01:45:31Z","published":"2024-09-05T16:02:11Z","title":"RealisHuman: A Two-Stage Approach for Refining Malformed Human Parts in\n Generated Images","summary":" In recent years, diffusion models have revolutionized visual generation,\noutperforming traditional frameworks like Generative Adversarial Networks\n(GANs). However, generating images of humans with realistic semantic parts,\nsuch as hands and faces, remains a significant challenge due to their intricate\nstructural complexity. To address this issue, we propose a novel\npost-processing solution named RealisHuman. The RealisHuman framework operates\nin two stages. First, it generates realistic human parts, such as hands or\nfaces, using the original malformed parts as references, ensuring consistent\ndetails with the original image. Second, it seamlessly integrates the rectified\nhuman parts back into their corresponding positions by repainting the\nsurrounding areas to ensure smooth and realistic blending. The RealisHuman\nframework significantly enhances the realism of human generation, as\ndemonstrated by notable improvements in both qualitative and quantitative\nmetrics. Code is available at https://github.com/Wangbenzhi/RealisHuman.\n","authors":["Benzhi Wang","Jingkai Zhou","Jingqi Bai","Yang Yang","Weihua Chen","Fan Wang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2409.03644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05386v2","updated":"2024-11-13T01:40:53Z","published":"2024-05-08T19:31:06Z","title":"Interpretability Needs a New Paradigm","summary":" Interpretability is the study of explaining models in understandable terms to\nhumans. At present, interpretability is divided into two paradigms: the\nintrinsic paradigm, which believes that only models designed to be explained\ncan be explained, and the post-hoc paradigm, which believes that black-box\nmodels can be explained. At the core of this debate is how each paradigm\nensures its explanations are faithful, i.e., true to the model's behavior. This\nis important, as false but convincing explanations lead to unsupported\nconfidence in artificial intelligence (AI), which can be dangerous. This\npaper's position is that we should think about new paradigms while staying\nvigilant regarding faithfulness. First, by examining the history of paradigms\nin science, we see that paradigms are constantly evolving. Then, by examining\nthe current paradigms, we can understand their underlying beliefs, the value\nthey bring, and their limitations. Finally, this paper presents 3 emerging\nparadigms for interpretability. The first paradigm designs models such that\nfaithfulness can be easily measured. Another optimizes models such that\nexplanations become faithful. The last paradigm proposes to develop models that\nproduce both a prediction and an explanation.\n","authors":["Andreas Madsen","Himabindu Lakkaraju","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2405.05386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08279v1","updated":"2024-11-13T01:38:06Z","published":"2024-11-13T01:38:06Z","title":"MBA-SLAM: Motion Blur Aware Dense Visual SLAM with Radiance Fields\n Representation","summary":" Emerging 3D scene representations, such as Neural Radiance Fields (NeRF) and\n3D Gaussian Splatting (3DGS), have demonstrated their effectiveness in\nSimultaneous Localization and Mapping (SLAM) for photo-realistic rendering,\nparticularly when using high-quality video sequences as input. However,\nexisting methods struggle with motion-blurred frames, which are common in\nreal-world scenarios like low-light or long-exposure conditions. This often\nresults in a significant reduction in both camera localization accuracy and map\nreconstruction quality. To address this challenge, we propose a dense visual\nSLAM pipeline (i.e. MBA-SLAM) to handle severe motion-blurred inputs. Our\napproach integrates an efficient motion blur-aware tracker with either neural\nradiance fields or Gaussian Splatting based mapper. By accurately modeling the\nphysical image formation process of motion-blurred images, our method\nsimultaneously learns 3D scene representation and estimates the cameras' local\ntrajectory during exposure time, enabling proactive compensation for motion\nblur caused by camera movement. In our experiments, we demonstrate that\nMBA-SLAM surpasses previous state-of-the-art methods in both camera\nlocalization and map reconstruction, showcasing superior performance across a\nrange of datasets, including synthetic and real datasets featuring sharp images\nas well as those affected by motion blur, highlighting the versatility and\nrobustness of our approach. Code is available at\nhttps://github.com/WU-CVGL/MBA-SLAM.\n","authors":["Peng Wang","Lingzhe Zhao","Yin Zhang","Shiyu Zhao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08272v1","updated":"2024-11-13T00:49:05Z","published":"2024-11-13T00:49:05Z","title":"LBONet: Supervised Spectral Descriptors for Shape Analysis","summary":" The Laplace-Beltrami operator has established itself in the field of\nnon-rigid shape analysis due to its many useful properties such as being\ninvariant under isometric transformation, having a countable eigensystem\nforming an orthonormal basis, and fully characterizing geodesic distances of\nthe manifold. However, this invariancy only applies under isometric\ndeformations, which leads to a performance breakdown in many real-world\napplications. In recent years emphasis has been placed upon extracting optimal\nfeatures using deep learning methods, however spectral signatures play a\ncrucial role and still add value. In this paper we take a step back, revisiting\nthe LBO and proposing a supervised way to learn several operators on a\nmanifold. Depending on the task, by applying these functions, we can train the\nLBO eigenbasis to be more task-specific. The optimization of the LBO leads to\nenormous improvements to established descriptors such as the heat kernel\nsignature in various tasks such as retrieval, classification, segmentation, and\ncorrespondence, proving the adaption of the LBO eigenbasis to both global and\nhighly local learning settings.\n","authors":["Oguzhan Yigit","Richard C. Wilson"],"pdf_url":"https://arxiv.org/pdf/2411.08272v1.pdf","comment":"14 pages, 13 figure"},{"id":"http://arxiv.org/abs/2404.09995v2","updated":"2024-11-13T00:41:01Z","published":"2024-04-15T17:59:57Z","title":"Taming Latent Diffusion Model for Neural Radiance Field Inpainting","summary":" Neural Radiance Field (NeRF) is a representation for 3D reconstruction from\nmulti-view images. Despite some recent work showing preliminary success in\nediting a reconstructed NeRF with diffusion prior, they remain struggling to\nsynthesize reasonable geometry in completely uncovered regions. One major\nreason is the high diversity of synthetic contents from the diffusion model,\nwhich hinders the radiance field from converging to a crisp and deterministic\ngeometry. Moreover, applying latent diffusion models on real data often yields\na textural shift incoherent to the image condition due to auto-encoding errors.\nThese two problems are further reinforced with the use of pixel-distance\nlosses. To address these issues, we propose tempering the diffusion model's\nstochasticity with per-scene customization and mitigating the textural shift\nwith masked adversarial training. During the analyses, we also found the\ncommonly used pixel and perceptual losses are harmful in the NeRF inpainting\ntask. Through rigorous experiments, our framework yields state-of-the-art NeRF\ninpainting results on various real-world scenes. Project page:\nhttps://hubert0527.github.io/MALD-NeRF\n","authors":["Chieh Hubert Lin","Changil Kim","Jia-Bin Huang","Qinbo Li","Chih-Yao Ma","Johannes Kopf","Ming-Hsuan Yang","Hung-Yu Tseng"],"pdf_url":"https://arxiv.org/pdf/2404.09995v2.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://hubert0527.github.io/MALD-NeRF"},{"id":"http://arxiv.org/abs/2406.08164v3","updated":"2024-11-13T00:15:20Z","published":"2024-06-12T12:54:27Z","title":"ConMe: Rethinking Evaluation of Compositional Reasoning for Modern VLMs","summary":" Compositional Reasoning (CR) entails grasping the significance of attributes,\nrelations, and word order. Recent Vision-Language Models (VLMs), comprising a\nvisual encoder and a Large Language Model (LLM) decoder, have demonstrated\nremarkable proficiency in such reasoning tasks. This prompts a crucial\nquestion: have VLMs effectively tackled the CR challenge? We conjecture that\nexisting CR benchmarks may not adequately push the boundaries of modern VLMs\ndue to the reliance on an LLM-only negative text generation pipeline.\nConsequently, the negatives produced either appear as outliers from the natural\nlanguage distribution learned by VLMs' LLM decoders or as improbable within the\ncorresponding image context. To address these limitations, we introduce ConMe\n-- a compositional reasoning benchmark and a novel data generation pipeline\nleveraging VLMs to produce `hard CR Q&A'. Through a new concept of VLMs\nconversing with each other to collaboratively expose their weaknesses, our\npipeline autonomously generates, evaluates, and selects challenging\ncompositional reasoning questions, establishing a robust CR benchmark, also\nsubsequently validated manually. Our benchmark provokes a noteworthy, up to\n33%, decrease in CR performance compared to preceding benchmarks, reinstating\nthe CR challenge even for state-of-the-art VLMs.\n","authors":["Irene Huang","Wei Lin","M. Jehanzeb Mirza","Jacob A. Hansen","Sivan Doveh","Victor Ion Butoi","Roei Herzig","Assaf Arbelle","Hilde Kuehne","Trevor Darrell","Chuang Gan","Aude Oliva","Rogerio Feris","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2406.08164v3.pdf","comment":"NeurIPS 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2411.09077v1","updated":"2024-11-13T23:09:53Z","published":"2024-11-13T23:09:53Z","title":"Drone Detection using Deep Neural Networks Trained on Pure Synthetic\n Data","summary":" Drone detection has benefited from improvements in deep neural networks, but\nlike many other applications, suffers from the availability of accurate data\nfor training. Synthetic data provides a potential for low-cost data generation\nand has been shown to improve data availability and quality. However, models\ntrained on synthetic datasets need to prove their ability to perform on\nreal-world data, known as the problem of sim-to-real transferability. Here, we\npresent a drone detection Faster-RCNN model trained on a purely synthetic\ndataset that transfers to real-world data. We found that it achieves an AP_50\nof 97.0% when evaluated on the MAV-Vid - a real dataset of flying drones -\ncompared with 97.8% for an equivalent model trained on real-world data. Our\nresults show that using synthetic data for drone detection has the potential to\nreduce data collection costs and improve labelling quality. These findings\ncould be a starting point for more elaborate synthetic drone datasets. For\nexample, realistic recreations of specific scenarios could de-risk the dataset\ngeneration of safety-critical applications such as the detection of drones at\nairports. Further, synthetic data may enable reliable drone detection systems,\nwhich could benefit other areas, such as unmanned traffic management systems.\nThe code is available\nhttps://github.com/mazqtpopx/cranfield-synthetic-drone-detection alongside the\ndatasets\nhttps://huggingface.co/datasets/mazqtpopx/cranfield-synthetic-drone-detection.\n","authors":["Mariusz Wisniewski","Zeeshan A. Rana","Ivan Petrunin","Alan Holt","Stephen Harman"],"pdf_url":"https://arxiv.org/pdf/2411.09077v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.09066v1","updated":"2024-11-13T22:47:24Z","published":"2024-11-13T22:47:24Z","title":"A multidimensional measurement of photorealistic avatar quality of\n experience","summary":" Photorealistic avatars are human avatars that look, move, and talk like real\npeople. The performance of photorealistic avatars has significantly improved\nrecently based on objective metrics such as PSNR, SSIM, LPIPS, FID, and FVD.\nHowever, recent photorealistic avatar publications do not provide subjective\ntests of the avatars to measure human usability factors. We provide an open\nsource test framework to subjectively measure photorealistic avatar performance\nin ten dimensions: realism, trust, comfortableness using, comfortableness\ninteracting with, appropriateness for work, creepiness, formality, affinity,\nresemblance to the person, and emotion accuracy. We show that the correlation\nof nine of these subjective metrics with PSNR, SSIM, LPIPS, FID, and FVD is\nweak, and moderate for emotion accuracy. The crowdsourced subjective test\nframework is highly reproducible and accurate when compared to a panel of\nexperts. We analyze a wide range of avatars from photorealistic to cartoon-like\nand show that some photorealistic avatars are approaching real video\nperformance based on these dimensions. We also find that for avatars above a\ncertain level of realism, eight of these measured dimensions are strongly\ncorrelated. In particular, for photorealistic avatars there is a linear\nrelationship between avatar affinity and realism; in other words, there is no\nuncanny valley effect for photorealistic avatars in the telecommunication\nscenario. We provide several extensions of this test framework for future work\nand discuss design implications for telecommunication systems. The test\nframework is available at https://github.com/microsoft/P.910.\n","authors":["Ross Cutler","Babak Naderi","Vishak Gopal","Dharmendar Palle"],"pdf_url":"https://arxiv.org/pdf/2411.09066v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2204.06784"},{"id":"http://arxiv.org/abs/2411.09062v1","updated":"2024-11-13T22:43:15Z","published":"2024-11-13T22:43:15Z","title":"Multimodal Object Detection using Depth and Image Data for Manufacturing\n Parts","summary":" Manufacturing requires reliable object detection methods for precise picking\nand handling of diverse types of manufacturing parts and components.\nTraditional object detection methods utilize either only 2D images from cameras\nor 3D data from lidars or similar 3D sensors. However, each of these sensors\nhave weaknesses and limitations. Cameras do not have depth perception and 3D\nsensors typically do not carry color information. These weaknesses can\nundermine the reliability and robustness of industrial manufacturing systems.\nTo address these challenges, this work proposes a multi-sensor system combining\nan red-green-blue (RGB) camera and a 3D point cloud sensor. The two sensors are\ncalibrated for precise alignment of the multimodal data captured from the two\nhardware devices. A novel multimodal object detection method is developed to\nprocess both RGB and depth data. This object detector is based on the Faster\nR-CNN baseline that was originally designed to process only camera images. The\nresults show that the multimodal model significantly outperforms the depth-only\nand RGB-only baselines on established object detection metrics. More\nspecifically, the multimodal model improves mAP by 13% and raises Mean\nPrecision by 11.8% in comparison to the RGB-only baseline. Compared to the\ndepth-only baseline, it improves mAP by 78% and raises Mean Precision by 57%.\nHence, this method facilitates more reliable and robust object detection in\nservice to smart manufacturing applications.\n","authors":["Nazanin Mahjourian","Vinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.09062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09037v1","updated":"2024-11-13T21:31:12Z","published":"2024-11-13T21:31:12Z","title":"A Transformer-Based Visual Piano Transcription Algorithm","summary":" Automatic music transcription (AMT) for musical performances is a long\nstanding problem in the field of Music Information Retrieval (MIR). Visual\npiano transcription (VPT) is a multimodal subproblem of AMT which focuses on\nextracting a symbolic representation of a piano performance from visual\ninformation only (e.g., from a top-down video of the piano keyboard). Inspired\nby the success of Transformers for audio-based AMT, as well as their recent\nsuccesses in other computer vision tasks, in this paper we present a\nTransformer based architecture for VPT. The proposed VPT system combines a\npiano bounding box detection model with an onset and pitch detection model,\nallowing our system to perform well in more naturalistic conditions like\nimperfect image crops around the piano and slightly tilted images.\n","authors":["Uros Zivanovic","Carlos Eduardo Cancino-Chacón"],"pdf_url":"https://arxiv.org/pdf/2411.09037v1.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.09023v1","updated":"2024-11-13T21:00:28Z","published":"2024-11-13T21:00:28Z","title":"CoMiX: Cross-Modal Fusion with Deformable Convolutions for HSI-X\n Semantic Segmentation","summary":" Improving hyperspectral image (HSI) semantic segmentation by exploiting\ncomplementary information from a supplementary data type (referred to\nX-modality) is promising but challenging due to differences in imaging sensors,\nimage content, and resolution. Current techniques struggle to enhance\nmodality-specific and modality-shared information, as well as to capture\ndynamic interaction and fusion between different modalities. In response, this\nstudy proposes CoMiX, an asymmetric encoder-decoder architecture with\ndeformable convolutions (DCNs) for HSI-X semantic segmentation. CoMiX is\ndesigned to extract, calibrate, and fuse information from HSI and X data. Its\npipeline includes an encoder with two parallel and interacting backbones and a\nlightweight all-multilayer perceptron (ALL-MLP) decoder. The encoder consists\nof four stages, each incorporating 2D DCN blocks for the X model to accommodate\ngeometric variations and 3D DCN blocks for HSIs to adaptively aggregate\nspatial-spectral features. Additionally, each stage includes a Cross-Modality\nFeature enhancement and eXchange (CMFeX) module and a feature fusion module\n(FFM). CMFeX is designed to exploit spatial-spectral correlations from\ndifferent modalities to recalibrate and enhance modality-specific and\nmodality-shared features while adaptively exchanging complementary information\nbetween them. Outputs from CMFeX are fed into the FFM for fusion and passed to\nthe next stage for further information learning. Finally, the outputs from each\nFFM are integrated by the ALL-MLP decoder for final prediction. Extensive\nexperiments demonstrate that our CoMiX achieves superior performance and\ngeneralizes well to various multimodal recognition tasks. The CoMiX code will\nbe released.\n","authors":["Xuming Zhang","Xingfa Gu","Qingjiu Tian","Lorenzo Bruzzone"],"pdf_url":"https://arxiv.org/pdf/2411.09023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09018v1","updated":"2024-11-13T20:50:04Z","published":"2024-11-13T20:50:04Z","title":"Bridging the Visual Gap: Fine-Tuning Multimodal Models with\n Knowledge-Adapted Captions","summary":" Recent research increasingly focuses on training vision-language models\n(VLMs) with long, detailed image captions. However, small-scale VLMs often\nstruggle to balance the richness of these captions with the risk of\nhallucinating content during fine-tuning. In this paper, we explore how well\nVLMs adapt to such captions. To quantify caption quality, we propose Decomposed\nNLI (DNLI), an evaluation framework that breaks down generated captions into\nindividual propositions, assessing each in isolation. This fine-grained\nanalysis reveals a critical balance between capturing descriptive details and\npreventing hallucinations. Our findings show that simply reducing caption\ncomplexity or employing standard data curation techniques does not effectively\nresolve this issue. To tackle this challenge, we introduce Knowledge Adapted\n(KnowAda) fine-tuning, a data-centric approach that automatically adapts\ntraining data with the model's existing knowledge and visual understanding.\nKnowAda minimizes hallucinations while preserving high descriptiveness. We\nvalidate this approach across several small-scale VLMs (up to 7B parameters)\nand dense caption datasets, demonstrating that KnowAda effectively balances\nhallucination reduction and descriptiveness. Our results show that KnowAda\noutperforms various baselines in both automatic metrics and human evaluations.\nWe will release our code and models.\n","authors":["Moran Yanuka","Assaf Ben Kish","Yonatan Bitton","Idan Szpektor","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2411.09018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15955v2","updated":"2024-11-13T20:30:13Z","published":"2024-06-22T22:43:10Z","title":"Beyond the Doors of Perception: Vision Transformers Represent Relations\n Between Objects","summary":" Though vision transformers (ViTs) have achieved state-of-the-art performance\nin a variety of settings, they exhibit surprising failures when performing\ntasks involving visual relations. This begs the question: how do ViTs attempt\nto perform tasks that require computing visual relations between objects? Prior\nefforts to interpret ViTs tend to focus on characterizing relevant low-level\nvisual features. In contrast, we adopt methods from mechanistic\ninterpretability to study the higher-level visual algorithms that ViTs use to\nperform abstract visual reasoning. We present a case study of a fundamental,\nyet surprisingly difficult, relational reasoning task: judging whether two\nvisual entities are the same or different. We find that pretrained ViTs\nfine-tuned on this task often exhibit two qualitatively different stages of\nprocessing despite having no obvious inductive biases to do so: 1) a perceptual\nstage wherein local object features are extracted and stored in a disentangled\nrepresentation, and 2) a relational stage wherein object representations are\ncompared. In the second stage, we find evidence that ViTs can learn to\nrepresent somewhat abstract visual relations, a capability that has long been\nconsidered out of reach for artificial neural networks. Finally, we demonstrate\nthat failures at either stage can prevent a model from learning a generalizable\nsolution to our fairly simple tasks. By understanding ViTs in terms of discrete\nprocessing stages, one can more precisely diagnose and rectify shortcomings of\nexisting and future models.\n","authors":["Michael A. Lepori","Alexa R. Tartaglini","Wai Keen Vong","Thomas Serre","Brenden M. Lake","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2406.15955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02501v3","updated":"2024-11-13T20:30:04Z","published":"2024-01-04T19:25:00Z","title":"A metric embedding kernel for live cell microscopy signaling patterns","summary":" Live cell microscopy captures 5-D $(x,y,z,channel,time)$ movies that display\npatterns of cellular motion and signaling dynamics. We present here a metric\nkernel function for spatiotemporal patterns of cell signaling dynamics in 5-D\nlive cell microscopy movies unique in requiring no a priori knowledge of\nexpected pattern dynamics, and no training data. The approach uses Kolmogorov\ncomplexity theory to compute a metric distance between movies and to measure\nthe meaningful information among subsets of movies. Cell signaling kymographs\nstore at each spatiotemporal cell centroid the cell signaling state, or a\nfunctional output such as velocity. Patterns of similarity are identified via\nthe metric normalized compression distance (NCD). The NCD is a reproducing\nkernel for a Hilbert space that represents the input cell signaling kymographs\nas points in a low dimensional embedding that optimally captures the pattern\nsimilarity identified by the NCD throughout the space. The only parameter is\nthe expected cell radii ($\\mu m$). A new formulation of the cluster structure\nfunction optimally estimates the meaningful information captured by the\nembedding. Also presented is the cell signaling structure function (SSF), a\nKolmogorov structure function that optimally measures cell signaling state as\nnuclear intensity w.r.t. surrounding cytoplasm, a significant improvement\ncompared to the current state-of-the-art cytonuclear ratio. Results are\npresented quantifying the impact of ERK and AKT signaling between different\noncogenic mutations, and by the relation between ERK signaling and cellular\nvelocity patterns for movies of 2-D monolayers of human breast epithelial\n(MCF10A) cells, 3-D MCF10A spheroids under optogenetic manipulation of ERK, and\nhuman induced pluripotent stem cells.\n","authors":["Layton Aho","Mark Winter","Marc DeCarlo","Agne Frismantiene","Yannick Blum","Paolo Armando Gagliardi","Olivier Pertz","Andrew R. Cohen"],"pdf_url":"https://arxiv.org/pdf/2401.02501v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09007v1","updated":"2024-11-13T20:17:30Z","published":"2024-11-13T20:17:30Z","title":"Scale Contrastive Learning with Selective Attentions for Blind Image\n Quality Assessment","summary":" Blind image quality assessment (BIQA) serves as a fundamental task in\ncomputer vision, yet it often fails to consistently align with human subjective\nperception. Recent advances show that multi-scale evaluation strategies are\npromising due to their ability to replicate the hierarchical structure of human\nvision. However, the effectiveness of these strategies is limited by a lack of\nunderstanding of how different image scales influence perceived quality. This\npaper addresses two primary challenges: the significant redundancy of\ninformation across different scales, and the confusion caused by combining\nfeatures from these scales, which may vary widely in quality. To this end, a\nnew multi-scale BIQA framework is proposed, namely Contrast-Constrained\nScale-Focused IQA Framework (CSFIQA). CSFIQA features a selective focus\nattention mechanism to minimize information redundancy and highlight critical\nquality-related information. Additionally, CSFIQA includes a scale-level\ncontrastive learning module equipped with a noise sample matching mechanism to\nidentify quality discrepancies across the same image content at different\nscales. By exploring the intrinsic relationship between image scales and the\nperceived quality, the proposed CSFIQA achieves leading performance on eight\nbenchmark datasets, e.g., achieving SRCC values of 0.967 (versus 0.947 in CSIQ)\nand 0.905 (versus 0.876 in LIVEC).\n","authors":["Zihao Huang","Xudong Li","Bohan Fu","Xiaohui Chu","Ke Li","Yunhang Shen","Yan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08995v1","updated":"2024-11-13T19:34:10Z","published":"2024-11-13T19:34:10Z","title":"Computed tomography using meta-optics","summary":" Computer vision tasks require processing large amounts of data to perform\nimage classification, segmentation, and feature extraction. Optical\npreprocessors can potentially reduce the number of floating point operations\nrequired by computer vision tasks, enabling low-power and low-latency\noperation. However, existing optical preprocessors are mostly learned and hence\nstrongly depend on the training data, and thus lack universal applicability. In\nthis paper, we present a metaoptic imager, which implements the Radon transform\nobviating the need for training the optics. High quality image reconstruction\nwith a large compression ratio of 0.6% is presented through the use of the\nSimultaneous Algebraic Reconstruction Technique. Image classification with 90%\naccuracy is presented on an experimentally measured Radon dataset through\nneural network trained on digitally transformed images.\n","authors":["Maksym Zhelyeznuyakov","Johannes E. Fröch","Shane Colburn","Steven L. Brunton","Arka Majumdar"],"pdf_url":"https://arxiv.org/pdf/2411.08995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08992v1","updated":"2024-11-13T19:33:08Z","published":"2024-11-13T19:33:08Z","title":"IDCIA: Immunocytochemistry Dataset for Cellular Image Analysis","summary":" We present a new annotated microscopic cellular image dataset to improve the\neffectiveness of machine learning methods for cellular image analysis. Cell\ncounting is an important step in cell analysis. Typically, domain experts\nmanually count cells in a microscopic image. Automated cell counting can\npotentially eliminate this tedious, time-consuming process. However, a good,\nlabeled dataset is required for training an accurate machine learning model.\nOur dataset includes microscopic images of cells, and for each image, the cell\ncount and the location of individual cells. The data were collected as part of\nan ongoing study investigating the potential of electrical stimulation to\nmodulate stem cell differentiation and possible applications for neural repair.\nCompared to existing publicly available datasets, our dataset has more images\nof cells stained with more variety of antibodies (protein components of immune\nresponses against invaders) typically used for cell analysis. The experimental\nresults on this dataset indicate that none of the five existing models under\nthis study are able to achieve sufficiently accurate count to replace the\nmanual methods. The dataset is available at\nhttps://figshare.com/articles/dataset/Dataset/21970604.\n","authors":["Abdurahman Ali Mohammed","Catherine Fonder","Donald S. Sakaguchi","Wallapak Tavanapong","Surya K. Mallapragada","Azeez Idris"],"pdf_url":"https://arxiv.org/pdf/2411.08992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08975v1","updated":"2024-11-13T19:06:57Z","published":"2024-11-13T19:06:57Z","title":"Fluoroformer: Scaling multiple instance learning to multiplexed images\n via attention-based channel fusion","summary":" Though multiple instance learning (MIL) has been a foundational strategy in\ncomputational pathology for processing whole slide images (WSIs), current\napproaches are designed for traditional hematoxylin and eosin (H&E) slides\nrather than emerging multiplexed technologies. Here, we present an MIL\nstrategy, the Fluoroformer module, that is specifically tailored to multiplexed\nWSIs by leveraging scaled dot-product attention (SDPA) to interpretably fuse\ninformation across disparate channels. On a cohort of 434 non-small cell lung\ncancer (NSCLC) samples, we show that the Fluoroformer both obtains strong\nprognostic performance and recapitulates immuno-oncological hallmarks of NSCLC.\nOur technique thereby provides a path for adapting state-of-the-art AI\ntechniques to emerging spatial biology assays.\n","authors":["Marc Harary","Eliezer M. Van Allen","William Lotter"],"pdf_url":"https://arxiv.org/pdf/2411.08975v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 14 pages"},{"id":"http://arxiv.org/abs/1902.00615v6","updated":"2024-11-13T18:32:53Z","published":"2019-02-02T01:52:53Z","title":"Confidence Trigger Detection: Accelerating Real-time\n Tracking-by-detection Systems","summary":" Real-time object tracking necessitates a delicate balance between speed and\naccuracy, a challenge exacerbated by the computational demands of deep learning\nmethods. In this paper, we propose Confidence-Triggered Detection (CTD), an\ninnovative approach that strategically bypasses object detection for frames\nclosely resembling intermediate states, leveraging tracker confidence scores.\nCTD not only enhances tracking speed but also preserves accuracy, surpassing\nexisting tracking algorithms. Through extensive evaluation across various\ntracker confidence thresholds, we identify an optimal trade-off between\ntracking speed and accuracy, providing crucial insights for parameter\nfine-tuning and enhancing CTD's practicality in real-world scenarios. Our\nexperiments across diverse detection models underscore the robustness and\nversatility of the CTD framework, demonstrating its potential to enable\nreal-time tracking in resource-constrained environments.\n","authors":["Zhicheng Ding","Zhixin Lai","Siyang Li","Panfeng Li","Qikai Yang","Edward Wong"],"pdf_url":"https://arxiv.org/pdf/1902.00615v6.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2411.08835v1","updated":"2024-11-13T18:14:23Z","published":"2024-11-13T18:14:23Z","title":"Goal-oriented Semantic Communication for Robot Arm Reconstruction in\n Digital Twin: Feature and Temporal Selections","summary":" As one of the most promising technologies in industry, the Digital Twin (DT)\nfacilitates real-time monitoring and predictive analysis for real-world systems\nby precisely reconstructing virtual replicas of physical entities. However,\nthis reconstruction faces unprecedented challenges due to the everincreasing\ncommunication overhead, especially for digital robot arm reconstruction. To\nthis end, we propose a novel goal-oriented semantic communication (GSC)\nframework to extract the GSC information for the robot arm reconstruction task\nin the DT, with the aim of minimising the communication load under the strict\nand relaxed reconstruction error constraints. Unlike the traditional\nreconstruction framework that periodically transmits a reconstruction message\nfor real-time DT reconstruction, our framework implements a feature selection\n(FS) algorithm to extract the semantic information from the reconstruction\nmessage, and a deep reinforcement learning-based temporal selection algorithm\nto selectively transmit the semantic information over time. We validate our\nproposed GSC framework through both Pybullet simulations and lab experiments\nbased on the Franka Research 3 robot arm. For a range of distinct robotic\ntasks, simulation results show that our framework can reduce the communication\nload by at least 59.5% under strict reconstruction error constraints and 80%\nunder relaxed reconstruction error constraints, compared with traditional\ncommunication framework. Also, experimental results confirm the effectiveness\nof our framework, where the communication load is reduced by 53% in strict\nconstraint case and 74% in relaxed constraint case. The demo is available at:\nhttps://youtu.be/2OdeHKxcgnk.\n","authors":["Shutong Chen","Emmanouil Spyrakos-Papastavridis","Yichao Jin","Yansha Deng"],"pdf_url":"https://arxiv.org/pdf/2411.08835v1.pdf","comment":"Submitted to IEEE for potential publication"},{"id":"http://arxiv.org/abs/2411.08761v1","updated":"2024-11-13T16:48:37Z","published":"2024-11-13T16:48:37Z","title":"AI-Enhanced Inverter Fault and Anomaly Detection System for Distributed\n Energy Resources in Microgrids","summary":" The integration of Distributed Energy Resources (DERs) into power\ndistribution systems has made microgrids foundational to grid modernization.\nThese DERs, connected through power electronic inverters, create power\nelectronics dominated grid architecture, introducing unique challenges for\nfault detection. While external line faults are widely studied, inverter faults\nremain a critical yet underexplored issue. This paper proposes various data\nmining techniques for the effective detection and localization of inverter\nfaults-essential for preventing catastrophic grid failures. Furthermore, the\ndifficulty of differentiating between system anomalies and internal inverter\nfaults within Power Electronics-Driven Grids (PEDGs) is addressed. To enhance\ngrid resilience, this work applies advanced artificial intelligence methods to\ndistinguish anomalies from true internal faults, identifying the specific\nmalfunctioning switch. The proposed FaultNet-ML methodology is validated on a\n9-bus system dominated by inverters, illustrating its robustness in a PEDG\nenvironment.\n","authors":["Swetha Rani Kasimalla","Kuchan Park","Junho Hong","Young-Jin Kim","HyoJong Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08761v1.pdf","comment":"5 pages, 2 figures, submitted to 2025 IEEE Power and Energy Society\n General Meeting (PESGM 2025), Austin, TX"},{"id":"http://arxiv.org/abs/2411.08759v1","updated":"2024-11-13T16:46:10Z","published":"2024-11-13T16:46:10Z","title":"Clutter-Aware Target Detection for ISAC in a Millimeter-Wave Cell-Free\n Massive MIMO System","summary":" In this paper, we investigate the performance of an integrated sensing and\ncommunication (ISAC) system within a cell-free massive multiple-input\nmultiple-output (MIMO) system. Each access point (AP) operates in the\nmillimeter-wave (mmWave) frequency band. The APs jointly serve the user\nequipments (UEs) in the downlink while simultaneously detecting a target\nthrough dedicated sensing beams, which are directed toward a reconfigurable\nintelligent surface (RIS). Although the AP-RIS, RIS-target, and AP-target\nchannels have both line-of-sight (LoS) and non-line-of-sight (NLoS) parts, it\nis assumed only knowledge of the LoS paths is available. A key contribution of\nthis study is the consideration of clutter, which degrades the target detection\nif not handled. We propose an algorithm to alternatively optimize the transmit\npower allocation and the RIS phase-shift matrix, maximizing the target\nsignal-to-clutter-plus-noise ratio (SCNR) while ensuring a minimum\nsignal-to-interference-plus-noise ratio (SINR) for the UEs. Numerical results\ndemonstrate that exploiting clutter subspace significantly enhances detection\nprobability, particularly at high clutter-to-noise ratios, and reveal that an\nincreased number of transmit side clusters impair detection performance.\nFinally, we highlight the performance gains achieved using a dedicated sensing\nstream.\n","authors":["Steven Rivetti","Ozlem Tugfe Demir","Emil Bjornson","Mikael Skoglund"],"pdf_url":"https://arxiv.org/pdf/2411.08759v1.pdf","comment":"submitted to IEEE ICC25"},{"id":"http://arxiv.org/abs/2411.08754v1","updated":"2024-11-13T16:31:13Z","published":"2024-11-13T16:31:13Z","title":"Logic-based Knowledge Awareness for Autonomous Agents in Continuous\n Spaces","summary":" This paper presents a step towards a formal controller design method for\nautonomous agents based on knowledge awareness to improve decision-making. Our\napproach is to first create an organized repository of information (a knowledge\nbase) for autonomous agents which can be accessed and then translated into\ntemporal specifications. Secondly, to develop a controller with formal\nguarantees that meets a combination of mission-specific objective and the\nspecification from the knowledge base, we utilize an abstraction-based\ncontroller design (ABCD) approach, capable of managing both nonlinear dynamics\nand temporal requirements. Unlike the conventional offline ABCD approach, our\nmethod dynamically updates the controller whenever the knowledge base prompts\nchanges in the specifications. A three-dimensional nonlinear car model\nnavigating an urban road scenario with traffic signs and obstacles is\nconsidered for validation. Results show the effectiveness of the method in\nguiding the autonomous agents to the target while complying with the knowledge\nbase and the mission-specific objective.\n","authors":["Arabinda Ghosh","Mahmoud Salamati","Sadegh Soudjani"],"pdf_url":"https://arxiv.org/pdf/2411.08754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08734v1","updated":"2024-11-13T16:16:22Z","published":"2024-11-13T16:16:22Z","title":"Recommender systems and reinforcement learning for building control and\n occupant interaction: A text-mining driven review of scientific literature","summary":" The indoor environment greatly affects health and well-being; enhancing\nhealth and reducing energy use in these settings is a key research focus. With\nadvancing Information and Communication Technology (ICT), recommendation\nsystems and reinforcement learning have emerged as promising methods to induce\nbehavioral changes that improve indoor environments and building energy\nefficiency. This study employs text-mining and Natural Language Processing\n(NLP) to examine these approaches in building control and occupant interaction.\nAnalyzing approximately 27,000 articles from the ScienceDirect database, we\nfound extensive use of recommendation systems and reinforcement learning for\nspace optimization, location recommendations, and personalized control\nsuggestions. Despite broad applications, their use in optimizing indoor\nenvironments and energy efficiency is limited. Traditional recommendation\nalgorithms are commonly used, but optimizing indoor conditions and energy\nefficiency often requires advanced machine learning techniques like\nreinforcement and deep learning. This review highlights the potential for\nexpanding recommender systems and reinforcement learning applications in\nbuildings and indoor environments. Areas for innovation include predictive\nmaintenance, building-related product recommendations, and optimizing\nenvironments for specific needs like sleep and productivity enhancements based\non user feedback.\n","authors":["Wenhao Zhang","Matias Quintana","Clayton Miller"],"pdf_url":"https://arxiv.org/pdf/2411.08734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08678v1","updated":"2024-11-13T15:11:11Z","published":"2024-11-13T15:11:11Z","title":"Identification of Power Systems with Droop-Controlled Units Using Neural\n Ordinary Differential Equations","summary":" In future power systems, the detailed structure and dynamics may not always\nbe fully known. This is due to an increasing number of distributed energy\nresources, such as photovoltaic generators, battery storage systems, heat pumps\nand electric vehicles, as well as a shift towards active distribution grids.\nObtaining physically-based models for simulation and control synthesis can\ntherefore become challenging. Differential equations, where the right-hand side\nis represented by a neural network, i.e., neural ordinary differential\nequations (NODEs), have a great potential to serve as a data-driven black-box\nmodel to overcome this challenge. This paper explores their use in identifying\nthe dynamics of droop-controlled grid-forming units based on inputs and state\nmeasurements. In numerical studies, various NODE structures used with different\nnumerical solvers are trained and evaluated. Moreover, they are compared to the\nsparse identification of nonlinear dynamics (SINDy) method. The results\ndemonstrate that even though SINDy yields more accurate models, NODEs achieve\ngood prediction performance without prior knowledge about the system's\nnonlinearities which SINDy requires to work best.\n","authors":["Hannes M. H. Wolf","Christian A. Hans"],"pdf_url":"https://arxiv.org/pdf/2411.08678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13224v4","updated":"2024-11-13T14:54:18Z","published":"2024-02-20T18:37:11Z","title":"Controlling Large Electric Vehicle Charging Stations via User Behavior\n Modeling and Stochastic Programming","summary":" This paper introduces an Electric Vehicle Charging Station (EVCS) model that\nincorporates real-world constraints, such as slot power limitations, contract\nthreshold overruns penalties, or early disconnections of electric vehicles\n(EVs). We propose a formulation of the problem of EVCS control under\nuncertainty, and implement two Multi-Stage Stochastic Programming approaches\nthat leverage user-provided information, namely, Model Predictive Control and\nTwo-Stage Stochastic Programming. The model addresses uncertainties in charging\nsession start and end times, as well as in energy demand. A user's behavior\nmodel based on a sojourn-time-dependent stochastic process enhances cost\nreduction while maintaining customer satisfaction. The benefits of the two\nproposed methods are showcased against two baselines over a 22-day simulation\nusing a real-world dataset. The two-stage approach demonstrates robustness\nagainst early disconnections by considering a wider range of uncertainty\nscenarios for optimization. The algorithm prioritizing user satisfaction over\nelectricity cost achieves a 20% and 36% improvement in two user satisfaction\nmetrics compared to an industry-standard baseline. Additionally, the algorithm\nstriking the best balance between cost and user satisfaction exhibits a mere 3%\nrelative cost increase compared to the theoretically optimal baseline - for\nwhich the nonanticipativity constraint is relaxed - while attaining 94% and 84%\nof the user satisfaction performance in the two used satisfaction metrics.\n","authors":["Alban Puech","Tristan Rigaut","William Templier","Maud Tournoud"],"pdf_url":"https://arxiv.org/pdf/2402.13224v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15552v3","updated":"2024-11-13T14:52:21Z","published":"2024-02-23T17:21:21Z","title":"Morphological Symmetries in Robotics","summary":" We present a comprehensive framework for studying and leveraging\nmorphological symmetries in robotic systems. These are intrinsic properties of\nthe robot's morphology, frequently observed in animal biology and robotics,\nwhich stem from the replication of kinematic structures and the symmetrical\ndistribution of mass. We illustrate how these symmetries extend to the robot's\nstate space and both proprioceptive and exteroceptive sensor measurements,\nresulting in the equivariance of the robot's equations of motion and optimal\ncontrol policies. Thus, we recognize morphological symmetries as a relevant and\npreviously unexplored physics-informed geometric prior, with significant\nimplications for both data-driven and analytical methods used in modeling,\ncontrol, estimation and design in robotics. For data-driven methods, we\ndemonstrate that morphological symmetries can enhance the sample efficiency and\ngeneralization of machine learning models through data augmentation, or by\napplying equivariant/invariant constraints on the model's architecture. In the\ncontext of analytical methods, we employ abstract harmonic analysis to\ndecompose the robot's dynamics into a superposition of lower-dimensional,\nindependent dynamics. We substantiate our claims with both synthetic and\nreal-world experiments conducted on bipedal and quadrupedal robots. Lastly, we\nintroduce the repository MorphoSymm to facilitate the practical use of the\ntheory and applications outlined in this work.\n","authors":["Daniel Ordoñez-Apraez","Giulio Turrisi","Vladimir Kostic","Mario Martin","Antonio Agudo","Francesc Moreno-Noguer","Massimiliano Pontil","Claudio Semini","Carlos Mastalli"],"pdf_url":"https://arxiv.org/pdf/2402.15552v3.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.08634v1","updated":"2024-11-13T14:18:28Z","published":"2024-11-13T14:18:28Z","title":"On the Application of Model Predictive Control to a Weighted Coverage\n Path Planning Problem","summary":" This paper considers the application of Model Predictive Control (MPC) to a\nweighted coverage path planning (WCPP) problem. The problem appears in a wide\nrange of practical applications, such as search and rescue (SAR) missions. The\nbasic setup is that one (or multiple) agents can move around a given search\nspace and collect rewards from a given spatial distribution. Unlike an\nartificial potential field, each reward can only be collected once. In contrast\nto a Traveling Salesman Problem (TSP), the agent moves in a continuous space.\nMoreover, he is not obliged to cover all locations and/or may return to\npreviously visited locations. The WCPP problem is tackled by a new Model\nPredictive Control (MPC) formulation with so-called Coverage Constraints (CCs).\nIt is shown that the solution becomes more effective if the solver is\ninitialized with a TSP-based heuristic. With and without this initialization,\nthe proposed MPC approach clearly outperforms a naive MPC formulation, as\ndemonstrated in a small simulation study.\n","authors":["Kilian Schweppe","Ludmila Moshagen","Georg Schildbach"],"pdf_url":"https://arxiv.org/pdf/2411.08634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08618v1","updated":"2024-11-13T14:05:32Z","published":"2024-11-13T14:05:32Z","title":"Robust Optimal Power Flow Against Adversarial Attacks: A Tri-Level\n Optimization Approach","summary":" In power systems, unpredictable events like extreme weather, equipment\nfailures, and cyberattacks present significant challenges to ensuring safety\nand reliability. Ensuring resilience in the face of these uncertainties is\ncrucial for reliable and efficient operations. This paper presents a tri-level\noptimization approach for robust power system operations that effectively\naddress worst-case attacks. The first stage focuses on optimizing economic\ndispatch under normal operating conditions, aiming to minimize generation costs\nwhile maintaining the supply-demand balance. The second stage introduces an\nadversarial attack model, identifying worst-case scenarios that maximize the\nsystem's vulnerability by targeting distributed generation (DG). In the third\nstage, mitigation strategies are developed using fast-response energy storage\nsystems (ESS) to minimize disruptions caused by these attacks. By integrating\neconomic dispatch, vulnerability assessment, and mitigation into a unified\nframework, this approach provides a robust solution for enhancing power system\nresilience and safety against evolving adversarial threats. The approach is\nvalidated using the IEEE-33 node distribution system to demonstrate its\neffectiveness in achieving both cost efficiency and system resilience.\n","authors":["Saman Mazaheri Khamaneh","Tong Wu"],"pdf_url":"https://arxiv.org/pdf/2411.08618v1.pdf","comment":"This work has been submitted for possible publication"},{"id":"http://arxiv.org/abs/2411.08576v1","updated":"2024-11-13T12:47:53Z","published":"2024-11-13T12:47:53Z","title":"Future state prediction based on observer for missile system","summary":" Guided missile accuracy and precision is negatively impacted by seeker delay,\nmore specifically by the delay introduced by a mechanical seeker gimbal and the\ncomputational time taken to process the raw data. To meet the demands and\nexpectations of modern missiles systems, the impact of this hardware limitation\nmust be reduced. This paper presents a new observer design that predicts the\nfuture state of a seeker signal, augmenting the guidance system to mitigate the\neffects of this delay. The design is based on a novel two-step differentiator,\nwhich produces the estimated future time derivatives of the signal. The input\nsignal can be nonlinear and provides for simple integration into existing\nsystems. A bespoke numerical guided missile simulation is used to demonstrate\nthe performance of the observer within a missile guidance system. Both\nnon-manoeuvring and randomly manoeuvring target engagement scenarios are\nconsidered.\n","authors":["W. K. Smithson","Xinhua Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12249v2","updated":"2024-11-13T10:00:35Z","published":"2023-10-18T18:34:26Z","title":"A Link-Based Flow Model with Turn-Level Queue Transmission and\n Time-Varying Free-Flow Speed for Urban Road Networks","summary":" Macroscopic link-based flow models are efficient for simulating flow\npropagation in urban road networks. Existing link-based flow models described\ntraffic states of a link with two state variables of link inflow and outflow\nand assumed homogeneous traffic states within a whole link. Consequently, the\nturn-level queue length change within the link can not be captured, resulting\nin underrepresented queue spillback. Moreover, a constant link free-flow speed\nwas assumed to formulate models, restricting their applicability in modeling\nphenomena involving time-varying free-flow speed. This study proposed a new\nlink-based flow model by introducing an additional state variable of link queue\ninflow and adapting the link outflow to be free-flow speed-dependent. In our\nmodel, the vehicle propagation within each link is described by the link\ninflow, queue inflow, and outflow, which depends on the link free-flow speed\nchanges. A node model is further defined to capture the presence of signal\ncontrol and potential queue spillback, which estimates the constrained flow\npropagation between adjacent road segments. Simulation experiments were\nconducted on a single intersection and a network with consecutive intersections\nto verify the proposed model performance. Results demonstrate the predictive\npower of the proposed model in predicting traffic operations of intersections\nwith multiple turning movements and time-varying free-flow speed. Our model\noutperforms the baseline link-based flow model and preserves the computational\ntractability property of link-based flow models.\n","authors":["Lei Wei","S. Travis Waller","Yu Mei","Peng Chen","Yunpeng Wang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.12249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08436v1","updated":"2024-11-13T08:35:37Z","published":"2024-11-13T08:35:37Z","title":"Robust performance for switched systems with constrained switching and\n its application to weakly hard real-time control systems","summary":" Many cyber-physical systems can naturally be formulated as switched systems\nwith constrained switching. This includes systems where one of the signals in\nthe feedback loop may be lost. Possible sources for losses are shared or\nunreliable communication media in networked control systems, or signals which\nare discarded, e.g., when using a shared computation device such as a processor\nin real-time control applications. The use of switched systems with constrained\nswitching is not limited to cyber-physical systems but, includes many other\nrelevant applications such as power systems and modeling virus mutations. In\nthis chapter, we introduce a framework for analyzing and designing controllers\nwhich guarantee robust quadratic performance for switched systems with\nconstrained switching. The possible switching sequences are described by the\nlanguage of a labeled graph where the labels are linked to the different\nsubsystems. The subsystems are allowed to have different input and output\ndimensions, and their state-space representations can be affected by a broad\nclass of uncertainties in a rational way. The proposed framework exploits ideas\nfrom dissipativity-based linear control theory to derive analysis and synthesis\ninequalities given by linear matrix inequalities. We demonstrate how the\nproposed framework can be applied to the design of controllers for uncertain\nweakly hard real-time control systems - a system class naturally appearing in\nnetworked and real-time control.\n","authors":["Simon Lang","Marc Seidel","Frank Allgöwer"],"pdf_url":"https://arxiv.org/pdf/2411.08436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08413v1","updated":"2024-11-13T08:04:35Z","published":"2024-11-13T08:04:35Z","title":"Inference-Aware State Reconstruction for Industrial Metaverse under\n Synchronous/Asynchronous Short-Packet Transmission","summary":" We consider a real-time state reconstruction system for industrial metaverse.\nThe time-varying physical process states in real space are captured by multiple\nsensors via wireless links, and then reconstructed in virtual space. In this\npaper, we use the spatial-temporal correlation of the sensor data of interest\nto infer the real-time data of the target sensor to reduce the mean squared\nerror (MSE) of reconstruction for industrial metaverse under short-packet\ntransmission (SPT). Both synchronous and asynchronous transmission modes for\nmultiple sensors are considered. It is proved that the average MSE of\nreconstruction and average block error probability (BLEP) have a positive\ncorrelation under inference with synchronous transmission scheme, and they have\na negative correlation in some conditions under inference with asynchronous\ntransmission scheme. Also, it is proved that the average MSE of reconstruction\nwith inference can be significantly lower than that without inference, even\nunder weak mean squared spatial correlation (MSSC). In addition, closed-form\nMSSC thresholds are derived for the superiority regions of the inference with\nsynchronous transmission and inference with asynchronous transmission schemes,\nrespectively. Adaptations of blocklength and time shift of asynchronous\ntransmission are conducted to minimize the average MSE of reconstruction.\nSimulation results show that the two schemes significantly outperform the no\ninference case, with an average MSE reduction of more than 50%.\n","authors":["Qinqin Xiong","Jie Cao","Xu Zhu","Yufei Jiang","Nikolaos Pappas"],"pdf_url":"https://arxiv.org/pdf/2411.08413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09521v3","updated":"2024-11-13T05:46:12Z","published":"2023-12-15T04:02:57Z","title":"Multi-Objective Complementary Control","summary":" This paper proposes a novel multi-objective control framework for linear\ntime-invariant systems in which performance and robustness can be achieved in a\ncomplementary way instead of a trade-off. In particular, a state-space solution\nis first established for a new stabilizing control structure consisting of two\nindependently designed controllers coordinated with a Youla-type operator ${\\bm\nQ}$. It is then shown by performance analysis that these two independently\ndesigned controllers operate in a naturally complementary way for a tracking\ncontrol system, due to the coordination function of ${\\bm Q}$ driven by the\nresidual signal of a Luenberger observer. Moreover, it is pointed out that\n${\\bm Q}$ could be further optimized with an additional gain factor to achieve\nimproved performance, through a data-driven methodology for a measured cost\nfunction.\n","authors":["Jiapeng Xu","Xiang Chen","Ying Tan","Kemin Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.09521v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08329v1","updated":"2024-11-13T04:23:05Z","published":"2024-11-13T04:23:05Z","title":"Neural Network Certification Informed Power System Transient Stability\n Preventive Control with Renewable Energy","summary":" Existing machine learning-based surrogate modeling methods for transient\nstability constrained-optimal power flow (TSC-OPF) lack certifications in the\npresence of unseen disturbances or uncertainties. This may lead to divergence\nof TSC-OPF or insecure control strategies. This paper proposes a neural network\ncertification-informed power system transient stability preventive control\nmethod considering the impacts of various uncertainty resources, such as errors\nfrom measurements, fluctuations in renewable energy sources (RESs) and loads,\netc. A deep belief network (DBN) is trained to estimate the transient\nstability, replacing the time-consuming time-domain simulation-based\ncalculations. Then, DBN is embedded into the iterations of the primal-dual\ninterior-point method to solve TSC-OPF. To guarantee the robustness of the\nsolutions, the neural network verifier $\\alpha, \\beta$-CROWN to deal with\nuncertainties from RESs and loads is proposed. The yielded certification\nresults allow us to further adjust the transient stability safety margin under\nthe iterated TSC-OPF solution process, balancing system security and economics.\nNumerical results on a modified western South Carolina 500-bus system\ndemonstrate that the proposed method can efficiently and quickly obtain the\nsafety-verified preventive control strategy through RES curtailment and\ngenerator dispatch with only a slight increase in cost.\n","authors":["Tong Su","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.08329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07541v2","updated":"2024-11-13T02:17:55Z","published":"2023-08-15T03:01:41Z","title":"On-demand Cold Start Frequency Reduction with Off-Policy Reinforcement\n Learning in Serverless Computing","summary":" Function-as-a-Service (FaaS) is a cloud computing paradigm offering an\nevent-driven execution model to applications. It features serverless attributes\nby eliminating resource management responsibilities from developers, and offers\ntransparent and on-demand scalability of applications. To provide seamless\non-demand scalability, new function instances are prepared to serve the\nincoming workload in the absence or unavailability of function instances.\nHowever, FaaS platforms are known to suffer from cold starts, where this\nfunction provisioning process introduces a non-negligible delay in function\nresponse and reduces the end-user experience. Therefore, the presented work\nfocuses on reducing the frequent, on-demand cold starts on the platform by\nusing Reinforcement Learning(RL). The proposed approach uses model-free\nQ-learning that consider function metrics such as CPU utilization, existing\nfunction instances, and response failure rate, to proactively initialize\nfunctions, in advance, based on the expected demand. The proposed solution is\nimplemented on Kubeless and evaluated using an open-source function invocation\ntrace applied to a matrix multiplication function. The evaluation results\ndemonstrate a favourable performance of the RL-based agent when compared to\nKubeless' default policy and a function keep-alive policy by improving\nthroughput by up to 8.81% and reducing computation load and resource wastage by\nup to 55% and 37%, respectively, that is a direct outcome of reduced cold\nstarts.\n","authors":["Siddharth Agarwal","Maria A. Rodriguez","Rajkumar Buyya"],"pdf_url":"https://arxiv.org/pdf/2308.07541v2.pdf","comment":"13 figures, 24 pages, 3 tables"},{"id":"http://arxiv.org/abs/2208.04883v6","updated":"2024-11-13T23:34:19Z","published":"2022-08-09T16:25:49Z","title":"Neural-Rendezvous: Provably Robust Guidance and Control to Encounter\n Interstellar Objects","summary":" Interstellar objects (ISOs) are likely representatives of primitive materials\ninvaluable in understanding exoplanetary star systems. Due to their poorly\nconstrained orbits with generally high inclinations and relative velocities,\nhowever, exploring ISOs with conventional human-in-the-loop approaches is\nsignificantly challenging. This paper presents Neural-Rendezvous -- a deep\nlearning-based guidance and control framework for encountering fast-moving\nobjects, including ISOs, robustly, accurately, and autonomously in real time.\nIt uses pointwise minimum norm tracking control on top of a guidance policy\nmodeled by a spectrally-normalized deep neural network, where its\nhyperparameters are tuned with a loss function directly penalizing the MPC\nstate trajectory tracking error. We show that Neural-Rendezvous provides a high\nprobability exponential bound on the expected spacecraft delivery error, the\nproof of which leverages stochastic incremental stability analysis. In\nparticular, it is used to construct a non-negative function with a\nsupermartingale property, explicitly accounting for the ISO state uncertainty\nand the local nature of nonlinear state estimation guarantees. In numerical\nsimulations, Neural-Rendezvous is demonstrated to satisfy the expected error\nbound for 100 ISO candidates. This performance is also empirically validated\nusing our spacecraft simulator and in high-conflict and distributed UAV swarm\nreconfiguration with up to 20 UAVs.\n","authors":["Hiroyasu Tsukamoto","Soon-Jo Chung","Yashwanth Kumar Nakka","Benjamin Donitz","Declan Mages","Michel Ingham"],"pdf_url":"https://arxiv.org/pdf/2208.04883v6.pdf","comment":"Preprint Version, Accepted: October, 2024 (One-minute YouTube\n summary: https://youtu.be/q3e0LYS2IYQ, DOI:\n https://doi.org/10.2514/1.G007671)"},{"id":"http://arxiv.org/abs/2405.07587v2","updated":"2024-11-13T21:57:51Z","published":"2024-05-13T09:48:30Z","title":"Structure-Preserving Model Order Reduction for Nonlinear DAE Models of\n Power Networks","summary":" This paper deals with the joint reduction of the number of dynamic and\nalgebraic states of a nonlinear differential-algebraic equation (NDAE) model of\na power network. The dynamic states depict the internal states of generators,\nloads, renewables, whereas the algebraic ones define network states such as\nvoltages and phase angles. In the current literature of power system model\norder reduction (MOR), the algebraic constraints are usually neglected and the\npower network is commonly modeled via a set of ordinary differential equations\n(ODEs) instead of NDAEs. Thus, reduction is usually carried out for the dynamic\nstates only and the algebraic variables are kept intact. This leaves a\nsignificant part of the system's size and complexity unreduced. This paper\naddresses this aforementioned limitation by jointly reducing both dynamic and\nalgebraic variables. As compared to the literature the proposed MOR techniques\nare endowed with the following features: (i) no system linearization is\nrequired, (ii) require no transformation to an equivalent or approximate ODE\nrepresentation, (iii) guarantee that the reduced order model to be\nNDAE-structured and thus preserves the differential-algebraic structure of\noriginal power system model, and (iv) can seamlessly reduce both dynamic and\nalgebraic variables while maintaining high accuracy. Case studies performed on\na 2000-bus power system reveal that the proposed MOR techniques are able to\nreduce system order while maintaining accuracy.\n","authors":["Muhammad Nadeem","Ahmad F. Taha"],"pdf_url":"https://arxiv.org/pdf/2405.07587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11212v3","updated":"2024-11-13T20:13:35Z","published":"2024-08-20T22:14:19Z","title":"How many autonomous vehicles are required to stabilize traffic flow?","summary":" The collective behavior of human-driven vehicles (HVs) produces the\nwell-known stop-and-go waves potentially leading to higher fuel consumption and\nemissions. This paper investigates the stabilization of traffic flow via a\nminimum number of autonomous vehicles (AVs) subject to constraints on the\ncontrol parameters aiming to reduce the number of vehicles on the road while\nachieving lower fuel consumption and emissions. The unconstrained scenario has\nbeen well-studied in recent studies. The main motivation to investigate the\nconstrained scenario is that, in realistic engineering applications, lower and\nupper bounds exist on the control parameters. For the constrained scenario, we\noptimally find the minimum number of required AVs (via computing the optimal\nlower bound on the AV penetration rate) to stabilize traffic flow for a given\nnumber of HVs. As an immediate consequence, we conclude that for a given number\nof AVs, the number of HVs in the stabilized traffic flow may not be arbitrarily\nlarge in the constrained scenario unlike the unconstrained scenario studied in\nthe literature. We systematically propose a procedure to compute the optimal\nlower bound on the AV penetration rate using nonlinear optimization techniques.\nFinally, we validate the theoretical results via numerical simulations.\nNumerical simulations suggest that enlarging the constraint intervals makes a\nsmaller optimal lower bound on the AV penetration rate attainable. However, it\nleads to a slower transient response due to a dominant pole closer to the\norigin.\n","authors":["MirSaleh Bahavarnia","Ahmad F. Taha"],"pdf_url":"https://arxiv.org/pdf/2408.11212v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08999v1","updated":"2024-11-13T19:45:47Z","published":"2024-11-13T19:45:47Z","title":"Learning-Based Control Barrier Function with Provably Safe Guarantees:\n Reducing Conservatism with Heading-Aware Safety Margin","summary":" We propose a learning-based Control Barrier Function (CBF) to reduce\nconservatism in collision avoidance of car-like robots. Traditional CBFs often\nuse Euclidean distance between robots' centers as safety margin, neglecting\nheadings and simplifying geometries to circles. While this ensures smooth,\ndifferentiable safety functions required by CBFs, it can be overly conservative\nin tight environments. To address this limitation, we design a heading-aware\nsafety margin that accounts for the robots' orientations, enabling a less\nconservative and more accurate estimation of safe regions. Since the function\ncomputing this safety margin is non-differentiable, we approximate it with a\nneural network to ensure differentiability and facilitate integration with\nCBFs. We describe how we achieve bounded learning error and incorporate the\nupper bound into the CBF to provide formal safety guarantees through forward\ninvariance. We show that our CBF is a high-order CBF with relative degree two\nfor a system with two robots whose dynamics are modeled by the nonlinear\nkinematic bicycle model. Experimental results in overtaking and bypassing\nscenarios reveal a 33.5 % reduction in conservatism compared to traditional\nmethods, while maintaining safety. Code: https://github.com/bassamlab/sigmarl\n","authors":["Jianye Xu","Bassam Alrifaee"],"pdf_url":"https://arxiv.org/pdf/2411.08999v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.08981v1","updated":"2024-11-13T19:16:44Z","published":"2024-11-13T19:16:44Z","title":"Reliability, Resilience and Human Factors Engineering for Trustworthy AI\n Systems","summary":" As AI systems become integral to critical operations across industries and\nservices, ensuring their reliability and safety is essential. We offer a\nframework that integrates established reliability and resilience engineering\nprinciples into AI systems. By applying traditional metrics such as failure\nrate and Mean Time Between Failures (MTBF) along with resilience engineering\nand human reliability analysis, we propose an integrate framework to manage AI\nsystem performance, and prevent or efficiently recover from failures. Our work\nadapts classical engineering methods to AI systems and outlines a research\nagenda for future technical studies. We apply our framework to a real-world AI\nsystem, using system status data from platforms such as openAI, to demonstrate\nits practical applicability. This framework aligns with emerging global\nstandards and regulatory frameworks, providing a methodology to enhance the\ntrustworthiness of AI systems. Our aim is to guide policy, regulation, and the\ndevelopment of reliable, safe, and adaptable AI technologies capable of\nconsistent performance in real-world environments.\n","authors":["Saurabh Mishra","Anand Rao","Ramayya Krishnan","Bilal Ayyub","Amin Aria","Enrico Zio"],"pdf_url":"https://arxiv.org/pdf/2411.08981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08538v1","updated":"2024-11-13T11:36:02Z","published":"2024-11-13T11:36:02Z","title":"Intelligent Adaptive Metasurface in Complex Wireless Environments","summary":" The programmable metasurface is regarded as one of the most promising\ntransformative technologies for next-generation wireless system applications.\nDue to the lack of effective perception ability of the external electromagnetic\nenvironment, there are numerous challenges in the intelligent regulation of\nwireless channels, and it still relies on external sensors to reshape\nelectromagnetic environment as desired. To address that problem, we propose an\nadaptive metasurface (AMS) which integrates the capabilities of acquiring\nwireless environment information and manipulating reflected electromagnetic\n(EM) waves in a programmable manner. The proposed design endows the\nmetasurfaces with excellent capabilities to sense the complex electromagnetic\nfield distributions around them and then dynamically manipulate the waves and\nsignals in real time under the guidance of the sensed information, eliminating\nthe need for prior knowledge or external inputs about the wireless environment.\nFor verification, a prototype of the proposed AMS is constructed, and its dual\ncapabilities of sensing and manipulation are experimentally validated.\nAdditionally, different integrated sensing and communication (ISAC) scenarios\nwith and without the aid of the AMS are established. The effectiveness of the\nAMS in enhancing communication quality is well demonstrated in complex\nelectromagnetic environments, highlighting its beneficial application potential\nin future wireless systems.\n","authors":["Han Qing Yang","Jun Yan Dai","Hui Dong Li","Lijie Wu","Meng Zhen Zhang","Zi Hang Shen","Si Ran Wang","Zheng Xing Wang","Wankai Tang","Shi Jin","Jun Wei Wu","Qiang Cheng","Tie Jun Cui"],"pdf_url":"https://arxiv.org/pdf/2411.08538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14980v2","updated":"2024-11-13T23:33:06Z","published":"2022-10-26T18:49:43Z","title":"Interstellar Object Accessibility and Mission Design","summary":" Interstellar objects (ISOs) represent a compelling and under-explored\ncategory of celestial bodies, providing physical laboratories to understand the\nformation of our solar system and probe the composition and properties of\nmaterial formed in exoplanetary systems. In this work, we investigate existing\napproaches to designing successful flyby missions to ISOs, including a deep\nlearning-driven guidance and control algorithm for ISOs traveling at velocities\nover 60 km/s. We have generated spacecraft trajectories to a series of\nsynthetic representative ISOs, simulating a ground campaign to observe the\ntarget and resolve its state, thereby determining the cruise and close approach\ndelta-Vs required for the encounter. We discuss the accessibility of and\nmission design to ISOs with varying characteristics, with special focuses on 1)\nstate covariance estimation throughout the cruise, 2) handoffs from traditional\nnavigation approaches to novel autonomous navigation for fast flyby regimes,\nand 3) overall recommendations about preparing for the future in situ\nexploration of these targets. The lessons learned also apply to the fast flyby\nof other small bodies, e.g., long-period comets and potentially hazardous\nasteroids, which also require tactical responses with similar characteristics.\n","authors":["Benjamin P. S. Donitz","Declan Mages","Hiroyasu Tsukamoto","Peter Dixon","Damon Landau","Soon-Jo Chung","Erica Bufanda","Michel Ingham","Julie Castillo-Rogez"],"pdf_url":"https://arxiv.org/pdf/2210.14980v2.pdf","comment":"IEEE Aerospace Conference, Preprint Version, Accepted: November 2022"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.08878v1","updated":"2024-11-13T18:55:10Z","published":"2024-11-13T18:55:10Z","title":"A Short Note on Evaluating RepNet for Temporal Repetition Counting in\n Videos","summary":" We discuss some consistent issues on how RepNet has been evaluated in various\npapers. As a way to mitigate these issues, we report RepNet performance results\non different datasets, and release evaluation code and the RepNet checkpoint to\nobtain these results. Code URL:\nhttps://github.com/google-research/google-research/blob/master/repnet/\n","authors":["Debidatta Dwibedi","Yusuf Aytar","Jonathan Tompson","Pierre Sermanet","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.08878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08870v1","updated":"2024-11-13T18:50:13Z","published":"2024-11-13T18:50:13Z","title":"The Limited Impact of Medical Adaptation of Large Language and\n Vision-Language Models","summary":" Several recent works seek to develop foundation models specifically for\nmedical applications, adapting general-purpose large language models (LLMs) and\nvision-language models (VLMs) via continued pretraining on publicly available\nbiomedical corpora. These works typically claim that such domain-adaptive\npretraining (DAPT) improves performance on downstream medical tasks, such as\nanswering medical licensing exam questions. In this paper, we compare ten\npublic \"medical\" LLMs and two VLMs against their corresponding base models,\narriving at a different conclusion: all medical VLMs and nearly all medical\nLLMs fail to consistently improve over their base models in the zero-/few-shot\nprompting and supervised fine-tuning regimes for medical question-answering\n(QA). For instance, across all tasks and model pairs we consider in the 3-shot\nsetting, medical LLMs only outperform their base models in 22.7% of cases,\nreach a (statistical) tie in 36.8% of cases, and are significantly worse than\ntheir base models in the remaining 40.5% of cases. Our conclusions are based on\n(i) comparing each medical model head-to-head, directly against the\ncorresponding base model; (ii) optimizing the prompts for each model separately\nin zero-/few-shot prompting; and (iii) accounting for statistical uncertainty\nin comparisons. While these basic practices are not consistently adopted in the\nliterature, our ablations show that they substantially impact conclusions.\nMeanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs\ncan show performance improvements, but the benefits do not carry over to tasks\nbased on clinical notes. Our findings suggest that state-of-the-art\ngeneral-domain models may already exhibit strong medical knowledge and\nreasoning capabilities, and offer recommendations to strengthen the conclusions\nof future studies.\n","authors":["Daniel P. Jeong","Pranav Mani","Saurabh Garg","Zachary C. Lipton","Michael Oberst"],"pdf_url":"https://arxiv.org/pdf/2411.08870v1.pdf","comment":"Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes\n additional results on clinical note QA tasks and supervised fine-tuning\n evaluations"},{"id":"http://arxiv.org/abs/2411.08867v1","updated":"2024-11-13T18:48:51Z","published":"2024-11-13T18:48:51Z","title":"Unsupervised Parameter-free Outlier Detection using HDBSCAN* Outlier\n Profiles","summary":" In machine learning and data mining, outliers are data points that\nsignificantly differ from the dataset and often introduce irrelevant\ninformation that can induce bias in its statistics and models. Therefore,\nunsupervised methods are crucial to detect outliers if there is limited or no\ninformation about them. Global-Local Outlier Scores based on Hierarchies\n(GLOSH) is an unsupervised outlier detection method within HDBSCAN*, a\nstate-of-the-art hierarchical clustering method. GLOSH estimates outlier scores\nfor each data point by comparing its density to the highest density of the\nregion they reside in the HDBSCAN* hierarchy. GLOSH may be sensitive to\nHDBSCAN*'s minpts parameter that influences density estimation. With limited\nknowledge about the data, choosing an appropriate minpts value beforehand is\nchallenging as one or some minpts values may better represent the underlying\ncluster structure than others. Additionally, in the process of searching for\n``potential outliers'', one has to define the number of outliers n a dataset\nhas, which may be impractical and is often unknown. In this paper, we propose\nan unsupervised strategy to find the ``best'' minpts value, leveraging the\nrange of GLOSH scores across minpts values to identify the value for which\nGLOSH scores can best identify outliers from the rest of the dataset. Moreover,\nwe propose an unsupervised strategy to estimate a threshold for classifying\npoints into inliers and (potential) outliers without the need to pre-define any\nvalue. Our experiments show that our strategies can automatically find the\nminpts value and threshold that yield the best or near best outlier detection\nresults using GLOSH.\n","authors":["Kushankur Ghosh","Murilo Coelho Naldi","Jörg Sander","Euijin Choo"],"pdf_url":"https://arxiv.org/pdf/2411.08867v1.pdf","comment":"Accepted at IEEE International Conference on Big Data, IEEE BigData\n 2024"},{"id":"http://arxiv.org/abs/2411.08862v1","updated":"2024-11-13T18:44:30Z","published":"2024-11-13T18:44:30Z","title":"LLMStinger: Jailbreaking LLMs using RL fine-tuned LLMs","summary":" We introduce LLMStinger, a novel approach that leverages Large Language\nModels (LLMs) to automatically generate adversarial suffixes for jailbreak\nattacks. Unlike traditional methods, which require complex prompt engineering\nor white-box access, LLMStinger uses a reinforcement learning (RL) loop to\nfine-tune an attacker LLM, generating new suffixes based on existing attacks\nfor harmful questions from the HarmBench benchmark. Our method significantly\noutperforms existing red-teaming approaches (we compared against 15 of the\nlatest methods), achieving a +57.2% improvement in Attack Success Rate (ASR) on\nLLaMA2-7B-chat and a +50.3% ASR increase on Claude 2, both models known for\ntheir extensive safety measures. Additionally, we achieved a 94.97% ASR on\nGPT-3.5 and 99.4% on Gemma-2B-it, demonstrating the robustness and adaptability\nof LLMStinger across open and closed-source models.\n","authors":["Piyush Jha","Arnav Arora","Vijay Ganesh"],"pdf_url":"https://arxiv.org/pdf/2411.08862v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2411.08861v1","updated":"2024-11-13T18:42:34Z","published":"2024-11-13T18:42:34Z","title":"Interaction Testing in Variation Analysis","summary":" Relationships of cause and effect are of prime importance for explaining\nscientific phenomena. Often, rather than just understanding the effects of\ncauses, researchers also wish to understand how a cause $X$ affects an outcome\n$Y$ mechanistically -- i.e., what are the causal pathways that are activated\nbetween $X$ and $Y$. For analyzing such questions, a range of methods has been\ndeveloped over decades under the rubric of causal mediation analysis.\nTraditional mediation analysis focuses on decomposing the average treatment\neffect (ATE) into direct and indirect effects, and therefore focuses on the ATE\nas the central quantity. This corresponds to providing explanations for\nassociations in the interventional regime, such as when the treatment $X$ is\nrandomized. Commonly, however, it is of interest to explain associations in the\nobservational regime, and not just in the interventional regime. In this paper,\nwe introduce \\text{variation analysis}, an extension of mediation analysis that\nfocuses on the total variation (TV) measure between $X$ and $Y$, written as\n$\\mathrm{E}[Y \\mid X=x_1] - \\mathrm{E}[Y \\mid X=x_0]$. The TV measure\nencompasses both causal and confounded effects, as opposed to the ATE which\nonly encompasses causal (direct and mediated) variations. In this way, the TV\nmeasure is suitable for providing explanations in the natural regime and\nanswering questions such as ``why is $X$ associated with $Y$?''. Our focus is\non decomposing the TV measure, in a way that explicitly includes direct,\nindirect, and confounded variations. Furthermore, we also decompose the TV\nmeasure to include interaction terms between these different pathways.\nSubsequently, interaction testing is introduced, involving hypothesis tests to\ndetermine if interaction terms are significantly different from zero. If\ninteractions are not significant, more parsimonious decompositions of the TV\nmeasure can be used.\n","authors":["Drago Plecko"],"pdf_url":"https://arxiv.org/pdf/2411.08861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13880v4","updated":"2024-11-13T18:31:18Z","published":"2024-04-22T05:07:02Z","title":"Regional Style and Color Transfer","summary":" This paper presents a novel contribution to the field of regional style\ntransfer. Existing methods often suffer from the drawback of applying style\nhomogeneously across the entire image, leading to stylistic inconsistencies or\nforeground object twisted when applied to image with foreground elements such\nas person figures. To address this limitation, we propose a new approach that\nleverages a segmentation network to precisely isolate foreground objects within\nthe input image. Subsequently, style transfer is applied exclusively to the\nbackground region. The isolated foreground objects are then carefully\nreintegrated into the style-transferred background. To enhance the visual\ncoherence between foreground and background, a color transfer step is employed\non the foreground elements prior to their rein-corporation. Finally, we utilize\nfeathering techniques to achieve a seamless amalgamation of foreground and\nbackground, resulting in a visually unified and aesthetically pleasing final\ncomposition. Extensive evaluations demonstrate that our proposed approach\nyields significantly more natural stylistic transformations compared to\nconventional methods.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Siyang Li","Qingtian Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13880v4.pdf","comment":"Accepted by 2024 5th International Conference on Computer Vision,\n Image and Deep Learning"},{"id":"http://arxiv.org/abs/2411.08849v1","updated":"2024-11-13T18:29:58Z","published":"2024-11-13T18:29:58Z","title":"Oblique Bayesian additive regression trees","summary":" Current implementations of Bayesian Additive Regression Trees (BART) are\nbased on axis-aligned decision rules that recursively partition the feature\nspace using a single feature at a time. Several authors have demonstrated that\noblique trees, whose decision rules are based on linear combinations of\nfeatures, can sometimes yield better predictions than axis-aligned trees and\nexhibit excellent theoretical properties. We develop an oblique version of BART\nthat leverages a data-adaptive decision rule prior that recursively partitions\nthe feature space along random hyperplanes. Using several synthetic and\nreal-world benchmark datasets, we systematically compared our oblique BART\nimplementation to axis-aligned BART and other tree ensemble methods, finding\nthat oblique BART was competitive with -- and sometimes much better than --\nthose methods.\n","authors":["Paul-Hieu V. Nguyen","Ryan Yee","Sameer K. Deshpande"],"pdf_url":"https://arxiv.org/pdf/2411.08849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06438v2","updated":"2024-11-13T18:21:22Z","published":"2024-07-08T22:40:15Z","title":"A Single Transformer for Scalable Vision-Language Modeling","summary":" We present SOLO, a single transformer for Scalable visiOn-Language mOdeling.\nCurrent large vision-language models (LVLMs) such as LLaVA mostly employ\nheterogeneous architectures that connect pre-trained visual encoders with large\nlanguage models (LLMs) to facilitate visual recognition and complex reasoning.\nAlthough achieving remarkable performance with relatively lightweight training,\nwe identify four primary scalability limitations: (1) The visual capacity is\nconstrained by pre-trained visual encoders, which are typically an order of\nmagnitude smaller than LLMs. (2) The heterogeneous architecture complicates the\nuse of established hardware and software infrastructure. (3) Study of scaling\nlaws on such architecture must consider three separate components - visual\nencoder, connector, and LLMs, which complicates the analysis. (4) The use of\nexisting visual encoders typically requires following a pre-defined\nspecification of image inputs pre-processing, for example, by reshaping inputs\nto fixed-resolution square images, which presents difficulties in processing\nand training on high-resolution images or those with unusual aspect ratio. A\nunified single Transformer architecture, like SOLO, effectively addresses these\nscalability concerns in LVLMs; however, its limited adoption in the modern\ncontext likely stems from the absence of reliable training recipes that balance\nboth modalities and ensure stable training for billion-scale models. In this\npaper, we introduce the first open-source training recipe for developing SOLO,\nan open-source 7B LVLM using moderate academic resources. The training recipe\ninvolves initializing from LLMs, sequential pre-training on ImageNet and\nweb-scale data, and instruction fine-tuning on our curated high-quality\ndatasets. On extensive evaluation, SOLO demonstrates performance comparable to\nLLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning.\n","authors":["Yangyi Chen","Xingyao Wang","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2407.06438v2.pdf","comment":"Accepted to TMLR"},{"id":"http://arxiv.org/abs/2411.08832v1","updated":"2024-11-13T18:12:15Z","published":"2024-11-13T18:12:15Z","title":"Offline Adaptation of Quadruped Locomotion using Diffusion Models","summary":" We present a diffusion-based approach to quadrupedal locomotion that\nsimultaneously addresses the limitations of learning and interpolating between\nmultiple skills and of (modes) offline adapting to new locomotion behaviours\nafter training. This is the first framework to apply classifier-free guided\ndiffusion to quadruped locomotion and demonstrate its efficacy by extracting\ngoal-conditioned behaviour from an originally unlabelled dataset. We show that\nthese capabilities are compatible with a multi-skill policy and can be applied\nwith little modification and minimal compute overhead, i.e., running entirely\non the robots onboard CPU. We verify the validity of our approach with hardware\nexperiments on the ANYmal quadruped platform.\n","authors":["Reece O'Mahoney","Alexander L. Mitchell","Wanming Yu","Ingmar Posner","Ioannis Havoutis"],"pdf_url":"https://arxiv.org/pdf/2411.08832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08821v1","updated":"2024-11-13T17:59:44Z","published":"2024-11-13T17:59:44Z","title":"Model agnostic local variable importance for locally dependent\n relationships","summary":" Global variable importance measures are commonly used to interpret machine\nlearning model results. Local variable importance techniques assess how\nvariables contribute to individual observations rather than the entire dataset.\nCurrent methods typically fail to accurately reflect locally dependent\nrelationships between variables and instead focus on marginal importance\nvalues. Additionally, they are not natively adapted for multi-class\nclassification problems. We propose a new model-agnostic method for calculating\nlocal variable importance, CLIQUE, that captures locally dependent\nrelationships, contains improvements over permutation-based methods, and can be\ndirectly applied to multi-class classification problems. Simulated and\nreal-world examples show that CLIQUE emphasizes locally dependent information\nand properly reduces bias in regions where variables do not affect the\nresponse.\n","authors":["Kelvyn K. Bladen","Adele Cutler","D. Richard Cutler","Kevin R. Moon"],"pdf_url":"https://arxiv.org/pdf/2411.08821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08814v1","updated":"2024-11-13T17:53:23Z","published":"2024-11-13T17:53:23Z","title":"Process-aware Human Activity Recognition","summary":" Humans naturally follow distinct patterns when conducting their daily\nactivities, which are driven by established practices and processes, such as\nproduction workflows, social norms and daily routines. Human activity\nrecognition (HAR) algorithms usually use neural networks or machine learning\ntechniques to analyse inherent relationships within the data. However, these\napproaches often overlook the contextual information in which the data are\ngenerated, potentially limiting their effectiveness. We propose a novel\napproach that incorporates process information from context to enhance the HAR\nperformance. Specifically, we align probabilistic events generated by machine\nlearning models with process models derived from contextual information. This\nalignment adaptively weighs these two sources of information to optimise HAR\naccuracy. Our experiments demonstrate that our approach achieves better\naccuracy and Macro F1-score compared to baseline models.\n","authors":["Jiawei Zheng","Petros Papapanagiotou","Jacques D. Fleuriot","Jane Hillston"],"pdf_url":"https://arxiv.org/pdf/2411.08814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01600v3","updated":"2024-11-13T17:41:43Z","published":"2024-08-02T23:11:42Z","title":"Physics-Informed Geometry-Aware Neural Operator","summary":" Engineering design problems often involve solving parametric Partial\nDifferential Equations (PDEs) under variable PDE parameters and domain\ngeometry. Recently, neural operators have shown promise in learning PDE\noperators and quickly predicting the PDE solutions. However, training these\nneural operators typically requires large datasets, the acquisition of which\ncan be prohibitively expensive. To overcome this, physics-informed training\noffers an alternative way of building neural operators, eliminating the high\ncomputational costs associated with Finite Element generation of training data.\nNevertheless, current physics-informed neural operators struggle with\nlimitations, either in handling varying domain geometries or varying PDE\nparameters. In this research, we introduce a novel method, the Physics-Informed\nGeometry-Aware Neural Operator (PI-GANO), designed to simultaneously generalize\nacross both PDE parameters and domain geometries. We adopt a geometry encoder\nto capture the domain geometry features, and design a novel pipeline to\nintegrate this component within the existing DCON architecture. Numerical\nresults demonstrate the accuracy and efficiency of the proposed method. All the\ncodes and data related to this work are available on GitHub:\nhttps://github.com/WeihengZ/Physics-informed-Neural-Foundation-Operator.\n","authors":["Weiheng Zhong","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2408.01600v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.13646"},{"id":"http://arxiv.org/abs/2411.08804v1","updated":"2024-11-13T17:38:07Z","published":"2024-11-13T17:38:07Z","title":"FinRobot: AI Agent for Equity Research and Valuation with Large Language\n Models","summary":" As financial markets grow increasingly complex, there is a rising need for\nautomated tools that can effectively assist human analysts in equity research,\nparticularly within sell-side research. While Generative AI (GenAI) has\nattracted significant attention in this field, existing AI solutions often fall\nshort due to their narrow focus on technical factors and limited capacity for\ndiscretionary judgment. These limitations hinder their ability to adapt to new\ndata in real-time and accurately assess risks, which diminishes their practical\nvalue for investors.\n This paper presents FinRobot, the first AI agent framework specifically\ndesigned for equity research. FinRobot employs a multi-agent Chain of Thought\n(CoT) system, integrating both quantitative and qualitative analyses to emulate\nthe comprehensive reasoning of a human analyst. The system is structured around\nthree specialized agents: the Data-CoT Agent, which aggregates diverse data\nsources for robust financial integration; the Concept-CoT Agent, which mimics\nan analysts reasoning to generate actionable insights; and the Thesis-CoT\nAgent, which synthesizes these insights into a coherent investment thesis and\nreport. FinRobot provides thorough company analysis supported by precise\nnumerical data, industry-appropriate valuation metrics, and realistic risk\nassessments. Its dynamically updatable data pipeline ensures that research\nremains timely and relevant, adapting seamlessly to new financial information.\nUnlike existing automated research tools, such as CapitalCube and Wright\nReports, FinRobot delivers insights comparable to those produced by major\nbrokerage firms and fundamental research vendors. We open-source FinRobot at\n\\url{https://github. com/AI4Finance-Foundation/FinRobot}.\n","authors":["Tianyu Zhou","Pinqiao Wang","Yilin Wu","Hongyang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.08804v1.pdf","comment":"The 1st Workshop on LLMs and Generative AI for Finance, ICAIF 2024"},{"id":"http://arxiv.org/abs/2410.16527v2","updated":"2024-11-13T17:30:33Z","published":"2024-10-21T21:36:03Z","title":"Insights and Current Gaps in Open-Source LLM Vulnerability Scanners: A\n Comparative Analysis","summary":" This report presents a comparative analysis of open-source vulnerability\nscanners for conversational large language models (LLMs). As LLMs become\nintegral to various applications, they also present potential attack surfaces,\nexposed to security risks such as information leakage and jailbreak attacks.\nOur study evaluates prominent scanners - Garak, Giskard, PyRIT, and\nCyberSecEval - that adapt red-teaming practices to expose these\nvulnerabilities. We detail the distinctive features and practical use of these\nscanners, outline unifying principles of their design and perform quantitative\nevaluations to compare them. These evaluations uncover significant reliability\nissues in detecting successful attacks, highlighting a fundamental gap for\nfuture development. Additionally, we contribute a preliminary labelled dataset,\nwhich serves as an initial step to bridge this gap. Based on the above, we\nprovide strategic recommendations to assist organizations choose the most\nsuitable scanner for their red-teaming needs, accounting for customizability,\ntest suite comprehensiveness, and industry-specific use cases.\n","authors":["Jonathan Brokman","Omer Hofman","Oren Rachmil","Inderjeet Singh","Rathina Sabapathy Aishvariya Priya","Vikas Pahuja","Amit Giloni","Roman Vainshtein","Hisashi Kojima"],"pdf_url":"https://arxiv.org/pdf/2410.16527v2.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2410.24164v3","updated":"2024-11-13T17:30:10Z","published":"2024-10-31T17:22:30Z","title":"$π_0$: A Vision-Language-Action Flow Model for General Robot Control","summary":" Robot learning holds tremendous promise to unlock the full potential of\nflexible, general, and dexterous robot systems, as well as to address some of\nthe deepest questions in artificial intelligence. However, bringing robot\nlearning to the level of generality required for effective real-world systems\nfaces major obstacles in terms of data, generalization, and robustness. In this\npaper, we discuss how generalist robot policies (i.e., robot foundation models)\ncan address these challenges, and how we can design effective generalist robot\npolicies for complex and highly dexterous tasks. We propose a novel flow\nmatching architecture built on top of a pre-trained vision-language model (VLM)\nto inherit Internet-scale semantic knowledge. We then discuss how this model\ncan be trained on a large and diverse dataset from multiple dexterous robot\nplatforms, including single-arm robots, dual-arm robots, and mobile\nmanipulators. We evaluate our model in terms of its ability to perform tasks in\nzero shot after pre-training, follow language instructions from people and from\na high-level VLM policy, and its ability to acquire new skills via fine-tuning.\nOur results cover a wide variety of tasks, such as laundry folding, table\ncleaning, and assembling boxes.\n","authors":["Kevin Black","Noah Brown","Danny Driess","Adnan Esmail","Michael Equi","Chelsea Finn","Niccolo Fusai","Lachy Groom","Karol Hausman","Brian Ichter","Szymon Jakubczak","Tim Jones","Liyiming Ke","Sergey Levine","Adrian Li-Bell","Mohith Mothukuri","Suraj Nair","Karl Pertsch","Lucy Xiaoyang Shi","James Tanner","Quan Vuong","Anna Walling","Haohuan Wang","Ury Zhilinsky"],"pdf_url":"https://arxiv.org/pdf/2410.24164v3.pdf","comment":"See project website for videos:\n https://physicalintelligence.company/blog/pi0"},{"id":"http://arxiv.org/abs/2411.08800v1","updated":"2024-11-13T17:27:32Z","published":"2024-11-13T17:27:32Z","title":"Deep Learning Accelerated Quantum Transport Simulations in\n Nanoelectronics: From Break Junctions to Field-Effect Transistors","summary":" Quantum transport calculations are essential for understanding and designing\nnanoelectronic devices, yet the trade-off between accuracy and computational\nefficiency has long limited their practical applications. We present a general\nframework that combines the deep learning tight-binding Hamiltonian (DeePTB)\napproach with the non-equilibrium Green's Function (NEGF) method, enabling\nefficient quantum transport calculations while maintaining first-principles\naccuracy. We demonstrate the capabilities of the DeePTB-NEGF framework through\ntwo representative applications: comprehensive simulation of break junction\nsystems, where conductance histograms show good agreement with experimental\nmeasurements in both metallic contact and single-molecule junction cases; and\nsimulation of carbon nanotube field effect transistors through self-consistent\nNEGF-Poisson calculations, capturing essential physics including the\nelectrostatic potential and transfer characteristic curves under finite bias\nconditions. This framework bridges the gap between first-principles accuracy\nand computational efficiency, providing a powerful tool for high-throughput\nquantum transport simulations across different scales in nanoelectronics.\n","authors":["Jijie Zou","Zhanghao Zhouyin","Dongying Lin","Linfeng Zhang","Shimin Hou","Qiangqiang Gu"],"pdf_url":"https://arxiv.org/pdf/2411.08800v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.13646v4","updated":"2024-11-13T17:26:36Z","published":"2024-04-21T12:41:30Z","title":"Physics-informed Discretization-independent Deep Compositional Operator\n Network","summary":" Solving parametric Partial Differential Equations (PDEs) for a broad range of\nparameters is a critical challenge in scientific computing. To this end, neural\noperators, which \\textcolor{black}{predicts the PDE solution with variable PDE\nparameter inputs}, have been successfully used. However, the training of neural\noperators typically demands large training datasets, the acquisition of which\ncan be prohibitively expensive. To address this challenge, physics-informed\ntraining can offer a cost-effective strategy. However, current physics-informed\nneural operators face limitations, either in handling irregular domain shapes\nor in in generalizing to various discrete representations of PDE parameters. In\nthis research, we introduce a novel physics-informed model architecture which\ncan generalize to various discrete representations of PDE parameters and\nirregular domain shapes. Particularly, inspired by deep operator neural\nnetworks, our model involves a discretization-independent learning of parameter\nembedding repeatedly, and this parameter embedding is integrated with the\nresponse embeddings through multiple compositional layers, for more\nexpressivity. Numerical results demonstrate the accuracy and efficiency of the\nproposed method. All the codes and data related to this work are available on\nGitHub: https://github.com/WeihengZ/PI-DCON.\n","authors":["Weiheng Zhong","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2404.13646v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08798v1","updated":"2024-11-13T17:25:25Z","published":"2024-11-13T17:25:25Z","title":"Learning Gaussian Multi-Index Models with Gradient Flow: Time Complexity\n and Directional Convergence","summary":" This work focuses on the gradient flow dynamics of a neural network model\nthat uses correlation loss to approximate a multi-index function on\nhigh-dimensional standard Gaussian data. Specifically, the multi-index function\nwe consider is a sum of neurons $f^*(x) \\!=\\! \\sum_{j=1}^k \\! \\sigma^*(v_j^T\nx)$ where $v_1, \\dots, v_k$ are unit vectors, and $\\sigma^*$ lacks the first\nand second Hermite polynomials in its Hermite expansion. It is known that, for\nthe single-index case ($k\\!=\\!1$), overcoming the search phase requires\npolynomial time complexity. We first generalize this result to multi-index\nfunctions characterized by vectors in arbitrary directions. After the search\nphase, it is not clear whether the network neurons converge to the index\nvectors, or get stuck at a sub-optimal solution. When the index vectors are\northogonal, we give a complete characterization of the fixed points and prove\nthat neurons converge to the nearest index vectors. Therefore, using $n \\!\n\\asymp \\! k \\log k$ neurons ensures finding the full set of index vectors with\ngradient flow with high probability over random initialization. When $ v_i^T\nv_j \\!=\\! \\beta \\! \\geq \\! 0$ for all $i \\neq j$, we prove the existence of a\nsharp threshold $\\beta_c \\!=\\! c/(c+k)$ at which the fixed point that computes\nthe average of the index vectors transitions from a saddle point to a minimum.\nNumerical simulations show that using a correlation loss and a mild\noverparameterization suffices to learn all of the index vectors when they are\nnearly orthogonal, however, the correlation loss fails when the dot product\nbetween the index vectors exceeds a certain threshold.\n","authors":["Berfin Simsek","Amire Bendjeddou","Daniel Hsu"],"pdf_url":"https://arxiv.org/pdf/2411.08798v1.pdf","comment":"21 pages, 6 figures, under review by AISTATS 2025"},{"id":"http://arxiv.org/abs/2411.08791v1","updated":"2024-11-13T17:17:16Z","published":"2024-11-13T17:17:16Z","title":"Locally Private Sampling with Public Data","summary":" Local differential privacy (LDP) is increasingly employed in\nprivacy-preserving machine learning to protect user data before sharing it with\nan untrusted aggregator. Most LDP methods assume that users possess only a\nsingle data record, which is a significant limitation since users often gather\nextensive datasets (e.g., images, text, time-series data) and frequently have\naccess to public datasets. To address this limitation, we propose a locally\nprivate sampling framework that leverages both the private and public datasets\nof each user. Specifically, we assume each user has two distributions: $p$ and\n$q$ that represent their private dataset and the public dataset, respectively.\nThe objective is to design a mechanism that generates a private sample\napproximating $p$ while simultaneously preserving $q$. We frame this objective\nas a minimax optimization problem using $f$-divergence as the utility measure.\nWe fully characterize the minimax optimal mechanisms for general\n$f$-divergences provided that $p$ and $q$ are discrete distributions.\nRemarkably, we demonstrate that this optimal mechanism is universal across all\n$f$-divergences. Experiments validate the effectiveness of our minimax optimal\nsampler compared to the state-of-the-art locally private sampler.\n","authors":["Behnoosh Zamanlooy","Mario Diaz","Shahab Asoodeh"],"pdf_url":"https://arxiv.org/pdf/2411.08791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08790v1","updated":"2024-11-13T17:16:48Z","published":"2024-11-13T17:16:48Z","title":"Can sparse autoencoders be used to decompose and interpret steering\n vectors?","summary":" Steering vectors are a promising approach to control the behaviour of large\nlanguage models. However, their underlying mechanisms remain poorly understood.\nWhile sparse autoencoders (SAEs) may offer a potential method to interpret\nsteering vectors, recent findings show that SAE-reconstructed vectors often\nlack the steering properties of the original vectors. This paper investigates\nwhy directly applying SAEs to steering vectors yields misleading\ndecompositions, identifying two reasons: (1) steering vectors fall outside the\ninput distribution for which SAEs are designed, and (2) steering vectors can\nhave meaningful negative projections in feature directions, which SAEs are not\ndesigned to accommodate. These limitations hinder the direct use of SAEs for\ninterpreting steering vectors.\n","authors":["Harry Mayne","Yushi Yang","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2411.08790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19552v2","updated":"2024-11-13T17:12:34Z","published":"2024-09-29T04:41:10Z","title":"A Universal Deep Learning Framework for Materials X-ray Absorption\n Spectra","summary":" X-ray absorption spectroscopy (XAS) is a powerful characterization technique\nfor probing the local chemical environment of absorbing atoms. However,\nanalyzing XAS data presents significant challenges, often requiring extensive,\ncomputationally intensive simulations, as well as significant domain expertise.\nThese limitations hinder the development of fast, robust XAS analysis pipelines\nthat are essential in high-throughput studies and for autonomous\nexperimentation. We address these challenges with OmniXAS, a framework that\ncontains a suite of transfer learning approaches for XAS prediction, each\ncontributing to improved accuracy and efficiency, as demonstrated on K-edge\nspectra database covering eight 3d transition metals (Ti-Cu). The OmniXAS\nframework is built upon three distinct strategies. First, we use M3GNet to\nderive latent representations of the local chemical environment of absorption\nsites as input for XAS prediction, achieving up to order-of-magnitude\nimprovements over conventional featurization techniques. Second, we employ a\nhierarchical transfer learning strategy, training a universal multi-task model\nacross elements before fine-tuning for element-specific predictions. Models\nbased on this cascaded approach after element-wise fine-tuning outperform\nelement-specific models by up to 69%. Third, we implement cross-fidelity\ntransfer learning, adapting a universal model to predict spectra generated by\nsimulation of a different fidelity with a higher computational cost. This\napproach improves prediction accuracy by up to 11% over models trained on the\ntarget fidelity alone. Our approach boosts the throughput of XAS modeling by\norders of magnitude versus first-principles simulations and is extendable to\nXAS prediction for a broader range of elements. This transfer learning\nframework is generalizable to enhance deep-learning models that target other\nproperties in materials research.\n","authors":["Shubha R. Kharel","Fanchen Meng","Xiaohui Qu","Matthew R. Carbone","Deyu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.19552v2.pdf","comment":"Main manuscript: 22 pages, 11 figures. Supplemental material (12\n pages, 6 figures) available as a separate file in arXiv ancillary files\n (additional downloadable files)"},{"id":"http://arxiv.org/abs/2402.03271v3","updated":"2024-11-13T17:10:20Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n Seeking in Large Language Models","summary":" In the face of uncertainty, the ability to *seek information* is of\nfundamental importance. In many practical applications, such as medical\ndiagnosis and troubleshooting, the information needed to solve the task is not\ninitially given and has to be actively sought by asking follow-up questions\n(for example, a doctor asking a patient for more details about their symptoms).\nIn this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to\naugment large language models with the ability to actively seek information by\nasking effective questions. UoT combines 1) an *uncertainty-aware simulation\napproach* which enables the model to simulate possible future scenarios and how\nlikely they are to occur, 2) *uncertainty-based rewards* motivated by\ninformation gain which incentivizes the model to seek information, and 3) a\n*reward propagation scheme* to select the optimal question to ask in a way that\nmaximizes the expected reward. In experiments on medical diagnosis,\ntroubleshooting, and the `20 Questions` game, UoT achieves an average\nperformance improvement of 38.1% in the rate of successful task completion\nacross multiple LLMs compared with direct prompting and also improves\nefficiency (i.e., the number of questions needed to complete the task). Our\ncode has been released [here](https://github.com/zhiyuanhubj/UoT)\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.09322v2","updated":"2024-11-13T17:08:34Z","published":"2024-06-13T17:00:30Z","title":"Active Inference Meeting Energy-Efficient Control of Parallel and\n Identical Machines","summary":" We investigate the application of active inference in developing\nenergy-efficient control agents for manufacturing systems. Active inference,\nrooted in neuroscience, provides a unified probabilistic framework integrating\nperception, learning, and action, with inherent uncertainty quantification\nelements. Our study explores deep active inference, an emerging field that\ncombines deep learning with the active inference decision-making framework.\nLeveraging a deep active inference agent, we focus on controlling parallel and\nidentical machine workstations to enhance energy efficiency. We address\nchallenges posed by the problem's stochastic nature and delayed policy response\nby introducing tailored enhancements to existing agent architectures.\nSpecifically, we introduce multi-step transition and hybrid horizon methods to\nmitigate the need for complex planning. Our experimental results demonstrate\nthe effectiveness of these enhancements and highlight the potential of the\nactive inference-based approach.\n","authors":["Yavar Taheri Yeganeh","Mohsen Jafari","Andrea Matta"],"pdf_url":"https://arxiv.org/pdf/2406.09322v2.pdf","comment":"Accepted at the 10th International Conference on Machine Learning,\n Optimization, and Data Science"},{"id":"http://arxiv.org/abs/2411.08773v1","updated":"2024-11-13T16:58:51Z","published":"2024-11-13T16:58:51Z","title":"Optimal Oblivious Subspace Embeddings with Near-optimal Sparsity","summary":" An oblivious subspace embedding is a random $m\\times n$ matrix $\\Pi$ such\nthat, for any $d$-dimensional subspace, with high probability $\\Pi$ preserves\nthe norms of all vectors in that subspace within a $1\\pm\\epsilon$ factor. In\nthis work, we give an oblivious subspace embedding with the optimal dimension\n$m=\\Theta(d/\\epsilon^2)$ that has a near-optimal sparsity of $\\tilde\nO(1/\\epsilon)$ non-zero entries per column of $\\Pi$. This is the first result\nto nearly match the conjecture of Nelson and Nguyen [FOCS 2013] in terms of the\nbest sparsity attainable by an optimal oblivious subspace embedding, improving\non a prior bound of $\\tilde O(1/\\epsilon^6)$ non-zeros per column [Chenakkod et\nal., STOC 2024]. We further extend our approach to the non-oblivious setting,\nproposing a new family of Leverage Score Sparsified embeddings with Independent\nColumns, which yield faster runtimes for matrix approximation and regression\ntasks.\n In our analysis, we develop a new method which uses a decoupling argument\ntogether with the cumulant method for bounding the edge universality error of\nisotropic random matrices. To achieve near-optimal sparsity, we combine this\ngeneral-purpose approach with new traces inequalities that leverage the\nspecific structure of our subspace embedding construction.\n","authors":["Shabarish Chenakkod","Michał Dereziński","Xiaoyu Dong"],"pdf_url":"https://arxiv.org/pdf/2411.08773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08766v1","updated":"2024-11-13T16:52:30Z","published":"2024-11-13T16:52:30Z","title":"Mapping Methane -- The Impact of Dairy Farm Practices on Emissions\n Through Satellite Data and Machine Learning","summary":" This study investigates the correlation between dairy farm characteristics\nand methane concentrations as derived from satellite observations in Eastern\nCanada. Utilizing data from 11 dairy farms collected between January 2020 and\nDecember 2022, we integrated Sentinel-5P satellite methane data with critical\nfarm-level attributes, including herd genetics, feeding practices, and\nmanagement strategies. Initial analyses revealed significant correlations with\nmethane concentrations, leading to the application of Variance Inflation Factor\n(VIF) and Principal Component Analysis (PCA) to address multicollinearity and\nenhance model stability. Subsequently, machine learning models - specifically\nRandom Forest and Neural Networks - were employed to evaluate feature\nimportance and predict methane emissions. Our findings indicate a strong\nnegative correlation between the Estimated Breeding Value (EBV) for protein\npercentage and methane concentrations, suggesting that genetic selection for\nhigher milk protein content could be an effective strategy for emissions\nreduction. The integration of atmospheric transport models with satellite data\nfurther refined our emission estimates, significantly enhancing accuracy and\nspatial resolution. This research underscores the potential of advanced\nsatellite monitoring, machine learning techniques, and atmospheric modeling in\nimproving methane emission assessments within the dairy sector. It emphasizes\nthe critical role of farm-specific characteristics in developing effective\nmitigation strategies. Future investigations should focus on expanding the\ndataset and incorporating inversion modeling for more precise emission\nquantification. Balancing ecological impacts with economic viability will be\nessential for fostering sustainable dairy farming practices.\n","authors":["Hanqing Bi","Suresh Neethirajan"],"pdf_url":"https://arxiv.org/pdf/2411.08766v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.08764v1","updated":"2024-11-13T16:49:56Z","published":"2024-11-13T16:49:56Z","title":"Flow reconstruction in time-varying geometries using graph neural\n networks","summary":" The paper presents a Graph Attention Convolutional Network (GACN) for flow\nreconstruction from very sparse data in time-varying geometries. The model\nincorporates a feature propagation algorithm as a preprocessing step to handle\nextremely sparse inputs, leveraging information from neighboring nodes to\ninitialize missing features. In addition, a binary indicator is introduced as a\nvalidity mask to distinguish between the original and propagated data points,\nenabling more effective learning from sparse inputs. Trained on a unique data\nset of Direct Numerical Simulations (DNS) of a motored engine at a technically\nrelevant operating condition, the GACN shows robust performance across\ndifferent resolutions and domain sizes and can effectively handle unstructured\ndata and variable input sizes. The model is tested on previously unseen DNS\ndata as well as on an experimental data set from Particle Image Velocimetry\n(PIV) measurements that were not considered during training. A comparative\nanalysis shows that the GACN consistently outperforms both a conventional\nConvolutional Neural Network (CNN) and cubic interpolation methods on the DNS\nand PIV test sets by achieving lower reconstruction errors and better capturing\nfine-scale turbulent structures. In particular, the GACN effectively\nreconstructs flow fields from domains up to 14 times larger than those observed\nduring training, with the performance advantage increasing for larger domains.\n","authors":["Bogdan A. Danciu","Vito A. Pagone","Benjamin Böhm","Marius Schmidt","Christos E. Frouzakis"],"pdf_url":"https://arxiv.org/pdf/2411.08764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08760v1","updated":"2024-11-13T16:47:34Z","published":"2024-11-13T16:47:34Z","title":"Energy Dissipation Preserving Physics Informed Neural Network for\n Allen-Cahn Equations","summary":" This paper investigates a numerical solution of Allen-Cahn equation with\nconstant and degenerate mobility, with polynomial and logarithmic energy\nfunctionals, with deterministic and random initial functions, and with\nadvective term in one, two, and three spatial dimensions, based on the\nphysics-informed neural network (PINN). To improve the learning capacity of the\nPINN, we incorporate the energy dissipation property of the Allen-Cahn equation\nas a penalty term into the loss function of the network. To facilitate the\nlearning process of random initials, we employ a continuous analogue of the\ninitial random condition by utilizing the Fourier series expansion. Adaptive\nmethods from traditional numerical analysis are also integrated to enhance the\neffectiveness of the proposed PINN. Numerical results indicate a consistent\ndecrease in the discrete energy, while also revealing phenomena such as phase\nseparation and metastability.\n","authors":["Mustafa Kütük","Hamdullah Yücel"],"pdf_url":"https://arxiv.org/pdf/2411.08760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13150v2","updated":"2024-11-13T16:46:23Z","published":"2024-03-19T20:58:38Z","title":"On Training Survival Models with Scoring Rules","summary":" Scoring rules are an established way of comparing predictive performances\nacross model classes. In the context of survival analysis, they require\nadaptation in order to accommodate censoring. This work investigates using\nscoring rules for model training rather than evaluation. Doing so, we establish\na general framework for training survival models that is model agnostic and can\nlearn event time distributions parametrically or non-parametrically. In\naddition, our framework is not restricted to any specific scoring rule. While\nwe focus on neural network-based implementations, we also provide\nproof-of-concept implementations using gradient boosting, generalized additive\nmodels, and trees. Empirical comparisons on synthetic and real-world data\nindicate that scoring rules can be successfully incorporated into model\ntraining and yield competitive predictive performance with established\ntime-to-event models.\n","authors":["Philipp Kopper","David Rügamer","Raphael Sonabend","Bernd Bischl","Andreas Bender"],"pdf_url":"https://arxiv.org/pdf/2403.13150v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.02538v2","updated":"2024-11-13T16:44:07Z","published":"2024-07-01T23:24:05Z","title":"CGRclust: Chaos Game Representation for Twin Contrastive Clustering of\n Unlabelled DNA Sequences","summary":" This study proposes CGRclust, a novel combination of unsupervised twin\ncontrastive clustering of Chaos Game Representations (CGR) of DNA sequences,\nwith convolutional neural networks (CNNs). To the best of our knowledge,\nCGRclust is the first method to use unsupervised learning for image\nclassification (herein applied to two-dimensional CGR images) for clustering\ndatasets of DNA sequences. CGRclust overcomes the limitations of traditional\nsequence classification methods by leveraging unsupervised twin contrastive\nlearning to detect distinctive sequence patterns, without requiring DNA\nsequence alignment or biological/taxonomic labels. CGRclust accurately\nclustered twenty-five diverse datasets, with sequence lengths ranging from 664\nbp to 100 kbp, including mitochondrial genomes of fish, fungi, and protists, as\nwell as viral whole genome assemblies and synthetic DNA sequences. Compared\nwith three recent clustering methods for DNA sequences (DeLUCS, iDeLUCS, and\nMeShClust v3.0.), CGRclust is the only method that surpasses 81.70% accuracy\nacross all four taxonomic levels tested for mitochondrial DNA genomes of fish.\nMoreover, CGRclust also consistently demonstrates superior performance across\nall the viral genomic datasets. The high clustering accuracy of CGRclust on\nthese twenty-five datasets, which vary significantly in terms of sequence\nlength, number of genomes, number of clusters, and level of taxonomy,\ndemonstrates its robustness, scalability, and versatility.\n","authors":["Fatemeh Alipour","Kathleen A. Hill","Lila Kari"],"pdf_url":"https://arxiv.org/pdf/2407.02538v2.pdf","comment":"28 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.08758v1","updated":"2024-11-13T16:42:59Z","published":"2024-11-13T16:42:59Z","title":"ScaleNet: Scale Invariance Learning in Directed Graphs","summary":" Graph Neural Networks (GNNs) have advanced relational data analysis but lack\ninvariance learning techniques common in image classification. In node\nclassification with GNNs, it is actually the ego-graph of the center node that\nis classified. This research extends the scale invariance concept to node\nclassification by drawing an analogy to image processing: just as scale\ninvariance being used in image classification to capture multi-scale features,\nwe propose the concept of ``scaled ego-graphs''. Scaled ego-graphs generalize\ntraditional ego-graphs by replacing undirected single-edges with\n``scaled-edges'', which are ordered sequences of multiple directed edges. We\nempirically assess the performance of the proposed scale invariance in graphs\non seven benchmark datasets, across both homophilic and heterophilic\nstructures. Our scale-invariance-based graph learning outperforms inception\nmodels derived from random walks by being simpler, faster, and more accurate.\nThe scale invariance explains inception models' success on homophilic graphs\nand limitations on heterophilic graphs. To ensure applicability of inception\nmodel to heterophilic graphs as well, we further present ScaleNet, an\narchitecture that leverages multi-scaled features. ScaleNet achieves\nstate-of-the-art results on five out of seven datasets (four homophilic and one\nheterophilic) and matches top performance on the remaining two, demonstrating\nits excellent applicability. This represents a significant advance in graph\nlearning, offering a unified framework that enhances node classification across\nvarious graph types. Our code is available at\nhttps://github.com/Qin87/ScaleNet/tree/July25.\n","authors":["Qin Jiang","Chengjia Wang","Michael Lones","Wei Pang"],"pdf_url":"https://arxiv.org/pdf/2411.08758v1.pdf","comment":"Scale invariance in node classification is demonstrated and applied\n in graph transformation to develop ScaleNet, which achieves state-of-the-art\n performance on both homophilic and heterophilic directed graphs"},{"id":"http://arxiv.org/abs/2310.10545v3","updated":"2024-11-13T16:42:52Z","published":"2023-10-16T16:14:43Z","title":"Optimal vintage factor analysis with deflation varimax","summary":" Vintage factor analysis is one important type of factor analysis that aims to\nfirst find a low-dimensional representation of the original data, and then to\nseek a rotation such that the rotated low-dimensional representation is\nscientifically meaningful. The most widely used vintage factor analysis is the\nPrincipal Component Analysis (PCA) followed by the varimax rotation. Despite\nits popularity, little theoretical guarantee can be provided to date mainly\nbecause varimax rotation requires to solve a non-convex optimization over the\nset of orthogonal matrices.\n In this paper, we propose a deflation varimax procedure that solves each row\nof an orthogonal matrix sequentially. In addition to its net computational gain\nand flexibility, we are able to fully establish theoretical guarantees for the\nproposed procedure in a broader context. Adopting this new deflation varimax as\nthe second step after PCA, we further analyze this two step procedure under a\ngeneral class of factor models. Our results show that it estimates the factor\nloading matrix in the minimax optimal rate when the signal-to-noise-ratio (SNR)\nis moderate or large. In the low SNR regime, we offer possible improvement over\nusing PCA and the deflation varimax when the additive noise under the factor\nmodel is structured. The modified procedure is shown to be minimax optimal in\nall SNR regimes. Our theory is valid for finite sample and allows the number of\nthe latent factors to grow with the sample size as well as the ambient\ndimension to grow with, or even exceed, the sample size. Extensive simulation\nand real data analysis further corroborate our theoretical findings.\n","authors":["Xin Bing","Dian Jin","Yuqian Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.10545v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03679v6","updated":"2024-11-13T16:42:22Z","published":"2024-06-06T01:49:29Z","title":"On the Effects of Data Scale on UI Control Agents","summary":" Autonomous agents that control computer interfaces to accomplish human tasks\nare emerging. Leveraging LLMs to power such agents has been of special\ninterest, but unless fine-tuned on human-collected task demonstrations,\nperformance is still relatively low. In this work we study whether fine-tuning\nalone is a viable approach for building real-world computer control agents. In\nparticularly, we investigate how performance measured on both high and\nlow-level tasks in domain and out of domain scales as more training data is\ncollected. To this end we collect and release a new dataset, AndroidControl,\nconsisting of 15,283 demonstrations of everyday tasks with Android apps.\nCompared to existing datasets, each AndroidControl task instance includes both\nhigh and low-level human-generated instructions, allowing us to explore the\nlevel of task complexity an agent can handle. Moreover, AndroidControl is the\nmost diverse computer control dataset to date, including 14,548 unique tasks\nover 833 Android apps, thus allowing us to conduct in-depth analysis of the\nmodel performance in and out of the domain of the training data. Using the\ndataset, we find that when tested in domain fine-tuned models outperform zero\nand few-shot baselines and scale in such a way that robust performance might\nfeasibly be obtained simply by collecting more data. Out of domain, performance\nscales significantly more slowly and suggests that in particular for high-level\ntasks, fine-tuning on more data alone may be insufficient for achieving robust\nout-of-domain performance.\n","authors":["Wei Li","William Bishop","Alice Li","Chris Rawles","Folawiyo Campbell-Ajala","Divya Tyamagundlu","Oriana Riva"],"pdf_url":"https://arxiv.org/pdf/2406.03679v6.pdf","comment":"NeurIPS 2024 (Datasets and Benchmarks)"},{"id":"http://arxiv.org/abs/2404.10420v3","updated":"2024-11-13T16:42:16Z","published":"2024-04-16T09:37:41Z","title":"AudioProtoPNet: An interpretable deep learning model for bird sound\n classification","summary":" Deep learning models have significantly advanced acoustic bird monitoring by\nbeing able to recognize numerous bird species based on their vocalizations.\nHowever, traditional deep learning models are black boxes that provide no\ninsight into their underlying computations, limiting their usefulness to\nornithologists and machine learning engineers. Explainable models could\nfacilitate debugging, knowledge discovery, trust, and interdisciplinary\ncollaboration. This study introduces AudioProtoPNet, an adaptation of the\nPrototypical Part Network (ProtoPNet) for multi-label bird sound\nclassification. It is an inherently interpretable model that uses a ConvNeXt\nbackbone to extract embeddings, with the classification layer replaced by a\nprototype learning classifier trained on these embeddings. The classifier\nlearns prototypical patterns of each bird species' vocalizations from\nspectrograms of training instances. During inference, audio recordings are\nclassified by comparing them to the learned prototypes in the embedding space,\nproviding explanations for the model's decisions and insights into the most\ninformative embeddings of each bird species. The model was trained on the\nBirdSet training dataset, which consists of 9,734 bird species and over 6,800\nhours of recordings. Its performance was evaluated on the seven test datasets\nof BirdSet, covering different geographical regions. AudioProtoPNet\noutperformed the state-of-the-art model Perch, achieving an average AUROC of\n0.90 and a cmAP of 0.42, with relative improvements of 7.1% and 16.7% over\nPerch, respectively. These results demonstrate that even for the challenging\ntask of multi-label bird sound classification, it is possible to develop\npowerful yet inherently interpretable deep learning models that provide\nvaluable insights for ornithologists and machine learning engineers.\n","authors":["René Heinrich","Lukas Rauch","Bernhard Sick","Christoph Scholz"],"pdf_url":"https://arxiv.org/pdf/2404.10420v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.08755v1","updated":"2024-11-13T16:33:27Z","published":"2024-11-13T16:33:27Z","title":"Weakly-Supervised Anomaly Detection in Surveillance Videos Based on\n Two-Stream I3D Convolution Network","summary":" The widespread implementation of urban surveillance systems has necessitated\nmore sophisticated techniques for anomaly detection to ensure enhanced public\nsafety. This paper presents a significant advancement in the field of anomaly\ndetection through the application of Two-Stream Inflated 3D (I3D) Convolutional\nNetworks. These networks substantially outperform traditional 3D Convolutional\nNetworks (C3D) by more effectively extracting spatial and temporal features\nfrom surveillance videos, thus improving the precision of anomaly detection.\nOur research advances the field by implementing a weakly supervised learning\nframework based on Multiple Instance Learning (MIL), which uniquely\nconceptualizes surveillance videos as collections of 'bags' that contain\ninstances (video clips). Each instance is innovatively processed through a\nranking mechanism that prioritizes clips based on their potential to display\nanomalies. This novel strategy not only enhances the accuracy and precision of\nanomaly detection but also significantly diminishes the dependency on extensive\nmanual annotations. Moreover, through meticulous optimization of model\nsettings, including the choice of optimizer, our approach not only establishes\nnew benchmarks in the performance of anomaly detection systems but also offers\na scalable and efficient solution for real-world surveillance applications.\nThis paper contributes significantly to the field of computer vision by\ndelivering a more adaptable, efficient, and context-aware anomaly detection\nsystem, which is poised to redefine practices in urban surveillance.\n","authors":["Sareh Soltani Nejad","Anwar Haque"],"pdf_url":"https://arxiv.org/pdf/2411.08755v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.08750v1","updated":"2024-11-13T16:29:33Z","published":"2024-11-13T16:29:33Z","title":"Optimal Transport-Based Displacement Interpolation with Data\n Augmentation for Reduced Order Modeling of Nonlinear Dynamical Systems","summary":" We present a novel reduced-order Model (ROM) that leverages optimal transport\n(OT) theory and displacement interpolation to enhance the representation of\nnonlinear dynamics in complex systems. While traditional ROM techniques face\nchallenges in this scenario, especially when data (i.e., observational\nsnapshots) is limited, our method addresses these issues by introducing a data\naugmentation strategy based on OT principles. The proposed framework generates\ninterpolated solutions tracing geodesic paths in the space of probability\ndistributions, enriching the training dataset for the ROM. A key feature of our\napproach is its ability to provide a continuous representation of the\nsolution's dynamics by exploiting a virtual-to-real time mapping. This enables\nthe reconstruction of solutions at finer temporal scales than those provided by\nthe original data. To further improve prediction accuracy, we employ Gaussian\nProcess Regression to learn the residual and correct the representation between\nthe interpolated snapshots and the physical solution. We demonstrate the\neffectiveness of our methodology with atmospheric mesoscale benchmarks\ncharacterized by highly nonlinear, advection-dominated dynamics. Our results\nshow improved accuracy and efficiency in predicting complex system behaviors,\nindicating the potential of this approach for a wide range of applications in\ncomputational physics and engineering.\n","authors":["Moaad Khamlich","Federico Pichi","Michele Girfoglio","Annalisa Quaini","Gianluigi Rozza"],"pdf_url":"https://arxiv.org/pdf/2411.08750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08739v1","updated":"2024-11-13T16:18:57Z","published":"2024-11-13T16:18:57Z","title":"Bayesian Comparisons Between Representations","summary":" Which neural networks are similar is a fundamental question for both machine\nlearning and neuroscience. Our novel method compares representations based on\nBayesian statistics about linear readouts from the representations. Concretely,\nwe suggest to use the total variation distance or Jensen-Shannon distance\nbetween prior predictive distributions to compare representations. The prior\npredictive distribution is a full description of the inductive bias and\ngeneralization of a model in Bayesian statistics, making it a great basis for\ncomparisons. As Jensen-Shannon distance and total variation distance are\nmetrics our dissimilarity measures are pseudo-metrics for representations. For\na linear readout, our metrics just depend on the linear kernel matrix of the\nrepresentations. Thus, our metrics connects linear read-out based comparisons\nto kernel based metrics like centered kernel alignment and representational\nsimilarity analysis. We apply our new metrics to deep neural networks trained\non ImageNet-1k. Our new metrics can be computed efficiently including a\nstochastic gradient without dimensionality reductions of the representations.\nIt broadly agrees with existing metrics, but is more stringent. It varies less\nacross different random image samples, and it measures how well two\nrepresentations could be distinguished based on a linear read out. Thus our\nmetric nicely extends our toolkit for comparing representations.\n","authors":["Heiko H. Schütt"],"pdf_url":"https://arxiv.org/pdf/2411.08739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08734v1","updated":"2024-11-13T16:16:22Z","published":"2024-11-13T16:16:22Z","title":"Recommender systems and reinforcement learning for building control and\n occupant interaction: A text-mining driven review of scientific literature","summary":" The indoor environment greatly affects health and well-being; enhancing\nhealth and reducing energy use in these settings is a key research focus. With\nadvancing Information and Communication Technology (ICT), recommendation\nsystems and reinforcement learning have emerged as promising methods to induce\nbehavioral changes that improve indoor environments and building energy\nefficiency. This study employs text-mining and Natural Language Processing\n(NLP) to examine these approaches in building control and occupant interaction.\nAnalyzing approximately 27,000 articles from the ScienceDirect database, we\nfound extensive use of recommendation systems and reinforcement learning for\nspace optimization, location recommendations, and personalized control\nsuggestions. Despite broad applications, their use in optimizing indoor\nenvironments and energy efficiency is limited. Traditional recommendation\nalgorithms are commonly used, but optimizing indoor conditions and energy\nefficiency often requires advanced machine learning techniques like\nreinforcement and deep learning. This review highlights the potential for\nexpanding recommender systems and reinforcement learning applications in\nbuildings and indoor environments. Areas for innovation include predictive\nmaintenance, building-related product recommendations, and optimizing\nenvironments for specific needs like sleep and productivity enhancements based\non user feedback.\n","authors":["Wenhao Zhang","Matias Quintana","Clayton Miller"],"pdf_url":"https://arxiv.org/pdf/2411.08734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12763v3","updated":"2024-11-13T16:07:47Z","published":"2024-06-18T16:30:51Z","title":"Implicit Bias of Mirror Flow on Separable Data","summary":" We examine the continuous-time counterpart of mirror descent, namely mirror\nflow, on classification problems which are linearly separable. Such problems\nare minimised `at infinity' and have many possible solutions; we study which\nsolution is preferred by the algorithm depending on the mirror potential. For\nexponential tailed losses and under mild assumptions on the potential, we show\nthat the iterates converge in direction towards a $\\phi_\\infty$-maximum margin\nclassifier. The function $\\phi_\\infty$ is the \\textit{horizon function} of the\nmirror potential and characterises its shape `at infinity'. When the potential\nis separable, a simple formula allows to compute this function. We analyse\nseveral examples of potentials and provide numerical experiments highlighting\nour results.\n","authors":["Scott Pesme","Radu-Alexandru Dragomir","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2406.12763v3.pdf","comment":"Neurips camera ready. Minor changes from the previous versions.\n Mainly added full iterate trajectories (Figure 4)"},{"id":"http://arxiv.org/abs/2307.05284v4","updated":"2024-11-13T15:53:37Z","published":"2023-07-11T14:25:10Z","title":"Rethinking Distribution Shifts: Empirical Analysis and Inductive\n Modeling for Tabular Data","summary":" Different distribution shifts require different interventions, and algorithms\nmust be grounded in the specific shifts they address. However, methodological\ndevelopment for robust algorithms typically relies on structural assumptions\nthat lack empirical validation. Advocating for an empirically grounded\ndata-driven approach to research, we build an empirical testbed comprising\nnatural shifts across 5 tabular datasets and 60,000 method configurations\nencompassing imbalanced learning and distributionally robust optimization (DRO)\nmethods. We find $Y|X$-shifts are most prevalent on our testbed, in stark\ncontrast to the heavy focus on $X$ (covariate)-shifts in the ML literature. The\nperformance of robust algorithms varies significantly over shift types, and is\nno better than that of vanilla methods. To understand why, we conduct an\nin-depth empirical analysis of DRO methods and find that although often\nneglected by researchers, implementation details -- such as the choice of\nunderlying model class (e.g., XGBoost) and hyperparameter selection -- have a\nbigger impact on performance than the ambiguity set or its radius. To further\nbridge that gap between methodological research and practice, we design case\nstudies that illustrate how such a data-driven, inductive understanding of\ndistribution shifts can enhance both data-centric and algorithmic\ninterventions.\n","authors":["Jiashuo Liu","Tianyu Wang","Peng Cui","Hongseok Namkoong"],"pdf_url":"https://arxiv.org/pdf/2307.05284v4.pdf","comment":"Conference version appeared in NeurIPS 2023, previously titled \"On\n the Need for a Language Describing Distribution Shifts: Illustrations on\n Tabular Datasets\""},{"id":"http://arxiv.org/abs/2411.08706v1","updated":"2024-11-13T15:50:32Z","published":"2024-11-13T15:50:32Z","title":"Searching Latent Program Spaces","summary":" Program synthesis methods aim to automatically generate programs restricted\nto a language that can explain a given specification of input-output pairs.\nWhile purely symbolic approaches suffer from a combinatorial search space,\nrecent methods leverage neural networks to learn distributions over program\nstructures to narrow this search space significantly, enabling more efficient\nsearch. However, for challenging problems, it remains difficult to train models\nto perform program synthesis in one shot, making test-time search essential.\nMost neural methods lack structured search mechanisms during inference, relying\ninstead on stochastic sampling or gradient updates, which can be inefficient.\nIn this work, we propose the Latent Program Network (LPN), a general algorithm\nfor program induction that learns a distribution over latent programs in a\ncontinuous space, enabling efficient search and test-time adaptation. We\nexplore how to train these networks to optimize for test-time computation and\ndemonstrate the use of gradient-based search both during training and at test\ntime. We evaluate LPN on ARC-AGI, a program synthesis benchmark that evaluates\nperformance by generalizing programs to new inputs rather than explaining the\nunderlying specification. We show that LPN can generalize beyond its training\ndistribution and adapt to unseen tasks by utilizing test-time computation,\noutperforming algorithms without test-time adaptation mechanisms.\n","authors":["Clément Bonnet","Matthew V Macfarlane"],"pdf_url":"https://arxiv.org/pdf/2411.08706v1.pdf","comment":"Code available at https://github.com/clement-bonnet/lpn"},{"id":"http://arxiv.org/abs/2408.00838v2","updated":"2024-11-13T15:48:34Z","published":"2024-08-01T18:00:05Z","title":"Calibrating Bayesian Generative Machine Learning for Bayesiamplification","summary":" Recently, combinations of generative and Bayesian machine learning have been\nintroduced in particle physics for both fast detector simulation and inference\ntasks. These neural networks aim to quantify the uncertainty on the generated\ndistribution originating from limited training statistics. The interpretation\nof a distribution-wide uncertainty however remains ill-defined. We show a clear\nscheme for quantifying the calibration of Bayesian generative machine learning\nmodels. For a Continuous Normalizing Flow applied to a low-dimensional toy\nexample, we evaluate the calibration of Bayesian uncertainties from either a\nmean-field Gaussian weight posterior, or Monte Carlo sampling network weights,\nto gauge their behaviour on unsteady distribution edges. Well calibrated\nuncertainties can then be used to roughly estimate the number of uncorrelated\ntruth samples that are equivalent to the generated sample and clearly indicate\ndata amplification for smooth features of the distribution.\n","authors":["Sebastian Bieringer","Sascha Diefenbacher","Gregor Kasieczka","Mathias Trabs"],"pdf_url":"https://arxiv.org/pdf/2408.00838v2.pdf","comment":"15 pages, 6 figures, updated references, fixed typo"},{"id":"http://arxiv.org/abs/2411.08703v1","updated":"2024-11-13T15:45:46Z","published":"2024-11-13T15:45:46Z","title":"MVKTrans: Multi-View Knowledge Transfer for Robust Multiomics\n Classification","summary":" The distinct characteristics of multiomics data, including complex\ninteractions within and across biological layers and disease heterogeneity\n(e.g., heterogeneity in etiology and clinical symptoms), drive us to develop\nnovel designs to address unique challenges in multiomics prediction. In this\npaper, we propose the multi-view knowledge transfer learning (MVKTrans)\nframework, which transfers intra- and inter-omics knowledge in an adaptive\nmanner by reviewing data heterogeneity and suppressing bias transfer, thereby\nenhancing classification performance. Specifically, we design a graph\ncontrastive module that is trained on unlabeled data to effectively learn and\ntransfer the underlying intra-omics patterns to the supervised task. This\nunsupervised pretraining promotes learning general and unbiased representations\nfor each modality, regardless of the downstream tasks. In light of the varying\ndiscriminative capacities of modalities across different diseases and/or\nsamples, we introduce an adaptive and bi-directional cross-omics distillation\nmodule. This module automatically identifies richer modalities and facilitates\ndynamic knowledge transfer from more informative to less informative omics,\nthereby enabling a more robust and generalized integration. Extensive\nexperiments on four real biomedical datasets demonstrate the superior\nperformance and robustness of MVKTrans compared to the state-of-the-art. Code\nand data are available at https://github.com/Yaolab-fantastic/MVKTrans.\n","authors":["Shan Cong","Zhiling Sang","Hongwei Liu","Haoran Luo","Xin Wang","Hong Liang","Jie Hao","Xiaohui Yao"],"pdf_url":"https://arxiv.org/pdf/2411.08703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08701v1","updated":"2024-11-13T15:42:28Z","published":"2024-11-13T15:42:28Z","title":"TRACE: Transformer-based Risk Assessment for Clinical Evaluation","summary":" We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation),\na novel method for clinical risk assessment based on clinical data, leveraging\nthe self-attention mechanism for enhanced feature interaction and result\ninterpretation. Our approach is able to handle different data modalities,\nincluding continuous, categorical and multiple-choice (checkbox) attributes.\nThe proposed architecture features a shared representation of the clinical data\nobtained by integrating specialized embeddings of each data modality, enabling\nthe detection of high-risk individuals using Transformer encoder layers. To\nassess the effectiveness of the proposed method, a strong baseline based on\nnon-negative multi-layer perceptrons (MLPs) is introduced. The proposed method\noutperforms various baselines widely used in the domain of clinical risk\nassessment, while effectively handling missing values. In terms of\nexplainability, our Transformer-based method offers easily interpretable\nresults via attention weights, further enhancing the clinicians'\ndecision-making process.\n","authors":["Dionysis Christopoulos","Sotiris Spanos","Valsamis Ntouskos","Konstantinos Karantzalos"],"pdf_url":"https://arxiv.org/pdf/2411.08701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08700v1","updated":"2024-11-13T15:42:13Z","published":"2024-11-13T15:42:13Z","title":"Rethinking negative sampling in content-based news recommendation","summary":" News recommender systems are hindered by the brief lifespan of articles, as\nthey undergo rapid relevance decay. Recent studies have demonstrated the\npotential of content-based neural techniques in tackling this problem. However,\nthese models often involve complex neural architectures and often lack\nconsideration for negative examples. In this study, we posit that the careful\nsampling of negative examples has a big impact on the model's outcome. We\ndevise a negative sampling technique that not only improves the accuracy of the\nmodel but also facilitates the decentralization of the recommendation system.\nThe experimental results obtained using the MIND dataset demonstrate that the\naccuracy of the method under consideration can compete with that of\nState-of-the-Art models. The utilization of the sampling technique is essential\nin reducing model complexity and accelerating the training process, while\nmaintaining a high level of accuracy. Finally, we discuss how decentralized\nmodels can help improve privacy and scalability.\n","authors":["Miguel Ângelo Rebelo","João Vinagre","Ivo Pereira","Álvaro Figueira"],"pdf_url":"https://arxiv.org/pdf/2411.08700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08699v1","updated":"2024-11-13T15:42:09Z","published":"2024-11-13T15:42:09Z","title":"FedSub: Introducing class-aware Subnetworks Fusion to Enhance\n Personalized Federated Learning in Ubiquitous Systems","summary":" Personalized Federated Learning is essential in AI-driven ubiquitous systems,\nsupporting the distributed development of models able to adapt to diverse and\nevolving user behaviors while safeguarding privacy. Despite addressing\nheterogeneous user data distributions in collaborative model training, existing\nmethods often face limitations balancing personalization and generalization,\noversimplifying user similarities, or relying heavily on global models. In this\npaper, we propose FedSub, a novel federated approach designed to enhance\npersonalization through the use of class-aware prototypes and model\nsubnetworks. Prototypes serve as compact representations of user data,\nclustered on the server to identify similarities based on specific label\npatterns. Concurrently, subnetworks -- model components necessary to process\neach class -- are extracted locally and fused by the server according to these\nclusters, producing highly tailored model updates for each user. This\nfine-grained, class-specific aggregation of clients' models allows FedSub to\ncapture the unique characteristics of individual user data patterns. The\neffectiveness of FedSub is validated in three real-world scenarios\ncharacterized by high data heterogeneity, derived from human activity\nrecognition and mobile health applications. Experimental evaluations\ndemonstrate FedSub's performance improvements with respect to the\nstate-of-the-art and significant advancements in personalization for ubiquitous\nsystems based on personal mobile and wearable devices.\n","authors":["Mattia Giovanni Campana","Franca Delmastro"],"pdf_url":"https://arxiv.org/pdf/2411.08699v1.pdf","comment":"Submitted to Proceedings of the ACM on Interactive, Mobile, Wearable\n and Ubiquitous Technologies (IMWUT)"},{"id":"http://arxiv.org/abs/2405.15732v2","updated":"2024-11-13T15:30:50Z","published":"2024-05-24T17:20:18Z","title":"Neural Persistence Dynamics","summary":" We consider the problem of learning the dynamics in the topology of\ntime-evolving point clouds, the prevalent spatiotemporal model for systems\nexhibiting collective behavior, such as swarms of insects and birds or\nparticles in physics. In such systems, patterns emerge from (local)\ninteractions among self-propelled entities. While several well-understood\ngoverning equations for motion and interaction exist, they are notoriously\ndifficult to fit to data, as most prior work requires knowledge about\nindividual motion trajectories, i.e., a requirement that is challenging to\nsatisfy with an increasing number of entities. To evade such confounding\nfactors, we investigate collective behavior from a $\\textit{topological\nperspective}$, but instead of summarizing entire observation sequences (as done\npreviously), we propose learning a latent dynamical model from topological\nfeatures $\\textit{per time point}$. The latter is then used to formulate a\ndownstream regression task to predict the parametrization of some a priori\nspecified governing equation. We implement this idea based on a latent ODE\nlearned from vectorized (static) persistence diagrams and show that a\ncombination of recent stability results for persistent homology justifies this\nmodeling choice. Various (ablation) experiments not only demonstrate the\nrelevance of each model component but provide compelling empirical evidence\nthat our proposed model - $\\textit{Neural Persistence Dynamics}$ -\nsubstantially outperforms the state-of-the-art across a diverse set of\nparameter regression tasks.\n","authors":["Sebastian Zeng","Florian Graf","Martin Uray","Stefan Huber","Roland Kwitt"],"pdf_url":"https://arxiv.org/pdf/2405.15732v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08687v1","updated":"2024-11-13T15:22:33Z","published":"2024-11-13T15:22:33Z","title":"Measuring similarity between embedding spaces using induced neighborhood\n graphs","summary":" Deep Learning techniques have excelled at generating embedding spaces that\ncapture semantic similarities between items. Often these representations are\npaired, enabling experiments with analogies (pairs within the same domain) and\ncross-modality (pairs across domains). These experiments are based on specific\nassumptions about the geometry of embedding spaces, which allow finding paired\nitems by extrapolating the positional relationships between embedding pairs in\nthe training dataset, allowing for tasks such as finding new analogies, and\nmultimodal zero-shot classification. In this work, we propose a metric to\nevaluate the similarity between paired item representations. Our proposal is\nbuilt from the structural similarity between the nearest-neighbors induced\ngraphs of each representation, and can be configured to compare spaces based on\ndifferent distance metrics and on different neighborhood sizes. We demonstrate\nthat our proposal can be used to identify similar structures at different\nscales, which is hard to achieve with kernel methods such as Centered Kernel\nAlignment (CKA). We further illustrate our method with two case studies: an\nanalogy task using GloVe embeddings, and zero-shot classification in the\nCIFAR-100 dataset using CLIP embeddings. Our results show that accuracy in both\nanalogy and zero-shot classification tasks correlates with the embedding\nsimilarity. These findings can help explain performance differences in these\ntasks, and may lead to improved design of paired-embedding models in the\nfuture.\n","authors":["Tiago F. Tavares","Fabio Ayres","Paris Smaragdis"],"pdf_url":"https://arxiv.org/pdf/2411.08687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09604v2","updated":"2024-11-13T15:17:20Z","published":"2024-08-18T22:11:24Z","title":"Circuit design in biology and machine learning. I. Random networks and\n dimensional reduction","summary":" A biological circuit is a neural or biochemical cascade, taking inputs and\nproducing outputs. How have biological circuits learned to solve environmental\nchallenges over the history of life? The answer certainly follows Dobzhansky's\nfamous quote that ``nothing in biology makes sense except in the light of\nevolution.'' But that quote leaves out the mechanistic basis by which natural\nselection's trial-and-error learning happens, which is exactly what we have to\nunderstand. How does the learning process that designs biological circuits\nactually work? How much insight can we gain about the form and function of\nbiological circuits by studying the processes that have made those circuits?\nBecause life's circuits must often solve the same problems as those faced by\nmachine learning, such as environmental tracking, homeostatic control,\ndimensional reduction, or classification, we can begin by considering how\nmachine learning designs computational circuits to solve problems. We can then\nask: How much insight do those computational circuits provide about the design\nof biological circuits? How much does biology differ from computers in the\nparticular circuit designs that it uses to solve problems? This article steps\nthrough two classic machine learning models to set the foundation for analyzing\nbroad questions about the design of biological circuits. One insight is the\nsurprising power of randomly connected networks. Another is the central role of\ninternal models of the environment embedded within biological circuits,\nillustrated by a model of dimensional reduction and trend prediction. Overall,\nmany challenges in biology have machine learning analogs, suggesting hypotheses\nabout how biology's circuits are designed.\n","authors":["Steven A. Frank"],"pdf_url":"https://arxiv.org/pdf/2408.09604v2.pdf","comment":"Added background info in two text boxes and new figure, edited\n throughout"},{"id":"http://arxiv.org/abs/2402.16187v3","updated":"2024-11-13T15:14:38Z","published":"2024-02-25T20:24:07Z","title":"No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design\n Choices","summary":" Advances in generative models have made it possible for AI-generated text,\ncode, and images to mirror human-generated content in many applications.\nWatermarking, a technique that aims to embed information in the output of a\nmodel to verify its source, is useful for mitigating the misuse of such\nAI-generated content. However, we show that common design choices in LLM\nwatermarking schemes make the resulting systems surprisingly susceptible to\nattack -- leading to fundamental trade-offs in robustness, utility, and\nusability. To navigate these trade-offs, we rigorously study a set of simple\nyet effective attacks on common watermarking systems, and propose guidelines\nand defenses for LLM watermarking in practice.\n","authors":["Qi Pang","Shengyuan Hu","Wenting Zheng","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2402.16187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08664v1","updated":"2024-11-13T14:55:08Z","published":"2024-11-13T14:55:08Z","title":"UniMat: Unifying Materials Embeddings through Multi-modal Learning","summary":" Materials science datasets are inherently heterogeneous and are available in\ndifferent modalities such as characterization spectra, atomic structures,\nmicroscopic images, and text-based synthesis conditions. The advancements in\nmulti-modal learning, particularly in vision and language models, have opened\nnew avenues for integrating data in different forms. In this work, we evaluate\ncommon techniques in multi-modal learning (alignment and fusion) in unifying\nsome of the most important modalities in materials science: atomic structure,\nX-ray diffraction patterns (XRD), and composition. We show that structure graph\nmodality can be enhanced by aligning with XRD patterns. Additionally, we show\nthat aligning and fusing more experimentally accessible data formats, such as\nXRD patterns and compositions, can create more robust joint embeddings than\nindividual modalities across various tasks. This lays the groundwork for future\nstudies aiming to exploit the full potential of multi-modal data in materials\nscience, facilitating more informed decision-making in materials design and\ndiscovery.\n","authors":["Janghoon Ock","Joseph Montoya","Daniel Schweigert","Linda Hung","Santosh K. Suram","Weike Ye"],"pdf_url":"https://arxiv.org/pdf/2411.08664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13224v4","updated":"2024-11-13T14:54:18Z","published":"2024-02-20T18:37:11Z","title":"Controlling Large Electric Vehicle Charging Stations via User Behavior\n Modeling and Stochastic Programming","summary":" This paper introduces an Electric Vehicle Charging Station (EVCS) model that\nincorporates real-world constraints, such as slot power limitations, contract\nthreshold overruns penalties, or early disconnections of electric vehicles\n(EVs). We propose a formulation of the problem of EVCS control under\nuncertainty, and implement two Multi-Stage Stochastic Programming approaches\nthat leverage user-provided information, namely, Model Predictive Control and\nTwo-Stage Stochastic Programming. The model addresses uncertainties in charging\nsession start and end times, as well as in energy demand. A user's behavior\nmodel based on a sojourn-time-dependent stochastic process enhances cost\nreduction while maintaining customer satisfaction. The benefits of the two\nproposed methods are showcased against two baselines over a 22-day simulation\nusing a real-world dataset. The two-stage approach demonstrates robustness\nagainst early disconnections by considering a wider range of uncertainty\nscenarios for optimization. The algorithm prioritizing user satisfaction over\nelectricity cost achieves a 20% and 36% improvement in two user satisfaction\nmetrics compared to an industry-standard baseline. Additionally, the algorithm\nstriking the best balance between cost and user satisfaction exhibits a mere 3%\nrelative cost increase compared to the theoretically optimal baseline - for\nwhich the nonanticipativity constraint is relaxed - while attaining 94% and 84%\nof the user satisfaction performance in the two used satisfaction metrics.\n","authors":["Alban Puech","Tristan Rigaut","William Templier","Maud Tournoud"],"pdf_url":"https://arxiv.org/pdf/2402.13224v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08652v1","updated":"2024-11-13T14:42:32Z","published":"2024-11-13T14:42:32Z","title":"Accelerating Quasi-Static Time Series Simulations with Foundation Models","summary":" Quasi-static time series (QSTS) simulations have great potential for\nevaluating the grid's ability to accommodate the large-scale integration of\ndistributed energy resources. However, as grids expand and operate closer to\ntheir limits, iterative power flow solvers, central to QSTS simulations, become\ncomputationally prohibitive and face increasing convergence issues. Neural\npower flow solvers provide a promising alternative, speeding up power flow\ncomputations by 3 to 4 orders of magnitude, though they are costly to train. In\nthis paper, we envision how recently introduced grid foundation models could\nimprove the economic viability of neural power flow solvers. Conceptually,\nthese models amortize training costs by serving as a foundation for a range of\ngrid operation and planning tasks beyond power flow solving, with only minimal\nfine-tuning required. We call for collaboration between the AI and power grid\ncommunities to develop and open-source these models, enabling all operators,\neven those with limited resources, to benefit from AI without building\nsolutions from scratch.\n","authors":["Alban Puech","François Mirallès","Jonas Weiss","Vincent Mai","Alexandre Blondin Massé","Martin de Montigny","Thomas Brunschwiler","Hendrik F. Hamann"],"pdf_url":"https://arxiv.org/pdf/2411.08652v1.pdf","comment":"Equal contributors: A.P. and F.M.; Lead contact: A.P"},{"id":"http://arxiv.org/abs/2306.16028v2","updated":"2024-11-13T14:41:20Z","published":"2023-06-28T08:55:56Z","title":"Exponential separations between classical and quantum learners","summary":" Despite significant effort, the quantum machine learning community has only\ndemonstrated quantum learning advantages for artificial cryptography-inspired\ndatasets when dealing with classical data. In this paper we address the\nchallenge of finding learning problems where quantum learning algorithms can\nachieve a provable exponential speedup over classical learning algorithms. We\nreflect on computational learning theory concepts related to this question and\ndiscuss how subtle differences in definitions can result in significantly\ndifferent requirements and tasks for the learner to meet and solve. We examine\nexisting learning problems with provable quantum speedups and find that they\nlargely rely on the classical hardness of evaluating the function that\ngenerates the data, rather than identifying it. To address this, we present two\nnew learning separations where the classical difficulty primarily lies in\nidentifying the function generating the data. Furthermore, we explore\ncomputational hardness assumptions that can be leveraged to prove quantum\nspeedups in scenarios where data is quantum-generated, which implies likely\nquantum advantages in a plethora of more natural settings (e.g., in condensed\nmatter and high energy physics). We also discuss the limitations of the\nclassical shadow paradigm in the context of learning separations, and how\nphysically-motivated settings such as characterizing phases of matter and\nHamiltonian learning fit in the computational learning framework.\n","authors":["Casper Gyurik","Vedran Dunjko"],"pdf_url":"https://arxiv.org/pdf/2306.16028v2.pdf","comment":"this article supersedes arXiv:2208.06339"},{"id":"http://arxiv.org/abs/2411.08651v1","updated":"2024-11-13T14:40:51Z","published":"2024-11-13T14:40:51Z","title":"Estimating unknown parameters in differential equations with a\n reinforcement learning based PSO method","summary":" Differential equations offer a foundational yet powerful framework for\nmodeling interactions within complex dynamic systems and are widely applied\nacross numerous scientific fields. One common challenge in this area is\nestimating the unknown parameters of these dynamic relationships. However,\ntraditional numerical optimization methods rely on the selection of initial\nparameter values, making them prone to local optima. Meanwhile, deep learning\nand Bayesian methods require training models on specific differential\nequations, resulting in poor versatility. This paper reformulates the parameter\nestimation problem of differential equations as an optimization problem by\nintroducing the concept of particles from the particle swarm optimization\nalgorithm. Building on reinforcement learning-based particle swarm optimization\n(RLLPSO), this paper proposes a novel method, DERLPSO, for estimating unknown\nparameters of differential equations. We compared its performance on three\ntypical ordinary differential equations with the state-of-the-art methods,\nincluding the RLLPSO algorithm, traditional numerical methods, deep learning\napproaches, and Bayesian methods. The experimental results demonstrate that our\nDERLPSO consistently outperforms other methods in terms of performance,\nachieving an average Mean Square Error of 1.13e-05, which reduces the error by\napproximately 4 orders of magnitude compared to other methods. Apart from\nordinary differential equations, our DERLPSO also show great promise for\nestimating unknown parameters of partial differential equations. The DERLPSO\nmethod proposed in this paper has high accuracy, is independent of initial\nparameter values, and possesses strong versatility and stability. This work\nprovides new insights into unknown parameter estimation for differential\nequations.\n","authors":["Wenkui Sun","Xiaoya Fan","Lijuan Jia","Tinyi Chu","Shing-Tung Yau","Rongling Wu","Zhong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07444v2","updated":"2024-11-13T14:39:42Z","published":"2023-11-13T16:18:58Z","title":"On the Robustness of Neural Collapse and the Neural Collapse of\n Robustness","summary":" Neural Collapse refers to the curious phenomenon in the end of training of a\nneural network, where feature vectors and classification weights converge to a\nvery simple geometrical arrangement (a simplex). While it has been observed\nempirically in various cases and has been theoretically motivated, its\nconnection with crucial properties of neural networks, like their\ngeneralization and robustness, remains unclear. In this work, we study the\nstability properties of these simplices. We find that the simplex structure\ndisappears under small adversarial attacks, and that perturbed examples \"leap\"\nbetween simplex vertices. We further analyze the geometry of networks that are\noptimized to be robust against adversarial perturbations of the input, and find\nthat Neural Collapse is a pervasive phenomenon in these cases as well, with\nclean and perturbed representations forming aligned simplices, and giving rise\nto a robust simple nearest-neighbor classifier. By studying the propagation of\nthe amount of collapse inside the network, we identify novel properties of both\nrobust and non-robust machine learning models, and show that earlier, unlike\nlater layers maintain reliable simplices on perturbed data. Our code is\navailable at https://github.com/JingtongSu/robust_neural_collapse .\n","authors":["Jingtong Su","Ya Shi Zhang","Nikolaos Tsilivis","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2311.07444v2.pdf","comment":"Transactions on Machine Learning Research, 2024"},{"id":"http://arxiv.org/abs/2411.08640v1","updated":"2024-11-13T14:31:52Z","published":"2024-11-13T14:31:52Z","title":"Towards Secure Intelligent O-RAN Architecture: Vulnerabilities, Threats\n and Promising Technical Solutions using LLMs","summary":" The evolution of wireless communication systems will be fundamentally\nimpacted by an open radio access network (O-RAN), a new concept defining an\nintelligent architecture with enhanced flexibility, openness, and the ability\nto slice services more efficiently. For all its promises, and like any\ntechnological advancement, O-RAN is not without risks that need to be carefully\nassessed and properly addressed to accelerate its wide adoption in future\nmobile networks. In this paper, we present an in-depth security analysis of the\nO-RAN architecture, discussing the potential threats that may arise in the\ndifferent O-RAN architecture layers and their impact on the Confidentiality,\nIntegrity, and Availability (CIA) triad. We also promote the potential of zero\ntrust, Moving Target Defense (MTD), blockchain, and large language models(LLM)\ntechnologies in fortifying O-RAN's security posture. Furthermore, we\nnumerically demonstrate the effectiveness of MTD in empowering robust deep\nreinforcement learning methods for dynamic network slice admission control in\nthe O-RAN architecture. Moreover, we examine the effect of explainable AI (XAI)\nbased on LLMs in securing the system.\n","authors":["Mojdeh Karbalaee Motalleb","Chafika Benzaid","Tarik Taleb","Marcos Katz","Vahid Shah-Mansouri","JaeSeung Song"],"pdf_url":"https://arxiv.org/pdf/2411.08640v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.08638v1","updated":"2024-11-13T14:26:04Z","published":"2024-11-13T14:26:04Z","title":"Gaussian Mixture Models Based Augmentation Enhances GNN Generalization","summary":" Graph Neural Networks (GNNs) have shown great promise in tasks like node and\ngraph classification, but they often struggle to generalize, particularly to\nunseen or out-of-distribution (OOD) data. These challenges are exacerbated when\ntraining data is limited in size or diversity. To address these issues, we\nintroduce a theoretical framework using Rademacher complexity to compute a\nregret bound on the generalization error and then characterize the effect of\ndata augmentation. This framework informs the design of GMM-GDA, an efficient\ngraph data augmentation (GDA) algorithm leveraging the capability of Gaussian\nMixture Models (GMMs) to approximate any distribution. Our approach not only\noutperforms existing augmentation techniques in terms of generalization but\nalso offers improved time complexity, making it highly suitable for real-world\napplications.\n","authors":["Yassine Abbahaddou","Fragkiskos D. Malliaros","Johannes F. Lutzeyer","Amine Mohamed Aboussalah","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2411.08638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08637v1","updated":"2024-11-13T14:24:47Z","published":"2024-11-13T14:24:47Z","title":"Robot See, Robot Do: Imitation Reward for Noisy Financial Environments","summary":" The sequential nature of decision-making in financial asset trading aligns\nnaturally with the reinforcement learning (RL) framework, making RL a common\napproach in this domain. However, the low signal-to-noise ratio in financial\nmarkets results in noisy estimates of environment components, including the\nreward function, which hinders effective policy learning by RL agents. Given\nthe critical importance of reward function design in RL problems, this paper\nintroduces a novel and more robust reward function by leveraging imitation\nlearning, where a trend labeling algorithm acts as an expert. We integrate\nimitation (expert's) feedback with reinforcement (agent's) feedback in a\nmodel-free RL algorithm, effectively embedding the imitation learning problem\nwithin the RL paradigm to handle the stochasticity of reward signals. Empirical\nresults demonstrate that this novel approach improves financial performance\nmetrics compared to traditional benchmarks and RL agents trained solely using\nreinforcement feedback.\n","authors":["Sven Goluža","Tomislav Kovačević","Stjepan Begušić","Zvonko Kostanjčar"],"pdf_url":"https://arxiv.org/pdf/2411.08637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08631v1","updated":"2024-11-13T14:17:26Z","published":"2024-11-13T14:17:26Z","title":"Deep Generative Demand Learning for Newsvendor and Pricing","summary":" We consider data-driven inventory and pricing decisions in the feature-based\nnewsvendor problem, where demand is influenced by both price and contextual\nfeatures and is modeled without any structural assumptions. The unknown demand\ndistribution results in a challenging conditional stochastic optimization\nproblem, further complicated by decision-dependent uncertainty and the\nintegration of features. Inspired by recent advances in deep generative\nlearning, we propose a novel approach leveraging conditional deep generative\nmodels (cDGMs) to address these challenges. cDGMs learn the demand distribution\nand generate probabilistic demand forecasts conditioned on price and features.\nThis generative approach enables accurate profit estimation and supports the\ndesign of algorithms for two key objectives: (1) optimizing inventory for\narbitrary prices, and (2) jointly determining optimal pricing and inventory\nlevels. We provide theoretical guarantees for our approach, including the\nconsistency of profit estimation and convergence of our decisions to the\noptimal solution. Extensive simulations-ranging from simple to complex\nscenarios, including one involving textual features-and a real-world case study\ndemonstrate the effectiveness of our approach. Our method opens a new paradigm\nin management science and operations research, is adaptable to extensions of\nthe newsvendor and pricing problems, and holds potential for solving other\nconditional stochastic optimization problems.\n","authors":["Shijin Gong","Huihang Liu","Xinyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08631v1.pdf","comment":"30 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.16336v2","updated":"2024-11-13T14:13:58Z","published":"2024-03-25T00:21:34Z","title":"Predictive Inference in Multi-environment Scenarios","summary":" We address the challenge of constructing valid confidence intervals and sets\nin problems of prediction across multiple environments. We investigate two\ntypes of coverage suitable for these problems, extending the jackknife and\nsplit-conformal methods to show how to obtain distribution-free coverage in\nsuch non-traditional, potentially hierarchical data-generating scenarios. We\ndemonstrate a novel resizing method to adapt to problem difficulty, which\napplies both to existing approaches for predictive inference and the methods we\ndevelop; this reduces prediction set sizes using limited information from the\ntest environment, a key to the methods' practical performance, which we\nevaluate through neurochemical sensing and species classification datasets. Our\ncontributions also include extensions for settings with non-real-valued\nresponses, a theory of consistency for predictive inference in these general\nproblems, and insights on the limits of conditional coverage.\n","authors":["John C. Duchi","Suyash Gupta","Kuanhao Jiang","Pragya Sur"],"pdf_url":"https://arxiv.org/pdf/2403.16336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08610v1","updated":"2024-11-13T13:53:10Z","published":"2024-11-13T13:53:10Z","title":"Dynamic Subset Tuning: Expanding the Operational Range of\n Parameter-Efficient Training for Large Language Models","summary":" We propose a novel parameter-efficient training (PET) method for large\nlanguage models that adapts models to downstream tasks by optimizing a small\nsubset of the existing model parameters. Unlike prior methods, this subset is\nnot fixed in location but rather which parameters are modified evolves over the\ncourse of training. This dynamic parameter selection can yield good performance\nwith many fewer parameters than extant methods. Our method enables a seamless\nscaling of the subset size across an arbitrary proportion of the total model\nsize, while popular PET approaches like prompt tuning and LoRA cover only a\nsmall part of this spectrum. We match or outperform prompt tuning and LoRA in\nmost cases on a variety of NLP tasks (MT, QA, GSM8K, SuperGLUE) for a given\nparameter budget across different model families and sizes.\n","authors":["Felix Stahlberg","Jared Lichtarge","Shankar Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.08610v1.pdf","comment":"NeurIPS 2024 Workshop on Adaptive Foundation Models"},{"id":"http://arxiv.org/abs/2411.08013v2","updated":"2024-11-13T13:36:05Z","published":"2024-11-12T18:43:27Z","title":"Investigating the Effectiveness of Explainability Methods in Parkinson's\n Detection from Speech","summary":" Speech impairments in Parkinson's disease (PD) provide significant early\nindicators for diagnosis. While models for speech-based PD detection have shown\nstrong performance, their interpretability remains underexplored. This study\nsystematically evaluates several explainability methods to identify PD-specific\nspeech features, aiming to support the development of accurate, interpretable\nmodels for clinical decision-making in PD diagnosis and monitoring. Our\nmethodology involves (i) obtaining attributions and saliency maps using\nmainstream interpretability techniques, (ii) quantitatively evaluating the\nfaithfulness of these maps and their combinations obtained via union and\nintersection through a range of established metrics, and (iii) assessing the\ninformation conveyed by the saliency maps for PD detection from an auxiliary\nclassifier. Our results reveal that, while explanations are aligned with the\nclassifier, they often fail to provide valuable information for domain experts.\n","authors":["Eleonora Mancini","Francesco Paissan","Paolo Torroni","Mirco Ravanelli","Cem Subakan"],"pdf_url":"https://arxiv.org/pdf/2411.08013v2.pdf","comment":"The first two authors contributed equally to this research: author\n order is alphabetical"},{"id":"http://arxiv.org/abs/2411.08599v1","updated":"2024-11-13T13:30:21Z","published":"2024-11-13T13:30:21Z","title":"XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL","summary":" To tackle the challenges of large language model performance in natural\nlanguage to SQL tasks, we introduce XiYan-SQL, an innovative framework that\nemploys a multi-generator ensemble strategy to improve candidate generation. We\nintroduce M-Schema, a semi-structured schema representation method designed to\nenhance the understanding of database structures. To enhance the quality and\ndiversity of generated candidate SQL queries, XiYan-SQL integrates the\nsignificant potential of in-context learning (ICL) with the precise control of\nsupervised fine-tuning. On one hand, we propose a series of training strategies\nto fine-tune models to generate high-quality candidates with diverse\npreferences. On the other hand, we implement the ICL approach with an example\nselection method based on named entity recognition to prevent overemphasis on\nentities. The refiner optimizes each candidate by correcting logical or\nsyntactical errors. To address the challenge of identifying the best candidate,\nwe fine-tune a selection model to distinguish nuances of candidate SQL queries.\nThe experimental results on multiple dialect datasets demonstrate the\nrobustness of XiYan-SQL in addressing challenges across different scenarios.\nOverall, our proposed XiYan-SQL achieves the state-of-the-art execution\naccuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on\nNL2GQL, and a competitive score of 72.23% on the Bird development benchmark.\nThe proposed framework not only enhances the quality and diversity of SQL\nqueries but also outperforms previous methods.\n","authors":["Yingqi Gao","Yifu Liu","Xiaoxia Li","Xiaorong Shi","Yin Zhu","Yiming Wang","Shiqi Li","Wei Li","Yuntao Hong","Zhiling Luo","Jinyang Gao","Liyu Mou","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2411.08599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13178v2","updated":"2024-11-13T13:14:19Z","published":"2024-10-17T02:58:57Z","title":"GeSubNet: Gene Interaction Inference for Disease Subtype Network\n Generation","summary":" Retrieving gene functional networks from knowledge databases presents a\nchallenge due to the mismatch between disease networks and subtype-specific\nvariations. Current solutions, including statistical and deep learning methods,\noften fail to effectively integrate gene interaction knowledge from databases\nor explicitly learn subtype-specific interactions. To address this mismatch, we\npropose GeSubNet, which learns a unified representation capable of predicting\ngene interactions while distinguishing between different disease subtypes.\nGraphs generated by such representations can be considered subtype-specific\nnetworks. GeSubNet is a multi-step representation learning framework with three\nmodules: First, a deep generative model learns distinct disease subtypes from\npatient gene expression profiles. Second, a graph neural network captures\nrepresentations of prior gene networks from knowledge databases, ensuring\naccurate physical gene interactions. Finally, we integrate these two\nrepresentations using an inference loss that leverages graph generation\ncapabilities, conditioned on the patient separation loss, to refine\nsubtype-specific information in the learned representation. GeSubNet\nconsistently outperforms traditional methods, with average improvements of\n30.6%, 21.0%, 20.1%, and 56.6% across four graph evaluation metrics, averaged\nover four cancer datasets. Particularly, we conduct a biological simulation\nexperiment to assess how the behavior of selected genes from over 11,000\ncandidates affects subtypes or patient distributions. The results show that the\ngenerated network has the potential to identify subtype-specific genes with an\n83% likelihood of impacting patient distribution shifts. The GeSubNet resource\nis available: https://anonymous.4open.science/r/GeSubNet/\n","authors":["Ziwei Yang","Zheng Chen","Xin Liu","Rikuto Kotoge","Peng Chen","Yasuko Matsubara","Yasushi Sakurai","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2410.13178v2.pdf","comment":"Under review as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2411.08590v1","updated":"2024-11-13T13:13:07Z","published":"2024-11-13T13:13:07Z","title":"Hopfield-Fenchel-Young Networks: A Unified Framework for Associative\n Memory Retrieval","summary":" Associative memory models, such as Hopfield networks and their modern\nvariants, have garnered renewed interest due to advancements in memory capacity\nand connections with self-attention in transformers. In this work, we introduce\na unified framework-Hopfield-Fenchel-Young networks-which generalizes these\nmodels to a broader family of energy functions. Our energies are formulated as\nthe difference between two Fenchel-Young losses: one, parameterized by a\ngeneralized entropy, defines the Hopfield scoring mechanism, while the other\napplies a post-transformation to the Hopfield output. By utilizing Tsallis and\nnorm entropies, we derive end-to-end differentiable update rules that enable\nsparse transformations, uncovering new connections between loss margins,\nsparsity, and exact retrieval of single memory patterns. We further extend this\nframework to structured Hopfield networks using the SparseMAP transformation,\nallowing the retrieval of pattern associations rather than a single pattern.\nOur framework unifies and extends traditional and modern Hopfield networks and\nprovides an energy minimization perspective for widely used\npost-transformations like $\\ell_2$-normalization and layer normalization-all\nthrough suitable choices of Fenchel-Young losses and by using convex analysis\nas a building block. Finally, we validate our Hopfield-Fenchel-Young networks\non diverse memory recall tasks, including free and sequential recall.\nExperiments on simulated data, image retrieval, multiple instance learning, and\ntext rationalization demonstrate the effectiveness of our approach.\n","authors":["Saul Santos","Vlad Niculae","Daniel McNamee","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2411.08590v1.pdf","comment":"49 pages, 14 figures. arXiv admin note: text overlap with\n arXiv:2402.13725"},{"id":"http://arxiv.org/abs/2411.08587v1","updated":"2024-11-13T13:11:49Z","published":"2024-11-13T13:11:49Z","title":"DeepUQ: Assessing the Aleatoric Uncertainties from two Deep Learning\n Methods","summary":" Assessing the quality of aleatoric uncertainty estimates from uncertainty\nquantification (UQ) deep learning methods is important in scientific contexts,\nwhere uncertainty is physically meaningful and important to characterize and\ninterpret exactly. We systematically compare aleatoric uncertainty measured by\ntwo UQ techniques, Deep Ensembles (DE) and Deep Evidential Regression (DER).\nOur method focuses on both zero-dimensional (0D) and two-dimensional (2D) data,\nto explore how the UQ methods function for different data dimensionalities. We\ninvestigate uncertainty injected on the input and output variables and include\na method to propagate uncertainty in the case of input uncertainty so that we\ncan compare the predicted aleatoric uncertainty to the known values. We\nexperiment with three levels of noise. The aleatoric uncertainty predicted\nacross all models and experiments scales with the injected noise level.\nHowever, the predicted uncertainty is miscalibrated to $\\rm{std}(\\sigma_{\\rm\nal})$ with the true uncertainty for half of the DE experiments and almost all\nof the DER experiments. The predicted uncertainty is the least accurate for\nboth UQ methods for the 2D input uncertainty experiment and the high-noise\nlevel. While these results do not apply to more complex data, they highlight\nthat further research on post-facto calibration for these methods would be\nbeneficial, particularly for high-noise and high-dimensional settings.\n","authors":["Rebecca Nevin","Aleksandra Ćiprijanović","Brian D. Nord"],"pdf_url":"https://arxiv.org/pdf/2411.08587v1.pdf","comment":"Accepted to the Machine Learning for Physical Sciences workshop at\n NeurIPS 2024; 11 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.08582v1","updated":"2024-11-13T13:01:44Z","published":"2024-11-13T13:01:44Z","title":"Intelligent Algorithms For Signature Diagnostics Of Three-Phase Motors","summary":" The application of machine learning (ML) algorithms in the intelligent\ndiagnosis of three-phase engines has the potential to significantly enhance\ndiagnostic performance and accuracy. Traditional methods largely rely on\nsignature analysis, which, despite being a standard practice, can benefit from\nthe integration of advanced ML techniques. In our study, we innovate by\ncombining state of the art algorithms with a novel unsupervised anomaly\ngeneration methodology that takes into account physics model of the engine.\nThis hybrid approach leverages the strengths of both supervised ML and\nunsupervised signature analysis, achieving superior diagnostic accuracy and\nreliability along with a wide industrial application. Our experimental results\ndemonstrate that this method significantly outperforms existing ML and non-ML\nstate-of-the-art approaches while retaining the practical advantages of an\nunsupervised methodology. The findings highlight the potential of our approach\nto significantly contribute to the field of engine diagnostics, offering a\nrobust and efficient solution for real-world applications.\n","authors":["Stepan Svirin","Artem Ryzhikov","Saraa Ali","Denis Derkach"],"pdf_url":"https://arxiv.org/pdf/2411.08582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16561v2","updated":"2024-11-13T13:01:19Z","published":"2024-10-21T22:40:42Z","title":"Gradient Normalization Provably Benefits Nonconvex SGD under\n Heavy-Tailed Noise","summary":" This paper investigates the roles of gradient normalization and clipping in\nensuring the convergence of Stochastic Gradient Descent (SGD) under\nheavy-tailed noise. While existing approaches consider gradient clipping\nindispensable for SGD convergence, we theoretically demonstrate that gradient\nnormalization alone without clipping is sufficient to ensure convergence.\nFurthermore, we establish that combining gradient normalization with clipping\noffers significantly improved convergence rates compared to using either\ntechnique in isolation, particularly as gradient noise diminishes. With these\nresults, our work provides the first theoretical evidence demonstrating the\nbenefits of gradient normalization in SGD under heavy-tailed noise. Finally, we\nintroduce an accelerated SGD variant that incorporates both gradient\nnormalization and clipping, further enhancing convergence rates under\nheavy-tailed noise.\n","authors":["Tao Sun","Xinwang Liu","Kun Yuan"],"pdf_url":"https://arxiv.org/pdf/2410.16561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07743v3","updated":"2024-11-13T12:43:33Z","published":"2023-06-13T13:00:10Z","title":"V-LoL: A Diagnostic Dataset for Visual Logical Learning","summary":" Despite the successes of recent developments in visual AI, different\nshortcomings still exist; from missing exact logical reasoning, to abstract\ngeneralization abilities, to understanding complex and noisy scenes.\nUnfortunately, existing benchmarks, were not designed to capture more than a\nfew of these aspects. Whereas deep learning datasets focus on visually complex\ndata but simple visual reasoning tasks, inductive logic datasets involve\ncomplex logical learning tasks, however, lack the visual component. To address\nthis, we propose the diagnostic visual logical learning dataset, V-LoL, that\nseamlessly combines visual and logical challenges. Notably, we introduce the\nfirst instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic\nbenchmark in symbolic AI, the Michalski train problem. By incorporating\nintricate visual scenes and flexible logical reasoning tasks within a versatile\nframework, V-LoL-Train provides a platform for investigating a wide range of\nvisual logical learning challenges. We evaluate a variety of AI systems\nincluding traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our\nevaluations demonstrate that even SOTA AI faces difficulties in dealing with\nvisual logical learning challenges, highlighting unique advantages and\nlimitations of each methodology. Overall, V-LoL opens up new avenues for\nunderstanding and enhancing current abilities in visual logical learning for AI\nsystems.\n","authors":["Lukas Helff","Wolfgang Stammer","Hikaru Shindo","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2306.07743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02549v2","updated":"2024-11-13T12:37:09Z","published":"2024-02-04T15:52:59Z","title":"Are Large Language Models Table-based Fact-Checkers?","summary":" Table-based Fact Verification (TFV) aims to extract the entailment relation\nbetween statements and structured tables. Existing TFV methods based on\nsmall-scaled models suffer from insufficient labeled data and weak zero-shot\nability. Recently, the appearance of Large Language Models (LLMs) has gained\nlots of attraction in research fields. They have shown powerful zero-shot and\nin-context learning abilities on several NLP tasks, but their potential on TFV\nis still unknown. In this work, we implement a preliminary study about whether\nLLMs are table-based fact-checkers. In detail, we design diverse prompts to\nexplore how the in-context learning can help LLMs in TFV, i.e., zero-shot and\nfew-shot TFV capability. Besides, we carefully design and construct TFV\ninstructions to study the performance gain brought by the instruction tuning of\nLLMs. Experimental results demonstrate that LLMs can achieve acceptable results\non zero-shot and few-shot TFV with prompt engineering, while instruction-tuning\ncan stimulate the TFV capability significantly. We also make some valuable\nfindings about the format of zero-shot prompts and the number of in-context\nexamples. Finally, we analyze some possible directions to promote the accuracy\nof TFV via LLMs, which is beneficial to further research of table reasoning.\n","authors":["Hanwen Zhang","Qingyi Si","Peng Fu","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02549v2.pdf","comment":"CSCWD 2024"},{"id":"http://arxiv.org/abs/2410.10929v4","updated":"2024-11-13T12:27:38Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v4.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2411.08566v1","updated":"2024-11-13T12:26:08Z","published":"2024-11-13T12:26:08Z","title":"Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space\n Exploration by Reinforcement Learning Agent","summary":" Grasping by a robot in unstructured environments is deemed a critical\nchallenge because of the requirement for effective adaptation to a wide\nvariation in object geometries, material properties, and other environmental\nfactors. In this paper, we propose a novel framework for robotic grasping based\non the idea of compressing high-dimensional target and gripper features in a\ncommon latent space using a set of autoencoders. Our approach simplifies\ngrasping by using three autoencoders dedicated to the target, the gripper, and\na third one that fuses their latent representations. This allows the RL agent\nto achieve higher learning rates at the initial stages of exploration of a new\nenvironment, as well as at non-zero shot grasp attempts. The agent explores the\nlatent space of the third autoencoder for better quality grasp without explicit\nreconstruction of objects. By implementing the PoWER algorithm into the RL\ntraining process, updates on the agent's policy will be made through the\nperturbation in the reward-weighted latent space. The successful exploration\nefficiently constrains both position and pose integrity for feasible executions\nof grasps. We evaluate our system on a diverse set of objects, demonstrating\nthe high success rate in grasping with minimum computational overhead. We found\nthat approach enhances the adaptation of the RL agent by more than 35 \\% in\nsimulation experiments.\n","authors":["Leonidas Askianakis"],"pdf_url":"https://arxiv.org/pdf/2411.08566v1.pdf","comment":"Submitted for review at IEEE ICRA 2025"},{"id":"http://arxiv.org/abs/2411.08557v1","updated":"2024-11-13T12:13:15Z","published":"2024-11-13T12:13:15Z","title":"Learning Locally Adaptive Metrics that Enhance Structural Representation\n with $\\texttt{LAMINAR}$","summary":" We present $\\texttt{LAMINAR}$, a novel unsupervised machine learning pipeline\ndesigned to enhance the representation of structure within data via producing a\nmore-informative distance metric. Analysis methods in the physical sciences\noften rely on standard metrics to define geometric relationships in data, which\nmay fail to capture the underlying structure of complex data sets.\n$\\texttt{LAMINAR}$ addresses this by using a continuous-normalising-flow and\ninverse-transform-sampling to define a Riemannian manifold in the data space\nwithout the need for the user to specify a metric over the data a-priori. The\nresult is a locally-adaptive-metric that produces structurally-informative\ndensity-based distances. We demonstrate the utility of $\\texttt{LAMINAR}$ by\ncomparing its output to the Euclidean metric for structured data sets.\n","authors":["Christian Kleiber","William H. Oliver","Tobias Buck"],"pdf_url":"https://arxiv.org/pdf/2411.08557v1.pdf","comment":"Accepted to the NeurIPS 2024 Machine Learning and the Physical\n Sciences workshop. 6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.08552v1","updated":"2024-11-13T12:03:39Z","published":"2024-11-13T12:03:39Z","title":"Leveraging Pre-Trained Neural Networks to Enhance Machine Learning with\n Variational Quantum Circuits","summary":" Quantum Machine Learning (QML) offers tremendous potential but is currently\nlimited by the availability of qubits. We introduce an innovative approach that\nutilizes pre-trained neural networks to enhance Variational Quantum Circuits\n(VQC). This technique effectively separates approximation error from qubit\ncount and removes the need for restrictive conditions, making QML more viable\nfor real-world applications. Our method significantly improves parameter\noptimization for VQC while delivering notable gains in representation and\ngeneralization capabilities, as evidenced by rigorous theoretical analysis and\nextensive empirical testing on quantum dot classification tasks. Moreover, our\nresults extend to applications such as human genome analysis, demonstrating the\nbroad applicability of our approach. By addressing the constraints of current\nquantum hardware, our work paves the way for a new era of advanced QML\napplications, unlocking the full potential of quantum computing in fields such\nas machine learning, materials science, medicine, mimetics, and various\ninterdisciplinary areas.\n","authors":["Jun Qi","Chao-Han Yang","Samuel Yen-Chi Chen","Pin-Yu Chen","Hector Zenil","Jesper Tegner"],"pdf_url":"https://arxiv.org/pdf/2411.08552v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2411.08550v1","updated":"2024-11-13T11:59:40Z","published":"2024-11-13T11:59:40Z","title":"Graph Neural Networks in Supply Chain Analytics and Optimization:\n Concepts, Perspectives, Dataset and Benchmarks","summary":" Graph Neural Networks (GNNs) have recently gained traction in transportation,\nbioinformatics, language and image processing, but research on their\napplication to supply chain management remains limited. Supply chains are\ninherently graph-like, making them ideal for GNN methodologies, which can\noptimize and solve complex problems. The barriers include a lack of proper\nconceptual foundations, familiarity with graph applications in SCM, and\nreal-world benchmark datasets for GNN-based supply chain research. To address\nthis, we discuss and connect supply chains with graph structures for effective\nGNN application, providing detailed formulations, examples, mathematical\ndefinitions, and task guidelines. Additionally, we present a multi-perspective\nreal-world benchmark dataset from a leading FMCG company in Bangladesh,\nfocusing on supply chain planning. We discuss various supply chain tasks using\nGNNs and benchmark several state-of-the-art models on homogeneous and\nheterogeneous graphs across six supply chain analytics tasks. Our analysis\nshows that GNN-based models consistently outperform statistical Machine\nLearning and other Deep Learning models by around 10-30% in regression, 10-30%\nin classification and detection tasks, and 15-40% in anomaly detection tasks on\ndesignated metrics. With this work, we lay the groundwork for solving supply\nchain problems using GNNs, supported by conceptual discussions, methodological\ninsights, and a comprehensive dataset.\n","authors":["Azmine Toushik Wasi","MD Shafikul Islam","Adipto Raihan Akib","Mahathir Mohammad Bappy"],"pdf_url":"https://arxiv.org/pdf/2411.08550v1.pdf","comment":"27 Pages. Extended journal version of SupplyGraph (arXiv:2401.15299).\n In Review"},{"id":"http://arxiv.org/abs/2411.08537v1","updated":"2024-11-13T11:35:39Z","published":"2024-11-13T11:35:39Z","title":"MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal\n Lymphatic Vessel Segmentation","summary":" Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste\nproducts from the human brain. An impairment in their functionality has been\nassociated with aging as well as brain disorders like multiple sclerosis and\nAlzheimer's disease. However, MLVs have only recently been described for the\nfirst time in magnetic resonance imaging (MRI), and their ramified structure\nrenders manual segmentation particularly difficult. Further, as there is no\nconsistent notion of their appearance, human-annotated MLV structures contain a\nhigh inter-rater variability that most automatic segmentation methods cannot\ntake into account. In this work, we propose a new rater-aware training scheme\nfor the popular nnU-Net model, and we explore rater-based ensembling strategies\nfor accurate and consistent segmentation of MLVs. This enables us to boost\nnnU-Net's performance while obtaining explicit predictions in different\nannotation styles and a rater-based uncertainty estimation. Our final model,\nMLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to\nthe human reference standard. The model further matches the human inter-rater\nreliability and replicates age-related associations with MLV volume.\n","authors":["Fabian Bongratz","Markus Karmann","Adrian Holz","Moritz Bonhoeffer","Viktor Neumaier","Sarah Deli","Benita Schmitz-Koep","Claus Zimmer","Christian Sorg","Melissa Thalhammer","Dennis M Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2411.08537v1.pdf","comment":"ML4H 2024"},{"id":"http://arxiv.org/abs/2411.08530v1","updated":"2024-11-13T11:24:12Z","published":"2024-11-13T11:24:12Z","title":"Efficient Whole Slide Image Classification through Fisher Vector\n Representation","summary":" The advancement of digital pathology, particularly through computational\nanalysis of whole slide images (WSI), is poised to significantly enhance\ndiagnostic precision and efficiency. However, the large size and complexity of\nWSIs make it difficult to analyze and classify them using computers. This study\nintroduces a novel method for WSI classification by automating the\nidentification and examination of the most informative patches, thus\neliminating the need to process the entire slide. Our method involves\ntwo-stages: firstly, it extracts only a few patches from the WSIs based on\ntheir pathological significance; and secondly, it employs Fisher vectors (FVs)\nfor representing features extracted from these patches, which is known for its\nrobustness in capturing fine-grained details. This approach not only\naccentuates key pathological features within the WSI representation but also\nsignificantly reduces computational overhead, thus making the process more\nefficient and scalable. We have rigorously evaluated the proposed method across\nmultiple datasets to benchmark its performance against comprehensive WSI\nanalysis and contemporary weakly-supervised learning methodologies. The\nempirical results indicate that our focused analysis of select patches,\ncombined with Fisher vector representation, not only aligns with, but at times\nsurpasses, the classification accuracy of standard practices. Moreover, this\nstrategy notably diminishes computational load and resource expenditure,\nthereby establishing an efficient and precise framework for WSI analysis in the\nrealm of digital pathology.\n","authors":["Ravi Kant Gupta","Dadi Dharani","Shambhavi Shanker","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2411.08530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10040v3","updated":"2024-11-13T11:13:56Z","published":"2024-05-16T12:22:41Z","title":"SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation","summary":" It is often desirable to distill the capabilities of large language models\n(LLMs) into smaller student models due to compute and memory constraints. One\nway to do this for classification tasks is via dataset synthesis, which can be\naccomplished by generating examples of each label from the LLM. Prior\napproaches to synthesis use few-shot prompting, which relies on the LLM's\nparametric knowledge to generate usable examples. However, this leads to issues\nof repetition, bias towards popular entities, and stylistic differences from\nhuman text. In this work, we propose Synthesize by Retrieval and Refinement\n(SynthesizRR), which uses retrieval augmentation to introduce variety into the\ndataset synthesis process: as retrieved passages vary, the LLM is seeded with\ndifferent content to generate its examples. We empirically study the synthesis\nof six datasets, covering topic classification, sentiment analysis, tone\ndetection, and humor, requiring complex synthesis strategies. We find that\nSynthesizRR greatly improves lexical and semantic diversity, similarity to\nhuman-written text, and distillation performance, when compared to 32-shot\nprompting and four prior approaches. We release our code to perform all steps\nat https://github.com/amazon-science/synthesizrr\n","authors":["Abhishek Divekar","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2405.10040v3.pdf","comment":"Published as a main conference paper at EMNLP 2024. Code available at\n https://github.com/amazon-science/synthesizrr"},{"id":"http://arxiv.org/abs/2411.08521v1","updated":"2024-11-13T11:08:28Z","published":"2024-11-13T11:08:28Z","title":"SAD-TIME: a Spatiotemporal-fused network for depression detection with\n Automated multi-scale Depth-wise and TIME-interval-related common feature\n extractor","summary":" Background and Objective: Depression is a severe mental disorder, and\naccurate diagnosis is pivotal to the cure and rehabilitation of people with\ndepression. However, the current questionnaire-based diagnostic methods could\nbring subjective biases and may be denied by subjects. In search of a more\nobjective means of diagnosis, researchers have begun to experiment with deep\nlearning-based methods for identifying depressive disorders in recent years.\nMethods: In this study, a novel Spatiotemporal-fused network with Automated\nmulti-scale Depth-wise and TIME-interval-related common feature extractor\n(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common\nfeatures extractor (CFE), a spatial sector (SpS), a modified temporal sector\n(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale\ndepth-wise 1D-convolutional neural network and a time-interval embedding\ngenerator, where the unique information of each channel is preserved. The SpS\nfuses the functional connectivity with the distance-based connectivity\ncontaining spatial position of EEG electrodes. A multi-head-attention graph\nconvolutional network is also applied in the SpS to fuse the features from\ndifferent EEG channels. The TeS is based on long short-term memory and graph\ntransformer networks, where the temporal information of different time-windows\nis fused. Moreover, the DAL is used after the SpS to obtain the\ndomain-invariant feature. Results: Experimental results under tenfold\ncross-validation show that the proposed SAD-TIME method achieves 92.00% and\n94.00% depression classification accuracies on two datasets, respectively, in\ncross-subject mode. Conclusion: SAD-TIME is a robust depression detection\nmodel, where the automatedly-generated features, the SpS and the TeS assist the\nclassification performance with the fusion of the innate spatiotemporal\ninformation in the EEG signals.\n","authors":["Han-Guang Wang","Hui-Rang Hou","Li-Cheng Jin","Chen-Yang Xu","Zhong-Yi Zhang","Qing-Hao Meng"],"pdf_url":"https://arxiv.org/pdf/2411.08521v1.pdf","comment":"21pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.15166v2","updated":"2024-11-13T11:05:04Z","published":"2024-09-23T16:20:21Z","title":"Harmonic Path Integral Diffusion","summary":" In this manuscript, we present a novel approach for sampling from a\ncontinuous multivariate probability distribution, which may either be\nexplicitly known (up to a normalization factor) or represented via empirical\nsamples. Our method constructs a time-dependent bridge from a delta function\ncentered at the origin of the state space at $t=0$, optimally transforming it\ninto the target distribution at $t=1$. We formulate this as a Stochastic\nOptimal Control problem of the Path Integral Control type, with a cost function\ncomprising (in its basic form) a quadratic control term, a quadratic state\nterm, and a terminal constraint. This framework, which we refer to as Harmonic\nPath Integral Diffusion (H-PID), leverages an analytical solution through a\nmapping to an auxiliary quantum harmonic oscillator in imaginary time.\n The H-PID framework results in a set of efficient sampling algorithms,\nwithout the incorporation of Neural Networks. The algorithms are validated on\ntwo standard use cases: a mixture of Gaussians over a grid and images from\nCIFAR-10. The transparency of the method allows us to analyze the algorithms in\ndetail, particularly revealing that the current weighted state is an order\nparameter for the dynamic phase transition, signaling earlier, at $t<1$, that\nthe sample generation process is almost complete. We contrast these algorithms\nwith other sampling methods, particularly simulated annealing and path integral\nsampling, highlighting their advantages in terms of analytical control,\naccuracy, and computational efficiency on benchmark problems.\n Additionally, we extend the methodology to more general cases where the\nunderlying stochastic differential equation includes an external deterministic,\npossibly non-conservative force, and where the cost function incorporates a\ngauge potential term.\n","authors":["Hamidreza Behjoo","Michael Chertkov"],"pdf_url":"https://arxiv.org/pdf/2409.15166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08506v1","updated":"2024-11-13T10:43:31Z","published":"2024-11-13T10:43:31Z","title":"An Information Theoretic Approach to Operationalize Right to Data\n Protection","summary":" The widespread practice of indiscriminate data scraping to fine-tune language\nmodels (LMs) raises significant legal and ethical concerns, particularly\nregarding compliance with data protection laws such as the General Data\nProtection Regulation (GDPR). This practice often results in the unauthorized\nuse of personal information, prompting growing debate within the academic and\nregulatory communities. Recent works have introduced the concept of generating\nunlearnable datasets (by adding imperceptible noise to the clean data), such\nthat the underlying model achieves lower loss during training but fails to\ngeneralize to the unseen test setting. Though somewhat effective, these\napproaches are predominantly designed for images and are limited by several\npractical constraints like requiring knowledge of the target model. To this\nend, we introduce RegText, a framework that injects imperceptible spurious\ncorrelations into natural language datasets, effectively rendering them\nunlearnable without affecting semantic content. We demonstrate RegText's\nutility through rigorous empirical analysis of small and large LMs. Notably,\nRegText can restrict newer models like GPT-4o and Llama from learning on our\ngenerated data, resulting in a drop in their test accuracy compared to their\nzero-shot performance and paving the way for generating unlearnable text to\nprotect public data.\n","authors":["Abhinav Java","Simra Shahid","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2411.08506v1.pdf","comment":"First two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2301.10369v4","updated":"2024-11-13T10:35:25Z","published":"2023-01-25T00:50:28Z","title":"Exact Fractional Inference via Re-Parametrization & Interpolation\n between Tree-Re-Weighted- and Belief Propagation- Algorithms","summary":" Computing the partition function, $Z$, of an Ising model over a graph of $N$\n\\enquote{spins} is most likely exponential in $N$. Efficient variational\nmethods, such as Belief Propagation (BP) and Tree Re-Weighted (TRW) algorithms,\ncompute $Z$ approximately by minimizing the respective (BP- or TRW-) free\nenergy. We generalize the variational scheme by building a $\\lambda$-fractional\ninterpolation, $Z^{(\\lambda)}$, where $\\lambda=0$ and $\\lambda=1$ correspond to\nTRW- and BP-approximations, respectively. This fractional scheme -- coined\nFractional Belief Propagation (FBP) -- guarantees that in the attractive\n(ferromagnetic) case $Z^{(TRW)} \\geq Z^{(\\lambda)} \\geq Z^{(BP)}$, and there\nexists a unique (\\enquote{exact}) $\\lambda_*$ such that $Z=Z^{(\\lambda_*)}$.\nGeneralizing the re-parametrization approach of\n\\citep{wainwright_tree-based_2002} and the loop series approach of\n\\citep{chertkov_loop_2006}, we show how to express $Z$ as a product, $\\forall\n\\lambda:\\ Z=Z^{(\\lambda)}{\\tilde Z}^{(\\lambda)}$, where the multiplicative\ncorrection, ${\\tilde Z}^{(\\lambda)}$, is an expectation over a node-independent\nprobability distribution built from node-wise fractional marginals. Our\ntheoretical analysis is complemented by extensive experiments with models from\nIsing ensembles over planar and random graphs of medium and large sizes. Our\nempirical study yields a number of interesting observations, such as the\nability to estimate ${\\tilde Z}^{(\\lambda)}$ with $O(N^{2::4})$ fractional\nsamples and suppression of variation in $\\lambda_*$ estimates with an increase\nin $N$ for instances from a particular random Ising ensemble, where $[2::4]$\nindicates a range from $2$ to $4$. We also discuss the applicability of this\napproach to the problem of image de-noising.\n","authors":["Hamidreza Behjoo","Michael Chertkov"],"pdf_url":"https://arxiv.org/pdf/2301.10369v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06058v3","updated":"2024-11-13T10:09:25Z","published":"2023-03-10T16:43:48Z","title":"A General Recipe for the Analysis of Randomized Multi-Armed Bandit\n Algorithms","summary":" In this paper we propose a general methodology to derive regret bounds for\nrandomized multi-armed bandit algorithms. It consists in checking a set of\nsufficient conditions on the sampling probability of each arm and on the family\nof distributions to prove a logarithmic regret. As a direct application we\nrevisit two famous bandit algorithms, Minimum Empirical Divergence (MED) and\nThompson Sampling (TS), under various models for the distributions including\nsingle parameter exponential families, Gaussian distributions, bounded\ndistributions, or distributions satisfying some conditions on their moments. In\nparticular, we prove that MED is asymptotically optimal for all these models,\nbut also provide a simple regret analysis of some TS algorithms for which the\noptimality is already known. We then further illustrate the interest of our\napproach, by analyzing a new Non-Parametric TS algorithm (h-NPTS), adapted to\nsome families of unbounded reward distributions with a bounded h-moment. This\nmodel can for instance capture some non-parametric families of distributions\nwhose variance is upper bounded by a known constant.\n","authors":["Dorian Baudry","Kazuya Suzuki","Junya Honda"],"pdf_url":"https://arxiv.org/pdf/2303.06058v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14487v3","updated":"2024-11-13T10:09:23Z","published":"2024-08-19T18:47:07Z","title":"Active learning of digenic functions with boolean matrix logic\n programming","summary":" We apply logic-based machine learning techniques to facilitate cellular\nengineering and drive biological discovery, based on comprehensive databases of\nmetabolic processes called genome-scale metabolic network models (GEMs).\nPredicted host behaviours are not always correctly described by GEMs. Learning\nthe intricate genetic interactions within GEMs presents computational and\nempirical challenges. To address these, we describe a novel approach called\nBoolean Matrix Logic Programming (BMLP) by leveraging boolean matrices to\nevaluate large logic programs. We introduce a new system, $BMLP_{active}$,\nwhich efficiently explores the genomic hypothesis space by guiding informative\nexperimentation through active learning. In contrast to sub-symbolic methods,\n$BMLP_{active}$ encodes a state-of-the-art GEM of a widely accepted bacterial\nhost in an interpretable and logical representation using datalog logic\nprograms. Notably, $BMLP_{active}$ can successfully learn the interaction\nbetween a gene pair with fewer training examples than random experimentation,\novercoming the increase in experimental design space. $BMLP_{active}$ enables\nrapid optimisation of metabolic models and offers a realistic approach to a\nself-driving lab for microbial engineering.\n","authors":["Lun Ai","Stephen H. Muggleton","Shi-shun Liang","Geoff S. Baldwin"],"pdf_url":"https://arxiv.org/pdf/2408.14487v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.06724"},{"id":"http://arxiv.org/abs/2410.17851v2","updated":"2024-11-13T10:01:38Z","published":"2024-10-23T13:20:42Z","title":"The Probabilistic Tsetlin Machine: A Novel Approach to Uncertainty\n Quantification","summary":" Tsetlin Machines (TMs) have emerged as a compelling alternative to\nconventional deep learning methods, offering notable advantages such as smaller\nmemory footprint, faster inference, fault-tolerant properties, and\ninterpretability. Although various adaptations of TMs have expanded their\napplicability across diverse domains, a fundamental gap remains in\nunderstanding how TMs quantify uncertainty in their predictions. In response,\nthis paper introduces the Probabilistic Tsetlin Machine (PTM) framework, aimed\nat providing a robust, reliable, and interpretable approach for uncertainty\nquantification. Unlike the original TM, the PTM learns the probability of\nstaying on each state of each Tsetlin Automaton (TA) across all clauses. These\nprobabilities are updated using the feedback tables that are part of the TM\nframework: Type I and Type II feedback. During inference, TAs decide their\nactions by sampling states based on learned probability distributions, akin to\nBayesian neural networks when generating weight values. In our experimental\nanalysis, we first illustrate the spread of the probabilities across TA states\nfor the noisy-XOR dataset. Then we evaluate the PTM alongside benchmark models\nusing both simulated and real-world datasets. The experiments on the simulated\ndataset reveal the PTM's effectiveness in uncertainty quantification,\nparticularly in delineating decision boundaries and identifying regions of high\nuncertainty. Moreover, when applied to multiclass classification tasks using\nthe Iris dataset, the PTM demonstrates competitive performance in terms of\npredictive entropy and expected calibration error, showcasing its potential as\na reliable tool for uncertainty estimation. Our findings underscore the\nimportance of selecting appropriate models for accurate uncertainty\nquantification in predictive tasks, with the PTM offering a particularly\ninterpretable and effective solution.\n","authors":["K. Darshana Abeyrathna","Sara El Mekkaoui","Andreas Hafver","Christian Agrell"],"pdf_url":"https://arxiv.org/pdf/2410.17851v2.pdf","comment":"12 pages, 5 figures, 6 tables, accepted and presented at ICAAI 2024,\n London"},{"id":"http://arxiv.org/abs/2411.08482v1","updated":"2024-11-13T10:01:33Z","published":"2024-11-13T10:01:33Z","title":"Methodology for a Statistical Analysis of Influencing Factors on 3D\n Object Detection Performance","summary":" In autonomous driving, object detection is an essential task to perceive the\nenvironment by localizing and classifying objects. Most object detection\nalgorithms rely on deep learning for their superior performance. However, their\nblack box nature makes it challenging to ensure safety. In this paper, we\npropose a first-of-its-kind methodology for statistical analysis of the\ninfluence of various factors related to the objects to detect or the\nenvironment on the detection performance of both LiDAR- and camera-based 3D\nobject detectors. We perform a univariate analysis between each of the factors\nand the detection error in order to compare the strength of influence. To\nbetter identify potential sources of detection errors, we also analyze the\nperformance in dependency of the influencing factors and examine the\ninterdependencies between the different influencing factors. Recognizing the\nfactors that influence detection performance helps identify robustness issues\nin the trained object detector and supports the safety approval of object\ndetection systems.\n","authors":["Anton Kuznietsov","Dirk Schweickard","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2411.08482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08478v1","updated":"2024-11-13T09:55:59Z","published":"2024-11-13T09:55:59Z","title":"Learning Model Agnostic Explanations via Constraint Programming","summary":" Interpretable Machine Learning faces a recurring challenge of explaining the\npredictions made by opaque classifiers such as ensemble models, kernel methods,\nor neural networks in terms that are understandable to humans. When the model\nis viewed as a black box, the objective is to identify a small set of features\nthat jointly determine the black box response with minimal error. However,\nfinding such model-agnostic explanations is computationally demanding, as the\nproblem is intractable even for binary classifiers. In this paper, the task is\nframed as a Constraint Optimization Problem, where the constraint solver seeks\nan explanation of minimum error and bounded size for an input data instance and\na set of samples generated by the black box. From a theoretical perspective,\nthis constraint programming approach offers PAC-style guarantees for the output\nexplanation. We evaluate the approach empirically on various datasets and show\nthat it statistically outperforms the state-of-the-art heuristic Anchors\nmethod.\n","authors":["Frederic Koriche","Jean-Marie Lagniez","Stefan Mengel","Chi Tran"],"pdf_url":"https://arxiv.org/pdf/2411.08478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07979v2","updated":"2024-11-13T09:52:45Z","published":"2024-11-12T17:58:40Z","title":"Exact, Tractable Gauss-Newton Optimization in Deep Reversible\n Architectures Reveal Poor Generalization","summary":" Second-order optimization has been shown to accelerate the training of deep\nneural networks in many applications, often yielding faster progress per\niteration on the training loss compared to first-order optimizers. However, the\ngeneralization properties of second-order methods are still being debated.\nTheoretical investigations have proved difficult to carry out outside the\ntractable settings of heavily simplified model classes -- thus, the relevance\nof existing theories to practical deep learning applications remains unclear.\nSimilarly, empirical studies in large-scale models and real datasets are\nsignificantly confounded by the necessity to approximate second-order updates\nin practice. It is often unclear whether the observed generalization behaviour\narises specifically from the second-order nature of the parameter updates, or\ninstead reflects the specific structured (e.g.\\ Kronecker) approximations used\nor any damping-based interpolation towards first-order updates. Here, we show\nfor the first time that exact Gauss-Newton (GN) updates take on a tractable\nform in a class of deep reversible architectures that are sufficiently\nexpressive to be meaningfully applied to common benchmark datasets. We exploit\nthis novel setting to study the training and generalization properties of the\nGN optimizer. We find that exact GN generalizes poorly. In the mini-batch\ntraining setting, this manifests as rapidly saturating progress even on the\n\\emph{training} loss, with parameter updates found to overfit each\nmini-batchatch without producing the features that would support generalization\nto other mini-batches. We show that our experiments run in the ``lazy'' regime,\nin which the neural tangent kernel (NTK) changes very little during the course\nof training. This behaviour is associated with having no significant changes in\nneural representations, explaining the lack of generalization.\n","authors":["Davide Buffelli","Jamie McGowan","Wangkun Xu","Alexandru Cioba","Da-shan Shiu","Guillaume Hennequin","Alberto Bernacchia"],"pdf_url":"https://arxiv.org/pdf/2411.07979v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.17804v3","updated":"2024-11-13T09:50:48Z","published":"2024-06-22T15:24:33Z","title":"A Review of Electromagnetic Elimination Methods for low-field portable\n MRI scanner","summary":" This paper analyzes conventional and deep learning methods for eliminating\nelectromagnetic interference (EMI) in MRI systems. We compare traditional\nanalytical and adaptive techniques with advanced deep learning approaches. Key\nstrengths and limitations of each method are highlighted. Recent advancements\nin active EMI elimination, such as external EMI receiver coils, are discussed\nalongside deep learning methods, which show superior EMI suppression by\nleveraging neural networks trained on MRI data. While deep learning improves\nEMI elimination and diagnostic capabilities, it introduces security and safety\nconcerns, particularly in commercial applications. A balanced approach,\nintegrating conventional reliability with deep learning's advanced\ncapabilities, is proposed for more effective EMI suppression in MRI systems.\n","authors":["Wanyu Bian","Panfeng Li","Mengyao Zheng","Chihang Wang","Anying Li","Ying Li","Haowei Ni","Zixuan Zeng"],"pdf_url":"https://arxiv.org/pdf/2406.17804v3.pdf","comment":"Accepted by 2024 5th International Conference on Machine Learning and\n Computer Application"},{"id":"http://arxiv.org/abs/2304.08310v2","updated":"2024-11-13T09:47:41Z","published":"2023-04-17T14:27:19Z","title":"TreeC: a method to generate interpretable energy management systems\n using a metaheuristic algorithm","summary":" Energy management systems (EMS) have traditionally been implemented using\nrule-based control (RBC) and model predictive control (MPC) methods. However,\nrecent research has explored the use of reinforcement learning (RL) as a\npromising alternative. This paper introduces TreeC, a machine learning method\nthat utilizes the covariance matrix adaptation evolution strategy metaheuristic\nalgorithm to generate an interpretable EMS modeled as a decision tree. Unlike\nRBC and MPC approaches, TreeC learns the decision strategy of the EMS based on\nhistorical data, adapting the control model to the controlled energy grid. The\ndecision strategy is represented as a decision tree, providing interpretability\ncompared to RL methods that often rely on black-box models like neural\nnetworks. TreeC is evaluated against MPC with perfect forecast and RL EMSs in\ntwo case studies taken from literature: an electric grid case and a household\nheating case. In the electric grid case, TreeC achieves an average energy loss\nand constraint violation score of 19.2, which is close to MPC and RL EMSs that\nachieve scores of 14.4 and 16.2 respectively. All three methods control the\nelectric grid well especially when compared to the random EMS, which obtains an\naverage score of 12 875. In the household heating case, TreeC performs\nsimilarly to MPC on the adjusted and averaged electricity cost and total\ndiscomfort (0.033 EUR/m$^2$ and 0.42 Kh for TreeC compared to 0.037 EUR/m$^2$\nand 2.91 kH for MPC), while outperforming RL (0.266 EUR/m$^2$ and 24.41 Kh).\n","authors":["Julian Ruddick","Luis Ramirez Camargo","Muhammad Andy Putratama","Maarten Messagie","Thierry Coosemans"],"pdf_url":"https://arxiv.org/pdf/2304.08310v2.pdf","comment":"Accepted version Knowledge based system"},{"id":"http://arxiv.org/abs/2411.08460v1","updated":"2024-11-13T09:31:06Z","published":"2024-11-13T09:31:06Z","title":"Trap-MID: Trapdoor-based Defense against Model Inversion Attacks","summary":" Model Inversion (MI) attacks pose a significant threat to the privacy of Deep\nNeural Networks by recovering training data distribution from well-trained\nmodels. While existing defenses often rely on regularization techniques to\nreduce information leakage, they remain vulnerable to recent attacks. In this\npaper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to\nmislead MI attacks. A trapdoor is integrated into the model to predict a\nspecific label when the input is injected with the corresponding trigger.\nConsequently, this trapdoor information serves as the \"shortcut\" for MI\nattacks, leading them to extract trapdoor triggers rather than private data. We\nprovide theoretical insights into the impacts of trapdoor's effectiveness and\nnaturalness on deceiving MI attacks. In addition, empirical experiments\ndemonstrate the state-of-the-art defense performance of Trap-MID against\nvarious MI attacks without the requirements for extra data or large\ncomputational overhead. Our source code is publicly available at\nhttps://github.com/ntuaislab/Trap-MID.\n","authors":["Zhen-Ting Liu","Shang-Tse Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08460v1.pdf","comment":"Accepted by Neural Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2307.12594v2","updated":"2024-11-13T09:29:36Z","published":"2023-07-24T08:11:59Z","title":"The effect of dataset size and the process of big data mining for\n investigating solar-thermal desalination by using machine learning","summary":" Machine learning's application in solar-thermal desalination is limited by\ndata shortage and inconsistent analysis. This study develops an optimized\ndataset collection and analysis process for the representative solar still. By\nultra-hydrophilic treatment on the condensation cover, the dataset collection\nprocess reduces the collection time by 83.3%. Over 1,000 datasets are\ncollected, which is nearly one order of magnitude larger than up-to-date works.\nThen, a new interdisciplinary process flow is proposed. Some meaningful results\nare obtained that were not addressed by previous studies. It is found that\nRadom Forest might be a better choice for datasets larger than 1,000 due to\nboth high accuracy and fast speed. Besides, the dataset range affects the\nquantified importance (weighted value) of factors significantly, with up to a\n115% increment. Moreover, the results show that machine learning has a high\naccuracy on the extrapolation prediction of productivity, where the minimum\nmean relative prediction error is just around 4%. The results of this work not\nonly show the necessity of the dataset characteristics' effect but also provide\na standard process for studying solar-thermal desalination by machine learning,\nwhich would pave the way for interdisciplinary study.\n","authors":["Guilong Peng","Senshan Sun","Zhenwei Xu","Juxin Du","Yangjun Qin","Swellam W. Sharshir","A. W. Kandel","A. E. Kabeel","Nuo Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00393v4","updated":"2024-11-13T09:27:41Z","published":"2024-11-01T06:40:47Z","title":"Advantages of Neural Population Coding for Deep Learning","summary":" Scalar variables, e.g., the orientation of a shape in an image, are commonly\npredicted using a single output neuron in a neural network. In contrast, the\nmammalian cortex represents variables with a population of neurons. In this\npopulation code, each neuron is most active at its preferred value and shows\npartial activity for other values. Here, we investigate the benefit of using a\npopulation code for the output layer of a neural network. We compare population\ncodes against single-neuron outputs and one-hot vectors. First, we show\ntheoretically and in experiments with synthetic data that population codes\nimprove robustness to input noise in networks of stacked linear layers. Second,\nwe demonstrate the benefit of using population codes to encode ambiguous\noutputs, such as the pose of symmetric objects. Using the T-LESS dataset of\nfeature-less real-world objects, we show that population codes improve the\naccuracy of predicting 3D object orientation from image input.\n","authors":["Heiko Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2411.00393v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08443v1","updated":"2024-11-13T08:56:35Z","published":"2024-11-13T08:56:35Z","title":"Machine Unlearning on Pre-trained Models by Residual Feature Alignment\n Using LoRA","summary":" Machine unlearning is new emerged technology that removes a subset of the\ntraining data from a trained model without affecting the model performance on\nthe remaining data. This topic is becoming increasingly important in protecting\nuser privacy and eliminating harmful or outdated data. The key challenge lies\nin effectively and efficiently unlearning specific information without\ncompromising the model's utility on the retained data. For the pre-trained\nmodels, fine-tuning is an important way to achieve the unlearning target.\nPrevious work typically fine-tuned the entire model's parameters, which incurs\nsignificant computation costs. In addition, the fine-tuning process may cause\nshifts in the intermediate layer features, affecting the model's overall\nutility. In this work, we propose a novel and efficient machine unlearning\nmethod on pre-trained models. We term the method as Residual Feature Alignment\nUnlearning. Specifically, we leverage LoRA (Low-Rank Adaptation) to decompose\nthe model's intermediate features into pre-trained features and residual\nfeatures. By adjusting the residual features, we align the unlearned model with\nthe pre-trained model at the intermediate feature level to achieve both\nunlearning and remaining targets. The method aims to learn the zero residuals\non the retained set and shifted residuals on the unlearning set. Extensive\nexperiments on numerous datasets validate the effectiveness of our approach.\n","authors":["Laiqiao Qin","Tianqing Zhu","Linlin Wang","Wanlei Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.08443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06979v2","updated":"2024-11-13T08:41:50Z","published":"2024-06-11T06:18:29Z","title":"AudioMarkBench: Benchmarking Robustness of Audio Watermarking","summary":" The increasing realism of synthetic speech, driven by advancements in\ntext-to-speech models, raises ethical concerns regarding impersonation and\ndisinformation. Audio watermarking offers a promising solution via embedding\nhuman-imperceptible watermarks into AI-generated audios. However, the\nrobustness of audio watermarking against common/adversarial perturbations\nremains understudied. We present AudioMarkBench, the first systematic benchmark\nfor evaluating the robustness of audio watermarking against watermark removal\nand watermark forgery. AudioMarkBench includes a new dataset created from\nCommon-Voice across languages, biological sexes, and ages, 3 state-of-the-art\nwatermarking methods, and 15 types of perturbations. We benchmark the\nrobustness of these methods against the perturbations in no-box, black-box, and\nwhite-box settings. Our findings highlight the vulnerabilities of current\nwatermarking techniques and emphasize the need for more robust and fair audio\nwatermarking solutions. Our dataset and code are publicly available at\nhttps://github.com/moyangkuo/AudioMarkBench.\n","authors":["Hongbin Liu","Moyang Guo","Zhengyuan Jiang","Lun Wang","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2406.06979v2.pdf","comment":"To appear in NeurIPS Datasets and Benchmarks, 2024"},{"id":"http://arxiv.org/abs/2410.21283v2","updated":"2024-11-13T08:33:17Z","published":"2024-10-11T03:19:44Z","title":"pLDDT-Predictor: High-speed Protein Screening Using Transformer and ESM2","summary":" Recent advancements in protein structure prediction, particularly AlphaFold2,\nhave revolutionized structural biology by achieving near-experimental accuracy\n($\\text{average RMSD} < 1.5\\text{\\AA}$). However, the computational demands of\nthese models (approximately 30 minutes per protein on an RTX 4090)\nsignificantly limit their application in high-throughput protein screening.\nWhile large language models like ESM (Evolutionary Scale Modeling) have shown\npromise in extracting structural information directly from protein sequences,\nrapid assessment of protein structure quality for large-scale analyses remains\na major challenge.\n We introduce pLDDT-Predictor, a high-speed protein screening tool that\nachieves a $250,000\\times$ speedup compared to AlphaFold2 by leveraging\npre-trained ESM2 protein embeddings and a Transformer architecture. Our model\npredicts AlphaFold2's pLDDT (predicted Local Distance Difference Test) scores\nwith a Pearson correlation of 0.7891 and processes proteins in just 0.007\nseconds on average. Using a comprehensive dataset of 1.5 million diverse\nprotein sequences (ranging from 50 to 2048 amino acids), we demonstrate that\npLDDT-Predictor accurately classifies high-confidence structures (pLDDT $>$ 70)\nwith 91.2\\% accuracy and achieves an MSE of 84.8142 compared to AlphaFold2's\npredictions.\n The source code and pre-trained models are freely available at\n\\url{https://github.com/jw-chae/pLDDT_Predictor}, enabling the research\ncommunity to perform rapid, large-scale protein structure quality assessments.\n","authors":["Joongwon Chae","Zhenyu Wang","Ijaz Gul","Jiansong Ji","Zhenglin Chen","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2410.21283v2.pdf","comment":"6 pages main topic, 8 pages including citiation, 4 figures"},{"id":"http://arxiv.org/abs/2411.08432v1","updated":"2024-11-13T08:32:42Z","published":"2024-11-13T08:32:42Z","title":"One STEP at a time: Language Agents are Stepwise Planners","summary":" Language agents have shown promising adaptability in dynamic environments to\nperform complex tasks. However, despite the versatile knowledge embedded in\nlarge language models, these agents still fall short when it comes to tasks\nthat require planning. We introduce STEP, a novel framework designed to\nefficiently learn from previous experiences to enhance the planning\ncapabilities of language agents in future steps. Concretely, STEP functions\nthrough four interconnected components. First, the Planner takes on the task,\nbreaks it down into subtasks and provides relevant insights. Then the Executor\ngenerates action candidates, while the Evaluator ensures the actions align with\nlearned rules from previous experiences. Lastly, Memory stores experiences to\ninform future decisions. In the ScienceWorld benchmark, our results show that\nSTEP consistently outperforms state-of-the-art models, achieving an overall\nscore of 67.4 and successfully completing 12 out of 18 tasks. These findings\nhighlight STEP's potential as a framework for enhancing planning capabilities\nin language agents, paving the way for more sophisticated task-solving in\ndynamic environments.\n","authors":["Minh Nguyen","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2411.08432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03364v2","updated":"2024-11-13T08:30:59Z","published":"2024-11-05T06:54:38Z","title":"DM4Steal: Diffusion Model For Link Stealing Attack On Graph Neural\n Networks","summary":" Graph has become increasingly integral to the advancement of recommendation\nsystems, particularly with the fast development of graph neural network(GNN).\nBy exploring the virtue of rich node features and link information, GNN is\ndesigned to provide personalized and accurate suggestions. Meanwhile, the\nprivacy leakage of GNN in such contexts has also captured special attention.\nPrior work has revealed that a malicious user can utilize auxiliary knowledge\nto extract sensitive link data of the target graph, integral to recommendation\nsystems, via the decision made by the target GNN model. This poses a\nsignificant risk to the integrity and confidentiality of data used in\nrecommendation system. Though important, previous works on GNN's privacy\nleakage are still challenged in three aspects, i.e., limited stealing attack\nscenarios, sub-optimal attack performance, and adaptation against defense. To\naddress these issues, we propose a diffusion model based link stealing attack,\nnamed DM4Steal. It differs previous work from three critical aspects. (i)\nGenerality: aiming at six attack scenarios with limited auxiliary knowledge, we\npropose a novel training strategy for diffusion models so that DM4Steal is\ntransferable to diverse attack scenarios. (ii) Effectiveness: benefiting from\nthe retention of semantic structure in the diffusion model during the training\nprocess, DM4Steal is capable to learn the precise topology of the target graph\nthrough the GNN decision process. (iii) Adaptation: when GNN is defensive\n(e.g., DP, Dropout), DM4Steal relies on the stability that comes from sampling\nthe score model multiple times to keep performance degradation to a minimum,\nthus DM4Steal implements successful adaptive attack on defensive GNN.\n","authors":["Jinyin Chen","Haonan Ma","Haibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.03364v2.pdf","comment":"We found that there were critical problems in our paper, and we\n needed to redo the experiment, which was incomplete"},{"id":"http://arxiv.org/abs/2411.07501v2","updated":"2024-11-13T08:30:52Z","published":"2024-11-12T02:57:15Z","title":"LAuReL: Learned Augmented Residual Layer","summary":" One of the core pillars of efficient deep learning methods is architectural\nimprovements such as the residual/skip connection, which has led to\nsignificantly better model convergence and quality. Since then the residual\nconnection has become ubiquitous in not just convolutional neural networks but\nalso transformer-based architectures, the backbone of LLMs.\n In this paper we introduce \\emph{Learned Augmented Residual Layer} (LAuReL)\n-- a novel generalization of the canonical residual connection -- with the goal\nto be an in-situ replacement of the latter while outperforming on both model\nquality and footprint metrics. Our experiments show that using \\laurel can help\nboost performance for both vision and language models. For example, on the\nResNet-50, ImageNet 1K task, it achieves $60\\%$ of the gains from adding an\nextra layer, while only adding $0.003\\%$ more parameters, and matches it while\nadding $2.6\\times$ fewer parameters.\n","authors":["Gaurav Menghani","Ravi Kumar","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07501v2.pdf","comment":"Accepted at the 2nd Efficient Systems for Foundation Models Workshop\n at the International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2411.01137v2","updated":"2024-11-13T08:24:09Z","published":"2024-11-02T04:48:41Z","title":"Data movement limits to frontier model training","summary":" We present a theoretical model of distributed training, and use it to analyze\nhow far dense and sparse training runs can be scaled. Under our baseline\nassumptions, given a three month training duration, data movement bottlenecks\nbegin to significantly lower hardware utilization for training runs exceeding\nabout $10^{28}$ FLOP, two orders of magnitude above the largest training run to\ndate, suggesting the arrival of fundamental barriers to scaling in three years\ngiven recent rates of growth. A training run exceeding about $10^{31}$ FLOP is\ninfeasible even at low utilization. However, more aggressive batch size scaling\nand/or shorter and fatter model shapes, if achievable, have the potential to\npermit much larger training runs.\n","authors":["Ege Erdil","David Schneider-Joseph"],"pdf_url":"https://arxiv.org/pdf/2411.01137v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08425v1","updated":"2024-11-13T08:18:03Z","published":"2024-11-13T08:18:03Z","title":"Properties of fairness measures in the context of varying class\n imbalance and protected group ratios","summary":" Society is increasingly relying on predictive models in fields like criminal\njustice, credit risk management, or hiring. To prevent such automated systems\nfrom discriminating against people belonging to certain groups, fairness\nmeasures have become a crucial component in socially relevant applications of\nmachine learning. However, existing fairness measures have been designed to\nassess the bias between predictions for protected groups without considering\nthe imbalance in the classes of the target variable. Current research on the\npotential effect of class imbalance on fairness focuses on practical\napplications rather than dataset-independent measure properties. In this paper,\nwe study the general properties of fairness measures for changing class and\nprotected group proportions. For this purpose, we analyze the probability mass\nfunctions of six of the most popular group fairness measures. We also measure\nhow the probability of achieving perfect fairness changes for varying class\nimbalance ratios. Moreover, we relate the dataset-independent properties of\nfairness measures described in this paper to classifier fairness in real-life\ntasks. Our results show that measures such as Equal Opportunity and Positive\nPredictive Parity are more sensitive to changes in class imbalance than\nAccuracy Equality. These findings can help guide researchers and practitioners\nin choosing the most appropriate fairness measures for their classification\nproblems.\n","authors":["Dariusz Brzezinski","Julia Stachowiak","Jerzy Stefanowski","Izabela Szczech","Robert Susmaga","Sofya Aksenyuk","Uladzimir Ivashka","Oleksandr Yasinskyi"],"pdf_url":"https://arxiv.org/pdf/2411.08425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15586v2","updated":"2024-11-13T08:17:38Z","published":"2024-05-24T14:14:24Z","title":"DAGER: Exact Gradient Inversion for Large Language Models","summary":" Federated learning works by aggregating locally computed gradients from\nmultiple clients, thus enabling collaborative training without sharing private\nclient data. However, prior work has shown that the data can actually be\nrecovered by the server using so-called gradient inversion attacks. While these\nattacks perform well when applied on images, they are limited in the text\ndomain and only permit approximate reconstruction of small batches and short\ninput sequences. In this work, we propose DAGER, the first algorithm to recover\nwhole batches of input text exactly. DAGER leverages the low-rank structure of\nself-attention layer gradients and the discrete nature of token embeddings to\nefficiently check if a given token sequence is part of the client data. We use\nthis check to exactly recover full batches in the honest-but-curious setting\nwithout any prior on the data for both encoder- and decoder-based architectures\nusing exhaustive heuristic search and a greedy approach, respectively. We\nprovide an efficient GPU implementation of DAGER and show experimentally that\nit recovers full batches of size up to 128 on large language models (LLMs),\nbeating prior attacks in speed (20x at same batch size), scalability (10x\nlarger batches), and reconstruction quality (ROUGE-1/2 > 0.99).\n","authors":["Ivo Petrov","Dimitar I. Dimitrov","Maximilian Baader","Mark Niklas Müller","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2405.15586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21063v2","updated":"2024-11-13T08:13:36Z","published":"2024-05-31T17:51:07Z","title":"Neural Network Verification with Branch-and-Bound for General\n Nonlinearities","summary":" Branch-and-bound (BaB) is among the most effective techniques for neural\nnetwork (NN) verification. However, existing works on BaB for NN verification\nhave mostly focused on NNs with piecewise linear activations, especially ReLU\nnetworks. In this paper, we develop a general framework, named GenBaB, to\nconduct BaB on general nonlinearities to verify NNs with general architectures,\nbased on linear bound propagation for NN verification. To decide which neuron\nto branch, we design a new branching heuristic which leverages linear bounds as\nshortcuts to efficiently estimate the potential improvement after branching. To\ndecide nontrivial branching points for general nonlinear functions, we propose\nto pre-optimize branching points, which can be efficiently leveraged during\nverification with a lookup table. We demonstrate the effectiveness of our\nGenBaB on verifying a wide range of NNs, including NNs with activation\nfunctions such as Sigmoid, Tanh, Sine and GeLU, as well as NNs involving\nmulti-dimensional nonlinear operations such as multiplications in LSTMs and\nVision Transformers. Our framework also allows the verification of general\nnonlinear computation graphs and enables verification applications beyond\nsimple NNs, particularly for AC Optimal Power Flow (ACOPF). GenBaB is part of\nthe latest $\\alpha,\\!\\beta$-CROWN, the winner of the 4th and the 5th\nInternational Verification of Neural Networks Competition (VNN-COMP 2023 and\n2024).\n","authors":["Zhouxing Shi","Qirui Jin","Zico Kolter","Suman Jana","Cho-Jui Hsieh","Huan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.21063v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2411.08414v1","updated":"2024-11-13T08:07:21Z","published":"2024-11-13T08:07:21Z","title":"Material Property Prediction with Element Attribute Knowledge Graphs and\n Multimodal Representation Learning","summary":" Machine learning has become a crucial tool for predicting the properties of\ncrystalline materials. However, existing methods primarily represent material\ninformation by constructing multi-edge graphs of crystal structures, often\noverlooking the chemical and physical properties of elements (such as atomic\nradius, electronegativity, melting point, and ionization energy), which have a\nsignificant impact on material performance. To address this limitation, we\nfirst constructed an element property knowledge graph and utilized an embedding\nmodel to encode the element attributes within the knowledge graph. Furthermore,\nwe propose a multimodal fusion framework, ESNet, which integrates element\nproperty features with crystal structure features to generate joint multimodal\nrepresentations. This provides a more comprehensive perspective for predicting\nthe performance of crystalline materials, enabling the model to consider both\nmicrostructural composition and chemical characteristics of the materials. We\nconducted experiments on the Materials Project benchmark dataset, which showed\nleading performance in the bandgap prediction task and achieved results on a\npar with existing benchmarks in the formation energy prediction task.\n","authors":["Chao Huang","Chunyan Chen","Ling Shi","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08404v1","updated":"2024-11-13T07:45:40Z","published":"2024-11-13T07:45:40Z","title":"Quantifying Qualitative Insights: Leveraging LLMs to Market Predict","summary":" Recent advancements in Large Language Models (LLMs) have the potential to\ntransform financial analytics by integrating numerical and textual data.\nHowever, challenges such as insufficient context when fusing multimodal\ninformation and the difficulty in measuring the utility of qualitative outputs,\nwhich LLMs generate as text, have limited their effectiveness in tasks such as\nfinancial forecasting. This study addresses these challenges by leveraging\ndaily reports from securities firms to create high-quality contextual\ninformation. The reports are segmented into text-based key factors and combined\nwith numerical data, such as price information, to form context sets. By\ndynamically updating few-shot examples based on the query time, the sets\nincorporate the latest information, forming a highly relevant set closely\naligned with the query point. Additionally, a crafted prompt is designed to\nassign scores to the key factors, converting qualitative insights into\nquantitative results. The derived scores undergo a scaling process,\ntransforming them into real-world values that are used for prediction. Our\nexperiments demonstrate that LLMs outperform time-series models in market\nforecasting, though challenges such as imperfect reproducibility and limited\nexplainability remain.\n","authors":["Hoyoung Lee","Youngsoo Choi","Yuhee Kwon"],"pdf_url":"https://arxiv.org/pdf/2411.08404v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.08397v1","updated":"2024-11-13T07:32:58Z","published":"2024-11-13T07:32:58Z","title":"CLaSP: Learning Concepts for Time-Series Signals from Natural Language\n Supervision","summary":" This paper proposes a foundation model called \"CLaSP\" that can search time\nseries signals using natural language that describes the characteristics of the\nsignals as queries. Previous efforts to represent time series signal data in\nnatural language have had challenges in designing a conventional class of time\nseries signal characteristics, formulating their quantification, and creating a\ndictionary of synonyms. To overcome these limitations, the proposed method\nintroduces a neural network based on contrastive learning. This network is\nfirst trained using the datasets TRUCE and SUSHI, which consist of time series\nsignals and their corresponding natural language descriptions. Previous studies\nhave proposed vocabularies that data analysts use to describe signal\ncharacteristics, and SUSHI was designed to cover these terms. We believe that a\nneural network trained on these datasets will enable data analysts to search\nusing natural language vocabulary. Furthermore, our method does not require a\ndictionary of predefined synonyms, and it leverages common sense knowledge\nembedded in a large-scale language model (LLM). Experimental results\ndemonstrate that CLaSP enables natural language search of time series signal\ndata and can accurately learn the points at which signal data changes.\n","authors":["Aoi Ito","Kota Dohi","Yohei Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2411.08397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08384v1","updated":"2024-11-13T07:10:18Z","published":"2024-11-13T07:10:18Z","title":"Interpretable Syntactic Representations Enable Hierarchical Word Vectors","summary":" The distributed representations currently used are dense and uninterpretable,\nleading to interpretations that themselves are relative, overcomplete, and hard\nto interpret. We propose a method that transforms these word vectors into\nreduced syntactic representations. The resulting representations are compact\nand interpretable allowing better visualization and comparison of the word\nvectors and we successively demonstrate that the drawn interpretations are in\nline with human judgment. The syntactic representations are then used to create\nhierarchical word vectors using an incremental learning approach similar to the\nhierarchical aspect of human learning. As these representations are drawn from\npre-trained vectors, the generation process and learning approach are\ncomputationally efficient. Most importantly, we find out that syntactic\nrepresentations provide a plausible interpretation of the vectors and\nsubsequent hierarchical vectors outperform the original vectors in benchmark\ntests.\n","authors":["Biraj Silwal"],"pdf_url":"https://arxiv.org/pdf/2411.08384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08378v1","updated":"2024-11-13T07:03:47Z","published":"2024-11-13T07:03:47Z","title":"Physics Informed Distillation for Diffusion Models","summary":" Diffusion models have recently emerged as a potent tool in generative\nmodeling. However, their inherent iterative nature often results in sluggish\nimage generation due to the requirement for multiple model evaluations. Recent\nprogress has unveiled the intrinsic link between diffusion models and\nProbability Flow Ordinary Differential Equations (ODEs), thus enabling us to\nconceptualize diffusion models as ODE systems. Simultaneously, Physics Informed\nNeural Networks (PINNs) have substantiated their effectiveness in solving\nintricate differential equations through implicit modeling of their solutions.\nBuilding upon these foundational insights, we introduce Physics Informed\nDistillation (PID), which employs a student model to represent the solution of\nthe ODE system corresponding to the teacher diffusion model, akin to the\nprinciples employed in PINNs. Through experiments on CIFAR 10 and ImageNet\n64x64, we observe that PID achieves performance comparable to recent\ndistillation methods. Notably, it demonstrates predictable trends concerning\nmethod-specific hyperparameters and eliminates the need for synthetic dataset\ngeneration during the distillation process. Both of which contribute to its\neasy-to-use nature as a distillation approach for Diffusion Models. Our code\nand pre-trained checkpoint are publicly available at:\nhttps://github.com/pantheon5100/pid_diffusion.git.\n","authors":["Joshua Tian Jin Tee","Kang Zhang","Hee Suk Yoon","Dhananjaya Nagaraja Gowda","Chanwoo Kim","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2411.08378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.02775v4","updated":"2024-11-13T06:57:35Z","published":"2022-01-08T06:18:17Z","title":"ADI: Adversarial Dominating Inputs in Vertical Federated Learning\n Systems","summary":" Vertical federated learning (VFL) system has recently become prominent as a\nconcept to process data distributed across many individual sources without the\nneed to centralize it. Multiple participants collaboratively train models based\non their local data in a privacy-aware manner. To date, VFL has become a de\nfacto solution to securely learn a model among organizations, allowing\nknowledge to be shared without compromising privacy of any individuals. Despite\nthe prosperous development of VFL systems, we find that certain inputs of a\nparticipant, named adversarial dominating inputs (ADIs), can dominate the joint\ninference towards the direction of the adversary's will and force other\n(victim) participants to make negligible contributions, losing rewards that are\nusually offered regarding the importance of their contributions in federated\nlearning scenarios. We conduct a systematic study on ADIs by first proving\ntheir existence in typical VFL systems. We then propose gradient-based methods\nto synthesize ADIs of various formats and exploit common VFL systems. We\nfurther launch greybox fuzz testing, guided by the saliency score of ``victim''\nparticipants, to perturb adversary-controlled inputs and systematically explore\nthe VFL attack surface in a privacy-preserving manner. We conduct an in-depth\nstudy on the influence of critical parameters and settings in synthesizing\nADIs. Our study reveals new VFL attack opportunities, promoting the\nidentification of unknown threats before breaches and building more secure VFL\nsystems.\n","authors":["Qi Pang","Yuanyuan Yuan","Shuai Wang","Wenting Zheng"],"pdf_url":"https://arxiv.org/pdf/2201.02775v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08374v1","updated":"2024-11-13T06:54:05Z","published":"2024-11-13T06:54:05Z","title":"Federated Graph Learning with Graphless Clients","summary":" Federated Graph Learning (FGL) is tasked with training machine learning\nmodels, such as Graph Neural Networks (GNNs), for multiple clients, each with\nits own graph data. Existing methods usually assume that each client has both\nnode features and graph structure of its graph data. In real-world scenarios,\nhowever, there exist federated systems where only a part of the clients have\nsuch data while other clients (i.e. graphless clients) may only have node\nfeatures. This naturally leads to a novel problem in FGL: how to jointly train\na model over distributed graph data with graphless clients? In this paper, we\npropose a novel framework FedGLS to tackle the problem in FGL with graphless\nclients. In FedGLS, we devise a local graph learner on each graphless client\nwhich learns the local graph structure with the structure knowledge transferred\nfrom other clients. To enable structure knowledge transfer, we design a GNN\nmodel and a feature encoder on each client. During local training, the feature\nencoder retains the local graph structure knowledge together with the GNN model\nvia knowledge distillation, and the structure knowledge is transferred among\nclients in global update. Our extensive experiments demonstrate the superiority\nof the proposed FedGLS over five baselines.\n","authors":["Xingbo Fu","Song Wang","Yushun Dong","Binchi Zhang","Chen Chen","Jundong Li"],"pdf_url":"https://arxiv.org/pdf/2411.08374v1.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2410.21564v2","updated":"2024-11-13T06:35:53Z","published":"2024-10-28T21:54:44Z","title":"Mitigating Gradient Overlap in Deep Residual Networks with Gradient\n Normalization for Improved Non-Convex Optimization","summary":" In deep learning, Residual Networks (ResNets) have proven effective in\naddressing the vanishing gradient problem, allowing for the successful training\nof very deep networks. However, skip connections in ResNets can lead to\ngradient overlap, where gradients from both the learned transformation and the\nskip connection combine, potentially resulting in overestimated gradients. This\noverestimation can cause inefficiencies in optimization, as some updates may\novershoot optimal regions, affecting weight updates. To address this, we\nexamine Z-score Normalization (ZNorm) as a technique to manage gradient\noverlap. ZNorm adjusts the gradient scale, standardizing gradients across\nlayers and reducing the negative impact of overlapping gradients. Our\nexperiments demonstrate that ZNorm improves training process, especially in\nnon-convex optimization scenarios common in deep learning, where finding\noptimal solutions is challenging. These findings suggest that ZNorm can affect\nthe gradient flow, enhancing performance in large-scale data processing where\naccuracy is critical.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2410.21564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07934v2","updated":"2024-11-13T06:34:07Z","published":"2024-11-12T17:04:56Z","title":"Doubly Mild Generalization for Offline Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) suffers from the extrapolation error and\nvalue overestimation. From a generalization perspective, this issue can be\nattributed to the over-generalization of value functions or policies towards\nout-of-distribution (OOD) actions. Significant efforts have been devoted to\nmitigating such generalization, and recent in-sample learning approaches have\nfurther succeeded in entirely eschewing it. Nevertheless, we show that mild\ngeneralization beyond the dataset can be trusted and leveraged to improve\nperformance under certain conditions. To appropriately exploit generalization\nin offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild\naction generalization and (ii) mild generalization propagation. The former\nrefers to selecting actions in a close neighborhood of the dataset to maximize\nthe Q values. Even so, the potential erroneous generalization can still be\npropagated, accumulated, and exacerbated by bootstrapping. In light of this,\nthe latter concept is introduced to mitigate the generalization propagation\nwithout impeding the propagation of RL learning signals. Theoretically, DMG\nguarantees better performance than the in-sample optimal policy in the oracle\ngeneralization scenario. Even under worst-case generalization, DMG can still\ncontrol value overestimation at a certain level and lower bound the\nperformance. Empirically, DMG achieves state-of-the-art performance across\nGym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting\nfrom its flexibility in both generalization aspects, DMG enjoys a seamless\ntransition from offline to online learning and attains strong online\nfine-tuning performance.\n","authors":["Yixiu Mao","Qi Wang","Yun Qu","Yuhang Jiang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2411.07934v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08367v1","updated":"2024-11-13T06:32:17Z","published":"2024-11-13T06:32:17Z","title":"Surprisingly Popular Voting for Concentric Rank-Order Models","summary":" An important problem on social information sites is the recovery of ground\ntruth from individual reports when the experts are in the minority. The wisdom\nof the crowd, i.e. the collective opinion of a group of individuals fails in\nsuch a scenario. However, the surprisingly popular (SP)\nalgorithm~\\cite{prelec2017solution} can recover the ground truth even when the\nexperts are in the minority, by asking the individuals to report additional\nprediction reports--their beliefs about the reports of others. Several recent\nworks have extended the surprisingly popular algorithm to an equivalent voting\nrule (SP-voting) to recover the ground truth ranking over a set of $m$\nalternatives. However, we are yet to fully understand when SP-voting can\nrecover the ground truth ranking, and if so, how many samples (votes and\npredictions) it needs. We answer this question by proposing two rank-order\nmodels and analyzing the sample complexity of SP-voting under these models. In\nparticular, we propose concentric mixtures of Mallows and Plackett-Luce models\nwith $G (\\ge 2)$ groups. Our models generalize previously proposed concentric\nmixtures of Mallows models with $2$ groups, and we highlight the importance of\n$G > 2$ groups by identifying three distinct groups (expert, intermediate, and\nnon-expert) from existing datasets. Next, we provide conditions on the\nparameters of the underlying models so that SP-voting can recover ground-truth\nrankings with high probability, and also derive sample complexities under the\nsame. We complement the theoretical results by evaluating SP-voting on\nsimulated and real datasets.\n","authors":["Hadi Hosseini","Debmalya Mandal","Amrit Puhan"],"pdf_url":"https://arxiv.org/pdf/2411.08367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08360v1","updated":"2024-11-13T06:16:12Z","published":"2024-11-13T06:16:12Z","title":"Coverage Analysis for Digital Cousin Selection -- Improving\n Multi-Environment Q-Learning","summary":" Q-learning is widely employed for optimizing various large-dimensional\nnetworks with unknown system dynamics. Recent advancements include\nmulti-environment mixed Q-learning (MEMQ) algorithms, which utilize multiple\nindependent Q-learning algorithms across multiple, structurally related but\ndistinct environments and outperform several state-of-the-art Q-learning\nalgorithms in terms of accuracy, complexity, and robustness. We herein conduct\na comprehensive probabilistic coverage analysis to ensure optimal data coverage\nconditions for MEMQ algorithms. First, we derive upper and lower bounds on the\nexpectation and variance of different coverage coefficients (CC) for MEMQ\nalgorithms. Leveraging these bounds, we develop a simple way of comparing the\nutilities of multiple environments in MEMQ algorithms. This approach appears to\nbe near optimal versus our previously proposed partial ordering approach. We\nalso present a novel CC-based MEMQ algorithm to improve the accuracy and\ncomplexity of existing MEMQ algorithms. Numerical experiments are conducted\nusing random network graphs with four different graph properties. Our algorithm\ncan reduce the average policy error (APE) by 65% compared to partial ordering\nand is 95% faster than the exhaustive search. It also achieves 60% less APE\nthan several state-of-the-art reinforcement learning and prior MEMQ algorithms.\nAdditionally, we numerically verify the theoretical results and show their\nscalability with the action-space size.\n","authors":["Talha Bozkus","Tara Javidi","Urbashi Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.08360v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2409.18696v2","updated":"2024-11-13T06:01:51Z","published":"2024-09-27T12:34:08Z","title":"Rethinking the Power of Timestamps for Robust Time Series Forecasting: A\n Global-Local Fusion Perspective","summary":" Time series forecasting has played a pivotal role across various industries,\nincluding finance, transportation, energy, healthcare, and climate. Due to the\nabundant seasonal information they contain, timestamps possess the potential to\noffer robust global guidance for forecasting techniques. However, existing\nworks primarily focus on local observations, with timestamps being treated\nmerely as an optional supplement that remains underutilized. When data gathered\nfrom the real world is polluted, the absence of global information will damage\nthe robust prediction capability of these algorithms. To address these\nproblems, we propose a novel framework named GLAFF. Within this framework, the\ntimestamps are modeled individually to capture the global dependencies. Working\nas a plugin, GLAFF adaptively adjusts the combined weights for global and local\ninformation, enabling seamless collaboration with any time series forecasting\nbackbone. Extensive experiments conducted on nine real-world datasets\ndemonstrate that GLAFF significantly enhances the average performance of widely\nused mainstream forecasting models by 12.5%, surpassing the previous\nstate-of-the-art method by 5.5%.\n","authors":["Chengsen Wang","Qi Qi","Jingyu Wang","Haifeng Sun","Zirui Zhuang","Jinming Wu","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2409.18696v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08355v1","updated":"2024-11-13T05:59:04Z","published":"2024-11-13T05:59:04Z","title":"Communication Efficient Decentralization for Smoothed Online Convex\n Optimization","summary":" We study the multi-agent Smoothed Online Convex Optimization (SOCO) problem,\nwhere $N$ agents interact through a communication graph. In each round, each\nagent $i$ receives a strongly convex hitting cost function $f^i_t$ in an online\nfashion and selects an action $x^i_t \\in \\mathbb{R}^d$. The objective is to\nminimize the global cumulative cost, which includes the sum of individual\nhitting costs $f^i_t(x^i_t)$, a temporal \"switching cost\" for changing\ndecisions, and a spatial \"dissimilarity cost\" that penalizes deviations in\ndecisions among neighboring agents. We propose the first decentralized\nalgorithm for multi-agent SOCO and prove its asymptotic optimality. Our\napproach allows each agent to operate using only local information from its\nimmediate neighbors in the graph. For finite-time performance, we establish\nthat the optimality gap in competitive ratio decreases with the time horizon\n$T$ and can be conveniently tuned based on the per-round computation available\nto each agent. Moreover, our results hold even when the communication graph\nchanges arbitrarily and adaptively over time. Finally, we establish that the\ncomputational complexity per round depends only logarithmically on the number\nof agents and almost linearly on their degree within the graph, ensuring\nscalability for large-system implementations.\n","authors":["Neelkamal Bhuyan","Debankur Mukherjee","Adam Wierman"],"pdf_url":"https://arxiv.org/pdf/2411.08355v1.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2411.04493v2","updated":"2024-11-13T05:52:23Z","published":"2024-11-07T07:41:04Z","title":"Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised\n Medical Image Segmentation","summary":" Semi-supervised learning has received considerable attention for its\npotential to leverage abundant unlabeled data to enhance model robustness.\nPseudo labeling is a widely used strategy in semi supervised learning. However,\nexisting methods often suffer from noise contamination, which can undermine\nmodel performance. To tackle this challenge, we introduce a novel\nSynergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework.\nBuilt upon the mean teacher network, we employ a Mix Augmentation module to\nenhance the unlabeled data. By evaluating the synergy before and after\naugmentation, we strategically partition the pseudo labels into distinct\nregions. Additionally, we introduce a Region Loss Evaluation module to assess\nthe loss across each delineated area. Extensive experiments conducted on the LA\ndataset have demonstrated superior performance over state-of-the-art\ntechniques, underscoring the efficiency and practicality of our framework.\n","authors":["Tao Wang","Xinlin Zhang","Yuanbin Chen","Yuanbo Zhou","Longxuan Zhao","Tao Tan","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2411.04493v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08344v1","updated":"2024-11-13T05:22:45Z","published":"2024-11-13T05:22:45Z","title":"Bangla Grammatical Error Detection Leveraging Transformer-based Token\n Classification","summary":" Bangla is the seventh most spoken language by a total number of speakers in\nthe world, and yet the development of an automated grammar checker in this\nlanguage is an understudied problem. Bangla grammatical error detection is a\ntask of detecting sub-strings of a Bangla text that contain grammatical,\npunctuation, or spelling errors, which is crucial for developing an automated\nBangla typing assistant. Our approach involves breaking down the task as a\ntoken classification problem and utilizing state-of-the-art transformer-based\nmodels. Finally, we combine the output of these models and apply rule-based\npost-processing to generate a more reliable and comprehensive result. Our\nsystem is evaluated on a dataset consisting of over 25,000 texts from various\nsources. Our best model achieves a Levenshtein distance score of 1.04. Finally,\nwe provide a detailed analysis of different components of our system.\n","authors":["Shayekh Bin Islam","Ridwanul Hasan Tanvir","Sihat Afnan"],"pdf_url":"https://arxiv.org/pdf/2411.08344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03497v3","updated":"2024-11-13T04:41:31Z","published":"2024-08-07T01:37:10Z","title":"Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and\n Tabnet with SMOTEENN","summary":" Bank credit risk is a significant challenge in modern financial transactions,\nand the ability to identify qualified credit card holders among a large number\nof applicants is crucial for the profitability of a bank'sbank's credit card\nbusiness. In the past, screening applicants'applicants' conditions often\nrequired a significant amount of manual labor, which was time-consuming and\nlabor-intensive. Although the accuracy and reliability of previously used ML\nmodels have been continuously improving, the pursuit of more reliable and\npowerful AI intelligent models is undoubtedly the unremitting pursuit by major\nbanks in the financial industry. In this study, we used a dataset of over\n40,000 records provided by a commercial bank as the research object. We\ncompared various dimensionality reduction techniques such as PCA and T-SNE for\npreprocessing high-dimensional datasets and performed in-depth adaptation and\ntuning of distributed models such as LightGBM and XGBoost, as well as deep\nmodels like Tabnet. After a series of research and processing, we obtained\nexcellent research results by combining SMOTEENN with these techniques. The\nexperiments demonstrated that LightGBM combined with PCA and SMOTEENN\ntechniques can assist banks in accurately predicting potential high-quality\ncustomers, showing relatively outstanding performance compared to other models.\n","authors":["Chang Yu","Yixin Jin","Qianwen Xing","Ye Zhang","Shaobo Guo","Shuchen Meng"],"pdf_url":"https://arxiv.org/pdf/2408.03497v3.pdf","comment":"8 pagess on IEEE ICPICS"},{"id":"http://arxiv.org/abs/2411.08332v1","updated":"2024-11-13T04:27:25Z","published":"2024-11-13T04:27:25Z","title":"Learning-Augmented Algorithms for Online Concave Packing and Convex\n Covering Problems","summary":" Learning-augmented algorithms have been extensively studied across the\ncomputer science community in the recent years, driven by advances in machine\nlearning predictors, which can provide additional information to augment\nclassical algorithms. Such predictions are especially powerful in the context\nof online problems, where decisions have to be made without knowledge of the\nfuture, and which traditionally exhibits impossibility results bounding the\nperformance of any online algorithm. The study of learning-augmented algorithms\nthus aims to use external advice prudently, to overcome classical impossibility\nresults when the advice is accurate, and still perform comparably to the\nstate-of-the-art online algorithms even when the advice is inaccurate.\n In this paper, we present learning-augmented algorithmic frameworks for two\nfundamental optimizations settings, extending and generalizing prior works. For\nonline packing with concave objectives, we present a simple but overarching\nstrategy that switches between the advice and the state-of-the-art online\nalgorithm. For online covering with convex objectives, we greatly extend\nprimal-dual methods for online convex covering programs by Azar et al. (FOCS\n2016) and previous learning-augmented framework for online covering linear\nprograms from the literature, to many new applications. We show that our\nalgorithms break impossibility results when the advice is accurate, while\nmaintaining comparable performance with state-of-the-art classical online\nalgorithms even when the advice is erroneous.\n","authors":["Elena Grigorescu","Young-San Lin","Maoyuan Song"],"pdf_url":"https://arxiv.org/pdf/2411.08332v1.pdf","comment":"38 pages. In submission"},{"id":"http://arxiv.org/abs/2411.08326v1","updated":"2024-11-13T04:20:29Z","published":"2024-11-13T04:20:29Z","title":"Neural Conjugate Flows: Physics-informed architectures with flow\n structure","summary":" We introduce Neural Conjugate Flows (NCF), a class of neural network\narchitectures equipped with exact flow structure. By leveraging topological\nconjugation, we prove that these networks are not only naturally isomorphic to\na continuous group, but are also universal approximators for flows of ordinary\ndifferential equation (ODEs). Furthermore, topological properties of these\nflows can be enforced by the architecture in an interpretable manner. We\ndemonstrate in numerical experiments how this topological group structure leads\nto concrete computational gains over other physics informed neural networks in\nestimating and extrapolating latent dynamics of ODEs, while training up to five\ntimes faster than other flow-based architectures.\n","authors":["Arthur Bizzi","Lucas Nissenbaum","João M. Pereira"],"pdf_url":"https://arxiv.org/pdf/2411.08326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08324v1","updated":"2024-11-13T04:20:20Z","published":"2024-11-13T04:20:20Z","title":"Are LLMs Prescient? A Continuous Evaluation using Daily News as the\n Oracle","summary":" Many existing evaluation benchmarks for Large Language Models (LLMs) quickly\nbecome outdated due to the emergence of new models and training data. These\nbenchmarks also fall short in assessing how LLM performance changes over time,\nas they consist of static questions without a temporal dimension. To address\nthese limitations, we propose using future event prediction as a continuous\nevaluation method to assess LLMs' temporal generalization and forecasting\nabilities. Our benchmark, Daily Oracle, automatically generates question-answer\n(QA) pairs from daily news, challenging LLMs to predict \"future\" event\noutcomes. Our findings reveal that as pre-training data becomes outdated, LLM\nperformance degrades over time. While Retrieval Augmented Generation (RAG) has\nthe potential to enhance prediction accuracy, the performance degradation\npattern persists, highlighting the need for continuous model updates.\n","authors":["Hui Dai","Ryan Teehan","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2411.08324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07207v2","updated":"2024-11-13T04:15:38Z","published":"2024-11-11T18:32:44Z","title":"General Geospatial Inference with a Population Dynamics Foundation Model","summary":" Supporting the health and well-being of dynamic populations around the world\nrequires governmental agencies, organizations and researchers to understand and\nreason over complex relationships between human behavior and local contexts in\norder to identify high-risk groups and strategically allocate limited\nresources. Traditional approaches to these classes of problems often entail\ndeveloping manually curated, task-specific features and models to represent\nhuman behavior and the natural and built environment, which can be challenging\nto adapt to new, or even, related tasks. To address this, we introduce a\nPopulation Dynamics Foundation Model (PDFM) that aims to capture the\nrelationships between diverse data modalities and is applicable to a broad\nrange of geospatial tasks. We first construct a geo-indexed dataset for postal\ncodes and counties across the United States, capturing rich aggregated\ninformation on human behavior from maps, busyness, and aggregated search\ntrends, and environmental factors such as weather and air quality. We then\nmodel this data and the complex relationships between locations using a graph\nneural network, producing embeddings that can be adapted to a wide range of\ndownstream tasks using relatively simple models. We evaluate the effectiveness\nof our approach by benchmarking it on 27 downstream tasks spanning three\ndistinct domains: health indicators, socioeconomic factors, and environmental\nmeasurements. The approach achieves state-of-the-art performance on all 27\ngeospatial interpolation tasks, and on 25 out of the 27 extrapolation and\nsuper-resolution tasks. We combined the PDFM with a state-of-the-art\nforecasting foundation model, TimesFM, to predict unemployment and poverty,\nachieving performance that surpasses fully supervised forecasting. The full set\nof embeddings and sample code are publicly available for researchers.\n","authors":["Mohit Agarwal","Mimi Sun","Chaitanya Kamath","Arbaaz Muslim","Prithul Sarker","Joydeep Paul","Hector Yee","Marcin Sieniek","Kim Jablonski","Yael Mayer","David Fork","Sheila de Guia","Jamie McPike","Adam Boulanger","Tomer Shekel","David Schottlander","Yao Xiao","Manjit Chakravarthy Manukonda","Yun Liu","Neslihan Bulut","Sami Abu-el-haija","Arno Eigenwillig","Parth Kothari","Bryan Perozzi","Monica Bharel","Von Nguyen","Luke Barrington","Niv Efron","Yossi Matias","Greg Corrado","Krish Eswaran","Shruthi Prabhakara","Shravya Shetty","Gautam Prasad"],"pdf_url":"https://arxiv.org/pdf/2411.07207v2.pdf","comment":"28 pages, 16 figures, preprint; v2: updated github url"},{"id":"http://arxiv.org/abs/2411.08314v1","updated":"2024-11-13T03:42:55Z","published":"2024-11-13T03:42:55Z","title":"Conditional Variable Flow Matching: Transforming Conditional Densities\n with Amortized Conditional Optimal Transport","summary":" Forecasting stochastic nonlinear dynamical systems under the influence of\nconditioning variables is a fundamental challenge repeatedly encountered across\nthe biological and physical sciences. While flow-based models can impressively\npredict the temporal evolution of probability distributions representing\npossible outcomes of a specific process, existing frameworks cannot\nsatisfactorily account for the impact of conditioning variables on these\ndynamics. Amongst several limitations, existing methods require training data\nwith paired conditions and are developed for discrete conditioning variables.\nWe propose Conditional Variable Flow Matching (CVFM), a framework for learning\nflows transforming conditional distributions with amortization across\ncontinuous conditioning variables - permitting predictions across the\nconditional density manifold. This is accomplished through several novel\nadvances, in particular, simultaneous sample conditioned flows over the main\nand conditioning variables, alongside a conditional Wasserstein distance and\nkernel facilitating conditional optimal transport. Collectively, these advances\nallow for learning system dynamics provided measurement data whose states and\nconditioning variables are not in correspondence. We demonstrate CVFM on a\nsuite of increasingly challenging problems, including discrete and continuous\nconditional mapping benchmarks, image-to-image domain transfer, and modeling\nthe temporal evolution of materials internal structure during manufacturing\nprocesses. We observe that CVFM results in improved performance and convergence\ncharacteristics over alternative conditional variants.\n","authors":["Adam P. Generale","Andreas E. Robertson","Surya R. Kalidindi"],"pdf_url":"https://arxiv.org/pdf/2411.08314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08621v3","updated":"2024-11-13T03:24:24Z","published":"2024-02-13T17:42:27Z","title":"A Unified Framework for Analyzing Meta-algorithms in Online Convex\n Optimization","summary":" In this paper, we analyze the problem of online convex optimization in\ndifferent settings, including different feedback types\n(full-information/semi-bandit/bandit/etc) in either stochastic or\nnon-stochastic setting and different notions of regret (static adversarial\nregret/dynamic regret/adaptive regret). This is done through a framework which\nallows us to systematically propose and analyze meta-algorithms for the various\nsettings described above. We show that any algorithm for online linear\noptimization with fully adaptive adversaries is an algorithm for online convex\noptimization. We also show that any such algorithm that requires\nfull-information feedback may be transformed to an algorithm with semi-bandit\nfeedback with comparable regret bound. We further show that algorithms that are\ndesigned for fully adaptive adversaries using deterministic semi-bandit\nfeedback can obtain similar bounds using only stochastic semi-bandit feedback\nwhen facing oblivious adversaries. We use this to describe general\nmeta-algorithms to convert first order algorithms to zeroth order algorithms\nwith comparable regret bounds. Our framework allows us to analyze online\noptimization in various settings, recovers several results in the literature\nwith a simplified proof technique, and provides new results.\n","authors":["Mohammad Pedramfar","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2402.08621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08306v1","updated":"2024-11-13T03:08:33Z","published":"2024-11-13T03:08:33Z","title":"SDDBench: A Benchmark for Synthesizable Drug Design","summary":" A significant challenge in wet lab experiments with current drug design\ngenerative models is the trade-off between pharmacological properties and\nsynthesizability. Molecules predicted to have highly desirable properties are\noften difficult to synthesize, while those that are easily synthesizable tend\nto exhibit less favorable properties. As a result, evaluating the\nsynthesizability of molecules in general drug design scenarios remains a\nsignificant challenge in the field of drug discovery. The commonly used\nsynthetic accessibility (SA) score aims to evaluate the ease of synthesizing\ngenerated molecules, but it falls short of guaranteeing that synthetic routes\ncan actually be found. Inspired by recent advances in top-down synthetic route\ngeneration, we propose a new, data-driven metric to evaluate molecule\nsynthesizability. Our approach directly assesses the feasibility of synthetic\nroutes for a given molecule through our proposed round-trip score. This novel\nmetric leverages the synergistic duality between retrosynthetic planners and\nreaction predictors, both of which are trained on extensive reaction datasets.\nTo demonstrate the efficacy of our method, we conduct a comprehensive\nevaluation of round-trip scores alongside search success rate across a range of\nrepresentative molecule generative models. Code is available at\nhttps://github.com/SongtaoLiu0823/SDDBench.\n","authors":["Songtao Liu","Zhengkai Tu","Hanjun Dai","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07468v2","updated":"2024-11-13T03:07:36Z","published":"2024-11-12T01:09:52Z","title":"Privacy-Preserving Verifiable Neural Network Inference Service","summary":" Machine learning has revolutionized data analysis and pattern recognition,\nbut its resource-intensive training has limited accessibility. Machine Learning\nas a Service (MLaaS) simplifies this by enabling users to delegate their data\nsamples to an MLaaS provider and obtain the inference result using a\npre-trained model. Despite its convenience, leveraging MLaaS poses significant\nprivacy and reliability concerns to the client. Specifically, sensitive\ninformation from the client inquiry data can be leaked to an adversarial MLaaS\nprovider. Meanwhile, the lack of a verifiability guarantee can potentially\nresult in biased inference results or even unfair payment issues. While\nexisting trustworthy machine learning techniques, such as those relying on\nverifiable computation or secure computation, offer solutions to privacy and\nreliability concerns, they fall short of simultaneously protecting the privacy\nof client data and providing provable inference verifiability.\n In this paper, we propose vPIN, a privacy-preserving and verifiable CNN\ninference scheme that preserves privacy for client data samples while ensuring\nverifiability for the inference. vPIN makes use of partial homomorphic\nencryption and commit-and-prove succinct non-interactive argument of knowledge\ntechniques to achieve desirable security properties. In vPIN, we develop\nvarious optimization techniques to minimize the proving circuit for homomorphic\ninference evaluation thereby, improving the efficiency and performance of our\ntechnique. We fully implemented and evaluated our vPIN scheme on standard\ndatasets (e.g., MNIST, CIFAR-10). Our experimental results show that vPIN\nachieves high efficiency in terms of proving time, verification time, and proof\nsize, while providing client data privacy guarantees and provable\nverifiability.\n","authors":["Arman Riasi","Jorge Guajardo","Thang Hoang"],"pdf_url":"https://arxiv.org/pdf/2411.07468v2.pdf","comment":"Accepted at the Annual Computer Security Applications Conference\n (ACSAC) 2024. Source code: github.com/vt-asaplab/vPIN"},{"id":"http://arxiv.org/abs/2411.07954v2","updated":"2024-11-13T02:56:56Z","published":"2024-11-12T17:30:31Z","title":"Learning Memory Mechanisms for Decision Making through Demonstrations","summary":" In Partially Observable Markov Decision Processes, integrating an agent's\nhistory into memory poses a significant challenge for decision-making.\nTraditional imitation learning, relying on observation-action pairs for expert\ndemonstrations, fails to capture the expert's memory mechanisms used in\ndecision-making. To capture memory processes as demonstrations, we introduce\nthe concept of memory dependency pairs $(p, q)$ indicating that events at time\n$p$ are recalled for decision-making at time $q$. We introduce AttentionTuner\nto leverage memory dependency pairs in Transformers and find significant\nimprovements across several tasks compared to standard Transformers when\nevaluated on Memory Gym and the Long-term Memory Benchmark. Code is available\nat https://github.com/WilliamYue37/AttentionTuner.\n","authors":["William Yue","Bo Liu","Peter Stone"],"pdf_url":"https://arxiv.org/pdf/2411.07954v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07463v2","updated":"2024-11-13T02:39:12Z","published":"2024-11-12T00:54:26Z","title":"MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation\n Models, Convolutional Neural Networks, and Uncertainty Quantification for\n High-Speed Video Phase Detection Data","summary":" Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in\nnuclear reactors, chemical processing, and electronics cooling for detecting\nvapor, liquid, and microlayer phases. Traditional segmentation models face\npixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ\nintroduces VideoSAM, a hybrid framework leveraging convolutional neural\nnetworks (CNNs) and transformer-based vision models to enhance segmentation\naccuracy and generalizability across complex multimodal PD tasks. Methods:\nVideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced\nfeature extraction and segmentation across diverse HSV PD modalities, spanning\nfluids like water, FC-72, nitrogen, and argon under varied heat flux\nconditions. The framework also incorporates uncertainty quantification (UQ) to\nassess pixel-based discretization errors, delivering reliable metrics such as\ncontact line density and dry area fraction under experimental conditions.\nResults: VideoSAM outperforms SAM and modality-specific CNN models in\nsegmentation accuracy, excelling in environments with complex phase boundaries,\noverlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid\narchitecture supports cross-dataset generalization, adapting effectively to\nvarying modalities. The UQ module provides accurate error estimates, enhancing\nthe reliability of segmentation outputs for advanced HSV PD research.\nConclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD\nsegmentation, addressing previous limitations with advanced deep learning and\nUQ techniques. The open-source datasets and tools introduced enable scalable,\nprecise, and adaptable segmentation for multimodal PD datasets, supporting\nadvancements in HSV analysis and autonomous experimentation. The codes and data\nused for this paper are publicly available at:\n\\url{https://github.com/chikap421/mseg_vcuq}\n","authors":["Chika Maduabuchi","Ericmoore Jossou","Matteo Bucci"],"pdf_url":"https://arxiv.org/pdf/2411.07463v2.pdf","comment":"Under Review in EAAI"},{"id":"http://arxiv.org/abs/2411.07249v2","updated":"2024-11-13T02:38:02Z","published":"2024-10-26T21:27:53Z","title":"SPDIM: Source-Free Unsupervised Conditional and Label Shift Adaptation\n in EEG","summary":" The non-stationary nature of electroencephalography (EEG) introduces\ndistribution shifts across domains (e.g., days and subjects), posing a\nsignificant challenge to EEG-based neurotechnology generalization. Without\nlabeled calibration data for target domains, the problem is a source-free\nunsupervised domain adaptation (SFUDA) problem. For scenarios with constant\nlabel distribution, Riemannian geometry-aware statistical alignment frameworks\non the symmetric positive definite (SPD) manifold are considered\nstate-of-the-art. However, many practical scenarios, including EEG-based sleep\nstaging, exhibit label shifts. Here, we propose a geometric deep learning\nframework for SFUDA problems under specific distribution shifts, including\nlabel shifts. We introduce a novel, realistic generative model and show that\nprior Riemannian statistical alignment methods on the SPD manifold can\ncompensate for specific marginal and conditional distribution shifts but hurt\ngeneralization under label shifts. As a remedy, we propose a\nparameter-efficient manifold optimization strategy termed SPDIM. SPDIM uses the\ninformation maximization principle to learn a single SPD-manifold-constrained\nparameter per target domain. In simulations, we demonstrate that SPDIM can\ncompensate for the shifts under our generative model. Moreover, using public\nEEG-based brain-computer interface and sleep staging datasets, we show that\nSPDIM outperforms prior approaches.\n","authors":["Shanglin Li","Motoaki Kawanabe","Reinmar J. Kobler"],"pdf_url":"https://arxiv.org/pdf/2411.07249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08297v1","updated":"2024-11-13T02:32:38Z","published":"2024-11-13T02:32:38Z","title":"TowerDebias: A Novel Debiasing Method based on the Tower Property","summary":" Decision-making processes have increasingly come to rely on sophisticated\nmachine learning tools, raising concerns about the fairness of their\npredictions with respect to any sensitive groups. The widespread use of\ncommercial black-box machine learning models necessitates careful consideration\nof their legal and ethical implications on consumers. In situations where users\nhave access to these \"black-box\" models, a key question emerges: how can we\nmitigate or eliminate the influence of sensitive attributes, such as race or\ngender? We propose towerDebias (tDB), a novel approach designed to reduce the\ninfluence of sensitive variables in predictions made by black-box models. Using\nthe Tower Property from probability theory, tDB aims to improve prediction\nfairness during the post-processing stage in a manner amenable to the\nFairness-Utility Tradeoff. This method is highly flexible, requiring no prior\nknowledge of the original model's internal structure, and can be extended to a\nrange of different applications. We provide a formal improvement theorem for\ntDB and demonstrate its effectiveness in both regression and classification\ntasks, underscoring its impact on the fairness-utility tradeoff.\n","authors":["Norman Matloff","Aditya Mittal"],"pdf_url":"https://arxiv.org/pdf/2411.08297v1.pdf","comment":"To be submitted to a journal soon"},{"id":"http://arxiv.org/abs/2411.08290v1","updated":"2024-11-13T02:17:03Z","published":"2024-11-13T02:17:03Z","title":"RESOLVE: Relational Reasoning with Symbolic and Object-Level Features\n Using Vector Symbolic Processing","summary":" Modern transformer-based encoder-decoder architectures struggle with\nreasoning tasks due to their inability to effectively extract relational\ninformation between input objects (data/tokens). Recent work introduced the\nAbstractor module, embedded between transformer layers, to address this gap.\nHowever, the Abstractor layer while excelling at capturing relational\ninformation (pure relational reasoning), faces challenges in tasks that require\nboth object and relational-level reasoning (partial relational reasoning). To\naddress this, we propose RESOLVE, a neuro-vector symbolic architecture that\ncombines object-level features with relational representations in\nhigh-dimensional spaces, using fast and efficient operations such as bundling\n(summation) and binding (Hadamard product) allowing both object-level features\nand relational representations to coexist within the same structure without\ninterfering with one another. RESOLVE is driven by a novel attention mechanism\nthat operates in a bipolar high dimensional space, allowing fast attention\nscore computation compared to the state-of-the-art. By leveraging this design,\nthe model achieves both low compute latency and memory efficiency. RESOLVE also\noffers better generalizability while achieving higher accuracy in purely\nrelational reasoning tasks such as sorting as well as partial relational\nreasoning tasks such as math problem-solving compared to state-of-the-art\nmethods.\n","authors":["Mohamed Mejri","Chandramouli Amarnath","Abhijit Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2411.08290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02349v2","updated":"2024-11-13T02:03:01Z","published":"2023-11-04T08:28:33Z","title":"Sample Complexity of Opinion Formation on Networks with Linear\n Regression Models","summary":" Consider public health officials aiming to spread awareness about a new\nvaccine in a community interconnected by a social network. How can they\ndistribute information with minimal resources, so as to avoid polarization and\nensure community-wide convergence of opinion? To tackle such challenges, we\ninitiate the study of sample complexity of opinion convergence in networks. Our\nframework is built on the recognized opinion formation game, where we regard\nthe opinion of each agent as a data-derived model, unlike previous works that\ntreat opinions as data-independent scalars. The opinion model for every agent\nis initially learned from its local samples and evolves game-theoretically as\nall agents communicate with neighbors and revise their models towards an\nequilibrium. Our focus is on the sample complexity needed to ensure that the\nopinions converge to an equilibrium such that the final model of every agent\nhas low generalization error.\n Our paper has two main technical results. First, we present a novel\npolynomial time optimization framework to quantify the total sample complexity\nfor arbitrary networks, when the underlying learning problem is (generalized)\nlinear regression. Second, we leverage this optimization to study the network\ngain which measures the improvement of sample complexity when learning over a\nnetwork compared to that in isolation. Towards this end, we derive network gain\nbounds for various network classes including cliques, star graphs, and random\nregular graphs. Additionally, our framework provides a method to study sample\ndistribution within the network, suggesting that it is sufficient to allocate\nsamples inversely to the degree. Empirical results on both synthetic and\nreal-world networks strongly support our theoretical findings.\n","authors":["Haolin Liu","Rajmohan Rajaraman","Ravi Sundaram","Anil Vullikanti","Omer Wasim","Haifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2311.02349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08286v1","updated":"2024-11-13T02:02:52Z","published":"2024-11-13T02:02:52Z","title":"Hashing for Protein Structure Similarity Search","summary":" Protein structure similarity search (PSSS), which tries to search proteins\nwith similar structures, plays a crucial role across diverse domains from drug\ndesign to protein function prediction and molecular evolution. Traditional\nalignment-based PSSS methods, which directly calculate alignment on the protein\nstructures, are highly time-consuming with high memory cost. Recently,\nalignment-free methods, which represent protein structures as fixed-length\nreal-valued vectors, are proposed for PSSS. Although these methods have lower\ntime and memory cost than alignment-based methods, their time and memory cost\nis still too high for large-scale PSSS, and their accuracy is unsatisfactory.\nIn this paper, we propose a novel method, called\n$\\underline{\\text{p}}$r$\\underline{\\text{o}}$tein\n$\\underline{\\text{s}}$tructure $\\underline{\\text{h}}$ashing (POSH), for PSSS.\nPOSH learns a binary vector representation for each protein structure, which\ncan dramatically reduce the time and memory cost for PSSS compared with\nreal-valued vector representation based methods. Furthermore, in POSH we also\npropose expressive hand-crafted features and a structure encoder to well model\nboth node and edge interactions in proteins. Experimental results on real\ndatasets show that POSH can outperform other methods to achieve\nstate-of-the-art accuracy. Furthermore, POSH achieves a memory saving of more\nthan six times and speed improvement of more than four times, compared with\nother methods.\n","authors":["Jin Han","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2411.08286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12723v4","updated":"2024-11-13T01:45:11Z","published":"2024-06-18T15:45:21Z","title":"BIOSCAN-5M: A Multimodal Dataset for Insect Biodiversity","summary":" As part of an ongoing worldwide effort to comprehend and monitor insect\nbiodiversity, this paper presents the BIOSCAN-5M Insect dataset to the machine\nlearning community and establish several benchmark tasks. BIOSCAN-5M is a\ncomprehensive dataset containing multi-modal information for over 5 million\ninsect specimens, and it significantly expands existing image-based biological\ndatasets by including taxonomic labels, raw nucleotide barcode sequences,\nassigned barcode index numbers, geographical, and size information. We propose\nthree benchmark experiments to demonstrate the impact of the multi-modal data\ntypes on the classification and clustering accuracy. First, we pretrain a\nmasked language model on the DNA barcode sequences of the BIOSCAN-5M dataset,\nand demonstrate the impact of using this large reference library on species-\nand genus-level classification performance. Second, we propose a zero-shot\ntransfer learning task applied to images and DNA barcodes to cluster feature\nembeddings obtained from self-supervised learning, to investigate whether\nmeaningful clusters can be derived from these representation embeddings. Third,\nwe benchmark multi-modality by performing contrastive learning on DNA barcodes,\nimage data, and taxonomic information. This yields a general shared embedding\nspace enabling taxonomic classification using multiple types of information and\nmodalities. The code repository of the BIOSCAN-5M Insect dataset is available\nat https://github.com/bioscan-ml/BIOSCAN-5M.\n","authors":["Zahra Gharaee","Scott C. Lowe","ZeMing Gong","Pablo Millan Arias","Nicholas Pellegrino","Austin T. Wang","Joakim Bruslund Haurum","Iuliia Zarubiieva","Lila Kari","Dirk Steinke","Graham W. Taylor","Paul Fieguth","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2406.12723v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05386v2","updated":"2024-11-13T01:40:53Z","published":"2024-05-08T19:31:06Z","title":"Interpretability Needs a New Paradigm","summary":" Interpretability is the study of explaining models in understandable terms to\nhumans. At present, interpretability is divided into two paradigms: the\nintrinsic paradigm, which believes that only models designed to be explained\ncan be explained, and the post-hoc paradigm, which believes that black-box\nmodels can be explained. At the core of this debate is how each paradigm\nensures its explanations are faithful, i.e., true to the model's behavior. This\nis important, as false but convincing explanations lead to unsupported\nconfidence in artificial intelligence (AI), which can be dangerous. This\npaper's position is that we should think about new paradigms while staying\nvigilant regarding faithfulness. First, by examining the history of paradigms\nin science, we see that paradigms are constantly evolving. Then, by examining\nthe current paradigms, we can understand their underlying beliefs, the value\nthey bring, and their limitations. Finally, this paper presents 3 emerging\nparadigms for interpretability. The first paradigm designs models such that\nfaithfulness can be easily measured. Another optimizes models such that\nexplanations become faithful. The last paradigm proposes to develop models that\nproduce both a prediction and an explanation.\n","authors":["Andreas Madsen","Himabindu Lakkaraju","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2405.05386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14798v2","updated":"2024-11-13T01:36:33Z","published":"2024-06-21T00:16:55Z","title":"Probabilistic Emulation of a Global Climate Model with Spherical\n DYffusion","summary":" Data-driven deep learning models are transforming global weather forecasting.\nIt is an open question if this success can extend to climate modeling, where\nthe complexity of the data and long inference rollouts pose significant\nchallenges. Here, we present the first conditional generative model that\nproduces accurate and physically consistent global climate ensemble simulations\nby emulating a coarse version of the United States' primary operational global\nforecast model, FV3GFS. Our model integrates the dynamics-informed diffusion\nframework (DYffusion) with the Spherical Fourier Neural Operator (SFNO)\narchitecture, enabling stable 100-year simulations at 6-hourly timesteps while\nmaintaining low computational overhead compared to single-step deterministic\nbaselines. The model achieves near gold-standard performance for climate model\nemulation, outperforming existing approaches and demonstrating promising\nensemble skill. This work represents a significant advance towards efficient,\ndata-driven climate simulations that can enhance our understanding of the\nclimate system and inform adaptation strategies.\n","authors":["Salva Rühling Cachay","Brian Henn","Oliver Watt-Meyer","Christopher S. Bretherton","Rose Yu"],"pdf_url":"https://arxiv.org/pdf/2406.14798v2.pdf","comment":"NeurIPS 2024; Code is available at\n https://github.com/Rose-STL-Lab/spherical-dyffusion"},{"id":"http://arxiv.org/abs/2411.08267v1","updated":"2024-11-13T00:42:40Z","published":"2024-11-13T00:42:40Z","title":"Least Squares Training of Quadratic Convolutional Neural Networks with\n Applications to System Theory","summary":" This paper provides a least squares formulation for the training of a 2-layer\nconvolutional neural network using quadratic activation functions, a 2-norm\nloss function, and no regularization term. Using this method, an analytic\nexpression for the globally optimal weights is obtained alongside a quadratic\ninput-output equation for the network. These properties make the network a\nviable tool in system theory by enabling further analysis, such as the\nsensitivity of the output to perturbations in the input, which is crucial for\nsafety-critical systems such as aircraft or autonomous vehicles.The least\nsquares method is compared to previously proposed strategies for training\nquadratic networks and to a back-propagation-trained ReLU network. The proposed\nmethod is applied to a system identification problem and a GPS position\nestimation problem. The least squares network is shown to have a significantly\nreduced training time with minimal compromises on prediction accuracy alongside\nthe advantages of having an analytic input-output equation. Although these\nresults only apply to 2-layer networks, this paper motivates the exploration of\ndeeper quadratic networks in the context of system theory.\n","authors":["Zachary Yetman Van Egmond","Luis Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2411.08267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09995v2","updated":"2024-11-13T00:41:01Z","published":"2024-04-15T17:59:57Z","title":"Taming Latent Diffusion Model for Neural Radiance Field Inpainting","summary":" Neural Radiance Field (NeRF) is a representation for 3D reconstruction from\nmulti-view images. Despite some recent work showing preliminary success in\nediting a reconstructed NeRF with diffusion prior, they remain struggling to\nsynthesize reasonable geometry in completely uncovered regions. One major\nreason is the high diversity of synthetic contents from the diffusion model,\nwhich hinders the radiance field from converging to a crisp and deterministic\ngeometry. Moreover, applying latent diffusion models on real data often yields\na textural shift incoherent to the image condition due to auto-encoding errors.\nThese two problems are further reinforced with the use of pixel-distance\nlosses. To address these issues, we propose tempering the diffusion model's\nstochasticity with per-scene customization and mitigating the textural shift\nwith masked adversarial training. During the analyses, we also found the\ncommonly used pixel and perceptual losses are harmful in the NeRF inpainting\ntask. Through rigorous experiments, our framework yields state-of-the-art NeRF\ninpainting results on various real-world scenes. Project page:\nhttps://hubert0527.github.io/MALD-NeRF\n","authors":["Chieh Hubert Lin","Changil Kim","Jia-Bin Huang","Qinbo Li","Chih-Yao Ma","Johannes Kopf","Ming-Hsuan Yang","Hung-Yu Tseng"],"pdf_url":"https://arxiv.org/pdf/2404.09995v2.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://hubert0527.github.io/MALD-NeRF"},{"id":"http://arxiv.org/abs/2402.17457v2","updated":"2024-11-13T00:38:48Z","published":"2024-02-27T12:28:01Z","title":"Super Consistency of Neural Network Landscapes and Learning Rate\n Transfer","summary":" Recently, there has been growing evidence that if the width and depth of a\nneural network are scaled toward the so-called rich feature learning limit\n(\\mup and its depth extension), then some hyperparameters -- such as the\nlearning rate -- exhibit transfer from small to very large models. From an\noptimization perspective, this phenomenon is puzzling, as it implies that the\nloss landscape is consistently similar across very different model sizes. In\nthis work, we study the landscape through the lens of the loss Hessian, with a\nfocus on its largest eigenvalue (i.e. the sharpness), and find that certain\nspectral properties under $\\mu$P are largely independent of the size of the\nnetwork, and remain consistent as training progresses. We name this property\nSuper Consistency of the landscape. On the other hand, we show that in the\nNeural Tangent Kernel (NTK) and other scaling regimes, the sharpness exhibits\nvery different dynamics at different scales. But what causes these differences\nin the sharpness dynamics? Through a connection between the Hessian's and the\nNTK's spectrum, we argue that the cause lies in the presence (for $\\mu$P) or\nprogressive absence (for the NTK scaling) of feature learning. We corroborate\nour claims with a substantial suite of experiments, covering a wide range of\ndatasets and architectures: from ResNets and Vision Transformers trained on\nbenchmark vision datasets to Transformers-based language models trained on\nWikiText.\n","authors":["Lorenzo Noci","Alexandru Meterez","Thomas Hofmann","Antonio Orvieto"],"pdf_url":"https://arxiv.org/pdf/2402.17457v2.pdf","comment":"The paper has been accepted at Neurips 2024. This is a revised\n version of the paper previously titled \"Why do Learning Rates Transfer?\n Reconciling Optimization and Scaling Limits for Deep Learning\""},{"id":"http://arxiv.org/abs/2410.01272v2","updated":"2024-11-13T00:19:34Z","published":"2024-10-02T06:30:49Z","title":"\"No Matter What You Do\": Purifying GNN Models via Backdoor Unlearning","summary":" Recent studies have exposed that GNNs are vulnerable to several adversarial\nattacks, among which backdoor attack is one of the toughest. Similar to Deep\nNeural Networks (DNNs), backdoor attacks in GNNs lie in the fact that the\nattacker modifies a portion of graph data by embedding triggers and enforces\nthe model to learn the trigger feature during the model training process.\nDespite the massive prior backdoor defense works on DNNs, defending against\nbackdoor attacks in GNNs is largely unexplored, severely hindering the\nwidespread application of GNNs in real-world tasks. To bridge this gap, we\npresent GCleaner, the first backdoor mitigation method on GNNs. GCleaner can\nmitigate the presence of the backdoor logic within backdoored GNNs by reversing\nthe backdoor learning procedure, aiming to restore the model performance to a\nlevel similar to that is directly trained on the original clean dataset. To\nachieve this objective, we ask: How to recover universal and hard backdoor\ntriggers in GNNs? How to unlearn the backdoor trigger feature while maintaining\nthe model performance? We conduct the graph trigger recovery via the\nexplanation method to identify optimal trigger locations, facilitating the\nsearch of universal and hard backdoor triggers in the feature space of the\nbackdoored model through maximal similarity. Subsequently, we introduce the\nbackdoor unlearning mechanism, which combines knowledge distillation and\ngradient-based explainable knowledge for fine-grained backdoor erasure.\nExtensive experimental evaluations on four benchmark datasets demonstrate that\nGCleaner can reduce the backdoor attack success rate to 10% with only 1% of\nclean data, and has almost negligible degradation in model performance, which\nfar outperforms the state-of-the-art (SOTA) defense methods.\n","authors":["Jiale Zhang","Chengcheng Zhu","Bosen Rao","Hao Sui","Xiaobing Sun","Bing Chen","Chunyi Zhou","Shouling Ji"],"pdf_url":"https://arxiv.org/pdf/2410.01272v2.pdf","comment":"18 pages, 12 figures, 9 tables"},{"id":"http://arxiv.org/abs/2409.18164v2","updated":"2024-11-13T00:15:46Z","published":"2024-09-26T17:30:28Z","title":"Data-Prep-Kit: getting your data ready for LLM application development","summary":" Data preparation is the first and a very important step towards any Large\nLanguage Model (LLM) development. This paper introduces an easy-to-use,\nextensible, and scale-flexible open-source data preparation toolkit called Data\nPrep Kit (DPK). DPK is architected and designed to enable users to scale their\ndata preparation to their needs. With DPK they can prepare data on a local\nmachine or effortlessly scale to run on a cluster with thousands of CPU Cores.\nDPK comes with a highly scalable, yet extensible set of modules that transform\nnatural language and code data. If the user needs additional transforms, they\ncan be easily developed using extensive DPK support for transform creation.\nThese modules can be used independently or pipelined to perform a series of\noperations. In this paper, we describe DPK architecture and show its\nperformance from a small scale to a very large number of CPUs. The modules from\nDPK have been used for the preparation of Granite Models [1] [2]. We believe\nDPK is a valuable contribution to the AI community to easily prepare data to\nenhance the performance of their LLM models or to fine-tune models with\nRetrieval-Augmented Generation (RAG).\n","authors":["David Wood","Boris Lublinsky","Alexy Roytman","Shivdeep Singh","Constantin Adam","Abdulhamid Adebayo","Sungeun An","Yuan Chi Chang","Xuan-Hong Dang","Nirmit Desai","Michele Dolfi","Hajar Emami-Gohari","Revital Eres","Takuya Goto","Dhiraj Joshi","Yan Koyfman","Mohammad Nassar","Hima Patel","Paramesvaran Selvam","Yousaf Shah","Saptha Surendran","Daiki Tsuzuku","Petros Zerfos","Shahrokh Daijavad"],"pdf_url":"https://arxiv.org/pdf/2409.18164v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.08257v1","updated":"2024-11-13T00:14:09Z","published":"2024-11-13T00:14:09Z","title":"GPTree: Towards Explainable Decision-Making via LLM-powered Decision\n Trees","summary":" Traditional decision tree algorithms are explainable but struggle with\nnon-linear, high-dimensional data, limiting its applicability in complex\ndecision-making. Neural networks excel at capturing complex patterns but\nsacrifice explainability in the process. In this work, we present GPTree, a\nnovel framework combining explainability of decision trees with the advanced\nreasoning capabilities of LLMs. GPTree eliminates the need for feature\nengineering and prompt chaining, requiring only a task-specific prompt and\nleveraging a tree-based structure to dynamically split samples. We also\nintroduce an expert-in-the-loop feedback mechanism to further enhance\nperformance by enabling human intervention to refine and rebuild decision\npaths, emphasizing the harmony between human expertise and machine\nintelligence. Our decision tree achieved a 7.8% precision rate for identifying\n\"unicorn\" startups at the inception stage of a startup, surpassing gpt-4o with\nfew-shot learning as well as the best human decision-makers (3.1% to 5.6%).\n","authors":["Sichao Xiong","Yigit Ihlamur","Fuat Alican","Aaron Ontoyin Yin"],"pdf_url":"https://arxiv.org/pdf/2411.08257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.04883v6","updated":"2024-11-13T23:34:19Z","published":"2022-08-09T16:25:49Z","title":"Neural-Rendezvous: Provably Robust Guidance and Control to Encounter\n Interstellar Objects","summary":" Interstellar objects (ISOs) are likely representatives of primitive materials\ninvaluable in understanding exoplanetary star systems. Due to their poorly\nconstrained orbits with generally high inclinations and relative velocities,\nhowever, exploring ISOs with conventional human-in-the-loop approaches is\nsignificantly challenging. This paper presents Neural-Rendezvous -- a deep\nlearning-based guidance and control framework for encountering fast-moving\nobjects, including ISOs, robustly, accurately, and autonomously in real time.\nIt uses pointwise minimum norm tracking control on top of a guidance policy\nmodeled by a spectrally-normalized deep neural network, where its\nhyperparameters are tuned with a loss function directly penalizing the MPC\nstate trajectory tracking error. We show that Neural-Rendezvous provides a high\nprobability exponential bound on the expected spacecraft delivery error, the\nproof of which leverages stochastic incremental stability analysis. In\nparticular, it is used to construct a non-negative function with a\nsupermartingale property, explicitly accounting for the ISO state uncertainty\nand the local nature of nonlinear state estimation guarantees. In numerical\nsimulations, Neural-Rendezvous is demonstrated to satisfy the expected error\nbound for 100 ISO candidates. This performance is also empirically validated\nusing our spacecraft simulator and in high-conflict and distributed UAV swarm\nreconfiguration with up to 20 UAVs.\n","authors":["Hiroyasu Tsukamoto","Soon-Jo Chung","Yashwanth Kumar Nakka","Benjamin Donitz","Declan Mages","Michel Ingham"],"pdf_url":"https://arxiv.org/pdf/2208.04883v6.pdf","comment":"Preprint Version, Accepted: October, 2024 (One-minute YouTube\n summary: https://youtu.be/q3e0LYS2IYQ, DOI:\n https://doi.org/10.2514/1.G007671)"},{"id":"http://arxiv.org/abs/2411.09077v1","updated":"2024-11-13T23:09:53Z","published":"2024-11-13T23:09:53Z","title":"Drone Detection using Deep Neural Networks Trained on Pure Synthetic\n Data","summary":" Drone detection has benefited from improvements in deep neural networks, but\nlike many other applications, suffers from the availability of accurate data\nfor training. Synthetic data provides a potential for low-cost data generation\nand has been shown to improve data availability and quality. However, models\ntrained on synthetic datasets need to prove their ability to perform on\nreal-world data, known as the problem of sim-to-real transferability. Here, we\npresent a drone detection Faster-RCNN model trained on a purely synthetic\ndataset that transfers to real-world data. We found that it achieves an AP_50\nof 97.0% when evaluated on the MAV-Vid - a real dataset of flying drones -\ncompared with 97.8% for an equivalent model trained on real-world data. Our\nresults show that using synthetic data for drone detection has the potential to\nreduce data collection costs and improve labelling quality. These findings\ncould be a starting point for more elaborate synthetic drone datasets. For\nexample, realistic recreations of specific scenarios could de-risk the dataset\ngeneration of safety-critical applications such as the detection of drones at\nairports. Further, synthetic data may enable reliable drone detection systems,\nwhich could benefit other areas, such as unmanned traffic management systems.\nThe code is available\nhttps://github.com/mazqtpopx/cranfield-synthetic-drone-detection alongside the\ndatasets\nhttps://huggingface.co/datasets/mazqtpopx/cranfield-synthetic-drone-detection.\n","authors":["Mariusz Wisniewski","Zeeshan A. Rana","Ivan Petrunin","Alan Holt","Stephen Harman"],"pdf_url":"https://arxiv.org/pdf/2411.09077v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.07217v3","updated":"2024-11-13T23:09:38Z","published":"2024-11-11T18:38:22Z","title":"Feature Selection Based on Wasserstein Distance","summary":" This paper presents a novel feature selection method leveraging the\nWasserstein distance to improve feature selection in machine learning. Unlike\ntraditional methods based on correlation or Kullback-Leibler (KL) divergence,\nour approach uses the Wasserstein distance to assess feature similarity,\ninherently capturing class relationships and making it robust to noisy labels.\nWe introduce a Markov blanket-based feature selection algorithm and demonstrate\nits effectiveness. Our analysis shows that the Wasserstein distance-based\nfeature selection method effectively reduces the impact of noisy labels without\nrelying on specific noise models. We provide a lower bound on its\neffectiveness, which remains meaningful even in the presence of noise.\nExperimental results across multiple datasets demonstrate that our approach\nconsistently outperforms traditional methods, particularly in noisy settings.\n","authors":["Fuwei Li"],"pdf_url":"https://arxiv.org/pdf/2411.07217v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14131v2","updated":"2024-11-13T23:07:44Z","published":"2024-05-23T03:11:07Z","title":"Statistical Advantages of Perturbing Cosine Router in Mixture of Experts","summary":" The cosine router in Mixture of Experts (MoE) has recently emerged as an\nattractive alternative to the conventional linear router. Indeed, the cosine\nrouter demonstrates favorable performance in image and language tasks and\nexhibits better ability to mitigate the representation collapse issue, which\noften leads to parameter redundancy and limited representation potentials.\nDespite its empirical success, a comprehensive analysis of the cosine router in\nMoE has been lacking. Considering the least square estimation of the cosine\nrouting MoE, we demonstrate that due to the intrinsic interaction of the model\nparameters in the cosine router via some partial differential equations,\nregardless of the structures of the experts, the estimation rates of experts\nand model parameters can be as slow as $\\mathcal{O}(1/\\log^{\\tau}(n))$ where\n$\\tau > 0$ is some constant and $n$ is the sample size. Surprisingly, these\npessimistic non-polynomial convergence rates can be circumvented by the widely\nused technique in practice to stabilize the cosine router -- simply adding\nnoises to the $L^2$ norms in the cosine router, which we refer to as\n\\textit{perturbed cosine router}. Under the strongly identifiable settings of\nthe expert functions, we prove that the estimation rates for both the experts\nand model parameters under the perturbed cosine routing MoE are significantly\nimproved to polynomial rates. Finally, we conduct extensive simulation studies\nin both synthetic and real data settings to empirically validate our\ntheoretical results.\n","authors":["Huy Nguyen","Pedram Akbarian","Trang Pham","Trang Nguyen","Shujian Zhang","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2405.14131v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2411.09073v1","updated":"2024-11-13T22:56:00Z","published":"2024-11-13T22:56:00Z","title":"Code-mixed LLM: Improve Large Language Models' Capability to Handle\n Code-Mixing through Reinforcement Learning from AI Feedback","summary":" Code-mixing(CM) or code-switching(CSW) refers to the juxtaposition of\nlinguistic units from two or more languages during the conversation or\nsometimes even a single utterance. Code-mixing introduces unique challenges in\ndaily life, such as syntactic mismatches and semantic blending, that are rarely\nencountered in monolingual settings. Large language models (LLMs) have\nrevolutionized the field of natural language processing (NLP) by offering\nunprecedented capabilities in understanding human languages. However, the\neffectiveness of current state-of-the-art multilingual LLMs has not yet been\nfully explored in the CM scenario. To fill this gap, we first benchmark the\nperformance of multilingual LLMs on various code-mixing NLP tasks. Then we\npropose to improve the multilingual LLMs' ability to understand code-mixing\nthrough reinforcement learning from human feedback (RLHF) and code-mixed\nmachine translation tasks. Given the high-cost and time-consuming preference\nlabeling procedure, we improve this by utilizing LLMs as annotators to perform\nthe reinforcement learning from AI feedback (RLAIF). The experiments show the\neffectiveness of the proposed method.\n","authors":["Wenbo Zhang","Aditya Majumdar","Amulya Yadav"],"pdf_url":"https://arxiv.org/pdf/2411.09073v1.pdf","comment":"initial version: 5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.09072v1","updated":"2024-11-13T22:55:45Z","published":"2024-11-13T22:55:45Z","title":"Continuous GNN-based Anomaly Detection on Edge using Efficient Adaptive\n Knowledge Graph Learning","summary":" The increasing demand for robust security solutions across various industries\nhas made Video Anomaly Detection (VAD) a critical task in applications such as\nintelligent surveillance, evidence investigation, and violence detection.\nTraditional approaches to VAD often rely on finetuning large pre-trained\nmodels, which can be computationally expensive and impractical for real-time or\nresource-constrained environments. To address this, MissionGNN introduced a\nmore efficient method by training a graph neural network (GNN) using a fixed\nknowledge graph (KG) derived from large language models (LLMs) like GPT-4.\nWhile this approach demonstrated significant efficiency in computational power\nand memory, it faces limitations in dynamic environments where frequent updates\nto the KG are necessary due to evolving behavior trends and shifting data\npatterns. These updates typically require cloud-based computation, posing\nchallenges for edge computing applications. In this paper, we propose a novel\nframework that facilitates continuous KG adaptation directly on edge devices,\novercoming the limitations of cloud dependency. Our method dynamically modifies\nthe KG through a three-phase process: pruning, alternating, and creating nodes,\nenabling real-time adaptation to changing data trends. This continuous learning\napproach enhances the robustness of anomaly detection models, making them more\nsuitable for deployment in dynamic and resource-constrained environments.\n","authors":["Sanggeon Yun","Ryozo Masukawa","William Youngwoo Chung","Minhyoung Na","Nathaniel Bastian","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2411.09072v1.pdf","comment":"Accepted to DATE 2025"},{"id":"http://arxiv.org/abs/2405.20194v6","updated":"2024-11-13T22:51:10Z","published":"2024-05-30T15:58:22Z","title":"Occam Gradient Descent","summary":" Deep learning neural network models must be large enough to adapt to their\nproblem domain, while small enough to avoid overfitting training data during\ngradient descent. To balance these competing demands, overprovisioned deep\nlearning models such as transformers are trained for a single epoch on large\ndata sets, and hence inefficient with both computing resources and training\ndata. In response to these inefficiencies, we exploit learning theory to derive\nOccam Gradient Descent, an algorithm that interleaves adaptive reduction of\nmodel size to minimize generalization error, with gradient descent on model\nweights to minimize fitting error. In contrast, traditional gradient descent\ngreedily minimizes fitting error without regard to generalization error. Our\nalgorithm simultaneously descends the space of weights and topological size of\nany neural network without modification. With respect to loss, compute and\nmodel size, our experiments show (a) on image classification benchmarks, linear\nand convolutional neural networks trained with Occam Gradient Descent\noutperform traditional gradient descent with or without post-train pruning; (b)\non a range of tabular data classification tasks, neural networks trained with\nOccam Gradient Descent outperform traditional gradient descent, as well as\nRandom Forests; (c) on natural language transformers, Occam Gradient Descent\noutperforms traditional gradient descent.\n","authors":["B. N. Kausik"],"pdf_url":"https://arxiv.org/pdf/2405.20194v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09065v1","updated":"2024-11-13T22:45:52Z","published":"2024-11-13T22:45:52Z","title":"Language-Model Prior Overcomes Cold-Start Items","summary":" The growth of recommender systems (RecSys) is driven by digitization and the\nneed for personalized content in areas such as e-commerce and video streaming.\nThe content in these systems often changes rapidly and therefore they\nconstantly face the ongoing cold-start problem, where new items lack\ninteraction data and are hard to value. Existing solutions for the cold-start\nproblem, such as content-based recommenders and hybrid methods, leverage item\nmetadata to determine item similarities. The main challenge with these methods\nis their reliance on structured and informative metadata to capture detailed\nitem similarities, which may not always be available. This paper introduces a\nnovel approach for cold-start item recommendation that utilizes the language\nmodel (LM) to estimate item similarities, which are further integrated as a\nBayesian prior with classic recommender systems. This approach is generic and\nable to boost the performance of various recommenders. Specifically, our\nexperiments integrate it with both sequential and collaborative filtering-based\nrecommender and evaluate it on two real-world datasets, demonstrating the\nenhanced performance of the proposed approach.\n","authors":["Shiyu Wang","Hao Ding","Yupeng Gu","Sergul Aydore","Kousha Kalantari","Branislav Kveton"],"pdf_url":"https://arxiv.org/pdf/2411.09065v1.pdf","comment":"This paper is dedicated to cold-start item recommendation using\n language-model priors"},{"id":"http://arxiv.org/abs/2411.09064v1","updated":"2024-11-13T22:44:25Z","published":"2024-11-13T22:44:25Z","title":"Minimax Optimal Two-Sample Testing under Local Differential Privacy","summary":" We explore the trade-off between privacy and statistical utility in private\ntwo-sample testing under local differential privacy (LDP) for both multinomial\nand continuous data. We begin by addressing the multinomial case, where we\nintroduce private permutation tests using practical privacy mechanisms such as\nLaplace, discrete Laplace, and Google's RAPPOR. We then extend our multinomial\napproach to continuous data via binning and study its uniform separation rates\nunder LDP over H\\\"older and Besov smoothness classes. The proposed tests for\nboth discrete and continuous cases rigorously control the type I error for any\nfinite sample size, strictly adhere to LDP constraints, and achieve minimax\nseparation rates under LDP. The attained minimax rates reveal inherent\nprivacy-utility trade-offs that are unavoidable in private testing. To address\nscenarios with unknown smoothness parameters in density testing, we propose an\nadaptive test based on a Bonferroni-type approach that ensures robust\nperformance without prior knowledge of the smoothness parameters. We validate\nour theoretical findings with extensive numerical experiments and demonstrate\nthe practical relevance and effectiveness of our proposed methods.\n","authors":["Jongmin Mun","Seungwoo Kwak","Ilmun Kim"],"pdf_url":"https://arxiv.org/pdf/2411.09064v1.pdf","comment":"59 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.09056v1","updated":"2024-11-13T22:29:23Z","published":"2024-11-13T22:29:23Z","title":"Optimisation Strategies for Ensuring Fairness in Machine Learning: With\n and Without Demographics","summary":" Ensuring fairness has emerged as one of the primary concerns in AI and its\nrelated algorithms. Over time, the field of machine learning fairness has\nevolved to address these issues. This paper provides an extensive overview of\nthis field and introduces two formal frameworks to tackle open questions in\nmachine learning fairness.\n In one framework, operator-valued optimisation and min-max objectives are\nemployed to address unfairness in time-series problems. This approach showcases\nstate-of-the-art performance on the notorious COMPAS benchmark dataset,\ndemonstrating its effectiveness in real-world scenarios.\n In the second framework, the challenge of lacking sensitive attributes, such\nas gender and race, in commonly used datasets is addressed. This issue is\nparticularly pressing because existing algorithms in this field predominantly\nrely on the availability or estimations of such attributes to assess and\nmitigate unfairness. Here, a framework for a group-blind bias-repair is\nintroduced, aiming to mitigate bias without relying on sensitive attributes.\nThe efficacy of this approach is showcased through analyses conducted on the\nAdult Census Income dataset.\n Additionally, detailed algorithmic analyses for both frameworks are provided,\naccompanied by convergence guarantees, ensuring the robustness and reliability\nof the proposed methodologies.\n","authors":["Quan Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.09056v1.pdf","comment":"PhD thesis. arXiv admin note: text overlap with arXiv:2310.11407"},{"id":"http://arxiv.org/abs/2411.09055v1","updated":"2024-11-13T22:28:05Z","published":"2024-11-13T22:28:05Z","title":"SAFELOC: Overcoming Data Poisoning Attacks in Heterogeneous Federated\n Machine Learning for Indoor Localization","summary":" Machine learning (ML) based indoor localization solutions are critical for\nmany emerging applications, yet their efficacy is often compromised by\nhardware/software variations across mobile devices (i.e., device heterogeneity)\nand the threat of ML data poisoning attacks. Conventional methods aimed at\ncountering these challenges show limited resilience to the uncertainties\ncreated by these phenomena. In response, in this paper, we introduce SAFELOC, a\nnovel framework that not only minimizes localization errors under these\nchallenging conditions but also ensures model compactness for efficient mobile\ndevice deployment. Our framework targets a distributed and co-operative\nlearning environment that uses federated learning (FL) to preserve user data\nprivacy and assumes heterogeneous mobile devices carried by users (just like in\nmost real-world scenarios). Within this heterogeneous FL context, SAFELOC\nintroduces a novel fused neural network architecture that performs data\npoisoning detection and localization, with a low model footprint. Additionally,\na dynamic saliency map-based aggregation strategy is designed to adapt based on\nthe severity of the detected data poisoning scenario. Experimental evaluations\ndemonstrate that SAFELOC achieves improvements of up to 5.9x in mean\nlocalization error, 7.8x in worst-case localization error, and a 2.1x reduction\nin model inference latency compared to state-of-the-art indoor localization\nframeworks, across diverse building floorplans, mobile devices, and ML data\npoisoning attack scenarios.\n","authors":["Akhil Singampalli","Danish Gufran","Sudeep Pasricha"],"pdf_url":"https://arxiv.org/pdf/2411.09055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09052v1","updated":"2024-11-13T22:15:31Z","published":"2024-11-13T22:15:31Z","title":"ClevrSkills: Compositional Language and Visual Reasoning in Robotics","summary":" Robotics tasks are highly compositional by nature. For example, to perform a\nhigh-level task like cleaning the table a robot must employ low-level\ncapabilities of moving the effectors to the objects on the table, pick them up\nand then move them off the table one-by-one, while re-evaluating the\nconsequently dynamic scenario in the process. Given that large vision language\nmodels (VLMs) have shown progress on many tasks that require high level,\nhuman-like reasoning, we ask the question: if the models are taught the\nrequisite low-level capabilities, can they compose them in novel ways to\nachieve interesting high-level tasks like cleaning the table without having to\nbe explicitly taught so? To this end, we present ClevrSkills - a benchmark\nsuite for compositional reasoning in robotics. ClevrSkills is an environment\nsuite developed on top of the ManiSkill2 simulator and an accompanying dataset.\nThe dataset contains trajectories generated on a range of robotics tasks with\nlanguage and visual annotations as well as multi-modal prompts as task\nspecification. The suite includes a curriculum of tasks with three levels of\ncompositional understanding, starting with simple tasks requiring basic motor\nskills. We benchmark multiple different VLM baselines on ClevrSkills and show\nthat even after being pre-trained on large numbers of tasks, these models fail\non compositional reasoning in robotics tasks.\n","authors":["Sanjay Haresh","Daniel Dijkman","Apratim Bhattacharyya","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2411.09052v1.pdf","comment":"To appear at NeurIPS 2024 (D&B track)"},{"id":"http://arxiv.org/abs/2411.09047v1","updated":"2024-11-13T22:04:19Z","published":"2024-11-13T22:04:19Z","title":"Anomaly Detection in Large-Scale Cloud Systems: An Industry Case and\n Dataset","summary":" As Large-Scale Cloud Systems (LCS) become increasingly complex, effective\nanomaly detection is critical for ensuring system reliability and performance.\nHowever, there is a shortage of large-scale, real-world datasets available for\nbenchmarking anomaly detection methods.\n To address this gap, we introduce a new high-dimensional dataset from IBM\nCloud, collected over 4.5 months from the IBM Cloud Console. This dataset\ncomprises 39,365 rows and 117,448 columns of telemetry data. Additionally, we\ndemonstrate the application of machine learning models for anomaly detection\nand discuss the key challenges faced in this process.\n This study and the accompanying dataset provide a resource for researchers\nand practitioners in cloud system monitoring. It facilitates more efficient\ntesting of anomaly detection methods in real-world data, helping to advance the\ndevelopment of robust solutions to maintain the health and performance of\nlarge-scale cloud infrastructures.\n","authors":["Mohammad Saiful Islam","Mohamed Sami Rakha","William Pourmajidi","Janakan Sivaloganathan","John Steinbacher","Andriy Miranskyy"],"pdf_url":"https://arxiv.org/pdf/2411.09047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06635v2","updated":"2024-11-13T21:20:06Z","published":"2024-11-11T00:10:48Z","title":"Mixed Effects Deep Learning for the interpretable analysis of single\n cell RNA sequencing data by quantifying and visualizing batch effects","summary":" Single-cell RNA sequencing (scRNA-seq) data are often confounded by technical\nor biological batch effects. Existing deep learning models mitigate these\neffects but often discard batch-specific information, potentially losing\nvaluable biological insights. We propose a Mixed Effects Deep Learning (MEDL)\nautoencoder framework that separately models batch-invariant (fixed effects)\nand batch-specific (random effects) components. By decoupling batch-invariant\nbiological states from batch variations, our framework integrates both into\npredictive models. Our approach also generates 2D visualizations of how the\nsame cell appears across batches, enhancing interpretability. Retaining both\nfixed and random effect latent spaces improves classification accuracy.\n We applied our framework to three datasets spanning the cardiovascular system\n(Healthy Heart), Autism Spectrum Disorder (ASD), and Acute Myeloid Leukemia\n(AML). With 147 batches in the Healthy Heart dataset, far exceeding typical\nnumbers, we tested our framework's ability to handle many batches. In the ASD\ndataset, our approach captured donor heterogeneity between autistic and healthy\nindividuals. In the AML dataset, it distinguished donor heterogeneity despite\nmissing cell types and diseased donors exhibiting both healthy and malignant\ncells. These results highlight our framework's ability to characterize fixed\nand random effects, enhance batch effect visualization, and improve prediction\naccuracy across diverse datasets.\n","authors":["Aixa X. Andrade","Son Nguyen","Albert Montillo"],"pdf_url":"https://arxiv.org/pdf/2411.06635v2.pdf","comment":"Main manuscript: 29 pages, including 10 figures and 8 tables.\n Supplemental material: 17 pages"},{"id":"http://arxiv.org/abs/2411.09027v1","updated":"2024-11-13T21:09:55Z","published":"2024-11-13T21:09:55Z","title":"Transformer-based Time-Series Biomarker Discovery for COPD Diagnosis","summary":" Chronic Obstructive Pulmonary Disorder (COPD) is an irreversible and\nprogressive disease which is highly heritable. Clinically, COPD is defined\nusing the summary measures derived from a spirometry test but these are not\nalways adequate. Here we show that using the high-dimensional raw spirogram can\nprovide a richer signal compared to just using the summary measures. We design\na transformer-based deep learning technique to process the raw spirogram values\nalong with demographic information and predict clinically-relevant endpoints\nrelated to COPD. Our method is able to perform better than prior works while\nbeing more computationally efficient. Using the weights learned by the model,\nwe make the framework more interpretable by identifying parts of the spirogram\nthat are important for the model predictions. Pairing up with a board-certified\npulmonologist, we also provide clinical insights into the different aspects of\nthe spirogram and show that the explanations obtained from the model align with\nunderlying medical knowledge.\n","authors":["Soham Gadgil","Joshua Galanter","Mohammadreza Negahdar"],"pdf_url":"https://arxiv.org/pdf/2411.09027v1.pdf","comment":"Accepted as a workshop paper to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09018v1","updated":"2024-11-13T20:50:04Z","published":"2024-11-13T20:50:04Z","title":"Bridging the Visual Gap: Fine-Tuning Multimodal Models with\n Knowledge-Adapted Captions","summary":" Recent research increasingly focuses on training vision-language models\n(VLMs) with long, detailed image captions. However, small-scale VLMs often\nstruggle to balance the richness of these captions with the risk of\nhallucinating content during fine-tuning. In this paper, we explore how well\nVLMs adapt to such captions. To quantify caption quality, we propose Decomposed\nNLI (DNLI), an evaluation framework that breaks down generated captions into\nindividual propositions, assessing each in isolation. This fine-grained\nanalysis reveals a critical balance between capturing descriptive details and\npreventing hallucinations. Our findings show that simply reducing caption\ncomplexity or employing standard data curation techniques does not effectively\nresolve this issue. To tackle this challenge, we introduce Knowledge Adapted\n(KnowAda) fine-tuning, a data-centric approach that automatically adapts\ntraining data with the model's existing knowledge and visual understanding.\nKnowAda minimizes hallucinations while preserving high descriptiveness. We\nvalidate this approach across several small-scale VLMs (up to 7B parameters)\nand dense caption datasets, demonstrating that KnowAda effectively balances\nhallucination reduction and descriptiveness. Our results show that KnowAda\noutperforms various baselines in both automatic metrics and human evaluations.\nWe will release our code and models.\n","authors":["Moran Yanuka","Assaf Ben Kish","Yonatan Bitton","Idan Szpektor","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2411.09018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00759v2","updated":"2024-11-13T20:48:26Z","published":"2024-11-01T17:35:09Z","title":"Minibatch Optimal Transport and Perplexity Bound Estimation in Discrete\n Flow Matching","summary":" Outperforming autoregressive models on categorical data distributions, such\nas textual data, remains challenging for continuous diffusion and flow models.\nDiscrete flow matching, a recent framework for modeling categorical data, has\nshown competitive performance with autoregressive models. Despite its\nsimilarities with continuous flow matching, the rectification strategy applied\nin the continuous version does not directly extend to the discrete one due to\nthe inherent stochasticity of discrete paths. This limitation necessitates\nexploring alternative methods to minimize state transitions during generation.\nTo address this, we propose a dynamic-optimal-transport-like minimization\nobjective for discrete flows with convex interpolants and derive its equivalent\nKantorovich formulation. The latter defines transport cost solely in terms of\ninter-state similarity and is optimized using a minibatch strategy. Another\nlimitation we address in the discrete flow framework is model evaluation.\nUnlike continuous flows, wherein the instantaneous change of variables enables\ndensity estimation, discrete models lack a similar mechanism due to the\ninherent non-determinism and discontinuity of their paths. To alleviate this\nissue, we propose an upper bound on the perplexity of discrete flow models,\nenabling performance evaluation and comparison with other methods.\n","authors":["Etrit Haxholli","Yeti Z. Gürbüz","Oğul Can","Eli Waxman"],"pdf_url":"https://arxiv.org/pdf/2411.00759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09009v1","updated":"2024-11-13T20:30:15Z","published":"2024-11-13T20:30:15Z","title":"Cut Your Losses in Large-Vocabulary Language Models","summary":" As language models grow ever larger, so do their vocabularies. This has\nshifted the memory footprint of LLMs during training disproportionately to one\nsingle layer: the cross-entropy in the loss computation. Cross-entropy builds\nup a logit matrix with entries for each pair of input tokens and vocabulary\nitems and, for small models, consumes an order of magnitude more memory than\nthe rest of the LLM combined. We propose Cut Cross-Entropy (CCE), a method that\ncomputes the cross-entropy loss without materializing the logits for all tokens\ninto global memory. Rather, CCE only computes the logit for the correct token\nand evaluates the log-sum-exp over all logits on the fly. We implement a custom\nkernel that performs the matrix multiplications and the log-sum-exp reduction\nover the vocabulary in flash memory, making global memory consumption for the\ncross-entropy computation negligible. This has a dramatic effect. Taking the\nGemma 2 (2B) model as an example, CCE reduces the memory footprint of the loss\ncomputation from 24 GB to 1 MB, and the total training-time memory consumption\nof the classifier head from 28 GB to 1 GB. To improve the throughput of CCE, we\nleverage the inherent sparsity of softmax and propose to skip elements of the\ngradient computation that have a negligible (i.e., below numerical precision)\ncontribution to the gradient. Experiments demonstrate that the dramatic\nreduction in memory consumption is accomplished without sacrificing training\nspeed or convergence.\n","authors":["Erik Wijmans","Brody Huval","Alexander Hertzberg","Vladlen Koltun","Philipp Krähenbühl"],"pdf_url":"https://arxiv.org/pdf/2411.09009v1.pdf","comment":"Code is available at https://github.com/apple/ml-cross-entropy"},{"id":"http://arxiv.org/abs/2401.02501v3","updated":"2024-11-13T20:30:04Z","published":"2024-01-04T19:25:00Z","title":"A metric embedding kernel for live cell microscopy signaling patterns","summary":" Live cell microscopy captures 5-D $(x,y,z,channel,time)$ movies that display\npatterns of cellular motion and signaling dynamics. We present here a metric\nkernel function for spatiotemporal patterns of cell signaling dynamics in 5-D\nlive cell microscopy movies unique in requiring no a priori knowledge of\nexpected pattern dynamics, and no training data. The approach uses Kolmogorov\ncomplexity theory to compute a metric distance between movies and to measure\nthe meaningful information among subsets of movies. Cell signaling kymographs\nstore at each spatiotemporal cell centroid the cell signaling state, or a\nfunctional output such as velocity. Patterns of similarity are identified via\nthe metric normalized compression distance (NCD). The NCD is a reproducing\nkernel for a Hilbert space that represents the input cell signaling kymographs\nas points in a low dimensional embedding that optimally captures the pattern\nsimilarity identified by the NCD throughout the space. The only parameter is\nthe expected cell radii ($\\mu m$). A new formulation of the cluster structure\nfunction optimally estimates the meaningful information captured by the\nembedding. Also presented is the cell signaling structure function (SSF), a\nKolmogorov structure function that optimally measures cell signaling state as\nnuclear intensity w.r.t. surrounding cytoplasm, a significant improvement\ncompared to the current state-of-the-art cytonuclear ratio. Results are\npresented quantifying the impact of ERK and AKT signaling between different\noncogenic mutations, and by the relation between ERK signaling and cellular\nvelocity patterns for movies of 2-D monolayers of human breast epithelial\n(MCF10A) cells, 3-D MCF10A spheroids under optogenetic manipulation of ERK, and\nhuman induced pluripotent stem cells.\n","authors":["Layton Aho","Mark Winter","Marc DeCarlo","Agne Frismantiene","Yannick Blum","Paolo Armando Gagliardi","Olivier Pertz","Andrew R. Cohen"],"pdf_url":"https://arxiv.org/pdf/2401.02501v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20367v2","updated":"2024-11-13T20:29:35Z","published":"2024-07-29T18:31:42Z","title":"Mixed Newton Method for Optimization in Complex Spaces","summary":" In this paper, we modify and apply the recently introduced Mixed Newton\nMethod, which is originally designed for minimizing real-valued functions of\ncomplex variables, to the minimization of real-valued functions of real\nvariables by extending the functions to complex space. We show that arbitrary\nregularizations preserve the favorable local convergence properties of the\nmethod, and construct a special type of regularization used to prevent\nconvergence to complex minima. We compare several variants of the method\napplied to training neural networks with real and complex parameters.\n","authors":["Nikita Yudin","Roland Hildebrand","Sergey Bakhurin","Alexander Degtyarev","Anna Lisachenko","Ilya Kuruzov","Andrei Semenov","Mohammad Alkousa"],"pdf_url":"https://arxiv.org/pdf/2407.20367v2.pdf","comment":"16 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2406.14775v2","updated":"2024-11-13T20:28:24Z","published":"2024-06-20T22:57:38Z","title":"Machine Learning Global Simulation of Nonlocal Gravity Wave Propagation","summary":" Global climate models typically operate at a grid resolution of hundreds of\nkilometers and fail to resolve atmospheric mesoscale processes, e.g., clouds,\nprecipitation, and gravity waves (GWs). Model representation of these processes\nand their sources is essential to the global circulation and planetary energy\nbudget, but subgrid scale contributions from these processes are often only\napproximately represented in models using parameterizations. These\nparameterizations are subject to approximations and idealizations, which limit\ntheir capability and accuracy. The most drastic of these approximations is the\n\"single-column approximation\" which completely neglects the horizontal\nevolution of these processes, resulting in key biases in current climate\nmodels. With a focus on atmospheric GWs, we present the first-ever global\nsimulation of atmospheric GW fluxes using machine learning (ML) models trained\non the WINDSET dataset to emulate global GW emulation in the atmosphere, as an\nalternative to traditional single-column parameterizations. Using an Attention\nU-Net-based architecture trained on globally resolved GW momentum fluxes, we\nillustrate the importance and effectiveness of global nonlocality, when\nsimulating GWs using data-driven schemes.\n","authors":["Aman Gupta","Aditi Sheshadri","Sujit Roy","Vishal Gaur","Manil Maskey","Rahul Ramachandran"],"pdf_url":"https://arxiv.org/pdf/2406.14775v2.pdf","comment":"International Conference on Machine Learning 2024"},{"id":"http://arxiv.org/abs/2406.03230v4","updated":"2024-11-13T20:18:19Z","published":"2024-06-05T13:06:33Z","title":"Defending Large Language Models Against Attacks With Residual Stream\n Activation Analysis","summary":" The widespread adoption of Large Language Models (LLMs), exemplified by\nOpenAI's ChatGPT, brings to the forefront the imperative to defend against\nadversarial threats on these models. These attacks, which manipulate an LLM's\noutput by introducing malicious inputs, undermine the model's integrity and the\ntrust users place in its outputs. In response to this challenge, our paper\npresents an innovative defensive strategy, given white box access to an LLM,\nthat harnesses residual activation analysis between transformer layers of the\nLLM. We apply a novel methodology for analyzing distinctive activation patterns\nin the residual streams for attack prompt classification. We curate multiple\ndatasets to demonstrate how this method of classification has high accuracy\nacross multiple types of attack scenarios, including our newly-created attack\ndataset. Furthermore, we enhance the model's resilience by integrating safety\nfine-tuning techniques for LLMs in order to measure its effect on our\ncapability to detect attacks. The results underscore the effectiveness of our\napproach in enhancing the detection and mitigation of adversarial inputs,\nadvancing the security framework within which LLMs operate.\n","authors":["Amelia Kawasaki","Andrew Davis","Houssam Abbas"],"pdf_url":"https://arxiv.org/pdf/2406.03230v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09003v1","updated":"2024-11-13T20:12:55Z","published":"2024-11-13T20:12:55Z","title":"Refusal in LLMs is an Affine Function","summary":" We propose affine concept editing (ACE) as an approach for steering language\nmodels' behavior by intervening directly in activations. We begin with an\naffine decomposition of model activation vectors and show that prior methods\nfor steering model behavior correspond to subsets of terms of this\ndecomposition. We then provide a derivation of ACE and test it on refusal using\nLlama 3 8B and Hermes Eagle RWKV v5. ACE ultimately combines affine subspace\nprojection and activation addition to reliably control the model's refusal\nresponses across prompt types. We evaluate the results using LLM-based scoring\non a collection of harmful and harmless prompts. Our experiments demonstrate\nthat ACE consistently achieves more precise control over model behavior and\ngeneralizes to models where directional ablation via affine subspace projection\nalone produces incoherent outputs. Code for reproducing our results is\navailable at https://github.com/EleutherAI/steering-llama3 .\n","authors":["Thomas Marshall","Adam Scherlis","Nora Belrose"],"pdf_url":"https://arxiv.org/pdf/2411.09003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08998v1","updated":"2024-11-13T19:37:49Z","published":"2024-11-13T19:37:49Z","title":"Microfoundation Inference for Strategic Prediction","summary":" Often in prediction tasks, the predictive model itself can influence the\ndistribution of the target variable, a phenomenon termed performative\nprediction. Generally, this influence stems from strategic actions taken by\nstakeholders with a vested interest in predictive models. A key challenge that\nhinders the widespread adaptation of performative prediction in machine\nlearning is that practitioners are generally unaware of the social impacts of\ntheir predictions. To address this gap, we propose a methodology for learning\nthe distribution map that encapsulates the long-term impacts of predictive\nmodels on the population. Specifically, we model agents' responses as a\ncost-adjusted utility maximization problem and propose estimates for said cost.\nOur approach leverages optimal transport to align pre-model exposure (ex ante)\nand post-model exposure (ex post) distributions. We provide a rate of\nconvergence for this proposed estimate and assess its quality through empirical\ndemonstrations on a credit-scoring dataset.\n","authors":["Daniele Bracale","Subha Maity","Felipe Maia Polo","Seamus Somerstep","Moulinath Banerjee","Yuekai Sun"],"pdf_url":"https://arxiv.org/pdf/2411.08998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23463v2","updated":"2024-11-13T19:34:22Z","published":"2024-10-30T21:08:07Z","title":"MDCure: A Scalable Pipeline for Multi-Document Instruction-Following","summary":" Multi-document (MD) processing is crucial for LLMs to handle real-world tasks\nsuch as summarization and question-answering across large sets of documents.\nWhile LLMs have improved at processing long inputs, MD contexts still present\nchallenges, such as managing inter-document dependencies, redundancy, and\nincoherent structures. We introduce MDCure, a scalable and effective\nfine-tuning pipeline to enhance the MD capabilities of LLMs without the\ncomputational cost of pre-training or reliance on human annotated data. MDCure\nis based on generation of high-quality synthetic MD instruction data from sets\nof related articles via targeted prompts. We further introduce MDCureRM, a\nmulti-objective reward model which filters generated data based on their\ntraining utility for MD settings. With MDCure, we fine-tune a variety of LLMs,\nfrom the FlanT5, Qwen2, and LLAMA3.1 model families, up to 70B parameters in\nsize. Extensive evaluations on a wide range of MD and long-context benchmarks\nspanning various tasks show MDCure consistently improves performance over\npre-trained baselines and over corresponding base models by up to 75.5%. Our\ncode, datasets, and models are available at https://github.com/yale-nlp/MDCure.\n","authors":["Gabrielle Kaili-May Liu","Bowen Shi","Avi Caciularu","Idan Szpektor","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2410.23463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08993v1","updated":"2024-11-13T19:33:47Z","published":"2024-11-13T19:33:47Z","title":"Parameter Inference via Differentiable Diffusion Bridge Importance\n Sampling","summary":" We introduce a methodology for performing parameter inference in\nhigh-dimensional, non-linear diffusion processes. We illustrate its\napplicability for obtaining insights into the evolution of and relationships\nbetween species, including ancestral state reconstruction. Estimation is\nperformed by utilising score matching to approximate diffusion bridges, which\nare subsequently used in an importance sampler to estimate log-likelihoods. The\nentire setup is differentiable, allowing gradient ascent on approximated\nlog-likelihoods. This allows both parameter inference and diffusion mean\nestimation. This novel, numerically stable, score matching-based parameter\ninference framework is presented and demonstrated on biological two- and\nthree-dimensional morphometry data.\n","authors":["Nicklas Boserup","Gefan Yang","Michael Lind Severinsen","Christy Anna Hipsley","Stefan Sommer"],"pdf_url":"https://arxiv.org/pdf/2411.08993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08987v1","updated":"2024-11-13T19:22:34Z","published":"2024-11-13T19:22:34Z","title":"Non-Euclidean High-Order Smooth Convex Optimization","summary":" We develop algorithms for the optimization of convex objectives that have\nH\\\"older continuous $q$-th derivatives with respect to a $p$-norm by using a\n$q$-th order oracle, for $p, q \\geq 1$. We can also optimize other structured\nfunctions. We do this by developing a non-Euclidean inexact accelerated\nproximal point method that makes use of an inexact uniformly convex\nregularizer. We also provide nearly matching lower bounds for any deterministic\nalgorithm that interacts with the function via a local oracle.\n","authors":["Juan Pablo Contreras","Cristóbal Guzmán","David Martínez-Rubio"],"pdf_url":"https://arxiv.org/pdf/2411.08987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08982v1","updated":"2024-11-13T19:18:08Z","published":"2024-11-13T19:18:08Z","title":"Lynx: Enabling Efficient MoE Inference through Dynamic Batch-Aware\n Expert Selection","summary":" Mixture-of-Experts (MoE) architectures have recently gained popularity in\nenabling efficient scaling of large language models. However, we uncover a\nfundamental tension: while MoEs are designed for selective expert activation,\nproduction serving requires request batching, which forces the activation of\nall experts and negates MoE's efficiency benefits during the decode phase. We\npresent Lynx, a system that enables efficient MoE inference through dynamic,\nbatch-aware expert selection. Our key insight is that expert importance varies\nsignificantly across tokens and inference phases, creating opportunities for\nruntime optimization. Lynx leverages this insight through a lightweight\nframework that dynamically reduces active experts while preserving model\naccuracy. Our evaluations show that Lynx achieves up to 1.55x reduction in\ninference latency while maintaining negligible accuracy loss from baseline\nmodel across complex code generation and mathematical reasoning tasks.\n","authors":["Vima Gupta","Kartik Sinha","Ada Gavrilovska","Anand Padmanabha Iyer"],"pdf_url":"https://arxiv.org/pdf/2411.08982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08968v1","updated":"2024-11-13T19:02:36Z","published":"2024-11-13T19:02:36Z","title":"Sparse Upcycling: Inference Inefficient Finetuning","summary":" Small, highly trained, open-source large language models are widely used due\nto their inference efficiency, but further improving their quality remains a\nchallenge. Sparse upcycling is a promising approach that transforms a\npretrained dense model into a Mixture-of-Experts (MoE) architecture, increasing\nthe model's parameter count and quality. In this work, we compare the\neffectiveness of sparse upcycling against continued pretraining (CPT) across\ndifferent model sizes, compute budgets, and pretraining durations. Our\nexperiments show that sparse upcycling can achieve better quality, with\nimprovements of over 20% relative to CPT in certain scenarios. However, this\ncomes with a significant inference cost, leading to 40% slowdowns in\nhigh-demand inference settings for larger models. Our findings highlight the\ntrade-off between model quality and inference efficiency, offering insights for\npractitioners seeking to balance model quality and deployment constraints.\n","authors":["Sasha Doubov","Nikhil Sardana","Vitaliy Chiley"],"pdf_url":"https://arxiv.org/pdf/2411.08968v1.pdf","comment":"12 pages, 4 figures, To appear in the 4th NeurIPS Workshop on\n Efficient Natural Language and Speech Processing (ENLSP), 2024"},{"id":"http://arxiv.org/abs/2411.08954v1","updated":"2024-11-13T19:00:02Z","published":"2024-11-13T19:00:02Z","title":"Inconsistencies In Consistency Models: Better ODE Solving Does Not Imply\n Better Samples","summary":" Although diffusion models can generate remarkably high-quality samples, they\nare intrinsically bottlenecked by their expensive iterative sampling procedure.\nConsistency models (CMs) have recently emerged as a promising diffusion model\ndistillation method, reducing the cost of sampling by generating high-fidelity\nsamples in just a few iterations. Consistency model distillation aims to solve\nthe probability flow ordinary differential equation (ODE) defined by an\nexisting diffusion model. CMs are not directly trained to minimize error\nagainst an ODE solver, rather they use a more computationally tractable\nobjective. As a way to study how effectively CMs solve the probability flow\nODE, and the effect that any induced error has on the quality of generated\nsamples, we introduce Direct CMs, which \\textit{directly} minimize this error.\nIntriguingly, we find that Direct CMs reduce the ODE solving error compared to\nCMs but also result in significantly worse sample quality, calling into\nquestion why exactly CMs work well in the first place. Full code is available\nat: https://github.com/layer6ai-labs/direct-cms.\n","authors":["Noël Vouitsis","Rasa Hosseinzadeh","Brendan Leigh Ross","Valentin Villecroze","Satya Krishna Gorti","Jesse C. Cresswell","Gabriel Loaiza-Ganem"],"pdf_url":"https://arxiv.org/pdf/2411.08954v1.pdf","comment":"NeurIPS 2024 ATTRIB Workshop"},{"id":"http://arxiv.org/abs/2411.05196v2","updated":"2024-11-13T18:58:46Z","published":"2024-11-07T21:43:29Z","title":"Explainable AI through a Democratic Lens: DhondtXAI for Proportional\n Feature Importance Using the D'Hondt Method","summary":" In democratic societies, electoral systems play a crucial role in translating\npublic preferences into political representation. Among these, the D'Hondt\nmethod is widely used to ensure proportional representation, balancing fair\nrepresentation with governmental stability. Recently, there has been a growing\ninterest in applying similar principles of proportional representation to\nenhance interpretability in machine learning, specifically in Explainable AI\n(XAI). This study investigates the integration of D'Hondt-based voting\nprinciples in the DhondtXAI method, which leverages resource allocation\nconcepts to interpret feature importance within AI models. Through a comparison\nof SHAP (Shapley Additive Explanations) and DhondtXAI, we evaluate their\neffectiveness in feature attribution within CatBoost and XGBoost models for\nbreast cancer and diabetes prediction, respectively. The DhondtXAI approach\nallows for alliance formation and thresholding to enhance interpretability,\nrepresenting feature importance as seats in a parliamentary view. Statistical\ncorrelation analyses between SHAP values and DhondtXAI allocations support the\nconsistency of interpretations, demonstrating DhondtXAI's potential as a\ncomplementary tool for understanding feature importance in AI models. The\nresults highlight that integrating electoral principles, such as proportional\nrepresentation and alliances, into AI explainability can improve user\nunderstanding, especially in high-stakes fields like healthcare.\n","authors":["Turker Berk Donmez"],"pdf_url":"https://arxiv.org/pdf/2411.05196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1902.00615v6","updated":"2024-11-13T18:32:53Z","published":"2019-02-02T01:52:53Z","title":"Confidence Trigger Detection: Accelerating Real-time\n Tracking-by-detection Systems","summary":" Real-time object tracking necessitates a delicate balance between speed and\naccuracy, a challenge exacerbated by the computational demands of deep learning\nmethods. In this paper, we propose Confidence-Triggered Detection (CTD), an\ninnovative approach that strategically bypasses object detection for frames\nclosely resembling intermediate states, leveraging tracker confidence scores.\nCTD not only enhances tracking speed but also preserves accuracy, surpassing\nexisting tracking algorithms. Through extensive evaluation across various\ntracker confidence thresholds, we identify an optimal trade-off between\ntracking speed and accuracy, providing crucial insights for parameter\nfine-tuning and enhancing CTD's practicality in real-world scenarios. Our\nexperiments across diverse detection models underscore the robustness and\nversatility of the CTD framework, demonstrating its potential to enable\nreal-time tracking in resource-constrained environments.\n","authors":["Zhicheng Ding","Zhixin Lai","Siyang Li","Panfeng Li","Qikai Yang","Edward Wong"],"pdf_url":"https://arxiv.org/pdf/1902.00615v6.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2210.14980v2","updated":"2024-11-13T23:33:06Z","published":"2022-10-26T18:49:43Z","title":"Interstellar Object Accessibility and Mission Design","summary":" Interstellar objects (ISOs) represent a compelling and under-explored\ncategory of celestial bodies, providing physical laboratories to understand the\nformation of our solar system and probe the composition and properties of\nmaterial formed in exoplanetary systems. In this work, we investigate existing\napproaches to designing successful flyby missions to ISOs, including a deep\nlearning-driven guidance and control algorithm for ISOs traveling at velocities\nover 60 km/s. We have generated spacecraft trajectories to a series of\nsynthetic representative ISOs, simulating a ground campaign to observe the\ntarget and resolve its state, thereby determining the cruise and close approach\ndelta-Vs required for the encounter. We discuss the accessibility of and\nmission design to ISOs with varying characteristics, with special focuses on 1)\nstate covariance estimation throughout the cruise, 2) handoffs from traditional\nnavigation approaches to novel autonomous navigation for fast flyby regimes,\nand 3) overall recommendations about preparing for the future in situ\nexploration of these targets. The lessons learned also apply to the fast flyby\nof other small bodies, e.g., long-period comets and potentially hazardous\nasteroids, which also require tactical responses with similar characteristics.\n","authors":["Benjamin P. S. Donitz","Declan Mages","Hiroyasu Tsukamoto","Peter Dixon","Damon Landau","Soon-Jo Chung","Erica Bufanda","Michel Ingham","Julie Castillo-Rogez"],"pdf_url":"https://arxiv.org/pdf/2210.14980v2.pdf","comment":"IEEE Aerospace Conference, Preprint Version, Accepted: November 2022"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.08034v2","updated":"2024-11-13T18:59:44Z","published":"2024-11-12T18:59:35Z","title":"Scaling Properties of Diffusion Models for Perceptual Tasks","summary":" In this paper, we argue that iterative computation with diffusion models\noffers a powerful paradigm for not only generation but also visual perception\ntasks. We unify tasks such as depth estimation, optical flow, and amodal\nsegmentation under the framework of image-to-image translation, and show how\ndiffusion models benefit from scaling training and test-time compute for these\nperceptual tasks. Through a careful analysis of these scaling properties, we\nformulate compute-optimal training and inference recipes to scale diffusion\nmodels for visual perception tasks. Our models achieve competitive performance\nto state-of-the-art methods using significantly less data and compute. To\naccess our code and models, see https://scaling-diffusion-perception.github.io .\n","authors":["Rahul Ravishankar","Zeeshan Patel","Jathushan Rajasegaran","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2411.08034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08879v1","updated":"2024-11-13T18:56:39Z","published":"2024-11-13T18:56:39Z","title":"4D Gaussian Splatting in the Wild with Uncertainty-Aware Regularization","summary":" Novel view synthesis of dynamic scenes is becoming important in various\napplications, including augmented and virtual reality. We propose a novel 4D\nGaussian Splatting (4DGS) algorithm for dynamic scenes from casually recorded\nmonocular videos. To overcome the overfitting problem of existing work for\nthese real-world videos, we introduce an uncertainty-aware regularization that\nidentifies uncertain regions with few observations and selectively imposes\nadditional priors based on diffusion models and depth smoothness on such\nregions. This approach improves both the performance of novel view synthesis\nand the quality of training image reconstruction. We also identify the\ninitialization problem of 4DGS in fast-moving dynamic regions, where the\nStructure from Motion (SfM) algorithm fails to provide reliable 3D landmarks.\nTo initialize Gaussian primitives in such regions, we present a dynamic region\ndensification method using the estimated depth maps and scene flow. Our\nexperiments show that the proposed method improves the performance of 4DGS\nreconstruction from a video captured by a handheld monocular camera and also\nexhibits promising results in few-shot static scene reconstruction.\n","authors":["Mijeong Kim","Jongwoo Lim","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2411.08879v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08878v1","updated":"2024-11-13T18:55:10Z","published":"2024-11-13T18:55:10Z","title":"A Short Note on Evaluating RepNet for Temporal Repetition Counting in\n Videos","summary":" We discuss some consistent issues on how RepNet has been evaluated in various\npapers. As a way to mitigate these issues, we report RepNet performance results\non different datasets, and release evaluation code and the RepNet checkpoint to\nobtain these results. Code URL:\nhttps://github.com/google-research/google-research/blob/master/repnet/\n","authors":["Debidatta Dwibedi","Yusuf Aytar","Jonathan Tompson","Pierre Sermanet","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.08878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08875v1","updated":"2024-11-13T18:52:42Z","published":"2024-11-13T18:52:42Z","title":"Causal Explanations for Image Classifiers","summary":" Existing algorithms for explaining the output of image classifiers use\ndifferent definitions of explanations and a variety of techniques to extract\nthem. However, none of the existing tools use a principled approach based on\nformal definitions of causes and explanations for the explanation extraction.\nIn this paper we present a novel black-box approach to computing explanations\ngrounded in the theory of actual causality. We prove relevant theoretical\nresults and present an algorithm for computing approximate explanations based\non these definitions. We prove termination of our algorithm and discuss its\ncomplexity and the amount of approximation compared to the precise definition.\nWe implemented the framework in a tool rex and we present experimental results\nand a comparison with state-of-the-art tools. We demonstrate that rex is the\nmost efficient tool and produces the smallest explanations, in addition to\noutperforming other black-box tools on standard quality measures.\n","authors":["Hana Chockler","David A. Kelly","Daniel Kroening","Youcheng Sun"],"pdf_url":"https://arxiv.org/pdf/2411.08875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08870v1","updated":"2024-11-13T18:50:13Z","published":"2024-11-13T18:50:13Z","title":"The Limited Impact of Medical Adaptation of Large Language and\n Vision-Language Models","summary":" Several recent works seek to develop foundation models specifically for\nmedical applications, adapting general-purpose large language models (LLMs) and\nvision-language models (VLMs) via continued pretraining on publicly available\nbiomedical corpora. These works typically claim that such domain-adaptive\npretraining (DAPT) improves performance on downstream medical tasks, such as\nanswering medical licensing exam questions. In this paper, we compare ten\npublic \"medical\" LLMs and two VLMs against their corresponding base models,\narriving at a different conclusion: all medical VLMs and nearly all medical\nLLMs fail to consistently improve over their base models in the zero-/few-shot\nprompting and supervised fine-tuning regimes for medical question-answering\n(QA). For instance, across all tasks and model pairs we consider in the 3-shot\nsetting, medical LLMs only outperform their base models in 22.7% of cases,\nreach a (statistical) tie in 36.8% of cases, and are significantly worse than\ntheir base models in the remaining 40.5% of cases. Our conclusions are based on\n(i) comparing each medical model head-to-head, directly against the\ncorresponding base model; (ii) optimizing the prompts for each model separately\nin zero-/few-shot prompting; and (iii) accounting for statistical uncertainty\nin comparisons. While these basic practices are not consistently adopted in the\nliterature, our ablations show that they substantially impact conclusions.\nMeanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs\ncan show performance improvements, but the benefits do not carry over to tasks\nbased on clinical notes. Our findings suggest that state-of-the-art\ngeneral-domain models may already exhibit strong medical knowledge and\nreasoning capabilities, and offer recommendations to strengthen the conclusions\nof future studies.\n","authors":["Daniel P. Jeong","Pranav Mani","Saurabh Garg","Zachary C. Lipton","Michael Oberst"],"pdf_url":"https://arxiv.org/pdf/2411.08870v1.pdf","comment":"Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes\n additional results on clinical note QA tasks and supervised fine-tuning\n evaluations"},{"id":"http://arxiv.org/abs/2411.08861v1","updated":"2024-11-13T18:42:34Z","published":"2024-11-13T18:42:34Z","title":"Interaction Testing in Variation Analysis","summary":" Relationships of cause and effect are of prime importance for explaining\nscientific phenomena. Often, rather than just understanding the effects of\ncauses, researchers also wish to understand how a cause $X$ affects an outcome\n$Y$ mechanistically -- i.e., what are the causal pathways that are activated\nbetween $X$ and $Y$. For analyzing such questions, a range of methods has been\ndeveloped over decades under the rubric of causal mediation analysis.\nTraditional mediation analysis focuses on decomposing the average treatment\neffect (ATE) into direct and indirect effects, and therefore focuses on the ATE\nas the central quantity. This corresponds to providing explanations for\nassociations in the interventional regime, such as when the treatment $X$ is\nrandomized. Commonly, however, it is of interest to explain associations in the\nobservational regime, and not just in the interventional regime. In this paper,\nwe introduce \\text{variation analysis}, an extension of mediation analysis that\nfocuses on the total variation (TV) measure between $X$ and $Y$, written as\n$\\mathrm{E}[Y \\mid X=x_1] - \\mathrm{E}[Y \\mid X=x_0]$. The TV measure\nencompasses both causal and confounded effects, as opposed to the ATE which\nonly encompasses causal (direct and mediated) variations. In this way, the TV\nmeasure is suitable for providing explanations in the natural regime and\nanswering questions such as ``why is $X$ associated with $Y$?''. Our focus is\non decomposing the TV measure, in a way that explicitly includes direct,\nindirect, and confounded variations. Furthermore, we also decompose the TV\nmeasure to include interaction terms between these different pathways.\nSubsequently, interaction testing is introduced, involving hypothesis tests to\ndetermine if interaction terms are significantly different from zero. If\ninteractions are not significant, more parsimonious decompositions of the TV\nmeasure can be used.\n","authors":["Drago Plecko"],"pdf_url":"https://arxiv.org/pdf/2411.08861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13880v4","updated":"2024-11-13T18:31:18Z","published":"2024-04-22T05:07:02Z","title":"Regional Style and Color Transfer","summary":" This paper presents a novel contribution to the field of regional style\ntransfer. Existing methods often suffer from the drawback of applying style\nhomogeneously across the entire image, leading to stylistic inconsistencies or\nforeground object twisted when applied to image with foreground elements such\nas person figures. To address this limitation, we propose a new approach that\nleverages a segmentation network to precisely isolate foreground objects within\nthe input image. Subsequently, style transfer is applied exclusively to the\nbackground region. The isolated foreground objects are then carefully\nreintegrated into the style-transferred background. To enhance the visual\ncoherence between foreground and background, a color transfer step is employed\non the foreground elements prior to their rein-corporation. Finally, we utilize\nfeathering techniques to achieve a seamless amalgamation of foreground and\nbackground, resulting in a visually unified and aesthetically pleasing final\ncomposition. Extensive evaluations demonstrate that our proposed approach\nyields significantly more natural stylistic transformations compared to\nconventional methods.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Siyang Li","Qingtian Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13880v4.pdf","comment":"Accepted by 2024 5th International Conference on Computer Vision,\n Image and Deep Learning"},{"id":"http://arxiv.org/abs/2411.08843v1","updated":"2024-11-13T18:21:56Z","published":"2024-11-13T18:21:56Z","title":"Data-driven Surface Solar Irradiance Estimation using Neural Operators\n at Global Scale","summary":" Accurate surface solar irradiance (SSI) forecasting is essential for\noptimizing renewable energy systems, particularly in the context of long-term\nenergy planning on a global scale. This paper presents a pioneering approach to\nsolar radiation forecasting that leverages recent advancements in numerical\nweather prediction (NWP) and data-driven machine learning weather models. These\nadvances facilitate long, stable rollouts and enable large ensemble forecasts,\nenhancing the reliability of predictions. Our flexible model utilizes variables\nforecast by these NWP and AI weather models to estimate 6-hourly SSI at global\nscale. Developed using NVIDIA Modulus, our model represents the first adaptive\nglobal framework capable of providing long-term SSI forecasts. Furthermore, it\ncan be fine-tuned using satellite data, which significantly enhances its\nperformance in the fine-tuned regions, while maintaining accuracy elsewhere.\nThe improved accuracy of these forecasts has substantial implications for the\nintegration of solar energy into power grids, enabling more efficient energy\nmanagement and contributing to the global transition to renewable energy\nsources.\n","authors":["Alberto Carpentieri","Jussi Leinonen","Jeff Adie","Boris Bonev","Doris Folini","Farah Hariri"],"pdf_url":"https://arxiv.org/pdf/2411.08843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08842v1","updated":"2024-11-13T18:20:29Z","published":"2024-11-13T18:20:29Z","title":"AstroM$^3$: A self-supervised multimodal model for astronomy","summary":" While machine-learned models are now routinely employed to facilitate\nastronomical inquiry, model inputs tend to be limited to a primary data source\n(namely images or time series) and, in the more advanced approaches, some\nmetadata. Yet with the growing use of wide-field, multiplexed observational\nresources, individual sources of interest often have a broad range of\nobservational modes available. Here we construct an astronomical multimodal\ndataset and propose AstroM$^3$, a self-supervised pre-training approach that\nenables a model to learn from multiple modalities simultaneously. Specifically,\nwe extend the CLIP (Contrastive Language-Image Pretraining) model to a trimodal\nsetting, allowing the integration of time-series photometry data, spectra, and\nastrophysical metadata. In a fine-tuning supervised setting, our results\ndemonstrate that CLIP pre-training improves classification performance for\ntime-series photometry, where accuracy increases from 84.6% to 91.5%.\nFurthermore, CLIP boosts classification accuracy by up to 12.6% when the\navailability of labeled data is limited, showing the effectiveness of\nleveraging larger corpora of unlabeled data. In addition to fine-tuned\nclassification, we can use the trained model in other downstream tasks that are\nnot explicitly contemplated during the construction of the self-supervised\nmodel. In particular we show the efficacy of using the learned embeddings for\nmisclassifications identification, similarity search, and anomaly detection.\nOne surprising highlight is the \"rediscovery\" of Mira subtypes and two\nRotational variable subclasses using manifold learning and dimension reduction\nalgorithm. To our knowledge this is the first construction of an $n>2$ mode\nmodel in astronomy. Extensions to $n>3$ modes is naturally anticipated with\nthis approach.\n","authors":["Mariia Rizhko","Joshua S. Bloom"],"pdf_url":"https://arxiv.org/pdf/2411.08842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08832v1","updated":"2024-11-13T18:12:15Z","published":"2024-11-13T18:12:15Z","title":"Offline Adaptation of Quadruped Locomotion using Diffusion Models","summary":" We present a diffusion-based approach to quadrupedal locomotion that\nsimultaneously addresses the limitations of learning and interpolating between\nmultiple skills and of (modes) offline adapting to new locomotion behaviours\nafter training. This is the first framework to apply classifier-free guided\ndiffusion to quadruped locomotion and demonstrate its efficacy by extracting\ngoal-conditioned behaviour from an originally unlabelled dataset. We show that\nthese capabilities are compatible with a multi-skill policy and can be applied\nwith little modification and minimal compute overhead, i.e., running entirely\non the robots onboard CPU. We verify the validity of our approach with hardware\nexperiments on the ANYmal quadruped platform.\n","authors":["Reece O'Mahoney","Alexander L. Mitchell","Wanming Yu","Ingmar Posner","Ioannis Havoutis"],"pdf_url":"https://arxiv.org/pdf/2411.08832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08814v1","updated":"2024-11-13T17:53:23Z","published":"2024-11-13T17:53:23Z","title":"Process-aware Human Activity Recognition","summary":" Humans naturally follow distinct patterns when conducting their daily\nactivities, which are driven by established practices and processes, such as\nproduction workflows, social norms and daily routines. Human activity\nrecognition (HAR) algorithms usually use neural networks or machine learning\ntechniques to analyse inherent relationships within the data. However, these\napproaches often overlook the contextual information in which the data are\ngenerated, potentially limiting their effectiveness. We propose a novel\napproach that incorporates process information from context to enhance the HAR\nperformance. Specifically, we align probabilistic events generated by machine\nlearning models with process models derived from contextual information. This\nalignment adaptively weighs these two sources of information to optimise HAR\naccuracy. Our experiments demonstrate that our approach achieves better\naccuracy and Macro F1-score compared to baseline models.\n","authors":["Jiawei Zheng","Petros Papapanagiotou","Jacques D. Fleuriot","Jane Hillston"],"pdf_url":"https://arxiv.org/pdf/2411.08814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08813v1","updated":"2024-11-13T17:51:57Z","published":"2024-11-13T17:51:57Z","title":"Rethinking CyberSecEval: An LLM-Aided Approach to Evaluation Critique","summary":" A key development in the cybersecurity evaluations space is the work carried\nout by Meta, through their CyberSecEval approach. While this work is\nundoubtedly a useful contribution to a nascent field, there are notable\nfeatures that limit its utility. Key drawbacks focus on the insecure code\ndetection part of Meta's methodology. We explore these limitations, and use our\nexploration as a test case for LLM-assisted benchmark analysis.\n","authors":["Suhas Hariharan","Zainab Ali Majid","Jaime Raldua Veuthey","Jacob Haimes"],"pdf_url":"https://arxiv.org/pdf/2411.08813v1.pdf","comment":"NeurIPS 2024, 2 pages"},{"id":"http://arxiv.org/abs/2411.03887v2","updated":"2024-11-13T17:37:55Z","published":"2024-11-01T18:46:03Z","title":"OML: Open, Monetizable, and Loyal AI","summary":" Artificial Intelligence (AI) has steadily improved across a wide range of\ntasks. However, the development and deployment of AI are almost entirely\ncontrolled by a few powerful organizations that are racing to create Artificial\nGeneral Intelligence (AGI). The centralized entities make decisions with little\npublic oversight, shaping the future of humanity, often with unforeseen\nconsequences. In this paper, we propose OML, which stands for Open,\nMonetizable, and Loyal AI, an approach designed to democratize AI development.\nOML is realized through an interdisciplinary framework spanning AI, blockchain,\nand cryptography. We present several ideas for constructing OML using\ntechnologies such as Trusted Execution Environments (TEE), traditional\ncryptographic primitives like fully homomorphic encryption and functional\nencryption, obfuscation, and AI-native solutions rooted in the sample\ncomplexity and intrinsic hardness of AI tasks. A key innovation of our work is\nintroducing a new scientific field: AI-native cryptography. Unlike conventional\ncryptography, which focuses on discrete data and binary security guarantees,\nAI-native cryptography exploits the continuous nature of AI data\nrepresentations and their low-dimensional manifolds, focusing on improving\napproximate performance. One core idea is to transform AI attack methods, such\nas data poisoning, into security tools. This novel approach serves as a\nfoundation for OML 1.0 which uses model fingerprinting to protect the integrity\nand ownership of AI models. The spirit of OML is to establish a decentralized,\nopen, and transparent platform for AI development, enabling the community to\ncontribute, monetize, and take ownership of AI models. By decentralizing\ncontrol and ensuring transparency through blockchain technology, OML prevents\nthe concentration of power and provides accountability in AI development that\nhas not been possible before.\n","authors":["Zerui Cheng","Edoardo Contente","Ben Finch","Oleg Golev","Jonathan Hayase","Andrew Miller","Niusha Moshrefi","Anshul Nasery","Sandeep Nailwal","Sewoong Oh","Himanshu Tyagi","Pramod Viswanath"],"pdf_url":"https://arxiv.org/pdf/2411.03887v2.pdf","comment":"60 pages, 22 figures"},{"id":"http://arxiv.org/abs/2411.07315v2","updated":"2024-11-13T17:28:15Z","published":"2024-11-11T19:15:29Z","title":"Harnessing Smartphone Sensors for Enhanced Road Safety: A Comprehensive\n Dataset and Review","summary":" Severe collisions can result from aggressive driving and poor road\nconditions, emphasizing the need for effective monitoring to ensure safety.\nSmartphones, with their array of built-in sensors, offer a practical and\naffordable solution for road-sensing. However, the lack of reliable,\nstandardized datasets has hindered progress in assessing road conditions and\ndriving patterns. This study addresses this gap by introducing a comprehensive\ndataset derived from smartphone sensors, which surpasses existing datasets by\nincorporating a diverse range of sensors including accelerometer, gyroscope,\nmagnetometer, GPS, gravity, orientation, and uncalibrated sensors. These\nsensors capture extensive parameters such as acceleration force, gravitation,\nrotation rate, magnetic field strength, and vehicle speed, providing a detailed\nunderstanding of road conditions and driving behaviors. The dataset is designed\nto enhance road safety, infrastructure maintenance, traffic management, and\nurban planning. By making this dataset available to the community, the study\naims to foster collaboration, inspire further research, and facilitate the\ndevelopment of innovative solutions in intelligent transportation systems.\n","authors":["Amith Khandakar","David G. Michelson","Mansura Naznine","Abdus Salam","Md. Nahiduzzaman","Khaled M. Khan","Ponnuthurai Nagaratnam Suganthan","Mohamed Arselene Ayari","Hamid Menouar","Julfikar Haider"],"pdf_url":"https://arxiv.org/pdf/2411.07315v2.pdf","comment":"29 pages, 14 Figures, journal paper, submitted into Scientific Data\n Journal"},{"id":"http://arxiv.org/abs/2411.08794v1","updated":"2024-11-13T17:19:32Z","published":"2024-11-13T17:19:32Z","title":"Evaluating World Models with LLM for Decision Making","summary":" World model emerges as a key module in decision making, where MuZero and\nDreamer achieve remarkable successes in complex tasks. Recent work leverages\nLarge Language Models (LLMs) as general world simulators to simulate the\ndynamics of the world due to their generalizability. LLMs also serve as the\nworld model for deliberative reasoning in Reasoning via Planning (RAP) and Tree\nof Thought (ToT). However, the world models are either evaluated as a general\nworld simulator, or as a functional module of the agent, i.e., predicting the\ntransitions to assist the planning. In this work, we propose a comprehensive\nevaluation of the world models with LLMs from the decision making perspective.\nSpecifically, we leverage the 31 diverse environments from (Wang et al.,\n2023;2024) and curate the rule-based policy of each environment for the diverse\nevaluation. Then, we design three main tasks, i.e., policy verification, action\nproposal, and policy planning, where the world models can be used for decision\nmaking solely. Finally, we conduct the comprehensive evaluation of the advanced\nLLMs, i.e., GPT-4o and GPT-4o-mini, on the environments for the three main\ntasks under various settings. The key observations include: i) GPT-4o\nsignificantly outperforms GPT-4o-mini on the three main tasks, especially for\nthe tasks which require the domain knowledge, ii) the performance of the world\nmodel with LLM will be decreased for long-term decision-making tasks, and iii)\nthe combination of different functionalities of the world model will brings\nadditional unstabilities of the performance.\n","authors":["Chang Yang","Xinrun Wang","Junzhe Jiang","Qinggang Zhang","Xiao Huang"],"pdf_url":"https://arxiv.org/pdf/2411.08794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08790v1","updated":"2024-11-13T17:16:48Z","published":"2024-11-13T17:16:48Z","title":"Can sparse autoencoders be used to decompose and interpret steering\n vectors?","summary":" Steering vectors are a promising approach to control the behaviour of large\nlanguage models. However, their underlying mechanisms remain poorly understood.\nWhile sparse autoencoders (SAEs) may offer a potential method to interpret\nsteering vectors, recent findings show that SAE-reconstructed vectors often\nlack the steering properties of the original vectors. This paper investigates\nwhy directly applying SAEs to steering vectors yields misleading\ndecompositions, identifying two reasons: (1) steering vectors fall outside the\ninput distribution for which SAEs are designed, and (2) steering vectors can\nhave meaningful negative projections in feature directions, which SAEs are not\ndesigned to accommodate. These limitations hinder the direct use of SAEs for\ninterpreting steering vectors.\n","authors":["Harry Mayne","Yushi Yang","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2411.08790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17101v8","updated":"2024-11-13T17:14:29Z","published":"2024-03-25T18:38:54Z","title":"AI Consciousness is Inevitable: A Theoretical Computer Science\n Perspective","summary":" We look at consciousness through the lens of Theoretical Computer Science, a\nbranch of mathematics that studies computation under resource limitations. From\nthis perspective, we develop a formal machine model for consciousness. The\nmodel is inspired by Alan Turing's simple yet powerful model of computation and\nBernard Baars' theater model of consciousness. Though extremely simple, the\nmodel aligns at a high level with many of the major scientific theories of\nhuman and animal consciousness, supporting our claim that machine consciousness\nis inevitable.\n","authors":["Lenore Blum","Manuel Blum"],"pdf_url":"https://arxiv.org/pdf/2403.17101v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08785v1","updated":"2024-11-13T17:13:25Z","published":"2024-11-13T17:13:25Z","title":"Zero-shot Cross-lingual Transfer Learning with Multiple Source and\n Target Languages for Information Extraction: Language Selection and\n Adversarial Training","summary":" The majority of previous researches addressing multi-lingual IE are limited\nto zero-shot cross-lingual single-transfer (one-to-one) setting, with\nhigh-resource languages predominantly as source training data. As a result,\nthese works provide little understanding and benefit for the realistic goal of\ndeveloping a multi-lingual IE system that can generalize to as many languages\nas possible. Our study aims to fill this gap by providing a detailed analysis\non Cross-Lingual Multi-Transferability (many-to-many transfer learning), for\nthe recent IE corpora that cover a diverse set of languages. Specifically, we\nfirst determine the correlation between single-transfer performance and a wide\nrange of linguistic-based distances. From the obtained insights, a combined\nlanguage distance metric can be developed that is not only highly correlated\nbut also robust across different tasks and model scales. Next, we investigate\nthe more general zero-shot multi-lingual transfer settings where multiple\nlanguages are involved in the training and evaluation processes. Language\nclustering based on the newly defined distance can provide directions for\nachieving the optimal cost-performance trade-off in data (languages) selection\nproblem. Finally, a relational-transfer setting is proposed to further\nincorporate multi-lingual unlabeled data based on adversarial training using\nthe relation induced from the above linguistic distance.\n","authors":["Nghia Trung Ngo","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.08785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19552v2","updated":"2024-11-13T17:12:34Z","published":"2024-09-29T04:41:10Z","title":"A Universal Deep Learning Framework for Materials X-ray Absorption\n Spectra","summary":" X-ray absorption spectroscopy (XAS) is a powerful characterization technique\nfor probing the local chemical environment of absorbing atoms. However,\nanalyzing XAS data presents significant challenges, often requiring extensive,\ncomputationally intensive simulations, as well as significant domain expertise.\nThese limitations hinder the development of fast, robust XAS analysis pipelines\nthat are essential in high-throughput studies and for autonomous\nexperimentation. We address these challenges with OmniXAS, a framework that\ncontains a suite of transfer learning approaches for XAS prediction, each\ncontributing to improved accuracy and efficiency, as demonstrated on K-edge\nspectra database covering eight 3d transition metals (Ti-Cu). The OmniXAS\nframework is built upon three distinct strategies. First, we use M3GNet to\nderive latent representations of the local chemical environment of absorption\nsites as input for XAS prediction, achieving up to order-of-magnitude\nimprovements over conventional featurization techniques. Second, we employ a\nhierarchical transfer learning strategy, training a universal multi-task model\nacross elements before fine-tuning for element-specific predictions. Models\nbased on this cascaded approach after element-wise fine-tuning outperform\nelement-specific models by up to 69%. Third, we implement cross-fidelity\ntransfer learning, adapting a universal model to predict spectra generated by\nsimulation of a different fidelity with a higher computational cost. This\napproach improves prediction accuracy by up to 11% over models trained on the\ntarget fidelity alone. Our approach boosts the throughput of XAS modeling by\norders of magnitude versus first-principles simulations and is extendable to\nXAS prediction for a broader range of elements. This transfer learning\nframework is generalizable to enhance deep-learning models that target other\nproperties in materials research.\n","authors":["Shubha R. Kharel","Fanchen Meng","Xiaohui Qu","Matthew R. Carbone","Deyu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.19552v2.pdf","comment":"Main manuscript: 22 pages, 11 figures. Supplemental material (12\n pages, 6 figures) available as a separate file in arXiv ancillary files\n (additional downloadable files)"},{"id":"http://arxiv.org/abs/2402.03271v3","updated":"2024-11-13T17:10:20Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n Seeking in Large Language Models","summary":" In the face of uncertainty, the ability to *seek information* is of\nfundamental importance. In many practical applications, such as medical\ndiagnosis and troubleshooting, the information needed to solve the task is not\ninitially given and has to be actively sought by asking follow-up questions\n(for example, a doctor asking a patient for more details about their symptoms).\nIn this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to\naugment large language models with the ability to actively seek information by\nasking effective questions. UoT combines 1) an *uncertainty-aware simulation\napproach* which enables the model to simulate possible future scenarios and how\nlikely they are to occur, 2) *uncertainty-based rewards* motivated by\ninformation gain which incentivizes the model to seek information, and 3) a\n*reward propagation scheme* to select the optimal question to ask in a way that\nmaximizes the expected reward. In experiments on medical diagnosis,\ntroubleshooting, and the `20 Questions` game, UoT achieves an average\nperformance improvement of 38.1% in the rate of successful task completion\nacross multiple LLMs compared with direct prompting and also improves\nefficiency (i.e., the number of questions needed to complete the task). Our\ncode has been released [here](https://github.com/zhiyuanhubj/UoT)\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.09322v2","updated":"2024-11-13T17:08:34Z","published":"2024-06-13T17:00:30Z","title":"Active Inference Meeting Energy-Efficient Control of Parallel and\n Identical Machines","summary":" We investigate the application of active inference in developing\nenergy-efficient control agents for manufacturing systems. Active inference,\nrooted in neuroscience, provides a unified probabilistic framework integrating\nperception, learning, and action, with inherent uncertainty quantification\nelements. Our study explores deep active inference, an emerging field that\ncombines deep learning with the active inference decision-making framework.\nLeveraging a deep active inference agent, we focus on controlling parallel and\nidentical machine workstations to enhance energy efficiency. We address\nchallenges posed by the problem's stochastic nature and delayed policy response\nby introducing tailored enhancements to existing agent architectures.\nSpecifically, we introduce multi-step transition and hybrid horizon methods to\nmitigate the need for complex planning. Our experimental results demonstrate\nthe effectiveness of these enhancements and highlight the potential of the\nactive inference-based approach.\n","authors":["Yavar Taheri Yeganeh","Mohsen Jafari","Andrea Matta"],"pdf_url":"https://arxiv.org/pdf/2406.09322v2.pdf","comment":"Accepted at the 10th International Conference on Machine Learning,\n Optimization, and Data Science"},{"id":"http://arxiv.org/abs/2304.11123v2","updated":"2024-11-13T17:06:10Z","published":"2023-04-21T16:47:51Z","title":"China and the U.S. produce more impactful AI research when collaborating\n together","summary":" Artificial Intelligence (AI) has become a disruptive technology, promising to\ngrant a significant economic and strategic advantage to nations that harness\nits power. China, with its recent push towards AI adoption, is challenging the\nU.S.'s position as the global leader in this field. Given AI's massive\npotential, as well as the fierce geopolitical tensions between China and the\nU.S., several recent policies have been put in place to discourage AI\nscientists from migrating to, or collaborating with, the other nation.\nNevertheless, the extent of talent migration and cross-border collaboration are\nnot fully understood. Here, we analyze a dataset of over 350,000 AI scientists\nand 5,000,000 AI papers. We find that since 2000, China and the U.S. have led\nthe field in terms of impact, novelty, productivity, and workforce. Most AI\nscientists who move to China come from the U.S., and most who move to the U.S.\ncome from China, highlighting a notable bidirectional talent migration.\nMoreover, the vast majority of those moving in either direction have Asian\nancestry. Upon moving, those scientists continue to collaborate frequently with\nthose in the origin country. Although the number of collaborations between the\ntwo countries has increased since the dawn of the millennium, such\ncollaborations continue to be relatively rare. A matching experiment reveals\nthat the two countries have always been more impactful when collaborating than\nwhen each works without the other. These findings suggest that instead of\nsuppressing cross-border migration and collaboration between the two nations,\nthe science could benefit from promoting such activities.\n","authors":["Bedoor AlShebli","Shahan Ali Memon","James A. Evans","Talal Rahwan"],"pdf_url":"https://arxiv.org/pdf/2304.11123v2.pdf","comment":"38 pages, 15 figures, 3 tables"},{"id":"http://arxiv.org/abs/2411.08768v1","updated":"2024-11-13T16:53:29Z","published":"2024-11-13T16:53:29Z","title":"Sharingan: Extract User Action Sequence from Desktop Recordings","summary":" Video recordings of user activities, particularly desktop recordings, offer a\nrich source of data for understanding user behaviors and automating processes.\nHowever, despite advancements in Vision-Language Models (VLMs) and their\nincreasing use in video analysis, extracting user actions from desktop\nrecordings remains an underexplored area. This paper addresses this gap by\nproposing two novel VLM-based methods for user action extraction: the Direct\nFrame-Based Approach (DF), which inputs sampled frames directly into VLMs, and\nthe Differential Frame-Based Approach (DiffF), which incorporates explicit\nframe differences detected via computer vision techniques. We evaluate these\nmethods using a basic self-curated dataset and an advanced benchmark adapted\nfrom prior work. Our results show that the DF approach achieves an accuracy of\n70% to 80% in identifying user actions, with the extracted action sequences\nbeing re-playable though Robotic Process Automation. We find that while VLMs\nshow potential, incorporating explicit UI changes can degrade performance,\nmaking the DF approach more reliable. This work represents the first\napplication of VLMs for extracting user action sequences from desktop\nrecordings, contributing new methods, benchmarks, and insights for future\nresearch.\n","authors":["Yanting Chen","Yi Ren","Xiaoting Qin","Jue Zhang","Kehong Yuan","Lu Han","Qingwei Lin","Dongmei Zhang","Saravan Rajmohan","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08767v1","updated":"2024-11-13T16:53:14Z","published":"2024-11-13T16:53:14Z","title":"SANDWICH: Towards an Offline, Differentiable, Fully-Trainable Wireless\n Neural Ray-Tracing Surrogate","summary":" Wireless ray-tracing (RT) is emerging as a key tool for three-dimensional\n(3D) wireless channel modeling, driven by advances in graphical rendering.\nCurrent approaches struggle to accurately model beyond 5G (B5G) network\nsignaling, which often operates at higher frequencies and is more susceptible\nto environmental conditions and changes. Existing online learning solutions\nrequire real-time environmental supervision during training, which is both\ncostly and incompatible with GPU-based processing. In response, we propose a\nnovel approach that redefines ray trajectory generation as a sequential\ndecision-making problem, leveraging generative models to jointly learn the\noptical, physical, and signal properties within each designated environment.\nOur work introduces the Scene-Aware Neural Decision Wireless Channel Raytracing\nHierarchy (SANDWICH), an innovative offline, fully differentiable approach that\ncan be trained entirely on GPUs. SANDWICH offers superior performance compared\nto existing online learning methods, outperforms the baseline by 4e^-2 radian\nin RT accuracy, and only fades 0.5 dB away from toplined channel gain\nestimation.\n","authors":["Yifei Jin","Ali Maatouk","Sarunas Girdzijauskas","Shugong Xu","Leandros Tassiulas","Rex Ying"],"pdf_url":"https://arxiv.org/pdf/2411.08767v1.pdf","comment":"Submitted in ICASSP 2025"},{"id":"http://arxiv.org/abs/2411.08764v1","updated":"2024-11-13T16:49:56Z","published":"2024-11-13T16:49:56Z","title":"Flow reconstruction in time-varying geometries using graph neural\n networks","summary":" The paper presents a Graph Attention Convolutional Network (GACN) for flow\nreconstruction from very sparse data in time-varying geometries. The model\nincorporates a feature propagation algorithm as a preprocessing step to handle\nextremely sparse inputs, leveraging information from neighboring nodes to\ninitialize missing features. In addition, a binary indicator is introduced as a\nvalidity mask to distinguish between the original and propagated data points,\nenabling more effective learning from sparse inputs. Trained on a unique data\nset of Direct Numerical Simulations (DNS) of a motored engine at a technically\nrelevant operating condition, the GACN shows robust performance across\ndifferent resolutions and domain sizes and can effectively handle unstructured\ndata and variable input sizes. The model is tested on previously unseen DNS\ndata as well as on an experimental data set from Particle Image Velocimetry\n(PIV) measurements that were not considered during training. A comparative\nanalysis shows that the GACN consistently outperforms both a conventional\nConvolutional Neural Network (CNN) and cubic interpolation methods on the DNS\nand PIV test sets by achieving lower reconstruction errors and better capturing\nfine-scale turbulent structures. In particular, the GACN effectively\nreconstructs flow fields from domains up to 14 times larger than those observed\nduring training, with the performance advantage increasing for larger domains.\n","authors":["Bogdan A. Danciu","Vito A. Pagone","Benjamin Böhm","Marius Schmidt","Christos E. Frouzakis"],"pdf_url":"https://arxiv.org/pdf/2411.08764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13150v2","updated":"2024-11-13T16:46:23Z","published":"2024-03-19T20:58:38Z","title":"On Training Survival Models with Scoring Rules","summary":" Scoring rules are an established way of comparing predictive performances\nacross model classes. In the context of survival analysis, they require\nadaptation in order to accommodate censoring. This work investigates using\nscoring rules for model training rather than evaluation. Doing so, we establish\na general framework for training survival models that is model agnostic and can\nlearn event time distributions parametrically or non-parametrically. In\naddition, our framework is not restricted to any specific scoring rule. While\nwe focus on neural network-based implementations, we also provide\nproof-of-concept implementations using gradient boosting, generalized additive\nmodels, and trees. Empirical comparisons on synthetic and real-world data\nindicate that scoring rules can be successfully incorporated into model\ntraining and yield competitive predictive performance with established\ntime-to-event models.\n","authors":["Philipp Kopper","David Rügamer","Raphael Sonabend","Bernd Bischl","Andreas Bender"],"pdf_url":"https://arxiv.org/pdf/2403.13150v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.03679v6","updated":"2024-11-13T16:42:22Z","published":"2024-06-06T01:49:29Z","title":"On the Effects of Data Scale on UI Control Agents","summary":" Autonomous agents that control computer interfaces to accomplish human tasks\nare emerging. Leveraging LLMs to power such agents has been of special\ninterest, but unless fine-tuned on human-collected task demonstrations,\nperformance is still relatively low. In this work we study whether fine-tuning\nalone is a viable approach for building real-world computer control agents. In\nparticularly, we investigate how performance measured on both high and\nlow-level tasks in domain and out of domain scales as more training data is\ncollected. To this end we collect and release a new dataset, AndroidControl,\nconsisting of 15,283 demonstrations of everyday tasks with Android apps.\nCompared to existing datasets, each AndroidControl task instance includes both\nhigh and low-level human-generated instructions, allowing us to explore the\nlevel of task complexity an agent can handle. Moreover, AndroidControl is the\nmost diverse computer control dataset to date, including 14,548 unique tasks\nover 833 Android apps, thus allowing us to conduct in-depth analysis of the\nmodel performance in and out of the domain of the training data. Using the\ndataset, we find that when tested in domain fine-tuned models outperform zero\nand few-shot baselines and scale in such a way that robust performance might\nfeasibly be obtained simply by collecting more data. Out of domain, performance\nscales significantly more slowly and suggests that in particular for high-level\ntasks, fine-tuning on more data alone may be insufficient for achieving robust\nout-of-domain performance.\n","authors":["Wei Li","William Bishop","Alice Li","Chris Rawles","Folawiyo Campbell-Ajala","Divya Tyamagundlu","Oriana Riva"],"pdf_url":"https://arxiv.org/pdf/2406.03679v6.pdf","comment":"NeurIPS 2024 (Datasets and Benchmarks)"},{"id":"http://arxiv.org/abs/2402.04032v4","updated":"2024-11-13T16:33:33Z","published":"2024-02-06T14:26:22Z","title":"ProactivePIM: Accelerating Weight-Sharing Embedding Layer with PIM for\n Scalable Recommendation System","summary":" The personalized recommendation system's continuous size growth poses new\nchallenges for model inference. Although weight-sharing algorithms have been\nproposed to reduce embedding table capacity, they increase memory access.\nRecent advancements in processing-in-memory (PIM) successfully enhance the\nrecommendation system's throughput by exploiting memory parallelism, but our\nanalysis shows that those algorithms introduce CPU-PIM communication overhead\ninto prior PIM systems, compromising the PIM throughput. We propose\nProactivePIM, a specialized memory architecture integrated with PIM technology\ntailored to accelerate the weight-sharing algorithms. ProacitvePIM integrates\nan SRAM cache within the PIM with an efficient prefetching scheme to leverage a\nunique locality of the algorithm and eliminate CPU-PIM communication.\n","authors":["Youngsuk Kim","Junghwan Lim","Hyuk-Jae Lee","Chae Eun Rhee"],"pdf_url":"https://arxiv.org/pdf/2402.04032v4.pdf","comment":"7 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.08745v1","updated":"2024-11-13T16:26:19Z","published":"2024-11-13T16:26:19Z","title":"Separating Tongue from Thought: Activation Patching Reveals\n Language-Agnostic Concept Representations in Transformers","summary":" A central question in multilingual language modeling is whether large\nlanguage models (LLMs) develop a universal concept representation, disentangled\nfrom specific languages. In this paper, we address this question by analyzing\nlatent representations (latents) during a word translation task in\ntransformer-based LLMs. We strategically extract latents from a source\ntranslation prompt and insert them into the forward pass on a target\ntranslation prompt. By doing so, we find that the output language is encoded in\nthe latent at an earlier layer than the concept to be translated. Building on\nthis insight, we conduct two key experiments. First, we demonstrate that we can\nchange the concept without changing the language and vice versa through\nactivation patching alone. Second, we show that patching with the mean over\nlatents across different languages does not impair and instead improves the\nmodels' performance in translating the concept. Our results provide evidence\nfor the existence of language-agnostic concept representations within the\ninvestigated models.\n","authors":["Clément Dumas","Chris Wendler","Veniamin Veselovsky","Giovanni Monea","Robert West"],"pdf_url":"https://arxiv.org/pdf/2411.08745v1.pdf","comment":"12 pages, 10 figures, previously published under the title \"How Do\n Llamas Process Multilingual Text? A Latent Exploration through Activation\n Patching\" at the ICML 2024 mechanistic interpretability workshop\n https://openreview.net/forum?id=0ku2hIm4BS"},{"id":"http://arxiv.org/abs/2411.08728v1","updated":"2024-11-13T16:10:14Z","published":"2024-11-13T16:10:14Z","title":"Polymetis:Large Language Modeling for Multiple Material Domains","summary":" As the application of large language models in various fields continues to\nexpand, materials science also ushers in opportunities for AI-driven\ninnovation. The traditional way of relying on manual search for materials\nscience-related information is now using artificial intelligence technology as\nan auxiliary tool to improve the efficiency of materials science research. To\naccelerate researchers' knowledge acquisition and intelligent decision-making\nsupport in materials science research, this paper proposes a large language\nmodel Polymetis model for a variety of materials fields, aiming to provide\nhighly professional knowledge answers in the field of materials, covering\nenergy materials, functional materials, alloy materials, physical chemistry,\nbiology, and other material directions. The model uses a dataset of about 2\nmillion material knowledge instructions, and in the process of building the\ndataset, we developed the Intelligent Extraction Large Model (IELM), which is\nspecially used to extract and form structured knowledge from scientific texts,\navoiding a large number of costs that need to be manually annotated, and\nimproving efficiency. We inject this data into the GLM4-9B model for learning\nto enhance its inference capabilities in a variety of material domains. In\naddition, we have introduced enhanced prompt strategies to ensure that the\nanswers to the model are more organized and comprehensive, providing efficient\nand comprehensive intelligent support for the diverse needs of materials\nscience exploration, and promoting the development of material science.\n","authors":["Chao Huang","Huichen Xiao","Chen Chen","Chunyan Chen","Yi Zhao","Shiyu Du","Yiming Zhang","He Sha","Ruixin Gu"],"pdf_url":"https://arxiv.org/pdf/2411.08728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05284v4","updated":"2024-11-13T15:53:37Z","published":"2023-07-11T14:25:10Z","title":"Rethinking Distribution Shifts: Empirical Analysis and Inductive\n Modeling for Tabular Data","summary":" Different distribution shifts require different interventions, and algorithms\nmust be grounded in the specific shifts they address. However, methodological\ndevelopment for robust algorithms typically relies on structural assumptions\nthat lack empirical validation. Advocating for an empirically grounded\ndata-driven approach to research, we build an empirical testbed comprising\nnatural shifts across 5 tabular datasets and 60,000 method configurations\nencompassing imbalanced learning and distributionally robust optimization (DRO)\nmethods. We find $Y|X$-shifts are most prevalent on our testbed, in stark\ncontrast to the heavy focus on $X$ (covariate)-shifts in the ML literature. The\nperformance of robust algorithms varies significantly over shift types, and is\nno better than that of vanilla methods. To understand why, we conduct an\nin-depth empirical analysis of DRO methods and find that although often\nneglected by researchers, implementation details -- such as the choice of\nunderlying model class (e.g., XGBoost) and hyperparameter selection -- have a\nbigger impact on performance than the ambiguity set or its radius. To further\nbridge that gap between methodological research and practice, we design case\nstudies that illustrate how such a data-driven, inductive understanding of\ndistribution shifts can enhance both data-centric and algorithmic\ninterventions.\n","authors":["Jiashuo Liu","Tianyu Wang","Peng Cui","Hongseok Namkoong"],"pdf_url":"https://arxiv.org/pdf/2307.05284v4.pdf","comment":"Conference version appeared in NeurIPS 2023, previously titled \"On\n the Need for a Language Describing Distribution Shifts: Illustrations on\n Tabular Datasets\""},{"id":"http://arxiv.org/abs/2411.08706v1","updated":"2024-11-13T15:50:32Z","published":"2024-11-13T15:50:32Z","title":"Searching Latent Program Spaces","summary":" Program synthesis methods aim to automatically generate programs restricted\nto a language that can explain a given specification of input-output pairs.\nWhile purely symbolic approaches suffer from a combinatorial search space,\nrecent methods leverage neural networks to learn distributions over program\nstructures to narrow this search space significantly, enabling more efficient\nsearch. However, for challenging problems, it remains difficult to train models\nto perform program synthesis in one shot, making test-time search essential.\nMost neural methods lack structured search mechanisms during inference, relying\ninstead on stochastic sampling or gradient updates, which can be inefficient.\nIn this work, we propose the Latent Program Network (LPN), a general algorithm\nfor program induction that learns a distribution over latent programs in a\ncontinuous space, enabling efficient search and test-time adaptation. We\nexplore how to train these networks to optimize for test-time computation and\ndemonstrate the use of gradient-based search both during training and at test\ntime. We evaluate LPN on ARC-AGI, a program synthesis benchmark that evaluates\nperformance by generalizing programs to new inputs rather than explaining the\nunderlying specification. We show that LPN can generalize beyond its training\ndistribution and adapt to unseen tasks by utilizing test-time computation,\noutperforming algorithms without test-time adaptation mechanisms.\n","authors":["Clément Bonnet","Matthew V Macfarlane"],"pdf_url":"https://arxiv.org/pdf/2411.08706v1.pdf","comment":"Code available at https://github.com/clement-bonnet/lpn"},{"id":"http://arxiv.org/abs/2408.00838v2","updated":"2024-11-13T15:48:34Z","published":"2024-08-01T18:00:05Z","title":"Calibrating Bayesian Generative Machine Learning for Bayesiamplification","summary":" Recently, combinations of generative and Bayesian machine learning have been\nintroduced in particle physics for both fast detector simulation and inference\ntasks. These neural networks aim to quantify the uncertainty on the generated\ndistribution originating from limited training statistics. The interpretation\nof a distribution-wide uncertainty however remains ill-defined. We show a clear\nscheme for quantifying the calibration of Bayesian generative machine learning\nmodels. For a Continuous Normalizing Flow applied to a low-dimensional toy\nexample, we evaluate the calibration of Bayesian uncertainties from either a\nmean-field Gaussian weight posterior, or Monte Carlo sampling network weights,\nto gauge their behaviour on unsteady distribution edges. Well calibrated\nuncertainties can then be used to roughly estimate the number of uncorrelated\ntruth samples that are equivalent to the generated sample and clearly indicate\ndata amplification for smooth features of the distribution.\n","authors":["Sebastian Bieringer","Sascha Diefenbacher","Gregor Kasieczka","Mathias Trabs"],"pdf_url":"https://arxiv.org/pdf/2408.00838v2.pdf","comment":"15 pages, 6 figures, updated references, fixed typo"},{"id":"http://arxiv.org/abs/2402.10705v3","updated":"2024-11-13T15:46:08Z","published":"2024-02-16T14:04:56Z","title":"AutoSAT: Automatically Optimize SAT Solvers via Large Language Models","summary":" Conflict-Driven Clause Learning (CDCL) is the mainstream framework for\nsolving the Satisfiability problem (SAT), and CDCL solvers typically rely on\nvarious heuristics, which have a significant impact on their performance.\nModern CDCL solvers, such as MiniSat and Kissat, commonly incorporate several\nheuristics and select one to use according to simple rules, requiring\nsignificant time and expert effort to fine-tune in practice. The pervasion of\nLarge Language Models (LLMs) provides a potential solution to address this\nissue. However, generating a CDCL solver from scratch is not effective due to\nthe complexity and context volume of SAT solvers. Instead, we propose AutoSAT,\na framework that automatically optimizes heuristics in a pre-defined modular\nsearch space based on existing CDCL solvers. Unlike existing automated\nalgorithm design approaches focusing on hyperparameter tuning and operator\nselection, AutoSAT can generate new efficient heuristics. In this first attempt\nat optimizing SAT solvers using LLMs, several strategies including the greedy\nhill climber and (1+1) Evolutionary Algorithm are employed to guide LLMs to\nsearch for better heuristics. Experimental results demonstrate that LLMs can\ngenerally enhance the performance of CDCL solvers. A realization of AutoSAT\noutperforms MiniSat on 9 out of 12 datasets and even surpasses the\nstate-of-the-art hybrid solver Kissat on 4 datasets.\n","authors":["Yiwen Sun","Furong Ye","Xianyin Zhang","Shiyu Huang","Bingzhen Zhang","Ke Wei","Shaowei Cai"],"pdf_url":"https://arxiv.org/pdf/2402.10705v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08703v1","updated":"2024-11-13T15:45:46Z","published":"2024-11-13T15:45:46Z","title":"MVKTrans: Multi-View Knowledge Transfer for Robust Multiomics\n Classification","summary":" The distinct characteristics of multiomics data, including complex\ninteractions within and across biological layers and disease heterogeneity\n(e.g., heterogeneity in etiology and clinical symptoms), drive us to develop\nnovel designs to address unique challenges in multiomics prediction. In this\npaper, we propose the multi-view knowledge transfer learning (MVKTrans)\nframework, which transfers intra- and inter-omics knowledge in an adaptive\nmanner by reviewing data heterogeneity and suppressing bias transfer, thereby\nenhancing classification performance. Specifically, we design a graph\ncontrastive module that is trained on unlabeled data to effectively learn and\ntransfer the underlying intra-omics patterns to the supervised task. This\nunsupervised pretraining promotes learning general and unbiased representations\nfor each modality, regardless of the downstream tasks. In light of the varying\ndiscriminative capacities of modalities across different diseases and/or\nsamples, we introduce an adaptive and bi-directional cross-omics distillation\nmodule. This module automatically identifies richer modalities and facilitates\ndynamic knowledge transfer from more informative to less informative omics,\nthereby enabling a more robust and generalized integration. Extensive\nexperiments on four real biomedical datasets demonstrate the superior\nperformance and robustness of MVKTrans compared to the state-of-the-art. Code\nand data are available at https://github.com/Yaolab-fantastic/MVKTrans.\n","authors":["Shan Cong","Zhiling Sang","Hongwei Liu","Haoran Luo","Xin Wang","Hong Liang","Jie Hao","Xiaohui Yao"],"pdf_url":"https://arxiv.org/pdf/2411.08703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07879v4","updated":"2024-11-13T15:45:31Z","published":"2023-11-14T03:18:28Z","title":"Toxicity Detection is NOT all you Need: Measuring the Gaps to Supporting\n Volunteer Content Moderators","summary":" Extensive efforts in automated approaches for content moderation have been\nfocused on developing models to identify toxic, offensive, and hateful content\nwith the aim of lightening the load for moderators. Yet, it remains uncertain\nwhether improvements on those tasks have truly addressed moderators' needs in\naccomplishing their work. In this paper, we surface gaps between past research\nefforts that have aimed to provide automation for aspects of content moderation\nand the needs of volunteer content moderators, regarding identifying violations\nof various moderation rules. To do so, we conduct a model review on Hugging\nFace to reveal the availability of models to cover various moderation rules and\nguidelines from three exemplar forums. We further put state-of-the-art LLMs to\nthe test, evaluating how well these models perform in flagging violations of\nplatform rules from one particular forum. Finally, we conduct a user survey\nstudy with volunteer moderators to gain insight into their perspectives on\nuseful moderation models. Overall, we observe a non-trivial gap, as missing\ndeveloped models and LLMs exhibit moderate to low performance on a significant\nportion of the rules. Moderators' reports provide guides for future work on\ndeveloping moderation assistant models.\n","authors":["Yang Trista Cao","Lovely-Frances Domingo","Sarah Ann Gilbert","Michelle Mazurek","Katie Shilton","Hal Daumé III"],"pdf_url":"https://arxiv.org/pdf/2311.07879v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08701v1","updated":"2024-11-13T15:42:28Z","published":"2024-11-13T15:42:28Z","title":"TRACE: Transformer-based Risk Assessment for Clinical Evaluation","summary":" We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation),\na novel method for clinical risk assessment based on clinical data, leveraging\nthe self-attention mechanism for enhanced feature interaction and result\ninterpretation. Our approach is able to handle different data modalities,\nincluding continuous, categorical and multiple-choice (checkbox) attributes.\nThe proposed architecture features a shared representation of the clinical data\nobtained by integrating specialized embeddings of each data modality, enabling\nthe detection of high-risk individuals using Transformer encoder layers. To\nassess the effectiveness of the proposed method, a strong baseline based on\nnon-negative multi-layer perceptrons (MLPs) is introduced. The proposed method\noutperforms various baselines widely used in the domain of clinical risk\nassessment, while effectively handling missing values. In terms of\nexplainability, our Transformer-based method offers easily interpretable\nresults via attention weights, further enhancing the clinicians'\ndecision-making process.\n","authors":["Dionysis Christopoulos","Sotiris Spanos","Valsamis Ntouskos","Konstantinos Karantzalos"],"pdf_url":"https://arxiv.org/pdf/2411.08701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08700v1","updated":"2024-11-13T15:42:13Z","published":"2024-11-13T15:42:13Z","title":"Rethinking negative sampling in content-based news recommendation","summary":" News recommender systems are hindered by the brief lifespan of articles, as\nthey undergo rapid relevance decay. Recent studies have demonstrated the\npotential of content-based neural techniques in tackling this problem. However,\nthese models often involve complex neural architectures and often lack\nconsideration for negative examples. In this study, we posit that the careful\nsampling of negative examples has a big impact on the model's outcome. We\ndevise a negative sampling technique that not only improves the accuracy of the\nmodel but also facilitates the decentralization of the recommendation system.\nThe experimental results obtained using the MIND dataset demonstrate that the\naccuracy of the method under consideration can compete with that of\nState-of-the-Art models. The utilization of the sampling technique is essential\nin reducing model complexity and accelerating the training process, while\nmaintaining a high level of accuracy. Finally, we discuss how decentralized\nmodels can help improve privacy and scalability.\n","authors":["Miguel Ângelo Rebelo","João Vinagre","Ivo Pereira","Álvaro Figueira"],"pdf_url":"https://arxiv.org/pdf/2411.08700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08696v1","updated":"2024-11-13T15:34:52Z","published":"2024-11-13T15:34:52Z","title":"Scholarly Wikidata: Population and Exploration of Conference Data in\n Wikidata using LLMs","summary":" Several initiatives have been undertaken to conceptually model the domain of\nscholarly data using ontologies and to create respective Knowledge Graphs. Yet,\nthe full potential seems unleashed, as automated means for automatic population\nof said ontologies are lacking, and respective initiatives from the Semantic\nWeb community are not necessarily connected: we propose to make scholarly data\nmore sustainably accessible by leveraging Wikidata's infrastructure and\nautomating its population in a sustainable manner through LLMs by tapping into\nunstructured sources like conference Web sites and proceedings texts as well as\nalready existing structured conference datasets. While an initial analysis\nshows that Semantic Web conferences are only minimally represented in Wikidata,\nwe argue that our methodology can help to populate, evolve and maintain\nscholarly data as a community within Wikidata. Our main contributions include\n(a) an analysis of ontologies for representing scholarly data to identify gaps\nand relevant entities/properties in Wikidata, (b) semi-automated extraction --\nrequiring (minimal) manual validation -- of conference metadata (e.g.,\nacceptance rates, organizer roles, programme committee members, best paper\nawards, keynotes, and sponsors) from websites and proceedings texts using LLMs.\nFinally, we discuss (c) extensions to visualization tools in the Wikidata\ncontext for data exploration of the generated scholarly data. Our study focuses\non data from 105 Semantic Web-related conferences and extends/adds more than\n6000 entities in Wikidata. It is important to note that the method can be more\ngenerally applicable beyond Semantic Web-related conferences for enhancing\nWikidata's utility as a comprehensive scholarly resource.\n Source Repository: https://github.com/scholarly-wikidata/\n DOI: https://doi.org/10.5281/zenodo.10989709\n License: Creative Commons CC0 (Data), MIT (Code)\n","authors":["Nandana Mihindukulasooriya","Sanju Tiwari","Daniil Dobriy","Finn Årup Nielsen","Tek Raj Chhetri","Axel Polleres"],"pdf_url":"https://arxiv.org/pdf/2411.08696v1.pdf","comment":"17 pages, accepted at EKAW-24"},{"id":"http://arxiv.org/abs/2407.21164v2","updated":"2024-11-13T15:22:32Z","published":"2024-07-30T20:10:59Z","title":"Extending choice assessments to choice functions: An algorithm for\n computing the natural extension","summary":" We study how to infer new choices from prior choices using the framework of\nchoice functions, a unifying mathematical framework for decision-making based\non sets of preference orders. In particular, we define the natural (most\nconservative) extension of a given choice assessment to a coherent choice\nfunction -- whenever possible -- and use this natural extension to make new\nchoices. We provide a practical algorithm for computing this natural extension\nand various ways to improve scalability. Finally, we test these algorithms for\ndifferent types of choice assessments.\n","authors":["Arne Decadt","Alexander Erreygers","Jasper De Bock"],"pdf_url":"https://arxiv.org/pdf/2407.21164v2.pdf","comment":"40 pages, 8 figures, pre-print for International Journal of\n Approximate Reasoning"},{"id":"http://arxiv.org/abs/2411.08684v1","updated":"2024-11-13T15:20:14Z","published":"2024-11-13T15:20:14Z","title":"Analogical Reasoning Within a Conceptual Hyperspace","summary":" We propose an approach to analogical inference that marries the\nneuro-symbolic computational power of complex-sampled hyperdimensional\ncomputing (HDC) with Conceptual Spaces Theory (CST), a promising theory of\nsemantic meaning. CST sketches, at an abstract level, approaches to analogical\ninference that go beyond the standard predicate-based structure mapping\ntheories. But it does not describe how such an approach can be operationalized.\nWe propose a concrete HDC-based architecture that computes several types of\nanalogy classified by CST. We present preliminary proof-of-concept experimental\nresults within a toy domain and describe how it can perform category-based and\nproperty-based analogical reasoning.\n","authors":["Howard Goldowsky","Vasanth Sarathy"],"pdf_url":"https://arxiv.org/pdf/2411.08684v1.pdf","comment":"Analogy-angle workshop full paper at IJCAI 2024"},{"id":"http://arxiv.org/abs/2411.08666v1","updated":"2024-11-13T14:59:41Z","published":"2024-11-13T14:59:41Z","title":"A Survey on Vision Autoregressive Model","summary":" Autoregressive models have demonstrated great performance in natural language\nprocessing (NLP) with impressive scalability, adaptability and\ngeneralizability. Inspired by their notable success in NLP field,\nautoregressive models have been intensively investigated recently for computer\nvision, which perform next-token predictions by representing visual data as\nvisual tokens and enables autoregressive modelling for a wide range of vision\ntasks, ranging from visual generation and visual understanding to the very\nrecent multimodal generation that unifies visual generation and understanding\nwith a single autoregressive model. This paper provides a systematic review of\nvision autoregressive models, including the development of a taxonomy of\nexisting methods and highlighting their major contributions, strengths, and\nlimitations, covering various vision tasks such as image generation, video\ngeneration, image editing, motion generation, medical image analysis, 3D\ngeneration, robotic manipulation, unified multimodal generation, etc. Besides,\nwe investigate and analyze the latest advancements in autoregressive models,\nincluding thorough benchmarking and discussion of existing methods across\nvarious evaluation datasets. Finally, we outline key challenges and promising\ndirections for future research, offering a roadmap to guide further\nadvancements in vision autoregressive models.\n","authors":["Kai Jiang","Jiaxing Huang"],"pdf_url":"https://arxiv.org/pdf/2411.08666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13224v4","updated":"2024-11-13T14:54:18Z","published":"2024-02-20T18:37:11Z","title":"Controlling Large Electric Vehicle Charging Stations via User Behavior\n Modeling and Stochastic Programming","summary":" This paper introduces an Electric Vehicle Charging Station (EVCS) model that\nincorporates real-world constraints, such as slot power limitations, contract\nthreshold overruns penalties, or early disconnections of electric vehicles\n(EVs). We propose a formulation of the problem of EVCS control under\nuncertainty, and implement two Multi-Stage Stochastic Programming approaches\nthat leverage user-provided information, namely, Model Predictive Control and\nTwo-Stage Stochastic Programming. The model addresses uncertainties in charging\nsession start and end times, as well as in energy demand. A user's behavior\nmodel based on a sojourn-time-dependent stochastic process enhances cost\nreduction while maintaining customer satisfaction. The benefits of the two\nproposed methods are showcased against two baselines over a 22-day simulation\nusing a real-world dataset. The two-stage approach demonstrates robustness\nagainst early disconnections by considering a wider range of uncertainty\nscenarios for optimization. The algorithm prioritizing user satisfaction over\nelectricity cost achieves a 20% and 36% improvement in two user satisfaction\nmetrics compared to an industry-standard baseline. Additionally, the algorithm\nstriking the best balance between cost and user satisfaction exhibits a mere 3%\nrelative cost increase compared to the theoretically optimal baseline - for\nwhich the nonanticipativity constraint is relaxed - while attaining 94% and 84%\nof the user satisfaction performance in the two used satisfaction metrics.\n","authors":["Alban Puech","Tristan Rigaut","William Templier","Maud Tournoud"],"pdf_url":"https://arxiv.org/pdf/2402.13224v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15552v3","updated":"2024-11-13T14:52:21Z","published":"2024-02-23T17:21:21Z","title":"Morphological Symmetries in Robotics","summary":" We present a comprehensive framework for studying and leveraging\nmorphological symmetries in robotic systems. These are intrinsic properties of\nthe robot's morphology, frequently observed in animal biology and robotics,\nwhich stem from the replication of kinematic structures and the symmetrical\ndistribution of mass. We illustrate how these symmetries extend to the robot's\nstate space and both proprioceptive and exteroceptive sensor measurements,\nresulting in the equivariance of the robot's equations of motion and optimal\ncontrol policies. Thus, we recognize morphological symmetries as a relevant and\npreviously unexplored physics-informed geometric prior, with significant\nimplications for both data-driven and analytical methods used in modeling,\ncontrol, estimation and design in robotics. For data-driven methods, we\ndemonstrate that morphological symmetries can enhance the sample efficiency and\ngeneralization of machine learning models through data augmentation, or by\napplying equivariant/invariant constraints on the model's architecture. In the\ncontext of analytical methods, we employ abstract harmonic analysis to\ndecompose the robot's dynamics into a superposition of lower-dimensional,\nindependent dynamics. We substantiate our claims with both synthetic and\nreal-world experiments conducted on bipedal and quadrupedal robots. Lastly, we\nintroduce the repository MorphoSymm to facilitate the practical use of the\ntheory and applications outlined in this work.\n","authors":["Daniel Ordoñez-Apraez","Giulio Turrisi","Vladimir Kostic","Mario Martin","Antonio Agudo","Francesc Moreno-Noguer","Massimiliano Pontil","Claudio Semini","Carlos Mastalli"],"pdf_url":"https://arxiv.org/pdf/2402.15552v3.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.08651v1","updated":"2024-11-13T14:40:51Z","published":"2024-11-13T14:40:51Z","title":"Estimating unknown parameters in differential equations with a\n reinforcement learning based PSO method","summary":" Differential equations offer a foundational yet powerful framework for\nmodeling interactions within complex dynamic systems and are widely applied\nacross numerous scientific fields. One common challenge in this area is\nestimating the unknown parameters of these dynamic relationships. However,\ntraditional numerical optimization methods rely on the selection of initial\nparameter values, making them prone to local optima. Meanwhile, deep learning\nand Bayesian methods require training models on specific differential\nequations, resulting in poor versatility. This paper reformulates the parameter\nestimation problem of differential equations as an optimization problem by\nintroducing the concept of particles from the particle swarm optimization\nalgorithm. Building on reinforcement learning-based particle swarm optimization\n(RLLPSO), this paper proposes a novel method, DERLPSO, for estimating unknown\nparameters of differential equations. We compared its performance on three\ntypical ordinary differential equations with the state-of-the-art methods,\nincluding the RLLPSO algorithm, traditional numerical methods, deep learning\napproaches, and Bayesian methods. The experimental results demonstrate that our\nDERLPSO consistently outperforms other methods in terms of performance,\nachieving an average Mean Square Error of 1.13e-05, which reduces the error by\napproximately 4 orders of magnitude compared to other methods. Apart from\nordinary differential equations, our DERLPSO also show great promise for\nestimating unknown parameters of partial differential equations. The DERLPSO\nmethod proposed in this paper has high accuracy, is independent of initial\nparameter values, and possesses strong versatility and stability. This work\nprovides new insights into unknown parameter estimation for differential\nequations.\n","authors":["Wenkui Sun","Xiaoya Fan","Lijuan Jia","Tinyi Chu","Shing-Tung Yau","Rongling Wu","Zhong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10534v2","updated":"2024-11-13T14:36:47Z","published":"2024-04-12T21:41:50Z","title":"Into the Fog: Evaluating Robustness of Multiple Object Tracking","summary":" State-of-the-art Multiple Object Tracking (MOT) approaches have shown\nremarkable performance when trained and evaluated on current benchmarks.\nHowever, these benchmarks primarily consist of clear weather scenarios,\noverlooking adverse atmospheric conditions such as fog, haze, smoke and dust.\nAs a result, the robustness of trackers against these challenging conditions\nremains underexplored. To address this gap, we introduce physics-based\nvolumetric fog simulation method for arbitrary MOT datasets, utilizing\nframe-by-frame monocular depth estimation and a fog formation optical model. We\nenhance our simulation by rendering both homogeneous and heterogeneous fog and\npropose to use the dark channel prior method to estimate atmospheric light,\nshowing promising results even in night and indoor scenes. We present the\nleading benchmark MOTChallenge (third release) augmented with fog (smoke for\nindoor scenes) of various intensities and conduct a comprehensive evaluation of\nMOT methods, revealing their limitations under fog and fog-like challenges.\n","authors":["Nadezda Kirillova","M. Jehanzeb Mirza","Horst Bischof","Horst Possegger"],"pdf_url":"https://arxiv.org/pdf/2404.10534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08645v1","updated":"2024-11-13T14:36:12Z","published":"2024-11-13T14:36:12Z","title":"A System Level Performance Evaluation for Superconducting Digital\n Systems","summary":" Superconducting Digital (SCD) technology offers significant potential for\nenhancing the performance of next generation large scale compute workloads. By\nleveraging advanced lithography and a 300 mm platform, SCD devices can reduce\nenergy consumption and boost computational power. This paper presents a\ncross-layer modeling approach to evaluate the system-level performance benefits\nof SCD architectures for Large Language Model (LLM) training and inference. Our\nfindings, based on experimental data and Pulse Conserving Logic (PCL) design\nprinciples, demonstrate substantial performance gain in both training and\ninference. We are, thus, able to convincingly show that the SCD technology can\naddress memory and interconnect limitations of present day solutions for\nnext-generation compute systems.\n","authors":["Joyjit Kundu","Debjyoti Bhattacharjee","Nathan Josephsen","Ankit Pokhrel","Udara De Silva","Wenzhe Guo","Steven Van Winckel","Steven Brebels","Manu Perumkunnil","Quentin Herr","Anna Herr"],"pdf_url":"https://arxiv.org/pdf/2411.08645v1.pdf","comment":"8 figures"},{"id":"http://arxiv.org/abs/2410.05538v2","updated":"2024-11-13T14:34:10Z","published":"2024-10-07T22:36:40Z","title":"Online Dynamic Pricing for Electric Vehicle Charging Stations with\n Reservations","summary":" The transition to electric vehicles (EVs), coupled with the rise of renewable\nenergy sources, will significantly impact the electric grid. Unlike\nconventional fuel sources, electricity for EVs is constrained by grid capacity,\nprice fluctuations, and long EV charging times, requiring new pricing solutions\nto manage demand and supply. This paper proposes a model for online dynamic\npricing of reserved EV charging services, including reservation, parking, and\ncharging as a bundled service priced as a whole. Our approach focuses on the\nindividual charging station operator, employing a stochastic demand model and\nonline dynamic pricing based on expected demand. The proposed model uses a\nMarkov Decision Process (MDP) formulation to optimize sequential pricing\ndecisions for charging session requests. A key contribution is the novel\ndefinition and quantification of discretization error introduced by the\ndiscretization of the Poisson process for use in the MDP. The model's viability\nis demonstrated with a heuristic solution method based on Monte-Carlo tree\nsearch, offering a viable path for real-world application.\n","authors":["Jan Mrkos","Antonín Komenda","David Fiedler","Jiří Vokřínek"],"pdf_url":"https://arxiv.org/pdf/2410.05538v2.pdf","comment":"45 pages, 11 figure, prepared for submission to IEEE Transactions on\n Intelligent Transportation Systems (T-ITS)"},{"id":"http://arxiv.org/abs/2411.08642v1","updated":"2024-11-13T14:32:28Z","published":"2024-11-13T14:32:28Z","title":"Towards More Accurate Fake Detection on Images Generated from Advanced\n Generative and Neural Rendering Models","summary":" The remarkable progress in neural-network-driven visual data generation,\nespecially with neural rendering techniques like Neural Radiance Fields and 3D\nGaussian splatting, offers a powerful alternative to GANs and diffusion models.\nThese methods can produce high-fidelity images and lifelike avatars,\nhighlighting the need for robust detection methods. In response, an\nunsupervised training technique is proposed that enables the model to extract\ncomprehensive features from the Fourier spectrum magnitude, thereby overcoming\nthe challenges of reconstructing the spectrum due to its centrosymmetric\nproperties. By leveraging the spectral domain and dynamically combining it with\nspatial domain information, we create a robust multimodal detector that\ndemonstrates superior generalization capabilities in identifying challenging\nsynthetic images generated by the latest image synthesis techniques. To address\nthe absence of a 3D neural rendering-based fake image database, we develop a\ncomprehensive database that includes images generated by diverse neural\nrendering techniques, providing a robust foundation for evaluating and\nadvancing detection methods.\n","authors":["Chengdong Dong","Vijayakumar Bhagavatula","Zhenyu Zhou","Ajay Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.08642v1.pdf","comment":"13 pages, 8 Figures"},{"id":"http://arxiv.org/abs/2411.08641v1","updated":"2024-11-13T14:32:10Z","published":"2024-11-13T14:32:10Z","title":"DipMe: Haptic Recognition of Granular Media for Tangible Interactive\n Applications","summary":" While tangible user interface has shown its power in naturally interacting\nwith rigid or soft objects, users cannot conveniently use different types of\ngranular materials as the interaction media. We introduce DipMe as a smart\ndevice to recognize the types of granular media in real time, which can be used\nto connect the granular materials in the physical world with various virtual\ncontent. Other than vision-based solutions, we propose a dip operation of our\ndevice and exploit the haptic signals to recognize different types of granular\nmaterials. With modern machine learning tools, we find the haptic signals from\ndifferent granular media are distinguishable by DipMe. With the online granular\nobject recognition, we build several tangible interactive applications,\ndemonstrating the effects of DipMe in perceiving granular materials and its\npotential in developing a tangible user interface with granular objects as the\nnew media.\n","authors":["Xinkai Wang","Shuo Zhang","Ziyi Zhao","Lifeng Zhu","Aiguo Song"],"pdf_url":"https://arxiv.org/pdf/2411.08641v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.08622v1","updated":"2024-11-13T14:08:58Z","published":"2024-11-13T14:08:58Z","title":"Precision-Focused Reinforcement Learning Model for Robotic Object\n Pushing","summary":" Non-prehensile manipulation, such as pushing objects to a desired target\nposition, is an important skill for robots to assist humans in everyday\nsituations. However, the task is challenging due to the large variety of\nobjects with different and sometimes unknown physical properties, such as\nshape, size, mass, and friction. This can lead to the object overshooting its\ntarget position, requiring fast corrective movements of the robot around the\nobject, especially in cases where objects need to be precisely pushed. In this\npaper, we improve the state-of-the-art by introducing a new memory-based\nvision-proprioception RL model to push objects more precisely to target\npositions using fewer corrective movements.\n","authors":["Lara Bergmann","David Leins","Robert Haschke","Klaus Neumann"],"pdf_url":"https://arxiv.org/pdf/2411.08622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08605v1","updated":"2024-11-13T13:45:54Z","published":"2024-11-13T13:45:54Z","title":"Lo-MARVE: A Low Cost Autonomous Underwater Vehicle for Marine\n Exploration","summary":" This paper presents Low-cost Marine Autonomous Robotic Vehicle Explorer\n(Lo-MARVE), a novel autonomous underwater vehicle (AUV) designed to provide a\nlow cost solution for underwater exploration and environmental monitoring in\nshallow water environments. Lo-MARVE offers a cost-effective alternative to\nexisting AUVs, featuring a modular design, low-cost sensors, and wireless\ncommunication capabilities. The total cost of Lo-MARVE is approximately EUR\n500. Lo-MARVE is developed using the Raspberry Pi 4B microprocessor, with\ncontrol software written in Python. The proposed AUV was validated through\nfield testing outside of a laboratory setting, in the freshwater environment of\nthe River Corrib in Galway, Ireland. This demonstrates its ability to navigate\nautonomously, collect data, and communicate effectively outside of a controlled\nlaboratory setting. The successful deployment of Lo-MARVE in a real-world\nenvironment validates its proof of concept.\n","authors":["Karl Mason","Daniel Kelly"],"pdf_url":"https://arxiv.org/pdf/2411.08605v1.pdf","comment":"This paper was presented at the 12th International Conference on\n Control, Mechatronics and Automation (ICCMA 2024), held in London, UK, from\n November 11-13, 2024"},{"id":"http://arxiv.org/abs/2411.08013v2","updated":"2024-11-13T13:36:05Z","published":"2024-11-12T18:43:27Z","title":"Investigating the Effectiveness of Explainability Methods in Parkinson's\n Detection from Speech","summary":" Speech impairments in Parkinson's disease (PD) provide significant early\nindicators for diagnosis. While models for speech-based PD detection have shown\nstrong performance, their interpretability remains underexplored. This study\nsystematically evaluates several explainability methods to identify PD-specific\nspeech features, aiming to support the development of accurate, interpretable\nmodels for clinical decision-making in PD diagnosis and monitoring. Our\nmethodology involves (i) obtaining attributions and saliency maps using\nmainstream interpretability techniques, (ii) quantitatively evaluating the\nfaithfulness of these maps and their combinations obtained via union and\nintersection through a range of established metrics, and (iii) assessing the\ninformation conveyed by the saliency maps for PD detection from an auxiliary\nclassifier. Our results reveal that, while explanations are aligned with the\nclassifier, they often fail to provide valuable information for domain experts.\n","authors":["Eleonora Mancini","Francesco Paissan","Paolo Torroni","Mirco Ravanelli","Cem Subakan"],"pdf_url":"https://arxiv.org/pdf/2411.08013v2.pdf","comment":"The first two authors contributed equally to this research: author\n order is alphabetical"},{"id":"http://arxiv.org/abs/2411.08599v1","updated":"2024-11-13T13:30:21Z","published":"2024-11-13T13:30:21Z","title":"XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL","summary":" To tackle the challenges of large language model performance in natural\nlanguage to SQL tasks, we introduce XiYan-SQL, an innovative framework that\nemploys a multi-generator ensemble strategy to improve candidate generation. We\nintroduce M-Schema, a semi-structured schema representation method designed to\nenhance the understanding of database structures. To enhance the quality and\ndiversity of generated candidate SQL queries, XiYan-SQL integrates the\nsignificant potential of in-context learning (ICL) with the precise control of\nsupervised fine-tuning. On one hand, we propose a series of training strategies\nto fine-tune models to generate high-quality candidates with diverse\npreferences. On the other hand, we implement the ICL approach with an example\nselection method based on named entity recognition to prevent overemphasis on\nentities. The refiner optimizes each candidate by correcting logical or\nsyntactical errors. To address the challenge of identifying the best candidate,\nwe fine-tune a selection model to distinguish nuances of candidate SQL queries.\nThe experimental results on multiple dialect datasets demonstrate the\nrobustness of XiYan-SQL in addressing challenges across different scenarios.\nOverall, our proposed XiYan-SQL achieves the state-of-the-art execution\naccuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on\nNL2GQL, and a competitive score of 72.23% on the Bird development benchmark.\nThe proposed framework not only enhances the quality and diversity of SQL\nqueries but also outperforms previous methods.\n","authors":["Yingqi Gao","Yifu Liu","Xiaoxia Li","Xiaorong Shi","Yin Zhu","Yiming Wang","Shiqi Li","Wei Li","Yuntao Hong","Zhiling Luo","Jinyang Gao","Liyu Mou","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2411.08599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13178v2","updated":"2024-11-13T13:14:19Z","published":"2024-10-17T02:58:57Z","title":"GeSubNet: Gene Interaction Inference for Disease Subtype Network\n Generation","summary":" Retrieving gene functional networks from knowledge databases presents a\nchallenge due to the mismatch between disease networks and subtype-specific\nvariations. Current solutions, including statistical and deep learning methods,\noften fail to effectively integrate gene interaction knowledge from databases\nor explicitly learn subtype-specific interactions. To address this mismatch, we\npropose GeSubNet, which learns a unified representation capable of predicting\ngene interactions while distinguishing between different disease subtypes.\nGraphs generated by such representations can be considered subtype-specific\nnetworks. GeSubNet is a multi-step representation learning framework with three\nmodules: First, a deep generative model learns distinct disease subtypes from\npatient gene expression profiles. Second, a graph neural network captures\nrepresentations of prior gene networks from knowledge databases, ensuring\naccurate physical gene interactions. Finally, we integrate these two\nrepresentations using an inference loss that leverages graph generation\ncapabilities, conditioned on the patient separation loss, to refine\nsubtype-specific information in the learned representation. GeSubNet\nconsistently outperforms traditional methods, with average improvements of\n30.6%, 21.0%, 20.1%, and 56.6% across four graph evaluation metrics, averaged\nover four cancer datasets. Particularly, we conduct a biological simulation\nexperiment to assess how the behavior of selected genes from over 11,000\ncandidates affects subtypes or patient distributions. The results show that the\ngenerated network has the potential to identify subtype-specific genes with an\n83% likelihood of impacting patient distribution shifts. The GeSubNet resource\nis available: https://anonymous.4open.science/r/GeSubNet/\n","authors":["Ziwei Yang","Zheng Chen","Xin Liu","Rikuto Kotoge","Peng Chen","Yasuko Matsubara","Yasushi Sakurai","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2410.13178v2.pdf","comment":"Under review as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2411.08587v1","updated":"2024-11-13T13:11:49Z","published":"2024-11-13T13:11:49Z","title":"DeepUQ: Assessing the Aleatoric Uncertainties from two Deep Learning\n Methods","summary":" Assessing the quality of aleatoric uncertainty estimates from uncertainty\nquantification (UQ) deep learning methods is important in scientific contexts,\nwhere uncertainty is physically meaningful and important to characterize and\ninterpret exactly. We systematically compare aleatoric uncertainty measured by\ntwo UQ techniques, Deep Ensembles (DE) and Deep Evidential Regression (DER).\nOur method focuses on both zero-dimensional (0D) and two-dimensional (2D) data,\nto explore how the UQ methods function for different data dimensionalities. We\ninvestigate uncertainty injected on the input and output variables and include\na method to propagate uncertainty in the case of input uncertainty so that we\ncan compare the predicted aleatoric uncertainty to the known values. We\nexperiment with three levels of noise. The aleatoric uncertainty predicted\nacross all models and experiments scales with the injected noise level.\nHowever, the predicted uncertainty is miscalibrated to $\\rm{std}(\\sigma_{\\rm\nal})$ with the true uncertainty for half of the DE experiments and almost all\nof the DER experiments. The predicted uncertainty is the least accurate for\nboth UQ methods for the 2D input uncertainty experiment and the high-noise\nlevel. While these results do not apply to more complex data, they highlight\nthat further research on post-facto calibration for these methods would be\nbeneficial, particularly for high-noise and high-dimensional settings.\n","authors":["Rebecca Nevin","Aleksandra Ćiprijanović","Brian D. Nord"],"pdf_url":"https://arxiv.org/pdf/2411.08587v1.pdf","comment":"Accepted to the Machine Learning for Physical Sciences workshop at\n NeurIPS 2024; 11 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.08586v1","updated":"2024-11-13T13:09:14Z","published":"2024-11-13T13:09:14Z","title":"Optimizing Automatic Summarization of Long Clinical Records Using\n Dynamic Context Extension:Testing and Evaluation of the NBCE Method","summary":" Summarizing patient clinical notes is vital for reducing documentation\nburdens. Current manual summarization makes medical staff struggle. We propose\nan automatic method using LLMs, but long inputs cause LLMs to lose context,\nreducing output quality especially in small size model. We used a 7B model,\nopen-calm-7b, enhanced with Native Bayes Context Extend and a redesigned\ndecoding mechanism to reference one sentence at a time, keeping inputs within\ncontext windows, 2048 tokens. Our improved model achieved near parity with\nGoogle's over 175B Gemini on ROUGE-L metrics with 200 samples, indicating\nstrong performance using less resources, enhancing automated EMR summarization\nfeasibility.\n","authors":["Guoqing Zhang","Keita Fukuyama","Kazumasa Kishimoto","Tomohiro Kuroda"],"pdf_url":"https://arxiv.org/pdf/2411.08586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08583v1","updated":"2024-11-13T13:03:49Z","published":"2024-11-13T13:03:49Z","title":"An Empirical Examination of the Evaluative AI Framework","summary":" This study empirically examines the \"Evaluative AI\" framework, which aims to\nenhance the decision-making process for AI users by transitioning from a\nrecommendation-based approach to a hypothesis-driven one. Rather than offering\ndirect recommendations, this framework presents users pro and con evidence for\nhypotheses to support more informed decisions. However, findings from the\ncurrent behavioral experiment reveal no significant improvement in\ndecision-making performance and limited user engagement with the evidence\nprovided, resulting in cognitive processes similar to those observed in\ntraditional AI systems. Despite these results, the framework still holds\npromise for further exploration in future research.\n","authors":["Jaroslaw Kornowicz"],"pdf_url":"https://arxiv.org/pdf/2411.08583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08582v1","updated":"2024-11-13T13:01:44Z","published":"2024-11-13T13:01:44Z","title":"Intelligent Algorithms For Signature Diagnostics Of Three-Phase Motors","summary":" The application of machine learning (ML) algorithms in the intelligent\ndiagnosis of three-phase engines has the potential to significantly enhance\ndiagnostic performance and accuracy. Traditional methods largely rely on\nsignature analysis, which, despite being a standard practice, can benefit from\nthe integration of advanced ML techniques. In our study, we innovate by\ncombining state of the art algorithms with a novel unsupervised anomaly\ngeneration methodology that takes into account physics model of the engine.\nThis hybrid approach leverages the strengths of both supervised ML and\nunsupervised signature analysis, achieving superior diagnostic accuracy and\nreliability along with a wide industrial application. Our experimental results\ndemonstrate that this method significantly outperforms existing ML and non-ML\nstate-of-the-art approaches while retaining the practical advantages of an\nunsupervised methodology. The findings highlight the potential of our approach\nto significantly contribute to the field of engine diagnostics, offering a\nrobust and efficient solution for real-world applications.\n","authors":["Stepan Svirin","Artem Ryzhikov","Saraa Ali","Denis Derkach"],"pdf_url":"https://arxiv.org/pdf/2411.08582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07743v3","updated":"2024-11-13T12:43:33Z","published":"2023-06-13T13:00:10Z","title":"V-LoL: A Diagnostic Dataset for Visual Logical Learning","summary":" Despite the successes of recent developments in visual AI, different\nshortcomings still exist; from missing exact logical reasoning, to abstract\ngeneralization abilities, to understanding complex and noisy scenes.\nUnfortunately, existing benchmarks, were not designed to capture more than a\nfew of these aspects. Whereas deep learning datasets focus on visually complex\ndata but simple visual reasoning tasks, inductive logic datasets involve\ncomplex logical learning tasks, however, lack the visual component. To address\nthis, we propose the diagnostic visual logical learning dataset, V-LoL, that\nseamlessly combines visual and logical challenges. Notably, we introduce the\nfirst instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic\nbenchmark in symbolic AI, the Michalski train problem. By incorporating\nintricate visual scenes and flexible logical reasoning tasks within a versatile\nframework, V-LoL-Train provides a platform for investigating a wide range of\nvisual logical learning challenges. We evaluate a variety of AI systems\nincluding traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our\nevaluations demonstrate that even SOTA AI faces difficulties in dealing with\nvisual logical learning challenges, highlighting unique advantages and\nlimitations of each methodology. Overall, V-LoL opens up new avenues for\nunderstanding and enhancing current abilities in visual logical learning for AI\nsystems.\n","authors":["Lukas Helff","Wolfgang Stammer","Hikaru Shindo","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2306.07743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02549v2","updated":"2024-11-13T12:37:09Z","published":"2024-02-04T15:52:59Z","title":"Are Large Language Models Table-based Fact-Checkers?","summary":" Table-based Fact Verification (TFV) aims to extract the entailment relation\nbetween statements and structured tables. Existing TFV methods based on\nsmall-scaled models suffer from insufficient labeled data and weak zero-shot\nability. Recently, the appearance of Large Language Models (LLMs) has gained\nlots of attraction in research fields. They have shown powerful zero-shot and\nin-context learning abilities on several NLP tasks, but their potential on TFV\nis still unknown. In this work, we implement a preliminary study about whether\nLLMs are table-based fact-checkers. In detail, we design diverse prompts to\nexplore how the in-context learning can help LLMs in TFV, i.e., zero-shot and\nfew-shot TFV capability. Besides, we carefully design and construct TFV\ninstructions to study the performance gain brought by the instruction tuning of\nLLMs. Experimental results demonstrate that LLMs can achieve acceptable results\non zero-shot and few-shot TFV with prompt engineering, while instruction-tuning\ncan stimulate the TFV capability significantly. We also make some valuable\nfindings about the format of zero-shot prompts and the number of in-context\nexamples. Finally, we analyze some possible directions to promote the accuracy\nof TFV via LLMs, which is beneficial to further research of table reasoning.\n","authors":["Hanwen Zhang","Qingyi Si","Peng Fu","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02549v2.pdf","comment":"CSCWD 2024"},{"id":"http://arxiv.org/abs/2410.10929v4","updated":"2024-11-13T12:27:38Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v4.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2411.08563v1","updated":"2024-11-13T12:21:13Z","published":"2024-11-13T12:21:13Z","title":"Leveraging LLMs for Predictive Insights in Food Policy and Behavioral\n Interventions","summary":" Food consumption and production contribute significantly to global greenhouse\ngas emissions, making them crucial entry points for mitigating climate change\nand maintaining a liveable planet. Over the past two decades, food policy\ninitiatives have explored interventions to reshape production and consumption\npatterns, focusing on reducing food waste and curbing ruminant meat\nconsumption. While the evidence of \"what works\" improves, evaluating which\npolicies are appropriate and effective in specific contexts remains difficult\ndue to external validity challenges. This paper demonstrates that a fine-tuned\nlarge language model (LLM) can accurately predict the direction of outcomes in\napproximately 80\\% of empirical studies measuring dietary-based impacts (e.g.\nfood choices, sales, waste) resulting from behavioral interventions and\npolicies. Approximately 75 prompts were required to achieve optimal results,\nwith performance showing signs of catastrophic loss beyond this point. Our\nfindings indicate that greater input detail enhances predictive accuracy,\nalthough the model still faces challenges with unseen studies, underscoring the\nimportance of a representative training sample. As LLMs continue to improve and\ndiversify, they hold promise for advancing data-driven, evidence-based\npolicymaking.\n","authors":["Micha Kaiser","Paul Lohmann","Peter Ochieng","Billy Shi","Cass R. Sunstein","Lucia A. Reisch"],"pdf_url":"https://arxiv.org/pdf/2411.08563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08562v1","updated":"2024-11-13T12:19:46Z","published":"2024-11-13T12:19:46Z","title":"Neural Corrective Machine Unranking","summary":" Machine unlearning in neural information retrieval (IR) systems requires\nremoving specific data whilst maintaining model performance. Applying existing\nmachine unlearning methods to IR may compromise retrieval effectiveness or\ninadvertently expose unlearning actions due to the removal of particular items\nfrom the retrieved results presented to users. We formalise corrective\nunranking, which extends machine unlearning in (neural) IR context by\nintegrating substitute documents to preserve ranking integrity, and propose a\nnovel teacher-student framework, Corrective unRanking Distillation (CuRD), for\nthis task. CuRD (1) facilitates forgetting by adjusting the (trained) neural IR\nmodel such that its output relevance scores of to-be-forgotten samples mimic\nthose of low-ranking, non-retrievable samples; (2) enables correction by\nfine-tuning the relevance scores for the substitute samples to match those of\ncorresponding to-be-forgotten samples closely; (3) seeks to preserve\nperformance on samples that are not targeted for forgetting. We evaluate CuRD\non four neural IR models (BERTcat, BERTdot, ColBERT, PARADE) using MS MARCO and\nTREC CAR datasets. Experiments with forget set sizes from 1 % and 20 % of the\ntraining dataset demonstrate that CuRD outperforms seven state-of-the-art\nbaselines in terms of forgetting and correction while maintaining model\nretention and generalisation capabilities.\n","authors":["Jingrui Hou","Axel Finke","Georgina Cosma"],"pdf_url":"https://arxiv.org/pdf/2411.08562v1.pdf","comment":"submitted to Information Sciences"},{"id":"http://arxiv.org/abs/2411.08561v1","updated":"2024-11-13T12:18:00Z","published":"2024-11-13T12:18:00Z","title":"LogLLM: Log-based Anomaly Detection Using Large Language Models","summary":" Software systems often record important runtime information in logs to help\nwith troubleshooting. Log-based anomaly detection has become a key research\narea that aims to identify system issues through log data, ultimately enhancing\nthe reliability of software systems. Traditional deep learning methods often\nstruggle to capture the semantic information embedded in log data, which is\ntypically organized in natural language. In this paper, we propose LogLLM, a\nlog-based anomaly detection framework that leverages large language models\n(LLMs). LogLLM employs BERT for extracting semantic vectors from log messages,\nwhile utilizing Llama, a transformer decoder-based model, for classifying log\nsequences. Additionally, we introduce a projector to align the vector\nrepresentation spaces of BERT and Llama, ensuring a cohesive understanding of\nlog semantics. Unlike conventional methods that require log parsers to extract\ntemplates, LogLLM preprocesses log messages with regular expressions,\nstreamlining the entire process. Our framework is trained through a novel\nthree-stage procedure designed to enhance performance and adaptability.\nExperimental results across four public datasets demonstrate that LogLLM\noutperforms state-of-the-art methods. Even when handling unstable logs, it\neffectively captures the semantic meaning of log messages and detects anomalies\naccurately.\n","authors":["Wei Guan","Jian Cao","Shiyou Qian","Jianqi Gao"],"pdf_url":"https://arxiv.org/pdf/2411.08561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08552v1","updated":"2024-11-13T12:03:39Z","published":"2024-11-13T12:03:39Z","title":"Leveraging Pre-Trained Neural Networks to Enhance Machine Learning with\n Variational Quantum Circuits","summary":" Quantum Machine Learning (QML) offers tremendous potential but is currently\nlimited by the availability of qubits. We introduce an innovative approach that\nutilizes pre-trained neural networks to enhance Variational Quantum Circuits\n(VQC). This technique effectively separates approximation error from qubit\ncount and removes the need for restrictive conditions, making QML more viable\nfor real-world applications. Our method significantly improves parameter\noptimization for VQC while delivering notable gains in representation and\ngeneralization capabilities, as evidenced by rigorous theoretical analysis and\nextensive empirical testing on quantum dot classification tasks. Moreover, our\nresults extend to applications such as human genome analysis, demonstrating the\nbroad applicability of our approach. By addressing the constraints of current\nquantum hardware, our work paves the way for a new era of advanced QML\napplications, unlocking the full potential of quantum computing in fields such\nas machine learning, materials science, medicine, mimetics, and various\ninterdisciplinary areas.\n","authors":["Jun Qi","Chao-Han Yang","Samuel Yen-Chi Chen","Pin-Yu Chen","Hector Zenil","Jesper Tegner"],"pdf_url":"https://arxiv.org/pdf/2411.08552v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2411.08544v1","updated":"2024-11-13T11:45:39Z","published":"2024-11-13T11:45:39Z","title":"Deeper Insights into Learning Performance of Stochastic Configuration\n Networks","summary":" Stochastic Configuration Networks (SCNs) are a class of randomized neural\nnetworks that integrate randomized algorithms within an incremental learning\nframework. A defining feature of SCNs is the supervisory mechanism, which\nadaptively adjusts the distribution to generate effective random basis\nfunctions, thereby enabling error-free learning. In this paper, we present a\ncomprehensive analysis of the impact of the supervisory mechanism on the\nlearning performance of SCNs. Our findings reveal that the current SCN\nframework evaluates the effectiveness of each random basis function in reducing\nresidual errors using a lower bound on its error reduction potential, which\nconstrains SCNs' overall learning efficiency. Specifically, SCNs may fail to\nconsistently select the most effective random candidate as the new basis\nfunction during each training iteration. To overcome this problem, we propose a\nnovel method for evaluating the hidden layer's output matrix, supported by a\nnew supervisory mechanism that accurately assesses the error reduction\npotential of random basis functions without requiring the computation of the\nMoore-Penrose inverse of the output matrix. This approach enhances the\nselection of basis functions, reducing computational complexity and improving\nthe overall scalability and learning capabilities of SCNs. We introduce a\nRecursive Moore-Penrose Inverse-SCN (RMPI-SCN) training scheme based on the new\nsupervisory mechanism and demonstrate its effectiveness through simulations\nover some benchmark datasets. Experiments show that RMPI-SCN outperforms the\nconventional SCN in terms of learning capability, underscoring its potential to\nadvance the SCN framework for large-scale data modeling applications.\n","authors":["Xiufeng Yan","Dianhui Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08537v1","updated":"2024-11-13T11:35:39Z","published":"2024-11-13T11:35:39Z","title":"MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal\n Lymphatic Vessel Segmentation","summary":" Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste\nproducts from the human brain. An impairment in their functionality has been\nassociated with aging as well as brain disorders like multiple sclerosis and\nAlzheimer's disease. However, MLVs have only recently been described for the\nfirst time in magnetic resonance imaging (MRI), and their ramified structure\nrenders manual segmentation particularly difficult. Further, as there is no\nconsistent notion of their appearance, human-annotated MLV structures contain a\nhigh inter-rater variability that most automatic segmentation methods cannot\ntake into account. In this work, we propose a new rater-aware training scheme\nfor the popular nnU-Net model, and we explore rater-based ensembling strategies\nfor accurate and consistent segmentation of MLVs. This enables us to boost\nnnU-Net's performance while obtaining explicit predictions in different\nannotation styles and a rater-based uncertainty estimation. Our final model,\nMLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to\nthe human reference standard. The model further matches the human inter-rater\nreliability and replicates age-related associations with MLV volume.\n","authors":["Fabian Bongratz","Markus Karmann","Adrian Holz","Moritz Bonhoeffer","Viktor Neumaier","Sarah Deli","Benita Schmitz-Koep","Claus Zimmer","Christian Sorg","Melissa Thalhammer","Dennis M Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2411.08537v1.pdf","comment":"ML4H 2024"},{"id":"http://arxiv.org/abs/2411.08533v1","updated":"2024-11-13T11:29:14Z","published":"2024-11-13T11:29:14Z","title":"ACROSS: A Deformation-Based Cross-Modal Representation for Robotic\n Tactile Perception","summary":" Tactile perception is essential for human interaction with the environment\nand is becoming increasingly crucial in robotics. Tactile sensors like the\nBioTac mimic human fingertips and provide detailed interaction data. Despite\nits utility in applications like slip detection and object identification, this\nsensor is now deprecated, making many existing valuable datasets obsolete.\nHowever, recreating similar datasets with newer sensor technologies is both\ntedious and time-consuming. Therefore, it is crucial to adapt these existing\ndatasets for use with new setups and modalities. In response, we introduce\nACROSS, a novel framework for translating data between tactile sensors by\nexploiting sensor deformation information. We demonstrate the approach by\ntranslating BioTac signals into the DIGIT sensor. Our framework consists of\nfirst converting the input signals into 3D deformation meshes. We then\ntransition from the 3D deformation mesh of one sensor to the mesh of another,\nand finally convert the generated 3D deformation mesh into the corresponding\noutput space. We demonstrate our approach to the most challenging problem of\ngoing from a low-dimensional tactile representation to a high-dimensional one.\nIn particular, we transfer the tactile signals of a BioTac sensor to DIGIT\ntactile images. Our approach enables the continued use of valuable datasets and\nthe exchange of data between groups with different setups.\n","authors":["Wadhah Zai El Amri","Malte Kuhlmann","Nicolás Navarro-Guerrero"],"pdf_url":"https://arxiv.org/pdf/2411.08533v1.pdf","comment":"Paper Submitted to ICRA2025. arXiv admin note: text overlap with\n arXiv:2410.14310"},{"id":"http://arxiv.org/abs/2411.07268v2","updated":"2024-11-13T11:28:07Z","published":"2024-11-09T15:59:59Z","title":"Target-driven Attack for Large Language Models","summary":" Current large language models (LLM) provide a strong foundation for\nlarge-scale user-oriented natural language tasks. Many users can easily inject\nadversarial text or instructions through the user interface, thus causing LLM\nmodel security challenges like the language model not giving the correct\nanswer. Although there is currently a large amount of research on black-box\nattacks, most of these black-box attacks use random and heuristic strategies.\nIt is unclear how these strategies relate to the success rate of attacks and\nthus effectively improve model robustness. To solve this problem, we propose\nour target-driven black-box attack method to maximize the KL divergence between\nthe conditional probabilities of the clean text and the attack text to redefine\nthe attack's goal. We transform the distance maximization problem into two\nconvex optimization problems based on the attack goal to solve the attack text\nand estimate the covariance. Furthermore, the projected gradient descent\nalgorithm solves the vector corresponding to the attack text. Our target-driven\nblack-box attack approach includes two attack strategies: token manipulation\nand misinformation attack. Experimental results on multiple Large Language\nModels and datasets demonstrate the effectiveness of our attack method.\n","authors":["Chong Zhang","Mingyu Jin","Dong Shu","Taowen Wang","Dongfang Liu","Xiaobo Jin"],"pdf_url":"https://arxiv.org/pdf/2411.07268v2.pdf","comment":"12 pages, 7 figures. This work is an extension of the\n arXiv:2404.07234 work. We propose new methods. 27th European Conference on\n Artificial Intelligence 2024"},{"id":"http://arxiv.org/abs/2411.08526v1","updated":"2024-11-13T11:20:09Z","published":"2024-11-13T11:20:09Z","title":"Gendered Words and Grant Rates: A Textual Analysis of Disparate Outcomes\n in the Patent System","summary":" This study examines gender disparities in patent law by analyzing the textual\ncontent of patent applications. While prior research has primarily focused on\nthe study of metadata (i.e., filing year or technological class), we employ\nmachine learning and natural language processing techniques to derive latent\ninformation from patent texts. In particular, these methods are used to predict\ninventor gender based on textual characteristics. We find that gender can be\nidentified with notable accuracy - even without knowing the inventor's name.\nThis ability to discern gender through text suggests that anonymized patent\nexamination - often proposed as a solution to mitigate disparities in patent\ngrant rate - may not fully address gender-specific outcomes in securing a\npatent. Our analysis additionally identifies gendered differences in textual\nchoices within patent documents and the fields in which inventors choose to\nwork. These findings highlight the complex interaction between textual choices,\ngender, and success in securing a patent. As discussed herein, this raises\ncritical questions about the efficacy of current proposals aimed at achieving\ngender parity and efficiency in the patent system.\n","authors":["Deborah Gerhardt","Miriam Marcowitz-Bitton","W. Michael Schuster","Avshalom Elmalech","Omri Suissa","Moshe Mash"],"pdf_url":"https://arxiv.org/pdf/2411.08526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10040v3","updated":"2024-11-13T11:13:56Z","published":"2024-05-16T12:22:41Z","title":"SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation","summary":" It is often desirable to distill the capabilities of large language models\n(LLMs) into smaller student models due to compute and memory constraints. One\nway to do this for classification tasks is via dataset synthesis, which can be\naccomplished by generating examples of each label from the LLM. Prior\napproaches to synthesis use few-shot prompting, which relies on the LLM's\nparametric knowledge to generate usable examples. However, this leads to issues\nof repetition, bias towards popular entities, and stylistic differences from\nhuman text. In this work, we propose Synthesize by Retrieval and Refinement\n(SynthesizRR), which uses retrieval augmentation to introduce variety into the\ndataset synthesis process: as retrieved passages vary, the LLM is seeded with\ndifferent content to generate its examples. We empirically study the synthesis\nof six datasets, covering topic classification, sentiment analysis, tone\ndetection, and humor, requiring complex synthesis strategies. We find that\nSynthesizRR greatly improves lexical and semantic diversity, similarity to\nhuman-written text, and distillation performance, when compared to 32-shot\nprompting and four prior approaches. We release our code to perform all steps\nat https://github.com/amazon-science/synthesizrr\n","authors":["Abhishek Divekar","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2405.10040v3.pdf","comment":"Published as a main conference paper at EMNLP 2024. Code available at\n https://github.com/amazon-science/synthesizrr"},{"id":"http://arxiv.org/abs/2411.01078v3","updated":"2024-11-13T11:10:18Z","published":"2024-11-01T23:19:05Z","title":"Effective ML Model Versioning in Edge Networks","summary":" Machine learning (ML) models, data and software need to be regularly updated\nwhenever essential version updates are released and feasible for integration.\nThis is a basic but most challenging requirement to satisfy in the edge, due to\nthe various system constraints and the major impact that an update can have on\nrobustness and stability. In this paper, we formulate for the first time the ML\nmodel versioning optimization problem, and propose effective solutions,\nincluding the update automation with reinforcement learning (RL) based\nalgorithm. We study the edge network environment due to the known constraints\nin performance, response time, security, and reliability, which make updates\nespecially challenging. The performance study shows that model version updates\ncan be fully and effectively automated with reinforcement learning method. We\nshow that for every range of server load values, the proper versioning can be\nfound that improves security, reliability and/or ML model accuracy, while\nassuring a comparably lower response time.\n","authors":["Fin Gentzen","Mounir Bensalem","Admela Jukan"],"pdf_url":"https://arxiv.org/pdf/2411.01078v3.pdf","comment":"This paper is uploaded here for research community, thus it is for\n non-commercial purposes"},{"id":"http://arxiv.org/abs/2411.08521v1","updated":"2024-11-13T11:08:28Z","published":"2024-11-13T11:08:28Z","title":"SAD-TIME: a Spatiotemporal-fused network for depression detection with\n Automated multi-scale Depth-wise and TIME-interval-related common feature\n extractor","summary":" Background and Objective: Depression is a severe mental disorder, and\naccurate diagnosis is pivotal to the cure and rehabilitation of people with\ndepression. However, the current questionnaire-based diagnostic methods could\nbring subjective biases and may be denied by subjects. In search of a more\nobjective means of diagnosis, researchers have begun to experiment with deep\nlearning-based methods for identifying depressive disorders in recent years.\nMethods: In this study, a novel Spatiotemporal-fused network with Automated\nmulti-scale Depth-wise and TIME-interval-related common feature extractor\n(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common\nfeatures extractor (CFE), a spatial sector (SpS), a modified temporal sector\n(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale\ndepth-wise 1D-convolutional neural network and a time-interval embedding\ngenerator, where the unique information of each channel is preserved. The SpS\nfuses the functional connectivity with the distance-based connectivity\ncontaining spatial position of EEG electrodes. A multi-head-attention graph\nconvolutional network is also applied in the SpS to fuse the features from\ndifferent EEG channels. The TeS is based on long short-term memory and graph\ntransformer networks, where the temporal information of different time-windows\nis fused. Moreover, the DAL is used after the SpS to obtain the\ndomain-invariant feature. Results: Experimental results under tenfold\ncross-validation show that the proposed SAD-TIME method achieves 92.00% and\n94.00% depression classification accuracies on two datasets, respectively, in\ncross-subject mode. Conclusion: SAD-TIME is a robust depression detection\nmodel, where the automatedly-generated features, the SpS and the TeS assist the\nclassification performance with the fusion of the innate spatiotemporal\ninformation in the EEG signals.\n","authors":["Han-Guang Wang","Hui-Rang Hou","Li-Cheng Jin","Chen-Yang Xu","Zhong-Yi Zhang","Qing-Hao Meng"],"pdf_url":"https://arxiv.org/pdf/2411.08521v1.pdf","comment":"21pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.13929v4","updated":"2024-11-13T10:57:21Z","published":"2024-05-22T18:58:58Z","title":"Vikhr: Constructing a State-of-the-art Bilingual Open-Source\n Instruction-Following Large Language Model for Russian","summary":" There has been a surge in developing various Large Language Models (LLMs).\nHowever, text generation for languages other than English often faces\nsignificant challenges, including poor generation quality and reduced\ncomputational performance due to the disproportionate representation of tokens\nin the model's vocabulary. In this work, we address these issues by developing\na pipeline for adapting English-oriented pre-trained models to other languages\nand constructing efficient bilingual LLMs. Using this pipeline, we construct\nVikhr, a state-of-the-art bilingual open-source instruction-following LLM\ndesigned specifically for the Russian language. \"Vikhr\" refers to the name of\nthe Mistral LLM series and means a \"strong gust of wind.\" Unlike previous\nRussian-language models that typically rely on LoRA adapters on top of\nEnglish-oriented models, sacrificing performance for lower training costs,\nVikhr features an adapted tokenizer vocabulary and undergoes continued\npre-training and instruction tuning of all weights. This not only enhances the\nmodel's performance but also significantly improves its computational and\ncontextual efficiency. The remarkable performance of Vikhr across various\nRussian-language benchmarks can also be attributed to our efforts in expanding\ninstruction datasets and corpora for continued pre-training. Vikhr not only\nsets a new state of the art among open-source LLMs for Russian but even\noutperforms some proprietary closed-source models on certain benchmarks. The\nmodel weights, instruction sets, and code are publicly available.\n","authors":["Aleksandr Nikolich","Konstantin Korolev","Sergei Bratchikov","Igor Kiselev","Artem Shelmanov"],"pdf_url":"https://arxiv.org/pdf/2405.13929v4.pdf","comment":"Accepted at WMRL @ EMNLP-2024"},{"id":"http://arxiv.org/abs/2411.08514v1","updated":"2024-11-13T10:53:07Z","published":"2024-11-13T10:53:07Z","title":"Explainers' Mental Representations of Explainees' Needs in Everyday\n Explanations","summary":" In explanations, explainers have mental representations of explainees'\ndeveloping knowledge and shifting interests regarding the explanandum. These\nmental representations are dynamic in nature and develop over time, thereby\nenabling explainers to react to explainees' needs by adapting and customizing\nthe explanation. XAI should be able to react to explainees' needs in a similar\nmanner. Therefore, a component that incorporates aspects of explainers' mental\nrepresentations of explainees is required. In this study, we took first steps\nby investigating explainers' mental representations in everyday explanations of\ntechnological artifacts. According to the dual nature theory, technological\nartifacts require explanations with two distinct perspectives, namely\nobservable and measurable features addressing \"Architecture\" or interpretable\naspects addressing \"Relevance\". We conducted extended semi structured pre-,\npost- and video recall-interviews with explainers (N=9) in the context of an\nexplanation. The transcribed interviews were analyzed utilizing qualitative\ncontent analysis. The explainers' answers regarding the explainees' knowledge\nand interests with regard to the technological artifact emphasized the\nvagueness of early assumptions of explainers toward strong beliefs in the\ncourse of explanations. The assumed knowledge of explainees in the beginning is\ncentered around Architecture and develops toward knowledge with regard to both\nArchitecture and Relevance. In contrast, explainers assumed higher interests in\nRelevance in the beginning to interests regarding both Architecture and\nRelevance in the further course of explanations. Further, explainers often\nfinished the explanation despite their perception that explainees still had\ngaps in knowledge. These findings are transferred into practical implications\nrelevant for user models for adaptive explainable systems.\n","authors":["Michael Erol Schaffer","Lutz Terfloth","Carsten Schulte","Heike M. Buhl"],"pdf_url":"https://arxiv.org/pdf/2411.08514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08506v1","updated":"2024-11-13T10:43:31Z","published":"2024-11-13T10:43:31Z","title":"An Information Theoretic Approach to Operationalize Right to Data\n Protection","summary":" The widespread practice of indiscriminate data scraping to fine-tune language\nmodels (LMs) raises significant legal and ethical concerns, particularly\nregarding compliance with data protection laws such as the General Data\nProtection Regulation (GDPR). This practice often results in the unauthorized\nuse of personal information, prompting growing debate within the academic and\nregulatory communities. Recent works have introduced the concept of generating\nunlearnable datasets (by adding imperceptible noise to the clean data), such\nthat the underlying model achieves lower loss during training but fails to\ngeneralize to the unseen test setting. Though somewhat effective, these\napproaches are predominantly designed for images and are limited by several\npractical constraints like requiring knowledge of the target model. To this\nend, we introduce RegText, a framework that injects imperceptible spurious\ncorrelations into natural language datasets, effectively rendering them\nunlearnable without affecting semantic content. We demonstrate RegText's\nutility through rigorous empirical analysis of small and large LMs. Notably,\nRegText can restrict newer models like GPT-4o and Llama from learning on our\ngenerated data, resulting in a drop in their test accuracy compared to their\nzero-shot performance and paving the way for generating unlearnable text to\nprotect public data.\n","authors":["Abhinav Java","Simra Shahid","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2411.08506v1.pdf","comment":"First two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2411.08504v1","updated":"2024-11-13T10:42:11Z","published":"2024-11-13T10:42:11Z","title":"Towards Objective and Unbiased Decision Assessments with LLM-Enhanced\n Hierarchical Attention Networks","summary":" How objective and unbiased are we while making decisions? This work\ninvestigates cognitive bias identification in high-stake decision making\nprocess by human experts, questioning its effectiveness in real-world settings,\nsuch as candidates assessments for university admission. We begin with a\nstatistical analysis assessing correlations among different decision points\namong in the current process, which discovers discrepancies that imply\ncognitive bias and inconsistency in decisions. This motivates our exploration\nof bias-aware AI-augmented workflow that surpass human judgment. We propose\nBGM-HAN, a hierarchical attention network enhanced by byte-pair encoding,\nmulti-head attention and gated residual connection. Using it as backbone model,\nwe further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow, which\nsimulate real-world decision-making. In our experiments, both the proposed\nmodel and the agentic workflow significantly improves on both human judgment\nand alternative models, validated with real-world data.\n","authors":["Junhua Liu","Kwan Hui Lim","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07940v2","updated":"2024-11-13T10:29:51Z","published":"2024-11-12T17:09:20Z","title":"Automatic dataset shift identification to support root cause analysis of\n AI performance drift","summary":" Shifts in data distribution can substantially harm the performance of\nclinical AI models. Hence, various methods have been developed to detect the\npresence of such shifts at deployment time. However, root causes of dataset\nshifts are varied, and the choice of shift mitigation strategies is highly\ndependent on the precise type of shift encountered at test time. As such,\ndetecting test-time dataset shift is not sufficient: precisely identifying\nwhich type of shift has occurred is critical. In this work, we propose the\nfirst unsupervised dataset shift identification framework, effectively\ndistinguishing between prevalence shift (caused by a change in the label\ndistribution), covariate shift (caused by a change in input characteristics)\nand mixed shifts (simultaneous prevalence and covariate shifts). We discuss the\nimportance of self-supervised encoders for detecting subtle covariate shifts\nand propose a novel shift detector leveraging both self-supervised encoders and\ntask model outputs for improved shift detection. We report promising results\nfor the proposed shift identification framework across three different imaging\nmodalities (chest radiography, digital mammography, and retinal fundus images)\non five types of real-world dataset shifts, using four large publicly available\ndatasets.\n","authors":["Mélanie Roschewitz","Raghav Mehta","Charles Jones","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2411.07940v2.pdf","comment":"Code available at\n https://github.com/biomedia-mira/shift_identification"},{"id":"http://arxiv.org/abs/2408.14487v3","updated":"2024-11-13T10:09:23Z","published":"2024-08-19T18:47:07Z","title":"Active learning of digenic functions with boolean matrix logic\n programming","summary":" We apply logic-based machine learning techniques to facilitate cellular\nengineering and drive biological discovery, based on comprehensive databases of\nmetabolic processes called genome-scale metabolic network models (GEMs).\nPredicted host behaviours are not always correctly described by GEMs. Learning\nthe intricate genetic interactions within GEMs presents computational and\nempirical challenges. To address these, we describe a novel approach called\nBoolean Matrix Logic Programming (BMLP) by leveraging boolean matrices to\nevaluate large logic programs. We introduce a new system, $BMLP_{active}$,\nwhich efficiently explores the genomic hypothesis space by guiding informative\nexperimentation through active learning. In contrast to sub-symbolic methods,\n$BMLP_{active}$ encodes a state-of-the-art GEM of a widely accepted bacterial\nhost in an interpretable and logical representation using datalog logic\nprograms. Notably, $BMLP_{active}$ can successfully learn the interaction\nbetween a gene pair with fewer training examples than random experimentation,\novercoming the increase in experimental design space. $BMLP_{active}$ enables\nrapid optimisation of metabolic models and offers a realistic approach to a\nself-driving lab for microbial engineering.\n","authors":["Lun Ai","Stephen H. Muggleton","Shi-shun Liang","Geoff S. Baldwin"],"pdf_url":"https://arxiv.org/pdf/2408.14487v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.06724"},{"id":"http://arxiv.org/abs/2409.15503v3","updated":"2024-11-13T10:02:25Z","published":"2024-09-23T19:46:19Z","title":"From Text to Treatment Effects: A Meta-Learning Approach to Handling\n Text-Based Confounding","summary":" One of the central goals of causal machine learning is the accurate\nestimation of heterogeneous treatment effects from observational data. In\nrecent years, meta-learning has emerged as a flexible, model-agnostic paradigm\nfor estimating conditional average treatment effects (CATE) using any\nsupervised model. This paper examines the performance of meta-learners when the\nconfounding variables are expressed in text. Through synthetic data\nexperiments, we show that learners using pre-trained text representations of\nconfounders, in addition to tabular background variables, achieve improved CATE\nestimates compared to those relying solely on the tabular variables,\nparticularly when sufficient data is available. However, due to the entangled\nnature of the text embeddings, these models do not fully match the performance\nof meta-learners with perfect confounder knowledge. These findings highlight\nboth the potential and the limitations of pre-trained text representations for\ncausal inference and open up interesting avenues for future research.\n","authors":["Henri Arno","Paloma Rabaey","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2409.15503v3.pdf","comment":"Presented at the NeurIPS 2024 Workshop on Causal Representation\n Learning"},{"id":"http://arxiv.org/abs/2410.17851v2","updated":"2024-11-13T10:01:38Z","published":"2024-10-23T13:20:42Z","title":"The Probabilistic Tsetlin Machine: A Novel Approach to Uncertainty\n Quantification","summary":" Tsetlin Machines (TMs) have emerged as a compelling alternative to\nconventional deep learning methods, offering notable advantages such as smaller\nmemory footprint, faster inference, fault-tolerant properties, and\ninterpretability. Although various adaptations of TMs have expanded their\napplicability across diverse domains, a fundamental gap remains in\nunderstanding how TMs quantify uncertainty in their predictions. In response,\nthis paper introduces the Probabilistic Tsetlin Machine (PTM) framework, aimed\nat providing a robust, reliable, and interpretable approach for uncertainty\nquantification. Unlike the original TM, the PTM learns the probability of\nstaying on each state of each Tsetlin Automaton (TA) across all clauses. These\nprobabilities are updated using the feedback tables that are part of the TM\nframework: Type I and Type II feedback. During inference, TAs decide their\nactions by sampling states based on learned probability distributions, akin to\nBayesian neural networks when generating weight values. In our experimental\nanalysis, we first illustrate the spread of the probabilities across TA states\nfor the noisy-XOR dataset. Then we evaluate the PTM alongside benchmark models\nusing both simulated and real-world datasets. The experiments on the simulated\ndataset reveal the PTM's effectiveness in uncertainty quantification,\nparticularly in delineating decision boundaries and identifying regions of high\nuncertainty. Moreover, when applied to multiclass classification tasks using\nthe Iris dataset, the PTM demonstrates competitive performance in terms of\npredictive entropy and expected calibration error, showcasing its potential as\na reliable tool for uncertainty estimation. Our findings underscore the\nimportance of selecting appropriate models for accurate uncertainty\nquantification in predictive tasks, with the PTM offering a particularly\ninterpretable and effective solution.\n","authors":["K. Darshana Abeyrathna","Sara El Mekkaoui","Andreas Hafver","Christian Agrell"],"pdf_url":"https://arxiv.org/pdf/2410.17851v2.pdf","comment":"12 pages, 5 figures, 6 tables, accepted and presented at ICAAI 2024,\n London"},{"id":"http://arxiv.org/abs/2411.08478v1","updated":"2024-11-13T09:55:59Z","published":"2024-11-13T09:55:59Z","title":"Learning Model Agnostic Explanations via Constraint Programming","summary":" Interpretable Machine Learning faces a recurring challenge of explaining the\npredictions made by opaque classifiers such as ensemble models, kernel methods,\nor neural networks in terms that are understandable to humans. When the model\nis viewed as a black box, the objective is to identify a small set of features\nthat jointly determine the black box response with minimal error. However,\nfinding such model-agnostic explanations is computationally demanding, as the\nproblem is intractable even for binary classifiers. In this paper, the task is\nframed as a Constraint Optimization Problem, where the constraint solver seeks\nan explanation of minimum error and bounded size for an input data instance and\na set of samples generated by the black box. From a theoretical perspective,\nthis constraint programming approach offers PAC-style guarantees for the output\nexplanation. We evaluate the approach empirically on various datasets and show\nthat it statistically outperforms the state-of-the-art heuristic Anchors\nmethod.\n","authors":["Frederic Koriche","Jean-Marie Lagniez","Stefan Mengel","Chi Tran"],"pdf_url":"https://arxiv.org/pdf/2411.08478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07979v2","updated":"2024-11-13T09:52:45Z","published":"2024-11-12T17:58:40Z","title":"Exact, Tractable Gauss-Newton Optimization in Deep Reversible\n Architectures Reveal Poor Generalization","summary":" Second-order optimization has been shown to accelerate the training of deep\nneural networks in many applications, often yielding faster progress per\niteration on the training loss compared to first-order optimizers. However, the\ngeneralization properties of second-order methods are still being debated.\nTheoretical investigations have proved difficult to carry out outside the\ntractable settings of heavily simplified model classes -- thus, the relevance\nof existing theories to practical deep learning applications remains unclear.\nSimilarly, empirical studies in large-scale models and real datasets are\nsignificantly confounded by the necessity to approximate second-order updates\nin practice. It is often unclear whether the observed generalization behaviour\narises specifically from the second-order nature of the parameter updates, or\ninstead reflects the specific structured (e.g.\\ Kronecker) approximations used\nor any damping-based interpolation towards first-order updates. Here, we show\nfor the first time that exact Gauss-Newton (GN) updates take on a tractable\nform in a class of deep reversible architectures that are sufficiently\nexpressive to be meaningfully applied to common benchmark datasets. We exploit\nthis novel setting to study the training and generalization properties of the\nGN optimizer. We find that exact GN generalizes poorly. In the mini-batch\ntraining setting, this manifests as rapidly saturating progress even on the\n\\emph{training} loss, with parameter updates found to overfit each\nmini-batchatch without producing the features that would support generalization\nto other mini-batches. We show that our experiments run in the ``lazy'' regime,\nin which the neural tangent kernel (NTK) changes very little during the course\nof training. This behaviour is associated with having no significant changes in\nneural representations, explaining the lack of generalization.\n","authors":["Davide Buffelli","Jamie McGowan","Wangkun Xu","Alexandru Cioba","Da-shan Shiu","Guillaume Hennequin","Alberto Bernacchia"],"pdf_url":"https://arxiv.org/pdf/2411.07979v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.17804v3","updated":"2024-11-13T09:50:48Z","published":"2024-06-22T15:24:33Z","title":"A Review of Electromagnetic Elimination Methods for low-field portable\n MRI scanner","summary":" This paper analyzes conventional and deep learning methods for eliminating\nelectromagnetic interference (EMI) in MRI systems. We compare traditional\nanalytical and adaptive techniques with advanced deep learning approaches. Key\nstrengths and limitations of each method are highlighted. Recent advancements\nin active EMI elimination, such as external EMI receiver coils, are discussed\nalongside deep learning methods, which show superior EMI suppression by\nleveraging neural networks trained on MRI data. While deep learning improves\nEMI elimination and diagnostic capabilities, it introduces security and safety\nconcerns, particularly in commercial applications. A balanced approach,\nintegrating conventional reliability with deep learning's advanced\ncapabilities, is proposed for more effective EMI suppression in MRI systems.\n","authors":["Wanyu Bian","Panfeng Li","Mengyao Zheng","Chihang Wang","Anying Li","Ying Li","Haowei Ni","Zixuan Zeng"],"pdf_url":"https://arxiv.org/pdf/2406.17804v3.pdf","comment":"Accepted by 2024 5th International Conference on Machine Learning and\n Computer Application"},{"id":"http://arxiv.org/abs/2411.04555v2","updated":"2024-11-13T09:44:33Z","published":"2024-11-07T09:26:54Z","title":"An Axiomatic Study of the Evaluation of Enthymeme Decoding in Weighted\n Structured Argumentation","summary":" An argument can be seen as a pair consisting of a set of premises and a claim\nsupported by them. Arguments used by humans are often enthymemes, i.e., some\npremises are implicit. To better understand, evaluate, and compare enthymemes,\nit is essential to decode them, i.e., to find the missing premisses. Many\nenthymeme decodings are possible. We need to distinguish between reasonable\ndecodings and unreasonable ones. However, there is currently no research in the\nliterature on \"How to evaluate decodings?\". To pave the way and achieve this\ngoal, we introduce seven criteria related to decoding, based on different\nresearch areas. Then, we introduce the notion of criterion measure, the\nobjective of which is to evaluate a decoding with regard to a certain\ncriterion. Since such measures need to be validated, we introduce several\ndesirable properties for them, called axioms. Another main contribution of the\npaper is the construction of certain criterion measures that are validated by\nour axioms. Such measures can be used to identify the best enthymemes\ndecodings.\n","authors":["Jonathan Ben-Naim","Victor David","Anthony Hunter"],"pdf_url":"https://arxiv.org/pdf/2411.04555v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2411.08469v1","updated":"2024-11-13T09:40:37Z","published":"2024-11-13T09:40:37Z","title":"Building Trustworthy AI: Transparent AI Systems via Large Language\n Models, Ontologies, and Logical Reasoning (TranspNet)","summary":" Growing concerns over the lack of transparency in AI, particularly in\nhigh-stakes fields like healthcare and finance, drive the need for explainable\nand trustworthy systems. While Large Language Models (LLMs) perform\nexceptionally well in generating accurate outputs, their \"black box\" nature\nposes significant challenges to transparency and trust. To address this, the\npaper proposes the TranspNet pipeline, which integrates symbolic AI with LLMs.\nBy leveraging domain expert knowledge, retrieval-augmented generation (RAG),\nand formal reasoning frameworks like Answer Set Programming (ASP), TranspNet\nenhances LLM outputs with structured reasoning and verification. This approach\nensures that AI systems deliver not only accurate but also explainable and\ntrustworthy results, meeting regulatory demands for transparency and\naccountability. TranspNet provides a comprehensive solution for developing AI\nsystems that are reliable and interpretable, making it suitable for real-world\napplications where trust is critical.\n","authors":["Fadi Al Machot","Martin Thomas Horsch","Habib Ullah"],"pdf_url":"https://arxiv.org/pdf/2411.08469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08464v1","updated":"2024-11-13T09:36:50Z","published":"2024-11-13T09:36:50Z","title":"Crystal Structure Generation Based On Material Properties","summary":" The discovery of new materials is very important to the field of materials\nscience. When researchers explore new materials, they often have expected\nperformance requirements for their crystal structure. In recent years,\ndata-driven methods have made great progress in the direction plane of crystal\nstructure generation, but there is still a lack of methods that can effectively\nmap material properties to crystal structure. In this paper, we propose a\nCrystal DiT model to generate the crystal structure from the expected material\nproperties by embedding the material properties and combining the symmetry\ninformation predicted by the large language model. Experimental verification\nshows that our proposed method has good performance.\n","authors":["Chao Huang","JiaHui Chen","HongRui Liang","ChunYan Chen","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08463v1","updated":"2024-11-13T09:33:33Z","published":"2024-11-13T09:33:33Z","title":"Symbolic-AI-Fusion Deep Learning (SAIF-DL): Encoding Knowledge into\n Training with Answer Set Programming Loss Penalties by a Novel Loss Function\n Approach","summary":" This paper presents a hybrid methodology that enhances the training process\nof deep learning (DL) models by embedding domain expert knowledge using\nontologies and answer set programming (ASP). By integrating these symbolic AI\nmethods, we encode domain-specific constraints, rules, and logical reasoning\ndirectly into the model's learning process, thereby improving both performance\nand trustworthiness. The proposed approach is flexible and applicable to both\nregression and classification tasks, demonstrating generalizability across\nvarious fields such as healthcare, autonomous systems, engineering, and battery\nmanufacturing applications. Unlike other state-of-the-art methods, the strength\nof our approach lies in its scalability across different domains. The design\nallows for the automation of the loss function by simply updating the ASP\nrules, making the system highly scalable and user-friendly. This facilitates\nseamless adaptation to new domains without significant redesign, offering a\npractical solution for integrating expert knowledge into DL models in\nindustrial settings such as battery manufacturing.\n","authors":["Fadi Al Machot","Martin Thomas Horsch","Habib Ullah"],"pdf_url":"https://arxiv.org/pdf/2411.08463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06098v2","updated":"2024-11-13T09:31:14Z","published":"2024-11-09T07:19:56Z","title":"LT-DARTS: An Architectural Approach to Enhance Deep Long-Tailed Learning","summary":" Deep long-tailed recognition has been widely studied to address the issue of\nimbalanced data distributions in real-world scenarios. However, there has been\ninsufficient focus on the design of neural architectures, despite empirical\nevidence suggesting that architecture can significantly impact performance. In\nthis paper, we attempt to mitigate long-tailed issues through architectural\nimprovements. To simplify the design process, we utilize Differential\nArchitecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS\nmethods struggle to perform well in long-tailed scenarios. To tackle this\nchallenge, we introduce Long-Tailed Differential Architecture Search\n(LT-DARTS). Specifically, we conduct extensive experiments to explore\narchitectural components that demonstrate better performance on long-tailed\ndata and propose a new search space based on our observations. This ensures\nthat the architecture obtained through our search process incorporates superior\ncomponents. Additionally, we propose replacing the learnable linear classifier\nwith an Equiangular Tight Frame (ETF) classifier to further enhance our method.\nThis classifier effectively alleviates the biased search process and prevents\nperformance collapse. Extensive experimental evaluations demonstrate that our\napproach consistently improves upon existing methods from an orthogonal\nperspective and achieves state-of-the-art results with simple enhancements.\n","authors":["Yuhan Pan","Yanan Sun","Wei Gong"],"pdf_url":"https://arxiv.org/pdf/2411.06098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08460v1","updated":"2024-11-13T09:31:06Z","published":"2024-11-13T09:31:06Z","title":"Trap-MID: Trapdoor-based Defense against Model Inversion Attacks","summary":" Model Inversion (MI) attacks pose a significant threat to the privacy of Deep\nNeural Networks by recovering training data distribution from well-trained\nmodels. While existing defenses often rely on regularization techniques to\nreduce information leakage, they remain vulnerable to recent attacks. In this\npaper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to\nmislead MI attacks. A trapdoor is integrated into the model to predict a\nspecific label when the input is injected with the corresponding trigger.\nConsequently, this trapdoor information serves as the \"shortcut\" for MI\nattacks, leading them to extract trapdoor triggers rather than private data. We\nprovide theoretical insights into the impacts of trapdoor's effectiveness and\nnaturalness on deceiving MI attacks. In addition, empirical experiments\ndemonstrate the state-of-the-art defense performance of Trap-MID against\nvarious MI attacks without the requirements for extra data or large\ncomputational overhead. Our source code is publicly available at\nhttps://github.com/ntuaislab/Trap-MID.\n","authors":["Zhen-Ting Liu","Shang-Tse Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08460v1.pdf","comment":"Accepted by Neural Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2411.00393v4","updated":"2024-11-13T09:27:41Z","published":"2024-11-01T06:40:47Z","title":"Advantages of Neural Population Coding for Deep Learning","summary":" Scalar variables, e.g., the orientation of a shape in an image, are commonly\npredicted using a single output neuron in a neural network. In contrast, the\nmammalian cortex represents variables with a population of neurons. In this\npopulation code, each neuron is most active at its preferred value and shows\npartial activity for other values. Here, we investigate the benefit of using a\npopulation code for the output layer of a neural network. We compare population\ncodes against single-neuron outputs and one-hot vectors. First, we show\ntheoretically and in experiments with synthetic data that population codes\nimprove robustness to input noise in networks of stacked linear layers. Second,\nwe demonstrate the benefit of using population codes to encode ambiguous\noutputs, such as the pose of symmetric objects. Using the T-LESS dataset of\nfeature-less real-world objects, we show that population codes improve the\naccuracy of predicting 3D object orientation from image input.\n","authors":["Heiko Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2411.00393v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08447v1","updated":"2024-11-13T08:59:53Z","published":"2024-11-13T08:59:53Z","title":"Learning Dynamic Cognitive Map with Autonomous Navigation","summary":" Inspired by animal navigation strategies, we introduce a novel computational\nmodel to navigate and map a space rooted in biologically inspired principles.\nAnimals exhibit extraordinary navigation prowess, harnessing memory,\nimagination, and strategic decision-making to traverse complex and aliased\nenvironments adeptly. Our model aims to replicate these capabilities by\nincorporating a dynamically expanding cognitive map over predicted poses within\nan Active Inference framework, enhancing our agent's generative model\nplasticity to novelty and environmental changes. Through structure learning and\nactive inference navigation, our model demonstrates efficient exploration and\nexploitation, dynamically expanding its model capacity in response to\nanticipated novel un-visited locations and updating the map given new evidence\ncontradicting previous beliefs. Comparative analyses in mini-grid environments\nwith the Clone-Structured Cognitive Graph model (CSCG), which shares similar\nobjectives, highlight our model's ability to rapidly learn environmental\nstructures within a single episode, with minimal navigation overlap. Our model\nachieves this without prior knowledge of observation and world dimensions,\nunderscoring its robustness and efficacy in navigating intricate environments.\n","authors":["Daria de Tinguy","Tim Verbelen","Bart Dhoedt"],"pdf_url":"https://arxiv.org/pdf/2411.08447v1.pdf","comment":"under submission at Frontiers Computer Neuroscience"},{"id":"http://arxiv.org/abs/2410.21991v4","updated":"2024-11-13T08:59:31Z","published":"2024-10-29T12:22:07Z","title":"From Explicit Rules to Implicit Reasoning in an Interpretable Violence\n Monitoring System","summary":" Recently, research based on pre-trained models has demonstrated outstanding\nperformance in violence surveillance tasks. However, most of them were\nblack-box systems which faced challenges regarding explainability during\ntraining and inference processes. An important question is how to incorporate\nexplicit knowledge into these implicit models, thereby designing expert-driven\nand interpretable violence surveillance systems. This paper proposes a new\nparadigm for weakly supervised violence monitoring (WSVM) called Rule base\nViolence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure\nwith different designs for images and text. One of the branches is called the\nimplicit branch, which uses only visual features for coarse-grained binary\nclassification. In this branch, image feature extraction is divided into two\nchannels: one responsible for extracting scene frames and the other focusing on\nextracting actions. The other branch is called the explicit branch, which\nutilizes language-image alignment to perform fine-grained classification. For\nthe language channel design in the explicit branch, the proposed RuleCLIP uses\nthe state-of-the-art YOLO-World model to detect objects in video frames, and\nassociation rules are identified through data mining methods as descriptions of\nthe video. Leveraging the dual-branch architecture, RuleVM achieves\ninterpretable coarse-grained and fine-grained violence surveillance. Extensive\nexperiments were conducted on two commonly used benchmarks, and the results\nshow that RuleCLIP achieved the best performance in both coarse-grained and\nfine-grained monitoring, significantly outperforming existing state-of-the-art\nmethods. Moreover, interpretability experiments uncovered some interesting\nrules, such as the observation that as the number of people increases, the risk\nlevel of violent behavior also rises.\n","authors":["Wen-Dong Jiang","Chih-Yung Chang","Ssu-Chi Kuai","Diptendu Sinha Roy"],"pdf_url":"https://arxiv.org/pdf/2410.21991v4.pdf","comment":"12 pages,7 figures IEEE TSMCA (Under review)"},{"id":"http://arxiv.org/abs/2411.08438v1","updated":"2024-11-13T08:43:37Z","published":"2024-11-13T08:43:37Z","title":"Towards Optimizing a Retrieval Augmented Generation using Large Language\n Model on Academic Data","summary":" Given the growing trend of many organizations integrating Retrieval Augmented\nGeneration (RAG) into their operations, we assess RAG on domain-specific data\nand test state-of-the-art models across various optimization techniques. We\nincorporate four optimizations; Multi-Query, Child-Parent-Retriever, Ensemble\nRetriever, and In-Context-Learning, to enhance the functionality and\nperformance in the academic domain. We focus on data retrieval, specifically\ntargeting various study programs at a large technical university. We\nadditionally introduce a novel evaluation approach, the RAG Confusion Matrix\ndesigned to assess the effectiveness of various configurations within the RAG\nframework. By exploring the integration of both open-source (e.g., Llama2,\nMistral) and closed-source (GPT-3.5 and GPT-4) Large Language Models, we offer\nvaluable insights into the application and optimization of RAG frameworks in\ndomain-specific contexts. Our experiments show a significant performance\nincrease when including multi-query in the retrieval phase.\n","authors":["Anum Afzal","Juraj Vladika","Gentrit Fazlija","Andrei Staradubets","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2411.08438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08433v1","updated":"2024-11-13T08:34:07Z","published":"2024-11-13T08:34:07Z","title":"3D Multi-Object Tracking with Semi-Supervised GRU-Kalman Filter","summary":" 3D Multi-Object Tracking (MOT), a fundamental component of environmental\nperception, is essential for intelligent systems like autonomous driving and\nrobotic sensing. Although Tracking-by-Detection frameworks have demonstrated\nexcellent performance in recent years, their application in real-world\nscenarios faces significant challenges. Object movement in complex environments\nis often highly nonlinear, while existing methods typically rely on linear\napproximations of motion. Furthermore, system noise is frequently modeled as a\nGaussian distribution, which fails to capture the true complexity of the noise\ndynamics. These oversimplified modeling assumptions can lead to significant\nreductions in tracking precision. To address this, we propose a GRU-based MOT\nmethod, which introduces a learnable Kalman filter into the motion module. This\napproach is able to learn object motion characteristics through data-driven\nlearning, thereby avoiding the need for manual model design and model error. At\nthe same time, to avoid abnormal supervision caused by the wrong association\nbetween annotations and trajectories, we design a semi-supervised learning\nstrategy to accelerate the convergence speed and improve the robustness of the\nmodel. Evaluation experiment on the nuScenes and Argoverse2 datasets\ndemonstrates that our system exhibits superior performance and significant\npotential compared to traditional TBD methods.\n","authors":["Xiaoxiang Wang","Jiaxin Liu","Miaojie Feng","Zhaoxing Zhang","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2411.08433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21283v2","updated":"2024-11-13T08:33:17Z","published":"2024-10-11T03:19:44Z","title":"pLDDT-Predictor: High-speed Protein Screening Using Transformer and ESM2","summary":" Recent advancements in protein structure prediction, particularly AlphaFold2,\nhave revolutionized structural biology by achieving near-experimental accuracy\n($\\text{average RMSD} < 1.5\\text{\\AA}$). However, the computational demands of\nthese models (approximately 30 minutes per protein on an RTX 4090)\nsignificantly limit their application in high-throughput protein screening.\nWhile large language models like ESM (Evolutionary Scale Modeling) have shown\npromise in extracting structural information directly from protein sequences,\nrapid assessment of protein structure quality for large-scale analyses remains\na major challenge.\n We introduce pLDDT-Predictor, a high-speed protein screening tool that\nachieves a $250,000\\times$ speedup compared to AlphaFold2 by leveraging\npre-trained ESM2 protein embeddings and a Transformer architecture. Our model\npredicts AlphaFold2's pLDDT (predicted Local Distance Difference Test) scores\nwith a Pearson correlation of 0.7891 and processes proteins in just 0.007\nseconds on average. Using a comprehensive dataset of 1.5 million diverse\nprotein sequences (ranging from 50 to 2048 amino acids), we demonstrate that\npLDDT-Predictor accurately classifies high-confidence structures (pLDDT $>$ 70)\nwith 91.2\\% accuracy and achieves an MSE of 84.8142 compared to AlphaFold2's\npredictions.\n The source code and pre-trained models are freely available at\n\\url{https://github.com/jw-chae/pLDDT_Predictor}, enabling the research\ncommunity to perform rapid, large-scale protein structure quality assessments.\n","authors":["Joongwon Chae","Zhenyu Wang","Ijaz Gul","Jiansong Ji","Zhenglin Chen","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2410.21283v2.pdf","comment":"6 pages main topic, 8 pages including citiation, 4 figures"},{"id":"http://arxiv.org/abs/2411.08432v1","updated":"2024-11-13T08:32:42Z","published":"2024-11-13T08:32:42Z","title":"One STEP at a time: Language Agents are Stepwise Planners","summary":" Language agents have shown promising adaptability in dynamic environments to\nperform complex tasks. However, despite the versatile knowledge embedded in\nlarge language models, these agents still fall short when it comes to tasks\nthat require planning. We introduce STEP, a novel framework designed to\nefficiently learn from previous experiences to enhance the planning\ncapabilities of language agents in future steps. Concretely, STEP functions\nthrough four interconnected components. First, the Planner takes on the task,\nbreaks it down into subtasks and provides relevant insights. Then the Executor\ngenerates action candidates, while the Evaluator ensures the actions align with\nlearned rules from previous experiences. Lastly, Memory stores experiences to\ninform future decisions. In the ScienceWorld benchmark, our results show that\nSTEP consistently outperforms state-of-the-art models, achieving an overall\nscore of 67.4 and successfully completing 12 out of 18 tasks. These findings\nhighlight STEP's potential as a framework for enhancing planning capabilities\nin language agents, paving the way for more sophisticated task-solving in\ndynamic environments.\n","authors":["Minh Nguyen","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2411.08432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07501v2","updated":"2024-11-13T08:30:52Z","published":"2024-11-12T02:57:15Z","title":"LAuReL: Learned Augmented Residual Layer","summary":" One of the core pillars of efficient deep learning methods is architectural\nimprovements such as the residual/skip connection, which has led to\nsignificantly better model convergence and quality. Since then the residual\nconnection has become ubiquitous in not just convolutional neural networks but\nalso transformer-based architectures, the backbone of LLMs.\n In this paper we introduce \\emph{Learned Augmented Residual Layer} (LAuReL)\n-- a novel generalization of the canonical residual connection -- with the goal\nto be an in-situ replacement of the latter while outperforming on both model\nquality and footprint metrics. Our experiments show that using \\laurel can help\nboost performance for both vision and language models. For example, on the\nResNet-50, ImageNet 1K task, it achieves $60\\%$ of the gains from adding an\nextra layer, while only adding $0.003\\%$ more parameters, and matches it while\nadding $2.6\\times$ fewer parameters.\n","authors":["Gaurav Menghani","Ravi Kumar","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07501v2.pdf","comment":"Accepted at the 2nd Efficient Systems for Foundation Models Workshop\n at the International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2411.01137v2","updated":"2024-11-13T08:24:09Z","published":"2024-11-02T04:48:41Z","title":"Data movement limits to frontier model training","summary":" We present a theoretical model of distributed training, and use it to analyze\nhow far dense and sparse training runs can be scaled. Under our baseline\nassumptions, given a three month training duration, data movement bottlenecks\nbegin to significantly lower hardware utilization for training runs exceeding\nabout $10^{28}$ FLOP, two orders of magnitude above the largest training run to\ndate, suggesting the arrival of fundamental barriers to scaling in three years\ngiven recent rates of growth. A training run exceeding about $10^{31}$ FLOP is\ninfeasible even at low utilization. However, more aggressive batch size scaling\nand/or shorter and fatter model shapes, if achievable, have the potential to\npermit much larger training runs.\n","authors":["Ege Erdil","David Schneider-Joseph"],"pdf_url":"https://arxiv.org/pdf/2411.01137v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08424v1","updated":"2024-11-13T08:17:52Z","published":"2024-11-13T08:17:52Z","title":"A Heterogeneous Graph Neural Network Fusing Functional and Structural\n Connectivity for MCI Diagnosis","summary":" Brain connectivity alternations associated with brain disorders have been\nwidely reported in resting-state functional imaging (rs-fMRI) and diffusion\ntensor imaging (DTI). While many dual-modal fusion methods based on graph\nneural networks (GNNs) have been proposed, they generally follow homogenous\nfusion ways ignoring rich heterogeneity of dual-modal information. To address\nthis issue, we propose a novel method that integrates functional and structural\nconnectivity based on heterogeneous graph neural networks (HGNNs) to better\nleverage the rich heterogeneity in dual-modal images. We firstly use blood\noxygen level dependency and whiter matter structure information provided by\nrs-fMRI and DTI to establish homo-meta-path, capturing node relationships\nwithin the same modality. At the same time, we propose to establish\nhetero-meta-path based on structure-function coupling and brain community\nsearching to capture relations among cross-modal nodes. Secondly, we further\nintroduce a heterogeneous graph pooling strategy that automatically balances\nhomo- and hetero-meta-path, effectively leveraging heterogeneous information\nand preventing feature confusion after pooling. Thirdly, based on the\nflexibility of heterogeneous graphs, we propose a heterogeneous graph data\naugmentation approach that can conveniently address the sample imbalance issue\ncommonly seen in clinical diagnosis. We evaluate our method on ADNI-3 dataset\nfor mild cognitive impairment (MCI) diagnosis. Experimental results indicate\nthe proposed method is effective and superior to other algorithms, with a mean\nclassification accuracy of 93.3%.\n","authors":["Feiyu Yin","Yu Lei","Siyuan Dai","Wenwen Zeng","Guoqing Wu","Liang Zhan","Jinhua Yu"],"pdf_url":"https://arxiv.org/pdf/2411.08424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08418v1","updated":"2024-11-13T08:13:41Z","published":"2024-11-13T08:13:41Z","title":"Enhanced Classroom Dialogue Sequences Analysis with a Hybrid AI Agent:\n Merging Expert Rule-Base with Large Language Models","summary":" Classroom dialogue plays a crucial role in fostering student engagement and\ndeeper learning. However, analysing dialogue sequences has traditionally relied\non either theoretical frameworks or empirical descriptions of practice, with\nlimited integration between the two. This study addresses this gap by\ndeveloping a comprehensive rule base of dialogue sequences and an Artificial\nIntelligence (AI) agent that combines expert-informed rule-based systems with a\nlarge language model (LLM). The agent applies expert knowledge while adapting\nto the complexities of natural language, enabling accurate and flexible\ncategorisation of classroom dialogue sequences. By synthesising findings from\nover 30 studies, we established a comprehensive framework for dialogue\nanalysis. The agent was validated against human expert coding, achieving high\nlevels of precision and reliability. The results demonstrate that the agent\nprovides theory-grounded and adaptive functions, tremendously enhancing the\nefficiency and scalability of classroom dialogue analysis, offering significant\npotential in improving classroom teaching practices and supporting teacher\nprofessional development.\n","authors":["Yun Long","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21063v2","updated":"2024-11-13T08:13:36Z","published":"2024-05-31T17:51:07Z","title":"Neural Network Verification with Branch-and-Bound for General\n Nonlinearities","summary":" Branch-and-bound (BaB) is among the most effective techniques for neural\nnetwork (NN) verification. However, existing works on BaB for NN verification\nhave mostly focused on NNs with piecewise linear activations, especially ReLU\nnetworks. In this paper, we develop a general framework, named GenBaB, to\nconduct BaB on general nonlinearities to verify NNs with general architectures,\nbased on linear bound propagation for NN verification. To decide which neuron\nto branch, we design a new branching heuristic which leverages linear bounds as\nshortcuts to efficiently estimate the potential improvement after branching. To\ndecide nontrivial branching points for general nonlinear functions, we propose\nto pre-optimize branching points, which can be efficiently leveraged during\nverification with a lookup table. We demonstrate the effectiveness of our\nGenBaB on verifying a wide range of NNs, including NNs with activation\nfunctions such as Sigmoid, Tanh, Sine and GeLU, as well as NNs involving\nmulti-dimensional nonlinear operations such as multiplications in LSTMs and\nVision Transformers. Our framework also allows the verification of general\nnonlinear computation graphs and enables verification applications beyond\nsimple NNs, particularly for AC Optimal Power Flow (ACOPF). GenBaB is part of\nthe latest $\\alpha,\\!\\beta$-CROWN, the winner of the 4th and the 5th\nInternational Verification of Neural Networks Competition (VNN-COMP 2023 and\n2024).\n","authors":["Zhouxing Shi","Qirui Jin","Zico Kolter","Suman Jana","Cho-Jui Hsieh","Huan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.21063v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2411.08414v1","updated":"2024-11-13T08:07:21Z","published":"2024-11-13T08:07:21Z","title":"Material Property Prediction with Element Attribute Knowledge Graphs and\n Multimodal Representation Learning","summary":" Machine learning has become a crucial tool for predicting the properties of\ncrystalline materials. However, existing methods primarily represent material\ninformation by constructing multi-edge graphs of crystal structures, often\noverlooking the chemical and physical properties of elements (such as atomic\nradius, electronegativity, melting point, and ionization energy), which have a\nsignificant impact on material performance. To address this limitation, we\nfirst constructed an element property knowledge graph and utilized an embedding\nmodel to encode the element attributes within the knowledge graph. Furthermore,\nwe propose a multimodal fusion framework, ESNet, which integrates element\nproperty features with crystal structure features to generate joint multimodal\nrepresentations. This provides a more comprehensive perspective for predicting\nthe performance of crystalline materials, enabling the model to consider both\nmicrostructural composition and chemical characteristics of the materials. We\nconducted experiments on the Materials Project benchmark dataset, which showed\nleading performance in the bandgap prediction task and achieved results on a\npar with existing benchmarks in the formation energy prediction task.\n","authors":["Chao Huang","Chunyan Chen","Ling Shi","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08409v1","updated":"2024-11-13T07:55:41Z","published":"2024-11-13T07:55:41Z","title":"DiVR: incorporating context from diverse VR scenes for human trajectory\n prediction","summary":" Virtual environments provide a rich and controlled setting for collecting\ndetailed data on human behavior, offering unique opportunities for predicting\nhuman trajectories in dynamic scenes. However, most existing approaches have\noverlooked the potential of these environments, focusing instead on static\ncontexts without considering userspecific factors. Employing the CREATTIVE3D\ndataset, our work models trajectories recorded in virtual reality (VR) scenes\nfor diverse situations including road-crossing tasks with user interactions and\nsimulated visual impairments. We propose Diverse Context VR Human Motion\nPrediction (DiVR), a cross-modal transformer based on the Perceiver\narchitecture that integrates both static and dynamic scene context using a\nheterogeneous graph convolution network. We conduct extensive experiments\ncomparing DiVR against existing architectures including MLP, LSTM, and\ntransformers with gaze and point cloud context. Additionally, we also stress\ntest our model's generalizability across different users, tasks, and scenes.\nResults show that DiVR achieves higher accuracy and adaptability compared to\nother models and to static graphs. This work highlights the advantages of using\nVR datasets for context-aware human trajectory modeling, with potential\napplications in enhancing user experiences in the metaverse. Our source code is\npublicly available at https://gitlab.inria.fr/ffrancog/creattive3d-divr-model.\n","authors":["Franz Franco Gallo","Hui-Yin Wu","Lucile Sassatelli"],"pdf_url":"https://arxiv.org/pdf/2411.08409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08400v1","updated":"2024-11-13T07:38:24Z","published":"2024-11-13T07:38:24Z","title":"BAMAX: Backtrack Assisted Multi-Agent Exploration using Reinforcement\n Learning","summary":" Autonomous robots collaboratively exploring an unknown environment is still\nan open problem. The problem has its roots in coordination among non-stationary\nagents, each with only a partial view of information. The problem is compounded\nwhen the multiple robots must completely explore the environment. In this\npaper, we introduce Backtrack Assisted Multi-Agent Exploration using\nReinforcement Learning (BAMAX), a method for collaborative exploration in\nmulti-agent systems which attempts to explore an entire virtual environment. As\nin the name, BAMAX leverages backtrack assistance to enhance the performance of\nagents in exploration tasks. To evaluate BAMAX against traditional approaches,\nwe present the results of experiments conducted across multiple hexagonal\nshaped grids sizes, ranging from 10x10 to 60x60. The results demonstrate that\nBAMAX outperforms other methods in terms of faster coverage and less\nbacktracking across these environments.\n","authors":["Geetansh Kalra","Amit Patel","Atul Chaudhari","Divye Singh"],"pdf_url":"https://arxiv.org/pdf/2411.08400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08392v1","updated":"2024-11-13T07:24:14Z","published":"2024-11-13T07:24:14Z","title":"RLInspect: An Interactive Visual Approach to Assess Reinforcement\n Learning Algorithm","summary":" Reinforcement Learning (RL) is a rapidly growing area of machine learning\nthat finds its application in a broad range of domains, from finance and\nhealthcare to robotics and gaming. Compared to other machine learning\ntechniques, RL agents learn from their own experiences using trial and error,\nand improve their performance over time. However, assessing RL models can be\nchallenging, which makes it difficult to interpret their behaviour. While\nreward is a widely used metric to evaluate RL models, it may not always provide\nan accurate measure of training performance. In some cases, the reward may seem\nincreasing while the model's performance is actually decreasing, leading to\nmisleading conclusions about the effectiveness of the training. To overcome\nthis limitation, we have developed RLInspect - an interactive visual analytic\ntool, that takes into account different components of the RL model - state,\naction, agent architecture and reward, and provides a more comprehensive view\nof the RL training. By using RLInspect, users can gain insights into the\nmodel's behaviour, identify issues during training, and potentially correct\nthem effectively, leading to a more robust and reliable RL system.\n","authors":["Geetansh Kalra","Divye Singh","Justin Jose"],"pdf_url":"https://arxiv.org/pdf/2411.08392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02288v2","updated":"2024-11-13T07:13:36Z","published":"2024-08-05T07:54:01Z","title":"Spin glass model of in-context learning","summary":" Large language models show a surprising in-context learning ability -- being\nable to use a prompt to form a prediction for a query, yet without additional\ntraining, in stark contrast to old-fashioned supervised learning. Providing a\nmechanistic interpretation and linking the empirical phenomenon to physics are\nthus challenging and remain unsolved. We study a simple yet expressive\ntransformer with linear attention and map this structure to a spin glass model\nwith real-valued spins, where the couplings and fields explain the intrinsic\ndisorder in data. The spin glass model explains how the weight parameters\ninteract with each other during pre-training, and further clarifies why an\nunseen function can be predicted by providing only a prompt yet without further\ntraining. Our theory reveals that for single-instance learning, increasing the\ntask diversity leads to the emergence of in-context learning, by allowing the\nBoltzmann distribution to converge to a unique correct solution of weight\nparameters. Therefore the pre-trained transformer displays a prediction power\nin a novel prompt setting. The proposed analytically tractable model thus\noffers a promising avenue for thinking about how to interpret many intriguing\nbut puzzling properties of large language models.\n","authors":["Yuhao Li","Ruoran Bai","Haiping Huang"],"pdf_url":"https://arxiv.org/pdf/2408.02288v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.08378v1","updated":"2024-11-13T07:03:47Z","published":"2024-11-13T07:03:47Z","title":"Physics Informed Distillation for Diffusion Models","summary":" Diffusion models have recently emerged as a potent tool in generative\nmodeling. However, their inherent iterative nature often results in sluggish\nimage generation due to the requirement for multiple model evaluations. Recent\nprogress has unveiled the intrinsic link between diffusion models and\nProbability Flow Ordinary Differential Equations (ODEs), thus enabling us to\nconceptualize diffusion models as ODE systems. Simultaneously, Physics Informed\nNeural Networks (PINNs) have substantiated their effectiveness in solving\nintricate differential equations through implicit modeling of their solutions.\nBuilding upon these foundational insights, we introduce Physics Informed\nDistillation (PID), which employs a student model to represent the solution of\nthe ODE system corresponding to the teacher diffusion model, akin to the\nprinciples employed in PINNs. Through experiments on CIFAR 10 and ImageNet\n64x64, we observe that PID achieves performance comparable to recent\ndistillation methods. Notably, it demonstrates predictable trends concerning\nmethod-specific hyperparameters and eliminates the need for synthetic dataset\ngeneration during the distillation process. Both of which contribute to its\neasy-to-use nature as a distillation approach for Diffusion Models. Our code\nand pre-trained checkpoint are publicly available at:\nhttps://github.com/pantheon5100/pid_diffusion.git.\n","authors":["Joshua Tian Jin Tee","Kang Zhang","Hee Suk Yoon","Dhananjaya Nagaraja Gowda","Chanwoo Kim","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2411.08378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08375v1","updated":"2024-11-13T06:55:18Z","published":"2024-11-13T06:55:18Z","title":"Developing an Effective Training Dataset to Enhance the Performance of\n AI-based Speaker Separation Systems","summary":" This paper addresses the challenge of speaker separation, which remains an\nactive research topic despite the promising results achieved in recent years.\nThese results, however, often degrade in real recording conditions due to the\npresence of noise, echo, and other interferences. This is because neural models\nare typically trained on synthetic datasets consisting of mixed audio signals\nand their corresponding ground truths, which are generated using computer\nsoftware and do not fully represent the complexities of real-world recording\nscenarios. The lack of realistic training sets for speaker separation remains a\nmajor hurdle, as obtaining individual sounds from mixed audio signals is a\nnontrivial task. To address this issue, we propose a novel method for\nconstructing a realistic training set that includes mixture signals and\ncorresponding ground truths for each speaker. We evaluate this dataset on a\ndeep learning model and compare it to a synthetic dataset. We got a 1.65 dB\nimprovement in Scale Invariant Signal to Distortion Ratio (SI-SDR) for speaker\nseparation accuracy in realistic mixing. Our findings highlight the potential\nof realistic training sets for enhancing the performance of speaker separation\nmodels in real-world scenarios.\n","authors":["Rawad Melhem","Assef Jafar","Oumayma Al Dakkak"],"pdf_url":"https://arxiv.org/pdf/2411.08375v1.pdf","comment":"in Arabic language"},{"id":"http://arxiv.org/abs/2402.11892v2","updated":"2024-11-13T06:54:05Z","published":"2024-02-19T07:07:44Z","title":"Towards Reliable Evaluation of Neural Program Repair with Natural\n Robustness Testing","summary":" In this paper, we propose shifting the focus of robustness evaluation for\nNeural Program Repair (NPR) techniques toward naturally-occurring data\ntransformations. To accomplish this, we first examine the naturalness of\nsemantic-preserving transformations through a two-stage human study. This study\nincludes (1) interviews with senior software developers to establish concrete\ncriteria for evaluating the naturalness of these transformations, and (2) a\nsurvey involving 10 developers to assess the naturalness of 1,178\ntransformations, i.e., pairs of original and transformed programs, applied to\n225 real-world bugs. Our findings show that only 60% of these transformations\nare deemed natural, while 20% are considered unnatural, with strong agreement\namong annotators. Moreover, the unnaturalness of these transformations\nsignificantly impacts both their applicability to benchmarks and the\nconclusions drawn from robustness testing. Next, we conduct natural robustness\ntesting on NPR techniques to assess their true effectiveness against real-world\ndata variations. Our experimental results reveal a substantial number of\nprediction changes in NPR techniques, leading to significant reductions in both\nplausible and correct patch rates when comparing performance on the original\nand transformed datasets. Additionally, we observe notable differences in\nperformance improvements between NPR techniques, suggesting potential biases on\nNPR evaluation introduced by limited datasets. Finally, we propose an LLM-based\nmetric to automate the assessment of transformation naturalness, ensuring the\nscalability of natural robustness testing.\n","authors":["Thanh Le-Cong","Dat Nguyen","Bach Le","Toby Murray"],"pdf_url":"https://arxiv.org/pdf/2402.11892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08370v1","updated":"2024-11-13T06:40:17Z","published":"2024-11-13T06:40:17Z","title":"A Fuzzy Reinforcement LSTM-based Long-term Prediction Model for Fault\n Conditions in Nuclear Power Plants","summary":" Early fault detection and timely maintenance scheduling can significantly\nmitigate operational risks in NPPs and enhance the reliability of operator\ndecision-making. Therefore, it is necessary to develop an efficient Prognostics\nand Health Management (PHM) multi-step prediction model for predicting of\nsystem health status and prompt execution of maintenance operations. In this\nstudy, we propose a novel predictive model that integrates reinforcement\nlearning with Long Short-Term Memory (LSTM) neural networks and the Expert\nFuzzy Evaluation Method. The model is validated using parameter data for 20\ndifferent breach sizes in the Main Steam Line Break (MSLB) accident condition\nof the CPR1000 pressurized water reactor simulation model and it demonstrates a\nremarkable capability in accurately forecasting NPP parameter changes up to 128\nsteps ahead (with a time interval of 10 seconds per step, i.e., 1280 seconds),\nthereby satisfying the temporal advance requirement for fault prognostics in\nNPPs. Furthermore, this method provides an effective reference solution for PHM\napplications such as anomaly detection and remaining useful life prediction.\n","authors":["Siwei Li","Jiayan Fang","Yichun Wua","Wei Wang","Chengxin Li","Jiangwen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21564v2","updated":"2024-11-13T06:35:53Z","published":"2024-10-28T21:54:44Z","title":"Mitigating Gradient Overlap in Deep Residual Networks with Gradient\n Normalization for Improved Non-Convex Optimization","summary":" In deep learning, Residual Networks (ResNets) have proven effective in\naddressing the vanishing gradient problem, allowing for the successful training\nof very deep networks. However, skip connections in ResNets can lead to\ngradient overlap, where gradients from both the learned transformation and the\nskip connection combine, potentially resulting in overestimated gradients. This\noverestimation can cause inefficiencies in optimization, as some updates may\novershoot optimal regions, affecting weight updates. To address this, we\nexamine Z-score Normalization (ZNorm) as a technique to manage gradient\noverlap. ZNorm adjusts the gradient scale, standardizing gradients across\nlayers and reducing the negative impact of overlapping gradients. Our\nexperiments demonstrate that ZNorm improves training process, especially in\nnon-convex optimization scenarios common in deep learning, where finding\noptimal solutions is challenging. These findings suggest that ZNorm can affect\nthe gradient flow, enhancing performance in large-scale data processing where\naccuracy is critical.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2410.21564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07934v2","updated":"2024-11-13T06:34:07Z","published":"2024-11-12T17:04:56Z","title":"Doubly Mild Generalization for Offline Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) suffers from the extrapolation error and\nvalue overestimation. From a generalization perspective, this issue can be\nattributed to the over-generalization of value functions or policies towards\nout-of-distribution (OOD) actions. Significant efforts have been devoted to\nmitigating such generalization, and recent in-sample learning approaches have\nfurther succeeded in entirely eschewing it. Nevertheless, we show that mild\ngeneralization beyond the dataset can be trusted and leveraged to improve\nperformance under certain conditions. To appropriately exploit generalization\nin offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild\naction generalization and (ii) mild generalization propagation. The former\nrefers to selecting actions in a close neighborhood of the dataset to maximize\nthe Q values. Even so, the potential erroneous generalization can still be\npropagated, accumulated, and exacerbated by bootstrapping. In light of this,\nthe latter concept is introduced to mitigate the generalization propagation\nwithout impeding the propagation of RL learning signals. Theoretically, DMG\nguarantees better performance than the in-sample optimal policy in the oracle\ngeneralization scenario. Even under worst-case generalization, DMG can still\ncontrol value overestimation at a certain level and lower bound the\nperformance. Empirically, DMG achieves state-of-the-art performance across\nGym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting\nfrom its flexibility in both generalization aspects, DMG enjoys a seamless\ntransition from offline to online learning and attains strong online\nfine-tuning performance.\n","authors":["Yixiu Mao","Qi Wang","Yun Qu","Yuhang Jiang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2411.07934v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08367v1","updated":"2024-11-13T06:32:17Z","published":"2024-11-13T06:32:17Z","title":"Surprisingly Popular Voting for Concentric Rank-Order Models","summary":" An important problem on social information sites is the recovery of ground\ntruth from individual reports when the experts are in the minority. The wisdom\nof the crowd, i.e. the collective opinion of a group of individuals fails in\nsuch a scenario. However, the surprisingly popular (SP)\nalgorithm~\\cite{prelec2017solution} can recover the ground truth even when the\nexperts are in the minority, by asking the individuals to report additional\nprediction reports--their beliefs about the reports of others. Several recent\nworks have extended the surprisingly popular algorithm to an equivalent voting\nrule (SP-voting) to recover the ground truth ranking over a set of $m$\nalternatives. However, we are yet to fully understand when SP-voting can\nrecover the ground truth ranking, and if so, how many samples (votes and\npredictions) it needs. We answer this question by proposing two rank-order\nmodels and analyzing the sample complexity of SP-voting under these models. In\nparticular, we propose concentric mixtures of Mallows and Plackett-Luce models\nwith $G (\\ge 2)$ groups. Our models generalize previously proposed concentric\nmixtures of Mallows models with $2$ groups, and we highlight the importance of\n$G > 2$ groups by identifying three distinct groups (expert, intermediate, and\nnon-expert) from existing datasets. Next, we provide conditions on the\nparameters of the underlying models so that SP-voting can recover ground-truth\nrankings with high probability, and also derive sample complexities under the\nsame. We complement the theoretical results by evaluating SP-voting on\nsimulated and real datasets.\n","authors":["Hadi Hosseini","Debmalya Mandal","Amrit Puhan"],"pdf_url":"https://arxiv.org/pdf/2411.08367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08347v1","updated":"2024-11-13T05:38:55Z","published":"2024-11-13T05:38:55Z","title":"A Chinese Multi-label Affective Computing Dataset Based on Social Media\n Network Users","summary":" Emotion and personality are central elements in understanding human\npsychological states. Emotions reflect an individual subjective experiences,\nwhile personality reveals relatively stable behavioral and cognitive patterns.\nExisting affective computing datasets often annotate emotion and personality\ntraits separately, lacking fine-grained labeling of micro-emotions and emotion\nintensity in both single-label and multi-label classifications. Chinese emotion\ndatasets are extremely scarce, and datasets capturing Chinese user personality\ntraits are even more limited. To address these gaps, this study collected data\nfrom the major social media platform Weibo, screening 11,338 valid users from\nover 50,000 individuals with diverse MBTI personality labels and acquiring\n566,900 posts along with the user MBTI personality tags. Using the EQN method,\nwe compiled a multi-label Chinese affective computing dataset that integrates\nthe same user's personality traits with six emotions and micro-emotions, each\nannotated with intensity levels. Validation results across multiple NLP\nclassification models demonstrate the dataset strong utility. This dataset is\ndesigned to advance machine recognition of complex human emotions and provide\ndata support for research in psychology, education, marketing, finance, and\npolitics.\n","authors":["Jingyi Zhou","Senlin Luo","Haofan Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08341v1","updated":"2024-11-13T05:15:25Z","published":"2024-11-13T05:15:25Z","title":"Generative AI for Data Augmentation in Wireless Networks: Analysis,\n Applications, and Case Study","summary":" Data augmentation is a powerful technique to mitigate data scarcity. However,\nowing to fundamental differences in wireless data structures, traditional data\naugmentation techniques may not be suitable for wireless data. Fortunately,\nGenerative Artificial Intelligence (GenAI) can be an effective alternative to\nwireless data augmentation due to its excellent data generation capability.\nThis article systemically explores the potential and effectiveness of\nGenAI-driven data augmentation in wireless networks. We first briefly review\ndata augmentation techniques, discuss their limitations in wireless networks,\nand introduce generative data augmentation, including reviewing GenAI models\nand their applications in data augmentation. We then explore the application\nprospects of GenAI-driven data augmentation in wireless networks from the\nphysical, network, and application layers, which provides a GenAI-driven data\naugmentation architecture for each application. Subsequently, we propose a\ngeneral generative diffusion model-based data augmentation framework for Wi-Fi\ngesture recognition, which uses transformer-based diffusion models to generate\nhigh-quality channel state information data. Furthermore, we develop residual\nneural network models for Wi-Fi gesture recognition to evaluate the role of\naugmented data and conduct a case study based on a real dataset. Simulation\nresults demonstrate the effectiveness of the proposed framework. Finally, we\ndiscuss research directions for generative data augmentation.\n","authors":["Jinbo Wen","Jiawen Kang","Dusit Niyato","Yang Zhang","Jiacheng Wang","Biplab Sikdar","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10758v3","updated":"2024-11-13T05:11:05Z","published":"2024-10-14T17:38:37Z","title":"Arrhythmia Classification Using Graph Neural Networks Based on\n Correlation Matrix","summary":" With the advancements in graph neural network, there has been increasing\ninterest in applying this network to ECG signal analysis. In this study, we\ngenerated an adjacency matrix using correlation matrix of extracted features\nand applied a graph neural network to classify arrhythmias. The proposed model\nwas compared with existing approaches from the literature. The results\ndemonstrated that precision and recall for all arrhythmia classes exceeded 50%,\nsuggesting that this method can be considered an approach for arrhythmia\nclassification.\n","authors":["Seungwoo Han"],"pdf_url":"https://arxiv.org/pdf/2410.10758v3.pdf","comment":"Accepted for BIBM 2024 AIBH Workshop"},{"id":"http://arxiv.org/abs/2410.17439v3","updated":"2024-11-13T04:57:08Z","published":"2024-10-22T21:30:58Z","title":"Evaluating AI-Generated Essays with GRE Analytical Writing Assessment","summary":" The recent revolutionary advance in generative AI enables the generation of\nrealistic and coherent texts by large language models (LLMs). Despite many\nexisting evaluation metrics on the quality of the generated texts, there is\nstill a lack of rigorous assessment of how well LLMs perform in complex and\ndemanding writing assessments. This study examines essays generated by ten\nleading LLMs for the analytical writing assessment of the Graduate Record Exam\n(GRE). We assessed these essays using both human raters and the e-rater\nautomated scoring engine as used in the GRE scoring pipeline. Notably, the\ntop-performing Gemini and GPT-4o received an average score of 4.78 and 4.67,\nrespectively, falling between \"generally thoughtful, well-developed analysis of\nthe issue and conveys meaning clearly\" and \"presents a competent analysis of\nthe issue and conveys meaning with acceptable clarity\" according to the GRE\nscoring guideline. We also evaluated the detection accuracy of these essays,\nwith detectors trained on essays generated by the same and different LLMs.\n","authors":["Yang Zhong","Jiangang Hao","Michael Fauss","Chen Li","Yuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17439v3.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.08335v1","updated":"2024-11-13T04:49:32Z","published":"2024-11-13T04:49:32Z","title":"DEEGITS: Deep Learning based Framework for Measuring Heterogenous\n Traffic State in Challenging Traffic Scenarios","summary":" This paper presents DEEGITS (Deep Learning Based Heterogeneous Traffic State\nMeasurement), a comprehensive framework that leverages state-of-the-art\nconvolutional neural network (CNN) techniques to accurately and rapidly detect\nvehicles and pedestrians, as well as to measure traffic states in challenging\nscenarios (i.e., congestion, occlusion). In this study, we enhance the training\ndataset through data fusion, enabling simultaneous detection of vehicles and\npedestrians. Image preprocessing and augmentation are subsequently performed to\nimprove the quality and quantity of the dataset. Transfer learning is applied\non the YOLOv8 pretrained model to increase the model's capability to identify a\ndiverse array of vehicles. Optimal hyperparameters are obtained using the Grid\nSearch algorithm, with the Stochastic Gradient Descent (SGD) optimizer\noutperforming other optimizers under these settings. Extensive experimentation\nand evaluation demonstrate substantial accuracy within the detection framework,\nwith the model achieving 0.794 mAP@0.5 on the validation set and 0.786 mAP@0.5\non the test set, surpassing previous benchmarks on similar datasets. The\nDeepSORT multi-object tracking algorithm is incorporated to track detected\nvehicles and pedestrians in this study. Finally, the framework is tested to\nmeasure heterogeneous traffic states in mixed traffic conditions. Two locations\nwith differing traffic compositions and congestion levels are selected: one\nmotorized-dominant location with moderate density and one\nnon-motorized-dominant location with higher density. Errors are statistically\ninsignificant for both cases, showing correlations from 0.99 to 0.88 and 0.91\nto 0.97 for heterogeneous traffic flow and speed measurements, respectively.\n","authors":["Muttahirul Islam","Nazmul Haque","Md. Hadiuzzaman"],"pdf_url":"https://arxiv.org/pdf/2411.08335v1.pdf","comment":"Submitted for presentation at the 103 rd Annual Meeting of\n Transportation Research Board and publication in Transportation Research\n Record: Journal of Transportation Research Board"},{"id":"http://arxiv.org/abs/2408.03497v3","updated":"2024-11-13T04:41:31Z","published":"2024-08-07T01:37:10Z","title":"Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and\n Tabnet with SMOTEENN","summary":" Bank credit risk is a significant challenge in modern financial transactions,\nand the ability to identify qualified credit card holders among a large number\nof applicants is crucial for the profitability of a bank'sbank's credit card\nbusiness. In the past, screening applicants'applicants' conditions often\nrequired a significant amount of manual labor, which was time-consuming and\nlabor-intensive. Although the accuracy and reliability of previously used ML\nmodels have been continuously improving, the pursuit of more reliable and\npowerful AI intelligent models is undoubtedly the unremitting pursuit by major\nbanks in the financial industry. In this study, we used a dataset of over\n40,000 records provided by a commercial bank as the research object. We\ncompared various dimensionality reduction techniques such as PCA and T-SNE for\npreprocessing high-dimensional datasets and performed in-depth adaptation and\ntuning of distributed models such as LightGBM and XGBoost, as well as deep\nmodels like Tabnet. After a series of research and processing, we obtained\nexcellent research results by combining SMOTEENN with these techniques. The\nexperiments demonstrated that LightGBM combined with PCA and SMOTEENN\ntechniques can assist banks in accurately predicting potential high-quality\ncustomers, showing relatively outstanding performance compared to other models.\n","authors":["Chang Yu","Yixin Jin","Qianwen Xing","Ye Zhang","Shaobo Guo","Shuchen Meng"],"pdf_url":"https://arxiv.org/pdf/2408.03497v3.pdf","comment":"8 pagess on IEEE ICPICS"},{"id":"http://arxiv.org/abs/2411.08334v1","updated":"2024-11-13T04:32:58Z","published":"2024-11-13T04:32:58Z","title":"Enhancing Multimodal Query Representation via Visual Dialogues for\n End-to-End Knowledge Retrieval","summary":" Existing multimodal retrieval systems often rely on disjointed models for\nimage comprehension, such as object detectors and caption generators, leading\nto cumbersome implementations and training processes. To overcome this\nlimitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a\ntext retriever with the ability to understand multimodal queries via dynamic\nmodality interaction. Ret-XKnow leverages a partial convolution mechanism to\nfocus on visual information relevant to the given textual query, thereby\nenhancing multimodal query representations. To effectively learn multimodal\ninteraction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset\nautomatically constructed from visual dialogue datasets. Our dataset\nconstruction process ensures that the dialogues are transformed into suitable\ninformation retrieval tasks using a text retriever. We demonstrate that our\napproach not only significantly improves retrieval performance in zero-shot\nsettings but also achieves substantial improvements in fine-tuning scenarios.\nOur code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow.\n","authors":["Yeong-Joon Ju","Ho-Joong Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08324v1","updated":"2024-11-13T04:20:20Z","published":"2024-11-13T04:20:20Z","title":"Are LLMs Prescient? A Continuous Evaluation using Daily News as the\n Oracle","summary":" Many existing evaluation benchmarks for Large Language Models (LLMs) quickly\nbecome outdated due to the emergence of new models and training data. These\nbenchmarks also fall short in assessing how LLM performance changes over time,\nas they consist of static questions without a temporal dimension. To address\nthese limitations, we propose using future event prediction as a continuous\nevaluation method to assess LLMs' temporal generalization and forecasting\nabilities. Our benchmark, Daily Oracle, automatically generates question-answer\n(QA) pairs from daily news, challenging LLMs to predict \"future\" event\noutcomes. Our findings reveal that as pre-training data becomes outdated, LLM\nperformance degrades over time. While Retrieval Augmented Generation (RAG) has\nthe potential to enhance prediction accuracy, the performance degradation\npattern persists, highlighting the need for continuous model updates.\n","authors":["Hui Dai","Ryan Teehan","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2411.08324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08320v1","updated":"2024-11-13T04:06:09Z","published":"2024-11-13T04:06:09Z","title":"Responsible AI in Construction Safety: Systematic Evaluation of Large\n Language Models and Prompt Engineering","summary":" Construction remains one of the most hazardous sectors. Recent advancements\nin AI, particularly Large Language Models (LLMs), offer promising opportunities\nfor enhancing workplace safety. However, responsible integration of LLMs\nrequires systematic evaluation, as deploying them without understanding their\ncapabilities and limitations risks generating inaccurate information, fostering\nmisplaced confidence, and compromising worker safety. This study evaluates the\nperformance of two widely used LLMs, GPT-3.5 and GPT-4o, across three\nstandardized exams administered by the Board of Certified Safety Professionals\n(BCSP). Using 385 questions spanning seven safety knowledge areas, the study\nanalyzes the models' accuracy, consistency, and reliability. Results show that\nboth models consistently exceed the BCSP benchmark, with GPT-4o achieving an\naccuracy rate of 84.6% and GPT-3.5 reaching 73.8%. Both models demonstrate\nstrengths in safety management systems and hazard identification and control,\nbut exhibit weaknesses in science, mathematics, emergency response, and fire\nprevention. An error analysis identifies four primary limitations affecting LLM\nperformance: lack of knowledge, reasoning flaws, memory issues, and calculation\nerrors. Our study also highlights the impact of prompt engineering strategies,\nwith variations in accuracy reaching 13.5% for GPT-3.5 and 7.9% for GPT-4o.\nHowever, no single prompt configuration proves universally effective. This\nresearch advances knowledge in three ways: by identifying areas where LLMs can\nsupport safety practices and where human oversight remains essential, by\noffering practical insights into improving LLM implementation through prompt\nengineering, and by providing evidence-based direction for future research and\ndevelopment. These contributions support the responsible integration of AI in\nconstruction safety management toward achieving zero injuries.\n","authors":["Farouq Sammour","Jia Xu","Xi Wang","Mo Hu","Zhenyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08320v1.pdf","comment":"29 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.07521v2","updated":"2024-11-13T04:03:54Z","published":"2024-11-12T03:37:53Z","title":"Fair Summarization: Bridging Quality and Diversity in Extractive\n Summaries","summary":" Fairness in multi-document summarization of user-generated content remains a\ncritical challenge in natural language processing (NLP). Existing summarization\nmethods often fail to ensure equitable representation across different social\ngroups, leading to biased outputs. In this paper, we introduce two novel\nmethods for fair extractive summarization: FairExtract, a clustering-based\napproach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints.\nWe evaluate these methods using Divsumm summarization dataset of White-aligned,\nHispanic, and African-American dialect tweets and compare them against relevant\nbaselines. The results obtained using a comprehensive set of summarization\nquality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well\nas a fairness metric F, demonstrate that FairExtract and FairGPT achieve\nsuperior fairness while maintaining competitive summarization quality.\nAdditionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that\nintegrate quality and fairness into a single evaluation framework, offering a\nmore nuanced understanding of the trade-offs between these objectives. This\nwork highlights the importance of fairness in summarization and sets a\nbenchmark for future research in fairness-aware NLP models.\n","authors":["Sina Bagheri Nezhad","Sayan Bandyapadhyay","Ameeta Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.07521v2.pdf","comment":"Accepted at Algorithmic Fairness through the Lens of Metrics and\n Evaluation Workshop @ NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.07976v2","updated":"2024-11-13T03:56:10Z","published":"2024-11-12T17:55:39Z","title":"DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring","summary":" Coronary artery disease (CAD), one of the most common cause of mortality in\nthe world. Coronary artery calcium (CAC) scoring using computed tomography (CT)\nis key for risk assessment to prevent coronary disease. Previous studies on\nrisk assessment and calcification detection in CT scans primarily use\napproaches based on UNET architecture, frequently implemented on pre-built\nmodels. However, these models are limited by the availability of annotated CT\nscans containing CAC and suffering from imbalanced dataset, decreasing\nperformance of CAC segmentation and scoring. In this study, we extend this\napproach by incorporating the self-supervised learning (SSL) technique of DINO\n(self-distillation with no labels) to eliminate limitations of scarce annotated\ndata in CT scans. The DINO model's ability to train without requiring CAC area\nannotations enhances its robustness in generating distinct features. The DINO\nmodel is trained on to focus specifically on calcified areas by using labels,\naiming to generate features that effectively capture and highlight key\ncharacteristics. The label-guided DINO (DINO-LG) enhances classification by\ndistinguishing CT slices that contain calcification from those that do not,\nperforming 57% better than the standard DINO model in this task. CAC scoring\nand segmentation tasks are performed by a basic U-NET architecture, fed\nspecifically with CT slices containing calcified areas as identified by the\nDINO-LG model. This targeted identification performed by DINO-LG model improves\nCAC segmentation performance by approximately 10% and significant increase in\nCAC scoring accuracy.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Caner Ozcan"],"pdf_url":"https://arxiv.org/pdf/2411.07976v2.pdf","comment":"Developed by Center for Applied Artificial Intelligence (CAAI),\n University of Kentucky"},{"id":"http://arxiv.org/abs/2411.06106v2","updated":"2024-11-13T03:19:47Z","published":"2024-11-09T08:00:50Z","title":"Personalize to generalize: Towards a universal medical multi-modality\n generalization through personalization","summary":" The differences among medical imaging modalities, driven by distinct\nunderlying principles, pose significant challenges for generalization in\nmulti-modal medical tasks. Beyond modality gaps, individual variations, such as\ndifferences in organ size and metabolic rate, further impede a model's ability\nto generalize effectively across both modalities and diverse populations.\nDespite the importance of personalization, existing approaches to multi-modal\ngeneralization often neglect individual differences, focusing solely on common\nanatomical features. This limitation may result in weakened generalization in\nvarious medical tasks. In this paper, we unveil that personalization is\ncritical for multi-modal generalization. Specifically, we propose an approach\nto achieve personalized generalization through approximating the underlying\npersonalized invariant representation ${X}_h$ across various modalities by\nleveraging individual-level constraints and a learnable biological prior. We\nvalidate the feasibility and benefits of learning a personalized ${X}_h$,\nshowing that this representation is highly generalizable and transferable\nacross various multi-modal medical tasks. Extensive experimental results\nconsistently show that the additionally incorporated personalization\nsignificantly improves performance and generalization across diverse scenarios,\nconfirming its effectiveness.\n","authors":["Zhaorui Tan","Xi Yang","Tan Pan","Tianyi Liu","Chen Jiang","Xin Guo","Qiufeng Wang","Anh Nguyen","Yuan Qi","Kaizhu Huang","Yuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.06106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08307v1","updated":"2024-11-13T03:14:10Z","published":"2024-11-13T03:14:10Z","title":"PerceiverS: A Multi-Scale Perceiver with Effective Segmentation for\n Long-Term Expressive Symbolic Music Generation","summary":" Music generation has progressed significantly, especially in the domain of\naudio generation. However, generating symbolic music that is both\nlong-structured and expressive remains a significant challenge. In this paper,\nwe propose PerceiverS (Segmentation and Scale), a novel architecture designed\nto address this issue by leveraging both Effective Segmentation and Multi-Scale\nattention mechanisms. Our approach enhances symbolic music generation by\nsimultaneously learning long-term structural dependencies and short-term\nexpressive details. By combining cross-attention and self-attention in a\nMulti-Scale setting, PerceiverS captures long-range musical structure while\npreserving performance nuances. The proposed model, evaluated on datasets like\nMaestro, demonstrates improvements in generating coherent and diverse music\nwith both structural consistency and expressive variation. The project demos\nand the generated music samples can be accessed through the link:\nhttps://perceivers.github.io.\n","authors":["Yungang Yi","Weihua Li","Matthew Kuo","Quan Bai"],"pdf_url":"https://arxiv.org/pdf/2411.08307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08302v1","updated":"2024-11-13T02:45:21Z","published":"2024-11-13T02:45:21Z","title":"R3HF: Reward Redistribution for Enhancing Reinforcement Learning from\n Human Feedback","summary":" Reinforcement learning from human feedback (RLHF) provides a paradigm for\naligning large language models (LLMs) with human preferences. This involves the\ninitial training of a reward model based on pairwise human feedback. The reward\nmodel is subsequently utilized in reinforcement learning to assess the scores\nof each generated sentence as a whole, further guiding the optimization of\nLLMs. However, current approaches have a significant shortcoming: \\emph{They\nallocate a single, sparse, and delayed reward to an entire sequence of output}.\nThis may overlook some significant individual contributions of each token\ntowards the desired outcome. To overcome this limitation, our paper proposes a\nnovel reward redistribution method called R3HF, which facilitates a more\nfine-grained, token-level reward allocation. Specifically, our method treats\nthe reward prediction task of the reward model as a regression problem. As a\nresult, the redistributed rewards are computed by evaluating the specific\ncontribution of each token to the reward model's output. This detailed approach\nimproves the model's understanding of language nuances, leading to more precise\nenhancements in its performance. Our method is crafted to integrate seamlessly\nwith most current techniques while incurring minimal computational costs.\nThrough comprehensive experiments across diverse datasets and tasks, we have\nverified the effectiveness and superiority of our approach.\n","authors":["Jiahui Li","Tai-wei Chang","Fengda Zhang","Kun Kuang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08299v1","updated":"2024-11-13T02:41:02Z","published":"2024-11-13T02:41:02Z","title":"DNN Task Assignment in UAV Networks: A Generative AI Enhanced\n Multi-Agent Reinforcement Learning Approach","summary":" Unmanned Aerial Vehicles (UAVs) possess high mobility and flexible deployment\ncapabilities, prompting the development of UAVs for various application\nscenarios within the Internet of Things (IoT). The unique capabilities of UAVs\ngive rise to increasingly critical and complex tasks in uncertain and\npotentially harsh environments. The substantial amount of data generated from\nthese applications necessitates processing and analysis through deep neural\nnetworks (DNNs). However, UAVs encounter challenges due to their limited\ncomputing resources when managing DNN models. This paper presents a joint\napproach that combines multiple-agent reinforcement learning (MARL) and\ngenerative diffusion models (GDM) for assigning DNN tasks to a UAV swarm, aimed\nat reducing latency from task capture to result output. To address these\nchallenges, we first consider the task size of the target area to be inspected\nand the shortest flying path as optimization constraints, employing a greedy\nalgorithm to resolve the subproblem with a focus on minimizing the UAV's flying\npath and the overall system cost. In the second stage, we introduce a novel DNN\ntask assignment algorithm, termed GDM-MADDPG, which utilizes the reverse\ndenoising process of GDM to replace the actor network in multi-agent deep\ndeterministic policy gradient (MADDPG). This approach generates specific DNN\ntask assignment actions based on agents' observations in a dynamic environment.\nSimulation results indicate that our algorithm performs favorably compared to\nbenchmarks in terms of path planning, Age of Information (AoI), energy\nconsumption, and task load balancing.\n","authors":["Xin Tang","Qian Chen","Wenjie Weng","Binhan Liao","Jiacheng Wang","Xianbin Cao","Xiaohuan Li"],"pdf_url":"https://arxiv.org/pdf/2411.08299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08297v1","updated":"2024-11-13T02:32:38Z","published":"2024-11-13T02:32:38Z","title":"TowerDebias: A Novel Debiasing Method based on the Tower Property","summary":" Decision-making processes have increasingly come to rely on sophisticated\nmachine learning tools, raising concerns about the fairness of their\npredictions with respect to any sensitive groups. The widespread use of\ncommercial black-box machine learning models necessitates careful consideration\nof their legal and ethical implications on consumers. In situations where users\nhave access to these \"black-box\" models, a key question emerges: how can we\nmitigate or eliminate the influence of sensitive attributes, such as race or\ngender? We propose towerDebias (tDB), a novel approach designed to reduce the\ninfluence of sensitive variables in predictions made by black-box models. Using\nthe Tower Property from probability theory, tDB aims to improve prediction\nfairness during the post-processing stage in a manner amenable to the\nFairness-Utility Tradeoff. This method is highly flexible, requiring no prior\nknowledge of the original model's internal structure, and can be extended to a\nrange of different applications. We provide a formal improvement theorem for\ntDB and demonstrate its effectiveness in both regression and classification\ntasks, underscoring its impact on the fairness-utility tradeoff.\n","authors":["Norman Matloff","Aditya Mittal"],"pdf_url":"https://arxiv.org/pdf/2411.08297v1.pdf","comment":"To be submitted to a journal soon"},{"id":"http://arxiv.org/abs/2308.07541v2","updated":"2024-11-13T02:17:55Z","published":"2023-08-15T03:01:41Z","title":"On-demand Cold Start Frequency Reduction with Off-Policy Reinforcement\n Learning in Serverless Computing","summary":" Function-as-a-Service (FaaS) is a cloud computing paradigm offering an\nevent-driven execution model to applications. It features serverless attributes\nby eliminating resource management responsibilities from developers, and offers\ntransparent and on-demand scalability of applications. To provide seamless\non-demand scalability, new function instances are prepared to serve the\nincoming workload in the absence or unavailability of function instances.\nHowever, FaaS platforms are known to suffer from cold starts, where this\nfunction provisioning process introduces a non-negligible delay in function\nresponse and reduces the end-user experience. Therefore, the presented work\nfocuses on reducing the frequent, on-demand cold starts on the platform by\nusing Reinforcement Learning(RL). The proposed approach uses model-free\nQ-learning that consider function metrics such as CPU utilization, existing\nfunction instances, and response failure rate, to proactively initialize\nfunctions, in advance, based on the expected demand. The proposed solution is\nimplemented on Kubeless and evaluated using an open-source function invocation\ntrace applied to a matrix multiplication function. The evaluation results\ndemonstrate a favourable performance of the RL-based agent when compared to\nKubeless' default policy and a function keep-alive policy by improving\nthroughput by up to 8.81% and reducing computation load and resource wastage by\nup to 55% and 37%, respectively, that is a direct outcome of reduced cold\nstarts.\n","authors":["Siddharth Agarwal","Maria A. Rodriguez","Rajkumar Buyya"],"pdf_url":"https://arxiv.org/pdf/2308.07541v2.pdf","comment":"13 figures, 24 pages, 3 tables"},{"id":"http://arxiv.org/abs/2411.08290v1","updated":"2024-11-13T02:17:03Z","published":"2024-11-13T02:17:03Z","title":"RESOLVE: Relational Reasoning with Symbolic and Object-Level Features\n Using Vector Symbolic Processing","summary":" Modern transformer-based encoder-decoder architectures struggle with\nreasoning tasks due to their inability to effectively extract relational\ninformation between input objects (data/tokens). Recent work introduced the\nAbstractor module, embedded between transformer layers, to address this gap.\nHowever, the Abstractor layer while excelling at capturing relational\ninformation (pure relational reasoning), faces challenges in tasks that require\nboth object and relational-level reasoning (partial relational reasoning). To\naddress this, we propose RESOLVE, a neuro-vector symbolic architecture that\ncombines object-level features with relational representations in\nhigh-dimensional spaces, using fast and efficient operations such as bundling\n(summation) and binding (Hadamard product) allowing both object-level features\nand relational representations to coexist within the same structure without\ninterfering with one another. RESOLVE is driven by a novel attention mechanism\nthat operates in a bipolar high dimensional space, allowing fast attention\nscore computation compared to the state-of-the-art. By leveraging this design,\nthe model achieves both low compute latency and memory efficiency. RESOLVE also\noffers better generalizability while achieving higher accuracy in purely\nrelational reasoning tasks such as sorting as well as partial relational\nreasoning tasks such as math problem-solving compared to state-of-the-art\nmethods.\n","authors":["Mohamed Mejri","Chandramouli Amarnath","Abhijit Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2411.08290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16744v2","updated":"2024-11-13T02:04:10Z","published":"2024-01-30T04:48:43Z","title":"ShaRP: A Novel Feature Importance Framework for Ranking","summary":" Algorithmic decisions in critical domains such as hiring, college admissions,\nand lending are often based on rankings. Because of the impact these decisions\nhave on individuals, organizations, and population groups, there is a need to\nunderstand them: to help individuals improve their position in a ranking,\ndesign better ranking procedures, and check whether a procedure is legally\ncompliant. In this paper, we present ShaRP -- Shapley for Rankings and\nPreferences -- a framework that explains the contributions of features to\ndifferent aspects of a ranked outcome and is based on Shapley values. Using\nShaRP, we show that even when the scoring function used by an algorithmic\nranker is known and linear, the feature weights do not correspond to their\nShapley value contribution. The contributions instead depend on the feature\ndistributions and the subtle local interactions between the scoring features.\n ShaRP builds on the Quantitative Input Influence framework to compute the\ncontributions of features for multiple -- ranking specific -- Quantities of\nInterest, including score, rank, pair-wise preference, and top-k. We show the\nresults of an extensive experimental validation of ShaRP using real and\nsynthetic datasets. We demonstrate that feature importance can be computed\nefficiently, and that ShaRP compares favorably to several prior local feature\nimportance methods, in terms of both generality and quality of explanations.\nAmong our results, we highlight a case study on the CS Rankings dataset.\nContrary to expectation, we find that a strong track record in Systems research\nis much more important than AI research for placing a CS department among the\ntop-10%. ShaRP is available at latex for matplotlib\ntogetherhttps://github.com/DataResponsibly/ShaRP.\n","authors":["Venetia Pliatsika","Joao Fonseca","Kateryna Akhynko","Ivan Shevchenko","Julia Stoyanovich"],"pdf_url":"https://arxiv.org/pdf/2401.16744v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2411.08286v1","updated":"2024-11-13T02:02:52Z","published":"2024-11-13T02:02:52Z","title":"Hashing for Protein Structure Similarity Search","summary":" Protein structure similarity search (PSSS), which tries to search proteins\nwith similar structures, plays a crucial role across diverse domains from drug\ndesign to protein function prediction and molecular evolution. Traditional\nalignment-based PSSS methods, which directly calculate alignment on the protein\nstructures, are highly time-consuming with high memory cost. Recently,\nalignment-free methods, which represent protein structures as fixed-length\nreal-valued vectors, are proposed for PSSS. Although these methods have lower\ntime and memory cost than alignment-based methods, their time and memory cost\nis still too high for large-scale PSSS, and their accuracy is unsatisfactory.\nIn this paper, we propose a novel method, called\n$\\underline{\\text{p}}$r$\\underline{\\text{o}}$tein\n$\\underline{\\text{s}}$tructure $\\underline{\\text{h}}$ashing (POSH), for PSSS.\nPOSH learns a binary vector representation for each protein structure, which\ncan dramatically reduce the time and memory cost for PSSS compared with\nreal-valued vector representation based methods. Furthermore, in POSH we also\npropose expressive hand-crafted features and a structure encoder to well model\nboth node and edge interactions in proteins. Experimental results on real\ndatasets show that POSH can outperform other methods to achieve\nstate-of-the-art accuracy. Furthermore, POSH achieves a memory saving of more\nthan six times and speed improvement of more than four times, compared with\nother methods.\n","authors":["Jin Han","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2411.08286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14798v2","updated":"2024-11-13T01:36:33Z","published":"2024-06-21T00:16:55Z","title":"Probabilistic Emulation of a Global Climate Model with Spherical\n DYffusion","summary":" Data-driven deep learning models are transforming global weather forecasting.\nIt is an open question if this success can extend to climate modeling, where\nthe complexity of the data and long inference rollouts pose significant\nchallenges. Here, we present the first conditional generative model that\nproduces accurate and physically consistent global climate ensemble simulations\nby emulating a coarse version of the United States' primary operational global\nforecast model, FV3GFS. Our model integrates the dynamics-informed diffusion\nframework (DYffusion) with the Spherical Fourier Neural Operator (SFNO)\narchitecture, enabling stable 100-year simulations at 6-hourly timesteps while\nmaintaining low computational overhead compared to single-step deterministic\nbaselines. The model achieves near gold-standard performance for climate model\nemulation, outperforming existing approaches and demonstrating promising\nensemble skill. This work represents a significant advance towards efficient,\ndata-driven climate simulations that can enhance our understanding of the\nclimate system and inform adaptation strategies.\n","authors":["Salva Rühling Cachay","Brian Henn","Oliver Watt-Meyer","Christopher S. Bretherton","Rose Yu"],"pdf_url":"https://arxiv.org/pdf/2406.14798v2.pdf","comment":"NeurIPS 2024; Code is available at\n https://github.com/Rose-STL-Lab/spherical-dyffusion"},{"id":"http://arxiv.org/abs/2411.08278v1","updated":"2024-11-13T01:33:05Z","published":"2024-11-13T01:33:05Z","title":"Knowledge Bases in Support of Large Language Models for Processing Web\n News","summary":" Large Language Models (LLMs) have received considerable interest in wide\napplications lately. During pre-training via massive datasets, such a model\nimplicitly memorizes the factual knowledge of trained datasets in its hidden\nparameters. However, knowledge held implicitly in parameters often makes its\nuse by downstream applications ineffective due to the lack of common-sense\nreasoning. In this article, we introduce a general framework that permits to\nbuild knowledge bases with an aid of LLMs, tailored for processing Web news.\nThe framework applies a rule-based News Information Extractor (NewsIE) to news\nitems for extracting their relational tuples, referred to as knowledge bases,\nwhich are then graph-convoluted with the implicit knowledge facts of news items\nobtained by LLMs, for their classification. It involves two lightweight\ncomponents: 1) NewsIE: for extracting the structural information of every news\nitem, in the form of relational tuples; 2) BERTGraph: for graph convoluting the\nimplicit knowledge facts with relational tuples extracted by NewsIE. We have\nevaluated our framework under different news-related datasets for news category\nclassification, with promising experimental results.\n","authors":["Yihe Zhang","Nabin Pakka","Nian-feng Tzeng"],"pdf_url":"https://arxiv.org/pdf/2411.08278v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.04872v2","updated":"2024-11-13T01:06:35Z","published":"2024-11-07T17:07:35Z","title":"FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning\n in AI","summary":" We introduce FrontierMath, a benchmark of hundreds of original, exceptionally\nchallenging mathematics problems crafted and vetted by expert mathematicians.\nThe questions cover most major branches of modern mathematics -- from\ncomputationally intensive problems in number theory and real analysis to\nabstract questions in algebraic geometry and category theory. Solving a typical\nproblem requires multiple hours of effort from a researcher in the relevant\nbranch of mathematics, and for the upper end questions, multiple days.\nFrontierMath uses new, unpublished problems and automated verification to\nreliably evaluate models while minimizing risk of data contamination. Current\nstate-of-the-art AI models solve under 2% of problems, revealing a vast gap\nbetween AI capabilities and the prowess of the mathematical community. As AI\nsystems advance toward expert-level mathematical abilities, FrontierMath offers\na rigorous testbed that quantifies their progress.\n","authors":["Elliot Glazer","Ege Erdil","Tamay Besiroglu","Diego Chicharro","Evan Chen","Alex Gunning","Caroline Falkman Olsson","Jean-Stanislas Denain","Anson Ho","Emily de Oliveira Santos","Olli Järviniemi","Matthew Barnett","Robert Sandler","Matej Vrzala","Jaime Sevilla","Qiuyu Ren","Elizabeth Pratt","Lionel Levine","Grant Barkley","Natalie Stewart","Bogdan Grechuk","Tetiana Grechuk","Shreepranav Varma Enugandla","Mark Wildon"],"pdf_url":"https://arxiv.org/pdf/2411.04872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09995v2","updated":"2024-11-13T00:41:01Z","published":"2024-04-15T17:59:57Z","title":"Taming Latent Diffusion Model for Neural Radiance Field Inpainting","summary":" Neural Radiance Field (NeRF) is a representation for 3D reconstruction from\nmulti-view images. Despite some recent work showing preliminary success in\nediting a reconstructed NeRF with diffusion prior, they remain struggling to\nsynthesize reasonable geometry in completely uncovered regions. One major\nreason is the high diversity of synthetic contents from the diffusion model,\nwhich hinders the radiance field from converging to a crisp and deterministic\ngeometry. Moreover, applying latent diffusion models on real data often yields\na textural shift incoherent to the image condition due to auto-encoding errors.\nThese two problems are further reinforced with the use of pixel-distance\nlosses. To address these issues, we propose tempering the diffusion model's\nstochasticity with per-scene customization and mitigating the textural shift\nwith masked adversarial training. During the analyses, we also found the\ncommonly used pixel and perceptual losses are harmful in the NeRF inpainting\ntask. Through rigorous experiments, our framework yields state-of-the-art NeRF\ninpainting results on various real-world scenes. Project page:\nhttps://hubert0527.github.io/MALD-NeRF\n","authors":["Chieh Hubert Lin","Changil Kim","Jia-Bin Huang","Qinbo Li","Chih-Yao Ma","Johannes Kopf","Ming-Hsuan Yang","Hung-Yu Tseng"],"pdf_url":"https://arxiv.org/pdf/2404.09995v2.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://hubert0527.github.io/MALD-NeRF"},{"id":"http://arxiv.org/abs/2409.18164v2","updated":"2024-11-13T00:15:46Z","published":"2024-09-26T17:30:28Z","title":"Data-Prep-Kit: getting your data ready for LLM application development","summary":" Data preparation is the first and a very important step towards any Large\nLanguage Model (LLM) development. This paper introduces an easy-to-use,\nextensible, and scale-flexible open-source data preparation toolkit called Data\nPrep Kit (DPK). DPK is architected and designed to enable users to scale their\ndata preparation to their needs. With DPK they can prepare data on a local\nmachine or effortlessly scale to run on a cluster with thousands of CPU Cores.\nDPK comes with a highly scalable, yet extensible set of modules that transform\nnatural language and code data. If the user needs additional transforms, they\ncan be easily developed using extensive DPK support for transform creation.\nThese modules can be used independently or pipelined to perform a series of\noperations. In this paper, we describe DPK architecture and show its\nperformance from a small scale to a very large number of CPUs. The modules from\nDPK have been used for the preparation of Granite Models [1] [2]. We believe\nDPK is a valuable contribution to the AI community to easily prepare data to\nenhance the performance of their LLM models or to fine-tune models with\nRetrieval-Augmented Generation (RAG).\n","authors":["David Wood","Boris Lublinsky","Alexy Roytman","Shivdeep Singh","Constantin Adam","Abdulhamid Adebayo","Sungeun An","Yuan Chi Chang","Xuan-Hong Dang","Nirmit Desai","Michele Dolfi","Hajar Emami-Gohari","Revital Eres","Takuya Goto","Dhiraj Joshi","Yan Koyfman","Mohammad Nassar","Hima Patel","Paramesvaran Selvam","Yousaf Shah","Saptha Surendran","Daiki Tsuzuku","Petros Zerfos","Shahrokh Daijavad"],"pdf_url":"https://arxiv.org/pdf/2409.18164v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.08257v1","updated":"2024-11-13T00:14:09Z","published":"2024-11-13T00:14:09Z","title":"GPTree: Towards Explainable Decision-Making via LLM-powered Decision\n Trees","summary":" Traditional decision tree algorithms are explainable but struggle with\nnon-linear, high-dimensional data, limiting its applicability in complex\ndecision-making. Neural networks excel at capturing complex patterns but\nsacrifice explainability in the process. In this work, we present GPTree, a\nnovel framework combining explainability of decision trees with the advanced\nreasoning capabilities of LLMs. GPTree eliminates the need for feature\nengineering and prompt chaining, requiring only a task-specific prompt and\nleveraging a tree-based structure to dynamically split samples. We also\nintroduce an expert-in-the-loop feedback mechanism to further enhance\nperformance by enabling human intervention to refine and rebuild decision\npaths, emphasizing the harmony between human expertise and machine\nintelligence. Our decision tree achieved a 7.8% precision rate for identifying\n\"unicorn\" startups at the inception stage of a startup, surpassing gpt-4o with\nfew-shot learning as well as the best human decision-makers (3.1% to 5.6%).\n","authors":["Sichao Xiong","Yigit Ihlamur","Fuat Alican","Aaron Ontoyin Yin"],"pdf_url":"https://arxiv.org/pdf/2411.08257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08254v1","updated":"2024-11-13T00:07:32Z","published":"2024-11-13T00:07:32Z","title":"VALTEST: Automated Validation of Language Model Generated Test Cases","summary":" Large Language Models (LLMs) have demonstrated significant potential in\nautomating software testing, specifically in generating unit test cases.\nHowever, the validation of LLM-generated test cases remains a challenge,\nparticularly when the ground truth is unavailable. This paper introduces\nVALTEST, a novel framework designed to automatically validate test cases\ngenerated by LLMs by leveraging token probabilities. We evaluate VALTEST using\nnine test suites generated from three datasets (HumanEval, MBPP, and LeetCode)\nacross three LLMs (GPT-4o, GPT-3.5-turbo, and LLama3.1 8b). By extracting\nstatistical features from token probabilities, we train a machine learning\nmodel to predict test case validity. VALTEST increases the validity rate of\ntest cases by 6.2% to 24%, depending on the dataset and LLM. Our results\nsuggest that token probabilities are reliable indicators for distinguishing\nbetween valid and invalid test cases, which provides a robust solution for\nimproving the correctness of LLM-generated test cases in software testing. In\naddition, we found that replacing the identified invalid test cases by VALTEST,\nusing a Chain-of-Thought prompting results in a more effective test suite while\nkeeping the high validity rates.\n","authors":["Hamed Taherkhani","Hadi Hemmati"],"pdf_url":"https://arxiv.org/pdf/2411.08254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09089v1","updated":"2024-11-13T23:43:01Z","published":"2024-11-13T23:43:01Z","title":"Set-Based Retrograde Analysis: Precomputing the Solution to 24-card\n Bridge Double Dummy Deals","summary":" Retrograde analysis is used in game-playing programs to solve states at the\nend of a game, working backwards toward the start of the game. The algorithm\niterates through and computes the perfect-play value for as many states as\nresources allow. We introduce setrograde analysis which achieves the same\nresults by operating on sets of states that have the same game value. The\nalgorithm is demonstrated by computing exact solutions for Bridge double dummy\ncard-play. For deals with 24 cards remaining to be played ($10^{27}$ states,\nwhich can be reduced to $10^{15}$ states using preexisting techniques), we\nstrongly solve all deals. The setrograde algorithm performs a factor of $10^3$\nfewer search operations than a standard retrograde algorithm, producing a\ndatabase with a factor of $10^4$ fewer entries. For applicable domains, this\nallows retrograde searching to reach unprecedented search depths.\n","authors":["Isaac Stone","Nathan R. Sturtevant","Jonathan Schaeffer"],"pdf_url":"https://arxiv.org/pdf/2411.09089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.04883v6","updated":"2024-11-13T23:34:19Z","published":"2022-08-09T16:25:49Z","title":"Neural-Rendezvous: Provably Robust Guidance and Control to Encounter\n Interstellar Objects","summary":" Interstellar objects (ISOs) are likely representatives of primitive materials\ninvaluable in understanding exoplanetary star systems. Due to their poorly\nconstrained orbits with generally high inclinations and relative velocities,\nhowever, exploring ISOs with conventional human-in-the-loop approaches is\nsignificantly challenging. This paper presents Neural-Rendezvous -- a deep\nlearning-based guidance and control framework for encountering fast-moving\nobjects, including ISOs, robustly, accurately, and autonomously in real time.\nIt uses pointwise minimum norm tracking control on top of a guidance policy\nmodeled by a spectrally-normalized deep neural network, where its\nhyperparameters are tuned with a loss function directly penalizing the MPC\nstate trajectory tracking error. We show that Neural-Rendezvous provides a high\nprobability exponential bound on the expected spacecraft delivery error, the\nproof of which leverages stochastic incremental stability analysis. In\nparticular, it is used to construct a non-negative function with a\nsupermartingale property, explicitly accounting for the ISO state uncertainty\nand the local nature of nonlinear state estimation guarantees. In numerical\nsimulations, Neural-Rendezvous is demonstrated to satisfy the expected error\nbound for 100 ISO candidates. This performance is also empirically validated\nusing our spacecraft simulator and in high-conflict and distributed UAV swarm\nreconfiguration with up to 20 UAVs.\n","authors":["Hiroyasu Tsukamoto","Soon-Jo Chung","Yashwanth Kumar Nakka","Benjamin Donitz","Declan Mages","Michel Ingham"],"pdf_url":"https://arxiv.org/pdf/2208.04883v6.pdf","comment":"Preprint Version, Accepted: October, 2024 (One-minute YouTube\n summary: https://youtu.be/q3e0LYS2IYQ, DOI:\n https://doi.org/10.2514/1.G007671)"},{"id":"http://arxiv.org/abs/2411.09077v1","updated":"2024-11-13T23:09:53Z","published":"2024-11-13T23:09:53Z","title":"Drone Detection using Deep Neural Networks Trained on Pure Synthetic\n Data","summary":" Drone detection has benefited from improvements in deep neural networks, but\nlike many other applications, suffers from the availability of accurate data\nfor training. Synthetic data provides a potential for low-cost data generation\nand has been shown to improve data availability and quality. However, models\ntrained on synthetic datasets need to prove their ability to perform on\nreal-world data, known as the problem of sim-to-real transferability. Here, we\npresent a drone detection Faster-RCNN model trained on a purely synthetic\ndataset that transfers to real-world data. We found that it achieves an AP_50\nof 97.0% when evaluated on the MAV-Vid - a real dataset of flying drones -\ncompared with 97.8% for an equivalent model trained on real-world data. Our\nresults show that using synthetic data for drone detection has the potential to\nreduce data collection costs and improve labelling quality. These findings\ncould be a starting point for more elaborate synthetic drone datasets. For\nexample, realistic recreations of specific scenarios could de-risk the dataset\ngeneration of safety-critical applications such as the detection of drones at\nairports. Further, synthetic data may enable reliable drone detection systems,\nwhich could benefit other areas, such as unmanned traffic management systems.\nThe code is available\nhttps://github.com/mazqtpopx/cranfield-synthetic-drone-detection alongside the\ndatasets\nhttps://huggingface.co/datasets/mazqtpopx/cranfield-synthetic-drone-detection.\n","authors":["Mariusz Wisniewski","Zeeshan A. Rana","Ivan Petrunin","Alan Holt","Stephen Harman"],"pdf_url":"https://arxiv.org/pdf/2411.09077v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.07870v2","updated":"2024-11-13T23:02:41Z","published":"2024-11-12T15:26:17Z","title":"Trustful LLMs: Customizing and Grounding Text Generation with Knowledge\n Bases and Dual Decoders","summary":" Although people are impressed by the content generation skills of large\nlanguage models, the use of LLMs, such as ChatGPT, is limited by the domain\ngrounding of the content. The correctness and groundedness of the generated\ncontent need to be based on a verified context, such as results from\nRetrieval-Augmented Generation (RAG). One important issue when adapting LLMs to\na customized domain is that the generated responses are often incomplete, or\nthe additions are not verified and may even be hallucinated. Prior studies on\nhallucination detection have focused on evaluation metrics, which are not\neasily adaptable to dynamic domains and can be vulnerable to attacks like\njail-breaking. In this work, we propose 1) a post-processing algorithm that\nleverages knowledge triplets in RAG context to correct hallucinations and 2) a\ndual-decoder model that fuses RAG context to guide the generation process.\n","authors":["Xiaofeng Zhu","Jaya Krishna Mandivarapu"],"pdf_url":"https://arxiv.org/pdf/2411.07870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09073v1","updated":"2024-11-13T22:56:00Z","published":"2024-11-13T22:56:00Z","title":"Code-mixed LLM: Improve Large Language Models' Capability to Handle\n Code-Mixing through Reinforcement Learning from AI Feedback","summary":" Code-mixing(CM) or code-switching(CSW) refers to the juxtaposition of\nlinguistic units from two or more languages during the conversation or\nsometimes even a single utterance. Code-mixing introduces unique challenges in\ndaily life, such as syntactic mismatches and semantic blending, that are rarely\nencountered in monolingual settings. Large language models (LLMs) have\nrevolutionized the field of natural language processing (NLP) by offering\nunprecedented capabilities in understanding human languages. However, the\neffectiveness of current state-of-the-art multilingual LLMs has not yet been\nfully explored in the CM scenario. To fill this gap, we first benchmark the\nperformance of multilingual LLMs on various code-mixing NLP tasks. Then we\npropose to improve the multilingual LLMs' ability to understand code-mixing\nthrough reinforcement learning from human feedback (RLHF) and code-mixed\nmachine translation tasks. Given the high-cost and time-consuming preference\nlabeling procedure, we improve this by utilizing LLMs as annotators to perform\nthe reinforcement learning from AI feedback (RLAIF). The experiments show the\neffectiveness of the proposed method.\n","authors":["Wenbo Zhang","Aditya Majumdar","Amulya Yadav"],"pdf_url":"https://arxiv.org/pdf/2411.09073v1.pdf","comment":"initial version: 5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.09068v1","updated":"2024-11-13T22:49:16Z","published":"2024-11-13T22:49:16Z","title":"Liner Shipping Network Design with Reinforcement Learning","summary":" This paper proposes a novel reinforcement learning framework to address the\nLiner Shipping Network Design Problem (LSNDP), a challenging combinatorial\noptimization problem focused on designing cost-efficient maritime shipping\nroutes. Traditional methods for solving the LSNDP typically involve decomposing\nthe problem into sub-problems, such as network design and multi-commodity flow,\nwhich are then tackled using approximate heuristics or large neighborhood\nsearch (LNS) techniques. In contrast, our approach employs a model-free\nreinforcement learning algorithm on the network design, integrated with a\nheuristic-based multi-commodity flow solver, to produce competitive results on\nthe publicly available LINERLIB benchmark. Additionally, our method also\ndemonstrates generalization capabilities by producing competitive solutions on\nthe benchmark instances after training on perturbed instances.\n","authors":["Utsav Dutta","Yifan Lin","Zhaoyang Larry Jin"],"pdf_url":"https://arxiv.org/pdf/2411.09068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09065v1","updated":"2024-11-13T22:45:52Z","published":"2024-11-13T22:45:52Z","title":"Language-Model Prior Overcomes Cold-Start Items","summary":" The growth of recommender systems (RecSys) is driven by digitization and the\nneed for personalized content in areas such as e-commerce and video streaming.\nThe content in these systems often changes rapidly and therefore they\nconstantly face the ongoing cold-start problem, where new items lack\ninteraction data and are hard to value. Existing solutions for the cold-start\nproblem, such as content-based recommenders and hybrid methods, leverage item\nmetadata to determine item similarities. The main challenge with these methods\nis their reliance on structured and informative metadata to capture detailed\nitem similarities, which may not always be available. This paper introduces a\nnovel approach for cold-start item recommendation that utilizes the language\nmodel (LM) to estimate item similarities, which are further integrated as a\nBayesian prior with classic recommender systems. This approach is generic and\nable to boost the performance of various recommenders. Specifically, our\nexperiments integrate it with both sequential and collaborative filtering-based\nrecommender and evaluate it on two real-world datasets, demonstrating the\nenhanced performance of the proposed approach.\n","authors":["Shiyu Wang","Hao Ding","Yupeng Gu","Sergul Aydore","Kousha Kalantari","Branislav Kveton"],"pdf_url":"https://arxiv.org/pdf/2411.09065v1.pdf","comment":"This paper is dedicated to cold-start item recommendation using\n language-model priors"},{"id":"http://arxiv.org/abs/2411.09062v1","updated":"2024-11-13T22:43:15Z","published":"2024-11-13T22:43:15Z","title":"Multimodal Object Detection using Depth and Image Data for Manufacturing\n Parts","summary":" Manufacturing requires reliable object detection methods for precise picking\nand handling of diverse types of manufacturing parts and components.\nTraditional object detection methods utilize either only 2D images from cameras\nor 3D data from lidars or similar 3D sensors. However, each of these sensors\nhave weaknesses and limitations. Cameras do not have depth perception and 3D\nsensors typically do not carry color information. These weaknesses can\nundermine the reliability and robustness of industrial manufacturing systems.\nTo address these challenges, this work proposes a multi-sensor system combining\nan red-green-blue (RGB) camera and a 3D point cloud sensor. The two sensors are\ncalibrated for precise alignment of the multimodal data captured from the two\nhardware devices. A novel multimodal object detection method is developed to\nprocess both RGB and depth data. This object detector is based on the Faster\nR-CNN baseline that was originally designed to process only camera images. The\nresults show that the multimodal model significantly outperforms the depth-only\nand RGB-only baselines on established object detection metrics. More\nspecifically, the multimodal model improves mAP by 13% and raises Mean\nPrecision by 11.8% in comparison to the RGB-only baseline. Compared to the\ndepth-only baseline, it improves mAP by 78% and raises Mean Precision by 57%.\nHence, this method facilitates more reliable and robust object detection in\nservice to smart manufacturing applications.\n","authors":["Nazanin Mahjourian","Vinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.09062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09055v1","updated":"2024-11-13T22:28:05Z","published":"2024-11-13T22:28:05Z","title":"SAFELOC: Overcoming Data Poisoning Attacks in Heterogeneous Federated\n Machine Learning for Indoor Localization","summary":" Machine learning (ML) based indoor localization solutions are critical for\nmany emerging applications, yet their efficacy is often compromised by\nhardware/software variations across mobile devices (i.e., device heterogeneity)\nand the threat of ML data poisoning attacks. Conventional methods aimed at\ncountering these challenges show limited resilience to the uncertainties\ncreated by these phenomena. In response, in this paper, we introduce SAFELOC, a\nnovel framework that not only minimizes localization errors under these\nchallenging conditions but also ensures model compactness for efficient mobile\ndevice deployment. Our framework targets a distributed and co-operative\nlearning environment that uses federated learning (FL) to preserve user data\nprivacy and assumes heterogeneous mobile devices carried by users (just like in\nmost real-world scenarios). Within this heterogeneous FL context, SAFELOC\nintroduces a novel fused neural network architecture that performs data\npoisoning detection and localization, with a low model footprint. Additionally,\na dynamic saliency map-based aggregation strategy is designed to adapt based on\nthe severity of the detected data poisoning scenario. Experimental evaluations\ndemonstrate that SAFELOC achieves improvements of up to 5.9x in mean\nlocalization error, 7.8x in worst-case localization error, and a 2.1x reduction\nin model inference latency compared to state-of-the-art indoor localization\nframeworks, across diverse building floorplans, mobile devices, and ML data\npoisoning attack scenarios.\n","authors":["Akhil Singampalli","Danish Gufran","Sudeep Pasricha"],"pdf_url":"https://arxiv.org/pdf/2411.09055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09050v1","updated":"2024-11-13T22:10:07Z","published":"2024-11-13T22:10:07Z","title":"The Systems Engineering Approach in Times of Large Language Models","summary":" Using Large Language Models (LLMs) to address critical societal problems\nrequires adopting this novel technology into socio-technical systems. However,\nthe complexity of such systems and the nature of LLMs challenge such a vision.\nIt is unlikely that the solution to such challenges will come from the\nArtificial Intelligence (AI) community itself. Instead, the Systems Engineering\napproach is better equipped to facilitate the adoption of LLMs by prioritising\nthe problems and their context before any other aspects. This paper introduces\nthe challenges LLMs generate and surveys systems research efforts for\nengineering AI-based systems. We reveal how the systems engineering principles\nhave supported addressing similar issues to the ones LLMs pose and discuss our\nfindings to provide future directions for adopting LLMs.\n","authors":["Christian Cabrera","Viviana Bastidas","Jennifer Schooling","Neil D. Lawrence"],"pdf_url":"https://arxiv.org/pdf/2411.09050v1.pdf","comment":"This paper has been accepted for the upcoming 58th Hawaii\n International Conference on System Sciences (HICSS-58)"},{"id":"http://arxiv.org/abs/2410.06865v2","updated":"2024-11-13T20:41:14Z","published":"2024-10-09T13:24:06Z","title":"Students' Perceptions and Use of Generative AI Tools for Programming\n Across Different Computing Courses","summary":" Investigation of students' perceptions and opinions on the use of generative\nartificial intelligence (GenAI) in education is a topic gaining much interest.\nStudies addressing this are typically conducted with large heterogeneous\ngroups, at one moment in time. However, how students perceive and use GenAI\ntools can potentially depend on many factors, including their background\nknowledge, familiarity with the tools, and the learning goals and policies of\nthe courses they are taking.\n In this study we explore how students following computing courses use GenAI\nfor programming-related tasks across different programs and courses: Bachelor\nand Master, in courses in which learning programming is the learning goal,\ncourses that require programming as a means to achieve another goal, and in\ncourses in which programming is optional, but can be useful. We are also\ninterested in changes over time, since GenAI capabilities are changing at a\nfast pace, and users are adopting GenAI increasingly.\n We conducted three consecutive surveys (fall `23, winter `23, and spring `24)\namong students of all computing programs of a large European research\nuniversity. We asked questions on the use in education, ethics, and job\nprospects, and we included specific questions on the (dis)allowed use of GenAI\ntools in the courses they were taking at the time.\n We received 264 responses, which we quantitatively and qualitatively\nanalyzed, to find out how students have employed GenAI tools across 59\ndifferent computing courses, and whether the opinion of an average student\nabout these tools evolves over time. Our study contributes to the emerging\ndiscussion of how to differentiate GenAI use across different courses, and how\nto align its use with the learning goals of a computing course.\n","authors":["Hieke Keuning","Isaac Alpizar-Chacon","Ioanna Lykourentzou","Lauren Beehler","Christian Köppe","Imke de Jong","Sergey Sosnovsky"],"pdf_url":"https://arxiv.org/pdf/2410.06865v2.pdf","comment":"Accepted to Koli Calling 24. Numbers in Table 1, row 1 updated"},{"id":"http://arxiv.org/abs/2406.15955v2","updated":"2024-11-13T20:30:13Z","published":"2024-06-22T22:43:10Z","title":"Beyond the Doors of Perception: Vision Transformers Represent Relations\n Between Objects","summary":" Though vision transformers (ViTs) have achieved state-of-the-art performance\nin a variety of settings, they exhibit surprising failures when performing\ntasks involving visual relations. This begs the question: how do ViTs attempt\nto perform tasks that require computing visual relations between objects? Prior\nefforts to interpret ViTs tend to focus on characterizing relevant low-level\nvisual features. In contrast, we adopt methods from mechanistic\ninterpretability to study the higher-level visual algorithms that ViTs use to\nperform abstract visual reasoning. We present a case study of a fundamental,\nyet surprisingly difficult, relational reasoning task: judging whether two\nvisual entities are the same or different. We find that pretrained ViTs\nfine-tuned on this task often exhibit two qualitatively different stages of\nprocessing despite having no obvious inductive biases to do so: 1) a perceptual\nstage wherein local object features are extracted and stored in a disentangled\nrepresentation, and 2) a relational stage wherein object representations are\ncompared. In the second stage, we find evidence that ViTs can learn to\nrepresent somewhat abstract visual relations, a capability that has long been\nconsidered out of reach for artificial neural networks. Finally, we demonstrate\nthat failures at either stage can prevent a model from learning a generalizable\nsolution to our fairly simple tasks. By understanding ViTs in terms of discrete\nprocessing stages, one can more precisely diagnose and rectify shortcomings of\nexisting and future models.\n","authors":["Michael A. Lepori","Alexa R. Tartaglini","Wai Keen Vong","Thomas Serre","Brenden M. Lake","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2406.15955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09001v1","updated":"2024-11-13T20:02:17Z","published":"2024-11-13T20:02:17Z","title":"Virtual teaching assistant for undergraduate students using natural\n language processing & deep learning","summary":" Online education's popularity has been continuously increasing over the past\nfew years. Many universities were forced to switch to online education as a\nresult of COVID-19. In many cases, even after more than two years of online\ninstruction, colleges were unable to resume their traditional classroom\nprograms. A growing number of institutions are considering blended learning\nwith some parts in-person and the rest of the learning taking place online.\nNevertheless, many online education systems are inefficient, and this results\nin a poor rate of student retention. In this paper, we are offering a primary\ndataset, the initial implementation of a virtual teaching assistant named\nVTA-bot, and its system architecture. Our primary implementation of the\nsuggested system consists of a chatbot that can be queried about the content\nand topics of the fundamental python programming language course. Students in\ntheir first year of university will be benefited from this strategy, which aims\nto increase student participation and involvement in online education.\n","authors":["Sadman Jashim Sakib","Baktiar Kabir Joy","Zahin Rydha","Md. Nuruzzaman","Annajiat Alim Rasel"],"pdf_url":"https://arxiv.org/pdf/2411.09001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08992v1","updated":"2024-11-13T19:33:08Z","published":"2024-11-13T19:33:08Z","title":"IDCIA: Immunocytochemistry Dataset for Cellular Image Analysis","summary":" We present a new annotated microscopic cellular image dataset to improve the\neffectiveness of machine learning methods for cellular image analysis. Cell\ncounting is an important step in cell analysis. Typically, domain experts\nmanually count cells in a microscopic image. Automated cell counting can\npotentially eliminate this tedious, time-consuming process. However, a good,\nlabeled dataset is required for training an accurate machine learning model.\nOur dataset includes microscopic images of cells, and for each image, the cell\ncount and the location of individual cells. The data were collected as part of\nan ongoing study investigating the potential of electrical stimulation to\nmodulate stem cell differentiation and possible applications for neural repair.\nCompared to existing publicly available datasets, our dataset has more images\nof cells stained with more variety of antibodies (protein components of immune\nresponses against invaders) typically used for cell analysis. The experimental\nresults on this dataset indicate that none of the five existing models under\nthis study are able to achieve sufficiently accurate count to replace the\nmanual methods. The dataset is available at\nhttps://figshare.com/articles/dataset/Dataset/21970604.\n","authors":["Abdurahman Ali Mohammed","Catherine Fonder","Donald S. Sakaguchi","Wallapak Tavanapong","Surya K. Mallapragada","Azeez Idris"],"pdf_url":"https://arxiv.org/pdf/2411.08992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08981v1","updated":"2024-11-13T19:16:44Z","published":"2024-11-13T19:16:44Z","title":"Reliability, Resilience and Human Factors Engineering for Trustworthy AI\n Systems","summary":" As AI systems become integral to critical operations across industries and\nservices, ensuring their reliability and safety is essential. We offer a\nframework that integrates established reliability and resilience engineering\nprinciples into AI systems. By applying traditional metrics such as failure\nrate and Mean Time Between Failures (MTBF) along with resilience engineering\nand human reliability analysis, we propose an integrate framework to manage AI\nsystem performance, and prevent or efficiently recover from failures. Our work\nadapts classical engineering methods to AI systems and outlines a research\nagenda for future technical studies. We apply our framework to a real-world AI\nsystem, using system status data from platforms such as openAI, to demonstrate\nits practical applicability. This framework aligns with emerging global\nstandards and regulatory frameworks, providing a methodology to enhance the\ntrustworthiness of AI systems. Our aim is to guide policy, regulation, and the\ndevelopment of reliable, safe, and adaptable AI technologies capable of\nconsistent performance in real-world environments.\n","authors":["Saurabh Mishra","Anand Rao","Ramayya Krishnan","Bilal Ayyub","Amin Aria","Enrico Zio"],"pdf_url":"https://arxiv.org/pdf/2411.08981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08979v1","updated":"2024-11-13T19:12:02Z","published":"2024-11-13T19:12:02Z","title":"CoCoP: Enhancing Text Classification with LLM through Code Completion\n Prompt","summary":" Text classification is a fundamental task in natural language processing\n(NLP), and large language models (LLMs) have demonstrated their capability to\nperform this task across various domains. However, the performance of LLMs\nheavily depends on the quality of their input prompts. Recent studies have also\nshown that LLMs exhibit remarkable results in code-related tasks. To leverage\nthe capabilities of LLMs in text classification, we propose the Code Completion\nPrompt (CoCoP) method, which transforms the text classification problem into a\ncode completion task. CoCoP significantly improves text classification\nperformance across diverse datasets by utilizing LLMs' code-completion\ncapability. For instance, CoCoP enhances the accuracy of the SST2 dataset by\nmore than 20%. Moreover, when CoCoP integrated with LLMs specifically designed\nfor code-related tasks (code models), such as CodeLLaMA, this method\ndemonstrates better or comparable performance to few-shot learning techniques\nwhile using only one-tenth of the model size. The source code of our proposed\nmethod will be available to the public upon the acceptance of the paper.\n","authors":["Mohammad Mahdi Mohajeri","Mohammad Javad Dousti","Majid Nili Ahmadabadi"],"pdf_url":"https://arxiv.org/pdf/2411.08979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08975v1","updated":"2024-11-13T19:06:57Z","published":"2024-11-13T19:06:57Z","title":"Fluoroformer: Scaling multiple instance learning to multiplexed images\n via attention-based channel fusion","summary":" Though multiple instance learning (MIL) has been a foundational strategy in\ncomputational pathology for processing whole slide images (WSIs), current\napproaches are designed for traditional hematoxylin and eosin (H&E) slides\nrather than emerging multiplexed technologies. Here, we present an MIL\nstrategy, the Fluoroformer module, that is specifically tailored to multiplexed\nWSIs by leveraging scaled dot-product attention (SDPA) to interpretably fuse\ninformation across disparate channels. On a cohort of 434 non-small cell lung\ncancer (NSCLC) samples, we show that the Fluoroformer both obtains strong\nprognostic performance and recapitulates immuno-oncological hallmarks of NSCLC.\nOur technique thereby provides a path for adapting state-of-the-art AI\ntechniques to emerging spatial biology assays.\n","authors":["Marc Harary","Eliezer M. Van Allen","William Lotter"],"pdf_url":"https://arxiv.org/pdf/2411.08975v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 14 pages"},{"id":"http://arxiv.org/abs/2411.08954v1","updated":"2024-11-13T19:00:02Z","published":"2024-11-13T19:00:02Z","title":"Inconsistencies In Consistency Models: Better ODE Solving Does Not Imply\n Better Samples","summary":" Although diffusion models can generate remarkably high-quality samples, they\nare intrinsically bottlenecked by their expensive iterative sampling procedure.\nConsistency models (CMs) have recently emerged as a promising diffusion model\ndistillation method, reducing the cost of sampling by generating high-fidelity\nsamples in just a few iterations. Consistency model distillation aims to solve\nthe probability flow ordinary differential equation (ODE) defined by an\nexisting diffusion model. CMs are not directly trained to minimize error\nagainst an ODE solver, rather they use a more computationally tractable\nobjective. As a way to study how effectively CMs solve the probability flow\nODE, and the effect that any induced error has on the quality of generated\nsamples, we introduce Direct CMs, which \\textit{directly} minimize this error.\nIntriguingly, we find that Direct CMs reduce the ODE solving error compared to\nCMs but also result in significantly worse sample quality, calling into\nquestion why exactly CMs work well in the first place. Full code is available\nat: https://github.com/layer6ai-labs/direct-cms.\n","authors":["Noël Vouitsis","Rasa Hosseinzadeh","Brendan Leigh Ross","Valentin Villecroze","Satya Krishna Gorti","Jesse C. Cresswell","Gabriel Loaiza-Ganem"],"pdf_url":"https://arxiv.org/pdf/2411.08954v1.pdf","comment":"NeurIPS 2024 ATTRIB Workshop"},{"id":"http://arxiv.org/abs/2411.05196v2","updated":"2024-11-13T18:58:46Z","published":"2024-11-07T21:43:29Z","title":"Explainable AI through a Democratic Lens: DhondtXAI for Proportional\n Feature Importance Using the D'Hondt Method","summary":" In democratic societies, electoral systems play a crucial role in translating\npublic preferences into political representation. Among these, the D'Hondt\nmethod is widely used to ensure proportional representation, balancing fair\nrepresentation with governmental stability. Recently, there has been a growing\ninterest in applying similar principles of proportional representation to\nenhance interpretability in machine learning, specifically in Explainable AI\n(XAI). This study investigates the integration of D'Hondt-based voting\nprinciples in the DhondtXAI method, which leverages resource allocation\nconcepts to interpret feature importance within AI models. Through a comparison\nof SHAP (Shapley Additive Explanations) and DhondtXAI, we evaluate their\neffectiveness in feature attribution within CatBoost and XGBoost models for\nbreast cancer and diabetes prediction, respectively. The DhondtXAI approach\nallows for alliance formation and thresholding to enhance interpretability,\nrepresenting feature importance as seats in a parliamentary view. Statistical\ncorrelation analyses between SHAP values and DhondtXAI allocations support the\nconsistency of interpretations, demonstrating DhondtXAI's potential as a\ncomplementary tool for understanding feature importance in AI models. The\nresults highlight that integrating electoral principles, such as proportional\nrepresentation and alliances, into AI explainability can improve user\nunderstanding, especially in high-stakes fields like healthcare.\n","authors":["Turker Berk Donmez"],"pdf_url":"https://arxiv.org/pdf/2411.05196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1902.00615v6","updated":"2024-11-13T18:32:53Z","published":"2019-02-02T01:52:53Z","title":"Confidence Trigger Detection: Accelerating Real-time\n Tracking-by-detection Systems","summary":" Real-time object tracking necessitates a delicate balance between speed and\naccuracy, a challenge exacerbated by the computational demands of deep learning\nmethods. In this paper, we propose Confidence-Triggered Detection (CTD), an\ninnovative approach that strategically bypasses object detection for frames\nclosely resembling intermediate states, leveraging tracker confidence scores.\nCTD not only enhances tracking speed but also preserves accuracy, surpassing\nexisting tracking algorithms. Through extensive evaluation across various\ntracker confidence thresholds, we identify an optimal trade-off between\ntracking speed and accuracy, providing crucial insights for parameter\nfine-tuning and enhancing CTD's practicality in real-world scenarios. Our\nexperiments across diverse detection models underscore the robustness and\nversatility of the CTD framework, demonstrating its potential to enable\nreal-time tracking in resource-constrained environments.\n","authors":["Zhicheng Ding","Zhixin Lai","Siyang Li","Panfeng Li","Qikai Yang","Edward Wong"],"pdf_url":"https://arxiv.org/pdf/1902.00615v6.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"}],"Computation and Language":[{"id":"http://arxiv.org/abs/2411.08870v1","updated":"2024-11-13T18:50:13Z","published":"2024-11-13T18:50:13Z","title":"The Limited Impact of Medical Adaptation of Large Language and\n Vision-Language Models","summary":" Several recent works seek to develop foundation models specifically for\nmedical applications, adapting general-purpose large language models (LLMs) and\nvision-language models (VLMs) via continued pretraining on publicly available\nbiomedical corpora. These works typically claim that such domain-adaptive\npretraining (DAPT) improves performance on downstream medical tasks, such as\nanswering medical licensing exam questions. In this paper, we compare ten\npublic \"medical\" LLMs and two VLMs against their corresponding base models,\narriving at a different conclusion: all medical VLMs and nearly all medical\nLLMs fail to consistently improve over their base models in the zero-/few-shot\nprompting and supervised fine-tuning regimes for medical question-answering\n(QA). For instance, across all tasks and model pairs we consider in the 3-shot\nsetting, medical LLMs only outperform their base models in 22.7% of cases,\nreach a (statistical) tie in 36.8% of cases, and are significantly worse than\ntheir base models in the remaining 40.5% of cases. Our conclusions are based on\n(i) comparing each medical model head-to-head, directly against the\ncorresponding base model; (ii) optimizing the prompts for each model separately\nin zero-/few-shot prompting; and (iii) accounting for statistical uncertainty\nin comparisons. While these basic practices are not consistently adopted in the\nliterature, our ablations show that they substantially impact conclusions.\nMeanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs\ncan show performance improvements, but the benefits do not carry over to tasks\nbased on clinical notes. Our findings suggest that state-of-the-art\ngeneral-domain models may already exhibit strong medical knowledge and\nreasoning capabilities, and offer recommendations to strengthen the conclusions\nof future studies.\n","authors":["Daniel P. Jeong","Pranav Mani","Saurabh Garg","Zachary C. Lipton","Michael Oberst"],"pdf_url":"https://arxiv.org/pdf/2411.08870v1.pdf","comment":"Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes\n additional results on clinical note QA tasks and supervised fine-tuning\n evaluations"},{"id":"http://arxiv.org/abs/2411.08868v1","updated":"2024-11-13T18:49:35Z","published":"2024-11-13T18:49:35Z","title":"CamemBERT 2.0: A Smarter French Language Model Aged to Perfection","summary":" French language models, such as CamemBERT, have been widely adopted across\nindustries for natural language processing (NLP) tasks, with models like\nCamemBERT seeing over 4 million downloads per month. However, these models face\nchallenges due to temporal concept drift, where outdated training data leads to\na decline in performance, especially when encountering new topics and\nterminology. This issue emphasizes the need for updated models that reflect\ncurrent linguistic trends. In this paper, we introduce two new versions of the\nCamemBERT base model-CamemBERTav2 and CamemBERTv2-designed to address these\nchallenges. CamemBERTav2 is based on the DeBERTaV3 architecture and makes use\nof the Replaced Token Detection (RTD) objective for better contextual\nunderstanding, while CamemBERTv2 is built on RoBERTa, which uses the Masked\nLanguage Modeling (MLM) objective. Both models are trained on a significantly\nlarger and more recent dataset with longer context length and an updated\ntokenizer that enhances tokenization performance for French. We evaluate the\nperformance of these models on both general-domain NLP tasks and\ndomain-specific applications, such as medical field tasks, demonstrating their\nversatility and effectiveness across a range of use cases. Our results show\nthat these updated models vastly outperform their predecessors, making them\nvaluable tools for modern NLP systems. All our new models, as well as\nintermediate checkpoints, are made openly available on Huggingface.\n","authors":["Wissam Antoun","Francis Kulumba","Rian Touchent","Éric de la Clergerie","Benoît Sagot","Djamé Seddah"],"pdf_url":"https://arxiv.org/pdf/2411.08868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06438v2","updated":"2024-11-13T18:21:22Z","published":"2024-07-08T22:40:15Z","title":"A Single Transformer for Scalable Vision-Language Modeling","summary":" We present SOLO, a single transformer for Scalable visiOn-Language mOdeling.\nCurrent large vision-language models (LVLMs) such as LLaVA mostly employ\nheterogeneous architectures that connect pre-trained visual encoders with large\nlanguage models (LLMs) to facilitate visual recognition and complex reasoning.\nAlthough achieving remarkable performance with relatively lightweight training,\nwe identify four primary scalability limitations: (1) The visual capacity is\nconstrained by pre-trained visual encoders, which are typically an order of\nmagnitude smaller than LLMs. (2) The heterogeneous architecture complicates the\nuse of established hardware and software infrastructure. (3) Study of scaling\nlaws on such architecture must consider three separate components - visual\nencoder, connector, and LLMs, which complicates the analysis. (4) The use of\nexisting visual encoders typically requires following a pre-defined\nspecification of image inputs pre-processing, for example, by reshaping inputs\nto fixed-resolution square images, which presents difficulties in processing\nand training on high-resolution images or those with unusual aspect ratio. A\nunified single Transformer architecture, like SOLO, effectively addresses these\nscalability concerns in LVLMs; however, its limited adoption in the modern\ncontext likely stems from the absence of reliable training recipes that balance\nboth modalities and ensure stable training for billion-scale models. In this\npaper, we introduce the first open-source training recipe for developing SOLO,\nan open-source 7B LVLM using moderate academic resources. The training recipe\ninvolves initializing from LLMs, sequential pre-training on ImageNet and\nweb-scale data, and instruction fine-tuning on our curated high-quality\ndatasets. On extensive evaluation, SOLO demonstrates performance comparable to\nLLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning.\n","authors":["Yangyi Chen","Xingyao Wang","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2407.06438v2.pdf","comment":"Accepted to TMLR"},{"id":"http://arxiv.org/abs/2411.02538v2","updated":"2024-11-13T18:04:44Z","published":"2024-11-04T19:17:17Z","title":"MILU: A Multi-task Indic Language Understanding Benchmark","summary":" Evaluating Large Language Models (LLMs) in low-resource and linguistically\ndiverse languages remains a significant challenge in NLP, particularly for\nlanguages using non-Latin scripts like those spoken in India. Existing\nbenchmarks predominantly focus on English, leaving substantial gaps in\nassessing LLM capabilities in these languages. We introduce MILU, a Multi task\nIndic Language Understanding Benchmark, a comprehensive evaluation benchmark\ndesigned to address this gap. MILU spans 8 domains and 42 subjects across 11\nIndic languages, reflecting both general and culturally specific knowledge.\nWith an India-centric design, incorporates material from regional and\nstate-level examinations, covering topics such as local history, arts,\nfestivals, and laws, alongside standard subjects like science and mathematics.\nWe evaluate over 45 LLMs, and find that current LLMs struggle with MILU, with\nGPT-4o achieving the highest average accuracy at 72 percent. Open multilingual\nmodels outperform language-specific fine-tuned models, which perform only\nslightly better than random baselines. Models also perform better in high\nresource languages as compared to low resource ones. Domain-wise analysis\nindicates that models perform poorly in culturally relevant areas like Arts and\nHumanities, Law and Governance compared to general fields like STEM. To the\nbest of our knowledge, MILU is the first of its kind benchmark focused on Indic\nlanguages, serving as a crucial step towards comprehensive cultural evaluation.\nAll code, benchmarks, and artifacts are publicly available to foster open\nresearch.\n","authors":["Sshubam Verma","Mohammed Safi Ur Rahman Khan","Vishwajeet Kumar","Rudra Murthy","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2411.02538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18346v4","updated":"2024-11-13T17:17:43Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from over-reliance on unimodal biases (e.g., language bias\nand vision bias), leading to incorrect answers or hallucinations in complex\nmultimodal tasks. To investigate this issue, we propose a causal framework to\ninterpret the biases in Visual Question Answering (VQA) problems. Within this\nframework, we conduct an in-depth causal analysis to assess the causal effect\nof these biases on MLLM predictions. Based on the analysis, we introduce 1) a\nnovel MORE dataset with 12,000 challenging VQA instances requiring multi-hop\nreasoning and overcoming unimodal biases. 2) a causality-enhanced agent\nframework CAVE that guides models to comprehensively integrate information from\ndifferent modalities and mitigate biases. Our experiments show that MLLMs\nperform poorly on MORE, indicating strong unimodal biases and limited semantic\nunderstanding. However, when integrated with our CAVE, promising improvements\nin reasoning and bias mitigation can be seen. These findings provide important\ninsights for the development of more robust MLLMs and contribute to the broader\ngoal of advancing multimodal AI systems capable of deeper understanding and\nreasoning. Our project page is at https://github.com/OpenCausaLab/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08790v1","updated":"2024-11-13T17:16:48Z","published":"2024-11-13T17:16:48Z","title":"Can sparse autoencoders be used to decompose and interpret steering\n vectors?","summary":" Steering vectors are a promising approach to control the behaviour of large\nlanguage models. However, their underlying mechanisms remain poorly understood.\nWhile sparse autoencoders (SAEs) may offer a potential method to interpret\nsteering vectors, recent findings show that SAE-reconstructed vectors often\nlack the steering properties of the original vectors. This paper investigates\nwhy directly applying SAEs to steering vectors yields misleading\ndecompositions, identifying two reasons: (1) steering vectors fall outside the\ninput distribution for which SAEs are designed, and (2) steering vectors can\nhave meaningful negative projections in feature directions, which SAEs are not\ndesigned to accommodate. These limitations hinder the direct use of SAEs for\ninterpreting steering vectors.\n","authors":["Harry Mayne","Yushi Yang","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2411.08790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08785v1","updated":"2024-11-13T17:13:25Z","published":"2024-11-13T17:13:25Z","title":"Zero-shot Cross-lingual Transfer Learning with Multiple Source and\n Target Languages for Information Extraction: Language Selection and\n Adversarial Training","summary":" The majority of previous researches addressing multi-lingual IE are limited\nto zero-shot cross-lingual single-transfer (one-to-one) setting, with\nhigh-resource languages predominantly as source training data. As a result,\nthese works provide little understanding and benefit for the realistic goal of\ndeveloping a multi-lingual IE system that can generalize to as many languages\nas possible. Our study aims to fill this gap by providing a detailed analysis\non Cross-Lingual Multi-Transferability (many-to-many transfer learning), for\nthe recent IE corpora that cover a diverse set of languages. Specifically, we\nfirst determine the correlation between single-transfer performance and a wide\nrange of linguistic-based distances. From the obtained insights, a combined\nlanguage distance metric can be developed that is not only highly correlated\nbut also robust across different tasks and model scales. Next, we investigate\nthe more general zero-shot multi-lingual transfer settings where multiple\nlanguages are involved in the training and evaluation processes. Language\nclustering based on the newly defined distance can provide directions for\nachieving the optimal cost-performance trade-off in data (languages) selection\nproblem. Finally, a relational-transfer setting is proposed to further\nincorporate multi-lingual unlabeled data based on adversarial training using\nthe relation induced from the above linguistic distance.\n","authors":["Nghia Trung Ngo","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.08785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03271v3","updated":"2024-11-13T17:10:20Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n Seeking in Large Language Models","summary":" In the face of uncertainty, the ability to *seek information* is of\nfundamental importance. In many practical applications, such as medical\ndiagnosis and troubleshooting, the information needed to solve the task is not\ninitially given and has to be actively sought by asking follow-up questions\n(for example, a doctor asking a patient for more details about their symptoms).\nIn this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to\naugment large language models with the ability to actively seek information by\nasking effective questions. UoT combines 1) an *uncertainty-aware simulation\napproach* which enables the model to simulate possible future scenarios and how\nlikely they are to occur, 2) *uncertainty-based rewards* motivated by\ninformation gain which incentivizes the model to seek information, and 3) a\n*reward propagation scheme* to select the optimal question to ask in a way that\nmaximizes the expected reward. In experiments on medical diagnosis,\ntroubleshooting, and the `20 Questions` game, UoT achieves an average\nperformance improvement of 38.1% in the rate of successful task completion\nacross multiple LLMs compared with direct prompting and also improves\nefficiency (i.e., the number of questions needed to complete the task). Our\ncode has been released [here](https://github.com/zhiyuanhubj/UoT)\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08752v1","updated":"2024-11-13T16:30:41Z","published":"2024-11-13T16:30:41Z","title":"Multi-Perspective Stance Detection","summary":" Subjective NLP tasks usually rely on human annotations provided by multiple\nannotators, whose judgments may vary due to their diverse backgrounds and life\nexperiences. Traditional methods often aggregate multiple annotations into a\nsingle ground truth, disregarding the diversity in perspectives that arises\nfrom annotator disagreement. In this preliminary study, we examine the effect\nof including multiple annotations on model accuracy in classification. Our\nmethodology investigates the performance of perspective-aware classification\nmodels in stance detection task and further inspects if annotator disagreement\naffects the model confidence. The results show that multi-perspective approach\nyields better classification performance outperforming the baseline which uses\nthe single label. This entails that designing more inclusive perspective-aware\nAI models is not only an essential first step in implementing responsible and\nethical AI, but it can also achieve superior results than using the traditional\napproaches.\n","authors":["Benedetta Muscato","Praveen Bushipaka","Gizem Gezici","Lucia Passaro","Fosca Giannotti"],"pdf_url":"https://arxiv.org/pdf/2411.08752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07140v2","updated":"2024-11-13T16:27:43Z","published":"2024-11-11T17:10:56Z","title":"Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language\n Models","summary":" New LLM evaluation benchmarks are important to align with the rapid\ndevelopment of Large Language Models (LLMs). In this work, we present Chinese\nSimpleQA, the first comprehensive Chinese benchmark to evaluate the factuality\nability of language models to answer short questions, and Chinese SimpleQA\nmainly has five properties (i.e., Chinese, Diverse, High-quality, Static,\nEasy-to-evaluate). Specifically, first, we focus on the Chinese language over 6\nmajor topics with 99 diverse subtopics. Second, we conduct a comprehensive\nquality control process to achieve high-quality questions and answers, where\nthe reference answers are static and cannot be changed over time. Third,\nfollowing SimpleQA, the questions and answers are very short, and the grading\nprocess is easy-to-evaluate based on OpenAI API. Based on Chinese SimpleQA, we\nperform a comprehensive evaluation on the factuality abilities of existing\nLLMs. Finally, we hope that Chinese SimpleQA could guide the developers to\nbetter understand the Chinese factuality abilities of their models and\nfacilitate the growth of foundation models.\n","authors":["Yancheng He","Shilong Li","Jiaheng Liu","Yingshui Tan","Weixun Wang","Hui Huang","Xingyuan Bu","Hangyu Guo","Chengwei Hu","Boren Zheng","Zhuoran Lin","Xuepeng Liu","Dekai Sun","Shirong Lin","Zhicheng Zheng","Xiaoyong Zhu","Wenbo Su","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.07140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08745v1","updated":"2024-11-13T16:26:19Z","published":"2024-11-13T16:26:19Z","title":"Separating Tongue from Thought: Activation Patching Reveals\n Language-Agnostic Concept Representations in Transformers","summary":" A central question in multilingual language modeling is whether large\nlanguage models (LLMs) develop a universal concept representation, disentangled\nfrom specific languages. In this paper, we address this question by analyzing\nlatent representations (latents) during a word translation task in\ntransformer-based LLMs. We strategically extract latents from a source\ntranslation prompt and insert them into the forward pass on a target\ntranslation prompt. By doing so, we find that the output language is encoded in\nthe latent at an earlier layer than the concept to be translated. Building on\nthis insight, we conduct two key experiments. First, we demonstrate that we can\nchange the concept without changing the language and vice versa through\nactivation patching alone. Second, we show that patching with the mean over\nlatents across different languages does not impair and instead improves the\nmodels' performance in translating the concept. Our results provide evidence\nfor the existence of language-agnostic concept representations within the\ninvestigated models.\n","authors":["Clément Dumas","Chris Wendler","Veniamin Veselovsky","Giovanni Monea","Robert West"],"pdf_url":"https://arxiv.org/pdf/2411.08745v1.pdf","comment":"12 pages, 10 figures, previously published under the title \"How Do\n Llamas Process Multilingual Text? A Latent Exploration through Activation\n Patching\" at the ICML 2024 mechanistic interpretability workshop\n https://openreview.net/forum?id=0ku2hIm4BS"},{"id":"http://arxiv.org/abs/2411.08742v1","updated":"2024-11-13T16:20:20Z","published":"2024-11-13T16:20:20Z","title":"A Comparative Study of Discrete Speech Tokens for Semantic-Related Tasks\n with Large Language Models","summary":" With the rise of Speech Large Language Models (Speech LLMs), there has been\ngrowing interest in discrete speech tokens for their ability to integrate with\ntext-based tokens seamlessly. Compared to most studies that focus on continuous\nspeech features, although discrete-token based LLMs have shown promising\nresults on certain tasks, the performance gap between these two paradigms is\nrarely explored. In this paper, we present a fair and thorough comparison\nbetween discrete and continuous features across a variety of semantic-related\ntasks using a light-weight LLM (Qwen1.5-0.5B). Our findings reveal that\ncontinuous features generally outperform discrete tokens, particularly in tasks\nrequiring fine-grained semantic understanding. Moreover, this study goes beyond\nsurface-level comparison by identifying key factors behind the\nunder-performance of discrete tokens, such as limited token granularity and\ninefficient information retention. To enhance the performance of discrete\ntokens, we explore potential aspects based on our analysis. We hope our results\ncan offer new insights into the opportunities for advancing discrete speech\ntokens in Speech LLMs.\n","authors":["Dingdong Wang","Mingyu Cui","Dongchao Yang","Xueyuan Chen","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2411.08742v1.pdf","comment":"5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2411.08733v1","updated":"2024-11-13T16:15:38Z","published":"2024-11-13T16:15:38Z","title":"Dynamic Rewarding with Prompt Optimization Enables Tuning-free\n Self-Alignment of Language Models","summary":" Aligning Large Language Models (LLMs) traditionally relies on costly training\nand human preference annotations. Self-alignment seeks to reduce these expenses\nby enabling models to align themselves. To further lower costs and achieve\nalignment without any expensive tuning or annotations, we introduce a new\ntuning-free approach for self-alignment, Dynamic Rewarding with Prompt\nOptimization (\\ours). Our approach leverages a search-based optimization\nframework that allows LLMs to iteratively self-improve and craft the optimal\nalignment instructions, all without additional training or human intervention.\nThe core of \\ours is a dynamic rewarding mechanism, which identifies and\nrectifies model-specific alignment weaknesses, allowing LLMs to adapt\nefficiently to diverse alignment challenges. Empirical evaluations on eight\nrecent LLMs, both open- and closed-sourced, demonstrate that \\ours\nsignificantly enhances alignment performance, with base models outperforming\ntheir SFT/RLHF-tuned counterparts. Moreover, the prompts automatically\noptimized by \\ours surpass those curated by human experts, further validating\nthe effectiveness of our approach. Our findings highlight the great potential\nof current LLMs to achieve adaptive self-alignment through inference-time\noptimization, complementing tuning-based alignment methods.\n","authors":["Somanshu Singla","Zhen Wang","Tianyang Liu","Abdullah Ashfaq","Zhiting Hu","Eric P. Xing"],"pdf_url":"https://arxiv.org/pdf/2411.08733v1.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2411.08726v1","updated":"2024-11-13T16:08:40Z","published":"2024-11-13T16:08:40Z","title":"Analyst Reports and Stock Performance: Evidence from the Chinese Market","summary":" This article applies natural language processing (NLP) to extract and\nquantify textual information to predict stock performance. Using an extensive\ndataset of Chinese analyst reports and employing a customized BERT deep\nlearning model for Chinese text, this study categorizes the sentiment of the\nreports as positive, neutral, or negative. The findings underscore the\npredictive capacity of this sentiment indicator for stock volatility, excess\nreturns, and trading volume. Specifically, analyst reports with strong positive\nsentiment will increase excess return and intraday volatility, and vice versa,\nreports with strong negative sentiment also increase volatility and trading\nvolume, but decrease future excess return. The magnitude of this effect is\ngreater for positive sentiment reports than for negative sentiment reports.\nThis article contributes to the empirical literature on sentiment analysis and\nthe response of the stock market to news in the Chinese stock market.\n","authors":["Rui Liu","Jiayou Liang","Haolong Chen","Yujia Hu"],"pdf_url":"https://arxiv.org/pdf/2411.08726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08708v1","updated":"2024-11-13T15:50:38Z","published":"2024-11-13T15:50:38Z","title":"Are Triggers Needed for Document-Level Event Extraction?","summary":" Most existing work on event extraction has focused on sentence-level texts\nand presumes the identification of a trigger-span -- a word or phrase in the\ninput that evokes the occurrence of an event of interest. Event arguments are\nthen extracted with respect to the trigger. Indeed, triggers are treated as\nintegral to, and trigger detection as an essential component of, event\nextraction. In this paper, we provide the first investigation of the role of\ntriggers for the more difficult and much less studied task of document-level\nevent extraction. We analyze their usefulness in multiple end-to-end and\npipelined neural event extraction models for three document-level event\nextraction datasets, measuring performance using triggers of varying quality\n(human-annotated, LLM-generated, keyword-based, and random). Our research shows\nthat trigger effectiveness varies based on the extraction task's\ncharacteristics and data quality, with basic, automatically-generated triggers\nserving as a viable alternative to human-annotated ones. Furthermore, providing\ndetailed event descriptions to the extraction model helps maintain robust\nperformance even when trigger quality degrades. Perhaps surprisingly, we also\nfind that the mere existence of trigger input, even random ones, is important\nfor prompt-based LLM approaches to the task.\n","authors":["Shaden Shaar","Wayne Chen","Maitreyi Chatterjee","Barry Wang","Wenting Zhao","Claire Cardie"],"pdf_url":"https://arxiv.org/pdf/2411.08708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07879v4","updated":"2024-11-13T15:45:31Z","published":"2023-11-14T03:18:28Z","title":"Toxicity Detection is NOT all you Need: Measuring the Gaps to Supporting\n Volunteer Content Moderators","summary":" Extensive efforts in automated approaches for content moderation have been\nfocused on developing models to identify toxic, offensive, and hateful content\nwith the aim of lightening the load for moderators. Yet, it remains uncertain\nwhether improvements on those tasks have truly addressed moderators' needs in\naccomplishing their work. In this paper, we surface gaps between past research\nefforts that have aimed to provide automation for aspects of content moderation\nand the needs of volunteer content moderators, regarding identifying violations\nof various moderation rules. To do so, we conduct a model review on Hugging\nFace to reveal the availability of models to cover various moderation rules and\nguidelines from three exemplar forums. We further put state-of-the-art LLMs to\nthe test, evaluating how well these models perform in flagging violations of\nplatform rules from one particular forum. Finally, we conduct a user survey\nstudy with volunteer moderators to gain insight into their perspectives on\nuseful moderation models. Overall, we observe a non-trivial gap, as missing\ndeveloped models and LLMs exhibit moderate to low performance on a significant\nportion of the rules. Moderators' reports provide guides for future work on\ndeveloping moderation assistant models.\n","authors":["Yang Trista Cao","Lovely-Frances Domingo","Sarah Ann Gilbert","Michelle Mazurek","Katie Shilton","Hal Daumé III"],"pdf_url":"https://arxiv.org/pdf/2311.07879v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15339v3","updated":"2024-11-13T15:25:32Z","published":"2024-07-22T02:53:18Z","title":"Deep Learning for Economists","summary":" Deep learning provides powerful methods to impute structured information from\nlarge-scale, unstructured text and image datasets. For example, economists\nmight wish to detect the presence of economic activity in satellite images, or\nto measure the topics or entities mentioned in social media, the congressional\nrecord, or firm filings. This review introduces deep neural networks, covering\nmethods such as classifiers, regression models, generative AI, and embedding\nmodels. Applications include classification, document digitization, record\nlinkage, and methods for data exploration in massive scale text and image\ncorpora. When suitable methods are used, deep learning models can be cheap to\ntune and can scale affordably to problems involving millions or billions of\ndata points.. The review is accompanied by a companion website, EconDL, with\nuser-friendly demo notebooks, software resources, and a knowledge base that\nprovides technical details and additional applications.\n","authors":["Melissa Dell"],"pdf_url":"https://arxiv.org/pdf/2407.15339v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16187v3","updated":"2024-11-13T15:14:38Z","published":"2024-02-25T20:24:07Z","title":"No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design\n Choices","summary":" Advances in generative models have made it possible for AI-generated text,\ncode, and images to mirror human-generated content in many applications.\nWatermarking, a technique that aims to embed information in the output of a\nmodel to verify its source, is useful for mitigating the misuse of such\nAI-generated content. However, we show that common design choices in LLM\nwatermarking schemes make the resulting systems surprisingly susceptible to\nattack -- leading to fundamental trade-offs in robustness, utility, and\nusability. To navigate these trade-offs, we rigorously study a set of simple\nyet effective attacks on common watermarking systems, and propose guidelines\nand defenses for LLM watermarking in practice.\n","authors":["Qi Pang","Shengyuan Hu","Wenting Zheng","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2402.16187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08671v1","updated":"2024-11-13T15:04:02Z","published":"2024-11-13T15:04:02Z","title":"Theoretical Analysis of Byte-Pair Encoding","summary":" Byte-Pair Encoding (BPE) is a widely used method for subword tokenization,\nwith origins in grammar-based text compression. It is employed in a variety of\nlanguage processing tasks such as machine translation or large language model\n(LLM) pretraining, to create a token dictionary of a prescribed size. Most\nevaluations of BPE to date are empirical, and the reasons for its good\npractical performance are not well understood.\n In this paper we focus on the optimization problem underlying BPE: finding a\npair encoding that achieves optimal compression utility. We show that this\nproblem is APX-complete, indicating that it is unlikely to admit a\npolynomial-time approximation scheme. This answers, in a stronger form, a\nquestion recently raised by Zouhar et al.\n On the positive side, we show that BPE approximates the compression utility\nof the optimal pair encoding to a worst-case factor between $0.333$ and\n$0.625$. Our results aim to explain the ongoing success of BPE and are, to our\nknowledge, the first rigorous guarantees on its compression utility that hold\nfor all inputs.\n","authors":["László Kozma","Johannes Voderholzer"],"pdf_url":"https://arxiv.org/pdf/2411.08671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15736v2","updated":"2024-11-13T14:05:18Z","published":"2024-03-23T06:03:36Z","title":"General LLMs as Instructors for Domain-Specific LLMs: A Sequential\n Fusion Method to Integrate Extraction and Editing","summary":" The substantial interest in updating Large Language Models (LLMs) without\nretraining from scratch is accompanied by several challenges. This is\nparticularly true when updating LLMs with datasets that necessitate\ndomain-expert reasoning across extensive texts, despite limited samples. We\ntermed the scenario as the Few-Shot Domain-Expert Reasoning for Updating LLMs\n(FDoR-UL). Traditional methods such as Low-Rank Adaptation (LoRA) and Retrieval\nAugmented Generation (RAG) are inadequate for addressing this critical issue,\nparticularly evident in our exploration of a specific medical dataset that\nepitomizes the distinct needs of FDoR-UL. To tackle this challenge, we\nintroduce a Sequential Fusion method to integrate knowledge from complex\ncontexts into LLMs. This method employs a two-stage framework: initially\nleveraging general LLMs to perform relation extraction for knowledge\nacquisition from complex texts, followed by updating domain-specific LLMs\nthrough Knowledge Editing (KE). Employing our method, domain-specific LLMs\nachieved a 71.7% accuracy (an average gain of 39.1%) in question-answering\ntasks. Furthermore, we expanded our evaluation to a novel economics-management\ndataset we developed, where our method achieved a 75.0% accuracy (an average\ngain of 45.0%). These findings underscore the effectiveness and flexibility of\nour approach in FDoR-UL across various domains.\n","authors":["Xin Zhang","Tianjie Ju","Huijia Liang","Ying Fu","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.15736v2.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2411.08610v1","updated":"2024-11-13T13:53:10Z","published":"2024-11-13T13:53:10Z","title":"Dynamic Subset Tuning: Expanding the Operational Range of\n Parameter-Efficient Training for Large Language Models","summary":" We propose a novel parameter-efficient training (PET) method for large\nlanguage models that adapts models to downstream tasks by optimizing a small\nsubset of the existing model parameters. Unlike prior methods, this subset is\nnot fixed in location but rather which parameters are modified evolves over the\ncourse of training. This dynamic parameter selection can yield good performance\nwith many fewer parameters than extant methods. Our method enables a seamless\nscaling of the subset size across an arbitrary proportion of the total model\nsize, while popular PET approaches like prompt tuning and LoRA cover only a\nsmall part of this spectrum. We match or outperform prompt tuning and LoRA in\nmost cases on a variety of NLP tasks (MT, QA, GSM8K, SuperGLUE) for a given\nparameter budget across different model families and sizes.\n","authors":["Felix Stahlberg","Jared Lichtarge","Shankar Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.08610v1.pdf","comment":"NeurIPS 2024 Workshop on Adaptive Foundation Models"},{"id":"http://arxiv.org/abs/2410.20513v2","updated":"2024-11-13T13:40:19Z","published":"2024-10-27T16:52:21Z","title":"Is Moral Self-correction An Innate Capability of Large Language Models?\n A Mechanistic Analysis to Self-correction","summary":" Though intensive attentions to the self-correction capability of Large\nLanguage Models (LLMs), the underlying mechanism of this capability is still\nunder-explored. In this paper, we aim to answer two fundamental questions for\nmoral self-correction: (1) how different components in self-correction, such as\nChain-of-Thought (CoT) reasoning, external feedback, and instructional prompts,\ninteract to enable moral self-correction; and (2) is the self-correction one of\nLLMs' innate capabilities? To answer the first question, we examine how\ndifferent self-correction components interact to intervene the embedded\nmorality within hidden states, therefore contributing to different performance.\nFor the second question, we (i) evaluate the robustness of moral\nself-correction by introducing natural language interventions of weak evidence\ninto prompts; (ii) propose a validation framework, self-distinguish, that\nrequires effective self-correction to enable LLMs to distinguish between\ndesirable and undesirable outputs. Our experimental results indicate that there\nis no universally optimal self-correction method for the tasks considered,\nalthough external feedback and CoT can contribute to additional performance\ngains. However, our mechanistic analysis reveals negative interactions among\ninstructional prompts, CoT, and external feedback, suggesting a conflict\nbetween internal knowledge and external feedback. The self-distinguish\nexperiments demonstrate that while LLMs can self-correct their responses, they\nare unable to reliably distinguish between desired and undesired outputs. With\nour empirical evidence, we can conclude that moral self-correction is not an\ninnate capability of LLMs acquired during pretraining.\n","authors":["Zimo Qi","Guangliang Liu","Kristen Marie Johnson","Lu Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.20513v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08599v1","updated":"2024-11-13T13:30:21Z","published":"2024-11-13T13:30:21Z","title":"XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL","summary":" To tackle the challenges of large language model performance in natural\nlanguage to SQL tasks, we introduce XiYan-SQL, an innovative framework that\nemploys a multi-generator ensemble strategy to improve candidate generation. We\nintroduce M-Schema, a semi-structured schema representation method designed to\nenhance the understanding of database structures. To enhance the quality and\ndiversity of generated candidate SQL queries, XiYan-SQL integrates the\nsignificant potential of in-context learning (ICL) with the precise control of\nsupervised fine-tuning. On one hand, we propose a series of training strategies\nto fine-tune models to generate high-quality candidates with diverse\npreferences. On the other hand, we implement the ICL approach with an example\nselection method based on named entity recognition to prevent overemphasis on\nentities. The refiner optimizes each candidate by correcting logical or\nsyntactical errors. To address the challenge of identifying the best candidate,\nwe fine-tune a selection model to distinguish nuances of candidate SQL queries.\nThe experimental results on multiple dialect datasets demonstrate the\nrobustness of XiYan-SQL in addressing challenges across different scenarios.\nOverall, our proposed XiYan-SQL achieves the state-of-the-art execution\naccuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on\nNL2GQL, and a competitive score of 72.23% on the Bird development benchmark.\nThe proposed framework not only enhances the quality and diversity of SQL\nqueries but also outperforms previous methods.\n","authors":["Yingqi Gao","Yifu Liu","Xiaoxia Li","Xiaorong Shi","Yin Zhu","Yiming Wang","Shiqi Li","Wei Li","Yuntao Hong","Zhiling Luo","Jinyang Gao","Liyu Mou","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2411.08599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17073v3","updated":"2024-11-13T12:46:54Z","published":"2024-09-25T16:32:35Z","title":"Enhancing Post-Hoc Attributions in Long Document Comprehension via\n Coarse Grained Answer Decomposition","summary":" Accurately attributing answer text to its source document is crucial for\ndeveloping a reliable question-answering system. However, attribution for long\ndocuments remains largely unexplored. Post-hoc attribution systems are designed\nto map answer text back to the source document, yet the granularity of this\nmapping has not been addressed. Furthermore, a critical question arises: What\nexactly should be attributed? This involves identifying the specific\ninformation units within an answer that require grounding. In this paper, we\npropose and investigate a novel approach to the factual decomposition of\ngenerated answers for attribution, employing template-based in-context\nlearning. To accomplish this, we utilize the question and integrate negative\nsampling during few-shot in-context learning for decomposition. This approach\nenhances the semantic understanding of both abstractive and extractive answers.\nWe examine the impact of answer decomposition by providing a thorough\nexamination of various attribution approaches, ranging from retrieval-based\ntechniques to LLM-based attributors.\n","authors":["Pritika Ramu","Koustava Goswami","Apoorv Saxena","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2409.17073v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02549v2","updated":"2024-11-13T12:37:09Z","published":"2024-02-04T15:52:59Z","title":"Are Large Language Models Table-based Fact-Checkers?","summary":" Table-based Fact Verification (TFV) aims to extract the entailment relation\nbetween statements and structured tables. Existing TFV methods based on\nsmall-scaled models suffer from insufficient labeled data and weak zero-shot\nability. Recently, the appearance of Large Language Models (LLMs) has gained\nlots of attraction in research fields. They have shown powerful zero-shot and\nin-context learning abilities on several NLP tasks, but their potential on TFV\nis still unknown. In this work, we implement a preliminary study about whether\nLLMs are table-based fact-checkers. In detail, we design diverse prompts to\nexplore how the in-context learning can help LLMs in TFV, i.e., zero-shot and\nfew-shot TFV capability. Besides, we carefully design and construct TFV\ninstructions to study the performance gain brought by the instruction tuning of\nLLMs. Experimental results demonstrate that LLMs can achieve acceptable results\non zero-shot and few-shot TFV with prompt engineering, while instruction-tuning\ncan stimulate the TFV capability significantly. We also make some valuable\nfindings about the format of zero-shot prompts and the number of in-context\nexamples. Finally, we analyze some possible directions to promote the accuracy\nof TFV via LLMs, which is beneficial to further research of table reasoning.\n","authors":["Hanwen Zhang","Qingyi Si","Peng Fu","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02549v2.pdf","comment":"CSCWD 2024"},{"id":"http://arxiv.org/abs/2411.08553v1","updated":"2024-11-13T12:09:23Z","published":"2024-11-13T12:09:23Z","title":"CorrSynth -- A Correlated Sampling Method for Diverse Dataset Generation\n from LLMs","summary":" Large language models (LLMs) have demonstrated remarkable performance in\ndiverse tasks using zero-shot and few-shot prompting. Even though their\ncapabilities of data synthesis have been studied well in recent years, the\ngenerated data suffers from a lack of diversity, less adherence to the prompt,\nand potential biases that creep into the data from the generator model. In this\nwork, we tackle the challenge of generating datasets with high diversity, upon\nwhich a student model is trained for downstream tasks. Taking the route of\ndecoding-time guidance-based approaches, we propose CorrSynth, which generates\ndata that is more diverse and faithful to the input prompt using a correlated\nsampling strategy. Further, our method overcomes the complexity drawbacks of\nsome other guidance-based techniques like classifier-based guidance. With\nextensive experiments, we show the effectiveness of our approach and\nsubstantiate our claims. In particular, we perform intrinsic evaluation to show\nthe improvements in diversity. Our experiments show that CorrSynth improves\nboth student metrics and intrinsic metrics upon competitive baselines across\nfour datasets, showing the innate advantage of our method.\n","authors":["Suhas S Kowshik","Abhishek Divekar","Vijit Malik"],"pdf_url":"https://arxiv.org/pdf/2411.08553v1.pdf","comment":"Published as a main conference paper at EMNLP 2024; First two authors\n contributed equally"},{"id":"http://arxiv.org/abs/2411.08534v1","updated":"2024-11-13T11:31:02Z","published":"2024-11-13T11:31:02Z","title":"Neural Topic Modeling with Large Language Models in the Loop","summary":" Topic modeling is a fundamental task in natural language processing, allowing\nthe discovery of latent thematic structures in text corpora. While Large\nLanguage Models (LLMs) have demonstrated promising capabilities in topic\ndiscovery, their direct application to topic modeling suffers from issues such\nas incomplete topic coverage, misalignment of topics, and inefficiency. To\naddress these limitations, we propose LLM-ITL, a novel LLM-in-the-loop\nframework that integrates LLMs with many existing Neural Topic Models (NTMs).\nIn LLM-ITL, global topics and document representations are learned through the\nNTM, while an LLM refines the topics via a confidence-weighted Optimal\nTransport (OT)-based alignment objective. This process enhances the\ninterpretability and coherence of the learned topics, while maintaining the\nefficiency of NTMs. Extensive experiments demonstrate that LLM-ITL can help\nNTMs significantly improve their topic interpretability while maintaining the\nquality of document representation.\n","authors":["Xiaohao Yang","He Zhao","Weijie Xu","Yuanyuan Qi","Jueqing Lu","Dinh Phung","Lan Du"],"pdf_url":"https://arxiv.org/pdf/2411.08534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07268v2","updated":"2024-11-13T11:28:07Z","published":"2024-11-09T15:59:59Z","title":"Target-driven Attack for Large Language Models","summary":" Current large language models (LLM) provide a strong foundation for\nlarge-scale user-oriented natural language tasks. Many users can easily inject\nadversarial text or instructions through the user interface, thus causing LLM\nmodel security challenges like the language model not giving the correct\nanswer. Although there is currently a large amount of research on black-box\nattacks, most of these black-box attacks use random and heuristic strategies.\nIt is unclear how these strategies relate to the success rate of attacks and\nthus effectively improve model robustness. To solve this problem, we propose\nour target-driven black-box attack method to maximize the KL divergence between\nthe conditional probabilities of the clean text and the attack text to redefine\nthe attack's goal. We transform the distance maximization problem into two\nconvex optimization problems based on the attack goal to solve the attack text\nand estimate the covariance. Furthermore, the projected gradient descent\nalgorithm solves the vector corresponding to the attack text. Our target-driven\nblack-box attack approach includes two attack strategies: token manipulation\nand misinformation attack. Experimental results on multiple Large Language\nModels and datasets demonstrate the effectiveness of our attack method.\n","authors":["Chong Zhang","Mingyu Jin","Dong Shu","Taowen Wang","Dongfang Liu","Xiaobo Jin"],"pdf_url":"https://arxiv.org/pdf/2411.07268v2.pdf","comment":"12 pages, 7 figures. This work is an extension of the\n arXiv:2404.07234 work. We propose new methods. 27th European Conference on\n Artificial Intelligence 2024"},{"id":"http://arxiv.org/abs/2405.10040v3","updated":"2024-11-13T11:13:56Z","published":"2024-05-16T12:22:41Z","title":"SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation","summary":" It is often desirable to distill the capabilities of large language models\n(LLMs) into smaller student models due to compute and memory constraints. One\nway to do this for classification tasks is via dataset synthesis, which can be\naccomplished by generating examples of each label from the LLM. Prior\napproaches to synthesis use few-shot prompting, which relies on the LLM's\nparametric knowledge to generate usable examples. However, this leads to issues\nof repetition, bias towards popular entities, and stylistic differences from\nhuman text. In this work, we propose Synthesize by Retrieval and Refinement\n(SynthesizRR), which uses retrieval augmentation to introduce variety into the\ndataset synthesis process: as retrieved passages vary, the LLM is seeded with\ndifferent content to generate its examples. We empirically study the synthesis\nof six datasets, covering topic classification, sentiment analysis, tone\ndetection, and humor, requiring complex synthesis strategies. We find that\nSynthesizRR greatly improves lexical and semantic diversity, similarity to\nhuman-written text, and distillation performance, when compared to 32-shot\nprompting and four prior approaches. We release our code to perform all steps\nat https://github.com/amazon-science/synthesizrr\n","authors":["Abhishek Divekar","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2405.10040v3.pdf","comment":"Published as a main conference paper at EMNLP 2024. Code available at\n https://github.com/amazon-science/synthesizrr"},{"id":"http://arxiv.org/abs/2411.08516v1","updated":"2024-11-13T11:02:04Z","published":"2024-11-13T11:02:04Z","title":"Tree-of-Table: Unleashing the Power of LLMs for Enhanced Large-Scale\n Table Understanding","summary":" The ubiquity and value of tables as semi-structured data across various\ndomains necessitate advanced methods for understanding their complexity and\nvast amounts of information. Despite the impressive capabilities of large\nlanguage models (LLMs) in advancing the natural language understanding\nfrontier, their application to large-scale tabular data presents significant\nchallenges, specifically regarding table size and complex intricate\nrelationships. Existing works have shown promise with small-scale tables but\noften flounder when tasked with the complex reasoning required by larger,\ninterconnected tables found in real-world scenarios. To address this gap, we\nintroduce \"Tree-of-Table\", a novel approach designed to enhance LLMs' reasoning\ncapabilities over large and complex tables. Our method employs Table\nCondensation and Decomposition to distill and reorganize relevant data into a\nmanageable format, followed by the construction of a hierarchical Table-Tree\nthat facilitates tree-structured reasoning. Through a meticulous Table-Tree\nExecution process, we systematically unravel the tree-structured reasoning\nchain to derive the solutions. Experiments across diverse datasets, including\nWikiTQ, TableFact, FeTaQA, and BIRD, demonstrate that Tree-of-Table sets a new\nbenchmark with superior performance, showcasing remarkable efficiency and\ngeneralization capabilities in large-scale table reasoning.\n","authors":["Deyi Ji","Lanyun Zhu","Siqi Gao","Peng Xu","Hongtao Lu","Jieping Ye","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.08516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13929v4","updated":"2024-11-13T10:57:21Z","published":"2024-05-22T18:58:58Z","title":"Vikhr: Constructing a State-of-the-art Bilingual Open-Source\n Instruction-Following Large Language Model for Russian","summary":" There has been a surge in developing various Large Language Models (LLMs).\nHowever, text generation for languages other than English often faces\nsignificant challenges, including poor generation quality and reduced\ncomputational performance due to the disproportionate representation of tokens\nin the model's vocabulary. In this work, we address these issues by developing\na pipeline for adapting English-oriented pre-trained models to other languages\nand constructing efficient bilingual LLMs. Using this pipeline, we construct\nVikhr, a state-of-the-art bilingual open-source instruction-following LLM\ndesigned specifically for the Russian language. \"Vikhr\" refers to the name of\nthe Mistral LLM series and means a \"strong gust of wind.\" Unlike previous\nRussian-language models that typically rely on LoRA adapters on top of\nEnglish-oriented models, sacrificing performance for lower training costs,\nVikhr features an adapted tokenizer vocabulary and undergoes continued\npre-training and instruction tuning of all weights. This not only enhances the\nmodel's performance but also significantly improves its computational and\ncontextual efficiency. The remarkable performance of Vikhr across various\nRussian-language benchmarks can also be attributed to our efforts in expanding\ninstruction datasets and corpora for continued pre-training. Vikhr not only\nsets a new state of the art among open-source LLMs for Russian but even\noutperforms some proprietary closed-source models on certain benchmarks. The\nmodel weights, instruction sets, and code are publicly available.\n","authors":["Aleksandr Nikolich","Konstantin Korolev","Sergei Bratchikov","Igor Kiselev","Artem Shelmanov"],"pdf_url":"https://arxiv.org/pdf/2405.13929v4.pdf","comment":"Accepted at WMRL @ EMNLP-2024"},{"id":"http://arxiv.org/abs/2411.08506v1","updated":"2024-11-13T10:43:31Z","published":"2024-11-13T10:43:31Z","title":"An Information Theoretic Approach to Operationalize Right to Data\n Protection","summary":" The widespread practice of indiscriminate data scraping to fine-tune language\nmodels (LMs) raises significant legal and ethical concerns, particularly\nregarding compliance with data protection laws such as the General Data\nProtection Regulation (GDPR). This practice often results in the unauthorized\nuse of personal information, prompting growing debate within the academic and\nregulatory communities. Recent works have introduced the concept of generating\nunlearnable datasets (by adding imperceptible noise to the clean data), such\nthat the underlying model achieves lower loss during training but fails to\ngeneralize to the unseen test setting. Though somewhat effective, these\napproaches are predominantly designed for images and are limited by several\npractical constraints like requiring knowledge of the target model. To this\nend, we introduce RegText, a framework that injects imperceptible spurious\ncorrelations into natural language datasets, effectively rendering them\nunlearnable without affecting semantic content. We demonstrate RegText's\nutility through rigorous empirical analysis of small and large LMs. Notably,\nRegText can restrict newer models like GPT-4o and Llama from learning on our\ngenerated data, resulting in a drop in their test accuracy compared to their\nzero-shot performance and paving the way for generating unlearnable text to\nprotect public data.\n","authors":["Abhinav Java","Simra Shahid","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2411.08506v1.pdf","comment":"First two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2411.08504v1","updated":"2024-11-13T10:42:11Z","published":"2024-11-13T10:42:11Z","title":"Towards Objective and Unbiased Decision Assessments with LLM-Enhanced\n Hierarchical Attention Networks","summary":" How objective and unbiased are we while making decisions? This work\ninvestigates cognitive bias identification in high-stake decision making\nprocess by human experts, questioning its effectiveness in real-world settings,\nsuch as candidates assessments for university admission. We begin with a\nstatistical analysis assessing correlations among different decision points\namong in the current process, which discovers discrepancies that imply\ncognitive bias and inconsistency in decisions. This motivates our exploration\nof bias-aware AI-augmented workflow that surpass human judgment. We propose\nBGM-HAN, a hierarchical attention network enhanced by byte-pair encoding,\nmulti-head attention and gated residual connection. Using it as backbone model,\nwe further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow, which\nsimulate real-world decision-making. In our experiments, both the proposed\nmodel and the agentic workflow significantly improves on both human judgment\nand alternative models, validated with real-world data.\n","authors":["Junhua Liu","Kwan Hui Lim","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08449v1","updated":"2024-11-13T09:11:56Z","published":"2024-11-13T09:11:56Z","title":"Towards Evaluating Large Language Models for Graph Query Generation","summary":" Large Language Models (LLMs) are revolutionizing the landscape of Generative\nArtificial Intelligence (GenAI), with innovative LLM-backed solutions emerging\nrapidly. However, when applied to database technologies, specifically query\ngeneration for graph databases and Knowledge Graphs (KGs), LLMs still face\nsignificant challenges. While research on LLM-driven query generation for\nStructured Query Language (SQL) exists, similar systems for graph databases\nremain underdeveloped. This paper presents a comparative study addressing the\nchallenge of generating Cypher queries a powerful language for interacting with\ngraph databases using open-access LLMs. We rigorously evaluate several LLM\nagents (OpenAI ChatGPT 4o, Claude Sonnet 3.5, Google Gemini Pro 1.5, and a\nlocally deployed Llama 3.1 8B) using a designed few-shot learning prompt and\nRetrieval Augmented Generation (RAG) backed by Chain-of-Thoughts (CoT)\nreasoning. Our empirical analysis of query generation accuracy reveals that\nClaude Sonnet 3.5 outperforms its counterparts in this specific domain.\nFurther, we highlight promising future research directions to address the\nidentified limitations and advance LLM-driven query generation for graph\ndatabases.\n","authors":["Siraj Munir","Alessandro Aldini"],"pdf_url":"https://arxiv.org/pdf/2411.08449v1.pdf","comment":"Paper accepted and will be presented at CSCI2024 in December 2024,\n Later will be published at Springer LNCS"},{"id":"http://arxiv.org/abs/2404.17808v3","updated":"2024-11-13T08:51:04Z","published":"2024-04-27T07:12:07Z","title":"Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models\n with Simple and Effective Scaffold Token Removal","summary":" Byte Pair Encoding (BPE) serves as a foundation method for text tokenization\nin the Natural Language Processing (NLP) field. Despite its wide adoption, the\noriginal BPE algorithm harbors an inherent flaw: it inadvertently introduces a\nfrequency imbalance for tokens in the text corpus. Since BPE iteratively merges\nthe most frequent token pair in the text corpus to generate a new token and\nkeeps all generated tokens in the vocabulary, it unavoidably holds tokens that\nprimarily act as components of a longer token and appear infrequently on their\nown. We term such tokens as Scaffold Tokens. Due to their infrequent\noccurrences in the text corpus, Scaffold Tokens pose a learning imbalance\nissue. To address that issue, we propose Scaffold-BPE, which incorporates a\ndynamic scaffold token removal mechanism by parameter-free, computation-light,\nand easy-to-implement modifications to the original BPE method. This novel\napproach ensures the exclusion of low-frequency Scaffold Tokens from the token\nrepresentations for given texts, thereby mitigating the issue of frequency\nimbalance and facilitating model training. On extensive experiments across\nlanguage modeling and even machine translation, Scaffold-BPE consistently\noutperforms the original BPE, well demonstrating its effectiveness.\n","authors":["Haoran Lian","Yizhe Xiong","Jianwei Niu","Shasha Mo","Zhenpeng Su","Zijia Lin","Hui Chen","Peng Liu","Jungong Han","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2404.17808v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08432v1","updated":"2024-11-13T08:32:42Z","published":"2024-11-13T08:32:42Z","title":"One STEP at a time: Language Agents are Stepwise Planners","summary":" Language agents have shown promising adaptability in dynamic environments to\nperform complex tasks. However, despite the versatile knowledge embedded in\nlarge language models, these agents still fall short when it comes to tasks\nthat require planning. We introduce STEP, a novel framework designed to\nefficiently learn from previous experiences to enhance the planning\ncapabilities of language agents in future steps. Concretely, STEP functions\nthrough four interconnected components. First, the Planner takes on the task,\nbreaks it down into subtasks and provides relevant insights. Then the Executor\ngenerates action candidates, while the Evaluator ensures the actions align with\nlearned rules from previous experiences. Lastly, Memory stores experiences to\ninform future decisions. In the ScienceWorld benchmark, our results show that\nSTEP consistently outperforms state-of-the-art models, achieving an overall\nscore of 67.4 and successfully completing 12 out of 18 tasks. These findings\nhighlight STEP's potential as a framework for enhancing planning capabilities\nin language agents, paving the way for more sophisticated task-solving in\ndynamic environments.\n","authors":["Minh Nguyen","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2411.08432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08397v1","updated":"2024-11-13T07:32:58Z","published":"2024-11-13T07:32:58Z","title":"CLaSP: Learning Concepts for Time-Series Signals from Natural Language\n Supervision","summary":" This paper proposes a foundation model called \"CLaSP\" that can search time\nseries signals using natural language that describes the characteristics of the\nsignals as queries. Previous efforts to represent time series signal data in\nnatural language have had challenges in designing a conventional class of time\nseries signal characteristics, formulating their quantification, and creating a\ndictionary of synonyms. To overcome these limitations, the proposed method\nintroduces a neural network based on contrastive learning. This network is\nfirst trained using the datasets TRUCE and SUSHI, which consist of time series\nsignals and their corresponding natural language descriptions. Previous studies\nhave proposed vocabularies that data analysts use to describe signal\ncharacteristics, and SUSHI was designed to cover these terms. We believe that a\nneural network trained on these datasets will enable data analysts to search\nusing natural language vocabulary. Furthermore, our method does not require a\ndictionary of predefined synonyms, and it leverages common sense knowledge\nembedded in a large-scale language model (LLM). Experimental results\ndemonstrate that CLaSP enables natural language search of time series signal\ndata and can accurately learn the points at which signal data changes.\n","authors":["Aoi Ito","Kota Dohi","Yohei Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2411.08397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02288v2","updated":"2024-11-13T07:13:36Z","published":"2024-08-05T07:54:01Z","title":"Spin glass model of in-context learning","summary":" Large language models show a surprising in-context learning ability -- being\nable to use a prompt to form a prediction for a query, yet without additional\ntraining, in stark contrast to old-fashioned supervised learning. Providing a\nmechanistic interpretation and linking the empirical phenomenon to physics are\nthus challenging and remain unsolved. We study a simple yet expressive\ntransformer with linear attention and map this structure to a spin glass model\nwith real-valued spins, where the couplings and fields explain the intrinsic\ndisorder in data. The spin glass model explains how the weight parameters\ninteract with each other during pre-training, and further clarifies why an\nunseen function can be predicted by providing only a prompt yet without further\ntraining. Our theory reveals that for single-instance learning, increasing the\ntask diversity leads to the emergence of in-context learning, by allowing the\nBoltzmann distribution to converge to a unique correct solution of weight\nparameters. Therefore the pre-trained transformer displays a prediction power\nin a novel prompt setting. The proposed analytically tractable model thus\noffers a promising avenue for thinking about how to interpret many intriguing\nbut puzzling properties of large language models.\n","authors":["Yuhao Li","Ruoran Bai","Haiping Huang"],"pdf_url":"https://arxiv.org/pdf/2408.02288v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.08384v1","updated":"2024-11-13T07:10:18Z","published":"2024-11-13T07:10:18Z","title":"Interpretable Syntactic Representations Enable Hierarchical Word Vectors","summary":" The distributed representations currently used are dense and uninterpretable,\nleading to interpretations that themselves are relative, overcomplete, and hard\nto interpret. We propose a method that transforms these word vectors into\nreduced syntactic representations. The resulting representations are compact\nand interpretable allowing better visualization and comparison of the word\nvectors and we successively demonstrate that the drawn interpretations are in\nline with human judgment. The syntactic representations are then used to create\nhierarchical word vectors using an incremental learning approach similar to the\nhierarchical aspect of human learning. As these representations are drawn from\npre-trained vectors, the generation process and learning approach are\ncomputationally efficient. Most importantly, we find out that syntactic\nrepresentations provide a plausible interpretation of the vectors and\nsubsequent hierarchical vectors outperform the original vectors in benchmark\ntests.\n","authors":["Biraj Silwal"],"pdf_url":"https://arxiv.org/pdf/2411.08384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07820v2","updated":"2024-11-13T05:43:58Z","published":"2024-11-12T14:12:45Z","title":"Query Optimization for Parametric Knowledge Refinement in\n Retrieval-Augmented Large Language Models","summary":" We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel\napproach designed to bridge the pre-retrieval information gap in\nRetrieval-Augmented Generation (RAG) systems through query optimization\ntailored to meet the specific knowledge requirements of Large Language Models\n(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR\nframework begins by extracting parametric knowledge from LLMs, followed by\nusing a specialized query optimizer for refining these queries. This process\nensures the retrieval of only the most pertinent information essential for\ngenerating accurate responses. Moreover, to enhance flexibility and reduce\ncomputational costs, we propose a trainable scheme for our pipeline that\nutilizes a smaller, tunable model as the query optimizer, which is refined\nthrough knowledge distillation from a larger teacher model. Our evaluations on\nvarious question-answering (QA) datasets and with different retrieval systems\nshow that ERRR consistently outperforms existing baselines, proving to be a\nversatile and cost-effective module for improving the utility and accuracy of\nRAG systems.\n","authors":["Youan Cong","Cheng Wang","Pritom Saha Akash","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2411.07820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08348v1","updated":"2024-11-13T05:40:24Z","published":"2024-11-13T05:40:24Z","title":"Refining Translations with LLMs: A Constraint-Aware Iterative Prompting\n Approach","summary":" Large language models (LLMs) have demonstrated remarkable proficiency in\nmachine translation (MT), even without specific training on the languages in\nquestion. However, translating rare words in low-resource or domain-specific\ncontexts remains challenging for LLMs. To address this issue, we propose a\nmulti-step prompt chain that enhances translation faithfulness by prioritizing\nkey terms crucial for semantic accuracy. Our method first identifies these\nkeywords and retrieves their translations from a bilingual dictionary,\nintegrating them into the LLM's context using Retrieval-Augmented Generation\n(RAG). We further mitigate potential output hallucinations caused by long\nprompts through an iterative self-checking mechanism, where the LLM refines its\ntranslations based on lexical and semantic constraints. Experiments using Llama\nand Qwen as base models on the FLORES-200 and WMT datasets demonstrate\nsignificant improvements over baselines, highlighting the effectiveness of our\napproach in enhancing translation faithfulness and robustness, particularly in\nlow-resource scenarios.\n","authors":["Shangfeng Chen","Xiayang Shi","Pu Li","Yinlin Li","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08347v1","updated":"2024-11-13T05:38:55Z","published":"2024-11-13T05:38:55Z","title":"A Chinese Multi-label Affective Computing Dataset Based on Social Media\n Network Users","summary":" Emotion and personality are central elements in understanding human\npsychological states. Emotions reflect an individual subjective experiences,\nwhile personality reveals relatively stable behavioral and cognitive patterns.\nExisting affective computing datasets often annotate emotion and personality\ntraits separately, lacking fine-grained labeling of micro-emotions and emotion\nintensity in both single-label and multi-label classifications. Chinese emotion\ndatasets are extremely scarce, and datasets capturing Chinese user personality\ntraits are even more limited. To address these gaps, this study collected data\nfrom the major social media platform Weibo, screening 11,338 valid users from\nover 50,000 individuals with diverse MBTI personality labels and acquiring\n566,900 posts along with the user MBTI personality tags. Using the EQN method,\nwe compiled a multi-label Chinese affective computing dataset that integrates\nthe same user's personality traits with six emotions and micro-emotions, each\nannotated with intensity levels. Validation results across multiple NLP\nclassification models demonstrate the dataset strong utility. This dataset is\ndesigned to advance machine recognition of complex human emotions and provide\ndata support for research in psychology, education, marketing, finance, and\npolitics.\n","authors":["Jingyi Zhou","Senlin Luo","Haofan Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08344v1","updated":"2024-11-13T05:22:45Z","published":"2024-11-13T05:22:45Z","title":"Bangla Grammatical Error Detection Leveraging Transformer-based Token\n Classification","summary":" Bangla is the seventh most spoken language by a total number of speakers in\nthe world, and yet the development of an automated grammar checker in this\nlanguage is an understudied problem. Bangla grammatical error detection is a\ntask of detecting sub-strings of a Bangla text that contain grammatical,\npunctuation, or spelling errors, which is crucial for developing an automated\nBangla typing assistant. Our approach involves breaking down the task as a\ntoken classification problem and utilizing state-of-the-art transformer-based\nmodels. Finally, we combine the output of these models and apply rule-based\npost-processing to generate a more reliable and comprehensive result. Our\nsystem is evaluated on a dataset consisting of over 25,000 texts from various\nsources. Our best model achieves a Levenshtein distance score of 1.04. Finally,\nwe provide a detailed analysis of different components of our system.\n","authors":["Shayekh Bin Islam","Ridwanul Hasan Tanvir","Sihat Afnan"],"pdf_url":"https://arxiv.org/pdf/2411.08344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17439v3","updated":"2024-11-13T04:57:08Z","published":"2024-10-22T21:30:58Z","title":"Evaluating AI-Generated Essays with GRE Analytical Writing Assessment","summary":" The recent revolutionary advance in generative AI enables the generation of\nrealistic and coherent texts by large language models (LLMs). Despite many\nexisting evaluation metrics on the quality of the generated texts, there is\nstill a lack of rigorous assessment of how well LLMs perform in complex and\ndemanding writing assessments. This study examines essays generated by ten\nleading LLMs for the analytical writing assessment of the Graduate Record Exam\n(GRE). We assessed these essays using both human raters and the e-rater\nautomated scoring engine as used in the GRE scoring pipeline. Notably, the\ntop-performing Gemini and GPT-4o received an average score of 4.78 and 4.67,\nrespectively, falling between \"generally thoughtful, well-developed analysis of\nthe issue and conveys meaning clearly\" and \"presents a competent analysis of\nthe issue and conveys meaning with acceptable clarity\" according to the GRE\nscoring guideline. We also evaluated the detection accuracy of these essays,\nwith detectors trained on essays generated by the same and different LLMs.\n","authors":["Yang Zhong","Jiangang Hao","Michael Fauss","Chen Li","Yuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17439v3.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.15553v2","updated":"2024-11-13T04:26:13Z","published":"2024-10-21T00:59:47Z","title":"Multi-IF: Benchmarking LLMs on Multi-Turn and Multilingual Instructions\n Following","summary":" Large Language Models (LLMs) have demonstrated impressive capabilities in\nvarious tasks, including instruction following, which is crucial for aligning\nmodel outputs with user expectations. However, evaluating LLMs' ability to\nfollow instructions remains challenging due to the complexity and subjectivity\nof human language. Current benchmarks primarily focus on single-turn,\nmonolingual instructions, which do not adequately reflect the complexities of\nreal-world applications that require handling multi-turn and multilingual\ninteractions. To address this gap, we introduce Multi-IF, a new benchmark\ndesigned to assess LLMs' proficiency in following multi-turn and multilingual\ninstructions. Multi-IF, which utilizes a hybrid framework combining LLM and\nhuman annotators, expands upon the IFEval by incorporating multi-turn sequences\nand translating the English prompts into another 7 languages, resulting in a\ndataset of 4,501 multilingual conversations, where each has three turns. Our\nevaluation of 14 state-of-the-art LLMs on Multi-IF reveals that it presents a\nsignificantly more challenging task than existing benchmarks. All the models\ntested showed a higher rate of failure in executing instructions correctly with\neach additional turn. For example, o1-preview drops from 0.877 at the first\nturn to 0.707 at the third turn in terms of average accuracy over all\nlanguages. Moreover, languages with non-Latin scripts (Hindi, Russian, and\nChinese) generally exhibit higher error rates, suggesting potential limitations\nin the models' multilingual capabilities. We release Multi-IF prompts and the\nevaluation code base to encourage further research in this critical area.\n","authors":["Yun He","Di Jin","Chaoqi Wang","Chloe Bi","Karishma Mandyam","Hejia Zhang","Chen Zhu","Ning Li","Tengyu Xu","Hongjiang Lv","Shruti Bhosale","Chenguang Zhu","Karthik Abinav Sankararaman","Eryk Helenowski","Melanie Kambadur","Aditya Tayade","Hao Ma","Han Fang","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2410.15553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08324v1","updated":"2024-11-13T04:20:20Z","published":"2024-11-13T04:20:20Z","title":"Are LLMs Prescient? A Continuous Evaluation using Daily News as the\n Oracle","summary":" Many existing evaluation benchmarks for Large Language Models (LLMs) quickly\nbecome outdated due to the emergence of new models and training data. These\nbenchmarks also fall short in assessing how LLM performance changes over time,\nas they consist of static questions without a temporal dimension. To address\nthese limitations, we propose using future event prediction as a continuous\nevaluation method to assess LLMs' temporal generalization and forecasting\nabilities. Our benchmark, Daily Oracle, automatically generates question-answer\n(QA) pairs from daily news, challenging LLMs to predict \"future\" event\noutcomes. Our findings reveal that as pre-training data becomes outdated, LLM\nperformance degrades over time. While Retrieval Augmented Generation (RAG) has\nthe potential to enhance prediction accuracy, the performance degradation\npattern persists, highlighting the need for continuous model updates.\n","authors":["Hui Dai","Ryan Teehan","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2411.08324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11216v2","updated":"2024-11-13T04:16:21Z","published":"2024-10-15T03:02:03Z","title":"Experiences from Creating a Benchmark for Sentiment Classification for\n Varieties of English","summary":" Existing benchmarks often fail to account for linguistic diversity, like\nlanguage variants of English. In this paper, we share our experiences from our\nongoing project of building a sentiment classification benchmark for three\nvariants of English: Australian (en-AU), Indian (en-IN), and British (en-UK)\nEnglish. Using Google Places reviews, we explore the effects of various\nsampling techniques based on label semantics, review length, and sentiment\nproportion and report performances on three fine-tuned BERT-based models. Our\ninitial evaluation reveals significant performance variations influenced by\nsample characteristics, label semantics, and language variety, highlighting the\nneed for nuanced benchmark design. We offer actionable insights for researchers\nto create robust benchmarks, emphasising the importance of diverse sampling,\ncareful label definition, and comprehensive evaluation across linguistic\nvarieties.\n","authors":["Dipankar Srirag","Jordan Painter","Aditya Joshi","Diptesh Kanojia"],"pdf_url":"https://arxiv.org/pdf/2410.11216v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2411.07521v2","updated":"2024-11-13T04:03:54Z","published":"2024-11-12T03:37:53Z","title":"Fair Summarization: Bridging Quality and Diversity in Extractive\n Summaries","summary":" Fairness in multi-document summarization of user-generated content remains a\ncritical challenge in natural language processing (NLP). Existing summarization\nmethods often fail to ensure equitable representation across different social\ngroups, leading to biased outputs. In this paper, we introduce two novel\nmethods for fair extractive summarization: FairExtract, a clustering-based\napproach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints.\nWe evaluate these methods using Divsumm summarization dataset of White-aligned,\nHispanic, and African-American dialect tweets and compare them against relevant\nbaselines. The results obtained using a comprehensive set of summarization\nquality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well\nas a fairness metric F, demonstrate that FairExtract and FairGPT achieve\nsuperior fairness while maintaining competitive summarization quality.\nAdditionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that\nintegrate quality and fairness into a single evaluation framework, offering a\nmore nuanced understanding of the trade-offs between these objectives. This\nwork highlights the importance of fairness in summarization and sets a\nbenchmark for future research in fairness-aware NLP models.\n","authors":["Sina Bagheri Nezhad","Sayan Bandyapadhyay","Ameeta Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.07521v2.pdf","comment":"Accepted at Algorithmic Fairness through the Lens of Metrics and\n Evaluation Workshop @ NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08302v1","updated":"2024-11-13T02:45:21Z","published":"2024-11-13T02:45:21Z","title":"R3HF: Reward Redistribution for Enhancing Reinforcement Learning from\n Human Feedback","summary":" Reinforcement learning from human feedback (RLHF) provides a paradigm for\naligning large language models (LLMs) with human preferences. This involves the\ninitial training of a reward model based on pairwise human feedback. The reward\nmodel is subsequently utilized in reinforcement learning to assess the scores\nof each generated sentence as a whole, further guiding the optimization of\nLLMs. However, current approaches have a significant shortcoming: \\emph{They\nallocate a single, sparse, and delayed reward to an entire sequence of output}.\nThis may overlook some significant individual contributions of each token\ntowards the desired outcome. To overcome this limitation, our paper proposes a\nnovel reward redistribution method called R3HF, which facilitates a more\nfine-grained, token-level reward allocation. Specifically, our method treats\nthe reward prediction task of the reward model as a regression problem. As a\nresult, the redistributed rewards are computed by evaluating the specific\ncontribution of each token to the reward model's output. This detailed approach\nimproves the model's understanding of language nuances, leading to more precise\nenhancements in its performance. Our method is crafted to integrate seamlessly\nwith most current techniques while incurring minimal computational costs.\nThrough comprehensive experiments across diverse datasets and tasks, we have\nverified the effectiveness and superiority of our approach.\n","authors":["Jiahui Li","Tai-wei Chang","Fengda Zhang","Kun Kuang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05386v2","updated":"2024-11-13T01:40:53Z","published":"2024-05-08T19:31:06Z","title":"Interpretability Needs a New Paradigm","summary":" Interpretability is the study of explaining models in understandable terms to\nhumans. At present, interpretability is divided into two paradigms: the\nintrinsic paradigm, which believes that only models designed to be explained\ncan be explained, and the post-hoc paradigm, which believes that black-box\nmodels can be explained. At the core of this debate is how each paradigm\nensures its explanations are faithful, i.e., true to the model's behavior. This\nis important, as false but convincing explanations lead to unsupported\nconfidence in artificial intelligence (AI), which can be dangerous. This\npaper's position is that we should think about new paradigms while staying\nvigilant regarding faithfulness. First, by examining the history of paradigms\nin science, we see that paradigms are constantly evolving. Then, by examining\nthe current paradigms, we can understand their underlying beliefs, the value\nthey bring, and their limitations. Finally, this paper presents 3 emerging\nparadigms for interpretability. The first paradigm designs models such that\nfaithfulness can be easily measured. Another optimizes models such that\nexplanations become faithful. The last paradigm proposes to develop models that\nproduce both a prediction and an explanation.\n","authors":["Andreas Madsen","Himabindu Lakkaraju","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2405.05386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08278v1","updated":"2024-11-13T01:33:05Z","published":"2024-11-13T01:33:05Z","title":"Knowledge Bases in Support of Large Language Models for Processing Web\n News","summary":" Large Language Models (LLMs) have received considerable interest in wide\napplications lately. During pre-training via massive datasets, such a model\nimplicitly memorizes the factual knowledge of trained datasets in its hidden\nparameters. However, knowledge held implicitly in parameters often makes its\nuse by downstream applications ineffective due to the lack of common-sense\nreasoning. In this article, we introduce a general framework that permits to\nbuild knowledge bases with an aid of LLMs, tailored for processing Web news.\nThe framework applies a rule-based News Information Extractor (NewsIE) to news\nitems for extracting their relational tuples, referred to as knowledge bases,\nwhich are then graph-convoluted with the implicit knowledge facts of news items\nobtained by LLMs, for their classification. It involves two lightweight\ncomponents: 1) NewsIE: for extracting the structural information of every news\nitem, in the form of relational tuples; 2) BERTGraph: for graph convoluting the\nimplicit knowledge facts with relational tuples extracted by NewsIE. We have\nevaluated our framework under different news-related datasets for news category\nclassification, with promising experimental results.\n","authors":["Yihe Zhang","Nabin Pakka","Nian-feng Tzeng"],"pdf_url":"https://arxiv.org/pdf/2411.08278v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.08275v1","updated":"2024-11-13T01:12:35Z","published":"2024-11-13T01:12:35Z","title":"A Large-Scale Study of Relevance Assessments with Large Language Models:\n An Initial Look","summary":" The application of large language models to provide relevance assessments\npresents exciting opportunities to advance information retrieval, natural\nlanguage processing, and beyond, but to date many unknowns remain. This paper\nreports on the results of a large-scale evaluation (the TREC 2024 RAG Track)\nwhere four different relevance assessment approaches were deployed in situ: the\n\"standard\" fully manual process that NIST has implemented for decades and three\ndifferent alternatives that take advantage of LLMs to different extents using\nthe open-source UMBRELA tool. This setup allows us to correlate system rankings\ninduced by the different approaches to characterize tradeoffs between cost and\nquality. We find that in terms of nDCG@20, nDCG@100, and Recall@100, system\nrankings induced by automatically generated relevance assessments from UMBRELA\ncorrelate highly with those induced by fully manual assessments across a\ndiverse set of 77 runs from 19 teams. Our results suggest that automatically\ngenerated UMBRELA judgments can replace fully manual judgments to accurately\ncapture run-level effectiveness. Surprisingly, we find that LLM assistance does\nnot appear to increase correlation with fully manual assessments, suggesting\nthat costs associated with human-in-the-loop processes do not bring obvious\ntangible benefits. Overall, human assessors appear to be stricter than UMBRELA\nin applying relevance criteria. Our work validates the use of LLMs in academic\nTREC-style evaluations and provides the foundation for future studies.\n","authors":["Shivani Upadhyay","Ronak Pradeep","Nandan Thakur","Daniel Campos","Nick Craswell","Ian Soboroff","Hoa Trang Dang","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2411.08275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18164v2","updated":"2024-11-13T00:15:46Z","published":"2024-09-26T17:30:28Z","title":"Data-Prep-Kit: getting your data ready for LLM application development","summary":" Data preparation is the first and a very important step towards any Large\nLanguage Model (LLM) development. This paper introduces an easy-to-use,\nextensible, and scale-flexible open-source data preparation toolkit called Data\nPrep Kit (DPK). DPK is architected and designed to enable users to scale their\ndata preparation to their needs. With DPK they can prepare data on a local\nmachine or effortlessly scale to run on a cluster with thousands of CPU Cores.\nDPK comes with a highly scalable, yet extensible set of modules that transform\nnatural language and code data. If the user needs additional transforms, they\ncan be easily developed using extensive DPK support for transform creation.\nThese modules can be used independently or pipelined to perform a series of\noperations. In this paper, we describe DPK architecture and show its\nperformance from a small scale to a very large number of CPUs. The modules from\nDPK have been used for the preparation of Granite Models [1] [2]. We believe\nDPK is a valuable contribution to the AI community to easily prepare data to\nenhance the performance of their LLM models or to fine-tune models with\nRetrieval-Augmented Generation (RAG).\n","authors":["David Wood","Boris Lublinsky","Alexy Roytman","Shivdeep Singh","Constantin Adam","Abdulhamid Adebayo","Sungeun An","Yuan Chi Chang","Xuan-Hong Dang","Nirmit Desai","Michele Dolfi","Hajar Emami-Gohari","Revital Eres","Takuya Goto","Dhiraj Joshi","Yan Koyfman","Mohammad Nassar","Hima Patel","Paramesvaran Selvam","Yousaf Shah","Saptha Surendran","Daiki Tsuzuku","Petros Zerfos","Shahrokh Daijavad"],"pdf_url":"https://arxiv.org/pdf/2409.18164v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.07870v2","updated":"2024-11-13T23:02:41Z","published":"2024-11-12T15:26:17Z","title":"Trustful LLMs: Customizing and Grounding Text Generation with Knowledge\n Bases and Dual Decoders","summary":" Although people are impressed by the content generation skills of large\nlanguage models, the use of LLMs, such as ChatGPT, is limited by the domain\ngrounding of the content. The correctness and groundedness of the generated\ncontent need to be based on a verified context, such as results from\nRetrieval-Augmented Generation (RAG). One important issue when adapting LLMs to\na customized domain is that the generated responses are often incomplete, or\nthe additions are not verified and may even be hallucinated. Prior studies on\nhallucination detection have focused on evaluation metrics, which are not\neasily adaptable to dynamic domains and can be vulnerable to attacks like\njail-breaking. In this work, we propose 1) a post-processing algorithm that\nleverages knowledge triplets in RAG context to correct hallucinations and 2) a\ndual-decoder model that fuses RAG context to guide the generation process.\n","authors":["Xiaofeng Zhu","Jaya Krishna Mandivarapu"],"pdf_url":"https://arxiv.org/pdf/2411.07870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09073v1","updated":"2024-11-13T22:56:00Z","published":"2024-11-13T22:56:00Z","title":"Code-mixed LLM: Improve Large Language Models' Capability to Handle\n Code-Mixing through Reinforcement Learning from AI Feedback","summary":" Code-mixing(CM) or code-switching(CSW) refers to the juxtaposition of\nlinguistic units from two or more languages during the conversation or\nsometimes even a single utterance. Code-mixing introduces unique challenges in\ndaily life, such as syntactic mismatches and semantic blending, that are rarely\nencountered in monolingual settings. Large language models (LLMs) have\nrevolutionized the field of natural language processing (NLP) by offering\nunprecedented capabilities in understanding human languages. However, the\neffectiveness of current state-of-the-art multilingual LLMs has not yet been\nfully explored in the CM scenario. To fill this gap, we first benchmark the\nperformance of multilingual LLMs on various code-mixing NLP tasks. Then we\npropose to improve the multilingual LLMs' ability to understand code-mixing\nthrough reinforcement learning from human feedback (RLHF) and code-mixed\nmachine translation tasks. Given the high-cost and time-consuming preference\nlabeling procedure, we improve this by utilizing LLMs as annotators to perform\nthe reinforcement learning from AI feedback (RLAIF). The experiments show the\neffectiveness of the proposed method.\n","authors":["Wenbo Zhang","Aditya Majumdar","Amulya Yadav"],"pdf_url":"https://arxiv.org/pdf/2411.09073v1.pdf","comment":"initial version: 5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.09018v1","updated":"2024-11-13T20:50:04Z","published":"2024-11-13T20:50:04Z","title":"Bridging the Visual Gap: Fine-Tuning Multimodal Models with\n Knowledge-Adapted Captions","summary":" Recent research increasingly focuses on training vision-language models\n(VLMs) with long, detailed image captions. However, small-scale VLMs often\nstruggle to balance the richness of these captions with the risk of\nhallucinating content during fine-tuning. In this paper, we explore how well\nVLMs adapt to such captions. To quantify caption quality, we propose Decomposed\nNLI (DNLI), an evaluation framework that breaks down generated captions into\nindividual propositions, assessing each in isolation. This fine-grained\nanalysis reveals a critical balance between capturing descriptive details and\npreventing hallucinations. Our findings show that simply reducing caption\ncomplexity or employing standard data curation techniques does not effectively\nresolve this issue. To tackle this challenge, we introduce Knowledge Adapted\n(KnowAda) fine-tuning, a data-centric approach that automatically adapts\ntraining data with the model's existing knowledge and visual understanding.\nKnowAda minimizes hallucinations while preserving high descriptiveness. We\nvalidate this approach across several small-scale VLMs (up to 7B parameters)\nand dense caption datasets, demonstrating that KnowAda effectively balances\nhallucination reduction and descriptiveness. Our results show that KnowAda\noutperforms various baselines in both automatic metrics and human evaluations.\nWe will release our code and models.\n","authors":["Moran Yanuka","Assaf Ben Kish","Yonatan Bitton","Idan Szpektor","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2411.09018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09009v1","updated":"2024-11-13T20:30:15Z","published":"2024-11-13T20:30:15Z","title":"Cut Your Losses in Large-Vocabulary Language Models","summary":" As language models grow ever larger, so do their vocabularies. This has\nshifted the memory footprint of LLMs during training disproportionately to one\nsingle layer: the cross-entropy in the loss computation. Cross-entropy builds\nup a logit matrix with entries for each pair of input tokens and vocabulary\nitems and, for small models, consumes an order of magnitude more memory than\nthe rest of the LLM combined. We propose Cut Cross-Entropy (CCE), a method that\ncomputes the cross-entropy loss without materializing the logits for all tokens\ninto global memory. Rather, CCE only computes the logit for the correct token\nand evaluates the log-sum-exp over all logits on the fly. We implement a custom\nkernel that performs the matrix multiplications and the log-sum-exp reduction\nover the vocabulary in flash memory, making global memory consumption for the\ncross-entropy computation negligible. This has a dramatic effect. Taking the\nGemma 2 (2B) model as an example, CCE reduces the memory footprint of the loss\ncomputation from 24 GB to 1 MB, and the total training-time memory consumption\nof the classifier head from 28 GB to 1 GB. To improve the throughput of CCE, we\nleverage the inherent sparsity of softmax and propose to skip elements of the\ngradient computation that have a negligible (i.e., below numerical precision)\ncontribution to the gradient. Experiments demonstrate that the dramatic\nreduction in memory consumption is accomplished without sacrificing training\nspeed or convergence.\n","authors":["Erik Wijmans","Brody Huval","Alexander Hertzberg","Vladlen Koltun","Philipp Krähenbühl"],"pdf_url":"https://arxiv.org/pdf/2411.09009v1.pdf","comment":"Code is available at https://github.com/apple/ml-cross-entropy"},{"id":"http://arxiv.org/abs/2411.09003v1","updated":"2024-11-13T20:12:55Z","published":"2024-11-13T20:12:55Z","title":"Refusal in LLMs is an Affine Function","summary":" We propose affine concept editing (ACE) as an approach for steering language\nmodels' behavior by intervening directly in activations. We begin with an\naffine decomposition of model activation vectors and show that prior methods\nfor steering model behavior correspond to subsets of terms of this\ndecomposition. We then provide a derivation of ACE and test it on refusal using\nLlama 3 8B and Hermes Eagle RWKV v5. ACE ultimately combines affine subspace\nprojection and activation addition to reliably control the model's refusal\nresponses across prompt types. We evaluate the results using LLM-based scoring\non a collection of harmful and harmless prompts. Our experiments demonstrate\nthat ACE consistently achieves more precise control over model behavior and\ngeneralizes to models where directional ablation via affine subspace projection\nalone produces incoherent outputs. Code for reproducing our results is\navailable at https://github.com/EleutherAI/steering-llama3 .\n","authors":["Thomas Marshall","Adam Scherlis","Nora Belrose"],"pdf_url":"https://arxiv.org/pdf/2411.09003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23463v2","updated":"2024-11-13T19:34:22Z","published":"2024-10-30T21:08:07Z","title":"MDCure: A Scalable Pipeline for Multi-Document Instruction-Following","summary":" Multi-document (MD) processing is crucial for LLMs to handle real-world tasks\nsuch as summarization and question-answering across large sets of documents.\nWhile LLMs have improved at processing long inputs, MD contexts still present\nchallenges, such as managing inter-document dependencies, redundancy, and\nincoherent structures. We introduce MDCure, a scalable and effective\nfine-tuning pipeline to enhance the MD capabilities of LLMs without the\ncomputational cost of pre-training or reliance on human annotated data. MDCure\nis based on generation of high-quality synthetic MD instruction data from sets\nof related articles via targeted prompts. We further introduce MDCureRM, a\nmulti-objective reward model which filters generated data based on their\ntraining utility for MD settings. With MDCure, we fine-tune a variety of LLMs,\nfrom the FlanT5, Qwen2, and LLAMA3.1 model families, up to 70B parameters in\nsize. Extensive evaluations on a wide range of MD and long-context benchmarks\nspanning various tasks show MDCure consistently improves performance over\npre-trained baselines and over corresponding base models by up to 75.5%. Our\ncode, datasets, and models are available at https://github.com/yale-nlp/MDCure.\n","authors":["Gabrielle Kaili-May Liu","Bowen Shi","Avi Caciularu","Idan Szpektor","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2410.23463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08979v1","updated":"2024-11-13T19:12:02Z","published":"2024-11-13T19:12:02Z","title":"CoCoP: Enhancing Text Classification with LLM through Code Completion\n Prompt","summary":" Text classification is a fundamental task in natural language processing\n(NLP), and large language models (LLMs) have demonstrated their capability to\nperform this task across various domains. However, the performance of LLMs\nheavily depends on the quality of their input prompts. Recent studies have also\nshown that LLMs exhibit remarkable results in code-related tasks. To leverage\nthe capabilities of LLMs in text classification, we propose the Code Completion\nPrompt (CoCoP) method, which transforms the text classification problem into a\ncode completion task. CoCoP significantly improves text classification\nperformance across diverse datasets by utilizing LLMs' code-completion\ncapability. For instance, CoCoP enhances the accuracy of the SST2 dataset by\nmore than 20%. Moreover, when CoCoP integrated with LLMs specifically designed\nfor code-related tasks (code models), such as CodeLLaMA, this method\ndemonstrates better or comparable performance to few-shot learning techniques\nwhile using only one-tenth of the model size. The source code of our proposed\nmethod will be available to the public upon the acceptance of the paper.\n","authors":["Mohammad Mahdi Mohajeri","Mohammad Javad Dousti","Majid Nili Ahmadabadi"],"pdf_url":"https://arxiv.org/pdf/2411.08979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08977v1","updated":"2024-11-13T19:08:23Z","published":"2024-11-13T19:08:23Z","title":"Robustness and Confounders in the Demographic Alignment of LLMs with\n Human Perceptions of Offensiveness","summary":" Large language models (LLMs) are known to exhibit demographic biases, yet few\nstudies systematically evaluate these biases across multiple datasets or\naccount for confounding factors. In this work, we examine LLM alignment with\nhuman annotations in five offensive language datasets, comprising approximately\n220K annotations. Our findings reveal that while demographic traits,\nparticularly race, influence alignment, these effects are inconsistent across\ndatasets and often entangled with other factors. Confounders -- such as\ndocument difficulty, annotator sensitivity, and within-group agreement --\naccount for more variation in alignment patterns than demographic traits alone.\nSpecifically, alignment increases with higher annotator sensitivity and group\nagreement, while greater document difficulty corresponds to reduced alignment.\nOur results underscore the importance of multi-dataset analyses and\nconfounder-aware methodologies in developing robust measures of demographic\nbias in LLMs.\n","authors":["Shayan Alipour","Indira Sen","Mattia Samory","Tanushree Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.08977v1.pdf","comment":"18 pages, 8 figures, ACL'25"},{"id":"http://arxiv.org/abs/2411.08968v1","updated":"2024-11-13T19:02:36Z","published":"2024-11-13T19:02:36Z","title":"Sparse Upcycling: Inference Inefficient Finetuning","summary":" Small, highly trained, open-source large language models are widely used due\nto their inference efficiency, but further improving their quality remains a\nchallenge. Sparse upcycling is a promising approach that transforms a\npretrained dense model into a Mixture-of-Experts (MoE) architecture, increasing\nthe model's parameter count and quality. In this work, we compare the\neffectiveness of sparse upcycling against continued pretraining (CPT) across\ndifferent model sizes, compute budgets, and pretraining durations. Our\nexperiments show that sparse upcycling can achieve better quality, with\nimprovements of over 20% relative to CPT in certain scenarios. However, this\ncomes with a significant inference cost, leading to 40% slowdowns in\nhigh-demand inference settings for larger models. Our findings highlight the\ntrade-off between model quality and inference efficiency, offering insights for\npractitioners seeking to balance model quality and deployment constraints.\n","authors":["Sasha Doubov","Nikhil Sardana","Vitaliy Chiley"],"pdf_url":"https://arxiv.org/pdf/2411.08968v1.pdf","comment":"12 pages, 4 figures, To appear in the 4th NeurIPS Workshop on\n Efficient Natural Language and Speech Processing (ENLSP), 2024"}]},"2024-11-14T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.09658v1","updated":"2024-11-14T18:29:31Z","published":"2024-11-14T18:29:31Z","title":"Motion Before Action: Diffusing Object Motion as Manipulation Condition","summary":" Inferring object motion representations from observations enhances the\nperformance of robotic manipulation tasks. This paper introduces a new paradigm\nfor robot imitation learning that generates action sequences by reasoning about\nobject motion from visual observations. We propose MBA (Motion Before Action),\na novel module that employs two cascaded diffusion processes for object motion\ngeneration and robot action generation under object motion guidance. MBA first\npredicts the future pose sequence of the object based on observations, then\nuses this sequence as a condition to guide robot action generation. Designed as\na plug-and-play component, MBA can be flexibly integrated into existing robotic\nmanipulation policies with diffusion action heads. Extensive experiments in\nboth simulated and real-world environments demonstrate that our approach\nsubstantially improves the performance of existing policies across a wide range\nof manipulation tasks.\n","authors":["Yup Su","Xinyu Zhan","Hongjie Fang","Yong-Lu Li","Cewu Lu","Lixin Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09643v1","updated":"2024-11-14T18:10:10Z","published":"2024-11-14T18:10:10Z","title":"Modular Fault Diagnosis Framework for Complex Autonomous Driving Systems","summary":" Fault diagnosis is crucial for complex autonomous mobile systems, especially\nfor modern-day autonomous driving (AD). Different actors, numerous use cases,\nand complex heterogeneous components motivate a fault diagnosis of the system\nand overall system integrity. AD systems are composed of many heterogeneous\ncomponents, each with different functionality and possibly using a different\nalgorithm (e.g., rule-based vs. AI components). In addition, these components\nare subject to the vehicle's driving state and are highly dependent. This\npaper, therefore, faces this problem by presenting the concept of a modular\nfault diagnosis framework for AD systems. The concept suggests modular state\nmonitoring and diagnosis elements, together with a state- and dependency-aware\naggregation method. Our proposed classification scheme allows for the\ncategorization of the fault diagnosis modules. The concept is implemented on AD\nshuttle buses and evaluated to demonstrate its capabilities.\n","authors":["Stefan Orf","Sven Ochs","Jens Doll","Albert Schotschneider","Marc Heinrich","Marc René Zofka","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2411.09643v1.pdf","comment":"Accepted at 2024 IEEE 20th International Conference on Intelligent\n Computer Communication and Processing (ICCP 2024)"},{"id":"http://arxiv.org/abs/2411.09627v1","updated":"2024-11-14T17:54:43Z","published":"2024-11-14T17:54:43Z","title":"One-Shot Manipulation Strategy Learning by Making Contact Analogies","summary":" We present a novel approach, MAGIC (manipulation analogies for generalizable\nintelligent contacts), for one-shot learning of manipulation strategies with\nfast and extensive generalization to novel objects. By leveraging a reference\naction trajectory, MAGIC effectively identifies similar contact points and\nsequences of actions on novel objects to replicate a demonstrated strategy,\nsuch as using different hooks to retrieve distant objects of different shapes\nand sizes. Our method is based on a two-stage contact-point matching process\nthat combines global shape matching using pretrained neural features with local\ncurvature analysis to ensure precise and physically plausible contact points.\nWe experiment with three tasks including scooping, hanging, and hooking\nobjects. MAGIC demonstrates superior performance over existing methods,\nachieving significant improvements in runtime speed and generalization to\ndifferent object categories. Website: https://magic-2024.github.io/ .\n","authors":["Yuyao Liu","Jiayuan Mao","Joshua Tenenbaum","Tomás Lozano-Pérez","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2411.09627v1.pdf","comment":"CoRL LEAP Workshop, 2024"},{"id":"http://arxiv.org/abs/2411.09623v1","updated":"2024-11-14T17:47:54Z","published":"2024-11-14T17:47:54Z","title":"Vision-based Manipulation of Transparent Plastic Bags in Industrial\n Setups","summary":" This paper addresses the challenges of vision-based manipulation for\nautonomous cutting and unpacking of transparent plastic bags in industrial\nsetups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data,\nconnectivity, analytics, and robotics, promises enhanced accessibility and\nsustainability throughout the value chain. The integration of autonomous\nsystems, including collaborative robots (cobots), into industrial processes is\npivotal for efficiency and safety. The proposed solution employs advanced\nMachine Learning algorithms, particularly Convolutional Neural Networks (CNNs),\nto identify transparent plastic bags under varying lighting and background\nconditions. Tracking algorithms and depth sensing technologies are utilized for\n3D spatial awareness during pick and placement. The system addresses challenges\nin grasping and manipulation, considering optimal points, compliance control\nwith vacuum gripping technology, and real-time automation for safe interaction\nin dynamic environments. The system's successful testing and validation in the\nlab with the FRANKA robot arm, showcases its potential for widespread\nindustrial applications, while demonstrating effectiveness in automating the\nunpacking and cutting of transparent plastic bags for an 8-stack bulk-loader\nbased on specific requirements and rigorous testing.\n","authors":["F. Adetunji","A. Karukayil","P. Samant","S. Shabana","F. Varghese","U. Upadhyay","R. A. Yadav","A. Partridge","E. Pendleton","R. Plant","Y. Petillot","M. Koskinopoulou"],"pdf_url":"https://arxiv.org/pdf/2411.09623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07104v2","updated":"2024-11-14T17:28:37Z","published":"2024-11-11T16:27:25Z","title":"Learning Multi-Agent Loco-Manipulation for Long-Horizon Quadrupedal\n Pushing","summary":" Recently, quadrupedal locomotion has achieved significant success, but their\nmanipulation capabilities, particularly in handling large objects, remain\nlimited, restricting their usefulness in demanding real-world applications such\nas search and rescue, construction, industrial automation, and room\norganization. This paper tackles the task of obstacle-aware, long-horizon\npushing by multiple quadrupedal robots. We propose a hierarchical multi-agent\nreinforcement learning framework with three levels of control. The high-level\ncontroller integrates an RRT planner and a centralized adaptive policy to\ngenerate subgoals, while the mid-level controller uses a decentralized\ngoal-conditioned policy to guide the robots toward these sub-goals. A\npre-trained low-level locomotion policy executes the movement commands. We\nevaluate our method against several baselines in simulation, demonstrating\nsignificant improvements over baseline approaches, with 36.0% higher success\nrates and 24.5% reduction in completion time than the best baseline. Our\nframework successfully enables long-horizon, obstacle-aware manipulation tasks\nlike Push-Cuboid and Push-T on Go1 robots in the real world.\n","authors":["Yuming Feng","Chuye Hong","Yaru Niu","Shiqi Liu","Yuxiang Yang","Wenhao Yu","Tingnan Zhang","Jie Tan","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.07104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09603v1","updated":"2024-11-14T17:22:11Z","published":"2024-11-14T17:22:11Z","title":"Smart Automation in Luxury Leather Shoe Polishing: A Human Centric\n Robotic Approach","summary":" The polishing of luxury leather shoes is a delicate, labor intensive process\ntraditionally performed by skilled craftsmen. Footwear companies aim to\nautomate parts of this process to enhance quality, productivity, and operator\nwell-being, but the unique nature of luxury shoe production presents\nchallenges. This paper introduces a solution involving a collaborative robotic\ncell to assist in shoe polishing. A collaborative robotic manipulator, equipped\nwith a specialized tool and governed by force control, executes the polishing\ntasks. Key factors such as trajectory design, applied force, polishing speed,\nand polish amount were analyzed. Polishing trajectories are designed using CAM\nsoftware and transferred to the robot control system. Human operators design\nthe process, supervise the robot, and perform final finishing, ensuring their\nexpertise is integral to achieving quality. Extensive testing on various shoe\nmodels showed significant improvements in quality and reliability, leading to\nsuccessful implementation on an industrial production line.\n","authors":["Matteo Forlini","Marianna Ciccarelli","Luca Carbonari","Alessandra Papetti","Giacomo Palmieri"],"pdf_url":"https://arxiv.org/pdf/2411.09603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16677v3","updated":"2024-11-14T16:54:02Z","published":"2024-07-23T17:44:54Z","title":"From Imitation to Refinement -- Residual RL for Precise Assembly","summary":" Advances in behavior cloning (BC), like action-chunking and diffusion, have\nenabled impressive capabilities. Still, imitation alone remains insufficient\nfor learning reliable policies for tasks requiring precise aligning and\ninserting of objects, like assembly. Our key insight is that chunked BC\npolicies effectively function as trajectory planners, enabling long-horizon\ntasks. Conversely, as they execute action chunks open-loop, they lack the\nfine-grained reactivity necessary for reliable execution. Further, we find that\nthe performance of BC policies saturates despite increasing data. Reinforcement\nlearning (RL) is a natural way to overcome BC's limitations, but it is not\nstraightforward to apply directly to action-chunked models like diffusion\npolicies. We present a simple yet effective method, ResiP (Residual for Precise\nManipulation), that sidesteps these challenges by augmenting a frozen, chunked\nBC model with a fully closed-loop residual policy trained with RL. The residual\npolicy is trained via on-policy RL, addressing distribution shifts and\nintroducing reactive control without altering the BC trajectory planner.\nEvaluation on high-precision manipulation tasks demonstrates strong performance\nof ResiP over BC methods and direct RL fine-tuning. Videos, code, and data are\navailable at https://residual-assembly.github.io.\n","authors":["Lars Ankile","Anthony Simeonov","Idan Shenfeld","Marcel Torne","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2407.16677v3.pdf","comment":"Project website: https://residual-assembly.github.io"},{"id":"http://arxiv.org/abs/2406.01767v3","updated":"2024-11-14T16:46:17Z","published":"2024-06-03T20:16:35Z","title":"Region-aware Grasp Framework with Normalized Grasp Space for Efficient\n 6-DoF Grasping","summary":" A series of region-based methods succeed in extracting regional features and\nenhancing grasp detection quality. However, faced with a cluttered scene with\npotential collision, the definition of the grasp-relevant region stays\ninconsistent, and the relationship between grasps and regional spaces remains\nincompletely investigated. In this paper, we propose Normalized Grasp Space\n(NGS) from a novel region-aware viewpoint, unifying the grasp representation\nwithin a normalized regional space and benefiting the generalizability of\nmethods. Leveraging the NGS, we find that CNNs are underestimated for 3D\nfeature extraction and 6-DoF grasp detection in clutter scenes and build a\nhighly efficient Region-aware Normalized Grasp Network (RNGNet). Experiments on\nthe public benchmark show that our method achieves significant >20% performance\ngains while attaining a real-time inference speed of approximately 50 FPS.\nReal-world cluttered scene clearance experiments underscore the effectiveness\nof our method. Further, human-to-robot handover and dynamic object grasping\nexperiments demonstrate the potential of our proposed method for closed-loop\ngrasping in dynamic scenarios.\n","authors":["Siang Chen","Pengwei Xie","Wei Tang","Dingchang Hu","Yixiang Dai","Guijin Wang"],"pdf_url":"https://arxiv.org/pdf/2406.01767v3.pdf","comment":"Accepted by CoRL2024, final camera-ready version will be updated soon"},{"id":"http://arxiv.org/abs/2410.22980v2","updated":"2024-11-14T16:40:00Z","published":"2024-10-30T12:45:12Z","title":"Efficient End-to-End 6-Dof Grasp Detection Framework for Edge Devices\n with Hierarchical Heatmaps and Feature Propagation","summary":" 6-DoF grasp detection is critically important for the advancement of\nintelligent embodied systems, as it provides feasible robot poses for object\ngrasping. Various methods have been proposed to detect 6-DoF grasps through the\nextraction of 3D geometric features from RGBD or point cloud data. However,\nmost of these approaches encounter challenges during real robot deployment due\nto their significant computational demands, which can be particularly\nproblematic for mobile robot platforms, especially those reliant on edge\ncomputing devices. This paper presents an Efficient End-to-End Grasp Detection\nNetwork (E3GNet) for 6-DoF grasp detection utilizing hierarchical heatmap\nrepresentations. E3GNet effectively identifies high-quality and diverse grasps\nin cluttered real-world environments. Benefiting from our end-to-end\nmethodology and efficient network design, our approach surpasses previous\nmethods in model inference efficiency and achieves real-time 6-Dof grasp\ndetection on edge devices. Furthermore, real-world experiments validate the\neffectiveness of our method, achieving a satisfactory 94% object grasping\nsuccess rate.\n","authors":["Kaiqin Yang","Yixiang Dai","Guijin Wang","Siang Chen"],"pdf_url":"https://arxiv.org/pdf/2410.22980v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06542v2","updated":"2024-11-14T16:22:51Z","published":"2024-11-10T17:48:26Z","title":"Is Linear Feedback on Smoothed Dynamics Sufficient for Stabilizing\n Contact-Rich Plans?","summary":" Designing planners and controllers for contact-rich manipulation is extremely\nchallenging as contact violates the smoothness conditions that many\ngradient-based controller synthesis tools assume. Contact smoothing\napproximates a non-smooth system with a smooth one, allowing one to use these\nsynthesis tools more effectively. However, applying classical control synthesis\nmethods to smoothed contact dynamics remains relatively under-explored. This\npaper analyzes the efficacy of linear controller synthesis using differential\nsimulators based on contact smoothing. We introduce natural baselines for\nleveraging contact smoothing to compute (a) open-loop plans robust to uncertain\nconditions and/or dynamics, and (b) feedback gains to stabilize around\nopen-loop plans. Using robotic bimanual whole-body manipulation as a testbed,\nwe perform extensive empirical experiments on over 300 trajectories and analyze\nwhy LQR seems insufficient for stabilizing contact-rich plans. The video\nsummarizing this paper and hardware experiments is found here:\nhttps://youtu.be/HLaKi6qbwQg?si=_zCAmBBD6rGSitm9.\n","authors":["Yuki Shirai","Tong Zhao","H. J. Terry Suh","Huaijiang Zhu","Xinpei Ni","Jiuguang Wang","Max Simchowitz","Tao Pang"],"pdf_url":"https://arxiv.org/pdf/2411.06542v2.pdf","comment":"Under review for ICRA2025"},{"id":"http://arxiv.org/abs/2411.09565v1","updated":"2024-11-14T16:16:16Z","published":"2024-11-14T16:16:16Z","title":"Vlimb: A Wire-Driven Wearable Robot for Bodily Extension, Balancing\n Powerfulness and Reachability","summary":" Numerous wearable robots have been developed to meet the demands of physical\nassistance and entertainment. These wearable robots range from body-enhancing\ntypes that assist human arms and legs to body-extending types that have extra\narms. This study focuses specifically on wearable robots of the latter\ncategory, aimed at bodily extension. However, they have not yet achieved the\nlevel of powerfulness and reachability equivalent to that of human limbs,\nlimiting their application to entertainment and manipulation tasks involving\nlightweight objects. Therefore, in this study, we develop an body-extending\nwearable robot, Vlimb, which has enough powerfulness to lift a human and can\nperform manipulation. Leveraging the advantages of tendon-driven mechanisms,\nVlimb incorporates a wire routing mechanism capable of accommodating both\ndelicate manipulations and robust lifting tasks. Moreover, by introducing a\npassive ring structure to overcome the limited reachability inherent in\ntendon-driven mechanisms, Vlimb achieves both the powerfulness and reachability\ncomparable to that of humans. This paper outlines the design methodology of\nVlimb, conducts preliminary manipulation and lifting tasks, and verifies its\neffectiveness.\n","authors":["Shogo Sawaguchi","Temma Suzuki","Akihiro Miki","Kento Kawaharazuka","Sota Yuzaki","Shunnosuke Yoshimura","Yoshimoto Ribayashi","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2411.09565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16713v4","updated":"2024-11-14T16:12:56Z","published":"2024-06-24T15:15:25Z","title":"ShanghaiTech Mapping Robot is All You Need: Robot System for Collecting\n Universal Ground Vehicle Datasets","summary":" This paper presents the ShanghaiTech Mapping Robot, a state-of-the-art\nunmanned ground vehicle (UGV) designed for collecting comprehensive\nmulti-sensor datasets to support research in robotics, Simultaneous\nLocalization and Mapping (SLAM), computer vision, and autonomous driving. The\nrobot is equipped with a wide array of sensors including RGB cameras, RGB-D\ncameras, event-based cameras, IR cameras, LiDARs, mmWave radars, IMUs,\nultrasonic range finders, and a GNSS RTK receiver. The sensor suite is\nintegrated onto a specially designed mechanical structure with a centralized\npower system and a synchronization mechanism to ensure spatial and temporal\nalignment of the sensor data. A 16-node on-board computing cluster handles\nsensor control, data collection, and storage. We describe the hardware and\nsoftware architecture of the robot in detail and discuss the calibration\nprocedures for the various sensors and investigate the interference for LiDAR\nand RGB-D sensors. The capabilities of the platform are demonstrated through an\nextensive outdoor dataset collected in a diverse campus environment.\nExperiments with two LiDAR-based and two RGB-based SLAM approaches showcase the\npotential of the dataset to support development and benchmarking for robotics.\nTo facilitate research, we make the dataset publicly available along with the\nassociated robot sensor calibration data:\nhttps://slam-hive.net/wiki/ShanghaiTech_Datasets\n","authors":["Bowen Xu","Xiting Zhao","Delin Feng","Yuanyuan Yang","Sören Schwertfeger"],"pdf_url":"https://arxiv.org/pdf/2406.16713v4.pdf","comment":"19 pages, 27 figures. Submitted to IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2411.09524v1","updated":"2024-11-14T15:40:16Z","published":"2024-11-14T15:40:16Z","title":"FlowNav: Learning Efficient Navigation Policies via Conditional Flow\n Matching","summary":" Effective robot navigation in dynamic environments is a challenging task that\ndepends on generating precise control actions at high frequencies. Recent\nadvancements have framed navigation as a goal-conditioned control problem.\nCurrent state-of-the-art methods for goal-based navigation, such as diffusion\npolicies, either generate sub-goal images or robot control actions to guide\nrobots. However, despite their high accuracy, these methods incur substantial\ncomputational costs, which limits their practicality for real-time\napplications. Recently, Conditional Flow Matching(CFM) has emerged as a more\nefficient and robust generalization of diffusion. In this work we explore the\nuse of CFM to learn action policies that help the robot navigate its\nenvironment. Our results demonstrate that CFM is able to generate highly\naccurate robot actions. CFM not only matches the accuracy of diffusion policies\nbut also significantly improves runtime performance. This makes it particularly\nadvantageous for real-time robot navigation, where swift, reliable action\ngeneration is vital for collision avoidance and smooth operation. By leveraging\nCFM, we provide a pathway to more scalable, responsive robot navigation systems\ncapable of handling the demands of dynamic and unpredictable environments.\n","authors":["Samiran Gode","Abhijeet Nayak","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2411.09524v1.pdf","comment":"Accepted at CoRL 2024 workshop on Learning Effective Abstractions for\n Planning (LEAP) and workshop on Differentiable Optimization Everywhere:\n Simulation, Estimation, Learning, and Control. 7 pages + 2 pages of\n references, 7 figures"},{"id":"http://arxiv.org/abs/2410.01440v3","updated":"2024-11-14T15:04:33Z","published":"2024-10-02T11:42:49Z","title":"Closed-Loop Long-Horizon Robotic Planning via Equilibrium Sequence\n Modeling","summary":" In the endeavor to make autonomous robots take actions, task planning is a\nmajor challenge that requires translating high-level task descriptions into\nlong-horizon action sequences. Despite recent advances in language model\nagents, they remain prone to planning errors and limited in their ability to\nplan ahead. To address these limitations in robotic planning, we advocate a\nself-refining scheme that iteratively refines a draft plan until an equilibrium\nis reached. Remarkably, this process can be optimized end-to-end from an\nanalytical perspective without the need to curate additional verifiers or\nreward models, allowing us to train self-refining planners in a simple\nsupervised learning fashion. Meanwhile, a nested equilibrium sequence modeling\nprocedure is devised for efficient closed-loop planning that incorporates\nuseful feedback from the environment (or an internal world model). Our method\nis evaluated on the VirtualHome-Env benchmark, showing advanced performance\nwith better scaling for inference computation. Code is available at\nhttps://github.com/Singularity0104/equilibrium-planner.\n","authors":["Jinghan Li","Zhicheng Sun","Fei Li","Cao Sheng","Jiazhong Yu","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2410.01440v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09493v1","updated":"2024-11-14T15:00:14Z","published":"2024-11-14T15:00:14Z","title":"Strategic Sacrifice: Self-Organized Robot Swarm Localization for\n Inspection Productivity","summary":" Robot swarms offer significant potential for inspecting diverse\ninfrastructure, ranging from bridges to space stations. However, effective\ninspection requires accurate robot localization, which demands substantial\ncomputational resources and limits productivity. Inspired by biological\nsystems, we introduce a novel cooperative localization mechanism that minimizes\ncollective computation expenditure through self-organized sacrifice. Here, a\nfew agents bear the computational burden of localization; through local\ninteractions, they improve the inspection productivity of the swarm. Our\napproach adaptively maximizes inspection productivity for unconstrained\ntrajectories in dynamic interaction and environmental settings. We demonstrate\nthe optimality and robustness using mean-field analytical models, multi-agent\nsimulations, and hardware experiments with metal climbing robots inspecting a\n3D cylinder.\n","authors":["Sneha Ramshanker","Hungtang Ko","Radhika Nagpal"],"pdf_url":"https://arxiv.org/pdf/2411.09493v1.pdf","comment":"14 pages, 10 figures, 17th International Symposium on Distributed\n Autonomous Robotic Systems (DARS'24)"},{"id":"http://arxiv.org/abs/2409.01083v2","updated":"2024-11-14T14:52:54Z","published":"2024-09-02T09:11:28Z","title":"Affordance-based Robot Manipulation with Flow Matching","summary":" We present a framework for assistive robot manipulation, which focuses on two\nfundamental challenges: first, efficiently adapting large-scale models to\ndownstream scene affordance understanding tasks, especially in daily living\nscenarios where gathering multi-task data involving humans requires strenuous\neffort; second, effectively learning robot trajectories by grounding the visual\naffordance model. We tackle the first challenge by employing a\nparameter-efficient prompt tuning method that prepends learnable text prompts\nto the frozen vision model to predict manipulation affordances in multi-task\nscenarios. Then we propose to learn robot trajectories guided by affordances in\na supervised Flow Matching method. Flow matching represents a robot visuomotor\npolicy as a conditional process of flowing random waypoints to desired robot\ntrajectories. Finally, we introduce a real-world dataset with 10 tasks across\nActivities of Daily Living to test our framework. Our extensive evaluation\nhighlights that the proposed prompt tuning method for learning manipulation\naffordance with language prompter achieves competitive performance and even\noutperforms other finetuning protocols across data scales, while satisfying\nparameter efficiency. Learning multi-task robot trajectories with flow matching\npolicy also leads to consistently better generalization performance and faster\ninference than alternative behavior cloning methods, especially given\nmultimodal robot action distributions. Our framework seamlessly unifies\naffordance model learning and trajectory generation with flow matching for\nrobot manipulation.\n","authors":["Fan Zhang","Michael Gienger"],"pdf_url":"https://arxiv.org/pdf/2409.01083v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09451v1","updated":"2024-11-14T13:56:02Z","published":"2024-11-14T13:56:02Z","title":"DiffRoad: Realistic and Diverse Road Scenario Generation for Autonomous\n Vehicle Testing","summary":" Generating realistic and diverse road scenarios is essential for autonomous\nvehicle testing and validation. Nevertheless, owing to the complexity and\nvariability of real-world road environments, creating authentic and varied\nscenarios for intelligent driving testing is challenging. In this paper, we\npropose DiffRoad, a novel diffusion model designed to produce controllable and\nhigh-fidelity 3D road scenarios. DiffRoad leverages the generative capabilities\nof diffusion models to synthesize road layouts from white noise through an\ninverse denoising process, preserving real-world spatial features. To enhance\nthe quality of generated scenarios, we design the Road-UNet architecture,\noptimizing the balance between backbone and skip connections for high-realism\nscenario generation. Furthermore, we introduce a road scenario evaluation\nmodule that screens adequate and reasonable scenarios for intelligent driving\ntesting using two critical metrics: road continuity and road reasonableness.\nExperimental results on multiple real-world datasets demonstrate DiffRoad's\nability to generate realistic and smooth road structures while maintaining the\noriginal distribution. Additionally, the generated scenarios can be fully\nautomated into the OpenDRIVE format, facilitating generalized autonomous\nvehicle simulation testing. DiffRoad provides a rich and diverse scenario\nlibrary for large-scale autonomous vehicle testing and offers valuable insights\nfor future infrastructure designs that are better suited for autonomous\nvehicles.\n","authors":["Junjie Zhou","Lin Wang","Qiang Meng","Xiaofan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09451v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.05953v2","updated":"2024-11-14T13:47:06Z","published":"2024-04-09T02:21:23Z","title":"3D Branch Point Cloud Completion for Robotic Pruning in Apple Orchards","summary":" Robotic branch pruning is a significantly growing research area to cope with\nthe shortage of labor force in the context of agriculture. One fundamental\nrequirement in robotic pruning is the perception of detailed geometry and\ntopology of branches. However, the point clouds obtained in agricultural\nsettings often exhibit incompleteness due to several constraints, thereby\nrestricting the accuracy of downstream robotic pruning. In this work, we\naddressed the issue of point cloud quality through a simulation-based deep\nneural network, leveraging a Real-to-Simulation (Real2Sim) data generation\npipeline that not only eliminates the need for manual parameterization but also\nguarantees the realism of simulated data. The simulation-based neural network\nwas applied to jointly perform point cloud completion and skeletonization on\nreal-world partial branches, without additional real-world training. The\nSim2Real qualitative completion and skeletonization results showed the model's\nremarkable capability for geometry reconstruction and topology prediction.\nAdditionally, we quantitatively evaluated the Sim2Real performance by comparing\nbranch-level trait characterization errors using raw incomplete data and\ncomplete data. The Mean Absolute Error (MAE) reduced by 75% and 8% for branch\ndiameter and branch angle estimation, respectively, using the best complete\ndata, which indicates the effectiveness of the Real2Sim data in a zero-shot\ngeneralization setting. The characterization improvements contributed to the\nprecision and efficacy of robotic branch pruning.\n","authors":["Tian Qiu","Alan Zoubi","Nikolai Spine","Lailiang Cheng","Yu Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.05953v2.pdf","comment":"Accepted by IROS 2024"},{"id":"http://arxiv.org/abs/2411.09441v1","updated":"2024-11-14T13:40:41Z","published":"2024-11-14T13:40:41Z","title":"A ROS~2-based Navigation and Simulation Stack for the Robotino","summary":" The Robotino, developed by Festo Didactic, serves as a versatile platform in\neducation and research for mobile robotics tasks. However, there currently is\nno ROS2 integration for the Robotino available. In this paper, we describe our\nwork on a Webots simulation environment for a Robotino platform extended by\nLIDAR sensors. A ROS2 integration and a pre-configured setup for localization\nand navigation using existing ROS packages from the Nav2 suite are provided. We\nvalidate our setup by comparing simulations with real-world experiments\nconducted by three Robotinos in a logistics environment in our lab.\nAdditionally, we tested the setup using a ROS 2 hardware driver for the\nRobotino developed by team GRIPS of the RoboCup Logistics League. The results\ndemonstrate the feasibility of using ROS2 and Nav2 for navigation tasks on the\nRobotino platform showing great consistency between simulation and real-world\nperformance.\n","authors":["Saurabh Borse","Tarik Viehmann","Alexander Ferrein","Gerhard Lakemeyer"],"pdf_url":"https://arxiv.org/pdf/2411.09441v1.pdf","comment":"Published at RoboCup 2024: Robot World Cup XXVII, Springer-Verlag,\n 2024"},{"id":"http://arxiv.org/abs/2411.09436v1","updated":"2024-11-14T13:34:16Z","published":"2024-11-14T13:34:16Z","title":"Robot Tasks with Fuzzy Time Requirements from Natural Language\n Instructions","summary":" Natural language allows robot programming to be accessible to everyone.\nHowever, the inherent fuzziness in natural language poses challenges for\ninflexible, traditional robot systems. We focus on instructions with fuzzy time\nrequirements (e.g., \"start in a few minutes\"). Building on previous robotics\nresearch, we introduce fuzzy skills. These define an execution by the robot\nwith so-called satisfaction functions representing vague execution time\nrequirements. Such functions express a user's satisfaction over potential\nstarting times for skill execution. When the robot handles multiple fuzzy\nskills, the satisfaction function provides a temporal tolerance window for\nexecution, thus, enabling optimal scheduling based on satisfaction. We\ngeneralized such functions based on individual user expectations with a user\nstudy. The participants rated their satisfaction with an instruction's\nexecution at various times. Our investigations reveal that trapezoidal\nfunctions best approximate the users' satisfaction. Additionally, the results\nsuggest that users are more lenient if the execution is specified further into\nthe future.\n","authors":["Sascha Sucker","Michael Neubauer","Dominik Henrich"],"pdf_url":"https://arxiv.org/pdf/2411.09436v1.pdf","comment":"9 pages, 8 figures, to be published in 2024 IEEE International\n Conference on Robotic Computing (IRC)"},{"id":"http://arxiv.org/abs/2406.17586v2","updated":"2024-11-14T13:27:45Z","published":"2024-06-25T14:28:21Z","title":"Benchmarking SLAM Algorithms in the Cloud: The SLAM Hive Benchmarking\n Suite","summary":" Evaluating the performance of Simultaneous Localization and Mapping (SLAM)\nalgorithms is essential for scientists and users of robotic systems alike. But\nthere are a multitude of different permutations of possible options of hardware\nsetups and algorithm configurations, as well as different datasets and\nalgorithms, such that it was previously infeasible to thoroughly compare SLAM\nsystems against the full state of the art. To solve that we present the SLAM\nHive Benchmarking Suite, which is able to analyze SLAM algorithms in 1000's of\nmapping runs, through its utilization of container technology and deployment in\nthe cloud. This paper presents the architecture and open source implementation\nof SLAM Hive and compares it to existing efforts on SLAM evaluation. We perform\nmapping runs with popular visual, RGBD and LiDAR based SLAM algorithms against\ncommonly used datasets and show how SLAM Hive can be used to conveniently\nanalyze the results against various aspects. Through this we envision that SLAM\nHive can become an essential tool for proper comparisons and evaluations of\nSLAM algorithms and thus drive the scientific development in the research on\nSLAM. The open source software as well as a demo to show the live analysis of\n1000's of mapping runs can be found on our SLAM Hive website.\n","authors":["Xinzhe Liu","Yuanyuan Yang","Bowen Xu","Delin Feng","Sören Schwertfeger"],"pdf_url":"https://arxiv.org/pdf/2406.17586v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.11854"},{"id":"http://arxiv.org/abs/2410.20549v2","updated":"2024-11-14T13:21:27Z","published":"2024-10-27T18:34:49Z","title":"Comparing the Consistency of User Studies Conducted in Simulations and\n Laboratory Settings","summary":" Human-robot collaboration enables highly adaptive co-working. The variety of\nresulting workflows makes it difficult to measure metrics as, e.g. makespans or\nidle times for multiple systems and tasks in a comparable manner. This issue\ncan be addressed with virtual commissioning, where arbitrary numbers of\nnon-deterministic human-robot workflows in assembly tasks can be simulated. To\nthis end, data-driven models of human decisions are needed. Gathering the\nrequired large corpus of data with on-site user studies is quite\ntime-consuming. In comparison, simulation-based studies (e.g., by\ncrowdsourcing) would allow us to access a large pool of study participants with\nless effort. To rely on respective study results, human action sequences\nobserved in a browser-based simulation environment must be shown to match those\ngathered in a laboratory setting. To this end, this work aims to understand to\nwhat extent cooperative assembly work in a simulated environment differs from\nthat in an on-site laboratory setting. We show how a simulation environment can\nbe aligned with a laboratory setting in which a robot and a human perform\npick-and-place tasks together. A user study (N=29) indicates that participants'\nassembly decisions and perception of the situation are consistent across these\ndifferent environments.\n","authors":["Jonathan Hümmer","Dominik Riedelbauch","Dominik Henrich"],"pdf_url":"https://arxiv.org/pdf/2410.20549v2.pdf","comment":"Accepted for presentation at 2024 IEEE International Conference on\n Robotic Computing (IRC)"},{"id":"http://arxiv.org/abs/2409.12514v4","updated":"2024-11-14T12:03:37Z","published":"2024-09-19T07:10:18Z","title":"TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for\n Robotic Manipulation","summary":" Vision-Language-Action (VLA) models have shown remarkable potential in\nvisuomotor control and instruction comprehension through end-to-end learning\nprocesses. However, current VLA models face significant challenges: they are\nslow during inference and require extensive pre-training on large amounts of\nrobotic data, making real-world deployment difficult. In this paper, we\nintroduce a new family of compact vision-language-action models, called\nTinyVLA, which offers two key advantages over existing VLA models: (1) faster\ninference speeds, and (2) improved data efficiency, eliminating the need for\npre-training stage. Our framework incorporates two essential components to\nbuild TinyVLA: (1) initializing the policy backbone with robust, high-speed\nmultimodal models, and (2) integrating a diffusion policy decoder during\nfine-tuning to enable precise robot actions. We conducted extensive evaluations\nof TinyVLA in both simulation and on real robots, demonstrating that our\napproach significantly outperforms the state-of-the-art VLA model, OpenVLA, in\nterms of speed and data efficiency, while delivering comparable or superior\nperformance. Additionally, TinyVLA exhibits strong generalization capabilities\nacross various dimensions, including language instructions, novel objects,\nunseen positions, changes in object appearance, background variations, and\nenvironmental shifts, often matching or exceeding the performance of OpenVLA.\nWe believe that \\methodname offers an interesting perspective on utilizing\npre-trained multimodal models for policy learning. Our project is at\nhttps://tiny-vla.github.io.\n","authors":["Junjie Wen","Yichen Zhu","Jinming Li","Minjie Zhu","Kun Wu","Zhiyuan Xu","Ning Liu","Ran Cheng","Chaomin Shen","Yaxin Peng","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2409.12514v4.pdf","comment":"add more citations"},{"id":"http://arxiv.org/abs/2409.14411v2","updated":"2024-11-14T11:59:09Z","published":"2024-09-22T12:14:16Z","title":"Scaling Diffusion Policy in Transformer to 1 Billion Parameters for\n Robotic Manipulation","summary":" Diffusion Policy is a powerful technique tool for learning end-to-end\nvisuomotor robot control. It is expected that Diffusion Policy possesses\nscalability, a key attribute for deep neural networks, typically suggesting\nthat increasing model size would lead to enhanced performance. However, our\nobservations indicate that Diffusion Policy in transformer architecture (\\DP)\nstruggles to scale effectively; even minor additions of layers can deteriorate\ntraining outcomes. To address this issue, we introduce Scalable Diffusion\nTransformer Policy for visuomotor learning. Our proposed method, namely\n\\textbf{\\methodname}, introduces two modules that improve the training dynamic\nof Diffusion Policy and allow the network to better handle multimodal action\ndistribution. First, we identify that \\DP~suffers from large gradient issues,\nmaking the optimization of Diffusion Policy unstable. To resolve this issue, we\nfactorize the feature embedding of observation into multiple affine layers, and\nintegrate it into the transformer blocks. Additionally, our utilize non-causal\nattention which allows the policy network to \\enquote{see} future actions\nduring prediction, helping to reduce compounding errors. We demonstrate that\nour proposed method successfully scales the Diffusion Policy from 10 million to\n1 billion parameters. This new model, named \\methodname, can effectively scale\nup the model size with improved performance and generalization. We benchmark\n\\methodname~across 50 different tasks from MetaWorld and find that our largest\n\\methodname~outperforms \\DP~with an average improvement of 21.6\\%. Across 7\nreal-world robot tasks, our ScaleDP demonstrates an average improvement of\n36.25\\% over DP-T on four single-arm tasks and 75\\% on three bimanual tasks. We\nbelieve our work paves the way for scaling up models for visuomotor learning.\nThe project page is available at scaling-diffusion-policy.github.io.\n","authors":["Minjie Zhu","Yichen Zhu","Jinming Li","Junjie Wen","Zhiyuan Xu","Ning Liu","Ran Cheng","Chaomin Shen","Yaxin Peng","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2409.14411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10959v4","updated":"2024-11-14T11:23:50Z","published":"2024-07-15T17:55:36Z","title":"A Unified Probabilistic Approach to Traffic Conflict Detection","summary":" Traffic conflict detection is essential for proactive road safety by\nidentifying potential collisions before they occur. Existing methods rely on\nsurrogate safety measures tailored to specific interactions (e.g.,\ncar-following, side-swiping, or path-crossing) and require varying thresholds\nin different traffic conditions. This variation leads to inconsistencies and\nlimited adaptability of conflict detection in evolving traffic environments.\nConsequently, a need persists for consistent detection of traffic conflicts\nacross interaction contexts. To address this need, this study proposes a\nunified probabilistic approach. The proposed approach establishes a unified\nframework of traffic conflict detection, where traffic conflicts are formulated\nas context-dependent extreme events of road user interactions. The detection of\nconflicts is then decomposed into a series of statistical learning tasks:\nrepresenting interaction contexts, inferring proximity distributions, and\nassessing extreme collision risk. The unified formulation accommodates diverse\nhypotheses of traffic conflicts and the learning tasks enable data-driven\nanalysis of factors such as motion states of road users, environment\nconditions, and participant characteristics. Jointly, this approach supports\nconsistent and comprehensive evaluation of the collision risk emerging in road\nuser interactions. Our experiments using real-world trajectory data show that\nthe approach provides effective collision warnings, generalises across distinct\ndatasets and traffic environments, covers a broad range of conflict types, and\ncaptures a long-tailed distribution of conflict intensity. The findings\nhighlight its potential to enhance the safety assessment of traffic\ninfrastructures and policies, improve collision warning systems for autonomous\ndriving, and deepen the understanding of road user behaviour in safety-critical\ninteractions.\n","authors":["Yiru Jiao","Simeon C. Calvert","Sander van Cranenburgh","Hans van Lint"],"pdf_url":"https://arxiv.org/pdf/2407.10959v4.pdf","comment":"21 pages, 10 figures, under revision"},{"id":"http://arxiv.org/abs/2411.09360v1","updated":"2024-11-14T11:08:44Z","published":"2024-11-14T11:08:44Z","title":"D4W: Dependable Data-Driven Dynamics for Wheeled Robots","summary":" Wheeled robots have gained significant attention due to their wide range of\napplications in manufacturing, logistics, and service industries. However, due\nto the difficulty of building a highly accurate dynamics model for wheeled\nrobots, developing and testing control algorithms for them remains challenging\nand time-consuming, requiring extensive physical experimentation. To address\nthis problem, we propose D4W, i.e., Dependable Data-Driven Dynamics for Wheeled\nRobots, a simulation framework incorporating data-driven methods to accelerate\nthe development and evaluation of algorithms for wheeled robots. The key\ncontribution of D4W is a solution that utilizes real-world sensor data to learn\naccurate models of robot dynamics. The learned dynamics can capture complex\nrobot behaviors and interactions with the environment throughout simulations,\nsurpassing the limitations of analytical methods, which only work in simplified\nscenarios. Experimental results show that D4W achieves the best simulation\naccuracy compared to traditional approaches, allowing for rapid iteration of\nwheel robot algorithms with less or no need for fine-tuning in reality. We\nfurther verify the usability and practicality of the proposed framework through\nintegration with existing simulators and controllers.\n","authors":["Yunfeng Lin","Minghuan Liu","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2411.09360v1.pdf","comment":"The Fifth International Conference on Distributed Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2411.09299v1","updated":"2024-11-14T09:17:58Z","published":"2024-11-14T09:17:58Z","title":"Hearing the Robot's Mind: Sonification for Explicit Feedback in\n Human-Robot Interaction","summary":" Social robots are required not only to understand human intentions but also\nto effectively communicate their intentions or own internal states to users.\nThis study explores the use of sonification to provide explicit auditory\nfeedback, enhancing mutual understanding in HRI. We introduce a novel\nsonification approach that conveys the robot's internal state, linked to its\nperception of nearby individuals and their interaction intentions. The approach\nis evaluated through a two-fold user study: an online video-based survey with\n$26$ participants and live experiments with $10$ participants. Results indicate\nthat while sonification improves the robot's expressivity and communication\neffectiveness, the design of the auditory feedback needs refinement to enhance\nuser experience. Participants found the auditory cues useful but described the\nsounds as uninteresting and unpleasant. These findings underscore the\nimportance of carefully designed auditory feedback in developing more effective\nand engaging HRI systems.\n","authors":["Simone Arreghini","Antonio Paolillo","Gabriele Abbate","Alessandro Giusti"],"pdf_url":"https://arxiv.org/pdf/2411.09299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09294v1","updated":"2024-11-14T09:12:38Z","published":"2024-11-14T09:12:38Z","title":"Learning Hand State Estimation for a Light Exoskeleton","summary":" We propose a machine learning-based estimator of the hand state for\nrehabilitation purposes, using light exoskeletons. These devices are easy to\nuse and useful for delivering domestic and frequent therapies. We build a\nsupervised approach using information from the muscular activity of the forearm\nand the motion of the exoskeleton to reconstruct the hand's opening degree and\ncompliance level. Such information can be used to evaluate the therapy progress\nand develop adaptive control behaviors. Our approach is validated with a real\nlight exoskeleton. The experiments demonstrate good predictive performance of\nour approach when trained on data coming from a single user and tested on the\nsame user, even across different sessions. This generalization capability makes\nour system promising for practical use in real rehabilitation.\n","authors":["Gabriele Abbate","Alessandro Giusti","Luca Randazzo","Antonio Paolillo"],"pdf_url":"https://arxiv.org/pdf/2411.09294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09241v1","updated":"2024-11-14T07:15:24Z","published":"2024-11-14T07:15:24Z","title":"BlueME: Robust Underwater Robot-to-Robot Communication Using Compact\n Magnetoelectric Antennas","summary":" We present the design, development, and experimental validation of BlueME, a\ncompact magnetoelectric (ME) antenna array system for underwater robot-to-robot\ncommunication. BlueME employs ME antennas operating at their natural mechanical\nresonance frequency to efficiently transmit and receive very-low-frequency\n(VLF) electromagnetic signals underwater. To evaluate its performance, we\ndeployed BlueME on an autonomous surface vehicle (ASV) and a remotely operated\nvehicle (ROV) in open-water field trials. Our tests demonstrate that BlueME\nmaintains reliable signal transmission at distances beyond 200 meters while\nconsuming only 1 watt of power. Field trials show that the system operates\neffectively in challenging underwater conditions such as turbidity, obstacles,\nand multipath interference-- that generally affect acoustics and optics. Our\nanalysis also examines the impact of complete submersion on system performance\nand identifies key deployment considerations. This work represents the first\npractical underwater deployment of ME antennas outside the laboratory and\nimplements the largest VLF ME array system to date. BlueME demonstrates\nsignificant potential for marine robotics and automation in multi-robot\ncooperative systems and remote sensor networks.\n","authors":["Mehron Talebi","Sultan Mahmud","Adam Khalifa","Md Jahidul Islam"],"pdf_url":"https://arxiv.org/pdf/2411.09241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11420v2","updated":"2024-11-14T06:22:20Z","published":"2024-07-16T06:26:30Z","title":"iKalibr: Unified Targetless Spatiotemporal Calibration for Resilient\n Integrated Inertial Systems","summary":" The integrated inertial system, typically integrating an IMU and an\nexteroceptive sensor such as radar, LiDAR, and camera, has been widely accepted\nand applied in modern robotic applications for ego-motion estimation, motion\ncontrol, or autonomous exploration. To improve system accuracy, robustness, and\nfurther usability, both multiple and various sensors are generally resiliently\nintegrated, which benefits the system performance regarding failure tolerance,\nperception capability, and environment compatibility. For such systems,\naccurate and consistent spatiotemporal calibration is required to maintain a\nunique spatiotemporal framework for multi-sensor fusion. Considering most\nexisting calibration methods (i) are generally oriented to specific integrated\ninertial systems, (ii) often only focus on spatial determination, (iii) usually\nrequire artificial targets, lacking convenience and usability, we propose\niKalibr: a unified targetless spatiotemporal calibration framework for\nresilient integrated inertial systems, which overcomes the above issues, and\nenables both accurate and consistent calibration. Altogether four commonly\nemployed sensors are supported in iKalibr currently, namely IMU, radar, LiDAR,\nand camera. The proposed method starts with a rigorous and efficient dynamic\ninitialization, where all parameters in the estimator would be accurately\nrecovered. Subsequently, several continuous-time batch optimizations are\nconducted to refine the initialized parameters toward better states. Sufficient\nreal-world experiments were conducted to verify the feasibility and evaluate\nthe calibration performance of iKalibr. The results demonstrate that iKalibr\ncan achieve accurate resilient spatiotemporal calibration. We open-source our\nimplementations at (https://github.com/Unsigned-Long/iKalibr) to benefit the\nresearch community.\n","authors":["Shuolong Chen","Xingxing Li","Shengyu Li","Yuxuan Zhou","Xiaoteng Yang"],"pdf_url":"https://arxiv.org/pdf/2407.11420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09198v1","updated":"2024-11-14T05:39:29Z","published":"2024-11-14T05:39:29Z","title":"Risk-aware MPPI for Stochastic Hybrid Systems","summary":" Path Planning for stochastic hybrid systems presents a unique challenge of\npredicting distributions of future states subject to a state-dependent dynamics\nswitching function. In this work, we propose a variant of Model Predictive Path\nIntegral Control (MPPI) to plan kinodynamic paths for such systems. Monte Carlo\nmay be inaccurate when few samples are chosen to predict future states under\nstate-dependent disturbances. We employ recently proposed Unscented\nTransform-based methods to capture stochasticity in the states as well as the\nstate-dependent switching surfaces. This is in contrast to previous works that\nperform switching based only on the mean of predicted states. We focus our\nmotion planning application on the navigation of a mobile robot in the presence\nof dynamically moving agents whose responses are based on sensor-constrained\nattention zones. We evaluate our framework on a simulated mobile robot and show\nfaster convergence to a goal without collisions when the robot exploits the\nhybrid human dynamics versus when it does not.\n","authors":["Hardik Parwana","Mitchell Black","Bardh Hoxha","Hideki Okamoto","Georgios Fainekos","Danil Prokhorov","Dimitra Panagou"],"pdf_url":"https://arxiv.org/pdf/2411.09198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09160v1","updated":"2024-11-14T03:28:02Z","published":"2024-11-14T03:28:02Z","title":"Rationality based Innate-Values-driven Reinforcement Learning","summary":" Innate values describe agents' intrinsic motivations, which reflect their\ninherent interests and preferences to pursue goals and drive them to develop\ndiverse skills satisfying their various needs. The essence of reinforcement\nlearning (RL) is learning from interaction based on reward-driven behaviors,\nmuch like natural agents. It is an excellent model to describe the\ninnate-values-driven (IV) behaviors of AI agents. Especially developing the\nawareness of the AI agent through balancing internal and external utilities\nbased on its needs in different tasks is a crucial problem for individuals\nlearning to support AI agents integrating human society with safety and harmony\nin the long term. This paper proposes a hierarchical compound intrinsic value\nreinforcement learning model -- innate-values-driven reinforcement learning\ntermed IVRL to describe the complex behaviors of AI agents' interaction. We\nformulated the IVRL model and proposed two IVRL models: DQN and A2C. By\ncomparing them with benchmark algorithms such as DQN, DDQN, A2C, and PPO in the\nRole-Playing Game (RPG) reinforcement learning test platform VIZDoom, we\ndemonstrated that rationally organizing various individual needs can\neffectively achieve better performance.\n","authors":["Qin Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09160v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2401.05572"},{"id":"http://arxiv.org/abs/2411.09153v1","updated":"2024-11-14T03:13:26Z","published":"2024-11-14T03:13:26Z","title":"VidMan: Exploiting Implicit Dynamics from Video Diffusion Model for\n Effective Robot Manipulation","summary":" Recent advancements utilizing large-scale video data for learning video\ngeneration models demonstrate significant potential in understanding complex\nphysical dynamics. It suggests the feasibility of leveraging diverse robot\ntrajectory data to develop a unified, dynamics-aware model to enhance robot\nmanipulation. However, given the relatively small amount of available robot\ndata, directly fitting data without considering the relationship between visual\nobservations and actions could lead to suboptimal data utilization. To this\nend, we propose VidMan (Video Diffusion for Robot Manipulation), a novel\nframework that employs a two-stage training mechanism inspired by dual-process\ntheory from neuroscience to enhance stability and improve data utilization\nefficiency. Specifically, in the first stage, VidMan is pre-trained on the Open\nX-Embodiment dataset (OXE) for predicting future visual trajectories in a video\ndenoising diffusion manner, enabling the model to develop a long horizontal\nawareness of the environment's dynamics. In the second stage, a flexible yet\neffective layer-wise self-attention adapter is introduced to transform VidMan\ninto an efficient inverse dynamics model that predicts action modulated by the\nimplicit dynamics knowledge via parameter sharing. Our VidMan framework\noutperforms state-of-the-art baseline model GR-1 on the CALVIN benchmark,\nachieving a 11.7% relative improvement, and demonstrates over 9% precision\ngains on the OXE small-scale dataset. These results provide compelling evidence\nthat world models can significantly enhance the precision of robot action\nprediction. Codes and models will be public.\n","authors":["Youpeng Wen","Junfan Lin","Yi Zhu","Jianhua Han","Hang Xu","Shen Zhao","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2411.09153v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09145v1","updated":"2024-11-14T02:57:11Z","published":"2024-11-14T02:57:11Z","title":"UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for\n Egocentric Hand Object Interaction Videos","summary":" Egocentric Hand Object Interaction (HOI) videos provide valuable insights\ninto human interactions with the physical world, attracting growing interest\nfrom the computer vision and robotics communities. A key task in fully\nunderstanding the geometry and dynamics of HOI scenes is dense pointclouds\nsequence reconstruction. However, the inherent motion of both hands and the\ncamera makes this challenging. Current methods often rely on time-consuming\ntest-time optimization, making them impractical for reconstructing\ninternet-scale videos. To address this, we introduce UniHOI, a model that\nunifies the estimation of all variables necessary for dense 4D reconstruction,\nincluding camera intrinsic, camera poses, and video depth, for egocentric HOI\nscene in a fast feed-forward manner. We end-to-end optimize all these variables\nto improve their consistency in 3D space. Furthermore, our model could be\ntrained solely on large-scale monocular video dataset, overcoming the\nlimitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain\nand zero-shot generalization setting, surpassing all baselines in pointclouds\nsequence reconstruction and long-term 3D scene flow recovery. UniHOI is the\nfirst approach to offer fast, dense, and generalizable monocular egocentric HOI\nscene reconstruction in the presence of motion. Code and trained model will be\nreleased in the future.\n","authors":["Chengbo Yuan","Geng Chen","Li Yi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2411.09145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09110v1","updated":"2024-11-14T00:55:49Z","published":"2024-11-14T00:55:49Z","title":"Information-Optimal Multi-Spacecraft Positioning for Interstellar Object\n Exploration","summary":" Interstellar objects (ISOs), astronomical objects not gravitationally bound\nto the sun, could present valuable opportunities to advance our understanding\nof the universe's formation and composition. In response to the unpredictable\nnature of their discoveries that inherently come with large and rapidly\nchanging uncertainty in their state, this paper proposes a novel\nmulti-spacecraft framework for locally maximizing information to be gained\nthrough ISO encounters with formal probabilistic guarantees. Given some\napproximated control and estimation policies for fully autonomous spacecraft\noperations, we first construct an ellipsoid around its terminal position, where\nthe ISO would be located with a finite probability. The large state uncertainty\nof the ISO is formally handled here through the hierarchical property in\nstochastically contracting nonlinear systems. We then propose a method to find\nthe terminal positions of the multiple spacecraft optimally distributed around\nthe ellipsoid, which locally maximizes the information we can get from all the\npoints of interest (POIs). This utilizes a probabilistic information cost\nfunction that accounts for spacecraft positions, camera specifications, and ISO\nposition uncertainty, where the information is defined as visual data collected\nby cameras. Numerical simulations demonstrate the efficacy of this approach\nusing synthetic ISO candidates generated from quasi-realistic empirical\npopulations. Our method allows each spacecraft to optimally select its terminal\nstate and determine the ideal number of POIs to investigate, potentially\nenhancing the ability to study these rare and fleeting interstellar visitors\nwhile minimizing resource utilization.\n","authors":["Arna Bhardwaj","Shishir Bhatta","Hiroyasu Tsukamoto"],"pdf_url":"https://arxiv.org/pdf/2411.09110v1.pdf","comment":"IEEE Aerospace Conference, Preprint Version, Accepted: November 2024"},{"id":"http://arxiv.org/abs/2411.09241v1","updated":"2024-11-14T07:15:24Z","published":"2024-11-14T07:15:24Z","title":"BlueME: Robust Underwater Robot-to-Robot Communication Using Compact\n Magnetoelectric Antennas","summary":" We present the design, development, and experimental validation of BlueME, a\ncompact magnetoelectric (ME) antenna array system for underwater robot-to-robot\ncommunication. BlueME employs ME antennas operating at their natural mechanical\nresonance frequency to efficiently transmit and receive very-low-frequency\n(VLF) electromagnetic signals underwater. To evaluate its performance, we\ndeployed BlueME on an autonomous surface vehicle (ASV) and a remotely operated\nvehicle (ROV) in open-water field trials. Our tests demonstrate that BlueME\nmaintains reliable signal transmission at distances beyond 200 meters while\nconsuming only 1 watt of power. Field trials show that the system operates\neffectively in challenging underwater conditions such as turbidity, obstacles,\nand multipath interference -- that generally affect acoustics and optics. Our\nanalysis also examines the impact of complete submersion on system performance\nand identifies key deployment considerations. This work represents the first\npractical underwater deployment of ME antennas outside the laboratory and\nimplements the largest VLF ME array system to date. BlueME demonstrates\nsignificant potential for marine robotics and automation in multi-robot\ncooperative systems and remote sensor networks.\n","authors":["Mehron Talebi","Sultan Mahmud","Adam Khalifa","Md Jahidul Islam"],"pdf_url":"https://arxiv.org/pdf/2411.09241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.20435v2","updated":"2024-11-14T22:27:15Z","published":"2024-09-30T15:53:46Z","title":"ALLO: A Photorealistic Dataset and Data Generation Pipeline for Anomaly\n Detection During Robotic Proximity Operations in Lunar Orbit","summary":" NASA's forthcoming Lunar Gateway space station, which will be uncrewed most\nof the time, will need to operate with an unprecedented level of autonomy.\nEnhancing autonomy on the Gateway presents several unique challenges, one of\nwhich is to equip the Canadarm3, the Gateway's external robotic system, with\nthe capability to perform worksite monitoring. Monitoring will involve using\nthe arm's inspection cameras to detect any anomalies within the operating\nenvironment, a task complicated by the widely-varying lighting conditions in\nspace. In this paper, we introduce the visual anomaly detection and\nlocalization task for space applications and establish a benchmark with our\nnovel synthetic dataset called ALLO (for Anomaly Localization in Lunar Orbit).\nWe develop a complete data generation pipeline to create ALLO, which we use to\nevaluate the performance of state-of-the-art visual anomaly detection\nalgorithms. Given the low tolerance for risk during space operations and the\nlack of relevant data, we emphasize the need for novel, robust, and accurate\nanomaly detection methods to handle the challenging visual conditions found in\nlunar orbit and beyond.\n","authors":["Selina Leveugle","Chang Won Lee","Svetlana Stolpner","Chris Langley","Paul Grouchy","Steven Waslander","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2409.20435v2.pdf","comment":"Submitted to International Conference on Robotics and Automation\n (ICRA'25), Atlanta, USA, May 19-23, 2025"},{"id":"http://arxiv.org/abs/2411.09810v1","updated":"2024-11-14T21:00:20Z","published":"2024-11-14T21:00:20Z","title":"Robustness Assessment of Static Structures for Efficient Object Handling","summary":" This work establishes a solution to the problem of assessing the robustness\nof multi-object assemblies to external forces. Our physically-grounded approach\nhandles arbitrary static structures made from rigid objects of any shape and\nmass distribution without relying on heuristics or approximations. The result\nis a method that provides a foundation for autonomous robot decision-making\nwhen interacting with objects in frictional contact. Our strategy decouples\nslipping from toppling, enabling independent assessments of these two\nphenomena, with a shared robustness representation being key to combining the\nresults into an accurate robustness assessment. Our algorithms can be used by\nmotion planners to produce efficient assembly transportation plans, and by\nobject placement planners to select poses that improve the strength of an\nassembly. Compared to prior work, our approach is more generally applicable\nthan commonly used heuristics and more efficient than dynamics simulations.\n","authors":["Philippe Nadeau","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2411.09810v1.pdf","comment":"Submitted to IEEE Transactions on Robotics. Contains 16 pages, 13\n figures, and 3 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.09703v1","updated":"2024-11-14T18:59:57Z","published":"2024-11-14T18:59:57Z","title":"MagicQuill: An Intelligent Interactive Image Editing System","summary":" Image editing involves a variety of complex tasks and requires efficient and\nprecise manipulation techniques. In this paper, we present MagicQuill, an\nintegrated image editing system that enables swift actualization of creative\nideas. Our system features a streamlined yet functionally robust interface,\nallowing for the articulation of editing operations (e.g., inserting elements,\nerasing objects, altering color) with minimal input. These interactions are\nmonitored by a multimodal large language model (MLLM) to anticipate editing\nintentions in real time, bypassing the need for explicit prompt entry. Finally,\nwe apply a powerful diffusion prior, enhanced by a carefully learned two-branch\nplug-in module, to process editing requests with precise control. Experimental\nresults demonstrate the effectiveness of MagicQuill in achieving high-quality\nimage edits. Please visit https://magic-quill.github.io to try out our system.\n","authors":["Zichen Liu","Yue Yu","Hao Ouyang","Qiuyu Wang","Ka Leong Cheng","Wen Wang","Zhiheng Liu","Qifeng Chen","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2411.09703v1.pdf","comment":"Code and demo available at https://magic-quill.github.io"},{"id":"http://arxiv.org/abs/2411.09702v1","updated":"2024-11-14T18:59:40Z","published":"2024-11-14T18:59:40Z","title":"On the Surprising Effectiveness of Attention Transfer for Vision\n Transformers","summary":" Conventional wisdom suggests that pre-training Vision Transformers (ViT)\nimproves downstream performance by learning useful representations. Is this\nactually true? We investigate this question and find that the features and\nrepresentations learned during pre-training are not essential. Surprisingly,\nusing only the attention patterns from pre-training (i.e., guiding how\ninformation flows between tokens) is sufficient for models to learn high\nquality features from scratch and achieve comparable downstream performance. We\nshow this by introducing a simple method called attention transfer, where only\nthe attention patterns from a pre-trained teacher ViT are transferred to a\nstudent, either by copying or distilling the attention maps. Since attention\ntransfer lets the student learn its own features, ensembling it with a\nfine-tuned teacher also further improves accuracy on ImageNet. We\nsystematically study various aspects of our findings on the sufficiency of\nattention maps, including distribution shift settings where they underperform\nfine-tuning. We hope our exploration provides a better understanding of what\npre-training accomplishes and leads to a useful alternative to the standard\npractice of fine-tuning\n","authors":["Alexander C. Li","Yuandong Tian","Beidi Chen","Deepak Pathak","Xinlei Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09702v1.pdf","comment":"NeurIPS 2024. Code:\n https://github.com/alexlioralexli/attention-transfer"},{"id":"http://arxiv.org/abs/2411.09693v1","updated":"2024-11-14T18:58:02Z","published":"2024-11-14T18:58:02Z","title":"CropCraft: Inverse Procedural Modeling for 3D Reconstruction of Crop\n Plants","summary":" The ability to automatically build 3D digital twins of plants from images has\ncountless applications in agriculture, environmental science, robotics, and\nother fields. However, current 3D reconstruction methods fail to recover\ncomplete shapes of plants due to heavy occlusion and complex geometries. In\nthis work, we present a novel method for 3D reconstruction of agricultural\ncrops based on optimizing a parametric model of plant morphology via inverse\nprocedural modeling. Our method first estimates depth maps by fitting a neural\nradiance field and then employs Bayesian optimization to estimate plant\nmorphological parameters that result in consistent depth renderings. The\nresulting 3D model is complete and biologically plausible. We validate our\nmethod on a dataset of real images of agricultural fields, and demonstrate that\nthe reconstructions can be used for a variety of monitoring and simulation\napplications.\n","authors":["Albert J. Zhai","Xinlei Wang","Kaiyuan Li","Zhao Jiang","Junxiong Zhou","Sheng Wang","Zhenong Jin","Kaiyu Guan","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09693v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2411.09691v1","updated":"2024-11-14T18:57:07Z","published":"2024-11-14T18:57:07Z","title":"Advancing Fine-Grained Visual Understanding with Multi-Scale Alignment\n in Multi-Modal Models","summary":" Multi-modal large language models (MLLMs) have achieved remarkable success in\nfine-grained visual understanding across a range of tasks. However, they often\nencounter significant challenges due to inadequate alignment for fine-grained\nknowledge, which restricts their ability to accurately capture local details\nand attain a comprehensive global perception. While recent advancements have\nfocused on aligning object expressions with grounding information, they\ntypically lack explicit integration of object images, which contain affluent\ninformation beyond mere texts or coordinates. To bridge this gap, we introduce\na novel fine-grained visual knowledge alignment method that effectively aligns\nand integrates multi-scale knowledge of objects, including texts, coordinates,\nand images. This innovative method is underpinned by our multi-scale\nfine-grained enhancement data synthesis pipeline, which provides over 300K\nessential training data to enhance alignment and improve overall performance.\nFurthermore, we present TinyGroundingGPT, a series of compact models optimized\nfor high-level alignments. With a scale of approximately 3B parameters,\nTinyGroundingGPT achieves outstanding results in grounding tasks while\ndelivering performance comparable to larger MLLMs in complex visual scenarios.\n","authors":["Wei Wang","Zhaowei Li","Qi Xu","Linfeng Li","YiQing Cai","Botian Jiang","Hang Song","Xingcan Hu","Pengyu Wang","Li Xiao"],"pdf_url":"https://arxiv.org/pdf/2411.09691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14022v3","updated":"2024-11-14T18:44:25Z","published":"2024-05-22T21:55:58Z","title":"I2I-Mamba: Multi-modal medical image synthesis via selective state space\n modeling","summary":" In recent years, deep learning models comprising transformer components have\npushed the performance envelope in medical image synthesis tasks. Contrary to\nconvolutional neural networks (CNNs) that use static, local filters,\ntransformers use self-attention mechanisms to permit adaptive, non-local\nfiltering to sensitively capture long-range context. However, this sensitivity\ncomes at the expense of substantial model complexity, which can compromise\nlearning efficacy particularly on relatively modest-sized imaging datasets.\nHere, we propose a novel adversarial model for multi-modal medical image\nsynthesis, I2I-Mamba, that leverages selective state space modeling (SSM) to\nefficiently capture long-range context while maintaining local precision. To do\nthis, I2I-Mamba injects channel-mixed Mamba (cmMamba) blocks in the bottleneck\nof a convolutional backbone. In cmMamba blocks, SSM layers are used to learn\ncontext across the spatial dimension and channel-mixing layers are used to\nlearn context across the channel dimension of feature maps. Comprehensive\ndemonstrations are reported for imputing missing images in multi-contrast MRI\nand MRI-CT protocols. Our results indicate that I2I-Mamba offers superior\nperformance against state-of-the-art CNN- and transformer-based methods in\nsynthesizing target-modality images.\n","authors":["Omer F. Atli","Bilal Kabas","Fuat Arslan","Mahmut Yurt","Onat Dalmaz","Tolga Çukur"],"pdf_url":"https://arxiv.org/pdf/2405.14022v3.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.03060v3","updated":"2024-11-14T18:09:39Z","published":"2024-01-05T20:32:40Z","title":"Super-resolution multi-contrast unbiased eye atlases with deep\n probabilistic refinement","summary":" Purpose: Eye morphology varies significantly across the population,\nespecially for the orbit and optic nerve. These variations limit the\nfeasibility and robustness of generalizing population-wise features of eye\norgans to an unbiased spatial reference.\n Approach: To tackle these limitations, we propose a process for creating\nhigh-resolution unbiased eye atlases. First, to restore spatial details from\nscans with a low through-plane resolution compared to a high in-plane\nresolution, we apply a deep learning-based super-resolution algorithm. Then, we\ngenerate an initial unbiased reference with an iterative metric-based\nregistration using a small portion of subject scans. We register the remaining\nscans to this template and refine the template using an unsupervised deep\nprobabilistic approach that generates a more expansive deformation field to\nenhance the organ boundary alignment. We demonstrate this framework using\nmagnetic resonance images across four different tissue contrasts, generating\nfour atlases in separate spatial alignments.\n Results: For each tissue contrast, we find a significant improvement using\nthe Wilcoxon signed-rank test in the average Dice score across four labeled\nregions compared to a standard registration framework consisting of rigid,\naffine, and deformable transformations. These results highlight the effective\nalignment of eye organs and boundaries using our proposed process.\n Conclusions: By combining super-resolution preprocessing and deep\nprobabilistic models, we address the challenge of generating an eye atlas to\nserve as a standardized reference across a largely variable population.\n","authors":["Ho Hin Lee","Adam M. Saunders","Michael E. Kim","Samuel W. Remedios","Lucas W. Remedios","Yucheng Tang","Qi Yang","Xin Yu","Shunxing Bao","Chloe Cho","Louise A. Mawn","Tonia S. Rex","Kevin L. Schey","Blake E. Dewey","Jeffrey M. Spraggins","Jerry L. Prince","Yuankai Huo","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2401.03060v3.pdf","comment":"Published in SPIE Journal of Medical Imaging\n (https://doi.org/10.1117/1.JMI.11.6.064004). 27 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.09627v1","updated":"2024-11-14T17:54:43Z","published":"2024-11-14T17:54:43Z","title":"One-Shot Manipulation Strategy Learning by Making Contact Analogies","summary":" We present a novel approach, MAGIC (manipulation analogies for generalizable\nintelligent contacts), for one-shot learning of manipulation strategies with\nfast and extensive generalization to novel objects. By leveraging a reference\naction trajectory, MAGIC effectively identifies similar contact points and\nsequences of actions on novel objects to replicate a demonstrated strategy,\nsuch as using different hooks to retrieve distant objects of different shapes\nand sizes. Our method is based on a two-stage contact-point matching process\nthat combines global shape matching using pretrained neural features with local\ncurvature analysis to ensure precise and physically plausible contact points.\nWe experiment with three tasks including scooping, hanging, and hooking\nobjects. MAGIC demonstrates superior performance over existing methods,\nachieving significant improvements in runtime speed and generalization to\ndifferent object categories. Website: https://magic-2024.github.io/ .\n","authors":["Yuyao Liu","Jiayuan Mao","Joshua Tenenbaum","Tomás Lozano-Pérez","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2411.09627v1.pdf","comment":"CoRL LEAP Workshop, 2024"},{"id":"http://arxiv.org/abs/2411.09623v1","updated":"2024-11-14T17:47:54Z","published":"2024-11-14T17:47:54Z","title":"Vision-based Manipulation of Transparent Plastic Bags in Industrial\n Setups","summary":" This paper addresses the challenges of vision-based manipulation for\nautonomous cutting and unpacking of transparent plastic bags in industrial\nsetups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data,\nconnectivity, analytics, and robotics, promises enhanced accessibility and\nsustainability throughout the value chain. The integration of autonomous\nsystems, including collaborative robots (cobots), into industrial processes is\npivotal for efficiency and safety. The proposed solution employs advanced\nMachine Learning algorithms, particularly Convolutional Neural Networks (CNNs),\nto identify transparent plastic bags under varying lighting and background\nconditions. Tracking algorithms and depth sensing technologies are utilized for\n3D spatial awareness during pick and placement. The system addresses challenges\nin grasping and manipulation, considering optimal points, compliance control\nwith vacuum gripping technology, and real-time automation for safe interaction\nin dynamic environments. The system's successful testing and validation in the\nlab with the FRANKA robot arm, showcases its potential for widespread\nindustrial applications, while demonstrating effectiveness in automating the\nunpacking and cutting of transparent plastic bags for an 8-stack bulk-loader\nbased on specific requirements and rigorous testing.\n","authors":["F. Adetunji","A. Karukayil","P. Samant","S. Shabana","F. Varghese","U. Upadhyay","R. A. Yadav","A. Partridge","E. Pendleton","R. Plant","Y. Petillot","M. Koskinopoulou"],"pdf_url":"https://arxiv.org/pdf/2411.09623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20559v2","updated":"2024-11-14T17:40:16Z","published":"2024-05-31T00:57:58Z","title":"Information-driven design of imaging systems","summary":" Most modern imaging systems process the data they capture algorithmically\nbefore-or instead of-human viewing. As a result, performance depends not on how\ninterpretable the measurements appear, but how effectively they encode details\nfor algorithmic processing. Information theory provides mathematical tools to\nanalyze this, but developing methods that can handle the complexity of\nreal-world measurements yet remain practical enough for widespread use has\nproven challenging. We introduce a data-driven approach for estimating the\ninformation content of imaging system measurements. Our framework requires only\nexperimental measurements and noise characterization, with no need for ground\ntruth data. We demonstrate that these information estimates reliably predict\nsystem performance across diverse imaging modalities, including color\nphotography, radio astronomy, lensless imaging, and label-free microscopy. To\nautomate the process of designing imaging systems that maximize information\ncapture we introduce an optimization technique called Information-Driven\nEncoder Analysis Learning (IDEAL). The tools we develop in this work unlock\ninformation theory as a powerful, practical tool for analyzing and designing\nimaging systems across a broad range of applications.\n A video summarizing this work can be found at\nhttps://waller-lab.github.io/EncodingInformationWebsite/\n","authors":["Henry Pinkard","Leyla Kabuli","Eric Markley","Tiffany Chien","Jiantao Jiao","Laura Waller"],"pdf_url":"https://arxiv.org/pdf/2405.20559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09604v1","updated":"2024-11-14T17:22:16Z","published":"2024-11-14T17:22:16Z","title":"Local-Global Attention: An Adaptive Mechanism for Multi-Scale Feature\n Integration","summary":" In recent years, attention mechanisms have significantly enhanced the\nperformance of object detection by focusing on key feature information.\nHowever, prevalent methods still encounter difficulties in effectively\nbalancing local and global features. This imbalance hampers their ability to\ncapture both fine-grained details and broader contextual information-two\ncritical elements for achieving accurate object detection.To address these\nchallenges, we propose a novel attention mechanism, termed Local-Global\nAttention, which is designed to better integrate both local and global\ncontextual features. Specifically, our approach combines multi-scale\nconvolutions with positional encoding, enabling the model to focus on local\ndetails while concurrently considering the broader global context.\nAdditionally, we introduce a learnable parameters, which allow the model to\ndynamically adjust the relative importance of local and global attention,\ndepending on the specific requirements of the task, thereby optimizing feature\nrepresentations across multiple scales.We have thoroughly evaluated the\nLocal-Global Attention mechanism on several widely used object detection and\nclassification datasets. Our experimental results demonstrate that this\napproach significantly enhances the detection of objects at various scales,\nwith particularly strong performance on multi-class and small object detection\ntasks. In comparison to existing attention mechanisms, Local-Global Attention\nconsistently outperforms them across several key metrics, all while maintaining\ncomputational efficiency.\n","authors":["Yifan Shao"],"pdf_url":"https://arxiv.org/pdf/2411.09604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09598v1","updated":"2024-11-14T17:15:51Z","published":"2024-11-14T17:15:51Z","title":"Assessing the Performance of the DINOv2 Self-supervised Learning Vision\n Transformer Model for the Segmentation of the Left Atrium from MRI Images","summary":" Accurate left atrium (LA) segmentation from pre-operative scans is crucial\nfor diagnosing atrial fibrillation, treatment planning, and supporting surgical\ninterventions. While deep learning models are key in medical image\nsegmentation, they often require extensive manually annotated data. Foundation\nmodels trained on larger datasets have reduced this dependency, enhancing\ngeneralizability and robustness through transfer learning. We explore DINOv2, a\nself-supervised learning vision transformer trained on natural images, for LA\nsegmentation using MRI. The challenges for LA's complex anatomy, thin\nboundaries, and limited annotated data make accurate segmentation difficult\nbefore & during the image-guided intervention. We demonstrate DINOv2's ability\nto provide accurate & consistent segmentation, achieving a mean Dice score of\n.871 & a Jaccard Index of .792 for end-to-end fine-tuning. Through few-shot\nlearning across various data sizes & patient counts, DINOv2 consistently\noutperforms baseline models. These results suggest that DINOv2 effectively\nadapts to MRI with limited data, highlighting its potential as a competitive\ntool for segmentation & encouraging broader use in medical imaging.\n","authors":["Bipasha Kundu","Bidur Khanal","Richard Simon","Cristian A. Linte"],"pdf_url":"https://arxiv.org/pdf/2411.09598v1.pdf","comment":"6 pages, 3 figures, SPIE Medical Imaging, 2025"},{"id":"http://arxiv.org/abs/2411.09595v1","updated":"2024-11-14T17:08:23Z","published":"2024-11-14T17:08:23Z","title":"LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models","summary":" This work explores expanding the capabilities of large language models (LLMs)\npretrained on text to generate 3D meshes within a unified model. This offers\nkey advantages of (1) leveraging spatial knowledge already embedded in LLMs,\nderived from textual sources like 3D tutorials, and (2) enabling conversational\n3D generation and mesh understanding. A primary challenge is effectively\ntokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly.\nTo address this, we introduce LLaMA-Mesh, a novel approach that represents the\nvertex coordinates and face definitions of 3D meshes as plain text, allowing\ndirect integration with LLMs without expanding the vocabulary. We construct a\nsupervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate\n3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs\nas required, and (3) understand and interpret 3D meshes. Our work is the first\nto demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge\nfor 3D mesh generation in a text-based format, effectively unifying the 3D and\ntext modalities. LLaMA-Mesh achieves mesh generation quality on par with models\ntrained from scratch while maintaining strong text generation performance.\n","authors":["Zhengyi Wang","Jonathan Lorraine","Yikai Wang","Hang Su","Jun Zhu","Sanja Fidler","Xiaohui Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.09595v1.pdf","comment":"See the project website at\n https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/"},{"id":"http://arxiv.org/abs/2410.18958v2","updated":"2024-11-14T17:06:55Z","published":"2024-10-24T17:55:52Z","title":"Stable Consistency Tuning: Understanding and Improving Consistency\n Models","summary":" Diffusion models achieve superior generation quality but suffer from slow\ngeneration speed due to the iterative nature of denoising. In contrast,\nconsistency models, a new generative family, achieve competitive performance\nwith significantly faster sampling. These models are trained either through\nconsistency distillation, which leverages pretrained diffusion models, or\nconsistency training/tuning directly from raw data. In this work, we propose a\nnovel framework for understanding consistency models by modeling the denoising\nprocess of the diffusion model as a Markov Decision Process (MDP) and framing\nconsistency model training as the value estimation through Temporal\nDifference~(TD) Learning. More importantly, this framework allows us to analyze\nthe limitations of current consistency training/tuning strategies. Built upon\nEasy Consistency Tuning (ECT), we propose Stable Consistency Tuning (SCT),\nwhich incorporates variance-reduced learning using the score identity. SCT\nleads to significant performance improvements on benchmarks such as CIFAR-10\nand ImageNet-64. On ImageNet-64, SCT achieves 1-step FID 2.42 and 2-step FID\n1.55, a new SoTA for consistency models.\n","authors":["Fu-Yun Wang","Zhengyang Geng","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2410.18958v2.pdf","comment":"Code is available at\n https://github.com/G-U-N/Stable-Consistency-Tuning"},{"id":"http://arxiv.org/abs/2411.09593v1","updated":"2024-11-14T17:06:00Z","published":"2024-11-14T17:06:00Z","title":"SMILE-UHURA Challenge -- Small Vessel Segmentation at Mesoscopic Scale\n from Ultra-High Resolution 7T Magnetic Resonance Angiograms","summary":" The human brain receives nutrients and oxygen through an intricate network of\nblood vessels. Pathology affecting small vessels, at the mesoscopic scale,\nrepresents a critical vulnerability within the cerebral blood supply and can\nlead to severe conditions, such as Cerebral Small Vessel Diseases. The advent\nof 7 Tesla MRI systems has enabled the acquisition of higher spatial resolution\nimages, making it possible to visualise such vessels in the brain. However, the\nlack of publicly available annotated datasets has impeded the development of\nrobust, machine learning-driven segmentation algorithms. To address this, the\nSMILE-UHURA challenge was organised. This challenge, held in conjunction with\nthe ISBI 2023, in Cartagena de Indias, Colombia, aimed to provide a platform\nfor researchers working on related topics. The SMILE-UHURA challenge addresses\nthe gap in publicly available annotated datasets by providing an annotated\ndataset of Time-of-Flight angiography acquired with 7T MRI. This dataset was\ncreated through a combination of automated pre-segmentation and extensive\nmanual refinement. In this manuscript, sixteen submitted methods and two\nbaseline methods are compared both quantitatively and qualitatively on two\ndifferent datasets: held-out test MRAs from the same dataset as the training\ndata (with labels kept secret) and a separate 7T ToF MRA dataset where both\ninput volumes and labels are kept secret. The results demonstrate that most of\nthe submitted deep learning methods, trained on the provided training dataset,\nachieved reliable segmentation performance. Dice scores reached up to 0.838\n$\\pm$ 0.066 and 0.716 $\\pm$ 0.125 on the respective datasets, with an average\nperformance of up to 0.804 $\\pm$ 0.15.\n","authors":["Soumick Chatterjee","Hendrik Mattern","Marc Dörner","Alessandro Sciarra","Florian Dubost","Hannes Schnurre","Rupali Khatun","Chun-Chih Yu","Tsung-Lin Hsieh","Yi-Shan Tsai","Yi-Zeng Fang","Yung-Ching Yang","Juinn-Dar Huang","Marshall Xu","Siyu Liu","Fernanda L. Ribeiro","Saskia Bollmann","Karthikesh Varma Chintalapati","Chethan Mysuru Radhakrishna","Sri Chandana Hudukula Ram Kumara","Raviteja Sutrave","Abdul Qayyum","Moona Mazher","Imran Razzak","Cristobal Rodero","Steven Niederren","Fengming Lin","Yan Xia","Jiacheng Wang","Riyu Qiu","Liansheng Wang","Arya Yazdan Panah","Rosana El Jurdi","Guanghui Fu","Janan Arslan","Ghislain Vaillant","Romain Valabregue","Didier Dormont","Bruno Stankoff","Olivier Colliot","Luisa Vargas","Isai Daniel Chacón","Ioannis Pitsiorlas","Pablo Arbeláez","Maria A. Zuluaga","Stefanie Schreiber","Oliver Speck","Andreas Nürnberger"],"pdf_url":"https://arxiv.org/pdf/2411.09593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09439v1","updated":"2024-11-14T16:58:19Z","published":"2024-11-14T16:58:19Z","title":"Spider: Any-to-Many Multimodal LLM","summary":" Multimodal LLMs (MLLMs) have emerged as an extension of Large Language Models\n(LLMs), enabling the integration of various modalities. However, Any-to-Any\nMLLMs are limited to generating pairwise modalities 'Text + X' within a single\nresponse, such as Text + {Image or Audio or Video}. To address this limitation,\nwe introduce Spider, a novel efficient Any-to-Many Modalities Generation (AMMG)\nframework, which can generate an arbitrary combination of modalities 'Text +\nXs', such as Text + {Image and Audio and Video}. To achieve efficient AMMG, our\nSpider integrates three core components: a Base Model for basic X-to-X (i.e.,\nAny-to-Any) modality processing, a novel Efficient Decoders-Controller for\ncontrolling multimodal Decoders to generate Xs (many-modal) contents, and an\nAny-to-Many Instruction Template designed for producing Xs signal prompts. To\ntrain Spider, we constructed a novel Text-formatted Many-Modal (TMM) dataset,\nwhich facilitates the learning of the X-to-Xs (i.e., Any-to-Many) capability\nnecessary for AMMG. Ultimately, the well-trained Spider generates a pseudo\nX-to-Xs dataset, the first-ever X-to-Xs many-modal dataset, enhancing the\npotential for AMMG task in future research. Overall, this work not only pushes\nthe boundary of multimodal interaction but also provides rich data support for\nadvancing the field.\n","authors":["Jinxiang Lai","Jie Zhang","Jun Liu","Jian Li","Xiaocheng Lu","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2411.09439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16824v4","updated":"2024-11-14T16:35:48Z","published":"2024-04-25T17:59:45Z","title":"V2A-Mark: Versatile Deep Visual-Audio Watermarking for Manipulation\n Localization and Copyright Protection","summary":" AI-generated video has revolutionized short video production, filmmaking, and\npersonalized media, making video local editing an essential tool. However, this\nprogress also blurs the line between reality and fiction, posing challenges in\nmultimedia forensics. To solve this urgent issue, V2A-Mark is proposed to\naddress the limitations of current video tampering forensics, such as poor\ngeneralizability, singular function, and single modality focus. Combining the\nfragility of video-into-video steganography with deep robust watermarking, our\nmethod can embed invisible visual-audio localization watermarks and copyright\nwatermarks into the original video frames and audio, enabling precise\nmanipulation localization and copyright protection. We also design a temporal\nalignment and fusion module and degradation prompt learning to enhance the\nlocalization accuracy and decoding robustness. Meanwhile, we introduce a\nsample-level audio localization method and a cross-modal copyright extraction\nmechanism to couple the information of audio and video frames. The\neffectiveness of V2A-Mark has been verified on a visual-audio tampering\ndataset, emphasizing its superiority in localization precision and copyright\naccuracy, crucial for the sustainable development of video editing in the AIGC\nvideo era.\n","authors":["Xuanyu Zhang","Youmin Xu","Runyi Li","Jiwen Yu","Weiqi Li","Zhipei Xu","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16824v4.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2411.09572v1","updated":"2024-11-14T16:29:45Z","published":"2024-11-14T16:29:45Z","title":"Dynamic Reconstruction of Hand-Object Interaction with Distributed\n Force-aware Contact Representation","summary":" We present ViTaM-D, a novel visual-tactile framework for dynamic hand-object\ninteraction reconstruction, integrating distributed tactile sensing for more\naccurate contact modeling. While existing methods focus primarily on visual\ninputs, they struggle with capturing detailed contact interactions such as\nobject deformation. Our approach leverages distributed tactile sensors to\naddress this limitation by introducing DF-Field. This distributed force-aware\ncontact representation models both kinetic and potential energy in hand-object\ninteraction. ViTaM-D first reconstructs hand-object interactions using a\nvisual-only network, VDT-Net, and then refines contact details through a\nforce-aware optimization (FO) process, enhancing object deformation modeling.\nTo benchmark our approach, we introduce the HOT dataset, which features 600\nsequences of hand-object interactions, including deformable objects, built in a\nhigh-precision simulation environment. Extensive experiments on both the DexYCB\nand HOT datasets demonstrate significant improvements in accuracy over previous\nstate-of-the-art methods such as gSDF and HOTrack. Our results highlight the\nsuperior performance of ViTaM-D in both rigid and deformable object\nreconstruction, as well as the effectiveness of DF-Field in refining hand\nposes. This work offers a comprehensive solution to dynamic hand-object\ninteraction reconstruction by seamlessly integrating visual and tactile data.\nCodes, models, and datasets will be available.\n","authors":["Zhenjun Yu","Wenqiang Xu","Pengfei Xie","Yutong Li","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2411.09572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09567v1","updated":"2024-11-14T16:21:47Z","published":"2024-11-14T16:21:47Z","title":"VPBSD:Vessel-Pattern-Based Semi-Supervised Distillation for Efficient 3D\n Microscopic Cerebrovascular Segmentation","summary":" 3D microscopic cerebrovascular images are characterized by their high\nresolution, presenting significant annotation challenges, large data volumes,\nand intricate variations in detail. Together, these factors make achieving\nhigh-quality, efficient whole-brain segmentation particularly demanding. In\nthis paper, we propose a novel Vessel-Pattern-Based Semi-Supervised\nDistillation pipeline (VpbSD) to address the challenges of 3D microscopic\ncerebrovascular segmentation. This pipeline initially constructs a\nvessel-pattern codebook that captures diverse vascular structures from\nunlabeled data during the teacher model's pretraining phase. In the knowledge\ndistillation stage, the codebook facilitates the transfer of rich knowledge\nfrom a heterogeneous teacher model to a student model, while the\nsemi-supervised approach further enhances the student model's exposure to\ndiverse learning samples. Experimental results on real-world data, including\ncomparisons with state-of-the-art methods and ablation studies, demonstrate\nthat our pipeline and its individual components effectively address the\nchallenges inherent in microscopic cerebrovascular segmentation.\n","authors":["Xi Lin","Shixuan Zhao","Xinxu Wei","Amir Shmuel","Yongjie Li"],"pdf_url":"https://arxiv.org/pdf/2411.09567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06503v2","updated":"2024-11-14T16:15:20Z","published":"2024-11-10T15:57:53Z","title":"Diffusion Sampling Correction via Approximately 10 Parameters","summary":" Diffusion Probabilistic Models (DPMs) have demonstrated exceptional\nperformance in generative tasks, but this comes at the expense of sampling\nefficiency. To enhance sampling speed without sacrificing quality, various\ndistillation-based accelerated sampling algorithms have been recently proposed.\nHowever, they typically require significant additional training costs and model\nparameter storage, which limit their practical application. In this work, we\npropose PCA-based Adaptive Search (PAS), which optimizes existing solvers for\nDPMs with minimal learnable parameters and training costs. Specifically, we\nfirst employ PCA to obtain a few orthogonal unit basis vectors to span the\nhigh-dimensional sampling space, which enables us to learn just a set of\ncoordinates to correct the sampling direction; furthermore, based on the\nobservation that the cumulative truncation error exhibits an ``S''-shape, we\ndesign an adaptive search strategy that further enhances the sampling\nefficiency and reduces the number of stored parameters to approximately 10.\nExtensive experiments demonstrate that PAS can significantly enhance existing\nfast solvers in a plug-and-play manner with negligible costs. For instance, on\nCIFAR10, PAS requires only 12 parameters and less than 1 minute of training on\na single NVIDIA A100 GPU to optimize the DDIM from 15.69 FID (NFE=10) to 4.37.\n","authors":["Guangyi Wang","Wei Peng","Lijiang Li","Wenyu Chen","Yuren Cai","Songzhi Su"],"pdf_url":"https://arxiv.org/pdf/2411.06503v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09558v1","updated":"2024-11-14T16:10:15Z","published":"2024-11-14T16:10:15Z","title":"Adaptive Deviation Learning for Visual Anomaly Detection with Data\n Contamination","summary":" Visual anomaly detection targets to detect images that notably differ from\nnormal pattern, and it has found extensive application in identifying defective\nparts within the manufacturing industry. These anomaly detection paradigms\npredominantly focus on training detection models using only clean, unlabeled\nnormal samples, assuming an absence of contamination; a condition often unmet\nin real-world scenarios. The performance of these methods significantly depends\non the quality of the data and usually decreases when exposed to noise. We\nintroduce a systematic adaptive method that employs deviation learning to\ncompute anomaly scores end-to-end while addressing data contamination by\nassigning relative importance to the weights of individual instances. In this\napproach, the anomaly scores for normal instances are designed to approximate\nscalar scores obtained from the known prior distribution. Meanwhile, anomaly\nscores for anomaly examples are adjusted to exhibit statistically significant\ndeviations from these reference scores. Our approach incorporates a constrained\noptimization problem within the deviation learning framework to update instance\nweights, resolving this problem for each mini-batch. Comprehensive experiments\non the MVTec and VisA benchmark datasets indicate that our proposed method\nsurpasses competing techniques and exhibits both stability and robustness in\nthe presence of data contamination.\n","authors":["Anindya Sundar Das","Guansong Pang","Monowar Bhuyan"],"pdf_url":"https://arxiv.org/pdf/2411.09558v1.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV 2025)"},{"id":"http://arxiv.org/abs/2411.09555v1","updated":"2024-11-14T16:07:04Z","published":"2024-11-14T16:07:04Z","title":"Image Processing for Motion Magnification","summary":" Motion Magnification (MM) is a collection of relative recent techniques\nwithin the realm of Image Processing. The main motivation of introducing these\ntechniques in to support the human visual system to capture relevant\ndisplacements of an object of interest; these motions can be in object color\nand in object location. In fact, the goal is to opportunely process a video\nsequence to obtain as output a new video in which motions are magnified and\nvisible to the viewer. We propose a numerical technique using the Phase-Based\nMotion Magnification which analyses the video sequence in the Fourier Domain\nand rely on the Fourier Shifting Property. We describe the mathematical\nfoundation of this method and the corresponding implementation in a numerical\nalgorithm. We present preliminary experiments, focusing on some basic test made\nup using synthetic images.\n","authors":["Nadaniela Egidi","Josephin Giacomini","Paolo Leonesi","Pierluigi Maponi","Federico Mearelli","Edin Trebovic"],"pdf_url":"https://arxiv.org/pdf/2411.09555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09553v1","updated":"2024-11-14T16:06:30Z","published":"2024-11-14T16:06:30Z","title":"OOD-SEG: Out-Of-Distribution detection for image SEGmentation with\n sparse multi-class positive-only annotations","summary":" Despite significant advancements, segmentation based on deep neural networks\nin medical and surgical imaging faces several challenges, two of which we aim\nto address in this work. First, acquiring complete pixel-level segmentation\nlabels for medical images is time-consuming and requires domain expertise.\nSecond, typical segmentation pipelines cannot detect out-of-distribution (OOD)\npixels, leaving them prone to spurious outputs during deployment. In this work,\nwe propose a novel segmentation approach exploiting OOD detection that learns\nonly from sparsely annotated pixels from multiple positive-only classes. %but\n\\emph{no background class} annotation. These multi-class positive annotations\nnaturally fall within the in-distribution (ID) set. Unlabelled pixels may\ncontain positive classes but also negative ones, including what is typically\nreferred to as \\emph{background} in standard segmentation formulations. Here,\nwe forgo the need for background annotation and consider these together with\nany other unseen classes as part of the OOD set. Our framework can integrate,\nat a pixel-level, any OOD detection approaches designed for classification\ntasks. To address the lack of existing OOD datasets and established evaluation\nmetric for medical image segmentation, we propose a cross-validation strategy\nthat treats held-out labelled classes as OOD. Extensive experiments on both\nmulti-class hyperspectral and RGB surgical imaging datasets demonstrate the\nrobustness and generalisation capability of our proposed framework.\n","authors":["Junwen Wang","Zhonghao Wang","Oscar MacCormac","Jonathan Shapey","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2411.09553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09551v1","updated":"2024-11-14T16:06:10Z","published":"2024-11-14T16:06:10Z","title":"MFTIQ: Multi-Flow Tracker with Independent Matching Quality Estimation","summary":" In this work, we present MFTIQ, a novel dense long-term tracking model that\nadvances the Multi-Flow Tracker (MFT) framework to address challenges in\npoint-level visual tracking in video sequences. MFTIQ builds upon the\nflow-chaining concepts of MFT, integrating an Independent Quality (IQ) module\nthat separates correspondence quality estimation from optical flow\ncomputations. This decoupling significantly enhances the accuracy and\nflexibility of the tracking process, allowing MFTIQ to maintain reliable\ntrajectory predictions even in scenarios of prolonged occlusions and complex\ndynamics. Designed to be \"plug-and-play\", MFTIQ can be employed with any\noff-the-shelf optical flow method without the need for fine-tuning or\narchitectural modifications. Experimental validations on the TAP-Vid Davis\ndataset show that MFTIQ with RoMa optical flow not only surpasses MFT but also\nperforms comparably to state-of-the-art trackers while having substantially\nfaster processing speed. Code and models available at\nhttps://github.com/serycjon/MFTIQ .\n","authors":["Jonas Serych","Michal Neoral","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2411.09551v1.pdf","comment":"accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2411.09540v1","updated":"2024-11-14T15:56:11Z","published":"2024-11-14T15:56:11Z","title":"Prompting the Unseen: Detecting Hidden Backdoors in Black-Box Models","summary":" Visual prompting (VP) is a new technique that adapts well-trained frozen\nmodels for source domain tasks to target domain tasks. This study examines VP's\nbenefits for black-box model-level backdoor detection. The visual prompt in VP\nmaps class subspaces between source and target domains. We identify a\nmisalignment, termed class subspace inconsistency, between clean and poisoned\ndatasets. Based on this, we introduce \\textsc{BProm}, a black-box model-level\ndetection method to identify backdoors in suspicious models, if any.\n\\textsc{BProm} leverages the low classification accuracy of prompted models\nwhen backdoors are present. Extensive experiments confirm \\textsc{BProm}'s\neffectiveness.\n","authors":["Zi-Xuan Huang","Jia-Wei Chen","Zhi-Peng Zhang","Chia-Mu Yu"],"pdf_url":"https://arxiv.org/pdf/2411.09540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09538v1","updated":"2024-11-14T15:55:21Z","published":"2024-11-14T15:55:21Z","title":"Marker-free Human Gait Analysis using a Smart Edge Sensor System","summary":" The human gait is a complex interplay between the neuronal and the muscular\nsystems, reflecting an individual's neurological and physiological condition.\nThis makes gait analysis a valuable tool for biomechanics and medical experts.\nTraditional observational gait analysis is cost-effective but lacks reliability\nand accuracy, while instrumented gait analysis, particularly using marker-based\noptical systems, provides accurate data but is expensive and time-consuming. In\nthis paper, we introduce a novel markerless approach for gait analysis using a\nmulti-camera setup with smart edge sensors to estimate 3D body poses without\nfiducial markers. We propose a Siamese embedding network with triplet loss\ncalculation to identify individuals by their gait pattern. This network\neffectively maps gait sequences to an embedding space that enables clustering\nsequences from the same individual or activity closely together while\nseparating those of different ones. Our results demonstrate the potential of\nthe proposed system for efficient automated gait analysis in diverse real-world\nenvironments, facilitating a wide range of applications.\n","authors":["Eva Katharina Bauer","Simon Bultmann","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2411.09538v1.pdf","comment":"accepted for SII 2025"},{"id":"http://arxiv.org/abs/2408.08700v2","updated":"2024-11-14T15:47:59Z","published":"2024-08-16T12:27:46Z","title":"HyCoT: A Transformer-Based Autoencoder for Hyperspectral Image\n Compression","summary":" The development of learning-based hyperspectral image (HSI) compression\nmodels has recently attracted significant interest. Existing models\npredominantly utilize convolutional filters, which capture only local\ndependencies. Furthermore,they often incur high training costs and exhibit\nsubstantial computational complexity. To address these limitations, in this\npaper we propose Hyperspectral Compression Transformer (HyCoT) that is a\ntransformer-based autoencoder for pixelwise HSI compression. Additionally, we\napply a simple yet effective training set reduction approach to accelerate the\ntraining process. Experimental results on the HySpecNet-11k dataset demonstrate\nthat HyCoT surpasses the state of the art across various compression ratios by\nover 1 dB of PSNR with significantly reduced computational requirements. Our\ncode and pre-trained weights are publicly available at\nhttps://git.tu-berlin.de/rsim/hycot .\n","authors":["Martin Hermann Paul Fuchs","Behnood Rasti","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2408.08700v2.pdf","comment":"Accepted at 14th IEEE GRSS Workshop on Hyperspectral Image and Signal\n Processing: Evolution in Remote Sensing (WHISPERS), 2024"},{"id":"http://arxiv.org/abs/2405.14325v4","updated":"2024-11-14T15:47:04Z","published":"2024-05-23T08:55:20Z","title":"Dinomaly: The Less Is More Philosophy in Multi-Class Unsupervised\n Anomaly Detection","summary":" Recent studies highlighted a practical setting of unsupervised anomaly\ndetection (UAD) that builds a unified model for multi-class images. Despite\nvarious advancements addressing this challenging task, the detection\nperformance under the multi-class setting still lags far behind\nstate-of-the-art class-separated models. Our research aims to bridge this\nsubstantial performance gap. In this paper, we introduce a minimalistic\nreconstruction-based anomaly detection framework, namely Dinomaly, which\nleverages pure Transformer architectures without relying on complex designs,\nadditional modules, or specialized tricks. Given this powerful framework\nconsisted of only Attentions and MLPs, we found four simple components that are\nessential to multi-class anomaly detection: (1) Foundation Transformers that\nextracts universal and discriminative features, (2) Noisy Bottleneck where\npre-existing Dropouts do all the noise injection tricks, (3) Linear Attention\nthat naturally cannot focus, and (4) Loose Reconstruction that does not force\nlayer-to-layer and point-by-point reconstruction. Extensive experiments are\nconducted across popular anomaly detection benchmarks including MVTec-AD, VisA,\nand Real-IAD. Our proposed Dinomaly achieves impressive image-level AUROC of\n99.6%, 98.7%, and 89.3% on the three datasets respectively, which is not only\nsuperior to state-of-the-art multi-class UAD methods, but also achieves the\nmost advanced class-separated UAD records.\n","authors":["Jia Guo","Shuai Lu","Weihang Zhang","Fang Chen","Hongen Liao","Huiqi Li"],"pdf_url":"https://arxiv.org/pdf/2405.14325v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07635v2","updated":"2024-11-14T15:40:59Z","published":"2024-11-12T08:30:59Z","title":"Breaking the Low-Rank Dilemma of Linear Attention","summary":" The Softmax attention mechanism in Transformer models is notoriously\ncomputationally expensive, particularly due to its quadratic complexity, posing\nsignificant challenges in vision applications. In contrast, linear attention\nprovides a far more efficient solution by reducing the complexity to linear\nlevels. However, compared to Softmax attention, linear attention often\nexperiences significant performance degradation. Our experiments indicate that\nthis performance drop is due to the low-rank nature of linear attention's\nfeature map, which hinders its ability to adequately model complex spatial\ninformation. In this paper, to break the low-rank dilemma of linear attention,\nwe conduct rank analysis from two perspectives: the KV buffer and the output\nfeatures. Consequently, we introduce Rank-Augmented Linear Attention (RALA),\nwhich rivals the performance of Softmax attention while maintaining linear\ncomplexity and high efficiency. Based on RALA, we construct the Rank-Augmented\nVision Linear Transformer (RAVLT). Extensive experiments demonstrate that RAVLT\nachieves excellent performance across various vision tasks. Specifically,\nwithout using any additional labels, data, or supervision during training,\nRAVLT achieves an 84.4% Top-1 accuracy on ImageNet-1k with only 26M parameters\nand 4.6G FLOPs. This result significantly surpasses previous linear attention\nmechanisms, fully illustrating the potential of RALA. Code will be available at\nhttps://github.com/qhfan/RALA.\n","authors":["Qihang Fan","Huaibo Huang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2411.07635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08514v4","updated":"2024-11-14T15:39:54Z","published":"2023-05-15T10:23:14Z","title":"Generative Adversarial Networks for Spatio-Spectral Compression of\n Hyperspectral Images","summary":" The development of deep learning-based models for the compression of\nhyperspectral images (HSIs) has recently attracted great attention in remote\nsensing due to the sharp growing of hyperspectral data archives. Most of the\nexisting models achieve either spectral or spatial compression, and do not\njointly consider the spatio-spectral redundancies present in HSIs. To address\nthis problem, in this paper we focus our attention on the High Fidelity\nCompression (HiFiC) model (which is proven to be highly effective for spatial\ncompression problems) and adapt it to perform spatio-spectral compression of\nHSIs. In detail, we introduce two new models: i) HiFiC using Squeeze and\nExcitation (SE) blocks (denoted as HiFiC$_{SE}$); and ii) HiFiC with 3D\nconvolutions (denoted as HiFiC$_{3D}$) in the framework of compression of HSIs.\nWe analyze the effectiveness of HiFiC$_{SE}$ and HiFiC$_{3D}$ in compressing\nthe spatio-spectral redundancies with channel attention and inter-dependency\nanalysis. Experimental results show the efficacy of the proposed models in\nperforming spatio-spectral compression, while reconstructing images at reduced\nbitrates with higher reconstruction quality. The code of the proposed models is\npublicly available at https://git.tu-berlin.de/rsim/HSI-SSC .\n","authors":["Martin Hermann Paul Fuchs","Akshara Preethy Byju","Alisa Walda","Behnood Rasti","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2305.08514v4.pdf","comment":"Accepted at 14th IEEE GRSS Workshop on Hyperspectral Image and Signal\n Processing: Evolution in Remote Sensing (WHISPERS), 2024"},{"id":"http://arxiv.org/abs/2409.07271v3","updated":"2024-11-14T15:36:29Z","published":"2024-09-11T13:46:35Z","title":"CFCPalsy: Facial Image Synthesis with Cross-Fusion Cycle Diffusion Model\n for Facial Paralysis Individuals","summary":" Currently, the diagnosis of facial paralysis remains a challenging task,\noften relying heavily on the subjective judgment and experience of clinicians,\nwhich can introduce variability and uncertainty in the assessment process. One\npromising application in real-life situations is the automatic estimation of\nfacial paralysis. However, the scarcity of facial paralysis datasets limits the\ndevelopment of robust machine learning models for automated diagnosis and\ntherapeutic interventions. To this end, this study aims to synthesize a\nhigh-quality facial paralysis dataset to address this gap, enabling more\naccurate and efficient algorithm training. Specifically, a novel Cross-Fusion\nCycle Palsy Expression Generative Model (CFCPalsy) based on the diffusion model\nis proposed to combine different features of facial information and enhance the\nvisual details of facial appearance and texture in facial regions, thus\ncreating synthetic facial images that accurately represent various degrees and\ntypes of facial paralysis. We have qualitatively and quantitatively evaluated\nthe proposed method on the commonly used public clinical datasets of facial\nparalysis to demonstrate its effectiveness. Experimental results indicate that\nthe proposed method surpasses state-of-the-art methods, generating more\nrealistic facial images and maintaining identity consistency.\n","authors":["Weixiang Gao","Yifan Xia"],"pdf_url":"https://arxiv.org/pdf/2409.07271v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09512v1","updated":"2024-11-14T15:26:10Z","published":"2024-11-14T15:26:10Z","title":"GAN-Based Architecture for Low-dose Computed Tomography Imaging\n Denoising","summary":" Generative Adversarial Networks (GANs) have surfaced as a revolutionary\nelement within the domain of low-dose computed tomography (LDCT) imaging,\nproviding an advanced resolution to the enduring issue of reconciling radiation\nexposure with image quality. This comprehensive review synthesizes the rapid\nadvancements in GAN-based LDCT denoising techniques, examining the evolution\nfrom foundational architectures to state-of-the-art models incorporating\nadvanced features such as anatomical priors, perceptual loss functions, and\ninnovative regularization strategies. We critically analyze various GAN\narchitectures, including conditional GANs (cGANs), CycleGANs, and\nSuper-Resolution GANs (SRGANs), elucidating their unique strengths and\nlimitations in the context of LDCT denoising. The evaluation provides both\nqualitative and quantitative results related to the improvements in performance\nin benchmark and clinical datasets with metrics such as PSNR, SSIM, and LPIPS.\nAfter highlighting the positive results, we discuss some of the challenges\npreventing a wider clinical use, including the interpretability of the images\ngenerated by GANs, synthetic artifacts, and the need for clinically relevant\nmetrics. The review concludes by highlighting the essential significance of\nGAN-based methodologies in the progression of precision medicine via tailored\nLDCT denoising models, underlining the transformative possibilities presented\nby artificial intelligence within contemporary radiological practice.\n","authors":["Yunuo Wang","Ningning Yang","Jialin Li"],"pdf_url":"https://arxiv.org/pdf/2411.09512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05612v2","updated":"2024-11-14T15:22:27Z","published":"2023-06-09T01:11:50Z","title":"Spatial Re-parameterization for N:M Sparsity","summary":" This paper presents a Spatial Re-parameterization (SpRe) method for the N:M\nsparsity in CNNs. SpRe is stemmed from an observation regarding the restricted\nvariety in spatial sparsity present in N:M sparsity compared with unstructured\nsparsity. Particularly, N:M sparsity exhibits a fixed sparsity rate within the\nspatial domains due to its distinctive pattern that mandates N non-zero\ncomponents among M successive weights in the input channel dimension of\nconvolution filters. On the contrary, we observe that unstructured sparsity\ndisplays a substantial divergence in sparsity across the spatial domains, which\nwe experimentally verified to be very crucial for its robust performance\nretention compared with N:M sparsity. Therefore, SpRe employs the\nspatial-sparsity distribution of unstructured sparsity to assign an extra\nbranch in conjunction with the original N:M branch at training time, which\nallows the N:M sparse network to sustain a similar distribution of spatial\nsparsity with unstructured sparsity. During inference, the extra branch can be\nfurther re-parameterized into the main N:M branch, without exerting any\ndistortion on the sparse pattern or additional computation costs. SpRe has\nachieved a commendable feat by matching the performance of N:M sparsity methods\nwith state-of-the-art unstructured sparsity methods across various benchmarks.\nCode and models are anonymously available at\n\\url{https://github.com/zyxxmu/SpRe}.\n","authors":["Yuxin Zhang","Mingliang Xu","Yonghong Tian","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2306.05612v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.09502v1","updated":"2024-11-14T15:13:13Z","published":"2024-11-14T15:13:13Z","title":"Golden Noise for Diffusion Models: A Learning Framework","summary":" Text-to-image diffusion model is a popular paradigm that synthesizes\npersonalized images by providing a text prompt and a random Gaussian noise.\nWhile people observe that some noises are ``golden noises'' that can achieve\nbetter text-image alignment and higher human preference than others, we still\nlack a machine learning framework to obtain those golden noises. To learn\ngolden noises for diffusion sampling, we mainly make three contributions in\nthis paper. First, we identify a new concept termed the \\textit{noise prompt},\nwhich aims at turning a random Gaussian noise into a golden noise by adding a\nsmall desirable perturbation derived from the text prompt. Following the\nconcept, we first formulate the \\textit{noise prompt learning} framework that\nsystematically learns ``prompted'' golden noise associated with a text prompt\nfor diffusion models. Second, we design a noise prompt data collection pipeline\nand collect a large-scale \\textit{noise prompt dataset}~(NPD) that contains\n100k pairs of random noises and golden noises with the associated text prompts.\nWith the prepared NPD as the training dataset, we trained a small \\textit{noise\nprompt network}~(NPNet) that can directly learn to transform a random noise\ninto a golden noise. The learned golden noise perturbation can be considered as\na kind of prompt for noise, as it is rich in semantic information and tailored\nto the given text prompt. Third, our extensive experiments demonstrate the\nimpressive effectiveness and generalization of NPNet on improving the quality\nof synthesized images across various diffusion models, including SDXL,\nDreamShaper-xl-v2-turbo, and Hunyuan-DiT. Moreover, NPNet is a small and\nefficient controller that acts as a plug-and-play module with very limited\nadditional inference and computational costs, as it just provides a golden\nnoise instead of a random noise without accessing the original pipeline.\n","authors":["Zikai Zhou","Shitong Shao","Lichen Bai","Zhiqiang Xu","Bo Han","Zeke Xie"],"pdf_url":"https://arxiv.org/pdf/2411.09502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24006v2","updated":"2024-11-14T14:58:26Z","published":"2024-10-31T15:09:36Z","title":"DiffPAD: Denoising Diffusion-based Adversarial Patch Decontamination","summary":" In the ever-evolving adversarial machine learning landscape, developing\neffective defenses against patch attacks has become a critical challenge,\nnecessitating reliable solutions to safeguard real-world AI systems. Although\ndiffusion models have shown remarkable capacity in image synthesis and have\nbeen recently utilized to counter $\\ell_p$-norm bounded attacks, their\npotential in mitigating localized patch attacks remains largely underexplored.\nIn this work, we propose DiffPAD, a novel framework that harnesses the power of\ndiffusion models for adversarial patch decontamination. DiffPAD first performs\nsuper-resolution restoration on downsampled input images, then adopts\nbinarization, dynamic thresholding scheme and sliding window for effective\nlocalization of adversarial patches. Such a design is inspired by the\ntheoretically derived correlation between patch size and diffusion restoration\nerror that is generalized across diverse patch attack scenarios. Finally,\nDiffPAD applies inpainting techniques to the original input images with the\nestimated patch region being masked. By integrating closed-form solutions for\nsuper-resolution restoration and image inpainting into the conditional reverse\nsampling process of a pre-trained diffusion model, DiffPAD obviates the need\nfor text guidance or fine-tuning. Through comprehensive experiments, we\ndemonstrate that DiffPAD not only achieves state-of-the-art adversarial\nrobustness against patch attacks but also excels in recovering naturalistic\nimages without patch remnants. The source code is available at\nhttps://github.com/JasonFu1998/DiffPAD.\n","authors":["Jia Fu","Xiao Zhang","Sepideh Pashami","Fatemeh Rahimian","Anders Holst"],"pdf_url":"https://arxiv.org/pdf/2410.24006v2.pdf","comment":"Accepted to 2025 IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV)"},{"id":"http://arxiv.org/abs/2406.01395v3","updated":"2024-11-14T14:39:07Z","published":"2024-06-03T14:58:49Z","title":"TE-NeXt: A LiDAR-Based 3D Sparse Convolutional Network for\n Traversability Estimation","summary":" This paper presents TE-NeXt, a novel and efficient architecture for\nTraversability Estimation (TE) from sparse LiDAR point clouds based on a\nresidual convolution block. TE-NeXt block fuses notions of current trends such\nas attention mechanisms and 3D sparse convolutions. TE-NeXt aims to demonstrate\nhigh capacity for generalisation in a variety of urban and natural\nenvironments, using well-known and accessible datasets such as SemanticKITTI,\nRellis-3D and SemanticUSL. Thus, the designed architecture ouperforms\nstate-of-the-art methods in the problem of semantic segmentation, demonstrating\nbetter results in unstructured environments and maintaining high reliability\nand robustness in urbans environments, which leads to better abstraction.\nImplementation is available in a open repository to the scientific community\nwith the aim of ensuring the reproducibility of results.\n","authors":["Antonio Santo","Juan J. Cabrera","David Valiente","Carlos Viegas","Arturo Gil"],"pdf_url":"https://arxiv.org/pdf/2406.01395v3.pdf","comment":"This work has been submitted to the Expert Systems With applications"},{"id":"http://arxiv.org/abs/2411.09484v1","updated":"2024-11-14T14:37:50Z","published":"2024-11-14T14:37:50Z","title":"Image Matching Filtering and Refinement by Planes and Beyond","summary":" This paper introduces a modular, non-deep learning method for filtering and\nrefining sparse correspondences in image matching. Assuming that motion flow\nwithin the scene can be approximated by local homography transformations,\nmatches are aggregated into overlapping clusters corresponding to virtual\nplanes using an iterative RANSAC-based approach, with non-conforming\ncorrespondences discarded. Moreover, the underlying planar structural design\nprovides an explicit map between local patches associated with the matches,\nenabling optional refinement of keypoint positions through cross-correlation\ntemplate matching after patch reprojection. Finally, to enhance robustness and\nfault-tolerance against violations of the piece-wise planar approximation\nassumption, a further strategy is designed for minimizing relative patch\ndistortion in the plane reprojection by introducing an intermediate homography\nthat projects both patches into a common plane. The proposed method is\nextensively evaluated on standard datasets and image matching pipelines, and\ncompared with state-of-the-art approaches. Unlike other current comparisons,\nthe proposed benchmark also takes into account the more general, real, and\npractical cases where camera intrinsics are unavailable. Experimental results\ndemonstrate that our proposed non-deep learning, geometry-based approach\nachieves performances that are either superior to or on par with recent\nstate-of-the-art deep learning methods. Finally, this study suggests that there\nare still development potential in actual image matching solutions in the\nconsidered research direction, which could be in the future incorporated in\nnovel deep image matching architectures.\n","authors":["Fabio Bellavia","Zhenjun Zhao","Luca Morelli","Fabio Remondino"],"pdf_url":"https://arxiv.org/pdf/2411.09484v1.pdf","comment":"project page: https://github.com/fb82/MiHo"},{"id":"http://arxiv.org/abs/2410.03979v3","updated":"2024-11-14T14:34:13Z","published":"2024-10-04T23:37:21Z","title":"Improving Arabic Multi-Label Emotion Classification using Stacked\n Embeddings and Hybrid Loss Function","summary":" In multi-label emotion classification, particularly for low-resource\nlanguages like Arabic, the challenges of class imbalance and label correlation\nhinder model performance, especially in accurately predicting minority\nemotions. To address these issues, this study proposes a novel approach that\ncombines stacked embeddings, meta-learning, and a hybrid loss function to\nenhance multi-label emotion classification for the Arabic language. The study\nextracts contextual embeddings from three fine-tuned language\nmodels-ArabicBERT, MarBERT, and AraBERT-which are then stacked to form enriched\nembeddings. A meta-learner is trained on these stacked embeddings, and the\nresulting concatenated representations are provided as input to a Bi-LSTM\nmodel, followed by a fully connected neural network for multi-label\nclassification. To further improve performance, a hybrid loss function is\nintroduced, incorporating class weighting, label correlation matrix, and\ncontrastive learning, effectively addressing class imbalances and improving the\nhandling of label correlations. Extensive experiments validate the proposed\nmodel's performance across key metrics such as Precision, Recall, F1-Score,\nJaccard Accuracy, and Hamming Loss. The class-wise performance analysis\ndemonstrates the hybrid loss function's ability to significantly reduce\ndisparities between majority and minority classes, resulting in a more balanced\nemotion classification. An ablation study highlights the contribution of each\ncomponent, showing the superiority of the model compared to baseline approaches\nand other loss functions. This study not only advances multi-label emotion\nclassification for Arabic but also presents a generalizable framework that can\nbe adapted to other languages and domains, providing a significant step forward\nin addressing the challenges of low-resource emotion classification tasks.\n","authors":["Muhammad Azeem Aslam","Wang Jun","Nisar Ahmed","Muhammad Imran Zaman","Li Yanan","Hu Hongfei","Wang Shiyu","Xin Liu"],"pdf_url":"https://arxiv.org/pdf/2410.03979v3.pdf","comment":"The paper is submitted in Scientific Reports and is currently under\n review"},{"id":"http://arxiv.org/abs/2411.09471v1","updated":"2024-11-14T14:21:49Z","published":"2024-11-14T14:21:49Z","title":"Renal Cell Carcinoma subtyping: learning from multi-resolution\n localization","summary":" Renal Cell Carcinoma is typically asymptomatic at the early stages for many\npatients. This leads to a late diagnosis of the tumor, where the curability\nlikelihood is lower, and makes the mortality rate of Renal Cell Carcinoma high,\nwith respect to its incidence rate. To increase the survival chance, a fast and\ncorrect categorization of the tumor subtype is paramount. Nowadays,\ncomputerized methods, based on artificial intelligence, represent an\ninteresting opportunity to improve the productivity and the objectivity of the\nmicroscopy-based Renal Cell Carcinoma diagnosis. Nonetheless, much of their\nexploitation is hampered by the paucity of annotated dataset, essential for a\nproficient training of supervised machine learning technologies. This study\nsets out to investigate a novel self supervised training strategy for machine\nlearning diagnostic tools, based on the multi-resolution nature of the\nhistological samples. We aim at reducing the need of annotated dataset, without\nsignificantly reducing the accuracy of the tool. We demonstrate the\nclassification capability of our tool on a whole slide imaging dataset for\nRenal Cancer subtyping, and we compare our solution with several\nstate-of-the-art classification counterparts.\n","authors":["Mohamad Mohamad","Francesco Ponzio","Santa Di Cataldo","Damien Ambrosetti","Xavier Descombes"],"pdf_url":"https://arxiv.org/pdf/2411.09471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09462v1","updated":"2024-11-14T14:12:16Z","published":"2024-11-14T14:12:16Z","title":"SINETRA: a Versatile Framework for Evaluating Single Neuron Tracking in\n Behaving Animals","summary":" Accurately tracking neuronal activity in behaving animals presents\nsignificant challenges due to complex motions and background noise. The lack of\nannotated datasets limits the evaluation and improvement of such tracking\nalgorithms. To address this, we developed SINETRA, a versatile simulator that\ngenerates synthetic tracking data for particles on a deformable background,\nclosely mimicking live animal recordings. This simulator produces annotated 2D\nand 3D videos that reflect the intricate movements seen in behaving animals\nlike Hydra Vulgaris. We evaluated four state-of-the-art tracking algorithms\nhighlighting the current limitations of these methods in challenging scenarios\nand paving the way for improved cell tracking techniques in dynamic biological\nsystems.\n","authors":["Raphael Reme","Alasdair Newson","Elsa Angelini","Jean-Christophe Olivo-Marin","Thibault Lagach"],"pdf_url":"https://arxiv.org/pdf/2411.09462v1.pdf","comment":"5 pages, 3 figures, submitted at 2025 IEEE International Symposium on\n Biomedical Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2402.03227v4","updated":"2024-11-14T14:11:57Z","published":"2024-02-05T17:38:49Z","title":"IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of\n brain MR images","summary":" In MRI studies, the aggregation of imaging data from multiple acquisition\nsites enhances sample size but may introduce site-related variabilities that\nhinder consistency in subsequent analyses. Deep learning methods for image\ntranslation have emerged as a solution for harmonizing MR images across sites.\nIn this study, we introduce IGUANe (Image Generation with Unified Adversarial\nNetworks), an original 3D model that leverages the strengths of domain\ntranslation and straightforward application of style transfer methods for\nmulticenter brain MR image harmonization. IGUANe extends CycleGAN by\nintegrating an arbitrary number of domains for training through a many-to-one\narchitecture. The framework based on domain pairs enables the implementation of\nsampling strategies that prevent confusion between site-related and biological\nvariabilities. During inference, the model can be applied to any image, even\nfrom an unknown acquisition site, making it a universal generator for\nharmonization. Trained on a dataset comprising T1-weighted images from 11\ndifferent scanners, IGUANe was evaluated on data from unseen sites. The\nassessments included the transformation of MR images with traveling subjects,\nthe preservation of pairwise distances between MR images within domains, the\nevolution of volumetric patterns related to age and Alzheimer$'$s disease (AD),\nand the performance in age regression and patient classification tasks.\nComparisons with other harmonization and normalization methods suggest that\nIGUANe better preserves individual information in MR images and is more\nsuitable for maintaining and reinforcing variabilities related to age and AD.\nFuture studies may further assess IGUANe in other multicenter contexts, either\nusing the same model or retraining it for applications to different image\nmodalities. IGUANe is available at\nhttps://github.com/RocaVincent/iguane_harmonization.git.\n","authors":["Vincent Roca","Grégory Kuchcinski","Jean-Pierre Pruvo","Dorian Manouvriez","Renaud Lopes"],"pdf_url":"https://arxiv.org/pdf/2402.03227v4.pdf","comment":"29 pages, 14 figures"},{"id":"http://arxiv.org/abs/2411.08656v2","updated":"2024-11-14T14:11:06Z","published":"2024-11-13T14:46:41Z","title":"MikuDance: Animating Character Art with Mixed Motion Dynamics","summary":" We propose MikuDance, a diffusion-based pipeline incorporating mixed motion\ndynamics to animate stylized character art. MikuDance consists of two key\ntechniques: Mixed Motion Modeling and Mixed-Control Diffusion, to address the\nchallenges of high-dynamic motion and reference-guidance misalignment in\ncharacter art animation. Specifically, a Scene Motion Tracking strategy is\npresented to explicitly model the dynamic camera in pixel-wise space, enabling\nunified character-scene motion modeling. Building on this, the Mixed-Control\nDiffusion implicitly aligns the scale and body shape of diverse characters with\nmotion guidance, allowing flexible control of local character motion.\nSubsequently, a Motion-Adaptive Normalization module is incorporated to\neffectively inject global scene motion, paving the way for comprehensive\ncharacter art animation. Through extensive experiments, we demonstrate the\neffectiveness and generalizability of MikuDance across various character art\nand motion guidance, consistently producing high-quality animations with\nremarkable motion dynamics.\n","authors":["Jiaxu Zhang","Xianfang Zeng","Xin Chen","Wei Zuo","Gang Yu","Zhigang Tu"],"pdf_url":"https://arxiv.org/pdf/2411.08656v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09453v1","updated":"2024-11-14T13:59:01Z","published":"2024-11-14T13:59:01Z","title":"Long-Tailed Object Detection Pre-training: Dynamic Rebalancing\n Contrastive Learning with Dual Reconstruction","summary":" Pre-training plays a vital role in various vision tasks, such as object\nrecognition and detection. Commonly used pre-training methods, which typically\nrely on randomized approaches like uniform or Gaussian distributions to\ninitialize model parameters, often fall short when confronted with long-tailed\ndistributions, especially in detection tasks. This is largely due to extreme\ndata imbalance and the issue of simplicity bias. In this paper, we introduce a\nnovel pre-training framework for object detection, called Dynamic Rebalancing\nContrastive Learning with Dual Reconstruction (2DRCL). Our method builds on a\nHolistic-Local Contrastive Learning mechanism, which aligns pre-training with\nobject detection by capturing both global contextual semantics and detailed\nlocal patterns. To tackle the imbalance inherent in long-tailed data, we design\na dynamic rebalancing strategy that adjusts the sampling of underrepresented\ninstances throughout the pre-training process, ensuring better representation\nof tail classes. Moreover, Dual Reconstruction addresses simplicity bias by\nenforcing a reconstruction task aligned with the self-consistency principle,\nspecifically benefiting underrepresented tail classes. Experiments on COCO and\nLVIS v1.0 datasets demonstrate the effectiveness of our method, particularly in\nimproving the mAP/AP scores for tail classes.\n","authors":["Chen-Long Duan","Yong Li","Xiu-Shen Wei","Lin Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.09453v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09449v1","updated":"2024-11-14T13:52:43Z","published":"2024-11-14T13:52:43Z","title":"Image Regeneration: Evaluating Text-to-Image Model via Generating\n Identical Image with Multimodal Large Language Models","summary":" Diffusion models have revitalized the image generation domain, playing\ncrucial roles in both academic research and artistic expression. With the\nemergence of new diffusion models, assessing the performance of text-to-image\nmodels has become increasingly important. Current metrics focus on directly\nmatching the input text with the generated image, but due to cross-modal\ninformation asymmetry, this leads to unreliable or incomplete assessment\nresults. Motivated by this, we introduce the Image Regeneration task in this\nstudy to assess text-to-image models by tasking the T2I model with generating\nan image according to the reference image. We use GPT4V to bridge the gap\nbetween the reference image and the text input for the T2I model, allowing T2I\nmodels to understand image content. This evaluation process is simplified as\ncomparisons between the generated image and the reference image are\nstraightforward. Two regeneration datasets spanning content-diverse and\nstyle-diverse evaluation dataset are introduced to evaluate the leading\ndiffusion models currently available. Additionally, we present ImageRepainter\nframework to enhance the quality of generated images by improving content\ncomprehension via MLLM guided iterative generation and revision. Our\ncomprehensive experiments have showcased the effectiveness of this framework in\nassessing the generative capabilities of models. By leveraging MLLM, we have\ndemonstrated that a robust T2M can produce images more closely resembling the\nreference image.\n","authors":["Chutian Meng","Fan Ma","Jiaxu Miao","Chi Zhang","Yi Yang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2411.09449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06651v2","updated":"2024-11-14T13:26:35Z","published":"2024-11-11T01:36:48Z","title":"Machine learning-enabled velocity model building with uncertainty\n quantification","summary":" Accurately characterizing migration velocity models is crucial for a wide\nrange of geophysical applications, from hydrocarbon exploration to monitoring\nof CO2 sequestration projects. Traditional velocity model building methods such\nas Full-Waveform Inversion (FWI) are powerful but often struggle with the\ninherent complexities of the inverse problem, including noise, limited\nbandwidth, receiver aperture and computational constraints. To address these\nchallenges, we propose a scalable methodology that integrates generative\nmodeling, in the form of Diffusion networks, with physics-informed summary\nstatistics, making it suitable for complicated imaging problems including field\ndatasets. By defining these summary statistics in terms of subsurface-offset\nimage volumes for poor initial velocity models, our approach allows for\ncomputationally efficient generation of Bayesian posterior samples for\nmigration velocity models that offer a useful assessment of uncertainty. To\nvalidate our approach, we introduce a battery of tests that measure the quality\nof the inferred velocity models, as well as the quality of the inferred\nuncertainties. With modern synthetic datasets, we reconfirm gains from using\nsubsurface-image gathers as the conditioning observable. For complex velocity\nmodel building involving salt, we propose a new iterative workflow that refines\namortized posterior approximations with salt flooding and demonstrate how the\nuncertainty in the velocity model can be propagated to the final product\nreverse time migrated images. Finally, we present a proof of concept on field\ndatasets to show that our method can scale to industry-sized problems.\n","authors":["Rafael Orozco","Huseyin Tuna Erdinc","Yunlin Zeng","Mathias Louboutin","Felix J. Herrmann"],"pdf_url":"https://arxiv.org/pdf/2411.06651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09420v1","updated":"2024-11-14T13:15:27Z","published":"2024-11-14T13:15:27Z","title":"SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph\n Attention for Vision Transformers","summary":" Image classification is a computer vision task where a model analyzes an\nimage to categorize it into a specific label. Vision Transformers (ViT) improve\nthis task by leveraging self-attention to capture complex patterns and long\nrange relationships between image patches. However, a key challenge for ViTs is\nefficiently incorporating multiscale feature representations, which is inherent\nin CNNs through their hierarchical structure. In this paper, we introduce the\nScale-Aware Graph Attention Vision Transformer (SAG-ViT), a novel framework\nthat addresses this challenge by integrating multi-scale features. Using\nEfficientNet as a backbone, the model extracts multi-scale feature maps, which\nare divided into patches to preserve semantic information. These patches are\norganized into a graph based on spatial and feature similarities, with a Graph\nAttention Network (GAT) refining the node embeddings. Finally, a Transformer\nencoder captures long-range dependencies and complex interactions. The SAG-ViT\nis evaluated on benchmark datasets, demonstrating its effectiveness in\nenhancing image classification performance.\n","authors":["Shravan Venkatraman","Jaskaran Singh Walia","Joe Dhanith P R"],"pdf_url":"https://arxiv.org/pdf/2411.09420v1.pdf","comment":"10 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.20155v2","updated":"2024-11-14T13:07:26Z","published":"2024-05-30T15:30:38Z","title":"MotionDreamer: Exploring Semantic Video Diffusion features for Zero-Shot\n 3D Mesh Animation","summary":" Animation techniques bring digital 3D worlds and characters to life. However,\nmanual animation is tedious and automated techniques are often specialized to\nnarrow shape classes. In our work, we propose a technique for automatic\nre-animation of various 3D shapes based on a motion prior extracted from a\nvideo diffusion model. Unlike existing 4D generation methods, we focus solely\non the motion, and we leverage an explicit mesh-based representation compatible\nwith existing computer-graphics pipelines. Furthermore, our utilization of\ndiffusion features enhances accuracy of our motion fitting. We analyze efficacy\nof these features for animation fitting and we experimentally validate our\napproach for two different diffusion models and four animation models. Finally,\nwe demonstrate that our time-efficient zero-shot method achieves a superior\nperformance re-animating a diverse set of 3D shapes when compared to existing\ntechniques in a user study. The project website is located at\nhttps://lukas.uzolas.com/MotionDreamer.\n","authors":["Lukas Uzolas","Elmar Eisemann","Petr Kellnhofer"],"pdf_url":"https://arxiv.org/pdf/2405.20155v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09413v1","updated":"2024-11-14T13:07:19Z","published":"2024-11-14T13:07:19Z","title":"Script-centric behavior understanding for assisted autism spectrum\n disorder diagnosis","summary":" Observing and analyzing children's social behaviors is crucial for the early\ndiagnosis of Autism Spectrum Disorders (ASD). This work focuses on\nautomatically detecting ASD using computer vision techniques and large language\nmodels (LLMs). Existing methods typically rely on supervised learning. However,\nthe scarcity of ASD diagnostic datasets and the lack of interpretability in\ndiagnostic results significantly limits its clinical application. To address\nthese challenges, we introduce a novel unsupervised approach based on\nscript-centric behavior understanding. Our pipeline converts video content into\nscripts that describe the behavior of characters, leveraging the\ngeneralizability of large language models to detect ASD in a zero-shot or\nfew-shot manner. Specifically, we propose a scripts transcription module for\nmultimodal behavior data textualization and a domain prompts module to bridge\nLLMs. Our method achieves an accuracy of 92.00\\% in diagnosing ASD in children\nwith an average age of 24 months, surpassing the performance of supervised\nlearning methods by 3.58\\% absolutely. Extensive experiments confirm the\neffectiveness of our approach and suggest its potential for advancing ASD\nresearch through LLMs.\n","authors":["Wenxing Liu","Yueran Pan","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2411.09413v1.pdf","comment":"5 pages, 4 figures, submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2411.09411v1","updated":"2024-11-14T13:06:18Z","published":"2024-11-14T13:06:18Z","title":"Building Height Estimation Using Shadow Length in Satellite Imagery","summary":" Estimating building height from satellite imagery poses significant\nchallenges, especially when monocular images are employed, resulting in a loss\nof essential 3D information during imaging. This loss of spatial depth further\ncomplicates the height estimation process. We addressed this issue by using\nshadow length as an additional cue to compensate for the loss of building\nheight estimation using single-view imagery. We proposed a novel method that\nfirst localized a building and its shadow in the given satellite image. After\nlocalization, the shadow length is estimated using a regression model. To\nestimate the final height of each building, we utilize the principles of\nphotogrammetry, specifically considering the relationship between the solar\nelevation angle, the vertical edge length of the building, and the length of\nthe building's shadow. For the localization of buildings in our model, we\nutilized a modified YOLOv7 detector, and to regress the shadow length for each\nbuilding we utilized the ResNet18 as backbone architecture. Finally, we\nestimated the associated building height using solar elevation with shadow\nlength through analytical formulation. We evaluated our method on 42 different\ncities and the results showed that the proposed framework surpasses the\nstate-of-the-art methods with a suitable margin.\n","authors":["Mahd Qureshi","Shayaan Chaudhry","Sana Jabba","Murtaza Taj"],"pdf_url":"https://arxiv.org/pdf/2411.09411v1.pdf","comment":"6 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2410.17856v2","updated":"2024-11-14T12:29:41Z","published":"2024-10-23T13:26:59Z","title":"ROCKET-1: Mastering Open-World Interaction with Visual-Temporal Context\n Prompting","summary":" Vision-language models (VLMs) have excelled in multimodal tasks, but adapting\nthem to embodied decision-making in open-world environments presents\nchallenges. One critical issue is bridging the gap between discrete entities in\nlow-level observations and the abstract concepts required for effective\nplanning. A common solution is building hierarchical agents, where VLMs serve\nas high-level reasoners that break down tasks into executable sub-tasks,\ntypically specified using language. However, language suffers from the\ninability to communicate detailed spatial information. We propose\nvisual-temporal context prompting, a novel communication protocol between VLMs\nand policy models. This protocol leverages object segmentation from past\nobservations to guide policy-environment interactions. Using this approach, we\ntrain ROCKET-1, a low-level policy that predicts actions based on concatenated\nvisual observations and segmentation masks, supported by real-time object\ntracking from SAM-2. Our method unlocks the potential of VLMs, enabling them to\ntackle complex tasks that demand spatial reasoning. Experiments in Minecraft\nshow that our approach enables agents to achieve previously unattainable tasks,\nwith a $\\mathbf{76}\\%$ absolute improvement in open-world interaction\nperformance. Codes and demos are now available on the project page:\nhttps://craftjarvis.github.io/ROCKET-1.\n","authors":["Shaofei Cai","Zihao Wang","Kewei Lian","Zhancun Mu","Xiaojian Ma","Anji Liu","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2410.17856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09402v1","updated":"2024-11-14T12:27:31Z","published":"2024-11-14T12:27:31Z","title":"Automated Segmentation of Ischemic Stroke Lesions in Non-Contrast\n Computed Tomography Images for Enhanced Treatment and Prognosis","summary":" Stroke is the second leading cause of death worldwide, and is increasingly\nprevalent in low- and middle-income countries (LMICs). Timely interventions can\nsignificantly influence stroke survivability and the quality of life after\ntreatment. However, the standard and most widely available imaging method for\nconfirming strokes and their sub-types, the NCCT, is more challenging and\ntime-consuming to employ in cases of ischemic stroke. For this reason, we\ndeveloped an automated method for ischemic stroke lesion segmentation in NCCTs\nusing the nnU-Net frame work, aimed at enhancing early treatment and improving\nthe prognosis of ischemic stroke patients. We achieved Dice scores of 0.596 and\nIntersection over Union (IoU) scores of 0.501 on the sampled dataset. After\nadjusting for outliers, these scores improved to 0.752 for the Dice score and\n0.643 for the IoU. Proper delineation of the region of infarction can help\nclinicians better assess the potential impact of the infarction, and guide\ntreatment procedures.\n","authors":["Toufiq Musah","Prince Ebenezer Adjei","Kojo Obed Otoo"],"pdf_url":"https://arxiv.org/pdf/2411.09402v1.pdf","comment":"7 pages, 3 figures, MICCAI Meets Africa Workshop"},{"id":"http://arxiv.org/abs/2410.21991v5","updated":"2024-11-14T12:19:26Z","published":"2024-10-29T12:22:07Z","title":"From Explicit Rules to Implicit Reasoning in an Interpretable Violence\n Monitoring System","summary":" Recently, research based on pre-trained models has demonstrated outstanding\nperformance in violence surveillance tasks. However, most of them were\nblack-box systems which faced challenges regarding explainability during\ntraining and inference processes. An important question is how to incorporate\nexplicit knowledge into these implicit models, thereby designing expertdriven\nand interpretable violence surveillance systems. This paper proposes a new\nparadigm for weakly supervised violence monitoring (WSVM) called Rule base\nViolence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure\nwith different designs for images and text. One of the branches is called the\nimplicit branch, which uses only visual features for coarse-grained binary\nclassification. In this branch, image feature extraction is divided into two\nchannels: one responsible for extracting scene frames and the other focusing on\nextracting actions. The other branch is called the explicit branch, which\nutilizes language-image alignment to perform fine-grained classification. For\nthe language channel design in the explicit branch, the proposed RuleVM uses\nthe state-of-the-art YOLOWorld model to detect objects in video frames, and\nassociation rules are identified through data mining methods as descriptions of\nthe video. Leveraging the dual-branch architecture, RuleVM achieves\ninterpretable coarse-grained and fine-grained violence surveillance. Extensive\nexperiments were conducted on two commonly used benchmarks, and the results\nshow that RuleVM achieved the best performance in both coarse-grained and\nfinegrained monitoring, significantly outperforming existing state-ofthe-art\nmethods. Moreover, interpretability experiments uncovered some interesting\nrules, such as the observation that as the number of people increases, the risk\nlevel of violent behavior also rises.\n","authors":["Wen-Dong Jiang","Chih-Yung Chang","Ssu-Chi Kuai","Diptendu Sinha Roy"],"pdf_url":"https://arxiv.org/pdf/2410.21991v5.pdf","comment":"12 pages,7 figures IEEE TSMCA (Under review)"},{"id":"http://arxiv.org/abs/2310.03525v4","updated":"2024-11-14T12:17:17Z","published":"2023-10-05T13:19:48Z","title":"V2X Cooperative Perception for Autonomous Driving: Recent Advances and\n Challenges","summary":" Achieving fully autonomous driving with heightened safety and efficiency\ndepends on vehicle-to-everything (V2X) cooperative perception (CP), which\nallows vehicles to share perception data, thereby enhancing situational\nawareness and overcoming the limitations of the sensing ability of individual\nvehicles. V2X CP is crucial for extending perception range, improving accuracy,\nand strengthening the decision-making and control capabilities of autonomous\nvehicles in complex environments. This paper provides a comprehensive survey of\nrecent advances in V2X CP, introducing mathematical models of CP processes\nacross various collaboration strategies. We examine essential techniques for\nreliable perception sharing, including agent selection, data alignment, and\nfusion methods. Key issues are analyzed, such as agent and model heterogeneity,\nperception uncertainty, and the impact of V2X communication constraints like\ndelays and data loss on CP effectiveness. To inspire further advancements in\nV2X CP, we outline promising avenues, including privacy-preserving artificial\nintelligence (AI), collaborative AI, and integrated sensing frameworks, as\npathways to enhance CP capabilities.\n","authors":["Tao Huang","Jianan Liu","Xi Zhou","Dinh C. Nguyen","Mostafa Rahimi Azghadi","Yuxuan Xia","Qing-Long Han","Sumei Sun"],"pdf_url":"https://arxiv.org/pdf/2310.03525v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12514v4","updated":"2024-11-14T12:03:37Z","published":"2024-09-19T07:10:18Z","title":"TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for\n Robotic Manipulation","summary":" Vision-Language-Action (VLA) models have shown remarkable potential in\nvisuomotor control and instruction comprehension through end-to-end learning\nprocesses. However, current VLA models face significant challenges: they are\nslow during inference and require extensive pre-training on large amounts of\nrobotic data, making real-world deployment difficult. In this paper, we\nintroduce a new family of compact vision-language-action models, called\nTinyVLA, which offers two key advantages over existing VLA models: (1) faster\ninference speeds, and (2) improved data efficiency, eliminating the need for\npre-training stage. Our framework incorporates two essential components to\nbuild TinyVLA: (1) initializing the policy backbone with robust, high-speed\nmultimodal models, and (2) integrating a diffusion policy decoder during\nfine-tuning to enable precise robot actions. We conducted extensive evaluations\nof TinyVLA in both simulation and on real robots, demonstrating that our\napproach significantly outperforms the state-of-the-art VLA model, OpenVLA, in\nterms of speed and data efficiency, while delivering comparable or superior\nperformance. Additionally, TinyVLA exhibits strong generalization capabilities\nacross various dimensions, including language instructions, novel objects,\nunseen positions, changes in object appearance, background variations, and\nenvironmental shifts, often matching or exceeding the performance of OpenVLA.\nWe believe that \\methodname offers an interesting perspective on utilizing\npre-trained multimodal models for policy learning. Our project is at\nhttps://tiny-vla.github.io.\n","authors":["Junjie Wen","Yichen Zhu","Jinming Li","Minjie Zhu","Kun Wu","Zhiyuan Xu","Ning Liu","Ran Cheng","Chaomin Shen","Yaxin Peng","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2409.12514v4.pdf","comment":"add more citations"},{"id":"http://arxiv.org/abs/2411.09387v1","updated":"2024-11-14T12:02:01Z","published":"2024-11-14T12:02:01Z","title":"Instruction-Driven Fusion of Infrared-Visible Images: Tailoring for\n Diverse Downstream Tasks","summary":" The primary value of infrared and visible image fusion technology lies in\napplying the fusion results to downstream tasks. However, existing methods face\nchallenges such as increased training complexity and significantly compromised\nperformance of individual tasks when addressing multiple downstream tasks\nsimultaneously. To tackle this, we propose Task-Oriented Adaptive Regulation\n(T-OAR), an adaptive mechanism specifically designed for multi-task\nenvironments. Additionally, we introduce the Task-related Dynamic Prompt\nInjection (T-DPI) module, which generates task-specific dynamic prompts from\nuser-input text instructions and integrates them into target representations.\nThis guides the feature extraction module to produce representations that are\nmore closely aligned with the specific requirements of downstream tasks. By\nincorporating the T-DPI module into the T-OAR framework, our approach generates\nfusion images tailored to task-specific requirements without the need for\nseparate training or task-specific weights. This not only reduces computational\ncosts but also enhances adaptability and performance across multiple tasks.\nExperimental results show that our method excels in object detection, semantic\nsegmentation, and salient object detection, demonstrating its strong\nadaptability, flexibility, and task specificity. This provides an efficient\nsolution for image fusion in multi-task environments, highlighting the\ntechnology's potential across diverse applications.\n","authors":["Zengyi Yang","Yafei Zhang","Huafeng Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2411.09387v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.05767v2","updated":"2024-11-14T11:27:41Z","published":"2024-10-08T07:48:34Z","title":"Grounding is All You Need? Dual Temporal Grounding for Video Dialog","summary":" In the realm of video dialog response generation, the understanding of video\ncontent and the temporal nuances of conversation history are paramount. While a\nsegment of current research leans heavily on large-scale pretrained\nvisual-language models and often overlooks temporal dynamics, another delves\ndeep into spatial-temporal relationships within videos but demands intricate\nobject trajectory pre-extractions and sidelines dialog temporal dynamics. This\npaper introduces the Dual Temporal Grounding-enhanced Video Dialog model\n(DTGVD), strategically designed to merge the strengths of both dominant\napproaches. It emphasizes dual temporal relationships by predicting dialog\nturn-specific temporal regions, filtering video content accordingly, and\ngrounding responses in both video and dialog contexts. One standout feature of\nDTGVD is its heightened attention to chronological interplay. By recognizing\nand acting upon the dependencies between different dialog turns, it captures\nmore nuanced conversational dynamics. To further bolster the alignment between\nvideo and dialog temporal dynamics, we've implemented a list-wise contrastive\nlearning strategy. Within this framework, accurately grounded turn-clip\npairings are designated as positive samples, while less precise pairings are\ncategorized as negative. This refined classification is then funneled into our\nholistic end-to-end response generation mechanism. Evaluations using\nAVSD@DSTC-7 and AVSD@DSTC-8 datasets underscore the superiority of our\nmethodology.\n","authors":["You Qin","Wei Ji","Xinze Lan","Hao Fei","Xun Yang","Dan Guo","Roger Zimmermann","Lizi Liao"],"pdf_url":"https://arxiv.org/pdf/2410.05767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09373v1","updated":"2024-11-14T11:27:15Z","published":"2024-11-14T11:27:15Z","title":"Are nuclear masks all you need for improved out-of-domain\n generalisation? A closer look at cancer classification in histopathology","summary":" Domain generalisation in computational histopathology is challenging because\nthe images are substantially affected by differences among hospitals due to\nfactors like fixation and staining of tissue and imaging equipment. We\nhypothesise that focusing on nuclei can improve the out-of-domain (OOD)\ngeneralisation in cancer detection. We propose a simple approach to improve OOD\ngeneralisation for cancer detection by focusing on nuclear morphology and\norganisation, as these are domain-invariant features critical in cancer\ndetection. Our approach integrates original images with nuclear segmentation\nmasks during training, encouraging the model to prioritise nuclei and their\nspatial arrangement. Going beyond mere data augmentation, we introduce a\nregularisation technique that aligns the representations of masks and original\nimages. We show, using multiple datasets, that our method improves OOD\ngeneralisation and also leads to increased robustness to image corruptions and\nadversarial attacks. The source code is available at\nhttps://github.com/undercutspiky/SFL/\n","authors":["Dhananjay Tomar","Alexander Binder","Andreas Kleppe"],"pdf_url":"https://arxiv.org/pdf/2411.09373v1.pdf","comment":"Poster at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09371v1","updated":"2024-11-14T11:25:32Z","published":"2024-11-14T11:25:32Z","title":"DSCformer: A Dual-Branch Network Integrating Enhanced Dynamic Snake\n Convolution and SegFormer for Crack Segmentation","summary":" In construction quality monitoring, accurately detecting and segmenting\ncracks in concrete structures is paramount for safety and maintenance. Current\nconvolutional neural networks (CNNs) have demonstrated strong performance in\ncrack segmentation tasks, yet they often struggle with complex backgrounds and\nfail to capture fine-grained tubular structures fully. In contrast,\nTransformers excel at capturing global context but lack precision in detailed\nfeature extraction. We introduce DSCformer, a novel hybrid model that\nintegrates an enhanced Dynamic Snake Convolution (DSConv) with a Transformer\narchitecture for crack segmentation to address these challenges. Our key\ncontributions include the enhanced DSConv through a pyramid kernel for adaptive\noffset computation and a simultaneous bi-directional learnable offset\niteration, significantly improving the model's performance to capture intricate\ncrack patterns. Additionally, we propose a Weighted Convolutional Attention\nModule (WCAM), which refines channel attention, allowing for more precise and\nadaptive feature attention. We evaluate DSCformer on the Crack3238 and FIND\ndatasets, achieving IoUs of 59.22\\% and 87.24\\%, respectively. The experimental\nresults suggest that our DSCformer outperforms state-of-the-art methods across\ndifferent datasets.\n","authors":["Kaiwei Yu","I-Ming Chen","Jing Wu"],"pdf_url":"https://arxiv.org/pdf/2411.09371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12778v2","updated":"2024-11-14T11:21:09Z","published":"2024-03-19T14:45:17Z","title":"ViTGaze: Gaze Following with Interaction Features in Vision Transformers","summary":" Gaze following aims to interpret human-scene interactions by predicting the\nperson's focal point of gaze. Prevailing approaches often adopt a two-stage\nframework, whereby multi-modality information is extracted in the initial stage\nfor gaze target prediction. Consequently, the efficacy of these methods highly\ndepends on the precision of the preceding modality extraction. Others use a\nsingle-modality approach with complex decoders, increasing network\ncomputational load. Inspired by the remarkable success of pre-trained plain\nvision transformers (ViTs), we introduce a novel single-modality gaze following\nframework called ViTGaze. In contrast to previous methods, it creates a novel\ngaze following framework based mainly on powerful encoders (relative decoder\nparameters less than 1%). Our principal insight is that the inter-token\ninteractions within self-attention can be transferred to interactions between\nhumans and scenes. Leveraging this presumption, we formulate a framework\nconsisting of a 4D interaction encoder and a 2D spatial guidance module to\nextract human-scene interaction information from self-attention maps.\nFurthermore, our investigation reveals that ViT with self-supervised\npre-training has an enhanced ability to extract correlation information. Many\nexperiments have been conducted to demonstrate the performance of the proposed\nmethod. Our method achieves state-of-the-art (SOTA) performance among all\nsingle-modality methods (3.4% improvement in the area under curve (AUC) score,\n5.1% improvement in the average precision (AP)) and very comparable performance\nagainst multi-modality methods with 59% number of parameters less.\n","authors":["Yuehao Song","Xinggang Wang","Jingfeng Yao","Wenyu Liu","Jinglin Zhang","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2403.12778v2.pdf","comment":"15 pages; Accepted by Visual Intelligence"},{"id":"http://arxiv.org/abs/2411.09361v1","updated":"2024-11-14T11:08:54Z","published":"2024-11-14T11:08:54Z","title":"Time-to-Event Pretraining for 3D Medical Imaging","summary":" With the rise of medical foundation models and the growing availability of\nimaging data, scalable pretraining techniques offer a promising way to identify\nimaging biomarkers predictive of future disease risk. While current\nself-supervised methods for 3D medical imaging models capture local structural\nfeatures like organ morphology, they fail to link pixel biomarkers with\nlong-term health outcomes due to a missing context problem. Current approaches\nlack the temporal context necessary to identify biomarkers correlated with\ndisease progression, as they rely on supervision derived only from images and\nconcurrent text descriptions. To address this, we introduce time-to-event\npretraining, a pretraining framework for 3D medical imaging models that\nleverages large-scale temporal supervision from paired, longitudinal electronic\nhealth records (EHRs). Using a dataset of 18,945 CT scans (4.2 million 2D\nimages) and time-to-event distributions across thousands of EHR-derived tasks,\nour method improves outcome prediction, achieving an average AUROC increase of\n23.7% and a 29.4% gain in Harrell's C-index across 8 benchmark tasks.\nImportantly, these gains are achieved without sacrificing diagnostic\nclassification performance. This study lays the foundation for integrating\nlongitudinal EHR and 3D imaging data to advance clinical risk prediction.\n","authors":["Zepeng Huo","Jason Alan Fries","Alejandro Lozano","Jeya Maria Jose Valanarasu","Ethan Steinberg","Louis Blankemeier","Akshay S. Chaudhari","Curtis Langlotz","Nigam H. Shah"],"pdf_url":"https://arxiv.org/pdf/2411.09361v1.pdf","comment":"34 pages, 19 figures"},{"id":"http://arxiv.org/abs/2411.09344v1","updated":"2024-11-14T10:47:01Z","published":"2024-11-14T10:47:01Z","title":"Adaptively Augmented Consistency Learning: A Semi-supervised\n Segmentation Framework for Remote Sensing","summary":" Remote sensing (RS) involves the acquisition of data about objects or areas\nfrom a distance, primarily to monitor environmental changes, manage resources,\nand support planning and disaster response. A significant challenge in RS\nsegmentation is the scarcity of high-quality labeled images due to the\ndiversity and complexity of RS image, which makes pixel-level annotation\ndifficult and hinders the development of effective supervised segmentation\nalgorithms. To solve this problem, we propose Adaptively Augmented Consistency\nLearning (AACL), a semi-supervised segmentation framework designed to enhances\nRS segmentation accuracy under condictions of limited labeled data. AACL\nextracts additional information embedded in unlabeled images through the use of\nUniform Strength Augmentation (USAug) and Adaptive Cut-Mix (AdaCM). Evaluations\nacross various RS datasets demonstrate that AACL achieves competitive\nperformance in semi-supervised segmentation, showing up to a 20% improvement in\nspecific categories and 2% increase in overall performance compared to\nstate-of-the-art frameworks.\n","authors":["Hui Ye","Haodong Chen","Xiaoming Chen","Vera Chung"],"pdf_url":"https://arxiv.org/pdf/2411.09344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02562v2","updated":"2024-11-14T10:45:32Z","published":"2024-09-04T09:29:24Z","title":"One Homography is All You Need: IMM-based Joint Homography and Multiple\n Object State Estimation","summary":" A novel online MOT algorithm, IMM Joint Homography State Estimation\n(IMM-JHSE), is proposed. IMM-JHSE uses an initial homography estimate as the\nonly additional 3D information, whereas other 3D MOT methods use regular 3D\nmeasurements. By jointly modelling the homography matrix and its dynamics as\npart of track state vectors, IMM-JHSE removes the explicit influence of camera\nmotion compensation techniques on predicted track position states, which was\nprevalent in previous approaches. Expanding upon this, static and dynamic\ncamera motion models are combined using an IMM filter. A simple bounding box\nmotion model is used to predict bounding box positions to incorporate image\nplane information. In addition to applying an IMM to camera motion, a\nnon-standard IMM approach is applied where bounding-box-based BIoU scores are\nmixed with ground-plane-based Mahalanobis distances in an IMM-like fashion to\nperform association only, making IMM-JHSE robust to motion away from the ground\nplane. Finally, IMM-JHSE makes use of dynamic process and measurement noise\nestimation techniques. IMM-JHSE improves upon related techniques, including\nUCMCTrack, OC-SORT, C-BIoU and ByteTrack on the DanceTrack and KITTI-car\ndatasets, increasing HOTA by 2.64 and 2.11, respectively, while offering\ncompetitive performance on the MOT17, MOT20 and KITTI-pedestrian datasets.\nUsing publicly available detections, IMM-JHSE outperforms almost all other 2D\nMOT methods and is outperformed only by 3D MOT methods -- some of which are\noffline -- on the KITTI-car dataset. Compared to tracking-by-attention methods,\nIMM-JHSE shows remarkably similar performance on the DanceTrack dataset and\noutperforms them on the MOT17 dataset. The code is publicly available:\n\\url{https://github.com/Paulkie99/imm-jhse}.\n","authors":["Paul Johannes Claasen","Johan Pieter de Villiers"],"pdf_url":"https://arxiv.org/pdf/2409.02562v2.pdf","comment":"Preprint submitted to Information Fusion"},{"id":"http://arxiv.org/abs/2405.18839v3","updated":"2024-11-14T10:27:51Z","published":"2024-05-29T07:40:31Z","title":"MEGA: Masked Generative Autoencoder for Human Mesh Recovery","summary":" Human Mesh Recovery (HMR) from a single RGB image is a highly ambiguous\nproblem, as an infinite set of 3D interpretations can explain the 2D\nobservation equally well. Nevertheless, most HMR methods overlook this issue\nand make a single prediction without accounting for this ambiguity. A few\napproaches generate a distribution of human meshes, enabling the sampling of\nmultiple predictions; however, none of them is competitive with the latest\nsingle-output model when making a single prediction. This work proposes a new\napproach based on masked generative modeling. By tokenizing the human pose and\nshape, we formulate the HMR task as generating a sequence of discrete tokens\nconditioned on an input image. We introduce MEGA, a MaskEd Generative\nAutoencoder trained to recover human meshes from images and partial human mesh\ntoken sequences. Given an image, our flexible generation scheme allows us to\npredict a single human mesh in deterministic mode or to generate multiple human\nmeshes in stochastic mode. Experiments on in-the-wild benchmarks show that MEGA\nachieves state-of-the-art performance in deterministic and stochastic modes,\noutperforming single-output and multi-output approaches.\n","authors":["Guénolé Fiche","Simon Leglaive","Xavier Alameda-Pineda","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2405.18839v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19513v3","updated":"2024-11-14T09:40:00Z","published":"2024-04-30T12:45:41Z","title":"A Smartphone-Based Method for Assessing Tomato Nutrient Status through\n Trichome Density Measurement","summary":" Early detection of fertilizer-induced stress in tomato plants is crucial for\noptimizing crop yield through timely management interventions. While\nconventional optical methods struggle to detect fertilizer stress in young\nleaves, these leaves contain valuable diagnostic information through their\nmicroscopic hair-like structures, particularly trichomes, which existing\napproaches have overlooked. This study introduces a smartphone-based\nnoninvasive technique that leverages mobile computing and digital imaging\ncapabilities to quantify trichome density on young leaves with superior\ndetection latency. Our method uniquely combines augmented reality technology\nwith image processing algorithms to analyze trichomes transferred onto\nspecialized measurement paper. A robust automated pipeline processes these\nimages through region extraction, perspective transformation, and illumination\ncorrection to precisely quantify trichome density. Validation experiments on\nhydroponically grown tomatoes under varying fertilizer conditions demonstrated\nthe method's effectiveness. Leave-one-out cross-validation revealed strong\npredictive performance with the area under the precision-recall curve (PR-AUC:\n0.82) and area under the receiver operating characteristic curve (ROC-AUC:\n0.64), while the predicted and observed trichome densities exhibited high\ncorrelation ($r = 0.79$). This innovative approach transforms smartphones into\nprecise diagnostic tools for plant nutrition assessment, offering a practical,\ncost-effective solution for precision agriculture.\n","authors":["Sho Ueda","Xujun Ye"],"pdf_url":"https://arxiv.org/pdf/2404.19513v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09310v1","updated":"2024-11-14T09:38:29Z","published":"2024-11-14T09:38:29Z","title":"Exploring Zero-Shot Anomaly Detection with CLIP in Medical Imaging: Are\n We There Yet?","summary":" Zero-shot anomaly detection (ZSAD) offers potential for identifying anomalies\nin medical imaging without task-specific training. In this paper, we evaluate\nCLIP-based models, originally developed for industrial tasks, on brain tumor\ndetection using the BraTS-MET dataset. Our analysis examines their ability to\ndetect medical-specific anomalies with no or minimal supervision, addressing\nthe challenges posed by limited data annotation. While these models show\npromise in transferring general knowledge to medical tasks, their performance\nfalls short of the precision required for clinical use. Our findings highlight\nthe need for further adaptation before CLIP-based models can be reliably\napplied to medical anomaly detection.\n","authors":["Aldo Marzullo","Marta Bianca Maria Ranzini"],"pdf_url":"https://arxiv.org/pdf/2411.09310v1.pdf","comment":"accepted at 3rd AIxIA Workshop on Artificial Intelligence for\n Healthcare and 5th Data4SmartHealth"},{"id":"http://arxiv.org/abs/2411.09308v1","updated":"2024-11-14T09:34:36Z","published":"2024-11-14T09:34:36Z","title":"DT-JRD: Deep Transformer based Just Recognizable Difference Prediction\n Model for Video Coding for Machines","summary":" Just Recognizable Difference (JRD) represents the minimum visual difference\nthat is detectable by machine vision, which can be exploited to promote machine\nvision oriented visual signal processing. In this paper, we propose a Deep\nTransformer based JRD (DT-JRD) prediction model for Video Coding for Machines\n(VCM), where the accurately predicted JRD can be used reduce the coding bit\nrate while maintaining the accuracy of machine tasks. Firstly, we model the JRD\nprediction as a multi-class classification and propose a DT-JRD prediction\nmodel that integrates an improved embedding, a content and distortion feature\nextraction, a multi-class classification and a novel learning strategy.\nSecondly, inspired by the perception property that machine vision exhibits a\nsimilar response to distortions near JRD, we propose an asymptotic JRD loss by\nusing Gaussian Distribution-based Soft Labels (GDSL), which significantly\nextends the number of training labels and relaxes classification boundaries.\nFinally, we propose a DT-JRD based VCM to reduce the coding bits while\nmaintaining the accuracy of object detection. Extensive experimental results\ndemonstrate that the mean absolute error of the predicted JRD by the DT-JRD is\n5.574, outperforming the state-of-the-art JRD prediction model by 13.1%. Coding\nexperiments shows that comparing with the VVC, the DT-JRD based VCM achieves an\naverage of 29.58% bit rate reduction while maintaining the object detection\naccuracy.\n","authors":["Junqi Liu","Yun Zhang","Xiaoqi Wang","Xu Long","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2411.09308v1.pdf","comment":"Submitted to IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2411.09301v1","updated":"2024-11-14T09:23:40Z","published":"2024-11-14T09:23:40Z","title":"LHRS-Bot-Nova: Improved Multimodal Large Language Model for Remote\n Sensing Vision-Language Interpretation","summary":" Automatically and rapidly understanding Earth's surface is fundamental to our\ngrasp of the living environment and informed decision-making. This underscores\nthe need for a unified system with comprehensive capabilities in analyzing\nEarth's surface to address a wide range of human needs. The emergence of\nmultimodal large language models (MLLMs) has great potential in boosting the\nefficiency and convenience of intelligent Earth observation. These models can\nengage in human-like conversations, serve as unified platforms for\nunderstanding images, follow diverse instructions, and provide insightful\nfeedbacks. In this study, we introduce LHRS-Bot-Nova, an MLLM specialized in\nunderstanding remote sensing (RS) images, designed to expertly perform a wide\nrange of RS understanding tasks aligned with human instructions. LHRS-Bot-Nova\nfeatures an enhanced vision encoder and a novel bridge layer, enabling\nefficient visual compression and better language-vision alignment. To further\nenhance RS-oriented vision-language alignment, we propose a large-scale RS\nimage-caption dataset, generated through feature-guided image recaptioning.\nAdditionally, we introduce an instruction dataset specifically designed to\nimprove spatial recognition abilities. Extensive experiments demonstrate\nsuperior performance of LHRS-Bot-Nova across various RS image understanding\ntasks. We also evaluate different MLLM performances in complex RS perception\nand instruction following using a complicated multi-choice question evaluation\nbenchmark, providing a reliable guide for future model selection and\nimprovement. Data, code, and models will be available at\nhttps://github.com/NJU-LHRS/LHRS-Bot.\n","authors":["Zhenshi Li","Dilxat Muhtar","Feng Gu","Xueliang Zhang","Pengfeng Xiao","Guangjun He","Xiaoxiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.09301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09293v1","updated":"2024-11-14T09:12:18Z","published":"2024-11-14T09:12:18Z","title":"LLV-FSR: Exploiting Large Language-Vision Prior for Face\n Super-resolution","summary":" Existing face super-resolution (FSR) methods have made significant\nadvancements, but they primarily super-resolve face with limited visual\ninformation, original pixel-wise space in particular, commonly overlooking the\npluralistic clues, like the higher-order depth and semantics, as well as\nnon-visual inputs (text caption and description). Consequently, these methods\nstruggle to produce a unified and meaningful representation from the input\nface. We suppose that introducing the language-vision pluralistic\nrepresentation into unexplored potential embedding space could enhance FSR by\nencoding and exploiting the complementarity across language-vision prior. This\nmotivates us to propose a new framework called LLV-FSR, which marries the power\nof large vision-language model and higher-order visual prior with the\nchallenging task of FSR. Specifically, besides directly absorbing knowledge\nfrom original input, we introduce the pre-trained vision-language model to\ngenerate pluralistic priors, involving the image caption, descriptions, face\nsemantic mask and depths. These priors are then employed to guide the more\ncritical feature representation, facilitating realistic and high-quality face\nsuper-resolution. Experimental results demonstrate that our proposed framework\nsignificantly improves both the reconstruction quality and perceptual quality,\nsurpassing the SOTA by 0.43dB in terms of PSNR on the MMCelebA-HQ dataset.\n","authors":["Chenyang Wang","Wenjie An","Kui Jiang","Xianming Liu","Junjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.09293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09283v1","updated":"2024-11-14T08:40:08Z","published":"2024-11-14T08:40:08Z","title":"Leveraging Auxiliary Classification for Rib Fracture Segmentation","summary":" Thoracic trauma often results in rib fractures, which demand swift and\naccurate diagnosis for effective treatment. However, detecting these fractures\non rib CT scans poses considerable challenges, involving the analysis of many\nimage slices in sequence. Despite notable advancements in algorithms for\nautomated fracture segmentation, the persisting challenges stem from the\ndiverse shapes and sizes of these fractures. To address these issues, this\nstudy introduces a sophisticated deep-learning model with an auxiliary\nclassification task designed to enhance the accuracy of rib fracture\nsegmentation. The auxiliary classification task is crucial in distinguishing\nbetween fractured ribs and negative regions, encompassing non-fractured ribs\nand surrounding tissues, from the patches obtained from CT scans. By leveraging\nthis auxiliary task, the model aims to improve feature representation at the\nbottleneck layer by highlighting the regions of interest. Experimental results\non the RibFrac dataset demonstrate significant improvement in segmentation\nperformance.\n","authors":["Harini G.","Aiman Farooq","Deepak Mishra"],"pdf_url":"https://arxiv.org/pdf/2411.09283v1.pdf","comment":"Accepted at ICVGIP'24"},{"id":"http://arxiv.org/abs/2411.08756v2","updated":"2024-11-14T08:36:22Z","published":"2024-11-13T16:42:07Z","title":"Masked Image Modeling Boosting Semi-Supervised Semantic Segmentation","summary":" In view of the fact that semi- and self-supervised learning share a\nfundamental principle, effectively modeling knowledge from unlabeled data,\nvarious semi-supervised semantic segmentation methods have integrated\nrepresentative self-supervised learning paradigms for further regularization.\nHowever, the potential of the state-of-the-art generative self-supervised\nparadigm, masked image modeling, has been scarcely studied. This paradigm\nlearns the knowledge through establishing connections between the masked and\nvisible parts of masked image, during the pixel reconstruction process. By\ninheriting and extending this insight, we successfully leverage masked image\nmodeling to boost semi-supervised semantic segmentation. Specifically, we\nintroduce a novel class-wise masked image modeling that independently\nreconstructs different image regions according to their respective classes. In\nthis way, the mask-induced connections are established within each class,\nmitigating the semantic confusion that arises from plainly reconstructing\nimages in basic masked image modeling. To strengthen these intra-class\nconnections, we further develop a feature aggregation strategy that minimizes\nthe distances between features corresponding to the masked and visible parts\nwithin the same class. Additionally, in semantic space, we explore the\napplication of masked image modeling to enhance regularization. Extensive\nexperiments conducted on well-known benchmarks demonstrate that our approach\nachieves state-of-the-art performance. The code will be available at\nhttps://github.com/haoxt/S4MIM.\n","authors":["Yangyang Li","Xuanting Hao","Ronghua Shang","Licheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2411.08756v2.pdf","comment":"13 pages. This work has been submitted to the IEEE for possible\n publication"},{"id":"http://arxiv.org/abs/2411.09268v1","updated":"2024-11-14T08:12:16Z","published":"2024-11-14T08:12:16Z","title":"LES-Talker: Fine-Grained Emotion Editing for Talking Head Generation in\n Linear Emotion Space","summary":" While existing one-shot talking head generation models have achieved progress\nin coarse-grained emotion editing, there is still a lack of fine-grained\nemotion editing models with high interpretability. We argue that for an\napproach to be considered fine-grained, it needs to provide clear definitions\nand sufficiently detailed differentiation. We present LES-Talker, a novel\none-shot talking head generation model with high interpretability, to achieve\nfine-grained emotion editing across emotion types, emotion levels, and facial\nunits. We propose a Linear Emotion Space (LES) definition based on Facial\nAction Units to characterize emotion transformations as vector transformations.\nWe design the Cross-Dimension Attention Net (CDAN) to deeply mine the\ncorrelation between LES representation and 3D model representation. Through\nmining multiple relationships across different feature and structure\ndimensions, we enable LES representation to guide the controllable deformation\nof 3D model. In order to adapt the multimodal data with deviations to the LES\nand enhance visual quality, we utilize specialized network design and training\nstrategies. Experiments show that our method provides high visual quality along\nwith multilevel and interpretable fine-grained emotion editing, outperforming\nmainstream methods.\n","authors":["Guanwen Feng","Zhihao Qian","Yunan Li","Siyu Jin","Qiguang Miao","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2411.09268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09266v1","updated":"2024-11-14T08:07:02Z","published":"2024-11-14T08:07:02Z","title":"How Good is ChatGPT at Audiovisual Deepfake Detection: A Comparative\n Study of ChatGPT, AI Models and Human Perception","summary":" Multimodal deepfakes involving audiovisual manipulations are a growing threat\nbecause they are difficult to detect with the naked eye or using unimodal deep\nlearningbased forgery detection methods. Audiovisual forensic models, while\nmore capable than unimodal models, require large training datasets and are\ncomputationally expensive for training and inference. Furthermore, these models\nlack interpretability and often do not generalize well to unseen manipulations.\nIn this study, we examine the detection capabilities of a large language model\n(LLM) (i.e., ChatGPT) to identify and account for any possible visual and\nauditory artifacts and manipulations in audiovisual deepfake content. Extensive\nexperiments are conducted on videos from a benchmark multimodal deepfake\ndataset to evaluate the detection performance of ChatGPT and compare it with\nthe detection capabilities of state-of-the-art multimodal forensic models and\nhumans. Experimental results demonstrate the importance of domain knowledge and\nprompt engineering for video forgery detection tasks using LLMs. Unlike\napproaches based on end-to-end learning, ChatGPT can account for spatial and\nspatiotemporal artifacts and inconsistencies that may exist within or across\nmodalities. Additionally, we discuss the limitations of ChatGPT for multimedia\nforensic tasks.\n","authors":["Sahibzada Adil Shahzad","Ammarah Hashmi","Yan-Tsung Peng","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09265v1","updated":"2024-11-14T08:05:34Z","published":"2024-11-14T08:05:34Z","title":"BEARD: Benchmarking the Adversarial Robustness for Dataset Distillation","summary":" Dataset Distillation (DD) is an emerging technique that compresses\nlarge-scale datasets into significantly smaller synthesized datasets while\npreserving high test performance and enabling the efficient training of large\nmodels. However, current research primarily focuses on enhancing evaluation\naccuracy under limited compression ratios, often overlooking critical security\nconcerns such as adversarial robustness. A key challenge in evaluating this\nrobustness lies in the complex interactions between distillation methods, model\narchitectures, and adversarial attack strategies, which complicate standardized\nassessments. To address this, we introduce BEARD, an open and unified benchmark\ndesigned to systematically assess the adversarial robustness of DD methods,\nincluding DM, IDM, and BACON. BEARD encompasses a variety of adversarial\nattacks (e.g., FGSM, PGD, C&W) on distilled datasets like CIFAR-10/100 and\nTinyImageNet. Utilizing an adversarial game framework, it introduces three key\nmetrics: Robustness Ratio (RR), Attack Efficiency Ratio (AE), and Comprehensive\nRobustness-Efficiency Index (CREI). Our analysis includes unified benchmarks,\nvarious Images Per Class (IPC) settings, and the effects of adversarial\ntraining. Results are available on the BEARD Leaderboard, along with a library\nproviding model and dataset pools to support reproducible research. Access the\ncode at BEARD.\n","authors":["Zheng Zhou","Wenquan Feng","Shuchang Lyu","Guangliang Cheng","Xiaowei Huang","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.09265v1.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.09263v1","updated":"2024-11-14T08:02:14Z","published":"2024-11-14T08:02:14Z","title":"Rethinking Weight-Averaged Model-merging","summary":" Weight-averaged model-merging has emerged as a powerful approach in deep\nlearning, capable of enhancing model performance without fine-tuning or\nretraining. However, the underlying mechanisms that explain its effectiveness\nremain largely unexplored. In this paper, we investigate this technique from\nthree novel perspectives to provide deeper insights into how and why\nweight-averaged model-merging works: (1) we examine the intrinsic patterns\ncaptured by the learning of the model weights, through the visualizations of\ntheir patterns on several datasets, showing that these weights often encode\nstructured and interpretable patterns; (2) we investigate model ensemble\nmerging strategies based on averaging on weights versus averaging on features,\nproviding detailed analyses across diverse architectures and datasets; and (3)\nwe explore the impact on model-merging prediction stability in terms of\nchanging the parameter magnitude, revealing insights into the way of weight\naveraging works as regularization by showing the robustness across different\nparameter scales. Our findings shed light on the \"black box\" of weight-averaged\nmodel-merging, offering valuable insights and practical recommendations that\nadvance the model-merging process.\n","authors":["Hu Wang","Congbo Ma","Ibrahim Almakky","Ian Reid","Gustavo Carneiro","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2411.09263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09259v1","updated":"2024-11-14T07:51:51Z","published":"2024-11-14T07:51:51Z","title":"Jailbreak Attacks and Defenses against Multimodal Generative Models: A\n Survey","summary":" The rapid evolution of multimodal foundation models has led to significant\nadvancements in cross-modal understanding and generation across diverse\nmodalities, including text, images, audio, and video. However, these models\nremain susceptible to jailbreak attacks, which can bypass built-in safety\nmechanisms and induce the production of potentially harmful content.\nConsequently, understanding the methods of jailbreak attacks and existing\ndefense mechanisms is essential to ensure the safe deployment of multimodal\ngenerative models in real-world scenarios, particularly in security-sensitive\napplications. To provide comprehensive insight into this topic, this survey\nreviews jailbreak and defense in multimodal generative models. First, given the\ngeneralized lifecycle of multimodal jailbreak, we systematically explore\nattacks and corresponding defense strategies across four levels: input,\nencoder, generator, and output. Based on this analysis, we present a detailed\ntaxonomy of attack methods, defense mechanisms, and evaluation frameworks\nspecific to multimodal generative models. Additionally, we cover a wide range\nof input-output configurations, including modalities such as Any-to-Text,\nAny-to-Vision, and Any-to-Any within generative systems. Finally, we highlight\ncurrent research challenges and propose potential directions for future\nresearch.The open-source repository corresponding to this work can be found at\nhttps://github.com/liuxuannan/Awesome-Multimodal-Jailbreak.\n","authors":["Xuannan Liu","Xing Cui","Peipei Li","Zekun Li","Huaibo Huang","Shuhan Xia","Miaoxuan Zhang","Yueying Zou","Ran He"],"pdf_url":"https://arxiv.org/pdf/2411.09259v1.pdf","comment":"ongoing work"},{"id":"http://arxiv.org/abs/2311.10126v2","updated":"2024-11-14T07:43:14Z","published":"2023-11-16T13:07:47Z","title":"I&S-ViT: An Inclusive & Stable Method for Pushing the Limit of\n Post-Training ViTs Quantization","summary":" Albeit the scalable performance of vision transformers (ViTs), the dense\ncomputational costs (training & inference) undermine their position in\nindustrial applications. Post-training quantization (PTQ), tuning ViTs with a\ntiny dataset and running in a low-bit format, well addresses the cost issue but\nunluckily bears more performance drops in lower-bit cases. In this paper, we\nintroduce I&S-ViT, a novel method that regulates the PTQ of ViTs in an\ninclusive and stable fashion. I&S-ViT first identifies two issues in the PTQ of\nViTs: (1) Quantization inefficiency in the prevalent log2 quantizer for\npost-Softmax activations; (2) Rugged and magnified loss landscape in\ncoarse-grained quantization granularity for post-LayerNorm activations. Then,\nI&S-ViT addresses these issues by introducing: (1) A novel shift-uniform-log2\nquantizer (SULQ) that incorporates a shift mechanism followed by uniform\nquantization to achieve both an inclusive domain representation and accurate\ndistribution approximation; (2) A three-stage smooth optimization strategy\n(SOS) that amalgamates the strengths of channel-wise and layer-wise\nquantization to enable stable learning. Comprehensive evaluations across\ndiverse vision tasks validate I&S-ViT' superiority over existing PTQ of ViTs\nmethods, particularly in low-bit scenarios. For instance, I&S-ViT elevates the\nperformance of 3-bit ViT-B by an impressive 50.68%.\n","authors":["Yunshan Zhong","Jiawei Hu","Mengzhao Chen","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2311.10126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14955v4","updated":"2024-11-14T07:37:28Z","published":"2024-04-23T12:00:20Z","title":"A Comprehensive Survey for Hyperspectral Image Classification: The\n Evolution from Conventional to Transformers and Mamba Models","summary":" Hyperspectral Image Classification (HSC) presents significant challenges\nowing to the high dimensionality and intricate nature of Hyperspectral (HS)\ndata. While traditional Machine Learning (TML) approaches have demonstrated\neffectiveness, they often encounter substantial obstacles in real-world\napplications, including the variability of optimal feature sets, subjectivity\nin human-driven design, inherent biases, and methodological limitations.\nSpecifically, TML suffers from the curse of dimensionality, difficulties in\nfeature selection and extraction, insufficient consideration of spatial\ninformation, limited robustness against noise, scalability issues, and\ninadequate adaptability to complex data distributions. In recent years, Deep\nLearning (DL) techniques have emerged as robust solutions to address these\nchallenges. This survey offers a comprehensive overview of current trends and\nfuture prospects in HSC, emphasizing advancements from DL models to the\nincreasing adoption of Transformer and Mamba Model architectures. We\nsystematically review key concepts, methodologies, and state-of-the-art\napproaches in DL for HSC. Furthermore, we investigate the potential of\nTransformer-based models and the Mamba Model in HSC, detailing their advantages\nand challenges. Emerging trends in HSC are explored, including in-depth\ndiscussions on Explainable AI and Interoperability concepts, alongside\nDiffusion Models for image denoising, feature extraction, and image fusion.\nComprehensive experimental results were conducted on three HS datasets to\nsubstantiate the efficacy of various conventional DL models and Transformers.\nAdditionally, we identify several open challenges and pertinent research\nquestions in the field of HSC. Finally, we outline future research directions\nand potential applications aimed at enhancing the accuracy and efficiency of\nHSC.\n","authors":["Muhammad Ahmad","Salvatore Distifano","Adil Mehmood Khan","Manuel Mazzara","Chenyu Li","Hao Li","Jagannath Aryal","Yao Ding","Gemine Vivone","Danfeng Hong"],"pdf_url":"https://arxiv.org/pdf/2404.14955v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09251v1","updated":"2024-11-14T07:34:31Z","published":"2024-11-14T07:34:31Z","title":"Cross Space and Time: A Spatio-Temporal Unitized Model for Traffic Flow\n Forecasting","summary":" Predicting spatio-temporal traffic flow presents significant challenges due\nto complex interactions between spatial and temporal factors. Existing\napproaches often address these dimensions in isolation, neglecting their\ncritical interdependencies. In this paper, we introduce the Spatio-Temporal\nUnitized Model (STUM), a unified framework designed to capture both spatial and\ntemporal dependencies while addressing spatio-temporal heterogeneity through\ntechniques such as distribution alignment and feature fusion. It also ensures\nboth predictive accuracy and computational efficiency. Central to STUM is the\nAdaptive Spatio-temporal Unitized Cell (ASTUC), which utilizes low-rank\nmatrices to seamlessly store, update, and interact with space, time, as well as\ntheir correlations. Our framework is also modular, allowing it to integrate\nwith various spatio-temporal graph neural networks through components such as\nbackbone models, feature extractors, residual fusion blocks, and predictive\nmodules to collectively enhance forecasting outcomes. Experimental results\nacross multiple real-world datasets demonstrate that STUM consistently improves\nprediction performance with minimal computational cost. These findings are\nfurther supported by hyperparameter optimization, pre-training analysis, and\nresult visualization. We provide our source code for reproducibility at\nhttps://anonymous.4open.science/r/STUM-E4F0.\n","authors":["Weilin Ruan","Wenzhuo Wang","Siru Zhong","Wei Chen","Li Liu","Yuxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2411.09251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09250v1","updated":"2024-11-14T07:31:12Z","published":"2024-11-14T07:31:12Z","title":"Embedding Space Allocation with Angle-Norm Joint Classifiers for\n Few-Shot Class-Incremental Learning","summary":" Few-shot class-incremental learning (FSCIL) aims to continually learn new\nclasses from only a few samples without forgetting previous ones, requiring\nintelligent agents to adapt to dynamic environments. FSCIL combines the\ncharacteristics and challenges of class-incremental learning and few-shot\nlearning: (i) Current classes occupy the entire feature space, which is\ndetrimental to learning new classes. (ii) The small number of samples in\nincremental rounds is insufficient for fully training. In existing mainstream\nvirtual class methods, for addressing the challenge (i), they attempt to use\nvirtual classes as placeholders. However, new classes may not necessarily align\nwith the virtual classes. For the challenge (ii), they replace trainable fully\nconnected layers with Nearest Class Mean (NCM) classifiers based on cosine\nsimilarity, but NCM classifiers do not account for sample imbalance issues. To\naddress these issues in previous methods, we propose the class-center guided\nembedding Space Allocation with Angle-Norm joint classifiers (SAAN) learning\nframework, which provides balanced space for all classes and leverages norm\ndifferences caused by sample imbalance to enhance classification criteria.\nSpecifically, for challenge (i), SAAN divides the feature space into multiple\nsubspaces and allocates a dedicated subspace for each session by guiding\nsamples with the pre-set category centers. For challenge (ii), SAAN establishes\na norm distribution for each class and generates angle-norm joint logits.\nExperiments demonstrate that SAAN can achieve state-of-the-art performance and\nit can be directly embedded into other SOTA methods as a plug-in, further\nenhancing their performance.\n","authors":["Dunwei Tu","Huiyu Yi","Tieyi Zhang","Ruotong Li","Furao Shen","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.09250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06526v2","updated":"2024-11-14T07:29:33Z","published":"2024-06-10T17:59:55Z","title":"GaussianCity: Generative Gaussian Splatting for Unbounded 3D City\n Generation","summary":" 3D city generation with NeRF-based methods shows promising generation results\nbut is computationally inefficient. Recently 3D Gaussian Splatting (3D-GS) has\nemerged as a highly efficient alternative for object-level 3D generation.\nHowever, adapting 3D-GS from finite-scale 3D objects and humans to\ninfinite-scale 3D cities is non-trivial. Unbounded 3D city generation entails\nsignificant storage overhead (out-of-memory issues), arising from the need to\nexpand points to billions, often demanding hundreds of Gigabytes of VRAM for a\ncity scene spanning 10km^2. In this paper, we propose GaussianCity, a\ngenerative Gaussian Splatting framework dedicated to efficiently synthesizing\nunbounded 3D cities with a single feed-forward pass. Our key insights are\ntwo-fold: 1) Compact 3D Scene Representation: We introduce BEV-Point as a\nhighly compact intermediate representation, ensuring that the growth in VRAM\nusage for unbounded scenes remains constant, thus enabling unbounded city\ngeneration. 2) Spatial-aware Gaussian Attribute Decoder: We present\nspatial-aware BEV-Point decoder to produce 3D Gaussian attributes, which\nleverages Point Serializer to integrate the structural and contextual\ncharacteristics of BEV points. Extensive experiments demonstrate that\nGaussianCity achieves state-of-the-art results in both drone-view and\nstreet-view 3D city generation. Notably, compared to CityDreamer, GaussianCity\nexhibits superior performance with a speedup of 60 times (10.72 FPS v.s. 0.18\nFPS).\n","authors":["Haozhe Xie","Zhaoxi Chen","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2406.06526v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16725v2","updated":"2024-11-14T07:15:06Z","published":"2024-07-23T12:53:38Z","title":"Category-Extensible Out-of-Distribution Detection via Hierarchical\n Context Descriptions","summary":" The key to OOD detection has two aspects: generalized feature representation\nand precise category description. Recently, vision-language models such as CLIP\nprovide significant advances in both two issues, but constructing precise\ncategory descriptions is still in its infancy due to the absence of unseen\ncategories. This work introduces two hierarchical contexts, namely perceptual\ncontext and spurious context, to carefully describe the precise category\nboundary through automatic prompt tuning. Specifically, perceptual contexts\nperceive the inter-category difference (e.g., cats vs apples) for current\nclassification tasks, while spurious contexts further identify spurious\n(similar but exactly not) OOD samples for every single category (e.g., cats vs\npanthers, apples vs peaches). The two contexts hierarchically construct the\nprecise description for a certain category, which is, first roughly classifying\na sample to the predicted category and then delicately identifying whether it\nis truly an ID sample or actually OOD. Moreover, the precise descriptions for\nthose categories within the vision-language framework present a novel\napplication: CATegory-EXtensible OOD detection (CATEX). One can efficiently\nextend the set of recognizable categories by simply merging the hierarchical\ncontexts learned under different sub-task settings. And extensive experiments\nare conducted to demonstrate CATEX's effectiveness, robustness, and\ncategory-extensibility. For instance, CATEX consistently surpasses the rivals\nby a large margin with several protocols on the challenging ImageNet-1K\ndataset. In addition, we offer new insights on how to efficiently scale up the\nprompt engineering in vision-language models to recognize thousands of object\ncategories, as well as how to incorporate large language models (like GPT-3) to\nboost zero-shot applications. Code is publicly available at\nhttps://github.com/alibaba/catex.\n","authors":["Kai Liu","Zhihang Fu","Chao Chen","Sheng Jin","Ze Chen","Mingyuan Tao","Rongxin Jiang","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2407.16725v2.pdf","comment":"Accepted by 37th Conference on Neural Information Processing Systems\n (NeurIPS 2023). Code is available at https://github.com/alibaba/catex"},{"id":"http://arxiv.org/abs/2411.07579v3","updated":"2024-11-14T07:02:03Z","published":"2024-11-12T06:29:48Z","title":"Projecting Gaussian Ellipsoids While Avoiding Affine Projection\n Approximation","summary":" Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its\nreal-time rendering speed and state-of-the-art rendering quality. However,\nduring the rendering process, the use of the Jacobian of the affine\napproximation of the projection transformation leads to inevitable errors,\nresulting in blurriness, artifacts and a lack of scene consistency in the final\nrendered images. To address this issue, we introduce an ellipsoid-based\nprojection method to calculate the projection of Gaussian ellipsoid onto the\nimage plane, which is the primitive of 3D Gaussian Splatting. As our proposed\nellipsoid-based projection method cannot handle Gaussian ellipsoids with camera\norigins inside them or parts lying below $z=0$ plane in the camera space, we\ndesigned a pre-filtering strategy. Experiments over multiple widely adopted\nbenchmark datasets show that our ellipsoid-based projection method can enhance\nthe rendering quality of 3D Gaussian Splatting and its extensions.\n","authors":["Han Qi","Tao Cai","Xiyue Han"],"pdf_url":"https://arxiv.org/pdf/2411.07579v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07032v2","updated":"2024-11-14T06:36:57Z","published":"2024-03-11T04:56:10Z","title":"STARFlow: Spatial Temporal Feature Re-embedding with Attentive Learning\n for Real-world Scene Flow","summary":" Scene flow prediction is a crucial underlying task in understanding dynamic\nscenes as it offers fundamental motion information. However, contemporary scene\nflow methods encounter three major challenges. Firstly, flow estimation solely\nbased on local receptive fields lacks long-dependency matching of point pairs.\nTo address this issue, we propose global attentive flow embedding to match\nall-to-all point pairs in both feature space and Euclidean space, providing\nglobal initialization before local refinement. Secondly, there are deformations\nexisting in non-rigid objects after warping, which leads to variations in the\nspatiotemporal relation between the consecutive frames. For a more precise\nestimation of residual flow, a spatial temporal feature re-embedding module is\ndevised to acquire the sequence features after deformation. Furthermore,\nprevious methods perform poor generalization due to the significant domain gap\nbetween the synthesized and LiDAR-scanned datasets. We leverage novel domain\nadaptive losses to effectively bridge the gap of motion inference from\nsynthetic to real-world. Experiments demonstrate that our approach achieves\nstate-of-the-art performance across various datasets, with particularly\noutstanding results on real-world LiDAR-scanned datasets. Our code is available\nat https://github.com/O-VIGIA/StarFlow.\n","authors":["Zhiyang Lu","Qinghan Chen","Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.07032v2.pdf","comment":"This paper was renamed to:\"SSRFlow: Semantic-aware Fusion with\n Spatial Temporal Re-embedding for Real-world Scene Flow\" [arXiv:2408.07825]\n and was accepted in 3DV 2025"},{"id":"http://arxiv.org/abs/2411.09219v1","updated":"2024-11-14T06:31:20Z","published":"2024-11-14T06:31:20Z","title":"Harnessing Vision Foundation Models for High-Performance, Training-Free\n Open Vocabulary Segmentation","summary":" While Contrastive Language-Image Pre-training (CLIP) has advanced\nopen-vocabulary predictions, its performance on semantic segmentation remains\nsuboptimal. This shortfall primarily stems from its spatial-invariant semantic\nfeatures and constrained resolution. While previous adaptations addressed\nspatial invariance semantic by modifying the self-attention in CLIP's image\nencoder, the issue of limited resolution remains unexplored. Different from\nprevious segment-then-splice methods that segment sub-images via a sliding\nwindow and splice the results, we introduce a splice-then-segment paradigm that\nincorporates Segment-Anything Model (SAM) to tackle the resolution issue since\nSAM excels at extracting fine-grained semantic correlations from\nhigh-resolution images. Specifically, we introduce Trident, a training-free\nframework that first splices features extracted by CLIP and DINO from\nsub-images, then leverages SAM's encoder to create a correlation matrix for\nglobal aggregation, enabling a broadened receptive field for effective\nsegmentation. Besides, we propose a refinement strategy for CLIP's coarse\nsegmentation outputs by transforming them into prompts for SAM, further\nenhancing the segmentation performance. Trident achieves a significant\nimprovement in the mIoU across eight benchmarks compared with the current SOTA,\nincreasing from 44.4 to 48.6.Code is available at\nhttps://github.com/YuHengsss/Trident.\n","authors":["Yuheng Shi","Minjing Dong","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2411.09219v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.09209v1","updated":"2024-11-14T06:13:05Z","published":"2024-11-14T06:13:05Z","title":"JoyVASA: Portrait and Animal Image Animation with Diffusion-Based\n Audio-Driven Facial Dynamics and Head Motion Generation","summary":" Audio-driven portrait animation has made significant advances with\ndiffusion-based models, improving video quality and lipsync accuracy. However,\nthe increasing complexity of these models has led to inefficiencies in training\nand inference, as well as constraints on video length and inter-frame\ncontinuity. In this paper, we propose JoyVASA, a diffusion-based method for\ngenerating facial dynamics and head motion in audio-driven facial animation.\nSpecifically, in the first stage, we introduce a decoupled facial\nrepresentation framework that separates dynamic facial expressions from static\n3D facial representations. This decoupling allows the system to generate longer\nvideos by combining any static 3D facial representation with dynamic motion\nsequences. Then, in the second stage, a diffusion transformer is trained to\ngenerate motion sequences directly from audio cues, independent of character\nidentity. Finally, a generator trained in the first stage uses the 3D facial\nrepresentation and the generated motion sequences as inputs to render\nhigh-quality animations. With the decoupled facial representation and the\nidentity-independent motion generation process, JoyVASA extends beyond human\nportraits to animate animal faces seamlessly. The model is trained on a hybrid\ndataset of private Chinese and public English data, enabling multilingual\nsupport. Experimental results validate the effectiveness of our approach.\nFuture work will focus on improving real-time performance and refining\nexpression control, further expanding the applications in portrait animation.\nThe code will be available at: https://jdhalgo.github.io/JoyVASA.\n","authors":["Xuyang Cao","Sheng Shi","Jun Zhao","Yang Yao","Jintao Fei","Minyu Gao","Guoxin Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08481v2","updated":"2024-11-14T05:13:21Z","published":"2024-09-13T02:13:11Z","title":"USTC-TD: A Test Dataset and Benchmark for Image and Video Coding in\n 2020s","summary":" Image/video coding has been a remarkable research area for both academia and\nindustry for many years. Testing datasets, especially high-quality image/video\ndatasets are desirable for the justified evaluation of coding-related research,\npractical applications, and standardization activities. We put forward a test\ndataset namely USTC-TD, which has been successfully adopted in the practical\nend-to-end image/video coding challenge of the IEEE International Conference on\nVisual Communications and lmage Processing (VCIP) in 2022 and 2023. USTC-TD\ncontains 40 images at 4K spatial resolution and 10 video sequences at 1080p\nspatial resolution, featuring various content due to the diverse environmental\nfactors (e.g. scene type, texture, motion, view) and the designed imaging\nfactors (e.g. illumination, lens, shadow). We quantitatively evaluate USTC-TD\non different image/video features (spatial, temporal, color, lightness), and\ncompare it with the previous image/video test datasets, which verifies the\nwider coverage and more diversity of the proposed dataset. We also evaluate\nboth classic standardized and recent learned image/video coding schemes on\nUSTC-TD with PSNR and MS-SSIM, and provide an extensive benchmark for the\nevaluated schemes. Based on the characteristics and specific design of the\nproposed test dataset, we analyze the benchmark performance and shed light on\nthe future research and development of image/video coding. All the data are\nreleased online: https://esakak.github.io/USTC-TD .\n","authors":["Zhuoyuan Li","Junqi Liao","Chuanbo Tang","Haotian Zhang","Yuqi Li","Yifan Bian","Xihua Sheng","Xinmin Feng","Yao Li","Changsheng Gao","Li Li","Dong Liu","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.08481v2.pdf","comment":"23 pages. Project Page: https://esakak.github.io/USTC-TD"},{"id":"http://arxiv.org/abs/2411.09180v1","updated":"2024-11-14T04:39:10Z","published":"2024-11-14T04:39:10Z","title":"LEAP:D - A Novel Prompt-based Approach for Domain-Generalized Aerial\n Object Detection","summary":" Drone-captured images present significant challenges in object detection due\nto varying shooting conditions, which can alter object appearance and shape.\nFactors such as drone altitude, angle, and weather cause these variations,\ninfluencing the performance of object detection algorithms. To tackle these\nchallenges, we introduce an innovative vision-language approach using learnable\nprompts. This shift from conventional manual prompts aims to reduce\ndomain-specific knowledge interference, ultimately improving object detection\ncapabilities. Furthermore, we streamline the training process with a one-step\napproach, updating the learnable prompt concurrently with model training,\nenhancing efficiency without compromising performance. Our study contributes to\ndomain-generalized object detection by leveraging learnable prompts and\noptimizing training processes. This enhances model robustness and adaptability\nacross diverse environments, leading to more effective aerial object detection.\n","authors":["Chanyeong Park","Heegwang Kim","Joonki Paik"],"pdf_url":"https://arxiv.org/pdf/2411.09180v1.pdf","comment":"ICIP 2024 Workshop accepted paper"},{"id":"http://arxiv.org/abs/2411.09176v1","updated":"2024-11-14T04:29:07Z","published":"2024-11-14T04:29:07Z","title":"Gazing at Rewards: Eye Movements as a Lens into Human and AI\n Decision-Making in Hybrid Visual Foraging","summary":" Imagine searching a collection of coins for quarters ($0.25$), dimes\n($0.10$), nickels ($0.05$), and pennies ($0.01$)-a hybrid foraging task where\nobservers look for multiple instances of multiple target types. In such tasks,\nhow do target values and their prevalence influence foraging and eye movement\nbehaviors (e.g., should you prioritize rare quarters or common nickels)? To\nexplore this, we conducted human psychophysics experiments, revealing that\nhumans are proficient reward foragers. Their eye fixations are drawn to regions\nwith higher average rewards, fixation durations are longer on more valuable\ntargets, and their cumulative rewards exceed chance, approaching the upper\nbound of optimal foragers. To probe these decision-making processes of humans,\nwe developed a transformer-based Visual Forager (VF) model trained via\nreinforcement learning. Our VF model takes a series of targets, their\ncorresponding values, and the search image as inputs, processes the images\nusing foveated vision, and produces a sequence of eye movements along with\ndecisions on whether to collect each fixated item. Our model outperforms all\nbaselines, achieves cumulative rewards comparable to those of humans, and\napproximates human foraging behavior in eye movements and foraging biases\nwithin time-limited environments. Furthermore, stress tests on\nout-of-distribution tasks with novel targets, unseen values, and varying set\nsizes demonstrate the VF model's effective generalization. Our work offers\nvaluable insights into the relationship between eye movements and\ndecision-making, with our model serving as a powerful tool for further\nexploration of this connection. All data, code, and models will be made\npublicly available.\n","authors":["Bo Wang","Dingwei Tan","Yen-Ling Kuo","Zhaowei Sun","Jeremy M. Wolfe","Tat-Jen Cham","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09174v1","updated":"2024-11-14T04:23:28Z","published":"2024-11-14T04:23:28Z","title":"Advancing Diffusion Models: Alias-Free Resampling and Enhanced\n Rotational Equivariance","summary":" Recent advances in image generation, particularly via diffusion models, have\nled to impressive improvements in image synthesis quality. Despite this,\ndiffusion models are still challenged by model-induced artifacts and limited\nstability in image fidelity. In this work, we hypothesize that the primary\ncause of this issue is the improper resampling operation that introduces\naliasing in the diffusion model and a careful alias-free resampling dictated by\nimage processing theory can improve the model's performance in image synthesis.\nWe propose the integration of alias-free resampling layers into the UNet\narchitecture of diffusion models without adding extra trainable parameters,\nthereby maintaining computational efficiency. We then assess whether these\ntheory-driven modifications enhance image quality and rotational equivariance.\nOur experimental results on benchmark datasets, including CIFAR-10, MNIST, and\nMNIST-M, reveal consistent gains in image quality, particularly in terms of FID\nand KID scores. Furthermore, we propose a modified diffusion process that\nenables user-controlled rotation of generated images without requiring\nadditional training. Our findings highlight the potential of theory-driven\nenhancements such as alias-free resampling in generative models to improve\nimage quality while maintaining model efficiency and pioneer future research\ndirections to incorporate them into video-generating diffusion models, enabling\ndeeper exploration of the applications of alias-free resampling in generative\nmodeling.\n","authors":["Md Fahim Anjum"],"pdf_url":"https://arxiv.org/pdf/2411.09174v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.18684v2","updated":"2024-11-14T04:13:08Z","published":"2024-05-29T01:25:43Z","title":"Learning Diffeomorphism for Image Registration with Time-Continuous\n Networks using Semigroup Regularization","summary":" Diffeomorphic image registration (DIR) is a critical task in 3D medical image\nanalysis, aimed at finding topology preserving deformations between pairs of\nimages. Focusing on the solution of the flow map differential equation as the\ndiffeomorphic deformation, recent methods use discrete timesteps along with\nvarious regularization terms to penalize the negative determinant of Jacobian\nand impose smoothness of the solution vector field. In this paper, we propose a\nnovel learning-based approach for diffeomorphic 3D-image registration which\nfinds the diffeomorphisms in the time continuum with only a single\nregularization term and no additional integration. As one of the fundamental\nproperties of flow maps, we exploit the semigroup property as the only form of\nregularization, ensuring temporally continuous diffeomorphic flows between\npairs of images. Leveraging this property, our method alleviates the need for\nadditional regularization terms and scaling and squaring integration during\nboth training and evaluation. To achieve time-continuous diffeomorphisms, we\nemploy time-embedded UNets, an architecture commonly utilized in diffusion\nmodels. The proposed method reveals that ensuring diffeomorphism in a\ncontinuous time interval leads to better registration results. Experimental\nresults on four public datasets demonstrate the superiority of our model over\nboth learning-based and optimization-based methods.\n","authors":["Mohammadjavad Matinkia","Nilanjan Ray"],"pdf_url":"https://arxiv.org/pdf/2405.18684v2.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.14289v3","updated":"2024-11-14T03:26:46Z","published":"2024-09-22T01:42:01Z","title":"Deep Learning Technology for Face Forgery Detection: A Survey","summary":" Currently, the rapid development of computer vision and deep learning has\nenabled the creation or manipulation of high-fidelity facial images and videos\nvia deep generative approaches. This technology, also known as deepfake, has\nachieved dramatic progress and become increasingly popular in social media.\nHowever, the technology can generate threats to personal privacy and national\nsecurity by spreading misinformation. To diminish the risks of deepfake, it is\ndesirable to develop powerful forgery detection methods to distinguish fake\nfaces from real faces. This paper presents a comprehensive survey of recent\ndeep learning-based approaches for facial forgery detection. We attempt to\nprovide the reader with a deeper understanding of the current advances as well\nas the major challenges for deepfake detection based on deep learning. We\npresent an overview of deepfake techniques and analyse the characteristics of\nvarious deepfake datasets. We then provide a systematic review of different\ncategories of deepfake detection and state-of-the-art deepfake detection\nmethods. The drawbacks of existing detection methods are analyzed, and future\nresearch directions are discussed to address the challenges in improving both\nthe performance and generalization of deepfake detection.\n","authors":["Lixia Ma","Puning Yang","Yuting Xu","Ziming Yang","Peipei Li","Huaibo Huang"],"pdf_url":"https://arxiv.org/pdf/2409.14289v3.pdf","comment":"The paper \"Deep Learning Technology for Face Forgery Detection: A\n Survey\" is hereby formally withdrawn. The reason for this withdrawal is that\n I did not adequately consult and obtain proper authorization from the\n corresponding author during the submission process. I sincerely apologize for\n any inconvenience this may have caused the journal, reviewers, and readers"},{"id":"http://arxiv.org/abs/2411.05836v2","updated":"2024-11-14T03:21:12Z","published":"2024-11-06T12:28:26Z","title":"Prion-ViT: Prions-Inspired Vision Transformers for Temperature\n prediction with Specklegrams","summary":" Fiber Specklegram Sensors (FSS) are vital for environmental monitoring due to\ntheir high temperature sensitivity, but their complex data poses challeng-es\nfor predictive models. This study introduces Prion-ViT, a prion-inspired Vision\nTransformer model, inspired by biological prion memory mecha-nisms, to improve\nlong-term dependency modeling and temperature prediction accuracy using FSS\ndata. Prion-ViT leverages a persistent memory state to retain and propagate key\nfeatures across layers, reducing mean absolute error (MAE) to 0.52{\\deg}C and\noutperforming models like ResNet, Inception Net V2, and standard vision\ntransformers. This work highlights Prion-ViT's potential for real-time\nindustrial temperature monitoring and broader optical sensing applications.\n","authors":["Abhishek Sebastian","Pragna R"],"pdf_url":"https://arxiv.org/pdf/2411.05836v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09156v1","updated":"2024-11-14T03:19:57Z","published":"2024-11-14T03:19:57Z","title":"DyGASR: Dynamic Generalized Exponential Splatting with Surface Alignment\n for Accelerated 3D Mesh Reconstruction","summary":" Recent advancements in 3D Gaussian Splatting (3DGS), which lead to\nhigh-quality novel view synthesis and accelerated rendering, have remarkably\nimproved the quality of radiance field reconstruction. However, the extraction\nof mesh from a massive number of minute 3D Gaussian points remains great\nchallenge due to the large volume of Gaussians and difficulty of representation\nof sharp signals caused by their inherent low-pass characteristics. To address\nthis issue, we propose DyGASR, which utilizes generalized exponential function\ninstead of traditional 3D Gaussian to decrease the number of particles and\ndynamically optimize the representation of the captured signal. In addition, it\nis observed that reconstructing mesh with Generalized Exponential\nSplatting(GES) without modifications frequently leads to failures since the\ngeneralized exponential distribution centroids may not precisely align with the\nscene surface. To overcome this, we adopt Sugar's approach and introduce\nGeneralized Surface Regularization (GSR), which reduces the smallest scaling\nvector of each point cloud to zero and ensures normal alignment perpendicular\nto the surface, facilitating subsequent Poisson surface mesh reconstruction.\nAdditionally, we propose a dynamic resolution adjustment strategy that utilizes\na cosine schedule to gradually increase image resolution from low to high\nduring the training stage, thus avoiding constant full resolution, which\nsignificantly boosts the reconstruction speed. Our approach surpasses existing\n3DGS-based mesh reconstruction methods, as evidenced by extensive evaluations\non various scene datasets, demonstrating a 25\\% increase in speed, and a 30\\%\nreduction in memory usage.\n","authors":["Shengchao Zhao","Yundong Li"],"pdf_url":"https://arxiv.org/pdf/2411.09156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09153v1","updated":"2024-11-14T03:13:26Z","published":"2024-11-14T03:13:26Z","title":"VidMan: Exploiting Implicit Dynamics from Video Diffusion Model for\n Effective Robot Manipulation","summary":" Recent advancements utilizing large-scale video data for learning video\ngeneration models demonstrate significant potential in understanding complex\nphysical dynamics. It suggests the feasibility of leveraging diverse robot\ntrajectory data to develop a unified, dynamics-aware model to enhance robot\nmanipulation. However, given the relatively small amount of available robot\ndata, directly fitting data without considering the relationship between visual\nobservations and actions could lead to suboptimal data utilization. To this\nend, we propose VidMan (Video Diffusion for Robot Manipulation), a novel\nframework that employs a two-stage training mechanism inspired by dual-process\ntheory from neuroscience to enhance stability and improve data utilization\nefficiency. Specifically, in the first stage, VidMan is pre-trained on the Open\nX-Embodiment dataset (OXE) for predicting future visual trajectories in a video\ndenoising diffusion manner, enabling the model to develop a long horizontal\nawareness of the environment's dynamics. In the second stage, a flexible yet\neffective layer-wise self-attention adapter is introduced to transform VidMan\ninto an efficient inverse dynamics model that predicts action modulated by the\nimplicit dynamics knowledge via parameter sharing. Our VidMan framework\noutperforms state-of-the-art baseline model GR-1 on the CALVIN benchmark,\nachieving a 11.7% relative improvement, and demonstrates over 9% precision\ngains on the OXE small-scale dataset. These results provide compelling evidence\nthat world models can significantly enhance the precision of robot action\nprediction. Codes and models will be public.\n","authors":["Youpeng Wen","Junfan Lin","Yi Zhu","Jianhua Han","Hang Xu","Shen Zhao","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2411.09153v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09151v1","updated":"2024-11-14T03:01:36Z","published":"2024-11-14T03:01:36Z","title":"Mono2Stereo: Monocular Knowledge Transfer for Enhanced Stereo Matching","summary":" The generalization and performance of stereo matching networks are limited\ndue to the domain gap of the existing synthetic datasets and the sparseness of\nGT labels in the real datasets. In contrast, monocular depth estimation has\nachieved significant advancements, benefiting from large-scale depth datasets\nand self-supervised strategies. To bridge the performance gap between monocular\ndepth estimation and stereo matching, we propose leveraging monocular knowledge\ntransfer to enhance stereo matching, namely Mono2Stereo. We introduce knowledge\ntransfer with a two-stage training process, comprising synthetic data\npre-training and real-world data fine-tuning. In the pre-training stage, we\ndesign a data generation pipeline that synthesizes stereo training data from\nmonocular images. This pipeline utilizes monocular depth for warping and novel\nview synthesis and employs our proposed Edge-Aware (EA) inpainting module to\nfill in missing contents in the generated images. In the fine-tuning stage, we\nintroduce a Sparse-to-Dense Knowledge Distillation (S2DKD) strategy encouraging\nthe distributions of predictions to align with dense monocular depths. This\nstrategy mitigates issues with edge blurring in sparse real-world labels and\nenhances overall consistency. Experimental results demonstrate that our\npre-trained model exhibits strong zero-shot generalization capabilities.\nFurthermore, domain-specific fine-tuning using our pre-trained model and S2DKD\nstrategy significantly increments in-domain performance. The code will be made\navailable soon.\n","authors":["Yuran Wang","Yingping Liang","Hesong Li","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2411.09151v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.09145v1","updated":"2024-11-14T02:57:11Z","published":"2024-11-14T02:57:11Z","title":"UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for\n Egocentric Hand Object Interaction Videos","summary":" Egocentric Hand Object Interaction (HOI) videos provide valuable insights\ninto human interactions with the physical world, attracting growing interest\nfrom the computer vision and robotics communities. A key task in fully\nunderstanding the geometry and dynamics of HOI scenes is dense pointclouds\nsequence reconstruction. However, the inherent motion of both hands and the\ncamera makes this challenging. Current methods often rely on time-consuming\ntest-time optimization, making them impractical for reconstructing\ninternet-scale videos. To address this, we introduce UniHOI, a model that\nunifies the estimation of all variables necessary for dense 4D reconstruction,\nincluding camera intrinsic, camera poses, and video depth, for egocentric HOI\nscene in a fast feed-forward manner. We end-to-end optimize all these variables\nto improve their consistency in 3D space. Furthermore, our model could be\ntrained solely on large-scale monocular video dataset, overcoming the\nlimitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain\nand zero-shot generalization setting, surpassing all baselines in pointclouds\nsequence reconstruction and long-term 3D scene flow recovery. UniHOI is the\nfirst approach to offer fast, dense, and generalizable monocular egocentric HOI\nscene reconstruction in the presence of motion. Code and trained model will be\nreleased in the future.\n","authors":["Chengbo Yuan","Geng Chen","Li Yi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2411.09145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09140v1","updated":"2024-11-14T02:40:34Z","published":"2024-11-14T02:40:34Z","title":"Adversarial Vessel-Unveiling Semi-Supervised Segmentation for\n Retinopathy of Prematurity Diagnosis","summary":" Accurate segmentation of retinal images plays a crucial role in aiding\nophthalmologists in diagnosing retinopathy of prematurity (ROP) and assessing\nits severity. However, due to their underdeveloped, thinner vessels, manual\nannotation in infant fundus images is very complex, and this presents\nchallenges for fully supervised learning. To address the scarcity of\nannotations, we propose a semi supervised segmentation framework designed to\nadvance ROP studies without the need for extensive manual vessel annotation.\nUnlike previous methods that rely solely on limited labeled data, our approach\nleverages teacher student learning by integrating two powerful components: an\nuncertainty weighted vessel unveiling module and domain adversarial learning.\nThe vessel unveiling module helps the model effectively reveal obscured and\nhard to detect vessel structures, while adversarial training aligns feature\nrepresentations across different domains, ensuring robust and generalizable\nvessel segmentations. We validate our approach on public datasets (CHASEDB,\nSTARE) and an in-house ROP dataset, demonstrating its superior performance\nacross multiple evaluation metrics. Additionally, we extend the model's utility\nto a downstream task of ROP multi-stage classification, where vessel masks\nextracted by our segmentation model improve diagnostic accuracy. The promising\nresults in classification underscore the model's potential for clinical\napplication, particularly in early-stage ROP diagnosis and intervention.\nOverall, our work offers a scalable solution for leveraging unlabeled data in\npediatric ophthalmology, opening new avenues for biomarker discovery and\nclinical research.\n","authors":["Gozde Merve Demirci","Jiachen Yao","Ming-Chih Ho","Xiaoling Hu","Wei-Chi Wu","Chao Chen","Chia-Ling Tsai"],"pdf_url":"https://arxiv.org/pdf/2411.09140v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.09137v1","updated":"2024-11-14T02:25:23Z","published":"2024-11-14T02:25:23Z","title":"Fast probabilistic snake algorithm","summary":" Few people use the probability theory in order to achieve image segmentation\nwith snake models. In this article, we are presenting an active contour\nalgorithm based on a probability approach inspired by A. Blake work and P.\nR{\\'e}fr{\\'e}gier's team research in France. Our algorithm, both very fast and\nhighly accurate as far as contour description is concerned, is easily adaptable\nto any specific application.\n","authors":["Jérôme Gilles","Bertrand Collin"],"pdf_url":"https://arxiv.org/pdf/2411.09137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09133v1","updated":"2024-11-14T02:13:25Z","published":"2024-11-14T02:13:25Z","title":"Computational metaoptics for imaging","summary":" Metasurfaces -- ultrathin structures composed of subwavelength optical\nelements -- have revolutionized light manipulation by enabling precise control\nover electromagnetic waves' amplitude, phase, polarization, and spectral\nproperties. Concurrently, computational imaging leverages algorithms to\nreconstruct images from optically processed signals, overcoming limitations of\ntraditional imaging systems. This review explores the synergistic integration\nof metaoptics and computational imaging, \"computational metaoptics,\" which\ncombines the physical wavefront shaping ability of metasurfaces with advanced\ncomputational algorithms to enhance imaging performance beyond conventional\nlimits. We discuss how computational metaoptics addresses the inherent\nlimitations of single-layer metasurfaces in achieving multifunctionality\nwithout compromising efficiency. By treating metasurfaces as physical\npreconditioners and co-designing them with reconstruction algorithms through\nend-to-end (inverse) design, it is possible to jointly optimize the optical\nhardware and computational software. This holistic approach allows for the\nautomatic discovery of optimal metasurface designs and reconstruction methods\nthat significantly improve imaging capabilities. Advanced applications enabled\nby computational metaoptics are highlighted, including phase imaging and\nquantum state measurement, which benefit from the metasurfaces' ability to\nmanipulate complex light fields and the computational algorithms' capacity to\nreconstruct high-dimensional information. We also examine performance\nevaluation challenges, emphasizing the need for new metrics that account for\nthe combined optical and computational nature of these systems. Finally, we\nidentify new frontiers in computational metaoptics which point toward a future\nwhere computational metaoptics may play a central role in advancing imaging\nscience and technology.\n","authors":["Charles Roques-Carmes","Kai Wang","Yuanmu Yang","Arka Majumdar","Zin Lin"],"pdf_url":"https://arxiv.org/pdf/2411.09133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06727v2","updated":"2024-11-14T02:11:56Z","published":"2024-11-11T05:44:48Z","title":"Can KAN Work? Exploring the Potential of Kolmogorov-Arnold Networks in\n Computer Vision","summary":" Kolmogorov-Arnold Networks(KANs), as a theoretically efficient neural network\narchitecture, have garnered attention for their potential in capturing complex\npatterns. However, their application in computer vision remains relatively\nunexplored. This study first analyzes the potential of KAN in computer vision\ntasks, evaluating the performance of KAN and its convolutional variants in\nimage classification and semantic segmentation. The focus is placed on\nexamining their characteristics across varying data scales and noise levels.\nResults indicate that while KAN exhibits stronger fitting capabilities, it is\nhighly sensitive to noise, limiting its robustness. To address this challenge,\nwe propose a smoothness regularization method and introduce a Segment\nDeactivation technique. Both approaches enhance KAN's stability and\ngeneralization, demonstrating its potential in handling complex visual data\ntasks.\n","authors":["Yueyang Cang","Yu hang liu","Li Shi"],"pdf_url":"https://arxiv.org/pdf/2411.06727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00289v3","updated":"2024-11-14T02:08:40Z","published":"2023-09-30T07:45:50Z","title":"Pubic Symphysis-Fetal Head Segmentation Using Pure Transformer with\n Bi-level Routing Attention","summary":" In this paper, we propose a method, named BRAU-Net, to solve the pubic\nsymphysis-fetal head segmentation task. The method adopts a U-Net-like pure\nTransformer architecture with bi-level routing attention and skip connections,\nwhich effectively learns local-global semantic information. The proposed\nBRAU-Net was evaluated on transperineal Ultrasound images dataset from the\npubic symphysis-fetal head segmentation and angle of progression (FH-PS-AOP)\nchallenge. The results demonstrate that the proposed BRAU-Net achieves\ncomparable a final score. The codes will be available at\nhttps://github.com/Caipengzhou/BRAU-Net.\n","authors":["Pengzhou Cai","Lu Jiang","Yanxin Li","Libin Lan"],"pdf_url":"https://arxiv.org/pdf/2310.00289v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09417v3","updated":"2024-11-14T02:00:33Z","published":"2024-01-17T18:56:18Z","title":"Vision Mamba: Efficient Visual Representation Learning with\n Bidirectional State Space Model","summary":" Recently the state space models (SSMs) with efficient hardware-aware designs,\ni.e., the Mamba deep learning model, have shown great potential for long\nsequence modeling. Meanwhile building efficient and generic vision backbones\npurely upon SSMs is an appealing direction. However, representing visual data\nis challenging for SSMs due to the position-sensitivity of visual data and the\nrequirement of global context for visual understanding. In this paper, we show\nthat the reliance on self-attention for visual representation learning is not\nnecessary and propose a new generic vision backbone with bidirectional Mamba\nblocks (Vim), which marks the image sequences with position embeddings and\ncompresses the visual representation with bidirectional state space models. On\nImageNet classification, COCO object detection, and ADE20k semantic\nsegmentation tasks, Vim achieves higher performance compared to\nwell-established vision transformers like DeiT, while also demonstrating\nsignificantly improved computation & memory efficiency. For example, Vim is\n2.8$\\times$ faster than DeiT and saves 86.8% GPU memory when performing batch\ninference to extract features on images with a resolution of 1248$\\times$1248.\nThe results demonstrate that Vim is capable of overcoming the computation &\nmemory constraints on performing Transformer-style understanding for\nhigh-resolution images and it has great potential to be the next-generation\nbackbone for vision foundation models. Code is available at\nhttps://github.com/hustvl/Vim.\n","authors":["Lianghui Zhu","Bencheng Liao","Qian Zhang","Xinlong Wang","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09417v3.pdf","comment":"Vision Mamba (Vim) is accepted by ICML 2024. Code is available at\n https://github.com/hustvl/Vim"},{"id":"http://arxiv.org/abs/2411.09126v1","updated":"2024-11-14T01:53:17Z","published":"2024-11-14T01:53:17Z","title":"SCAN: Bootstrapping Contrastive Pre-training for Data Efficiency","summary":" While contrastive pre-training is widely employed, its data efficiency\nproblem has remained relatively under-explored thus far. Existing methods often\nrely on static coreset selection algorithms to pre-identify important data for\ntraining. However, this static nature renders them unable to dynamically track\nthe data usefulness throughout pre-training, leading to subpar pre-trained\nmodels. To address this challenge, our paper introduces a novel dynamic\nbootstrapping dataset pruning method. It involves pruning data preparation\nfollowed by dataset mutation operations, both of which undergo iterative and\ndynamic updates. We apply this method to two prevalent contrastive pre-training\nframeworks: \\textbf{CLIP} and \\textbf{MoCo}, representing vision-language and\nvision-centric domains, respectively. In particular, we individually pre-train\nseven CLIP models on two large-scale image-text pair datasets, and two MoCo\nmodels on the ImageNet dataset, resulting in a total of 16 pre-trained models.\nWith a data pruning rate of 30-35\\% across all 16 models, our method exhibits\nonly marginal performance degradation (less than \\textbf{1\\%} on average)\ncompared to corresponding models trained on the full dataset counterparts\nacross various downstream datasets, and also surpasses several baselines with a\nlarge performance margin. Additionally, the byproduct from our method, \\ie\ncoresets derived from the original datasets after pre-training, also\ndemonstrates significant superiority in terms of downstream performance over\nother static coreset selection approaches.\n","authors":["Yangyang Guo","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2411.09126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04997v2","updated":"2024-11-14T01:36:12Z","published":"2024-11-07T18:59:16Z","title":"LLM2CLIP: Powerful Language Model Unlocks Richer Visual Representation","summary":" CLIP is one of the most important multimodal foundational models today. What\npowers CLIP's capabilities? The rich supervision signals provided by natural\nlanguage, the carrier of human knowledge, shape a powerful cross-modal\nrepresentation space. However, with the rapid advancements in large language\nmodels LLMs like GPT-4 and LLaMA, the boundaries of language comprehension and\ngeneration are continually being pushed. This raises an intriguing question:\ncan the capabilities of LLMs be harnessed to further improve multimodal\nrepresentation learning? The potential benefits of incorporating LLMs into CLIP\nare clear. LLMs' strong textual understanding can fundamentally improve CLIP's\nability to handle image captions, drastically enhancing its ability to process\nlong and complex texts, a well-known limitation of vanilla CLIP. Moreover, LLMs\nare trained on a vast corpus of text, possessing open-world knowledge. This\nallows them to expand on caption information during training, increasing the\nefficiency of the learning process. In this paper, we propose LLM2CLIP, a novel\napproach that embraces the power of LLMs to unlock CLIP's potential. By\nfine-tuning the LLM in the caption space with contrastive learning, we extract\nits textual capabilities into the output embeddings, significantly improving\nthe output layer's textual discriminability. We then design an efficient\ntraining process where the fine-tuned LLM acts as a powerful teacher for CLIP's\nvisual encoder. Thanks to the LLM's presence, we can now incorporate longer and\nmore complex captions without being restricted by vanilla CLIP's text encoder's\ncontext window and ability limitations. Our experiments demonstrate that this\napproach brings substantial improvements in cross-modal tasks.\n","authors":["Weiquan Huang","Aoqi Wu","Yifan Yang","Xufang Luo","Yuqing Yang","Liang Hu","Qi Dai","Xiyang Dai","Dongdong Chen","Chong Luo","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2411.04997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15104v4","updated":"2024-11-14T01:32:30Z","published":"2024-06-21T12:45:07Z","title":"Deciphering the Definition of Adversarial Robustness for post-hoc OOD\n Detectors","summary":" Detecting out-of-distribution (OOD) inputs is critical for safely deploying\ndeep learning models in real-world scenarios. In recent years, many OOD\ndetectors have been developed, and even the benchmarking has been standardized,\ni.e. OpenOOD. The number of post-hoc detectors is growing fast. They are\nshowing an option to protect a pre-trained classifier against natural\ndistribution shifts and claim to be ready for real-world scenarios. However,\nits effectiveness in dealing with adversarial examples (AdEx) has been\nneglected in most studies. In cases where an OOD detector includes AdEx in its\nexperiments, the lack of uniform parameters for AdEx makes it difficult to\naccurately evaluate the performance of the OOD detector. This paper\ninvestigates the adversarial robustness of 16 post-hoc detectors against\nvarious evasion attacks. It also discusses a roadmap for adversarial defense in\nOOD detectors that would help adversarial robustness. We believe that level 1\n(AdEx on a unified dataset) should be added to any OOD detector to see the\nlimitations. The last level in the roadmap (defense against adaptive attacks)\nwe added for integrity from an adversarial machine learning (AML) point of\nview, which we do not believe is the ultimate goal for OOD detectors.\n","authors":["Peter Lorenz","Mario Fernandez","Jens Müller","Ullrich Köthe"],"pdf_url":"https://arxiv.org/pdf/2406.15104v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09105v1","updated":"2024-11-14T00:26:26Z","published":"2024-11-14T00:26:26Z","title":"VCBench: A Controllable Benchmark for Symbolic and Abstract Challenges\n in Video Cognition","summary":" Recent advancements in Large Video-Language Models (LVLMs) have driven the\ndevelopment of benchmarks designed to assess cognitive abilities in video-based\ntasks. However, most existing benchmarks heavily rely on web-collected videos\npaired with human annotations or model-generated questions, which limit control\nover the video content and fall short in evaluating advanced cognitive\nabilities involving symbolic elements and abstract concepts. To address these\nlimitations, we introduce VCBench, a controllable benchmark to assess LVLMs'\ncognitive abilities, involving symbolic and abstract concepts at varying\ndifficulty levels. By generating video data with the Python-based engine,\nVCBench allows for precise control over the video content, creating dynamic,\ntask-oriented videos that feature complex scenes and abstract concepts. Each\ntask pairs with tailored question templates that target specific cognitive\nchallenges, providing a rigorous evaluation test. Our evaluation reveals that\neven state-of-the-art (SOTA) models, such as Qwen2-VL-72B, struggle with simple\nvideo cognition tasks involving abstract concepts, with performance sharply\ndropping by 19% as video complexity rises. These findings reveal the current\nlimitations of LVLMs in advanced cognitive tasks and highlight the critical\nrole of VCBench in driving research toward more robust LVLMs for complex video\ncognition challenges.\n","authors":["Chenglin Li","Qianglong Chen","Zhi Li","Feng Tao","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09101v1","updated":"2024-11-14T00:18:04Z","published":"2024-11-14T00:18:04Z","title":"Heuristical Comparison of Vision Transformers Against Convolutional\n Neural Networks for Semantic Segmentation on Remote Sensing Imagery","summary":" Vision Transformers (ViT) have recently brought a new wave of research in the\nfield of computer vision. These models have done particularly well in the field\nof image classification and segmentation. Research on semantic and instance\nsegmentation has emerged to accelerate with the inception of the new\narchitecture, with over 80\\% of the top 20 benchmarks for the iSAID dataset\nbeing either based on the ViT architecture or the attention mechanism behind\nits success. This paper focuses on the heuristic comparison of three key\nfactors of using (or not using) ViT for semantic segmentation of remote sensing\naerial images on the iSAID. The experimental results observed during the course\nof the research were under the scrutinization of the following objectives: 1.\nUse of weighted fused loss function for the maximum mean Intersection over\nUnion (mIoU) score, Dice score, and minimization or conservation of entropy or\nclass representation, 2. Comparison of transfer learning on Meta's MaskFormer,\na ViT-based semantic segmentation model, against generic UNet Convolutional\nNeural Networks (CNNs) judged over mIoU, Dice scores, training efficiency, and\ninference time, and 3. What do we lose for what we gain? i.e., the comparison\nof the two models against current state-of-art segmentation models. We show the\nuse of the novel combined weighted loss function significantly boosts the CNN\nmodel's performance capacities as compared to transfer learning the ViT. The\ncode for this implementation can be found on\n\\url{https://github.com/ashimdahal/ViT-vs-CNN-ImageSegmentation}.\n","authors":["Ashim Dahal","Saydul Akbar Murad","Nick Rahimi"],"pdf_url":"https://arxiv.org/pdf/2411.09101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09180v1","updated":"2024-11-14T04:39:10Z","published":"2024-11-14T04:39:10Z","title":"LEAP:D -- A Novel Prompt-based Approach for Domain-Generalized Aerial\n Object Detection","summary":" Drone-captured images present significant challenges in object detection due\nto varying shooting conditions, which can alter object appearance and shape.\nFactors such as drone altitude, angle, and weather cause these variations,\ninfluencing the performance of object detection algorithms. To tackle these\nchallenges, we introduce an innovative vision-language approach using learnable\nprompts. This shift from conventional manual prompts aims to reduce\ndomain-specific knowledge interference, ultimately improving object detection\ncapabilities. Furthermore, we streamline the training process with a one-step\napproach, updating the learnable prompt concurrently with model training,\nenhancing efficiency without compromising performance. Our study contributes to\ndomain-generalized object detection by leveraging learnable prompts and\noptimizing training processes. This enhances model robustness and adaptability\nacross diverse environments, leading to more effective aerial object detection.\n","authors":["Chanyeong Park","Heegwang Kim","Joonki Paik"],"pdf_url":"https://arxiv.org/pdf/2411.09180v1.pdf","comment":"ICIP 2024 Workshop accepted paper"},{"id":"http://arxiv.org/abs/2203.17255v7","updated":"2024-11-14T01:06:47Z","published":"2022-03-29T22:28:30Z","title":"A Cognitive Architecture for Machine Consciousness and Artificial\n Superintelligence: Thought Is Structured by the Iterative Updating of Working\n Memory","summary":" This article provides an analytical framework for how to simulate human-like\nthought processes within a computer. It describes how attention and memory\nshould be structured, updated, and utilized to search for associative additions\nto the stream of thought. The focus is on replicating the dynamics of the\nmammalian working memory system, which features two forms of persistent\nactivity: sustained firing (preserving information on the order of seconds) and\nsynaptic potentiation (preserving information from minutes to hours). The\narticle uses a series of figures to systematically demonstrate how the\niterative updating of these working memory stores provides functional\norganization to behavior, cognition, and awareness.\n In a machine learning implementation, these two memory stores should be\nupdated continuously and in an iterative fashion. This means each state should\npreserve a proportion of the coactive representations from the state before it\n(where each representation is an ensemble of neural network nodes). This makes\neach state a revised iteration of the preceding state and causes successive\nconfigurations to overlap and blend with respect to the information they\ncontain. Thus, the set of concepts in working memory will evolve gradually and\nincrementally over time. Transitions between states happen as persistent\nactivity spreads activation energy throughout the hierarchical network,\nsearching long-term memory for the most appropriate representation to be added\nto the global workspace. The result is a chain of associatively linked\nintermediate states capable of advancing toward a solution or goal. Iterative\nupdating is conceptualized here as an information processing strategy, a model\nof working memory, a theory of consciousness, and an algorithm for designing\nand programming artificial intelligence (AI, AGI, and ASI).\n","authors":["Jared Edward Reser"],"pdf_url":"https://arxiv.org/pdf/2203.17255v7.pdf","comment":"88 pages and 53 figures"},{"id":"http://arxiv.org/abs/2405.13800v2","updated":"2024-11-14T23:53:05Z","published":"2024-05-22T16:25:03Z","title":"Dense Connector for MLLMs","summary":" Do we fully leverage the potential of visual encoder in Multimodal Large\nLanguage Models (MLLMs)? The recent outstanding performance of MLLMs in\nmultimodal understanding has garnered broad attention from both academia and\nindustry. In the current MLLM rat race, the focus seems to be predominantly on\nthe linguistic side. We witness the rise of larger and higher-quality\ninstruction datasets, as well as the involvement of larger-sized LLMs. Yet,\nscant attention has been directed towards the visual signals utilized by MLLMs,\noften assumed to be the final high-level features extracted by a frozen visual\nencoder. In this paper, we introduce the Dense Connector - a simple, effective,\nand plug-and-play vision-language connector that significantly enhances\nexisting MLLMs by leveraging multi-layer visual features, with minimal\nadditional computational overhead. Building on this, we also propose the\nEfficient Dense Connector, which achieves performance comparable to LLaVA-v1.5\nwith only 25% of the visual tokens. Furthermore, our model, trained solely on\nimages, showcases remarkable zero-shot capabilities in video understanding as\nwell. Experimental results across various vision encoders, image resolutions,\ntraining dataset scales, varying sizes of LLMs (2.7B->70B), and diverse\narchitectures of MLLMs (e.g., LLaVA-v1.5, LLaVA-NeXT and Mini-Gemini) validate\nthe versatility and scalability of our approach, achieving state-of-the-art\nperformance across 19 image and video benchmarks. We hope that this work will\nprovide valuable experience and serve as a basic module for future MLLM\ndevelopment. Code is available at https://github.com/HJYao00/DenseConnector .\n","authors":["Huanjin Yao","Wenhao Wu","Taojiannan Yang","YuXin Song","Mengxi Zhang","Haocheng Feng","Yifan Sun","Zhiheng Li","Wanli Ouyang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2405.13800v2.pdf","comment":"27 pages, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.04268v2","updated":"2024-11-14T23:46:34Z","published":"2024-08-08T07:11:57Z","title":"Evaluating Modern Approaches in 3D Scene Reconstruction: NeRF vs\n Gaussian-Based Methods","summary":" Exploring the capabilities of Neural Radiance Fields (NeRF) and\nGaussian-based methods in the context of 3D scene reconstruction, this study\ncontrasts these modern approaches with traditional Simultaneous Localization\nand Mapping (SLAM) systems. Utilizing datasets such as Replica and ScanNet, we\nassess performance based on tracking accuracy, mapping fidelity, and view\nsynthesis. Findings reveal that NeRF excels in view synthesis, offering unique\ncapabilities in generating new perspectives from existing data, albeit at\nslower processing speeds. Conversely, Gaussian-based methods provide rapid\nprocessing and significant expressiveness but lack comprehensive scene\ncompletion. Enhanced by global optimization and loop closure techniques, newer\nmethods like NICE-SLAM and SplaTAM not only surpass older frameworks such as\nORB-SLAM2 in terms of robustness but also demonstrate superior performance in\ndynamic and complex environments. This comparative analysis bridges theoretical\nresearch with practical implications, shedding light on future developments in\nrobust 3D scene reconstruction across various real-world applications.\n","authors":["Yiming Zhou","Zixuan Zeng","Andi Chen","Xiaofan Zhou","Haowei Ni","Shiyao Zhang","Panfeng Li","Liangxi Liu","Mengyao Zheng","Xupeng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.04268v2.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2411.09838v1","updated":"2024-11-14T23:11:45Z","published":"2024-11-14T23:11:45Z","title":"OneNet: A Channel-Wise 1D Convolutional U-Net","summary":" Many state-of-the-art computer vision architectures leverage U-Net for its\nadaptability and efficient feature extraction. However, the multi-resolution\nconvolutional design often leads to significant computational demands, limiting\ndeployment on edge devices. We present a streamlined alternative: a 1D\nconvolutional encoder that retains accuracy while enhancing its suitability for\nedge applications. Our novel encoder architecture achieves semantic\nsegmentation through channel-wise 1D convolutions combined with pixel-unshuffle\noperations. By incorporating PixelShuffle, known for improving accuracy in\nsuper-resolution tasks while reducing computational load, OneNet captures\nspatial relationships without requiring 2D convolutions, reducing parameters by\nup to 47%. Additionally, we explore a fully 1D encoder-decoder that achieves a\n71% reduction in size, albeit with some accuracy loss. We benchmark our\napproach against U-Net variants across diverse mask-generation tasks,\ndemonstrating that it preserves accuracy effectively. Although focused on image\nsegmentation, this architecture is adaptable to other convolutional\napplications. Code for the project is available at\nhttps://github.com/shbyun080/OneNet .\n","authors":["Sanghyun Byun","Kayvan Shah","Ayushi Gang","Christopher Apton","Jacob Song","Woo Seong Chung"],"pdf_url":"https://arxiv.org/pdf/2411.09838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09823v1","updated":"2024-11-14T22:15:48Z","published":"2024-11-14T22:15:48Z","title":"Architect: Generating Vivid and Interactive 3D Scenes with Hierarchical\n 2D Inpainting","summary":" Creating large-scale interactive 3D environments is essential for the\ndevelopment of Robotics and Embodied AI research. Current methods, including\nmanual design, procedural generation, diffusion-based scene generation, and\nlarge language model (LLM) guided scene design, are hindered by limitations\nsuch as excessive human effort, reliance on predefined rules or training\ndatasets, and limited 3D spatial reasoning ability. Since pre-trained 2D image\ngenerative models better capture scene and object configuration than LLMs, we\naddress these challenges by introducing Architect, a generative framework that\ncreates complex and realistic 3D embodied environments leveraging\ndiffusion-based 2D image inpainting. In detail, we utilize foundation visual\nperception models to obtain each generated object from the image and leverage\npre-trained depth estimation models to lift the generated 2D image to 3D space.\nOur pipeline is further extended to a hierarchical and iterative inpainting\nprocess to continuously generate placement of large furniture and small objects\nto enrich the scene. This iterative structure brings the flexibility for our\nmethod to generate or refine scenes from various starting points, such as text,\nfloor plans, or pre-arranged environments.\n","authors":["Yian Wang","Xiaowen Qiu","Jiageng Liu","Zhehuan Chen","Jiting Cai","Yufei Wang","Tsun-Hsuan Wang","Zhou Xian","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2411.09823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09822v1","updated":"2024-11-14T22:00:37Z","published":"2024-11-14T22:00:37Z","title":"A Self-Supervised Model for Multi-modal Stroke Risk Prediction","summary":" Predicting stroke risk is a complex challenge that can be enhanced by\nintegrating diverse clinically available data modalities. This study introduces\na self-supervised multimodal framework that combines 3D brain imaging, clinical\ndata, and image-derived features to improve stroke risk prediction prior to\nonset. By leveraging large unannotated clinical datasets, the framework\ncaptures complementary and synergistic information across image and tabular\ndata modalities. Our approach is based on a contrastive learning framework that\ncouples contrastive language-image pretraining with an image-tabular matching\nmodule, to better align multimodal data representations in a shared latent\nspace. The model is trained on the UK Biobank, which includes structural brain\nMRI and clinical data. We benchmark its performance against state-of-the-art\nunimodal and multimodal methods using tabular, image, and image-tabular\ncombinations under diverse frozen and trainable model settings. The proposed\nmodel outperformed self-supervised tabular (image) methods by 2.6% (2.6%) in\nROC-AUC and by 3.3% (5.6%) in balanced accuracy. Additionally, it showed a 7.6%\nincrease in balanced accuracy compared to the best multimodal supervised model.\nThrough interpretable tools, our approach demonstrated better integration of\ntabular and image data, providing richer and more aligned embeddings.\nGradient-weighted Class Activation Mapping heatmaps further revealed activated\nbrain regions commonly associated in the literature with brain aging, stroke\nrisk, and clinical outcomes. This robust self-supervised multimodal framework\nsurpasses state-of-the-art methods for stroke risk prediction and offers a\nstrong foundation for future studies integrating diverse data modalities to\nadvance clinical predictive modelling.\n","authors":["Camille Delgrange","Olga Demler","Samia Mora","Bjoern Menze","Ezequiel de la Rosa","Neda Davoudi"],"pdf_url":"https://arxiv.org/pdf/2411.09822v1.pdf","comment":"Accepted as oral paper at AIM-FM workshop, Neurips 2024"},{"id":"http://arxiv.org/abs/2411.09821v1","updated":"2024-11-14T21:53:46Z","published":"2024-11-14T21:53:46Z","title":"Automatic Classification of General Movements in Newborns","summary":" General movements (GMs) are spontaneous, coordinated body movements in\ninfants that offer valuable insights into the developing nervous system.\nAssessed through the Prechtl GM Assessment (GMA), GMs are reliable predictors\nfor neurodevelopmental disorders. However, GMA requires specifically trained\nclinicians, who are limited in number. To scale up newborn screening, there is\na need for an algorithm that can automatically classify GMs from infant video\nrecordings. This data poses challenges, including variability in recording\nlength, device type, and setting, with each video coarsely annotated for\noverall movement quality. In this work, we introduce a tool for extracting\nfeatures from these recordings and explore various machine learning techniques\nfor automated GM classification.\n","authors":["Daphné Chopard","Sonia Laguna","Kieran Chin-Cheong","Annika Dietz","Anna Badura","Sven Wellmann","Julia E Vogt"],"pdf_url":"https://arxiv.org/pdf/2411.09821v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages"},{"id":"http://arxiv.org/abs/2406.05191v4","updated":"2024-11-14T21:26:33Z","published":"2024-06-07T18:17:17Z","title":"DiffusionPID: Interpreting Diffusion via Partial Information\n Decomposition","summary":" Text-to-image diffusion models have made significant progress in generating\nnaturalistic images from textual inputs, and demonstrate the capacity to learn\nand represent complex visual-semantic relationships. While these diffusion\nmodels have achieved remarkable success, the underlying mechanisms driving\ntheir performance are not yet fully accounted for, with many unanswered\nquestions surrounding what they learn, how they represent visual-semantic\nrelationships, and why they sometimes fail to generalize. Our work presents\nDiffusion Partial Information Decomposition (DiffusionPID), a novel technique\nthat applies information-theoretic principles to decompose the input text\nprompt into its elementary components, enabling a detailed examination of how\nindividual tokens and their interactions shape the generated image. We\nintroduce a formal approach to analyze the uniqueness, redundancy, and synergy\nterms by applying PID to the denoising model at both the image and pixel level.\nThis approach enables us to characterize how individual tokens and their\ninteractions affect the model output. We first present a fine-grained analysis\nof characteristics utilized by the model to uniquely localize specific\nconcepts, we then apply our approach in bias analysis and show it can recover\ngender and ethnicity biases. Finally, we use our method to visually\ncharacterize word ambiguity and similarity from the model's perspective and\nillustrate the efficacy of our method for prompt intervention. Our results show\nthat PID is a potent tool for evaluating and diagnosing text-to-image diffusion\nmodels.\n","authors":["Rushikesh Zawar","Shaurya Dewan","Prakanshul Saxena","Yingshan Chang","Andrew Luo","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2406.05191v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07463v3","updated":"2024-11-14T21:20:34Z","published":"2024-11-12T00:54:26Z","title":"MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation\n Models, Convolutional Neural Networks, and Uncertainty Quantification for\n High-Speed Video Phase Detection Data","summary":" Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in\nnuclear reactors, chemical processing, and electronics cooling for detecting\nvapor, liquid, and microlayer phases. Traditional segmentation models face\npixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ\nintroduces VideoSAM, a hybrid framework leveraging convolutional neural\nnetworks (CNNs) and transformer-based vision models to enhance segmentation\naccuracy and generalizability across complex multimodal PD tasks. Methods:\nVideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced\nfeature extraction and segmentation across diverse HSV PD modalities, spanning\nfluids like water, FC-72, nitrogen, and argon under varied heat flux\nconditions. The framework also incorporates uncertainty quantification (UQ) to\nassess pixel-based discretization errors, delivering reliable metrics such as\ncontact line density and dry area fraction under experimental conditions.\nResults: VideoSAM outperforms SAM and modality-specific CNN models in\nsegmentation accuracy, excelling in environments with complex phase boundaries,\noverlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid\narchitecture supports cross-dataset generalization, adapting effectively to\nvarying modalities. The UQ module provides accurate error estimates, enhancing\nthe reliability of segmentation outputs for advanced HSV PD research.\nConclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD\nsegmentation, addressing previous limitations with advanced deep learning and\nUQ techniques. The open-source datasets and tools introduced enable scalable,\nprecise, and adaptable segmentation for multimodal PD datasets, supporting\nadvancements in HSV analysis and autonomous experimentation. The codes and data\nused for this paper are publicly available at\nhttps://github.com/chikap421/mseg_vcuq\n","authors":["Chika Maduabuchi","Ericmoore Jossou","Matteo Bucci"],"pdf_url":"https://arxiv.org/pdf/2411.07463v3.pdf","comment":"Under Review in EAAI"},{"id":"http://arxiv.org/abs/2411.09798v1","updated":"2024-11-14T20:15:25Z","published":"2024-11-14T20:15:25Z","title":"Video Denoising in Fluorescence Guided Surgery","summary":" Fluorescence guided surgery (FGS) is a promising surgical technique that\ngives surgeons a unique view of tissue that is used to guide their practice by\ndelineating tissue types and diseased areas. As new fluorescent contrast agents\nare developed that have low fluorescent photon yields, it becomes increasingly\nimportant to develop computational models to allow FGS systems to maintain good\nvideo quality in real time environments. To further complicate this task, FGS\nhas a difficult bias noise term from laser leakage light (LLL) that represents\nunfiltered excitation light that can be on the order of the fluorescent signal.\nMost conventional video denoising methods focus on zero mean noise, and\nnon-causal processing, both of which are violated in FGS. Luckily in FGS, often\na co-located reference video is also captured which we use to simulate the LLL\nand assist in the denoising processes. In this work, we propose an accurate\nnoise simulation pipeline that includes LLL and propose three baseline deep\nlearning based algorithms for FGS video denoising.\n","authors":["Trevor Seets","Andreas Velten"],"pdf_url":"https://arxiv.org/pdf/2411.09798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07066v3","updated":"2024-11-14T20:13:19Z","published":"2024-07-09T17:42:26Z","title":"Explainable Differential Privacy-Hyperdimensional Computing for\n Balancing Privacy and Transparency in Additive Manufacturing Monitoring","summary":" Machine Learning (ML) models combined with in-situ sensing offer a powerful\nsolution to address defect detection challenges in Additive Manufacturing (AM),\nyet this integration raises critical data privacy concerns, such as data\nleakage and sensor data compromise, potentially exposing sensitive information\nabout part design and material composition. Differential Privacy (DP), which\nadds mathematically controlled noise to ML models, provides a way to balance\ndata utility with privacy by concealing identifiable traces from sensor data.\nHowever, introducing noise into ML models, especially black-box Artificial\nIntelligence (AI) models, complicates the prediction of how noise impacts model\naccuracy. This study presents the Differential Privacy-Hyperdimensional\nComputing (DP-HD) framework, which leverages Explainable AI (XAI) and the\nvector symbolic paradigm to quantify noise effects on accuracy. By defining a\nSignal-to-Noise Ratio (SNR) metric, DP-HD assesses the contribution of training\ndata relative to DP noise, allowing selection of an optimal balance between\naccuracy and privacy. Experimental results using high-speed melt pool data for\nanomaly detection in AM demonstrate that DP-HD achieves superior operational\nefficiency, prediction accuracy, and privacy protection. For instance, with a\nprivacy budget set at 1, DP-HD achieves 94.43% accuracy, outperforming\nstate-of-the-art ML models. Furthermore, DP-HD maintains high accuracy under\nsubstantial noise additions to enhance privacy, unlike current models that\nexperience significant accuracy declines under stringent privacy constraints.\n","authors":["Fardin Jalil Piran","Prathyush P. Poduval","Hamza Errahmouni Barkam","Mohsen Imani","Farhad Imani"],"pdf_url":"https://arxiv.org/pdf/2407.07066v3.pdf","comment":"28 pages, 13 figures"},{"id":"http://arxiv.org/abs/2411.09767v1","updated":"2024-11-14T19:24:46Z","published":"2024-11-14T19:24:46Z","title":"Deep Learning for Fetal Inflammatory Response Diagnosis in the Umbilical\n Cord","summary":" Inflammation of the umbilical cord can be seen as a result of ascending\nintrauterine infection or other inflammatory stimuli. Acute fetal inflammatory\nresponse (FIR) is characterized by infiltration of the umbilical cord by fetal\nneutrophils, and can be associated with neonatal sepsis or fetal inflammatory\nresponse syndrome. Recent advances in deep learning in digital pathology have\ndemonstrated favorable performance across a wide range of clinical tasks, such\nas diagnosis and prognosis. In this study we classified FIR from whole slide\nimages (WSI). We digitized 4100 histological slides of umbilical cord stained\nwith hematoxylin and eosin(H&E) and extracted placental diagnoses from the\nelectronic health record. We build models using attention-based whole slide\nlearning models. We compared strategies between features extracted by a model\n(ConvNeXtXLarge) pretrained on non-medical images (ImageNet), and one\npretrained using histopathology images (UNI). We trained multiple iterations of\neach model and combined them into an ensemble. The predictions from the\nensemble of models trained using UNI achieved an overall balanced accuracy of\n0.836 on the test dataset. In comparison, the ensembled predictions using\nConvNeXtXLarge had a lower balanced accuracy of 0.7209. Heatmaps generated from\ntop accuracy model appropriately highlighted arteritis in cases of FIR 2. In\nFIR 1, the highest performing model assigned high attention to areas of\nactivated-appearing stroma in Wharton's Jelly. However, other high-performing\nmodels assigned attention to umbilical vessels. We developed models for\ndiagnosis of FIR from placental histology images, helping reduce interobserver\nvariability among pathologists. Future work may examine the utility of these\nmodels for identifying infants at risk of systemic inflammatory response or\nearly onset neonatal sepsis.\n","authors":["Marina A. Ayad","Ramin Nateghi","Abhishek Sharma","Lawrence Chillrud","Tilly Seesillapachai","Lee A. D. Cooper","Jeffery A. Goldstein"],"pdf_url":"https://arxiv.org/pdf/2411.09767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09766v1","updated":"2024-11-14T19:22:36Z","published":"2024-11-14T19:22:36Z","title":"NACNet: A Histology Context-aware Transformer Graph Convolution Network\n for Predicting Treatment Response to Neoadjuvant Chemotherapy in Triple\n Negative Breast Cancer","summary":" Neoadjuvant chemotherapy (NAC) response prediction for triple negative breast\ncancer (TNBC) patients is a challenging task clinically as it requires\nunderstanding complex histology interactions within the tumor microenvironment\n(TME). Digital whole slide images (WSIs) capture detailed tissue information,\nbut their giga-pixel size necessitates computational methods based on multiple\ninstance learning, which typically analyze small, isolated image tiles without\nthe spatial context of the TME. To address this limitation and incorporate TME\nspatial histology interactions in predicting NAC response for TNBC patients, we\ndeveloped a histology context-aware transformer graph convolution network\n(NACNet). Our deep learning method identifies the histopathological labels on\nindividual image tiles from WSIs, constructs a spatial TME graph, and\nrepresents each node with features derived from tissue texture and social\nnetwork analysis. It predicts NAC response using a transformer graph\nconvolution network model enhanced with graph isomorphism network layers. We\nevaluate our method with WSIs of a cohort of TNBC patient (N=105) and compared\nits performance with multiple state-of-the-art machine learning and deep\nlearning models, including both graph and non-graph approaches. Our NACNet\nachieves 90.0% accuracy, 96.0% sensitivity, 88.0% specificity, and an AUC of\n0.82, through eight-fold cross-validation, outperforming baseline models. These\ncomprehensive experimental results suggest that NACNet holds strong potential\nfor stratifying TNBC patients by NAC response, thereby helping to prevent\novertreatment, improve patient quality of life, reduce treatment cost, and\nenhance clinical outcomes, marking an important advancement toward personalized\nbreast cancer treatment.\n","authors":["Qiang Li","George Teodoro","Yi Jiang","Jun Kong"],"pdf_url":"https://arxiv.org/pdf/2411.09766v1.pdf","comment":"This paper is accepted by Computerized Medical Imaging and Graphics\n (Nov 07 2024)"},{"id":"http://arxiv.org/abs/2411.09758v1","updated":"2024-11-14T19:16:01Z","published":"2024-11-14T19:16:01Z","title":"Partial Multi-View Clustering via Meta-Learning and Contrastive Feature\n Alignment","summary":" Partial multi-view clustering (PVC) presents significant challenges practical\nresearch problem for data analysis in real-world applications, especially when\nsome views of the data are partially missing. Existing clustering methods\nstruggle to handle incomplete views effectively, leading to suboptimal\nclustering performance. In this paper, we propose a novel dual optimization\nframework based on contrastive learning, which aims to maximize the consistency\nof latent features in incomplete multi-view data and improve clustering\nperformance through deep learning models. By combining a fine-tuned Vision\nTransformer and k-nearest neighbors (KNN), we fill in missing views and\ndynamically adjust view weights using self-supervised learning and\nmeta-learning. Experimental results demonstrate that our framework outperforms\nstate-of-the-art clustering models on the BDGP and HW datasets, particularly in\nhandling complex and incomplete multi-view data.\n","authors":["BoHao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06463v3","updated":"2024-11-14T19:06:05Z","published":"2024-05-10T13:15:42Z","title":"MRSegmentator: Multi-Modality Segmentation of 40 Classes in MRI and CT","summary":" Purpose: To develop and evaluate a deep learning model for multi-organ\nsegmentation of MRI scans.\n Materials and Methods: The model was trained on 1,200 manually annotated 3D\naxial MRI scans from the UK Biobank, 221 in-house MRI scans, and 1228 CT scans\nfrom the TotalSegmentator dataset. A human-in-the-loop annotation workflow was\nemployed, leveraging cross-modality transfer learning from an existing CT\nsegmentation model to segment 40 anatomical structures. The annotation process\nbegan with a model based on transfer learning between CT and MR, which was\niteratively refined based on manual corrections to predicted segmentations. The\nmodel's performance was evaluated on MRI examinations obtained from the German\nNational Cohort (NAKO) study (n=900) from the AMOS22 dataset (n=60) and from\nthe TotalSegmentator-MRI test data (n=29). The Dice Similarity Coefficient\n(DSC) and Hausdorff Distance (HD) were used to assess segmentation quality,\nstratified by organ and scan type. The model and its weights will be\nopen-sourced.\n Results: MRSegmentator demonstrated high accuracy for well-defined organs\n(lungs: DSC 0.96, heart: DSC 0.94) and organs with anatomic variability (liver:\nDSC 0.96, kidneys: DSC 0.95). Smaller structures showed lower accuracy\n(portal/splenic veins: DSC 0.64, adrenal glands: DSC 0.69). On external\nvalidation using NAKO data, mean DSC ranged from 0.85 $\\pm$ 0.08 for T2-HASTE\nto 0.91 $\\pm$ 0.05 for in-phase sequences. The model generalized well to CT,\nachieving mean DSC of 0.84 $\\pm$ 0.11 on AMOS CT data.\n Conclusion: MRSegmentator accurately segments 40 anatomical structures in MRI\nacross diverse datasets and imaging protocols, with additional generalizability\nto CT images. This open-source model will provide a valuable tool for automated\nmulti-organ segmentation in medical imaging research. It can be downloaded from\nhttps://github.com/hhaentze/MRSegmentator.\n","authors":["Hartmut Häntze","Lina Xu","Christian J. Mertens","Felix J. Dorfner","Leonhard Donle","Felix Busch","Avan Kader","Sebastian Ziegelmayer","Nadine Bayerl","Nassir Navab","Daniel Rueckert","Julia Schnabel","Hugo JWL Aerts","Daniel Truhn","Fabian Bamberg","Jakob Weiß","Christopher L. Schlett","Steffen Ringhof","Thoralf Niendorf","Tobias Pischon","Hans-Ulrich Kauczor","Tobias Nonnenmacher","Thomas Kröncke","Henry Völzke","Jeanette Schulz-Menger","Klaus Maier-Hein","Mathias Prokop","Bram van Ginneken","Alessa Hering","Marcus R. Makowski","Lisa C. Adams","Keno K. Bressem"],"pdf_url":"https://arxiv.org/pdf/2405.06463v3.pdf","comment":"17 pages, 6 figures; updated data; completed co-author info"},{"id":"http://arxiv.org/abs/2411.09751v1","updated":"2024-11-14T19:05:47Z","published":"2024-11-14T19:05:47Z","title":"Analyzing the AI Nudification Application Ecosystem","summary":" Given a source image of a clothed person (an image subject), AI-based\nnudification applications can produce nude (undressed) images of that person.\nMoreover, not only do such applications exist, but there is ample evidence of\nthe use of such applications in the real world and without the consent of an\nimage subject. Still, despite the growing awareness of the existence of such\napplications and their potential to violate the rights of image subjects and\ncause downstream harms, there has been no systematic study of the nudification\napplication ecosystem across multiple applications. We conduct such a study\nhere, focusing on 20 popular and easy-to-find nudification websites. We study\nthe positioning of these web applications (e.g., finding that most sites\nexplicitly target the nudification of women, not all people), the features that\nthey advertise (e.g., ranging from undressing-in-place to the rendering of\nimage subjects in sexual positions, as well as differing user-privacy options),\nand their underlying monetization infrastructure (e.g., credit cards and\ncryptocurrencies). We believe this work will empower future, data-informed\nconversations -- within the scientific, technical, and policy communities -- on\nhow to better protect individuals' rights and minimize harm in the face of\nmodern (and future) AI-based nudification applications. Content warning: This\npaper includes descriptions of web applications that can be used to create\nsynthetic non-consensual explicit AI-created imagery (SNEACI). This paper also\nincludes an artistic rendering of a user interface for such an application.\n","authors":["Cassidy Gibson","Daniel Olszewski","Natalie Grace Brigham","Anna Crowder","Kevin R. B. Butler","Patrick Traynor","Elissa M. Redmiles","Tadayoshi Kohno"],"pdf_url":"https://arxiv.org/pdf/2411.09751v1.pdf","comment":"22 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.09749v1","updated":"2024-11-14T19:03:11Z","published":"2024-11-14T19:03:11Z","title":"Adversarial Attacks Using Differentiable Rendering: A Survey","summary":" Differentiable rendering methods have emerged as a promising means for\ngenerating photo-realistic and physically plausible adversarial attacks by\nmanipulating 3D objects and scenes that can deceive deep neural networks\n(DNNs). Recently, differentiable rendering capabilities have evolved\nsignificantly into a diverse landscape of libraries, such as Mitsuba,\nPyTorch3D, and methods like Neural Radiance Fields and 3D Gaussian Splatting\nfor solving inverse rendering problems that share conceptually similar\nproperties commonly used to attack DNNs, such as back-propagation and\noptimization. However, the adversarial machine learning research community has\nnot yet fully explored or understood such capabilities for generating attacks.\nSome key reasons are that researchers often have different attack goals, such\nas misclassification or misdetection, and use different tasks to accomplish\nthese goals by manipulating different representation in a scene, such as the\nmesh or texture of an object. This survey adopts a task-oriented unifying\nframework that systematically summarizes common tasks, such as manipulating\ntextures, altering illumination, and modifying 3D meshes to exploit\nvulnerabilities in DNNs. Our framework enables easy comparison of existing\nworks, reveals research gaps and spotlights exciting future research directions\nin this rapidly evolving field. Through focusing on how these tasks enable\nattacks on various DNNs such as image classification, facial recognition,\nobject detection, optical flow and depth estimation, our survey helps\nresearchers and practitioners better understand the vulnerabilities of computer\nvision systems against photorealistic adversarial attacks that could threaten\nreal-world applications.\n","authors":["Matthew Hull","Chao Zhang","Zsolt Kira","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2411.09749v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2411.09653v1","updated":"2024-11-14T18:21:05Z","published":"2024-11-14T18:21:05Z","title":"How to implement the Bayes' formula in the age of ML?","summary":" This chapter contains a self-contained introduction to the significance of\nBayes' formula in the context of nonlinear filtering problems. Both\ndiscrete-time and continuous-time settings of the problem are considered in a\nunified manner. In control theory, the focus on optimization-based solution\napproaches is stressed together with a discussion of historical developments in\nthis area (from 1960s onwards). The heart of this chapter contains a\npresentation of a novel optimal transportation formulation for the Bayes\nformula (developed recently by the first author) and its relationship to some\nof the prior joint work (feedback particle filter) from the authors. The\npresentation highlights how optimal transportation theory is leveraged to\novercome some of the numerical challenges of implementing Bayes' law by\nenabling the use of machine learning (ML) tools.\n","authors":["Amirhossein Taghvaei","Prashant G. Mehta"],"pdf_url":"https://arxiv.org/pdf/2411.09653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09636v1","updated":"2024-11-14T18:03:12Z","published":"2024-11-14T18:03:12Z","title":"Nash equilibrium seeking for a class of quadratic-bilinear Wasserstein\n distributionally robust games","summary":" We consider a class of Wasserstein distributionally robust Nash equilibrium\nproblems, where agents construct heterogeneous data-driven Wasserstein\nambiguity sets using private samples and radii, in line with their individual\nrisk-averse behaviour. By leveraging relevant properties of this class of\ngames, we show that equilibria of the original seemingly infinite-dimensional\nproblem can be obtained as a solution to a finite-dimensional Nash equilibrium\nproblem. We then reformulate the problem as a finite-dimensional variational\ninequality and establish the connection between the corresponding solution\nsets. Our reformulation has scalable behaviour with respect to the data size\nand maintains a fixed number of constraints, independently of the number of\nsamples. To compute a solution, we leverage two algorithms, based on the golden\nratio algorithm. The efficiency of both algorithmic schemes is corroborated\nthrough extensive simulation studies on an illustrative example and a\nstochastic portfolio allocation game, where behavioural coupling among\ninvestors is modeled.\n","authors":["Georgios Pantazis","Reza Rahimi Bahbadorani","Sergio Grammatico"],"pdf_url":"https://arxiv.org/pdf/2411.09636v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.09582v1","updated":"2024-11-14T16:49:43Z","published":"2024-11-14T16:49:43Z","title":"Safety Filter for Robust Disturbance Rejection via Online Optimization","summary":" Disturbance rejection in high-precision control applications can be\nsignificantly improved upon via online convex optimization (OCO). This includes\nclassical techniques such as recursive least squares (RLS) and more recent,\nregret-based formulations. However, these methods can cause instabilities in\nthe presence of model uncertainty. This paper introduces a safety filter for\nsystems with OCO in the form of adaptive finite impulse response (FIR)\nfiltering to ensure robust disturbance rejection. The safety filter enforces a\nrobust stability constraint on the FIR coefficients while minimally altering\nthe OCO command in the $\\infty$-norm cost. Additionally, we show that the\ninduced $\\ell_\\infty$-norm allows for easy online implementation of the safety\nfilter by directly limiting the OCO command. The constraint can be tuned to\ntrade off robustness and performance. We provide a simple example to\ndemonstrate the safety filter.\n","authors":["Joyce Lai","Peter Seiler"],"pdf_url":"https://arxiv.org/pdf/2411.09582v1.pdf","comment":"Submitted to the 2025 European Control Conference. This paper builds\n on the work done in arXiv:2405.07037"},{"id":"http://arxiv.org/abs/2411.06542v2","updated":"2024-11-14T16:22:51Z","published":"2024-11-10T17:48:26Z","title":"Is Linear Feedback on Smoothed Dynamics Sufficient for Stabilizing\n Contact-Rich Plans?","summary":" Designing planners and controllers for contact-rich manipulation is extremely\nchallenging as contact violates the smoothness conditions that many\ngradient-based controller synthesis tools assume. Contact smoothing\napproximates a non-smooth system with a smooth one, allowing one to use these\nsynthesis tools more effectively. However, applying classical control synthesis\nmethods to smoothed contact dynamics remains relatively under-explored. This\npaper analyzes the efficacy of linear controller synthesis using differential\nsimulators based on contact smoothing. We introduce natural baselines for\nleveraging contact smoothing to compute (a) open-loop plans robust to uncertain\nconditions and/or dynamics, and (b) feedback gains to stabilize around\nopen-loop plans. Using robotic bimanual whole-body manipulation as a testbed,\nwe perform extensive empirical experiments on over 300 trajectories and analyze\nwhy LQR seems insufficient for stabilizing contact-rich plans. The video\nsummarizing this paper and hardware experiments is found here:\nhttps://youtu.be/HLaKi6qbwQg?si=_zCAmBBD6rGSitm9.\n","authors":["Yuki Shirai","Tong Zhao","H. J. Terry Suh","Huaijiang Zhu","Xinpei Ni","Jiuguang Wang","Max Simchowitz","Tao Pang"],"pdf_url":"https://arxiv.org/pdf/2411.06542v2.pdf","comment":"Under review for ICRA2025"},{"id":"http://arxiv.org/abs/2309.11477v3","updated":"2024-11-14T16:13:31Z","published":"2023-09-20T17:22:28Z","title":"Multi-Agent Control Synthesis from Global Temporal Logic Tasks with\n Synchronous Satisfaction Requirements","summary":" This paper addresses the multi-agent control problem under global temporal\nlogic tasks, considering agents with heterogeneous capabilities. These global\ntasks involve not only absolute and relative temporal and spatial constraints,\nbut also group behaviors, including task completion times, agent capabilities,\nand task interdependencies such as the need for synchronous execution. The\nglobal tasks are formally formulated into global signal temporal logic (STL)\nformulae, and a synchronous robustness metric is designed to evaluate the\nsynchronization quality with real values. A mixed-integer linear programming\n(MILP) encoding method is further proposed to realize task-satisfied motion\nplanning with high synchronicity and minimum control efforts. The encoding\nmethod uses a logarithmic number of binary variables to fully capture\nsynchronous robustness, leading to only linear computational complexity.\nSimulations are conducted to demonstrate the efficiency of the proposed control\nstrategy.\n","authors":["Tiange Yang","Yuanyuan Zou","Jinfeng Liu","Shaoyuan Li","Xiaohu Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.11477v3.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.09550v1","updated":"2024-11-14T16:05:33Z","published":"2024-11-14T16:05:33Z","title":"A small-gain criterion for 2-contraction of large scale interconnected\n systems","summary":" Despite modular conditions to guarantee stability for large-scale systems\nhave been widely studied, few methods are available to tackle the case of\nnetworks with multiple equilibria. This paper introduces small-gain like\nsufficient conditions for 2-contraction of large-scale interconnected systems\non the basis of a family of upper-bounds to the $L_2$ gains that arise from the\ngains computed on individual channels of the second additive variational\nequation. Such a condition guarantee the 2-additive compound of the system's\nJacobian to be exponentially contractive, thus implying convergence towards\nequilibria of the system's solutions. The gains are obtained by solving\nsuitable Linear Matrix Inequalities. Three interconnected Thomas' systems are\nconsidered in order to illustrate the application of the theory and the degree\nof conservatism.\n","authors":["David Angeli","Davide Martini","Giacomo Innocenti","Alberto Tesi"],"pdf_url":"https://arxiv.org/pdf/2411.09550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09546v1","updated":"2024-11-14T16:01:05Z","published":"2024-11-14T16:01:05Z","title":"Architectural Exploration of Application-Specific Resonant SRAM\n Compute-in-Memory (rCiM)","summary":" While general-purpose computing follows Von Neumann's architecture, the data\nmovement between memory and processor elements dictates the processor's\nperformance. The evolving compute-in-memory (CiM) paradigm tackles this issue\nby facilitating simultaneous processing and storage within static random-access\nmemory (SRAM) elements. Numerous design decisions taken at different levels of\nhierarchy affect the figure of merits (FoMs) of SRAM, such as power,\nperformance, area, and yield. The absence of a rapid assessment mechanism for\nthe impact of changes at different hierarchy levels on global FoMs poses a\nchallenge to accurately evaluating innovative SRAM designs. This paper presents\nan automation tool designed to optimize the energy and latency of SRAM designs\nincorporating diverse implementation strategies for executing logic operations\nwithin the SRAM. The tool structure allows easy comparison across different\narray topologies and various design strategies to result in energy-efficient\nimplementations. Our study involves a comprehensive comparison of over 6900+\ndistinct design implementation strategies for EPFL combinational benchmark\ncircuits on the energy-recycling resonant compute-in-memory (rCiM) architecture\ndesigned using TSMC 28 nm technology. When provided with a combinational\ncircuit, the tool aims to generate an energy-efficient implementation strategy\ntailored to the specified input memory and latency constraints. The tool\nreduces 80.9% of energy consumption on average across all benchmarks while\nusing the six-topology implementation compared to baseline implementation of\nsingle-macro topology by considering the parallel processing capability of rCiM\ncache size ranging from 4KB to 192KB.\n","authors":["Dhandeep Challagundla","Ignatius Bezzam","Riadul Islam"],"pdf_url":"https://arxiv.org/pdf/2411.09546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00921v2","updated":"2024-11-14T14:52:19Z","published":"2023-09-02T12:02:18Z","title":"An iterative scheme for finite horizon model reduction of\n continuous-time linear time-varying systems","summary":" In this paper, we obtain the functional derivatives of a finite horizon error\nnorm between a full-order and a reduced-order continuous-time linear\ntime-varying (LTV) system. Based on the functional derivatives, first-order\nnecessary conditions for optimality of the error norm are derived, and a\nprojection-based iterative scheme for model reduction is proposed. The\niterative scheme upon convergence produces reduced-order models satisfying the\noptimality conditions. Finally, through a numerical example, we demonstrate the\nbetter performance of the proposed model reduction scheme in comparison to the\nfinite horizon balanced truncation algorithm for continuous-time LTV systems.\n","authors":["Kasturi Das","Srinivasan Krishnaswamy","Somanath Majhi"],"pdf_url":"https://arxiv.org/pdf/2309.00921v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01970v2","updated":"2024-11-14T14:01:29Z","published":"2023-12-04T15:33:00Z","title":"CaRL: Cascade Reinforcement Learning with State Space Splitting for\n O-RAN based Traffic Steering","summary":" The Open Radio Access Network (O-RAN) architecture empowers intelligent and\nautomated optimization of the RAN through applications deployed on the RAN\nIntelligent Controller (RIC) platform, enabling capabilities beyond what is\nachievable with traditional RAN solutions. Within this paradigm, Traffic\nSteering (TS) emerges as a pivotal RIC application that focuses on optimizing\ncell-level mobility settings in near-real-time, aiming to significantly improve\nnetwork spectral efficiency. In this paper, we design a novel TS algorithm\nbased on a Cascade Reinforcement Learning (CaRL) framework. We propose state\nspace factorization and policy decomposition to reduce the need for large\nmodels and well-labeled datasets. For each sub-state space, an RL sub-policy\nwill be trained to learn an optimized mapping onto the action space. To apply\nCaRL on new network regions, we propose a knowledge transfer approach to\ninitialize a new sub-policy based on knowledge learned by the trained policies.\nTo evaluate CaRL, we build a data-driven and scalable RIC digital twin (DT)\nthat is modeled using important real-world data, including network\nconfiguration, user geo-distribution, and traffic demand, among others, from a\ntier-1 mobile operator in the US. We evaluate CaRL on two DT scenarios\nrepresenting two network clusters in two different cities and compare its\nperformance with the business-as-usual (BAU) policy and other competing\noptimization approaches using heuristic and Q-table algorithms. Benchmarking\nresults show that CaRL performs the best and improves the average\ncluster-aggregated downlink throughput over the BAU policy by 24% and 18% in\nthese two scenarios, respectively.\n","authors":["Chuanneng Sun","Gueyoung Jung","Tuyen Xuan Tran","Dario Pompili"],"pdf_url":"https://arxiv.org/pdf/2312.01970v2.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.13175v2","updated":"2024-11-14T13:49:52Z","published":"2023-12-20T16:35:55Z","title":"Nonlinear moving horizon estimation for robust state and parameter\n estimation - extended version","summary":" We propose a moving horizon estimation scheme to estimate the states and the\nunknown constant parameters of general nonlinear uncertain discrete-time\nsystems. The proposed framework and analysis explicitly do not involve the a\npriori verification of a particular excitation condition for the parameters.\nInstead, we use online information about the actual excitation of the\nparameters at any time during operation and ensure that the regularization term\nin the cost function is always automatically selected appropriately. This\nensures that the state and parameter estimation error is bounded for all times,\neven if the parameters are never (or only rarely) excited during operation.\nRobust exponential stability of the state and parameter estimation error\nemerges under an additional uniform condition on the maximum duration of\ninsufficient excitation. The theoretical results are illustrated by a numerical\nexample.\n","authors":["Julian D. Schiller","Matthias A. Müller"],"pdf_url":"https://arxiv.org/pdf/2312.13175v2.pdf","comment":"Replaced by revised version"},{"id":"http://arxiv.org/abs/2404.01901v2","updated":"2024-11-14T12:23:33Z","published":"2024-04-02T12:40:02Z","title":"Learning-based model augmentation with LFRs","summary":" Nonlinear system identification (NL-SI) has proven to be effective in\nobtaining accurate models for highly complex systems. Especially, recent\nencoder-based methods for artificial neural networks state-space (ANN-SS)\nmodels have achieved state-of-the-art performance on various benchmarks, while\noffering consistency and computational efficiency. The inclusion of prior\nknowledge of the system can be exploited to increase (i) estimation speed, (ii)\naccuracy, and (iii) interpretability of the resulting models. This paper\nproposes an encoder based model augmentation method incorporating prior\nknowledge from first-principles (FP) models. We introduce a novel\nlinear-fractional-representation (LFR) model structure that allows for the\nunified representation of various augmentation structures including the ones\nthat are commonly used in the literature, and an identification algorithm for\nestimating the proposed structure together with appropriate initialization\nmethods. The performance and generalization capabilities of the proposed method\nare demonstrated on a hardening mass-spring-damper simulation.\n","authors":["Jan H. Hoekstra","Chris Verhoek","Roland Tóth","Maarten Schoukens"],"pdf_url":"https://arxiv.org/pdf/2404.01901v2.pdf","comment":"Submitted for ECC 2025"},{"id":"http://arxiv.org/abs/2408.00341v2","updated":"2024-11-14T11:30:49Z","published":"2024-08-01T07:25:15Z","title":"Enhancing Attack Resilience in Real-Time Systems through Variable\n Control Task Sampling Rates","summary":" Cyber-physical systems (CPSs) in modern real-time applications integrate\nnumerous control units linked through communication networks, each responsible\nfor executing a mix of real-time safety-critical and non-critical tasks. To\nensure predictable timing behaviour, most safety-critical tasks are scheduled\nwith fixed sampling periods, which supports rigorous safety and performance\nanalyses. However, this deterministic execution can be exploited by attackers\nto launch inference-based attacks on safety-critical tasks. This paper\naddresses the challenge of preventing such timing inference or schedule-based\nattacks by dynamically adjusting the execution rates of safety-critical tasks\nwhile maintaining their performance. We propose a novel schedule vulnerability\nanalysis methodology, enabling runtime switching between valid schedules for\nvarious control task sampling rates. Leveraging this approach, we present the\nMulti-Rate Attack-Aware Randomized Scheduling (MAARS) framework for preemptive\nfixed-priority schedulers, designed to reduce the success rate of timing\ninference attacks on real-time systems. To our knowledge, this is the first\nmethod that combines attack-aware schedule randomization with preserved control\nand scheduling integrity. The framework's efficacy in attack prevention is\nevaluated on automotive benchmarks using a Hardware-in-the-Loop (HiL) setup.\n","authors":["Arkaprava Sain","Sunandan Adhikary","Ipsita Koley","Soumyajit Dey"],"pdf_url":"https://arxiv.org/pdf/2408.00341v2.pdf","comment":"12 pages including references, Total 10 figures (with 3 having\n subfigures)"},{"id":"http://arxiv.org/abs/2411.09335v1","updated":"2024-11-14T10:31:29Z","published":"2024-11-14T10:31:29Z","title":"Experimental Demonstration of Remote Synchronization in Coupled\n Nonlinear Oscillator","summary":" This study investigates remote synchronization in scale-free networks of\ncoupled nonlinear oscillators inspired by synchronization observed in the\nbrain's cortical regions and power grid. We employ the Master Stability\nFunction (MSF) approach to analyze network stability across various oscillator\nmodels. Synchronization results are obtained for a star network using\nlinearization techniques and extended to arbitrary networks with benchmark\noscillators, verifying consistent behavior. Stable synchronous solutions emerge\nas the Floquet multiplier decreases and the MSF becomes negative. Additionally,\nwe demonstrate remote synchronization in a star network, where peripheral\noscillators communicate exclusively through a central hub, drawing parallels to\nneuronal synchronization in the brain. Experimental validation is achieved\nthrough an electronic circuit testbed, supported by nonlinear ODE modeling and\nLTspice simulation. Future work will extend the investigation to arbitrary\nnetwork topologies, further elucidating synchronization dynamics in complex\nsystems.\n","authors":["Sanjeev Kumar Pandey"],"pdf_url":"https://arxiv.org/pdf/2411.09335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09320v1","updated":"2024-11-14T10:01:53Z","published":"2024-11-14T10:01:53Z","title":"AMARETTO: Enabling Efficient Quantum Algorithm Emulation on Low-Tier\n FPGAs","summary":" Researchers and industries are increasingly drawn to quantum computing for\nits computational potential. However, validating new quantum algorithms is\nchallenging due to the limitations of current quantum devices. Software\nsimulators are time and memory-consuming, making hardware emulators an\nattractive alternative. This article introduces AMARETTO (quAntuM ARchitecture\nEmulaTion TechnOlogy), designed for quantum computing emulation on low-tier\nField-Programmable gate arrays (FPGAs), supporting Clifford+T and rotational\ngate sets. It simplifies and accelerates the verification of quantum algorithms\nusing a Reduced-Instruction-Set-Computer (RISC)-like structure and efficient\nhandling of sparse quantum gates. A dedicated compiler translates OpenQASM 2.0\ninto RISC-like instructions. AMARETTO is validated against the Qiskit\nsimulators. Our results show successful emulation of sixteen qubits on a AMD\nKria KV260 SoM. This approach rivals other works in emulated qubit capacity on\na smaller, more affordable FPGA\n","authors":["Christian Conti","Deborah Volpe","Mariagrazia Graziano","Maurizio Zamboni","Giovanna Turvani"],"pdf_url":"https://arxiv.org/pdf/2411.09320v1.pdf","comment":"paper accepted at the IEEE International Conference on Electronics\n Circuits and Systems 2024 conference, 4 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.09307v1","updated":"2024-11-14T09:33:29Z","published":"2024-11-14T09:33:29Z","title":"Model-Based Event-Triggered Implementation of Hybrid Controllers Using\n Finite-Time Convergent Observers","summary":" In this paper, we explore the conditions for asymptotic stability of the\nhybrid closed-loop system resulting from the interconnection of a nonlinear\nplant, an intelligent sensor that generates finite-time convergent estimates of\nthe plant state, and a controller node that receives opportunistic samples from\nthe sensor node when certain model-based event-triggering conditions are met.\nThe proposed method is endowed with a degree of separation, in the sense that\nthe controller design is independent of the sensor design. This is achieved\nunder mild regularity conditions imposed on the hybrid closed-loop system and\nthe existence of persistently flowing solutions. We demonstrate the versatility\nof the method by implementing it on: 1) a sampled-data controller for\nregulation of linear plants; 2) a synergistic controller for attitude\nstabilization of rigid bodies. The effectiveness of these novel controllers is\ndemonstrated through numerical simulations.\n","authors":["Xuanzhi Zhu","Pedro Casau","Carlos Silvestre"],"pdf_url":"https://arxiv.org/pdf/2411.09307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09279v1","updated":"2024-11-14T08:30:52Z","published":"2024-11-14T08:30:52Z","title":"A Comparative Analysis of Electricity Consumption Flexibility in\n Different Industrial Plant Configurations","summary":" The flexibility of industrial power consumption plays a key role in the\ntransition to renewable energy systems, contributing to grid stability, cost\nreduction and decarbonization efforts. This paper presents a novel methodology\nto quantify and optimize the flexibility of electricity consumption in\nmanufacturing plants. The proposed model is applied to actual cement and steel\nplant configurations. Comparative simulations performed with the model reveal\nsignificant differences in flexibility and cost-effectiveness, driven by\nfactors such as production capacity, downstream process demand, storage\ncapacity, and operational constraints. A comprehensive sensitivity analysis\nfurther clarifies the impact of various parameters on production optimization\nand flexibility savings. Specifically, as demand approaches production levels,\nflexibility decreases. Although increasing storage capacity typically reduces\nproduction costs, the benefits diminish above a certain threshold. The results\nprovide valuable information for industrial operators wishing to improve\noperational efficiency, reduce costs and increase the flexibility of their\noperations.\n","authors":["Sebastián Rojas-Innocenti","Enrique Baeyens","Alejandro Martín-Crespo","Sergio Saludes-Rodil","Fernando Frechoso"],"pdf_url":"https://arxiv.org/pdf/2411.09279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09254v1","updated":"2024-11-14T07:36:37Z","published":"2024-11-14T07:36:37Z","title":"Are the flows of complex-valued Laplacians and their pseudoinverses\n related?","summary":" Laplacian flows model the rate of change of each node's state as being\nproportional to the difference between its value and that of its neighbors.\nTypically, these flows capture diffusion or synchronization dynamics and are\nwell-studied. Expanding on these classical flows, we introduce a pseudoinverse\nLaplacian flow system, substituting the Laplacian with its pseudoinverse within\ncomplex-valued networks. Interestingly, for undirected graphs and unsigned\nweight-balanced digraphs, Laplacian and the pseudoinverse Laplacian flows\nexhibit an interdependence in terms of consensus. To show this relation, we\nfirst present the conditions for achieving consensus in the pseudoinverse\nLaplacian flow system using the property of real eventually exponentially\npositivity. Thereafter, we show that the pseudoinverse Laplacian flow system\nconverges to consensus if and only if the Laplacian flow system achieves\nconsensus in the above-mentioned networks. However, these are only the\nsufficient conditions for digraphs. Further, we illustrate the efficacy of the\nproposed approach through examples, focusing primarily on power networks.\n","authors":["Aditi Saxena","Twinkle Tripathy","Rajasekhar Anguluri"],"pdf_url":"https://arxiv.org/pdf/2411.09254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09237v1","updated":"2024-11-14T07:09:36Z","published":"2024-11-14T07:09:36Z","title":"Unsupervised Physics-Informed Neural Network-based Nonlinear Observer\n design for autonomous systems using contraction analysis","summary":" Contraction analysis offers, through elegant mathematical developments, a\nunified way of designing observers for a general class of nonlinear systems,\nwhere the observer correction term is obtained by solving an infinite\ndimensional inequality that guarantees global exponential convergence. However,\nsolving the matrix partial differential inequality involved in contraction\nanalysis design is both analytically and numerically challenging and represents\na long-lasting challenge that prevented its wide use. Therefore, the present\npaper proposes a novel approach that relies on an unsupervised Physics Informed\nNeural Network (PINN) to design the observer's correction term by enforcing the\npartial differential inequality in the loss function. The performance of the\nproposed PINN-based nonlinear observer is assessed in numerical simulation as\nwell as its robustness to measurement noise and neural network approximation\nerror.\n","authors":["Yasmine Marani","Israel Filho","Tareq Al-Naffouri","Taous-Meriem Laleg-Kirati"],"pdf_url":"https://arxiv.org/pdf/2411.09237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08121v2","updated":"2024-11-14T07:04:03Z","published":"2024-08-15T12:48:40Z","title":"Optimizing Highway Ramp Merge Safety and Efficiency via Spatio-Temporal\n Cooperative Control and Vehicle-Road Coordination","summary":" In view of existing automatic driving is difficult to accurately and timely\nobtain the status and driving intention of other vehicles and the safety risk\nand urgency of autonomous vehicles in the absence of collision are evaluated.\nAs a result, while vehicles generally maintain safe distances, accidents still\nfrequently occur, particularly in merging areas. To ensure safety, improve road\nefficiency, this paper presents a pre-programmed technique for managing\nvehicles' spatiotemporal trajectories to proactively mitigate conflicts among\nvehicles. Firstly, the study focuses on the calculation of safe distances under\nvarying spatiotemporal conditions, taking into account differences in vehicle\nspeed. Subsequently, an evaluation model for vehicle conflict risk is\ndeveloped, which incorporates critical parameters such as collision\nacceleration and emergency acceleration. The methodology further identifies the\nmain line vehicles that are potentially in conflict with on-ramp vehicles and\ndetermines the target gap for the latter. Based on this selected target gap, a\ncooperative control method is formulated, enabling the pre-programming of\nvehicle trajectories. Using highway ramp merging as a case study, the paper\nintroduces a mainline priority spatiotemporal cooperative control method and\nvalidates its efficacy through rigorous simulations. The analysis indicates\nthat the average delay time can be reduced by 97.96%, and fuel consumption by\n6.01%. The mainline priority strategy demonstrates increased speed, low latency\nand low fuel consumption.\n","authors":["Ting Peng","Xiaoxue Xu","Yuan Li","Jie WU","Tao Li","Xiang Dong","Yincai Cai","Peng Wu","Sana Ullah"],"pdf_url":"https://arxiv.org/pdf/2408.08121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00476v3","updated":"2024-11-14T06:06:09Z","published":"2024-06-29T15:47:28Z","title":"Large Language Models for Power Scheduling: A User-Centric Approach","summary":" While traditional optimization and scheduling schemes are designed to meet\nfixed, predefined system requirements, future systems are moving toward\nuser-driven approaches and personalized services, aiming to achieve high\nquality-of-experience (QoE) and flexibility. This challenge is particularly\npronounced in wireless and digitalized energy networks, where users'\nrequirements have largely not been taken into consideration due to the lack of\na common language between users and machines. The emergence of powerful large\nlanguage models (LLMs) marks a radical departure from traditional\nsystem-centric methods into more advanced user-centric approaches by providing\na natural communication interface between users and devices. In this paper, for\nthe first time, we introduce a novel architecture for resource scheduling\nproblems by constructing three LLM agents to convert an arbitrary user's voice\nrequest (VRQ) into a resource allocation vector. Specifically, we design an LLM\nintent recognition agent to translate the request into an optimization problem\n(OP), an LLM OP parameter identification agent, and an LLM OP solving agent. To\nevaluate system performance, we construct a database of typical VRQs in the\ncontext of electric vehicle (EV) charging. As a proof of concept, we primarily\nuse Llama 3 8B. Through testing with different prompt engineering scenarios,\nthe obtained results demonstrate the efficiency of the proposed architecture.\nThe conducted performance analysis allows key insights to be extracted. For\ninstance, having a larger set of candidate OPs to model the real-world problem\nmight degrade the final performance because of a higher recognition/OP\nclassification noise level. All results and codes are open source.\n","authors":["Thomas Mongaillard","Samson Lasaulce","Othman Hicheur","Chao Zhang","Lina Bariah","Vineeth S. Varma","Hang Zou","Qiyang Zhao","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2407.00476v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14116v3","updated":"2024-11-14T05:00:46Z","published":"2024-10-18T02:01:47Z","title":"Robustness to Model Approximation, Empirical Model Learning, and Sample\n Complexity in Wasserstein Regular MDPs","summary":" The paper studies the robustness properties of discrete-time stochastic\noptimal control under Wasserstein model approximation for both discounted cost\nand average cost criteria. Specifically, we study the performance loss when\napplying an optimal policy designed for an approximate model to the true\ndynamics compared with the optimal cost for the true model under the\nsup-norm-induced metric, and relate it to the Wasserstein-1 distance between\nthe approximate and true transition kernels. A primary motivation of this\nanalysis is empirical model learning, as well as empirical noise distribution\nlearning, where Wasserstein convergence holds under mild conditions but\nstronger convergence criteria, such as total variation, may not. We discuss\napplications of the results to the disturbance estimation problem, where sample\ncomplexity bounds are given, and also to a general empirical model learning\napproach, obtained under either Markov or i.i.d.~learning settings. Further\napplications regarding the continuity of invariant probability measures with\nrespect to transition kernels are also discussed.\n","authors":["Yichen Zhou","Yanglei Song","Serdar Yüksel"],"pdf_url":"https://arxiv.org/pdf/2410.14116v3.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2411.09177v1","updated":"2024-11-14T04:30:42Z","published":"2024-11-14T04:30:42Z","title":"Enhancing reinforcement learning for population setpoint tracking in\n co-cultures","summary":" Efficient multiple setpoint tracking can enable advanced biotechnological\napplications, such as maintaining desired population levels in co-cultures for\noptimal metabolic division of labor. In this study, we employ reinforcement\nlearning as a control method for population setpoint tracking in co-cultures,\nfocusing on policy-gradient techniques where the control policy is\nparameterized by neural networks. However, achieving accurate tracking across\nmultiple setpoints is a significant challenge in reinforcement learning, as the\nagent must effectively balance the contributions of various setpoints to\nmaximize the expected system performance. Traditional return functions, such as\nthose based on a quadratic cost, often yield suboptimal performance due to\ntheir inability to efficiently guide the agent toward the simultaneous\nsatisfaction of all setpoints. To overcome this, we propose a novel return\nfunction that rewards the simultaneous satisfaction of multiple setpoints and\ndiminishes overall reward gains otherwise, accounting for both stage and\nterminal system performance. This return function includes parameters to\nfine-tune the desired smoothness and steepness of the learning process. We\ndemonstrate our approach considering an $\\textit{Escherichia coli}$ co-culture\nin a chemostat with optogenetic control over amino acid synthesis pathways,\nleveraging auxotrophies to modulate growth.\n","authors":["Sebastián Espinel-Ríos","Joyce Qiaoxi Mo","Dongda Zhang","Ehecatl Antonio del Rio-Chanona","José L. Avalos"],"pdf_url":"https://arxiv.org/pdf/2411.09177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04391v2","updated":"2024-11-14T01:18:18Z","published":"2024-04-05T20:19:51Z","title":"Adaptive Power Flow Approximations with Second-Order Sensitivity\n Insights","summary":" The power flow equations are fundamental to power system planning, analysis,\nand control. However, the inherent non-linearity and non-convexity of these\nequations present formidable obstacles in problem-solving processes. To\nmitigate these challenges, recent research has proposed adaptive power flow\nlinearizations that aim to achieve accuracy over wide operating ranges. The\naccuracy of these approximations inherently depends on the curvature of the\npower flow equations within these ranges, which necessitates considering\nsecond-order sensitivities. In this paper, we leverage second-order\nsensitivities to both analyze and improve power flow approximations. We\nevaluate the curvature across broad operational ranges and subsequently utilize\nthis information to inform the computation of various sample-based power flow\napproximation techniques. Additionally, we leverage second-order sensitivities\nto guide the development of rational approximations that yield linear\nconstraints in optimization problems. This approach is extended to enhance\naccuracy beyond the limitations of linear functions across varied operational\nscenarios.\n","authors":["Paprapee Buason","Sidhant Misra","Jean-Paul Watson","Daniel K. Molzahn"],"pdf_url":"https://arxiv.org/pdf/2404.04391v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09110v1","updated":"2024-11-14T00:55:49Z","published":"2024-11-14T00:55:49Z","title":"Information-Optimal Multi-Spacecraft Positioning for Interstellar Object\n Exploration","summary":" Interstellar objects (ISOs), astronomical objects not gravitationally bound\nto the sun, could present valuable opportunities to advance our understanding\nof the universe's formation and composition. In response to the unpredictable\nnature of their discoveries that inherently come with large and rapidly\nchanging uncertainty in their state, this paper proposes a novel\nmulti-spacecraft framework for locally maximizing information to be gained\nthrough ISO encounters with formal probabilistic guarantees. Given some\napproximated control and estimation policies for fully autonomous spacecraft\noperations, we first construct an ellipsoid around its terminal position, where\nthe ISO would be located with a finite probability. The large state uncertainty\nof the ISO is formally handled here through the hierarchical property in\nstochastically contracting nonlinear systems. We then propose a method to find\nthe terminal positions of the multiple spacecraft optimally distributed around\nthe ellipsoid, which locally maximizes the information we can get from all the\npoints of interest (POIs). This utilizes a probabilistic information cost\nfunction that accounts for spacecraft positions, camera specifications, and ISO\nposition uncertainty, where the information is defined as visual data collected\nby cameras. Numerical simulations demonstrate the efficacy of this approach\nusing synthetic ISO candidates generated from quasi-realistic empirical\npopulations. Our method allows each spacecraft to optimally select its terminal\nstate and determine the ideal number of POIs to investigate, potentially\nenhancing the ability to study these rare and fleeting interstellar visitors\nwhile minimizing resource utilization.\n","authors":["Arna Bhardwaj","Shishir Bhatta","Hiroyasu Tsukamoto"],"pdf_url":"https://arxiv.org/pdf/2411.09110v1.pdf","comment":"IEEE Aerospace Conference, Preprint Version, Accepted: November 2024"},{"id":"http://arxiv.org/abs/2312.13175v2","updated":"2024-11-14T13:49:52Z","published":"2023-12-20T16:35:55Z","title":"Nonlinear moving horizon estimation for robust state and parameter\n estimation -- extended version","summary":" We propose a moving horizon estimation scheme to estimate the states and the\nunknown constant parameters of general nonlinear uncertain discrete-time\nsystems. The proposed framework and analysis explicitly do not involve the a\npriori verification of a particular excitation condition for the parameters.\nInstead, we use online information about the actual excitation of the\nparameters at any time during operation and ensure that the regularization term\nin the cost function is always automatically selected appropriately. This\nensures that the state and parameter estimation error is bounded for all times,\neven if the parameters are never (or only rarely) excited during operation.\nRobust exponential stability of the state and parameter estimation error\nemerges under an additional uniform condition on the maximum duration of\ninsufficient excitation. The theoretical results are illustrated by a numerical\nexample.\n","authors":["Julian D. Schiller","Matthias A. Müller"],"pdf_url":"https://arxiv.org/pdf/2312.13175v2.pdf","comment":"Replaced by revised version"},{"id":"http://arxiv.org/abs/2411.09812v1","updated":"2024-11-14T21:01:29Z","published":"2024-11-14T21:01:29Z","title":"Edge Caching Optimization with PPO and Transfer Learning for Dynamic\n Environments","summary":" This paper addresses the challenge of edge caching in dynamic environments,\nwhere rising traffic loads strain backhaul links and core networks. We propose\na Proximal Policy Optimization (PPO)-based caching strategy that fully\nincorporates key file attributes such as size, lifetime, importance, and\npopularity, while also considering random file request arrivals, reflecting\nmore realistic edge caching scenarios. In dynamic environments, changes such as\nshifts in content popularity and variations in request rates frequently occur,\nmaking previously learned policies less effective as they were optimized for\nearlier conditions. Without adaptation, caching efficiency and response times\ncan degrade. While learning a new policy from scratch in a new environment is\nan option, it is highly inefficient and computationally expensive. Thus,\nadapting an existing policy to these changes is critical. To address this, we\ndevelop a mechanism that detects changes in content popularity and request\nrates, ensuring timely adjustments to the caching strategy. We also propose a\ntransfer learning-based PPO algorithm that accelerates convergence in new\nenvironments by leveraging prior knowledge. Simulation results demonstrate the\nsignificant effectiveness of our approach, outperforming a recent Deep\nReinforcement Learning (DRL)-based method.\n","authors":["Farnaz Niknia","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09787v1","updated":"2024-11-14T19:56:17Z","published":"2024-11-14T19:56:17Z","title":"ART-Rx: A Proportional-Integral-Derivative (PID) Controlled Adaptive\n Real-Time Threshold Receiver for Molecular Communication","summary":" Molecular communication (MC) in microfluidic channels faces significant\nchallenges in signal detection due to the stochastic nature of molecule\npropagation and dynamic, noisy environments. Conventional detection methods\noften struggle under varying channel conditions, leading to high bit error\nrates (BER) and reduced communication efficiency. This paper introduces ART-Rx,\na novel Adaptive Real-Time Threshold Receiver for MC that addresses these\nchallenges. Implemented within a conceptual system-on-chip (SoC), ART-Rx\nemploys a Proportional-Integral-Derivative (PID) controller to dynamically\nadjust the detection threshold based on observed errors in real time.\nComprehensive simulations using MATLAB and Smoldyn compare ART-Rx's performance\nagainst a statistically optimal detection threshold across various scenarios,\nincluding different levels of interference, concentration shift keying (CSK)\nlevels, flow velocities, transmitter-receiver distances, diffusion\ncoefficients, and binding rates. The results demonstrate that ART-Rx\nsignificantly outperforms conventional methods, maintaining consistently low\nBER and bit error probabilities (BEP) even in high-noise conditions and extreme\nchannel environments. The system exhibits exceptional robustness to\ninterference and shows the potential to enable higher data rates in CSK\nmodulation. Furthermore, because ART-Rx is effectively adaptable to varying\nenvironmental conditions in microfluidic channels, it offers a computationally\nefficient and straightforward approach to enhance signal detection in nanoscale\ncommunication systems. This approach presents a promising control theory-based\nsolution to improve the reliability of data transmission in practical MC\nsystems, with potential applications in healthcare, brain-machine interfaces\n(BMI), and the Internet of Bio-Nano Things (IoBNT).\n","authors":["Hongbin Ni","Ozgur B. Akan"],"pdf_url":"https://arxiv.org/pdf/2411.09787v1.pdf","comment":"14 pages, 7 figures, submitted to IEEE Transactions on Molecular,\n Biological, and Multi-Scale Communications (TMBMC)"},{"id":"http://arxiv.org/abs/2411.09783v1","updated":"2024-11-14T19:49:33Z","published":"2024-11-14T19:49:33Z","title":"Exploring the Use of Autonomous Unmanned Vehicles for Supporting Power\n Grid Operations","summary":" This paper explores the use of autonomous unmanned vehicles for supporting\npower grid operations. With built-in batteries and the capability to carry\nadditional battery energy storage, the rising number of autonomous vehicles can\nrepresent a substantial amount of capacity that is currently underutilized in\nthe power grid. Unlike traditional electric vehicles which require drivers, the\noperations of autonomous vehicles can be performed without human intervention.\nTo guide idle vehicles to support power grids autonomously, we propose a\ntractable optimization-based method for effectively integrating these ``mobile\nbatteries'' into grid operations. During real-time operations, the vehicles are\nstrategically routed to target locations to help maintain system power balance\nand reduce operating costs. Numerical studies have confirmed both the validity\nand scalability of the proposed algorithm for efficiently integrating\nautonomous vehicles into routine power system operations.\n","authors":["Yuqi Zhou","Cong Feng","Mingzhi Zhang","Rui Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13142v2","updated":"2024-11-14T19:36:14Z","published":"2024-04-19T19:03:33Z","title":"Decentralized Coordination of Distributed Energy Resources through Local\n Energy Markets and Deep Reinforcement Learning","summary":" As distributed energy resources (DERs) grow, the electricity grid faces\nincreased net load variability at the grid edge, impacting operability and\nreliability. Transactive energy, facilitated through local energy markets,\noffers a decentralized, indirect demand response solution, with model-free\ncontrol techniques, such as deep reinforcement learning (DRL), enabling\nautomated, decentralized participation. However, existing studies largely\noverlook community-level net load variability, focusing instead on\nsocioeconomic metrics.\n This study addresses this gap by using DRL agents to automate end-user\nparticipation in a local energy market (ALEX), where agents act independently\nto minimize individual energy bills. Results reveal a strong link between bill\nreduction and decreased net load variability, assessed across metrics such as\nramping rate, load factor, and peak demand over various time horizons. Using a\nno-control baseline, DRL agents are benchmarked against a near-optimal dynamic\nprogramming approach. The dynamic programming benchmark achieves reductions of\n22.05 percent, 83.92 percent, and 24.09 percent in daily import, export, and\npeak demand, respectively, while the DRL agents show comparable or superior\nresults with reductions of 21.93 percent, 84.46 percent, and 27.02 percent.\n This study demonstrates the effectiveness of DRL in decentralized grid\nmanagement, highlighting its scalability and near-optimal performance in\nreducing net load variability within community-driven energy markets.\n","authors":["Daniel May","Matthew Taylor","Petr Musilek"],"pdf_url":"https://arxiv.org/pdf/2404.13142v2.pdf","comment":"preprint, submitted to Energy and AI"},{"id":"http://arxiv.org/abs/2411.09764v1","updated":"2024-11-14T19:21:24Z","published":"2024-11-14T19:21:24Z","title":"ModelPredictiveControl.jl: advanced process control made easy in Julia","summary":" Proprietary closed-source software is still the norm in advanced process\ncontrol. Transparency and reproducibility are key aspects of scientific\nresearch. Free and open-source toolkit can contribute to the development,\nsharing and advancement of new and efficient control approaches, and the\nindustrial sector will certainly benefit from them. This paper presents\nModelPredictiveControl.jl, an open-source software package for designing model\npredictive controllers in the Julia programming language. It is designed to be\neasy to use and modular, while providing advanced features like nonlinear\ncontrol and moving horizon estimation. It relies on powerful control system and\nmathematical optimization frameworks to simplify the construction and testing\nof state estimators and predictive controllers. It also integrates with the\nstandard plotting library to quickly visualize closed-loop data. The paper\npresents the main functionalities and illustrates them with two case studies in\nsimulation. The first example is a continuously stirred tank reactor described\nby linear dynamics. The second one implements a nonlinear, an economic, and a\nsuccessive linearization model predictive controllers for an inverted pendulum.\nThe solving times are benchmarked against equivalent implementations in MATLAB\nto show the efficiency of the package.\n","authors":["Francis Gagnon","Alex Thivierge","André Desbiens","Fredrik Bagge Carlson"],"pdf_url":"https://arxiv.org/pdf/2411.09764v1.pdf","comment":"11 pages, 11 figures, 1 table"},{"id":"http://arxiv.org/abs/2411.09717v1","updated":"2024-11-14T06:35:50Z","published":"2024-11-14T06:35:50Z","title":"Integrating Fuzzy Set Theory with Pandora Temporal Fault Trees for\n Dynamic Failure Analysis of Complex Systems","summary":" Pandora temporal fault tree, as one notable extension of the fault tree,\nintroduces temporal gates and temporal laws. Pandora Temporal Fault Tree(TFT)\nenhances the capability of fault trees and enables the modeling of system\nfailure behavior that depends on sequences. The calculation of system failure\nprobability in Pandora TFT relies on precise probabilistic information on\ncomponent failures. However, obtaining such precise failure data can often be\nchallenging. The data may be uncertain as historical records are used to derive\nfailure data for system components. To mitigate this uncertainty, in this\nstudy, we proposed a method that integrates fuzzy set theory with Pandora TFT.\nThis integration aims to enable dynamic analysis of complex systems, even in\ncases where quantitative failure data of components is unreliable or imprecise.\nThe proposed work introduces the development of Fuzzy AND, Fuzzy OR, Fuzzy\nPAND, and Fuzzy POR logic gates for Pandora TFT. We also introduce a fuzzy\nimportance measure for criticality analysis of basic events. All events in our\nanalysis are assumed to have exponentially distributed failures, with their\nfailure rates represented as triangular fuzzy numbers. We illustrate the\nproposed method through a case study of the Aircraft Fuel Distribution System\n(AFDS), highlighting its practical application and effectiveness in analyzing\ncomplex systems. The results are compared with existing results from Petri net\nand Bayesian network techniques to validate the findings.\n","authors":["Hitesh Khungla","Mohit Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.09717v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.09702v1","updated":"2024-11-14T18:59:40Z","published":"2024-11-14T18:59:40Z","title":"On the Surprising Effectiveness of Attention Transfer for Vision\n Transformers","summary":" Conventional wisdom suggests that pre-training Vision Transformers (ViT)\nimproves downstream performance by learning useful representations. Is this\nactually true? We investigate this question and find that the features and\nrepresentations learned during pre-training are not essential. Surprisingly,\nusing only the attention patterns from pre-training (i.e., guiding how\ninformation flows between tokens) is sufficient for models to learn high\nquality features from scratch and achieve comparable downstream performance. We\nshow this by introducing a simple method called attention transfer, where only\nthe attention patterns from a pre-trained teacher ViT are transferred to a\nstudent, either by copying or distilling the attention maps. Since attention\ntransfer lets the student learn its own features, ensembling it with a\nfine-tuned teacher also further improves accuracy on ImageNet. We\nsystematically study various aspects of our findings on the sufficiency of\nattention maps, including distribution shift settings where they underperform\nfine-tuning. We hope our exploration provides a better understanding of what\npre-training accomplishes and leads to a useful alternative to the standard\npractice of fine-tuning\n","authors":["Alexander C. Li","Yuandong Tian","Beidi Chen","Deepak Pathak","Xinlei Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09702v1.pdf","comment":"NeurIPS 2024. Code:\n https://github.com/alexlioralexli/attention-transfer"},{"id":"http://arxiv.org/abs/2405.09596v2","updated":"2024-11-14T18:57:09Z","published":"2024-05-15T13:43:07Z","title":"Enhancing Maritime Trajectory Forecasting via H3 Index and Causal\n Language Modelling (CLM)","summary":" The prediction of ship trajectories is a growing field of study in artificial\nintelligence. Traditional methods rely on the use of LSTM, GRU networks, and\neven Transformer architectures for the prediction of spatio-temporal series.\nThis study proposes a viable alternative for predicting these trajectories\nusing only GNSS positions. It considers this spatio-temporal problem as a\nnatural language processing problem. The latitude/longitude coordinates of AIS\nmessages are transformed into cell identifiers using the H3 index. Thanks to\nthe pseudo-octal representation, it becomes easier for language models to learn\nthe spatial hierarchy of the H3 index. The method is compared with a classical\nKalman filter, widely used in the maritime domain, and introduces the Fr\\'echet\ndistance as the main evaluation metric. We show that it is possible to predict\nship trajectories quite precisely up to 8 hours ahead with 30 minutes of\ncontext, using solely GNSS positions, without relying on any additional\ninformation such as speed, course, or external conditions - unlike many\ntraditional methods. We demonstrate that this alternative works well enough to\npredict trajectories worldwide.\n","authors":["Nicolas Drapier","Aladine Chetouani","Aurélien Chateigner"],"pdf_url":"https://arxiv.org/pdf/2405.09596v2.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2411.09686v1","updated":"2024-11-14T18:53:51Z","published":"2024-11-14T18:53:51Z","title":"Conditional regression for the Nonlinear Single-Variable Model","summary":" Several statistical models for regression of a function $F$ on $\\mathbb{R}^d$\nwithout the statistical and computational curse of dimensionality exist, for\nexample by imposing and exploiting geometric assumptions on the distribution of\nthe data (e.g. that its support is low-dimensional), or strong smoothness\nassumptions on $F$, or a special structure $F$. Among the latter, compositional\nmodels assume $F=f\\circ g$ with $g$ mapping to $\\mathbb{R}^r$ with $r\\ll d$,\nhave been studied, and include classical single- and multi-index models and\nrecent works on neural networks. While the case where $g$ is linear is rather\nwell-understood, much less is known when $g$ is nonlinear, and in particular\nfor which $g$'s the curse of dimensionality in estimating $F$, or both $f$ and\n$g$, may be circumvented. In this paper, we consider a model\n$F(X):=f(\\Pi_\\gamma X) $ where $\\Pi_\\gamma:\\mathbb{R}^d\\to[0,\\rm{len}_\\gamma]$\nis the closest-point projection onto the parameter of a regular curve $\\gamma:\n[0,\\rm{len}_\\gamma]\\to\\mathbb{R}^d$ and $f:[0,\\rm{len}_\\gamma]\\to\\mathbb{R}^1$.\nThe input data $X$ is not low-dimensional, far from $\\gamma$, conditioned on\n$\\Pi_\\gamma(X)$ being well-defined. The distribution of the data, $\\gamma$ and\n$f$ are unknown. This model is a natural nonlinear generalization of the\nsingle-index model, which corresponds to $\\gamma$ being a line. We propose a\nnonparametric estimator, based on conditional regression, and show that under\nsuitable assumptions, the strongest of which being that $f$ is coarsely\nmonotone, it can achieve the $one$-$dimensional$ optimal min-max rate for\nnon-parametric regression, up to the level of noise in the observations, and be\nconstructed in time $\\mathcal{O}(d^2n\\log n)$. All the constants in the\nlearning bounds, in the minimal number of samples required for our bounds to\nhold, and in the computational complexity are at most low-order polynomials in\n$d$.\n","authors":["Yantao Wu","Mauro Maggioni"],"pdf_url":"https://arxiv.org/pdf/2411.09686v1.pdf","comment":"55 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.09683v1","updated":"2024-11-14T18:52:05Z","published":"2024-11-14T18:52:05Z","title":"Towards a Classification of Open-Source ML Models and Datasets for\n Software Engineering","summary":" Background: Open-Source Pre-Trained Models (PTMs) and datasets provide\nextensive resources for various Machine Learning (ML) tasks, yet these\nresources lack a classification tailored to Software Engineering (SE) needs.\nAims: We apply an SE-oriented classification to PTMs and datasets on a popular\nopen-source ML repository, Hugging Face (HF), and analyze the evolution of PTMs\nover time. Method: We conducted a repository mining study. We started with a\nsystematically gathered database of PTMs and datasets from the HF API. Our\nselection was refined by analyzing model and dataset cards and metadata, such\nas tags, and confirming SE relevance using Gemini 1.5 Pro. All analyses are\nreplicable, with a publicly accessible replication package. Results: The most\ncommon SE task among PTMs and datasets is code generation, with a primary focus\non software development and limited attention to software management. Popular\nPTMs and datasets mainly target software development. Among ML tasks, text\ngeneration is the most common in SE PTMs and datasets. There has been a marked\nincrease in PTMs for SE since 2023 Q2. Conclusions: This study underscores the\nneed for broader task coverage to enhance the integration of ML within SE\npractices.\n","authors":["Alexandra González","Xavier Franch","David Lo","Silverio Martínez-Fernández"],"pdf_url":"https://arxiv.org/pdf/2411.09683v1.pdf","comment":"5 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.09678v1","updated":"2024-11-14T18:44:31Z","published":"2024-11-14T18:44:31Z","title":"NeuralDEM - Real-time Simulation of Industrial Particulate Flows","summary":" Advancements in computing power have made it possible to numerically simulate\nlarge-scale fluid-mechanical and/or particulate systems, many of which are\nintegral to core industrial processes. Among the different numerical methods\navailable, the discrete element method (DEM) provides one of the most accurate\nrepresentations of a wide range of physical systems involving granular and\ndiscontinuous materials. Consequently, DEM has become a widely accepted\napproach for tackling engineering problems connected to granular flows and\npowder mechanics. Additionally, DEM can be integrated with grid-based\ncomputational fluid dynamics (CFD) methods, enabling the simulation of chemical\nprocesses taking place, e.g., in fluidized beds. However, DEM is\ncomputationally intensive because of the intrinsic multiscale nature of\nparticulate systems, restricting simulation duration or number of particles.\nTowards this end, NeuralDEM presents an end-to-end approach to replace slow\nnumerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM\nis capable of picturing long-term transport processes across different regimes\nusing macroscopic observables without any reference to microscopic model\nparameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an\nunderlying continuous field, while simultaneously modeling macroscopic behavior\ndirectly as additional auxiliary fields. Second, NeuralDEM introduces\nmulti-branch neural operators scalable to real-time modeling of\nindustrially-sized scenarios - from slow and pseudo-steady to fast and\ntransient. Such scenarios have previously posed insurmountable challenges for\ndeep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM\nfluidized bed reactors of 160k CFD cells and 500k DEM particles for\ntrajectories of 28s. NeuralDEM will open many new doors to advanced engineering\nand much faster process cycles.\n","authors":["Benedikt Alkin","Tobias Kronlachner","Samuele Papa","Stefan Pirker","Thomas Lichtenegger","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2411.09678v1.pdf","comment":"Project page: https://nx-ai.github.io/NeuralDEM/"},{"id":"http://arxiv.org/abs/2411.09648v1","updated":"2024-11-14T18:17:30Z","published":"2024-11-14T18:17:30Z","title":"Med-Bot: An AI-Powered Assistant to Provide Accurate and Reliable\n Medical Information","summary":" This paper introduces Med-Bot, an AI-powered chatbot designed to provide\nusers with accurate and reliable medical information. Utilizing advanced\nlibraries and frameworks such as PyTorch, Chromadb, Langchain and Autogptq,\nMed-Bot is built to handle the complexities of natural language understanding\nin a healthcare context. The integration of llamaassisted data processing and\nAutoGPT-Q provides enhanced performance in processing and responding to queries\nbased on PDFs of medical literature, ensuring that users receive precise and\ntrustworthy information. This research details the methodologies employed in\ndeveloping Med-Bot and evaluates its effectiveness in disseminating healthcare\ninformation.\n","authors":["Ahan Bhatt","Nandan Vaghela"],"pdf_url":"https://arxiv.org/pdf/2411.09648v1.pdf","comment":"3 figures, 5 pages Keywords-LLM, AI-powered healthcare, Medical\n chatbot, Context-based interaction, Llama-assisted data processing,\n AutoGPT-Q, PyTorch, TensorFlow, Reliable medical information, Machine\n learning in healthcare, Conversational AI"},{"id":"http://arxiv.org/abs/2411.09645v1","updated":"2024-11-14T18:14:32Z","published":"2024-11-14T18:14:32Z","title":"How do Machine Learning Models Change?","summary":" The proliferation of Machine Learning (ML) models and their open-source\nimplementations has transformed Artificial Intelligence research and\napplications. Platforms like Hugging Face (HF) enable the development, sharing,\nand deployment of these models, fostering an evolving ecosystem. While previous\nstudies have examined aspects of models hosted on platforms like HF, a\ncomprehensive longitudinal study of how these models change remains\nunderexplored. This study addresses this gap by utilizing both repository\nmining and longitudinal analysis methods to examine over 200,000 commits and\n1,200 releases from over 50,000 models on HF. We replicate and extend an ML\nchange taxonomy for classifying commits and utilize Bayesian networks to\nuncover patterns in commit and release activities over time. Our findings\nindicate that commit activities align with established data science\nmethodologies, such as CRISP-DM, emphasizing iterative refinement and\ncontinuous improvement. Additionally, release patterns tend to consolidate\nsignificant updates, particularly in documentation, distinguishing between\ngranular changes and milestone-based releases. Furthermore, projects with\nhigher popularity prioritize infrastructure enhancements early in their\nlifecycle, and those with intensive collaboration practices exhibit improved\ndocumentation standards. These and other insights enhance the understanding of\nmodel changes on community platforms and provide valuable guidance for best\npractices in model maintenance.\n","authors":["Joel Castaño","Rafael Cabañas","Antonio Salmerón","David Lo","Silverio Martínez-Fernández"],"pdf_url":"https://arxiv.org/pdf/2411.09645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04783v2","updated":"2024-11-14T18:14:00Z","published":"2024-03-02T16:52:22Z","title":"AutoDefense: Multi-Agent LLM Defense against Jailbreak Attacks","summary":" Despite extensive pre-training in moral alignment to prevent generating\nharmful information, large language models (LLMs) remain vulnerable to\njailbreak attacks. In this paper, we propose AutoDefense, a multi-agent defense\nframework that filters harmful responses from LLMs. With the response-filtering\nmechanism, our framework is robust against different jailbreak attack prompts,\nand can be used to defend different victim models. AutoDefense assigns\ndifferent roles to LLM agents and employs them to complete the defense task\ncollaboratively. The division in tasks enhances the overall\ninstruction-following of LLMs and enables the integration of other defense\ncomponents as tools. With AutoDefense, small open-source LMs can serve as\nagents and defend larger models against jailbreak attacks. Our experiments show\nthat AutoDefense can effectively defense against different jailbreak attacks,\nwhile maintaining the performance at normal user request. For example, we\nreduce the attack success rate on GPT-3.5 from 55.74% to 7.95% using\nLLaMA-2-13b with a 3-agent system. Our code and data are publicly available at\nhttps://github.com/XHMY/AutoDefense.\n","authors":["Yifan Zeng","Yiran Wu","Xiao Zhang","Huazheng Wang","Qingyun Wu"],"pdf_url":"https://arxiv.org/pdf/2403.04783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09644v1","updated":"2024-11-14T18:12:06Z","published":"2024-11-14T18:12:06Z","title":"Neural Operators Can Play Dynamic Stackelberg Games","summary":" Dynamic Stackelberg games are a broad class of two-player games in which the\nleader acts first, and the follower chooses a response strategy to the leader's\nstrategy. Unfortunately, only stylized Stackelberg games are explicitly\nsolvable since the follower's best-response operator (as a function of the\ncontrol of the leader) is typically analytically intractable. This paper\naddresses this issue by showing that the \\textit{follower's best-response\noperator} can be approximately implemented by an \\textit{attention-based neural\noperator}, uniformly on compact subsets of adapted open-loop controls for the\nleader. We further show that the value of the Stackelberg game where the\nfollower uses the approximate best-response operator approximates the value of\nthe original Stackelberg game. Our main result is obtained using our universal\napproximation theorem for attention-based neural operators between spaces of\nsquare-integrable adapted stochastic processes, as well as stability results\nfor a general class of Stackelberg games.\n","authors":["Guillermo Alvarez","Ibrahim Ekren","Anastasis Kratsios","Xuwei Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09642v1","updated":"2024-11-14T18:06:55Z","published":"2024-11-14T18:06:55Z","title":"On the Limits of Language Generation: Trade-Offs Between Hallucination\n and Mode Collapse","summary":" Specifying all desirable properties of a language model is challenging, but\ncertain requirements seem essential. Given samples from an unknown language,\nthe trained model should produce valid strings not seen in training and be\nexpressive enough to capture the language's full richness. Otherwise,\noutputting invalid strings constitutes \"hallucination,\" and failing to capture\nthe full range leads to \"mode collapse.\" We ask if a language model can meet\nboth requirements.\n We investigate this within a statistical language generation setting building\non Gold and Angluin. Here, the model receives random samples from a\ndistribution over an unknown language K, which belongs to a possibly infinite\ncollection of languages. The goal is to generate unseen strings from K. We say\nthe model generates from K with consistency and breadth if, as training size\nincreases, its output converges to all unseen strings in K.\n Kleinberg and Mullainathan [KM24] asked if consistency and breadth in\nlanguage generation are possible. We answer this negatively: for a large class\nof language models, including next-token prediction models, this is impossible\nfor most collections of candidate languages. This contrasts with [KM24]'s\nresult, showing consistent generation without breadth is possible for any\ncountable collection of languages. Our finding highlights that generation with\nbreadth fundamentally differs from generation without breadth.\n As a byproduct, we establish near-tight bounds on the number of samples\nneeded for generation with or without breadth.\n Finally, our results offer hope: consistent generation with breadth is\nachievable for any countable collection of languages when negative examples\n(strings outside K) are available alongside positive ones. This suggests that\npost-training feedback, which encodes negative examples, can be crucial in\nreducing hallucinations while limiting mode collapse.\n","authors":["Alkis Kalavasis","Anay Mehrotra","Grigoris Velegkas"],"pdf_url":"https://arxiv.org/pdf/2411.09642v1.pdf","comment":"Abstract shortened to fit arXiv limit"},{"id":"http://arxiv.org/abs/2411.09639v1","updated":"2024-11-14T18:03:44Z","published":"2024-11-14T18:03:44Z","title":"MCCE: Missingness-aware Causal Concept Explainer","summary":" Causal concept effect estimation is gaining increasing interest in the field\nof interpretable machine learning. This general approach explains the behaviors\nof machine learning models by estimating the causal effect of\nhuman-understandable concepts, which represent high-level knowledge more\ncomprehensibly than raw inputs like tokens. However, existing causal concept\neffect explanation methods assume complete observation of all concepts involved\nwithin the dataset, which can fail in practice due to incomplete annotations or\nmissing concept data. We theoretically demonstrate that unobserved concepts can\nbias the estimation of the causal effects of observed concepts. To address this\nlimitation, we introduce the Missingness-aware Causal Concept Explainer (MCCE),\na novel framework specifically designed to estimate causal concept effects when\nnot all concepts are observable. Our framework learns to account for residual\nbias resulting from missing concepts and utilizes a linear predictor to model\nthe relationships between these concepts and the outputs of black-box machine\nlearning models. It can offer explanations on both local and global levels. We\nconduct validations using a real-world dataset, demonstrating that MCCE\nachieves promising performance compared to state-of-the-art explanation methods\nin causal concept effect estimation.\n","authors":["Jifan Gao","Guanhua Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09635v1","updated":"2024-11-14T18:01:02Z","published":"2024-11-14T18:01:02Z","title":"Counterfactual Uncertainty Quantification of Factual Estimand of\n Efficacy from Before-and-After Treatment Repeated Measures Randomized\n Controlled Trials","summary":" The ideal estimand for comparing a new treatment $Rx$ with a control $C$ is\nthe $\\textit{counterfactual}$ efficacy $Rx:C$, the expected differential\noutcome between $Rx$ and $C$ if each patient were given $\\textit{both}$. While\ncounterfactual $\\textit{point estimation}$ from $\\textit{factual}$ Randomized\nControlled Trials (RCTs) has been available, this article shows\n$\\textit{counterfactual}$ uncertainty quantification (CUQ), quantifying\nuncertainty for factual point estimates but in a counterfactual setting, is\nsurprisingly achievable. We achieve CUQ whose variability is typically smaller\nthan factual UQ, by creating a new statistical modeling principle called ETZ\nwhich is applicable to RCTs with $\\textit{Before-and-After}$ treatment Repeated\nMeasures, common in many therapeutic areas.\n We urge caution when estimate of the unobservable true condition of a patient\nbefore treatment has measurement error, because that violation of standard\nregression assumption can cause attenuation in estimating treatment effects.\nFortunately, we prove that, for traditional medicine in general, and for\ntargeted therapy with efficacy defined as averaged over the population,\ncounterfactual point estimation is unbiased. However, for targeted therapy,\nboth Real Human and Digital Twins approaches should respect this limitation,\nlest predicted treatment effect in $\\textit{subgroups}$ will have bias.\n","authors":["Xingya Wang","Yang Han","Yushi Liu","Szu-Yu Tang","Jason C. Hsu"],"pdf_url":"https://arxiv.org/pdf/2411.09635v1.pdf","comment":"39 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.09625v1","updated":"2024-11-14T17:49:27Z","published":"2024-11-14T17:49:27Z","title":"Local deployment of large-scale music AI models on commodity hardware","summary":" We present the MIDInfinite, a web application capable of generating symbolic\nmusic using a large-scale generative AI model locally on commodity hardware.\nCreating this demo involved porting the Anticipatory Music Transformer, a large\nlanguage model (LLM) pre-trained on the Lakh MIDI dataset, to the Machine\nLearning Compilation (MLC) framework. Once the model is ported, MLC facilitates\ninference on a variety of runtimes including C++, mobile, and the browser. We\nenvision that MLC has the potential to bridge the gap between the landscape of\nincreasingly capable music AI models and technology more familiar to music\nsoftware developers. As a proof of concept, we build a web application that\nallows users to generate endless streams of multi-instrumental MIDI in the\nbrowser, either from scratch or conditioned on a prompt. On commodity hardware\n(an M3 Macbook Pro), our demo can generate 51 notes per second, which is faster\nthan real-time playback for 72.9% of generations, and increases to 86.3% with 2\nseconds of upfront buffering.\n","authors":["Xun Zhou","Charlie Ruan","Zihe Zhao","Tianqi Chen","Chris Donahue"],"pdf_url":"https://arxiv.org/pdf/2411.09625v1.pdf","comment":"2 pages"},{"id":"http://arxiv.org/abs/2411.09618v1","updated":"2024-11-14T17:37:19Z","published":"2024-11-14T17:37:19Z","title":"MICCAI-CDMRI 2023 QuantConn Challenge Findings on Achieving Robust\n Quantitative Connectivity through Harmonized Preprocessing of Diffusion MRI","summary":" White matter alterations are increasingly implicated in neurological diseases\nand their progression. International-scale studies use diffusion-weighted\nmagnetic resonance imaging (DW-MRI) to qualitatively identify changes in white\nmatter microstructure and connectivity. Yet, quantitative analysis of DW-MRI\ndata is hindered by inconsistencies stemming from varying acquisition\nprotocols. There is a pressing need to harmonize the preprocessing of DW-MRI\ndatasets to ensure the derivation of robust quantitative diffusion metrics\nacross acquisitions. In the MICCAI-CDMRI 2023 QuantConn challenge, participants\nwere provided raw data from the same individuals collected on the same scanner\nbut with two different acquisitions and tasked with preprocessing the DW-MRI to\nminimize acquisition differences while retaining biological variation.\nSubmissions are evaluated on the reproducibility and comparability of\ncross-acquisition bundle-wise microstructure measures, bundle shape features,\nand connectomics. The key innovations of the QuantConn challenge are that (1)\nwe assess bundles and tractography in the context of harmonization for the\nfirst time, (2) we assess connectomics in the context of harmonization for the\nfirst time, and (3) we have 10x additional subjects over prior harmonization\nchallenge, MUSHAC and 100x over SuperMUDI. We find that bundle surface area,\nfractional anisotropy, connectome assortativity, betweenness centrality, edge\ncount, modularity, nodal strength, and participation coefficient measures are\nmost biased by acquisition and that machine learning voxel-wise correction,\nRISH mapping, and NeSH methods effectively reduce these biases. In addition,\nmicrostructure measures AD, MD, RD, bundle length, connectome density,\nefficiency, and path length are least biased by these acquisition differences.\n","authors":["Nancy R. Newlin","Kurt Schilling","Serge Koudoro","Bramsh Qamar Chandio","Praitayini Kanakaraj","Daniel Moyer","Claire E. Kelly","Sila Genc","Jian Chen","Joseph Yuan-Mou Yang","Ye Wu","Yifei He","Jiawei Zhang","Qingrun Zeng","Fan Zhang","Nagesh Adluru","Vishwesh Nath","Sudhir Pathak","Walter Schneider","Anurag Gade","Yogesh Rathi","Tom Hendriks","Anna Vilanova","Maxime Chamberland","Tomasz Pieciak","Dominika Ciupek","Antonio Tristán Vega","Santiago Aja-Fernández","Maciej Malawski","Gani Ouedraogo","Julia Machnio","Christian Ewert","Paul M. Thompson","Neda Jahanshad","Eleftherios Garyfallidis","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2411.09618v1.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2024/019"},{"id":"http://arxiv.org/abs/2411.09612v1","updated":"2024-11-14T17:32:03Z","published":"2024-11-14T17:32:03Z","title":"The Moral Foundations Weibo Corpus","summary":" Moral sentiments expressed in natural language significantly influence both\nonline and offline environments, shaping behavioral styles and interaction\npatterns, including social media selfpresentation, cyberbullying, adherence to\nsocial norms, and ethical decision-making. To effectively measure moral\nsentiments in natural language processing texts, it is crucial to utilize\nlarge, annotated datasets that provide nuanced understanding for accurate\nanalysis and modeltraining. However, existing corpora, while valuable, often\nface linguistic limitations. To address this gap in the Chinese language\ndomain,we introduce the Moral Foundation Weibo Corpus. This corpus consists of\n25,671 Chinese comments on Weibo, encompassing six diverse topic areas. Each\ncomment is manually annotated by at least three systematically trained\nannotators based on ten moral categories derived from a grounded theory of\nmorality. To assess annotator reliability, we present the kappa testresults, a\ngold standard for measuring consistency. Additionally, we apply several the\nlatest large language models to supplement the manual annotations, conducting\nanalytical experiments to compare their performance and report baseline results\nfor moral sentiment classification.\n","authors":["Renjie Cao","Miaoyan Hu","Jiahan Wei","Baha Ihnaini"],"pdf_url":"https://arxiv.org/pdf/2411.09612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07104v2","updated":"2024-11-14T17:28:37Z","published":"2024-11-11T16:27:25Z","title":"Learning Multi-Agent Loco-Manipulation for Long-Horizon Quadrupedal\n Pushing","summary":" Recently, quadrupedal locomotion has achieved significant success, but their\nmanipulation capabilities, particularly in handling large objects, remain\nlimited, restricting their usefulness in demanding real-world applications such\nas search and rescue, construction, industrial automation, and room\norganization. This paper tackles the task of obstacle-aware, long-horizon\npushing by multiple quadrupedal robots. We propose a hierarchical multi-agent\nreinforcement learning framework with three levels of control. The high-level\ncontroller integrates an RRT planner and a centralized adaptive policy to\ngenerate subgoals, while the mid-level controller uses a decentralized\ngoal-conditioned policy to guide the robots toward these sub-goals. A\npre-trained low-level locomotion policy executes the movement commands. We\nevaluate our method against several baselines in simulation, demonstrating\nsignificant improvements over baseline approaches, with 36.0% higher success\nrates and 24.5% reduction in completion time than the best baseline. Our\nframework successfully enables long-horizon, obstacle-aware manipulation tasks\nlike Push-Cuboid and Push-T on Go1 robots in the real world.\n","authors":["Yuming Feng","Chuye Hong","Yaru Niu","Shiqi Liu","Yuxiang Yang","Wenhao Yu","Tingnan Zhang","Jie Tan","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.07104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09600v1","updated":"2024-11-14T17:18:24Z","published":"2024-11-14T17:18:24Z","title":"Latency Optimization in LEO Satellite Communications with Hybrid Beam\n Pattern and Interference Control","summary":" The rapid advancement of low Earth orbit (LEO) satellite communication\nsystems has significantly enhanced global connectivity, offering high-capacity,\nlow-latency services crucial for next-generation applications. However, the\ndense configuration of LEO constellations poses challenges in resource\nallocation optimization and interference management, complicating coexistence\nwith other communication systems. To address these limitations, this paper\nproposes a novel framework for optimizing the beam scheduling and resource\nallocation in multi-beam LEO systems. To satisfy the uneven terrestrial traffic\ndemand, a hybrid beam pattern is employed to enhance the downlink quality of\nservice and minimize the transmission latency from LEO satellites to ground\nuser terminals. Additionally, a dynamic co-channel interference (CCI) control\nmechanism is developed to mitigate inter-beam interference within the LEO\nconstellation and limit cross-system interference affecting protected users\nfrom other networks. The problem of user-beam-frequency allocation with power\noptimization is formulated as a mixed-integer dynamic programming model and\nsolved using a low-complexity neural network-based graph generation algorithm.\nSimulation results show that the proposed approach outperforms the baseline\nmethods of full frequency reuse and single-channel transmission, and highlights\nthe potential for further performance improvement with multi-user\ntransmissions.\n","authors":["Qianqian Zhang","Ye Hu","Minchae Jung"],"pdf_url":"https://arxiv.org/pdf/2411.09600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09595v1","updated":"2024-11-14T17:08:23Z","published":"2024-11-14T17:08:23Z","title":"LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models","summary":" This work explores expanding the capabilities of large language models (LLMs)\npretrained on text to generate 3D meshes within a unified model. This offers\nkey advantages of (1) leveraging spatial knowledge already embedded in LLMs,\nderived from textual sources like 3D tutorials, and (2) enabling conversational\n3D generation and mesh understanding. A primary challenge is effectively\ntokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly.\nTo address this, we introduce LLaMA-Mesh, a novel approach that represents the\nvertex coordinates and face definitions of 3D meshes as plain text, allowing\ndirect integration with LLMs without expanding the vocabulary. We construct a\nsupervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate\n3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs\nas required, and (3) understand and interpret 3D meshes. Our work is the first\nto demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge\nfor 3D mesh generation in a text-based format, effectively unifying the 3D and\ntext modalities. LLaMA-Mesh achieves mesh generation quality on par with models\ntrained from scratch while maintaining strong text generation performance.\n","authors":["Zhengyi Wang","Jonathan Lorraine","Yikai Wang","Hang Su","Jun Zhu","Sanja Fidler","Xiaohui Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.09595v1.pdf","comment":"See the project website at\n https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/"},{"id":"http://arxiv.org/abs/2410.18958v2","updated":"2024-11-14T17:06:55Z","published":"2024-10-24T17:55:52Z","title":"Stable Consistency Tuning: Understanding and Improving Consistency\n Models","summary":" Diffusion models achieve superior generation quality but suffer from slow\ngeneration speed due to the iterative nature of denoising. In contrast,\nconsistency models, a new generative family, achieve competitive performance\nwith significantly faster sampling. These models are trained either through\nconsistency distillation, which leverages pretrained diffusion models, or\nconsistency training/tuning directly from raw data. In this work, we propose a\nnovel framework for understanding consistency models by modeling the denoising\nprocess of the diffusion model as a Markov Decision Process (MDP) and framing\nconsistency model training as the value estimation through Temporal\nDifference~(TD) Learning. More importantly, this framework allows us to analyze\nthe limitations of current consistency training/tuning strategies. Built upon\nEasy Consistency Tuning (ECT), we propose Stable Consistency Tuning (SCT),\nwhich incorporates variance-reduced learning using the score identity. SCT\nleads to significant performance improvements on benchmarks such as CIFAR-10\nand ImageNet-64. On ImageNet-64, SCT achieves 1-step FID 2.42 and 2-step FID\n1.55, a new SoTA for consistency models.\n","authors":["Fu-Yun Wang","Zhengyang Geng","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2410.18958v2.pdf","comment":"Code is available at\n https://github.com/G-U-N/Stable-Consistency-Tuning"},{"id":"http://arxiv.org/abs/2411.09591v1","updated":"2024-11-14T17:02:41Z","published":"2024-11-14T17:02:41Z","title":"Expert Study on Interpretable Machine Learning Models with Missing Data","summary":" Inherently interpretable machine learning (IML) models provide valuable\ninsights for clinical decision-making but face challenges when features have\nmissing values. Classical solutions like imputation or excluding incomplete\nrecords are often unsuitable in applications where values are missing at test\ntime. In this work, we conducted a survey with 71 clinicians from 29 trauma\ncenters across France, including 20 complete responses to study the interaction\nbetween medical professionals and IML applied to data with missing values. This\nprovided valuable insights into how missing data is interpreted in clinical\nmachine learning. We used the prediction of hemorrhagic shock as a concrete\nexample to gauge the willingness and readiness of the participants to adopt IML\nmodels from three classes of methods. Our findings show that, while clinicians\nvalue interpretability and are familiar with common IML methods, classical\nimputation techniques often misalign with their intuition, and that models that\nnatively handle missing values are preferred. These results emphasize the need\nto integrate clinical intuition into future IML models for better\nhuman-computer interaction.\n","authors":["Lena Stempfle","Arthur James","Julie Josse","Tobias Gauss","Fredrik D. Johansson"],"pdf_url":"https://arxiv.org/pdf/2411.09591v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 13 pages"},{"id":"http://arxiv.org/abs/2407.16677v3","updated":"2024-11-14T16:54:02Z","published":"2024-07-23T17:44:54Z","title":"From Imitation to Refinement -- Residual RL for Precise Assembly","summary":" Advances in behavior cloning (BC), like action-chunking and diffusion, have\nenabled impressive capabilities. Still, imitation alone remains insufficient\nfor learning reliable policies for tasks requiring precise aligning and\ninserting of objects, like assembly. Our key insight is that chunked BC\npolicies effectively function as trajectory planners, enabling long-horizon\ntasks. Conversely, as they execute action chunks open-loop, they lack the\nfine-grained reactivity necessary for reliable execution. Further, we find that\nthe performance of BC policies saturates despite increasing data. Reinforcement\nlearning (RL) is a natural way to overcome BC's limitations, but it is not\nstraightforward to apply directly to action-chunked models like diffusion\npolicies. We present a simple yet effective method, ResiP (Residual for Precise\nManipulation), that sidesteps these challenges by augmenting a frozen, chunked\nBC model with a fully closed-loop residual policy trained with RL. The residual\npolicy is trained via on-policy RL, addressing distribution shifts and\nintroducing reactive control without altering the BC trajectory planner.\nEvaluation on high-precision manipulation tasks demonstrates strong performance\nof ResiP over BC methods and direct RL fine-tuning. Videos, code, and data are\navailable at https://residual-assembly.github.io.\n","authors":["Lars Ankile","Anthony Simeonov","Idan Shenfeld","Marcel Torne","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2407.16677v3.pdf","comment":"Project website: https://residual-assembly.github.io"},{"id":"http://arxiv.org/abs/2402.02681v3","updated":"2024-11-14T16:30:13Z","published":"2024-02-05T02:35:11Z","title":"Equivariant Symmetry Breaking Sets","summary":" Equivariant neural networks (ENNs) have been shown to be extremely effective\nin applications involving underlying symmetries. By construction ENNs cannot\nproduce lower symmetry outputs given a higher symmetry input. However, symmetry\nbreaking occurs in many physical systems and we may obtain a less symmetric\nstable state from an initial highly symmetric one. Hence, it is imperative that\nwe understand how to systematically break symmetry in ENNs. In this work, we\npropose a novel symmetry breaking framework that is fully equivariant and is\nthe first which fully addresses spontaneous symmetry breaking. We emphasize\nthat our approach is general and applicable to equivariance under any group. To\nachieve this, we introduce the idea of symmetry breaking sets (SBS). Rather\nthan redesign existing networks, we design sets of symmetry breaking objects\nwhich we feed into our network based on the symmetry of our inputs and outputs.\nWe show there is a natural way to define equivariance on these sets, which\ngives an additional constraint. Minimizing the size of these sets equates to\ndata efficiency. We prove that minimizing these sets translates to a well\nstudied group theory problem, and tabulate solutions to this problem for the\npoint groups. Finally, we provide some examples of symmetry breaking to\ndemonstrate how our approach works in practice. The code for these examples is\navailable at \\url{https://github.com/atomicarchitects/equivariant-SBS}.\n","authors":["YuQing Xie","Tess Smidt"],"pdf_url":"https://arxiv.org/pdf/2402.02681v3.pdf","comment":"50 pages, 19 figures Published in Transactions on Machine Learning\n Research, October 2024"},{"id":"http://arxiv.org/abs/2411.01881v2","updated":"2024-11-14T16:17:40Z","published":"2024-11-04T08:24:56Z","title":"Causal Discovery and Classification Using Lempel-Ziv Complexity","summary":" Inferring causal relationships in the decision-making processes of machine\nlearning algorithms is a crucial step toward achieving explainable Artificial\nIntelligence (AI). In this research, we introduce a novel causality measure and\na distance metric derived from Lempel-Ziv (LZ) complexity. We explore how the\nproposed causality measure can be used in decision trees by enabling splits\nbased on features that most strongly \\textit{cause} the outcome. We further\nevaluate the effectiveness of the causality-based decision tree and the\ndistance-based decision tree in comparison to a traditional decision tree using\nGini impurity. While the proposed methods demonstrate comparable classification\nperformance overall, the causality-based decision tree significantly\noutperforms both the distance-based decision tree and the Gini-based decision\ntree on datasets generated from causal models. This result indicates that the\nproposed approach can capture insights beyond those of classical decision\ntrees, especially in causally structured data. Based on the features used in\nthe LZ causal measure based decision tree, we introduce a causal strength for\neach features in the dataset so as to infer the predominant causal variables\nfor the occurrence of the outcome.\n","authors":[" Dhruthi","Nithin Nagaraj","Harikrishnan N B"],"pdf_url":"https://arxiv.org/pdf/2411.01881v2.pdf","comment":"17 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2411.06503v2","updated":"2024-11-14T16:15:20Z","published":"2024-11-10T15:57:53Z","title":"Diffusion Sampling Correction via Approximately 10 Parameters","summary":" Diffusion Probabilistic Models (DPMs) have demonstrated exceptional\nperformance in generative tasks, but this comes at the expense of sampling\nefficiency. To enhance sampling speed without sacrificing quality, various\ndistillation-based accelerated sampling algorithms have been recently proposed.\nHowever, they typically require significant additional training costs and model\nparameter storage, which limit their practical application. In this work, we\npropose PCA-based Adaptive Search (PAS), which optimizes existing solvers for\nDPMs with minimal learnable parameters and training costs. Specifically, we\nfirst employ PCA to obtain a few orthogonal unit basis vectors to span the\nhigh-dimensional sampling space, which enables us to learn just a set of\ncoordinates to correct the sampling direction; furthermore, based on the\nobservation that the cumulative truncation error exhibits an ``S''-shape, we\ndesign an adaptive search strategy that further enhances the sampling\nefficiency and reduces the number of stored parameters to approximately 10.\nExtensive experiments demonstrate that PAS can significantly enhance existing\nfast solvers in a plug-and-play manner with negligible costs. For instance, on\nCIFAR10, PAS requires only 12 parameters and less than 1 minute of training on\na single NVIDIA A100 GPU to optimize the DDIM from 15.69 FID (NFE=10) to 4.37.\n","authors":["Guangyi Wang","Wei Peng","Lijiang Li","Wenyu Chen","Yuren Cai","Songzhi Su"],"pdf_url":"https://arxiv.org/pdf/2411.06503v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09558v1","updated":"2024-11-14T16:10:15Z","published":"2024-11-14T16:10:15Z","title":"Adaptive Deviation Learning for Visual Anomaly Detection with Data\n Contamination","summary":" Visual anomaly detection targets to detect images that notably differ from\nnormal pattern, and it has found extensive application in identifying defective\nparts within the manufacturing industry. These anomaly detection paradigms\npredominantly focus on training detection models using only clean, unlabeled\nnormal samples, assuming an absence of contamination; a condition often unmet\nin real-world scenarios. The performance of these methods significantly depends\non the quality of the data and usually decreases when exposed to noise. We\nintroduce a systematic adaptive method that employs deviation learning to\ncompute anomaly scores end-to-end while addressing data contamination by\nassigning relative importance to the weights of individual instances. In this\napproach, the anomaly scores for normal instances are designed to approximate\nscalar scores obtained from the known prior distribution. Meanwhile, anomaly\nscores for anomaly examples are adjusted to exhibit statistically significant\ndeviations from these reference scores. Our approach incorporates a constrained\noptimization problem within the deviation learning framework to update instance\nweights, resolving this problem for each mini-batch. Comprehensive experiments\non the MVTec and VisA benchmark datasets indicate that our proposed method\nsurpasses competing techniques and exhibits both stability and robustness in\nthe presence of data contamination.\n","authors":["Anindya Sundar Das","Guansong Pang","Monowar Bhuyan"],"pdf_url":"https://arxiv.org/pdf/2411.09558v1.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV 2025)"},{"id":"http://arxiv.org/abs/2411.09545v1","updated":"2024-11-14T15:59:41Z","published":"2024-11-14T15:59:41Z","title":"Equation-informed data-driven identification of flow budgets and\n dynamics","summary":" Computational Fluid Dynamics (CFD) is an indispensable method of fluid\nmodelling in engineering applications, reducing the need for physical\nprototypes and testing for tasks such as design optimisation and performance\nanalysis. Depending on the complexity of the system under consideration, models\nranging from low to high fidelity can be used for prediction, allowing\nsignificant speed-up. However, the choice of model requires information about\nthe actual dynamics of the flow regime. Correctly identifying the\nregions/clusters of flow that share the same dynamics has been a challenging\nresearch topic to date. In this study, we propose a novel hybrid approach to\nflow clustering. It consists of characterising each sample point of the system\nwith equation-based features, i.e. features are budgets that represent the\ncontribution of each term from the original governing equation to the local\ndynamics at each sample point. This was achieved by applying the Sparse\nIdentification of Nonlinear Dynamical systems (SINDy) method pointwise to time\nevolution data. The method proceeds with equation-based clustering using the\nGirvan-Newman algorithm. This allows the detection of communities that share\nthe same physical dynamics. The algorithm is implemented in both Eulerian and\nLagrangian frameworks. In the Lagrangian, i.e. dynamic approach, the clustering\nis performed on the trajectory of each point, allowing the change of clusters\nto be represented also in time. The performance of the algorithm is first\ntested on a flow around a cylinder. The construction of the dynamic clusters in\nthis test case clearly shows the evolution of the wake from the steady state\nsolution through the transient to the oscillatory solution. Dynamic clustering\nwas then successfully tested on turbulent flow data. Two distinct and\nwell-defined clusters were identified and their temporal evolution was\nreconstructed.\n","authors":["Nataliya Sevryugina","Serena Costanzo","Steve de Bruyn Kops","Colm-cille Caulfield","Iraj Mortazavi","Taraneh Sayadi"],"pdf_url":"https://arxiv.org/pdf/2411.09545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09540v1","updated":"2024-11-14T15:56:11Z","published":"2024-11-14T15:56:11Z","title":"Prompting the Unseen: Detecting Hidden Backdoors in Black-Box Models","summary":" Visual prompting (VP) is a new technique that adapts well-trained frozen\nmodels for source domain tasks to target domain tasks. This study examines VP's\nbenefits for black-box model-level backdoor detection. The visual prompt in VP\nmaps class subspaces between source and target domains. We identify a\nmisalignment, termed class subspace inconsistency, between clean and poisoned\ndatasets. Based on this, we introduce \\textsc{BProm}, a black-box model-level\ndetection method to identify backdoors in suspicious models, if any.\n\\textsc{BProm} leverages the low classification accuracy of prompted models\nwhen backdoors are present. Extensive experiments confirm \\textsc{BProm}'s\neffectiveness.\n","authors":["Zi-Xuan Huang","Jia-Wei Chen","Zhi-Peng Zhang","Chia-Mu Yu"],"pdf_url":"https://arxiv.org/pdf/2411.09540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09539v1","updated":"2024-11-14T15:55:37Z","published":"2024-11-14T15:55:37Z","title":"A Practical Guide to Fine-tuning Language Models with Limited Data","summary":" Employing pre-trained Large Language Models (LLMs) has become the de facto\nstandard in Natural Language Processing (NLP) despite their extensive data\nrequirements. Motivated by the recent surge in research focused on training\nLLMs with limited data, particularly in low-resource domains and languages,\nthis paper surveys recent transfer learning approaches to optimize model\nperformance in downstream tasks where data is scarce. We first address initial\nand continued pre-training strategies to better leverage prior knowledge in\nunseen domains and languages. We then examine how to maximize the utility of\nlimited data during fine-tuning and few-shot learning. The final section takes\na task-specific perspective, reviewing models and methods suited for different\nlevels of data scarcity. Our goal is to provide practitioners with practical\nguidelines for overcoming the challenges posed by constrained data while also\nhighlighting promising directions for future research.\n","authors":["Márton Szép","Daniel Rueckert","Rüdiger von Eisenhart-Rothe","Florian Hinterwimmer"],"pdf_url":"https://arxiv.org/pdf/2411.09539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09517v1","updated":"2024-11-14T15:28:40Z","published":"2024-11-14T15:28:40Z","title":"Randomized Truthful Auctions with Learning Agents","summary":" We study a setting where agents use no-regret learning algorithms to\nparticipate in repeated auctions. \\citet{kolumbus2022auctions} showed, rather\nsurprisingly, that when bidders participate in second-price auctions using\nno-regret bidding algorithms, no matter how large the number of interactions\n$T$ is, the runner-up bidder may not converge to bidding truthfully. Our first\nresult shows that this holds for \\emph{general deterministic} truthful\nauctions. We also show that the ratio of the learning rates of the bidders can\n\\emph{qualitatively} affect the convergence of the bidders. Next, we consider\nthe problem of revenue maximization in this environment. In the setting with\nfully rational bidders, \\citet{myerson1981optimal} showed that revenue can be\nmaximized by using a second-price auction with reserves.We show that, in stark\ncontrast, in our setting with learning bidders, \\emph{randomized} auctions can\nhave strictly better revenue guarantees than second-price auctions with\nreserves, when $T$ is large enough. Finally, we study revenue maximization in\nthe non-asymptotic regime. We define a notion of {\\em auctioneer regret}\ncomparing the revenue generated to the revenue of a second price auction with\ntruthful bids. When the auctioneer has to use the same auction throughout the\ninteraction, we show an (almost) tight regret bound of $\\smash{\\widetilde\n\\Theta(T^{3/4})}.$ If the auctioneer can change auctions during the\ninteraction, but in a way that is oblivious to the bids, we show an (almost)\ntight bound of $\\smash{\\widetilde \\Theta(\\sqrt{T})}.$\n","authors":["Gagan Aggarwal","Anupam Gupta","Andres Perlroth","Grigoris Velegkas"],"pdf_url":"https://arxiv.org/pdf/2411.09517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09512v1","updated":"2024-11-14T15:26:10Z","published":"2024-11-14T15:26:10Z","title":"GAN-Based Architecture for Low-dose Computed Tomography Imaging\n Denoising","summary":" Generative Adversarial Networks (GANs) have surfaced as a revolutionary\nelement within the domain of low-dose computed tomography (LDCT) imaging,\nproviding an advanced resolution to the enduring issue of reconciling radiation\nexposure with image quality. This comprehensive review synthesizes the rapid\nadvancements in GAN-based LDCT denoising techniques, examining the evolution\nfrom foundational architectures to state-of-the-art models incorporating\nadvanced features such as anatomical priors, perceptual loss functions, and\ninnovative regularization strategies. We critically analyze various GAN\narchitectures, including conditional GANs (cGANs), CycleGANs, and\nSuper-Resolution GANs (SRGANs), elucidating their unique strengths and\nlimitations in the context of LDCT denoising. The evaluation provides both\nqualitative and quantitative results related to the improvements in performance\nin benchmark and clinical datasets with metrics such as PSNR, SSIM, and LPIPS.\nAfter highlighting the positive results, we discuss some of the challenges\npreventing a wider clinical use, including the interpretability of the images\ngenerated by GANs, synthetic artifacts, and the need for clinically relevant\nmetrics. The review concludes by highlighting the essential significance of\nGAN-based methodologies in the progression of precision medicine via tailored\nLDCT denoising models, underlining the transformative possibilities presented\nby artificial intelligence within contemporary radiological practice.\n","authors":["Yunuo Wang","Ningning Yang","Jialin Li"],"pdf_url":"https://arxiv.org/pdf/2411.09512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09510v1","updated":"2024-11-14T15:19:01Z","published":"2024-11-14T15:19:01Z","title":"Communication Compression for Tensor Parallel LLM Inference","summary":" Large Language Models (LLMs) have pushed the frontier of artificial\nintelligence but are comprised of hundreds of billions of parameters and\noperations. For faster inference latency, LLMs are deployed on multiple\nhardware accelerators through various Model Parallelism strategies. Our paper\nlooks into the details on one such strategy - Tensor Parallel - and proposes to\nreduce latency by compressing inter-accelerator communication. We leverage fine\ngrained quantization techniques to compress selected activations by 3.5 - 4.5x.\nOur proposed method leads up to 2x reduction of time-to-first-token (TTFT) with\nnegligible model performance degradation.\n","authors":["Jan Hansen-Palmus","Michael Truong-Le","Oliver Hausdörfer","Alok Verma"],"pdf_url":"https://arxiv.org/pdf/2411.09510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09502v1","updated":"2024-11-14T15:13:13Z","published":"2024-11-14T15:13:13Z","title":"Golden Noise for Diffusion Models: A Learning Framework","summary":" Text-to-image diffusion model is a popular paradigm that synthesizes\npersonalized images by providing a text prompt and a random Gaussian noise.\nWhile people observe that some noises are ``golden noises'' that can achieve\nbetter text-image alignment and higher human preference than others, we still\nlack a machine learning framework to obtain those golden noises. To learn\ngolden noises for diffusion sampling, we mainly make three contributions in\nthis paper. First, we identify a new concept termed the \\textit{noise prompt},\nwhich aims at turning a random Gaussian noise into a golden noise by adding a\nsmall desirable perturbation derived from the text prompt. Following the\nconcept, we first formulate the \\textit{noise prompt learning} framework that\nsystematically learns ``prompted'' golden noise associated with a text prompt\nfor diffusion models. Second, we design a noise prompt data collection pipeline\nand collect a large-scale \\textit{noise prompt dataset}~(NPD) that contains\n100k pairs of random noises and golden noises with the associated text prompts.\nWith the prepared NPD as the training dataset, we trained a small \\textit{noise\nprompt network}~(NPNet) that can directly learn to transform a random noise\ninto a golden noise. The learned golden noise perturbation can be considered as\na kind of prompt for noise, as it is rich in semantic information and tailored\nto the given text prompt. Third, our extensive experiments demonstrate the\nimpressive effectiveness and generalization of NPNet on improving the quality\nof synthesized images across various diffusion models, including SDXL,\nDreamShaper-xl-v2-turbo, and Hunyuan-DiT. Moreover, NPNet is a small and\nefficient controller that acts as a plug-and-play module with very limited\nadditional inference and computational costs, as it just provides a golden\nnoise instead of a random noise without accessing the original pipeline.\n","authors":["Zikai Zhou","Shitong Shao","Lichen Bai","Zhiqiang Xu","Bo Han","Zeke Xie"],"pdf_url":"https://arxiv.org/pdf/2411.09502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09499v1","updated":"2024-11-14T15:06:50Z","published":"2024-11-14T15:06:50Z","title":"Developement of Reinforcement Learning based Optimisation Method for\n Side-Sill Design","summary":" Optimisation for crashworthiness is a critical part of the vehicle\ndevelopment process. Due to stringent regulations and increasing market\ndemands, multiple factors must be considered within a limited timeframe.\nHowever, for optimal crashworthiness design, multiobjective optimisation is\nnecessary, and for complex parts, multiple design parameters must be evaluated.\nThis crashworthiness analysis requires computationally intensive finite element\nsimulations. This challenge leads to the need for inverse multi-parameter\nmulti-objective optimisation. This challenge leads to the need for\nmulti-parameter, multi-objective inverse optimisation. This article\ninvestigates a machine learning-based method for this type of optimisation,\nfocusing on the design optimisation of a multi-cell side sill to improve\ncrashworthiness results. Furthermore, the optimiser is coupled with an FE\nsolver to achieve improved results.\n","authors":["Aditya Borse","Rutwik Gulakala","Marcus Stoffel"],"pdf_url":"https://arxiv.org/pdf/2411.09499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03648v3","updated":"2024-11-14T15:06:12Z","published":"2023-08-07T14:58:53Z","title":"Generative Forests","summary":" We focus on generative AI for a type of data that still represent one of the\nmost prevalent form of data: tabular data. Our paper introduces two key\ncontributions: a new powerful class of forest-based models fit for such tasks\nand a simple training algorithm with strong convergence guarantees in a\nboosting model that parallels that of the original weak / strong supervised\nlearning setting. This algorithm can be implemented by a few tweaks to the most\npopular induction scheme for decision tree induction (i.e. supervised learning)\nwith two classes. Experiments on the quality of generated data display\nsubstantial improvements compared to the state of the art. The losses our\nalgorithm minimize and the structure of our models make them practical for\nrelated tasks that require fast estimation of a density given a generative\nmodel and an observation (even partially specified): such tasks include missing\ndata imputation and density estimation. Additional experiments on these tasks\nreveal that our models can be notably good contenders to diverse state of the\nart methods, relying on models as diverse as (or mixing elements of) trees,\nneural nets, kernels or graphical models.\n","authors":["Richard Nock","Mathieu Guillame-Bert"],"pdf_url":"https://arxiv.org/pdf/2308.03648v3.pdf","comment":"NeurIPS'24"},{"id":"http://arxiv.org/abs/2410.01440v3","updated":"2024-11-14T15:04:33Z","published":"2024-10-02T11:42:49Z","title":"Closed-Loop Long-Horizon Robotic Planning via Equilibrium Sequence\n Modeling","summary":" In the endeavor to make autonomous robots take actions, task planning is a\nmajor challenge that requires translating high-level task descriptions into\nlong-horizon action sequences. Despite recent advances in language model\nagents, they remain prone to planning errors and limited in their ability to\nplan ahead. To address these limitations in robotic planning, we advocate a\nself-refining scheme that iteratively refines a draft plan until an equilibrium\nis reached. Remarkably, this process can be optimized end-to-end from an\nanalytical perspective without the need to curate additional verifiers or\nreward models, allowing us to train self-refining planners in a simple\nsupervised learning fashion. Meanwhile, a nested equilibrium sequence modeling\nprocedure is devised for efficient closed-loop planning that incorporates\nuseful feedback from the environment (or an internal world model). Our method\nis evaluated on the VirtualHome-Env benchmark, showing advanced performance\nwith better scaling for inference computation. Code is available at\nhttps://github.com/Singularity0104/equilibrium-planner.\n","authors":["Jinghan Li","Zhicheng Sun","Fei Li","Cao Sheng","Jiazhong Yu","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2410.01440v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24006v2","updated":"2024-11-14T14:58:26Z","published":"2024-10-31T15:09:36Z","title":"DiffPAD: Denoising Diffusion-based Adversarial Patch Decontamination","summary":" In the ever-evolving adversarial machine learning landscape, developing\neffective defenses against patch attacks has become a critical challenge,\nnecessitating reliable solutions to safeguard real-world AI systems. Although\ndiffusion models have shown remarkable capacity in image synthesis and have\nbeen recently utilized to counter $\\ell_p$-norm bounded attacks, their\npotential in mitigating localized patch attacks remains largely underexplored.\nIn this work, we propose DiffPAD, a novel framework that harnesses the power of\ndiffusion models for adversarial patch decontamination. DiffPAD first performs\nsuper-resolution restoration on downsampled input images, then adopts\nbinarization, dynamic thresholding scheme and sliding window for effective\nlocalization of adversarial patches. Such a design is inspired by the\ntheoretically derived correlation between patch size and diffusion restoration\nerror that is generalized across diverse patch attack scenarios. Finally,\nDiffPAD applies inpainting techniques to the original input images with the\nestimated patch region being masked. By integrating closed-form solutions for\nsuper-resolution restoration and image inpainting into the conditional reverse\nsampling process of a pre-trained diffusion model, DiffPAD obviates the need\nfor text guidance or fine-tuning. Through comprehensive experiments, we\ndemonstrate that DiffPAD not only achieves state-of-the-art adversarial\nrobustness against patch attacks but also excels in recovering naturalistic\nimages without patch remnants. The source code is available at\nhttps://github.com/JasonFu1998/DiffPAD.\n","authors":["Jia Fu","Xiao Zhang","Sepideh Pashami","Fatemeh Rahimian","Anders Holst"],"pdf_url":"https://arxiv.org/pdf/2410.24006v2.pdf","comment":"Accepted to 2025 IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV)"},{"id":"http://arxiv.org/abs/2411.09483v1","updated":"2024-11-14T14:37:47Z","published":"2024-11-14T14:37:47Z","title":"Sparse Bayesian Generative Modeling for Compressive Sensing","summary":" This work addresses the fundamental linear inverse problem in compressive\nsensing (CS) by introducing a new type of regularizing generative prior. Our\nproposed method utilizes ideas from classical dictionary-based CS and, in\nparticular, sparse Bayesian learning (SBL), to integrate a strong\nregularization towards sparse solutions. At the same time, by leveraging the\nnotion of conditional Gaussianity, it also incorporates the adaptability from\ngenerative models to training data. However, unlike most state-of-the-art\ngenerative models, it is able to learn from a few compressed and noisy data\nsamples and requires no optimization algorithm for solving the inverse problem.\nAdditionally, similar to Dirichlet prior networks, our model parameterizes a\nconjugate prior enabling its application for uncertainty quantification. We\nsupport our approach theoretically through the concept of variational inference\nand validate it empirically using different types of compressible signals.\n","authors":["Benedikt Böck","Sadaf Syed","Wolfgang Utschick"],"pdf_url":"https://arxiv.org/pdf/2411.09483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09481v1","updated":"2024-11-14T14:37:15Z","published":"2024-11-14T14:37:15Z","title":"What makes a good BIM design: quantitative linking between design\n behavior and quality","summary":" In the Architecture Engineering & Construction (AEC) industry, how design\nbehaviors impact design quality remains unclear. This study proposes a novel\napproach, which, for the first time, identifies and quantitatively describes\nthe relationship between design behaviors and quality of design based on\nBuilding Information Modeling (BIM). Real-time collection and log mining are\nintegrated to collect raw data of design behaviors. Feature engineering and\nvarious machine learning models are then utilized for quantitative modeling and\ninterpretation. Results confirm an existing quantifiable relationship which can\nbe learned by various models. The best-performing model using Extremely Random\nTrees achieved an R2 value of 0.88 on the test set. Behavioral features related\nto designer's skill level and changes of design intentions are identified to\nhave significant impacts on design quality. These findings deepen our\nunderstanding of the design process and help forming BIM designs with better\nquality.\n","authors":["Xiang-Rui Ni","Peng Pan","Jia-Rui Lin"],"pdf_url":"https://arxiv.org/pdf/2411.09481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09476v1","updated":"2024-11-14T14:31:52Z","published":"2024-11-14T14:31:52Z","title":"Graph Neural Networks and Differential Equations: A hybrid approach for\n data assimilation of fluid flows","summary":" This study presents a novel hybrid approach that combines Graph Neural\nNetworks (GNNs) with Reynolds-Averaged Navier Stokes (RANS) equations to\nenhance the accuracy of mean flow reconstruction across a range of fluid\ndynamics applications. Traditional purely data-driven Neural Networks (NNs)\nmodels, often struggle maintaining physical consistency. Moreover, they\ntypically require large datasets to achieve reliable performances. The GNN\nframework, which naturally handles unstructured data such as complex geometries\nin Computational Fluid Dynamics (CFD), is here integrated with RANS equations\nas a physical baseline model. The methodology leverages the adjoint method,\nenabling the use of RANS-derived gradients as optimization terms in the GNN\ntraining process. This ensures that the learned model adheres to the governing\nphysics, maintaining physical consistency while improving the prediction\naccuracy. We test our approach on multiple CFD scenarios, including cases\ninvolving generalization with respect to the Reynolds number, sparse\nmeasurements, denoising and inpainting of missing portions of the mean flow.\nThe results demonstrate significant improvements in the accuracy of the\nreconstructed mean flow compared to purely data-driven models, using limited\namounts of data in the training dataset. The key strengths of this study are\nthe integration of physical laws into the training process of the GNN, and the\nability to achieve high-accuracy predictions with a limited amount of data,\nmaking this approach particularly valuable for applications in fluid dynamics\nwhere data is often scarce.\n","authors":["M. Quattromini","M. A. Bucci","S. Cherubini","O. Semeraro"],"pdf_url":"https://arxiv.org/pdf/2411.09476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09475v1","updated":"2024-11-14T14:31:30Z","published":"2024-11-14T14:31:30Z","title":"ResidualDroppath: Enhancing Feature Reuse over Residual Connections","summary":" Residual connections are one of the most important components in neural\nnetwork architectures for mitigating the vanishing gradient problem and\nfacilitating the training of much deeper networks. One possible explanation for\nhow residual connections aid deeper network training is by promoting feature\nreuse. However, we identify and analyze the limitations of feature reuse with\nvanilla residual connections. To address these limitations, we propose\nmodifications in training methods. Specifically, we provide an additional\nopportunity for the model to learn feature reuse with residual connections\nthrough two types of iterations during training. The first type of iteration\ninvolves using droppath, which enforces feature reuse by randomly dropping a\nsubset of layers. The second type of iteration focuses on training the dropped\nparts of the model while freezing the undropped parts. As a result, the dropped\nparts learn in a way that encourages feature reuse, as the model relies on the\nundropped parts with feature reuse in mind. Overall, we demonstrated\nperformance improvements in models with residual connections for image\nclassification in certain cases.\n","authors":["Sejik Park"],"pdf_url":"https://arxiv.org/pdf/2411.09475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02407v3","updated":"2024-11-14T14:26:42Z","published":"2024-08-05T12:01:42Z","title":"Terracorder: Sense Long and Prosper","summary":" In-situ sensing devices need to be deployed in remote environments for long\nperiods of time; minimizing their power consumption is vital for maximising\nboth their operational lifetime and coverage. We introduce Terracorder -- a\nversatile multi-sensor device -- and showcase its exceptionally low power\nconsumption using an on-device reinforcement learning scheduler. We prototype a\nunique device setup for biodiversity monitoring and compare its battery life\nusing our scheduler against a number of fixed schedules; the scheduler captures\nmore than 80% of events at less than 50% of the number of activations of the\nbest-performing fixed schedule. We then explore how a collaborative scheduler\ncan maximise the useful operation of a network of devices, improving overall\nnetwork power consumption and robustness.\n","authors":["Josh Millar","Sarab Sethi","Hamed Haddadi","Anil Madhavapeddy"],"pdf_url":"https://arxiv.org/pdf/2408.02407v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2411.01013v2","updated":"2024-11-14T14:23:29Z","published":"2024-11-01T20:33:49Z","title":"A Similarity-Based Oversampling Method for Multi-label Imbalanced Text\n Data","summary":" In real-world applications, as data availability increases, obtaining labeled\ndata for machine learning (ML) projects remains challenging due to the high\ncosts and intensive efforts required for data annotation. Many ML projects,\nparticularly those focused on multi-label classification, also grapple with\ndata imbalance issues, where certain classes may lack sufficient data to train\neffective classifiers. This study introduces and examines a novel oversampling\nmethod for multi-label text classification, designed to address performance\nchallenges associated with data imbalance. The proposed method identifies\npotential new samples from unlabeled data by leveraging similarity measures\nbetween instances. By iteratively searching the unlabeled dataset, the method\nlocates instances similar to those in underrepresented classes and evaluates\ntheir contribution to classifier performance enhancement. Instances that\ndemonstrate performance improvement are then added to the labeled dataset.\nExperimental results indicate that the proposed approach effectively enhances\nclassifier performance post-oversampling.\n","authors":["Ismail Hakki Karaman","Gulser Koksal","Levent Eriskin","Salih Salihoglu"],"pdf_url":"https://arxiv.org/pdf/2411.01013v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09471v1","updated":"2024-11-14T14:21:49Z","published":"2024-11-14T14:21:49Z","title":"Renal Cell Carcinoma subtyping: learning from multi-resolution\n localization","summary":" Renal Cell Carcinoma is typically asymptomatic at the early stages for many\npatients. This leads to a late diagnosis of the tumor, where the curability\nlikelihood is lower, and makes the mortality rate of Renal Cell Carcinoma high,\nwith respect to its incidence rate. To increase the survival chance, a fast and\ncorrect categorization of the tumor subtype is paramount. Nowadays,\ncomputerized methods, based on artificial intelligence, represent an\ninteresting opportunity to improve the productivity and the objectivity of the\nmicroscopy-based Renal Cell Carcinoma diagnosis. Nonetheless, much of their\nexploitation is hampered by the paucity of annotated dataset, essential for a\nproficient training of supervised machine learning technologies. This study\nsets out to investigate a novel self supervised training strategy for machine\nlearning diagnostic tools, based on the multi-resolution nature of the\nhistological samples. We aim at reducing the need of annotated dataset, without\nsignificantly reducing the accuracy of the tool. We demonstrate the\nclassification capability of our tool on a whole slide imaging dataset for\nRenal Cancer subtyping, and we compare our solution with several\nstate-of-the-art classification counterparts.\n","authors":["Mohamad Mohamad","Francesco Ponzio","Santa Di Cataldo","Damien Ambrosetti","Xavier Descombes"],"pdf_url":"https://arxiv.org/pdf/2411.09471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09468v1","updated":"2024-11-14T14:16:50Z","published":"2024-11-14T14:16:50Z","title":"Harnessing Machine Learning for Single-Shot Measurement of Free Electron\n Laser Pulse Power","summary":" Electron beam accelerators are essential in many scientific and technological\nfields. Their operation relies heavily on the stability and precision of the\nelectron beam. Traditional diagnostic techniques encounter difficulties in\naddressing the complex and dynamic nature of electron beams. Particularly in\nthe context of free-electron lasers (FELs), it is fundamentally impossible to\nmeasure the lasing-on and lasingoff electron power profiles for a single\nelectron bunch. This is a crucial hurdle in the exact reconstruction of the\nphoton pulse profile. To overcome this hurdle, we developed a machine learning\nmodel that predicts the temporal power profile of the electron bunch in the\nlasing-off regime using machine parameters that can be obtained when lasing is\non. The model was statistically validated and showed superior predictions\ncompared to the state-of-the-art batch calibrations. The work we present here\nis a critical element for a virtual pulse reconstruction diagnostic (VPRD) tool\ndesigned to reconstruct the power profile of individual photon pulses without\nrequiring repeated measurements in the lasing-off regime. This promises to\nsignificantly enhance the diagnostic capabilities in FELs at large.\n","authors":["Till Korten","Vladimir Rybnikov","Mathias Vogt","Juliane Roensch-Schulenburg","Peter Steinbach","Najmeh Mirian"],"pdf_url":"https://arxiv.org/pdf/2411.09468v1.pdf","comment":"10 pages, 4 figures, Machine Learning and the Physical Sciences\n Workshop, NeurIPS 2024 https://neurips.cc/virtual/2024/100009"},{"id":"http://arxiv.org/abs/2402.03227v4","updated":"2024-11-14T14:11:57Z","published":"2024-02-05T17:38:49Z","title":"IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of\n brain MR images","summary":" In MRI studies, the aggregation of imaging data from multiple acquisition\nsites enhances sample size but may introduce site-related variabilities that\nhinder consistency in subsequent analyses. Deep learning methods for image\ntranslation have emerged as a solution for harmonizing MR images across sites.\nIn this study, we introduce IGUANe (Image Generation with Unified Adversarial\nNetworks), an original 3D model that leverages the strengths of domain\ntranslation and straightforward application of style transfer methods for\nmulticenter brain MR image harmonization. IGUANe extends CycleGAN by\nintegrating an arbitrary number of domains for training through a many-to-one\narchitecture. The framework based on domain pairs enables the implementation of\nsampling strategies that prevent confusion between site-related and biological\nvariabilities. During inference, the model can be applied to any image, even\nfrom an unknown acquisition site, making it a universal generator for\nharmonization. Trained on a dataset comprising T1-weighted images from 11\ndifferent scanners, IGUANe was evaluated on data from unseen sites. The\nassessments included the transformation of MR images with traveling subjects,\nthe preservation of pairwise distances between MR images within domains, the\nevolution of volumetric patterns related to age and Alzheimer$'$s disease (AD),\nand the performance in age regression and patient classification tasks.\nComparisons with other harmonization and normalization methods suggest that\nIGUANe better preserves individual information in MR images and is more\nsuitable for maintaining and reinforcing variabilities related to age and AD.\nFuture studies may further assess IGUANe in other multicenter contexts, either\nusing the same model or retraining it for applications to different image\nmodalities. IGUANe is available at\nhttps://github.com/RocaVincent/iguane_harmonization.git.\n","authors":["Vincent Roca","Grégory Kuchcinski","Jean-Pierre Pruvo","Dorian Manouvriez","Renaud Lopes"],"pdf_url":"https://arxiv.org/pdf/2402.03227v4.pdf","comment":"29 pages, 14 figures"},{"id":"http://arxiv.org/abs/2411.09459v1","updated":"2024-11-14T14:10:31Z","published":"2024-11-14T14:10:31Z","title":"Caravan MultiMet: Extending Caravan with Multiple Weather Nowcasts and\n Forecasts","summary":" The Caravan large-sample hydrology dataset (Kratzert et al., 2023) was\ncreated to standardize and harmonize streamflow data from various regional\ndatasets, combined with globally available meteorological forcing and catchment\nattributes. This community-driven project also allows researchers to\nconveniently extend the dataset for additional basins, as done 6 times to date\n(see https://github.com/kratzert/Caravan/discussions/10). We present a novel\nextension to Caravan, focusing on enriching the meteorological forcing data.\nOur extension adds three precipitation nowcast products (CPC, IMERG v07 Early,\nand CHIRPS) and three weather forecast products (ECMWF IFS HRES, GraphCast, and\nCHIRPS-GEFS) to the existing ERA5-Land reanalysis data. The inclusion of\ndiverse data sources, particularly weather forecasts, enables more robust\nevaluation and benchmarking of hydrological models, especially for real-time\nforecasting scenarios. To the best of our knowledge, this extension makes\nCaravan the first large-sample hydrology dataset to incorporate weather\nforecast data, significantly enhancing its capabilities and fostering\nadvancements in hydrological research, benchmarking, and real-time hydrologic\nforecasting. The data is publicly available under a CC-BY-4.0 license on Zenodo\nin two parts (https://zenodo.org/records/14161235,\nhttps://zenodo.org/records/14161281) and on Google Cloud Platform (GCP) - see\nmore under the Data Availability chapter.\n","authors":["Guy Shalev","Frederik Kratzert"],"pdf_url":"https://arxiv.org/pdf/2411.09459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09453v1","updated":"2024-11-14T13:59:01Z","published":"2024-11-14T13:59:01Z","title":"Long-Tailed Object Detection Pre-training: Dynamic Rebalancing\n Contrastive Learning with Dual Reconstruction","summary":" Pre-training plays a vital role in various vision tasks, such as object\nrecognition and detection. Commonly used pre-training methods, which typically\nrely on randomized approaches like uniform or Gaussian distributions to\ninitialize model parameters, often fall short when confronted with long-tailed\ndistributions, especially in detection tasks. This is largely due to extreme\ndata imbalance and the issue of simplicity bias. In this paper, we introduce a\nnovel pre-training framework for object detection, called Dynamic Rebalancing\nContrastive Learning with Dual Reconstruction (2DRCL). Our method builds on a\nHolistic-Local Contrastive Learning mechanism, which aligns pre-training with\nobject detection by capturing both global contextual semantics and detailed\nlocal patterns. To tackle the imbalance inherent in long-tailed data, we design\na dynamic rebalancing strategy that adjusts the sampling of underrepresented\ninstances throughout the pre-training process, ensuring better representation\nof tail classes. Moreover, Dual Reconstruction addresses simplicity bias by\nenforcing a reconstruction task aligned with the self-consistency principle,\nspecifically benefiting underrepresented tail classes. Experiments on COCO and\nLVIS v1.0 datasets demonstrate the effectiveness of our method, particularly in\nimproving the mAP/AP scores for tail classes.\n","authors":["Chen-Long Duan","Yong Li","Xiu-Shen Wei","Lin Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.09453v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09451v1","updated":"2024-11-14T13:56:02Z","published":"2024-11-14T13:56:02Z","title":"DiffRoad: Realistic and Diverse Road Scenario Generation for Autonomous\n Vehicle Testing","summary":" Generating realistic and diverse road scenarios is essential for autonomous\nvehicle testing and validation. Nevertheless, owing to the complexity and\nvariability of real-world road environments, creating authentic and varied\nscenarios for intelligent driving testing is challenging. In this paper, we\npropose DiffRoad, a novel diffusion model designed to produce controllable and\nhigh-fidelity 3D road scenarios. DiffRoad leverages the generative capabilities\nof diffusion models to synthesize road layouts from white noise through an\ninverse denoising process, preserving real-world spatial features. To enhance\nthe quality of generated scenarios, we design the Road-UNet architecture,\noptimizing the balance between backbone and skip connections for high-realism\nscenario generation. Furthermore, we introduce a road scenario evaluation\nmodule that screens adequate and reasonable scenarios for intelligent driving\ntesting using two critical metrics: road continuity and road reasonableness.\nExperimental results on multiple real-world datasets demonstrate DiffRoad's\nability to generate realistic and smooth road structures while maintaining the\noriginal distribution. Additionally, the generated scenarios can be fully\nautomated into the OpenDRIVE format, facilitating generalized autonomous\nvehicle simulation testing. DiffRoad provides a rich and diverse scenario\nlibrary for large-scale autonomous vehicle testing and offers valuable insights\nfor future infrastructure designs that are better suited for autonomous\nvehicles.\n","authors":["Junjie Zhou","Lin Wang","Qiang Meng","Xiaofan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09451v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.11166v4","updated":"2024-11-14T13:54:32Z","published":"2023-12-18T13:09:55Z","title":"Volume-Preserving Transformers for Learning Time Series Data with\n Structure","summary":" Two of the many trends in neural network research of the past few years have\nbeen (i) the learning of dynamical systems, especially with recurrent neural\nnetworks such as long short-term memory networks (LSTMs) and (ii) the\nintroduction of transformer neural networks for natural language processing\n(NLP) tasks.\n While some work has been performed on the intersection of these two trends,\nthose efforts were largely limited to using the vanilla transformer directly\nwithout adjusting its architecture for the setting of a physical system.\n In this work we develop a transformer-inspired neural network and use it to\nlearn a dynamical system. We (for the first time) change the activation\nfunction of the attention layer to imbue the transformer with\nstructure-preserving properties to improve long-term stability. This is shown\nto be of great advantage when applying the neural network to learning the\ntrajectory of a rigid body.\n","authors":["Benedikt Brantner","Guillaume de Romemont","Michael Kraus","Zeyuan Li"],"pdf_url":"https://arxiv.org/pdf/2312.11166v4.pdf","comment":"Will be published as part of \"Cemracs Proceedings 2023\" (status:\n accepted)"},{"id":"http://arxiv.org/abs/2411.09444v1","updated":"2024-11-14T13:45:22Z","published":"2024-11-14T13:45:22Z","title":"Learning efficient and provably convergent splitting methods","summary":" Splitting methods are widely used for solving initial value problems (IVPs)\ndue to their ability to simplify complicated evolutions into more manageable\nsubproblems which can be solved efficiently and accurately. Traditionally,\nthese methods are derived using analytic and algebraic techniques from\nnumerical analysis, including truncated Taylor series and their Lie algebraic\nanalogue, the Baker--Campbell--Hausdorff formula. These tools enable the\ndevelopment of high-order numerical methods that provide exceptional accuracy\nfor small timesteps. Moreover, these methods often (nearly) conserve important\nphysical invariants, such as mass, unitarity, and energy. However, in many\npractical applications the computational resources are limited. Thus, it is\ncrucial to identify methods that achieve the best accuracy within a fixed\ncomputational budget, which might require taking relatively large timesteps. In\nthis regime, high-order methods derived with traditional methods often exhibit\nlarge errors since they are only designed to be asymptotically optimal. Machine\nLearning techniques offer a potential solution since they can be trained to\nefficiently solve a given IVP with less computational resources. However, they\nare often purely data-driven, come with limited convergence guarantees in the\nsmall-timestep regime and do not necessarily conserve physical invariants. In\nthis work, we propose a framework for finding machine learned splitting methods\nthat are computationally efficient for large timesteps and have provable\nconvergence and conservation guarantees in the small-timestep limit. We\ndemonstrate numerically that the learned methods, which by construction\nconverge quadratically in the timestep size, can be significantly more\nefficient than established methods for the Schr\\\"{o}dinger equation if the\ncomputational budget is limited.\n","authors":["L. M. Kreusser","H. E. Lockyer","E. H. Müller","P. Singh"],"pdf_url":"https://arxiv.org/pdf/2411.09444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06651v2","updated":"2024-11-14T13:26:35Z","published":"2024-11-11T01:36:48Z","title":"Machine learning-enabled velocity model building with uncertainty\n quantification","summary":" Accurately characterizing migration velocity models is crucial for a wide\nrange of geophysical applications, from hydrocarbon exploration to monitoring\nof CO2 sequestration projects. Traditional velocity model building methods such\nas Full-Waveform Inversion (FWI) are powerful but often struggle with the\ninherent complexities of the inverse problem, including noise, limited\nbandwidth, receiver aperture and computational constraints. To address these\nchallenges, we propose a scalable methodology that integrates generative\nmodeling, in the form of Diffusion networks, with physics-informed summary\nstatistics, making it suitable for complicated imaging problems including field\ndatasets. By defining these summary statistics in terms of subsurface-offset\nimage volumes for poor initial velocity models, our approach allows for\ncomputationally efficient generation of Bayesian posterior samples for\nmigration velocity models that offer a useful assessment of uncertainty. To\nvalidate our approach, we introduce a battery of tests that measure the quality\nof the inferred velocity models, as well as the quality of the inferred\nuncertainties. With modern synthetic datasets, we reconfirm gains from using\nsubsurface-image gathers as the conditioning observable. For complex velocity\nmodel building involving salt, we propose a new iterative workflow that refines\namortized posterior approximations with salt flooding and demonstrate how the\nuncertainty in the velocity model can be propagated to the final product\nreverse time migrated images. Finally, we present a proof of concept on field\ndatasets to show that our method can scale to industry-sized problems.\n","authors":["Rafael Orozco","Huseyin Tuna Erdinc","Yunlin Zeng","Mathias Louboutin","Felix J. Herrmann"],"pdf_url":"https://arxiv.org/pdf/2411.06651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09420v1","updated":"2024-11-14T13:15:27Z","published":"2024-11-14T13:15:27Z","title":"SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph\n Attention for Vision Transformers","summary":" Image classification is a computer vision task where a model analyzes an\nimage to categorize it into a specific label. Vision Transformers (ViT) improve\nthis task by leveraging self-attention to capture complex patterns and long\nrange relationships between image patches. However, a key challenge for ViTs is\nefficiently incorporating multiscale feature representations, which is inherent\nin CNNs through their hierarchical structure. In this paper, we introduce the\nScale-Aware Graph Attention Vision Transformer (SAG-ViT), a novel framework\nthat addresses this challenge by integrating multi-scale features. Using\nEfficientNet as a backbone, the model extracts multi-scale feature maps, which\nare divided into patches to preserve semantic information. These patches are\norganized into a graph based on spatial and feature similarities, with a Graph\nAttention Network (GAT) refining the node embeddings. Finally, a Transformer\nencoder captures long-range dependencies and complex interactions. The SAG-ViT\nis evaluated on benchmark datasets, demonstrating its effectiveness in\nenhancing image classification performance.\n","authors":["Shravan Venkatraman","Jaskaran Singh Walia","Joe Dhanith P R"],"pdf_url":"https://arxiv.org/pdf/2411.09420v1.pdf","comment":"10 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2410.07974v3","updated":"2024-11-14T12:51:52Z","published":"2024-10-10T14:32:16Z","title":"Doob's Lagrangian: A Sample-Efficient Variational Approach to Transition\n Path Sampling","summary":" Rare event sampling in dynamical systems is a fundamental problem arising in\nthe natural sciences, which poses significant computational challenges due to\nan exponentially large space of trajectories. For settings where the dynamical\nsystem of interest follows a Brownian motion with known drift, the question of\nconditioning the process to reach a given endpoint or desired rare event is\ndefinitively answered by Doob's h-transform. However, the naive estimation of\nthis transform is infeasible, as it requires simulating sufficiently many\nforward trajectories to estimate rare event probabilities. In this work, we\npropose a variational formulation of Doob's h-transform as an optimization\nproblem over trajectories between a given initial point and the desired ending\npoint. To solve this optimization, we propose a simulation-free training\nobjective with a model parameterization that imposes the desired boundary\nconditions by design. Our approach significantly reduces the search space over\ntrajectories and avoids expensive trajectory simulation and inefficient\nimportance sampling estimators which are required in existing methods. We\ndemonstrate the ability of our method to find feasible transition paths on\nreal-world molecular simulation and protein folding tasks.\n","authors":["Yuanqi Du","Michael Plainer","Rob Brekelmans","Chenru Duan","Frank Noé","Carla P. Gomes","Alán Aspuru-Guzik","Kirill Neklyudov"],"pdf_url":"https://arxiv.org/pdf/2410.07974v3.pdf","comment":"Accepted as Spotlight at Conference on Neural Information Processing\n Systems (NeurIPS 2024); Alanine dipeptide results updated after fixing\n unphysical parameterization"},{"id":"http://arxiv.org/abs/2411.05757v2","updated":"2024-11-14T12:12:15Z","published":"2024-11-08T18:18:18Z","title":"Tract-RLFormer: A Tract-Specific RL policy based Decoder-only\n Transformer Network","summary":" Fiber tractography is a cornerstone of neuroimaging, enabling the detailed\nmapping of the brain's white matter pathways through diffusion MRI. This is\ncrucial for understanding brain connectivity and function, making it a valuable\ntool in neurological applications. Despite its importance, tractography faces\nchallenges due to its complexity and susceptibility to false positives,\nmisrepresenting vital pathways. To address these issues, recent strategies have\nshifted towards deep learning, utilizing supervised learning, which depends on\nprecise ground truth, or reinforcement learning, which operates without it. In\nthis work, we propose Tract-RLFormer, a network utilizing both supervised and\nreinforcement learning, in a two-stage policy refinement process that markedly\nimproves the accuracy and generalizability across various data-sets. By\nemploying a tract-specific approach, our network directly delineates the tracts\nof interest, bypassing the traditional segmentation process. Through rigorous\nvalidation on datasets such as TractoInferno, HCP, and ISMRM-2015, our\nmethodology demonstrates a leap forward in tractography, showcasing its ability\nto accurately map the brain's white matter tracts.\n","authors":["Ankita Joshi","Ashutosh Sharma","Anoushkrit Goel","Ranjeet Ranjan Jha","Chirag Ahuja","Arnav Bhavsar","Aditya Nigam"],"pdf_url":"https://arxiv.org/pdf/2411.05757v2.pdf","comment":"Accepted at 27th International Conference on Pattern Recognition\n (ICPR), 2024"},{"id":"http://arxiv.org/abs/2411.09393v1","updated":"2024-11-14T12:11:08Z","published":"2024-11-14T12:11:08Z","title":"Inherently Interpretable and Uncertainty-Aware Models for Online\n Learning in Cyber-Security Problems","summary":" In this paper, we address the critical need for interpretable and\nuncertainty-aware machine learning models in the context of online learning for\nhigh-risk industries, particularly cyber-security. While deep learning and\nother complex models have demonstrated impressive predictive capabilities,\ntheir opacity and lack of uncertainty quantification present significant\nquestions about their trustworthiness. We propose a novel pipeline for online\nsupervised learning problems in cyber-security, that harnesses the inherent\ninterpretability and uncertainty awareness of Additive Gaussian Processes\n(AGPs) models. Our approach aims to balance predictive performance with\ntransparency while improving the scalability of AGPs, which represents their\nmain drawback, potentially enabling security analysts to better validate threat\ndetection, troubleshoot and reduce false positives, and generally make\ntrustworthy, informed decisions. This work contributes to the growing field of\ninterpretable AI by proposing a class of models that can be significantly\nbeneficial for high-stake decision problems such as the ones typical of the\ncyber-security domain. The source code is available.\n","authors":["Benjamin Kolicic","Alberto Caron","Chris Hicks","Vasilios Mavroudis"],"pdf_url":"https://arxiv.org/pdf/2411.09393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09389v1","updated":"2024-11-14T12:05:35Z","published":"2024-11-14T12:05:35Z","title":"Less is More: Unseen Domain Fake News Detection via Causal Propagation\n Substructures","summary":" The spread of fake news on social media poses significant threats to\nindividuals and society. Text-based and graph-based models have been employed\nfor fake news detection by analysing news content and propagation networks,\nshowing promising results in specific scenarios. However, these data-driven\nmodels heavily rely on pre-existing in-distribution data for training, limiting\ntheir performance when confronted with fake news from emerging or previously\nunseen domains, known as out-of-distribution (OOD) data. Tackling OOD fake news\nis a challenging yet critical task. In this paper, we introduce the Causal\nSubgraph-oriented Domain Adaptive Fake News Detection (CSDA) model, designed to\nenhance zero-shot fake news detection by extracting causal substructures from\npropagation graphs using in-distribution data and generalising this approach to\nOOD data. The model employs a graph neural network based mask generation\nprocess to identify dominant nodes and edges within the propagation graph,\nusing these substructures for fake news detection. Additionally, the\nperformance of CSDA is further improved through contrastive learning in\nfew-shot scenarios, where a limited amount of OOD data is available for\ntraining. Extensive experiments on public social media datasets demonstrate\nthat CSDA effectively handles OOD fake news detection, achieving a 7 to 16\npercents accuracy improvement over other state-of-the-art models.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2411.09389v1.pdf","comment":"9 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2411.09388v1","updated":"2024-11-14T12:05:08Z","published":"2024-11-14T12:05:08Z","title":"A survey of probabilistic generative frameworks for molecular\n simulations","summary":" Generative artificial intelligence is now a widely used tool in molecular\nscience. Despite the popularity of probabilistic generative models, numerical\nexperiments benchmarking their performance on molecular data are lacking. In\nthis work, we introduce and explain several classes of generative models,\nbroadly sorted into two categories: flow-based models and diffusion models. We\nselect three representative models: Neural Spline Flows, Conditional Flow\nMatching, and Denoising Diffusion Probabilistic Models, and examine their\naccuracy, computational cost, and generation speed across datasets with tunable\ndimensionality, complexity, and modal asymmetry. Our findings are varied, with\nno one framework being the best for all purposes. In a nutshell, (i) Neural\nSpline Flows do best at capturing mode asymmetry present in low-dimensional\ndata, (ii) Conditional Flow Matching outperforms other models for\nhigh-dimensional data with low complexity, and (iii) Denoising Diffusion\nProbabilistic Models appears the best for low-dimensional data with high\ncomplexity. Our datasets include a Gaussian mixture model and the dihedral\ntorsion angle distribution of the Aib\\textsubscript{9} peptide, generated via a\nmolecular dynamics simulation. We hope our taxonomy of probabilistic generative\nframeworks and numerical results may guide model selection for a wide range of\nmolecular tasks.\n","authors":["Richard John","Lukas Herron","Pratyush Tiwary"],"pdf_url":"https://arxiv.org/pdf/2411.09388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07506v2","updated":"2024-11-14T12:02:01Z","published":"2024-01-15T07:13:43Z","title":"SeMaScore : a new evaluation metric for automatic speech recognition\n tasks","summary":" In this study, we present SeMaScore, generated using a segment-wise mapping\nand scoring algorithm that serves as an evaluation metric for automatic speech\nrecognition tasks. SeMaScore leverages both the error rate and a more robust\nsimilarity score. We show that our algorithm's score generation improves upon\nthe state-of-the-art BERTScore. Our experimental results show that SeMaScore\ncorresponds well with expert human assessments, signal-to-noise ratio levels,\nand other natural language metrics. We outperform BERTScore by 41x in metric\ncomputation speed. Overall, we demonstrate that SeMaScore serves as a more\ndependable evaluation metric, particularly in real-world situations involving\natypical speech patterns.\n","authors":["Zitha Sasindran","Harsha Yelchuri","T. V. Prabhakar"],"pdf_url":"https://arxiv.org/pdf/2401.07506v2.pdf","comment":"Accepted at Interspeech 2024"},{"id":"http://arxiv.org/abs/2404.07940v3","updated":"2024-11-14T11:51:00Z","published":"2024-03-11T02:06:30Z","title":"InfiBench: Evaluating the Question-Answering Capabilities of Code Large\n Language Models","summary":" Large Language Models for code (code LLMs) have witnessed tremendous progress\nin recent years. With the rapid development of code LLMs, many popular\nevaluation benchmarks, such as HumanEval, DS-1000, and MBPP, have emerged to\nmeasure the performance of code LLMs with a particular focus on code generation\ntasks. However, they are insufficient to cover the full range of expected\ncapabilities of code LLMs, which span beyond code generation to answering\ndiverse coding-related questions. To fill this gap, we propose InfiBench, the\nfirst large-scale freeform question-answering (QA) benchmark for code to our\nknowledge, comprising 234 carefully selected high-quality Stack Overflow\nquestions that span across 15 programming languages. InfiBench uses four types\nof model-free automatic metrics to evaluate response correctness where domain\nexperts carefully concretize the criterion for each question. We conduct a\nsystematic evaluation for over 100 latest code LLMs on InfiBench, leading to a\nseries of novel and insightful findings. Our detailed analyses showcase\npotential directions for further advancement of code LLMs. InfiBench is fully\nopen source at https://infi-coder.github.io/infibench and continuously\nexpanding to foster more scientific and systematic practices for code LLM\nevaluation.\n","authors":["Linyi Li","Shijie Geng","Zhenwen Li","Yibo He","Hao Yu","Ziyue Hua","Guanghan Ning","Siwei Wang","Tao Xie","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2404.07940v3.pdf","comment":"31 pages. Appear at NeurIPS 2024 Datasets and Benchmarks track.\n Project website: https://infi-coder.github.io/infibench"},{"id":"http://arxiv.org/abs/2411.09373v1","updated":"2024-11-14T11:27:15Z","published":"2024-11-14T11:27:15Z","title":"Are nuclear masks all you need for improved out-of-domain\n generalisation? A closer look at cancer classification in histopathology","summary":" Domain generalisation in computational histopathology is challenging because\nthe images are substantially affected by differences among hospitals due to\nfactors like fixation and staining of tissue and imaging equipment. We\nhypothesise that focusing on nuclei can improve the out-of-domain (OOD)\ngeneralisation in cancer detection. We propose a simple approach to improve OOD\ngeneralisation for cancer detection by focusing on nuclear morphology and\norganisation, as these are domain-invariant features critical in cancer\ndetection. Our approach integrates original images with nuclear segmentation\nmasks during training, encouraging the model to prioritise nuclei and their\nspatial arrangement. Going beyond mere data augmentation, we introduce a\nregularisation technique that aligns the representations of masks and original\nimages. We show, using multiple datasets, that our method improves OOD\ngeneralisation and also leads to increased robustness to image corruptions and\nadversarial attacks. The source code is available at\nhttps://github.com/undercutspiky/SFL/\n","authors":["Dhananjay Tomar","Alexander Binder","Andreas Kleppe"],"pdf_url":"https://arxiv.org/pdf/2411.09373v1.pdf","comment":"Poster at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09365v1","updated":"2024-11-14T11:16:32Z","published":"2024-11-14T11:16:32Z","title":"Stability and Generalization for Distributed SGDA","summary":" Minimax optimization is gaining increasing attention in modern machine\nlearning applications. Driven by large-scale models and massive volumes of data\ncollected from edge devices, as well as the concern to preserve client privacy,\ncommunication-efficient distributed minimax optimization algorithms become\npopular, such as Local Stochastic Gradient Descent Ascent (Local-SGDA), and\nLocal Decentralized SGDA (Local-DSGDA). While most existing research on\ndistributed minimax algorithms focuses on convergence rates, computation\ncomplexity, and communication efficiency, the generalization performance\nremains underdeveloped, whereas generalization ability is a pivotal indicator\nfor evaluating the holistic performance of a model when fed with unknown data.\nIn this paper, we propose the stability-based generalization analytical\nframework for Distributed-SGDA, which unifies two popular distributed minimax\nalgorithms including Local-SGDA and Local-DSGDA, and conduct a comprehensive\nanalysis of stability error, generalization gap, and population risk across\ndifferent metrics under various settings, e.g., (S)C-(S)C, PL-SC, and NC-NC\ncases. Our theoretical results reveal the trade-off between the generalization\ngap and optimization error and suggest hyperparameters choice to obtain the\noptimal population risk. Numerical experiments for Local-SGDA and Local-DSGDA\nvalidate the theoretical results.\n","authors":["Miaoxi Zhu","Yan Sun","Li Shen","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2411.09365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09361v1","updated":"2024-11-14T11:08:54Z","published":"2024-11-14T11:08:54Z","title":"Time-to-Event Pretraining for 3D Medical Imaging","summary":" With the rise of medical foundation models and the growing availability of\nimaging data, scalable pretraining techniques offer a promising way to identify\nimaging biomarkers predictive of future disease risk. While current\nself-supervised methods for 3D medical imaging models capture local structural\nfeatures like organ morphology, they fail to link pixel biomarkers with\nlong-term health outcomes due to a missing context problem. Current approaches\nlack the temporal context necessary to identify biomarkers correlated with\ndisease progression, as they rely on supervision derived only from images and\nconcurrent text descriptions. To address this, we introduce time-to-event\npretraining, a pretraining framework for 3D medical imaging models that\nleverages large-scale temporal supervision from paired, longitudinal electronic\nhealth records (EHRs). Using a dataset of 18,945 CT scans (4.2 million 2D\nimages) and time-to-event distributions across thousands of EHR-derived tasks,\nour method improves outcome prediction, achieving an average AUROC increase of\n23.7% and a 29.4% gain in Harrell's C-index across 8 benchmark tasks.\nImportantly, these gains are achieved without sacrificing diagnostic\nclassification performance. This study lays the foundation for integrating\nlongitudinal EHR and 3D imaging data to advance clinical risk prediction.\n","authors":["Zepeng Huo","Jason Alan Fries","Alejandro Lozano","Jeya Maria Jose Valanarasu","Ethan Steinberg","Louis Blankemeier","Akshay S. Chaudhari","Curtis Langlotz","Nigam H. Shah"],"pdf_url":"https://arxiv.org/pdf/2411.09361v1.pdf","comment":"34 pages, 19 figures"},{"id":"http://arxiv.org/abs/2410.21858v3","updated":"2024-11-14T10:54:53Z","published":"2024-10-29T08:42:22Z","title":"Joint Estimation of Conditional Mean and Covariance for Unbalanced\n Panels","summary":" We propose a nonparametric, kernel-based joint estimator for conditional mean\nand covariance matrices in large unbalanced panels. Our estimator, with proven\nconsistency and finite-sample guarantees, is applied to a comprehensive panel\nof monthly US stock excess returns from 1962 to 2021, conditioned on\nmacroeconomic and firm-specific covariates. The estimator captures time-varying\ncross-sectional dependencies effectively, demonstrating robust statistical\nperformance. In asset pricing, it generates conditional mean-variance efficient\nportfolios with out-of-sample Sharpe ratios that substantially exceed those of\nequal-weighted benchmarks.\n","authors":["Damir Filipovic","Paul Schneider"],"pdf_url":"https://arxiv.org/pdf/2410.21858v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09341v1","updated":"2024-11-14T10:37:34Z","published":"2024-11-14T10:37:34Z","title":"Approximated Variational Bayesian Inverse Reinforcement Learning for\n Large Language Model Alignment","summary":" The alignment of large language models (LLMs) is crucial for generating\nhelpful and harmless content. Existing approaches leverage preference-based\nhuman feedback data to learn the reward function and align the LLM with the\nfeedback data. However, these approaches focus on modeling the reward\ndifference between the chosen and rejected demonstrations, rather than directly\nmodeling the true reward from each demonstration. Moreover, these approaches\nassume that the reward is only obtained at the end of the sentence, which\noverlooks the modeling of intermediate rewards. These issues lead to\ninsufficient use of training signals in the feedback data, limiting the\nrepresentation and generalization ability of the reward and potentially\nresulting in reward hacking. In this paper, we formulate LLM alignment as a\nBayesian Inverse Reinforcement Learning (BIRL) problem and propose a novel\ntraining objective, Approximated Variational Alignment (AVA), to perform LLM\nalignment through Approximated Variational Reward Imitation Learning (AVRIL).\nThe BIRL formulation facilitates intermediate reward modeling and direct reward\nmodeling on each single demonstration, which enhances the utilization of\ntraining signals in the feedback data. Experiments show that AVA outperforms\nexisting LLM alignment approaches in reward modeling, RL fine-tuning, and\ndirect optimization.\n","authors":["Yuang Cai","Yuyu Yuan","Jinsheng Shi","Qinhong Lin"],"pdf_url":"https://arxiv.org/pdf/2411.09341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02930v2","updated":"2024-11-14T10:24:06Z","published":"2024-02-05T11:52:23Z","title":"Embedding Hardware Approximations in Discrete Genetic-based Training for\n Printed MLPs","summary":" Printed Electronics (PE) stands out as a promisingtechnology for widespread\ncomputing due to its distinct attributes, such as low costs and flexible\nmanufacturing. Unlike traditional silicon-based technologies, PE enables\nstretchable, conformal,and non-toxic hardware. However, PE are constrained by\nlarger feature sizes, making it challenging to implement complex circuits such\nas machine learning (ML) classifiers. Approximate computing has been proven to\nreduce the hardware cost of ML circuits such as Multilayer Perceptrons (MLPs).\nIn this paper, we maximize the benefits of approximate computing by integrating\nhardware approximation into the MLP training process. Due to the discrete\nnature of hardware approximation, we propose and implement a genetic-based,\napproximate, hardware-aware training approach specifically designed for printed\nMLPs. For a 5% accuracy loss, our MLPs achieve over 5x area and power reduction\ncompared to the baseline while outperforming state of-the-art approximate and\nstochastic printed MLPs.\n","authors":["Florentia Afentaki","Michael Hefenbrock","Georgios Zervakis","Mehdi B. Tahoori"],"pdf_url":"https://arxiv.org/pdf/2402.02930v2.pdf","comment":"Accepted for publication at the 27th Design, Automation and Test in\n Europe Conference (DATE'24), Mar 25-27 2024, Valencia, Spain"},{"id":"http://arxiv.org/abs/2312.17612v3","updated":"2024-11-14T10:22:05Z","published":"2023-12-29T14:16:11Z","title":"Bespoke Approximation of Multiplication-Accumulation and Activation\n Targeting Printed Multilayer Perceptrons","summary":" Printed Electronics (PE) feature distinct and remarkable characteristics that\nmake them a prominent technology for achieving true ubiquitous computing. This\nis particularly relevant in application domains that require conformal and\nultra-low cost solutions, which have experienced limited penetration of\ncomputing until now. Unlike silicon-based technologies, PE offer unparalleled\nfeatures such as non-recurring engineering costs, ultra-low manufacturing cost,\nand on-demand fabrication of conformal, flexible, non-toxic, and stretchable\nhardware. However, PE face certain limitations due to their large feature\nsizes, that impede the realization of complex circuits, such as machine\nlearning classifiers. In this work, we address these limitations by leveraging\nthe principles of Approximate Computing and Bespoke (fully-customized) design.\nWe propose an automated framework for designing ultra-low power Multilayer\nPerceptron (MLP) classifiers which employs, for the first time, a holistic\napproach to approximate all functions of the MLP's neurons: multiplication,\naccumulation, and activation. Through comprehensive evaluation across various\nMLPs of varying size, our framework demonstrates the ability to enable\nbattery-powered operation of even the most intricate MLP architecture examined,\nsignificantly surpassing the current state of the art.\n","authors":["Florentia Afentaki","Gurol Saglam","Argyris Kokkinis","Kostas Siozios","Georgios Zervakis","Mehdi B Tahoori"],"pdf_url":"https://arxiv.org/pdf/2312.17612v3.pdf","comment":"Accepted for publication at the 42th IEEE/ACM International\n Conference on Computer Aided Design (ICCAD) 2023, San Francisco, USA"},{"id":"http://arxiv.org/abs/2411.09329v1","updated":"2024-11-14T10:21:41Z","published":"2024-11-14T10:21:41Z","title":"Improving hp-Variational Physics-Informed Neural Networks for\n Steady-State Convection-Dominated Problems","summary":" This paper proposes and studies two extensions of applying hp-variational\nphysics-informed neural networks, more precisely the FastVPINNs framework, to\nconvection-dominated convection-diffusion-reaction problems. First, a term in\nthe spirit of a SUPG stabilization is included in the loss functional and a\nnetwork architecture is proposed that predicts spatially varying stabilization\nparameters. Having observed that the selection of the indicator function in\nhard-constrained Dirichlet boundary conditions has a big impact on the accuracy\nof the computed solutions, the second novelty is the proposal of a network\narchitecture that learns good parameters for a class of indicator functions.\nNumerical studies show that both proposals lead to noticeably more accurate\nresults than approaches that can be found in the literature.\n","authors":["Thivin Anandh","Divij Ghose","Himanshu Jain","Pratham Sunkad","Sashikumaar Ganesan","Volker John"],"pdf_url":"https://arxiv.org/pdf/2411.09329v1.pdf","comment":"25 pages, 11 figures, 8 tables"},{"id":"http://arxiv.org/abs/2309.17196v4","updated":"2024-11-14T10:16:35Z","published":"2023-09-29T12:45:39Z","title":"ResBit: Residual Bit Vector for Categorical Values","summary":" One-hot vectors, a common method for representing discrete/categorical data,\nin machine learning are widely used because of their simplicity and\nintuitiveness. However, one-hot vectors suffer from a linear increase in\ndimensionality, posing computational and memory challenges, especially when\ndealing with datasets containing numerous categories. In this paper, we focus\non tabular data generation, and reveal the multinomial diffusion faces the mode\ncollapse phenomenon when the cardinality is high. Moreover, due to the\nlimitations of one-hot vectors, the training phase takes time longer in such a\nsituation. To address these issues, we propose Residual Bit Vectors (ResBit), a\ntechnique for densely representing categorical data. ResBit is an extension of\nanalog bits and overcomes limitations of analog bits when applied to tabular\ndata generation. Our experiments demonstrate that ResBit not only accelerates\ntraining but also maintains performance when compared with the situations\nbefore applying ResBit. Furthermore, our results indicate that many existing\nmethods struggle with high-cardinality data, underscoring the need for\nlower-dimensional representations, such as ResBit and latent vectors.\n","authors":["Masane Fuchi","Amar Zanashir","Hiroto Minami","Tomohiro Takagi"],"pdf_url":"https://arxiv.org/pdf/2309.17196v4.pdf","comment":"25 pages, 29 tables, and 10 figures"},{"id":"http://arxiv.org/abs/2407.02279v2","updated":"2024-11-14T10:15:35Z","published":"2024-07-02T14:08:23Z","title":"How to Boost Any Loss Function","summary":" Boosting is a highly successful ML-born optimization setting in which one is\nrequired to computationally efficiently learn arbitrarily good models based on\nthe access to a weak learner oracle, providing classifiers performing at least\nslightly differently from random guessing. A key difference with gradient-based\noptimization is that boosting's original model does not requires access to\nfirst order information about a loss, yet the decades long history of boosting\nhas quickly evolved it into a first order optimization setting -- sometimes\neven wrongfully defining it as such. Owing to recent progress extending\ngradient-based optimization to use only a loss' zeroth ($0^{th}$) order\ninformation to learn, this begs the question: what loss functions can be\nefficiently optimized with boosting and what is the information really needed\nfor boosting to meet the original boosting blueprint's requirements?\n We provide a constructive formal answer essentially showing that any loss\nfunction can be optimized with boosting and thus boosting can achieve a feat\nnot yet known to be possible in the classical $0^{th}$ order setting, since\nloss functions are not required to be be convex, nor differentiable or\nLipschitz -- and in fact not required to be continuous either. Some tools we\nuse are rooted in quantum calculus, the mathematical field -- not to be\nconfounded with quantum computation -- that studies calculus without passing to\nthe limit, and thus without using first order information.\n","authors":["Richard Nock","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2407.02279v2.pdf","comment":"NeurIPS'24"},{"id":"http://arxiv.org/abs/2411.09317v1","updated":"2024-11-14T09:50:41Z","published":"2024-11-14T09:50:41Z","title":"Pie: Pooling CPU Memory for LLM Inference","summary":" The rapid growth of LLMs has revolutionized natural language processing and\nAI analysis, but their increasing size and memory demands present significant\nchallenges. A common solution is to spill over to CPU memory; however,\ntraditional GPU-CPU memory swapping often results in higher latency and lower\nthroughput.\n This paper introduces Pie, an LLM inference framework that addresses these\nchallenges with performance-transparent swapping and adaptive expansion. By\nleveraging predictable memory access patterns and the high bandwidth of modern\nhardware like the NVIDIA GH200 Grace Hopper Superchip, Pie enables concurrent\ndata swapping without affecting foreground computation, expanding effective\nmemory without added latency. Adaptive expansion dynamically adjusts CPU memory\nallocation based on real-time information, optimizing memory usage and\nperformance under varying conditions.\n Pie maintains low computation latency, high throughput, and high elasticity.\nOur experimental evaluation demonstrates that Pie achieves optimal swapping\npolicy during cache warmup and effectively balances increased memory capacity\nwith negligible impact on computation. With its extended capacity, Pie\noutperforms vLLM by up to 1.9X in throughput and 2X in latency. Additionally,\nPie can reduce GPU memory usage by up to 1.67X while maintaining the same\nperformance. Compared to FlexGen, an offline profiling-based swapping solution,\nPie achieves magnitudes lower latency and 9.4X higher throughput.\n","authors":["Yi Xu","Ziming Mao","Xiangxi Mo","Shu Liu","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2411.09317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09312v1","updated":"2024-11-14T09:38:58Z","published":"2024-11-14T09:38:58Z","title":"Approximate Probabilistic Inference forTime-Series Data A Robust Latent\n Gaussian Model With Temporal Awareness","summary":" The development of robust generative models for highly varied non-stationary\ntime series data is a complex yet important problem. Traditional models for\ntime series data prediction, such as Long Short-Term Memory (LSTM), are\ninefficient and generalize poorly as they cannot capture complex temporal\nrelationships. In this paper, we present a probabilistic generative model that\ncan be trained to capture temporal information, and that is robust to data\nerrors. We call it Time Deep Latent Gaussian Model (tDLGM). Its novel\narchitecture is inspired by Deep Latent Gaussian Model (DLGM). Our model is\ntrained to minimize a loss function based on the negative log loss. One\ncontributing factor to Time Deep Latent Gaussian Model (tDLGM) robustness is\nour regularizer, which accounts for data trends. Experiments conducted show\nthat tDLGM is able to reconstruct and generate complex time series data, and\nthat it is robust against to noise and faulty data.\n","authors":["Anton Johansson","Arunselvan Ramaswamy"],"pdf_url":"https://arxiv.org/pdf/2411.09312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09311v1","updated":"2024-11-14T09:38:41Z","published":"2024-11-14T09:38:41Z","title":"Compression Method for Solar Polarization Spectra Collected from Hinode\n SOT/SP Observations","summary":" The complex structure and extensive details of solar spectral data, combined\nwith a recent surge in volume, present significant processing challenges. To\naddress this, we propose a deep learning-based compression technique using deep\nautoencoder (DAE) and 1D-convolutional autoencoder (CAE) models developed with\nHinode SOT/SP data. We focused on compressing Stokes I and V polarization\nspectra from the quiet Sun, as well as from active regions, providing a novel\ninsight into comprehensive spectral analysis by incorporating spectra from\nextreme magnetic fields. The results indicate that the CAE model outperforms\nthe DAE model in reconstructing Stokes profiles, demonstrating greater\nrobustness and achieving reconstruction errors around the observational noise\nlevel. The proposed method has proven effective in compressing Stokes I and V\nspectra from both the quiet Sun and active regions, highlighting its potential\nfor impactful applications in solar spectral analysis, such as detection of\nunusual spectral signals.\n","authors":["Jargalmaa Batmunkh","Yusuke Iida","Takayoshi Oba","Haruhisa Iijima"],"pdf_url":"https://arxiv.org/pdf/2411.09311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03017v2","updated":"2024-11-14T09:19:43Z","published":"2024-02-05T13:55:54Z","title":"Toward Green and Human-Like Artificial Intelligence: A Complete Survey\n on Contemporary Few-Shot Learning Approaches","summary":" Despite deep learning's widespread success, its data-hungry and\ncomputationally expensive nature makes it impractical for many data-constrained\nreal-world applications. Few-Shot Learning (FSL) aims to address these\nlimitations by enabling rapid adaptation to novel learning tasks, seeing\nsignificant growth in recent years. This survey provides a comprehensive\noverview of the field's latest advancements. Initially, FSL is formally\ndefined, and its relationship with different learning fields is presented. A\nnovel taxonomy is introduced, extending previously proposed ones, and\nreal-world applications in classic and novel fields are described. Finally,\nrecent trends shaping the field, outstanding challenges, and promising future\nresearch directions are discussed.\n","authors":["Georgios Tsoumplekas","Vladislav Li","Panagiotis Sarigiannidis","Vasileios Argyriou"],"pdf_url":"https://arxiv.org/pdf/2402.03017v2.pdf","comment":"35 pages, 9 figures. Submitted to ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2410.14979v4","updated":"2024-11-14T09:17:48Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration From Cognitive Psychology","summary":" The cognitive mechanism by which Large Language Models (LLMs) solve\nmathematical problems remains a widely debated and unresolved issue. Currently,\nthere is little interpretable experimental evidence that connects LLMs'\nproblem-solving with human cognitive psychology.To determine if LLMs possess\nhuman-like mathematical reasoning, we modified the problems used in the human\nCognitive Reflection Test (CRT). Our results show that, even with the use of\nChains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model\n(noted for its reasoning capabilities), have a high error rate when solving\nthese modified CRT problems. Specifically, the average accuracy rate dropped by\nup to 50% compared to the original questions.Further analysis of LLMs'\nincorrect answers suggests that they primarily rely on pattern matching from\ntheir training data, which aligns more with human intuition (System 1 thinking)\nrather than with human-like reasoning (System 2 thinking). This finding\nchallenges the belief that LLMs have genuine mathematical reasoning abilities\ncomparable to humans. As a result, this work may adjust overly optimistic views\non LLMs' progress towards artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Kai Chen","Xiaobing Sun","Baosheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14979v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21862v2","updated":"2024-11-14T09:17:31Z","published":"2024-10-29T08:56:29Z","title":"Hierarchical mixtures of Unigram models for short text clustering: the\n role of Beta-Liouville priors","summary":" This paper presents a variant of the Multinomial mixture model tailored for\nthe unsupervised classification of short text data. Traditionally, the\nMultinomial probability vector in this hierarchical model is assigned a\nDirichlet prior distribution. Here, however, we explore an alternative\nprior--the Beta-Liouville distribution--which offers a more flexible\ncorrelation structure than the Dirichlet. We examine the theoretical properties\nof the Beta-Liouville distribution, focusing on its conjugacy with the\nMultinomial likelihood. This property enables the derivation of update\nequations for a CAVI (Coordinate Ascent Variational Inference) variational\nalgorithm, facilitating the approximate posterior estimation of model\nparameters. Additionally, we propose a stochastic variant of the CAVI algorithm\nthat enhances scalability. The paper concludes with data examples that\ndemonstrate effective strategies for setting the Beta-Liouville\nhyperparameters.\n","authors":["Massimo Bilancia","Samuele Magro"],"pdf_url":"https://arxiv.org/pdf/2410.21862v2.pdf","comment":"32 pages, 4 figures. Submitted"},{"id":"http://arxiv.org/abs/2411.09296v1","updated":"2024-11-14T09:15:28Z","published":"2024-11-14T09:15:28Z","title":"Enhancing generalization in high energy physics using white-box\n adversarial attacks","summary":" Machine learning is becoming increasingly popular in the context of particle\nphysics. Supervised learning, which uses labeled Monte Carlo (MC) simulations,\nremains one of the most widely used methods for discriminating signals beyond\nthe Standard Model. However, this paper suggests that supervised models may\ndepend excessively on artifacts and approximations from Monte Carlo\nsimulations, potentially limiting their ability to generalize well to real\ndata. This study aims to enhance the generalization properties of supervised\nmodels by reducing the sharpness of local minima. It reviews the application of\nfour distinct white-box adversarial attacks in the context of classifying Higgs\nboson decay signals. The attacks are divided into weight space attacks, and\nfeature space attacks. To study and quantify the sharpness of different local\nminima this paper presents two analysis methods: gradient ascent and reduced\nHessian eigenvalue analysis. The results show that white-box adversarial\nattacks significantly improve generalization performance, albeit with increased\ncomputational complexity.\n","authors":["Franck Rothen","Samuel Klein","Matthew Leigh","Tobias Golling"],"pdf_url":"https://arxiv.org/pdf/2411.09296v1.pdf","comment":"10 pages, 4 figures, 8 tables, 3 algorithms, to be published in\n Physical Review D (PRD), presented at the ML4Jets 2024 conference"},{"id":"http://arxiv.org/abs/2404.08434v2","updated":"2024-11-14T09:11:26Z","published":"2024-04-12T12:31:06Z","title":"An improved tabular data generator with VAE-GMM integration","summary":" The rising use of machine learning in various fields requires robust methods\nto create synthetic tabular data. Data should preserve key characteristics\nwhile addressing data scarcity challenges. Current approaches based on\nGenerative Adversarial Networks, such as the state-of-the-art CTGAN model,\nstruggle with the complex structures inherent in tabular data. These data often\ncontain both continuous and discrete features with non-Gaussian distributions.\nTherefore, we propose a novel Variational Autoencoder (VAE)-based model that\naddresses these limitations. Inspired by the TVAE model, our approach\nincorporates a Bayesian Gaussian Mixture model (BGM) within the VAE\narchitecture. This avoids the limitations imposed by assuming a strictly\nGaussian latent space, allowing for a more accurate representation of the\nunderlying data distribution during data generation. Furthermore, our model\noffers enhanced flexibility by allowing the use of various differentiable\ndistributions for individual features, making it possible to handle both\ncontinuous and discrete data types. We thoroughly validate our model on three\nreal-world datasets with mixed data types, including two medically relevant\nones, based on their resemblance and utility. This evaluation demonstrates\nsignificant outperformance against CTGAN and TVAE, establishing its potential\nas a valuable tool for generating synthetic tabular data in various domains,\nparticularly in healthcare.\n","authors":["Patricia A. Apellániz","Juan Parras","Santiago Zazo"],"pdf_url":"https://arxiv.org/pdf/2404.08434v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.09286v1","updated":"2024-11-14T08:53:23Z","published":"2024-11-14T08:53:23Z","title":"A Centralized-Distributed Transfer Model for Cross-Domain Recommendation\n Based on Multi-Source Heterogeneous Transfer Learning","summary":" Cross-domain recommendation (CDR) methods are proposed to tackle the sparsity\nproblem in click through rate (CTR) estimation. Existing CDR methods directly\ntransfer knowledge from the source domains to the target domain and ignore the\nheterogeneities among domains, including feature dimensional heterogeneity and\nlatent space heterogeneity, which may lead to negative transfer. Besides, most\nof the existing methods are based on single-source transfer, which cannot\nsimultaneously utilize knowledge from multiple source domains to further\nimprove the model performance in the target domain. In this paper, we propose a\ncentralized-distributed transfer model (CDTM) for CDR based on multi-source\nheterogeneous transfer learning. To address the issue of feature dimension\nheterogeneity, we build a dual embedding structure: domain specific embedding\n(DSE) and global shared embedding (GSE) to model the feature representation in\nthe single domain and the commonalities in the global space,separately. To\nsolve the latent space heterogeneity, the transfer matrix and attention\nmechanism are used to map and combine DSE and GSE adaptively. Extensive offline\nand online experiments demonstrate the effectiveness of our model.\n","authors":["Ke Xu","Ziliang Wang","Wei Zheng","Yuhao Ma","Chenglin Wang","Nengxue Jiang","Cai Cao"],"pdf_url":"https://arxiv.org/pdf/2411.09286v1.pdf","comment":"Published in: 2022 IEEE International Conference on Data Mining\n (ICDM) (The authors were affiliated Hangzhou NetEase Cloud Music Technology\n Co., Ltd.)"},{"id":"http://arxiv.org/abs/2410.02367v2","updated":"2024-11-14T08:39:54Z","published":"2024-10-03T10:25:23Z","title":"SageAttention: Accurate 8-Bit Attention for Plug-and-play Inference\n Acceleration","summary":" The transformer architecture predominates across various models. As the heart\nof the transformer, attention has a computational complexity of O(N^2),\ncompared to O(N) for linear transformations. When handling large sequence\nlengths, attention becomes the primary time-consuming component. Although\nquantization has proven to be an effective method for accelerating model\ninference, existing quantization methods primarily focus on optimizing the\nlinear layer. In response, we first analyze the feasibility of quantization in\nattention detailedly. Following that, we propose SageAttention, a highly\nefficient and accurate quantization method for attention. The OPS (operations\nper second) of our approach outperforms FlashAttention2 and xformers by about\n2.1 times and 2.7 times, respectively. SageAttention also achieves superior\naccuracy performance over FlashAttention3. Comprehensive experiments confirm\nthat our approach incurs almost no end-to-end metrics loss across diverse\nmodels, including those for large language processing, image generation, and\nvideo generation. The codes are available at\nhttps://github.com/thu-ml/SageAttention.\n","authors":["Jintao Zhang","Jia wei","Haofeng Huang","Pengle Zhang","Jun Zhu","Jianfei Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14212v2","updated":"2024-11-14T08:31:10Z","published":"2024-10-18T07:01:56Z","title":"Comparative Evaluation of Clustered Federated Learning Methods","summary":" Over recent years, Federated Learning (FL) has proven to be one of the most\npromising methods of distributed learning which preserves data privacy. As the\nmethod evolved and was confronted to various real-world scenarios, new\nchallenges have emerged. One such challenge is the presence of highly\nheterogeneous (often referred as non-IID) data distributions among participants\nof the FL protocol. A popular solution to this hurdle is Clustered Federated\nLearning (CFL), which aims to partition clients into groups where the\ndistribution are homogeneous. In the literature, state-of-the-art CFL\nalgorithms are often tested using a few cases of data heterogeneities, without\nsystematically justifying the choices. Further, the taxonomy used for\ndifferentiating the different heterogeneity scenarios is not always\nstraightforward. In this paper, we explore the performance of two\nstate-of-theart CFL algorithms with respect to a proposed taxonomy of data\nheterogeneities in federated learning (FL). We work with three image\nclassification datasets and analyze the resulting clusters against the\nheterogeneity classes using extrinsic clustering metrics. Our objective is to\nprovide a clearer understanding of the relationship between CFL performances\nand data heterogeneity scenarios.\n","authors":["Michael Ben Ali","Omar El-Rifai","Imen Megdiche","André Peninou","Olivier Teste"],"pdf_url":"https://arxiv.org/pdf/2410.14212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07176v2","updated":"2024-11-14T08:20:22Z","published":"2024-11-11T17:56:28Z","title":"More Expressive Attention with Negative Weights","summary":" We propose a novel attention mechanism, named Cog Attention, that enables\nattention weights to be negative for enhanced expressiveness, which stems from\ntwo key factors: (1) Cog Attention can shift the token deletion and copying\nfunction from a static OV matrix to dynamic QK inner products, with the OV\nmatrix now focusing more on refinement or modification. The attention head can\nsimultaneously delete, copy, or retain tokens by assigning them negative,\npositive, or minimal attention weights, respectively. As a result, a single\nattention head becomes more flexible and expressive. (2) Cog Attention improves\nthe model's robustness against representational collapse, which can occur when\nearlier tokens are over-squashed into later positions, leading to homogeneous\nrepresentations. Negative weights reduce effective information paths from\nearlier to later tokens, helping to mitigate this issue. We develop\nTransformer-like models which use Cog Attention as attention modules, including\ndecoder-only models for language modeling and U-ViT diffusion models for image\ngeneration. Experiments show that models using Cog Attention exhibit superior\nperformance compared to those employing traditional softmax attention modules.\nOur approach suggests a promising research direction for rethinking and\nbreaking the entrenched constraints of traditional softmax attention, such as\nthe requirement for non-negative weights.\n","authors":["Ang Lv","Ruobing Xie","Shuaipeng Li","Jiayi Liao","Xingwu Sun","Zhanhui Kang","Di Wang","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2411.07176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09267v1","updated":"2024-11-14T08:08:25Z","published":"2024-11-14T08:08:25Z","title":"Towards efficient compression and communication for prototype-based\n decentralized learning","summary":" In prototype-based federated learning, the exchange of model parameters\nbetween clients and the master server is replaced by transmission of prototypes\nor quantized versions of the data samples to the aggregation server. A fully\ndecentralized deployment of prototype- based learning, without a central\nagregartor of prototypes, is more robust upon network failures and reacts\nfaster to changes in the statistical distribution of the data, suggesting\npotential advantages and quick adaptation in dynamic learning tasks, e.g., when\nthe data sources are IoT devices or when data is non-iid. In this paper, we\nconsider the problem of designing a communication-efficient decentralized\nlearning system based on prototypes. We address the challenge of prototype\nredundancy by leveraging on a twofold data compression technique, i.e., sending\nonly update messages if the prototypes are informationtheoretically useful (via\nthe Jensen-Shannon distance), and using clustering on the prototypes to\ncompress the update messages used in the gossip protocol. We also use parallel\ninstead of sequential gossiping, and present an analysis of its\nage-of-information (AoI). Our experimental results show that, with these\nimprovements, the communications load can be substantially reduced without\ndecreasing the convergence rate of the learning algorithm.\n","authors":["Pablo Fernández-Piñeiro","Manuel Ferández-Veiga","Rebeca P. Díaz-Redondo","Ana Fernández-Vilas","Martín González-Soto"],"pdf_url":"https://arxiv.org/pdf/2411.09267v1.pdf","comment":"15 pages, 2 tables, 7 figures, 6 algorithms"},{"id":"http://arxiv.org/abs/2411.09266v1","updated":"2024-11-14T08:07:02Z","published":"2024-11-14T08:07:02Z","title":"How Good is ChatGPT at Audiovisual Deepfake Detection: A Comparative\n Study of ChatGPT, AI Models and Human Perception","summary":" Multimodal deepfakes involving audiovisual manipulations are a growing threat\nbecause they are difficult to detect with the naked eye or using unimodal deep\nlearningbased forgery detection methods. Audiovisual forensic models, while\nmore capable than unimodal models, require large training datasets and are\ncomputationally expensive for training and inference. Furthermore, these models\nlack interpretability and often do not generalize well to unseen manipulations.\nIn this study, we examine the detection capabilities of a large language model\n(LLM) (i.e., ChatGPT) to identify and account for any possible visual and\nauditory artifacts and manipulations in audiovisual deepfake content. Extensive\nexperiments are conducted on videos from a benchmark multimodal deepfake\ndataset to evaluate the detection performance of ChatGPT and compare it with\nthe detection capabilities of state-of-the-art multimodal forensic models and\nhumans. Experimental results demonstrate the importance of domain knowledge and\nprompt engineering for video forgery detection tasks using LLMs. Unlike\napproaches based on end-to-end learning, ChatGPT can account for spatial and\nspatiotemporal artifacts and inconsistencies that may exist within or across\nmodalities. Additionally, we discuss the limitations of ChatGPT for multimedia\nforensic tasks.\n","authors":["Sahibzada Adil Shahzad","Ammarah Hashmi","Yan-Tsung Peng","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09276v2","updated":"2024-11-14T08:06:46Z","published":"2024-05-15T11:46:47Z","title":"Dual-Segment Clustering Strategy for Hierarchical Federated Learning in\n Heterogeneous Wireless Environments","summary":" Non-independent and identically distributed (Non- IID) data adversely affects\nfederated learning (FL) while heterogeneity in communication quality can\nundermine the reliability of model parameter transmission, potentially\ndegrading wireless FL convergence. This paper proposes a novel dual-segment\nclustering (DSC) strategy that jointly addresses communication and data\nheterogeneity in FL. This is achieved by defining a new signal-to-noise ratio\n(SNR) matrix and information quantity matrix to capture the communication and\ndata heterogeneity, respectively. The celebrated affinity propagation algorithm\nis leveraged to iteratively refine the clustering of clients based on the newly\ndefined matrices effectively enhancing model aggregation in heterogeneous\nenvironments. The convergence analysis and experimental results show that the\nDSC strategy can improve the convergence rate of wireless FL and demonstrate\nsuperior accuracy in heterogeneous environments compared to classical\nclustering methods.\n","authors":["Pengcheng Sun","Erwu Liu","Wei Ni","Kanglei Yu","Xinyu Qu","Rui Wang","Yanlong Bi","Chuanchun Zhang","Abbas Jamalipour"],"pdf_url":"https://arxiv.org/pdf/2405.09276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09263v1","updated":"2024-11-14T08:02:14Z","published":"2024-11-14T08:02:14Z","title":"Rethinking Weight-Averaged Model-merging","summary":" Weight-averaged model-merging has emerged as a powerful approach in deep\nlearning, capable of enhancing model performance without fine-tuning or\nretraining. However, the underlying mechanisms that explain its effectiveness\nremain largely unexplored. In this paper, we investigate this technique from\nthree novel perspectives to provide deeper insights into how and why\nweight-averaged model-merging works: (1) we examine the intrinsic patterns\ncaptured by the learning of the model weights, through the visualizations of\ntheir patterns on several datasets, showing that these weights often encode\nstructured and interpretable patterns; (2) we investigate model ensemble\nmerging strategies based on averaging on weights versus averaging on features,\nproviding detailed analyses across diverse architectures and datasets; and (3)\nwe explore the impact on model-merging prediction stability in terms of\nchanging the parameter magnitude, revealing insights into the way of weight\naveraging works as regularization by showing the robustness across different\nparameter scales. Our findings shed light on the \"black box\" of weight-averaged\nmodel-merging, offering valuable insights and practical recommendations that\nadvance the model-merging process.\n","authors":["Hu Wang","Congbo Ma","Ibrahim Almakky","Ian Reid","Gustavo Carneiro","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2411.09263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14193v2","updated":"2024-11-14T07:58:03Z","published":"2024-10-18T06:07:22Z","title":"xPerT: Extended Persistence Transformer","summary":" A persistence diagram provides a compact summary of persistent homology,\nwhich captures the topological features of a space at different scales.\nHowever, due to its nature as a set, incorporating it as a feature into a\nmachine learning framework is challenging. Several methods have been proposed\nto use persistence diagrams as input for machine learning models, but they\noften require complex preprocessing steps and extensive hyperparameter tuning.\nIn this paper, we propose a novel transformer architecture called the\n\\textit{Extended Persistence Transformer (xPerT)}, which is highly scalable\nthan the compared to Persformer, an existing transformer for persistence\ndiagrams. xPerT reduces GPU memory usage by over 90\\% and improves accuracy on\nmultiple datasets. Additionally, xPerT does not require complex preprocessing\nsteps or extensive hyperparameter tuning, making it easy to use in practice.\nOur code is available at https://github.com/sehunfromdaegu/xpert.\n","authors":["Sehun Kim"],"pdf_url":"https://arxiv.org/pdf/2410.14193v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09242v1","updated":"2024-11-14T07:16:23Z","published":"2024-11-14T07:16:23Z","title":"FluidML: Fast and Memory Efficient Inference Optimization","summary":" Machine learning models deployed on edge devices have enabled numerous\nexciting new applications, such as humanoid robots, AR glasses, and autonomous\nvehicles. However, the computing resources available on these edge devices are\nnot catching up with the ever-growing number of parameters in these models. As\nthe models become bigger and more complicated, the novel yet sophisticated\nstructure challenges the inference runtime optimization. We present FluidML, a\ngeneric runtime memory management and optimization framework that can flexibly\ntransform the model execution blueprint to achieve faster and more\nmemory-efficient inference. Evaluations across different platforms show that\nFluidML can consistently reduce the end-to-end inference latency by up to\n25.38% for popular language models and reduce peak memory usage by up to\n41.47%, compared to state-of-the-art approaches. FluidML is of ~30K line of\ncodes, built for general-purpose usage, and will be released as an open-source\ninference runtime optimization framework to the community.\n","authors":["Jinjie Liu","Hang Qiu"],"pdf_url":"https://arxiv.org/pdf/2411.09242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09238v1","updated":"2024-11-14T07:13:08Z","published":"2024-11-14T07:13:08Z","title":"Rethinking the \"Heatmap + Monte Carlo Tree Search\" Paradigm for Solving\n Large Scale TSP","summary":" The Travelling Salesman Problem (TSP) remains a fundamental challenge in\ncombinatorial optimization, inspiring diverse algorithmic strategies. This\npaper revisits the \"heatmap + Monte Carlo Tree Search (MCTS)\" paradigm that has\nrecently gained traction for learning-based TSP solutions. Within this\nframework, heatmaps encode the likelihood of edges forming part of the optimal\ntour, and MCTS refines this probabilistic guidance to discover optimal\nsolutions. Contemporary approaches have predominantly emphasized the refinement\nof heatmap generation through sophisticated learning models, inadvertently\nsidelining the critical role of MCTS. Our extensive empirical analysis reveals\ntwo pivotal insights: 1) The configuration of MCTS strategies profoundly\ninfluences the solution quality, demanding meticulous tuning to leverage their\nfull potential; 2) Our findings demonstrate that a rudimentary and\nparameter-free heatmap, derived from the intrinsic $k$-nearest nature of TSP,\ncan rival or even surpass the performance of complicated heatmaps, with strong\ngeneralizability across various scales. Empirical evaluations across various\nTSP scales underscore the efficacy of our approach, achieving competitive\nresults. These observations challenge the prevailing focus on heatmap\nsophistication, advocating a reevaluation of the paradigm to harness both\ncomponents synergistically. Our code is available at:\nhttps://github.com/LOGO-CUHKSZ/rethink_mcts_tsp.\n","authors":["Xuanhao Pan","Chenguang Wang","Chaolong Ying","Ye Xue","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2411.09238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00996v2","updated":"2024-11-14T06:55:27Z","published":"2024-07-01T06:22:38Z","title":"Can Small Language Models Learn, Unlearn, and Retain Noise Patterns?","summary":" Small Language Models (SLMs) are generally considered more compact versions\nof large language models (LLMs). This study investigates the ability of SLMs\nwith parameters between 1 and 3 billion to learn, retain, and subsequently\neliminate different types of noise present in the data. Four pre-trained SLMs\nwere utilized for this: Olmo 1B, Qwen1.5 1.8B, Gemma 2B, and Phi2 2.7B. The\nmodels were instruction-tuned on noise-free data and tested using in-context\nexamples to determine if they could learn noise through examples. Subsequently,\nnoise patterns were introduced in instruction tuning to evaluate the noise\nlearning, unlearning, and retention capabilities of the models. Olmo, the\nsmallest model, was highly sensitive to noise, quickly adapting to noisy\npatterns. Phi2 resisted learning character-level and transliteration noise,\nlikely due to its carefully curated, structured, and high-quality pretraining\ndata. Gemma excelled with transliteration noise, likely benefiting from its\nmultilingual pretraining. The findings can be used to develop robust training\nstrategies for SLMs.\n","authors":["Nicy Scaria","Silvester John Joseph Kennedy","Deepak Subramani"],"pdf_url":"https://arxiv.org/pdf/2407.00996v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15750v3","updated":"2024-11-14T06:33:26Z","published":"2024-09-24T05:12:10Z","title":"The Roles of Generative Artificial Intelligence in Internet of Electric\n Vehicles","summary":" With the advancements of generative artificial intelligence (GenAI) models,\ntheir capabilities are expanding significantly beyond content generation and\nthe models are increasingly being used across diverse applications.\nParticularly, GenAI shows great potential in addressing challenges in the\nelectric vehicle (EV) ecosystem ranging from charging management to\ncyber-attack prevention. In this paper, we specifically consider Internet of\nelectric vehicles (IoEV) and we categorize GenAI for IoEV into four different\nlayers namely, EV's battery layer, individual EV layer, smart grid layer, and\nsecurity layer. We introduce various GenAI techniques used in each layer of\nIoEV applications. Subsequently, public datasets available for training the\nGenAI models are summarized. Finally, we provide recommendations for future\ndirections. This survey not only categorizes the applications of GenAI in IoEV\nacross different layers but also serves as a valuable resource for researchers\nand practitioners by highlighting the design and implementation challenges\nwithin each layer. Furthermore, it provides a roadmap for future research\ndirections, enabling the development of more robust and efficient IoEV systems\nthrough the integration of advanced GenAI techniques.\n","authors":["Hanwen Zhang","Dusit Niyato","Wei Zhang","Changyuan Zhao","Hongyang Du","Abbas Jamalipour","Sumei Sun","Yiyang Pei"],"pdf_url":"https://arxiv.org/pdf/2409.15750v3.pdf","comment":"25 Pages"},{"id":"http://arxiv.org/abs/2411.09210v1","updated":"2024-11-14T06:14:39Z","published":"2024-11-14T06:14:39Z","title":"Classical Verification of Quantum Learning Advantages with Noises","summary":" Classical verification of quantum learning allows classical clients to\nreliably leverage quantum computing advantages by interacting with untrusted\nquantum servers. Yet, current quantum devices available in practice suffers\nfrom a variety of noises and whether existed classical verification protocols\ncarry over to noisy scenarios remains unclear. Here, we propose an efficient\nclassical error rectification algorithm to reconstruct the noise-free results\ngiven by the quantum Fourier sampling circuit with practical constant-level\nnoises. In particular, we prove that the error rectification algorithm can\nrestore the heavy Fourier coefficients by using a small number of noisy samples\nthat scales logarithmically with the problem size. We apply this algorithm to\nthe agnostic parity learning task with uniform input marginal and prove that\nthis task can be accomplished in an efficient way on noisy quantum devices with\nour algorithm. In addition, we prove that a classical client with access to the\nrandom example oracle can verify the agnostic parity learning results from the\nnoisy quantum prover in an efficient way, under the condition that the Fourier\ncoefficients are sparse. Our results demonstrate the feasibility of classical\nverification of quantum learning advantages with noises, which provide a\nvaluable guide for both theoretical studies and practical applications with\ncurrent noisy intermediate scale quantum devices.\n","authors":["Yinghao Ma","Jiaxi Su","Dong-Ling Deng"],"pdf_url":"https://arxiv.org/pdf/2411.09210v1.pdf","comment":"13 pages 1 figure"},{"id":"http://arxiv.org/abs/2411.09199v1","updated":"2024-11-14T05:43:42Z","published":"2024-11-14T05:43:42Z","title":"Ghost-Connect Net: A Generalization-Enhanced Guidance For Sparse Deep\n Networks Under Distribution Shifts","summary":" Sparse deep neural networks (DNNs) excel in real-world applications like\nrobotics and computer vision, by reducing computational demands that hinder\nusability. However, recent studies aim to boost DNN efficiency by trimming\nredundant neurons or filters based on task relevance, but neglect their\nadaptability to distribution shifts. We aim to enhance these existing\ntechniques by introducing a companion network, Ghost Connect-Net (GC-Net), to\nmonitor the connections in the original network with distribution\ngeneralization advantage. GC-Net's weights represent connectivity measurements\nbetween consecutive layers of the original network. After pruning GC-Net, the\npruned locations are mapped back to the original network as pruned connections,\nallowing for the combination of magnitude and connectivity-based pruning\nmethods. Experimental results using common DNN benchmarks, such as CIFAR-10,\nFashion MNIST, and Tiny ImageNet show promising results for hybridizing the\nmethod, and using GC-Net guidance for later layers of a network and direct\npruning on earlier layers. We provide theoretical foundations for GC-Net's\napproach to improving generalization under distribution shifts.\n","authors":["Mary Isabelle Wisell","Salimeh Yasaei Sekeh"],"pdf_url":"https://arxiv.org/pdf/2411.09199v1.pdf","comment":"21 pages, 4 figures, 3 subfigures, 42 tables"},{"id":"http://arxiv.org/abs/2406.05964v2","updated":"2024-11-14T05:00:13Z","published":"2024-06-10T01:46:42Z","title":"Distributionally Robust Safe Sample Elimination under Covariate Shift","summary":" We consider a machine learning setup where one training dataset is used to\ntrain multiple models across slightly different data distributions. This occurs\nwhen customized models are needed for various deployment environments. To\nreduce storage and training costs, we propose the DRSSS method, which combines\ndistributionally robust (DR) optimization and safe sample screening (SSS). The\nkey benefit of this method is that models trained on the reduced dataset will\nperform the same as those trained on the full dataset for all possible\ndifferent environments. In this paper, we focus on covariate shift as a type of\ndata distribution change and demonstrate the effectiveness of our method\nthrough experiments.\n","authors":["Hiroyuki Hanada","Tatsuya Aoyama","Satoshi Akahane","Tomonari Tanaka","Yoshito Okura","Yu Inatsu","Noriaki Hashimoto","Shion Takeno","Taro Murayama","Hanju Lee","Shinya Kojima","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2406.05964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08318v2","updated":"2024-11-14T04:52:16Z","published":"2024-05-14T04:58:23Z","title":"No-Regret Learning of Nash Equilibrium for Black-Box Games via Gaussian\n Processes","summary":" This paper investigates the challenge of learning in black-box games, where\nthe underlying utility function is unknown to any of the agents. While there is\nan extensive body of literature on the theoretical analysis of algorithms for\ncomputing the Nash equilibrium with complete information about the game,\nstudies on Nash equilibrium in black-box games are less common. In this paper,\nwe focus on learning the Nash equilibrium when the only available information\nabout an agent's payoff comes in the form of empirical queries. We provide a\nno-regret learning algorithm that utilizes Gaussian processes to identify the\nequilibrium in such games. Our approach not only ensures a theoretical\nconvergence rate but also demonstrates effectiveness across a variety\ncollection of games through experimental validation.\n","authors":["Minbiao Han","Fengxue Zhang","Yuxin Chen"],"pdf_url":"https://arxiv.org/pdf/2405.08318v2.pdf","comment":"40th Conference on Uncertainty in Artificial Intelligence (UAI 2024)"},{"id":"http://arxiv.org/abs/2411.09184v1","updated":"2024-11-14T04:46:08Z","published":"2024-11-14T04:46:08Z","title":"Dynamic technology impact analysis: A multi-task learning approach to\n patent citation prediction","summary":" Machine learning (ML) models are valuable tools for analyzing the impact of\ntechnology using patent citation information. However, existing ML-based\nmethods often struggle to account for the dynamic nature of the technology\nimpact over time and the interdependencies of these impacts across different\nperiods. This study proposes a multi-task learning (MTL) approach to enhance\nthe prediction of technology impact across various time frames by leveraging\nknowledge sharing and simultaneously monitoring the evolution of technology\nimpact. First, we quantify the technology impacts and identify patterns through\ncitation analysis over distinct time periods. Next, we develop MTL models to\npredict citation counts using multiple patent indicators over time. Finally, we\nexamine the changes in key input indicators and their patterns over different\nperiods using the SHapley Additive exPlanation method. We also offer guidelines\nfor validating and interpreting the results by employing statistical methods\nand natural language processing techniques. A case study on battery\ntechnologies demonstrates that our approach not only deepens the understanding\nof technology impact, but also improves prediction accuracy, yielding valuable\ninsights for both academia and industry.\n","authors":["Youngjin Seol","Jaewoong Choi","Seunghyun Lee","Janghyeok Yoon"],"pdf_url":"https://arxiv.org/pdf/2411.09184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09181v1","updated":"2024-11-14T04:39:30Z","published":"2024-11-14T04:39:30Z","title":"DeBaTeR: Denoising Bipartite Temporal Graph for Recommendation","summary":" Due to the difficulty of acquiring large-scale explicit user feedback,\nimplicit feedback (e.g., clicks or other interactions) is widely applied as an\nalternative source of data, where user-item interactions can be modeled as a\nbipartite graph. Due to the noisy and biased nature of implicit real-world\nuser-item interactions, identifying and rectifying noisy interactions are vital\nto enhance model performance and robustness. Previous works on purifying\nuser-item interactions in collaborative filtering mainly focus on mining the\ncorrelation between user/item embeddings and noisy interactions, neglecting the\nbenefit of temporal patterns in determining noisy interactions. Time\ninformation, while enhancing the model utility, also bears its natural\nadvantage in helping to determine noisy edges, e.g., if someone usually watches\nhorror movies at night and talk shows in the morning, a record of watching a\nhorror movie in the morning is more likely to be noisy interaction. Armed with\nthis observation, we introduce a simple yet effective mechanism for generating\ntime-aware user/item embeddings and propose two strategies for denoising\nbipartite temporal graph in recommender systems (DeBaTeR): the first is through\nreweighting the adjacency matrix (DeBaTeR-A), where a reliability score is\ndefined to reweight the edges through both soft assignment and hard assignment;\nthe second is through reweighting the loss function (DeBaTeR-L), where weights\nare generated to reweight user-item samples in the losses. Extensive\nexperiments have been conducted to demonstrate the efficacy of our methods and\nillustrate how time information indeed helps identifying noisy edges.\n","authors":["Xinyu He","Jose Sepulveda","Mostafa Rahmani","Alyssa Woo","Fei Wang","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2411.09181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09178v1","updated":"2024-11-14T04:36:12Z","published":"2024-11-14T04:36:12Z","title":"SAFES: Sequential Privacy and Fairness Enhancing Data Synthesis for\n Responsible AI","summary":" As data-driven and AI-based decision making gains widespread adoption in most\ndisciplines, it is crucial that both data privacy and decision fairness are\nappropriately addressed. While differential privacy (DP) provides a robust\nframework for guaranteeing privacy and several widely accepted methods have\nbeen proposed for improving fairness, the vast majority of existing literature\ntreats the two concerns independently. For methods that do consider privacy and\nfairness simultaneously, they often only apply to a specific machine learning\ntask, limiting their generalizability. In response, we introduce SAFES, a\nSequential PrivAcy and Fairness Enhancing data Synthesis procedure that\nsequentially combines DP data synthesis with a fairness-aware data\ntransformation. SAFES allows full control over the privacy-fairness-utility\ntrade-off via tunable privacy and fairness parameters. We illustrate SAFES by\ncombining AIM, a graphical model-based DP data synthesizer, with a popular\nfairness-aware data pre-processing transformation. Empirical evaluations on the\nAdult and COMPAS datasets demonstrate that for reasonable privacy loss,\nSAFES-generated synthetic data achieve significantly improved fairness metrics\nwith relatively low utility loss.\n","authors":["Spencer Giddens","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.09178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09175v1","updated":"2024-11-14T04:26:47Z","published":"2024-11-14T04:26:47Z","title":"Hybrid deep additive neural networks","summary":" Traditional neural networks (multi-layer perceptrons) have become an\nimportant tool in data science due to their success across a wide range of\ntasks. However, their performance is sometimes unsatisfactory, and they often\nrequire a large number of parameters, primarily due to their reliance on the\nlinear combination structure. Meanwhile, additive regression has been a popular\nalternative to linear regression in statistics. In this work, we introduce\nnovel deep neural networks that incorporate the idea of additive regression.\nOur neural networks share architectural similarities with Kolmogorov-Arnold\nnetworks but are based on simpler yet flexible activation and basis functions.\nAdditionally, we introduce several hybrid neural networks that combine this\narchitecture with that of traditional neural networks. We derive their\nuniversal approximation properties and demonstrate their effectiveness through\nsimulation studies and a real-data application. The numerical results indicate\nthat our neural networks generally achieve better performance than traditional\nneural networks while using fewer parameters.\n","authors":["Gyu Min Kim","Jeong Min Jeon"],"pdf_url":"https://arxiv.org/pdf/2411.09175v1.pdf","comment":"29 pages, 13 figures"},{"id":"http://arxiv.org/abs/2411.09174v1","updated":"2024-11-14T04:23:28Z","published":"2024-11-14T04:23:28Z","title":"Advancing Diffusion Models: Alias-Free Resampling and Enhanced\n Rotational Equivariance","summary":" Recent advances in image generation, particularly via diffusion models, have\nled to impressive improvements in image synthesis quality. Despite this,\ndiffusion models are still challenged by model-induced artifacts and limited\nstability in image fidelity. In this work, we hypothesize that the primary\ncause of this issue is the improper resampling operation that introduces\naliasing in the diffusion model and a careful alias-free resampling dictated by\nimage processing theory can improve the model's performance in image synthesis.\nWe propose the integration of alias-free resampling layers into the UNet\narchitecture of diffusion models without adding extra trainable parameters,\nthereby maintaining computational efficiency. We then assess whether these\ntheory-driven modifications enhance image quality and rotational equivariance.\nOur experimental results on benchmark datasets, including CIFAR-10, MNIST, and\nMNIST-M, reveal consistent gains in image quality, particularly in terms of FID\nand KID scores. Furthermore, we propose a modified diffusion process that\nenables user-controlled rotation of generated images without requiring\nadditional training. Our findings highlight the potential of theory-driven\nenhancements such as alias-free resampling in generative models to improve\nimage quality while maintaining model efficiency and pioneer future research\ndirections to incorporate them into video-generating diffusion models, enabling\ndeeper exploration of the applications of alias-free resampling in generative\nmodeling.\n","authors":["Md Fahim Anjum"],"pdf_url":"https://arxiv.org/pdf/2411.09174v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.04204v2","updated":"2024-11-14T04:14:55Z","published":"2024-11-06T19:02:42Z","title":"Online Budgeted Matching with General Bids","summary":" Online Budgeted Matching (OBM) is a classic problem with important\napplications in online advertising, online service matching, revenue\nmanagement, and beyond. Traditional online algorithms typically assume a small\nbid setting, where the maximum bid-to-budget ratio (\\kappa) is infinitesimally\nsmall. While recent algorithms have tried to address scenarios with non-small\nor general bids, they often rely on the Fractional Last Matching (FLM)\nassumption, which allows for accepting partial bids when the remaining budget\nis insufficient. This assumption, however, does not hold for many applications\nwith indivisible bids. In this paper, we remove the FLM assumption and tackle\nthe open problem of OBM with general bids. We first establish an upper bound of\n1-\\kappa on the competitive ratio for any deterministic online algorithm. We\nthen propose a novel meta algorithm, called MetaAd, which reduces to different\nalgorithms with first known provable competitive ratios parameterized by the\nmaximum bid-to-budget ratio \\kappa \\in [0, 1]. As a by-product, we extend\nMetaAd to the FLM setting and get provable competitive algorithms. Finally, we\napply our competitive analysis to the design learning-augmented algorithms.\n","authors":["Jianyi Yang","Pengfei Li","Adam Wierman","Shaolei Ren"],"pdf_url":"https://arxiv.org/pdf/2411.04204v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09170v1","updated":"2024-11-14T04:12:47Z","published":"2024-11-14T04:12:47Z","title":"Towards Scalable Handwriting Communication via EEG Decoding and Latent\n Embedding Integration","summary":" In recent years, brain-computer interfaces have made advances in decoding\nvarious motor-related tasks, including gesture recognition and movement\nclassification, utilizing electroencephalogram (EEG) data. These developments\nare fundamental in exploring how neural signals can be interpreted to recognize\nspecific physical actions. This study centers on a written alphabet\nclassification task, where we aim to decode EEG signals associated with\nhandwriting. To achieve this, we incorporate hand kinematics to guide the\nextraction of the consistent embeddings from high-dimensional neural recordings\nusing auxiliary variables (CEBRA). These CEBRA embeddings, along with the EEG,\nare processed by a parallel convolutional neural network model that extracts\nfeatures from both data sources simultaneously. The model classifies nine\ndifferent handwritten characters, including symbols such as exclamation marks\nand commas, within the alphabet. We evaluate the model using a quantitative\nfive-fold cross-validation approach and explore the structure of the embedding\nspace through visualizations. Our approach achieves a classification accuracy\nof 91 % for the nine-class task, demonstrating the feasibility of fine-grained\nhandwriting decoding from EEG.\n","authors":["Jun-Young Kim","Deok-Seon Kim","Seo-Hyun Lee"],"pdf_url":"https://arxiv.org/pdf/2411.09170v1.pdf","comment":"4 pages, 2 figures, 1 table, Name of Conference: International\n Conference on Brain-Computer Interface"},{"id":"http://arxiv.org/abs/2411.09160v1","updated":"2024-11-14T03:28:02Z","published":"2024-11-14T03:28:02Z","title":"Rationality based Innate-Values-driven Reinforcement Learning","summary":" Innate values describe agents' intrinsic motivations, which reflect their\ninherent interests and preferences to pursue goals and drive them to develop\ndiverse skills satisfying their various needs. The essence of reinforcement\nlearning (RL) is learning from interaction based on reward-driven behaviors,\nmuch like natural agents. It is an excellent model to describe the\ninnate-values-driven (IV) behaviors of AI agents. Especially developing the\nawareness of the AI agent through balancing internal and external utilities\nbased on its needs in different tasks is a crucial problem for individuals\nlearning to support AI agents integrating human society with safety and harmony\nin the long term. This paper proposes a hierarchical compound intrinsic value\nreinforcement learning model -- innate-values-driven reinforcement learning\ntermed IVRL to describe the complex behaviors of AI agents' interaction. We\nformulated the IVRL model and proposed two IVRL models: DQN and A2C. By\ncomparing them with benchmark algorithms such as DQN, DDQN, A2C, and PPO in the\nRole-Playing Game (RPG) reinforcement learning test platform VIZDoom, we\ndemonstrated that rationally organizing various individual needs can\neffectively achieve better performance.\n","authors":["Qin Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09160v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2401.05572"},{"id":"http://arxiv.org/abs/2411.09152v1","updated":"2024-11-14T03:07:57Z","published":"2024-11-14T03:07:57Z","title":"GRAINRec: Graph and Attention Integrated Approach for Real-Time\n Session-Based Item Recommendations","summary":" Recent advancements in session-based recommendation models using deep\nlearning techniques have demonstrated significant performance improvements.\nWhile they can enhance model sophistication and improve the relevance of\nrecommendations, they also make it challenging to implement a scalable\nreal-time solution. To addressing this challenge, we propose GRAINRec- a Graph\nand Attention Integrated session-based recommendation model that generates\nrecommendations in real-time. Our scope of work is item recommendations in\nonline retail where a session is defined as an ordered sequence of digital\nguest actions, such as page views or adds to cart. The proposed model generates\nrecommendations by considering the importance of all items in the session\ntogether, letting us predict relevant recommendations dynamically as the\nsession evolves. We also propose a heuristic approach to implement real-time\ninferencing that meets Target platform's service level agreement (SLA). The\nproposed architecture lets us predict relevant recommendations dynamically as\nthe session evolves, rather than relying on pre-computed recommendations for\neach item. Evaluation results of the proposed model show an average improvement\nof 1.5% across all offline evaluation metrics. A/B tests done over a 2 week\nduration showed an increase of 10% in click through rate and 9% increase in\nattributable demand. Extensive ablation studies are also done to understand our\nmodel performance for different parameters.\n","authors":["Bhavtosh Rath","Pushkar Chennu","David Relyea","Prathyusha Kanmanth Reddy","Amit Pande"],"pdf_url":"https://arxiv.org/pdf/2411.09152v1.pdf","comment":"Accepted to the 2024 IEEE International Conference on Big Data (IEEE\n BigData 2024)"},{"id":"http://arxiv.org/abs/2411.09142v1","updated":"2024-11-14T02:52:47Z","published":"2024-11-14T02:52:47Z","title":"Laplace Transform Interpretation of Differential Privacy","summary":" We introduce a set of useful expressions of Differential Privacy (DP) notions\nin terms of the Laplace transform of the privacy loss distribution. Its bare\nform expression appears in several related works on analyzing DP, either as an\nintegral or an expectation. We show that recognizing the expression as a\nLaplace transform unlocks a new way to reason about DP properties by exploiting\nthe duality between time and frequency domains. Leveraging our interpretation,\nwe connect the $(q, \\rho(q))$-R\\'enyi DP curve and the $(\\epsilon,\n\\delta(\\epsilon))$-DP curve as being the Laplace and inverse-Laplace transforms\nof one another. This connection shows that the R\\'enyi divergence is\nwell-defined for complex orders $q = \\gamma + i \\omega$. Using our Laplace\ntransform-based analysis, we also prove an adaptive composition theorem for\n$(\\epsilon, \\delta)$-DP guarantees that is exactly tight (i.e., matches even in\nconstants) for all values of $\\epsilon$. Additionally, we resolve an issue\nregarding symmetry of $f$-DP on subsampling that prevented equivalence across\nall functional DP notions.\n","authors":["Rishav Chourasia","Uzair Javaid","Biplap Sikdar"],"pdf_url":"https://arxiv.org/pdf/2411.09142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09417v3","updated":"2024-11-14T02:00:33Z","published":"2024-01-17T18:56:18Z","title":"Vision Mamba: Efficient Visual Representation Learning with\n Bidirectional State Space Model","summary":" Recently the state space models (SSMs) with efficient hardware-aware designs,\ni.e., the Mamba deep learning model, have shown great potential for long\nsequence modeling. Meanwhile building efficient and generic vision backbones\npurely upon SSMs is an appealing direction. However, representing visual data\nis challenging for SSMs due to the position-sensitivity of visual data and the\nrequirement of global context for visual understanding. In this paper, we show\nthat the reliance on self-attention for visual representation learning is not\nnecessary and propose a new generic vision backbone with bidirectional Mamba\nblocks (Vim), which marks the image sequences with position embeddings and\ncompresses the visual representation with bidirectional state space models. On\nImageNet classification, COCO object detection, and ADE20k semantic\nsegmentation tasks, Vim achieves higher performance compared to\nwell-established vision transformers like DeiT, while also demonstrating\nsignificantly improved computation & memory efficiency. For example, Vim is\n2.8$\\times$ faster than DeiT and saves 86.8% GPU memory when performing batch\ninference to extract features on images with a resolution of 1248$\\times$1248.\nThe results demonstrate that Vim is capable of overcoming the computation &\nmemory constraints on performing Transformer-style understanding for\nhigh-resolution images and it has great potential to be the next-generation\nbackbone for vision foundation models. Code is available at\nhttps://github.com/hustvl/Vim.\n","authors":["Lianghui Zhu","Bencheng Liao","Qian Zhang","Xinlong Wang","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09417v3.pdf","comment":"Vision Mamba (Vim) is accepted by ICML 2024. Code is available at\n https://github.com/hustvl/Vim"},{"id":"http://arxiv.org/abs/2411.09127v1","updated":"2024-11-14T02:00:22Z","published":"2024-11-14T02:00:22Z","title":"Complexity-Aware Training of Deep Neural Networks for Optimal Structure\n Discovery","summary":" We propose a novel algorithm for combined unit/filter and layer pruning of\ndeep neural networks that functions during training and without requiring a\npre-trained network to apply. Our algorithm optimally trades-off learning\naccuracy and pruning levels while balancing layer vs. unit/filter pruning and\ncomputational vs. parameter complexity using only three user-defined\nparameters, which are easy to interpret and tune. The optimal network structure\nis found as the solution of a stochastic optimization problem over the network\nweights and the parameters of variational Bernoulli distributions for 0/1\nRandom Variables scaling the units and layers of the network. Pruning occurs\nwhen a variational parameter converges to 0 rendering the corresponding\nstructure permanently inactive, thus saving computations during training and\nprediction. A key contribution of our approach is to define a cost function\nthat combines the objectives of prediction accuracy and network pruning in a\ncomputational/parameter complexity-aware manner and the automatic selection of\nthe many regularization parameters. We show that the solutions of the\noptimization problem to which the algorithm converges are deterministic\nnetworks. We analyze the ODE system that underlies our stochastic optimization\nalgorithm and establish domains of attraction around zero for the dynamics of\nthe network parameters. These results provide theoretical support for safely\npruning units/filters and/or layers during training and lead to practical\npruning conditions. We evaluate our method on the CIFAR-10/100 and ImageNet\ndatasets using ResNet architectures and demonstrate that our method improves\nupon layer only or unit only pruning and favorably competes with combined\nunit/filter and layer pruning algorithms requiring pre-trained networks with\nrespect to pruning ratios and test accuracy.\n","authors":["Valentin Frank Ingmar Guenter","Athanasios Sideris"],"pdf_url":"https://arxiv.org/pdf/2411.09127v1.pdf","comment":"28 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2411.09120v1","updated":"2024-11-14T01:41:00Z","published":"2024-11-14T01:41:00Z","title":"Neural Graph Simulator for Complex Systems","summary":" Numerical simulation is a predominant tool for studying the dynamics in\ncomplex systems, but large-scale simulations are often intractable due to\ncomputational limitations. Here, we introduce the Neural Graph Simulator (NGS)\nfor simulating time-invariant autonomous systems on graphs. Utilizing a graph\nneural network, the NGS provides a unified framework to simulate diverse\ndynamical systems with varying topologies and sizes without constraints on\nevaluation times through its non-uniform time step and autoregressive approach.\nThe NGS offers significant advantages over numerical solvers by not requiring\nprior knowledge of governing equations and effectively handling noisy or\nmissing data with a robust training scheme. It demonstrates superior\ncomputational efficiency over conventional methods, improving performance by\nover $10^5$ times in stiff problems. Furthermore, it is applied to real traffic\ndata, forecasting traffic flow with state-of-the-art accuracy. The versatility\nof the NGS extends beyond the presented cases, offering numerous potential\navenues for enhancement.\n","authors":["Hoyun Choi","Sungyeop Lee","B. Kahng","Junghyo Jo"],"pdf_url":"https://arxiv.org/pdf/2411.09120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09118v1","updated":"2024-11-14T01:37:24Z","published":"2024-11-14T01:37:24Z","title":"FxTS-Net: Fixed-Time Stable Learning Framework for Neural ODEs","summary":" Neural Ordinary Differential Equations (Neural ODEs), as a novel category of\nmodeling big data methods, cleverly link traditional neural networks and\ndynamical systems. However, it is challenging to ensure the dynamics system\nreaches a correctly predicted state within a user-defined fixed time. To\naddress this problem, we propose a new method for training Neural ODEs using\nfixed-time stability (FxTS) Lyapunov conditions. Our framework, called\nFxTS-Net, is based on the novel FxTS loss (FxTS-Loss) designed on Lyapunov\nfunctions, which aims to encourage convergence to accurate predictions in a\nuser-defined fixed time. We also provide an innovative approach for\nconstructing Lyapunov functions to meet various tasks and network architecture\nrequirements, achieved by leveraging supervised information during training. By\ndeveloping a more precise time upper bound estimation for bounded\nnon-vanishingly perturbed systems, we demonstrate that minimizing FxTS-Loss not\nonly guarantees FxTS behavior of the dynamics but also input perturbation\nrobustness. For optimising FxTS-Loss, we also propose a learning algorithm, in\nwhich the simulated perturbation sampling method can capture sample points in\ncritical regions to approximate FxTS-Loss. Experimentally, we find that\nFxTS-Net provides better prediction performance and better robustness under\ninput perturbation.\n","authors":["Chaoyang Luo","Yan Zou","Wanying Li","Nanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2411.09118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09117v1","updated":"2024-11-14T01:37:02Z","published":"2024-11-14T01:37:02Z","title":"Efficiently learning and sampling multimodal distributions with\n data-based initialization","summary":" We consider the problem of sampling a multimodal distribution with a Markov\nchain given a small number of samples from the stationary measure. Although\nmixing can be arbitrarily slow, we show that if the Markov chain has a $k$th\norder spectral gap, initialization from a set of $\\tilde O(k/\\varepsilon^2)$\nsamples from the stationary distribution will, with high probability over the\nsamples, efficiently generate a sample whose conditional law is\n$\\varepsilon$-close in TV distance to the stationary measure. In particular,\nthis applies to mixtures of $k$ distributions satisfying a Poincar\\'e\ninequality, with faster convergence when they satisfy a log-Sobolev inequality.\nOur bounds are stable to perturbations to the Markov chain, and in particular\nwork for Langevin diffusion over $\\mathbb R^d$ with score estimation error, as\nwell as Glauber dynamics combined with approximation error from\npseudolikelihood estimation. This justifies the success of data-based\ninitialization for score matching methods despite slow mixing for the data\ndistribution, and improves and generalizes the results of Koehler and Vuong\n(2023) to have linear, rather than exponential, dependence on $k$ and apply to\narbitrary semigroups. As a consequence of our results, we show for the first\ntime that a natural class of low-complexity Ising measures can be efficiently\nlearned from samples.\n","authors":["Frederic Koehler","Holden Lee","Thuy-Duong Vuong"],"pdf_url":"https://arxiv.org/pdf/2411.09117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13306v4","updated":"2024-11-14T01:18:01Z","published":"2023-01-30T21:59:30Z","title":"Autobidders with Budget and ROI Constraints: Efficiency, Regret, and\n Pacing Dynamics","summary":" We study a game between autobidding algorithms that compete in an online\nadvertising platform. Each autobidder is tasked with maximizing its\nadvertiser's total value over multiple rounds of a repeated auction, subject to\nbudget and return-on-investment constraints. We propose a gradient-based\nlearning algorithm that is guaranteed to satisfy all constraints and achieves\nvanishing individual regret. Our algorithm uses only bandit feedback and can be\nused with the first- or second-price auction, as well as with any\n\"intermediate\" auction format. Our main result is that when these autobidders\nplay against each other, the resulting expected liquid welfare over all rounds\nis at least half of the expected optimal liquid welfare achieved by any\nallocation. This holds whether or not the bidding dynamics converges to an\nequilibrium.\n","authors":["Brendan Lucier","Sarath Pattathil","Aleksandrs Slivkins","Mengxiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2301.13306v4.pdf","comment":"Appeared at COLT 2024. Numerical experiments added since Jun'24\n version"},{"id":"http://arxiv.org/abs/2411.09111v1","updated":"2024-11-14T00:59:13Z","published":"2024-11-14T00:59:13Z","title":"Reducing Reasoning Costs - The Path of Optimization for Chain of Thought\n via Sparse Attention Mechanism","summary":" In order to address the chain of thought in the large language model\ninference cost surge, this research proposes to use a sparse attention\nmechanism that only focuses on a few relevant tokens. The researcher\nconstructed a new attention mechanism and used GiantRabbit trained with custom\nGPTs as an experimental tool. The experiment tested and compared the reasoning\ntime, correctness score and chain of thought length of this model and o1\nPreview in solving the linear algebra test questions of MIT OpenCourseWare. The\nresults show that GiantRabbit's reasoning time and chain of thought length are\nsignificantly lower than o1 Preview, confirming the feasibility of the sparse\nattention mechanism in reducing chain of thought reasoning. Detailed\narchitectural details and experimental process have been uploaded to Github,\nthe link is:https://github.com/brucewang123456789/GeniusTrail.git.\n","authors":["Libo Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09111v1.pdf","comment":"The main text is 9 pages, totaling 13 pages; 5 figures, 3 tables;\n preprints have been submitted to NeurIPS 2024 Workshop MusIML and OpenReview"},{"id":"http://arxiv.org/abs/2410.13986v2","updated":"2024-11-14T00:09:09Z","published":"2024-10-17T19:32:25Z","title":"Recurrent Neural Goodness-of-Fit Test for Time Series","summary":" Time series data are crucial across diverse domains such as finance and\nhealthcare, where accurate forecasting and decision-making rely on advanced\nmodeling techniques. While generative models have shown great promise in\ncapturing the intricate dynamics inherent in time series, evaluating their\nperformance remains a major challenge. Traditional evaluation metrics fall\nshort due to the temporal dependencies and potential high dimensionality of the\nfeatures. In this paper, we propose the REcurrent NeurAL (RENAL)\nGoodness-of-Fit test, a novel and statistically rigorous framework for\nevaluating generative time series models. By leveraging recurrent neural\nnetworks, we transform the time series into conditionally independent data\npairs, enabling the application of a chi-square-based goodness-of-fit test to\nthe temporal dependencies within the data. This approach offers a robust,\ntheoretically grounded solution for assessing the quality of generative models,\nparticularly in settings with limited time sequences. We demonstrate the\nefficacy of our method across both synthetic and real-world datasets,\noutperforming existing methods in terms of reliability and accuracy. Our method\nfills a critical gap in the evaluation of time series generative models,\noffering a tool that is both practical and adaptable to high-stakes\napplications.\n","authors":["Aoran Zhang","Wenbin Zhou","Liyan Xie","Shixiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.13986v2.pdf","comment":"27 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.09678v1","updated":"2024-11-14T18:44:31Z","published":"2024-11-14T18:44:31Z","title":"NeuralDEM -- Real-time Simulation of Industrial Particulate Flows","summary":" Advancements in computing power have made it possible to numerically simulate\nlarge-scale fluid-mechanical and/or particulate systems, many of which are\nintegral to core industrial processes. Among the different numerical methods\navailable, the discrete element method (DEM) provides one of the most accurate\nrepresentations of a wide range of physical systems involving granular and\ndiscontinuous materials. Consequently, DEM has become a widely accepted\napproach for tackling engineering problems connected to granular flows and\npowder mechanics. Additionally, DEM can be integrated with grid-based\ncomputational fluid dynamics (CFD) methods, enabling the simulation of chemical\nprocesses taking place, e.g., in fluidized beds. However, DEM is\ncomputationally intensive because of the intrinsic multiscale nature of\nparticulate systems, restricting simulation duration or number of particles.\nTowards this end, NeuralDEM presents an end-to-end approach to replace slow\nnumerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM\nis capable of picturing long-term transport processes across different regimes\nusing macroscopic observables without any reference to microscopic model\nparameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an\nunderlying continuous field, while simultaneously modeling macroscopic behavior\ndirectly as additional auxiliary fields. Second, NeuralDEM introduces\nmulti-branch neural operators scalable to real-time modeling of\nindustrially-sized scenarios - from slow and pseudo-steady to fast and\ntransient. Such scenarios have previously posed insurmountable challenges for\ndeep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM\nfluidized bed reactors of 160k CFD cells and 500k DEM particles for\ntrajectories of 28s. NeuralDEM will open many new doors to advanced engineering\nand much faster process cycles.\n","authors":["Benedikt Alkin","Tobias Kronlachner","Samuele Papa","Stefan Pirker","Thomas Lichtenegger","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2411.09678v1.pdf","comment":"Project page: https://nx-ai.github.io/NeuralDEM/"},{"id":"http://arxiv.org/abs/2411.09267v1","updated":"2024-11-14T08:08:25Z","published":"2024-11-14T08:08:25Z","title":"Towards efficient compression and communication for prototype-based\n decentralized learning","summary":" In prototype-based federated learning, the exchange of model parameters\nbetween clients and the master server is replaced by transmission of prototypes\nor quantized versions of the data samples to the aggregation server. A fully\ndecentralized deployment of prototype-based learning, without a central\nagregartor of prototypes, is more robust upon network failures and reacts\nfaster to changes in the statistical distribution of the data, suggesting\npotential advantages and quick adaptation in dynamic learning tasks, e.g., when\nthe data sources are IoT devices or when data is non-iid. In this paper, we\nconsider the problem of designing a communication-efficient decentralized\nlearning system based on prototypes. We address the challenge of prototype\nredundancy by leveraging on a twofold data compression technique, i.e., sending\nonly update messages if the prototypes are informationtheoretically useful (via\nthe Jensen-Shannon distance), and using clustering on the prototypes to\ncompress the update messages used in the gossip protocol. We also use parallel\ninstead of sequential gossiping, and present an analysis of its\nage-of-information (AoI). Our experimental results show that, with these\nimprovements, the communications load can be substantially reduced without\ndecreasing the convergence rate of the learning algorithm.\n","authors":["Pablo Fernández-Piñeiro","Manuel Ferández-Veiga","Rebeca P. Díaz-Redondo","Ana Fernández-Vilas","Martín González-Soto"],"pdf_url":"https://arxiv.org/pdf/2411.09267v1.pdf","comment":"15 pages, 2 tables, 7 figures, 6 algorithms"},{"id":"http://arxiv.org/abs/2411.09152v1","updated":"2024-11-14T03:07:57Z","published":"2024-11-14T03:07:57Z","title":"GRAINRec: Graph and Attention Integrated Approach for Real-Time\n Session-Based Item Recommendations","summary":" Recent advancements in session-based recommendation models using deep\nlearning techniques have demonstrated significant performance improvements.\nWhile they can enhance model sophistication and improve the relevance of\nrecommendations, they also make it challenging to implement a scalable\nreal-time solution. To addressing this challenge, we propose GRAINRec: a Graph\nand Attention Integrated session-based recommendation model that generates\nrecommendations in real-time. Our scope of work is item recommendations in\nonline retail where a session is defined as an ordered sequence of digital\nguest actions, such as page views or adds to cart. The proposed model generates\nrecommendations by considering the importance of all items in the session\ntogether, letting us predict relevant recommendations dynamically as the\nsession evolves. We also propose a heuristic approach to implement real-time\ninferencing that meets Target platform's service level agreement (SLA). The\nproposed architecture lets us predict relevant recommendations dynamically as\nthe session evolves, rather than relying on pre-computed recommendations for\neach item. Evaluation results of the proposed model show an average improvement\nof 1.5% across all offline evaluation metrics. A/B tests done over a 2 week\nduration showed an increase of 10% in click through rate and 9% increase in\nattributable demand. Extensive ablation studies are also done to understand our\nmodel performance for different parameters.\n","authors":["Bhavtosh Rath","Pushkar Chennu","David Relyea","Prathyusha Kanmanth Reddy","Amit Pande"],"pdf_url":"https://arxiv.org/pdf/2411.09152v1.pdf","comment":"Accepted to the 2024 IEEE International Conference on Big Data (IEEE\n BigData 2024)"},{"id":"http://arxiv.org/abs/2411.09111v1","updated":"2024-11-14T00:59:13Z","published":"2024-11-14T00:59:13Z","title":"Reducing Reasoning Costs -- The Path of Optimization for Chain of\n Thought via Sparse Attention Mechanism","summary":" In order to address the chain of thought in the large language model\ninference cost surge, this research proposes to use a sparse attention\nmechanism that only focuses on a few relevant tokens. The researcher\nconstructed a new attention mechanism and used GiantRabbit trained with custom\nGPTs as an experimental tool. The experiment tested and compared the reasoning\ntime, correctness score and chain of thought length of this model and o1\nPreview in solving the linear algebra test questions of MIT OpenCourseWare. The\nresults show that GiantRabbit's reasoning time and chain of thought length are\nsignificantly lower than o1 Preview, confirming the feasibility of the sparse\nattention mechanism in reducing chain of thought reasoning. Detailed\narchitectural details and experimental process have been uploaded to Github,\nthe link is:https://github.com/brucewang123456789/GeniusTrail.git.\n","authors":["Libo Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09111v1.pdf","comment":"The main text is 9 pages, totaling 13 pages; 5 figures, 3 tables;\n preprints have been submitted to NeurIPS 2024 Workshop MusIML and OpenReview"},{"id":"http://arxiv.org/abs/2411.09849v1","updated":"2024-11-14T23:56:57Z","published":"2024-11-14T23:56:57Z","title":"Self-Supervised Radio Pre-training: Toward Foundational Models for\n Spectrogram Learning","summary":" Foundational deep learning (DL) models are general models, trained on large,\ndiverse, and unlabelled datasets, typically using self-supervised learning\ntechniques have led to significant advancements especially in natural language\nprocessing. These pretrained models can be fine-tuned for related downstream\ntasks, offering faster development and reduced training costs, while often\nachieving improved performance. In this work, we introduce Masked Spectrogram\nModeling, a novel self-supervised learning approach for pretraining\nfoundational DL models on radio signals. Adopting a Convolutional LSTM\narchitecture for efficient spatio-temporal processing, we pretrain the model\nwith an unlabelled radio dataset collected from over-the-air measurements.\nSubsequently, the pretrained model is fine-tuned for two downstream tasks:\nspectrum forecasting and segmentation. Experimental results demonstrate that\nour methodology achieves competitive performance in both forecasting accuracy\nand segmentation, validating its effectiveness for developing foundational\nradio models.\n","authors":["Ahmed Aboulfotouh","Ashkan Eshaghbeigi","Dimitrios Karslidis","Hatem Abou-Zeid"],"pdf_url":"https://arxiv.org/pdf/2411.09849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01603v3","updated":"2024-11-14T23:56:22Z","published":"2024-06-26T17:33:21Z","title":"A Review of Large Language Models and Autonomous Agents in Chemistry","summary":" Large language models (LLMs) have emerged as powerful tools in chemistry,\nsignificantly impacting molecule design, property prediction, and synthesis\noptimization. This review highlights LLM capabilities in these domains and\ntheir potential to accelerate scientific discovery through automation. We also\nreview LLM-based autonomous agents: LLMs with a broader set of tools to\ninteract with their surrounding environment. These agents perform diverse tasks\nsuch as paper scraping, interfacing with automated laboratories, and synthesis\nplanning. As agents are an emerging topic, we extend the scope of our review of\nagents beyond chemistry and discuss across any scientific domains. This review\ncovers the recent history, current capabilities, and design of LLMs and\nautonomous agents, addressing specific challenges, opportunities, and future\ndirections in chemistry. Key challenges include data quality and integration,\nmodel interpretability, and the need for standard benchmarks, while future\ndirections point towards more sophisticated multi-modal agents and enhanced\ncollaboration between agents and experimental methods. Due to the quick pace of\nthis field, a repository has been built to keep track of the latest studies:\nhttps://github.com/ur-whitelab/LLMs-in-science.\n","authors":["Mayk Caldas Ramos","Christopher J. Collison","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2407.01603v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13413v2","updated":"2024-11-14T23:54:29Z","published":"2024-05-22T07:48:24Z","title":"Boosted Neural Decoders: Achieving Extreme Reliability of LDPC Codes for\n 6G Networks","summary":" Ensuring extremely high reliability in channel coding is essential for 6G\nnetworks. The next-generation of ultra-reliable and low-latency communications\n(xURLLC) scenario within 6G networks requires frame error rate (FER) below\n$10^{-9}$. However, low-density parity-check (LDPC) codes, the standard in 5G\nnew radio (NR), encounter a challenge known as the error floor phenomenon,\nwhich hinders to achieve such low rates. To tackle this problem, we introduce\nan innovative solution: boosted neural min-sum (NMS) decoder. This decoder\noperates identically to conventional NMS decoders, but is trained by novel\ntraining methods including: i) boosting learning with uncorrected vectors, ii)\nblock-wise training schedule to address the vanishing gradient issue, iii)\ndynamic weight sharing to minimize the number of trainable weights, iv)\ntransfer learning to reduce the required sample count, and v) data augmentation\nto expedite the sampling process. Leveraging these training strategies, the\nboosted NMS decoder achieves the state-of-the art performance in reducing the\nerror floor as well as superior waterfall performance. Remarkably, we fulfill\nthe 6G xURLLC requirement for 5G LDPC codes without a severe error floor.\nAdditionally, the boosted NMS decoder, once its weights are trained, can\nperform decoding without additional modules, making it highly practical for\nimmediate application. The source code is available at\nhttps://github.com/ghy1228/LDPC_Error_Floor.\n","authors":["Hee-Youl Kwak","Dae-Young Yun","Yongjune Kim","Sang-Hyo Kim","Jong-Seon No"],"pdf_url":"https://arxiv.org/pdf/2405.13413v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.04268v2","updated":"2024-11-14T23:46:34Z","published":"2024-08-08T07:11:57Z","title":"Evaluating Modern Approaches in 3D Scene Reconstruction: NeRF vs\n Gaussian-Based Methods","summary":" Exploring the capabilities of Neural Radiance Fields (NeRF) and\nGaussian-based methods in the context of 3D scene reconstruction, this study\ncontrasts these modern approaches with traditional Simultaneous Localization\nand Mapping (SLAM) systems. Utilizing datasets such as Replica and ScanNet, we\nassess performance based on tracking accuracy, mapping fidelity, and view\nsynthesis. Findings reveal that NeRF excels in view synthesis, offering unique\ncapabilities in generating new perspectives from existing data, albeit at\nslower processing speeds. Conversely, Gaussian-based methods provide rapid\nprocessing and significant expressiveness but lack comprehensive scene\ncompletion. Enhanced by global optimization and loop closure techniques, newer\nmethods like NICE-SLAM and SplaTAM not only surpass older frameworks such as\nORB-SLAM2 in terms of robustness but also demonstrate superior performance in\ndynamic and complex environments. This comparative analysis bridges theoretical\nresearch with practical implications, shedding light on future developments in\nrobust 3D scene reconstruction across various real-world applications.\n","authors":["Yiming Zhou","Zixuan Zeng","Andi Chen","Xiaofan Zhou","Haowei Ni","Shiyao Zhang","Panfeng Li","Liangxi Liu","Mengyao Zheng","Xupeng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.04268v2.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2411.09847v1","updated":"2024-11-14T23:34:38Z","published":"2024-11-14T23:34:38Z","title":"Towards a Fairer Non-negative Matrix Factorization","summary":" Topic modeling, or more broadly, dimensionality reduction, techniques provide\npowerful tools for uncovering patterns in large datasets and are widely applied\nacross various domains. We investigate how Non-negative Matrix Factorization\n(NMF) can introduce bias in the representation of data groups, such as those\ndefined by demographics or protected attributes. We present an approach, called\nFairer-NMF, that seeks to minimize the maximum reconstruction loss for\ndifferent groups relative to their size and intrinsic complexity. Further, we\npresent two algorithms for solving this problem. The first is an alternating\nminimization (AM) scheme and the second is a multiplicative updates (MU) scheme\nwhich demonstrates a reduced computational time compared to AM while still\nachieving similar performance. Lastly, we present numerical experiments on\nsynthetic and real datasets to evaluate the overall performance and trade-offs\nof Fairer-NMF\n","authors":["Lara Kassab","Erin George","Deanna Needell","Haowen Geng","Nika Jafar Nia","Aoxi Li"],"pdf_url":"https://arxiv.org/pdf/2411.09847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09844v1","updated":"2024-11-14T23:19:55Z","published":"2024-11-14T23:19:55Z","title":"Deep Autoencoders for Unsupervised Anomaly Detection in Wildfire\n Prediction","summary":" Wildfires pose a significantly increasing hazard to global ecosystems due to\nthe climate crisis. Due to its complex nature, there is an urgent need for\ninnovative approaches to wildfire prediction, such as machine learning. This\nresearch took a unique approach, differentiating from classical supervised\nlearning, and addressed the gap in unsupervised wildfire prediction using\nautoencoders and clustering techniques for anomaly detection. Historical\nweather and normalised difference vegetation index datasets of Australia for\n2005 - 2021 were utilised. Two main unsupervised approaches were analysed. The\nfirst used a deep autoencoder to obtain latent features, which were then fed\ninto clustering models, isolation forest, local outlier factor and one-class\nSVM for anomaly detection. The second approach used a deep autoencoder to\nreconstruct the input data and use reconstruction errors to identify anomalies.\nLong Short-Term Memory (LSTM) autoencoders and fully connected (FC)\nautoencoders were employed in this part, both in an unsupervised way learning\nonly from nominal data. The FC autoencoder outperformed its counterparts,\nachieving an accuracy of 0.71, an F1-score of 0.74, and an MCC of 0.42. These\nfindings highlight the practicality of this method, as it effectively predicts\nwildfires in the absence of ground truth, utilising an unsupervised learning\ntechnique.\n","authors":["İrem Üstek","Miguel Arana-Catania","Alexander Farr","Ivan Petrunin"],"pdf_url":"https://arxiv.org/pdf/2411.09844v1.pdf","comment":"33 pages, 18 figure, 16 tables. To appear in Earth and Space Science"},{"id":"http://arxiv.org/abs/2411.09842v1","updated":"2024-11-14T23:14:43Z","published":"2024-11-14T23:14:43Z","title":"FedRewind: Rewinding Continual Model Exchange for Decentralized\n Federated Learning","summary":" In this paper, we present FedRewind, a novel approach to decentralized\nfederated learning that leverages model exchange among nodes to address the\nissue of data distribution shift. Drawing inspiration from continual learning\n(CL) principles and cognitive neuroscience theories for memory retention,\nFedRewind implements a decentralized routing mechanism where nodes send/receive\nmodels to/from other nodes in the federation to address spatial distribution\nchallenges inherent in distributed learning (FL). During local training,\nfederation nodes periodically send their models back (i.e., rewind) to the\nnodes they received them from for a limited number of iterations. This strategy\nreduces the distribution shift between nodes' data, leading to enhanced\nlearning and generalization performance. We evaluate our method on multiple\nbenchmarks, demonstrating its superiority over standard decentralized federated\nlearning methods and those enforcing specific routing schemes within the\nfederation. Furthermore, the combination of federated and continual learning\nconcepts enables our method to tackle the more challenging federated continual\nlearning task, with data shifts over both space and time, surpassing existing\nbaselines.\n","authors":["Luca Palazzo","Matteo Pennisi","Federica Proietto Salanitri","Giovanni Bellitto","Simone Palazzo","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2411.09842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10296v4","updated":"2024-11-14T23:07:03Z","published":"2024-04-16T05:40:30Z","title":"Interpolating neural network: A lightweight yet precise architecture for\n data training, equation solving, and parameter calibration","summary":" Artificial intelligence (AI) has revolutionized software development,\nshifting from task-specific codes (Software 1.0) to neural network-based\napproaches (Software 2.0). However, applying this transition in engineering\nsoftware presents challenges, including low surrogate model accuracy, the curse\nof dimensionality in inverse design, and rising complexity in physical\nsimulations. We introduce an interpolating neural network (INN), grounded in\ninterpolation theory and tensor decomposition, to realize Engineering Software\n2.0 by advancing data training, partial differential equation solving, and\nparameter calibration. INN offers orders of magnitude fewer trainable/solvable\nparameters for comparable model accuracy than traditional multi-layer\nperceptron (MLP) or physics-informed neural networks (PINN). Demonstrated in\nmetal additive manufacturing, INN rapidly constructs an accurate surrogate\nmodel of Laser Powder Bed Fusion (L-PBF) heat transfer simulation, achieving\nsub-10-micrometer resolution for a 10 mm path in under 15 minutes on a single\nGPU. This makes a transformative step forward across all domains essential to\nengineering software.\n","authors":["Chanwook Park","Sourav Saha","Jiachen Guo","Hantao Zhang","Xiaoyu Xie","Miguel A. Bessa","Dong Qian","Wei Chen","Gregory J. Wagner","Jian Cao","Wing Kam Liu"],"pdf_url":"https://arxiv.org/pdf/2404.10296v4.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.09837v1","updated":"2024-11-14T23:02:30Z","published":"2024-11-14T23:02:30Z","title":"Real-time Adapting Routing (RAR): Improving Efficiency Through\n Continuous Learning in Software Powered by Layered Foundation Models","summary":" To balance the quality and inference cost of a Foundation Model (FM, such as\nlarge language models (LLMs)) powered software, people often opt to train a\nrouting model that routes requests to FMs with different sizes and\ncapabilities. Existing routing models rely on learning the optimal routing\ndecision from carefully curated data, require complex computations to be\nupdated, and do not consider the potential evolution of weaker FMs. In this\npaper, we propose Real-time Adaptive Routing (RAR), an approach to continuously\nadapt FM routing decisions while using guided in-context learning to enhance\nthe capabilities of weaker FM. The goal is to reduce reliance on stronger, more\nexpensive FMs. We evaluate our approach on different subsets of the popular\nMMLU benchmark. Over time, our approach routes 50.2% fewer requests to\ncomputationally expensive models while maintaining around 90.5% of the general\nresponse quality. In addition, the guides generated from stronger models have\nshown intra-domain generalization and led to a better quality of responses\ncompared to an equivalent approach with a standalone weaker FM.\n","authors":["Kirill Vasilevski","Dayi Lin","Ahmed Hassan"],"pdf_url":"https://arxiv.org/pdf/2411.09837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09827v1","updated":"2024-11-14T22:24:59Z","published":"2024-11-14T22:24:59Z","title":"The Good, The Efficient and the Inductive Biases: Exploring Efficiency\n in Deep Learning Through the Use of Inductive Biases","summary":" The emergence of Deep Learning has marked a profound shift in machine\nlearning, driven by numerous breakthroughs achieved in recent years. However,\nas Deep Learning becomes increasingly present in everyday tools and\napplications, there is a growing need to address unresolved challenges related\nto its efficiency and sustainability. This dissertation delves into the role of\ninductive biases -- particularly, continuous modeling and symmetry preservation\n-- as strategies to enhance the efficiency of Deep Learning. It is structured\nin two main parts.\n The first part investigates continuous modeling as a tool to improve the\nefficiency of Deep Learning algorithms. Continuous modeling involves the idea\nof parameterizing neural operations in a continuous space. The research\npresented here demonstrates substantial benefits for the (i) computational\nefficiency -- in time and memory, (ii) the parameter efficiency, and (iii)\ndesign efficiency -- the complexity of designing neural architectures for new\ndatasets and tasks.\n The second focuses on the role of symmetry preservation on Deep Learning\nefficiency. Symmetry preservation involves designing neural operations that\nalign with the inherent symmetries of data. The research presented in this part\nhighlights significant gains both in data and parameter efficiency through the\nuse of symmetry preservation. However, it also acknowledges a resulting\ntrade-off of increased computational costs.\n The dissertation concludes with a critical evaluation of these findings,\nopenly discussing their limitations and proposing strategies to address them,\ninformed by literature and the author insights. It ends by identifying\npromising future research avenues in the exploration of inductive biases for\nefficiency, and their wider implications for Deep Learning.\n","authors":["David W. Romero"],"pdf_url":"https://arxiv.org/pdf/2411.09827v1.pdf","comment":"PhD Dissertation"},{"id":"http://arxiv.org/abs/2407.07333v3","updated":"2024-11-14T22:17:25Z","published":"2024-07-10T03:04:20Z","title":"Mitigating Partial Observability in Sequential Decision Processes via\n the Lambda Discrepancy","summary":" Reinforcement learning algorithms typically rely on the assumption that the\nenvironment dynamics and value function can be expressed in terms of a\nMarkovian state representation. However, when state information is only\npartially observable, how can an agent learn such a state representation, and\nhow can it detect when it has found one? We introduce a metric that can\naccomplish both objectives, without requiring access to -- or knowledge of --\nan underlying, unobservable state space. Our metric, the $\\lambda$-discrepancy,\nis the difference between two distinct temporal difference (TD) value\nestimates, each computed using TD($\\lambda$) with a different value of\n$\\lambda$. Since TD($\\lambda{=}0$) makes an implicit Markov assumption and\nTD($\\lambda{=}1$) does not, a discrepancy between these estimates is a\npotential indicator of a non-Markovian state representation. Indeed, we prove\nthat the $\\lambda$-discrepancy is exactly zero for all Markov decision\nprocesses and almost always non-zero for a broad class of partially observable\nenvironments. We also demonstrate empirically that, once detected, minimizing\nthe $\\lambda$-discrepancy can help with learning a memory function to mitigate\nthe corresponding partial observability. We then train a reinforcement learning\nagent that simultaneously constructs two recurrent value networks with\ndifferent $\\lambda$ parameters and minimizes the difference between them as an\nauxiliary loss. The approach scales to challenging partially observable\ndomains, where the resulting agent frequently performs significantly better\n(and never performs worse) than a baseline recurrent agent with only a single\nvalue network.\n","authors":["Cameron Allen","Aaron Kirtland","Ruo Yu Tao","Sam Lobel","Daniel Scott","Nicholas Petrocelli","Omer Gottesman","Ronald Parr","Michael L. Littman","George Konidaris"],"pdf_url":"https://arxiv.org/pdf/2407.07333v3.pdf","comment":"GitHub URL: https://github.com/brownirl/lambda_discrepancy; Project\n page: https://lambda-discrepancy.github.io/"},{"id":"http://arxiv.org/abs/2411.09821v1","updated":"2024-11-14T21:53:46Z","published":"2024-11-14T21:53:46Z","title":"Automatic Classification of General Movements in Newborns","summary":" General movements (GMs) are spontaneous, coordinated body movements in\ninfants that offer valuable insights into the developing nervous system.\nAssessed through the Prechtl GM Assessment (GMA), GMs are reliable predictors\nfor neurodevelopmental disorders. However, GMA requires specifically trained\nclinicians, who are limited in number. To scale up newborn screening, there is\na need for an algorithm that can automatically classify GMs from infant video\nrecordings. This data poses challenges, including variability in recording\nlength, device type, and setting, with each video coarsely annotated for\noverall movement quality. In this work, we introduce a tool for extracting\nfeatures from these recordings and explore various machine learning techniques\nfor automated GM classification.\n","authors":["Daphné Chopard","Sonia Laguna","Kieran Chin-Cheong","Annika Dietz","Anna Badura","Sven Wellmann","Julia E Vogt"],"pdf_url":"https://arxiv.org/pdf/2411.09821v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages"},{"id":"http://arxiv.org/abs/2411.09820v1","updated":"2024-11-14T21:49:41Z","published":"2024-11-14T21:49:41Z","title":"WelQrate: Defining the Gold Standard in Small Molecule Drug Discovery\n Benchmarking","summary":" While deep learning has revolutionized computer-aided drug discovery, the AI\ncommunity has predominantly focused on model innovation and placed less\nemphasis on establishing best benchmarking practices. We posit that without a\nsound model evaluation framework, the AI community's efforts cannot reach their\nfull potential, thereby slowing the progress and transfer of innovation into\nreal-world drug discovery. Thus, in this paper, we seek to establish a new gold\nstandard for small molecule drug discovery benchmarking, WelQrate.\nSpecifically, our contributions are threefold: WelQrate Dataset Collection - we\nintroduce a meticulously curated collection of 9 datasets spanning 5\ntherapeutic target classes. Our hierarchical curation pipelines, designed by\ndrug discovery experts, go beyond the primary high-throughput screen by\nleveraging additional confirmatory and counter screens along with rigorous\ndomain-driven preprocessing, such as Pan-Assay Interference Compounds (PAINS)\nfiltering, to ensure the high-quality data in the datasets; WelQrate Evaluation\nFramework - we propose a standardized model evaluation framework considering\nhigh-quality datasets, featurization, 3D conformation generation, evaluation\nmetrics, and data splits, which provides a reliable benchmarking for drug\ndiscovery experts conducting real-world virtual screening; Benchmarking - we\nevaluate model performance through various research questions using the\nWelQrate dataset collection, exploring the effects of different models, dataset\nquality, featurization methods, and data splitting strategies on the results.\nIn summary, we recommend adopting our proposed WelQrate as the gold standard in\nsmall molecule drug discovery benchmarking. The WelQrate dataset collection,\nalong with the curation codes, and experimental scripts are all publicly\navailable at WelQrate.org.\n","authors":[" Yunchao"," Liu","Ha Dong","Xin Wang","Rocco Moretti","Yu Wang","Zhaoqian Su","Jiawei Gu","Bobby Bodenheimer","Charles David Weaver","Jens Meiler","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2411.09820v1.pdf","comment":"* denotes equal contribution"},{"id":"http://arxiv.org/abs/2411.08133v2","updated":"2024-11-14T21:48:30Z","published":"2024-11-12T19:24:42Z","title":"Impactful Bit-Flip Search on Full-precision Models","summary":" Neural networks have shown remarkable performance in various tasks, yet they\nremain susceptible to subtle changes in their input or model parameters. One\nparticularly impactful vulnerability arises through the Bit-Flip Attack (BFA),\nwhere flipping a small number of critical bits in a model's parameters can\nseverely degrade its performance. A common technique for inducing bit flips in\nDRAM is the Row-Hammer attack, which exploits frequent uncached memory accesses\nto alter data. Identifying susceptible bits can be achieved through exhaustive\nsearch or progressive layer-by-layer analysis, especially in quantized\nnetworks. In this work, we introduce Impactful Bit-Flip Search (IBS), a novel\nmethod for efficiently pinpointing and flipping critical bits in full-precision\nnetworks. Additionally, we propose a Weight-Stealth technique that\nstrategically modifies the model's parameters in a way that maintains the float\nvalues within the original distribution, thereby bypassing simple range checks\noften used in tamper detection.\n","authors":["Nadav Benedek","Matan Levy","Mahmood Sharif"],"pdf_url":"https://arxiv.org/pdf/2411.08133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09816v1","updated":"2024-11-14T21:29:58Z","published":"2024-11-14T21:29:58Z","title":"Learning Parameter Sharing with Tensor Decompositions and Sparsity","summary":" Large neural networks achieve remarkable performance, but their size hinders\ndeployment on resource-constrained devices. While various compression\ntechniques exist, parameter sharing remains relatively unexplored. This paper\nintroduces Fine-grained Parameter Sharing (FiPS), a novel algorithm that\nleverages the relationship between parameter sharing, tensor decomposition, and\nsparsity to efficiently compress large vision transformer models. FiPS employs\na shared base and sparse factors to represent shared neurons across multi-layer\nperception (MLP) modules. Shared parameterization is initialized via Singular\nValue Decomposition (SVD) and optimized by minimizing block-wise reconstruction\nerror. Experiments demonstrate that FiPS compresses DeiT-B and Swin-L MLPs to\n25-40% of their original parameter count while maintaining accuracy within 1\npercentage point of the original models.\n","authors":["Cem Üyük","Mike Lasby","Mohamed Yassin","Utku Evci","Yani Ioannou"],"pdf_url":"https://arxiv.org/pdf/2411.09816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07463v3","updated":"2024-11-14T21:20:34Z","published":"2024-11-12T00:54:26Z","title":"MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation\n Models, Convolutional Neural Networks, and Uncertainty Quantification for\n High-Speed Video Phase Detection Data","summary":" Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in\nnuclear reactors, chemical processing, and electronics cooling for detecting\nvapor, liquid, and microlayer phases. Traditional segmentation models face\npixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ\nintroduces VideoSAM, a hybrid framework leveraging convolutional neural\nnetworks (CNNs) and transformer-based vision models to enhance segmentation\naccuracy and generalizability across complex multimodal PD tasks. Methods:\nVideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced\nfeature extraction and segmentation across diverse HSV PD modalities, spanning\nfluids like water, FC-72, nitrogen, and argon under varied heat flux\nconditions. The framework also incorporates uncertainty quantification (UQ) to\nassess pixel-based discretization errors, delivering reliable metrics such as\ncontact line density and dry area fraction under experimental conditions.\nResults: VideoSAM outperforms SAM and modality-specific CNN models in\nsegmentation accuracy, excelling in environments with complex phase boundaries,\noverlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid\narchitecture supports cross-dataset generalization, adapting effectively to\nvarying modalities. The UQ module provides accurate error estimates, enhancing\nthe reliability of segmentation outputs for advanced HSV PD research.\nConclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD\nsegmentation, addressing previous limitations with advanced deep learning and\nUQ techniques. The open-source datasets and tools introduced enable scalable,\nprecise, and adaptable segmentation for multimodal PD datasets, supporting\nadvancements in HSV analysis and autonomous experimentation. The codes and data\nused for this paper are publicly available at\nhttps://github.com/chikap421/mseg_vcuq\n","authors":["Chika Maduabuchi","Ericmoore Jossou","Matteo Bucci"],"pdf_url":"https://arxiv.org/pdf/2411.07463v3.pdf","comment":"Under Review in EAAI"},{"id":"http://arxiv.org/abs/2411.09813v1","updated":"2024-11-14T21:07:52Z","published":"2024-11-14T21:07:52Z","title":"Can Features for Phishing URL Detection Be Trusted Across Diverse\n Datasets? A Case Study with Explainable AI","summary":" Phishing has been a prevalent cyber threat that manipulates users into\nrevealing sensitive private information through deceptive tactics, designed to\nmasquerade as trustworthy entities. Over the years, proactively detection of\nphishing URLs (or websites) has been established as an widely-accepted defense\napproach. In literature, we often find supervised Machine Learning (ML) models\nwith highly competitive performance for detecting phishing websites based on\nthe extracted features from both phishing and benign (i.e., legitimate)\nwebsites. However, it is still unclear if these features or indicators are\ndependent on a particular dataset or they are generalized for overall phishing\ndetection. In this paper, we delve deeper into this issue by analyzing two\npublicly available phishing URL datasets, where each dataset has its own set of\nunique and overlapping features related to URL string and website contents. We\nwant to investigate if overlapping features are similar in nature across\ndatasets and how does the model perform when trained on one dataset and tested\non the other. We conduct practical experiments and leverage explainable AI\n(XAI) methods such as SHAP plots to provide insights into different features'\ncontributions in case of phishing detection to answer our primary question,\n``Can features for phishing URL detection be trusted across diverse dataset?''.\nOur case study experiment results show that features for phishing URL detection\ncan often be dataset-dependent and thus may not be trusted across different\ndatasets even though they share same set of feature behaviors.\n","authors":["Maraz Mia","Darius Derakhshan","Mir Mehedi A. Pritom"],"pdf_url":"https://arxiv.org/pdf/2411.09813v1.pdf","comment":"8 pages, 10 figures, The 11th International Conference on Networking,\n Systems and Security, December 19-21, 2024"},{"id":"http://arxiv.org/abs/2411.09812v1","updated":"2024-11-14T21:01:29Z","published":"2024-11-14T21:01:29Z","title":"Edge Caching Optimization with PPO and Transfer Learning for Dynamic\n Environments","summary":" This paper addresses the challenge of edge caching in dynamic environments,\nwhere rising traffic loads strain backhaul links and core networks. We propose\na Proximal Policy Optimization (PPO)-based caching strategy that fully\nincorporates key file attributes such as size, lifetime, importance, and\npopularity, while also considering random file request arrivals, reflecting\nmore realistic edge caching scenarios. In dynamic environments, changes such as\nshifts in content popularity and variations in request rates frequently occur,\nmaking previously learned policies less effective as they were optimized for\nearlier conditions. Without adaptation, caching efficiency and response times\ncan degrade. While learning a new policy from scratch in a new environment is\nan option, it is highly inefficient and computationally expensive. Thus,\nadapting an existing policy to these changes is critical. To address this, we\ndevelop a mechanism that detects changes in content popularity and request\nrates, ensuring timely adjustments to the caching strategy. We also propose a\ntransfer learning-based PPO algorithm that accelerates convergence in new\nenvironments by leveraging prior knowledge. Simulation results demonstrate the\nsignificant effectiveness of our approach, outperforming a recent Deep\nReinforcement Learning (DRL)-based method.\n","authors":["Farnaz Niknia","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09812v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.09702v1","updated":"2024-11-14T18:59:40Z","published":"2024-11-14T18:59:40Z","title":"On the Surprising Effectiveness of Attention Transfer for Vision\n Transformers","summary":" Conventional wisdom suggests that pre-training Vision Transformers (ViT)\nimproves downstream performance by learning useful representations. Is this\nactually true? We investigate this question and find that the features and\nrepresentations learned during pre-training are not essential. Surprisingly,\nusing only the attention patterns from pre-training (i.e., guiding how\ninformation flows between tokens) is sufficient for models to learn high\nquality features from scratch and achieve comparable downstream performance. We\nshow this by introducing a simple method called attention transfer, where only\nthe attention patterns from a pre-trained teacher ViT are transferred to a\nstudent, either by copying or distilling the attention maps. Since attention\ntransfer lets the student learn its own features, ensembling it with a\nfine-tuned teacher also further improves accuracy on ImageNet. We\nsystematically study various aspects of our findings on the sufficiency of\nattention maps, including distribution shift settings where they underperform\nfine-tuning. We hope our exploration provides a better understanding of what\npre-training accomplishes and leads to a useful alternative to the standard\npractice of fine-tuning\n","authors":["Alexander C. Li","Yuandong Tian","Beidi Chen","Deepak Pathak","Xinlei Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09702v1.pdf","comment":"NeurIPS 2024. Code:\n https://github.com/alexlioralexli/attention-transfer"},{"id":"http://arxiv.org/abs/2405.09596v2","updated":"2024-11-14T18:57:09Z","published":"2024-05-15T13:43:07Z","title":"Enhancing Maritime Trajectory Forecasting via H3 Index and Causal\n Language Modelling (CLM)","summary":" The prediction of ship trajectories is a growing field of study in artificial\nintelligence. Traditional methods rely on the use of LSTM, GRU networks, and\neven Transformer architectures for the prediction of spatio-temporal series.\nThis study proposes a viable alternative for predicting these trajectories\nusing only GNSS positions. It considers this spatio-temporal problem as a\nnatural language processing problem. The latitude/longitude coordinates of AIS\nmessages are transformed into cell identifiers using the H3 index. Thanks to\nthe pseudo-octal representation, it becomes easier for language models to learn\nthe spatial hierarchy of the H3 index. The method is compared with a classical\nKalman filter, widely used in the maritime domain, and introduces the Fr\\'echet\ndistance as the main evaluation metric. We show that it is possible to predict\nship trajectories quite precisely up to 8 hours ahead with 30 minutes of\ncontext, using solely GNSS positions, without relying on any additional\ninformation such as speed, course, or external conditions - unlike many\ntraditional methods. We demonstrate that this alternative works well enough to\npredict trajectories worldwide.\n","authors":["Nicolas Drapier","Aladine Chetouani","Aurélien Chateigner"],"pdf_url":"https://arxiv.org/pdf/2405.09596v2.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2411.09689v1","updated":"2024-11-14T18:55:26Z","published":"2024-11-14T18:55:26Z","title":"LLM Hallucination Reasoning with Zero-shot Knowledge Test","summary":" LLM hallucination, where LLMs occasionally generate unfaithful text, poses\nsignificant challenges for their practical applications. Most existing\ndetection methods rely on external knowledge, LLM fine-tuning, or\nhallucination-labeled datasets, and they do not distinguish between different\ntypes of hallucinations, which are crucial for improving detection performance.\nWe introduce a new task, Hallucination Reasoning, which classifies\nLLM-generated text into one of three categories: aligned, misaligned, and\nfabricated. Our novel zero-shot method assesses whether LLM has enough\nknowledge about a given prompt and text. Our experiments conducted on new\ndatasets demonstrate the effectiveness of our method in hallucination reasoning\nand underscore its importance for enhancing detection performance.\n","authors":["Seongmin Lee","Hsiang Hsu","Chun-Fu Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09689v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.09683v1","updated":"2024-11-14T18:52:05Z","published":"2024-11-14T18:52:05Z","title":"Towards a Classification of Open-Source ML Models and Datasets for\n Software Engineering","summary":" Background: Open-Source Pre-Trained Models (PTMs) and datasets provide\nextensive resources for various Machine Learning (ML) tasks, yet these\nresources lack a classification tailored to Software Engineering (SE) needs.\nAims: We apply an SE-oriented classification to PTMs and datasets on a popular\nopen-source ML repository, Hugging Face (HF), and analyze the evolution of PTMs\nover time. Method: We conducted a repository mining study. We started with a\nsystematically gathered database of PTMs and datasets from the HF API. Our\nselection was refined by analyzing model and dataset cards and metadata, such\nas tags, and confirming SE relevance using Gemini 1.5 Pro. All analyses are\nreplicable, with a publicly accessible replication package. Results: The most\ncommon SE task among PTMs and datasets is code generation, with a primary focus\non software development and limited attention to software management. Popular\nPTMs and datasets mainly target software development. Among ML tasks, text\ngeneration is the most common in SE PTMs and datasets. There has been a marked\nincrease in PTMs for SE since 2023 Q2. Conclusions: This study underscores the\nneed for broader task coverage to enhance the integration of ML within SE\npractices.\n","authors":["Alexandra González","Xavier Franch","David Lo","Silverio Martínez-Fernández"],"pdf_url":"https://arxiv.org/pdf/2411.09683v1.pdf","comment":"5 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.09678v1","updated":"2024-11-14T18:44:31Z","published":"2024-11-14T18:44:31Z","title":"NeuralDEM - Real-time Simulation of Industrial Particulate Flows","summary":" Advancements in computing power have made it possible to numerically simulate\nlarge-scale fluid-mechanical and/or particulate systems, many of which are\nintegral to core industrial processes. Among the different numerical methods\navailable, the discrete element method (DEM) provides one of the most accurate\nrepresentations of a wide range of physical systems involving granular and\ndiscontinuous materials. Consequently, DEM has become a widely accepted\napproach for tackling engineering problems connected to granular flows and\npowder mechanics. Additionally, DEM can be integrated with grid-based\ncomputational fluid dynamics (CFD) methods, enabling the simulation of chemical\nprocesses taking place, e.g., in fluidized beds. However, DEM is\ncomputationally intensive because of the intrinsic multiscale nature of\nparticulate systems, restricting simulation duration or number of particles.\nTowards this end, NeuralDEM presents an end-to-end approach to replace slow\nnumerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM\nis capable of picturing long-term transport processes across different regimes\nusing macroscopic observables without any reference to microscopic model\nparameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an\nunderlying continuous field, while simultaneously modeling macroscopic behavior\ndirectly as additional auxiliary fields. Second, NeuralDEM introduces\nmulti-branch neural operators scalable to real-time modeling of\nindustrially-sized scenarios - from slow and pseudo-steady to fast and\ntransient. Such scenarios have previously posed insurmountable challenges for\ndeep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM\nfluidized bed reactors of 160k CFD cells and 500k DEM particles for\ntrajectories of 28s. NeuralDEM will open many new doors to advanced engineering\nand much faster process cycles.\n","authors":["Benedikt Alkin","Tobias Kronlachner","Samuele Papa","Stefan Pirker","Thomas Lichtenegger","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2411.09678v1.pdf","comment":"Project page: https://nx-ai.github.io/NeuralDEM/"},{"id":"http://arxiv.org/abs/2411.05777v2","updated":"2024-11-14T18:35:19Z","published":"2024-11-08T18:43:15Z","title":"Quantitative Assessment of Intersectional Empathetic Bias and\n Understanding","summary":" A growing amount of literature critiques the current operationalizations of\nempathy based on loose definitions of the construct. Such definitions\nnegatively affect dataset quality, model robustness, and evaluation\nreliability. We propose an empathy evaluation framework that operationalizes\nempathy close to its psychological origins. The framework measures the variance\nin responses of LLMs to prompts using existing metrics for empathy and\nemotional valence. The variance is introduced through the controlled generation\nof the prompts by varying social biases affecting context understanding, thus\nimpacting empathetic understanding. The control over generation ensures high\ntheoretical validity of the constructs in the prompt dataset. Also, it makes\nhigh-quality translation, especially into languages that currently have\nlittle-to-no way of evaluating empathy or bias, such as the Slavonic family,\nmore manageable. Using chosen LLMs and various prompt types, we demonstrate the\nempathy evaluation with the framework, including multiple-choice answers and\nfree generation. The variance in our initial evaluation sample is small and we\nwere unable to measure convincing differences between the empathetic\nunderstanding in contexts given by different social groups. However, the\nresults are promising because the models showed significant alterations their\nreasoning chains needed to capture the relatively subtle changes in the\nprompts. This provides the basis for future research into the construction of\nthe evaluation sample and statistical methods for measuring the results.\n","authors":["Vojtech Formanek","Ondrej Sotolar"],"pdf_url":"https://arxiv.org/pdf/2411.05777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09648v1","updated":"2024-11-14T18:17:30Z","published":"2024-11-14T18:17:30Z","title":"Med-Bot: An AI-Powered Assistant to Provide Accurate and Reliable\n Medical Information","summary":" This paper introduces Med-Bot, an AI-powered chatbot designed to provide\nusers with accurate and reliable medical information. Utilizing advanced\nlibraries and frameworks such as PyTorch, Chromadb, Langchain and Autogptq,\nMed-Bot is built to handle the complexities of natural language understanding\nin a healthcare context. The integration of llamaassisted data processing and\nAutoGPT-Q provides enhanced performance in processing and responding to queries\nbased on PDFs of medical literature, ensuring that users receive precise and\ntrustworthy information. This research details the methodologies employed in\ndeveloping Med-Bot and evaluates its effectiveness in disseminating healthcare\ninformation.\n","authors":["Ahan Bhatt","Nandan Vaghela"],"pdf_url":"https://arxiv.org/pdf/2411.09648v1.pdf","comment":"3 figures, 5 pages Keywords-LLM, AI-powered healthcare, Medical\n chatbot, Context-based interaction, Llama-assisted data processing,\n AutoGPT-Q, PyTorch, TensorFlow, Reliable medical information, Machine\n learning in healthcare, Conversational AI"},{"id":"http://arxiv.org/abs/2411.09642v1","updated":"2024-11-14T18:06:55Z","published":"2024-11-14T18:06:55Z","title":"On the Limits of Language Generation: Trade-Offs Between Hallucination\n and Mode Collapse","summary":" Specifying all desirable properties of a language model is challenging, but\ncertain requirements seem essential. Given samples from an unknown language,\nthe trained model should produce valid strings not seen in training and be\nexpressive enough to capture the language's full richness. Otherwise,\noutputting invalid strings constitutes \"hallucination,\" and failing to capture\nthe full range leads to \"mode collapse.\" We ask if a language model can meet\nboth requirements.\n We investigate this within a statistical language generation setting building\non Gold and Angluin. Here, the model receives random samples from a\ndistribution over an unknown language K, which belongs to a possibly infinite\ncollection of languages. The goal is to generate unseen strings from K. We say\nthe model generates from K with consistency and breadth if, as training size\nincreases, its output converges to all unseen strings in K.\n Kleinberg and Mullainathan [KM24] asked if consistency and breadth in\nlanguage generation are possible. We answer this negatively: for a large class\nof language models, including next-token prediction models, this is impossible\nfor most collections of candidate languages. This contrasts with [KM24]'s\nresult, showing consistent generation without breadth is possible for any\ncountable collection of languages. Our finding highlights that generation with\nbreadth fundamentally differs from generation without breadth.\n As a byproduct, we establish near-tight bounds on the number of samples\nneeded for generation with or without breadth.\n Finally, our results offer hope: consistent generation with breadth is\nachievable for any countable collection of languages when negative examples\n(strings outside K) are available alongside positive ones. This suggests that\npost-training feedback, which encodes negative examples, can be crucial in\nreducing hallucinations while limiting mode collapse.\n","authors":["Alkis Kalavasis","Anay Mehrotra","Grigoris Velegkas"],"pdf_url":"https://arxiv.org/pdf/2411.09642v1.pdf","comment":"Abstract shortened to fit arXiv limit"},{"id":"http://arxiv.org/abs/2411.09627v1","updated":"2024-11-14T17:54:43Z","published":"2024-11-14T17:54:43Z","title":"One-Shot Manipulation Strategy Learning by Making Contact Analogies","summary":" We present a novel approach, MAGIC (manipulation analogies for generalizable\nintelligent contacts), for one-shot learning of manipulation strategies with\nfast and extensive generalization to novel objects. By leveraging a reference\naction trajectory, MAGIC effectively identifies similar contact points and\nsequences of actions on novel objects to replicate a demonstrated strategy,\nsuch as using different hooks to retrieve distant objects of different shapes\nand sizes. Our method is based on a two-stage contact-point matching process\nthat combines global shape matching using pretrained neural features with local\ncurvature analysis to ensure precise and physically plausible contact points.\nWe experiment with three tasks including scooping, hanging, and hooking\nobjects. MAGIC demonstrates superior performance over existing methods,\nachieving significant improvements in runtime speed and generalization to\ndifferent object categories. Website: https://magic-2024.github.io/ .\n","authors":["Yuyao Liu","Jiayuan Mao","Joshua Tenenbaum","Tomás Lozano-Pérez","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2411.09627v1.pdf","comment":"CoRL LEAP Workshop, 2024"},{"id":"http://arxiv.org/abs/2308.11738v3","updated":"2024-11-14T17:50:53Z","published":"2023-08-22T18:58:21Z","title":"Lifted Inference beyond First-Order Logic","summary":" Weighted First Order Model Counting (WFOMC) is fundamental to probabilistic\ninference in statistical relational learning models. As WFOMC is known to be\nintractable in general ($\\#$P-complete), logical fragments that admit\npolynomial time WFOMC are of significant interest. Such fragments are called\ndomain liftable. Recent works have shown that the two-variable fragment of\nfirst order logic extended with counting quantifiers ($\\mathrm{C^2}$) is\ndomain-liftable. However, many properties of real-world data, like acyclicity\nin citation networks and connectivity in social networks, cannot be modeled in\n$\\mathrm{C^2}$, or first order logic in general. In this work, we expand the\ndomain liftability of $\\mathrm{C^2}$ with multiple such properties. We show\nthat any $\\mathrm{C^2}$ sentence remains domain liftable when one of its\nrelations is restricted to represent a directed acyclic graph, a connected\ngraph, a tree (resp. a directed tree) or a forest (resp. a directed forest).\nAll our results rely on a novel and general methodology of \"counting by\nsplitting\". Besides their application to probabilistic inference, our results\nprovide a general framework for counting combinatorial structures. We expand a\nvast array of previous results in discrete mathematics literature on directed\nacyclic graphs, phylogenetic networks, etc.\n","authors":["Sagar Malhotra","Davide Bizzaro","Luciano Serafini"],"pdf_url":"https://arxiv.org/pdf/2308.11738v3.pdf","comment":"Under Review at the Artificial Intelligence Journal. Added two new\n lemmas for counting by splitting in the Main approach section. Added\n experiments with Markov Logic.arXiv admin note: text overlap with\n arXiv:2302.09830"},{"id":"http://arxiv.org/abs/2411.09623v1","updated":"2024-11-14T17:47:54Z","published":"2024-11-14T17:47:54Z","title":"Vision-based Manipulation of Transparent Plastic Bags in Industrial\n Setups","summary":" This paper addresses the challenges of vision-based manipulation for\nautonomous cutting and unpacking of transparent plastic bags in industrial\nsetups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data,\nconnectivity, analytics, and robotics, promises enhanced accessibility and\nsustainability throughout the value chain. The integration of autonomous\nsystems, including collaborative robots (cobots), into industrial processes is\npivotal for efficiency and safety. The proposed solution employs advanced\nMachine Learning algorithms, particularly Convolutional Neural Networks (CNNs),\nto identify transparent plastic bags under varying lighting and background\nconditions. Tracking algorithms and depth sensing technologies are utilized for\n3D spatial awareness during pick and placement. The system addresses challenges\nin grasping and manipulation, considering optimal points, compliance control\nwith vacuum gripping technology, and real-time automation for safe interaction\nin dynamic environments. The system's successful testing and validation in the\nlab with the FRANKA robot arm, showcases its potential for widespread\nindustrial applications, while demonstrating effectiveness in automating the\nunpacking and cutting of transparent plastic bags for an 8-stack bulk-loader\nbased on specific requirements and rigorous testing.\n","authors":["F. Adetunji","A. Karukayil","P. Samant","S. Shabana","F. Varghese","U. Upadhyay","R. A. Yadav","A. Partridge","E. Pendleton","R. Plant","Y. Petillot","M. Koskinopoulou"],"pdf_url":"https://arxiv.org/pdf/2411.09623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09613v1","updated":"2024-11-14T17:33:36Z","published":"2024-11-14T17:33:36Z","title":"PTR: Precision-Driven Tool Recommendation for Large Language Models","summary":" By augmenting Large Language Models (LLMs) with external tools, their\ncapacity to solve complex problems has been significantly enhanced. However,\ndespite ongoing advancements in the parsing capabilities of LLMs, incorporating\nall available tools simultaneously in the prompt remains impractical due to the\nvast number of external tools. Consequently, it is essential to provide LLMs\nwith a precise set of tools tailored to the specific task, considering both\nquantity and quality. Current tool retrieval methods primarily focus on\nrefining the ranking list of tools and directly packaging a fixed number of\ntop-ranked tools as the tool set. However, these approaches often fail to equip\nLLMs with the optimal set of tools prior to execution, since the optimal number\nof tools for different tasks could be different, resulting in inefficiencies\nsuch as redundant or unsuitable tools, which impede immediate access to the\nmost relevant tools. This paper addresses the challenge of recommending precise\ntoolsets for LLMs. We introduce the problem of tool recommendation, define its\nscope, and propose a novel Precision-driven Tool Recommendation (PTR) approach.\nPTR captures an initial, concise set of tools by leveraging historical tool\nbundle usage and dynamically adjusts the tool set by performing tool matching,\nculminating in a multi-view-based tool addition. Additionally, we present a new\ndataset, RecTools, and a metric, TRACC, designed to evaluate the effectiveness\nof tool recommendation for LLMs. We further validate our design choices through\ncomprehensive experiments, demonstrating promising accuracy across two open\nbenchmarks and our RecTools dataset.\n","authors":["Hang Gao","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07104v2","updated":"2024-11-14T17:28:37Z","published":"2024-11-11T16:27:25Z","title":"Learning Multi-Agent Loco-Manipulation for Long-Horizon Quadrupedal\n Pushing","summary":" Recently, quadrupedal locomotion has achieved significant success, but their\nmanipulation capabilities, particularly in handling large objects, remain\nlimited, restricting their usefulness in demanding real-world applications such\nas search and rescue, construction, industrial automation, and room\norganization. This paper tackles the task of obstacle-aware, long-horizon\npushing by multiple quadrupedal robots. We propose a hierarchical multi-agent\nreinforcement learning framework with three levels of control. The high-level\ncontroller integrates an RRT planner and a centralized adaptive policy to\ngenerate subgoals, while the mid-level controller uses a decentralized\ngoal-conditioned policy to guide the robots toward these sub-goals. A\npre-trained low-level locomotion policy executes the movement commands. We\nevaluate our method against several baselines in simulation, demonstrating\nsignificant improvements over baseline approaches, with 36.0% higher success\nrates and 24.5% reduction in completion time than the best baseline. Our\nframework successfully enables long-horizon, obstacle-aware manipulation tasks\nlike Push-Cuboid and Push-T on Go1 robots in the real world.\n","authors":["Yuming Feng","Chuye Hong","Yaru Niu","Shiqi Liu","Yuxiang Yang","Wenhao Yu","Tingnan Zhang","Jie Tan","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.07104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09604v1","updated":"2024-11-14T17:22:16Z","published":"2024-11-14T17:22:16Z","title":"Local-Global Attention: An Adaptive Mechanism for Multi-Scale Feature\n Integration","summary":" In recent years, attention mechanisms have significantly enhanced the\nperformance of object detection by focusing on key feature information.\nHowever, prevalent methods still encounter difficulties in effectively\nbalancing local and global features. This imbalance hampers their ability to\ncapture both fine-grained details and broader contextual information-two\ncritical elements for achieving accurate object detection.To address these\nchallenges, we propose a novel attention mechanism, termed Local-Global\nAttention, which is designed to better integrate both local and global\ncontextual features. Specifically, our approach combines multi-scale\nconvolutions with positional encoding, enabling the model to focus on local\ndetails while concurrently considering the broader global context.\nAdditionally, we introduce a learnable parameters, which allow the model to\ndynamically adjust the relative importance of local and global attention,\ndepending on the specific requirements of the task, thereby optimizing feature\nrepresentations across multiple scales.We have thoroughly evaluated the\nLocal-Global Attention mechanism on several widely used object detection and\nclassification datasets. Our experimental results demonstrate that this\napproach significantly enhances the detection of objects at various scales,\nwith particularly strong performance on multi-class and small object detection\ntasks. In comparison to existing attention mechanisms, Local-Global Attention\nconsistently outperforms them across several key metrics, all while maintaining\ncomputational efficiency.\n","authors":["Yifan Shao"],"pdf_url":"https://arxiv.org/pdf/2411.09604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09601v1","updated":"2024-11-14T17:21:02Z","published":"2024-11-14T17:21:02Z","title":"Accelerating Knowledge Graph and Ontology Engineering with Large\n Language Models","summary":" Large Language Models bear the promise of significant acceleration of key\nKnowledge Graph and Ontology Engineering tasks, including ontology modeling,\nextension, modification, population, alignment, as well as entity\ndisambiguation. We lay out LLM-based Knowledge Graph and Ontology Engineering\nas a new and coming area of research, and argue that modular approaches to\nontologies will be of central importance.\n","authors":["Cogan Shimizu","Pascal Hitzler"],"pdf_url":"https://arxiv.org/pdf/2411.09601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09595v1","updated":"2024-11-14T17:08:23Z","published":"2024-11-14T17:08:23Z","title":"LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models","summary":" This work explores expanding the capabilities of large language models (LLMs)\npretrained on text to generate 3D meshes within a unified model. This offers\nkey advantages of (1) leveraging spatial knowledge already embedded in LLMs,\nderived from textual sources like 3D tutorials, and (2) enabling conversational\n3D generation and mesh understanding. A primary challenge is effectively\ntokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly.\nTo address this, we introduce LLaMA-Mesh, a novel approach that represents the\nvertex coordinates and face definitions of 3D meshes as plain text, allowing\ndirect integration with LLMs without expanding the vocabulary. We construct a\nsupervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate\n3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs\nas required, and (3) understand and interpret 3D meshes. Our work is the first\nto demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge\nfor 3D mesh generation in a text-based format, effectively unifying the 3D and\ntext modalities. LLaMA-Mesh achieves mesh generation quality on par with models\ntrained from scratch while maintaining strong text generation performance.\n","authors":["Zhengyi Wang","Jonathan Lorraine","Yikai Wang","Hang Su","Jun Zhu","Sanja Fidler","Xiaohui Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.09595v1.pdf","comment":"See the project website at\n https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/"},{"id":"http://arxiv.org/abs/2411.09593v1","updated":"2024-11-14T17:06:00Z","published":"2024-11-14T17:06:00Z","title":"SMILE-UHURA Challenge -- Small Vessel Segmentation at Mesoscopic Scale\n from Ultra-High Resolution 7T Magnetic Resonance Angiograms","summary":" The human brain receives nutrients and oxygen through an intricate network of\nblood vessels. Pathology affecting small vessels, at the mesoscopic scale,\nrepresents a critical vulnerability within the cerebral blood supply and can\nlead to severe conditions, such as Cerebral Small Vessel Diseases. The advent\nof 7 Tesla MRI systems has enabled the acquisition of higher spatial resolution\nimages, making it possible to visualise such vessels in the brain. However, the\nlack of publicly available annotated datasets has impeded the development of\nrobust, machine learning-driven segmentation algorithms. To address this, the\nSMILE-UHURA challenge was organised. This challenge, held in conjunction with\nthe ISBI 2023, in Cartagena de Indias, Colombia, aimed to provide a platform\nfor researchers working on related topics. The SMILE-UHURA challenge addresses\nthe gap in publicly available annotated datasets by providing an annotated\ndataset of Time-of-Flight angiography acquired with 7T MRI. This dataset was\ncreated through a combination of automated pre-segmentation and extensive\nmanual refinement. In this manuscript, sixteen submitted methods and two\nbaseline methods are compared both quantitatively and qualitatively on two\ndifferent datasets: held-out test MRAs from the same dataset as the training\ndata (with labels kept secret) and a separate 7T ToF MRA dataset where both\ninput volumes and labels are kept secret. The results demonstrate that most of\nthe submitted deep learning methods, trained on the provided training dataset,\nachieved reliable segmentation performance. Dice scores reached up to 0.838\n$\\pm$ 0.066 and 0.716 $\\pm$ 0.125 on the respective datasets, with an average\nperformance of up to 0.804 $\\pm$ 0.15.\n","authors":["Soumick Chatterjee","Hendrik Mattern","Marc Dörner","Alessandro Sciarra","Florian Dubost","Hannes Schnurre","Rupali Khatun","Chun-Chih Yu","Tsung-Lin Hsieh","Yi-Shan Tsai","Yi-Zeng Fang","Yung-Ching Yang","Juinn-Dar Huang","Marshall Xu","Siyu Liu","Fernanda L. Ribeiro","Saskia Bollmann","Karthikesh Varma Chintalapati","Chethan Mysuru Radhakrishna","Sri Chandana Hudukula Ram Kumara","Raviteja Sutrave","Abdul Qayyum","Moona Mazher","Imran Razzak","Cristobal Rodero","Steven Niederren","Fengming Lin","Yan Xia","Jiacheng Wang","Riyu Qiu","Liansheng Wang","Arya Yazdan Panah","Rosana El Jurdi","Guanghui Fu","Janan Arslan","Ghislain Vaillant","Romain Valabregue","Didier Dormont","Bruno Stankoff","Olivier Colliot","Luisa Vargas","Isai Daniel Chacón","Ioannis Pitsiorlas","Pablo Arbeláez","Maria A. Zuluaga","Stefanie Schreiber","Oliver Speck","Andreas Nürnberger"],"pdf_url":"https://arxiv.org/pdf/2411.09593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09590v1","updated":"2024-11-14T17:01:24Z","published":"2024-11-14T17:01:24Z","title":"Adopting RAG for LLM-Aided Future Vehicle Design","summary":" In this paper, we explore the integration of Large Language Models (LLMs)\nwith Retrieval-Augmented Generation (RAG) to enhance automated design and\nsoftware development in the automotive industry. We present two case studies: a\nstandardization compliance chatbot and a design copilot, both utilizing RAG to\nprovide accurate, context-aware responses. We evaluate four LLMs-GPT-4o,\nLLAMA3, Mistral, and Mixtral- comparing their answering accuracy and execution\ntime. Our results demonstrate that while GPT-4 offers superior performance,\nLLAMA3 and Mistral also show promising capabilities for local deployment,\naddressing data privacy concerns in automotive applications. This study\nhighlights the potential of RAG-augmented LLMs in improving design workflows\nand compliance in automotive engineering.\n","authors":["Vahid Zolfaghari","Nenad Petrovic","Fengjunjie Pan","Krzysztof Lebioda","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2411.09590v1.pdf","comment":"Conference paper accepted in IEEE FLLM 2024"},{"id":"http://arxiv.org/abs/2411.09580v1","updated":"2024-11-14T16:42:19Z","published":"2024-11-14T16:42:19Z","title":"Software Performance Engineering for Foundation Model-Powered Software\n (FMware)","summary":" The rise of Foundation Models (FMs) like Large Language Models (LLMs) is\nrevolutionizing software development. Despite the impressive prototypes,\ntransforming FMware into production-ready products demands complex engineering\nacross various domains. A critical but overlooked aspect is performance\nengineering, which aims at ensuring FMware meets performance goals such as\nthroughput and latency to avoid user dissatisfaction and financial loss. Often,\nperformance considerations are an afterthought, leading to costly optimization\nefforts post-deployment. FMware's high computational resource demands highlight\nthe need for efficient hardware use. Continuous performance engineering is\nessential to prevent degradation. This paper highlights the significance of\nSoftware Performance Engineering (SPE) in FMware, identifying four key\nchallenges: cognitive architecture design, communication protocols, tuning and\noptimization, and deployment. These challenges are based on literature surveys\nand experiences from developing an in-house FMware system. We discuss problems,\ncurrent practices, and innovative paths for the software engineering community.\n","authors":["Haoxiang Zhang","Shi Chang","Arthur Leung","Kishanthan Thangarajah","Boyuan Chen","Hanan Lutfiyya","Ahmed E. Hassan"],"pdf_url":"https://arxiv.org/pdf/2411.09580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09576v1","updated":"2024-11-14T16:35:15Z","published":"2024-11-14T16:35:15Z","title":"Automating Reformulation of Essence Specifications via Graph Rewriting","summary":" Formulating an effective constraint model of a parameterised problem class is\ncrucial to the efficiency with which instances of the class can subsequently be\nsolved. It is difficult to know beforehand which of a set of candidate models\nwill perform best in practice. This paper presents a system that employs graph\nrewriting to reformulate an input model for improved performance automatically.\nBy situating our work in the Essence abstract constraint specification\nlanguage, we can use the structure in its high level variable types to trigger\nrewrites directly. We implement our system via rewrite rules expressed in the\nGraph Programs 2 language, applied to the abstract syntax tree of an input\nspecification. We show how to automatically translate the solution of the\nreformulated problem into a solution of the original problem for verification\nand presentation. We demonstrate the efficacy of our system with a detailed\ncase study.\n","authors":["Ian Miguel","András Z. Salamon","Christopher Stone"],"pdf_url":"https://arxiv.org/pdf/2411.09576v1.pdf","comment":"Presented at the PTHG 2024 workshop"},{"id":"http://arxiv.org/abs/2402.02681v3","updated":"2024-11-14T16:30:13Z","published":"2024-02-05T02:35:11Z","title":"Equivariant Symmetry Breaking Sets","summary":" Equivariant neural networks (ENNs) have been shown to be extremely effective\nin applications involving underlying symmetries. By construction ENNs cannot\nproduce lower symmetry outputs given a higher symmetry input. However, symmetry\nbreaking occurs in many physical systems and we may obtain a less symmetric\nstable state from an initial highly symmetric one. Hence, it is imperative that\nwe understand how to systematically break symmetry in ENNs. In this work, we\npropose a novel symmetry breaking framework that is fully equivariant and is\nthe first which fully addresses spontaneous symmetry breaking. We emphasize\nthat our approach is general and applicable to equivariance under any group. To\nachieve this, we introduce the idea of symmetry breaking sets (SBS). Rather\nthan redesign existing networks, we design sets of symmetry breaking objects\nwhich we feed into our network based on the symmetry of our inputs and outputs.\nWe show there is a natural way to define equivariance on these sets, which\ngives an additional constraint. Minimizing the size of these sets equates to\ndata efficiency. We prove that minimizing these sets translates to a well\nstudied group theory problem, and tabulate solutions to this problem for the\npoint groups. Finally, we provide some examples of symmetry breaking to\ndemonstrate how our approach works in practice. The code for these examples is\navailable at \\url{https://github.com/atomicarchitects/equivariant-SBS}.\n","authors":["YuQing Xie","Tess Smidt"],"pdf_url":"https://arxiv.org/pdf/2402.02681v3.pdf","comment":"50 pages, 19 figures Published in Transactions on Machine Learning\n Research, October 2024"},{"id":"http://arxiv.org/abs/2411.04872v3","updated":"2024-11-14T16:26:03Z","published":"2024-11-07T17:07:35Z","title":"FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning\n in AI","summary":" We introduce FrontierMath, a benchmark of hundreds of original, exceptionally\nchallenging mathematics problems crafted and vetted by expert mathematicians.\nThe questions cover most major branches of modern mathematics -- from\ncomputationally intensive problems in number theory and real analysis to\nabstract questions in algebraic geometry and category theory. Solving a typical\nproblem requires multiple hours of effort from a researcher in the relevant\nbranch of mathematics, and for the upper end questions, multiple days.\nFrontierMath uses new, unpublished problems and automated verification to\nreliably evaluate models while minimizing risk of data contamination. Current\nstate-of-the-art AI models solve under 2% of problems, revealing a vast gap\nbetween AI capabilities and the prowess of the mathematical community. As AI\nsystems advance toward expert-level mathematical abilities, FrontierMath offers\na rigorous testbed that quantifies their progress.\n","authors":["Elliot Glazer","Ege Erdil","Tamay Besiroglu","Diego Chicharro","Evan Chen","Alex Gunning","Caroline Falkman Olsson","Jean-Stanislas Denain","Anson Ho","Emily de Oliveira Santos","Olli Järviniemi","Matthew Barnett","Robert Sandler","Matej Vrzala","Jaime Sevilla","Qiuyu Ren","Elizabeth Pratt","Lionel Levine","Grant Barkley","Natalie Stewart","Bogdan Grechuk","Tetiana Grechuk","Shreepranav Varma Enugandla","Mark Wildon"],"pdf_url":"https://arxiv.org/pdf/2411.04872v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06542v2","updated":"2024-11-14T16:22:51Z","published":"2024-11-10T17:48:26Z","title":"Is Linear Feedback on Smoothed Dynamics Sufficient for Stabilizing\n Contact-Rich Plans?","summary":" Designing planners and controllers for contact-rich manipulation is extremely\nchallenging as contact violates the smoothness conditions that many\ngradient-based controller synthesis tools assume. Contact smoothing\napproximates a non-smooth system with a smooth one, allowing one to use these\nsynthesis tools more effectively. However, applying classical control synthesis\nmethods to smoothed contact dynamics remains relatively under-explored. This\npaper analyzes the efficacy of linear controller synthesis using differential\nsimulators based on contact smoothing. We introduce natural baselines for\nleveraging contact smoothing to compute (a) open-loop plans robust to uncertain\nconditions and/or dynamics, and (b) feedback gains to stabilize around\nopen-loop plans. Using robotic bimanual whole-body manipulation as a testbed,\nwe perform extensive empirical experiments on over 300 trajectories and analyze\nwhy LQR seems insufficient for stabilizing contact-rich plans. The video\nsummarizing this paper and hardware experiments is found here:\nhttps://youtu.be/HLaKi6qbwQg?si=_zCAmBBD6rGSitm9.\n","authors":["Yuki Shirai","Tong Zhao","H. J. Terry Suh","Huaijiang Zhu","Xinpei Ni","Jiuguang Wang","Max Simchowitz","Tao Pang"],"pdf_url":"https://arxiv.org/pdf/2411.06542v2.pdf","comment":"Under review for ICRA2025"},{"id":"http://arxiv.org/abs/2411.09547v1","updated":"2024-11-14T16:01:33Z","published":"2024-11-14T16:01:33Z","title":"Piecing It All Together: Verifying Multi-Hop Multimodal Claims","summary":" Existing claim verification datasets often do not require systems to perform\ncomplex reasoning or effectively interpret multimodal evidence. To address\nthis, we introduce a new task: multi-hop multimodal claim verification. This\ntask challenges models to reason over multiple pieces of evidence from diverse\nsources, including text, images, and tables, and determine whether the combined\nmultimodal evidence supports or refutes a given claim. To study this task, we\nconstruct MMCV, a large-scale dataset comprising 16k multi-hop claims paired\nwith multimodal evidence, generated and refined using large language models,\nwith additional input from human feedback. We show that MMCV is challenging\neven for the latest state-of-the-art multimodal large language models,\nespecially as the number of reasoning hops increases. Additionally, we\nestablish a human performance benchmark on a subset of MMCV. We hope this\ndataset and its evaluation task will encourage future research in multimodal\nmulti-hop claim verification.\n","authors":["Haoran Wang","Aman Rangapur","Xiongxiao Xu","Yueqing Liang","Haroon Gharwi","Carl Yang","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2411.09547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09543v1","updated":"2024-11-14T15:58:46Z","published":"2024-11-14T15:58:46Z","title":"OpenGeMM: A High-Utilization GeMM Accelerator Generator with Lightweight\n RISC-V Control and Tight Memory Coupling","summary":" Deep neural networks (DNNs) face significant challenges when deployed on\nresource-constrained extreme edge devices due to their computational and\ndata-intensive nature. While standalone accelerators tailored for specific\napplication scenarios suffer from inflexible control and limited\nprogrammability, generic hardware acceleration platforms coupled with RISC-V\nCPUs can enable high reusability and flexibility, yet typically at the expense\nof system level efficiency and low utilization. To fill this gap, we propose\nOpenGeMM, an open-source acceleration platform, jointly demonstrating high\nefficiency and utilization, as well as ease of configurability and\nprogrammability. OpenGeMM encompasses a parameterized Chisel-coded GeMM\naccelerator, a lightweight RISC-V processor, and a tightly coupled multi-banked\nscratchpad memory. The GeMM core utilization and system efficiency are boosted\nthrough three mechanisms: configuration pre-loading, input pre-fetching with\noutput buffering, and programmable strided memory access. Experimental results\nshow that OpenGeMM can consistently achieve hardware utilization ranging from\n81.89% to 99.34% across diverse CNN and Transformer workloads. Compared to the\nSotA open-source Gemmini accelerator, OpenGeMM demonstrates a 3.58x to 16.40x\nspeedup on normalized throughput across a wide variety ofGeMM workloads, while\nachieving 4.68 TOPS/W system efficiency.\n","authors":["Xiaoling Yi","Ryan Antonio","Joren Dumoulin","Jiacong Sun","Josse Van Delm","Guilherme Paim","Marian Verhelst"],"pdf_url":"https://arxiv.org/pdf/2411.09543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09540v1","updated":"2024-11-14T15:56:11Z","published":"2024-11-14T15:56:11Z","title":"Prompting the Unseen: Detecting Hidden Backdoors in Black-Box Models","summary":" Visual prompting (VP) is a new technique that adapts well-trained frozen\nmodels for source domain tasks to target domain tasks. This study examines VP's\nbenefits for black-box model-level backdoor detection. The visual prompt in VP\nmaps class subspaces between source and target domains. We identify a\nmisalignment, termed class subspace inconsistency, between clean and poisoned\ndatasets. Based on this, we introduce \\textsc{BProm}, a black-box model-level\ndetection method to identify backdoors in suspicious models, if any.\n\\textsc{BProm} leverages the low classification accuracy of prompted models\nwhen backdoors are present. Extensive experiments confirm \\textsc{BProm}'s\neffectiveness.\n","authors":["Zi-Xuan Huang","Jia-Wei Chen","Zhi-Peng Zhang","Chia-Mu Yu"],"pdf_url":"https://arxiv.org/pdf/2411.09540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08278v2","updated":"2024-11-14T15:49:46Z","published":"2024-11-13T01:33:05Z","title":"Knowledge Bases in Support of Large Language Models for Processing Web\n News","summary":" Large Language Models (LLMs) have received considerable interest in wide\napplications lately. During pre-training via massive datasets, such a model\nimplicitly memorizes the factual knowledge of trained datasets in its hidden\nparameters. However, knowledge held implicitly in parameters often makes its\nuse by downstream applications ineffective due to the lack of common-sense\nreasoning. In this article, we introduce a general framework that permits to\nbuild knowledge bases with an aid of LLMs, tailored for processing Web news.\nThe framework applies a rule-based News Information Extractor (NewsIE) to news\nitems for extracting their relational tuples, referred to as knowledge bases,\nwhich are then graph-convoluted with the implicit knowledge facts of news items\nobtained by LLMs, for their classification. It involves two lightweight\ncomponents: 1) NewsIE: for extracting the structural information of every news\nitem, in the form of relational tuples; 2) BERTGraph: for graph convoluting the\nimplicit knowledge facts with relational tuples extracted by NewsIE. We have\nevaluated our framework under different news-related datasets for news category\nclassification, with promising experimental results.\n","authors":["Yihe Zhang","Nabin Pakka","Nian-Feng Tzeng"],"pdf_url":"https://arxiv.org/pdf/2411.08278v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.09523v1","updated":"2024-11-14T15:40:04Z","published":"2024-11-14T15:40:04Z","title":"Navigating the Risks: A Survey of Security, Privacy, and Ethics Threats\n in LLM-Based Agents","summary":" With the continuous development of large language models (LLMs),\ntransformer-based models have made groundbreaking advances in numerous natural\nlanguage processing (NLP) tasks, leading to the emergence of a series of agents\nthat use LLMs as their control hub. While LLMs have achieved success in various\ntasks, they face numerous security and privacy threats, which become even more\nsevere in the agent scenarios. To enhance the reliability of LLM-based\napplications, a range of research has emerged to assess and mitigate these\nrisks from different perspectives.\n To help researchers gain a comprehensive understanding of various risks, this\nsurvey collects and analyzes the different threats faced by these agents. To\naddress the challenges posed by previous taxonomies in handling cross-module\nand cross-stage threats, we propose a novel taxonomy framework based on the\nsources and impacts. Additionally, we identify six key features of LLM-based\nagents, based on which we summarize the current research progress and analyze\ntheir limitations. Subsequently, we select four representative agents as case\nstudies to analyze the risks they may face in practical use. Finally, based on\nthe aforementioned analyses, we propose future research directions from the\nperspectives of data, methodology, and policy, respectively.\n","authors":["Yuyou Gan","Yong Yang","Zhe Ma","Ping He","Rui Zeng","Yiming Wang","Qingming Li","Chunyi Zhou","Songze Li","Ting Wang","Yunjun Gao","Yingcai Wu","Shouling Ji"],"pdf_url":"https://arxiv.org/pdf/2411.09523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09510v1","updated":"2024-11-14T15:19:01Z","published":"2024-11-14T15:19:01Z","title":"Communication Compression for Tensor Parallel LLM Inference","summary":" Large Language Models (LLMs) have pushed the frontier of artificial\nintelligence but are comprised of hundreds of billions of parameters and\noperations. For faster inference latency, LLMs are deployed on multiple\nhardware accelerators through various Model Parallelism strategies. Our paper\nlooks into the details on one such strategy - Tensor Parallel - and proposes to\nreduce latency by compressing inter-accelerator communication. We leverage fine\ngrained quantization techniques to compress selected activations by 3.5 - 4.5x.\nOur proposed method leads up to 2x reduction of time-to-first-token (TTFT) with\nnegligible model performance degradation.\n","authors":["Jan Hansen-Palmus","Michael Truong-Le","Oliver Hausdörfer","Alok Verma"],"pdf_url":"https://arxiv.org/pdf/2411.09510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09507v1","updated":"2024-11-14T15:17:50Z","published":"2024-11-14T15:17:50Z","title":"Toward a Cohesive AI and Simulation Software Ecosystem for Scientific\n Innovation","summary":" In this paper, we discuss the need for an integrated software stack that\nunites artificial intelligence (AI) and modeling and simulation (ModSim) tools\nto advance scientific discovery. The authors advocate for a unified AI/ModSim\nsoftware ecosystem that ensures compatibility across a wide range of software\non diverse high-performance computing systems, promoting ease of deployment,\nversion management, and binary distribution. Key challenges highlighted include\nbalancing the distinct needs of AI and ModSim, especially in terms of software\nbuild practices, dependency management, and compatibility. The document\nunderscores the importance of continuous integration, community-driven\nstewardship, and collaboration with the Department of Energy (DOE) to develop a\nportable and cohesive scientific software ecosystem. Recommendations focus on\nsupporting standardized environments through initiatives like the Extreme-scale\nScientific Software Stack (E4S) and Spack to foster interdisciplinary\ninnovation and facilitate new scientific advancements.\n","authors":["Michael A. Heroux","Sameer Shende","Lois Curfman McInnes","Todd Gamblin","James M. Willenbring"],"pdf_url":"https://arxiv.org/pdf/2411.09507v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2411.09492v1","updated":"2024-11-14T14:58:38Z","published":"2024-11-14T14:58:38Z","title":"MM-Eval: A Hierarchical Benchmark for Modern Mongolian Evaluation in\n LLMs","summary":" Large language models (LLMs) excel in high-resource languages but face\nnotable challenges in low-resource languages like Mongolian. This paper\naddresses these challenges by categorizing capabilities into language abilities\n(syntax and semantics) and cognitive abilities (knowledge and reasoning). To\nsystematically evaluate these areas, we developed MM-Eval, a specialized\ndataset based on Modern Mongolian Language Textbook I and enriched with WebQSP\nand MGSM datasets.\n Preliminary experiments on models including Qwen2-7B-Instruct, GLM4-9b-chat,\nLlama3.1-8B-Instruct, GPT-4, and DeepseekV2.5 revealed that: 1) all models\nperformed better on syntactic tasks than semantic tasks, highlighting a gap in\ndeeper language understanding; and 2) knowledge tasks showed a moderate\ndecline, suggesting that models can transfer general knowledge from\nhigh-resource to low-resource contexts.\n The release of MM-Eval, comprising 569 syntax, 677 semantics, 344 knowledge,\nand 250 reasoning tasks, offers valuable insights for advancing NLP and LLMs in\nlow-resource languages like Mongolian. The dataset is available at\nhttps://github.com/joenahm/MM-Eval.\n","authors":["Mengyuan Zhang","Ruihui Wang","Bo Xia","Yuan Sun","Xiaobing Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.09492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01083v2","updated":"2024-11-14T14:52:54Z","published":"2024-09-02T09:11:28Z","title":"Affordance-based Robot Manipulation with Flow Matching","summary":" We present a framework for assistive robot manipulation, which focuses on two\nfundamental challenges: first, efficiently adapting large-scale models to\ndownstream scene affordance understanding tasks, especially in daily living\nscenarios where gathering multi-task data involving humans requires strenuous\neffort; second, effectively learning robot trajectories by grounding the visual\naffordance model. We tackle the first challenge by employing a\nparameter-efficient prompt tuning method that prepends learnable text prompts\nto the frozen vision model to predict manipulation affordances in multi-task\nscenarios. Then we propose to learn robot trajectories guided by affordances in\na supervised Flow Matching method. Flow matching represents a robot visuomotor\npolicy as a conditional process of flowing random waypoints to desired robot\ntrajectories. Finally, we introduce a real-world dataset with 10 tasks across\nActivities of Daily Living to test our framework. Our extensive evaluation\nhighlights that the proposed prompt tuning method for learning manipulation\naffordance with language prompter achieves competitive performance and even\noutperforms other finetuning protocols across data scales, while satisfying\nparameter efficiency. Learning multi-task robot trajectories with flow matching\npolicy also leads to consistently better generalization performance and faster\ninference than alternative behavior cloning methods, especially given\nmultimodal robot action distributions. Our framework seamlessly unifies\naffordance model learning and trajectory generation with flow matching for\nrobot manipulation.\n","authors":["Fan Zhang","Michael Gienger"],"pdf_url":"https://arxiv.org/pdf/2409.01083v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09475v1","updated":"2024-11-14T14:31:30Z","published":"2024-11-14T14:31:30Z","title":"ResidualDroppath: Enhancing Feature Reuse over Residual Connections","summary":" Residual connections are one of the most important components in neural\nnetwork architectures for mitigating the vanishing gradient problem and\nfacilitating the training of much deeper networks. One possible explanation for\nhow residual connections aid deeper network training is by promoting feature\nreuse. However, we identify and analyze the limitations of feature reuse with\nvanilla residual connections. To address these limitations, we propose\nmodifications in training methods. Specifically, we provide an additional\nopportunity for the model to learn feature reuse with residual connections\nthrough two types of iterations during training. The first type of iteration\ninvolves using droppath, which enforces feature reuse by randomly dropping a\nsubset of layers. The second type of iteration focuses on training the dropped\nparts of the model while freezing the undropped parts. As a result, the dropped\nparts learn in a way that encourages feature reuse, as the model relies on the\nundropped parts with feature reuse in mind. Overall, we demonstrated\nperformance improvements in models with residual connections for image\nclassification in certain cases.\n","authors":["Sejik Park"],"pdf_url":"https://arxiv.org/pdf/2411.09475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06900v5","updated":"2024-11-14T14:28:58Z","published":"2024-02-10T07:55:27Z","title":"Can LLMs Recognize Toxicity? A Structured Investigation Framework and\n Toxicity Metric","summary":" In the pursuit of developing Large Language Models (LLMs) that adhere to\nsocietal standards, it is imperative to detect the toxicity in the generated\ntext. The majority of existing toxicity metrics rely on encoder models trained\non specific toxicity datasets, which are susceptible to out-of-distribution\n(OOD) problems and depend on the dataset's definition of toxicity. In this\npaper, we introduce a robust metric grounded on LLMs to flexibly measure\ntoxicity according to the given definition. We first analyze the toxicity\nfactors, followed by an examination of the intrinsic toxic attributes of LLMs\nto ascertain their suitability as evaluators. Finally, we evaluate the\nperformance of our metric with detailed analysis. Our empirical results\ndemonstrate outstanding performance in measuring toxicity within verified\nfactors, improving on conventional metrics by 12 points in the F1 score. Our\nfindings also indicate that upstream toxicity significantly influences\ndownstream metrics, suggesting that LLMs are unsuitable for toxicity\nevaluations within unverified factors.\n","authors":["Hyukhun Koh","Dohyung Kim","Minwoo Lee","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2402.06900v5.pdf","comment":"8 page long"},{"id":"http://arxiv.org/abs/2411.01013v2","updated":"2024-11-14T14:23:29Z","published":"2024-11-01T20:33:49Z","title":"A Similarity-Based Oversampling Method for Multi-label Imbalanced Text\n Data","summary":" In real-world applications, as data availability increases, obtaining labeled\ndata for machine learning (ML) projects remains challenging due to the high\ncosts and intensive efforts required for data annotation. Many ML projects,\nparticularly those focused on multi-label classification, also grapple with\ndata imbalance issues, where certain classes may lack sufficient data to train\neffective classifiers. This study introduces and examines a novel oversampling\nmethod for multi-label text classification, designed to address performance\nchallenges associated with data imbalance. The proposed method identifies\npotential new samples from unlabeled data by leveraging similarity measures\nbetween instances. By iteratively searching the unlabeled dataset, the method\nlocates instances similar to those in underrepresented classes and evaluates\ntheir contribution to classifier performance enhancement. Instances that\ndemonstrate performance improvement are then added to the labeled dataset.\nExperimental results indicate that the proposed approach effectively enhances\nclassifier performance post-oversampling.\n","authors":["Ismail Hakki Karaman","Gulser Koksal","Levent Eriskin","Salih Salihoglu"],"pdf_url":"https://arxiv.org/pdf/2411.01013v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09471v1","updated":"2024-11-14T14:21:49Z","published":"2024-11-14T14:21:49Z","title":"Renal Cell Carcinoma subtyping: learning from multi-resolution\n localization","summary":" Renal Cell Carcinoma is typically asymptomatic at the early stages for many\npatients. This leads to a late diagnosis of the tumor, where the curability\nlikelihood is lower, and makes the mortality rate of Renal Cell Carcinoma high,\nwith respect to its incidence rate. To increase the survival chance, a fast and\ncorrect categorization of the tumor subtype is paramount. Nowadays,\ncomputerized methods, based on artificial intelligence, represent an\ninteresting opportunity to improve the productivity and the objectivity of the\nmicroscopy-based Renal Cell Carcinoma diagnosis. Nonetheless, much of their\nexploitation is hampered by the paucity of annotated dataset, essential for a\nproficient training of supervised machine learning technologies. This study\nsets out to investigate a novel self supervised training strategy for machine\nlearning diagnostic tools, based on the multi-resolution nature of the\nhistological samples. We aim at reducing the need of annotated dataset, without\nsignificantly reducing the accuracy of the tool. We demonstrate the\nclassification capability of our tool on a whole slide imaging dataset for\nRenal Cancer subtyping, and we compare our solution with several\nstate-of-the-art classification counterparts.\n","authors":["Mohamad Mohamad","Francesco Ponzio","Santa Di Cataldo","Damien Ambrosetti","Xavier Descombes"],"pdf_url":"https://arxiv.org/pdf/2411.09471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09469v1","updated":"2024-11-14T14:18:40Z","published":"2024-11-14T14:18:40Z","title":"An Explainable Attention Model for Cervical Precancer Risk\n Classification using Colposcopic Images","summary":" Cervical cancer remains a major worldwide health issue, with early\nidentification and risk assessment playing critical roles in effective\npreventive interventions. This paper presents the Cervix-AID-Net model for\ncervical precancer risk classification. The study designs and evaluates the\nproposed Cervix-AID-Net model based on patients colposcopy images. The model\ncomprises a Convolutional Block Attention Module (CBAM) and convolutional\nlayers that extract interpretable and representative features of colposcopic\nimages to distinguish high-risk and low-risk cervical precancer. In addition,\nthe proposed Cervix-AID-Net model integrates four explainable techniques,\nnamely gradient class activation maps, Local Interpretable Model-agnostic\nExplanations, CartoonX, and pixel rate distortion explanation based on output\nfeature maps and input features. The evaluation using holdout and ten-fold\ncross-validation techniques yielded a classification accuracy of 99.33\\% and\n99.81\\%. The analysis revealed that CartoonX provides meticulous explanations\nfor the decision of the Cervix-AID-Net model due to its ability to provide the\nrelevant piece-wise smooth part of the image. The effect of Gaussian noise and\nblur on the input shows that the performance remains unchanged up to Gaussian\nnoise of 3\\% and blur of 10\\%, while the performance reduces thereafter. A\ncomparison study of the proposed model's performance compared to other deep\nlearning approaches highlights the Cervix-AID-Net model's potential as a\nsupplemental tool for increasing the effectiveness of cervical precancer risk\nassessment. The proposed method, which incorporates the CBAM and explainable\nartificial integration, has the potential to influence cervical cancer\nprevention and early detection, improving patient outcomes and lowering the\nworldwide burden of this preventable disease.\n","authors":["Smith K. Khare","Berit Bargum Booth","Victoria Blanes-Vidal","Lone Kjeld Petersen","Esmaeil S. Nadimi"],"pdf_url":"https://arxiv.org/pdf/2411.09469v1.pdf","comment":"19 pages, 9 figure, and 7 tables"},{"id":"http://arxiv.org/abs/2402.03227v4","updated":"2024-11-14T14:11:57Z","published":"2024-02-05T17:38:49Z","title":"IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of\n brain MR images","summary":" In MRI studies, the aggregation of imaging data from multiple acquisition\nsites enhances sample size but may introduce site-related variabilities that\nhinder consistency in subsequent analyses. Deep learning methods for image\ntranslation have emerged as a solution for harmonizing MR images across sites.\nIn this study, we introduce IGUANe (Image Generation with Unified Adversarial\nNetworks), an original 3D model that leverages the strengths of domain\ntranslation and straightforward application of style transfer methods for\nmulticenter brain MR image harmonization. IGUANe extends CycleGAN by\nintegrating an arbitrary number of domains for training through a many-to-one\narchitecture. The framework based on domain pairs enables the implementation of\nsampling strategies that prevent confusion between site-related and biological\nvariabilities. During inference, the model can be applied to any image, even\nfrom an unknown acquisition site, making it a universal generator for\nharmonization. Trained on a dataset comprising T1-weighted images from 11\ndifferent scanners, IGUANe was evaluated on data from unseen sites. The\nassessments included the transformation of MR images with traveling subjects,\nthe preservation of pairwise distances between MR images within domains, the\nevolution of volumetric patterns related to age and Alzheimer$'$s disease (AD),\nand the performance in age regression and patient classification tasks.\nComparisons with other harmonization and normalization methods suggest that\nIGUANe better preserves individual information in MR images and is more\nsuitable for maintaining and reinforcing variabilities related to age and AD.\nFuture studies may further assess IGUANe in other multicenter contexts, either\nusing the same model or retraining it for applications to different image\nmodalities. IGUANe is available at\nhttps://github.com/RocaVincent/iguane_harmonization.git.\n","authors":["Vincent Roca","Grégory Kuchcinski","Jean-Pierre Pruvo","Dorian Manouvriez","Renaud Lopes"],"pdf_url":"https://arxiv.org/pdf/2402.03227v4.pdf","comment":"29 pages, 14 figures"},{"id":"http://arxiv.org/abs/2411.08586v2","updated":"2024-11-14T14:07:19Z","published":"2024-11-13T13:09:14Z","title":"Optimizing Automatic Summarization of Long Clinical Records Using\n Dynamic Context Extension:Testing and Evaluation of the NBCE Method","summary":" Summarizing patient clinical notes is vital for reducing documentation\nburdens. Current manual summarization makes medical staff struggle. We propose\nan automatic method using LLMs, but long inputs cause LLMs to lose context,\nreducing output quality especially in small size model. We used a 7B model,\nopen-calm-7b, enhanced with Native Bayes Context Extend and a redesigned\ndecoding mechanism to reference one sentence at a time, keeping inputs within\ncontext windows, 2048 tokens. Our improved model achieved near parity with\nGoogle's over 175B Gemini on ROUGE-L metrics with 200 samples, indicating\nstrong performance using less resources, enhancing automated EMR summarization\nfeasibility.\n","authors":["Guoqing Zhang","Keita Fukuyama","Kazumasa Kishimoto","Tomohiro Kuroda"],"pdf_url":"https://arxiv.org/pdf/2411.08586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09451v1","updated":"2024-11-14T13:56:02Z","published":"2024-11-14T13:56:02Z","title":"DiffRoad: Realistic and Diverse Road Scenario Generation for Autonomous\n Vehicle Testing","summary":" Generating realistic and diverse road scenarios is essential for autonomous\nvehicle testing and validation. Nevertheless, owing to the complexity and\nvariability of real-world road environments, creating authentic and varied\nscenarios for intelligent driving testing is challenging. In this paper, we\npropose DiffRoad, a novel diffusion model designed to produce controllable and\nhigh-fidelity 3D road scenarios. DiffRoad leverages the generative capabilities\nof diffusion models to synthesize road layouts from white noise through an\ninverse denoising process, preserving real-world spatial features. To enhance\nthe quality of generated scenarios, we design the Road-UNet architecture,\noptimizing the balance between backbone and skip connections for high-realism\nscenario generation. Furthermore, we introduce a road scenario evaluation\nmodule that screens adequate and reasonable scenarios for intelligent driving\ntesting using two critical metrics: road continuity and road reasonableness.\nExperimental results on multiple real-world datasets demonstrate DiffRoad's\nability to generate realistic and smooth road structures while maintaining the\noriginal distribution. Additionally, the generated scenarios can be fully\nautomated into the OpenDRIVE format, facilitating generalized autonomous\nvehicle simulation testing. DiffRoad provides a rich and diverse scenario\nlibrary for large-scale autonomous vehicle testing and offers valuable insights\nfor future infrastructure designs that are better suited for autonomous\nvehicles.\n","authors":["Junjie Zhou","Lin Wang","Qiang Meng","Xiaofan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09451v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.09429v1","updated":"2024-11-14T13:25:04Z","published":"2024-11-14T13:25:04Z","title":"AI-driven inverse design of materials: Past, present and future","summary":" The discovery of advanced materials is the cornerstone of human technological\ndevelopment and progress. The structures of materials and their corresponding\nproperties are essentially the result of a complex interplay of multiple\ndegrees of freedom such as lattice, charge, spin, symmetry, and topology. This\nposes significant challenges for the inverse design methods of materials.\nHumans have long explored new materials through a large number of experiments\nand proposed corresponding theoretical systems to predict new material\nproperties and structures. With the improvement of computational power,\nresearchers have gradually developed various electronic structure calculation\nmethods, particularly such as the one based density functional theory, as well\nas high-throughput computational methods. Recently, the rapid development of\nartificial intelligence technology in the field of computer science has enabled\nthe effective characterization of the implicit association between material\nproperties and structures, thus opening up an efficient paradigm for the\ninverse design of functional materials. A significant progress has been made in\ninverse design of materials based on generative and discriminative models,\nattracting widespread attention from researchers. Considering this rapid\ntechnological progress, in this survey, we look back on the latest advancements\nin AI-driven inverse design of materials by introducing the background, key\nfindings, and mainstream technological development routes. In addition, we\nsummarize the remaining issues for future directions. This survey provides the\nlatest overview of AI-driven inverse design of materials, which can serve as a\nuseful resource for researchers.\n","authors":["Xiao-Qi Han","Xin-De Wang","Meng-Yuan Xu","Zhen Feng","Bo-Wen Yao","Peng-Jie Guo","Ze-Feng Gao","Zhong-Yi Lu"],"pdf_url":"https://arxiv.org/pdf/2411.09429v1.pdf","comment":"43 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.09422v1","updated":"2024-11-14T13:18:06Z","published":"2024-11-14T13:18:06Z","title":"An Adaptive Open-Source Dataset Generation Framework for Machine\n Learning Tasks in Logic Synthesis","summary":" This paper introduces an adaptive logic synthesis dataset generation\nframework designed to enhance machine learning applications within the logic\nsynthesis process. Unlike previous dataset generation flows that were tailored\nfor specific tasks or lacked integrated machine learning capabilities, the\nproposed framework supports a comprehensive range of machine learning tasks by\nencapsulating the three fundamental steps of logic synthesis: Boolean\nrepresentation, logic optimization, and technology mapping. It preserves the\noriginal information in the intermediate files that can be stored in both\nVerilog and Graphmal format. Verilog files enable semi-customizability,\nallowing researchers to add steps and incrementally refine the generated\ndataset. The framework also includes an adaptive circuit engine to facilitate\nthe loading of GraphML files for final dataset packaging and sub-dataset\nextraction. The generated OpenLS-D dataset comprises 46 combinational designs\nfrom established benchmarks, totaling over 966,000 Boolean circuits, with each\ndesign containing 21,000 circuits generated from 1000 synthesis recipes,\nincluding 7000 Boolean networks, 7000 ASIC netlists, and 7000 FPGA netlists.\nFurthermore, OpenLS-D supports integrating newly desired data features, making\nit more versatile for new challenges. The utility of OpenLS-D is demonstrated\nthrough four distinct downstream tasks: circuit classification, circuit\nranking, quality of results (QoR) prediction, and probability prediction. Each\ntask highlights different internal steps of logic synthesis, with the datasets\nextracted and relabeled from the OpenLS-D dataset using the circuit engine. The\nexperimental results confirm the dataset's diversity and extensive\napplicability. The source code and datasets are available at\nhttps://github.com/Logic-Factory/ACE/blob/master/OpenLS-D/readme.md.\n","authors":["Liwei Ni","Rui Wang","Miao Liu","Xingyu Meng","Xiaoze Lin","Junfeng Liu","Guojie Luo","Zhufei Chu","Weikang Qian","Xiaoyan Yang","Biwei Xie","Xingquan Li","Huawei Li"],"pdf_url":"https://arxiv.org/pdf/2411.09422v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2411.09420v1","updated":"2024-11-14T13:15:27Z","published":"2024-11-14T13:15:27Z","title":"SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph\n Attention for Vision Transformers","summary":" Image classification is a computer vision task where a model analyzes an\nimage to categorize it into a specific label. Vision Transformers (ViT) improve\nthis task by leveraging self-attention to capture complex patterns and long\nrange relationships between image patches. However, a key challenge for ViTs is\nefficiently incorporating multiscale feature representations, which is inherent\nin CNNs through their hierarchical structure. In this paper, we introduce the\nScale-Aware Graph Attention Vision Transformer (SAG-ViT), a novel framework\nthat addresses this challenge by integrating multi-scale features. Using\nEfficientNet as a backbone, the model extracts multi-scale feature maps, which\nare divided into patches to preserve semantic information. These patches are\norganized into a graph based on spatial and feature similarities, with a Graph\nAttention Network (GAT) refining the node embeddings. Finally, a Transformer\nencoder captures long-range dependencies and complex interactions. The SAG-ViT\nis evaluated on benchmark datasets, demonstrating its effectiveness in\nenhancing image classification performance.\n","authors":["Shravan Venkatraman","Jaskaran Singh Walia","Joe Dhanith P R"],"pdf_url":"https://arxiv.org/pdf/2411.09420v1.pdf","comment":"10 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2411.09413v1","updated":"2024-11-14T13:07:19Z","published":"2024-11-14T13:07:19Z","title":"Script-centric behavior understanding for assisted autism spectrum\n disorder diagnosis","summary":" Observing and analyzing children's social behaviors is crucial for the early\ndiagnosis of Autism Spectrum Disorders (ASD). This work focuses on\nautomatically detecting ASD using computer vision techniques and large language\nmodels (LLMs). Existing methods typically rely on supervised learning. However,\nthe scarcity of ASD diagnostic datasets and the lack of interpretability in\ndiagnostic results significantly limits its clinical application. To address\nthese challenges, we introduce a novel unsupervised approach based on\nscript-centric behavior understanding. Our pipeline converts video content into\nscripts that describe the behavior of characters, leveraging the\ngeneralizability of large language models to detect ASD in a zero-shot or\nfew-shot manner. Specifically, we propose a scripts transcription module for\nmultimodal behavior data textualization and a domain prompts module to bridge\nLLMs. Our method achieves an accuracy of 92.00\\% in diagnosing ASD in children\nwith an average age of 24 months, surpassing the performance of supervised\nlearning methods by 3.58\\% absolutely. Extensive experiments confirm the\neffectiveness of our approach and suggest its potential for advancing ASD\nresearch through LLMs.\n","authors":["Wenxing Liu","Yueran Pan","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2411.09413v1.pdf","comment":"5 pages, 4 figures, submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2410.07974v3","updated":"2024-11-14T12:51:52Z","published":"2024-10-10T14:32:16Z","title":"Doob's Lagrangian: A Sample-Efficient Variational Approach to Transition\n Path Sampling","summary":" Rare event sampling in dynamical systems is a fundamental problem arising in\nthe natural sciences, which poses significant computational challenges due to\nan exponentially large space of trajectories. For settings where the dynamical\nsystem of interest follows a Brownian motion with known drift, the question of\nconditioning the process to reach a given endpoint or desired rare event is\ndefinitively answered by Doob's h-transform. However, the naive estimation of\nthis transform is infeasible, as it requires simulating sufficiently many\nforward trajectories to estimate rare event probabilities. In this work, we\npropose a variational formulation of Doob's h-transform as an optimization\nproblem over trajectories between a given initial point and the desired ending\npoint. To solve this optimization, we propose a simulation-free training\nobjective with a model parameterization that imposes the desired boundary\nconditions by design. Our approach significantly reduces the search space over\ntrajectories and avoids expensive trajectory simulation and inefficient\nimportance sampling estimators which are required in existing methods. We\ndemonstrate the ability of our method to find feasible transition paths on\nreal-world molecular simulation and protein folding tasks.\n","authors":["Yuanqi Du","Michael Plainer","Rob Brekelmans","Chenru Duan","Frank Noé","Carla P. Gomes","Alán Aspuru-Guzik","Kirill Neklyudov"],"pdf_url":"https://arxiv.org/pdf/2410.07974v3.pdf","comment":"Accepted as Spotlight at Conference on Neural Information Processing\n Systems (NeurIPS 2024); Alanine dipeptide results updated after fixing\n unphysical parameterization"},{"id":"http://arxiv.org/abs/2410.17856v2","updated":"2024-11-14T12:29:41Z","published":"2024-10-23T13:26:59Z","title":"ROCKET-1: Mastering Open-World Interaction with Visual-Temporal Context\n Prompting","summary":" Vision-language models (VLMs) have excelled in multimodal tasks, but adapting\nthem to embodied decision-making in open-world environments presents\nchallenges. One critical issue is bridging the gap between discrete entities in\nlow-level observations and the abstract concepts required for effective\nplanning. A common solution is building hierarchical agents, where VLMs serve\nas high-level reasoners that break down tasks into executable sub-tasks,\ntypically specified using language. However, language suffers from the\ninability to communicate detailed spatial information. We propose\nvisual-temporal context prompting, a novel communication protocol between VLMs\nand policy models. This protocol leverages object segmentation from past\nobservations to guide policy-environment interactions. Using this approach, we\ntrain ROCKET-1, a low-level policy that predicts actions based on concatenated\nvisual observations and segmentation masks, supported by real-time object\ntracking from SAM-2. Our method unlocks the potential of VLMs, enabling them to\ntackle complex tasks that demand spatial reasoning. Experiments in Minecraft\nshow that our approach enables agents to achieve previously unattainable tasks,\nwith a $\\mathbf{76}\\%$ absolute improvement in open-world interaction\nperformance. Codes and demos are now available on the project page:\nhttps://craftjarvis.github.io/ROCKET-1.\n","authors":["Shaofei Cai","Zihao Wang","Kewei Lian","Zhancun Mu","Xiaojian Ma","Anji Liu","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2410.17856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09403v1","updated":"2024-11-14T12:27:50Z","published":"2024-11-14T12:27:50Z","title":"Quantum Machine Learning: An Interplay Between Quantum Computing and\n Machine Learning","summary":" Quantum machine learning (QML) is a rapidly growing field that combines\nquantum computing principles with traditional machine learning. It seeks to\nrevolutionize machine learning by harnessing the unique capabilities of quantum\nmechanics and employs machine learning techniques to advance quantum computing\nresearch. This paper introduces quantum computing for the machine learning\nparadigm, where variational quantum circuits (VQC) are used to develop QML\narchitectures on noisy intermediate-scale quantum (NISQ) devices. We discuss\nmachine learning for the quantum computing paradigm, showcasing our recent\ntheoretical and empirical findings. In particular, we delve into future\ndirections for studying QML, exploring the potential industrial impacts of QML\nresearch.\n","authors":["Jun Qi","Chao-Han Yang","Samuel Yen-Chi Chen","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09403v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2411.09402v1","updated":"2024-11-14T12:27:31Z","published":"2024-11-14T12:27:31Z","title":"Automated Segmentation of Ischemic Stroke Lesions in Non-Contrast\n Computed Tomography Images for Enhanced Treatment and Prognosis","summary":" Stroke is the second leading cause of death worldwide, and is increasingly\nprevalent in low- and middle-income countries (LMICs). Timely interventions can\nsignificantly influence stroke survivability and the quality of life after\ntreatment. However, the standard and most widely available imaging method for\nconfirming strokes and their sub-types, the NCCT, is more challenging and\ntime-consuming to employ in cases of ischemic stroke. For this reason, we\ndeveloped an automated method for ischemic stroke lesion segmentation in NCCTs\nusing the nnU-Net frame work, aimed at enhancing early treatment and improving\nthe prognosis of ischemic stroke patients. We achieved Dice scores of 0.596 and\nIntersection over Union (IoU) scores of 0.501 on the sampled dataset. After\nadjusting for outliers, these scores improved to 0.752 for the Dice score and\n0.643 for the IoU. Proper delineation of the region of infarction can help\nclinicians better assess the potential impact of the infarction, and guide\ntreatment procedures.\n","authors":["Toufiq Musah","Prince Ebenezer Adjei","Kojo Obed Otoo"],"pdf_url":"https://arxiv.org/pdf/2411.09402v1.pdf","comment":"7 pages, 3 figures, MICCAI Meets Africa Workshop"},{"id":"http://arxiv.org/abs/2411.09400v1","updated":"2024-11-14T12:19:28Z","published":"2024-11-14T12:19:28Z","title":"Imagined Speech and Visual Imagery as Intuitive Paradigms for\n Brain-Computer Interfaces","summary":" Recent advancements in brain-computer interface (BCI) technology have\nemphasized the promise of imagined speech and visual imagery as effective\nparadigms for intuitive communication. This study investigates the\nclassification performance and brain connectivity patterns associated with\nthese paradigms, focusing on decoding accuracy across selected word classes.\nSixteen participants engaged in tasks involving thirteen imagined speech and\nvisual imagery classes, revealing above-chance classification accuracy for both\nparadigms. Variability in classification accuracy across individual classes\nhighlights the influence of sensory and motor associations in imagined speech\nand vivid visual associations in visual imagery. Connectivity analysis further\ndemonstrated increased functional connectivity in language-related and sensory\nregions for imagined speech, whereas visual imagery activated spatial and\nvisual processing networks. These findings suggest the potential of imagined\nspeech and visual imagery as an intuitive and scalable paradigm for BCI\ncommunication when selecting optimal word classes. Further exploration of the\ndecoding outcomes for these two paradigms could provide insights for practical\nBCI communication.\n","authors":["Seo-Hyun Lee","Ji-Ha Park","Deok-Seon Kim"],"pdf_url":"https://arxiv.org/pdf/2411.09400v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2410.21991v5","updated":"2024-11-14T12:19:26Z","published":"2024-10-29T12:22:07Z","title":"From Explicit Rules to Implicit Reasoning in an Interpretable Violence\n Monitoring System","summary":" Recently, research based on pre-trained models has demonstrated outstanding\nperformance in violence surveillance tasks. However, most of them were\nblack-box systems which faced challenges regarding explainability during\ntraining and inference processes. An important question is how to incorporate\nexplicit knowledge into these implicit models, thereby designing expertdriven\nand interpretable violence surveillance systems. This paper proposes a new\nparadigm for weakly supervised violence monitoring (WSVM) called Rule base\nViolence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure\nwith different designs for images and text. One of the branches is called the\nimplicit branch, which uses only visual features for coarse-grained binary\nclassification. In this branch, image feature extraction is divided into two\nchannels: one responsible for extracting scene frames and the other focusing on\nextracting actions. The other branch is called the explicit branch, which\nutilizes language-image alignment to perform fine-grained classification. For\nthe language channel design in the explicit branch, the proposed RuleVM uses\nthe state-of-the-art YOLOWorld model to detect objects in video frames, and\nassociation rules are identified through data mining methods as descriptions of\nthe video. Leveraging the dual-branch architecture, RuleVM achieves\ninterpretable coarse-grained and fine-grained violence surveillance. Extensive\nexperiments were conducted on two commonly used benchmarks, and the results\nshow that RuleVM achieved the best performance in both coarse-grained and\nfinegrained monitoring, significantly outperforming existing state-ofthe-art\nmethods. Moreover, interpretability experiments uncovered some interesting\nrules, such as the observation that as the number of people increases, the risk\nlevel of violent behavior also rises.\n","authors":["Wen-Dong Jiang","Chih-Yung Chang","Ssu-Chi Kuai","Diptendu Sinha Roy"],"pdf_url":"https://arxiv.org/pdf/2410.21991v5.pdf","comment":"12 pages,7 figures IEEE TSMCA (Under review)"},{"id":"http://arxiv.org/abs/2411.09389v1","updated":"2024-11-14T12:05:35Z","published":"2024-11-14T12:05:35Z","title":"Less is More: Unseen Domain Fake News Detection via Causal Propagation\n Substructures","summary":" The spread of fake news on social media poses significant threats to\nindividuals and society. Text-based and graph-based models have been employed\nfor fake news detection by analysing news content and propagation networks,\nshowing promising results in specific scenarios. However, these data-driven\nmodels heavily rely on pre-existing in-distribution data for training, limiting\ntheir performance when confronted with fake news from emerging or previously\nunseen domains, known as out-of-distribution (OOD) data. Tackling OOD fake news\nis a challenging yet critical task. In this paper, we introduce the Causal\nSubgraph-oriented Domain Adaptive Fake News Detection (CSDA) model, designed to\nenhance zero-shot fake news detection by extracting causal substructures from\npropagation graphs using in-distribution data and generalising this approach to\nOOD data. The model employs a graph neural network based mask generation\nprocess to identify dominant nodes and edges within the propagation graph,\nusing these substructures for fake news detection. Additionally, the\nperformance of CSDA is further improved through contrastive learning in\nfew-shot scenarios, where a limited amount of OOD data is available for\ntraining. Extensive experiments on public social media datasets demonstrate\nthat CSDA effectively handles OOD fake news detection, achieving a 7 to 16\npercents accuracy improvement over other state-of-the-art models.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2411.09389v1.pdf","comment":"9 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2410.17283v2","updated":"2024-11-14T11:42:52Z","published":"2024-10-15T13:28:55Z","title":"Advancements in Visual Language Models for Remote Sensing: Datasets,\n Capabilities, and Enhancement Techniques","summary":" Recently, the remarkable success of ChatGPT has sparked a renewed wave of\ninterest in artificial intelligence (AI), and the advancements in visual\nlanguage models (VLMs) have pushed this enthusiasm to new heights. Differring\nfrom previous AI approaches that generally formulated different tasks as\ndiscriminative models, VLMs frame tasks as generative models and align language\nwith visual information, enabling the handling of more challenging problems.\nThe remote sensing (RS) field, a highly practical domain, has also embraced\nthis new trend and introduced several VLM-based RS methods that have\ndemonstrated promising performance and enormous potential. In this paper, we\nfirst review the fundamental theories related to VLM, then summarize the\ndatasets constructed for VLMs in remote sensing and the various tasks they\naddressed. Finally, we categorize the improvement methods into three main parts\naccording to the core components of VLMs and provide a detailed introduction\nand comparison of these methods. A project associated with this review has been\ncreated at https://github.com/taolijie11111/VLMs-in-RS-review.\n","authors":["Lijie Tao","Haokui Zhang","Haizhao Jing","Yu Liu","Kelu Yao","Chao Li","Xizhe Xue"],"pdf_url":"https://arxiv.org/pdf/2410.17283v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05767v2","updated":"2024-11-14T11:27:41Z","published":"2024-10-08T07:48:34Z","title":"Grounding is All You Need? Dual Temporal Grounding for Video Dialog","summary":" In the realm of video dialog response generation, the understanding of video\ncontent and the temporal nuances of conversation history are paramount. While a\nsegment of current research leans heavily on large-scale pretrained\nvisual-language models and often overlooks temporal dynamics, another delves\ndeep into spatial-temporal relationships within videos but demands intricate\nobject trajectory pre-extractions and sidelines dialog temporal dynamics. This\npaper introduces the Dual Temporal Grounding-enhanced Video Dialog model\n(DTGVD), strategically designed to merge the strengths of both dominant\napproaches. It emphasizes dual temporal relationships by predicting dialog\nturn-specific temporal regions, filtering video content accordingly, and\ngrounding responses in both video and dialog contexts. One standout feature of\nDTGVD is its heightened attention to chronological interplay. By recognizing\nand acting upon the dependencies between different dialog turns, it captures\nmore nuanced conversational dynamics. To further bolster the alignment between\nvideo and dialog temporal dynamics, we've implemented a list-wise contrastive\nlearning strategy. Within this framework, accurately grounded turn-clip\npairings are designated as positive samples, while less precise pairings are\ncategorized as negative. This refined classification is then funneled into our\nholistic end-to-end response generation mechanism. Evaluations using\nAVSD@DSTC-7 and AVSD@DSTC-8 datasets underscore the superiority of our\nmethodology.\n","authors":["You Qin","Wei Ji","Xinze Lan","Hao Fei","Xun Yang","Dan Guo","Roger Zimmermann","Lizi Liao"],"pdf_url":"https://arxiv.org/pdf/2410.05767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09366v1","updated":"2024-11-14T11:17:06Z","published":"2024-11-14T11:17:06Z","title":"LTLf+ and PPLTL+: Extending LTLf and PPLTL to Infinite Traces","summary":" We introduce LTLf+ and PPLTL+, two logics to express properties of infinite\ntraces, that are based on the linear-time temporal logics LTLf and PPLTL on\nfinite traces. LTLf+/PPLTL+ use levels of Manna and Pnueli's LTL\nsafety-progress hierarchy, and thus have the same expressive power as LTL.\nHowever, they also retain a crucial characteristic of the reactive synthesis\nproblem for the base logics: the game arena for strategy extraction can be\nderived from deterministic finite automata (DFA). Consequently, these logics\ncircumvent the notorious difficulties associated with determinizing infinite\ntrace automata, typical of LTL reactive synthesis. We present DFA-based\nsynthesis techniques for LTLf+/PPLTL+, and show that synthesis is\n2EXPTIME-complete for LTLf+ (matching LTLf) and EXPTIME-complete for PPLTL+\n(matching PPLTL). Notably, while PPLTL+ retains the full expressive power of\nLTL, reactive synthesis is EXPTIME-complete instead of 2EXPTIME-complete. The\ntechniques are also adapted to optimally solve satisfiability, validity, and\nmodel-checking, to get EXPSPACE-complete for LTLf+ (extending a recent result\nfor the guarantee level using LTLf), and PSPACE-complete for PPLTL+.\n","authors":["Benjamin Aminof","Giuseppe De Giacomo","Sasha Rubin","Moshe Y. Vardi"],"pdf_url":"https://arxiv.org/pdf/2411.09366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17724v2","updated":"2024-11-14T11:06:36Z","published":"2024-05-28T00:42:18Z","title":"ClavaDDPM: Multi-relational Data Synthesis with Cluster-guided Diffusion\n Models","summary":" Recent research in tabular data synthesis has focused on single tables,\nwhereas real-world applications often involve complex data with tens or\nhundreds of interconnected tables. Previous approaches to synthesizing\nmulti-relational (multi-table) data fall short in two key aspects: scalability\nfor larger datasets and capturing long-range dependencies, such as correlations\nbetween attributes spread across different tables. Inspired by the success of\ndiffusion models in tabular data modeling, we introduce\n $\\textbf{C}luster$ $\\textbf{La}tent$ $\\textbf{Va}riable$ $guided$\n$\\textbf{D}enoising$ $\\textbf{D}iffusion$ $\\textbf{P}robabilistic$\n$\\textbf{M}odels$ (ClavaDDPM). This novel approach leverages clustering labels\nas intermediaries to model relationships between tables, specifically focusing\non foreign key constraints. ClavaDDPM leverages the robust generation\ncapabilities of diffusion models while incorporating efficient algorithms to\npropagate the learned latent variables across tables. This enables ClavaDDPM to\ncapture long-range dependencies effectively.\n Extensive evaluations on multi-table datasets of varying sizes show that\nClavaDDPM significantly outperforms existing methods for these long-range\ndependencies while remaining competitive on utility metrics for single-table\ndata.\n","authors":["Wei Pang","Masoumeh Shafieinejad","Lucy Liu","Stephanie Hazlewood","Xi He"],"pdf_url":"https://arxiv.org/pdf/2405.17724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09359v1","updated":"2024-11-14T11:06:34Z","published":"2024-11-14T11:06:34Z","title":"Your Fixed Watermark is Fragile: Towards Semantic-Aware Watermark for\n EaaS Copyright Protection","summary":" Embedding-as-a-Service (EaaS) has emerged as a successful business pattern\nbut faces significant challenges related to various forms of copyright\ninfringement, including API misuse and different attacks. Various studies have\nproposed backdoor-based watermarking schemes to protect the copyright of EaaS\nservices. In this paper, we reveal that previous watermarking schemes possess\nsemantic-independent characteristics and propose the Semantic Perturbation\nAttack (SPA). Our theoretical and experimental analyses demonstrate that this\nsemantic-independent nature makes current watermarking schemes vulnerable to\nadaptive attacks that exploit semantic perturbations test to bypass watermark\nverification. To address this vulnerability, we propose the Semantic Aware\nWatermarking (SAW) scheme, a robust defense mechanism designed to resist SPA,\nby injecting a watermark that adapts to the text semantics. Extensive\nexperimental results across multiple datasets demonstrate that the True\nPositive Rate (TPR) for detecting watermarked samples under SPA can reach up to\nmore than 95%, rendering previous watermarks ineffective. Meanwhile, our\nwatermarking scheme can resist such attack while ensuring the watermark\nverification capability. Our code is available at\nhttps://github.com/Zk4-ps/EaaS-Embedding-Watermark.\n","authors":["Zekun Fei","Biao Yi","Jianing Geng","Ruiqi He","Lihai Nie","Zheli Liu"],"pdf_url":"https://arxiv.org/pdf/2411.09359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09356v1","updated":"2024-11-14T11:01:45Z","published":"2024-11-14T11:01:45Z","title":"Multi-scale Generative Modeling for Fast Sampling","summary":" While working within the spatial domain can pose problems associated with\nill-conditioned scores caused by power-law decay, recent advances in\ndiffusion-based generative models have shown that transitioning to the wavelet\ndomain offers a promising alternative. However, within the wavelet domain, we\nencounter unique challenges, especially the sparse representation of\nhigh-frequency coefficients, which deviates significantly from the Gaussian\nassumptions in the diffusion process. To this end, we propose a multi-scale\ngenerative modeling in the wavelet domain that employs distinct strategies for\nhandling low and high-frequency bands. In the wavelet domain, we apply\nscore-based generative modeling with well-conditioned scores for low-frequency\nbands, while utilizing a multi-scale generative adversarial learning for\nhigh-frequency bands. As supported by the theoretical analysis and experimental\nresults, our model significantly improve performance and reduce the number of\ntrainable parameters, sampling steps, and time.\n","authors":["Xiongye Xiao","Shixuan Li","Luzhe Huang","Gengshuo Liu","Trung-Kien Nguyen","Yi Huang","Di Chang","Mykel J. Kochenderfer","Paul Bogdan"],"pdf_url":"https://arxiv.org/pdf/2411.09356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18406v2","updated":"2024-11-14T10:55:14Z","published":"2024-06-26T14:57:38Z","title":"IRCAN: Mitigating Knowledge Conflicts in LLM Generation via Identifying\n and Reweighting Context-Aware Neurons","summary":" It is widely acknowledged that large language models (LLMs) encode a vast\nreservoir of knowledge after being trained on mass data. Recent studies\ndisclose knowledge conflicts in LLM generation, wherein outdated or incorrect\nparametric knowledge (i.e., encoded knowledge) contradicts new knowledge\nprovided in the context. To mitigate such knowledge conflicts, we propose a\nnovel framework, IRCAN (Identifying and Reweighting Context-Aware Neurons) to\ncapitalize on neurons that are crucial in processing contextual cues.\nSpecifically, IRCAN first identifies neurons that significantly contribute to\ncontext processing, utilizing a context-aware attribution score derived from\nintegrated gradients. Subsequently, the identified context-aware neurons are\nstrengthened via reweighting. In doing so, we steer LLMs to generate\ncontext-sensitive outputs with respect to the new knowledge provided in the\ncontext. Extensive experiments conducted across a variety of models and tasks\ndemonstrate that IRCAN not only achieves remarkable improvements in handling\nknowledge conflicts but also offers a scalable, plug-and-play solution that can\nbe integrated seamlessly with existing models. Our codes are released at\nhttps://github.com/danshi777/IRCAN.\n","authors":["Dan Shi","Renren Jin","Tianhao Shen","Weilong Dong","Xinwei Wu","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2406.18406v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.13292v2","updated":"2024-11-14T10:20:53Z","published":"2024-06-19T07:31:47Z","title":"An interpretable generative multimodal neuroimaging-genomics framework\n for decoding Alzheimer's disease","summary":" Alzheimer's disease (AD) is the most prevalent form of dementia with a\nprogressive decline in cognitive abilities. The AD continuum encompasses a\nprodromal stage known as MCI, where patients may either progress to AD (MCIc)\nor remain stable (MCInc). Understanding AD mechanisms requires complementary\nanalyses relying on different data sources, leading to the development of\nmultimodal DL models. We leveraged structural and functional MRI to investigate\nthe disease-induced GM and functional network connectivity changes. Moreover,\nconsidering AD's strong genetic component, we introduced SNPs as a third\nchannel. Missing one or more modalities is a typical concern of multimodal\nmethods. We hence propose a novel DL-based classification framework where a\ngenerative module employing Cycle GAN was adopted for imputing missing data in\nthe latent space. Additionally, we adopted an XAI method, Integrated Gradients,\nto extract features' relevance, enhancing our understanding of the learned\nrepresentations. Two tasks were addressed: AD detection and MCI conversion\nprediction. Experimental results showed that our framework reached the SOA in\nthe classification of CN/AD with an average test accuracy of $0.926\\pm0.02$.\nFor the MCInc/MCIc task, we achieved an average prediction accuracy of\n$0.711\\pm0.01$ using the pre-trained model for CN and AD. The interpretability\nanalysis revealed that significant GM modulations led the classification\nperformance in cortical and subcortical brain areas well known for their\nassociation with AD. Impairments in sensory-motor and visual functional network\nconnectivity along AD, as well as mutations in SNPs defining biological\nprocesses linked to endocytosis, amyloid-beta, and cholesterol, were identified\nas contributors to the results. Overall, our integrative DL model shows promise\nfor AD detection and MCI prediction, while shading light on important\nbiological insights.\n","authors":["Giorgio Dolci","Federica Cruciani","Md Abdur Rahaman","Anees Abrol","Jiayu Chen","Zening Fu","Ilaria Boscolo Galazzo","Gloria Menegaz","Vince D. Calhoun"],"pdf_url":"https://arxiv.org/pdf/2406.13292v2.pdf","comment":"28 pages, 8 figures, submitted to a journal"},{"id":"http://arxiv.org/abs/2312.06231v4","updated":"2024-11-14T09:54:47Z","published":"2023-12-11T09:18:14Z","title":"Uncovering communities of pipelines in the task-fMRI analytical space","summary":" Analytical workflows in functional magnetic resonance imaging are highly\nflexible with limited best practices as to how to choose a pipeline. While it\nhas been shown that the use of different pipelines might lead to different\nresults, there is still a lack of understanding of the factors that drive these\ndifferences and of the stability of these differences across contexts. We use\ncommunity detection algorithms to explore the pipeline space and assess the\nstability of pipeline relationships across different contexts. We show that\nthere are subsets of pipelines that give similar results, especially those\nsharing specific parameters (e.g. number of motion regressors, software\npackages, etc.). Those pipeline-to-pipeline patterns are stable across groups\nof participants but not across different tasks. By visualizing the differences\nbetween communities, we show that the pipeline space is mainly driven by the\nsize of the activation area in the brain and the scale of statistic values in\nstatistic maps.\n","authors":["Elodie Germani","Elisa Fromont","Camille Maumet"],"pdf_url":"https://arxiv.org/pdf/2312.06231v4.pdf","comment":"Accepted at the 2024 IEEE International Conference on Image\n Processing"},{"id":"http://arxiv.org/abs/2206.04438v2","updated":"2024-11-14T09:37:20Z","published":"2022-06-09T11:59:42Z","title":"A taxonomy of explanations to support Explainability-by-Design","summary":" As automated decision-making solutions are increasingly applied to all\naspects of everyday life, capabilities to generate meaningful explanations for\na variety of stakeholders (i.e., decision-makers, recipients of decisions,\nauditors, regulators...) become crucial. In this paper, we present a taxonomy\nof explanations that was developed as part of a holistic\n'Explainability-by-Design' approach for the purposes of the project PLEAD. The\ntaxonomy was built with a view to produce explanations for a wide range of\nrequirements stemming from a variety of regulatory frameworks or policies set\nat the organizational level either to translate high-level compliance\nrequirements or to meet business needs. The taxonomy comprises nine dimensions.\nIt is used as a stand-alone classifier of explanations conceived as detective\ncontrols, in order to aid supportive automated compliance strategies. A\nmachinereadable format of the taxonomy is provided in the form of a light\nontology and the benefits of starting the Explainability-by-Design journey with\nsuch a taxonomy are demonstrated through a series of examples.\n","authors":["Niko Tsakalakis","Sophie Stalla-Bourdillon","Trung Dong Huynh","Luc Moreau"],"pdf_url":"https://arxiv.org/pdf/2206.04438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05521v2","updated":"2024-11-14T09:28:49Z","published":"2024-11-08T12:27:13Z","title":"SM3-Text-to-Query: Synthetic Multi-Model Medical Text-to-Query Benchmark","summary":" Electronic health records (EHRs) are stored in various database systems with\ndifferent database models on heterogeneous storage architectures, such as\nrelational databases, document stores, or graph databases. These different\ndatabase models have a big impact on query complexity and performance. While\nthis has been a known fact in database research, its implications for the\ngrowing number of Text-to-Query systems have surprisingly not been investigated\nso far. In this paper, we present SM3-Text-to-Query, the first multi-model\nmedical Text-to-Query benchmark based on synthetic patient data from Synthea,\nfollowing the SNOMED-CT taxonomy -- a widely used knowledge graph ontology\ncovering medical terminology. SM3-Text-to-Query provides data representations\nfor relational databases (PostgreSQL), document stores (MongoDB), and graph\ndatabases (Neo4j and GraphDB (RDF)), allowing the evaluation across four\npopular query languages, namely SQL, MQL, Cypher, and SPARQL. We systematically\nand manually develop 408 template questions, which we augment to construct a\nbenchmark of 10K diverse natural language question/query pairs for these four\nquery languages (40K pairs overall). On our dataset, we evaluate several common\nin-context-learning (ICL) approaches for a set of representative closed and\nopen-source LLMs. Our evaluation sheds light on the trade-offs between database\nmodels and query languages for different ICL strategies and LLMs. Last,\nSM3-Text-to-Query is easily extendable to additional query languages or real,\nstandard-based patient databases.\n","authors":["Sithursan Sivasubramaniam","Cedric Osei-Akoto","Yi Zhang","Kurt Stockinger","Jonathan Fuerst"],"pdf_url":"https://arxiv.org/pdf/2411.05521v2.pdf","comment":"NeurIPS 2024 Track Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2411.09302v1","updated":"2024-11-14T09:23:58Z","published":"2024-11-14T09:23:58Z","title":"EEG-Based Speech Decoding: A Novel Approach Using Multi-Kernel Ensemble\n Diffusion Models","summary":" In this study, we propose an ensemble learning framework for\nelectroencephalogram-based overt speech classification, leveraging denoising\ndiffusion probabilistic models with varying convolutional kernel sizes. The\nensemble comprises three models with kernel sizes of 51, 101, and 201,\neffectively capturing multi-scale temporal features inherent in signals. This\napproach improves the robustness and accuracy of speech decoding by\naccommodating the rich temporal complexity of neural signals. The ensemble\nmodels work in conjunction with conditional autoencoders that refine the\nreconstructed signals and maximize the useful information for downstream\nclassification tasks. The results indicate that the proposed ensemble-based\napproach significantly outperforms individual models and existing\nstate-of-the-art techniques. These findings demonstrate the potential of\nensemble methods in advancing brain signal decoding, offering new possibilities\nfor non-verbal communication applications, particularly in brain-computer\ninterface systems aimed at aiding individuals with speech impairments.\n","authors":["Soowon Kim","Ha-Na Jo","Eunyeong Ko"],"pdf_url":"https://arxiv.org/pdf/2411.09302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03017v2","updated":"2024-11-14T09:19:43Z","published":"2024-02-05T13:55:54Z","title":"Toward Green and Human-Like Artificial Intelligence: A Complete Survey\n on Contemporary Few-Shot Learning Approaches","summary":" Despite deep learning's widespread success, its data-hungry and\ncomputationally expensive nature makes it impractical for many data-constrained\nreal-world applications. Few-Shot Learning (FSL) aims to address these\nlimitations by enabling rapid adaptation to novel learning tasks, seeing\nsignificant growth in recent years. This survey provides a comprehensive\noverview of the field's latest advancements. Initially, FSL is formally\ndefined, and its relationship with different learning fields is presented. A\nnovel taxonomy is introduced, extending previously proposed ones, and\nreal-world applications in classic and novel fields are described. Finally,\nrecent trends shaping the field, outstanding challenges, and promising future\nresearch directions are discussed.\n","authors":["Georgios Tsoumplekas","Vladislav Li","Panagiotis Sarigiannidis","Vasileios Argyriou"],"pdf_url":"https://arxiv.org/pdf/2402.03017v2.pdf","comment":"35 pages, 9 figures. Submitted to ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2410.14979v4","updated":"2024-11-14T09:17:48Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration From Cognitive Psychology","summary":" The cognitive mechanism by which Large Language Models (LLMs) solve\nmathematical problems remains a widely debated and unresolved issue. Currently,\nthere is little interpretable experimental evidence that connects LLMs'\nproblem-solving with human cognitive psychology.To determine if LLMs possess\nhuman-like mathematical reasoning, we modified the problems used in the human\nCognitive Reflection Test (CRT). Our results show that, even with the use of\nChains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model\n(noted for its reasoning capabilities), have a high error rate when solving\nthese modified CRT problems. Specifically, the average accuracy rate dropped by\nup to 50% compared to the original questions.Further analysis of LLMs'\nincorrect answers suggests that they primarily rely on pattern matching from\ntheir training data, which aligns more with human intuition (System 1 thinking)\nrather than with human-like reasoning (System 2 thinking). This finding\nchallenges the belief that LLMs have genuine mathematical reasoning abilities\ncomparable to humans. As a result, this work may adjust overly optimistic views\non LLMs' progress towards artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Kai Chen","Xiaobing Sun","Baosheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14979v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09294v1","updated":"2024-11-14T09:12:38Z","published":"2024-11-14T09:12:38Z","title":"Learning Hand State Estimation for a Light Exoskeleton","summary":" We propose a machine learning-based estimator of the hand state for\nrehabilitation purposes, using light exoskeletons. These devices are easy to\nuse and useful for delivering domestic and frequent therapies. We build a\nsupervised approach using information from the muscular activity of the forearm\nand the motion of the exoskeleton to reconstruct the hand's opening degree and\ncompliance level. Such information can be used to evaluate the therapy progress\nand develop adaptive control behaviors. Our approach is validated with a real\nlight exoskeleton. The experiments demonstrate good predictive performance of\nour approach when trained on data coming from a single user and tested on the\nsame user, even across different sessions. This generalization capability makes\nour system promising for practical use in real rehabilitation.\n","authors":["Gabriele Abbate","Alessandro Giusti","Luca Randazzo","Antonio Paolillo"],"pdf_url":"https://arxiv.org/pdf/2411.09294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08434v2","updated":"2024-11-14T09:11:26Z","published":"2024-04-12T12:31:06Z","title":"An improved tabular data generator with VAE-GMM integration","summary":" The rising use of machine learning in various fields requires robust methods\nto create synthetic tabular data. Data should preserve key characteristics\nwhile addressing data scarcity challenges. Current approaches based on\nGenerative Adversarial Networks, such as the state-of-the-art CTGAN model,\nstruggle with the complex structures inherent in tabular data. These data often\ncontain both continuous and discrete features with non-Gaussian distributions.\nTherefore, we propose a novel Variational Autoencoder (VAE)-based model that\naddresses these limitations. Inspired by the TVAE model, our approach\nincorporates a Bayesian Gaussian Mixture model (BGM) within the VAE\narchitecture. This avoids the limitations imposed by assuming a strictly\nGaussian latent space, allowing for a more accurate representation of the\nunderlying data distribution during data generation. Furthermore, our model\noffers enhanced flexibility by allowing the use of various differentiable\ndistributions for individual features, making it possible to handle both\ncontinuous and discrete data types. We thoroughly validate our model on three\nreal-world datasets with mixed data types, including two medically relevant\nones, based on their resemblance and utility. This evaluation demonstrates\nsignificant outperformance against CTGAN and TVAE, establishing its potential\nas a valuable tool for generating synthetic tabular data in various domains,\nparticularly in healthcare.\n","authors":["Patricia A. Apellániz","Juan Parras","Santiago Zazo"],"pdf_url":"https://arxiv.org/pdf/2404.08434v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.09289v1","updated":"2024-11-14T09:03:54Z","published":"2024-11-14T09:03:54Z","title":"StreamAdapter: Efficient Test Time Adaptation from Contextual Streams","summary":" In-context learning (ICL) allows large language models (LLMs) to adapt to new\ntasks directly from the given demonstrations without requiring gradient\nupdates. While recent advances have expanded context windows to accommodate\nmore demonstrations, this approach increases inference costs without\nnecessarily improving performance. To mitigate these issues, We propose\nStreamAdapter, a novel approach that directly updates model parameters from\ncontext at test time, eliminating the need for explicit in-context\ndemonstrations. StreamAdapter employs context mapping and weight absorption\nmechanisms to dynamically transform ICL demonstrations into parameter updates\nwith minimal additional parameters. By reducing reliance on numerous in-context\nexamples, StreamAdapter significantly reduce inference costs and allows for\nefficient inference with constant time complexity, regardless of demonstration\ncount. Extensive experiments across diverse tasks and model architectures\ndemonstrate that StreamAdapter achieves comparable or superior adaptation\ncapability to ICL while requiring significantly fewer demonstrations. The\nsuperior task adaptation and context encoding capabilities of StreamAdapter on\nboth language understanding and generation tasks provides a new perspective for\nadapting LLMs at test time using context, allowing for more efficient\nadaptation across scenarios and more cost-effective inference\n","authors":["Dilxat Muhtar","Yelong Shen","Yaming Yang","Xiaodong Liu","Yadong Lu","Jianfeng Liu","Yuefeng Zhan","Hao Sun","Weiwei Deng","Feng Sun","Xueliang Zhang","Jianfeng Gao","Weizhu Chen","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09289v1.pdf","comment":"22 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2411.09273v1","updated":"2024-11-14T08:22:42Z","published":"2024-11-14T08:22:42Z","title":"Cross-Modal Consistency in Multimodal Large Language Models","summary":" Recent developments in multimodal methodologies have marked the beginning of\nan exciting era for models adept at processing diverse data types, encompassing\ntext, audio, and visual content. Models like GPT-4V, which merge computer\nvision with advanced language processing, exhibit extraordinary proficiency in\nhandling intricate tasks that require a simultaneous understanding of both\ntextual and visual information. Prior research efforts have meticulously\nevaluated the efficacy of these Vision Large Language Models (VLLMs) in various\ndomains, including object detection, image captioning, and other related\nfields. However, existing analyses have often suffered from limitations,\nprimarily centering on the isolated evaluation of each modality's performance\nwhile neglecting to explore their intricate cross-modal interactions.\nSpecifically, the question of whether these models achieve the same level of\naccuracy when confronted with identical task instances across different\nmodalities remains unanswered. In this study, we take the initiative to delve\ninto the interaction and comparison among these modalities of interest by\nintroducing a novel concept termed cross-modal consistency. Furthermore, we\npropose a quantitative evaluation framework founded on this concept. Our\nexperimental findings, drawn from a curated collection of parallel\nvision-language datasets developed by us, unveil a pronounced inconsistency\nbetween the vision and language modalities within GPT-4V, despite its portrayal\nas a unified multimodal model. Our research yields insights into the\nappropriate utilization of such models and hints at potential avenues for\nenhancing their design.\n","authors":["Xiang Zhang","Senyu Li","Ning Shi","Bradley Hauer","Zijun Wu","Grzegorz Kondrak","Muhammad Abdul-Mageed","Laks V. S. Lakshmanan"],"pdf_url":"https://arxiv.org/pdf/2411.09273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07176v2","updated":"2024-11-14T08:20:22Z","published":"2024-11-11T17:56:28Z","title":"More Expressive Attention with Negative Weights","summary":" We propose a novel attention mechanism, named Cog Attention, that enables\nattention weights to be negative for enhanced expressiveness, which stems from\ntwo key factors: (1) Cog Attention can shift the token deletion and copying\nfunction from a static OV matrix to dynamic QK inner products, with the OV\nmatrix now focusing more on refinement or modification. The attention head can\nsimultaneously delete, copy, or retain tokens by assigning them negative,\npositive, or minimal attention weights, respectively. As a result, a single\nattention head becomes more flexible and expressive. (2) Cog Attention improves\nthe model's robustness against representational collapse, which can occur when\nearlier tokens are over-squashed into later positions, leading to homogeneous\nrepresentations. Negative weights reduce effective information paths from\nearlier to later tokens, helping to mitigate this issue. We develop\nTransformer-like models which use Cog Attention as attention modules, including\ndecoder-only models for language modeling and U-ViT diffusion models for image\ngeneration. Experiments show that models using Cog Attention exhibit superior\nperformance compared to those employing traditional softmax attention modules.\nOur approach suggests a promising research direction for rethinking and\nbreaking the entrenched constraints of traditional softmax attention, such as\nthe requirement for non-negative weights.\n","authors":["Ang Lv","Ruobing Xie","Shuaipeng Li","Jiayi Liao","Xingwu Sun","Zhanhui Kang","Di Wang","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2411.07176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09269v1","updated":"2024-11-14T08:12:36Z","published":"2024-11-14T08:12:36Z","title":"Harnessing multiple LLMs for Information Retrieval: A case study on Deep\n Learning methodologies in Biodiversity publications","summary":" Deep Learning (DL) techniques are increasingly applied in scientific studies\nacross various domains to address complex research questions. However, the\nmethodological details of these DL models are often hidden in the unstructured\ntext. As a result, critical information about how these models are designed,\ntrained, and evaluated is challenging to access and comprehend. To address this\nissue, in this work, we use five different open-source Large Language Models\n(LLMs): Llama-3 70B, Llama-3.1 70B, Mixtral-8x22B-Instruct-v0.1, Mixtral 8x7B,\nand Gemma 2 9B in combination with Retrieval-Augmented Generation (RAG)\napproach to extract and process DL methodological details from scientific\npublications automatically. We built a voting classifier from the outputs of\nfive LLMs to accurately report DL methodological information. We tested our\napproach using biodiversity publications, building upon our previous research.\nTo validate our pipeline, we employed two datasets of DL-related biodiversity\npublications: a curated set of 100 publications from our prior work and a set\nof 364 publications from the Ecological Informatics journal. Our results\ndemonstrate that the multi-LLM, RAG-assisted pipeline enhances the retrieval of\nDL methodological information, achieving an accuracy of 69.5% (417 out of 600\ncomparisons) based solely on textual content from publications. This\nperformance was assessed against human annotators who had access to code,\nfigures, tables, and other supplementary information. Although demonstrated in\nbiodiversity, our methodology is not limited to this field; it can be applied\nacross other scientific domains where detailed methodological reporting is\nessential for advancing knowledge and ensuring reproducibility. This study\npresents a scalable and reliable approach for automating information\nextraction, facilitating better reproducibility and knowledge transfer across\nstudies.\n","authors":["Vamsi Krishna Kommineni","Birgitta König-Ries","Sheeba Samuel"],"pdf_url":"https://arxiv.org/pdf/2411.09269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09266v1","updated":"2024-11-14T08:07:02Z","published":"2024-11-14T08:07:02Z","title":"How Good is ChatGPT at Audiovisual Deepfake Detection: A Comparative\n Study of ChatGPT, AI Models and Human Perception","summary":" Multimodal deepfakes involving audiovisual manipulations are a growing threat\nbecause they are difficult to detect with the naked eye or using unimodal deep\nlearningbased forgery detection methods. Audiovisual forensic models, while\nmore capable than unimodal models, require large training datasets and are\ncomputationally expensive for training and inference. Furthermore, these models\nlack interpretability and often do not generalize well to unseen manipulations.\nIn this study, we examine the detection capabilities of a large language model\n(LLM) (i.e., ChatGPT) to identify and account for any possible visual and\nauditory artifacts and manipulations in audiovisual deepfake content. Extensive\nexperiments are conducted on videos from a benchmark multimodal deepfake\ndataset to evaluate the detection performance of ChatGPT and compare it with\nthe detection capabilities of state-of-the-art multimodal forensic models and\nhumans. Experimental results demonstrate the importance of domain knowledge and\nprompt engineering for video forgery detection tasks using LLMs. Unlike\napproaches based on end-to-end learning, ChatGPT can account for spatial and\nspatiotemporal artifacts and inconsistencies that may exist within or across\nmodalities. Additionally, we discuss the limitations of ChatGPT for multimedia\nforensic tasks.\n","authors":["Sahibzada Adil Shahzad","Ammarah Hashmi","Yan-Tsung Peng","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09276v2","updated":"2024-11-14T08:06:46Z","published":"2024-05-15T11:46:47Z","title":"Dual-Segment Clustering Strategy for Hierarchical Federated Learning in\n Heterogeneous Wireless Environments","summary":" Non-independent and identically distributed (Non- IID) data adversely affects\nfederated learning (FL) while heterogeneity in communication quality can\nundermine the reliability of model parameter transmission, potentially\ndegrading wireless FL convergence. This paper proposes a novel dual-segment\nclustering (DSC) strategy that jointly addresses communication and data\nheterogeneity in FL. This is achieved by defining a new signal-to-noise ratio\n(SNR) matrix and information quantity matrix to capture the communication and\ndata heterogeneity, respectively. The celebrated affinity propagation algorithm\nis leveraged to iteratively refine the clustering of clients based on the newly\ndefined matrices effectively enhancing model aggregation in heterogeneous\nenvironments. The convergence analysis and experimental results show that the\nDSC strategy can improve the convergence rate of wireless FL and demonstrate\nsuperior accuracy in heterogeneous environments compared to classical\nclustering methods.\n","authors":["Pengcheng Sun","Erwu Liu","Wei Ni","Kanglei Yu","Xinyu Qu","Rui Wang","Yanlong Bi","Chuanchun Zhang","Abbas Jamalipour"],"pdf_url":"https://arxiv.org/pdf/2405.09276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09261v1","updated":"2024-11-14T07:58:44Z","published":"2024-11-14T07:58:44Z","title":"Automating Autograding: Large Language Models as Test Suite Generators\n for Introductory Programming","summary":" Automatically graded programming assignments provide instant feedback to\nstudents and significantly reduce manual grading time for instructors. However,\ncreating comprehensive suites of test cases for programming problems within\nautomatic graders can be time-consuming and complex. The effort needed to\ndefine test suites may deter some instructors from creating additional problems\nor lead to inadequate test coverage, potentially resulting in misleading\nfeedback on student solutions. Such limitations may reduce student access to\nthe well-documented benefits of timely feedback when learning programming.\n In this work, we evaluate the effectiveness of using Large Language Models\n(LLMs), as part of a larger workflow, to automatically generate test suites for\nCS1-level programming problems. Each problem's statement and reference solution\nare provided to GPT-4 to produce a test suite that can be used by an\nautograder. We evaluate our proposed approach using a sample of 26 problems,\nand more than 25,000 attempted solutions to those problems, submitted by\nstudents in an introductory programming course. We compare the performance of\nthe LLM-generated test suites against the instructor-created test suites for\neach problem. Our findings reveal that LLM-generated test suites can correctly\nidentify most valid solutions, and for most problems are at least as\ncomprehensive as the instructor test suites. Additionally, the LLM-generated\ntest suites exposed ambiguities in some problem statements, underscoring their\npotential to improve both autograding and instructional design.\n","authors":["Umar Alkafaween","Ibrahim Albluwi","Paul Denny"],"pdf_url":"https://arxiv.org/pdf/2411.09261v1.pdf","comment":"Submitted to Journal of Computer Assisted Learning"},{"id":"http://arxiv.org/abs/2411.09251v1","updated":"2024-11-14T07:34:31Z","published":"2024-11-14T07:34:31Z","title":"Cross Space and Time: A Spatio-Temporal Unitized Model for Traffic Flow\n Forecasting","summary":" Predicting spatio-temporal traffic flow presents significant challenges due\nto complex interactions between spatial and temporal factors. Existing\napproaches often address these dimensions in isolation, neglecting their\ncritical interdependencies. In this paper, we introduce the Spatio-Temporal\nUnitized Model (STUM), a unified framework designed to capture both spatial and\ntemporal dependencies while addressing spatio-temporal heterogeneity through\ntechniques such as distribution alignment and feature fusion. It also ensures\nboth predictive accuracy and computational efficiency. Central to STUM is the\nAdaptive Spatio-temporal Unitized Cell (ASTUC), which utilizes low-rank\nmatrices to seamlessly store, update, and interact with space, time, as well as\ntheir correlations. Our framework is also modular, allowing it to integrate\nwith various spatio-temporal graph neural networks through components such as\nbackbone models, feature extractors, residual fusion blocks, and predictive\nmodules to collectively enhance forecasting outcomes. Experimental results\nacross multiple real-world datasets demonstrate that STUM consistently improves\nprediction performance with minimal computational cost. These findings are\nfurther supported by hyperparameter optimization, pre-training analysis, and\nresult visualization. We provide our source code for reproducibility at\nhttps://anonymous.4open.science/r/STUM-E4F0.\n","authors":["Weilin Ruan","Wenzhuo Wang","Siru Zhong","Wei Chen","Li Liu","Yuxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2411.09251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09249v1","updated":"2024-11-14T07:28:09Z","published":"2024-11-14T07:28:09Z","title":"Enhancing Financial Domain Adaptation of Language Models via Model\n Augmentation","summary":" The domain adaptation of language models, including large language models\n(LLMs), has become increasingly important as the use of such models continues\nto expand. This study demonstrates the effectiveness of Composition to Augment\nLanguage Models (CALM) in adapting to the financial domain. CALM is a model to\nextend the capabilities of existing models by introducing cross-attention\nbetween two LLMs with different functions. In our experiments, we developed a\nCALM to enhance the financial performance of an LLM with strong response\ncapabilities by leveraging a financial-specialized LLM. Notably, the CALM was\ntrained using a financial dataset different from the one used to train the\nfinancial-specialized LLM, confirming CALM's ability to adapt to various\ndatasets. The models were evaluated through quantitative Japanese financial\nbenchmarks and qualitative response comparisons, demonstrating that CALM\nenables superior responses with higher scores than the original models and\nbaselines. Additionally, comparative experiments on connection points revealed\nthat connecting the middle layers of the models is most effective in\nfacilitating adaptation to the financial domain. These findings confirm that\nCALM is a practical approach for adapting LLMs to the financial domain.\n","authors":["Kota Tanabe","Masanori Hirano","Kazuki Matoya","Kentaro Imajo","Hiroki Sakaji","Itsuki Noda"],"pdf_url":"https://arxiv.org/pdf/2411.09249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09243v1","updated":"2024-11-14T07:20:08Z","published":"2024-11-14T07:20:08Z","title":"Towards Unified Neural Decoding of Perceived, Spoken and Imagined Speech\n from EEG Signals","summary":" Brain signals accompany various information relevant to human actions and\nmental imagery, making them crucial to interpreting and understanding human\nintentions. Brain-computer interface technology leverages this brain activity\nto generate external commands for controlling the environment, offering\ncritical advantages to individuals with paralysis or locked-in syndrome. Within\nthe brain-computer interface domain, brain-to-speech research has gained\nattention, focusing on the direct synthesis of audible speech from brain\nsignals. Most current studies decode speech from brain activity using invasive\ntechniques and emphasize spoken speech data. However, humans express various\nspeech states, and distinguishing these states through non-invasive approaches\nremains a significant yet challenging task. This research investigated the\neffectiveness of deep learning models for non-invasive-based neural signal\ndecoding, with an emphasis on distinguishing between different speech\nparadigms, including perceived, overt, whispered, and imagined speech, across\nmultiple frequency bands. The model utilizing the spatial conventional neural\nnetwork module demonstrated superior performance compared to other models,\nespecially in the gamma band. Additionally, imagined speech in the theta\nfrequency band, where deep learning also showed strong effects, exhibited\nstatistically significant differences compared to the other speech paradigms.\n","authors":["Jung-Sun Lee","Ha-Na Jo","Seo-Hyun Lee"],"pdf_url":"https://arxiv.org/pdf/2411.09243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09224v1","updated":"2024-11-14T06:40:55Z","published":"2024-11-14T06:40:55Z","title":"Programming with AI: Evaluating ChatGPT, Gemini, AlphaCode, and GitHub\n Copilot for Programmers","summary":" Our everyday lives now heavily rely on artificial intelligence (AI) powered\nlarge language models (LLMs). Like regular users, programmers are also\nbenefiting from the newest large language models. In response to the critical\nrole that AI models play in modern software development, this study presents a\nthorough evaluation of leading programming assistants, including ChatGPT,\nGemini(Bard AI), AlphaCode, and GitHub Copilot. The evaluation is based on\ntasks like natural language processing and code generation accuracy in\ndifferent programming languages like Java, Python and C++. Based on the\nresults, it has emphasized their strengths and weaknesses and the importance of\nfurther modifications to increase the reliability and accuracy of the latest\npopular models. Although these AI assistants illustrate a high level of\nprogress in language understanding and code generation, along with ethical\nconsiderations and responsible usage, they provoke a necessity for discussion.\nWith time, developing more refined AI technology is essential for achieving\nadvanced solutions in various fields, especially with the knowledge of the\nfeature intricacies of these models and their implications. This study offers a\ncomparison of different LLMs and provides essential feedback on the rapidly\nchanging area of AI models. It also emphasizes the need for ethical\ndevelopmental practices to actualize AI models' full potential.\n","authors":["Md Kamrul Siam","Huanying Gu","Jerry Q. Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.09224v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.07032v2","updated":"2024-11-14T06:36:57Z","published":"2024-03-11T04:56:10Z","title":"STARFlow: Spatial Temporal Feature Re-embedding with Attentive Learning\n for Real-world Scene Flow","summary":" Scene flow prediction is a crucial underlying task in understanding dynamic\nscenes as it offers fundamental motion information. However, contemporary scene\nflow methods encounter three major challenges. Firstly, flow estimation solely\nbased on local receptive fields lacks long-dependency matching of point pairs.\nTo address this issue, we propose global attentive flow embedding to match\nall-to-all point pairs in both feature space and Euclidean space, providing\nglobal initialization before local refinement. Secondly, there are deformations\nexisting in non-rigid objects after warping, which leads to variations in the\nspatiotemporal relation between the consecutive frames. For a more precise\nestimation of residual flow, a spatial temporal feature re-embedding module is\ndevised to acquire the sequence features after deformation. Furthermore,\nprevious methods perform poor generalization due to the significant domain gap\nbetween the synthesized and LiDAR-scanned datasets. We leverage novel domain\nadaptive losses to effectively bridge the gap of motion inference from\nsynthetic to real-world. Experiments demonstrate that our approach achieves\nstate-of-the-art performance across various datasets, with particularly\noutstanding results on real-world LiDAR-scanned datasets. Our code is available\nat https://github.com/O-VIGIA/StarFlow.\n","authors":["Zhiyang Lu","Qinghan Chen","Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.07032v2.pdf","comment":"This paper was renamed to:\"SSRFlow: Semantic-aware Fusion with\n Spatial Temporal Re-embedding for Real-world Scene Flow\" [arXiv:2408.07825]\n and was accepted in 3DV 2025"},{"id":"http://arxiv.org/abs/2409.15750v3","updated":"2024-11-14T06:33:26Z","published":"2024-09-24T05:12:10Z","title":"The Roles of Generative Artificial Intelligence in Internet of Electric\n Vehicles","summary":" With the advancements of generative artificial intelligence (GenAI) models,\ntheir capabilities are expanding significantly beyond content generation and\nthe models are increasingly being used across diverse applications.\nParticularly, GenAI shows great potential in addressing challenges in the\nelectric vehicle (EV) ecosystem ranging from charging management to\ncyber-attack prevention. In this paper, we specifically consider Internet of\nelectric vehicles (IoEV) and we categorize GenAI for IoEV into four different\nlayers namely, EV's battery layer, individual EV layer, smart grid layer, and\nsecurity layer. We introduce various GenAI techniques used in each layer of\nIoEV applications. Subsequently, public datasets available for training the\nGenAI models are summarized. Finally, we provide recommendations for future\ndirections. This survey not only categorizes the applications of GenAI in IoEV\nacross different layers but also serves as a valuable resource for researchers\nand practitioners by highlighting the design and implementation challenges\nwithin each layer. Furthermore, it provides a roadmap for future research\ndirections, enabling the development of more robust and efficient IoEV systems\nthrough the integration of advanced GenAI techniques.\n","authors":["Hanwen Zhang","Dusit Niyato","Wei Zhang","Changyuan Zhao","Hongyang Du","Abbas Jamalipour","Sumei Sun","Yiyang Pei"],"pdf_url":"https://arxiv.org/pdf/2409.15750v3.pdf","comment":"25 Pages"},{"id":"http://arxiv.org/abs/2411.09220v1","updated":"2024-11-14T06:32:31Z","published":"2024-11-14T06:32:31Z","title":"Transferable Adversarial Attacks against ASR","summary":" Given the extensive research and real-world applications of automatic speech\nrecognition (ASR), ensuring the robustness of ASR models against minor input\nperturbations becomes a crucial consideration for maintaining their\neffectiveness in real-time scenarios. Previous explorations into ASR model\nrobustness have predominantly revolved around evaluating accuracy on white-box\nsettings with full access to ASR models. Nevertheless, full ASR model details\nare often not available in real-world applications. Therefore, evaluating the\nrobustness of black-box ASR models is essential for a comprehensive\nunderstanding of ASR model resilience. In this regard, we thoroughly study the\nvulnerability of practical black-box attacks in cutting-edge ASR models and\npropose to employ two advanced time-domain-based transferable attacks alongside\nour differentiable feature extractor. We also propose a speech-aware gradient\noptimization approach (SAGO) for ASR, which forces mistranscription with\nminimal impact on human imperceptibility through voice activity detection rule\nand a speech-aware gradient-oriented optimizer. Our comprehensive experimental\nresults reveal performance enhancements compared to baseline approaches across\nfive models on two databases.\n","authors":["Xiaoxue Gao","Zexin Li","Yiming Chen","Cong Liu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2411.09220v1.pdf","comment":"IEEE SPL"},{"id":"http://arxiv.org/abs/2411.09213v1","updated":"2024-11-14T06:19:18Z","published":"2024-11-14T06:19:18Z","title":"Comprehensive and Practical Evaluation of Retrieval-Augmented Generation\n Systems for Medical Question Answering","summary":" Retrieval-augmented generation (RAG) has emerged as a promising approach to\nenhance the performance of large language models (LLMs) in knowledge-intensive\ntasks such as those from medical domain. However, the sensitive nature of the\nmedical domain necessitates a completely accurate and trustworthy system. While\nexisting RAG benchmarks primarily focus on the standard retrieve-answer\nsetting, they overlook many practical scenarios that measure crucial aspects of\na reliable medical system. This paper addresses this gap by providing a\ncomprehensive evaluation framework for medical question-answering (QA) systems\nin a RAG setting for these situations, including sufficiency, integration, and\nrobustness. We introduce Medical Retrieval-Augmented Generation Benchmark\n(MedRGB) that provides various supplementary elements to four medical QA\ndatasets for testing LLMs' ability to handle these specific scenarios.\nUtilizing MedRGB, we conduct extensive evaluations of both state-of-the-art\ncommercial LLMs and open-source models across multiple retrieval conditions.\nOur experimental results reveals current models' limited ability to handle\nnoise and misinformation in the retrieved documents. We further analyze the\nLLMs' reasoning processes to provides valuable insights and future directions\nfor developing RAG systems in this critical medical domain.\n","authors":["Nghia Trung Ngo","Chien Van Nguyen","Franck Dernoncourt","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.09213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09211v1","updated":"2024-11-14T06:15:05Z","published":"2024-11-14T06:15:05Z","title":"Dynamic Neural Communication: Convergence of Computer Vision and\n Brain-Computer Interface","summary":" Interpreting human neural signals to decode static speech intentions such as\ntext or images and dynamic speech intentions such as audio or video is showing\ngreat potential as an innovative communication tool. Human communication\naccompanies various features, such as articulatory movements, facial\nexpressions, and internal speech, all of which are reflected in neural signals.\nHowever, most studies only generate short or fragmented outputs, while\nproviding informative communication by leveraging various features from neural\nsignals remains challenging. In this study, we introduce a dynamic neural\ncommunication method that leverages current computer vision and brain-computer\ninterface technologies. Our approach captures the user's intentions from neural\nsignals and decodes visemes in short time steps to produce dynamic visual\noutputs. The results demonstrate the potential to rapidly capture and\nreconstruct lip movements during natural speech attempts from human neural\nsignals, enabling dynamic neural communication through the convergence of\ncomputer vision and brain--computer interface.\n","authors":["Ji-Ha Park","Seo-Hyun Lee","Soowon Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2411.09211v1.pdf","comment":"4 pages, 2 figures, 1 table, Name of Conference: International\n Conference on Brain-Computer Interface"},{"id":"http://arxiv.org/abs/2411.09204v1","updated":"2024-11-14T06:03:54Z","published":"2024-11-14T06:03:54Z","title":"RibCageImp: A Deep Learning Framework for 3D Ribcage Implant Generation","summary":" The recovery of damaged or resected ribcage structures requires precise,\ncustom-designed implants to restore the integrity and functionality of the\nthoracic cavity. Traditional implant design methods rely mainly on manual\nprocesses, making them time-consuming and susceptible to variability. In this\nwork, we explore the feasibility of automated ribcage implant generation using\ndeep learning. We present a framework based on 3D U-Net architecture that\nprocesses CT scans to generate patient-specific implant designs. To the best of\nour knowledge, this is the first investigation into automated thoracic implant\ngeneration using deep learning approaches. Our preliminary results, while\nmoderate, highlight both the potential and the significant challenges in this\ncomplex domain. These findings establish a foundation for future research in\nautomated ribcage reconstruction and identify key technical challenges that\nneed to be addressed for practical implementation.\n","authors":["Gyanendra Chaubey","Aiman Farooq","Azad Singh","Deepak Mishra"],"pdf_url":"https://arxiv.org/pdf/2411.09204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08504v2","updated":"2024-11-14T05:51:26Z","published":"2024-11-13T10:42:11Z","title":"Towards Objective and Unbiased Decision Assessments with LLM-Enhanced\n Hierarchical Attention Networks","summary":" How objective and unbiased are we while making decisions? This work\ninvestigates cognitive bias identification in high-stake decision making\nprocess by human experts, questioning its effectiveness in real-world settings,\nsuch as candidates assessments for university admission. We begin with a\nstatistical analysis assessing correlations among different decision points\namong in the current process, which discovers discrepancies that imply\ncognitive bias and inconsistency in decisions. This motivates our exploration\nof bias-aware AI-augmented workflow that surpass human judgment. We propose\nBGM-HAN, an enhanced Hierarchical Attention Network with Byte-Pair Encoding,\nGated Residual Connections and Multi-Head Attention. Using it as a backbone\nmodel, we further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow,\nwhich simulate real-world decision-making. In our experiments, both the\nproposed model and the agentic workflow significantly improves on both human\njudgment and alternative models, validated with real-world data.\n","authors":["Junhua Liu","Kwan Hui Lim","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08504v2.pdf","comment":"Source code is available at: https://github.com/junhua/bgm-han"},{"id":"http://arxiv.org/abs/2411.06493v2","updated":"2024-11-14T05:34:13Z","published":"2024-11-10T15:21:30Z","title":"LProtector: An LLM-driven Vulnerability Detection System","summary":" This paper presents LProtector, an automated vulnerability detection system\nfor C/C++ codebases driven by the large language model (LLM) GPT-4o and\nRetrieval-Augmented Generation (RAG). As software complexity grows, traditional\nmethods face challenges in detecting vulnerabilities effectively. LProtector\nleverages GPT-4o's powerful code comprehension and generation capabilities to\nperform binary classification and identify vulnerabilities within target\ncodebases. We conducted experiments on the Big-Vul dataset, showing that\nLProtector outperforms two state-of-the-art baselines in terms of F1 score,\ndemonstrating the potential of integrating LLMs with vulnerability detection.\n","authors":["Ze Sheng","Fenghua Wu","Xiangwu Zuo","Chao Li","Yuxin Qiao","Lei Hang"],"pdf_url":"https://arxiv.org/pdf/2411.06493v2.pdf","comment":"5 pages, 4 figures. This is a preprint version of the article. The\n final version will be published in the proceedings of the IEEE conference"},{"id":"http://arxiv.org/abs/2411.09189v1","updated":"2024-11-14T05:05:36Z","published":"2024-11-14T05:05:36Z","title":"Improvement and Implementation of a Speech Emotion Recognition Model\n Based on Dual-Layer LSTM","summary":" This paper builds upon an existing speech emotion recognition model by adding\nan additional LSTM layer to improve the accuracy and processing efficiency of\nemotion recognition from audio data. By capturing the long-term dependencies\nwithin audio sequences through a dual-layer LSTM network, the model can\nrecognize and classify complex emotional patterns more accurately. Experiments\nconducted on the RAVDESS dataset validated this approach, showing that the\nmodified dual layer LSTM model improves accuracy by 2% compared to the\nsingle-layer LSTM while significantly reducing recognition latency, thereby\nenhancing real-time performance. These results indicate that the dual-layer\nLSTM architecture is highly suitable for handling emotional features with\nlong-term dependencies, providing a viable optimization for speech emotion\nrecognition systems. This research provides a reference for practical\napplications in fields like intelligent customer service, sentiment analysis\nand human-computer interaction.\n","authors":["Xiaoran Yang","Shuhan Yu","Wenxi Xu"],"pdf_url":"https://arxiv.org/pdf/2411.09189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09184v1","updated":"2024-11-14T04:46:08Z","published":"2024-11-14T04:46:08Z","title":"Dynamic technology impact analysis: A multi-task learning approach to\n patent citation prediction","summary":" Machine learning (ML) models are valuable tools for analyzing the impact of\ntechnology using patent citation information. However, existing ML-based\nmethods often struggle to account for the dynamic nature of the technology\nimpact over time and the interdependencies of these impacts across different\nperiods. This study proposes a multi-task learning (MTL) approach to enhance\nthe prediction of technology impact across various time frames by leveraging\nknowledge sharing and simultaneously monitoring the evolution of technology\nimpact. First, we quantify the technology impacts and identify patterns through\ncitation analysis over distinct time periods. Next, we develop MTL models to\npredict citation counts using multiple patent indicators over time. Finally, we\nexamine the changes in key input indicators and their patterns over different\nperiods using the SHapley Additive exPlanation method. We also offer guidelines\nfor validating and interpreting the results by employing statistical methods\nand natural language processing techniques. A case study on battery\ntechnologies demonstrates that our approach not only deepens the understanding\nof technology impact, but also improves prediction accuracy, yielding valuable\ninsights for both academia and industry.\n","authors":["Youngjin Seol","Jaewoong Choi","Seunghyun Lee","Janghyeok Yoon"],"pdf_url":"https://arxiv.org/pdf/2411.09184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09181v1","updated":"2024-11-14T04:39:30Z","published":"2024-11-14T04:39:30Z","title":"DeBaTeR: Denoising Bipartite Temporal Graph for Recommendation","summary":" Due to the difficulty of acquiring large-scale explicit user feedback,\nimplicit feedback (e.g., clicks or other interactions) is widely applied as an\nalternative source of data, where user-item interactions can be modeled as a\nbipartite graph. Due to the noisy and biased nature of implicit real-world\nuser-item interactions, identifying and rectifying noisy interactions are vital\nto enhance model performance and robustness. Previous works on purifying\nuser-item interactions in collaborative filtering mainly focus on mining the\ncorrelation between user/item embeddings and noisy interactions, neglecting the\nbenefit of temporal patterns in determining noisy interactions. Time\ninformation, while enhancing the model utility, also bears its natural\nadvantage in helping to determine noisy edges, e.g., if someone usually watches\nhorror movies at night and talk shows in the morning, a record of watching a\nhorror movie in the morning is more likely to be noisy interaction. Armed with\nthis observation, we introduce a simple yet effective mechanism for generating\ntime-aware user/item embeddings and propose two strategies for denoising\nbipartite temporal graph in recommender systems (DeBaTeR): the first is through\nreweighting the adjacency matrix (DeBaTeR-A), where a reliability score is\ndefined to reweight the edges through both soft assignment and hard assignment;\nthe second is through reweighting the loss function (DeBaTeR-L), where weights\nare generated to reweight user-item samples in the losses. Extensive\nexperiments have been conducted to demonstrate the efficacy of our methods and\nillustrate how time information indeed helps identifying noisy edges.\n","authors":["Xinyu He","Jose Sepulveda","Mostafa Rahmani","Alyssa Woo","Fei Wang","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2411.09181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09180v1","updated":"2024-11-14T04:39:10Z","published":"2024-11-14T04:39:10Z","title":"LEAP:D - A Novel Prompt-based Approach for Domain-Generalized Aerial\n Object Detection","summary":" Drone-captured images present significant challenges in object detection due\nto varying shooting conditions, which can alter object appearance and shape.\nFactors such as drone altitude, angle, and weather cause these variations,\ninfluencing the performance of object detection algorithms. To tackle these\nchallenges, we introduce an innovative vision-language approach using learnable\nprompts. This shift from conventional manual prompts aims to reduce\ndomain-specific knowledge interference, ultimately improving object detection\ncapabilities. Furthermore, we streamline the training process with a one-step\napproach, updating the learnable prompt concurrently with model training,\nenhancing efficiency without compromising performance. Our study contributes to\ndomain-generalized object detection by leveraging learnable prompts and\noptimizing training processes. This enhances model robustness and adaptability\nacross diverse environments, leading to more effective aerial object detection.\n","authors":["Chanyeong Park","Heegwang Kim","Joonki Paik"],"pdf_url":"https://arxiv.org/pdf/2411.09180v1.pdf","comment":"ICIP 2024 Workshop accepted paper"},{"id":"http://arxiv.org/abs/2411.09176v1","updated":"2024-11-14T04:29:07Z","published":"2024-11-14T04:29:07Z","title":"Gazing at Rewards: Eye Movements as a Lens into Human and AI\n Decision-Making in Hybrid Visual Foraging","summary":" Imagine searching a collection of coins for quarters ($0.25$), dimes\n($0.10$), nickels ($0.05$), and pennies ($0.01$)-a hybrid foraging task where\nobservers look for multiple instances of multiple target types. In such tasks,\nhow do target values and their prevalence influence foraging and eye movement\nbehaviors (e.g., should you prioritize rare quarters or common nickels)? To\nexplore this, we conducted human psychophysics experiments, revealing that\nhumans are proficient reward foragers. Their eye fixations are drawn to regions\nwith higher average rewards, fixation durations are longer on more valuable\ntargets, and their cumulative rewards exceed chance, approaching the upper\nbound of optimal foragers. To probe these decision-making processes of humans,\nwe developed a transformer-based Visual Forager (VF) model trained via\nreinforcement learning. Our VF model takes a series of targets, their\ncorresponding values, and the search image as inputs, processes the images\nusing foveated vision, and produces a sequence of eye movements along with\ndecisions on whether to collect each fixated item. Our model outperforms all\nbaselines, achieves cumulative rewards comparable to those of humans, and\napproximates human foraging behavior in eye movements and foraging biases\nwithin time-limited environments. Furthermore, stress tests on\nout-of-distribution tasks with novel targets, unseen values, and varying set\nsizes demonstrate the VF model's effective generalization. Our work offers\nvaluable insights into the relationship between eye movements and\ndecision-making, with our model serving as a powerful tool for further\nexploration of this connection. All data, code, and models will be made\npublicly available.\n","authors":["Bo Wang","Dingwei Tan","Yen-Ling Kuo","Zhaowei Sun","Jeremy M. Wolfe","Tat-Jen Cham","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09174v1","updated":"2024-11-14T04:23:28Z","published":"2024-11-14T04:23:28Z","title":"Advancing Diffusion Models: Alias-Free Resampling and Enhanced\n Rotational Equivariance","summary":" Recent advances in image generation, particularly via diffusion models, have\nled to impressive improvements in image synthesis quality. Despite this,\ndiffusion models are still challenged by model-induced artifacts and limited\nstability in image fidelity. In this work, we hypothesize that the primary\ncause of this issue is the improper resampling operation that introduces\naliasing in the diffusion model and a careful alias-free resampling dictated by\nimage processing theory can improve the model's performance in image synthesis.\nWe propose the integration of alias-free resampling layers into the UNet\narchitecture of diffusion models without adding extra trainable parameters,\nthereby maintaining computational efficiency. We then assess whether these\ntheory-driven modifications enhance image quality and rotational equivariance.\nOur experimental results on benchmark datasets, including CIFAR-10, MNIST, and\nMNIST-M, reveal consistent gains in image quality, particularly in terms of FID\nand KID scores. Furthermore, we propose a modified diffusion process that\nenables user-controlled rotation of generated images without requiring\nadditional training. Our findings highlight the potential of theory-driven\nenhancements such as alias-free resampling in generative models to improve\nimage quality while maintaining model efficiency and pioneer future research\ndirections to incorporate them into video-generating diffusion models, enabling\ndeeper exploration of the applications of alias-free resampling in generative\nmodeling.\n","authors":["Md Fahim Anjum"],"pdf_url":"https://arxiv.org/pdf/2411.09174v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.09170v1","updated":"2024-11-14T04:12:47Z","published":"2024-11-14T04:12:47Z","title":"Towards Scalable Handwriting Communication via EEG Decoding and Latent\n Embedding Integration","summary":" In recent years, brain-computer interfaces have made advances in decoding\nvarious motor-related tasks, including gesture recognition and movement\nclassification, utilizing electroencephalogram (EEG) data. These developments\nare fundamental in exploring how neural signals can be interpreted to recognize\nspecific physical actions. This study centers on a written alphabet\nclassification task, where we aim to decode EEG signals associated with\nhandwriting. To achieve this, we incorporate hand kinematics to guide the\nextraction of the consistent embeddings from high-dimensional neural recordings\nusing auxiliary variables (CEBRA). These CEBRA embeddings, along with the EEG,\nare processed by a parallel convolutional neural network model that extracts\nfeatures from both data sources simultaneously. The model classifies nine\ndifferent handwritten characters, including symbols such as exclamation marks\nand commas, within the alphabet. We evaluate the model using a quantitative\nfive-fold cross-validation approach and explore the structure of the embedding\nspace through visualizations. Our approach achieves a classification accuracy\nof 91 % for the nine-class task, demonstrating the feasibility of fine-grained\nhandwriting decoding from EEG.\n","authors":["Jun-Young Kim","Deok-Seon Kim","Seo-Hyun Lee"],"pdf_url":"https://arxiv.org/pdf/2411.09170v1.pdf","comment":"4 pages, 2 figures, 1 table, Name of Conference: International\n Conference on Brain-Computer Interface"},{"id":"http://arxiv.org/abs/2411.09169v1","updated":"2024-11-14T04:06:26Z","published":"2024-11-14T04:06:26Z","title":"Artificial Theory of Mind and Self-Guided Social Organisation","summary":" One of the challenges artificial intelligence (AI) faces is how a collection\nof agents coordinate their behaviour to achieve goals that are not reachable by\nany single agent. In a recent article by Ozmen et al this was framed as one of\nsix grand challenges: That AI needs to respect human cognitive processes at the\nhuman-AI interaction frontier. We suggest that this extends to the AI-AI\nfrontier and that it should also reflect human psychology, as it is the only\nsuccessful framework we have from which to build out. In this extended abstract\nwe first make the case for collective intelligence in a general setting,\ndrawing on recent work from single neuron complexity in neural networks and ant\nnetwork adaptability in ant colonies. From there we introduce how species\nrelate to one another in an ecological network via niche selection, niche\nchoice, and niche conformity with the aim of forming an analogy with human\nsocial network development as new agents join together and coordinate. From\nthere we show how our social structures are influenced by our neuro-physiology,\nour psychology, and our language. This emphasises how individual people within\na social network influence the structure and performance of that network in\ncomplex tasks, and that cognitive faculties such as Theory of Mind play a\ncentral role. We finish by discussing the current state of the art in AI and\nwhere there is potential for further development of a socially embodied\ncollective artificial intelligence that is capable of guiding its own social\nstructures.\n","authors":["Michael S. Harré","Jaime Ruiz-Serra","Catherine Drysdale"],"pdf_url":"https://arxiv.org/pdf/2411.09169v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2411.09168v1","updated":"2024-11-14T03:58:50Z","published":"2024-11-14T03:58:50Z","title":"Theory of Mind Enhances Collective Intelligence","summary":" Collective Intelligence plays a central role in a large variety of fields,\nfrom economics and evolutionary theory to neural networks and eusocial insects,\nand it is also core to much of the work on emergence and self-organisation in\ncomplex systems theory. However, in human collective intelligence there is\nstill much more to be understood in the relationship between specific\npsychological processes at the individual level and the emergence of\nself-organised structures at the social level. Previously psychological factors\nhave played a relatively minor role in the study of collective intelligence as\nthe principles are often quite general and applicable to humans just as readily\nas insects or other agents without sophisticated psychologies. In this article\nwe emphasise, with examples from other complex adaptive systems, the broad\napplicability of collective intelligence principles while the mechanisms and\ntime-scales differ significantly between examples. We contend that flexible\ncollective intelligence in human social settings is improved by our use of a\nspecific cognitive tool: our Theory of Mind. We identify several key\ncharacteristics of psychologically mediated collective intelligence and show\nthat the development of a Theory of Mind is a crucial factor distinguishing\nsocial collective intelligence from general collective intelligence. We then\nplace these capabilities in the context of the next steps in artificial\nintelligence embedded in a future that includes an effective human-AI hybrid\nsocial ecology.\n","authors":["Michael S. Harré","Catherine Drysdale","Jaime Ruiz-Serra"],"pdf_url":"https://arxiv.org/pdf/2411.09168v1.pdf","comment":"20 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2411.09160v1","updated":"2024-11-14T03:28:02Z","published":"2024-11-14T03:28:02Z","title":"Rationality based Innate-Values-driven Reinforcement Learning","summary":" Innate values describe agents' intrinsic motivations, which reflect their\ninherent interests and preferences to pursue goals and drive them to develop\ndiverse skills satisfying their various needs. The essence of reinforcement\nlearning (RL) is learning from interaction based on reward-driven behaviors,\nmuch like natural agents. It is an excellent model to describe the\ninnate-values-driven (IV) behaviors of AI agents. Especially developing the\nawareness of the AI agent through balancing internal and external utilities\nbased on its needs in different tasks is a crucial problem for individuals\nlearning to support AI agents integrating human society with safety and harmony\nin the long term. This paper proposes a hierarchical compound intrinsic value\nreinforcement learning model -- innate-values-driven reinforcement learning\ntermed IVRL to describe the complex behaviors of AI agents' interaction. We\nformulated the IVRL model and proposed two IVRL models: DQN and A2C. By\ncomparing them with benchmark algorithms such as DQN, DDQN, A2C, and PPO in the\nRole-Playing Game (RPG) reinforcement learning test platform VIZDoom, we\ndemonstrated that rationally organizing various individual needs can\neffectively achieve better performance.\n","authors":["Qin Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09160v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2401.05572"},{"id":"http://arxiv.org/abs/2411.09158v1","updated":"2024-11-14T03:24:45Z","published":"2024-11-14T03:24:45Z","title":"The \\emph{Optimist}: Towards Fully Automated Graph Theory Research","summary":" This paper introduces the \\emph{Optimist}, an autonomous system developed to\nadvance automated conjecture generation in graph theory. Leveraging\nmixed-integer programming (MIP) and heuristic methods, the \\emph{Optimist}\ngenerates conjectures that both rediscover established theorems and propose\nnovel inequalities. Through a combination of memory-based computation and\nagent-like adaptability, the \\emph{Optimist} iteratively refines its\nconjectures by integrating new data, enabling a feedback process with minimal\nhuman (\\emph{or machine}) intervention. Initial experiments reveal the\n\\emph{Optimist}'s potential to uncover foundational results in graph theory, as\nwell as to produce conjectures of interest for future exploration. This work\nalso outlines the \\emph{Optimist}'s evolving integration with a counterpart\nagent, the \\emph{Pessimist} (a human \\emph{or machine} agent), to establish a\ndueling system that will drive fully automated graph theory research.\n","authors":["Randy Davila"],"pdf_url":"https://arxiv.org/pdf/2411.09158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09134v1","updated":"2024-11-14T02:14:12Z","published":"2024-11-14T02:14:12Z","title":"ABCI 3.0: Evolution of the leading AI infrastructure in Japan","summary":" ABCI 3.0 is the latest version of the ABCI, a large-scale open AI\ninfrastructure that AIST has been operating since August 2018 and will be fully\noperational in January 2025. ABCI 3.0 consists of computing servers equipped\nwith 6128 of the NVIDIA H200 GPUs and an all-flash storage system. Its peak\nperformance is 6.22 exaflops in half precision and 3.0 exaflops in single\nprecision, which is 7 to 13 times faster than the previous system, ABCI 2.0. It\nalso more than doubles both storage capacity and theoretical read/write\nperformance. ABCI 3.0 is expected to accelerate research and development,\nevaluation, and workforce development of cutting-edge AI technologies, with a\nparticular focus on generative AI.\n","authors":["Ryousei Takano","Shinichiro Takizawa","Yusuke Tanimura","Hidemoto Nakada","Hirotaka Ogawa"],"pdf_url":"https://arxiv.org/pdf/2411.09134v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2410.19258v3","updated":"2024-11-14T01:56:11Z","published":"2024-10-25T02:22:00Z","title":"Not All Heads Matter: A Head-Level KV Cache Compression Method with\n Integrated Retrieval and Reasoning","summary":" Key-Value (KV) caching is a common technique to enhance the computational\nefficiency of Large Language Models (LLMs), but its memory overhead grows\nrapidly with input length. Prior work has shown that not all tokens are equally\nimportant for text generation, proposing layer-level KV cache compression to\nselectively retain key information. Recognizing the distinct roles of attention\nheads in generation, we propose HeadKV, a head-level KV cache compression\nmethod, and HeadKV-R2, which leverages a novel contextual reasoning ability\nestimation for compression. Our approach operates at the level of individual\nheads, estimating their importance for contextual QA tasks that require both\nretrieval and reasoning capabilities. Extensive experiments across diverse\nbenchmarks (LongBench, LooGLE), model architectures (e.g., Llama-3-8B-Instruct,\nMistral-7B-Instruct), and long-context abilities tests demonstrate that our\nhead-level KV cache compression significantly outperforms strong baselines,\nparticularly in low-resource settings (KV size = 64 & 128). Notably, our method\nretains just 1.5% of the KV cache while achieving 97% of the performance of the\nfull KV cache on the contextual question answering benchmark.Codes are\navailable at https://github.com/FYYFU/HeadKV\n","authors":["Yu Fu","Zefan Cai","Abedelkadir Asi","Wayne Xiong","Yue Dong","Wen Xiao"],"pdf_url":"https://arxiv.org/pdf/2410.19258v3.pdf","comment":"18pages"},{"id":"http://arxiv.org/abs/2411.09125v1","updated":"2024-11-14T01:48:08Z","published":"2024-11-14T01:48:08Z","title":"DROJ: A Prompt-Driven Attack against Large Language Models","summary":" Large Language Models (LLMs) have demonstrated exceptional capabilities\nacross various natural language processing tasks. Due to their training on\ninternet-sourced datasets, LLMs can sometimes generate objectionable content,\nnecessitating extensive alignment with human feedback to avoid such outputs.\nDespite massive alignment efforts, LLMs remain susceptible to adversarial\njailbreak attacks, which usually are manipulated prompts designed to circumvent\nsafety mechanisms and elicit harmful responses. Here, we introduce a novel\napproach, Directed Rrepresentation Optimization Jailbreak (DROJ), which\noptimizes jailbreak prompts at the embedding level to shift the hidden\nrepresentations of harmful queries towards directions that are more likely to\nelicit affirmative responses from the model. Our evaluations on LLaMA-2-7b-chat\nmodel show that DROJ achieves a 100\\% keyword-based Attack Success Rate (ASR),\neffectively preventing direct refusals. However, the model occasionally\nproduces repetitive and non-informative responses. To mitigate this, we\nintroduce a helpfulness system prompt that enhances the utility of the model's\nresponses. Our code is available at\nhttps://github.com/Leon-Leyang/LLM-Safeguard.\n","authors":["Leyang Hu","Boran Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09105v1","updated":"2024-11-14T00:26:26Z","published":"2024-11-14T00:26:26Z","title":"VCBench: A Controllable Benchmark for Symbolic and Abstract Challenges\n in Video Cognition","summary":" Recent advancements in Large Video-Language Models (LVLMs) have driven the\ndevelopment of benchmarks designed to assess cognitive abilities in video-based\ntasks. However, most existing benchmarks heavily rely on web-collected videos\npaired with human annotations or model-generated questions, which limit control\nover the video content and fall short in evaluating advanced cognitive\nabilities involving symbolic elements and abstract concepts. To address these\nlimitations, we introduce VCBench, a controllable benchmark to assess LVLMs'\ncognitive abilities, involving symbolic and abstract concepts at varying\ndifficulty levels. By generating video data with the Python-based engine,\nVCBench allows for precise control over the video content, creating dynamic,\ntask-oriented videos that feature complex scenes and abstract concepts. Each\ntask pairs with tailored question templates that target specific cognitive\nchallenges, providing a rigorous evaluation test. Our evaluation reveals that\neven state-of-the-art (SOTA) models, such as Qwen2-VL-72B, struggle with simple\nvideo cognition tasks involving abstract concepts, with performance sharply\ndropping by 19% as video complexity rises. These findings reveal the current\nlimitations of LVLMs in advanced cognitive tasks and highlight the critical\nrole of VCBench in driving research toward more robust LVLMs for complex video\ncognition challenges.\n","authors":["Chenglin Li","Qianglong Chen","Zhi Li","Feng Tao","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09102v1","updated":"2024-11-14T00:18:25Z","published":"2024-11-14T00:18:25Z","title":"Provocation: Who benefits from \"inclusion\" in Generative AI?","summary":" The demands for accurate and representative generative AI systems means there\nis an increased demand on participatory evaluation structures. While these\nparticipatory structures are paramount to to ensure non-dominant values,\nknowledge and material culture are also reflected in AI models and the media\nthey generate, we argue that dominant structures of community participation in\nAI development and evaluation are not explicit enough about the benefits and\nharms that members of socially marginalized groups may experience as a result\nof their participation. Without explicit interrogation of these benefits by AI\ndevelopers, as a community we may remain blind to the immensity of systemic\nchange that is needed as well. To support this provocation, we present a\nspeculative case study, developed from our own collective experiences as AI\nresearchers. We use this speculative context to itemize the barriers that need\nto be overcome in order for the proposed benefits to marginalized communities\nto be realized, and harms mitigated.\n","authors":["Nari Johnson","Siobhan Mackenzie Hall","Samantha Dalal"],"pdf_url":"https://arxiv.org/pdf/2411.09102v1.pdf","comment":"3 pages, 1 figure. Published as a Short Paper in the NeurIPS 2024\n Workshop on Evaluating Evaluations: Examining Best Practices for Measuring\n Broader Impacts of Generative AI"},{"id":"http://arxiv.org/abs/2411.09101v1","updated":"2024-11-14T00:18:04Z","published":"2024-11-14T00:18:04Z","title":"Heuristical Comparison of Vision Transformers Against Convolutional\n Neural Networks for Semantic Segmentation on Remote Sensing Imagery","summary":" Vision Transformers (ViT) have recently brought a new wave of research in the\nfield of computer vision. These models have done particularly well in the field\nof image classification and segmentation. Research on semantic and instance\nsegmentation has emerged to accelerate with the inception of the new\narchitecture, with over 80\\% of the top 20 benchmarks for the iSAID dataset\nbeing either based on the ViT architecture or the attention mechanism behind\nits success. This paper focuses on the heuristic comparison of three key\nfactors of using (or not using) ViT for semantic segmentation of remote sensing\naerial images on the iSAID. The experimental results observed during the course\nof the research were under the scrutinization of the following objectives: 1.\nUse of weighted fused loss function for the maximum mean Intersection over\nUnion (mIoU) score, Dice score, and minimization or conservation of entropy or\nclass representation, 2. Comparison of transfer learning on Meta's MaskFormer,\na ViT-based semantic segmentation model, against generic UNet Convolutional\nNeural Networks (CNNs) judged over mIoU, Dice scores, training efficiency, and\ninference time, and 3. What do we lose for what we gain? i.e., the comparison\nof the two models against current state-of-art segmentation models. We show the\nuse of the novel combined weighted loss function significantly boosts the CNN\nmodel's performance capacities as compared to transfer learning the ViT. The\ncode for this implementation can be found on\n\\url{https://github.com/ashimdahal/ViT-vs-CNN-ImageSegmentation}.\n","authors":["Ashim Dahal","Saydul Akbar Murad","Nick Rahimi"],"pdf_url":"https://arxiv.org/pdf/2411.09101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09678v1","updated":"2024-11-14T18:44:31Z","published":"2024-11-14T18:44:31Z","title":"NeuralDEM -- Real-time Simulation of Industrial Particulate Flows","summary":" Advancements in computing power have made it possible to numerically simulate\nlarge-scale fluid-mechanical and/or particulate systems, many of which are\nintegral to core industrial processes. Among the different numerical methods\navailable, the discrete element method (DEM) provides one of the most accurate\nrepresentations of a wide range of physical systems involving granular and\ndiscontinuous materials. Consequently, DEM has become a widely accepted\napproach for tackling engineering problems connected to granular flows and\npowder mechanics. Additionally, DEM can be integrated with grid-based\ncomputational fluid dynamics (CFD) methods, enabling the simulation of chemical\nprocesses taking place, e.g., in fluidized beds. However, DEM is\ncomputationally intensive because of the intrinsic multiscale nature of\nparticulate systems, restricting simulation duration or number of particles.\nTowards this end, NeuralDEM presents an end-to-end approach to replace slow\nnumerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM\nis capable of picturing long-term transport processes across different regimes\nusing macroscopic observables without any reference to microscopic model\nparameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an\nunderlying continuous field, while simultaneously modeling macroscopic behavior\ndirectly as additional auxiliary fields. Second, NeuralDEM introduces\nmulti-branch neural operators scalable to real-time modeling of\nindustrially-sized scenarios - from slow and pseudo-steady to fast and\ntransient. Such scenarios have previously posed insurmountable challenges for\ndeep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM\nfluidized bed reactors of 160k CFD cells and 500k DEM particles for\ntrajectories of 28s. NeuralDEM will open many new doors to advanced engineering\nand much faster process cycles.\n","authors":["Benedikt Alkin","Tobias Kronlachner","Samuele Papa","Stefan Pirker","Thomas Lichtenegger","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2411.09678v1.pdf","comment":"Project page: https://nx-ai.github.io/NeuralDEM/"},{"id":"http://arxiv.org/abs/2411.09590v1","updated":"2024-11-14T17:01:24Z","published":"2024-11-14T17:01:24Z","title":"Adopting RAG for LLM-Aided Future Vehicle Design","summary":" In this paper, we explore the integration of Large Language Models (LLMs)\nwith Retrieval-Augmented Generation (RAG) to enhance automated design and\nsoftware development in the automotive industry. We present two case studies: a\nstandardization compliance chatbot and a design copilot, both utilizing RAG to\nprovide accurate, context-aware responses. We evaluate four LLMs-GPT-4o,\nLLAMA3, Mistral, and Mixtral -- comparing their answering accuracy and\nexecution time. Our results demonstrate that while GPT-4 offers superior\nperformance, LLAMA3 and Mistral also show promising capabilities for local\ndeployment, addressing data privacy concerns in automotive applications. This\nstudy highlights the potential of RAG-augmented LLMs in improving design\nworkflows and compliance in automotive engineering.\n","authors":["Vahid Zolfaghari","Nenad Petrovic","Fengjunjie Pan","Krzysztof Lebioda","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2411.09590v1.pdf","comment":"Conference paper accepted in IEEE FLLM 2024"},{"id":"http://arxiv.org/abs/2411.09180v1","updated":"2024-11-14T04:39:10Z","published":"2024-11-14T04:39:10Z","title":"LEAP:D -- A Novel Prompt-based Approach for Domain-Generalized Aerial\n Object Detection","summary":" Drone-captured images present significant challenges in object detection due\nto varying shooting conditions, which can alter object appearance and shape.\nFactors such as drone altitude, angle, and weather cause these variations,\ninfluencing the performance of object detection algorithms. To tackle these\nchallenges, we introduce an innovative vision-language approach using learnable\nprompts. This shift from conventional manual prompts aims to reduce\ndomain-specific knowledge interference, ultimately improving object detection\ncapabilities. Furthermore, we streamline the training process with a one-step\napproach, updating the learnable prompt concurrently with model training,\nenhancing efficiency without compromising performance. Our study contributes to\ndomain-generalized object detection by leveraging learnable prompts and\noptimizing training processes. This enhances model robustness and adaptability\nacross diverse environments, leading to more effective aerial object detection.\n","authors":["Chanyeong Park","Heegwang Kim","Joonki Paik"],"pdf_url":"https://arxiv.org/pdf/2411.09180v1.pdf","comment":"ICIP 2024 Workshop accepted paper"},{"id":"http://arxiv.org/abs/2411.09849v1","updated":"2024-11-14T23:56:57Z","published":"2024-11-14T23:56:57Z","title":"Self-Supervised Radio Pre-training: Toward Foundational Models for\n Spectrogram Learning","summary":" Foundational deep learning (DL) models are general models, trained on large,\ndiverse, and unlabelled datasets, typically using self-supervised learning\ntechniques have led to significant advancements especially in natural language\nprocessing. These pretrained models can be fine-tuned for related downstream\ntasks, offering faster development and reduced training costs, while often\nachieving improved performance. In this work, we introduce Masked Spectrogram\nModeling, a novel self-supervised learning approach for pretraining\nfoundational DL models on radio signals. Adopting a Convolutional LSTM\narchitecture for efficient spatio-temporal processing, we pretrain the model\nwith an unlabelled radio dataset collected from over-the-air measurements.\nSubsequently, the pretrained model is fine-tuned for two downstream tasks:\nspectrum forecasting and segmentation. Experimental results demonstrate that\nour methodology achieves competitive performance in both forecasting accuracy\nand segmentation, validating its effectiveness for developing foundational\nradio models.\n","authors":["Ahmed Aboulfotouh","Ashkan Eshaghbeigi","Dimitrios Karslidis","Hatem Abou-Zeid"],"pdf_url":"https://arxiv.org/pdf/2411.09849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01603v3","updated":"2024-11-14T23:56:22Z","published":"2024-06-26T17:33:21Z","title":"A Review of Large Language Models and Autonomous Agents in Chemistry","summary":" Large language models (LLMs) have emerged as powerful tools in chemistry,\nsignificantly impacting molecule design, property prediction, and synthesis\noptimization. This review highlights LLM capabilities in these domains and\ntheir potential to accelerate scientific discovery through automation. We also\nreview LLM-based autonomous agents: LLMs with a broader set of tools to\ninteract with their surrounding environment. These agents perform diverse tasks\nsuch as paper scraping, interfacing with automated laboratories, and synthesis\nplanning. As agents are an emerging topic, we extend the scope of our review of\nagents beyond chemistry and discuss across any scientific domains. This review\ncovers the recent history, current capabilities, and design of LLMs and\nautonomous agents, addressing specific challenges, opportunities, and future\ndirections in chemistry. Key challenges include data quality and integration,\nmodel interpretability, and the need for standard benchmarks, while future\ndirections point towards more sophisticated multi-modal agents and enhanced\ncollaboration between agents and experimental methods. Due to the quick pace of\nthis field, a repository has been built to keep track of the latest studies:\nhttps://github.com/ur-whitelab/LLMs-in-science.\n","authors":["Mayk Caldas Ramos","Christopher J. Collison","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2407.01603v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13800v2","updated":"2024-11-14T23:53:05Z","published":"2024-05-22T16:25:03Z","title":"Dense Connector for MLLMs","summary":" Do we fully leverage the potential of visual encoder in Multimodal Large\nLanguage Models (MLLMs)? The recent outstanding performance of MLLMs in\nmultimodal understanding has garnered broad attention from both academia and\nindustry. In the current MLLM rat race, the focus seems to be predominantly on\nthe linguistic side. We witness the rise of larger and higher-quality\ninstruction datasets, as well as the involvement of larger-sized LLMs. Yet,\nscant attention has been directed towards the visual signals utilized by MLLMs,\noften assumed to be the final high-level features extracted by a frozen visual\nencoder. In this paper, we introduce the Dense Connector - a simple, effective,\nand plug-and-play vision-language connector that significantly enhances\nexisting MLLMs by leveraging multi-layer visual features, with minimal\nadditional computational overhead. Building on this, we also propose the\nEfficient Dense Connector, which achieves performance comparable to LLaVA-v1.5\nwith only 25% of the visual tokens. Furthermore, our model, trained solely on\nimages, showcases remarkable zero-shot capabilities in video understanding as\nwell. Experimental results across various vision encoders, image resolutions,\ntraining dataset scales, varying sizes of LLMs (2.7B->70B), and diverse\narchitectures of MLLMs (e.g., LLaVA-v1.5, LLaVA-NeXT and Mini-Gemini) validate\nthe versatility and scalability of our approach, achieving state-of-the-art\nperformance across 19 image and video benchmarks. We hope that this work will\nprovide valuable experience and serve as a basic module for future MLLM\ndevelopment. Code is available at https://github.com/HJYao00/DenseConnector .\n","authors":["Huanjin Yao","Wenhao Wu","Taojiannan Yang","YuXin Song","Mengxi Zhang","Haocheng Feng","Yifan Sun","Zhiheng Li","Wanli Ouyang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2405.13800v2.pdf","comment":"27 pages, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.04268v2","updated":"2024-11-14T23:46:34Z","published":"2024-08-08T07:11:57Z","title":"Evaluating Modern Approaches in 3D Scene Reconstruction: NeRF vs\n Gaussian-Based Methods","summary":" Exploring the capabilities of Neural Radiance Fields (NeRF) and\nGaussian-based methods in the context of 3D scene reconstruction, this study\ncontrasts these modern approaches with traditional Simultaneous Localization\nand Mapping (SLAM) systems. Utilizing datasets such as Replica and ScanNet, we\nassess performance based on tracking accuracy, mapping fidelity, and view\nsynthesis. Findings reveal that NeRF excels in view synthesis, offering unique\ncapabilities in generating new perspectives from existing data, albeit at\nslower processing speeds. Conversely, Gaussian-based methods provide rapid\nprocessing and significant expressiveness but lack comprehensive scene\ncompletion. Enhanced by global optimization and loop closure techniques, newer\nmethods like NICE-SLAM and SplaTAM not only surpass older frameworks such as\nORB-SLAM2 in terms of robustness but also demonstrate superior performance in\ndynamic and complex environments. This comparative analysis bridges theoretical\nresearch with practical implications, shedding light on future developments in\nrobust 3D scene reconstruction across various real-world applications.\n","authors":["Yiming Zhou","Zixuan Zeng","Andi Chen","Xiaofan Zhou","Haowei Ni","Shiyao Zhang","Panfeng Li","Liangxi Liu","Mengyao Zheng","Xupeng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.04268v2.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2411.09844v1","updated":"2024-11-14T23:19:55Z","published":"2024-11-14T23:19:55Z","title":"Deep Autoencoders for Unsupervised Anomaly Detection in Wildfire\n Prediction","summary":" Wildfires pose a significantly increasing hazard to global ecosystems due to\nthe climate crisis. Due to its complex nature, there is an urgent need for\ninnovative approaches to wildfire prediction, such as machine learning. This\nresearch took a unique approach, differentiating from classical supervised\nlearning, and addressed the gap in unsupervised wildfire prediction using\nautoencoders and clustering techniques for anomaly detection. Historical\nweather and normalised difference vegetation index datasets of Australia for\n2005 - 2021 were utilised. Two main unsupervised approaches were analysed. The\nfirst used a deep autoencoder to obtain latent features, which were then fed\ninto clustering models, isolation forest, local outlier factor and one-class\nSVM for anomaly detection. The second approach used a deep autoencoder to\nreconstruct the input data and use reconstruction errors to identify anomalies.\nLong Short-Term Memory (LSTM) autoencoders and fully connected (FC)\nautoencoders were employed in this part, both in an unsupervised way learning\nonly from nominal data. The FC autoencoder outperformed its counterparts,\nachieving an accuracy of 0.71, an F1-score of 0.74, and an MCC of 0.42. These\nfindings highlight the practicality of this method, as it effectively predicts\nwildfires in the absence of ground truth, utilising an unsupervised learning\ntechnique.\n","authors":["İrem Üstek","Miguel Arana-Catania","Alexander Farr","Ivan Petrunin"],"pdf_url":"https://arxiv.org/pdf/2411.09844v1.pdf","comment":"33 pages, 18 figure, 16 tables. To appear in Earth and Space Science"},{"id":"http://arxiv.org/abs/2310.09383v2","updated":"2024-11-14T23:18:27Z","published":"2023-10-13T20:03:22Z","title":"Integrating Symbolic Reasoning into Neural Generative Models for Design\n Generation","summary":" Design generation requires tight integration of neural and symbolic\nreasoning, as good design must meet explicit user needs and honor implicit\nrules for aesthetics, utility, and convenience. Current automated design tools\ndriven by neural networks produce appealing designs but cannot satisfy user\nspecifications and utility requirements. Symbolic reasoning tools, such as\nconstraint programming, cannot perceive low-level visual information in images\nor capture subtle aspects such as aesthetics. We introduce the Spatial\nReasoning Integrated Generator (SPRING) for design generation. SPRING embeds a\nneural and symbolic integrated spatial reasoning module inside the deep\ngenerative network. The spatial reasoning module samples the set of locations\nof objects to be generated from a backtrack-free distribution. This\ndistribution modifies the implicit preference distribution, which is learned by\na recursive neural network to capture utility and aesthetics. Sampling from the\nbacktrack-free distribution is accomplished by a symbolic reasoning approach,\nSampleSearch, which zeros out the probability of sampling spatial locations\nviolating explicit user specifications. Embedding symbolic reasoning into\nneural generation guarantees that the output of SPRING satisfies user\nrequirements. Furthermore, SPRING offers interpretability, allowing users to\nvisualize and diagnose the generation process through the bounding boxes.\nSPRING is also adept at managing novel user specifications not encountered\nduring its training, thanks to its proficiency in zero-shot constraint\ntransfer. Quantitative evaluations and a human study reveal that SPRING\noutperforms baseline generative models, excelling in delivering high design\nquality and better meeting user specifications.\n","authors":["Maxwell Joseph Jacobson","Yexiang Xue"],"pdf_url":"https://arxiv.org/pdf/2310.09383v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10296v4","updated":"2024-11-14T23:07:03Z","published":"2024-04-16T05:40:30Z","title":"Interpolating neural network: A lightweight yet precise architecture for\n data training, equation solving, and parameter calibration","summary":" Artificial intelligence (AI) has revolutionized software development,\nshifting from task-specific codes (Software 1.0) to neural network-based\napproaches (Software 2.0). However, applying this transition in engineering\nsoftware presents challenges, including low surrogate model accuracy, the curse\nof dimensionality in inverse design, and rising complexity in physical\nsimulations. We introduce an interpolating neural network (INN), grounded in\ninterpolation theory and tensor decomposition, to realize Engineering Software\n2.0 by advancing data training, partial differential equation solving, and\nparameter calibration. INN offers orders of magnitude fewer trainable/solvable\nparameters for comparable model accuracy than traditional multi-layer\nperceptron (MLP) or physics-informed neural networks (PINN). Demonstrated in\nmetal additive manufacturing, INN rapidly constructs an accurate surrogate\nmodel of Laser Powder Bed Fusion (L-PBF) heat transfer simulation, achieving\nsub-10-micrometer resolution for a 10 mm path in under 15 minutes on a single\nGPU. This makes a transformative step forward across all domains essential to\nengineering software.\n","authors":["Chanwook Park","Sourav Saha","Jiachen Guo","Hantao Zhang","Xiaoyu Xie","Miguel A. Bessa","Dong Qian","Wei Chen","Gregory J. Wagner","Jian Cao","Wing Kam Liu"],"pdf_url":"https://arxiv.org/pdf/2404.10296v4.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.09837v1","updated":"2024-11-14T23:02:30Z","published":"2024-11-14T23:02:30Z","title":"Real-time Adapting Routing (RAR): Improving Efficiency Through\n Continuous Learning in Software Powered by Layered Foundation Models","summary":" To balance the quality and inference cost of a Foundation Model (FM, such as\nlarge language models (LLMs)) powered software, people often opt to train a\nrouting model that routes requests to FMs with different sizes and\ncapabilities. Existing routing models rely on learning the optimal routing\ndecision from carefully curated data, require complex computations to be\nupdated, and do not consider the potential evolution of weaker FMs. In this\npaper, we propose Real-time Adaptive Routing (RAR), an approach to continuously\nadapt FM routing decisions while using guided in-context learning to enhance\nthe capabilities of weaker FM. The goal is to reduce reliance on stronger, more\nexpensive FMs. We evaluate our approach on different subsets of the popular\nMMLU benchmark. Over time, our approach routes 50.2% fewer requests to\ncomputationally expensive models while maintaining around 90.5% of the general\nresponse quality. In addition, the guides generated from stronger models have\nshown intra-domain generalization and led to a better quality of responses\ncompared to an equivalent approach with a standalone weaker FM.\n","authors":["Kirill Vasilevski","Dayi Lin","Ahmed Hassan"],"pdf_url":"https://arxiv.org/pdf/2411.09837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09834v1","updated":"2024-11-14T22:54:38Z","published":"2024-11-14T22:54:38Z","title":"A Benchmark for Long-Form Medical Question Answering","summary":" There is a lack of benchmarks for evaluating large language models (LLMs) in\nlong-form medical question answering (QA). Most existing medical QA evaluation\nbenchmarks focus on automatic metrics and multiple-choice questions. While\nvaluable, these benchmarks fail to fully capture or assess the complexities of\nreal-world clinical applications where LLMs are being deployed. Furthermore,\nexisting studies on evaluating long-form answer generation in medical QA are\nprimarily closed-source, lacking access to human medical expert annotations,\nwhich makes it difficult to reproduce results and enhance existing baselines.\nIn this work, we introduce a new publicly available benchmark featuring\nreal-world consumer medical questions with long-form answer evaluations\nannotated by medical doctors. We performed pairwise comparisons of responses\nfrom various open and closed-source medical and general-purpose LLMs based on\ncriteria such as correctness, helpfulness, harmfulness, and bias. Additionally,\nwe performed a comprehensive LLM-as-a-judge analysis to study the alignment\nbetween human judgments and LLMs. Our preliminary results highlight the strong\npotential of open LLMs in medical QA compared to leading closed models. Code &\nData: https://github.com/lavita-ai/medical-eval-sphere\n","authors":["Pedram Hosseini","Jessica M. Sin","Bing Ren","Bryceton G. Thomas","Elnaz Nouri","Ali Farahanchi","Saeed Hassanpour"],"pdf_url":"https://arxiv.org/pdf/2411.09834v1.pdf","comment":"AIM-FM: Advancements in Medical Foundation Models Workshop, 38th\n Conference on Neural Information Processing Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2404.02611v2","updated":"2024-11-14T22:53:12Z","published":"2024-04-03T09:56:38Z","title":"X-SHIELD: Regularization for eXplainable Artificial Intelligence","summary":" As artificial intelligence systems become integral across domains, the demand\nfor explainability grows, the called eXplainable artificial intelligence (XAI).\nExisting efforts primarily focus on generating and evaluating explanations for\nblack-box models while a critical gap in directly enhancing models remains\nthrough these evaluations. It is important to consider the potential of this\nexplanation process to improve model quality with a feedback on training as\nwell. XAI may be used to improve model performance while boosting its\nexplainability. Under this view, this paper introduces Transformation -\nSelective Hidden Input Evaluation for Learning Dynamics (T-SHIELD), a\nregularization family designed to improve model quality by hiding features of\ninput, forcing the model to generalize without those features. Within this\nfamily, we propose the XAI - SHIELD(X-SHIELD), a regularization for explainable\nartificial intelligence, which uses explanations to select specific features to\nhide. In contrast to conventional approaches, X-SHIELD regularization\nseamlessly integrates into the objective function enhancing model\nexplainability while also improving performance. Experimental validation on\nbenchmark datasets underscores X-SHIELD's effectiveness in improving\nperformance and overall explainability. The improvement is validated through\nexperiments comparing models with and without the X-SHIELD regularization, with\nfurther analysis exploring the rationale behind its design choices. This\nestablishes X-SHIELD regularization as a promising pathway for developing\nreliable artificial intelligence regularization.\n","authors":["Iván Sevillano-García","Julián Luengo","Francisco Herrera"],"pdf_url":"https://arxiv.org/pdf/2404.02611v2.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2212.02098v4","updated":"2024-11-14T22:21:46Z","published":"2022-12-05T08:34:23Z","title":"A Machine with Short-Term, Episodic, and Semantic Memory Systems","summary":" Inspired by the cognitive science theory of the explicit human memory\nsystems, we have modeled an agent with short-term, episodic, and semantic\nmemory systems, each of which is modeled with a knowledge graph. To evaluate\nthis system and analyze the behavior of this agent, we designed and released\nour own reinforcement learning agent environment, \"the Room\", where an agent\nhas to learn how to encode, store, and retrieve memories to maximize its return\nby answering questions. We show that our deep Q-learning based agent\nsuccessfully learns whether a short-term memory should be forgotten, or rather\nbe stored in the episodic or semantic memory systems. Our experiments indicate\nthat an agent with human-like memory systems can outperform an agent without\nthis memory structure in the environment.\n","authors":["Taewoon Kim","Michael Cochez","Vincent François-Lavet","Mark Neerincx","Piek Vossen"],"pdf_url":"https://arxiv.org/pdf/2212.02098v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00888v2","updated":"2024-11-14T22:20:49Z","published":"2024-01-30T04:00:54Z","title":"Security and Privacy Challenges of Large Language Models: A Survey","summary":" Large Language Models (LLMs) have demonstrated extraordinary capabilities and\ncontributed to multiple fields, such as generating and summarizing text,\nlanguage translation, and question-answering. Nowadays, LLM is becoming a very\npopular tool in computerized language processing tasks, with the capability to\nanalyze complicated linguistic patterns and provide relevant and appropriate\nresponses depending on the context. While offering significant advantages,\nthese models are also vulnerable to security and privacy attacks, such as\njailbreaking attacks, data poisoning attacks, and Personally Identifiable\nInformation (PII) leakage attacks. This survey provides a thorough review of\nthe security and privacy challenges of LLMs for both training data and users,\nalong with the application-based risks in various domains, such as\ntransportation, education, and healthcare. We assess the extent of LLM\nvulnerabilities, investigate emerging security and privacy attacks for LLMs,\nand review the potential defense mechanisms. Additionally, the survey outlines\nexisting research gaps in this domain and highlights future research\ndirections.\n","authors":["Badhan Chandra Das","M. Hadi Amini","Yanzhao Wu"],"pdf_url":"https://arxiv.org/pdf/2402.00888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07333v3","updated":"2024-11-14T22:17:25Z","published":"2024-07-10T03:04:20Z","title":"Mitigating Partial Observability in Sequential Decision Processes via\n the Lambda Discrepancy","summary":" Reinforcement learning algorithms typically rely on the assumption that the\nenvironment dynamics and value function can be expressed in terms of a\nMarkovian state representation. However, when state information is only\npartially observable, how can an agent learn such a state representation, and\nhow can it detect when it has found one? We introduce a metric that can\naccomplish both objectives, without requiring access to -- or knowledge of --\nan underlying, unobservable state space. Our metric, the $\\lambda$-discrepancy,\nis the difference between two distinct temporal difference (TD) value\nestimates, each computed using TD($\\lambda$) with a different value of\n$\\lambda$. Since TD($\\lambda{=}0$) makes an implicit Markov assumption and\nTD($\\lambda{=}1$) does not, a discrepancy between these estimates is a\npotential indicator of a non-Markovian state representation. Indeed, we prove\nthat the $\\lambda$-discrepancy is exactly zero for all Markov decision\nprocesses and almost always non-zero for a broad class of partially observable\nenvironments. We also demonstrate empirically that, once detected, minimizing\nthe $\\lambda$-discrepancy can help with learning a memory function to mitigate\nthe corresponding partial observability. We then train a reinforcement learning\nagent that simultaneously constructs two recurrent value networks with\ndifferent $\\lambda$ parameters and minimizes the difference between them as an\nauxiliary loss. The approach scales to challenging partially observable\ndomains, where the resulting agent frequently performs significantly better\n(and never performs worse) than a baseline recurrent agent with only a single\nvalue network.\n","authors":["Cameron Allen","Aaron Kirtland","Ruo Yu Tao","Sam Lobel","Daniel Scott","Nicholas Petrocelli","Omer Gottesman","Ronald Parr","Michael L. Littman","George Konidaris"],"pdf_url":"https://arxiv.org/pdf/2407.07333v3.pdf","comment":"GitHub URL: https://github.com/brownirl/lambda_discrepancy; Project\n page: https://lambda-discrepancy.github.io/"},{"id":"http://arxiv.org/abs/2411.09822v1","updated":"2024-11-14T22:00:37Z","published":"2024-11-14T22:00:37Z","title":"A Self-Supervised Model for Multi-modal Stroke Risk Prediction","summary":" Predicting stroke risk is a complex challenge that can be enhanced by\nintegrating diverse clinically available data modalities. This study introduces\na self-supervised multimodal framework that combines 3D brain imaging, clinical\ndata, and image-derived features to improve stroke risk prediction prior to\nonset. By leveraging large unannotated clinical datasets, the framework\ncaptures complementary and synergistic information across image and tabular\ndata modalities. Our approach is based on a contrastive learning framework that\ncouples contrastive language-image pretraining with an image-tabular matching\nmodule, to better align multimodal data representations in a shared latent\nspace. The model is trained on the UK Biobank, which includes structural brain\nMRI and clinical data. We benchmark its performance against state-of-the-art\nunimodal and multimodal methods using tabular, image, and image-tabular\ncombinations under diverse frozen and trainable model settings. The proposed\nmodel outperformed self-supervised tabular (image) methods by 2.6% (2.6%) in\nROC-AUC and by 3.3% (5.6%) in balanced accuracy. Additionally, it showed a 7.6%\nincrease in balanced accuracy compared to the best multimodal supervised model.\nThrough interpretable tools, our approach demonstrated better integration of\ntabular and image data, providing richer and more aligned embeddings.\nGradient-weighted Class Activation Mapping heatmaps further revealed activated\nbrain regions commonly associated in the literature with brain aging, stroke\nrisk, and clinical outcomes. This robust self-supervised multimodal framework\nsurpasses state-of-the-art methods for stroke risk prediction and offers a\nstrong foundation for future studies integrating diverse data modalities to\nadvance clinical predictive modelling.\n","authors":["Camille Delgrange","Olga Demler","Samia Mora","Bjoern Menze","Ezequiel de la Rosa","Neda Davoudi"],"pdf_url":"https://arxiv.org/pdf/2411.09822v1.pdf","comment":"Accepted as oral paper at AIM-FM workshop, Neurips 2024"},{"id":"http://arxiv.org/abs/2411.09820v1","updated":"2024-11-14T21:49:41Z","published":"2024-11-14T21:49:41Z","title":"WelQrate: Defining the Gold Standard in Small Molecule Drug Discovery\n Benchmarking","summary":" While deep learning has revolutionized computer-aided drug discovery, the AI\ncommunity has predominantly focused on model innovation and placed less\nemphasis on establishing best benchmarking practices. We posit that without a\nsound model evaluation framework, the AI community's efforts cannot reach their\nfull potential, thereby slowing the progress and transfer of innovation into\nreal-world drug discovery. Thus, in this paper, we seek to establish a new gold\nstandard for small molecule drug discovery benchmarking, WelQrate.\nSpecifically, our contributions are threefold: WelQrate Dataset Collection - we\nintroduce a meticulously curated collection of 9 datasets spanning 5\ntherapeutic target classes. Our hierarchical curation pipelines, designed by\ndrug discovery experts, go beyond the primary high-throughput screen by\nleveraging additional confirmatory and counter screens along with rigorous\ndomain-driven preprocessing, such as Pan-Assay Interference Compounds (PAINS)\nfiltering, to ensure the high-quality data in the datasets; WelQrate Evaluation\nFramework - we propose a standardized model evaluation framework considering\nhigh-quality datasets, featurization, 3D conformation generation, evaluation\nmetrics, and data splits, which provides a reliable benchmarking for drug\ndiscovery experts conducting real-world virtual screening; Benchmarking - we\nevaluate model performance through various research questions using the\nWelQrate dataset collection, exploring the effects of different models, dataset\nquality, featurization methods, and data splitting strategies on the results.\nIn summary, we recommend adopting our proposed WelQrate as the gold standard in\nsmall molecule drug discovery benchmarking. The WelQrate dataset collection,\nalong with the curation codes, and experimental scripts are all publicly\navailable at WelQrate.org.\n","authors":[" Yunchao"," Liu","Ha Dong","Xin Wang","Rocco Moretti","Yu Wang","Zhaoqian Su","Jiawei Gu","Bobby Bodenheimer","Charles David Weaver","Jens Meiler","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2411.09820v1.pdf","comment":"* denotes equal contribution"},{"id":"http://arxiv.org/abs/2407.12176v4","updated":"2024-11-14T21:34:59Z","published":"2024-07-16T21:03:14Z","title":"GPT-4V Cannot Generate Radiology Reports Yet","summary":" GPT-4V's purported strong multimodal abilities raise interests in using it to\nautomate radiology report writing, but there lacks thorough evaluations. In\nthis work, we perform a systematic evaluation of GPT-4V in generating radiology\nreports on two chest X-ray report datasets: MIMIC-CXR and IU X-Ray. We attempt\nto directly generate reports using GPT-4V through different prompting\nstrategies and find that it fails terribly in both lexical metrics and clinical\nefficacy metrics. To understand the low performance, we decompose the task into\ntwo steps: 1) the medical image reasoning step of predicting medical condition\nlabels from images; and 2) the report synthesis step of generating reports from\n(groundtruth) conditions. We show that GPT-4V's performance in image reasoning\nis consistently low across different prompts. In fact, the distributions of\nmodel-predicted labels remain constant regardless of which groundtruth\nconditions are present on the image, suggesting that the model is not\ninterpreting chest X-rays meaningfully. Even when given groundtruth conditions\nin report synthesis, its generated reports are less correct and less\nnatural-sounding than a finetuned LLaMA-2. Altogether, our findings cast doubt\non the viability of using GPT-4V in a radiology workflow.\n","authors":["Yuyang Jiang","Chacha Chen","Dang Nguyen","Benjamin M. Mervak","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2407.12176v4.pdf","comment":"24 pages, 3 figures, code:\n https://github.com/ChicagoHAI/cxr-eval-gpt-4v Findings paper presented at\n Machine Learning for Health (ML4H) symposium 2024, December 15-16, 2024,\n Vancouver, Canada, 26 pages"},{"id":"http://arxiv.org/abs/2411.09807v1","updated":"2024-11-14T20:46:26Z","published":"2024-11-14T20:46:26Z","title":"Evaluating Loss Landscapes from a Topology Perspective","summary":" Characterizing the loss of a neural network with respect to model parameters,\ni.e., the loss landscape, can provide valuable insights into properties of that\nmodel. Various methods for visualizing loss landscapes have been proposed, but\nless emphasis has been placed on quantifying and extracting actionable and\nreproducible insights from these complex representations. Inspired by powerful\ntools from topological data analysis (TDA) for summarizing the structure of\nhigh-dimensional data, here we characterize the underlying shape (or topology)\nof loss landscapes, quantifying the topology to reveal new insights about\nneural networks. To relate our findings to the machine learning (ML)\nliterature, we compute simple performance metrics (e.g., accuracy, error), and\nwe characterize the local structure of loss landscapes using Hessian-based\nmetrics (e.g., largest eigenvalue, trace, eigenvalue spectral density).\nFollowing this approach, we study established models from image pattern\nrecognition (e.g., ResNets) and scientific ML (e.g., physics-informed neural\nnetworks), and we show how quantifying the shape of loss landscapes can provide\nnew insights into model performance and learning dynamics.\n","authors":["Tiankai Xie","Caleb Geniesse","Jiaqing Chen","Yaoqing Yang","Dmitriy Morozov","Michael W. Mahoney","Ross Maciejewski","Gunther H. Weber"],"pdf_url":"https://arxiv.org/pdf/2411.09807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16744v3","updated":"2024-11-14T20:02:05Z","published":"2024-01-30T04:48:43Z","title":"ShaRP: A Novel Feature Importance Framework for Ranking","summary":" Algorithmic decisions in critical domains such as hiring, college admissions,\nand lending are often based on rankings. Because of the impact these decisions\nhave on individuals, organizations, and population groups, there is a need to\nunderstand them: to help individuals improve their position in a ranking,\ndesign better ranking procedures, and check whether a procedure is legally\ncompliant. In this paper, we present ShaRP - Shapley for Rankings and\nPreferences - a framework that explains the contributions of features to\ndifferent aspects of a ranked outcome and is based on Shapley values. Using\nShaRP, we show that even when the scoring function used by an algorithmic\nranker is known and linear, the feature weights do not correspond to their\nShapley value contribution. The contributions instead depend on the feature\ndistributions and the subtle local interactions between the scoring features.\n ShaRP builds on the Quantitative Input Influence framework to compute the\ncontributions of features for multiple - ranking specific - Quantities of\nInterest, including score, rank, pair-wise preference, and top-k. We show the\nresults of an extensive experimental validation of ShaRP using real and\nsynthetic datasets. We demonstrate that feature importance can be computed\nefficiently, and that ShaRP compares favorably to several prior local feature\nimportance methods, in terms of both generality and quality of explanations.\nAmong our results, we highlight a case study on the CS Rankings dataset.\nContrary to expectation, we find that a strong track record in Systems research\nis much more important than AI research for placing a CS department among the\ntop-10%.\n ShaRP is available as an open-source library at\nhttps://github.com/DataResponsibly/ShaRP and is already used in teaching.\n","authors":["Venetia Pliatsika","Joao Fonseca","Kateryna Akhynko","Ivan Shevchenko","Julia Stoyanovich"],"pdf_url":"https://arxiv.org/pdf/2401.16744v3.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2404.13142v2","updated":"2024-11-14T19:36:14Z","published":"2024-04-19T19:03:33Z","title":"Decentralized Coordination of Distributed Energy Resources through Local\n Energy Markets and Deep Reinforcement Learning","summary":" As distributed energy resources (DERs) grow, the electricity grid faces\nincreased net load variability at the grid edge, impacting operability and\nreliability. Transactive energy, facilitated through local energy markets,\noffers a decentralized, indirect demand response solution, with model-free\ncontrol techniques, such as deep reinforcement learning (DRL), enabling\nautomated, decentralized participation. However, existing studies largely\noverlook community-level net load variability, focusing instead on\nsocioeconomic metrics.\n This study addresses this gap by using DRL agents to automate end-user\nparticipation in a local energy market (ALEX), where agents act independently\nto minimize individual energy bills. Results reveal a strong link between bill\nreduction and decreased net load variability, assessed across metrics such as\nramping rate, load factor, and peak demand over various time horizons. Using a\nno-control baseline, DRL agents are benchmarked against a near-optimal dynamic\nprogramming approach. The dynamic programming benchmark achieves reductions of\n22.05 percent, 83.92 percent, and 24.09 percent in daily import, export, and\npeak demand, respectively, while the DRL agents show comparable or superior\nresults with reductions of 21.93 percent, 84.46 percent, and 27.02 percent.\n This study demonstrates the effectiveness of DRL in decentralized grid\nmanagement, highlighting its scalability and near-optimal performance in\nreducing net load variability within community-driven energy markets.\n","authors":["Daniel May","Matthew Taylor","Petr Musilek"],"pdf_url":"https://arxiv.org/pdf/2404.13142v2.pdf","comment":"preprint, submitted to Energy and AI"},{"id":"http://arxiv.org/abs/2411.09767v1","updated":"2024-11-14T19:24:46Z","published":"2024-11-14T19:24:46Z","title":"Deep Learning for Fetal Inflammatory Response Diagnosis in the Umbilical\n Cord","summary":" Inflammation of the umbilical cord can be seen as a result of ascending\nintrauterine infection or other inflammatory stimuli. Acute fetal inflammatory\nresponse (FIR) is characterized by infiltration of the umbilical cord by fetal\nneutrophils, and can be associated with neonatal sepsis or fetal inflammatory\nresponse syndrome. Recent advances in deep learning in digital pathology have\ndemonstrated favorable performance across a wide range of clinical tasks, such\nas diagnosis and prognosis. In this study we classified FIR from whole slide\nimages (WSI). We digitized 4100 histological slides of umbilical cord stained\nwith hematoxylin and eosin(H&E) and extracted placental diagnoses from the\nelectronic health record. We build models using attention-based whole slide\nlearning models. We compared strategies between features extracted by a model\n(ConvNeXtXLarge) pretrained on non-medical images (ImageNet), and one\npretrained using histopathology images (UNI). We trained multiple iterations of\neach model and combined them into an ensemble. The predictions from the\nensemble of models trained using UNI achieved an overall balanced accuracy of\n0.836 on the test dataset. In comparison, the ensembled predictions using\nConvNeXtXLarge had a lower balanced accuracy of 0.7209. Heatmaps generated from\ntop accuracy model appropriately highlighted arteritis in cases of FIR 2. In\nFIR 1, the highest performing model assigned high attention to areas of\nactivated-appearing stroma in Wharton's Jelly. However, other high-performing\nmodels assigned attention to umbilical vessels. We developed models for\ndiagnosis of FIR from placental histology images, helping reduce interobserver\nvariability among pathologists. Future work may examine the utility of these\nmodels for identifying infants at risk of systemic inflammatory response or\nearly onset neonatal sepsis.\n","authors":["Marina A. Ayad","Ramin Nateghi","Abhishek Sharma","Lawrence Chillrud","Tilly Seesillapachai","Lee A. D. Cooper","Jeffery A. Goldstein"],"pdf_url":"https://arxiv.org/pdf/2411.09767v1.pdf","comment":null}],"Computation and Language":[{"id":"http://arxiv.org/abs/2411.09694v1","updated":"2024-11-14T18:58:23Z","published":"2024-11-14T18:58:23Z","title":"A Bayesian Optimization Approach to Machine Translation Reranking","summary":" Reranking a list of candidates from a machine translation system with an\nexternal scoring model and returning the highest-scoring candidate remains a\nsimple and effective method for improving the overall output quality.\nTranslation scoring models continue to grow in size, with the best models being\ncomparable to generation models. Thus, reranking can add substantial\ncomputational cost to the translation pipeline. In this work, we pose reranking\nas a Bayesian optimization (BayesOpt) problem. By strategically selecting\ncandidates to score based on a balance of exploration and exploitation, we show\nthat it is possible to find top-scoring candidates when scoring only a fraction\nof the candidate list. For instance, our method achieves the same CometKiwi\nscore using only 70 scoring evaluations compared a baseline system using 180.\nWe present a multi-fidelity setting for BayesOpt, where the candidates are\nfirst scored with a cheaper but noisier proxy scoring model, which further\nimproves the cost-performance tradeoff when using smaller but well-trained\ndistilled proxy scorers.\n","authors":["Julius Cheng","Maike Züfle","Vilém Zouhar","Andreas Vlachos"],"pdf_url":"https://arxiv.org/pdf/2411.09694v1.pdf","comment":"v1: Preprint version"},{"id":"http://arxiv.org/abs/2411.09689v1","updated":"2024-11-14T18:55:26Z","published":"2024-11-14T18:55:26Z","title":"LLM Hallucination Reasoning with Zero-shot Knowledge Test","summary":" LLM hallucination, where LLMs occasionally generate unfaithful text, poses\nsignificant challenges for their practical applications. Most existing\ndetection methods rely on external knowledge, LLM fine-tuning, or\nhallucination-labeled datasets, and they do not distinguish between different\ntypes of hallucinations, which are crucial for improving detection performance.\nWe introduce a new task, Hallucination Reasoning, which classifies\nLLM-generated text into one of three categories: aligned, misaligned, and\nfabricated. Our novel zero-shot method assesses whether LLM has enough\nknowledge about a given prompt and text. Our experiments conducted on new\ndatasets demonstrate the effectiveness of our method in hallucination reasoning\nand underscore its importance for enhancing detection performance.\n","authors":["Seongmin Lee","Hsiang Hsu","Chun-Fu Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09689v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.09688v1","updated":"2024-11-14T18:54:19Z","published":"2024-11-14T18:54:19Z","title":"Squeezed Attention: Accelerating Long Context Length LLM Inference","summary":" Emerging Large Language Model (LLM) applications require long input prompts\nto perform complex downstream tasks like document analysis and code generation.\nFor these long context length applications, the length of the input prompt\nposes a significant challenge in terms of inference efficiency since the\ninference costs increase linearly with sequence length. However, for many of\nthese applications, much of the context in the prompt is fixed across different\nuser inputs, thereby providing the opportunity to perform offline optimizations\nto process user inputs quickly, as they are received. In this work, we propose\nSqueezed Attention as a mechanism to accelerate LLM applications where a large\nportion of the input prompt is fixed. We first leverage K-means clustering\noffline to group the keys for the fixed context based on semantic similarity\nand represent each cluster with a single centroid value. During inference, we\ncompare query tokens from the user input with the centroids to predict which of\nthe keys from the fixed context are semantically relevant and need to be loaded\nduring inference. We then compute exact attention using only these important\nkeys from the fixed context, thereby reducing bandwidth and computational\ncosts. We also extend our method to use a hierarchical centroid lookup to\nidentify important keys, which can reduce the complexity of attention from\nlinear to logarithmic with respect to the context length. We implement\noptimized Triton kernels for centroid comparison and sparse FlashAttention with\nimportant keys, achieving more than 4x speedups during both the prefill and\ngeneration phases for long-context inference. Furthermore, we have extensively\nevaluated our method on various long-context benchmarks including LongBench,\nwhere it achieves a 3x reduction in KV cache budget without accuracy loss and\nup to an 8x reduction with <0.5 point accuracy gap for various models.\n","authors":["Coleman Hooper","Sehoon Kim","Hiva Mohammadzadeh","Monishwaran Maheswaran","June Paik","Michael W. Mahoney","Kurt Keutzer","Amir Gholami"],"pdf_url":"https://arxiv.org/pdf/2411.09688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05777v2","updated":"2024-11-14T18:35:19Z","published":"2024-11-08T18:43:15Z","title":"Quantitative Assessment of Intersectional Empathetic Bias and\n Understanding","summary":" A growing amount of literature critiques the current operationalizations of\nempathy based on loose definitions of the construct. Such definitions\nnegatively affect dataset quality, model robustness, and evaluation\nreliability. We propose an empathy evaluation framework that operationalizes\nempathy close to its psychological origins. The framework measures the variance\nin responses of LLMs to prompts using existing metrics for empathy and\nemotional valence. The variance is introduced through the controlled generation\nof the prompts by varying social biases affecting context understanding, thus\nimpacting empathetic understanding. The control over generation ensures high\ntheoretical validity of the constructs in the prompt dataset. Also, it makes\nhigh-quality translation, especially into languages that currently have\nlittle-to-no way of evaluating empathy or bias, such as the Slavonic family,\nmore manageable. Using chosen LLMs and various prompt types, we demonstrate the\nempathy evaluation with the framework, including multiple-choice answers and\nfree generation. The variance in our initial evaluation sample is small and we\nwere unable to measure convincing differences between the empathetic\nunderstanding in contexts given by different social groups. However, the\nresults are promising because the models showed significant alterations their\nreasoning chains needed to capture the relatively subtle changes in the\nprompts. This provides the basis for future research into the construction of\nthe evaluation sample and statistical methods for measuring the results.\n","authors":["Vojtech Formanek","Ondrej Sotolar"],"pdf_url":"https://arxiv.org/pdf/2411.05777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09661v1","updated":"2024-11-14T18:31:39Z","published":"2024-11-14T18:31:39Z","title":"Adaptive Decoding via Latent Preference Optimization","summary":" During language model decoding, it is known that using higher temperature\nsampling gives more creative responses, while lower temperatures are more\nfactually accurate. However, such models are commonly applied to general\ninstruction following, which involves both creative and fact seeking tasks,\nusing a single fixed temperature across all examples and tokens. In this work,\nwe introduce Adaptive Decoding, a layer added to the model to select the\nsampling temperature dynamically at inference time, at either the token or\nexample level, in order to optimize performance. To learn its parameters we\nintroduce Latent Preference Optimization (LPO) a general approach to train\ndiscrete latent variables such as choices of temperature. Our method\noutperforms all fixed decoding temperatures across a range of tasks that\nrequire different temperatures, including UltraFeedback, Creative Story\nWriting, and GSM8K.\n","authors":["Shehzaad Dhuliawala","Ilia Kulikov","Ping Yu","Asli Celikyilmaz","Jason Weston","Sainbayar Sukhbaatar","Jack Lanchantin"],"pdf_url":"https://arxiv.org/pdf/2411.09661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03862v3","updated":"2024-11-14T18:27:39Z","published":"2024-04-05T02:27:09Z","title":"Verifiable by Design: Aligning Language Models to Quote from\n Pre-Training Data","summary":" To trust the fluent generations of large language models (LLMs), humans must\nbe able to verify their correctness against trusted, external sources. Recent\nefforts, such as providing citations via retrieved documents or post-hoc\nprovenance, enhance verifiability but provide no guarantees on their\ncorrectness. To address these limitations, we tackle the verifiability goal\nwith a different philosophy: trivializing the verification process by\ndeveloping models that quote verbatim statements from trusted sources in their\npre-training data. We propose Quote-Tuning, which demonstrates the feasibility\nof aligning models to quote. The core of Quote-Tuning is a fast membership\ninference function that efficiently verifies text against trusted corpora. We\nleverage this tool to design a reward function to quantify quotes in model\nresponses, and curate datasets for preference learning. Experiments show that\nQuote-Tuning significantly increases verbatim quotes from high-quality\ndocuments by up to 130% relative to base models while maintaining response\nquality. Quote-Tuning is applicable in different tasks, generalizes to\nout-of-domain data and diverse model families, and provides additional benefits\nto truthfulness. Our method not only serves as a hassle-free method to increase\nquoting but also opens up avenues for improving LLM trustworthiness through\nbetter verifiability.\n","authors":["Jingyu Zhang","Marc Marone","Tianjian Li","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2404.03862v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04783v2","updated":"2024-11-14T18:14:00Z","published":"2024-03-02T16:52:22Z","title":"AutoDefense: Multi-Agent LLM Defense against Jailbreak Attacks","summary":" Despite extensive pre-training in moral alignment to prevent generating\nharmful information, large language models (LLMs) remain vulnerable to\njailbreak attacks. In this paper, we propose AutoDefense, a multi-agent defense\nframework that filters harmful responses from LLMs. With the response-filtering\nmechanism, our framework is robust against different jailbreak attack prompts,\nand can be used to defend different victim models. AutoDefense assigns\ndifferent roles to LLM agents and employs them to complete the defense task\ncollaboratively. The division in tasks enhances the overall\ninstruction-following of LLMs and enables the integration of other defense\ncomponents as tools. With AutoDefense, small open-source LMs can serve as\nagents and defend larger models against jailbreak attacks. Our experiments show\nthat AutoDefense can effectively defense against different jailbreak attacks,\nwhile maintaining the performance at normal user request. For example, we\nreduce the attack success rate on GPT-3.5 from 55.74% to 7.95% using\nLLaMA-2-13b with a 3-agent system. Our code and data are publicly available at\nhttps://github.com/XHMY/AutoDefense.\n","authors":["Yifan Zeng","Yiran Wu","Xiao Zhang","Huazheng Wang","Qingyun Wu"],"pdf_url":"https://arxiv.org/pdf/2403.04783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09642v1","updated":"2024-11-14T18:06:55Z","published":"2024-11-14T18:06:55Z","title":"On the Limits of Language Generation: Trade-Offs Between Hallucination\n and Mode Collapse","summary":" Specifying all desirable properties of a language model is challenging, but\ncertain requirements seem essential. Given samples from an unknown language,\nthe trained model should produce valid strings not seen in training and be\nexpressive enough to capture the language's full richness. Otherwise,\noutputting invalid strings constitutes \"hallucination,\" and failing to capture\nthe full range leads to \"mode collapse.\" We ask if a language model can meet\nboth requirements.\n We investigate this within a statistical language generation setting building\non Gold and Angluin. Here, the model receives random samples from a\ndistribution over an unknown language K, which belongs to a possibly infinite\ncollection of languages. The goal is to generate unseen strings from K. We say\nthe model generates from K with consistency and breadth if, as training size\nincreases, its output converges to all unseen strings in K.\n Kleinberg and Mullainathan [KM24] asked if consistency and breadth in\nlanguage generation are possible. We answer this negatively: for a large class\nof language models, including next-token prediction models, this is impossible\nfor most collections of candidate languages. This contrasts with [KM24]'s\nresult, showing consistent generation without breadth is possible for any\ncountable collection of languages. Our finding highlights that generation with\nbreadth fundamentally differs from generation without breadth.\n As a byproduct, we establish near-tight bounds on the number of samples\nneeded for generation with or without breadth.\n Finally, our results offer hope: consistent generation with breadth is\nachievable for any countable collection of languages when negative examples\n(strings outside K) are available alongside positive ones. This suggests that\npost-training feedback, which encodes negative examples, can be crucial in\nreducing hallucinations while limiting mode collapse.\n","authors":["Alkis Kalavasis","Anay Mehrotra","Grigoris Velegkas"],"pdf_url":"https://arxiv.org/pdf/2411.09642v1.pdf","comment":"Abstract shortened to fit arXiv limit"},{"id":"http://arxiv.org/abs/2407.04573v2","updated":"2024-11-14T18:01:10Z","published":"2024-07-05T15:08:44Z","title":"VRSD: Rethinking Similarity and Diversity for Retrieval in Large\n Language Models","summary":" Vector retrieval algorithms are essential for semantic queries within the\nrapidly evolving landscape of Large Language Models (LLMs). The ability to\nretrieve vectors that satisfy both similarity and diversity criteria\nsubstantially enhances the performance of LLMs. Although Maximal Marginal\nRelevance (MMR) is widely employed in retrieval scenarios requiring relevance\nand diversity, variations in the parameter $\\lambda$ lead to fluctuations that\ncomplicate the optimization trajectory in vector spaces. This obscures the\ndirection of improvement and highlights the lack of a robust theoretical\nanalysis regarding similarity and diversity constraints in retrieval processes.\nTo address these challenges, this paper introduces a novel approach that\ncharacterizes both constraints through the relationship between the sum vector\nand the query vector. The proximity of these vectors ensures the similarity\nconstraint, while requiring individual vectors within the sum vector to diverge\nin their alignment with the query vector satisfies the diversity constraint. We\nfirst formulate a new combinatorial optimization problem, selecting k vectors\nfrom a candidate set such that their sum vector maximally aligns with the query\nvector, and demonstrate that this problem is NP-complete. This result\nunderscores the inherent difficulty of simultaneously achieving similarity and\ndiversity in vector retrieval, thereby providing a theoretical foundation for\nfuture research. Subsequently, we present the heuristic algorithm Vectors\nRetrieval with Similarity and Diversity, VRSD, which features a clear\noptimization objective and eliminates the need for preset parameters. VRSD also\nachieves a modest reduction in time complexity compared to MMR. Empirical\nvalidation confirms that VRSD significantly outperforms MMR across various\ndatasets.\n","authors":["Hang Gao","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17897v2","updated":"2024-11-14T17:46:04Z","published":"2024-10-23T14:15:07Z","title":"Value Residual Learning For Alleviating Attention Concentration In\n Transformers","summary":" Transformers can capture long-range dependencies using self-attention,\nallowing tokens to attend to all others directly. However, stacking multiple\nattention layers leads to attention concentration. One natural way to address\nthis issue is to use cross-layer attention, allowing information from earlier\nlayers to be directly accessible to later layers. However, this approach is\ncomputationally expensive. To address this problem, we propose Transformer with\nresidual value (ResFormer) which approximates cross-layer attention through\nadding a residual connection from the values of the the first layer to all\nsubsequent layers. Based on this method, one variant is the Transformer with\nsingle layer value (SVFormer), where all layers share the same value embedding\nfrom first layer, reducing the $KV$ cache by nearly 50\\%. Comprehensive\nempirical evidence demonstrates that ResFormer mitigates attention\nconcentration problem in deeper layers and enhances representation across most\nlayers, outperforming the vanilla Transformer, DenseFormer, and NeuTRENO in\ntraining error as well as downstream tasks. Further visualization results\nsuggest that Resformer alleviates attention sinks through avoiding value-state\ndrains. SVFormer trains significantly faster than the vanilla Transformer and\nperforms better than other methods like GQA and CLA, with performance\ninfluenced by sequence length and cumulative learning rate.\n","authors":["Zhanchao Zhou","Tianyi Wu","Zhiyun Jiang","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2410.17897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09613v1","updated":"2024-11-14T17:33:36Z","published":"2024-11-14T17:33:36Z","title":"PTR: Precision-Driven Tool Recommendation for Large Language Models","summary":" By augmenting Large Language Models (LLMs) with external tools, their\ncapacity to solve complex problems has been significantly enhanced. However,\ndespite ongoing advancements in the parsing capabilities of LLMs, incorporating\nall available tools simultaneously in the prompt remains impractical due to the\nvast number of external tools. Consequently, it is essential to provide LLMs\nwith a precise set of tools tailored to the specific task, considering both\nquantity and quality. Current tool retrieval methods primarily focus on\nrefining the ranking list of tools and directly packaging a fixed number of\ntop-ranked tools as the tool set. However, these approaches often fail to equip\nLLMs with the optimal set of tools prior to execution, since the optimal number\nof tools for different tasks could be different, resulting in inefficiencies\nsuch as redundant or unsuitable tools, which impede immediate access to the\nmost relevant tools. This paper addresses the challenge of recommending precise\ntoolsets for LLMs. We introduce the problem of tool recommendation, define its\nscope, and propose a novel Precision-driven Tool Recommendation (PTR) approach.\nPTR captures an initial, concise set of tools by leveraging historical tool\nbundle usage and dynamically adjusts the tool set by performing tool matching,\nculminating in a multi-view-based tool addition. Additionally, we present a new\ndataset, RecTools, and a metric, TRACC, designed to evaluate the effectiveness\nof tool recommendation for LLMs. We further validate our design choices through\ncomprehensive experiments, demonstrating promising accuracy across two open\nbenchmarks and our RecTools dataset.\n","authors":["Hang Gao","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09612v1","updated":"2024-11-14T17:32:03Z","published":"2024-11-14T17:32:03Z","title":"The Moral Foundations Weibo Corpus","summary":" Moral sentiments expressed in natural language significantly influence both\nonline and offline environments, shaping behavioral styles and interaction\npatterns, including social media selfpresentation, cyberbullying, adherence to\nsocial norms, and ethical decision-making. To effectively measure moral\nsentiments in natural language processing texts, it is crucial to utilize\nlarge, annotated datasets that provide nuanced understanding for accurate\nanalysis and modeltraining. However, existing corpora, while valuable, often\nface linguistic limitations. To address this gap in the Chinese language\ndomain,we introduce the Moral Foundation Weibo Corpus. This corpus consists of\n25,671 Chinese comments on Weibo, encompassing six diverse topic areas. Each\ncomment is manually annotated by at least three systematically trained\nannotators based on ten moral categories derived from a grounded theory of\nmorality. To assess annotator reliability, we present the kappa testresults, a\ngold standard for measuring consistency. Additionally, we apply several the\nlatest large language models to supplement the manual annotations, conducting\nanalytical experiments to compare their performance and report baseline results\nfor moral sentiment classification.\n","authors":["Renjie Cao","Miaoyan Hu","Jiahan Wei","Baha Ihnaini"],"pdf_url":"https://arxiv.org/pdf/2411.09612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09607v1","updated":"2024-11-14T17:25:43Z","published":"2024-11-14T17:25:43Z","title":"Initial Nugget Evaluation Results for the TREC 2024 RAG Track with the\n AutoNuggetizer Framework","summary":" This report provides an initial look at partial results from the TREC 2024\nRetrieval-Augmented Generation (RAG) Track. We have identified RAG evaluation\nas a barrier to continued progress in information access (and more broadly,\nnatural language processing and artificial intelligence), and it is our hope\nthat we can contribute to tackling the many challenges in this space. The\ncentral hypothesis we explore in this work is that the nugget evaluation\nmethodology, originally developed for the TREC Question Answering Track in\n2003, provides a solid foundation for evaluating RAG systems. As such, our\nefforts have focused on \"refactoring\" this methodology, specifically applying\nlarge language models to both automatically create nuggets and to automatically\nassign nuggets to system answers. We call this the AutoNuggetizer framework.\nWithin the TREC setup, we are able to calibrate our fully automatic process\nagainst a manual process whereby nuggets are created by human assessors\nsemi-manually and then assigned manually to system answers. Based on initial\nresults across 21 topics from 45 runs, we observe a strong correlation between\nscores derived from a fully automatic nugget evaluation and a (mostly) manual\nnugget evaluation by human assessors. This suggests that our fully automatic\nevaluation process can be used to guide future iterations of RAG systems.\n","authors":["Ronak Pradeep","Nandan Thakur","Shivani Upadhyay","Daniel Campos","Nick Craswell","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2411.09607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09595v1","updated":"2024-11-14T17:08:23Z","published":"2024-11-14T17:08:23Z","title":"LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models","summary":" This work explores expanding the capabilities of large language models (LLMs)\npretrained on text to generate 3D meshes within a unified model. This offers\nkey advantages of (1) leveraging spatial knowledge already embedded in LLMs,\nderived from textual sources like 3D tutorials, and (2) enabling conversational\n3D generation and mesh understanding. A primary challenge is effectively\ntokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly.\nTo address this, we introduce LLaMA-Mesh, a novel approach that represents the\nvertex coordinates and face definitions of 3D meshes as plain text, allowing\ndirect integration with LLMs without expanding the vocabulary. We construct a\nsupervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate\n3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs\nas required, and (3) understand and interpret 3D meshes. Our work is the first\nto demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge\nfor 3D mesh generation in a text-based format, effectively unifying the 3D and\ntext modalities. LLaMA-Mesh achieves mesh generation quality on par with models\ntrained from scratch while maintaining strong text generation performance.\n","authors":["Zhengyi Wang","Jonathan Lorraine","Yikai Wang","Hang Su","Jun Zhu","Sanja Fidler","Xiaohui Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.09595v1.pdf","comment":"See the project website at\n https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/"},{"id":"http://arxiv.org/abs/2411.09587v1","updated":"2024-11-14T16:57:46Z","published":"2024-11-14T16:57:46Z","title":"BabyLM Challenge: Exploring the Effect of Variation Sets on Language\n Model Training Efficiency","summary":" While current large language models have achieved a remarkable success, their\ndata efficiency remains a challenge to overcome. Recently it has been suggested\nthat child-directed speech (CDS) can improve training data efficiency of modern\nlanguage models based on Transformer neural networks. However, it is not yet\nunderstood which specific properties of CDS are effective for training these\nmodels. In the context of the BabyLM Challenge, we focus on Variation Sets\n(VSs), sets of consecutive utterances expressing a similar intent with slightly\ndifferent words and structures, which are ubiquitous in CDS. To assess the\nimpact of VSs on training data efficiency, we augment CDS data with different\nproportions of artificial VSs and use these datasets to train an\nauto-regressive model, GPT-2. We find that the best proportion of VSs depends\non the evaluation benchmark: BLiMP and GLUE scores benefit from the presence of\nVSs, but EWOK scores do not. Additionally, the results vary depending on\nmultiple factors such as the number of epochs and the order of utterance\npresentation. Taken together, these findings suggest that VSs can have a\nbeneficial influence on language models, while leaving room for further\ninvestigation.\n","authors":["Akari Haga","Akiyo Fukatsu","Miyu Oba","Arianna Bisazza","Yohei Oseki"],"pdf_url":"https://arxiv.org/pdf/2411.09587v1.pdf","comment":"This paper accepted BabyLM challenge 2024 at CONLL 2024"},{"id":"http://arxiv.org/abs/2411.09547v1","updated":"2024-11-14T16:01:33Z","published":"2024-11-14T16:01:33Z","title":"Piecing It All Together: Verifying Multi-Hop Multimodal Claims","summary":" Existing claim verification datasets often do not require systems to perform\ncomplex reasoning or effectively interpret multimodal evidence. To address\nthis, we introduce a new task: multi-hop multimodal claim verification. This\ntask challenges models to reason over multiple pieces of evidence from diverse\nsources, including text, images, and tables, and determine whether the combined\nmultimodal evidence supports or refutes a given claim. To study this task, we\nconstruct MMCV, a large-scale dataset comprising 16k multi-hop claims paired\nwith multimodal evidence, generated and refined using large language models,\nwith additional input from human feedback. We show that MMCV is challenging\neven for the latest state-of-the-art multimodal large language models,\nespecially as the number of reasoning hops increases. Additionally, we\nestablish a human performance benchmark on a subset of MMCV. We hope this\ndataset and its evaluation task will encourage future research in multimodal\nmulti-hop claim verification.\n","authors":["Haoran Wang","Aman Rangapur","Xiongxiao Xu","Yueqing Liang","Haroon Gharwi","Carl Yang","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2411.09547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09539v1","updated":"2024-11-14T15:55:37Z","published":"2024-11-14T15:55:37Z","title":"A Practical Guide to Fine-tuning Language Models with Limited Data","summary":" Employing pre-trained Large Language Models (LLMs) has become the de facto\nstandard in Natural Language Processing (NLP) despite their extensive data\nrequirements. Motivated by the recent surge in research focused on training\nLLMs with limited data, particularly in low-resource domains and languages,\nthis paper surveys recent transfer learning approaches to optimize model\nperformance in downstream tasks where data is scarce. We first address initial\nand continued pre-training strategies to better leverage prior knowledge in\nunseen domains and languages. We then examine how to maximize the utility of\nlimited data during fine-tuning and few-shot learning. The final section takes\na task-specific perspective, reviewing models and methods suited for different\nlevels of data scarcity. Our goal is to provide practitioners with practical\nguidelines for overcoming the challenges posed by constrained data while also\nhighlighting promising directions for future research.\n","authors":["Márton Szép","Daniel Rueckert","Rüdiger von Eisenhart-Rothe","Florian Hinterwimmer"],"pdf_url":"https://arxiv.org/pdf/2411.09539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08278v2","updated":"2024-11-14T15:49:46Z","published":"2024-11-13T01:33:05Z","title":"Knowledge Bases in Support of Large Language Models for Processing Web\n News","summary":" Large Language Models (LLMs) have received considerable interest in wide\napplications lately. During pre-training via massive datasets, such a model\nimplicitly memorizes the factual knowledge of trained datasets in its hidden\nparameters. However, knowledge held implicitly in parameters often makes its\nuse by downstream applications ineffective due to the lack of common-sense\nreasoning. In this article, we introduce a general framework that permits to\nbuild knowledge bases with an aid of LLMs, tailored for processing Web news.\nThe framework applies a rule-based News Information Extractor (NewsIE) to news\nitems for extracting their relational tuples, referred to as knowledge bases,\nwhich are then graph-convoluted with the implicit knowledge facts of news items\nobtained by LLMs, for their classification. It involves two lightweight\ncomponents: 1) NewsIE: for extracting the structural information of every news\nitem, in the form of relational tuples; 2) BERTGraph: for graph convoluting the\nimplicit knowledge facts with relational tuples extracted by NewsIE. We have\nevaluated our framework under different news-related datasets for news category\nclassification, with promising experimental results.\n","authors":["Yihe Zhang","Nabin Pakka","Nian-Feng Tzeng"],"pdf_url":"https://arxiv.org/pdf/2411.08278v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.09510v1","updated":"2024-11-14T15:19:01Z","published":"2024-11-14T15:19:01Z","title":"Communication Compression for Tensor Parallel LLM Inference","summary":" Large Language Models (LLMs) have pushed the frontier of artificial\nintelligence but are comprised of hundreds of billions of parameters and\noperations. For faster inference latency, LLMs are deployed on multiple\nhardware accelerators through various Model Parallelism strategies. Our paper\nlooks into the details on one such strategy - Tensor Parallel - and proposes to\nreduce latency by compressing inter-accelerator communication. We leverage fine\ngrained quantization techniques to compress selected activations by 3.5 - 4.5x.\nOur proposed method leads up to 2x reduction of time-to-first-token (TTFT) with\nnegligible model performance degradation.\n","authors":["Jan Hansen-Palmus","Michael Truong-Le","Oliver Hausdörfer","Alok Verma"],"pdf_url":"https://arxiv.org/pdf/2411.09510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09497v1","updated":"2024-11-14T15:04:17Z","published":"2024-11-14T15:04:17Z","title":"The Use of Readability Metrics in Legal Text: A Systematic Literature\n Review","summary":" Understanding the text in legal documents can be challenging due to their\ncomplex structure and the inclusion of domain-specific jargon. Laws and\nregulations are often crafted in such a manner that engagement with them\nrequires formal training, potentially leading to vastly different\ninterpretations of the same texts. Linguistic complexity is an important\ncontributor to the difficulties experienced by readers. Simplifying texts could\nenhance comprehension across a broader audience, not just among trained\nprofessionals. Various metrics have been developed to measure document\nreadability. Therefore, we adopted a systematic review approach to examine the\nlinguistic and readability metrics currently employed for legal and regulatory\ntexts. A total of 3566 initial papers were screened, with 34 relevant studies\nfound and further assessed. Our primary objective was to identify which current\nmetrics were applied for evaluating readability within the legal field. Sixteen\ndifferent metrics were identified, with the Flesch-Kincaid Grade Level being\nthe most frequently used method. The majority of studies (73.5%) were found in\nthe domain of \"informed consent forms\". From the analysis, it is clear that not\nall legal domains are well represented in terms of readability metrics and that\nthere is a further need to develop more consensus on which metrics should be\napplied for legal documents.\n","authors":["Yu Han","Aaron Ceross","Jeroen H. M. Bergmann"],"pdf_url":"https://arxiv.org/pdf/2411.09497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09492v1","updated":"2024-11-14T14:58:38Z","published":"2024-11-14T14:58:38Z","title":"MM-Eval: A Hierarchical Benchmark for Modern Mongolian Evaluation in\n LLMs","summary":" Large language models (LLMs) excel in high-resource languages but face\nnotable challenges in low-resource languages like Mongolian. This paper\naddresses these challenges by categorizing capabilities into language abilities\n(syntax and semantics) and cognitive abilities (knowledge and reasoning). To\nsystematically evaluate these areas, we developed MM-Eval, a specialized\ndataset based on Modern Mongolian Language Textbook I and enriched with WebQSP\nand MGSM datasets.\n Preliminary experiments on models including Qwen2-7B-Instruct, GLM4-9b-chat,\nLlama3.1-8B-Instruct, GPT-4, and DeepseekV2.5 revealed that: 1) all models\nperformed better on syntactic tasks than semantic tasks, highlighting a gap in\ndeeper language understanding; and 2) knowledge tasks showed a moderate\ndecline, suggesting that models can transfer general knowledge from\nhigh-resource to low-resource contexts.\n The release of MM-Eval, comprising 569 syntax, 677 semantics, 344 knowledge,\nand 250 reasoning tasks, offers valuable insights for advancing NLP and LLMs in\nlow-resource languages like Mongolian. The dataset is available at\nhttps://github.com/joenahm/MM-Eval.\n","authors":["Mengyuan Zhang","Ruihui Wang","Bo Xia","Yuan Sun","Xiaobing Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.09492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03979v3","updated":"2024-11-14T14:34:13Z","published":"2024-10-04T23:37:21Z","title":"Improving Arabic Multi-Label Emotion Classification using Stacked\n Embeddings and Hybrid Loss Function","summary":" In multi-label emotion classification, particularly for low-resource\nlanguages like Arabic, the challenges of class imbalance and label correlation\nhinder model performance, especially in accurately predicting minority\nemotions. To address these issues, this study proposes a novel approach that\ncombines stacked embeddings, meta-learning, and a hybrid loss function to\nenhance multi-label emotion classification for the Arabic language. The study\nextracts contextual embeddings from three fine-tuned language\nmodels-ArabicBERT, MarBERT, and AraBERT-which are then stacked to form enriched\nembeddings. A meta-learner is trained on these stacked embeddings, and the\nresulting concatenated representations are provided as input to a Bi-LSTM\nmodel, followed by a fully connected neural network for multi-label\nclassification. To further improve performance, a hybrid loss function is\nintroduced, incorporating class weighting, label correlation matrix, and\ncontrastive learning, effectively addressing class imbalances and improving the\nhandling of label correlations. Extensive experiments validate the proposed\nmodel's performance across key metrics such as Precision, Recall, F1-Score,\nJaccard Accuracy, and Hamming Loss. The class-wise performance analysis\ndemonstrates the hybrid loss function's ability to significantly reduce\ndisparities between majority and minority classes, resulting in a more balanced\nemotion classification. An ablation study highlights the contribution of each\ncomponent, showing the superiority of the model compared to baseline approaches\nand other loss functions. This study not only advances multi-label emotion\nclassification for Arabic but also presents a generalizable framework that can\nbe adapted to other languages and domains, providing a significant step forward\nin addressing the challenges of low-resource emotion classification tasks.\n","authors":["Muhammad Azeem Aslam","Wang Jun","Nisar Ahmed","Muhammad Imran Zaman","Li Yanan","Hu Hongfei","Wang Shiyu","Xin Liu"],"pdf_url":"https://arxiv.org/pdf/2410.03979v3.pdf","comment":"The paper is submitted in Scientific Reports and is currently under\n review"},{"id":"http://arxiv.org/abs/2402.06900v5","updated":"2024-11-14T14:28:58Z","published":"2024-02-10T07:55:27Z","title":"Can LLMs Recognize Toxicity? A Structured Investigation Framework and\n Toxicity Metric","summary":" In the pursuit of developing Large Language Models (LLMs) that adhere to\nsocietal standards, it is imperative to detect the toxicity in the generated\ntext. The majority of existing toxicity metrics rely on encoder models trained\non specific toxicity datasets, which are susceptible to out-of-distribution\n(OOD) problems and depend on the dataset's definition of toxicity. In this\npaper, we introduce a robust metric grounded on LLMs to flexibly measure\ntoxicity according to the given definition. We first analyze the toxicity\nfactors, followed by an examination of the intrinsic toxic attributes of LLMs\nto ascertain their suitability as evaluators. Finally, we evaluate the\nperformance of our metric with detailed analysis. Our empirical results\ndemonstrate outstanding performance in measuring toxicity within verified\nfactors, improving on conventional metrics by 12 points in the F1 score. Our\nfindings also indicate that upstream toxicity significantly influences\ndownstream metrics, suggesting that LLMs are unsuitable for toxicity\nevaluations within unverified factors.\n","authors":["Hyukhun Koh","Dohyung Kim","Minwoo Lee","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2402.06900v5.pdf","comment":"8 page long"},{"id":"http://arxiv.org/abs/2409.15933v2","updated":"2024-11-14T13:59:15Z","published":"2024-09-24T09:57:25Z","title":"SLIMER-IT: Zero-Shot NER on Italian Language","summary":" Traditional approaches to Named Entity Recognition (NER) frame the task into\na BIO sequence labeling problem. Although these systems often excel in the\ndownstream task at hand, they require extensive annotated data and struggle to\ngeneralize to out-of-distribution input domains and unseen entity types. On the\ncontrary, Large Language Models (LLMs) have demonstrated strong zero-shot\ncapabilities. While several works address Zero-Shot NER in English, little has\nbeen done in other languages. In this paper, we define an evaluation framework\nfor Zero-Shot NER, applying it to the Italian language. Furthermore, we\nintroduce SLIMER-IT, the Italian version of SLIMER, an instruction-tuning\napproach for zero-shot NER leveraging prompts enriched with definition and\nguidelines. Comparisons with other state-of-the-art models, demonstrate the\nsuperiority of SLIMER-IT on never-seen-before entity tags.\n","authors":["Andrew Zamai","Leonardo Rigutini","Marco Maggini","Andrea Zugarini"],"pdf_url":"https://arxiv.org/pdf/2409.15933v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09436v1","updated":"2024-11-14T13:34:16Z","published":"2024-11-14T13:34:16Z","title":"Robot Tasks with Fuzzy Time Requirements from Natural Language\n Instructions","summary":" Natural language allows robot programming to be accessible to everyone.\nHowever, the inherent fuzziness in natural language poses challenges for\ninflexible, traditional robot systems. We focus on instructions with fuzzy time\nrequirements (e.g., \"start in a few minutes\"). Building on previous robotics\nresearch, we introduce fuzzy skills. These define an execution by the robot\nwith so-called satisfaction functions representing vague execution time\nrequirements. Such functions express a user's satisfaction over potential\nstarting times for skill execution. When the robot handles multiple fuzzy\nskills, the satisfaction function provides a temporal tolerance window for\nexecution, thus, enabling optimal scheduling based on satisfaction. We\ngeneralized such functions based on individual user expectations with a user\nstudy. The participants rated their satisfaction with an instruction's\nexecution at various times. Our investigations reveal that trapezoidal\nfunctions best approximate the users' satisfaction. Additionally, the results\nsuggest that users are more lenient if the execution is specified further into\nthe future.\n","authors":["Sascha Sucker","Michael Neubauer","Dominik Henrich"],"pdf_url":"https://arxiv.org/pdf/2411.09436v1.pdf","comment":"9 pages, 8 figures, to be published in 2024 IEEE International\n Conference on Robotic Computing (IRC)"},{"id":"http://arxiv.org/abs/2411.09431v1","updated":"2024-11-14T13:29:09Z","published":"2024-11-14T13:29:09Z","title":"Everyone deserves their voice to be heard: Analyzing Predictive Gender\n Bias in ASR Models Applied to Dutch Speech Data","summary":" Recent research has shown that state-of-the-art (SotA) Automatic Speech\nRecognition (ASR) systems, such as Whisper, often exhibit predictive biases\nthat disproportionately affect various demographic groups. This study focuses\non identifying the performance disparities of Whisper models on Dutch speech\ndata from the Common Voice dataset and the Dutch National Public Broadcasting\norganisation. We analyzed the word error rate, character error rate and a\nBERT-based semantic similarity across gender groups. We used the moral\nframework of Weerts et al. (2022) to assess quality of service harms and\nfairness, and to provide a nuanced discussion on the implications of these\nbiases, particularly for automatic subtitling. Our findings reveal substantial\ndisparities in word error rate (WER) among gender groups across all model\nsizes, with bias identified through statistical testing.\n","authors":["Rik Raes","Saskia Lensink","Mykola Pechenizkiy"],"pdf_url":"https://arxiv.org/pdf/2411.09431v1.pdf","comment":"Accepted at ECML PKDD 2024, 4th Workshop on Bias and Fairness in AI\n (BIAS)"},{"id":"http://arxiv.org/abs/2411.09389v1","updated":"2024-11-14T12:05:35Z","published":"2024-11-14T12:05:35Z","title":"Less is More: Unseen Domain Fake News Detection via Causal Propagation\n Substructures","summary":" The spread of fake news on social media poses significant threats to\nindividuals and society. Text-based and graph-based models have been employed\nfor fake news detection by analysing news content and propagation networks,\nshowing promising results in specific scenarios. However, these data-driven\nmodels heavily rely on pre-existing in-distribution data for training, limiting\ntheir performance when confronted with fake news from emerging or previously\nunseen domains, known as out-of-distribution (OOD) data. Tackling OOD fake news\nis a challenging yet critical task. In this paper, we introduce the Causal\nSubgraph-oriented Domain Adaptive Fake News Detection (CSDA) model, designed to\nenhance zero-shot fake news detection by extracting causal substructures from\npropagation graphs using in-distribution data and generalising this approach to\nOOD data. The model employs a graph neural network based mask generation\nprocess to identify dominant nodes and edges within the propagation graph,\nusing these substructures for fake news detection. Additionally, the\nperformance of CSDA is further improved through contrastive learning in\nfew-shot scenarios, where a limited amount of OOD data is available for\ntraining. Extensive experiments on public social media datasets demonstrate\nthat CSDA effectively handles OOD fake news detection, achieving a 7 to 16\npercents accuracy improvement over other state-of-the-art models.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2411.09389v1.pdf","comment":"9 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2406.18406v2","updated":"2024-11-14T10:55:14Z","published":"2024-06-26T14:57:38Z","title":"IRCAN: Mitigating Knowledge Conflicts in LLM Generation via Identifying\n and Reweighting Context-Aware Neurons","summary":" It is widely acknowledged that large language models (LLMs) encode a vast\nreservoir of knowledge after being trained on mass data. Recent studies\ndisclose knowledge conflicts in LLM generation, wherein outdated or incorrect\nparametric knowledge (i.e., encoded knowledge) contradicts new knowledge\nprovided in the context. To mitigate such knowledge conflicts, we propose a\nnovel framework, IRCAN (Identifying and Reweighting Context-Aware Neurons) to\ncapitalize on neurons that are crucial in processing contextual cues.\nSpecifically, IRCAN first identifies neurons that significantly contribute to\ncontext processing, utilizing a context-aware attribution score derived from\nintegrated gradients. Subsequently, the identified context-aware neurons are\nstrengthened via reweighting. In doing so, we steer LLMs to generate\ncontext-sensitive outputs with respect to the new knowledge provided in the\ncontext. Extensive experiments conducted across a variety of models and tasks\ndemonstrate that IRCAN not only achieves remarkable improvements in handling\nknowledge conflicts but also offers a scalable, plug-and-play solution that can\nbe integrated seamlessly with existing models. Our codes are released at\nhttps://github.com/danshi777/IRCAN.\n","authors":["Dan Shi","Renren Jin","Tianhao Shen","Weilong Dong","Xinwei Wu","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2406.18406v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09339v1","updated":"2024-11-14T10:36:19Z","published":"2024-11-14T10:36:19Z","title":"Re-Parameterization of Lightweight Transformer for On-Device Speech\n Emotion Recognition","summary":" With the increasing implementation of machine learning models on edge or\nInternet-of-Things (IoT) devices, deploying advanced models on\nresource-constrained IoT devices remains challenging. Transformer models, a\ncurrently dominant neural architecture, have achieved great success in broad\ndomains but their complexity hinders its deployment on IoT devices with limited\ncomputation capability and storage size. Although many model compression\napproaches have been explored, they often suffer from notorious performance\ndegradation. To address this issue, we introduce a new method, namely\nTransformer Re-parameterization, to boost the performance of lightweight\nTransformer models. It consists of two processes: the High-Rank Factorization\n(HRF) process in the training stage and the deHigh-Rank Factorization (deHRF)\nprocess in the inference stage. In the former process, we insert an additional\nlinear layer before the Feed-Forward Network (FFN) of the lightweight\nTransformer. It is supposed that the inserted HRF layers can enhance the model\nlearning capability. In the later process, the auxiliary HRF layer will be\nmerged together with the following FFN layer into one linear layer and thus\nrecover the original structure of the lightweight model. To examine the\neffectiveness of the proposed method, we evaluate it on three widely used\nTransformer variants, i.e., ConvTransformer, Conformer, and SpeechFormer\nnetworks, in the application of speech emotion recognition on the IEMOCAP, M3ED\nand DAIC-WOZ datasets. Experimental results show that our proposed method\nconsistently improves the performance of lightweight Transformers, even making\nthem comparable to large models. The proposed re-parameterization approach\nenables advanced Transformer models to be deployed on resource-constrained IoT\ndevices.\n","authors":["Zixing Zhang","Zhongren Dong","Weixiang Xu","Jing Han"],"pdf_url":"https://arxiv.org/pdf/2411.09339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09318v1","updated":"2024-11-14T10:00:33Z","published":"2024-11-14T10:00:33Z","title":"DriveThru: a Document Extraction Platform and Benchmark Datasets for\n Indonesian Local Language Archives","summary":" Indonesia is one of the most diverse countries linguistically. However,\ndespite this linguistic diversity, Indonesian languages remain underrepresented\nin Natural Language Processing (NLP) research and technologies. In the past two\nyears, several efforts have been conducted to construct NLP resources for\nIndonesian languages. However, most of these efforts have been focused on\ncreating manual resources thus difficult to scale to more languages. Although\nmany Indonesian languages do not have a web presence, locally there are\nresources that document these languages well in printed forms such as books,\nmagazines, and newspapers. Digitizing these existing resources will enable\nscaling of Indonesian language resource construction to many more languages. In\nthis paper, we propose an alternative method of creating datasets by digitizing\ndocuments, which have not previously been used to build digital language\nresources in Indonesia. DriveThru is a platform for extracting document content\nutilizing Optical Character Recognition (OCR) techniques in its system to\nprovide language resource building with less manual effort and cost. This paper\nalso studies the utility of current state-of-the-art LLM for post-OCR\ncorrection to show the capability of increasing the character accuracy rate\n(CAR) and word accuracy rate (WAR) compared to off-the-shelf OCR.\n","authors":["MohammadRifqi Farhansyah","Muhammad Zuhdi Fikri Johari","Afinzaki Amiral","Ayu Purwarianti","Kumara Ari Yuana","Derry Tanti Wijaya"],"pdf_url":"https://arxiv.org/pdf/2411.09318v1.pdf","comment":"12 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2410.14979v4","updated":"2024-11-14T09:17:48Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration From Cognitive Psychology","summary":" The cognitive mechanism by which Large Language Models (LLMs) solve\nmathematical problems remains a widely debated and unresolved issue. Currently,\nthere is little interpretable experimental evidence that connects LLMs'\nproblem-solving with human cognitive psychology.To determine if LLMs possess\nhuman-like mathematical reasoning, we modified the problems used in the human\nCognitive Reflection Test (CRT). Our results show that, even with the use of\nChains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model\n(noted for its reasoning capabilities), have a high error rate when solving\nthese modified CRT problems. Specifically, the average accuracy rate dropped by\nup to 50% compared to the original questions.Further analysis of LLMs'\nincorrect answers suggests that they primarily rely on pattern matching from\ntheir training data, which aligns more with human intuition (System 1 thinking)\nrather than with human-like reasoning (System 2 thinking). This finding\nchallenges the belief that LLMs have genuine mathematical reasoning abilities\ncomparable to humans. As a result, this work may adjust overly optimistic views\non LLMs' progress towards artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Kai Chen","Xiaobing Sun","Baosheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14979v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09297v1","updated":"2024-11-14T09:16:48Z","published":"2024-11-14T09:16:48Z","title":"DTELS: Towards Dynamic Granularity of Timeline Summarization","summary":" The rapid proliferation of online news has posed significant challenges in\ntracking the continuous development of news topics. Traditional timeline\nsummarization constructs a chronological summary of the events but often lacks\nthe flexibility to meet the diverse granularity needs. To overcome this\nlimitation, we introduce a new paradigm, Dynamic-granularity TimELine\nSummarization, (DTELS), which aims to construct adaptive timelines based on\nuser instructions or requirements. This paper establishes a comprehensive\nbenchmark for DTLES that includes: (1) an evaluation framework grounded in\njournalistic standards to assess the timeline quality across four dimensions:\nInformativeness, Granular Consistency, Factuality, and Coherence; (2) a\nlarge-scale, multi-source dataset with multiple granularity timeline\nannotations based on a consensus process to facilitate authority; (3) extensive\nexperiments and analysis with two proposed solutions based on Large Language\nModels (LLMs) and existing state-of-the-art TLS methods. The experimental\nresults demonstrate the effectiveness of LLM-based solutions. However, even the\nmost advanced LLMs struggle to consistently generate timelines that are both\ninformative and granularly consistent, highlighting the challenges of the DTELS\ntask.\n","authors":["Chenlong Zhang","Tong Zhou","Pengfei Cao","Zhuoran Jin","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.09297v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2411.09289v1","updated":"2024-11-14T09:03:54Z","published":"2024-11-14T09:03:54Z","title":"StreamAdapter: Efficient Test Time Adaptation from Contextual Streams","summary":" In-context learning (ICL) allows large language models (LLMs) to adapt to new\ntasks directly from the given demonstrations without requiring gradient\nupdates. While recent advances have expanded context windows to accommodate\nmore demonstrations, this approach increases inference costs without\nnecessarily improving performance. To mitigate these issues, We propose\nStreamAdapter, a novel approach that directly updates model parameters from\ncontext at test time, eliminating the need for explicit in-context\ndemonstrations. StreamAdapter employs context mapping and weight absorption\nmechanisms to dynamically transform ICL demonstrations into parameter updates\nwith minimal additional parameters. By reducing reliance on numerous in-context\nexamples, StreamAdapter significantly reduce inference costs and allows for\nefficient inference with constant time complexity, regardless of demonstration\ncount. Extensive experiments across diverse tasks and model architectures\ndemonstrate that StreamAdapter achieves comparable or superior adaptation\ncapability to ICL while requiring significantly fewer demonstrations. The\nsuperior task adaptation and context encoding capabilities of StreamAdapter on\nboth language understanding and generation tasks provides a new perspective for\nadapting LLMs at test time using context, allowing for more efficient\nadaptation across scenarios and more cost-effective inference\n","authors":["Dilxat Muhtar","Yelong Shen","Yaming Yang","Xiaodong Liu","Yadong Lu","Jianfeng Liu","Yuefeng Zhan","Hao Sun","Weiwei Deng","Feng Sun","Xueliang Zhang","Jianfeng Gao","Weizhu Chen","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09289v1.pdf","comment":"22 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2411.09273v1","updated":"2024-11-14T08:22:42Z","published":"2024-11-14T08:22:42Z","title":"Cross-Modal Consistency in Multimodal Large Language Models","summary":" Recent developments in multimodal methodologies have marked the beginning of\nan exciting era for models adept at processing diverse data types, encompassing\ntext, audio, and visual content. Models like GPT-4V, which merge computer\nvision with advanced language processing, exhibit extraordinary proficiency in\nhandling intricate tasks that require a simultaneous understanding of both\ntextual and visual information. Prior research efforts have meticulously\nevaluated the efficacy of these Vision Large Language Models (VLLMs) in various\ndomains, including object detection, image captioning, and other related\nfields. However, existing analyses have often suffered from limitations,\nprimarily centering on the isolated evaluation of each modality's performance\nwhile neglecting to explore their intricate cross-modal interactions.\nSpecifically, the question of whether these models achieve the same level of\naccuracy when confronted with identical task instances across different\nmodalities remains unanswered. In this study, we take the initiative to delve\ninto the interaction and comparison among these modalities of interest by\nintroducing a novel concept termed cross-modal consistency. Furthermore, we\npropose a quantitative evaluation framework founded on this concept. Our\nexperimental findings, drawn from a curated collection of parallel\nvision-language datasets developed by us, unveil a pronounced inconsistency\nbetween the vision and language modalities within GPT-4V, despite its portrayal\nas a unified multimodal model. Our research yields insights into the\nappropriate utilization of such models and hints at potential avenues for\nenhancing their design.\n","authors":["Xiang Zhang","Senyu Li","Ning Shi","Bradley Hauer","Zijun Wu","Grzegorz Kondrak","Muhammad Abdul-Mageed","Laks V. S. Lakshmanan"],"pdf_url":"https://arxiv.org/pdf/2411.09273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07176v2","updated":"2024-11-14T08:20:22Z","published":"2024-11-11T17:56:28Z","title":"More Expressive Attention with Negative Weights","summary":" We propose a novel attention mechanism, named Cog Attention, that enables\nattention weights to be negative for enhanced expressiveness, which stems from\ntwo key factors: (1) Cog Attention can shift the token deletion and copying\nfunction from a static OV matrix to dynamic QK inner products, with the OV\nmatrix now focusing more on refinement or modification. The attention head can\nsimultaneously delete, copy, or retain tokens by assigning them negative,\npositive, or minimal attention weights, respectively. As a result, a single\nattention head becomes more flexible and expressive. (2) Cog Attention improves\nthe model's robustness against representational collapse, which can occur when\nearlier tokens are over-squashed into later positions, leading to homogeneous\nrepresentations. Negative weights reduce effective information paths from\nearlier to later tokens, helping to mitigate this issue. We develop\nTransformer-like models which use Cog Attention as attention modules, including\ndecoder-only models for language modeling and U-ViT diffusion models for image\ngeneration. Experiments show that models using Cog Attention exhibit superior\nperformance compared to those employing traditional softmax attention modules.\nOur approach suggests a promising research direction for rethinking and\nbreaking the entrenched constraints of traditional softmax attention, such as\nthe requirement for non-negative weights.\n","authors":["Ang Lv","Ruobing Xie","Shuaipeng Li","Jiayi Liao","Xingwu Sun","Zhanhui Kang","Di Wang","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2411.07176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09259v1","updated":"2024-11-14T07:51:51Z","published":"2024-11-14T07:51:51Z","title":"Jailbreak Attacks and Defenses against Multimodal Generative Models: A\n Survey","summary":" The rapid evolution of multimodal foundation models has led to significant\nadvancements in cross-modal understanding and generation across diverse\nmodalities, including text, images, audio, and video. However, these models\nremain susceptible to jailbreak attacks, which can bypass built-in safety\nmechanisms and induce the production of potentially harmful content.\nConsequently, understanding the methods of jailbreak attacks and existing\ndefense mechanisms is essential to ensure the safe deployment of multimodal\ngenerative models in real-world scenarios, particularly in security-sensitive\napplications. To provide comprehensive insight into this topic, this survey\nreviews jailbreak and defense in multimodal generative models. First, given the\ngeneralized lifecycle of multimodal jailbreak, we systematically explore\nattacks and corresponding defense strategies across four levels: input,\nencoder, generator, and output. Based on this analysis, we present a detailed\ntaxonomy of attack methods, defense mechanisms, and evaluation frameworks\nspecific to multimodal generative models. Additionally, we cover a wide range\nof input-output configurations, including modalities such as Any-to-Text,\nAny-to-Vision, and Any-to-Any within generative systems. Finally, we highlight\ncurrent research challenges and propose potential directions for future\nresearch.The open-source repository corresponding to this work can be found at\nhttps://github.com/liuxuannan/Awesome-Multimodal-Jailbreak.\n","authors":["Xuannan Liu","Xing Cui","Peipei Li","Zekun Li","Huaibo Huang","Shuhan Xia","Miaoxuan Zhang","Yueying Zou","Ran He"],"pdf_url":"https://arxiv.org/pdf/2411.09259v1.pdf","comment":"ongoing work"},{"id":"http://arxiv.org/abs/2411.09255v1","updated":"2024-11-14T07:41:34Z","published":"2024-11-14T07:41:34Z","title":"DAHL: Domain-specific Automated Hallucination Evaluation of Long-Form\n Text through a Benchmark Dataset in Biomedicine","summary":" We introduce DAHL, a benchmark dataset and automated evaluation system\ndesigned to assess hallucination in long-form text generation, specifically\nwithin the biomedical domain. Our benchmark dataset, meticulously curated from\nbiomedical research papers, consists of 8,573 questions across 29 categories.\nDAHL evaluates fact-conflicting hallucinations in Large Language Models (LLMs)\nby deconstructing responses into atomic units, each representing a single piece\nof information. The accuracy of these responses is averaged to produce the DAHL\nScore, offering a more in-depth evaluation of hallucinations compared to\nprevious methods that rely on multiple-choice tasks. We conduct experiments\nwith 8 different models, finding that larger models tend to hallucinate less;\nhowever, beyond a model size of 7 to 8 billion parameters, further scaling does\nnot significantly improve factual accuracy. The DAHL Score holds potential as\nan efficient alternative to human-annotated preference labels, being able to be\nexpanded to other specialized domains. We release the dataset and code in\npublic.\n","authors":["Jean Seo","Jongwon Lim","Dongjun Jang","Hyopil Shin"],"pdf_url":"https://arxiv.org/pdf/2411.09255v1.pdf","comment":"EMNLP2024/FEVER"},{"id":"http://arxiv.org/abs/2411.09249v1","updated":"2024-11-14T07:28:09Z","published":"2024-11-14T07:28:09Z","title":"Enhancing Financial Domain Adaptation of Language Models via Model\n Augmentation","summary":" The domain adaptation of language models, including large language models\n(LLMs), has become increasingly important as the use of such models continues\nto expand. This study demonstrates the effectiveness of Composition to Augment\nLanguage Models (CALM) in adapting to the financial domain. CALM is a model to\nextend the capabilities of existing models by introducing cross-attention\nbetween two LLMs with different functions. In our experiments, we developed a\nCALM to enhance the financial performance of an LLM with strong response\ncapabilities by leveraging a financial-specialized LLM. Notably, the CALM was\ntrained using a financial dataset different from the one used to train the\nfinancial-specialized LLM, confirming CALM's ability to adapt to various\ndatasets. The models were evaluated through quantitative Japanese financial\nbenchmarks and qualitative response comparisons, demonstrating that CALM\nenables superior responses with higher scores than the original models and\nbaselines. Additionally, comparative experiments on connection points revealed\nthat connecting the middle layers of the models is most effective in\nfacilitating adaptation to the financial domain. These findings confirm that\nCALM is a practical approach for adapting LLMs to the financial domain.\n","authors":["Kota Tanabe","Masanori Hirano","Kazuki Matoya","Kentaro Imajo","Hiroki Sakaji","Itsuki Noda"],"pdf_url":"https://arxiv.org/pdf/2411.09249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10880v2","updated":"2024-11-14T07:01:07Z","published":"2024-06-16T10:04:19Z","title":"Exploring the Potential of Multimodal LLM with Knowledge-Intensive\n Multimodal ASR","summary":" Recent advancements in multimodal large language models (MLLMs) have made\nsignificant progress in integrating information across various modalities, yet\nreal-world applications in educational and scientific domains remain\nchallenging. This paper introduces the Multimodal Scientific ASR (MS-ASR) task,\nwhich focuses on transcribing scientific conference videos by leveraging visual\ninformation from slides to enhance the accuracy of technical terminologies.\nRealized that traditional metrics like WER fall short in assessing performance\naccurately, prompting the proposal of severity-aware WER (SWER) that considers\nthe content type and severity of ASR errors. We propose the Scientific Vision\nAugmented ASR (SciVASR) framework as a baseline method, enabling MLLMs to\nimprove transcript quality through post-editing. Evaluations of\nstate-of-the-art MLLMs, including GPT-4o, show a 45% improvement over\nspeech-only baselines, highlighting the importance of multimodal information\nintegration.\n","authors":["Minghan Wang","Yuxia Wang","Thuy-Trang Vu","Ehsan Shareghi","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2406.10880v2.pdf","comment":"Accepted to EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2407.00996v2","updated":"2024-11-14T06:55:27Z","published":"2024-07-01T06:22:38Z","title":"Can Small Language Models Learn, Unlearn, and Retain Noise Patterns?","summary":" Small Language Models (SLMs) are generally considered more compact versions\nof large language models (LLMs). This study investigates the ability of SLMs\nwith parameters between 1 and 3 billion to learn, retain, and subsequently\neliminate different types of noise present in the data. Four pre-trained SLMs\nwere utilized for this: Olmo 1B, Qwen1.5 1.8B, Gemma 2B, and Phi2 2.7B. The\nmodels were instruction-tuned on noise-free data and tested using in-context\nexamples to determine if they could learn noise through examples. Subsequently,\nnoise patterns were introduced in instruction tuning to evaluate the noise\nlearning, unlearning, and retention capabilities of the models. Olmo, the\nsmallest model, was highly sensitive to noise, quickly adapting to noisy\npatterns. Phi2 resisted learning character-level and transliteration noise,\nlikely due to its carefully curated, structured, and high-quality pretraining\ndata. Gemma excelled with transliteration noise, likely benefiting from its\nmultilingual pretraining. The findings can be used to develop robust training\nstrategies for SLMs.\n","authors":["Nicy Scaria","Silvester John Joseph Kennedy","Deepak Subramani"],"pdf_url":"https://arxiv.org/pdf/2407.00996v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03735v4","updated":"2024-11-14T06:42:51Z","published":"2024-01-08T08:54:22Z","title":"Language Models Encode the Value of Numbers Linearly","summary":" Large language models (LLMs) have exhibited impressive competence in various\ntasks, but their internal mechanisms on mathematical problems are still\nunder-explored. In this paper, we study a fundamental question: how language\nmodels encode the value of numbers, a basic element in math. To study the\nquestion, we construct a synthetic dataset comprising addition problems and\nutilize linear probes to read out input numbers from the hidden states.\nExperimental results support the existence of encoded number values in LLMs on\ndifferent layers, and these values can be extracted via linear probes. Further\nexperiments show that LLMs store their calculation results in a similar manner,\nand we can intervene the output via simple vector additions, proving the causal\nconnection between encoded numbers and language model outputs. Our research\nprovides evidence that LLMs encode the value of numbers linearly, offering\ninsights for better exploring, designing, and utilizing numeric information in\nLLMs.\n","authors":["Fangwei Zhu","Damai Dai","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2401.03735v4.pdf","comment":"The code and data are available at\n https://github.com/solitaryzero/NumProbe"},{"id":"http://arxiv.org/abs/2411.09214v1","updated":"2024-11-14T06:20:21Z","published":"2024-11-14T06:20:21Z","title":"HateGPT: Unleashing GPT-3.5 Turbo to Combat Hate Speech on X","summary":" The widespread use of social media platforms like Twitter and Facebook has\nenabled people of all ages to share their thoughts and experiences, leading to\nan immense accumulation of user-generated content. However, alongside the\nbenefits, these platforms also face the challenge of managing hate speech and\noffensive content, which can undermine rational discourse and threaten\ndemocratic values. As a result, there is a growing need for automated methods\nto detect and mitigate such content, especially given the complexity of\nconversations that may require contextual analysis across multiple languages,\nincluding code-mixed languages like Hinglish, German-English, and Bangla. We\nparticipated in the English task where we have to classify English tweets into\ntwo categories namely Hate and Offensive and Non Hate-Offensive. In this work,\nwe experiment with state-of-the-art large language models like GPT-3.5 Turbo\nvia prompting to classify tweets into Hate and Offensive or Non Hate-Offensive.\nIn this study, we evaluate the performance of a classification model using\nMacro-F1 scores across three distinct runs. The Macro-F1 score, which balances\nprecision and recall across all classes, is used as the primary metric for\nmodel evaluation. The scores obtained are 0.756 for run 1, 0.751 for run 2, and\n0.754 for run 3, indicating a high level of performance with minimal variance\namong the runs. The results suggest that the model consistently performs well\nin terms of precision and recall, with run 1 showing the highest performance.\nThese findings highlight the robustness and reliability of the model across\ndifferent runs.\n","authors":["Aniket Deroy","Subhankar Maity"],"pdf_url":"https://arxiv.org/pdf/2411.09214v1.pdf","comment":"Accepted at FIRE 2024 (Track: Hate Speech and Offensive Content\n Identification in English and Indo-Aryan Languages (HASOC)). arXiv admin\n note: text overlap with arXiv:2411.05039, arXiv:2411.06946"},{"id":"http://arxiv.org/abs/2411.09213v1","updated":"2024-11-14T06:19:18Z","published":"2024-11-14T06:19:18Z","title":"Comprehensive and Practical Evaluation of Retrieval-Augmented Generation\n Systems for Medical Question Answering","summary":" Retrieval-augmented generation (RAG) has emerged as a promising approach to\nenhance the performance of large language models (LLMs) in knowledge-intensive\ntasks such as those from medical domain. However, the sensitive nature of the\nmedical domain necessitates a completely accurate and trustworthy system. While\nexisting RAG benchmarks primarily focus on the standard retrieve-answer\nsetting, they overlook many practical scenarios that measure crucial aspects of\na reliable medical system. This paper addresses this gap by providing a\ncomprehensive evaluation framework for medical question-answering (QA) systems\nin a RAG setting for these situations, including sufficiency, integration, and\nrobustness. We introduce Medical Retrieval-Augmented Generation Benchmark\n(MedRGB) that provides various supplementary elements to four medical QA\ndatasets for testing LLMs' ability to handle these specific scenarios.\nUtilizing MedRGB, we conduct extensive evaluations of both state-of-the-art\ncommercial LLMs and open-source models across multiple retrieval conditions.\nOur experimental results reveals current models' limited ability to handle\nnoise and misinformation in the retrieved documents. We further analyze the\nLLMs' reasoning processes to provides valuable insights and future directions\nfor developing RAG systems in this critical medical domain.\n","authors":["Nghia Trung Ngo","Chien Van Nguyen","Franck Dernoncourt","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.09213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10570v6","updated":"2024-11-14T06:09:47Z","published":"2023-10-16T16:45:12Z","title":"On Context Utilization in Summarization with Large Language Models","summary":" Large language models (LLMs) excel in abstractive summarization tasks,\ndelivering fluent and pertinent summaries. Recent advancements have extended\ntheir capabilities to handle long-input contexts, exceeding 100k tokens.\nHowever, in question answering, language models exhibit uneven utilization of\ntheir input context. They tend to favor the initial and final segments,\nresulting in a U-shaped performance pattern concerning where the answer is\nlocated within the input. This bias raises concerns, particularly in\nsummarization where crucial content may be dispersed throughout the source\ndocument(s). Besides, in summarization, mapping facts from the source to the\nsummary is not trivial as salient content is usually re-phrased. In this paper,\nwe conduct the first comprehensive study on context utilization and position\nbias in summarization. Our analysis encompasses 6 LLMs, 10 datasets, and 5\nevaluation metrics. We introduce a new evaluation benchmark called MiddleSum on\nthe which we benchmark two alternative inference methods to alleviate position\nbias: hierarchical summarization and incremental summarization. Our code and\ndata can be found here: https://github.com/ntunlp/MiddleSum.\n","authors":["Mathieu Ravaut","Aixin Sun","Nancy F. Chen","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2310.10570v6.pdf","comment":"ACL 2024. 9 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.00476v3","updated":"2024-11-14T06:06:09Z","published":"2024-06-29T15:47:28Z","title":"Large Language Models for Power Scheduling: A User-Centric Approach","summary":" While traditional optimization and scheduling schemes are designed to meet\nfixed, predefined system requirements, future systems are moving toward\nuser-driven approaches and personalized services, aiming to achieve high\nquality-of-experience (QoE) and flexibility. This challenge is particularly\npronounced in wireless and digitalized energy networks, where users'\nrequirements have largely not been taken into consideration due to the lack of\na common language between users and machines. The emergence of powerful large\nlanguage models (LLMs) marks a radical departure from traditional\nsystem-centric methods into more advanced user-centric approaches by providing\na natural communication interface between users and devices. In this paper, for\nthe first time, we introduce a novel architecture for resource scheduling\nproblems by constructing three LLM agents to convert an arbitrary user's voice\nrequest (VRQ) into a resource allocation vector. Specifically, we design an LLM\nintent recognition agent to translate the request into an optimization problem\n(OP), an LLM OP parameter identification agent, and an LLM OP solving agent. To\nevaluate system performance, we construct a database of typical VRQs in the\ncontext of electric vehicle (EV) charging. As a proof of concept, we primarily\nuse Llama 3 8B. Through testing with different prompt engineering scenarios,\nthe obtained results demonstrate the efficiency of the proposed architecture.\nThe conducted performance analysis allows key insights to be extracted. For\ninstance, having a larger set of candidate OPs to model the real-world problem\nmight degrade the final performance because of a higher recognition/OP\nclassification noise level. All results and codes are open source.\n","authors":["Thomas Mongaillard","Samson Lasaulce","Othman Hicheur","Chao Zhang","Lina Bariah","Vineeth S. Varma","Hang Zou","Qiyang Zhao","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2407.00476v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09593v4","updated":"2024-11-14T06:00:39Z","published":"2022-12-19T16:29:26Z","title":"Unsupervised Summarization Re-ranking","summary":" With the rise of task-specific pre-training objectives, abstractive\nsummarization models like PEGASUS offer appealing zero-shot performance on\ndownstream summarization tasks. However, the performance of such unsupervised\nmodels still lags significantly behind their supervised counterparts. Similarly\nto the supervised setup, we notice a very high variance in quality among\nsummary candidates from these models while only one candidate is kept as the\nsummary output. In this paper, we propose to re-rank summary candidates in an\nunsupervised manner, aiming to close the performance gap between unsupervised\nand supervised models. Our approach improves the unsupervised PEGASUS by up to\n7.27% and ChatGPT by up to 6.86% relative mean ROUGE across four widely-adopted\nsummarization benchmarks ; and achieves relative gains of 7.51% (up to 23.73%\nfrom XSum to WikiHow) averaged over 30 zero-shot transfer setups (finetuning on\na dataset, evaluating on another).\n","authors":["Mathieu Ravaut","Shafiq Joty","Nancy Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09593v4.pdf","comment":"9 pages, 1 figure, 10 tables, 23 appendix pages, ACL Findings 2023"},{"id":"http://arxiv.org/abs/2411.08504v2","updated":"2024-11-14T05:51:26Z","published":"2024-11-13T10:42:11Z","title":"Towards Objective and Unbiased Decision Assessments with LLM-Enhanced\n Hierarchical Attention Networks","summary":" How objective and unbiased are we while making decisions? This work\ninvestigates cognitive bias identification in high-stake decision making\nprocess by human experts, questioning its effectiveness in real-world settings,\nsuch as candidates assessments for university admission. We begin with a\nstatistical analysis assessing correlations among different decision points\namong in the current process, which discovers discrepancies that imply\ncognitive bias and inconsistency in decisions. This motivates our exploration\nof bias-aware AI-augmented workflow that surpass human judgment. We propose\nBGM-HAN, an enhanced Hierarchical Attention Network with Byte-Pair Encoding,\nGated Residual Connections and Multi-Head Attention. Using it as a backbone\nmodel, we further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow,\nwhich simulate real-world decision-making. In our experiments, both the\nproposed model and the agentic workflow significantly improves on both human\njudgment and alternative models, validated with real-world data.\n","authors":["Junhua Liu","Kwan Hui Lim","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08504v2.pdf","comment":"Source code is available at: https://github.com/junhua/bgm-han"},{"id":"http://arxiv.org/abs/2407.12471v2","updated":"2024-11-14T05:49:31Z","published":"2024-07-17T10:49:47Z","title":"Characterization of Political Polarized Users Attacked by Language\n Toxicity on Twitter","summary":" Understanding the dynamics of language toxicity on social media is important\nfor us to investigate the propagation of misinformation and the development of\necho chambers for political scenarios such as U.S. presidential elections.\nRecent research has used large-scale data to investigate the dynamics across\nsocial media platforms. However, research on the toxicity dynamics is not\nenough. This study aims to provide a first exploration of the potential\nlanguage toxicity flow among Left, Right and Center users. Specifically, we aim\nto examine whether Left users were easier to be attacked by language toxicity.\nIn this study, more than 500M Twitter posts were examined. It was discovered\nthat Left users received much more toxic replies than Right and Center users.\n","authors":["Wentao Xu"],"pdf_url":"https://arxiv.org/pdf/2407.12471v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09166v1","updated":"2024-11-14T03:54:42Z","published":"2024-11-14T03:54:42Z","title":"Unstructured Text Enhanced Open-domain Dialogue System: A Systematic\n Survey","summary":" Incorporating external knowledge into dialogue generation has been proven to\nbenefit the performance of an open-domain Dialogue System (DS), such as\ngenerating informative or stylized responses, controlling conversation topics.\nIn this article, we study the open-domain DS that uses unstructured text as\nexternal knowledge sources (\\textbf{U}nstructured \\textbf{T}ext\n\\textbf{E}nhanced \\textbf{D}ialogue \\textbf{S}ystem, \\textbf{UTEDS}). The\nexistence of unstructured text entails distinctions between UTEDS and\ntraditional data-driven DS and we aim to analyze these differences. We first\ngive the definition of the UTEDS related concepts, then summarize the recently\nreleased datasets and models. We categorize UTEDS into Retrieval and Generative\nmodels and introduce them from the perspective of model components. The\nretrieval models consist of Fusion, Matching, and Ranking modules, while the\ngenerative models comprise Dialogue and Knowledge Encoding, Knowledge\nSelection, and Response Generation modules. We further summarize the evaluation\nmethods utilized in UTEDS and analyze the current models' performance. At last,\nwe discuss the future development trends of UTEDS, hoping to inspire new\nresearch in this field.\n","authors":["Longxuan Ma","Mingda Li","Weinan Zhang","Jiapeng Li","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2411.09166v1.pdf","comment":"45 pages, 3 Figures, 11 Tables"},{"id":"http://arxiv.org/abs/2406.11214v3","updated":"2024-11-14T03:53:56Z","published":"2024-06-17T05:13:25Z","title":"Problematic Tokens: Tokenizer Bias in Large Language Models","summary":" Recent advancements in large language models(LLMs), such as GPT-4 and GPT-4o,\nhave shown exceptional performance, especially in languages with abundant\nresources like English, thanks to extensive datasets that ensure robust\ntraining. Conversely, these models exhibit limitations when processing\nunder-resourced languages such as Chinese and Korean, where issues including\nhallucinatory responses remain prevalent. This paper traces the roots of these\ndisparities to the tokenization process inherent to these models. Specifically,\nit explores how the tokenizers vocabulary, often used to speed up the\ntokenization process and reduce tokens but constructed independently of the\nactual model training data, inadequately represents non-English languages. This\nmisrepresentation results in the propagation of under-trained or untrained\ntokens, which perpetuate biases and pose serious concerns related to data\nsecurity and ethical standards. We aim to dissect the tokenization mechanics of\nGPT-4o, illustrating how its simplified token-handling methods amplify these\nrisks and offer strategic solutions to mitigate associated security and ethical\nissues. Through this study, we emphasize the critical need to rethink\ntokenization frameworks to foster more equitable and secure AI technologies.\nThe code and data are available at https://github.com/yeyimilk/LLMGPT4o\n","authors":["Jin Yang","Zhiqiang Wang","Yanbin Lin","Zunduo Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.11214v3.pdf","comment":"11th IEEE Special session on Privacy and Security of Big Data (PSBD\n 2024)"},{"id":"http://arxiv.org/abs/2406.12382v3","updated":"2024-11-14T03:10:45Z","published":"2024-06-18T08:14:28Z","title":"From Instance Training to Instruction Learning: Task Adapters Generation\n from Instructions","summary":" Large language models (LLMs) have acquired the ability to solve general tasks\nby utilizing instruction finetuning (IFT). However, IFT still relies heavily on\ninstance training of extensive task data, which greatly limits the adaptability\nof LLMs to real-world scenarios where labeled task instances are scarce and\nbroader task generalization becomes paramount. Contrary to LLMs, humans acquire\nskills and complete tasks not merely through repeated practice but also by\nunderstanding and following instructional guidelines. This paper is dedicated\nto simulating human learning to address the shortcomings of instance training,\nfocusing on instruction learning to enhance cross-task generalization. Within\nthis context, we introduce Task Adapters Generation from Instructions (TAGI),\nwhich automatically constructs the task-specific model in a parameter\ngeneration manner based on the given task instructions without retraining for\nunseen tasks. Specifically, we utilize knowledge distillation to enhance the\nconsistency between TAGI developed through Learning with Instruction and\ntask-specific models developed through Training with Instance, by aligning the\nlabels, output logits, and adapter parameters between them. TAGI is endowed\nwith cross-task generalization capabilities through a two-stage training\nprocess that includes hypernetwork pretraining and finetuning. We evaluate TAGI\non the Super-Natural Instructions and P3 datasets. The experimental results\ndemonstrate that TAGI can match or even outperform traditional meta-trained\nmodels and other hypernetwork models, while significantly reducing\ncomputational requirements.\n","authors":["Huanxuan Liao","Shizhu He","Yao Xu","Yuanzhe Zhang","Yanchao Hao","Shengping Liu","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.12382v3.pdf","comment":"accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08733v2","updated":"2024-11-14T02:36:58Z","published":"2024-11-13T16:15:38Z","title":"Dynamic Rewarding with Prompt Optimization Enables Tuning-free\n Self-Alignment of Language Models","summary":" Aligning Large Language Models (LLMs) traditionally relies on costly training\nand human preference annotations. Self-alignment seeks to reduce these expenses\nby enabling models to align themselves. To further lower costs and achieve\nalignment without any expensive tuning or annotations, we introduce a new\ntuning-free approach for self-alignment, Dynamic Rewarding with Prompt\nOptimization (DRPO). Our approach leverages a search-based optimization\nframework that allows LLMs to iteratively self-improve and craft the optimal\nalignment instructions, all without additional training or human intervention.\nThe core of DRPO is a dynamic rewarding mechanism, which identifies and\nrectifies model-specific alignment weaknesses, allowing LLMs to adapt\nefficiently to diverse alignment challenges. Empirical evaluations on eight\nrecent LLMs, both open- and closed-sourced, demonstrate that DRPO significantly\nenhances alignment performance, with base models outperforming their\nSFT/RLHF-tuned counterparts. Moreover, the prompts automatically optimized by\nDRPO surpass those curated by human experts, further validating the\neffectiveness of our approach. Our findings highlight the great potential of\ncurrent LLMs to achieve adaptive self-alignment through inference-time\noptimization, complementing tuning-based alignment methods.\n","authors":["Somanshu Singla","Zhen Wang","Tianyang Liu","Abdullah Ashfaq","Zhiting Hu","Eric P. Xing"],"pdf_url":"https://arxiv.org/pdf/2411.08733v2.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2410.19258v3","updated":"2024-11-14T01:56:11Z","published":"2024-10-25T02:22:00Z","title":"Not All Heads Matter: A Head-Level KV Cache Compression Method with\n Integrated Retrieval and Reasoning","summary":" Key-Value (KV) caching is a common technique to enhance the computational\nefficiency of Large Language Models (LLMs), but its memory overhead grows\nrapidly with input length. Prior work has shown that not all tokens are equally\nimportant for text generation, proposing layer-level KV cache compression to\nselectively retain key information. Recognizing the distinct roles of attention\nheads in generation, we propose HeadKV, a head-level KV cache compression\nmethod, and HeadKV-R2, which leverages a novel contextual reasoning ability\nestimation for compression. Our approach operates at the level of individual\nheads, estimating their importance for contextual QA tasks that require both\nretrieval and reasoning capabilities. Extensive experiments across diverse\nbenchmarks (LongBench, LooGLE), model architectures (e.g., Llama-3-8B-Instruct,\nMistral-7B-Instruct), and long-context abilities tests demonstrate that our\nhead-level KV cache compression significantly outperforms strong baselines,\nparticularly in low-resource settings (KV size = 64 & 128). Notably, our method\nretains just 1.5% of the KV cache while achieving 97% of the performance of the\nfull KV cache on the contextual question answering benchmark.Codes are\navailable at https://github.com/FYYFU/HeadKV\n","authors":["Yu Fu","Zefan Cai","Abedelkadir Asi","Wayne Xiong","Yue Dong","Wen Xiao"],"pdf_url":"https://arxiv.org/pdf/2410.19258v3.pdf","comment":"18pages"},{"id":"http://arxiv.org/abs/2411.09125v1","updated":"2024-11-14T01:48:08Z","published":"2024-11-14T01:48:08Z","title":"DROJ: A Prompt-Driven Attack against Large Language Models","summary":" Large Language Models (LLMs) have demonstrated exceptional capabilities\nacross various natural language processing tasks. Due to their training on\ninternet-sourced datasets, LLMs can sometimes generate objectionable content,\nnecessitating extensive alignment with human feedback to avoid such outputs.\nDespite massive alignment efforts, LLMs remain susceptible to adversarial\njailbreak attacks, which usually are manipulated prompts designed to circumvent\nsafety mechanisms and elicit harmful responses. Here, we introduce a novel\napproach, Directed Rrepresentation Optimization Jailbreak (DROJ), which\noptimizes jailbreak prompts at the embedding level to shift the hidden\nrepresentations of harmful queries towards directions that are more likely to\nelicit affirmative responses from the model. Our evaluations on LLaMA-2-7b-chat\nmodel show that DROJ achieves a 100\\% keyword-based Attack Success Rate (ASR),\neffectively preventing direct refusals. However, the model occasionally\nproduces repetitive and non-informative responses. To mitigate this, we\nintroduce a helpfulness system prompt that enhances the utility of the model's\nresponses. Our code is available at\nhttps://github.com/Leon-Leyang/LLM-Safeguard.\n","authors":["Leyang Hu","Boran Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04997v2","updated":"2024-11-14T01:36:12Z","published":"2024-11-07T18:59:16Z","title":"LLM2CLIP: Powerful Language Model Unlocks Richer Visual Representation","summary":" CLIP is one of the most important multimodal foundational models today. What\npowers CLIP's capabilities? The rich supervision signals provided by natural\nlanguage, the carrier of human knowledge, shape a powerful cross-modal\nrepresentation space. However, with the rapid advancements in large language\nmodels LLMs like GPT-4 and LLaMA, the boundaries of language comprehension and\ngeneration are continually being pushed. This raises an intriguing question:\ncan the capabilities of LLMs be harnessed to further improve multimodal\nrepresentation learning? The potential benefits of incorporating LLMs into CLIP\nare clear. LLMs' strong textual understanding can fundamentally improve CLIP's\nability to handle image captions, drastically enhancing its ability to process\nlong and complex texts, a well-known limitation of vanilla CLIP. Moreover, LLMs\nare trained on a vast corpus of text, possessing open-world knowledge. This\nallows them to expand on caption information during training, increasing the\nefficiency of the learning process. In this paper, we propose LLM2CLIP, a novel\napproach that embraces the power of LLMs to unlock CLIP's potential. By\nfine-tuning the LLM in the caption space with contrastive learning, we extract\nits textual capabilities into the output embeddings, significantly improving\nthe output layer's textual discriminability. We then design an efficient\ntraining process where the fine-tuned LLM acts as a powerful teacher for CLIP's\nvisual encoder. Thanks to the LLM's presence, we can now incorporate longer and\nmore complex captions without being restricted by vanilla CLIP's text encoder's\ncontext window and ability limitations. Our experiments demonstrate that this\napproach brings substantial improvements in cross-modal tasks.\n","authors":["Weiquan Huang","Aoqi Wu","Yifan Yang","Xufang Luo","Yuqing Yang","Liang Hu","Qi Dai","Xiyang Dai","Dongdong Chen","Chong Luo","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2411.04997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09116v1","updated":"2024-11-14T01:29:36Z","published":"2024-11-14T01:29:36Z","title":"P-MMEval: A Parallel Multilingual Multitask Benchmark for Consistent\n Evaluation of LLMs","summary":" Recent advancements in large language models (LLMs) showcase varied\nmultilingual capabilities across tasks like translation, code generation, and\nreasoning. Previous assessments often limited their scope to fundamental\nnatural language processing (NLP) or isolated capability-specific tasks. To\nalleviate this drawback, we aim to present a comprehensive multilingual\nmultitask benchmark. First, we present a pipeline for selecting available and\nreasonable benchmarks from massive ones, addressing the oversight in previous\nwork regarding the utility of these benchmarks, i.e., their ability to\ndifferentiate between models being evaluated. Leveraging this pipeline, we\nintroduce P-MMEval, a large-scale benchmark covering effective fundamental and\ncapability-specialized datasets. Furthermore, P-MMEval delivers consistent\nlanguage coverage across various datasets and provides parallel samples.\nFinally, we conduct extensive experiments on representative multilingual model\nseries to compare performances across models, analyze dataset effectiveness,\nexamine prompt impacts on model performances, and explore the relationship\nbetween multilingual performances and factors such as tasks, model sizes, and\nlanguages. These insights offer valuable guidance for future research. The\ndataset is available at https://huggingface.co/datasets/Qwen/P-MMEval.\n","authors":["Yidan Zhang","Boyi Deng","Yu Wan","Baosong Yang","Haoran Wei","Fei Huang","Bowen Yu","Junyang Lin","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.09116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09109v1","updated":"2024-11-14T00:52:45Z","published":"2024-11-14T00:52:45Z","title":"Personalized Help for Optimizing Low-Skilled Users' Strategy","summary":" AIs can beat humans in game environments; however, how helpful those agents\nare to human remains understudied. We augment CICERO, a natural language agent\nthat demonstrates superhuman performance in Diplomacy, to generate both move\nand message advice based on player intentions. A dozen Diplomacy games with\nnovice and experienced players, with varying advice settings, show that some of\nthe generated advice is beneficial. It helps novices compete with experienced\nplayers and in some instances even surpass them. The mere presence of advice\ncan be advantageous, even if players do not follow it.\n","authors":["Feng Gu","Wichayaporn Wongkamjan","Jordan Lee Boyd-Graber","Jonathan K. Kummerfeld","Denis Peskoff","Jonathan May"],"pdf_url":"https://arxiv.org/pdf/2411.09109v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2203.02458v2","updated":"2024-11-14T10:15:37Z","published":"2022-03-04T17:41:39Z","title":"Continuous Rating as Reliable Human Evaluation of Simultaneous Speech\n Translation","summary":" Simultaneous speech translation (SST) can be evaluated on simulated online\nevents where human evaluators watch subtitled videos and continuously express\ntheir satisfaction by pressing buttons (so called Continuous Rating).\nContinuous Rating is easy to collect, but little is known about its\nreliability, or relation to comprehension of foreign language document by SST\nusers. In this paper, we contrast Continuous Rating with factual questionnaires\non judges with different levels of source language knowledge. Our results show\nthat Continuous Rating is easy and reliable SST quality assessment if the\njudges have at least limited knowledge of the source language. Our study\nindicates users' preferences on subtitle layout and presentation style and,\nmost importantly, provides a significant evidence that users with advanced\nsource language knowledge prefer low latency over fewer re-translations.\n","authors":["Dávid Javorský","Dominik Macháček","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2203.02458v2.pdf","comment":"Published at WMT 2022: https://aclanthology.org/2022.wmt-1.9/"},{"id":"http://arxiv.org/abs/2203.17255v7","updated":"2024-11-14T01:06:47Z","published":"2022-03-29T22:28:30Z","title":"A Cognitive Architecture for Machine Consciousness and Artificial\n Superintelligence: Thought Is Structured by the Iterative Updating of Working\n Memory","summary":" This article provides an analytical framework for how to simulate human-like\nthought processes within a computer. It describes how attention and memory\nshould be structured, updated, and utilized to search for associative additions\nto the stream of thought. The focus is on replicating the dynamics of the\nmammalian working memory system, which features two forms of persistent\nactivity: sustained firing (preserving information on the order of seconds) and\nsynaptic potentiation (preserving information from minutes to hours). The\narticle uses a series of figures to systematically demonstrate how the\niterative updating of these working memory stores provides functional\norganization to behavior, cognition, and awareness.\n In a machine learning implementation, these two memory stores should be\nupdated continuously and in an iterative fashion. This means each state should\npreserve a proportion of the coactive representations from the state before it\n(where each representation is an ensemble of neural network nodes). This makes\neach state a revised iteration of the preceding state and causes successive\nconfigurations to overlap and blend with respect to the information they\ncontain. Thus, the set of concepts in working memory will evolve gradually and\nincrementally over time. Transitions between states happen as persistent\nactivity spreads activation energy throughout the hierarchical network,\nsearching long-term memory for the most appropriate representation to be added\nto the global workspace. The result is a chain of associatively linked\nintermediate states capable of advancing toward a solution or goal. Iterative\nupdating is conceptualized here as an information processing strategy, a model\nof working memory, a theory of consciousness, and an algorithm for designing\nand programming artificial intelligence (AI, AGI, and ASI).\n","authors":["Jared Edward Reser"],"pdf_url":"https://arxiv.org/pdf/2203.17255v7.pdf","comment":"88 pages and 53 figures"},{"id":"http://arxiv.org/abs/2407.01603v3","updated":"2024-11-14T23:56:22Z","published":"2024-06-26T17:33:21Z","title":"A Review of Large Language Models and Autonomous Agents in Chemistry","summary":" Large language models (LLMs) have emerged as powerful tools in chemistry,\nsignificantly impacting molecule design, property prediction, and synthesis\noptimization. This review highlights LLM capabilities in these domains and\ntheir potential to accelerate scientific discovery through automation. We also\nreview LLM-based autonomous agents: LLMs with a broader set of tools to\ninteract with their surrounding environment. These agents perform diverse tasks\nsuch as paper scraping, interfacing with automated laboratories, and synthesis\nplanning. As agents are an emerging topic, we extend the scope of our review of\nagents beyond chemistry and discuss across any scientific domains. This review\ncovers the recent history, current capabilities, and design of LLMs and\nautonomous agents, addressing specific challenges, opportunities, and future\ndirections in chemistry. Key challenges include data quality and integration,\nmodel interpretability, and the need for standard benchmarks, while future\ndirections point towards more sophisticated multi-modal agents and enhanced\ncollaboration between agents and experimental methods. Due to the quick pace of\nthis field, a repository has been built to keep track of the latest studies:\nhttps://github.com/ur-whitelab/LLMs-in-science.\n","authors":["Mayk Caldas Ramos","Christopher J. Collison","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2407.01603v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09834v1","updated":"2024-11-14T22:54:38Z","published":"2024-11-14T22:54:38Z","title":"A Benchmark for Long-Form Medical Question Answering","summary":" There is a lack of benchmarks for evaluating large language models (LLMs) in\nlong-form medical question answering (QA). Most existing medical QA evaluation\nbenchmarks focus on automatic metrics and multiple-choice questions. While\nvaluable, these benchmarks fail to fully capture or assess the complexities of\nreal-world clinical applications where LLMs are being deployed. Furthermore,\nexisting studies on evaluating long-form answer generation in medical QA are\nprimarily closed-source, lacking access to human medical expert annotations,\nwhich makes it difficult to reproduce results and enhance existing baselines.\nIn this work, we introduce a new publicly available benchmark featuring\nreal-world consumer medical questions with long-form answer evaluations\nannotated by medical doctors. We performed pairwise comparisons of responses\nfrom various open and closed-source medical and general-purpose LLMs based on\ncriteria such as correctness, helpfulness, harmfulness, and bias. Additionally,\nwe performed a comprehensive LLM-as-a-judge analysis to study the alignment\nbetween human judgments and LLMs. Our preliminary results highlight the strong\npotential of open LLMs in medical QA compared to leading closed models. Code &\nData: https://github.com/lavita-ai/medical-eval-sphere\n","authors":["Pedram Hosseini","Jessica M. Sin","Bing Ren","Bryceton G. Thomas","Elnaz Nouri","Ali Farahanchi","Saeed Hassanpour"],"pdf_url":"https://arxiv.org/pdf/2411.09834v1.pdf","comment":"AIM-FM: Advancements in Medical Foundation Models Workshop, 38th\n Conference on Neural Information Processing Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2407.05250v2","updated":"2024-11-14T22:51:46Z","published":"2024-07-07T03:41:51Z","title":"CLIMB: A Benchmark of Clinical Bias in Large Language Models","summary":" Large language models (LLMs) are increasingly applied to clinical\ndecision-making. However, their potential to exhibit bias poses significant\nrisks to clinical equity. Currently, there is a lack of benchmarks that\nsystematically evaluate such clinical bias in LLMs. While in downstream tasks,\nsome biases of LLMs can be avoided such as by instructing the model to answer\n\"I'm not sure...\", the internal bias hidden within the model still lacks deep\nstudies. We introduce CLIMB (shorthand for A Benchmark of Clinical Bias in\nLarge Language Models), a pioneering comprehensive benchmark to evaluate both\nintrinsic (within LLMs) and extrinsic (on downstream tasks) bias in LLMs for\nclinical decision tasks. Notably, for intrinsic bias, we introduce a novel\nmetric, AssocMAD, to assess the disparities of LLMs across multiple demographic\ngroups. Additionally, we leverage counterfactual intervention to evaluate\nextrinsic bias in a task of clinical diagnosis prediction. Our experiments\nacross popular and medically adapted LLMs, particularly from the Mistral and\nLLaMA families, unveil prevalent behaviors with both intrinsic and extrinsic\nbias. This work underscores the critical need to mitigate clinical bias and\nsets a new standard for future evaluations of LLMs' clinical bias.\n","authors":["Yubo Zhang","Shudi Hou","Mingyu Derek Ma","Wei Wang","Muhao Chen","Jieyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.05250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09826v1","updated":"2024-11-14T22:23:13Z","published":"2024-11-14T22:23:13Z","title":"Evaluating Gender Bias in Large Language Models","summary":" Gender bias in artificial intelligence has become an important issue,\nparticularly in the context of language models used in communication-oriented\napplications. This study examines the extent to which Large Language Models\n(LLMs) exhibit gender bias in pronoun selection in occupational contexts. The\nanalysis evaluates the models GPT-4, GPT-4o, PaLM 2 Text Bison and Gemini 1.0\nPro using a self-generated dataset. The jobs considered include a range of\noccupations, from those with a significant male presence to those with a\nnotable female concentration, as well as jobs with a relatively equal gender\ndistribution. Three different sentence processing methods were used to assess\npotential gender bias: masked tokens, unmasked sentences, and sentence\ncompletion. In addition, the LLMs suggested names of individuals in specific\noccupations, which were then examined for gender distribution. The results show\na positive correlation between the models' pronoun choices and the gender\ndistribution present in U.S. labor force data. Female pronouns were more often\nassociated with female-dominated occupations, while male pronouns were more\noften associated with male-dominated occupations. Sentence completion showed\nthe strongest correlation with actual gender distribution, while name\ngeneration resulted in a more balanced 'politically correct' gender\ndistribution, albeit with notable variations in predominantly male or female\noccupations. Overall, the prompting method had a greater impact on gender\ndistribution than the model selection itself, highlighting the complexity of\naddressing gender bias in LLMs. The findings highlight the importance of\nprompting in gender mapping.\n","authors":["Michael Döll","Markus Döhring","Andreas Müller"],"pdf_url":"https://arxiv.org/pdf/2411.09826v1.pdf","comment":"13 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2402.00888v2","updated":"2024-11-14T22:20:49Z","published":"2024-01-30T04:00:54Z","title":"Security and Privacy Challenges of Large Language Models: A Survey","summary":" Large Language Models (LLMs) have demonstrated extraordinary capabilities and\ncontributed to multiple fields, such as generating and summarizing text,\nlanguage translation, and question-answering. Nowadays, LLM is becoming a very\npopular tool in computerized language processing tasks, with the capability to\nanalyze complicated linguistic patterns and provide relevant and appropriate\nresponses depending on the context. While offering significant advantages,\nthese models are also vulnerable to security and privacy attacks, such as\njailbreaking attacks, data poisoning attacks, and Personally Identifiable\nInformation (PII) leakage attacks. This survey provides a thorough review of\nthe security and privacy challenges of LLMs for both training data and users,\nalong with the application-based risks in various domains, such as\ntransportation, education, and healthcare. We assess the extent of LLM\nvulnerabilities, investigate emerging security and privacy attacks for LLMs,\nand review the potential defense mechanisms. Additionally, the survey outlines\nexisting research gaps in this domain and highlights future research\ndirections.\n","authors":["Badhan Chandra Das","M. Hadi Amini","Yanzhao Wu"],"pdf_url":"https://arxiv.org/pdf/2402.00888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12176v4","updated":"2024-11-14T21:34:59Z","published":"2024-07-16T21:03:14Z","title":"GPT-4V Cannot Generate Radiology Reports Yet","summary":" GPT-4V's purported strong multimodal abilities raise interests in using it to\nautomate radiology report writing, but there lacks thorough evaluations. In\nthis work, we perform a systematic evaluation of GPT-4V in generating radiology\nreports on two chest X-ray report datasets: MIMIC-CXR and IU X-Ray. We attempt\nto directly generate reports using GPT-4V through different prompting\nstrategies and find that it fails terribly in both lexical metrics and clinical\nefficacy metrics. To understand the low performance, we decompose the task into\ntwo steps: 1) the medical image reasoning step of predicting medical condition\nlabels from images; and 2) the report synthesis step of generating reports from\n(groundtruth) conditions. We show that GPT-4V's performance in image reasoning\nis consistently low across different prompts. In fact, the distributions of\nmodel-predicted labels remain constant regardless of which groundtruth\nconditions are present on the image, suggesting that the model is not\ninterpreting chest X-rays meaningfully. Even when given groundtruth conditions\nin report synthesis, its generated reports are less correct and less\nnatural-sounding than a finetuned LLaMA-2. Altogether, our findings cast doubt\non the viability of using GPT-4V in a radiology workflow.\n","authors":["Yuyang Jiang","Chacha Chen","Dang Nguyen","Benjamin M. Mervak","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2407.12176v4.pdf","comment":"24 pages, 3 figures, code:\n https://github.com/ChicagoHAI/cxr-eval-gpt-4v Findings paper presented at\n Machine Learning for Health (ML4H) symposium 2024, December 15-16, 2024,\n Vancouver, Canada, 26 pages"},{"id":"http://arxiv.org/abs/2410.02521v2","updated":"2024-11-14T19:36:43Z","published":"2024-10-03T14:28:40Z","title":"Methods of Automatic Matrix Language Determination for Code-Switched\n Speech","summary":" Code-switching (CS) is the process of speakers interchanging between two or\nmore languages which in the modern world becomes increasingly common. In order\nto better describe CS speech the Matrix Language Frame (MLF) theory introduces\nthe concept of a Matrix Language, which is the language that provides the\ngrammatical structure for a CS utterance. In this work the MLF theory was used\nto develop systems for Matrix Language Identity (MLID) determination. The MLID\nof English/Mandarin and English/Spanish CS text and speech was compared to\nacoustic language identity (LID), which is a typical way to identify a language\nin monolingual utterances. MLID predictors from audio show higher correlation\nwith the textual principles than LID in all cases while also outperforming LID\nin an MLID recognition task based on F1 macro (60%) and correlation score\n(0.38). This novel approach has identified that non-English languages (Mandarin\nand Spanish) are preferred over the English language as the ML contrary to the\nmonolingual choice of LID.\n","authors":["Olga Iakovenko","Thomas Hain"],"pdf_url":"https://arxiv.org/pdf/2410.02521v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.09763v1","updated":"2024-11-14T19:20:33Z","published":"2024-11-14T19:20:33Z","title":"Evaluating the Predictive Capacity of ChatGPT for Academic Peer Review\n Outcomes Across Multiple Platforms","summary":" While previous studies have demonstrated that Large Language Models (LLMs)\ncan predict peer review outcomes to some extent, this paper builds on that by\nintroducing two new contexts and employing a more robust method - averaging\nmultiple ChatGPT scores. The findings that averaging 30 ChatGPT predictions,\nbased on reviewer guidelines and using only the submitted titles and abstracts,\nfailed to predict peer review outcomes for F1000Research (Spearman's rho=0.00).\nHowever, it produced mostly weak positive correlations with the quality\ndimensions of SciPost Physics (rho=0.25 for validity, rho=0.25 for originality,\nrho=0.20 for significance, and rho = 0.08 for clarity) and a moderate positive\ncorrelation for papers from the International Conference on Learning\nRepresentations (ICLR) (rho=0.38). Including the full text of articles\nsignificantly increased the correlation for ICLR (rho=0.46) and slightly\nimproved it for F1000Research (rho=0.09), while it had variable effects on the\nfour quality dimension correlations for SciPost LaTeX files. The use of\nchain-of-thought system prompts slightly increased the correlation for\nF1000Research (rho=0.10), marginally reduced it for ICLR (rho=0.37), and\nfurther decreased it for SciPost Physics (rho=0.16 for validity, rho=0.18 for\noriginality, rho=0.18 for significance, and rho=0.05 for clarity). Overall, the\nresults suggest that in some contexts, ChatGPT can produce weak pre-publication\nquality assessments. However, the effectiveness of these assessments and the\noptimal strategies for employing them vary considerably across different\nplatforms, journals, and conferences. Additionally, the most suitable inputs\nfor ChatGPT appear to differ depending on the platform.\n","authors":["Mike Thelwall","Abdullah Yaghi"],"pdf_url":"https://arxiv.org/pdf/2411.09763v1.pdf","comment":null}]},"2024-11-15T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.10446v1","updated":"2024-11-15T18:59:51Z","published":"2024-11-15T18:59:51Z","title":"VeriGraph: Scene Graphs for Execution Verifiable Robot Planning","summary":" Recent advancements in vision-language models (VLMs) offer potential for\nrobot task planning, but challenges remain due to VLMs' tendency to generate\nincorrect action sequences. To address these limitations, we propose VeriGraph,\na novel framework that integrates VLMs for robotic planning while verifying\naction feasibility. VeriGraph employs scene graphs as an intermediate\nrepresentation, capturing key objects and spatial relationships to improve plan\nverification and refinement. The system generates a scene graph from input\nimages and uses it to iteratively check and correct action sequences generated\nby an LLM-based task planner, ensuring constraints are respected and actions\nare executable. Our approach significantly enhances task completion rates\nacross diverse manipulation scenarios, outperforming baseline methods by 58%\nfor language-based tasks and 30% for image-based tasks.\n","authors":["Daniel Ekpo","Mara Levy","Saksham Suri","Chuong Huynh","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2411.10446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21497v2","updated":"2024-11-15T18:02:32Z","published":"2024-10-28T20:05:00Z","title":"Denoising Diffusion Planner: Learning Complex Paths from Low-Quality\n Demonstrations","summary":" Denoising Diffusion Probabilistic Models (DDPMs) are powerful generative deep\nlearning models that have been very successful at image generation, and, very\nrecently, in path planning and control. In this paper, we investigate how to\nleverage the generalization and conditional sampling capabilities of DDPMs to\ngenerate complex paths for a robotic end effector. We show that training a DDPM\nwith synthetic and low-quality demonstrations is sufficient for generating\nnontrivial paths reaching arbitrary targets and avoiding obstacles.\nAdditionally, we investigate different strategies for conditional sampling\ncombining classifier-free and classifier-guided approaches. Eventually, we\ndeploy the DDPM in a receding-horizon control scheme to enhance its planning\ncapabilities. The Denoising Diffusion Planner is experimentally validated\nthrough various experiments on a Franka Emika Panda robot.\n","authors":["Michiel Nikken","Nicolò Botteghi","Wesley Roozing","Federico Califano"],"pdf_url":"https://arxiv.org/pdf/2410.21497v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10336v1","updated":"2024-11-15T16:35:26Z","published":"2024-11-15T16:35:26Z","title":"BMP: Bridging the Gap between B-Spline and Movement Primitives","summary":" This work introduces B-spline Movement Primitives (BMPs), a new Movement\nPrimitive (MP) variant that leverages B-splines for motion representation.\nB-splines are a well-known concept in motion planning due to their ability to\ngenerate complex, smooth trajectories with only a few control points while\nsatisfying boundary conditions, i.e., passing through a specified desired\nposition with desired velocity. However, current usages of B-splines tend to\nignore the higher-order statistics in trajectory distributions, which limits\ntheir usage in imitation learning (IL) and reinforcement learning (RL), where\nmodeling trajectory distribution is essential. In contrast, MPs are commonly\nused in IL and RL for their capacity to capture trajectory likelihoods and\ncorrelations. However, MPs are constrained by their abilities to satisfy\nboundary conditions and usually need extra terms in learning objectives to\nsatisfy velocity constraints. By reformulating B-splines as MPs, represented\nthrough basis functions and weight parameters, BMPs combine the strengths of\nboth approaches, allowing B-splines to capture higher-order statistics while\nretaining their ability to satisfy boundary conditions. Empirical results in IL\nand RL demonstrate that BMPs broaden the applicability of B-splines in robot\nlearning and offer greater expressiveness compared to existing MP variants.\n","authors":["Weiran Liao","Ge Li","Hongyi Zhou","Rudolf Lioutikov","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2411.10336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10316v1","updated":"2024-11-15T16:14:48Z","published":"2024-11-15T16:14:48Z","title":"M3TR: Generalist HD Map Construction with Variable Map Priors","summary":" Autonomous vehicles require road information for their operation, usually in\nform of HD maps. Since offline maps eventually become outdated or may only be\npartially available, online HD map construction methods have been proposed to\ninfer map information from live sensor data. A key issue remains how to exploit\nsuch partial or outdated map information as a prior. We introduce M3TR\n(Multi-Masking Map Transformer), a generalist approach for HD map construction\nboth with and without map priors. We address shortcomings in ground truth\ngeneration for Argoverse 2 and nuScenes and propose the first realistic\nscenarios with semantically diverse map priors. Examining various query\ndesigns, we use an improved method for integrating prior map elements into a HD\nmap construction model, increasing performance by +4.3 mAP. Finally, we show\nthat training across all prior scenarios yields a single Generalist model,\nwhose performance is on par with previous Expert models that can handle only\none specific type of map prior. M3TR thus is the first model capable of\nleveraging variable map priors, making it suitable for real-world deployment.\nCode is available at https://github.com/immel-f/m3tr\n","authors":["Fabian Immel","Richard Fehler","Frank Bieder","Jan-Hendrik Pauls","Christoph Stiller"],"pdf_url":"https://arxiv.org/pdf/2411.10316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02106v2","updated":"2024-11-15T15:51:20Z","published":"2024-10-03T00:00:54Z","title":"Safe Navigation in Unmapped Environments for Robotic Systems with Input\n Constraints","summary":" This paper presents an approach for navigation and control in unmapped\nenvironments under input and state constraints using a composite control\nbarrier function (CBF). We consider the scenario where real-time perception\nfeedback (e.g., LiDAR) is used online to construct a local CBF that models\nlocal state constraints (e.g., local safety constraints such as obstacles) in\nthe a priori unmapped environment. The approach employs a soft-maximum function\nto synthesize a single time-varying CBF from the N most recently obtained local\nCBFs. Next, the input constraints are transformed into controller-state\nconstraints through the use of control dynamics. Then, we use a soft-minimum\nfunction to compose the input constraints with the time-varying CBF that models\nthe a priori unmapped environment. This composition yields a single relaxed\nCBF, which is used in a constrained optimization to obtain an optimal control\nthat satisfies the state and input constraints. The approach is validated\nthrough simulations of a nonholonomic ground robot that is equipped with LiDAR\nand navigates an unmapped environment. The robot successfully navigates the\nenvironment while avoiding the a priori unmapped obstacles and satisfying both\nspeed and input constraints.\n","authors":["Amirsaeid Safari","Jesse B. Hoagg"],"pdf_url":"https://arxiv.org/pdf/2410.02106v2.pdf","comment":"Preprint submitted to 2025 American Control Conference (ACC). arXiv\n admin note: substantial text overlap with arXiv:2409.01458"},{"id":"http://arxiv.org/abs/2411.10291v1","updated":"2024-11-15T15:48:50Z","published":"2024-11-15T15:48:50Z","title":"Moving Forward: A Review of Autonomous Driving Software and Hardware\n Systems","summary":" With their potential to significantly reduce traffic accidents, enhance road\nsafety, optimize traffic flow, and decrease congestion, autonomous driving\nsystems are a major focus of research and development in recent years. Beyond\nthese immediate benefits, they offer long-term advantages in promoting\nsustainable transportation by reducing emissions and fuel consumption.\nAchieving a high level of autonomy across diverse conditions requires a\ncomprehensive understanding of the environment. This is accomplished by\nprocessing data from sensors such as cameras, radars, and LiDARs through a\nsoftware stack that relies heavily on machine learning algorithms. These ML\nmodels demand significant computational resources and involve large-scale data\nmovement, presenting challenges for hardware to execute them efficiently and at\nhigh speed. In this survey, we first outline and highlight the key components\nof self-driving systems, covering input sensors, commonly used datasets,\nsimulation platforms, and the software architecture. We then explore the\nunderlying hardware platforms that support the execution of these software\nsystems. By presenting a comprehensive view of autonomous driving systems and\ntheir increasing demands, particularly for higher levels of autonomy, we\nanalyze the performance and efficiency of scaled-up off-the-shelf GPU/CPU-based\nsystems, emphasizing the challenges within the computational components.\nThrough examples showcasing the diverse computational and memory requirements\nin the software stack, we demonstrate how more specialized hardware and\nprocessing closer to memory can enable more efficient execution with lower\nlatency. Finally, based on current trends and future demands, we conclude by\nspeculating what a future hardware platform for autonomous driving might look\nlike.\n","authors":["Xu Wang","Mohammad Ali Maleki","Muhammad Waqar Azhar","Pedro Trancoso"],"pdf_url":"https://arxiv.org/pdf/2411.10291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03189v2","updated":"2024-11-15T14:56:06Z","published":"2024-11-05T15:34:25Z","title":"Energy-Aware Predictive Motion Planning for Autonomous Vehicles Using a\n Hybrid Zonotope Constraint Representation","summary":" Uncrewed aerial systems have tightly coupled energy and motion dynamics which\nmust be accounted for by onboard planning algorithms. This work proposes a\nstrategy for coupled motion and energy planning using model predictive control\n(MPC). A reduced-order linear time-invariant model of coupled energy and motion\ndynamics is presented. Constrained zonotopes are used to represent state and\ninput constraints, and hybrid zonotopes are used to represent non-convex\nconstraints tied to a map of the environment. The structures of these\nconstraint representations are exploited within a mixed-integer quadratic\nprogram solver tailored to MPC motion planning problems. Results apply the\nproposed methodology to coupled motion and energy utilization planning problems\nfor 1) a hybrid-electric vehicle that must restrict engine usage when flying\nover regions with noise restrictions, and 2) an electric package delivery drone\nthat must track waysets with both position and battery state of charge\nrequirements. By leveraging the structure-exploiting solver, the proposed\nmixed-integer MPC formulations can be implemented in real time.\n","authors":["Joshua A. Robbins","Andrew F. Thompson","Sean Brennan","Herschel C. Pangborn"],"pdf_url":"https://arxiv.org/pdf/2411.03189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10203v1","updated":"2024-11-15T14:01:02Z","published":"2024-11-15T14:01:02Z","title":"Learning Generalizable 3D Manipulation With 10 Demonstrations","summary":" Learning robust and generalizable manipulation skills from demonstrations\nremains a key challenge in robotics, with broad applications in industrial\nautomation and service robotics. While recent imitation learning methods have\nachieved impressive results, they often require large amounts of demonstration\ndata and struggle to generalize across different spatial variants. In this\nwork, we present a novel framework that learns manipulation skills from as few\nas 10 demonstrations, yet still generalizes to spatial variants such as\ndifferent initial object positions and camera viewpoints. Our framework\nconsists of two key modules: Semantic Guided Perception (SGP), which constructs\ntask-focused, spatially aware 3D point cloud representations from RGB-D inputs;\nand Spatial Generalized Decision (SGD), an efficient diffusion-based\ndecision-making module that generates actions via denoising. To effectively\nlearn generalization ability from limited data, we introduce a critical\nspatially equivariant training strategy that captures the spatial knowledge\nembedded in expert demonstrations. We validate our framework through extensive\nexperiments on both simulation benchmarks and real-world robotic systems. Our\nmethod demonstrates a 60 percent improvement in success rates over\nstate-of-the-art approaches on a series of challenging tasks, even with\nsubstantial variations in object poses and camera viewpoints. This work shows\nsignificant potential for advancing efficient, generalizable manipulation skill\nlearning in real-world applications.\n","authors":["Yu Ren","Yang Cong","Ronghan Chen","Jiahao Long"],"pdf_url":"https://arxiv.org/pdf/2411.10203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10195v1","updated":"2024-11-15T13:51:54Z","published":"2024-11-15T13:51:54Z","title":"BEV-ODOM: Reducing Scale Drift in Monocular Visual Odometry with BEV\n Representation","summary":" Monocular visual odometry (MVO) is vital in autonomous navigation and\nrobotics, providing a cost-effective and flexible motion tracking solution, but\nthe inherent scale ambiguity in monocular setups often leads to cumulative\nerrors over time. In this paper, we present BEV-ODOM, a novel MVO framework\nleveraging the Bird's Eye View (BEV) Representation to address scale drift.\nUnlike existing approaches, BEV-ODOM integrates a depth-based perspective-view\n(PV) to BEV encoder, a correlation feature extraction neck, and a CNN-MLP-based\ndecoder, enabling it to estimate motion across three degrees of freedom without\nthe need for depth supervision or complex optimization techniques. Our\nframework reduces scale drift in long-term sequences and achieves accurate\nmotion estimation across various datasets, including NCLT, Oxford, and KITTI.\nThe results indicate that BEV-ODOM outperforms current MVO methods,\ndemonstrating reduced scale drift and higher accuracy.\n","authors":["Yufei Wei","Sha Lu","Fuzhang Han","Rong Xiong","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2411.10195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10176v1","updated":"2024-11-15T13:22:04Z","published":"2024-11-15T13:22:04Z","title":"Let people fail! Exploring the influence of explainable virtual and\n robotic agents in learning-by-doing tasks","summary":" Collaborative decision-making with artificial intelligence (AI) agents\npresents opportunities and challenges. While human-AI performance often\nsurpasses that of individuals, the impact of such technology on human behavior\nremains insufficiently understood, primarily when AI agents can provide\njustifiable explanations for their suggestions. This study compares the effects\nof classic vs. partner-aware explanations on human behavior and performance\nduring a learning-by-doing task. Three participant groups were involved: one\ninteracting with a computer, another with a humanoid robot, and a third one\nwithout assistance. Results indicated that partner-aware explanations\ninfluenced participants differently based on the type of artificial agents\ninvolved. With the computer, participants enhanced their task completion times.\nAt the same time, those interacting with the humanoid robot were more inclined\nto follow its suggestions, although they did not reduce their timing.\nInterestingly, participants autonomously performing the learning-by-doing task\ndemonstrated superior knowledge acquisition than those assisted by explainable\nAI (XAI). These findings raise profound questions and have significant\nimplications for automated tutoring and human-AI collaboration.\n","authors":["Marco Matarese","Francesco Rea","Katharina J. Rohlfing","Alessandra Sciutti"],"pdf_url":"https://arxiv.org/pdf/2411.10176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10171v1","updated":"2024-11-15T13:17:54Z","published":"2024-11-15T13:17:54Z","title":"Imagine-2-Drive: High-Fidelity World Modeling in CARLA for Autonomous\n Vehicles","summary":" In autonomous driving with image based state space, accurate prediction of\nfuture events and modeling diverse behavioral modes are essential for safety\nand effective decision-making. World model-based Reinforcement Learning (WMRL)\napproaches offers a promising solution by simulating future states from current\nstate and actions. However, utility of world models is often limited by typical\nRL policies being limited to deterministic or single gaussian distribution. By\nfailing to capture the full spectrum of possible actions, reduces their\nadaptability in complex, dynamic environments. In this work, we introduce\nImagine-2-Drive, a framework that consists of two components, VISTAPlan, a\nhigh-fidelity world model for accurate future prediction and Diffusion Policy\nActor (DPA), a diffusion based policy to model multi-modal behaviors for\ntrajectory prediction. We use VISTAPlan to simulate and evaluate trajectories\nfrom DPA and use Denoising Diffusion Policy Optimization (DDPO) to train DPA to\nmaximize the cumulative sum of rewards over the trajectories. We analyze the\nbenefits of each component and the framework as a whole in CARLA with standard\ndriving metrics. As a consequence of our twin novelties- VISTAPlan and DPA, we\nsignificantly outperform the state of the art (SOTA) world models on standard\ndriving metrics by 15% and 20% on Route Completion and Success Rate\nrespectively.\n","authors":["Anant Garg","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2411.10171v1.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2411.10170v1","updated":"2024-11-15T13:17:39Z","published":"2024-11-15T13:17:39Z","title":"Better Safe Than Sorry: Enhancing Arbitration Graphs for Safe and Robust\n Autonomous Decision-Making","summary":" This paper introduces an extension to the arbitration graph framework\ndesigned to enhance the safety and robustness of autonomous systems in complex,\ndynamic environments. Building on the flexibility and scalability of\narbitration graphs, the proposed method incorporates a verification step and\nstructured fallback layers in the decision-making process. This ensures that\nonly verified and safe commands are executed while enabling graceful\ndegradation in the presence of unexpected faults or bugs. The approach is\ndemonstrated using a Pac-Man simulation and further validated in the context of\nautonomous driving, where it shows significant reductions in accident risk and\nimprovements in overall system safety. The bottom-up design of arbitration\ngraphs allows for an incremental integration of new behavior components. The\nextension presented in this work enables the integration of experimental or\nimmature behavior components while maintaining system safety by clearly and\nprecisely defining the conditions under which behaviors are considered safe.\nThe proposed method is implemented as a ready to use header-only C++ library,\npublished under the MIT License. Together with the Pac-Man demo, it is\navailable at github.com/KIT-MRT/arbitration_graphs.\n","authors":["Piotr Spieker","Nick Le Large","Martin Lauer"],"pdf_url":"https://arxiv.org/pdf/2411.10170v1.pdf","comment":"7 pages, 5 figures, handed in for possible publication at IEEE ICRA\n 2025, source code available at github.com/KIT-MRT/arbitration_graphs"},{"id":"http://arxiv.org/abs/2411.10164v1","updated":"2024-11-15T13:12:47Z","published":"2024-11-15T13:12:47Z","title":"Evaluating Text-to-Image Diffusion Models for Texturing Synthetic Data","summary":" Building generic robotic manipulation systems often requires large amounts of\nreal-world data, which can be dificult to collect. Synthetic data generation\noffers a promising alternative, but limiting the sim-to-real gap requires\nsignificant engineering efforts. To reduce this engineering effort, we\ninvestigate the use of pretrained text-to-image diffusion models for texturing\nsynthetic images and compare this approach with using random textures, a common\ndomain randomization technique in synthetic data generation. We focus on\ngenerating object-centric representations, such as keypoints and segmentation\nmasks, which are important for robotic manipulation and require precise\nannotations. We evaluate the efficacy of the texturing methods by training\nmodels on the synthetic data and measuring their performance on real-world\ndatasets for three object categories: shoes, T-shirts, and mugs. Surprisingly,\nwe find that texturing using a diffusion model performs on par with random\ntextures, despite generating seemingly more realistic images. Our results\nsuggest that, for now, using diffusion models for texturing does not benefit\nsynthetic data generation for robotics. The code, data and trained models are\navailable at \\url{https://github.com/tlpss/diffusing-synthetic-data.git}.\n","authors":["Thomas Lips","Francis wyffels"],"pdf_url":"https://arxiv.org/pdf/2411.10164v1.pdf","comment":"Submitted to RA-L"},{"id":"http://arxiv.org/abs/2411.10148v1","updated":"2024-11-15T12:41:30Z","published":"2024-11-15T12:41:30Z","title":"Multi-UAV Search and Rescue in Wilderness Using Smart Agent-Based\n Probability Models","summary":" The application of Multiple Unmanned Aerial Vehicles (Multi-UAV) in\nWilderness Search and Rescue (WiSAR) significantly enhances mission success due\nto their rapid coverage of search areas from high altitudes and their\nadaptability to complex terrains. This capability is particularly crucial\nbecause time is a critical factor in searching for a lost person in the\nwilderness; as time passes, survival rates decrease and the search area\nexpands. The probability of success in such searches can be further improved if\nUAVs leverage terrain features to predict the lost person's position. In this\npaper, we aim to enhance search missions by proposing a smart agent-based\nprobability model that combines Monte Carlo simulations with an agent strategy\nlist, mimicking the behavior of a lost person in the wildness areas.\nFurthermore, we develop a distributed Multi-UAV receding horizon search\nstrategy with dynamic partitioning, utilizing the generated probability density\nmodel as prior information to prioritize locations where the lost person is\nmost likely to be found. Simulated search experiments across different terrains\nhave been conducted to validate the search efficiency of the proposed methods\ncompared to other benchmark methods.\n","authors":["Zijian Ge","Jingjing Jiang","Matthew Coombes"],"pdf_url":"https://arxiv.org/pdf/2411.10148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13802v2","updated":"2024-11-15T12:37:20Z","published":"2023-12-21T12:46:36Z","title":"A Dense Subframe-based SLAM Framework with Side-scan Sonar","summary":" Side-scan sonar (SSS) is a lightweight acoustic sensor that is commonly\ndeployed on autonomous underwater vehicles (AUVs) to provide high-resolution\nseafloor images. However, leveraging side-scan images for simultaneous\nlocalization and mapping (SLAM) presents a notable challenge, primarily due to\nthe difficulty of establishing sufficient amount of accurate correspondences\nbetween these images. To address this, we introduce a novel subframe-based\ndense SLAM framework utilizing side-scan sonar data, enabling effective dense\nmatching in overlapping regions of paired side-scan images. With each image\nbeing evenly divided into subframes, we propose a robust estimation pipeline to\nestimate the relative pose between each paired subframes, by using a good\ninlier set identified from dense correspondences. These relative poses are then\nintegrated as edge constraints in a factor graph to optimize the AUV pose\ntrajectory.\n The proposed framework is evaluated on three real datasets collected by a\nHugin AUV. Among one of them includes manually-annotated keypoint\ncorrespondences as ground truth and is used for evaluation of pose trajectory.\nWe also present a feasible way of evaluating mapping quality against multi-beam\nechosounder (MBES) data without the influence of pose. Experimental results\ndemonstrate that our approach effectively mitigates drift from the\ndead-reckoning (DR) system and enables quasi-dense bathymetry reconstruction.\nAn open-source implementation of this work is available.\n","authors":["Jun Zhang","Yiping Xie","Li Ling","John Folkesson"],"pdf_url":"https://arxiv.org/pdf/2312.13802v2.pdf","comment":"13 pages, 15 figures. Preprint version of manuscript accepted to IEEE\n Journal of Ocean Engineering. arXiv admin note: text overlap with\n arXiv:2304.01854"},{"id":"http://arxiv.org/abs/2411.09145v2","updated":"2024-11-15T12:27:39Z","published":"2024-11-14T02:57:11Z","title":"UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for\n Egocentric Hand Object Interaction Videos","summary":" Egocentric Hand Object Interaction (HOI) videos provide valuable insights\ninto human interactions with the physical world, attracting growing interest\nfrom the computer vision and robotics communities. A key task in fully\nunderstanding the geometry and dynamics of HOI scenes is dense pointclouds\nsequence reconstruction. However, the inherent motion of both hands and the\ncamera makes this challenging. Current methods often rely on time-consuming\ntest-time optimization, making them impractical for reconstructing\ninternet-scale videos. To address this, we introduce UniHOI, a model that\nunifies the estimation of all variables necessary for dense 4D reconstruction,\nincluding camera intrinsic, camera poses, and video depth, for egocentric HOI\nscene in a fast feed-forward manner. We end-to-end optimize all these variables\nto improve their consistency in 3D space. Furthermore, our model could be\ntrained solely on large-scale monocular video dataset, overcoming the\nlimitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain\nand zero-shot generalization setting, surpassing all baselines in pointclouds\nsequence reconstruction and long-term 3D scene flow recovery. UniHOI is the\nfirst approach to offer fast, dense, and generalizable monocular egocentric HOI\nscene reconstruction in the presence of motion. Code and trained model will be\nreleased in the future.\n","authors":["Chengbo Yuan","Geng Chen","Li Yi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2411.09145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10049v1","updated":"2024-11-15T09:00:56Z","published":"2024-11-15T09:00:56Z","title":"SPLIT: SE(3)-diffusion via Local Geometry-based Score Prediction for 3D\n Scene-to-Pose-Set Matching Problems","summary":" To enable versatile robot manipulation, robots must detect task-relevant\nposes for different purposes from raw scenes. Currently, many perception\nalgorithms are designed for specific purposes, which limits the flexibility of\nthe perception module. We present a general problem formulation called 3D\nscene-to-pose-set matching, which directly matches the corresponding poses from\nthe scene without relying on task-specific heuristics. To address this, we\nintroduce SPLIT, an SE(3)-diffusion model for generating pose samples from a\nscene. The model's efficiency comes from predicting scores based on local\ngeometry with respect to the sample pose. Moreover, leveraging the conditioned\ngeneration capability of diffusion models, we demonstrate that SPLIT can\ngenerate the multi-purpose poses, required to complete both the mug\nreorientation and hanging manipulation within a single model.\n","authors":["Kanghyun Kim","Min Jun Kim"],"pdf_url":"https://arxiv.org/pdf/2411.10049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10038v1","updated":"2024-11-15T08:38:39Z","published":"2024-11-15T08:38:39Z","title":"Remote Life Support Robot Interface System for Global Task Planning and\n Local Action Expansion Using Foundation Models","summary":" Robot systems capable of executing tasks based on language instructions have\nbeen actively researched. It is challenging to convey uncertain information\nthat can only be determined on-site with a single language instruction to the\nrobot. In this study, we propose a system that includes ambiguous parts as\ntemplate variables in language instructions to communicate the information to\nbe collected and the options to be presented to the robot for predictable\nuncertain events. This study implements prompt generation for each robot action\nfunction based on template variables to collect information, and a feedback\nsystem for presenting and selecting options based on template variables for\nuser-to-robot communication. The effectiveness of the proposed system was\ndemonstrated through its application to real-life support tasks performed by\nthe robot.\n","authors":["Yoshiki Obinata","Haoyu Jia","Kento Kawaharazuka","Naoaki Kanazawa","Kei Okada"],"pdf_url":"https://arxiv.org/pdf/2411.10038v1.pdf","comment":"Accepted to 2024 IEEE-RAS International Conference on Humanoids\n Robots (Humanoids 2024)"},{"id":"http://arxiv.org/abs/2411.10016v1","updated":"2024-11-15T07:50:30Z","published":"2024-11-15T07:50:30Z","title":"'What did the Robot do in my Absence?' Video Foundation Models to\n Enhance Intermittent Supervision","summary":" This paper investigates the application of Video Foundation Models (ViFMs)\nfor generating robot data summaries to enhance intermittent human supervision\nof robot teams. We propose a novel framework that produces both generic and\nquery-driven summaries of long-duration robot vision data in three modalities:\nstoryboards, short videos, and text. Through a user study involving 30\nparticipants, we evaluate the efficacy of these summary methods in allowing\noperators to accurately retrieve the observations and actions that occurred\nwhile the robot was operating without supervision over an extended duration (40\nmin). Our findings reveal that query-driven summaries significantly improve\nretrieval accuracy compared to generic summaries or raw data, albeit with\nincreased task duration. Storyboards are found to be the most effective\npresentation modality, especially for object-related queries. This work\nrepresents, to our knowledge, the first zero-shot application of ViFMs for\ngenerating multi-modal robot-to-human communication in intermittent supervision\ncontexts, demonstrating both the promise and limitations of these models in\nhuman-robot interaction (HRI) scenarios.\n","authors":["Kavindie Katuwandeniya","Leimin Tian","Dana Kulić"],"pdf_url":"https://arxiv.org/pdf/2411.10016v1.pdf","comment":"This work has been submitted to the IEEE RAL for possible publication"},{"id":"http://arxiv.org/abs/2408.09468v2","updated":"2024-11-15T06:15:25Z","published":"2024-08-18T13:27:49Z","title":"Towards Safe and Robust Autonomous Vehicle Platooning: A Self-Organizing\n Cooperative Control Framework","summary":" In hybrid traffic environments where human-driven vehicles (HDVs) and\nautonomous vehicles (AVs) coexist, achieving safe and robust decision-making\nfor AV platooning remains a complex challenge. Existing platooning systems\noften struggle with dynamic formation management and adaptability, especially\nin unpredictable, mixed-traffic conditions. To enhance autonomous vehicle\nplatooning within these hybrid environments, this paper presents TriCoD, a\ntwin-world safety-enhanced Data-Model-Knowledge Triple-Driven Cooperative\nDecision-making Framework. This framework integrates deep reinforcement\nlearning (DRL) with model-driven approaches, enabling dynamic formation\ndissolution and reconfiguration through a safety-prioritized twin-world\ndeduction mechanism. The DRL component augments traditional model-driven\nmethods, enhancing both safety and operational efficiency, especially under\nemergency conditions. Additionally, an adaptive switching mechanism allows the\nsystem to seamlessly shift between data-driven and model-driven strategies\nbased on real-time traffic demands, thereby optimizing decision-making ability\nand adaptability. Simulation experiments and hardware-in-the-loop tests\ndemonstrate that the proposed framework significantly improves safety,\nrobustness, and flexibility. A detailed account of the validation results for\nthe model can be found in\n\\href{https://perfectxu88.github.io/towardssafeandrobust.github.io/}{Our\nWebsite}.\n","authors":["Chengkai Xu","Zihao Deng","Jiaqi Liu","Aijing Kong","Chao Huang","Peng Hang"],"pdf_url":"https://arxiv.org/pdf/2408.09468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09975v1","updated":"2024-11-15T06:15:02Z","published":"2024-11-15T06:15:02Z","title":"Express Yourself: Enabling large-scale public events involving\n multi-human-swarm interaction for social applications with MOSAIX","summary":" Robot swarms have the potential to help groups of people with social tasks,\ngiven their ability to scale to large numbers of robots and users. Developing\nmulti-human-swarm interaction is therefore crucial to support multiple people\ninteracting with the swarm simultaneously - which is an area that is scarcely\nresearched, unlike single-human, single-robot or single-human, multi-robot\ninteraction. Moreover, most robots are still confined to laboratory settings.\nIn this paper, we present our work with MOSAIX, a swarm of robot Tiles, that\nfacilitated ideation at a science museum. 63 robots were used as a swarm of\nsmart sticky notes, collecting input from the public and aggregating it based\non themes, providing an evolving visualization tool that engaged visitors and\nfostered their participation. Our contribution lies in creating a large-scale\n(63 robots and 294 attendees) public event, with a completely decentralized\nswarm system in real-life settings. We also discuss learnings we obtained that\nmight help future researchers create multi-human-swarm interaction with the\npublic.\n","authors":["Merihan Alhafnawi","Maca Gomez-Gutierrez","Edmund R. Hunt","Severin Lemaignan","Paul O'Dowd","Sabine Hauert"],"pdf_url":"https://arxiv.org/pdf/2411.09975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09971v1","updated":"2024-11-15T06:05:33Z","published":"2024-11-15T06:05:33Z","title":"Explanation for Trajectory Planning using Multi-modal Large Language\n Model for Autonomous Driving","summary":" End-to-end style autonomous driving models have been developed recently.\nThese models lack interpretability of decision-making process from perception\nto control of the ego vehicle, resulting in anxiety for passengers. To\nalleviate it, it is effective to build a model which outputs captions\ndescribing future behaviors of the ego vehicle and their reason. However, the\nexisting approaches generate reasoning text that inadequately reflects the\nfuture plans of the ego vehicle, because they train models to output captions\nusing momentary control signals as inputs. In this study, we propose a\nreasoning model that takes future planning trajectories of the ego vehicle as\ninputs to solve this limitation with the dataset newly collected.\n","authors":["Shota Yamazaki","Chenyu Zhang","Takuya Nanri","Akio Shigekane","Siyuan Wang","Jo Nishiyama","Tao Chu","Kohei Yokosawa"],"pdf_url":"https://arxiv.org/pdf/2411.09971v1.pdf","comment":"Accepted and presented at ECCV 2024 2nd Workshop on Vision-Centric\n Autonomous Driving (VCAD) on September 30, 2024. 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.11356v2","updated":"2024-11-15T05:28:37Z","published":"2024-10-15T07:25:51Z","title":"GSORB-SLAM: Gaussian Splatting SLAM benefits from ORB features and\n Transmittance information","summary":" The emergence of 3D Gaussian Splatting (3DGS) has recently sparked a renewed\nwave of dense visual SLAM research. However, current methods face challenges\nsuch as sensitivity to artifacts and noise, sub-optimal selection of training\nviewpoints, and a lack of light global optimization. In this paper, we propose\na dense SLAM system that tightly couples 3DGS with ORB features. We design a\njoint optimization approach for robust tracking and effectively reducing the\nimpact of noise and artifacts. This involves combining novel geometric\nobservations, derived from accumulated transmittance, with ORB features\nextracted from pixel data. Furthermore, to improve mapping quality, we propose\nan adaptive Gaussian expansion and regularization method that enables Gaussian\nprimitives to represent the scene compactly. This is coupled with a viewpoint\nselection strategy based on the hybrid graph to mitigate over-fitting effects\nand enhance convergence quality. Finally, our approach achieves compact and\nhigh-quality scene representations and accurate localization. GSORB-SLAM has\nbeen evaluated on different datasets, demonstrating outstanding performance.\nThe code will be available.\n","authors":["Wancai Zheng","Xinyi Yu","Jintao Rong","Linlin Ou","Yan Wei","Libo Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.11356v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09953v1","updated":"2024-11-15T05:11:28Z","published":"2024-11-15T05:11:28Z","title":"Brain-inspired Action Generation with Spiking Transformer Diffusion\n Policy Model","summary":" Spiking Neural Networks (SNNs) has the ability to extract spatio-temporal\nfeatures due to their spiking sequence. While previous research has primarily\nfoucus on the classification of image and reinforcement learning. In our paper,\nwe put forward novel diffusion policy model based on Spiking Transformer Neural\nNetworks and Denoising Diffusion Probabilistic Model (DDPM): Spiking\nTransformer Modulate Diffusion Policy Model (STMDP), a new brain-inspired model\nfor generating robot action trajectories. In order to improve the performance\nof this model, we develop a novel decoder module: Spiking Modulate De coder\n(SMD), which replaces the traditional Decoder module within the Transformer\narchitecture. Additionally, we explored the substitution of DDPM with Denoising\nDiffusion Implicit Models (DDIM) in our frame work. We conducted experiments\nacross four robotic manipulation tasks and performed ablation studies on the\nmodulate block. Our model consistently outperforms existing Transformer-based\ndiffusion policy method. Especially in Can task, we achieved an improvement of\n8%. The proposed STMDP method integrates SNNs, dffusion model and Transformer\narchitecture, which offers new perspectives and promising directions for\nexploration in brain-inspired robotics.\n","authors":["Qianhao Wang","Yinqian Sun","Enmeng Lu","Qian Zhang","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.09953v1.pdf","comment":"10 pages, 4 figures and 2 tables, conference submission"},{"id":"http://arxiv.org/abs/2411.09942v1","updated":"2024-11-15T04:42:15Z","published":"2024-11-15T04:42:15Z","title":"ALPHA-$α$ and Bi-ACT Are All You Need: Importance of Position and\n Force Information/Control for Imitation Learning of Unimanual and Bimanual\n Robotic Manipulation with Low-Cost System","summary":" Autonomous manipulation in everyday tasks requires flexible action generation\nto handle complex, diverse real-world environments, such as objects with\nvarying hardness and softness. Imitation Learning (IL) enables robots to learn\ncomplex tasks from expert demonstrations. However, a lot of existing methods\nrely on position/unilateral control, leaving challenges in tasks that require\nforce information/control, like carefully grasping fragile or varying-hardness\nobjects. As the need for diverse controls increases, there are demand for\nlow-cost bimanual robots that consider various motor inputs. To address these\nchallenges, we introduce Bilateral Control-Based Imitation Learning via Action\nChunking with Transformers(Bi-ACT) and\"A\" \"L\"ow-cost \"P\"hysical \"Ha\"rdware\nConsidering Diverse Motor Control Modes for Research in Everyday Bimanual\nRobotic Manipulation (ALPHA-$\\alpha$). Bi-ACT leverages bilateral control to\nutilize both position and force information, enhancing the robot's adaptability\nto object characteristics such as hardness, shape, and weight. The concept of\nALPHA-$\\alpha$ is affordability, ease of use, repairability, ease of assembly,\nand diverse control modes (position, velocity, torque), allowing\nresearchers/developers to freely build control systems using ALPHA-$\\alpha$. In\nour experiments, we conducted a detailed analysis of Bi-ACT in unimanual\nmanipulation tasks, confirming its superior performance and adaptability\ncompared to Bi-ACT without force control. Based on these results, we applied\nBi-ACT to bimanual manipulation tasks. Experimental results demonstrated high\nsuccess rates in coordinated bimanual operations across multiple tasks. The\neffectiveness of the Bi-ACT and ALPHA-$\\alpha$ can be seen through\ncomprehensive real-world experiments. Video available at:\nhttps://mertcookimg.github.io/alpha-biact/\n","authors":["Masato Kobayashi","Thanpimon Buamanee","Takumi Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2411.09942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09935v1","updated":"2024-11-15T04:20:00Z","published":"2024-11-15T04:20:00Z","title":"Whole-Body Impedance Coordinative Control of Wheel-Legged Robot on\n Uncertain Terrain","summary":" This article propose a whole-body impedance coordinative control framework\nfor a wheel-legged humanoid robot to achieve adaptability on complex terrains\nwhile maintaining robot upper body stability. The framework contains a bi-level\ncontrol strategy. The outer level is a variable damping impedance controller,\nwhich optimizes the damping parameters to ensure the stability of the upper\nbody while holding an object. The inner level employs Whole-Body Control (WBC)\noptimization that integrates real-time terrain estimation based on wheel-foot\nposition and force data. It generates motor torques while accounting for\ndynamic constraints, joint limits,friction cones, real-time terrain updates,\nand a model-free friction compensation strategy. The proposed whole-body\ncoordinative control method has been tested on a recently developed quadruped\nhumanoid robot. The results demonstrate that the proposed algorithm effectively\ncontrols the robot, maintaining upper body stability to successfully complete a\nwater-carrying task while adapting to varying terrains.\n","authors":["Lei Shi","Xinghua Yu","Cheng Zhou","Wanxin Jin","Wanchao Chi","Shenghao Zhang","Dongsheng Zhang","Xiong Li","Zhengyou Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09929v1","updated":"2024-11-15T04:07:54Z","published":"2024-11-15T04:07:54Z","title":"Autonomous Robotic Pepper Harvesting: Imitation Learning in Unstructured\n Agricultural Environments","summary":" Automating tasks in outdoor agricultural fields poses significant challenges\ndue to environmental variability, unstructured terrain, and diverse crop\ncharacteristics. We present a robotic system for autonomous pepper harvesting\ndesigned to operate in these unprotected, complex settings. Utilizing a custom\nhandheld shear-gripper, we collected 300 demonstrations to train a visuomotor\npolicy, enabling the system to adapt to varying field conditions and crop\ndiversity. We achieved a success rate of 28.95% with a cycle time of 31.71\nseconds, comparable to existing systems tested under more controlled conditions\nlike greenhouses. Our system demonstrates the feasibility and effectiveness of\nleveraging imitation learning for automated harvesting in unstructured\nagricultural environments. This work aims to advance scalable, automated\nrobotic solutions for agriculture in natural settings.\n","authors":["Chung Hee Kim","Abhisesh Silwal","George Kantor"],"pdf_url":"https://arxiv.org/pdf/2411.09929v1.pdf","comment":"8 pages, 11 figures"},{"id":"http://arxiv.org/abs/2407.05478v4","updated":"2024-11-15T03:13:00Z","published":"2024-07-07T19:33:30Z","title":"Sequential Gaussian Variational Inference for Nonlinear State Estimation\n and Its Application in Robot Navigation","summary":" Probabilistic state estimation is essential for robots navigating uncertain\nenvironments. Accurately and efficiently managing uncertainty in estimated\nstates is key to robust robotic operation. However, nonlinearities in robotic\nplatforms pose significant challenges that require advanced estimation\ntechniques. Gaussian variational inference (GVI) offers an optimization\nperspective on the estimation problem, providing analytically tractable\nsolutions and efficiencies derived from the geometry of Gaussian space. We\npropose a Sequential Gaussian Variational Inference (S-GVI) method to address\nnonlinearity and provide efficient sequential inference processes. Our approach\nintegrates sequential Bayesian principles into the GVI framework, which are\naddressed using statistical approximations and gradient updates on the\ninformation geometry. Validations through simulations and real-world\nexperiments demonstrate significant improvements in state estimation over the\nMaximum A Posteriori (MAP) estimation method.\n","authors":["Min-Won Seo","Solmaz S. Kia"],"pdf_url":"https://arxiv.org/pdf/2407.05478v4.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2411.09904v1","updated":"2024-11-15T02:59:16Z","published":"2024-11-15T02:59:16Z","title":"Self-Supervised Learning of Grasping Arbitrary Objects On-the-Move","summary":" Mobile grasping enhances manipulation efficiency by utilizing robots'\nmobility. This study aims to enable a commercial off-the-shelf robot for mobile\ngrasping, requiring precise timing and pose adjustments. Self-supervised\nlearning can develop a generalizable policy to adjust the robot's velocity and\ndetermine grasp position and orientation based on the target object's shape and\npose. Due to mobile grasping's complexity, action primitivization and\nstep-by-step learning are crucial to avoid data sparsity in learning from trial\nand error. This study simplifies mobile grasping into two grasp action\nprimitives and a moving action primitive, which can be operated with limited\ndegrees of freedom for the manipulator. This study introduces three fully\nconvolutional neural network (FCN) models to predict static grasp primitive,\ndynamic grasp primitive, and residual moving velocity error from visual inputs.\nA two-stage grasp learning approach facilitates seamless FCN model learning.\nThe ablation study demonstrated that the proposed method achieved the highest\ngrasping accuracy and pick-and-place efficiency. Furthermore, randomizing\nobject shapes and environments in the simulation effectively achieved\ngeneralizable mobile grasping.\n","authors":["Takuya Kiyokawa","Eiki Nagata","Yoshihisa Tsurumine","Yuhwan Kwon","Takamitsu Matsubara"],"pdf_url":"https://arxiv.org/pdf/2411.09904v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.09892v1","updated":"2024-11-15T02:36:36Z","published":"2024-11-15T02:36:36Z","title":"Deep learning robotics using self-supervised spatial differentiation\n drive autonomous contact-based semiconductor characterization","summary":" Integrating autonomous contact-based robotic characterization into\nself-driving laboratories can enhance measurement quality, reliability, and\nthroughput. While deep learning models support robust autonomy, current methods\nlack pixel-precision positioning and require extensive labeled data. To\novercome these challenges, we propose a self-supervised convolutional neural\nnetwork with a spatially differentiable loss function, incorporating shape\npriors to refine predictions of optimal robot contact poses for semiconductor\ncharacterization. This network improves valid pose generation by 20.0%,\nrelative to existing models. We demonstrate our network's performance by\ndriving a 4-degree-of-freedom robot to characterize photoconductivity at 3,025\npredicted poses across a gradient of perovskite compositions, achieving\nthroughputs over 125 measurements per hour. Spatially mapping photoconductivity\nonto each drop-casted film reveals regions of inhomogeneity. With this\nself-supervised deep learning-driven robotic system, we enable high-precision\nand reliable automation of contact-based characterization techniques at high\nthroughputs, thereby allowing the measurement of previously inaccessible yet\nimportant semiconductor properties for self-driving laboratories.\n","authors":["Alexander E. Siemenn","Basita Das","Kangyu Ji","Fang Sheng","Tonio Buonassisi"],"pdf_url":"https://arxiv.org/pdf/2411.09892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09891v1","updated":"2024-11-15T02:35:20Z","published":"2024-11-15T02:35:20Z","title":"Off-Dynamics Reinforcement Learning via Domain Adaptation and Reward\n Augmented Imitation","summary":" Training a policy in a source domain for deployment in the target domain\nunder a dynamics shift can be challenging, often resulting in performance\ndegradation. Previous work tackles this challenge by training on the source\ndomain with modified rewards derived by matching distributions between the\nsource and the target optimal trajectories. However, pure modified rewards only\nensure the behavior of the learned policy in the source domain resembles\ntrajectories produced by the target optimal policies, which does not guarantee\noptimal performance when the learned policy is actually deployed to the target\ndomain. In this work, we propose to utilize imitation learning to transfer the\npolicy learned from the reward modification to the target domain so that the\nnew policy can generate the same trajectories in the target domain. Our\napproach, Domain Adaptation and Reward Augmented Imitation Learning (DARAIL),\nutilizes the reward modification for domain adaptation and follows the general\nframework of generative adversarial imitation learning from observation (GAIfO)\nby applying a reward augmented estimator for the policy optimization step.\nTheoretically, we present an error bound for our method under a mild assumption\nregarding the dynamics shift to justify the motivation of our method.\nEmpirically, our method outperforms the pure modified reward method without\nimitation learning and also outperforms other baselines in benchmark\noff-dynamics environments.\n","authors":["Yihong Guo","Yixuan Wang","Yuanyuan Shi","Pan Xu","Anqi Liu"],"pdf_url":"https://arxiv.org/pdf/2411.09891v1.pdf","comment":"Published at Neurips 2024"},{"id":"http://arxiv.org/abs/2411.09887v1","updated":"2024-11-15T02:18:35Z","published":"2024-11-15T02:18:35Z","title":"Planning by Simulation: Motion Planning with Learning-based Parallel\n Scenario Prediction for Autonomous Driving","summary":" Planning safe trajectories for autonomous vehicles is essential for\noperational safety but remains extremely challenging due to the complex\ninteractions among traffic participants. Recent autonomous driving frameworks\nhave focused on improving prediction accuracy to explicitly model these\ninteractions. However, some methods overlook the significant influence of the\nego vehicle's planning on the possible trajectories of other agents, which can\nalter prediction accuracy and lead to unsafe planning decisions. In this paper,\nwe propose a novel motion Planning approach by Simulation with learning-based\nparallel scenario prediction (PS). PS deduces predictions iteratively based on\nMonte Carlo Tree Search (MCTS), jointly inferring scenarios that cooperate with\nthe ego vehicle's planning set. Our method simulates possible scenes and\ncalculates their costs after the ego vehicle executes potential actions. To\nbalance and prune unreasonable actions and scenarios, we adopt MCTS as the\nfoundation to explore possible future interactions encoded within the\nprediction network. Moreover, the query-centric trajectory prediction\nstreamlines our scene generation, enabling a sophisticated framework that\ncaptures the mutual influence between other agents' predictions and the ego\nvehicle's planning. We evaluate our framework on the Argoverse 2 dataset, and\nthe results demonstrate that our approach effectively achieves parallel ego\nvehicle planning.\n","authors":["Tian Niu","Kaizhao Zhang","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2411.09887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09870v1","updated":"2024-11-15T01:22:28Z","published":"2024-11-15T01:22:28Z","title":"Impact-Aware Control using Time-Invariant Reference Spreading","summary":" With the goal of increasing the speed and efficiency in robotic manipulation,\na control approach is presented that aims to utilize intentional simultaneous\nimpacts to its advantage. This approach exploits the concept of the\ntime-invariant reference spreading framework, in which partly-overlapping ante-\nand post-impact reference vector fields are used. These vector fields are\ncoupled via an impact model in proximity of the expected impact area,\nminimizing the otherwise large impact-induced velocity errors and control\nefforts. We show how a nonsmooth physics engine can be used to construct this\nimpact model for complex scenarios, which warrants applicability to a large\nrange of possible impact states without requiring contact stiffness and damping\nparameters. In addition, a novel interim-impact control mode provides\nrobustness in the execution against the inevitable lack of exact impact\nsimultaneity and the corresponding unreliable velocity error during the time\nwhen contact is only partially established. This interim mode uses a position\nfeedback signal that is derived from the ante-impact velocity reference to\npromote contact completion, and smoothly transitions into the post-impact mode.\nAn experimental validation of time-invariant reference spreading control is\npresented for the first time through a set of 600 robotic hit-and-push and\ndual-arm grabbing experiments.\n","authors":["Jari van Steen","Nathan van de Wouw","Alessandro Saccon"],"pdf_url":"https://arxiv.org/pdf/2411.09870v1.pdf","comment":"15 pages, 10 figures. Submitted to IEEE Transactions on Robotics\n (T-RO)"},{"id":"http://arxiv.org/abs/2411.10603v1","updated":"2024-11-15T21:53:41Z","published":"2024-11-15T21:53:41Z","title":"A Novel MLLM-based Approach for Autonomous Driving in Different Weather\n Conditions","summary":" Autonomous driving (AD) technology promises to revolutionize daily\ntransportation by making it safer, more efficient, and more comfortable. Their\nrole in reducing traffic accidents and improving mobility will be vital to the\nfuture of intelligent transportation systems. Autonomous driving in harsh\nenvironmental conditions presents significant challenges that demand robust and\nadaptive solutions and require more investigation. In this context, we present\nin this paper a comprehensive performance analysis of an autonomous driving\nagent leveraging the capabilities of a Multi-modal Large Language Model (MLLM)\nusing GPT-4o within the LimSim++ framework that offers close loop interaction\nwith the CARLA driving simulator. We call it MLLM-AD-4o. Our study evaluates\nthe agent's decision-making, perception, and control under adverse conditions,\nincluding bad weather, poor visibility, and complex traffic scenarios. Our\nresults demonstrate the AD agent's ability to maintain high levels of safety\nand efficiency, even in challenging environments, underscoring the potential of\nGPT-4o to enhance autonomous driving systems (ADS) in any environment\ncondition. Moreover, we evaluate the performance of MLLM-AD-4o when different\nperception entities are used including either front cameras only, front and\nrear cameras, and when combined with LiDAR. The results of this work provide\nvaluable insights into integrating MLLMs with AD frameworks, paving the way for\nfuture advancements in this field.\n","authors":["Sonda Fourati","Wael Jaafar","Noura Baccar"],"pdf_url":"https://arxiv.org/pdf/2411.10603v1.pdf","comment":"9 pages, 6 figures; Submitted to IEEE Transactions on Intelligent\n Transportation Systems"},{"id":"http://arxiv.org/abs/2411.10585v1","updated":"2024-11-15T21:15:21Z","published":"2024-11-15T21:15:21Z","title":"Autonomous Sensor Exchange and Calibration for Cornstalk Nitrate\n Monitoring Robot","summary":" Interactive sensors are an important component of robotic systems but often\nrequire manual replacement due to wear and tear. Automating this process can\nenhance system autonomy and facilitate long-term deployment. We developed an\nautonomous sensor exchange and calibration system for an agriculture crop\nmonitoring robot that inserts a nitrate sensor into cornstalks. A novel gripper\nand replacement mechanism, featuring a reliable funneling design, were\ndeveloped to enable efficient and reliable sensor exchanges. To maintain\nconsistent nitrate sensor measurement, an on-board sensor calibration station\nwas integrated to provide in-field sensor cleaning and calibration. The system\nwas deployed at the Ames Curtis Farm in June 2024, where it successfully\ninserted nitrate sensors with high accuracy into 30 cornstalks with a 77$\\%$\nsuccess rate.\n","authors":["Janice Seungyeon Lee","Thomas Detlefsen","Shara Lawande","Saudamini Ghatge","Shrudhi Ramesh Shanthi","Sruthi Mukkamala","George Kantor","Oliver Kroemer"],"pdf_url":"https://arxiv.org/pdf/2411.10585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10546v1","updated":"2024-11-15T19:43:24Z","published":"2024-11-15T19:43:24Z","title":"The Oxford Spires Dataset: Benchmarking Large-Scale LiDAR-Visual\n Localisation, Reconstruction and Radiance Field Methods","summary":" This paper introduces a large-scale multi-modal dataset captured in and\naround well-known landmarks in Oxford using a custom-built multi-sensor\nperception unit as well as a millimetre-accurate map from a Terrestrial LiDAR\nScanner (TLS). The perception unit includes three synchronised global shutter\ncolour cameras, an automotive 3D LiDAR scanner, and an inertial sensor - all\nprecisely calibrated. We also establish benchmarks for tasks involving\nlocalisation, reconstruction, and novel-view synthesis, which enable the\nevaluation of Simultaneous Localisation and Mapping (SLAM) methods,\nStructure-from-Motion (SfM) and Multi-view Stereo (MVS) methods as well as\nradiance field methods such as Neural Radiance Fields (NeRF) and 3D Gaussian\nSplatting. To evaluate 3D reconstruction the TLS 3D models are used as ground\ntruth. Localisation ground truth is computed by registering the mobile LiDAR\nscans to the TLS 3D models. Radiance field methods are evaluated not only with\nposes sampled from the input trajectory, but also from viewpoints that are from\ntrajectories which are distant from the training poses. Our evaluation\ndemonstrates a key limitation of state-of-the-art radiance field methods: we\nshow that they tend to overfit to the training poses/images and do not\ngeneralise well to out-of-sequence poses. They also underperform in 3D\nreconstruction compared to MVS systems using the same visual inputs. Our\ndataset and benchmarks are intended to facilitate better integration of\nradiance field methods and SLAM systems. The raw and processed data, along with\nsoftware for parsing and evaluation, can be accessed at\nhttps://dynamic.robots.ox.ac.uk/datasets/oxford-spires/.\n","authors":["Yifu Tao","Miguel Ángel Muñoz-Bañón","Lintong Zhang","Jiahao Wang","Lanke Frank Tarimo Fu","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2411.10546v1.pdf","comment":"Website: https://dynamic.robots.ox.ac.uk/datasets/oxford-spires/"},{"id":"http://arxiv.org/abs/2411.10535v1","updated":"2024-11-15T19:11:58Z","published":"2024-11-15T19:11:58Z","title":"Advancing Autonomous Driving Perception: Analysis of Sensor Fusion and\n Computer Vision Techniques","summary":" In autonomous driving, perception systems are piv otal as they interpret\nsensory data to understand the envi ronment, which is essential for\ndecision-making and planning.\n Ensuring the safety of these perception systems is fundamental\n for achieving high-level autonomy, allowing us to confidently\n delegate driving and monitoring tasks to machines. This re port aims to\nenhance the safety of perception systems by\n examining and summarizing the latest advancements in vision\n based systems, and metrics for perception tasks in autonomous\n driving. The report also underscores significant achievements and\n recognized challenges faced by current research in this field. This\n project focuses on enhancing the understanding and navigation\n capabilities of self-driving robots through depth based perception\n and computer vision techniques. Specifically, it explores how we\n can perform better navigation into unknown map 2D map with\n existing detection and tracking algorithms and on top of that how\n depth based perception can enhance the navigation capabilities of\n the wheel based bots to improve autonomous driving perception.\n","authors":["Urvishkumar Bharti","Vikram Shahapur"],"pdf_url":"https://arxiv.org/pdf/2411.10535v1.pdf","comment":"7 pages"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.10442v1","updated":"2024-11-15T18:59:27Z","published":"2024-11-15T18:59:27Z","title":"Enhancing the Reasoning Ability of Multimodal Large Language Models via\n Mixed Preference Optimization","summary":" Existing open-source multimodal large language models (MLLMs) generally\nfollow a training process involving pre-training and supervised fine-tuning.\nHowever, these models suffer from distribution shifts, which limit their\nmultimodal reasoning, particularly in the Chain-of-Thought (CoT) performance.\nTo address this, we introduce a preference optimization (PO) process to enhance\nthe multimodal reasoning capabilities of MLLMs. Specifically, (1) on the data\nside, we design an automated preference data construction pipeline to create\nMMPR, a high-quality, large-scale multimodal reasoning preference dataset. and\n(2) on the model side, we explore integrating PO with MLLMs, developing a\nsimple yet effective method, termed Mixed Preference Optimization (MPO), which\nboosts multimodal CoT performance. Our approach demonstrates improved\nperformance across multiple benchmarks, particularly in multimodal reasoning\ntasks. Notably, our model, InternVL2-8B-MPO, achieves an accuracy of 67.0 on\nMathVista, outperforming InternVL2-8B by 8.7 points and achieving performance\ncomparable to the 10x larger InternVL2-76B. We hope this study could inspire\nfurther advancements in MLLMs. Code, data, and model shall be publicly\nreleased.\n","authors":["Weiyun Wang","Zhe Chen","Wenhai Wang","Yue Cao","Yangzhou Liu","Zhangwei Gao","Jinguo Zhu","Xizhou Zhu","Lewei Lu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2411.10442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10440v1","updated":"2024-11-15T18:58:31Z","published":"2024-11-15T18:58:31Z","title":"LLaVA-o1: Let Vision Language Models Reason Step-by-Step","summary":" Large language models have demonstrated substantial advancements in reasoning\ncapabilities, particularly through inference-time scaling, as illustrated by\nmodels such as OpenAI's o1. However, current Vision-Language Models (VLMs)\noften struggle to perform systematic and structured reasoning, especially when\nhandling complex visual question-answering tasks. In this work, we introduce\nLLaVA-o1, a novel VLM designed to conduct autonomous multistage reasoning.\nUnlike chain-of-thought prompting, LLaVA-o1 independently engages in sequential\nstages of summarization, visual interpretation, logical reasoning, and\nconclusion generation. This structured approach enables LLaVA-o1 to achieve\nmarked improvements in precision on reasoning-intensive tasks. To accomplish\nthis, we compile the LLaVA-o1-100k dataset, integrating samples from various\nvisual question answering sources and providing structured reasoning\nannotations. Besides, we propose an inference-time stage-level beam search\nmethod, which enables effective inference-time scaling. Remarkably, with only\n100k training samples and a simple yet effective inference time scaling method,\nLLaVA-o1 not only outperforms its base model by 8.9% on a wide range of\nmultimodal reasoning benchmarks, but also surpasses the performance of larger\nand even closed-source models, such as Gemini-1.5-pro, GPT-4o-mini, and\nLlama-3.2-90B-Vision-Instruct.\n","authors":["Guowei Xu","Peng Jin","Li Hao","Yibing Song","Lichao Sun","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.10440v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.10436v1","updated":"2024-11-15T18:56:01Z","published":"2024-11-15T18:56:01Z","title":"Mitigating Hallucination in Multimodal Large Language Model via\n Hallucination-targeted Direct Preference Optimization","summary":" Multimodal Large Language Models (MLLMs) are known to hallucinate, which\nlimits their practical applications. Recent works have attempted to apply\nDirect Preference Optimization (DPO) to enhance the performance of MLLMs, but\nhave shown inconsistent improvements in mitigating hallucinations. To address\nthis issue more effectively, we introduce Hallucination-targeted Direct\nPreference Optimization (HDPO) to reduce hallucinations in MLLMs. Unlike\nprevious approaches, our method tackles hallucinations from their diverse forms\nand causes. Specifically, we develop three types of preference pair data\ntargeting the following causes of MLLM hallucinations: (1) insufficient visual\ncapabilities, (2) long context generation, and (3) multimodal conflicts.\nExperimental results demonstrate that our method achieves superior performance\nacross multiple hallucination evaluation datasets, surpassing most\nstate-of-the-art (SOTA) methods and highlighting the potential of our approach.\nAblation studies and in-depth analyses further confirm the effectiveness of our\nmethod and suggest the potential for further improvements through scaling up.\n","authors":["Yuhan Fu","Ruobing Xie","Xingwu Sun","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2411.10436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10433v1","updated":"2024-11-15T18:54:42Z","published":"2024-11-15T18:54:42Z","title":"M-VAR: Decoupled Scale-wise Autoregressive Modeling for High-Quality\n Image Generation","summary":" There exists recent work in computer vision, named VAR, that proposes a new\nautoregressive paradigm for image generation. Diverging from the vanilla\nnext-token prediction, VAR structurally reformulates the image generation into\na coarse to fine next-scale prediction. In this paper, we show that this\nscale-wise autoregressive framework can be effectively decoupled into\n\\textit{intra-scale modeling}, which captures local spatial dependencies within\neach scale, and \\textit{inter-scale modeling}, which models cross-scale\nrelationships progressively from coarse-to-fine scales. This decoupling\nstructure allows to rebuild VAR in a more computationally efficient manner.\nSpecifically, for intra-scale modeling -- crucial for generating high-fidelity\nimages -- we retain the original bidirectional self-attention design to ensure\ncomprehensive modeling; for inter-scale modeling, which semantically connects\ndifferent scales but is computationally intensive, we apply linear-complexity\nmechanisms like Mamba to substantially reduce computational overhead. We term\nthis new framework M-VAR. Extensive experiments demonstrate that our method\noutperforms existing models in both image quality and generation speed. For\nexample, our 1.5B model, with fewer parameters and faster inference speed,\noutperforms the largest VAR-d30-2B. Moreover, our largest model M-VAR-d32\nimpressively registers 1.78 FID on ImageNet 256$\\times$256 and outperforms the\nprior-art autoregressive models LlamaGen/VAR by 0.4/0.19 and popular diffusion\nmodels LDM/DiT by 1.82/0.49, respectively. Code is avaiable at\n\\url{https://github.com/OliverRensu/MVAR}.\n","authors":["Sucheng Ren","Yaodong Yu","Nataniel Ruiz","Feng Wang","Alan Yuille","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2411.10433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06169v2","updated":"2024-11-15T18:43:23Z","published":"2024-10-08T16:13:24Z","title":"Treat Visual Tokens as Text? But Your MLLM Only Needs Fewer Efforts to\n See","summary":" By treating visual tokens from visual encoders as text tokens, Multimodal\nLarge Language Models (MLLMs) have achieved remarkable progress across diverse\nvisual understanding tasks, leveraging the robust architectures of Large\nLanguage Models (LLMs). However, as token counts grow, the quadratic scaling of\ncomputation in LLMs introduces a significant efficiency bottleneck, impeding\nfurther scalability. Although recent approaches have explored pruning visual\ntokens or employing lighter LLM architectures, the computational overhead from\nan increasing number of visual tokens remains a substantial challenge.\n In this study, we investigate the redundancy in visual computation at both\nthe parameter and computational pattern levels within LLaVA, a representative\nMLLM, and introduce a suite of streamlined strategies to enhance efficiency.\nThese include neighbor-aware visual token attention, pruning of inactive visual\nattention heads, and selective layer dropping for visual computations. By\nimplementing these strategies in LLaVA, we achieve a reduction in computational\ndemands of 88% while maintaining model performance across key benchmarks.\nAdditionally, we validate the existence of visual computational redundancy in\nother MLLMs, such as Qwen2-VL-7B and InternVL-2.0-4B/8B/26B. These results\npresent a novel pathway for MLLMs to handle dense visual tokens with minimal\ncomputational costs. Code and model checkpoints will be released to support\nfurther research.\n","authors":["Zeliang Zhang","Phu Pham","Wentian Zhao","Kun Wan","Yu-Jhe Li","Jianing Zhou","Daniel Miranda","Ajinkya Kale","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2410.06169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10414v1","updated":"2024-11-15T18:34:07Z","published":"2024-11-15T18:34:07Z","title":"Llama Guard 3 Vision: Safeguarding Human-AI Image Understanding\n Conversations","summary":" We introduce Llama Guard 3 Vision, a multimodal LLM-based safeguard for\nhuman-AI conversations that involves image understanding: it can be used to\nsafeguard content for both multimodal LLM inputs (prompt classification) and\noutputs (response classification). Unlike the previous text-only Llama Guard\nversions (Inan et al., 2023; Llama Team, 2024b,a), it is specifically designed\nto support image reasoning use cases and is optimized to detect harmful\nmultimodal (text and image) prompts and text responses to these prompts. Llama\nGuard 3 Vision is fine-tuned on Llama 3.2-Vision and demonstrates strong\nperformance on the internal benchmarks using the MLCommons taxonomy. We also\ntest its robustness against adversarial attacks. We believe that Llama Guard 3\nVision serves as a good starting point to build more capable and robust content\nmoderation tools for human-AI conversation with multimodal capabilities.\n","authors":["Jianfeng Chi","Ujjwal Karn","Hongyuan Zhan","Eric Smith","Javier Rando","Yiming Zhang","Kate Plawiak","Zacharie Delpierre Coudert","Kartikeya Upasani","Mahesh Pasupuleti"],"pdf_url":"https://arxiv.org/pdf/2411.10414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10411v1","updated":"2024-11-15T18:29:59Z","published":"2024-11-15T18:29:59Z","title":"Repurposing Stable Diffusion Attention for Training-Free Unsupervised\n Interactive Segmentation","summary":" Recent progress in interactive point prompt based Image Segmentation allows\nto significantly reduce the manual effort to obtain high quality semantic\nlabels. State-of-the-art unsupervised methods use self-supervised pre-trained\nmodels to obtain pseudo-labels which are used in training a prompt-based\nsegmentation model. In this paper, we propose a novel unsupervised and\ntraining-free approach based solely on the self-attention of Stable Diffusion.\nWe interpret the self-attention tensor as a Markov transition operator, which\nenables us to iteratively construct a Markov chain. Pixel-wise counting of the\nrequired number of iterations along the Markov-chain to reach a relative\nprobability threshold yields a Markov-iteration-map, which we simply call a\nMarkov-map. Compared to the raw attention maps, we show that our proposed\nMarkov-map has less noise, sharper semantic boundaries and more uniform values\nwithin semantically similar regions. We integrate the Markov-map in a simple\nyet effective truncated nearest neighbor framework to obtain interactive point\nprompt based segmentation. Despite being training-free, we experimentally show\nthat our approach yields excellent results in terms of Number of Clicks (NoC),\neven outperforming state-of-the-art training based unsupervised methods in most\nof the datasets.\n","authors":["Markus Karmann","Onay Urfalioglu"],"pdf_url":"https://arxiv.org/pdf/2411.10411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10403v1","updated":"2024-11-15T18:15:56Z","published":"2024-11-15T18:15:56Z","title":"On the Foundation Model for Cardiac MRI Reconstruction","summary":" In recent years, machine learning (ML) based reconstruction has been widely\ninvestigated and employed in cardiac magnetic resonance (CMR) imaging. ML-based\nreconstructions can deliver clinically acceptable image quality under\nsubstantially accelerated scans. ML-based reconstruction, however, also\nrequires substantial data and computational time to train the neural network,\nwhich is often optimized for a fixed acceleration rate or image contrast. In\npractice, imaging parameters are often tuned to best suit the diagnosis, which\nmay differ from the training data. This can result in degraded image quality,\nand multiple trained networks are needed to fulfill the clinical demands. In\nthis study, we propose a foundation model that uses adaptive unrolling,\nchannel-shifting, and Pattern and Contrast-Prompt-UNet (PCP-UNet) to tackle the\nproblem. In particular, the undersampled data goes through a different number\nof unrolled iterations according to its acceleration rate. Channel-shifting\nimproves reconstructed data quality. The PCP-UNet is equipped with an image\ncontrast and sampling pattern prompt. In vivo CMR experiments were performed\nusing mixed combinations of image contrasts, acceleration rates, and\n(under)sampling patterns. The proposed foundation model has significantly\nimproved image quality for a wide range of CMR protocols and outperforms the\nconventional ML-based method.\n","authors":["Chi Zhang","Michael Loecher","Cagan Alkan","Mahmut Yurt","Shreyas S. Vasanawala","Daniel B. Ennis"],"pdf_url":"https://arxiv.org/pdf/2411.10403v1.pdf","comment":"For MICCAI CMRxRecon Challenge 2024 team CardiAxs"},{"id":"http://arxiv.org/abs/2411.10389v1","updated":"2024-11-15T17:50:46Z","published":"2024-11-15T17:50:46Z","title":"Deep Learning for Micro-Scale Crack Detection on Imbalanced Datasets\n Using Key Point Localization","summary":" Internal crack detection has been a subject of focus in structural health\nmonitoring. By focusing on crack detection in structural datasets, it is\ndemonstrated that deep learning (DL) methods can effectively analyze seismic\nwave fields interacting with micro-scale cracks, which are beyond the\nresolution of conventional visual inspection. This work explores a novel\napplication of DL-based key point detection technique, where cracks are\nlocalized by predicting the coordinates of four key points that define a\nbounding region of the crack. The study not only opens new research directions\nfor non-visual applications but also effectively mitigates the impact of\nimbalanced data which poses a challenge for previous DL models, as it can be\nbiased toward predicting the majority class (non-crack regions). Popular DL\ntechniques, such as the Inception blocks, are used and investigated. The model\nshows an overall reduction in loss when applied to micro-scale crack detection\nand is reflected in the lower average deviation between the location of actual\nand predicted cracks, with an average Intersection over Union (IoU) being 0.511\nfor all micro cracks (greater than 0.00 micrometers) and 0.631 for larger micro\ncracks (greater than 4 micrometers).\n","authors":["Fatahlla Moreh","Yusuf Hasan","Bilal Zahid Hussain","Mohammad Ammar","Sven Tomforde"],"pdf_url":"https://arxiv.org/pdf/2411.10389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09484v2","updated":"2024-11-15T17:48:31Z","published":"2024-11-14T14:37:50Z","title":"Image Matching Filtering and Refinement by Planes and Beyond","summary":" This paper introduces a modular, non-deep learning method for filtering and\nrefining sparse correspondences in image matching. Assuming that motion flow\nwithin the scene can be approximated by local homography transformations,\nmatches are aggregated into overlapping clusters corresponding to virtual\nplanes using an iterative RANSAC-based approach, with non-conforming\ncorrespondences discarded. Moreover, the underlying planar structural design\nprovides an explicit map between local patches associated with the matches,\nenabling optional refinement of keypoint positions through cross-correlation\ntemplate matching after patch reprojection. Finally, to enhance robustness and\nfault-tolerance against violations of the piece-wise planar approximation\nassumption, a further strategy is designed for minimizing relative patch\ndistortion in the plane reprojection by introducing an intermediate homography\nthat projects both patches into a common plane. The proposed method is\nextensively evaluated on standard datasets and image matching pipelines, and\ncompared with state-of-the-art approaches. Unlike other current comparisons,\nthe proposed benchmark also takes into account the more general, real, and\npractical cases where camera intrinsics are unavailable. Experimental results\ndemonstrate that our proposed non-deep learning, geometry-based approach\nachieves performances that are either superior to or on par with recent\nstate-of-the-art deep learning methods. Finally, this study suggests that there\nare still development potential in actual image matching solutions in the\nconsidered research direction, which could be in the future incorporated in\nnovel deep image matching architectures.\n","authors":["Fabio Bellavia","Zhenjun Zhao","Luca Morelli","Fabio Remondino"],"pdf_url":"https://arxiv.org/pdf/2411.09484v2.pdf","comment":"project page: https://github.com/fb82/MiHo"},{"id":"http://arxiv.org/abs/2411.10377v1","updated":"2024-11-15T17:32:01Z","published":"2024-11-15T17:32:01Z","title":"Generation of synthetic gait data: application to multiple sclerosis\n patients' gait patterns","summary":" Multiple sclerosis (MS) is the leading cause of severe non-traumatic\ndisability in young adults and its incidence is increasing worldwide. The\nvariability of gait impairment in MS necessitates the development of a\nnon-invasive, sensitive, and cost-effective tool for quantitative gait\nevaluation. The eGait movement sensor, designed to characterize human gait\nthrough unit quaternion time series (QTS) representing hip rotations, is a\npromising approach. However, the small sample sizes typical of clinical studies\npose challenges for the stability of gait data analysis tools. To address these\nchallenges, this article presents two key scientific contributions. First, a\ncomprehensive framework is proposed for transforming QTS data into a form that\npreserves the essential geometric properties of gait while enabling the use of\nany tabular synthetic data generation method. Second, a synthetic data\ngeneration method is introduced, based on nearest neighbors weighting, which\nproduces high-fidelity synthetic QTS data suitable for small datasets and\nprivate data environments. The effectiveness of the proposed method, is\ndemonstrated through its application to MS gait data, showing very good\nfidelity and respect of the initial geometry of the data. Thanks to this work,\nwe are able to produce synthetic data sets and work on the stability of\nclustering methods.\n","authors":["Klervi Le Gall","Lise Bellanger","David Laplaud"],"pdf_url":"https://arxiv.org/pdf/2411.10377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10369v1","updated":"2024-11-15T17:19:18Z","published":"2024-11-15T17:19:18Z","title":"Towards High-Fidelity 3D Portrait Generation with Rich Details by\n Cross-View Prior-Aware Diffusion","summary":" Recent diffusion-based Single-image 3D portrait generation methods typically\nemploy 2D diffusion models to provide multi-view knowledge, which is then\ndistilled into 3D representations. However, these methods usually struggle to\nproduce high-fidelity 3D models, frequently yielding excessively blurred\ntextures. We attribute this issue to the insufficient consideration of\ncross-view consistency during the diffusion process, resulting in significant\ndisparities between different views and ultimately leading to blurred 3D\nrepresentations. In this paper, we address this issue by comprehensively\nexploiting multi-view priors in both the conditioning and diffusion procedures\nto produce consistent, detail-rich portraits. From the conditioning standpoint,\nwe propose a Hybrid Priors Diffsion model, which explicitly and implicitly\nincorporates multi-view priors as conditions to enhance the status consistency\nof the generated multi-view portraits. From the diffusion perspective,\nconsidering the significant impact of the diffusion noise distribution on\ndetailed texture generation, we propose a Multi-View Noise Resamplig Strategy\nintegrated within the optimization process leveraging cross-view priors to\nenhance representation consistency. Extensive experiments demonstrate that our\nmethod can produce 3D portraits with accurate geometry and rich details from a\nsingle image. The project page is at\n\\url{https://haoran-wei.github.io/Portrait-Diffusion}.\n","authors":["Haoran Wei","Wencheng Han","Xingping Dong","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2411.10369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10368v1","updated":"2024-11-15T17:17:46Z","published":"2024-11-15T17:17:46Z","title":"Mechanisms of Generative Image-to-Image Translation Networks","summary":" Generative Adversarial Networks (GANs) are a class of neural networks that\nhave been widely used in the field of image-to-image translation. In this\npaper, we propose a streamlined image-to-image translation network with a\nsimpler architecture compared to existing models. We investigate the\nrelationship between GANs and autoencoders and provide an explanation for the\nefficacy of employing only the GAN component for tasks involving image\ntranslation. We show that adversarial for GAN models yields results comparable\nto those of existing methods without additional complex loss penalties.\nSubsequently, we elucidate the rationale behind this phenomenon. We also\nincorporate experimental results to demonstrate the validity of our findings.\n","authors":["Guangzong Chen","Mingui Sun","Zhi-Hong Mao","Kangni Liu","Wenyan Jia"],"pdf_url":"https://arxiv.org/pdf/2411.10368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10357v1","updated":"2024-11-15T17:06:50Z","published":"2024-11-15T17:06:50Z","title":"Interactive Image-Based Aphid Counting in Yellow Water Traps under\n Stirring Actions","summary":" The current vision-based aphid counting methods in water traps suffer from\nundercounts caused by occlusions and low visibility arising from dense\naggregation of insects and other objects. To address this problem, we propose a\nnovel aphid counting method through interactive stirring actions. We use\ninteractive stirring to alter the distribution of aphids in the yellow water\ntrap and capture a sequence of images which are then used for aphid detection\nand counting through an optimized small object detection network based on\nYolov5. We also propose a counting confidence evaluation system to evaluate the\nconfidence of count-ing results. The final counting result is a weighted sum of\nthe counting results from all sequence images based on the counting confidence.\nExperimental results show that our proposed aphid detection network\nsignificantly outperforms the original Yolov5, with improvements of 33.9% in\nAP@0.5 and 26.9% in AP@[0.5:0.95] on the aphid test set. In addition, the aphid\ncounting test results using our proposed counting confidence evaluation system\nshow significant improvements over the static counting method, closely aligning\nwith manual counting results.\n","authors":["Xumin Gao","Mark Stevens","Grzegorz Cielniak"],"pdf_url":"https://arxiv.org/pdf/2411.10357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10346v1","updated":"2024-11-15T16:46:04Z","published":"2024-11-15T16:46:04Z","title":"BiDense: Binarization for Dense Prediction","summary":" Dense prediction is a critical task in computer vision. However, previous\nmethods often require extensive computational resources, which hinders their\nreal-world application. In this paper, we propose BiDense, a generalized binary\nneural network (BNN) designed for efficient and accurate dense prediction\ntasks. BiDense incorporates two key techniques: the Distribution-adaptive\nBinarizer (DAB) and the Channel-adaptive Full-precision Bypass (CFB). The DAB\nadaptively calculates thresholds and scaling factors for binarization,\neffectively retaining more information within BNNs. Meanwhile, the CFB\nfacilitates full-precision bypassing for binary convolutional layers undergoing\nvarious channel size transformations, which enhances the propagation of\nreal-valued signals and minimizes information loss. By leveraging these\ntechniques, BiDense preserves more real-valued information, enabling more\naccurate and detailed dense predictions in BNNs. Extensive experiments\ndemonstrate that our framework achieves performance levels comparable to\nfull-precision models while significantly reducing memory usage and\ncomputational costs.\n","authors":["Rui Yin","Haotong Qin","Yulun Zhang","Wenbo Li","Yong Guo","Jianjun Zhu","Cheng Wang","Biao Jia"],"pdf_url":"https://arxiv.org/pdf/2411.10346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10345v1","updated":"2024-11-15T16:45:08Z","published":"2024-11-15T16:45:08Z","title":"Comparative Analysis of Machine Learning Approaches for Bone Age\n Assessment: A Comprehensive Study on Three Distinct Models","summary":" Radiologists and doctors make use of X-ray images of the non-dominant hands\nof children and infants to assess the possibility of genetic conditions and\ngrowth abnormalities. This is done by assessing the difference between the\nactual extent of growth found using the X-rays and the chronological age of the\nsubject. The assessment was done conventionally using The Greulich Pyle (GP) or\nTanner Whitehouse (TW) approach. These approaches require a high level of\nexpertise and may often lead to observer bias. Hence, to automate the process\nof assessing the X-rays, and to increase its accuracy and efficiency, several\nmachine learning models have been developed. These machine-learning models have\nseveral differences in their accuracy and efficiencies, leading to an unclear\nchoice for the suitable model depending on their needs and available resources.\nMethods: In this study, we have analyzed the 3 most widely used models for the\nautomation of bone age prediction, which are the Xception model, VGG model and\nCNN model. These models were trained on the preprocessed dataset and the\naccuracy was measured using the MAE in terms of months for each model. Using\nthis, the comparison between the models was done. Results: The 3 models,\nXception, VGG, and CNN models have been tested for accuracy and other relevant\nfactors.\n","authors":["Nandavardhan R.","Somanathan R.","Vikram Suresh","Savaridassan P"],"pdf_url":"https://arxiv.org/pdf/2411.10345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10334v1","updated":"2024-11-15T16:33:59Z","published":"2024-11-15T16:33:59Z","title":"Y-MAP-Net: Real-time depth, normals, segmentation, multi-label\n captioning and 2D human pose in RGB images","summary":" We present Y-MAP-Net, a Y-shaped neural network architecture designed for\nreal-time multi-task learning on RGB images. Y-MAP-Net, simultaneously predicts\ndepth, surface normals, human pose, semantic segmentation and generates\nmulti-label captions, all from a single network evaluation. To achieve this, we\nadopt a multi-teacher, single-student training paradigm, where task-specific\nfoundation models supervise the network's learning, enabling it to distill\ntheir capabilities into a lightweight architecture suitable for real-time\napplications. Y-MAP-Net, exhibits strong generalization, simplicity and\ncomputational efficiency, making it ideal for robotics and other practical\nscenarios. To support future research, we will release our code publicly.\n","authors":["Ammar Qammaz","Nikolaos Vasilikopoulos","Iason Oikonomidis","Antonis A. Argyros"],"pdf_url":"https://arxiv.org/pdf/2411.10334v1.pdf","comment":"8 page paper, 6 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2411.10332v1","updated":"2024-11-15T16:32:34Z","published":"2024-11-15T16:32:34Z","title":"Number it: Temporal Grounding Videos like Flipping Manga","summary":" Video Large Language Models (Vid-LLMs) have made remarkable advancements in\ncomprehending video content for QA dialogue. However, they struggle to extend\nthis visual understanding to tasks requiring precise temporal localization,\nknown as Video Temporal Grounding (VTG). To address this gap, we introduce\nNumber-Prompt (NumPro), a novel method that empowers Vid-LLMs to bridge visual\ncomprehension with temporal grounding by adding unique numerical identifiers to\neach video frame. Treating a video as a sequence of numbered frame images,\nNumPro transforms VTG into an intuitive process: flipping through manga panels\nin sequence. This allows Vid-LLMs to \"read\" event timelines, accurately linking\nvisual content with corresponding temporal information. Our experiments\ndemonstrate that NumPro significantly boosts VTG performance of top-tier\nVid-LLMs without additional computational cost. Furthermore, fine-tuning on a\nNumPro-enhanced dataset defines a new state-of-the-art for VTG, surpassing\nprevious top-performing methods by up to 6.9\\% in mIoU for moment retrieval and\n8.5\\% in mAP for highlight detection. The code will be available at\nhttps://github.com/yongliang-wu/NumPro.\n","authors":["Yongliang Wu","Xinting Hu","Yuyang Sun","Yizhou Zhou","Wenbo Zhu","Fengyun Rao","Bernt Schiele","Xu Yang"],"pdf_url":"https://arxiv.org/pdf/2411.10332v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.10330v1","updated":"2024-11-15T16:29:57Z","published":"2024-11-15T16:29:57Z","title":"CNN-Based Classification of Persian Miniature Paintings from Five\n Renowned Schools","summary":" This article addresses the gap in computational painting analysis focused on\nPersian miniature painting, a rich cultural and artistic heritage. It\nintroduces a novel approach using Convolutional Neural Networks (CNN) to\nclassify Persian miniatures from five schools: Herat, Tabriz-e Avval, Shiraz-e\nAvval, Tabriz-e Dovvom, and Qajar. The method achieves an average accuracy of\nover 91%. A meticulously curated dataset captures the distinct features of each\nschool, with a patch-based CNN approach classifying image segments\nindependently before merging results for enhanced accuracy. This research\ncontributes significantly to digital art analysis, providing detailed insights\ninto the dataset, CNN architecture, training, and validation processes. It\nhighlights the potential for future advancements in automated art analysis,\nbridging machine learning, art history, and digital humanities, thereby aiding\nthe preservation and understanding of Persian cultural heritage.\n","authors":["Mojtaba Shahi","Roozbeh Rajabi","Farnaz Masoumzadeh"],"pdf_url":"https://arxiv.org/pdf/2411.10330v1.pdf","comment":"20 pages, submitted to journal"},{"id":"http://arxiv.org/abs/2411.10323v1","updated":"2024-11-15T16:23:52Z","published":"2024-11-15T16:23:52Z","title":"The Dawn of GUI Agent: A Preliminary Case Study with Claude 3.5 Computer\n Use","summary":" The recently released model, Claude 3.5 Computer Use, stands out as the first\nfrontier AI model to offer computer use in public beta as a graphical user\ninterface (GUI) agent. As an early beta, its capability in the real-world\ncomplex environment remains unknown. In this case study to explore Claude 3.5\nComputer Use, we curate and organize a collection of carefully designed tasks\nspanning a variety of domains and software. Observations from these cases\ndemonstrate Claude 3.5 Computer Use's unprecedented ability in end-to-end\nlanguage to desktop actions. Along with this study, we provide an\nout-of-the-box agent framework for deploying API-based GUI automation models\nwith easy implementation. Our case studies aim to showcase a groundwork of\ncapabilities and limitations of Claude 3.5 Computer Use with detailed analyses\nand bring to the fore questions about planning, action, and critic, which must\nbe considered for future improvement. We hope this preliminary exploration will\ninspire future research into the GUI agent community. All the test cases in the\npaper can be tried through the project:\nhttps://github.com/showlab/computer_use_ootb.\n","authors":["Siyuan Hu","Mingyu Ouyang","Difei Gao","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2411.10323v1.pdf","comment":"40 pages, 21 figures, preprint"},{"id":"http://arxiv.org/abs/2411.10322v1","updated":"2024-11-15T16:22:32Z","published":"2024-11-15T16:22:32Z","title":"Melanoma Detection with Uncertainty Quantification","summary":" Early detection of melanoma is crucial for improving survival rates. Current\ndetection tools often utilize data-driven machine learning methods but often\noverlook the full integration of multiple datasets. We combine publicly\navailable datasets to enhance data diversity, allowing numerous experiments to\ntrain and evaluate various classifiers. We then calibrate them to minimize\nmisdiagnoses by incorporating uncertainty quantification. Our experiments on\nbenchmark datasets show accuracies of up to 93.2% before and 97.8% after\napplying uncertainty-based rejection, leading to a reduction in misdiagnoses by\nover 40.5%. Our code and data are publicly available, and a web-based interface\nfor quick melanoma detection of user-supplied images is also provided.\n","authors":["SangHyuk Kim","Edward Gaibor","Brian Matejek","Daniel Haehn"],"pdf_url":"https://arxiv.org/pdf/2411.10322v1.pdf","comment":"5 pages, 5 figures, 3 tables, submitted to ISBI2025"},{"id":"http://arxiv.org/abs/2411.10321v1","updated":"2024-11-15T16:22:22Z","published":"2024-11-15T16:22:22Z","title":"Probabilistic Prior Driven Attention Mechanism Based on Diffusion Model\n for Imaging Through Atmospheric Turbulence","summary":" Atmospheric turbulence introduces severe spatial and geometric distortions,\nchallenging traditional image restoration methods. We propose the Probabilistic\nPrior Turbulence Removal Network (PPTRN), which combines probabilistic\ndiffusion-based prior modeling with Transformer-driven feature extraction to\naddress this issue. PPTRN employs a two-stage approach: first, a latent encoder\nand Transformer are jointly trained on clear images to establish robust feature\nrepresentations. Then, a Denoising Diffusion Probabilistic Model (DDPM) models\nprior distributions over latent vectors, guiding the Transformer in capturing\ndiverse feature variations essential for restoration. A key innovation in PPTRN\nis the Probabilistic Prior Driven Cross Attention mechanism, which integrates\nthe DDPM-generated prior with feature embeddings to reduce artifacts and\nenhance spatial coherence. Extensive experiments validate that PPTRN\nsignificantly improves restoration quality on turbulence-degraded images,\nsetting a new benchmark in clarity and structural fidelity.\n","authors":["Guodong Sun","Qixiang Ma","Liqiang Zhang","Hongwei Wang","Zixuan Gao","Haotian Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10316v1","updated":"2024-11-15T16:14:48Z","published":"2024-11-15T16:14:48Z","title":"M3TR: Generalist HD Map Construction with Variable Map Priors","summary":" Autonomous vehicles require road information for their operation, usually in\nform of HD maps. Since offline maps eventually become outdated or may only be\npartially available, online HD map construction methods have been proposed to\ninfer map information from live sensor data. A key issue remains how to exploit\nsuch partial or outdated map information as a prior. We introduce M3TR\n(Multi-Masking Map Transformer), a generalist approach for HD map construction\nboth with and without map priors. We address shortcomings in ground truth\ngeneration for Argoverse 2 and nuScenes and propose the first realistic\nscenarios with semantically diverse map priors. Examining various query\ndesigns, we use an improved method for integrating prior map elements into a HD\nmap construction model, increasing performance by +4.3 mAP. Finally, we show\nthat training across all prior scenarios yields a single Generalist model,\nwhose performance is on par with previous Expert models that can handle only\none specific type of map prior. M3TR thus is the first model capable of\nleveraging variable map priors, making it suitable for real-world deployment.\nCode is available at https://github.com/immel-f/m3tr\n","authors":["Fabian Immel","Richard Fehler","Frank Bieder","Jan-Hendrik Pauls","Christoph Stiller"],"pdf_url":"https://arxiv.org/pdf/2411.10316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10309v1","updated":"2024-11-15T16:05:01Z","published":"2024-11-15T16:05:01Z","title":"Modification Takes Courage: Seamless Image Stitching via\n Reference-Driven Inpainting","summary":" Current image stitching methods often produce noticeable seams in challenging\nscenarios such as uneven hue and large parallax. To tackle this problem, we\npropose the Reference-Driven Inpainting Stitcher (RDIStitcher), which\nreformulates the image fusion and rectangling as a reference-based inpainting\nmodel, incorporating a larger modification fusion area and stronger\nmodification intensity than previous methods. Furthermore, we introduce a\nself-supervised model training method, which enables the implementation of\nRDIStitcher without requiring labeled data by fine-tuning a Text-to-Image (T2I)\ndiffusion model. Recognizing difficulties in assessing the quality of stitched\nimages, we present the Multimodal Large Language Models (MLLMs)-based metrics,\noffering a new perspective on evaluating stitched image quality. Compared to\nthe state-of-the-art (SOTA) method, extensive experiments demonstrate that our\nmethod significantly enhances content coherence and seamless transitions in the\nstitched images. Especially in the zero-shot experiments, our method exhibits\nstrong generalization capabilities. Code:\nhttps://github.com/yayoyo66/RDIStitcher\n","authors":["Ziqi Xie","Xiao Lai","Weidong Zhao","Xianhui Liu","Wenlong Hou"],"pdf_url":"https://arxiv.org/pdf/2411.10309v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.10308v1","updated":"2024-11-15T16:04:01Z","published":"2024-11-15T16:04:01Z","title":"A Realistic Collimated X-Ray Image Simulation Pipeline","summary":" Collimator detection remains a challenging task in X-ray systems with\nunreliable or non-available information about the detectors position relative\nto the source. This paper presents a physically motivated image processing\npipeline for simulating the characteristics of collimator shadows in X-ray\nimages. By generating randomized labels for collimator shapes and locations,\nincorporating scattered radiation simulation, and including Poisson noise, the\npipeline enables the expansion of limited datasets for training deep neural\nnetworks. We validate the proposed pipeline by a qualitative and quantitative\ncomparison against real collimator shadows. Furthermore, it is demonstrated\nthat utilizing simulated data within our deep learning framework not only\nserves as a suitable substitute for actual collimators but also enhances the\ngeneralization performance when applied to real-world data.\n","authors":["Benjamin El-Zein","Dominik Eckert","Thomas Weber","Maximilian Rohleder","Ludwig Ritschl","Steffen Kappler","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2411.10308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09871v4","updated":"2024-11-15T16:01:39Z","published":"2024-03-14T21:01:06Z","title":"ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric\n Thermal Images","summary":" Designing egocentric 3D hand pose estimation systems that can perform\nreliably in complex, real-world scenarios is crucial for downstream\napplications. Previous approaches using RGB or NIR imagery struggle in\nchallenging conditions: RGB methods are susceptible to lighting variations and\nobstructions like handwear, while NIR techniques can be disrupted by sunlight\nor interference from other NIR-equipped devices. To address these limitations,\nwe present ThermoHands, the first benchmark focused on thermal image-based\negocentric 3D hand pose estimation, demonstrating the potential of thermal\nimaging to achieve robust performance under these conditions. The benchmark\nincludes a multi-view and multi-spectral dataset collected from 28 subjects\nperforming hand-object and hand-virtual interactions under diverse scenarios,\naccurately annotated with 3D hand poses through an automated process. We\nintroduce a new baseline method, TherFormer, utilizing dual transformer modules\nfor effective egocentric 3D hand pose estimation in thermal imagery. Our\nexperimental results highlight TherFormer's leading performance and affirm\nthermal imaging's effectiveness in enabling robust 3D hand pose estimation in\nadverse conditions.\n","authors":["Fangqiang Ding","Yunzhou Zhu","Xiangyu Wen","Gaowen Liu","Chris Xiaoxuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.09871v4.pdf","comment":"15 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.10293v1","updated":"2024-11-15T15:51:25Z","published":"2024-11-15T15:51:25Z","title":"RETR: Multi-View Radar Detection Transformer for Indoor Perception","summary":" Indoor radar perception has seen rising interest due to affordable costs\ndriven by emerging automotive imaging radar developments and the benefits of\nreduced privacy concerns and reliability under hazardous conditions (e.g., fire\nand smoke). However, existing radar perception pipelines fail to account for\ndistinctive characteristics of the multi-view radar setting. In this paper, we\npropose Radar dEtection TRansformer (RETR), an extension of the popular DETR\narchitecture, tailored for multi-view radar perception. RETR inherits the\nadvantages of DETR, eliminating the need for hand-crafted components for object\ndetection and segmentation in the image plane. More importantly, RETR\nincorporates carefully designed modifications such as 1) depth-prioritized\nfeature similarity via a tunable positional encoding (TPE); 2) a tri-plane loss\nfrom both radar and camera coordinates; and 3) a learnable radar-to-camera\ntransformation via reparameterization, to account for the unique multi-view\nradar setting. Evaluated on two indoor radar perception datasets, our approach\noutperforms existing state-of-the-art methods by a margin of 15.38+ AP for\nobject detection and 11.77+ IoU for instance segmentation, respectively.\n","authors":["Ryoma Yataka","Adriano Cardace","Pu Perry Wang","Petros Boufounos","Ryuhei Takahashi"],"pdf_url":"https://arxiv.org/pdf/2411.10293v1.pdf","comment":"24 pages, Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10281v1","updated":"2024-11-15T15:36:48Z","published":"2024-11-15T15:36:48Z","title":"Multidimensional Byte Pair Encoding: Shortened Sequences for Improved\n Visual Data Generation","summary":" In language processing, transformers benefit greatly from text being\ncondensed. This is achieved through a larger vocabulary that captures word\nfragments instead of plain characters. This is often done with Byte Pair\nEncoding. In the context of images, tokenisation of visual data is usually\nlimited to regular grids obtained from quantisation methods, without global\ncontent awareness. Our work improves tokenisation of visual data by bringing\nByte Pair Encoding from 1D to multiple dimensions, as a complementary add-on to\nexisting compression. We achieve this through counting constellations of token\npairs and replacing the most frequent token pair with a newly introduced token.\nThe multidimensionality only increases the computation time by a factor of 2\nfor images, making it applicable even to large datasets like ImageNet within\nminutes on consumer hardware. This is a lossless preprocessing step. Our\nevaluation shows improved training and inference performance of transformers on\nvisual data achieved by compressing frequent constellations of tokens: The\nresulting sequences are shorter, with more uniformly distributed information\ncontent, e.g. condensing empty regions in an image into single tokens. As our\nexperiments show, these condensed sequences are easier to process. We\nadditionally introduce a strategy to amplify this compression further by\nclustering the vocabulary.\n","authors":["Tim Elsner","Paula Usinger","Julius Nehring-Wirxel","Gregor Kobsik","Victor Czech","Yanjiang He","Isaak Lim","Leif Kobbelt"],"pdf_url":"https://arxiv.org/pdf/2411.10281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10275v1","updated":"2024-11-15T15:31:58Z","published":"2024-11-15T15:31:58Z","title":"4DPV: 4D Pet from Videos by Coarse-to-Fine Non-Rigid Radiance Fields","summary":" We present a coarse-to-fine neural deformation model to simultaneously\nrecover the camera pose and the 4D reconstruction of an unknown object from\nmultiple RGB sequences in the wild. To that end, our approach does not consider\nany pre-built 3D template nor 3D training data as well as controlled\nillumination conditions, and can sort out the problem in a self-supervised\nmanner. Our model exploits canonical and image-variant spaces where both coarse\nand fine components are considered. We introduce a neural local quadratic model\nwith spatio-temporal consistency to encode fine details that is combined with\ncanonical embeddings in order to establish correspondences across sequences. We\nthoroughly validate the method on challenging scenarios with complex and\nreal-world deformations, providing both quantitative and qualitative\nevaluations, an ablation study and a comparison with respect to competing\napproaches. Our project is available at https://github.com/smontode24/4DPV.\n","authors":["Sergio M. de Paco","Antonio Agudo"],"pdf_url":"https://arxiv.org/pdf/2411.10275v1.pdf","comment":"17th Asian Conference on Computer Vision (ACCV 2024)"},{"id":"http://arxiv.org/abs/2303.07034v3","updated":"2024-11-15T15:31:44Z","published":"2023-03-13T11:53:40Z","title":"Pretrained ViTs Yield Versatile Representations For Medical Images","summary":" Convolutional Neural Networks (CNNs) have reigned for a decade as the de\nfacto approach to automated medical image diagnosis, pushing the\nstate-of-the-art in classification, detection and segmentation tasks. Over the\nlast years, vision transformers (ViTs) have appeared as a competitive\nalternative to CNNs, yielding impressive levels of performance in the natural\nimage domain, while possessing several interesting properties that could prove\nbeneficial for medical imaging tasks. In this work, we explore the benefits and\ndrawbacks of transformer-based models for medical image classification. We\nconduct a series of experiments on several standard 2D medical image benchmark\ndatasets and tasks. Our findings show that, while CNNs perform better if\ntrained from scratch, off-the-shelf vision transformers can perform on par with\nCNNs when pretrained on ImageNet, both in a supervised and self-supervised\nsetting, rendering them as a viable alternative to CNNs.\n","authors":["Christos Matsoukas","Johan Fredin Haslum","Moein Sorkhei","Magnus Söderberg","Kevin Smith"],"pdf_url":"https://arxiv.org/pdf/2303.07034v3.pdf","comment":"Extended version of arXiv:2108.09038 originally published at the ICCV\n 2021 Workshop on Computer Vision for Automated Medical Diagnosis"},{"id":"http://arxiv.org/abs/2411.10273v1","updated":"2024-11-15T15:31:06Z","published":"2024-11-15T15:31:06Z","title":"Fill in the blanks: Rethinking Interpretability in vision","summary":" Model interpretability is a key challenge that has yet to align with the\nadvancements observed in contemporary state-of-the-art deep learning models. In\nparticular, deep learning aided vision tasks require interpretability, in order\nfor their adoption in more specialized domains such as medical imaging.\nAlthough the field of explainable AI (XAI) developed methods for interpreting\nvision models along with early convolutional neural networks, recent XAI\nresearch has mainly focused on assigning attributes via saliency maps. As such,\nthese methods are restricted to providing explanations at a sample level, and\nmany explainability methods suffer from low adaptability across a wide range of\nvision models. In our work, we re-think vision-model explainability from a\nnovel perspective, to probe the general input structure that a model has learnt\nduring its training. To this end, we ask the question: \"How would a vision\nmodel fill-in a masked-image\". Experiments on standard vision datasets and\npre-trained models reveal consistent patterns, and could be intergrated as an\nadditional model-agnostic explainability tool in modern machine-learning\nplatforms. The code will be available at\n\\url{https://github.com/BoTZ-TND/FillingTheBlanks.git}\n","authors":["Pathirage N. Deelaka","Tharindu Wickremasinghe","Devin Y. De Silva","Lisara N. Gajaweera"],"pdf_url":"https://arxiv.org/pdf/2411.10273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14551v2","updated":"2024-11-15T15:16:56Z","published":"2024-02-22T13:45:01Z","title":"CLCE: An Approach to Refining Cross-Entropy and Contrastive Learning for\n Optimized Learning Fusion","summary":" State-of-the-art pre-trained image models predominantly adopt a two-stage\napproach: initial unsupervised pre-training on large-scale datasets followed by\ntask-specific fine-tuning using Cross-Entropy loss~(CE). However, it has been\ndemonstrated that CE can compromise model generalization and stability. While\nrecent works employing contrastive learning address some of these limitations\nby enhancing the quality of embeddings and producing better decision\nboundaries, they often overlook the importance of hard negative mining and rely\non resource intensive and slow training using large sample batches. To counter\nthese issues, we introduce a novel approach named CLCE, which integrates\nLabel-Aware Contrastive Learning with CE. Our approach not only maintains the\nstrengths of both loss functions but also leverages hard negative mining in a\nsynergistic way to enhance performance. Experimental results demonstrate that\nCLCE significantly outperforms CE in Top-1 accuracy across twelve benchmarks,\nachieving gains of up to 3.52% in few-shot learning scenarios and 3.41% in\ntransfer learning settings with the BEiT-3 model. Importantly, our proposed\nCLCE approach effectively mitigates the dependency of contrastive learning on\nlarge batch sizes such as 4096 samples per batch, a limitation that has\npreviously constrained the application of contrastive learning in\nbudget-limited hardware environments.\n","authors":["Zijun Long","George Killick","Lipeng Zhuang","Gerardo Aragon-Camarasa","Zaiqiao Meng","Richard Mccreadie"],"pdf_url":"https://arxiv.org/pdf/2402.14551v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10261v1","updated":"2024-11-15T15:08:04Z","published":"2024-11-15T15:08:04Z","title":"Partial Scene Text Retrieval","summary":" The task of partial scene text retrieval involves localizing and searching\nfor text instances that are the same or similar to a given query text from an\nimage gallery. However, existing methods can only handle text-line instances,\nleaving the problem of searching for partial patches within these text-line\ninstances unsolved due to a lack of patch annotations in the training data. To\naddress this issue, we propose a network that can simultaneously retrieve both\ntext-line instances and their partial patches. Our method embeds the two types\nof data (query text and scene text instances) into a shared feature space and\nmeasures their cross-modal similarities. To handle partial patches, our\nproposed approach adopts a Multiple Instance Learning (MIL) approach to learn\ntheir similarities with query text, without requiring extra annotations.\nHowever, constructing bags, which is a standard step of conventional MIL\napproaches, can introduce numerous noisy samples for training, and lower\ninference speed. To address this issue, we propose a Ranking MIL (RankMIL)\napproach to adaptively filter those noisy samples. Additionally, we present a\nDynamic Partial Match Algorithm (DPMA) that can directly search for the target\npartial patch from a text-line instance during the inference stage, without\nrequiring bags. This greatly improves the search efficiency and the performance\nof retrieving partial patches. The source code and dataset are available at\nhttps://github.com/lanfeng4659/PSTR.\n","authors":["Hao Wang","Minghui Liao","Zhouyi Xie","Wenyu Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2411.10261v1.pdf","comment":"Accepted on TPAMI"},{"id":"http://arxiv.org/abs/2411.10257v1","updated":"2024-11-15T15:04:04Z","published":"2024-11-15T15:04:04Z","title":"The Unreasonable Effectiveness of Guidance for Diffusion Models","summary":" Guidance is an error-correcting technique used to improve the perceptual\nquality of images generated by diffusion models. Typically, the correction is\nachieved by linear extrapolation, using an auxiliary diffusion model that has\nlower performance than the primary model. Using a 2D toy example, we show that\nit is highly beneficial when the auxiliary model exhibits similar errors as the\nprimary one but stronger. We verify this finding in higher dimensions, where we\nshow that competitive generative performance to state-of-the-art guidance\nmethods can be achieved when the auxiliary model differs from the primary one\nonly by having stronger weight regularization. As an independent contribution,\nwe investigate whether upweighting long-range spatial dependencies improves\nvisual fidelity. The result is a novel guidance method, which we call sliding\nwindow guidance (SWG), that guides the primary model with itself by\nconstraining its receptive field. Intriguingly, SWG aligns better with human\npreferences than state-of-the-art guidance methods while requiring neither\ntraining, architectural modifications, nor class conditioning. The code will be\nreleased.\n","authors":["Tim Kaiser","Nikolas Adaloglou","Markus Kollmann"],"pdf_url":"https://arxiv.org/pdf/2411.10257v1.pdf","comment":"Preprint. 19 pages, 14 figures in total, including references and\n appendix"},{"id":"http://arxiv.org/abs/2411.10252v1","updated":"2024-11-15T15:02:06Z","published":"2024-11-15T15:02:06Z","title":"Visual-Linguistic Agent: Towards Collaborative Contextual Object\n Reasoning","summary":" Multimodal Large Language Models (MLLMs) excel at descriptive tasks within\nimages but often struggle with precise object localization, a critical element\nfor reliable visual interpretation. In contrast, traditional object detection\nmodels provide high localization accuracy but frequently generate detections\nlacking contextual coherence due to limited modeling of inter-object\nrelationships. To address this fundamental limitation, we introduce the\n\\textbf{Visual-Linguistic Agent (VLA), a collaborative framework that combines\nthe relational reasoning strengths of MLLMs with the precise localization\ncapabilities of traditional object detectors. In the VLA paradigm, the MLLM\nserves as a central Linguistic Agent, working collaboratively with specialized\nVision Agents for object detection and classification. The Linguistic Agent\nevaluates and refines detections by reasoning over spatial and contextual\nrelationships among objects, while the classification Vision Agent offers\ncorrective feedback to improve classification accuracy. This collaborative\napproach enables VLA to significantly enhance both spatial reasoning and object\nlocalization, addressing key challenges in multimodal understanding. Extensive\nevaluations on the COCO dataset demonstrate substantial performance\nimprovements across multiple detection models, highlighting VLA's potential to\nset a new benchmark in accurate and contextually coherent object detection.\n","authors":["Jingru Yang","Huan Yu","Yang Jingxin","Chentianye Xu","Yin Biao","Yu Sun","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2411.10252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10251v1","updated":"2024-11-15T15:01:00Z","published":"2024-11-15T15:01:00Z","title":"Morpho-Aware Global Attention for Image Matting","summary":" Vision Transformers (ViTs) and Convolutional Neural Networks (CNNs) face\ninherent challenges in image matting, particularly in preserving fine\nstructural details. ViTs, with their global receptive field enabled by the\nself-attention mechanism, often lose local details such as hair strands.\nConversely, CNNs, constrained by their local receptive field, rely on deeper\nlayers to approximate global context but struggle to retain fine structures at\ngreater depths.\n To overcome these limitations, we propose a novel Morpho-Aware Global\nAttention (MAGA) mechanism, designed to effectively capture the morphology of\nfine structures. MAGA employs Tetris-like convolutional patterns to align the\nlocal shapes of fine structures, ensuring optimal local correspondence while\nmaintaining sensitivity to morphological details. The extracted local\nmorphology information is used as query embeddings, which are projected onto\nglobal key embeddings to emphasize local details in a broader context.\nSubsequently, by projecting onto value embeddings, MAGA seamlessly integrates\nthese emphasized morphological details into a unified global structure.\n This approach enables MAGA to simultaneously focus on local morphology and\nunify these details into a coherent whole, effectively preserving fine\nstructures. Extensive experiments show that our MAGA-based ViT achieves\nsignificant performance gains, outperforming state-of-the-art methods across\ntwo benchmarks with average improvements of 4.3% in SAD and 39.5% in MSE.\n","authors":["Jingru Yang","Chengzhi Cao","Chentianye Xu","Zhongwei Xie","Kaixiang Huang","Yang Zhou","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2411.10251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11802v3","updated":"2024-11-15T14:54:58Z","published":"2024-07-16T14:53:35Z","title":"DCD: Discriminative and Consistent Representation Distillation","summary":" Knowledge Distillation (KD) aims to transfer knowledge from a large teacher\nmodel to a smaller student model. While contrastive learning has shown promise\nin self-supervised learning by creating discriminative representations, its\napplication in knowledge distillation remains limited and focuses primarily on\ndiscrimination, neglecting the structural relationships captured by the teacher\nmodel. To address this limitation, we propose Discriminative and Consistent\nDistillation (DCD), which employs a contrastive loss along with a consistency\nregularization to minimize the discrepancy between the distributions of teacher\nand student representations. Our method introduces learnable temperature and\nbias parameters that adapt during training to balance these complementary\nobjectives, replacing the fixed hyperparameters commonly used in contrastive\nlearning approaches. Through extensive experiments on CIFAR-100 and ImageNet\nILSVRC-2012, we demonstrate that DCD achieves state-of-the-art performance,\nwith the student model sometimes surpassing the teacher's accuracy.\nFurthermore, we show that DCD's learned representations exhibit superior\ncross-dataset generalization when transferred to Tiny ImageNet and STL-10. Code\nis available at https://github.com/giakoumoglou/distillers.\n","authors":["Nikolaos Giakoumoglou","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2407.11802v3.pdf","comment":"11 pages, 3 figures, 6 tables. The paper's title has been changed,\n again"},{"id":"http://arxiv.org/abs/2411.10237v1","updated":"2024-11-15T14:51:30Z","published":"2024-11-15T14:51:30Z","title":"ScribbleVS: Scribble-Supervised Medical Image Segmentation via Dynamic\n Competitive Pseudo Label Selection","summary":" In clinical medicine, precise image segmentation can provide substantial\nsupport to clinicians. However, achieving such precision often requires a large\namount of finely annotated data, which can be costly. Scribble annotation\npresents a more efficient alternative, boosting labeling efficiency. However,\nutilizing such minimal supervision for medical image segmentation training,\nespecially with scribble annotations, poses significant challenges. To address\nthese challenges, we introduce ScribbleVS, a novel framework that leverages\nscribble annotations. We introduce a Regional Pseudo Labels Diffusion Module to\nexpand the scope of supervision and reduce the impact of noise present in\npseudo labels. Additionally, we propose a Dynamic Competitive Selection module\nfor enhanced refinement in selecting pseudo labels. Experiments conducted on\nthe ACDC and MSCMRseg datasets have demonstrated promising results, achieving\nperformance levels that even exceed those of fully supervised methodologies.\nThe codes of this study are available at\nhttps://github.com/ortonwang/ScribbleVS.\n","authors":["Tao Wang","Xinlin Zhang","Yuanbin Chen","Yuanbo Zhou","Longxuan Zhao","Tao Tan","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2411.10237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10232v1","updated":"2024-11-15T14:45:58Z","published":"2024-11-15T14:45:58Z","title":"ColorEdit: Training-free Image-Guided Color editing with diffusion model","summary":" Text-to-image (T2I) diffusion models, with their impressive generative\ncapabilities, have been adopted for image editing tasks, demonstrating\nremarkable efficacy. However, due to attention leakage and collision between\nthe cross-attention map of the object and the new color attribute from the text\nprompt, text-guided image editing methods may fail to change the color of an\nobject, resulting in a misalignment between the resulting image and the text\nprompt. In this paper, we conduct an in-depth analysis on the process of\ntext-guided image synthesizing and what semantic information different\ncross-attention blocks have learned. We observe that the visual representation\nof an object is determined in the up-block of the diffusion model in the early\nstage of the denoising process, and color adjustment can be achieved through\nvalue matrices alignment in the cross-attention layer. Based on our findings,\nwe propose a straightforward, yet stable, and effective image-guided method to\nmodify the color of an object without requiring any additional fine-tuning or\ntraining. Lastly, we present a benchmark dataset called COLORBENCH, the first\nbenchmark to evaluate the performance of color change methods. Extensive\nexperiments validate the effectiveness of our method in object-level color\nediting and surpass the performance of popular text-guided image editing\napproaches in both synthesized and real images.\n","authors":["Xingxi Yin","Zhi Li","Jingfeng Zhang","Chenglin Li","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10231v1","updated":"2024-11-15T14:43:58Z","published":"2024-11-15T14:43:58Z","title":"A Low-Resolution Image is Worth 1x1 Words: Enabling Fine Image\n Super-Resolution with Transformers and TaylorShift","summary":" Transformer-based Super-Resolution (SR) models have recently advanced image\nreconstruction quality, yet challenges remain due to computational complexity\nand an over-reliance on large patch sizes, which constrain fine-grained detail\nenhancement. In this work, we propose TaylorIR to address these limitations by\nutilizing a patch size of 1x1, enabling pixel-level processing in any\ntransformer-based SR model. To address the significant computational demands\nunder the traditional self-attention mechanism, we employ the TaylorShift\nattention mechanism, a memory-efficient alternative based on Taylor series\nexpansion, achieving full token-to-token interactions with linear complexity.\nExperimental results demonstrate that our approach achieves new\nstate-of-the-art SR performance while reducing memory consumption by up to 60%\ncompared to traditional self-attention-based transformers.\n","authors":["Sanath Budakegowdanadoddi Nagaraju","Brian Bernhard Moser","Tobias Christian Nauen","Stanislav Frolov","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.10231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10224v1","updated":"2024-11-15T14:38:13Z","published":"2024-11-15T14:38:13Z","title":"MCL: Multi-view Enhanced Contrastive Learning for Chest X-ray Report\n Generation","summary":" Radiology reports are crucial for planning treatment strategies and enhancing\ndoctor-patient communication, yet manually writing these reports is burdensome\nfor radiologists. While automatic report generation offers a solution, existing\nmethods often rely on single-view radiographs, limiting diagnostic accuracy. To\naddress this problem, we propose MCL, a Multi-view enhanced Contrastive\nLearning method for chest X-ray report generation. Specifically, we first\nintroduce multi-view enhanced contrastive learning for visual representation by\nmaximizing agreements between multi-view radiographs and their corresponding\nreport. Subsequently, to fully exploit patient-specific indications (e.g.,\npatient's symptoms) for report generation, we add a transitional ``bridge\" for\nmissing indications to reduce embedding space discrepancies caused by their\npresence or absence. Additionally, we construct Multi-view CXR and Two-view CXR\ndatasets from public sources to support research on multi-view report\ngeneration. Our proposed MCL surpasses recent state-of-the-art methods across\nmultiple datasets, achieving a 5.0% F1 RadGraph improvement on MIMIC-CXR, a\n7.3% BLEU-1 improvement on MIMIC-ABN, a 3.1% BLEU-4 improvement on Multi-view\nCXR, and an 8.2% F1 CheXbert improvement on Two-view CXR.\n","authors":["Kang Liu","Zhuoqi Ma","Kun Xie","Zhicheng Jiao","Qiguang Miao"],"pdf_url":"https://arxiv.org/pdf/2411.10224v1.pdf","comment":"https://github.com/mk-runner/MCL"},{"id":"http://arxiv.org/abs/2411.10203v1","updated":"2024-11-15T14:01:02Z","published":"2024-11-15T14:01:02Z","title":"Learning Generalizable 3D Manipulation With 10 Demonstrations","summary":" Learning robust and generalizable manipulation skills from demonstrations\nremains a key challenge in robotics, with broad applications in industrial\nautomation and service robotics. While recent imitation learning methods have\nachieved impressive results, they often require large amounts of demonstration\ndata and struggle to generalize across different spatial variants. In this\nwork, we present a novel framework that learns manipulation skills from as few\nas 10 demonstrations, yet still generalizes to spatial variants such as\ndifferent initial object positions and camera viewpoints. Our framework\nconsists of two key modules: Semantic Guided Perception (SGP), which constructs\ntask-focused, spatially aware 3D point cloud representations from RGB-D inputs;\nand Spatial Generalized Decision (SGD), an efficient diffusion-based\ndecision-making module that generates actions via denoising. To effectively\nlearn generalization ability from limited data, we introduce a critical\nspatially equivariant training strategy that captures the spatial knowledge\nembedded in expert demonstrations. We validate our framework through extensive\nexperiments on both simulation benchmarks and real-world robotic systems. Our\nmethod demonstrates a 60 percent improvement in success rates over\nstate-of-the-art approaches on a series of challenging tasks, even with\nsubstantial variations in object poses and camera viewpoints. This work shows\nsignificant potential for advancing efficient, generalizable manipulation skill\nlearning in real-world applications.\n","authors":["Yu Ren","Yang Cong","Ronghan Chen","Jiahao Long"],"pdf_url":"https://arxiv.org/pdf/2411.10203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10200v1","updated":"2024-11-15T13:58:56Z","published":"2024-11-15T13:58:56Z","title":"Block based Adaptive Compressive Sensing with Sampling Rate Control","summary":" Compressive sensing (CS), acquiring and reconstructing signals below the\nNyquist rate, has great potential in image and video acquisition to exploit\ndata redundancy and greatly reduce the amount of sampled data. To further\nreduce the sampled data while keeping the video quality, this paper explores\nthe temporal redundancy in video CS and proposes a block based adaptive\ncompressive sensing framework with a sampling rate (SR) control strategy. To\navoid redundant compression of non-moving regions, we first incorporate moving\nblock detection between consecutive frames, and only transmit the measurements\nof moving blocks. The non-moving regions are reconstructed from the previous\nframe. In addition, we propose a block storage system and a dynamic threshold\nto achieve adaptive SR allocation to each frame based on the area of moving\nregions and target SR for controlling the average SR within the target SR.\nFinally, to reduce blocking artifacts and improve reconstruction quality, we\nadopt a cooperative reconstruction of the moving and non-moving blocks by\nreferring to the measurements of the non-moving blocks from the previous frame.\nExtensive experiments have demonstrated that this work is able to control SR\nand obtain better performance than existing works.\n","authors":["Kosuke Iwama","Ryugo Morita","Jinjia Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.10200v1.pdf","comment":"Accepted to MMAsia2024"},{"id":"http://arxiv.org/abs/2410.08119v2","updated":"2024-11-15T13:57:06Z","published":"2024-10-10T17:02:48Z","title":"Q-VLM: Post-training Quantization for Large Vision-Language Models","summary":" In this paper, we propose a post-training quantization framework of large\nvision-language models (LVLMs) for efficient multi-modal inference.\nConventional quantization methods sequentially search the layer-wise rounding\nfunctions by minimizing activation discretization errors, which fails to\nacquire optimal quantization strategy without considering cross-layer\ndependency. On the contrary, we mine the cross-layer dependency that\nsignificantly influences discretization errors of the entire vision-language\nmodel, and embed this dependency into optimal quantization strategy searching\nwith low search cost. Specifically, we observe the strong correlation between\nthe activation entropy and the cross-layer dependency concerning output\ndiscretization errors. Therefore, we employ the entropy as the proxy to\npartition blocks optimally, which aims to achieve satisfying trade-offs between\ndiscretization errors and the search cost. Moreover, we optimize the visual\nencoder to disentangle the cross-layer dependency for fine-grained\ndecomposition of search space, so that the search cost is further reduced\nwithout harming the quantization accuracy. Experimental results demonstrate\nthat our method compresses the memory by 2.78x and increase generate speed by\n1.44x about 13B LLaVA model without performance degradation on diverse\nmulti-modal reasoning tasks. Code is available at\nhttps://github.com/ChangyuanWang17/QVLM.\n","authors":["Changyuan Wang","Ziwei Wang","Xiuwei Xu","Yansong Tang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2410.08119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10198v1","updated":"2024-11-15T13:53:19Z","published":"2024-11-15T13:53:19Z","title":"STLight: a Fully Convolutional Approach for Efficient Predictive\n Learning by Spatio-Temporal joint Processing","summary":" Spatio-Temporal predictive Learning is a self-supervised learning paradigm\nthat enables models to identify spatial and temporal patterns by predicting\nfuture frames based on past frames. Traditional methods, which use recurrent\nneural networks to capture temporal patterns, have proven their effectiveness\nbut come with high system complexity and computational demand. Convolutions\ncould offer a more efficient alternative but are limited by their\ncharacteristic of treating all previous frames equally, resulting in poor\ntemporal characterization, and by their local receptive field, limiting the\ncapacity to capture distant correlations among frames. In this paper, we\npropose STLight, a novel method for spatio-temporal learning that relies solely\non channel-wise and depth-wise convolutions as learnable layers. STLight\novercomes the limitations of traditional convolutional approaches by\nrearranging spatial and temporal dimensions together, using a single\nconvolution to mix both types of features into a comprehensive spatio-temporal\npatch representation. This representation is then processed in a purely\nconvolutional framework, capable of focusing simultaneously on the interaction\namong near and distant patches, and subsequently allowing for efficient\nreconstruction of the predicted frames. Our architecture achieves\nstate-of-the-art performance on STL benchmarks across different datasets and\nsettings, while significantly improving computational efficiency in terms of\nparameters and computational FLOPs. The code is publicly available\n","authors":["Andrea Alfarano","Alberto Alfarano","Linda Friso","Andrea Bacciu","Irene Amerini","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2411.10198v1.pdf","comment":"Accepted at WACV 2025 conference"},{"id":"http://arxiv.org/abs/2411.10193v1","updated":"2024-11-15T13:47:33Z","published":"2024-11-15T13:47:33Z","title":"DiMoDif: Discourse Modality-information Differentiation for Audio-visual\n Deepfake Detection and Localization","summary":" Deepfake technology has rapidly advanced, posing significant threats to\ninformation integrity and societal trust. While significant progress has been\nmade in detecting deepfakes, the simultaneous manipulation of audio and visual\nmodalities, sometimes at small parts but still altering the meaning, presents a\nmore challenging detection scenario. We present a novel audio-visual deepfake\ndetection framework that leverages the inter-modality differences in machine\nperception of speech, based on the assumption that in real samples - in\ncontrast to deepfakes - visual and audio signals coincide in terms of\ninformation. Our framework leverages features from deep networks that\nspecialize in video and audio speech recognition to spot frame-level\ncross-modal incongruities, and in that way to temporally localize the deepfake\nforgery. To this end, DiMoDif employs a Transformer encoder-based architecture\nwith a feature pyramid scheme and local attention, and optimizes the detection\nmodel through a composite loss function accounting for frame-level detections\nand fake intervals localization. DiMoDif outperforms the state-of-the-art on\nthe Temporal Forgery Localization task by +47.88% AP@0.75 on AV-Deepfake1M, and\nperforms on-par on LAV-DF. On the Deepfake Detection task, it outperforms the\nstate-of-the-art by +30.5% AUC on AV-Deepfake1M, +2.8% AUC on FakeAVCeleb, and\nperforms on-par on LAV-DF. Code available at\nhttps://github.com/mever-team/dimodif.\n","authors":["Christos Koutlis","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2411.10193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10189v1","updated":"2024-11-15T13:42:57Z","published":"2024-11-15T13:42:57Z","title":"NeISF++: Neural Incident Stokes Field for Polarized Inverse Rendering of\n Conductors and Dielectrics","summary":" Recent inverse rendering methods have greatly improved shape, material, and\nillumination reconstruction by utilizing polarization cues. However, existing\nmethods only support dielectrics, ignoring conductors that are found everywhere\nin life. Since conductors and dielectrics have different reflection properties,\nusing previous conductor methods will lead to obvious errors. In addition,\nconductors are glossy, which may cause strong specular reflection and is hard\nto reconstruct. To solve the above issues, we propose NeISF++, an inverse\nrendering pipeline that supports conductors and dielectrics. The key ingredient\nfor our proposal is a general pBRDF that describes both conductors and\ndielectrics. As for the strong specular reflection problem, we propose a novel\ngeometry initialization method using DoLP images. This physical cue is\ninvariant to intensities and thus robust to strong specular reflections.\nExperimental results on our synthetic and real datasets show that our method\nsurpasses the existing polarized inverse rendering methods for geometry and\nmaterial decomposition as well as downstream tasks like relighting.\n","authors":["Chenhao Li","Taishi Ono","Takeshi Uemori","Sho Nitta","Hajime Mihara","Alexander Gatto","Hajime Nagahara","Yusuke Moriuchi"],"pdf_url":"https://arxiv.org/pdf/2411.10189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10187v1","updated":"2024-11-15T13:35:58Z","published":"2024-11-15T13:35:58Z","title":"Try-On-Adapter: A Simple and Flexible Try-On Paradigm","summary":" Image-based virtual try-on, widely used in online shopping, aims to generate\nimages of a naturally dressed person conditioned on certain garments, providing\nsignificant research and commercial potential. A key challenge of try-on is to\ngenerate realistic images of the model wearing the garments while preserving\nthe details of the garments. Previous methods focus on masking certain parts of\nthe original model's standing image, and then inpainting on masked areas to\ngenerate realistic images of the model wearing corresponding reference\ngarments, which treat the try-on task as an inpainting task. However, such\nimplements require the user to provide a complete, high-quality standing image,\nwhich is user-unfriendly in practical applications. In this paper, we propose\nTry-On-Adapter (TOA), an outpainting paradigm that differs from the existing\ninpainting paradigm. Our TOA can preserve the given face and garment, naturally\nimagine the rest parts of the image, and provide flexible control ability with\nvarious conditions, e.g., garment properties and human pose. In the\nexperiments, TOA shows excellent performance on the virtual try-on task even\ngiven relatively low-quality face and garment images in qualitative\ncomparisons. Additionally, TOA achieves the state-of-the-art performance of FID\nscores 5.56 and 7.23 for paired and unpaired on the VITON-HD dataset in\nquantitative comparisons.\n","authors":["Hanzhong Guo","Jianfeng Zhang","Cheng Zou","Jun Li","Meng Wang","Ruxue Wen","Pingzhong Tang","Jingdong Chen","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2411.10187v1.pdf","comment":"Image virtual try-on, 7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.10185v1","updated":"2024-11-15T13:34:46Z","published":"2024-11-15T13:34:46Z","title":"Efficient Progressive Image Compression with Variance-aware Masking","summary":" Learned progressive image compression is gaining momentum as it allows\nimproved image reconstruction as more bits are decoded at the receiver. We\npropose a progressive image compression method in which an image is first\nrepresented as a pair of base-quality and top-quality latent representations.\nNext, a residual latent representation is encoded as the element-wise\ndifference between the top and base representations. Our scheme enables\nprogressive image compression with element-wise granularity by introducing a\nmasking system that ranks each element of the residual latent representation\nfrom most to least important, dividing it into complementary components, which\ncan be transmitted separately to the decoder in order to obtain different\nreconstruction quality. The masking system does not add further parameters nor\ncomplexity. At the receiver, any elements of the top latent representation\nexcluded from the transmitted components can be independently replaced with the\nmean predicted by the hyperprior architecture, ensuring reliable\nreconstructions at any intermediate quality level. We also introduced Rate\nEnhancement Modules (REMs), which refine the estimation of entropy parameters\nusing already decoded components. We obtain results competitive with\nstate-of-the-art competitors, while significantly reducing computational\ncomplexity, decoding time, and number of parameters.\n","authors":["Alberto Presta","Enzo Tartaglione","Attilio Fiandrotti","Marco Grangetto","Pamela Cosman"],"pdf_url":"https://arxiv.org/pdf/2411.10185v1.pdf","comment":"10 pages. Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2411.10183v1","updated":"2024-11-15T13:32:23Z","published":"2024-11-15T13:32:23Z","title":"Visual question answering based evaluation metrics for text-to-image\n generation","summary":" Text-to-image generation and text-guided image manipulation have received\nconsiderable attention in the field of image generation tasks. However, the\nmainstream evaluation methods for these tasks have difficulty in evaluating\nwhether all the information from the input text is accurately reflected in the\ngenerated images, and they mainly focus on evaluating the overall alignment\nbetween the input text and the generated images. This paper proposes new\nevaluation metrics that assess the alignment between input text and generated\nimages for every individual object. Firstly, according to the input text,\nchatGPT is utilized to produce questions for the generated images. After that,\nwe use Visual Question Answering(VQA) to measure the relevance of the generated\nimages to the input text, which allows for a more detailed evaluation of the\nalignment compared to existing methods. In addition, we use Non-Reference Image\nQuality Assessment(NR-IQA) to evaluate not only the text-image alignment but\nalso the quality of the generated images. Experimental results show that our\nproposed evaluation approach is the superior metric that can simultaneously\nassess finer text-image alignment and image quality while allowing for the\nadjustment of these ratios.\n","authors":["Mizuki Miyamoto","Ryugo Morita","Jinjia Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.10183v1.pdf","comment":"Accepted to ISCAS2024"},{"id":"http://arxiv.org/abs/2411.10180v1","updated":"2024-11-15T13:29:44Z","published":"2024-11-15T13:29:44Z","title":"CART: Compositional Auto-Regressive Transformer for Image Generation","summary":" In recent years, image synthesis has achieved remarkable advancements,\nenabling diverse applications in content creation, virtual reality, and beyond.\nWe introduce a novel approach to image generation using Auto-Regressive (AR)\nmodeling, which leverages a next-detail prediction strategy for enhanced\nfidelity and scalability. While AR models have achieved transformative success\nin language modeling, replicating this success in vision tasks has presented\nunique challenges due to the inherent spatial dependencies in images. Our\nproposed method addresses these challenges by iteratively adding finer details\nto an image compositionally, constructing it as a hierarchical combination of\nbase and detail image factors. This strategy is shown to be more effective than\nthe conventional next-token prediction and even surpasses the state-of-the-art\nnext-scale prediction approaches. A key advantage of this method is its\nscalability to higher resolutions without requiring full model retraining,\nmaking it a versatile solution for high-resolution image generation.\n","authors":["Siddharth Roheda"],"pdf_url":"https://arxiv.org/pdf/2411.10180v1.pdf","comment":"under review at CVPR 2025"},{"id":"http://arxiv.org/abs/2411.10175v1","updated":"2024-11-15T13:21:26Z","published":"2024-11-15T13:21:26Z","title":"The Surprising Ineffectiveness of Pre-Trained Visual Representations for\n Model-Based Reinforcement Learning","summary":" Visual Reinforcement Learning (RL) methods often require extensive amounts of\ndata. As opposed to model-free RL, model-based RL (MBRL) offers a potential\nsolution with efficient data utilization through planning. Additionally, RL\nlacks generalization capabilities for real-world tasks. Prior work has shown\nthat incorporating pre-trained visual representations (PVRs) enhances sample\nefficiency and generalization. While PVRs have been extensively studied in the\ncontext of model-free RL, their potential in MBRL remains largely unexplored.\nIn this paper, we benchmark a set of PVRs on challenging control tasks in a\nmodel-based RL setting. We investigate the data efficiency, generalization\ncapabilities, and the impact of different properties of PVRs on the performance\nof model-based agents. Our results, perhaps surprisingly, reveal that for MBRL\ncurrent PVRs are not more sample efficient than learning representations from\nscratch, and that they do not generalize better to out-of-distribution (OOD)\nsettings. To explain this, we analyze the quality of the trained dynamics\nmodel. Furthermore, we show that data diversity and network architecture are\nthe most important contributors to OOD generalization performance.\n","authors":["Moritz Schneider","Robert Krug","Narunas Vaskevicius","Luigi Palmieri","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2411.10175v1.pdf","comment":"Published at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/"},{"id":"http://arxiv.org/abs/2411.10161v1","updated":"2024-11-15T13:07:22Z","published":"2024-11-15T13:07:22Z","title":"SEAGULL: No-reference Image Quality Assessment for Regions of Interest\n via Vision-Language Instruction Tuning","summary":" Existing Image Quality Assessment (IQA) methods achieve remarkable success in\nanalyzing quality for overall image, but few works explore quality analysis for\nRegions of Interest (ROIs). The quality analysis of ROIs can provide\nfine-grained guidance for image quality improvement and is crucial for\nscenarios focusing on region-level quality. This paper proposes a novel\nnetwork, SEAGULL, which can SEe and Assess ROIs quality with GUidance from a\nLarge vision-Language model. SEAGULL incorporates a vision-language model\n(VLM), masks generated by Segment Anything Model (SAM) to specify ROIs, and a\nmeticulously designed Mask-based Feature Extractor (MFE) to extract global and\nlocal tokens for specified ROIs, enabling accurate fine-grained IQA for ROIs.\nMoreover, this paper constructs two ROI-based IQA datasets, SEAGULL-100w and\nSEAGULL-3k, for training and evaluating ROI-based IQA. SEAGULL-100w comprises\nabout 100w synthetic distortion images with 33 million ROIs for pre-training to\nimprove the model's ability of regional quality perception, and SEAGULL-3k\ncontains about 3k authentic distortion ROIs to enhance the model's ability to\nperceive real world distortions. After pre-training on SEAGULL-100w and\nfine-tuning on SEAGULL-3k, SEAGULL shows remarkable performance on fine-grained\nROI quality assessment. Code and datasets are publicly available at the\nhttps://github.com/chencn2020/Seagull.\n","authors":["Zewen Chen","Juan Wang","Wen Wang","Sunhan Xu","Hang Xiong","Yun Zeng","Jian Guo","Shuxun Wang","Chunfeng Yuan","Bing Li","Weiming Hu"],"pdf_url":"https://arxiv.org/pdf/2411.10161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10150v1","updated":"2024-11-15T12:43:38Z","published":"2024-11-15T12:43:38Z","title":"Outliers resistant image classification by anomaly detection","summary":" Various technologies, including computer vision models, are employed for the\nautomatic monitoring of manual assembly processes in production. These models\ndetect and classify events such as the presence of components in an assembly\narea or the connection of components. A major challenge with detection and\nclassification algorithms is their susceptibility to variations in\nenvironmental conditions and unpredictable behavior when processing objects\nthat are not included in the training dataset. As it is impractical to add all\npossible subjects in the training sample, an alternative solution is necessary.\nThis study proposes a model that simultaneously performs classification and\nanomaly detection, employing metric learning to generate vector representations\nof images in a multidimensional space, followed by classification using\ncross-entropy. For experimentation, a dataset of over 327,000 images was\nprepared. Experiments were conducted with various computer vision model\narchitectures, and the outcomes of each approach were compared.\n","authors":["Anton Sergeev","Victor Minchenkov","Aleksei Soldatov","Vasiliy Kakurin","Yaroslav Mazikov"],"pdf_url":"https://arxiv.org/pdf/2411.10150v1.pdf","comment":"19 pages, in Russian"},{"id":"http://arxiv.org/abs/2411.09462v2","updated":"2024-11-15T12:41:48Z","published":"2024-11-14T14:12:16Z","title":"SINETRA: a Versatile Framework for Evaluating Single Neuron Tracking in\n Behaving Animals","summary":" Accurately tracking neuronal activity in behaving animals presents\nsignificant challenges due to complex motions and background noise. The lack of\nannotated datasets limits the evaluation and improvement of such tracking\nalgorithms. To address this, we developed SINETRA, a versatile simulator that\ngenerates synthetic tracking data for particles on a deformable background,\nclosely mimicking live animal recordings. This simulator produces annotated 2D\nand 3D videos that reflect the intricate movements seen in behaving animals\nlike Hydra Vulgaris. We evaluated four state-of-the-art tracking algorithms\nhighlighting the current limitations of these methods in challenging scenarios\nand paving the way for improved cell tracking techniques in dynamic biological\nsystems.\n","authors":["Raphael Reme","Alasdair Newson","Elsa Angelini","Jean-Christophe Olivo-Marin","Thibault Lagache"],"pdf_url":"https://arxiv.org/pdf/2411.09462v2.pdf","comment":"5 pages, 3 figures, submitted at 2025 IEEE International Symposium on\n Biomedical Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2411.10141v1","updated":"2024-11-15T12:30:24Z","published":"2024-11-15T12:30:24Z","title":"Matrix-Valued LogSumExp Approximation for Colour Morphology","summary":" Mathematical morphology is a part of image processing that uses a window that\nmoves across the image to change certain pixels according to certain\noperations. The concepts of supremum and infimum play a crucial role here, but\nit proves challenging to define them generally for higher-dimensional data,\nsuch as colour representations. Numerous approaches have therefore been taken\nto solve this problem with certain compromises. In this paper we will analyse\nthe construction of a new approach, which we have already presented\nexperimentally in paper [Kahra, M., Breu{\\ss}, M., Kleefeld, A., Welk, M., DGMM\n2024, pp. 325-337]. This is based on a method by Burgeth and Kleefeld [Burgeth,\nB., Kleefeld, A., ISMM 2013, pp. 243-254], who regard the colours as symmetric\n$2\\times2$ matrices and compare them by means of the Loewner order in a bi-cone\nthrough different suprema. However, we will replace the supremum with the\nLogExp approximation for the maximum instead. This allows us to transfer the\nassociativity of the dilation from the one-dimensional case to the\nhigher-dimensional case. In addition, we will investigate the minimality\nproperty and specify a relaxation to ensure that our approach is continuously\ndependent on the input data.\n","authors":["Marvin Kahra","Michael Breuß","Andreas Kleefeld","Martin Welk"],"pdf_url":"https://arxiv.org/pdf/2411.10141v1.pdf","comment":"42 pages, 10 figures, to be submitted in JMIV"},{"id":"http://arxiv.org/abs/2411.09145v2","updated":"2024-11-15T12:27:39Z","published":"2024-11-14T02:57:11Z","title":"UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for\n Egocentric Hand Object Interaction Videos","summary":" Egocentric Hand Object Interaction (HOI) videos provide valuable insights\ninto human interactions with the physical world, attracting growing interest\nfrom the computer vision and robotics communities. A key task in fully\nunderstanding the geometry and dynamics of HOI scenes is dense pointclouds\nsequence reconstruction. However, the inherent motion of both hands and the\ncamera makes this challenging. Current methods often rely on time-consuming\ntest-time optimization, making them impractical for reconstructing\ninternet-scale videos. To address this, we introduce UniHOI, a model that\nunifies the estimation of all variables necessary for dense 4D reconstruction,\nincluding camera intrinsic, camera poses, and video depth, for egocentric HOI\nscene in a fast feed-forward manner. We end-to-end optimize all these variables\nto improve their consistency in 3D space. Furthermore, our model could be\ntrained solely on large-scale monocular video dataset, overcoming the\nlimitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain\nand zero-shot generalization setting, surpassing all baselines in pointclouds\nsequence reconstruction and long-term 3D scene flow recovery. UniHOI is the\nfirst approach to offer fast, dense, and generalizable monocular egocentric HOI\nscene reconstruction in the presence of motion. Code and trained model will be\nreleased in the future.\n","authors":["Chengbo Yuan","Geng Chen","Li Yi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2411.09145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12677v3","updated":"2024-11-15T12:26:24Z","published":"2024-08-22T18:32:50Z","title":"GSFusion: Online RGB-D Mapping Where Gaussian Splatting Meets TSDF\n Fusion","summary":" Traditional volumetric fusion algorithms preserve the spatial structure of 3D\nscenes, which is beneficial for many tasks in computer vision and robotics.\nHowever, they often lack realism in terms of visualization. Emerging 3D\nGaussian splatting bridges this gap, but existing Gaussian-based reconstruction\nmethods often suffer from artifacts and inconsistencies with the underlying 3D\nstructure, and struggle with real-time optimization, unable to provide users\nwith immediate feedback in high quality. One of the bottlenecks arises from the\nmassive amount of Gaussian parameters that need to be updated during\noptimization. Instead of using 3D Gaussian as a standalone map representation,\nwe incorporate it into a volumetric mapping system to take advantage of\ngeometric information and propose to use a quadtree data structure on images to\ndrastically reduce the number of splats initialized. In this way, we\nsimultaneously generate a compact 3D Gaussian map with fewer artifacts and a\nvolumetric map on the fly. Our method, GSFusion, significantly enhances\ncomputational efficiency without sacrificing rendering quality, as demonstrated\non both synthetic and real datasets. Code will be available at\nhttps://github.com/goldoak/GSFusion.\n","authors":["Jiaxin Wei","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2408.12677v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10136v1","updated":"2024-11-15T12:20:52Z","published":"2024-11-15T12:20:52Z","title":"CoSAM: Self-Correcting SAM for Domain Generalization in 2D Medical Image\n Segmentation","summary":" Medical images often exhibit distribution shifts due to variations in imaging\nprotocols and scanners across different medical centers. Domain Generalization\n(DG) methods aim to train models on source domains that can generalize to\nunseen target domains. Recently, the segment anything model (SAM) has\ndemonstrated strong generalization capabilities due to its prompt-based design,\nand has gained significant attention in image segmentation tasks. Existing\nSAM-based approaches attempt to address the need for manual prompts by\nintroducing prompt generators that automatically generate these prompts.\nHowever, we argue that auto-generated prompts may not be sufficiently accurate\nunder distribution shifts, potentially leading to incorrect predictions that\nstill require manual verification and correction by clinicians. To address this\nchallenge, we propose a method for 2D medical image segmentation called\nSelf-Correcting SAM (CoSAM). Our approach begins by generating coarse masks\nusing SAM in a prompt-free manner, providing prior prompts for the subsequent\nstages, and eliminating the need for prompt generators. To automatically refine\nthese coarse masks, we introduce a generalized error decoder that simulates the\ncorrection process typically performed by clinicians. Furthermore, we generate\ndiverse prompts as feedback based on the corrected masks, which are used to\niteratively refine the predictions within a self-correcting loop, enhancing the\ngeneralization performance of our model. Extensive experiments on two medical\nimage segmentation benchmarks across multiple scenarios demonstrate the\nsuperiority of CoSAM over state-of-the-art SAM-based methods.\n","authors":["Yihang Fu","Ziyang Chen","Yiwen Ye","Xingliang Lei","Zhisong Wang","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2411.10136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10133v1","updated":"2024-11-15T12:12:56Z","published":"2024-11-15T12:12:56Z","title":"Efficient Density Control for 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) excels in novel view synthesis, balancing\nadvanced rendering quality with real-time performance. However, in trained\nscenes, a large number of Gaussians with low opacity significantly increase\nrendering costs. This issue arises due to flaws in the split and clone\noperations during the densification process, which lead to extensive Gaussian\noverlap and subsequent opacity reduction. To enhance the efficiency of Gaussian\nutilization, we improve the adaptive density control of 3DGS. First, we\nintroduce a more efficient long-axis split operation to replace the original\nclone and split, which mitigates Gaussian overlap and improves densification\nefficiency.Second, we propose a simple adaptive pruning technique to reduce the\nnumber of low-opacity Gaussians. Finally, by dynamically lowering the splitting\nthreshold and applying importance weighting, the efficiency of Gaussian\nutilization is further improved.We evaluate our proposed method on various\nchallenging real-world datasets. Experimental results show that our Efficient\nDensity Control (EDC) can enhance both the rendering speed and quality.\n","authors":["Xiaobin Deng","Changyu Diao","Min Li","Ruohan Yu","Duanqing Xu"],"pdf_url":"https://arxiv.org/pdf/2411.10133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10130v1","updated":"2024-11-15T12:02:07Z","published":"2024-11-15T12:02:07Z","title":"Towards Multi-View Consistent Style Transfer with One-Step Diffusion via\n Vision Conditioning","summary":" The stylization of 3D scenes is an increasingly attractive topic in 3D\nvision. Although image style transfer has been extensively researched with\npromising results, directly applying 2D style transfer methods to 3D scenes\noften fails to preserve the structural and multi-view properties of 3D\nenvironments, resulting in unpleasant distortions in images from different\nviewpoints. To address these issues, we leverage the remarkable generative\nprior of diffusion-based models and propose a novel style transfer method,\nOSDiffST, based on a pre-trained one-step diffusion model (i.e., SD-Turbo) for\nrendering diverse styles in multi-view images of 3D scenes. To efficiently\nadapt the pre-trained model for multi-view style transfer on small datasets, we\nintroduce a vision condition module to extract style information from the\nreference style image to serve as conditional input for the diffusion model and\nemploy LoRA in diffusion model for adaptation. Additionally, we consider color\ndistribution alignment and structural similarity between the stylized and\ncontent images using two specific loss functions. As a result, our method\neffectively preserves the structural information and multi-view consistency in\nstylized images without any 3D information. Experiments show that our method\nsurpasses other promising style transfer methods in synthesizing various styles\nfor multi-view images of 3D scenes. Stylized images from different viewpoints\ngenerated by our method achieve superior visual quality, with better structural\nintegrity and less distortion. The source code is available at\nhttps://github.com/YushenZuo/OSDiffST.\n","authors":["Yushen Zuo","Jun Xiao","Kin-Chung Chan","Rongkang Dong","Cuixin Yang","Zongqi He","Hao Xie","Kin-Man Lam"],"pdf_url":"https://arxiv.org/pdf/2411.10130v1.pdf","comment":"Accepted by ECCV 2024 AI for Visual Arts Workshop and Challenges, 18\n pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.02331v2","updated":"2024-11-15T11:35:25Z","published":"2024-10-03T09:29:28Z","title":"Self-eXplainable AI for Medical Image Analysis: A Survey and New\n Outlooks","summary":" The increasing demand for transparent and reliable models, particularly in\nhigh-stakes decision-making areas such as medical image analysis, has led to\nthe emergence of eXplainable Artificial Intelligence (XAI). Post-hoc XAI\ntechniques, which aim to explain black-box models after training, have raised\nconcerns about their fidelity to model predictions. In contrast,\nSelf-eXplainable AI (S-XAI) offers a compelling alternative by incorporating\nexplainability directly into the training process of deep learning models. This\napproach allows models to generate inherent explanations that are closely\naligned with their internal decision-making processes, enhancing transparency\nand supporting the trustworthiness, robustness, and accountability of AI\nsystems in real-world medical applications. To facilitate the development of\nS-XAI methods for medical image analysis, this survey presents a comprehensive\nreview across various image modalities and clinical applications. It covers\nmore than 200 papers from three key perspectives: 1) input explainability\nthrough the integration of explainable feature engineering and knowledge graph,\n2) model explainability via attention-based learning, concept-based learning,\nand prototype-based learning, and 3) output explainability by providing textual\nand counterfactual explanations. This paper also outlines desired\ncharacteristics of explainability and evaluation methods for assessing\nexplanation quality, while discussing major challenges and future research\ndirections in developing S-XAI for medical image analysis.\n","authors":["Junlin Hou","Sicen Liu","Yequan Bie","Hongmei Wang","Andong Tan","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10100v1","updated":"2024-11-15T10:50:36Z","published":"2024-11-15T10:50:36Z","title":"Multi-Task Adversarial Variational Autoencoder for Estimating Biological\n Brain Age with Multimodal Neuroimaging","summary":" Despite advances in deep learning for estimating brain age from structural\nMRI data, incorporating functional MRI data is challenging due to its complex\nstructure and the noisy nature of functional connectivity measurements. To\naddress this, we present the Multitask Adversarial Variational Autoencoder, a\ncustom deep learning framework designed to improve brain age predictions\nthrough multimodal MRI data integration. This model separates latent variables\ninto generic and unique codes, isolating shared and modality-specific features.\nBy integrating multitask learning with sex classification as an additional\ntask, the model captures sex-specific aging patterns. Evaluated on the OpenBHB\ndataset, a large multisite brain MRI collection, the model achieves a mean\nabsolute error of 2.77 years, outperforming traditional methods. This success\npositions M-AVAE as a powerful tool for metaverse-based healthcare applications\nin brain age estimation.\n","authors":["Muhammad Usman","Azka Rehman","Abdullah Shahid","Abd Ur Rehman","Sung-Min Gho","Aleum Lee","Tariq M. Khan","Imran Razzak"],"pdf_url":"https://arxiv.org/pdf/2411.10100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14022v4","updated":"2024-11-15T10:35:58Z","published":"2024-05-22T21:55:58Z","title":"I2I-Mamba: Multi-modal medical image synthesis via selective state space\n modeling","summary":" In recent years, deep learning models comprising transformer components have\npushed the performance envelope in medical image synthesis tasks. Contrary to\nconvolutional neural networks (CNNs) that use static, local filters,\ntransformers use self-attention mechanisms to permit adaptive, non-local\nfiltering to sensitively capture long-range context. However, this sensitivity\ncomes at the expense of substantial model complexity, which can compromise\nlearning efficacy particularly on relatively modest-sized imaging datasets.\nHere, we propose a novel adversarial model for multi-modal medical image\nsynthesis, I2I-Mamba, that leverages selective state space modeling (SSM) to\nefficiently capture long-range context while maintaining local precision. To do\nthis, I2I-Mamba injects channel-mixed Mamba (cmMamba) blocks in the bottleneck\nof a convolutional backbone. In cmMamba blocks, SSM layers are used to learn\ncontext across the spatial dimension and channel-mixing layers are used to\nlearn context across the channel dimension of feature maps. Comprehensive\ndemonstrations are reported for imputing missing images in multi-contrast MRI\nand MRI-CT protocols. Our results indicate that I2I-Mamba offers superior\nperformance against state-of-the-art CNN- and transformer-based methods in\nsynthesizing target-modality images.\n","authors":["Omer F. Atli","Bilal Kabas","Fuat Arslan","Arda C. Demirtas","Mahmut Yurt","Onat Dalmaz","Tolga Çukur"],"pdf_url":"https://arxiv.org/pdf/2405.14022v4.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.17297v2","updated":"2024-11-15T10:35:19Z","published":"2024-08-30T13:52:26Z","title":"BOP-Distrib: Revisiting 6D Pose Estimation Benchmark for Better\n Evaluation under Visual Ambiguities","summary":" 6D pose estimation aims at determining the pose of the object that best\nexplains the camera observation. The unique solution for a non-symmetrical\nobject can turn into a multi-modal pose distribution for a symmetrical object\nor when occlusions of symmetry-breaking elements happen, depending on the\nviewpoint. Currently, 6D pose estimation methods are benchmarked on datasets\nthat consider, for their ground truth annotations, visual ambiguities as only\nrelated to global object symmetries, whereas they should be defined per-image\nto account for the camera viewpoint. We thus first propose an automatic method\nto re-annotate those datasets with a 6D pose distribution specific to each\nimage, taking into account the visibility of the object surface in the image to\ncorrectly determine the visual ambiguities. Second, given this improved ground\ntruth, we re-evaluate the state-of-the-art single pose methods and show that\nthis greatly modifies the ranking of these methods. Third, as some recent works\nfocus on estimating the complete set of solutions, we derive a precision/recall\nformulation to evaluate them against our image-wise distribution ground truth,\nmaking it the first benchmark for pose distribution methods on real images. We\nwill make our annotations for the T-LESS dataset and our code publicly\navailable.\n","authors":["Boris Meden","Asma Brazi","Fabrice Mayran de Chamisso","Steve Bourgeois"],"pdf_url":"https://arxiv.org/pdf/2408.17297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10086v1","updated":"2024-11-15T10:14:55Z","published":"2024-11-15T10:14:55Z","title":"CorrCLIP: Reconstructing Correlations in CLIP with Off-the-Shelf\n Foundation Models for Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation aims to assign semantic labels to each\npixel without relying on a predefined set of categories. Contrastive\nLanguage-Image Pre-training (CLIP) demonstrates outstanding zero-shot\nclassification capabilities but struggles with the pixel-wise segmentation task\nas the captured inter-patch correlations correspond to no specific visual\nconcepts. Despite previous CLIP-based works improving inter-patch correlations\nby self-self attention, they still face the inherent limitation that image\npatches tend to have high similarity to outlier ones. In this work, we\nintroduce CorrCLIP, a training-free approach for open-vocabulary semantic\nsegmentation, which reconstructs significantly coherent inter-patch\ncorrelations utilizing foundation models. Specifically, it employs the Segment\nAnything Model (SAM) to define the scope of patch interactions, ensuring that\npatches interact only with semantically similar ones. Furthermore, CorrCLIP\nobtains an understanding of an image's semantic layout via self-supervised\nmodels to determine concrete similarity values between image patches, which\naddresses the similarity irregularity problem caused by the aforementioned\nrestricted patch interaction regime. Finally, CorrCLIP reuses the region masks\nproduced by SAM to update the segmentation map. As a training-free method,\nCorrCLIP achieves a notable improvement across eight challenging benchmarks\nregarding the averaged mean Intersection over Union, boosting it from 44.4% to\n51.0%.\n","authors":["Dengke Zhang","Fagui Liu","Quan Tang"],"pdf_url":"https://arxiv.org/pdf/2411.10086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09402v2","updated":"2024-11-15T09:52:20Z","published":"2024-11-14T12:27:31Z","title":"Automated Segmentation of Ischemic Stroke Lesions in Non-Contrast\n Computed Tomography Images for Enhanced Treatment and Prognosis","summary":" Stroke is the second leading cause of death worldwide, and is increasingly\nprevalent in low- and middle-income countries (LMICs). Timely interventions can\nsignificantly influence stroke survivability and the quality of life after\ntreatment. However, the standard and most widely available imaging method for\nconfirming strokes and their sub-types, the NCCT, is more challenging and\ntime-consuming to employ in cases of ischemic stroke. For this reason, we\ndeveloped an automated method for ischemic stroke lesion segmentation in NCCTs\nusing the nnU-Net frame work, aimed at enhancing early treatment and improving\nthe prognosis of ischemic stroke patients. We achieved Dice scores of 0.596 and\nIntersection over Union (IoU) scores of 0.501 on the sampled dataset. After\nadjusting for outliers, these scores improved to 0.752 for the Dice score and\n0.643 for the IoU. Proper delineation of the region of infarction can help\nclinicians better assess the potential impact of the infarction, and guide\ntreatment procedures.\n","authors":["Toufiq Musah","Prince Ebenezer Adjei","Kojo Obed Otoo"],"pdf_url":"https://arxiv.org/pdf/2411.09402v2.pdf","comment":"7 pages, 3 figures, MICCAI Meets Africa Workshop"},{"id":"http://arxiv.org/abs/2411.10081v1","updated":"2024-11-15T09:50:31Z","published":"2024-11-15T09:50:31Z","title":"Influence of Depth Camera Noise Models on Respiration Estimation","summary":" Depth cameras are an interesting modality for capturing vital signs such as\nrespiratory rate. Plenty approaches exist to extract vital signs in a\ncontrolled setting, but in order to apply them more flexibly for example in\nmulti-camera settings, a simulated environment is needed to generate enough\ndata for training and testing of new algorithms. We show first results of a\n3D-rendering simulation pipeline that focuses on different noise models in\norder to generate realistic, depth-camera based respiratory signals using both\nsynthetic and real respiratory signals as a baseline. While most noise can be\naccurately modelled as Gaussian in this context, we can show that as soon as\nthe available image resolution is too low, the differences between different\nnoise models surface.\n","authors":["Maurice Rohr","Sebastian Dill"],"pdf_url":"https://arxiv.org/pdf/2411.10081v1.pdf","comment":"Poster Prague 2023 Conference, 4 pages"},{"id":"http://arxiv.org/abs/2407.19674v6","updated":"2024-11-15T09:48:05Z","published":"2024-07-29T03:30:09Z","title":"Advancing Prompt Learning through an External Layer","summary":" Prompt learning represents a promising method for adapting pre-trained\nvision-language models (VLMs) to various downstream tasks by learning a set of\ntext embeddings. One challenge inherent to these methods is the poor\ngeneralization performance due to the invalidity of the learned text embeddings\nfor unseen tasks. A straightforward approach to bridge this gap is to freeze\nthe text embeddings in prompts, which results in a lack of capacity to adapt\nVLMs for downstream tasks. To address this dilemma, we propose a paradigm\ncalled EnPrompt with a novel External Layer (EnLa). Specifically, we propose a\ntextual external layer and learnable visual embeddings for adapting VLMs to\ndownstream tasks. The learnable external layer is built upon valid embeddings\nof pre-trained CLIP. This design considers the balance of learning capabilities\nbetween the two branches. To align the textual and visual features, we propose\na novel two-pronged approach: i) we introduce the optimal transport as the\ndiscrepancy metric to align the vision and text modalities, and ii) we\nintroduce a novel strengthening feature to enhance the interaction between\nthese two modalities. Four representative experiments (i.e., base-to-novel\ngeneralization, few-shot learning, cross-dataset generalization, domain shifts\ngeneralization) across 15 datasets demonstrate that our method outperforms the\nexisting prompt learning method.\n","authors":["Fangming Cui","Xun Yang","Chao Wu","Liang Xiao","Xinmei Tian"],"pdf_url":"https://arxiv.org/pdf/2407.19674v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10077v1","updated":"2024-11-15T09:45:32Z","published":"2024-11-15T09:45:32Z","title":"Uncertainty-Weighted Mutual Distillation for Multi-View Fusion","summary":" Multi-view learning often faces challenges in effectively leveraging images\ncaptured from different angles and locations. This challenge is particularly\npronounced when addressing inconsistencies and uncertainties between views. In\nthis paper, we propose a novel Multi-View Uncertainty-Weighted Mutual\nDistillation (MV-UWMD) method. Our method enhances prediction consistency by\nperforming hierarchical mutual distillation across all possible view\ncombinations, including single-view, partial multi-view, and full multi-view\npredictions. This introduces an uncertainty-based weighting mechanism through\nmutual distillation, allowing effective exploitation of unique information from\neach view while mitigating the impact of uncertain predictions. We extend a\nCNN-Transformer hybrid architecture to facilitate robust feature learning and\nintegration across multiple view combinations. We conducted extensive\nexperiments using a large, unstructured dataset captured from diverse,\nnon-fixed viewpoints. The results demonstrate that MV-UWMD improves prediction\naccuracy and consistency compared to existing multi-view learning approaches.\n","authors":["Jiwoong Yang","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2411.10077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10074v1","updated":"2024-11-15T09:39:12Z","published":"2024-11-15T09:39:12Z","title":"Improving the accuracy of automated labeling of specimen images datasets\n via a confidence-based process","summary":" The digitization of natural history collections over the past three decades\nhas unlocked a treasure trove of specimen imagery and metadata. There is great\ninterest in making this data more useful by further labeling it with additional\ntrait data, and modern deep learning machine learning techniques utilizing\nconvolutional neural nets (CNNs) and similar networks show particular promise\nto reduce the amount of required manual labeling by human experts, making the\nprocess much faster and less expensive. However, in most cases, the accuracy of\nthese approaches is too low for reliable utilization of the automatic labeling,\ntypically in the range of 80-85% accuracy. In this paper, we present and\nvalidate an approach that can greatly improve this accuracy, essentially by\nexamining the confidence that the network has in the generated label as well as\nutilizing a user-defined threshold to reject labels that fall below a chosen\nlevel. We demonstrate that a naive model that produced 86% initial accuracy can\nachieve improved performance - over 95% accuracy (rejecting about 40% of the\nlabels) or over 99% accuracy (rejecting about 65%) by selecting higher\nconfidence thresholds. This gives flexibility to adapt existing models to the\nstatistical requirements of various types of research and has the potential to\nmove these automatic labeling approaches from being unusably inaccurate to\nbeing an invaluable new tool. After validating the approach in a number of\nways, we annotate the reproductive state of a large dataset of over 600,000\nherbarium specimens. The analysis of the results points at under-investigated\ncorrelations as well as general alignment with known trends. By sharing this\nnew dataset alongside this work, we want to allow ecologists to gather insights\nfor their own research questions, at their chosen point of accuracy/coverage\ntrade-off.\n","authors":["Quentin Bateux","Jonathan Koss","Patrick W. Sweeney","Erika Edwards","Nelson Rios","Aaron M. Dollar"],"pdf_url":"https://arxiv.org/pdf/2411.10074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10072v1","updated":"2024-11-15T09:37:49Z","published":"2024-11-15T09:37:49Z","title":"Real-Time AI-Driven People Tracking and Counting Using Overhead Cameras","summary":" Accurate people counting in smart buildings and intelligent transportation\nsystems is crucial for energy management, safety protocols, and resource\nallocation. This is especially critical during emergencies, where precise\noccupant counts are vital for safe evacuation. Existing methods struggle with\nlarge crowds, often losing accuracy with even a few additional people. To\naddress this limitation, this study proposes a novel approach combining a new\nobject tracking algorithm, a novel counting algorithm, and a fine-tuned object\ndetection model. This method achieves 97% accuracy in real-time people counting\nwith a frame rate of 20-27 FPS on a low-power edge computer.\n","authors":["Ishrath Ahamed","Chamith Dilshan Ranathunga","Dinuka Sandun Udayantha","Benny Kai Kiat Ng","Chau Yuen"],"pdf_url":"https://arxiv.org/pdf/2411.10072v1.pdf","comment":"This paper is accepted to IEEE Region 10 conference (TENCON) 2024"},{"id":"http://arxiv.org/abs/2411.10071v1","updated":"2024-11-15T09:34:28Z","published":"2024-11-15T09:34:28Z","title":"Evidential Federated Learning for Skin Lesion Image Classification","summary":" We introduce FedEvPrompt, a federated learning approach that integrates\nprinciples of evidential deep learning, prompt tuning, and knowledge\ndistillation for distributed skin lesion classification. FedEvPrompt leverages\ntwo sets of prompts: b-prompts (for low-level basic visual knowledge) and\nt-prompts (for task-specific knowledge) prepended to frozen pre-trained Vision\nTransformer (ViT) models trained in an evidential learning framework to\nmaximize class evidences. Crucially, knowledge sharing across federation\nclients is achieved only through knowledge distillation on attention maps\ngenerated by the local ViT models, ensuring enhanced privacy preservation\ncompared to traditional parameter or synthetic image sharing methodologies.\nFedEvPrompt is optimized within a round-based learning paradigm, where each\nround involves training local models followed by attention maps sharing with\nall federation clients. Experimental validation conducted in a real distributed\nsetting, on the ISIC2019 dataset, demonstrates the superior performance of\nFedEvPrompt against baseline federated learning algorithms and knowledge\ndistillation methods, without sharing model parameters. In conclusion,\nFedEvPrompt offers a promising approach for federated learning, effectively\naddressing challenges such as data heterogeneity, imbalance, privacy\npreservation, and knowledge sharing.\n","authors":["Rutger Hendrix","Federica Proietto Salanitri","Concetto Spampinato","Simone Palazzo","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2411.10071v1.pdf","comment":"Published as a conference paper at ICPR 2024"},{"id":"http://arxiv.org/abs/2410.15658v2","updated":"2024-11-15T09:34:23Z","published":"2024-10-21T05:56:31Z","title":"Calibration of ordinal regression networks","summary":" Recent studies have shown that deep neural networks are not well-calibrated\nand often produce over-confident predictions. The miscalibration issue\nprimarily stems from using cross-entropy in classifications, which aims to\nalign predicted softmax probabilities with one-hot labels. In ordinal\nregression tasks, this problem is compounded by an additional challenge: the\nexpectation that softmax probabilities should exhibit unimodal distribution is\nnot met with cross-entropy. The ordinal regression literature has focused on\nlearning orders and overlooked calibration. To address both issues, we propose\na novel loss function that introduces order-aware calibration, ensuring that\nprediction confidence adheres to ordinal relationships between classes. It\nincorporates soft ordinal encoding and order-aware regularization to enforce\nboth calibration and unimodality. Extensive experiments across three popular\nordinal regression benchmarks demonstrate that our approach achieves\nstate-of-the-art calibration without compromising accuracy.\n","authors":["Daehwan Kim","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2410.15658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10070v1","updated":"2024-11-15T09:34:07Z","published":"2024-11-15T09:34:07Z","title":"Step-wise Distribution Alignment Guided Style Prompt Tuning for\n Source-free Cross-domain Few-shot Learning","summary":" Existing cross-domain few-shot learning (CDFSL) methods, which develop\nsource-domain training strategies to enhance model transferability, face\nchallenges with large-scale pre-trained models (LMs) due to inaccessible source\ndata and training strategies. Moreover, fine-tuning LMs for CDFSL demands\nsubstantial computational resources, limiting practicality. This paper\naddresses the source-free CDFSL (SF-CDFSL) problem, tackling few-shot learning\n(FSL) in the target domain using only pre-trained models and a few target\nsamples without source data or strategies. To overcome the challenge of\ninaccessible source data, this paper introduces Step-wise Distribution\nAlignment Guided Style Prompt Tuning (StepSPT), which implicitly narrows domain\ngaps through prediction distribution optimization. StepSPT proposes a style\nprompt to align target samples with the desired distribution and adopts a\ndual-phase optimization process. In the external process, a step-wise\ndistribution alignment strategy factorizes prediction distribution optimization\ninto a multi-step alignment problem to tune the style prompt. In the internal\nprocess, the classifier is updated using standard cross-entropy loss.\nEvaluations on five datasets demonstrate that StepSPT outperforms existing\nprompt tuning-based methods and SOTAs. Ablation studies further verify its\neffectiveness. Code will be made publicly available at\n\\url{https://github.com/xuhuali-mxj/StepSPT}.\n","authors":["Huali Xu","Yongxiang Liu","Li Liu","Shuaifeng Zhi","Shuzhou Sun","Tianpeng Liu","MingMing Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.10070v1.pdf","comment":"15 pages, 12 figures, 7 tables"},{"id":"http://arxiv.org/abs/2411.10068v1","updated":"2024-11-15T09:33:13Z","published":"2024-11-15T09:33:13Z","title":"Diachronic Document Dataset for Semantic Layout Analysis","summary":" We present a novel, open-access dataset designed for semantic layout\nanalysis, built to support document recreation workflows through mapping with\nthe Text Encoding Initiative (TEI) standard. This dataset includes 7,254\nannotated pages spanning a large temporal range (1600-2024) of digitised and\nborn-digital materials across diverse document types (magazines, papers from\nsciences and humanities, PhD theses, monographs, plays, administrative reports,\netc.) sorted into modular subsets. By incorporating content from different\nperiods and genres, it addresses varying layout complexities and historical\nchanges in document structure. The modular design allows domain-specific\nconfigurations. We evaluate object detection models on this dataset, examining\nthe impact of input size and subset-based training. Results show that a\n1280-pixel input size for YOLO is optimal and that training on subsets\ngenerally benefits from incorporating them into a generic model rather than\nfine-tuning pre-trained weights.\n","authors":["Thibault Clérice","Juliette Janes","Hugo Scheithauer","Sarah Bénière","Florian Cafiero","Laurent Romary","Simon Gabay","Benoît Sagot"],"pdf_url":"https://arxiv.org/pdf/2411.10068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10063v1","updated":"2024-11-15T09:26:00Z","published":"2024-11-15T09:26:00Z","title":"Federated Domain Generalization via Prompt Learning and Aggregation","summary":" Federated domain generalization (FedDG) aims to improve the global model\ngeneralization in unseen domains by addressing data heterogeneity under\nprivacy-preserving constraints. A common strategy in existing FedDG studies\ninvolves sharing domain-specific knowledge among clients, such as spectrum\ninformation, class prototypes, and data styles. However, this knowledge is\nextracted directly from local client samples, and sharing such sensitive\ninformation poses a potential risk of data leakage, which might not fully meet\nthe requirements of FedDG. In this paper, we introduce prompt learning to adapt\npre-trained vision-language models (VLMs) in the FedDG scenario, and leverage\nlocally learned prompts as a more secure bridge to facilitate knowledge\ntransfer among clients. Specifically, we propose a novel FedDG framework\nthrough Prompt Learning and AggregatioN (PLAN), which comprises two training\nstages to collaboratively generate local prompts and global prompts at each\nfederated round. First, each client performs both text and visual prompt\nlearning using their own data, with local prompts indirectly synchronized by\nregarding the global prompts as a common reference. Second, all domain-specific\nlocal prompts are exchanged among clients and selectively aggregated into the\nglobal prompts using lightweight attention-based aggregators. The global\nprompts are finally applied to adapt VLMs to unseen target domains. As our PLAN\nframework requires training only a limited number of prompts and lightweight\naggregators, it offers notable advantages in computational and communication\nefficiency for FedDG. Extensive experiments demonstrate the superior\ngeneralization ability of PLAN across four benchmark datasets.\n","authors":["Shuai Gong","Chaoran Cui","Chunyun Zhang","Wenna Wang","Xiushan Nie","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.10063v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.10061v1","updated":"2024-11-15T09:23:18Z","published":"2024-11-15T09:23:18Z","title":"EchoMimicV2: Towards Striking, Simplified, and Semi-Body Human Animation","summary":" Recent work on human animation usually involves audio, pose, or movement maps\nconditions, thereby achieves vivid animation quality. However, these methods\noften face practical challenges due to extra control conditions, cumbersome\ncondition injection modules, or limitation to head region driving. Hence, we\nask if it is possible to achieve striking half-body human animation while\nsimplifying unnecessary conditions. To this end, we propose a half-body human\nanimation method, dubbed EchoMimicV2, that leverages a novel Audio-Pose Dynamic\nHarmonization strategy, including Pose Sampling and Audio Diffusion, to enhance\nhalf-body details, facial and gestural expressiveness, and meanwhile reduce\nconditions redundancy. To compensate for the scarcity of half-body data, we\nutilize Head Partial Attention to seamlessly accommodate headshot data into our\ntraining framework, which can be omitted during inference, providing a free\nlunch for animation. Furthermore, we design the Phase-specific Denoising Loss\nto guide motion, detail, and low-level quality for animation in specific\nphases, respectively. Besides, we also present a novel benchmark for evaluating\nthe effectiveness of half-body human animation. Extensive experiments and\nanalyses demonstrate that EchoMimicV2 surpasses existing methods in both\nquantitative and qualitative evaluations.\n","authors":["Rang Meng","Xingyu Zhang","Yuming Li","Chenguang Ma"],"pdf_url":"https://arxiv.org/pdf/2411.10061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18610v2","updated":"2024-11-15T09:02:32Z","published":"2024-10-24T10:06:45Z","title":"A Joint Representation Using Continuous and Discrete Features for\n Cardiovascular Diseases Risk Prediction on Chest CT Scans","summary":" Cardiovascular diseases (CVD) remain a leading health concern and contribute\nsignificantly to global mortality rates. While clinical advancements have led\nto a decline in CVD mortality, accurately identifying individuals who could\nbenefit from preventive interventions remains an unsolved challenge in\npreventive cardiology. Current CVD risk prediction models, recommended by\nguidelines, are based on limited traditional risk factors or use CT imaging to\nacquire quantitative biomarkers, and still have limitations in predictive\naccuracy and applicability. On the other hand, end-to-end trained CVD risk\nprediction methods leveraging deep learning on CT images often fail to provide\ntransparent and explainable decision grounds for assisting physicians. In this\nwork, we proposed a novel joint representation that integrates discrete\nquantitative biomarkers and continuous deep features extracted from chest CT\nscans. Our approach initiated with a deep CVD risk classification model by\ncapturing comprehensive continuous deep learning features while jointly\nobtaining currently clinical-established quantitative biomarkers via\nsegmentation models. In the feature joint representation stage, we use an\ninstance-wise feature-gated mechanism to align the continuous and discrete\nfeatures, followed by a soft instance-wise feature interaction mechanism\nfostering independent and effective feature interaction for the final CVD risk\nprediction. Our method substantially improves CVD risk predictive performance\nand offers individual contribution analysis of each biomarker, which is\nimportant in assisting physicians' decision-making processes. We validated our\nmethod on a public chest low-dose CT dataset and a private external chest\nstandard-dose CT patient cohort of 17,207 CT volumes from 6,393 unique\nsubjects, and demonstrated superior predictive performance, achieving AUCs of\n0.875 and 0.843, respectively.\n","authors":["Minfeng Xu","Chen-Chen Fan","Yan-Jie Zhou","Wenchao Guo","Pan Liu","Jing Qi","Le Lu","Hanqing Chao","Kunlun He"],"pdf_url":"https://arxiv.org/pdf/2410.18610v2.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2306.17799v2","updated":"2024-11-15T08:44:58Z","published":"2023-06-16T16:02:44Z","title":"A Low-rank Matching Attention based Cross-modal Feature Fusion Method\n for Conversational Emotion Recognition","summary":" Conversational emotion recognition (CER) is an important research topic in\nhuman-computer interactions. {Although recent advancements in transformer-based\ncross-modal fusion methods have shown promise in CER tasks, they tend to\noverlook the crucial intra-modal and inter-modal emotional interaction or\nsuffer from high computational complexity. To address this, we introduce a\nnovel and lightweight cross-modal feature fusion method called Low-Rank\nMatching Attention Method (LMAM). LMAM effectively captures contextual\nemotional semantic information in conversations while mitigating the quadratic\ncomplexity issue caused by the self-attention mechanism. Specifically, by\nsetting a matching weight and calculating inter-modal features attention scores\nrow by row, LMAM requires only one-third of the parameters of self-attention\nmethods. We also employ the low-rank decomposition method on the weights to\nfurther reduce the number of parameters in LMAM. As a result, LMAM offers a\nlightweight model while avoiding overfitting problems caused by a large number\nof parameters. Moreover, LMAM is able to fully exploit the intra-modal\nemotional contextual information within each modality and integrates\ncomplementary emotional semantic information across modalities by computing and\nfusing similarities of intra-modal and inter-modal features simultaneously.\nExperimental results verify the superiority of LMAM compared with other popular\ncross-modal fusion methods on the premise of being more lightweight. Also, LMAM\ncan be embedded into any existing state-of-the-art CER methods in a\nplug-and-play manner, and can be applied to other multi-modal recognition\ntasks, e.g., session recommendation and humour detection, demonstrating its\nremarkable generalization ability.\n","authors":["Yuntao Shou","Huan Liu","Xiangyong Cao","Deyu Meng","Bo Dong"],"pdf_url":"https://arxiv.org/pdf/2306.17799v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.06740v3","updated":"2024-11-15T08:36:51Z","published":"2024-08-13T09:00:35Z","title":"DiffLoRA: Generating Personalized Low-Rank Adaptation Weights with\n Diffusion","summary":" Personalized text-to-image generation has gained significant attention for\nits capability to generate high-fidelity portraits of specific identities\nconditioned on user-defined prompts. Existing methods typically involve\ntest-time fine-tuning or incorporating an additional pre-trained branch.\nHowever, these approaches struggle to simultaneously address efficiency,\nidentity fidelity, and the preservation of the model's original generative\ncapabilities. In this paper, we propose DiffLoRA, an efficient method that\nleverages the diffusion model as a hypernetwork to predict personalized\nLow-Rank Adaptation (LoRA) weights based on the reference images. By\nincorporating these LoRA weights into the off-the-shelf text-to-image model,\nDiffLoRA enables zero-shot personalization during inference, eliminating the\nneed for post-processing optimization. Moreover, we introduce a novel\nidentity-oriented LoRA weights construction pipeline to facilitate the training\nprocess of DiffLoRA. The dataset generated through this pipeline enables\nDiffLoRA to produce consistently high-quality LoRA weights. Notably, the\ndistinctive properties of the diffusion model enhance the generation of\nsuperior weights by employing probabilistic modeling to capture intricate\nstructural patterns and thoroughly explore the weight space. Comprehensive\nexperimental results demonstrate that DiffLoRA outperforms existing\npersonalization approaches across multiple benchmarks, achieving both time\nefficiency and maintaining identity fidelity throughout the personalization\nprocess.\n","authors":["Yujia Wu","Yiming Shi","Jiwei Wei","Chengwei Sun","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2408.06740v3.pdf","comment":"9 pages,8 figures"},{"id":"http://arxiv.org/abs/2411.10036v1","updated":"2024-11-15T08:36:24Z","published":"2024-11-15T08:36:24Z","title":"Rethinking Normalization Strategies and Convolutional Kernels for\n Multimodal Image Fusion","summary":" Multimodal image fusion (MMIF) aims to integrate information from different\nmodalities to obtain a comprehensive image, aiding downstream tasks. However,\nexisting methods tend to prioritize natural image fusion and focus on\ninformation complementary and network training strategies. They ignore the\nessential distinction between natural and medical image fusion and the\ninfluence of underlying components. This paper dissects the significant\ndifferences between the two tasks regarding fusion goals, statistical\nproperties, and data distribution. Based on this, we rethink the suitability of\nthe normalization strategy and convolutional kernels for end-to-end\nMMIF.Specifically, this paper proposes a mixture of instance normalization and\ngroup normalization to preserve sample independence and reinforce intrinsic\nfeature correlation.This strategy promotes the potential of enriching feature\nmaps, thus boosting fusion performance. To this end, we further introduce the\nlarge kernel convolution, effectively expanding receptive fields and enhancing\nthe preservation of image detail. Moreover, the proposed multipath adaptive\nfusion module recalibrates the decoder input with features of various scales\nand receptive fields, ensuring the transmission of crucial information.\nExtensive experiments demonstrate that our method exhibits state-of-the-art\nperformance in multiple fusion tasks and significantly improves downstream\napplications. The code is available at https://github.com/HeDan-11/LKC-FUNet.\n","authors":["Dan He","Guofen Wang","Weisheng Li","Yucheng Shu","Wenbo Li","Lijian Yang","Yuping Huang","Feiyan Li"],"pdf_url":"https://arxiv.org/pdf/2411.10036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03519v2","updated":"2024-11-15T08:32:02Z","published":"2024-09-05T13:32:40Z","title":"Tissue Concepts: supervised foundation models in computational pathology","summary":" Due to the increasing workload of pathologists, the need for automation to\nsupport diagnostic tasks and quantitative biomarker evaluation is becoming more\nand more apparent. Foundation models have the potential to improve\ngeneralizability within and across centers and serve as starting points for\ndata efficient development of specialized yet robust AI models. However, the\ntraining foundation models themselves is usually very expensive in terms of\ndata, computation, and time. This paper proposes a supervised training method\nthat drastically reduces these expenses. The proposed method is based on\nmulti-task learning to train a joint encoder, by combining 16 different\nclassification, segmentation, and detection tasks on a total of 912,000\npatches. Since the encoder is capable of capturing the properties of the\nsamples, we term it the Tissue Concepts encoder. To evaluate the performance\nand generalizability of the Tissue Concepts encoder across centers,\nclassification of whole slide images from four of the most prevalent solid\ncancers - breast, colon, lung, and prostate - was used. The experiments show\nthat the Tissue Concepts model achieve comparable performance to models trained\nwith self-supervision, while requiring only 6% of the amount of training\npatches. Furthermore, the Tissue Concepts encoder outperforms an ImageNet\npre-trained encoder on both in-domain and out-of-domain data.\n","authors":["Till Nicke","Jan Raphael Schaefer","Henning Hoefener","Friedrich Feuerhake","Dorit Merhof","Fabian Kiessling","Johannes Lotz"],"pdf_url":"https://arxiv.org/pdf/2409.03519v2.pdf","comment":"22 Pages, 3 Figures, submitted to and under revision at Computers in\n Biology and Medicine"},{"id":"http://arxiv.org/abs/2411.10033v1","updated":"2024-11-15T08:25:14Z","published":"2024-11-15T08:25:14Z","title":"GSEditPro: 3D Gaussian Splatting Editing with Attention-based\n Progressive Localization","summary":" With the emergence of large-scale Text-to-Image(T2I) models and implicit 3D\nrepresentations like Neural Radiance Fields (NeRF), many text-driven generative\nediting methods based on NeRF have appeared. However, the implicit encoding of\ngeometric and textural information poses challenges in accurately locating and\ncontrolling objects during editing. Recently, significant advancements have\nbeen made in the editing methods of 3D Gaussian Splatting, a real-time\nrendering technology that relies on explicit representation. However, these\nmethods still suffer from issues including inaccurate localization and limited\nmanipulation over editing. To tackle these challenges, we propose GSEditPro, a\nnovel 3D scene editing framework which allows users to perform various creative\nand precise editing using text prompts only. Leveraging the explicit nature of\nthe 3D Gaussian distribution, we introduce an attention-based progressive\nlocalization module to add semantic labels to each Gaussian during rendering.\nThis enables precise localization on editing areas by classifying Gaussians\nbased on their relevance to the editing prompts derived from cross-attention\nlayers of the T2I model. Furthermore, we present an innovative editing\noptimization method based on 3D Gaussian Splatting, obtaining stable and\nrefined editing results through the guidance of Score Distillation Sampling and\npseudo ground truth. We prove the efficacy of our method through extensive\nexperiments.\n","authors":["Yanhao Sun","RunZe Tian","Xiao Han","XinYao Liu","Yan Zhang","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2411.10033v1.pdf","comment":"Pacific Graphics 2024"},{"id":"http://arxiv.org/abs/2411.10032v1","updated":"2024-11-15T08:20:26Z","published":"2024-11-15T08:20:26Z","title":"VMID: A Multimodal Fusion LLM Framework for Detecting and Identifying\n Misinformation of Short Videos","summary":" Short video platforms have become important channels for news dissemination,\noffering a highly engaging and immediate way for users to access current events\nand share information. However, these platforms have also emerged as\nsignificant conduits for the rapid spread of misinformation, as fake news and\nrumors can leverage the visual appeal and wide reach of short videos to\ncirculate extensively among audiences. Existing fake news detection methods\nmainly rely on single-modal information, such as text or images, or apply only\nbasic fusion techniques, limiting their ability to handle the complex,\nmulti-layered information inherent in short videos. To address these\nlimitations, this paper presents a novel fake news detection method based on\nmultimodal information, designed to identify misinformation through a\nmulti-level analysis of video content. This approach effectively utilizes\ndifferent modal representations to generate a unified textual description,\nwhich is then fed into a large language model for comprehensive evaluation. The\nproposed framework successfully integrates multimodal features within videos,\nsignificantly enhancing the accuracy and reliability of fake news detection.\nExperimental results demonstrate that the proposed approach outperforms\nexisting models in terms of accuracy, robustness, and utilization of multimodal\ninformation, achieving an accuracy of 90.93%, which is significantly higher\nthan the best baseline model (SV-FEND) at 81.05%. Furthermore, case studies\nprovide additional evidence of the effectiveness of the approach in accurately\ndistinguishing between fake news, debunking content, and real incidents,\nhighlighting its reliability and robustness in real-world applications.\n","authors":["Weihao Zhong","Yinhao Xiao","Minghui Xu","Xiuzhen Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.10032v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2211.10973 by other authors"},{"id":"http://arxiv.org/abs/2411.10029v1","updated":"2024-11-15T08:17:08Z","published":"2024-11-15T08:17:08Z","title":"Toward Robust and Accurate Adversarial Camouflage Generation against\n Vehicle Detectors","summary":" Adversarial camouflage is a widely used physical attack against vehicle\ndetectors for its superiority in multi-view attack performance. One promising\napproach involves using differentiable neural renderers to facilitate\nadversarial camouflage optimization through gradient back-propagation. However,\nexisting methods often struggle to capture environmental characteristics during\nthe rendering process or produce adversarial textures that can precisely map to\nthe target vehicle. Moreover, these approaches neglect diverse weather\nconditions, reducing the efficacy of generated camouflage across varying\nweather scenarios. To tackle these challenges, we propose a robust and accurate\ncamouflage generation method, namely RAUCA. The core of RAUCA is a novel neural\nrendering component, End-to-End Neural Renderer Plus (E2E-NRP), which can\naccurately optimize and project vehicle textures and render images with\nenvironmental characteristics such as lighting and weather. In addition, we\nintegrate a multi-weather dataset for camouflage generation, leveraging the\nE2E-NRP to enhance the attack robustness. Experimental results on six popular\nobject detectors show that RAUCA-final outperforms existing methods in both\nsimulation and real-world settings.\n","authors":["Jiawei Zhou","Linye Lyu","Daojing He","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2411.10029v1.pdf","comment":"14 pages. arXiv admin note: substantial text overlap with\n arXiv:2402.15853"},{"id":"http://arxiv.org/abs/2411.10028v1","updated":"2024-11-15T08:17:05Z","published":"2024-11-15T08:17:05Z","title":"MOT\\_FCG++: Enhanced Representation of Motion and Appearance Features","summary":" The goal of multi-object tracking (MOT) is to detect and track all objects in\na scene across frames, while maintaining a unique identity for each object.\nMost existing methods rely on the spatial motion features and appearance\nembedding features of the detected objects in consecutive frames. Effectively\nand robustly representing the spatial and appearance features of long\ntrajectories has become a critical factor affecting the performance of MOT. We\npropose a novel approach for appearance and spatial feature representation,\nimproving upon the clustering association method MOT\\_FCG. For spatial motion\nfeatures, we propose Diagonal Modulated GIoU, which more accurately represents\nthe relationship between the position and shape of the objects. For appearance\nfeatures, we utilize a dynamic appearance representation that incorporates\nconfidence information, enabling the trajectory appearance features to be more\nrobust and global. Based on the baseline model MOT\\_FCG, we achieved 76.1 HOTA,\n80.4 MOTA and 81.3 IDF1 on the MOT17 validation set, and also achieved\ncompetitive performance on the MOT20 and DanceTrack validation sets.\n","authors":["Yanzhao Fang"],"pdf_url":"https://arxiv.org/pdf/2411.10028v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.18816v3","updated":"2024-11-15T08:10:51Z","published":"2024-05-29T06:56:12Z","title":"Flow Priors for Linear Inverse Problems via Iterative Corrupted\n Trajectory Matching","summary":" Generative models based on flow matching have attracted significant attention\nfor their simplicity and superior performance in high-resolution image\nsynthesis. By leveraging the instantaneous change-of-variables formula, one can\ndirectly compute image likelihoods from a learned flow, making them enticing\ncandidates as priors for downstream tasks such as inverse problems. In\nparticular, a natural approach would be to incorporate such image probabilities\nin a maximum-a-posteriori (MAP) estimation problem. A major obstacle, however,\nlies in the slow computation of the log-likelihood, as it requires\nbackpropagating through an ODE solver, which can be prohibitively slow for\nhigh-dimensional problems. In this work, we propose an iterative algorithm to\napproximate the MAP estimator efficiently to solve a variety of linear inverse\nproblems. Our algorithm is mathematically justified by the observation that the\nMAP objective can be approximated by a sum of $N$ ``local MAP'' objectives,\nwhere $N$ is the number of function evaluations. By leveraging Tweedie's\nformula, we show that we can perform gradient steps to sequentially optimize\nthese objectives. We validate our approach for various linear inverse problems,\nsuch as super-resolution, deblurring, inpainting, and compressed sensing, and\ndemonstrate that we can outperform other methods based on flow matching. Code\nis available at https://github.com/YasminZhang/ICTM.\n","authors":["Yasi Zhang","Peiyu Yu","Yaxuan Zhu","Yingshan Chang","Feng Gao","Ying Nian Wu","Oscar Leong"],"pdf_url":"https://arxiv.org/pdf/2405.18816v3.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.17484v3","updated":"2024-11-15T08:02:03Z","published":"2024-05-24T16:18:16Z","title":"Bridging The Gap between Low-rank and Orthogonal Adaptation via\n Householder Reflection Adaptation","summary":" While following different technical routes, both low-rank and orthogonal\nadaptation techniques can efficiently adapt large-scale pre-training models in\nspecific tasks or domains based on a small piece of trainable parameters. In\nthis study, we bridge the gap between these two techniques, proposing a simple\nbut effective adaptation method based on Householder reflections. Given a\npre-trained model, our method fine-tunes its layers by multiplying each frozen\nweight matrix with an orthogonal matrix constructed by a chain of learnable\nHouseholder reflections (HRs). This HR-based orthogonal fine-tuning is\nequivalent to an adaptive low-rank adaptation. Moreover, we show that the\northogonality of the reflection planes corresponding to the HRs impacts the\nmodel capacity and regularity. The analysis motivates us to regularize the\northogonality of the HRs, leading to different implementations of the proposed\nHouseholder reflection adaptation (HRA) method. Compared with state-of-the-art\nmethods, HRA achieves superior performance with fewer learnable parameters when\nadapting large language models and conditional image generators. The code of\nthe experiments is available at \\url{https://github.com/DaShenZi721/HRA}, and\nthe method has been merged into the\n\\href{https://github.com/huggingface/peft}{PEFT} package.\n","authors":["Shen Yuan","Haotian Liu","Hongteng Xu"],"pdf_url":"https://arxiv.org/pdf/2405.17484v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10019v1","updated":"2024-11-15T07:54:14Z","published":"2024-11-15T07:54:14Z","title":"Towards Utilising a Range of Neural Activations for Comprehending\n Representational Associations","summary":" Recent efforts to understand intermediate representations in deep neural\nnetworks have commonly attempted to label individual neurons and combinations\nof neurons that make up linear directions in the latent space by examining\nextremal neuron activations and the highest direction projections. In this\npaper, we show that this approach, although yielding a good approximation for\nmany purposes, fails to capture valuable information about the behaviour of a\nrepresentation. Neural network activations are generally dense, and so a more\ncomplex, but realistic scenario is that linear directions encode information at\nvarious levels of stimulation. We hypothesise that non-extremal level\nactivations contain complex information worth investigating, such as\nstatistical associations, and thus may be used to locate confounding human\ninterpretable concepts. We explore the value of studying a range of neuron\nactivations by taking the case of mid-level output neuron activations and\ndemonstrate on a synthetic dataset how they can inform us about aspects of\nrepresentations in the penultimate layer not evident through analysing maximal\nactivations alone. We use our findings to develop a method to curate data from\nmid-range logit samples for retraining to mitigate spurious correlations, or\nconfounding concepts in the penultimate layer, on real benchmark datasets. The\nsuccess of our method exemplifies the utility of inspecting non-maximal\nactivations to extract complex relationships learned by models.\n","authors":["Laura O'Mahony","Nikola S. Nikolov","David JP O'Sullivan"],"pdf_url":"https://arxiv.org/pdf/2411.10019v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.10015v1","updated":"2024-11-15T07:50:01Z","published":"2024-11-15T07:50:01Z","title":"MicroCrackAttentionNeXt: Advancing Microcrack Detection in Wave Field\n Analysis Using Deep Neural Networks through Feature Visualization","summary":" Micro Crack detection using deep neural networks (DNNs) through an automated\npipeline using wave fields interacting with the damaged areas is highly sought\nafter. These high-dimensional spatio-temporal crack data are limited, and these\ndatasets have large dimensions in the temporal domain. The dataset presents a\nsubstantial class imbalance, with crack pixels constituting an average of only\n5% of the total pixels per sample. This extreme class imbalance poses a\nchallenge for deep learning models with the different micro-scale cracks, as\nthe network can be biased toward predicting the majority class, generally\nleading to poor detection accuracy. This study builds upon the previous\nbenchmark SpAsE-Net, an asymmetric encoder-decoder network for micro-crack\ndetection. The impact of various activation and loss functions were examined\nthrough feature space visualization using the manifold discovery and analysis\n(MDA) algorithm. The optimized architecture and training methodology achieved\nan accuracy of 86.85%.\n","authors":["Fatahlla Moreh","Yusuf Hasan","Bilal Zahid Hussain","Mohammad Ammar","Sven Tomforde"],"pdf_url":"https://arxiv.org/pdf/2411.10015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10013v1","updated":"2024-11-15T07:43:45Z","published":"2024-11-15T07:43:45Z","title":"Efficient Depth Estimation for Unstable Stereo Camera Systems on AR\n Glasses","summary":" Stereo depth estimation is a fundamental component in augmented reality (AR)\napplications. Although AR applications require very low latency for their\nreal-time applications, traditional depth estimation models often rely on\ntime-consuming preprocessing steps such as rectification to achieve high\naccuracy. Also, non standard ML operator based algorithms such as cost volume\nalso require significant latency, which is aggravated on compute\nresource-constrained mobile platforms. Therefore, we develop hardware-friendly\nalternatives to the costly cost volume and preprocessing and design two new\nmodels based on them, MultiHeadDepth and HomoDepth. Our approaches for cost\nvolume is replacing it with a new group-pointwise convolution-based operator\nand approximation of consine similarity based on layernorm and dot product. For\nonline stereo rectification (preprocessing), we introduce homograhy matrix\nprediction network with a rectification positional encoding (RPE), which\ndelivers both low latency and robustness to unrectified images, which\neliminates the needs for preprocessing. Our MultiHeadDepth, which includes\noptimized cost volume, provides 11.8-30.3% improvements in accuracy and\n22.9-25.2% reduction in latency compared to a state-of-the-art depth estimation\nmodel for AR glasses from industry. Our HomoDepth, which includes optimized\npreprocessing (Homograhpy + RPE) upon MultiHeadDepth, can process unrectified\nimages and reduce the end-to-end latency by 44.5%. We adopt a multi-task\nlearning framework to handle misaligned stereo inputs on HomoDepth, which\nreduces theAbsRel error by 10.0-24.3%. The results demonstrate the efficacy of\nour approaches in achieving both high model performance with low latency, which\nmakes a step forward toward practical depth estimation on future AR devices.\n","authors":["Yongfan Liu","Hyoukjun Kwon"],"pdf_url":"https://arxiv.org/pdf/2411.10013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10004v1","updated":"2024-11-15T07:30:53Z","published":"2024-11-15T07:30:53Z","title":"EyeDiff: text-to-image diffusion model improves rare eye disease\n diagnosis","summary":" The rising prevalence of vision-threatening retinal diseases poses a\nsignificant burden on the global healthcare systems. Deep learning (DL) offers\na promising solution for automatic disease screening but demands substantial\ndata. Collecting and labeling large volumes of ophthalmic images across various\nmodalities encounters several real-world challenges, especially for rare\ndiseases. Here, we introduce EyeDiff, a text-to-image model designed to\ngenerate multimodal ophthalmic images from natural language prompts and\nevaluate its applicability in diagnosing common and rare diseases. EyeDiff is\ntrained on eight large-scale datasets using the advanced latent diffusion\nmodel, covering 14 ophthalmic image modalities and over 80 ocular diseases, and\nis adapted to ten multi-country external datasets. The generated images\naccurately capture essential lesional characteristics, achieving high alignment\nwith text prompts as evaluated by objective metrics and human experts.\nFurthermore, integrating generated images significantly enhances the accuracy\nof detecting minority classes and rare eye diseases, surpassing traditional\noversampling methods in addressing data imbalance. EyeDiff effectively tackles\nthe issue of data imbalance and insufficiency typically encountered in rare\ndiseases and addresses the challenges of collecting large-scale annotated\nimages, offering a transformative solution to enhance the development of\nexpert-level diseases diagnosis models in ophthalmic field.\n","authors":["Ruoyu Chen","Weiyi Zhang","Bowen Liu","Xiaolan Chen","Pusheng Xu","Shunming Liu","Mingguang He","Danli Shi"],"pdf_url":"https://arxiv.org/pdf/2411.10004v1.pdf","comment":"28 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.14704v2","updated":"2024-11-15T07:19:03Z","published":"2024-09-23T04:50:36Z","title":"VLEU: a Method for Automatic Evaluation for Generalizability of\n Text-to-Image Models","summary":" Progress in Text-to-Image (T2I) models has significantly improved the\ngeneration of images from textual descriptions. However, existing evaluation\nmetrics do not adequately assess the models' ability to handle a diverse range\nof textual prompts, which is crucial for their generalizability. To address\nthis, we introduce a new metric called Visual Language Evaluation Understudy\n(VLEU). VLEU uses large language models to sample from the visual text domain,\nthe set of all possible input texts for T2I models, to generate a wide variety\nof prompts. The images generated from these prompts are evaluated based on\ntheir alignment with the input text using the CLIP model.VLEU quantifies a\nmodel's generalizability by computing the Kullback-Leibler divergence between\nthe marginal distribution of the visual text and the conditional distribution\nof the images generated by the model. This metric provides a quantitative way\nto compare different T2I models and track improvements during model finetuning.\nOur experiments demonstrate the effectiveness of VLEU in evaluating the\ngeneralization capability of various T2I models, positioning it as an essential\nmetric for future research in text-to-image synthesis.\n","authors":["Jingtao Cao","Zheng Zhang","Hongru Wang","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2409.14704v2.pdf","comment":"accepted by EMNLP2024(long paper,main conference)"},{"id":"http://arxiv.org/abs/2411.09998v1","updated":"2024-11-15T07:12:18Z","published":"2024-11-15T07:12:18Z","title":"Adaptive Non-Uniform Timestep Sampling for Diffusion Model Training","summary":" As a highly expressive generative model, diffusion models have demonstrated\nexceptional success across various domains, including image generation, natural\nlanguage processing, and combinatorial optimization. However, as data\ndistributions grow more complex, training these models to convergence becomes\nincreasingly computationally intensive. While diffusion models are typically\ntrained using uniform timestep sampling, our research shows that the variance\nin stochastic gradients varies significantly across timesteps, with\nhigh-variance timesteps becoming bottlenecks that hinder faster convergence. To\naddress this issue, we introduce a non-uniform timestep sampling method that\nprioritizes these more critical timesteps. Our method tracks the impact of\ngradient updates on the objective for each timestep, adaptively selecting those\nmost likely to minimize the objective effectively. Experimental results\ndemonstrate that this approach not only accelerates the training process, but\nalso leads to improved performance at convergence. Furthermore, our method\nshows robust performance across various datasets, scheduling strategies, and\ndiffusion architectures, outperforming previously proposed timestep sampling\nand weighting heuristics that lack this degree of robustness.\n","authors":["Myunsoo Kim","Donghyeon Ki","Seong-Woong Shim","Byung-Jun Lee"],"pdf_url":"https://arxiv.org/pdf/2411.09998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07747v2","updated":"2024-11-15T07:10:52Z","published":"2024-11-12T12:18:18Z","title":"Constraint Learning for Parametric Point Cloud","summary":" Parametric point clouds are sampled from CAD shapes, and have become\nincreasingly prevalent in industrial manufacturing. However, most existing\npoint cloud learning methods focus on the geometric features, such as\ndeveloping efficient convolution operations, overlooking the important\nattribute of constraints inherent in CAD shapes, which limits these methods'\nability to comprehend CAD shapes fully. To address this issue, we analyzed the\neffect of constraints, and proposed its deep learning-friendly representation,\nafter that, the Constraint Feature Learning Network (CstNet) was developed to\nextract and leverage constraints. Our CstNet includes two stages. Stage 1\nextracts constraints from B-Rep data or point cloud. Stage 2 leverages\ncoordinates and constraints to enhance the comprehension of CAD shapes.\nAdditionally, we built up the Parametric 20,000 Multi-modal Dataset for the\nscarcity of labeled B-Rep datasets. Experiments demonstrate that our CstNet\nachieved state-of-the-art performance on both public and proposed CAD shape\ndatasets. To the best of our knowledge, CstNet is the first constraint-based\nlearning method tailored for CAD shape analysis.\n","authors":["Xi Cheng","Ruiqi Lei","Di Huang","Zhichao Liao","Fengyuan Piao","Yan Chen","Pingfa Feng","Long Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.07747v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09986v1","updated":"2024-11-15T06:43:49Z","published":"2024-11-15T06:43:49Z","title":"Unlocking Transfer Learning for Open-World Few-Shot Recognition","summary":" Few-Shot Open-Set Recognition (FSOSR) targets a critical real-world\nchallenge, aiming to categorize inputs into known categories, termed closed-set\nclasses, while identifying open-set inputs that fall outside these classes.\nAlthough transfer learning where a model is tuned to a given few-shot task has\nbecome a prominent paradigm in closed-world, we observe that it fails to expand\nto open-world. To unlock this challenge, we propose a two-stage method which\nconsists of open-set aware meta-learning with open-set free transfer learning.\nIn the open-set aware meta-learning stage, a model is trained to establish a\nmetric space that serves as a beneficial starting point for the subsequent\nstage. During the open-set free transfer learning stage, the model is further\nadapted to a specific target task through transfer learning. Additionally, we\nintroduce a strategy to simulate open-set examples by modifying the training\ndataset or generating pseudo open-set examples. The proposed method achieves\nstate-of-the-art performance on two widely recognized benchmarks, miniImageNet\nand tieredImageNet, with only a 1.5\\% increase in training effort. Our work\ndemonstrates the effectiveness of transfer learning in FSOSR.\n","authors":["Byeonggeun Kim","Juntae Lee","Kyuhong Shim","Simyung Chang"],"pdf_url":"https://arxiv.org/pdf/2411.09986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01483v3","updated":"2024-11-15T06:31:44Z","published":"2024-05-02T17:14:57Z","title":"MANTIS: Interleaved Multi-Image Instruction Tuning","summary":" Large multimodal models (LMMs) have shown great results in single-image\nvision language tasks. However, their abilities to solve multi-image visual\nlanguage tasks is yet to be improved. The existing LMMs like OpenFlamingo,\nEmu2, and Idefics gain their multi-image ability through pre-training on\nhundreds of millions of noisy interleaved image-text data from the web, which\nis neither efficient nor effective. In this paper, we aim to build strong\nmulti-image LMMs via instruction tuning with academic-level resources.\nTherefore, we meticulously construct Mantis-Instruct containing 721K\nmulti-image instruction data to train a family of Mantis models. The\ninstruction tuning empowers Mantis with different multi-image skills like\nco-reference, comparison, reasoning, and temporal understanding. We evaluate\nMantis on 8 multi-image benchmarks and 6 single-image benchmarks.\nMantis-Idefics2 can achieve SoTA results on all the multi-image benchmarks and\nbeat the strongest multi-image baseline, Idefics2-8B by an average of 13\nabsolute points. Notably, Idefics2-8B was pre-trained on 140M interleaved\nmulti-image data, which is 200x larger than Mantis-Instruct. We observe that\nMantis performs equivalently well on the held-in and held-out benchmarks, which\nshows its generalization ability. We further evaluate Mantis on single-image\nbenchmarks and demonstrate that Mantis also maintains a strong single-image\nperformance on par with CogVLM and Emu2. Our results show that multi-image\nabilities are not necessarily gained through massive pre-training, instead,\nthey can be gained by low-cost instruction tuning. The training and evaluation\nof Mantis has paved the road for future work to improve LMMs' multi-image\nabilities.\n","authors":["Dongfu Jiang","Xuan He","Huaye Zeng","Cong Wei","Max Ku","Qian Liu","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01483v3.pdf","comment":"13 pages, 3 figures, 13 tables"},{"id":"http://arxiv.org/abs/2411.08933v2","updated":"2024-11-15T06:13:33Z","published":"2024-11-13T09:13:20Z","title":"Confidence-aware Denoised Fine-tuning of Off-the-shelf Models for\n Certified Robustness","summary":" The remarkable advances in deep learning have led to the emergence of many\noff-the-shelf classifiers, e.g., large pre-trained models. However, since they\nare typically trained on clean data, they remain vulnerable to adversarial\nattacks. Despite this vulnerability, their superior performance and\ntransferability make off-the-shelf classifiers still valuable in practice,\ndemanding further work to provide adversarial robustness for them in a post-hoc\nmanner. A recently proposed method, denoised smoothing, leverages a denoiser\nmodel in front of the classifier to obtain provable robustness without\nadditional training. However, the denoiser often creates hallucination, i.e.,\nimages that have lost the semantics of their originally assigned class, leading\nto a drop in robustness. Furthermore, its noise-and-denoise procedure\nintroduces a significant distribution shift from the original distribution,\ncausing the denoised smoothing framework to achieve sub-optimal robustness. In\nthis paper, we introduce Fine-Tuning with Confidence-Aware Denoised Image\nSelection (FT-CADIS), a novel fine-tuning scheme to enhance the certified\nrobustness of off-the-shelf classifiers. FT-CADIS is inspired by the\nobservation that the confidence of off-the-shelf classifiers can effectively\nidentify hallucinated images during denoised smoothing. Based on this, we\ndevelop a confidence-aware training objective to handle such hallucinated\nimages and improve the stability of fine-tuning from denoised images. In this\nway, the classifier can be fine-tuned using only images that are beneficial for\nadversarial robustness. We also find that such a fine-tuning can be done by\nupdating a small fraction of parameters of the classifier. Extensive\nexperiments demonstrate that FT-CADIS has established the state-of-the-art\ncertified robustness among denoised smoothing methods across all\n$\\ell_2$-adversary radius in various benchmarks.\n","authors":["Suhyeok Jang","Seojin Kim","Jinwoo Shin","Jongheon Jeong"],"pdf_url":"https://arxiv.org/pdf/2411.08933v2.pdf","comment":"26 pages; TMLR 2024; Code is available at\n https://github.com/suhyeok24/FT-CADIS"},{"id":"http://arxiv.org/abs/2411.09971v1","updated":"2024-11-15T06:05:33Z","published":"2024-11-15T06:05:33Z","title":"Explanation for Trajectory Planning using Multi-modal Large Language\n Model for Autonomous Driving","summary":" End-to-end style autonomous driving models have been developed recently.\nThese models lack interpretability of decision-making process from perception\nto control of the ego vehicle, resulting in anxiety for passengers. To\nalleviate it, it is effective to build a model which outputs captions\ndescribing future behaviors of the ego vehicle and their reason. However, the\nexisting approaches generate reasoning text that inadequately reflects the\nfuture plans of the ego vehicle, because they train models to output captions\nusing momentary control signals as inputs. In this study, we propose a\nreasoning model that takes future planning trajectories of the ego vehicle as\ninputs to solve this limitation with the dataset newly collected.\n","authors":["Shota Yamazaki","Chenyu Zhang","Takuya Nanri","Akio Shigekane","Siyuan Wang","Jo Nishiyama","Tao Chu","Kohei Yokosawa"],"pdf_url":"https://arxiv.org/pdf/2411.09971v1.pdf","comment":"Accepted and presented at ECCV 2024 2nd Workshop on Vision-Centric\n Autonomous Driving (VCAD) on September 30, 2024. 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.10806v2","updated":"2024-11-15T05:53:11Z","published":"2024-07-15T15:21:34Z","title":"Enhancing Robustness to Noise Corruption for Point Cloud Recognition via\n Spatial Sorting and Set-Mixing Aggregation Module","summary":" Current models for point cloud recognition demonstrate promising performance\non synthetic datasets. However, real-world point cloud data inevitably contains\nnoise, impacting model robustness. While recent efforts focus on enhancing\nrobustness through various strategies, there still remains a gap in\ncomprehensive analyzes from the standpoint of network architecture design.\nUnlike traditional methods that rely on generic techniques, our approach\noptimizes model robustness to noise corruption through network architecture\ndesign. Inspired by the token-mixing technique applied in 2D images, we propose\nSet-Mixer, a noise-robust aggregation module which facilitates communication\namong all points to extract geometric shape information and mitigating the\ninfluence of individual noise points. A sorting strategy is designed to enable\nour module to be invariant to point permutation, which also tackles the\nunordered structure of point cloud and introduces consistent relative spatial\ninformation. Experiments conducted on ModelNet40-C indicate that Set-Mixer\nsignificantly enhances the model performance on noisy point clouds,\nunderscoring its potential to advance real-world applicability in 3D\nrecognition and perception tasks.\n","authors":["Dingxin Zhang","Jianhui Yu","Tengfei Xue","Chaoyi Zhang","Dongnan Liu","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2407.10806v2.pdf","comment":"Accepted by ACCV2024"},{"id":"http://arxiv.org/abs/2411.09968v1","updated":"2024-11-15T05:51:29Z","published":"2024-11-15T05:51:29Z","title":"Seeing Clearly by Layer Two: Enhancing Attention Heads to Alleviate\n Hallucination in LVLMs","summary":" The hallucination problem in multimodal large language models (MLLMs) remains\na common issue. Although image tokens occupy a majority of the input sequence\nof MLLMs, there is limited research to explore the relationship between image\ntokens and hallucinations. In this paper, we analyze the distribution of\nattention scores for image tokens across each layer and head of the model,\nrevealing an intriguing and common phenomenon: most hallucinations are closely\nlinked to the pattern of attention sinks in the self-attention matrix of image\ntokens, where shallow layers exhibit dense attention sinks and deeper layers\nshow sparse attention sinks. We further analyze the attention heads of\ndifferent layers and find that heads with high-density attention sink in the\nimage part play a positive role in alleviating hallucinations. In this paper,\nwe propose a training-free method named \\textcolor{red}{\\textbf{E}}nhancing\n\\textcolor{red}{\\textbf{A}}ttention \\textcolor{red}{\\textbf{H}}eads (EAH), an\napproach designed to enhance the convergence of image tokens attention sinks in\nthe shallow layers. EAH identifies the attention head that shows the vision\nsink in a shallow layer and extracts its attention matrix. This attention map\nis then broadcast to other heads in the layer, thereby strengthening the layer\nto pay more attention to the image itself. With extensive experiments, EAH\nshows significant hallucination-mitigating performance on different MLLMs and\nmetrics, proving its effectiveness and generality.\n","authors":["Xiaofeng Zhang","Yihao Quan","Chaochen Gu","Chen Shen","Xiaosong Yuan","Shaotian Yan","Hao Cheng","Kaijie Wu","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2411.09968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01613v3","updated":"2024-11-15T05:50:55Z","published":"2024-05-31T00:08:09Z","title":"QuST: QuPath Extension for Integrative Whole Slide Image and Spatial\n Transcriptomics Analysis","summary":" The integration of AI in digital pathology, particularly in whole slide image\n(WSI) and spatial transcriptomics (ST) analysis, holds immense potential for\nenhancing our understanding of diseases. Despite challenges such as training\npattern preparation and resolution disparities, the convergence of these\ntechnologies can unlock new insights. We introduce QuST, a tool that bridges\nthe gap between WSI and ST, underscoring the transformative power of this\nintegrated approach in disease biology.\n","authors":["Chao-Hui Huang","Sara Lichtarge","Diane Fernandez"],"pdf_url":"https://arxiv.org/pdf/2406.01613v3.pdf","comment":"18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.05531v3","updated":"2024-11-15T05:49:30Z","published":"2024-09-09T11:43:35Z","title":"HMAFlow: Learning More Accurate Optical Flow via Hierarchical Motion\n Field Alignment","summary":" Optical flow estimation is a fundamental and long-standing visual task. In\nthis work, we present a novel method, dubbed HMAFlow, to improve optical flow\nestimation in challenging scenes, particularly those involving small objects.\nThe proposed model mainly consists of two core components: a Hierarchical\nMotion Field Alignment (HMA) module and a Correlation Self-Attention (CSA)\nmodule. In addition, we rebuild 4D cost volumes by employing a Multi-Scale\nCorrelation Search (MCS) layer and replacing average pooling in common cost\nvolumes with a search strategy utilizing multiple search ranges. Experimental\nresults demonstrate that our model achieves the best generalization performance\ncompared to other state-of-the-art methods. Specifically, compared with RAFT,\nour method achieves relative error reductions of 14.2% and 3.4% on the clean\npass and final pass of the Sintel online benchmark, respectively. On the KITTI\ntest benchmark, HMAFlow surpasses RAFT and GMA in the Fl-all metric by relative\nmargins of 6.8% and 7.7%, respectively. To facilitate future research, our code\nwill be made available at https://github.com/BooTurbo/HMAFlow.\n","authors":["Dianbo Ma","Kousuke Imamura","Ziyan Gao","Xiangjie Wang","Satoshi Yamane"],"pdf_url":"https://arxiv.org/pdf/2409.05531v3.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.09955v1","updated":"2024-11-15T05:18:15Z","published":"2024-11-15T05:18:15Z","title":"Instruction-Guided Editing Controls for Images and Multimedia: A Survey\n in LLM era","summary":" The rapid advancement of large language models (LLMs) and multimodal learning\nhas transformed digital content creation and manipulation. Traditional visual\nediting tools require significant expertise, limiting accessibility. Recent\nstrides in instruction-based editing have enabled intuitive interaction with\nvisual content, using natural language as a bridge between user intent and\ncomplex editing operations. This survey provides an overview of these\ntechniques, focusing on how LLMs and multimodal models empower users to achieve\nprecise visual modifications without deep technical knowledge. By synthesizing\nover 100 publications, we explore methods from generative adversarial networks\nto diffusion models, examining multimodal integration for fine-grained content\ncontrol. We discuss practical applications across domains such as fashion, 3D\nscene manipulation, and video synthesis, highlighting increased accessibility\nand alignment with human intuition. Our survey compares existing literature,\nemphasizing LLM-empowered editing, and identifies key challenges to stimulate\nfurther research. We aim to democratize powerful visual editing across various\nindustries, from entertainment to education. Interested readers are encouraged\nto access our repository at\nhttps://github.com/tamlhp/awesome-instruction-editing.\n","authors":["Thanh Tam Nguyen","Zhao Ren","Trinh Pham","Phi Le Nguyen","Hongzhi Yin","Quoc Viet Hung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.09955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09952v1","updated":"2024-11-15T05:09:20Z","published":"2024-11-15T05:09:20Z","title":"GGAvatar: Reconstructing Garment-Separated 3D Gaussian Splatting Avatars\n from Monocular Video","summary":" Avatar modelling has broad applications in human animation and virtual\ntry-ons. Recent advancements in this field have focused on high-quality and\ncomprehensive human reconstruction but often overlook the separation of\nclothing from the body. To bridge this gap, this paper introduces GGAvatar\n(Garment-separated 3D Gaussian Splatting Avatar), which relies on monocular\nvideos. Through advanced parameterized templates and unique phased training,\nthis model effectively achieves decoupled, editable, and realistic\nreconstruction of clothed humans. Comparative evaluations with other costly\nmodels confirm GGAvatar's superior quality and efficiency in modelling both\nclothed humans and separable garments. The paper also showcases applications in\nclothing editing, as illustrated in Figure 1, highlighting the model's benefits\nand the advantages of effective disentanglement. The code is available at\nhttps://github.com/J-X-Chen/GGAvatar/.\n","authors":["Jingxuan Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09952v1.pdf","comment":"MMAsia'24 Accepted"},{"id":"http://arxiv.org/abs/2408.08092v3","updated":"2024-11-15T05:01:34Z","published":"2024-08-15T11:34:53Z","title":"SC3D: Label-Efficient Outdoor 3D Object Detection via Single Click\n Annotation","summary":" LiDAR-based outdoor 3D object detection has received widespread attention.\nHowever, training 3D detectors from the LiDAR point cloud typically relies on\nexpensive bounding box annotations. This paper presents SC3D, an innovative\nlabel-efficient method requiring only a single coarse click on the bird's eye\nview of the 3D point cloud for each frame. A key challenge here is the absence\nof complete geometric descriptions of the target objects from such simple click\nannotations. To address this issue, our proposed SC3D adopts a progressive\npipeline. Initially, we design a mixed pseudo-label generation module that\nexpands limited click annotations into a mixture of bounding box and semantic\nmask supervision. Next, we propose a mix-supervised teacher model, enabling the\ndetector to learn mixed supervision information. Finally, we introduce a\nmixed-supervised student network that leverages the teacher model's\ngeneralization ability to learn unclicked instances.Experimental results on the\nwidely used nuScenes and KITTI datasets demonstrate that our SC3D with only\ncoarse clicks, which requires only 0.2% annotation cost, achieves\nstate-of-the-art performance compared to weakly-supervised 3D detection\nmethods.The code will be made publicly available.\n","authors":["Qiming Xia","Hongwei Lin","Wei Ye","Hai Wu","Yadan Luo","Cheng Wang","Chenglu Wen"],"pdf_url":"https://arxiv.org/pdf/2408.08092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10347v2","updated":"2024-11-15T04:44:40Z","published":"2024-05-16T02:00:44Z","title":"Networking Systems for Video Anomaly Detection: A Tutorial and Survey","summary":" The increasing utilization of surveillance cameras in smart cities, coupled\nwith the surge of online video applications, has heightened concerns regarding\npublic security and privacy protection, which propelled automated Video Anomaly\nDetection (VAD) into a fundamental research task within the Artificial\nIntelligence (AI) community. With the advancements in deep learning and edge\ncomputing, VAD has made significant progress and advances synergized with\nemerging applications in smart cities and video internet, which has moved\nbeyond the conventional research scope of algorithm engineering to deployable\nNetworking Systems for VAD (NSVAD), a practical hotspot for intersection\nexploration in the AI, IoVT, and computing fields. In this article, we\ndelineate the foundational assumptions, learning frameworks, and applicable\nscenarios of various deep learning-driven VAD routes, offering an exhaustive\ntutorial for novices in NSVAD. This article elucidates core concepts by\nreviewing recent advances and typical solutions and aggregating available\nresearch resources accessible at https://github.com/fdjingliu/NSVAD.\nAdditionally, we showcase our latest NSVAD research in industrial IoT and smart\ncities, along with an end-cloud collaborative architecture for deployable\nNSVAD. Lastly, this article projects future development trends and discusses\nhow the integration of AI and computing technologies can address existing\nresearch challenges and promote open opportunities, serving as an insightful\nguide for prospective researchers and engineers.\n","authors":["Jing Liu","Yang Liu","Jieyu Lin","Jielin Li","Liang Cao","Peng Sun","Bo Hu","Liang Song","Azzedine Boukerche","Victor C. M. Leung"],"pdf_url":"https://arxiv.org/pdf/2405.10347v2.pdf","comment":"Revised to ACM Computing Surveys, under review, for more information\n and supplementary material, please see https://github.com/fdjingliu/NSVAD"},{"id":"http://arxiv.org/abs/2411.09933v1","updated":"2024-11-15T04:16:50Z","published":"2024-11-15T04:16:50Z","title":"JRadiEvo: A Japanese Radiology Report Generation Model Enhanced by\n Evolutionary Optimization of Model Merging","summary":" With the rapid advancement of large language models (LLMs), foundational\nmodels (FMs) have seen significant advancements. Healthcare is one of the most\ncrucial application areas for these FMs, given the significant time and effort\nrequired for physicians to analyze large volumes of patient data. Recent\nefforts have focused on adapting multimodal FMs to the medical domain through\ntechniques like instruction-tuning, leading to the development of medical\nfoundation models (MFMs). However, these approaches typically require large\namounts of training data to effectively adapt models to the medical field.\nMoreover, most existing models are trained on English datasets, limiting their\npracticality in non-English-speaking regions where healthcare professionals and\npatients are not always fluent in English. The need for translation introduces\nadditional costs and inefficiencies. To address these challenges, we propose a\n\\textbf{J}apanese \\textbf{Radi}ology report generation model enhanced by\n\\textbf{Evo}lutionary optimization of model merging (JRadiEvo). This is the\nfirst attempt to extend a non-medical vision-language foundation model to the\nmedical domain through evolutionary optimization of model merging. We\nsuccessfully created a model that generates accurate Japanese reports from\nX-ray images using only 50 translated samples from publicly available data.\nThis model, developed with highly efficient use of limited data, outperformed\nleading models from recent research trained on much larger datasets.\nAdditionally, with only 8 billion parameters, this relatively compact\nfoundation model can be deployed locally within hospitals, making it a\npractical solution for environments where APIs and other external services\ncannot be used due to strict privacy and security requirements.\n","authors":["Kaito Baba","Ryota Yagi","Junichiro Takahashi","Risa Kishikawa","Satoshi Kodera"],"pdf_url":"https://arxiv.org/pdf/2411.09933v1.pdf","comment":"Accepted by NeurIPS'24 Workshop on AIM-FM: Advancements In Medical\n Foundation Models: Explainability, Robustness, Security, and Beyond"},{"id":"http://arxiv.org/abs/2410.14994v2","updated":"2024-11-15T04:13:55Z","published":"2024-10-19T05:50:12Z","title":"Quanta Video Restoration","summary":" The proliferation of single-photon image sensors has opened the door to a\nplethora of high-speed and low-light imaging applications. However, data\ncollected by these sensors are often 1-bit or few-bit, and corrupted by noise\nand strong motion. Conventional video restoration methods are not designed to\nhandle this situation, while specialized quanta burst algorithms have limited\nperformance when the number of input frames is low. In this paper, we introduce\nQuanta Video Restoration (QUIVER), an end-to-end trainable network built on the\ncore ideas of classical quanta restoration methods, i.e., pre-filtering, flow\nestimation, fusion, and refinement. We also collect and publish I2-2000FPS, a\nhigh-speed video dataset with the highest temporal resolution of 2000\nframes-per-second, for training and testing. On simulated and real data, QUIVER\noutperforms existing quanta restoration methods by a significant margin. Code\nand dataset available at\nhttps://github.com/chennuriprateek/Quanta_Video_Restoration-QUIVER-\n","authors":["Prateek Chennuri","Yiheng Chi","Enze Jiang","G. M. Dilshan Godaliyadda","Abhiram Gnanasambandam","Hamid R. Sheikh","Istvan Gyongy","Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2410.14994v2.pdf","comment":"Accepted at European Conference on Computer Vision (ECCV) 2024,\n Milano, Italy, Sept 29 - Oct 4, 2024, Part XL, LNCS 15098"},{"id":"http://arxiv.org/abs/2408.07999v2","updated":"2024-11-15T04:09:34Z","published":"2024-08-15T07:56:02Z","title":"Co-Fix3D: Enhancing 3D Object Detection with Collaborative Refinement","summary":" 3D object detection in driving scenarios faces the challenge of complex road\nenvironments, which can lead to the loss or incompleteness of key features,\nthereby affecting perception performance. To address this issue, we propose an\nadvanced detection framework called Co-Fix3D. Co-Fix3D integrates Local and\nGlobal Enhancement (LGE) modules to refine Bird's Eye View (BEV) features. The\nLGE module uses Discrete Wavelet Transform (DWT) for pixel-level local\noptimization and incorporates an attention mechanism for global optimization.\nTo handle varying detection difficulties, we adopt multi-head LGE modules,\nenabling each module to focus on targets with different levels of detection\ncomplexity, thus further enhancing overall perception capability. Experimental\nresults show that on the nuScenes dataset's LiDAR benchmark, Co-Fix3D achieves\n69.4\\% mAP and 73.5\\% NDS, while on the multimodal benchmark, it achieves\n72.3\\% mAP and 74.7\\% NDS. The source code is publicly available at\n\\href{https://github.com/rubbish001/Co-Fix3d}{https://github.com/rubbish001/Co-Fix3d}.\n","authors":["Wenxuan Li","Qin Zou","Chi Chen","Bo Du","Long Chen","Jian Zhou","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2408.07999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09924v1","updated":"2024-11-15T03:47:36Z","published":"2024-11-15T03:47:36Z","title":"A Polarization Image Dehazing Method Based on the Principle of Physical\n Diffusion","summary":" Computer vision is increasingly used in areas such as unmanned vehicles,\nsurveillance systems and remote sensing. However, in foggy scenarios, image\ndegradation leads to loss of target details, which seriously affects the\naccuracy and effectiveness of these vision tasks. Polarized light, due to the\nfact that its electromagnetic waves vibrate in a specific direction, is able to\nresist scattering and refraction effects in complex media more effectively\ncompared to unpolarized light. As a result, polarized light has a greater\nability to maintain its polarization characteristics in complex transmission\nmedia and under long-distance imaging conditions. This property makes polarized\nimaging especially suitable for complex scenes such as outdoor and underwater,\nespecially in foggy environments, where higher quality images can be obtained.\nBased on this advantage, we propose an innovative semi-physical polarization\ndehazing method that does not rely on an external light source. The method\nsimulates the diffusion process of fog and designs a diffusion kernel that\ncorresponds to the image blurriness caused by this diffusion. By employing\nspatiotemporal Fourier transforms and deconvolution operations, the method\nrecovers the state of fog droplets prior to diffusion and the light inversion\ndistribution of objects. This approach effectively achieves dehazing and detail\nenhancement of the scene.\n","authors":["Zhenjun Zhang","Lijun Tang","Hongjin Wang","Lilian Zhang","Yunze He","Yaonan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09921v1","updated":"2024-11-15T03:45:09Z","published":"2024-11-15T03:45:09Z","title":"Motion-Grounded Video Reasoning: Understanding and Perceiving Motion at\n Pixel Level","summary":" In this paper, we introduce Motion-Grounded Video Reasoning, a new motion\nunderstanding task that requires generating visual answers (video segmentation\nmasks) according to the input question, and hence needs implicit spatiotemporal\nreasoning and grounding. This task extends existing spatiotemporal grounding\nwork focusing on explicit action/motion grounding, to a more general format by\nenabling implicit reasoning via questions. To facilitate the development of the\nnew task, we collect a large-scale dataset called GROUNDMORE, which comprises\n1,715 video clips, 249K object masks that are deliberately designed with 4\nquestion types (Causal, Sequential, Counterfactual, and Descriptive) for\nbenchmarking deep and comprehensive motion reasoning abilities. GROUNDMORE\nuniquely requires models to generate visual answers, providing a more concrete\nand visually interpretable response than plain texts. It evaluates models on\nboth spatiotemporal grounding and reasoning, fostering to address complex\nchallenges in motion-related video reasoning, temporal perception, and\npixel-level understanding. Furthermore, we introduce a novel baseline model\nnamed Motion-Grounded Video Reasoning Assistant (MORA). MORA incorporates the\nmultimodal reasoning ability from the Multimodal LLM, the pixel-level\nperception capability from the grounding model (SAM), and the temporal\nperception ability from a lightweight localization head. MORA achieves\nrespectable performance on GROUNDMORE outperforming the best existing visual\ngrounding baseline model by an average of 21.5% relatively. We hope this novel\nand challenging task will pave the way for future advancements in robust and\ngeneral motion understanding via video reasoning segmentation\n","authors":["Andong Deng","Tongjia Chen","Shoubin Yu","Taojiannan Yang","Lincoln Spencer","Yapeng Tian","Ajmal Saeed Mian","Mohit Bansal","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02968v2","updated":"2024-11-15T03:28:16Z","published":"2024-06-05T05:52:20Z","title":"GSGAN: Adversarial Learning for Hierarchical Generation of 3D Gaussian\n Splats","summary":" Most advances in 3D Generative Adversarial Networks (3D GANs) largely depend\non ray casting-based volume rendering, which incurs demanding rendering costs.\nOne promising alternative is rasterization-based 3D Gaussian Splatting (3D-GS),\nproviding a much faster rendering speed and explicit 3D representation. In this\npaper, we exploit Gaussian as a 3D representation for 3D GANs by leveraging its\nefficient and explicit characteristics. However, in an adversarial framework,\nwe observe that a na\\\"ive generator architecture suffers from training\ninstability and lacks the capability to adjust the scale of Gaussians. This\nleads to model divergence and visual artifacts due to the absence of proper\nguidance for initialized positions of Gaussians and densification to manage\ntheir scales adaptively. To address these issues, we introduce a generator\narchitecture with a hierarchical multi-scale Gaussian representation that\neffectively regularizes the position and scale of generated Gaussians.\nSpecifically, we design a hierarchy of Gaussians where finer-level Gaussians\nare parameterized by their coarser-level counterparts; the position of\nfiner-level Gaussians would be located near their coarser-level counterparts,\nand the scale would monotonically decrease as the level becomes finer, modeling\nboth coarse and fine details of the 3D scene. Experimental results demonstrate\nthat ours achieves a significantly faster rendering speed (x100) compared to\nstate-of-the-art 3D consistent GANs with comparable 3D generation capability.\nProject page: https://hse1032.github.io/gsgan.\n","authors":["Sangeek Hyun","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2406.02968v2.pdf","comment":"NeurIPS 2024 / Project page: https://hse1032.github.io/gsgan"},{"id":"http://arxiv.org/abs/2411.09914v1","updated":"2024-11-15T03:22:44Z","published":"2024-11-15T03:22:44Z","title":"mmSpyVR: Exploiting mmWave Radar for Penetrating Obstacles to Uncover\n Privacy Vulnerability of Virtual Reality","summary":" Virtual reality (VR), while enhancing user experiences, introduces\nsignificant privacy risks. This paper reveals a novel vulnerability in VR\nsystems that allows attackers to capture VR privacy through obstacles utilizing\nmillimeter-wave (mmWave) signals without physical intrusion and virtual\nconnection with the VR devices. We propose mmSpyVR, a novel attack on VR user's\nprivacy via mmWave radar. The mmSpyVR framework encompasses two main parts: (i)\nA transfer learning-based feature extraction model to achieve VR feature\nextraction from mmWave signal. (ii) An attention-based VR privacy spying module\nto spy VR privacy information from the extracted feature. The mmSpyVR\ndemonstrates the capability to extract critical VR privacy from the mmWave\nsignals that have penetrated through obstacles. We evaluate mmSpyVR through\nIRB-approved user studies. Across 22 participants engaged in four experimental\nscenes utilizing VR devices from three different manufacturers, our system\nachieves an application recognition accuracy of 98.5\\% and keystroke\nrecognition accuracy of 92.6\\%. This newly discovered vulnerability has\nimplications across various domains, such as cybersecurity, privacy protection,\nand VR technology development. We also engage with VR manufacturer Meta to\ndiscuss and explore potential mitigation strategies. Data and code are publicly\navailable for scrutiny and research at https://github.com/luoyumei1-a/mmSpyVR/\n","authors":["Luoyu Mei","Ruofeng Liu","Zhimeng Yin","Qingchuan Zhao","Wenchao Jiang","Shuai Wang","Kangjie Lu","Tian He"],"pdf_url":"https://arxiv.org/pdf/2411.09914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07571v2","updated":"2024-11-15T03:20:57Z","published":"2024-10-10T03:12:03Z","title":"How Does Vision-Language Adaptation Impact the Safety of Vision Language\n Models?","summary":" Vision-Language adaptation (VL adaptation) transforms Large Language Models\n(LLMs) into Large Vision-Language Models (LVLMs) for multimodal tasks, but this\nprocess often compromises the inherent safety capabilities embedded in the\noriginal LLMs. Despite potential harmfulness due to weakened safety measures,\nin-depth analysis on the effects of VL adaptation on safety remains\nunder-explored. This study examines how VL adaptation influences safety and\nevaluates the impact of safety fine-tuning methods. Our analysis reveals that\nsafety degradation occurs during VL adaptation, even when the training data is\nsafe. While safety tuning techniques like supervised fine-tuning with safety\ndatasets or reinforcement learning from human feedback mitigate some risks,\nthey still lead to safety degradation and a reduction in helpfulness due to\nover-rejection issues. Further analysis of internal model weights suggests that\nVL adaptation may impact certain safety-related layers, potentially lowering\noverall safety levels. Additionally, our findings demonstrate that the\nobjectives of VL adaptation and safety tuning are divergent, which often\nresults in their simultaneous application being suboptimal. To address this, we\nsuggest the weight merging approach as an optimal solution effectively reducing\nsafety degradation while maintaining helpfulness. These insights help guide the\ndevelopment of more reliable and secure LVLMs for real-world applications.\n","authors":["Seongyun Lee","Geewook Kim","Jiyeon Kim","Hyunji Lee","Hoyeon Chang","Sue Hyun Park","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2410.07571v2.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2411.09911v1","updated":"2024-11-15T03:14:11Z","published":"2024-11-15T03:14:11Z","title":"DiffFNO: Diffusion Fourier Neural Operator","summary":" We introduce DiffFNO, a novel diffusion framework for arbitrary-scale\nsuper-resolution strengthened by a Weighted Fourier Neural Operator (WFNO).\nMode Re-balancing in WFNO effectively captures critical frequency components,\nsignificantly improving the reconstruction of high-frequency image details that\nare crucial for super-resolution tasks. Gated Fusion Mechanism (GFM) adaptively\ncomplements WFNO's spectral features with spatial features from an\nAttention-based Neural Operator (AttnNO). This enhances the network's\ncapability to capture both global structures and local details. Adaptive\nTime-Step (ATS) ODE solver, a deterministic sampling strategy, accelerates\ninference without sacrificing output quality by dynamically adjusting\nintegration step sizes ATS. Extensive experiments demonstrate that DiffFNO\nachieves state-of-the-art (SOTA) results, outperforming existing methods across\nvarious scaling factors by a margin of 2 to 4 dB in PSNR, including those\nbeyond the training distribution. It also achieves this at lower inference\ntime. Our approach sets a new standard in super-resolution, delivering both\nsuperior accuracy and computational efficiency.\n","authors":["Xiaoyi Liu","Hao Tang"],"pdf_url":"https://arxiv.org/pdf/2411.09911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13371v2","updated":"2024-11-15T03:07:13Z","published":"2024-10-17T09:23:30Z","title":"Feature Extraction Reimagined: Achieving Superior Accuracy in Camera\n Calibration","summary":" Camera calibration is crucial for 3D vision applications. This paper focuses\non improving the accuracy of feature extraction, which is a key step in\ncalibration. We address the aliasing problem of star-shaped pattern by\nintroducing a novel dynamic calibration target that synthesizes multiple\ncheckerboard patterns of different angle around pattern center, which\nsignificantly improves feature refinement accuracy. Additionally, we propose a\nnovel cost function of feature refinement that accounts for defocus effect,\noffering a more physically realistic model compared to existing symmetry based\nmethod, experiment on a large dataset demonstrate significant improvements in\ncalibration accuracy with reduced computation time. Our code is available from\nhttps://github.com/spdfghi/Feature-Extraction-Reimagined-Achieving-Superior-Accuracy-in-Camera-Calibration.git.\n","authors":["Zezhun Shi"],"pdf_url":"https://arxiv.org/pdf/2410.13371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17601v3","updated":"2024-11-15T02:56:58Z","published":"2024-09-26T07:35:23Z","title":"CleanerCLIP: Fine-grained Counterfactual Semantic Augmentation for\n Backdoor Defense in Contrastive Learning","summary":" Pre-trained large models for multimodal contrastive learning, such as CLIP,\nhave been widely recognized in the industry as highly susceptible to\ndata-poisoned backdoor attacks. This poses significant risks to downstream\nmodel training. In response to such potential threats, finetuning offers a\nsimpler and more efficient defense choice compared to retraining large models\nwith augmented data. In the supervised learning domain, fine-tuning defense\nstrategies can achieve excellent defense performance. However, in the\nunsupervised and semi-supervised domain, we find that when CLIP faces some\ncomplex attack techniques, the existing fine-tuning defense strategy,\nCleanCLIP, has some limitations on defense performance. The synonym\nsubstitution of its text-augmentation is insufficient to enhance the text\nfeature space. To compensate for this weakness, we improve it by proposing a\nfine-grained \\textbf{T}ext \\textbf{A}lignment \\textbf{C}leaner (TA-Cleaner) to\ncut off feature connections of backdoor triggers. We randomly select a few\nsamples for positive and negative subtext generation at each epoch of\nCleanCLIP, and align the subtexts to the images to strengthen the text\nself-supervision. We evaluate the effectiveness of our TA-Cleaner against six\nattack algorithms and conduct comprehensive zero-shot classification tests on\nImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves\nstate-of-the-art defensiveness among finetuning-based defense techniques. Even\nwhen faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms\nCleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\\% and 63.88\\%,\nrespectively.\n","authors":["Yuan Xun","Siyuan Liang","Xiaojun Jia","Xinwei Liu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17601v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03634v3","updated":"2024-11-15T02:40:34Z","published":"2024-07-04T04:54:03Z","title":"SOWA: Adapting Hierarchical Frozen Window Self-Attention to\n Visual-Language Models for Better Anomaly Detection","summary":" Visual anomaly detection is essential in industrial manufacturing, yet\ntraditional methods often rely heavily on extensive normal datasets and\ntask-specific models, limiting their scalability. Recent advancements in\nlarge-scale vision-language models have significantly enhanced zero- and\nfew-shot anomaly detection. However, these approaches may not fully leverage\nhierarchical features, potentially overlooking nuanced details crucial for\naccurate detection. To address this, we introduce a novel window self-attention\nmechanism based on the CLIP model, augmented with learnable prompts to process\nmulti-level features within a Soldier-Officer Window Self-Attention (SOWA)\nframework. Our method has been rigorously evaluated on five benchmark datasets,\nachieving superior performance by leading in 18 out of 20 metrics, setting a\nnew standard against existing state-of-the-art techniques.\n","authors":["Zongxiang Hu","Zhaosheng Zhang","Jianwen Xie"],"pdf_url":"https://arxiv.org/pdf/2407.03634v3.pdf","comment":"8 pages, 9 figures, conference"},{"id":"http://arxiv.org/abs/2411.09894v1","updated":"2024-11-15T02:38:00Z","published":"2024-11-15T02:38:00Z","title":"Free Lunch in Pathology Foundation Model: Task-specific Model Adaptation\n with Concept-Guided Feature Enhancement","summary":" Whole slide image (WSI) analysis is gaining prominence within the medical\nimaging field. Recent advances in pathology foundation models have shown the\npotential to extract powerful feature representations from WSIs for downstream\ntasks. However, these foundation models are usually designed for\ngeneral-purpose pathology image analysis and may not be optimal for specific\ndownstream tasks or cancer types. In this work, we present Concept\nAnchor-guided Task-specific Feature Enhancement (CATE), an adaptable paradigm\nthat can boost the expressivity and discriminativeness of pathology foundation\nmodels for specific downstream tasks. Based on a set of task-specific concepts\nderived from the pathology vision-language model with expert-designed prompts,\nwe introduce two interconnected modules to dynamically calibrate the generic\nimage features extracted by foundation models for certain tasks or cancer\ntypes. Specifically, we design a Concept-guided Information Bottleneck module\nto enhance task-relevant characteristics by maximizing the mutual information\nbetween image features and concept anchors while suppressing superfluous\ninformation. Moreover, a Concept-Feature Interference module is proposed to\nutilize the similarity between calibrated features and concept anchors to\nfurther generate discriminative task-specific features. The extensive\nexperiments on public WSI datasets demonstrate that CATE significantly enhances\nthe performance and generalizability of MIL models. Additionally, heatmap and\numap visualization results also reveal the effectiveness and interpretability\nof CATE. The source code is available at https://github.com/HKU-MedAI/CATE.\n","authors":["Yanyan Huang","Weiqin Zhao","Yihang Chen","Yu Fu","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2411.09894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09893v1","updated":"2024-11-15T02:37:14Z","published":"2024-11-15T02:37:14Z","title":"Memory Proxy Maps for Visual Navigation","summary":" Visual navigation takes inspiration from humans, who navigate in previously\nunseen environments using vision without detailed environment maps. Inspired by\nthis, we introduce a novel no-RL, no-graph, no-odometry approach to visual\nnavigation using feudal learning to build a three tiered agent. Key to our\napproach is a memory proxy map (MPM), an intermediate representation of the\nenvironment learned in a self-supervised manner by the high-level manager agent\nthat serves as a simplified memory, approximating what the agent has seen. We\ndemonstrate that recording observations in this learned latent space is an\neffective and efficient memory proxy that can remove the need for graphs and\nodometry in visual navigation tasks. For the mid-level manager agent, we\ndevelop a waypoint network (WayNet) that outputs intermediate subgoals, or\nwaypoints, imitating human waypoint selection during local navigation. For the\nlow-level worker agent, we learn a classifier over a discrete action space that\navoids local obstacles and moves the agent towards the WayNet waypoint. The\nresulting feudal navigation network offers a novel approach with no RL, no\ngraph, no odometry, and no metric map; all while achieving SOTA results on the\nimage goal navigation task.\n","authors":["Faith Johnson","Bryan Bo Cao","Ashwin Ashok","Shubham Jain","Kristin Dana"],"pdf_url":"https://arxiv.org/pdf/2411.09893v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2402.12498"},{"id":"http://arxiv.org/abs/2406.13977v2","updated":"2024-11-15T02:32:57Z","published":"2024-06-20T03:54:41Z","title":"Similarity-aware Syncretic Latent Diffusion Model for Medical Image\n Translation with Representation Learning","summary":" Non-contrast CT (NCCT) imaging may reduce image contrast and anatomical\nvisibility, potentially increasing diagnostic uncertainty. In contrast,\ncontrast-enhanced CT (CECT) facilitates the observation of regions of interest\n(ROI). Leading generative models, especially the conditional diffusion model,\ndemonstrate remarkable capabilities in medical image modality transformation.\nTypical conditional diffusion models commonly generate images with guidance of\nsegmentation labels for medical modal transformation. Limited access to\nauthentic guidance and its low cardinality can pose challenges to the practical\nclinical application of conditional diffusion models. To achieve an equilibrium\nof generative quality and clinical practices, we propose a novel Syncretic\ngenerative model based on the latent diffusion model for medical image\ntranslation (S$^2$LDM), which can realize high-fidelity reconstruction without\ndemand of additional condition during inference. S$^2$LDM enhances the\nsimilarity in distinct modal images via syncretic encoding and diffusing,\npromoting amalgamated information in the latent space and generating medical\nimages with more details in contrast-enhanced regions. However, syncretic\nlatent spaces in the frequency domain tend to favor lower frequencies, commonly\nlocate in identical anatomic structures. Thus, S$^2$LDM applies adaptive\nsimilarity loss and dynamic similarity to guide the generation and supplements\nthe shortfall in high-frequency details throughout the training process.\nQuantitative experiments confirm the effectiveness of our approach in medical\nimage translation. Our code will release lately.\n","authors":["Tingyi Lin","Pengju Lyu","Jie Zhang","Yuqing Wang","Cheng Wang","Jianjun Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.13977v2.pdf","comment":"We decide to modify the majority of the content"},{"id":"http://arxiv.org/abs/2405.09083v2","updated":"2024-11-15T02:17:07Z","published":"2024-05-15T04:22:27Z","title":"RSHazeDiff: A Unified Fourier-aware Diffusion Model for Remote Sensing\n Image Dehazing","summary":" Haze severely degrades the visual quality of remote sensing images and\nhampers the performance of road extraction, vehicle detection, and traffic flow\nmonitoring. The emerging denoising diffusion probabilistic model (DDPM)\nexhibits the significant potential for dense haze removal with its strong\ngeneration ability. Since remote sensing images contain extensive small-scale\ntexture structures, it is important to effectively restore image details from\nhazy images. However, current wisdom of DDPM fails to preserve image details\nand color fidelity well, limiting its dehazing capacity for remote sensing\nimages. In this paper, we propose a novel unified Fourier-aware diffusion model\nfor remote sensing image dehazing, termed RSHazeDiff. From a new perspective,\nRSHazeDiff explores the conditional DDPM to improve image quality in dense hazy\nscenarios, and it makes three key contributions. First, RSHazeDiff refines the\ntraining phase of diffusion process by performing noise estimation and\nreconstruction constraints in a coarse-to-fine fashion. Thus, it remedies the\nunpleasing results caused by the simple noise estimation constraint in DDPM.\nSecond, by taking the frequency information as important prior knowledge during\niterative sampling steps, RSHazeDiff can preserve more texture details and\ncolor fidelity in dehazed images. Third, we design a global compensated\nlearning module to utilize the Fourier transform to capture the global\ndependency features of input images, which can effectively mitigate the effects\nof boundary artifacts when processing fixed-size patches. Experiments on both\nsynthetic and real-world benchmarks validate the favorable performance of\nRSHazeDiff over state-of-the-art methods. Source code will be released at\nhttps://github.com/jm-xiong/RSHazeDiff.\n","authors":["Jiamei Xiong","Xuefeng Yan","Yongzhen Wang","Wei Zhao","Xiao-Ping Zhang","Mingqiang Wei"],"pdf_url":"https://arxiv.org/pdf/2405.09083v2.pdf","comment":"IEEE TITS; 15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2305.15608v2","updated":"2024-11-15T01:43:35Z","published":"2023-05-24T22:51:52Z","title":"Semantic Segmentation by Semantic Proportions","summary":" Semantic segmentation is a critical task in computer vision aiming to\nidentify and classify individual pixels in an image, with numerous applications\nin for example autonomous driving and medical image analysis. However, semantic\nsegmentation can be highly challenging particularly due to the need for large\namounts of annotated data. Annotating images is a time-consuming and costly\nprocess, often requiring expert knowledge and significant effort; moreover,\nsaving the annotated images could dramatically increase the storage space. In\nthis paper, we propose a novel approach for semantic segmentation, requiring\nthe rough information of individual semantic class proportions, shortened as\nsemantic proportions, rather than the necessity of ground-truth segmentation\nmaps. This greatly simplifies the data annotation process and thus will\nsignificantly reduce the annotation time, cost and storage space, opening up\nnew possibilities for semantic segmentation tasks where obtaining the full\nground-truth segmentation maps may not be feasible or practical. Our proposed\nmethod of utilising semantic proportions can (i) further be utilised as a\nbooster in the presence of ground-truth segmentation maps to gain performance\nwithout extra data and model complexity, and (ii) also be seen as a\nparameter-free plug-and-play module, which can be attached to existing deep\nneural networks designed for semantic segmentation. Extensive experimental\nresults demonstrate the good performance of our method compared to benchmark\nmethods that rely on ground-truth segmentation maps. Utilising semantic\nproportions suggested in this work offers a promising direction for future\nsemantic segmentation research.\n","authors":["Halil Ibrahim Aysel","Xiaohao Cai","Adam Prügel-Bennett"],"pdf_url":"https://arxiv.org/pdf/2305.15608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09871v1","updated":"2024-11-15T01:32:19Z","published":"2024-11-15T01:32:19Z","title":"Content-Aware Preserving Image Generation","summary":" Remarkable progress has been achieved in image generation with the\nintroduction of generative models. However, precisely controlling the content\nin generated images remains a challenging task due to their fundamental\ntraining objective. This paper addresses this challenge by proposing a novel\nimage generation framework explicitly designed to incorporate desired content\nin output images. The framework utilizes advanced encoding techniques,\nintegrating subnetworks called content fusion and frequency encoding modules.\nThe frequency encoding module first captures features and structures of\nreference images by exclusively focusing on selected frequency components.\nSubsequently, the content fusion module generates a content-guiding vector that\nencapsulates desired content features. During the image generation process,\ncontent-guiding vectors from real images are fused with projected noise\nvectors. This ensures the production of generated images that not only maintain\nconsistent content from guiding images but also exhibit diverse stylistic\nvariations. To validate the effectiveness of the proposed framework in\npreserving content attributes, extensive experiments are conducted on widely\nused benchmark datasets, including Flickr-Faces-High Quality, Animal Faces High\nQuality, and Large-scale Scene Understanding datasets.\n","authors":["Giang H. Le","Anh Q. Nguyen","Byeongkeun Kang","Yeejin Lee"],"pdf_url":"https://arxiv.org/pdf/2411.09871v1.pdf","comment":"35 pages, 12 figures, 1 table, journal"},{"id":"http://arxiv.org/abs/2411.09863v1","updated":"2024-11-15T01:00:00Z","published":"2024-11-15T01:00:00Z","title":"Face De-identification: State-of-the-art Methods and Comparative Studies","summary":" The widespread use of image acquisition technologies, along with advances in\nfacial recognition, has raised serious privacy concerns. Face de-identification\nusually refers to the process of concealing or replacing personal identifiers,\nwhich is regarded as an effective means to protect the privacy of facial\nimages. A significant number of methods for face de-identification have been\nproposed in recent years. In this survey, we provide a comprehensive review of\nstate-of-the-art face de-identification methods, categorized into three levels:\npixel-level, representation-level, and semantic-level techniques. We\nsystematically evaluate these methods based on two key criteria, the\neffectiveness of privacy protection and preservation of image utility,\nhighlighting their advantages and limitations. Our analysis includes\nqualitative and quantitative comparisons of the main algorithms, demonstrating\nthat deep learning-based approaches, particularly those using Generative\nAdversarial Networks (GANs) and diffusion models, have achieved significant\nadvancements in balancing privacy and utility. Experimental results reveal that\nwhile recent methods demonstrate strong privacy protection, trade-offs remain\nin visual fidelity and computational complexity. This survey not only\nsummarizes the current landscape but also identifies key challenges and future\nresearch directions in face de-identification.\n","authors":["Jingyi Cao","Xiangyi Chen","Bo Liu","Ming Ding","Rong Xie","Li Song","Zhu Li","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09858v1","updated":"2024-11-15T00:37:29Z","published":"2024-11-15T00:37:29Z","title":"Masked Image Contrastive Learning for Efficient Visual Conceptual\n Pre-training","summary":" This paper proposes a scalable and straightforward pre-training paradigm for\nefficient visual conceptual representation called masked image contrastive\nlearning (MiCL). Our MiCL approach is simple: we randomly mask patches to\ngenerate different views within an image and contrast them among a mini-batch\nof images. The core idea behind MiCL consists of two designs. First, masked\ntokens have the potential to significantly diminish the conceptual redundancy\ninherent in images, and create distinct views with substantial fine-grained\ndifferences on the semantic concept level instead of the instance level.\nSecond, contrastive learning is adept at extracting high-level semantic\nconceptual features during the pre-training, circumventing the high-frequency\ninterference and additional costs associated with image reconstruction.\nImportantly, MiCL learns highly semantic conceptual representations efficiently\nwithout relying on hand-crafted data augmentations or additional auxiliary\nmodules. Empirically, MiCL demonstrates high scalability with Vision\nTransformers, as the ViT-L/16 can complete pre-training in 133 hours using only\n4 A100 GPUs, achieving 85.8% accuracy in downstream fine-tuning tasks.\n","authors":["Xiaoyu Yang","Lijian Xu"],"pdf_url":"https://arxiv.org/pdf/2411.09858v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.09850v1","updated":"2024-11-15T00:06:57Z","published":"2024-11-15T00:06:57Z","title":"Enhancing Diffusion Posterior Sampling for Inverse Problems by\n Integrating Crafted Measurements","summary":" Diffusion models have emerged as a powerful foundation model for visual\ngeneration. With an appropriate sampling process, it can effectively serve as a\ngenerative prior to solve general inverse problems. Current posterior sampling\nbased methods take the measurement (i.e., degraded image sample) into the\nposterior sampling to infer the distribution of the target data (i.e., clean\nimage sample). However, in this manner, we show that high-frequency information\ncan be prematurely introduced during the early stages, which could induce\nlarger posterior estimate errors during the restoration sampling. To address\nthis issue, we first reveal that forming the log posterior gradient with the\nnoisy measurement ( i.e., samples from a diffusion forward process) instead of\nthe clean one can benefit the reverse process. Consequently, we propose a novel\ndiffusion posterior sampling method DPS-CM, which incorporates a Crafted\nMeasurement (i.e., samples generated by a reverse denoising process, compared\nto random sampling with noise in standard methods) to form the posterior\nestimate. This integration aims to mitigate the misalignment with the diffusion\nprior caused by cumulative posterior estimate errors. Experimental results\ndemonstrate that our approach significantly improves the overall capacity to\nsolve general and noisy inverse problems, such as Gaussian deblurring,\nsuper-resolution, inpainting, nonlinear deblurring, and tasks with Poisson\nnoise, relative to existing approaches.\n","authors":["Shijie Zhou","Huaisheng Zhu","Rohan Sharma","Ruiyi Zhang","Kaiyi Ji","Changyou Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06683v6","updated":"2024-11-15T00:00:24Z","published":"2023-01-17T03:53:29Z","title":"From Isolation to Collaboration: Federated Class-Heterogeneous Learning\n for Chest X-Ray Classification","summary":" Federated learning (FL) is a promising paradigm to collaboratively train a\nglobal chest x-ray (CXR) classification model using distributed datasets while\npreserving patient privacy. A significant, yet relatively underexplored,\nchallenge in FL is class-heterogeneity, where clients have different sets of\nclasses. We propose surgical aggregation, a FL method that uses selective\naggregation to collaboratively train a global model using distributed,\nclass-heterogeneous datasets. Unlike other methods, our method does not rely on\nthe assumption that clients share the same classes as other clients, know the\nclasses of other clients, or have access to a fully annotated dataset. We\nevaluate surgical aggregation using class-heterogeneous CXR datasets across IID\nand non-IID settings. Our results show that our method outperforms current\nmethods and has better generalizability.\n","authors":["Pranav Kulkarni","Adway Kanhere","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2301.06683v6.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2411.10444v1","updated":"2024-11-15T18:59:45Z","published":"2024-11-15T18:59:45Z","title":"Balancing Passenger Transport and Power Distribution: A Distributed\n Dispatch Policy for Shared Autonomous Electric Vehicles","summary":" Shared autonomous electric vehicles can provide on-demand transportation for\npassengers while also interacting extensively with the electric distribution\nsystem. This interaction is especially beneficial after a disaster when the\nlarge battery capacity of the fleet can be used to restore critical electric\nloads. We develop a dispatch policy that balances the need to continue serving\npassengers (especially critical workers) and the ability to transfer energy\nacross the network. The model predictive control policy tracks both passenger\nand energy flows and provides maximum passenger throughput if any policy can.\nThe resulting mixed integer linear programming problem is difficult to solve\nfor large-scale problems, so a distributed solution approach is developed to\nimprove scalability, privacy, and resilience. We demonstrate that the proposed\nheuristic, based on the alternating direction method of multipliers, is\neffective in achieving near-optimal solutions quickly. The dispatch policy is\nexamined in simulation to demonstrate the ability of vehicles to balance these\ncompeting objectives with benefits to both systems. Finally, we compare several\ndispatch behaviors, demonstrating the importance of including operational\nconstraints and objectives from both the transportation and electric systems in\nthe model.\n","authors":["Jake Robbennolt","Meiyi Li","Javad Mohammadi","Stephen D. Boyles"],"pdf_url":"https://arxiv.org/pdf/2411.10444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10431v1","updated":"2024-11-15T18:53:08Z","published":"2024-11-15T18:53:08Z","title":"Mitigating Parameter Degeneracy using Joint Conditional Diffusion Model\n for WECC Composite Load Model in Power Systems","summary":" Data-driven modeling for dynamic systems has gained widespread attention in\nrecent years. Its inverse formulation, parameter estimation, aims to infer the\ninherent model parameters from observations. However, parameter degeneracy,\nwhere different combinations of parameters yield the same observable output,\nposes a critical barrier to accurately and uniquely identifying model\nparameters. In the context of WECC composite load model (CLM) in power systems,\nutility practitioners have observed that CLM parameters carefully selected for\none fault event may not perform satisfactorily in another fault. Here, we\ninnovate a joint conditional diffusion model-based inverse problem solver\n(JCDI), that incorporates a joint conditioning architecture with simultaneous\ninputs of multi-event observations to improve parameter generalizability.\nSimulation studies on the WECC CLM show that the proposed JCDI effectively\nreduces uncertainties of degenerate parameters, thus the parameter estimation\nerror is decreased by 42.1% compared to a single-event learning scheme. This\nenables the model to achieve high accuracy in predicting power trajectories\nunder different fault events, including electronic load tripping and motor\nstalling, outperforming standard deep reinforcement learning and supervised\nlearning approaches. We anticipate this work will contribute to mitigating\nparameter degeneracy in system dynamics, providing a general parameter\nestimation framework across various scientific domains.\n","authors":["Feiqin Zhu","Dmitrii Torbunov","Yihui Ren","Zhongjing Jiang","Tianqiao Zhao","Amirthagunaraj Yogarathnam","Meng Yue"],"pdf_url":"https://arxiv.org/pdf/2411.10431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07270v2","updated":"2024-11-15T17:35:27Z","published":"2024-03-12T02:56:31Z","title":"Long-term Hydrothermal Bid-based Market Simulator","summary":" Simulating long-term hydrothermal bid-based markets considering strategic\nagents is a challenging task. The representation of strategic agents\nconsidering intertemporal constraints within a stochastic framework brings\nadditional complexity to the already difficult single-period bilevel, thus,\nnon-convex, optimal bidding problem. Thus, we propose a simulation methodology\nthat effectively addresses these challenges for large-scale hydrothermal power\nsystems. We demonstrate the effectiveness of the framework through a case study\nwith real data from the large-scale Brazilian power system. In the case\nstudies, we show the effects of market concentration in power systems and how\ncontracts can be used to mitigate them. In particular, we show how market power\nmight affect the current setting in Brazil. The developed method can strongly\nbenefit policymakers, market monitors, and market designers as simulations can\nbe used to understand existing power systems and experiment with alternative\ndesigns.\n","authors":["Joaquim Dias Garcia","Alexandre Street","Mario Veiga Pereira"],"pdf_url":"https://arxiv.org/pdf/2403.07270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10359v1","updated":"2024-11-15T17:08:35Z","published":"2024-11-15T17:08:35Z","title":"Koopman-based control of nonlinear systems with closed-loop guarantees","summary":" In this paper, we provide a tutorial overview and an extension of a recently\ndeveloped framework for data-driven control of unknown nonlinear systems with\nrigorous closed-loop guarantees. The proposed approach relies on the Koopman\noperator representation of the nonlinear system, for which a bilinear surrogate\nmodel is estimated based on data. In contrast to existing Koopman-based\nestimation procedures, we state guaranteed bounds on the approximation error\nusing the stability- and certificate-oriented extended dynamic mode\ndecomposition (SafEDMD) framework. The resulting surrogate model and the\nuncertainty bounds allow us to design controllers via robust control theory and\nsum-of-squares optimization, guaranteeing desirable properties for the\nclosed-loop system. We present results on stabilization both in discrete and\ncontinuous time, and we derive a method for controller design with performance\nobjectives. The benefits of the presented framework over established approaches\nare demonstrated with a numerical example.\n","authors":["Robin Strässer","Julian Berberich","Manuel Schaller","Karl Worthmann","Frank Allgöwer"],"pdf_url":"https://arxiv.org/pdf/2411.10359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02106v2","updated":"2024-11-15T15:51:20Z","published":"2024-10-03T00:00:54Z","title":"Safe Navigation in Unmapped Environments for Robotic Systems with Input\n Constraints","summary":" This paper presents an approach for navigation and control in unmapped\nenvironments under input and state constraints using a composite control\nbarrier function (CBF). We consider the scenario where real-time perception\nfeedback (e.g., LiDAR) is used online to construct a local CBF that models\nlocal state constraints (e.g., local safety constraints such as obstacles) in\nthe a priori unmapped environment. The approach employs a soft-maximum function\nto synthesize a single time-varying CBF from the N most recently obtained local\nCBFs. Next, the input constraints are transformed into controller-state\nconstraints through the use of control dynamics. Then, we use a soft-minimum\nfunction to compose the input constraints with the time-varying CBF that models\nthe a priori unmapped environment. This composition yields a single relaxed\nCBF, which is used in a constrained optimization to obtain an optimal control\nthat satisfies the state and input constraints. The approach is validated\nthrough simulations of a nonholonomic ground robot that is equipped with LiDAR\nand navigates an unmapped environment. The robot successfully navigates the\nenvironment while avoiding the a priori unmapped obstacles and satisfying both\nspeed and input constraints.\n","authors":["Amirsaeid Safari","Jesse B. Hoagg"],"pdf_url":"https://arxiv.org/pdf/2410.02106v2.pdf","comment":"Preprint submitted to 2025 American Control Conference (ACC). arXiv\n admin note: substantial text overlap with arXiv:2409.01458"},{"id":"http://arxiv.org/abs/2411.10262v1","updated":"2024-11-15T15:09:35Z","published":"2024-11-15T15:09:35Z","title":"Observer-Based Safety Monitoring of Nonlinear Dynamical Systems with\n Neural Networks via Quadratic Constraint Approach","summary":" The safety monitoring for nonlinear dynamical systems with embedded neural\nnetwork components is addressed in this paper. The interval-observer-based\nsafety monitor is developed consisting of two auxiliary neural networks derived\nfrom the neural network components of the dynamical system. Due to the presence\nof nonlinear activation functions in neural networks, we use quadratic\nconstraints on the global sector to abstract the nonlinear activation functions\nin neural networks. By combining a quadratic constraint approach for the\nactivation function with Lyapunov theory, the interval observer design problem\nis transformed into a series of quadratic and linear programming feasibility\nproblems to make the interval observer operate with the ability to correctly\nestimate the system state with estimation errors within acceptable limits. The\napplicability of the proposed method is verified by simulation of the lateral\nvehicle control system.\n","authors":["Tao Wang","Yapeng Li","Zihao Mo","Wesley Cooke","Weiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2411.10262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03189v2","updated":"2024-11-15T14:56:06Z","published":"2024-11-05T15:34:25Z","title":"Energy-Aware Predictive Motion Planning for Autonomous Vehicles Using a\n Hybrid Zonotope Constraint Representation","summary":" Uncrewed aerial systems have tightly coupled energy and motion dynamics which\nmust be accounted for by onboard planning algorithms. This work proposes a\nstrategy for coupled motion and energy planning using model predictive control\n(MPC). A reduced-order linear time-invariant model of coupled energy and motion\ndynamics is presented. Constrained zonotopes are used to represent state and\ninput constraints, and hybrid zonotopes are used to represent non-convex\nconstraints tied to a map of the environment. The structures of these\nconstraint representations are exploited within a mixed-integer quadratic\nprogram solver tailored to MPC motion planning problems. Results apply the\nproposed methodology to coupled motion and energy utilization planning problems\nfor 1) a hybrid-electric vehicle that must restrict engine usage when flying\nover regions with noise restrictions, and 2) an electric package delivery drone\nthat must track waysets with both position and battery state of charge\nrequirements. By leveraging the structure-exploiting solver, the proposed\nmixed-integer MPC formulations can be implemented in real time.\n","authors":["Joshua A. Robbins","Andrew F. Thompson","Sean Brennan","Herschel C. Pangborn"],"pdf_url":"https://arxiv.org/pdf/2411.03189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10243v1","updated":"2024-11-15T14:55:46Z","published":"2024-11-15T14:55:46Z","title":"Data-Driven Decentralized Control Design for Discrete-Time Large-Scale\n Systems","summary":" In this paper, a data-driven approach is developed for controller design for\na class of discrete-time large-scale systems, where a large-scale system can be\nexpressed in an equivalent data-driven form and the decentralized controllers\ncan be parameterized by the data collected from its subsystems, i.e., system\nstate, control input, and interconnection input. Based on the developed\ndata-driven method and the Lyapunov approach, a data-driven semi-definite\nprogramming problem is constructed to obtain decentralized stabilizing\ncontrollers. The proposed approach has been validated on a mass-spring chain\nmodel, with the significant advantage of avoiding extensive modeling processes.\n","authors":["Jiaping Liao","Shuaizheng Lu","Tao Wang","Weiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2411.10243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10240v1","updated":"2024-11-15T14:53:34Z","published":"2024-11-15T14:53:34Z","title":"Efficient Neural Hybrid System Learning and Transition System\n Abstraction for Dynamical Systems","summary":" This paper proposes a neural network hybrid modeling framework for dynamics\nlearning to promote an interpretable, computationally efficient way of dynamics\nlearning and system identification. First, a low-level model will be trained to\nlearn the system dynamics, which utilizes multiple simple neural networks to\napproximate the local dynamics generated from data-driven partitions. Then,\nbased on the low-level model, a high-level model will be trained to abstract\nthe low-level neural hybrid system model into a transition system that allows\nComputational Tree Logic Verification to promote the model's ability with human\ninteraction and verification efficiency.\n","authors":["Yejiang Yang","Zihao Mo","Weiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2411.10240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01284v2","updated":"2024-11-15T14:50:22Z","published":"2024-09-02T14:27:35Z","title":"Analyzing electric vehicle, load and photovoltaic generation uncertainty\n using publicly available datasets","summary":" This paper aims to analyze three publicly available datasets for quantifying\nseasonal and annual uncertainty for efficient scenario creation. The datasets\nfrom Elaad, Elia and Fluvius are utilized to statistically analyze electric\nvehicle charging, normalized solar generation and low-voltage consumer load\nprofiles, respectively. Frameworks for scenario generation are also provided\nfor these datasets. The datasets for load profiles and solar generation\nanalyzed are for the year 2022, thus embedding seasonal information. An online\nrepository is created for the wider applicability of this work. Finally, the\nextreme load week(s) are identified and linked to the weather data measured at\nEnergyVille in Belgium.\n","authors":["Md Umar Hashmi","Domenico Gioffrè","Simon Nagels","Dirk Van Hertem"],"pdf_url":"https://arxiv.org/pdf/2409.01284v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06607v3","updated":"2024-11-15T14:09:05Z","published":"2024-09-10T16:00:22Z","title":"An Ontology-based Approach Towards Traceable Behavior Specifications in\n Automated Driving","summary":" Vehicles in public traffic that are equipped with Automated Driving Systems\nare subject to a number of expectations: Among other aspects, their behavior\nshould be safe, conforming to the rules of the road and provide mobility to\ntheir users. This poses challenges for the developers of such systems:\nDevelopers are responsible for specifying this behavior, for example, in terms\nof requirements at system design time. As we will discuss in the article, this\nspecification always involves the need for assumptions and trade-offs. As a\nresult, insufficiencies in such a behavior specification can occur that can\npotentially lead to unsafe system behavior. In order to support the\nidentification of specification insufficiencies, requirements and respective\nassumptions need to be made explicit. In this article, we propose the Semantic\nNorm Behavior Analysis as an ontology-based approach to specify the behavior\nfor an Automated Driving System equipped vehicle. We use ontologies to formally\nrepresent specified behavior for a targeted operational environment, and to\nestablish traceability between specified behavior and the addressed stakeholder\nneeds. Furthermore, we illustrate the application of the Semantic Norm Behavior\nAnalysis in a German legal context with two example scenarios and evaluate our\nresults. Our evaluation shows that the explicit documentation of assumptions in\nthe behavior specification supports both the identification of specification\ninsufficiencies and their treatment. Therefore, this article provides\nrequirements, terminology and an according methodology to facilitate\nontology-based behavior specifications in automated driving.\n","authors":["Nayel Fabian Salem","Marcus Nolte","Veronica Haber","Till Menzel","Hans Steege","Robert Graubohm","Markus Maurer"],"pdf_url":"https://arxiv.org/pdf/2409.06607v3.pdf","comment":"24 pages, 12 figures, submitted for publication"},{"id":"http://arxiv.org/abs/2403.00321v3","updated":"2024-11-15T13:42:28Z","published":"2024-03-01T06:48:58Z","title":"DEEP-IoT: Downlink-Enhanced Efficient-Power Internet of Things","summary":" At the heart of the Internet of Things (IoT) -- a domain witnessing explosive\ngrowth -- the imperative for energy efficiency and the extension of device\nlifespans has never been more pressing. This paper presents DEEP-IoT, an\ninnovative communication paradigm poised to redefine how IoT devices\ncommunicate. Through a pioneering feedback channel coding strategy, DEEP-IoT\nchallenges and transforms the traditional transmitter (IoT devices)-centric\ncommunication model to one where the receiver (the access point) play a pivotal\nrole, thereby cutting down energy use and boosting device longevity. We not\nonly conceptualize DEEP-IoT but also actualize it by integrating deep\nlearning-enhanced feedback channel codes within a narrow-band system.\nSimulation results show a significant enhancement in the operational lifespan\nof IoT cells -- surpassing traditional systems using Turbo and Polar codes by\nup to 52.71%. This leap signifies a paradigm shift in IoT communications,\nsetting the stage for a future where IoT devices boast unprecedented efficiency\nand durability.\n","authors":["Yulin Shao"],"pdf_url":"https://arxiv.org/pdf/2403.00321v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10166v1","updated":"2024-11-15T13:13:23Z","published":"2024-11-15T13:13:23Z","title":"Two-Stage Robust Optimal Operation of Distribution Networks using\n Confidence Level Based Distributionally Information Gap Decision","summary":" This paper presents a confidence level-based distributionally information gap\ndecision theory (CL-DIGDT) framework for the two-stage robust optimal operation\nof distribution networks, aiming at deriving an optimal operational scheme\ncapable of addressing uncertainties related to renewable energy and load\ndemands. Building on conventional IGDT, the proposed framework utilizes the\nconfidence level to capture the asymmetric characteristics of uncertainties and\nmaximize the risk-averse capability of the solution in a probabilistic manner.\nTo account for the probabilistic consideration, the imprecise Dirichlet model\nis employed to construct the ambiguity sets of uncertainties, reducing reliance\non precise probability distributions. Consequently, a two-stage robust optimal\noperation model for distribution networks using CL-DIGDT is developed. An\niterative method is proposed to solve the model and determine the upper and\nlower bounds of the objective function. Case study demonstrates that the\nproposed approach yields a more robust and statistically optimized solution\nwith required accuracy compared to existing method, contributing to a reduction\nin first-stage cost by 0.84%, second-stage average cost by 6.7%, and\nsignificantly increasing the reliability of the solution by 8%.\n","authors":["Zhisheng Xiong","Bo Zeng","Peter Palensky","Pedro P. Vergara"],"pdf_url":"https://arxiv.org/pdf/2411.10166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15498v3","updated":"2024-11-15T12:38:18Z","published":"2023-09-27T08:51:55Z","title":"A Control Theoretical Approach to Online Constrained Optimization","summary":" In this paper we focus on the solution of online problems with time-varying,\nlinear equality and inequality constraints. Our approach is to design a novel\nonline algorithm by leveraging the tools of control theory. In particular, for\nthe case of equality constraints only, using robust control we design an online\nalgorithm with asymptotic convergence to the optimal trajectory, differently\nfrom the alternatives that achieve non-zero tracking error. When also\ninequality constraints are present, we show how to modify the proposed\nalgorithm to account for the wind-up induced by the nonnegativity constraints\non the dual variables. We report numerical results that corroborate the\ntheoretical analysis, and show how the proposed approach outperforms\nstate-of-the-art algorithms both with equality and inequality constraints.\n","authors":["Umberto Casti","Nicola Bastianello","Ruggero Carli","Sandro Zampieri"],"pdf_url":"https://arxiv.org/pdf/2309.15498v3.pdf","comment":"To appear in Automatica"},{"id":"http://arxiv.org/abs/2411.10096v1","updated":"2024-11-15T10:44:29Z","published":"2024-11-15T10:44:29Z","title":"Neural Port-Hamiltonian Models for Nonlinear Distributed Control: An\n Unconstrained Parametrization Approach","summary":" The control of large-scale cyber-physical systems requires optimal\ndistributed policies relying solely on limited communication with neighboring\nagents. However, computing stabilizing controllers for nonlinear systems while\noptimizing complex costs remains a significant challenge. Neural Networks\n(NNs), known for their expressivity, can be leveraged to parametrize control\npolicies that yield good performance. However, NNs' sensitivity to small input\nchanges poses a risk of destabilizing the closed-loop system. Many existing\napproaches enforce constraints on the controllers' parameter space to guarantee\nclosed-loop stability, leading to computationally expensive optimization\nprocedures. To address these problems, we leverage the framework of\nport-Hamiltonian systems to design continuous-time distributed control policies\nfor nonlinear systems that guarantee closed-loop stability and finite\n$\\mathcal{L}_2$ or incremental $\\mathcal{L}_2$ gains, independent of the\noptimzation parameters of the controllers. This eliminates the need to\nconstrain parameters during optimization, allowing the use of standard\ntechniques such as gradient-based methods. Additionally, we discuss\ndiscretization schemes that preserve the dissipation properties of these\ncontrollers for implementation on embedded systems. The effectiveness of the\nproposed distributed controllers is demonstrated through consensus control of\nnon-holonomic mobile robots subject to collision avoidance and averaged voltage\nregulation with weighted power sharing in DC microgrids.\n","authors":["Muhammad Zakwan","Giancarlo Ferrari-Trecate"],"pdf_url":"https://arxiv.org/pdf/2411.10096v1.pdf","comment":"The paper has 15 pages, and has been submitted for a possible\n publication. arXiv admin note: text overlap with arXiv:2403.17785"},{"id":"http://arxiv.org/abs/2411.10058v1","updated":"2024-11-15T09:21:54Z","published":"2024-11-15T09:21:54Z","title":"Unsupervised Congestion Status Identification Using LMP Data","summary":" Having a better understanding of how locational marginal prices (LMPs) change\nhelps in price forecasting and market strategy making. This paper investigates\nthe fundamental distribution of the congestion part of LMPs in high-dimensional\nEuclidean space using an unsupervised approach. LMP models based on the\nlossless and lossy DC optimal power flow (DC-OPF) are analyzed to show the\noverlapping subspace property of the LMP data. The congestion part of LMPs is\nspanned by certain row vectors of the power transfer distribution factor (PTDF)\nmatrix, and the subspace attributes of an LMP vector uniquely are found to\nreflect the instantaneous congestion status of all the transmission lines. The\nproposed method searches for the basis vectors that span the subspaces of\ncongestion LMP data in hierarchical ways. In the bottom-up search, the data\nbelonging to 1-dimensional subspaces are detected, and other data are projected\non the orthogonal subspaces. This procedure is repeated until all the basis\nvectors are found or the basis gap appears. Top-down searching is used to\naddress the basis gap by hyperplane detection with outliers. Once all the basis\nvectors are detected, the congestion status can be identified. Numerical\nexperiments based on the IEEE 30-bus system, IEEE 118-bus system, Illinois\n200-bus system, and Southwest Power Pool are conducted to show the performance\nof the proposed method.\n","authors":["Kedi Zheng","Qixin Chen","Yi Wang","Chongqing Kang","Le Xie"],"pdf_url":"https://arxiv.org/pdf/2411.10058v1.pdf","comment":"Paper accepted for IEEE Transactions on Smart Grid. Personal use of\n this material is permitted. Permission from IEEE must be obtained for all\n other uses"},{"id":"http://arxiv.org/abs/2401.06332v2","updated":"2024-11-15T08:31:31Z","published":"2024-01-12T02:48:20Z","title":"Distributed Solvers for Network Linear Equations with Scalarized\n Compression","summary":" Distributed computing is fundamental to multi-agent systems, with solving\ndistributed linear equations as a typical example. In this paper, we study\ndistributed solvers for network linear equations over a network with\nnode-to-node communication messages compressed as scalar values. Our key idea\nlies in a dimension compression scheme that includes a dimension-compressing\nvector and a data unfolding step. The compression vector applies to individual\nnode states as an inner product to generate a real-valued message for node\ncommunication. In the unfolding step, such scalar message is then plotted along\nthe subspace generated by the compression vector for the local computations. We\nfirst present a compressed consensus flow that relies only on such scalarized\ncommunication, and show that linear convergence can be achieved with well\nexcited signals for the compression vector. We then employ such a compressed\nconsensus flow as a fundamental consensus subroutine to develop distributed\ncontinuous-time and discrete-time solvers for network linear equations, and\nprove their linear convergence properties under scalar node communications.\nWith scalar communications, a direct benefit would be the reduced node-to-node\ncommunication channel burden for distributed computing. Numerical examples are\npresented to illustrate the effectiveness of the established theoretical\nresults.\n","authors":["Lei Wang","Zihao Ren","Deming Yuan","Guodong Shi"],"pdf_url":"https://arxiv.org/pdf/2401.06332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00002v2","updated":"2024-11-15T08:24:32Z","published":"2024-08-14T08:54:44Z","title":"Spatio-Temporal Communication Compression for Distributed Prime-Dual\n Optimization","summary":" Several data compressors have been proposed in distributed optimization\nframeworks of network systems to reduce communication overhead in large-scale\napplications. In this paper, we demonstrate that effective information\ncompression may occur over time or space during sequences of node\ncommunications in distributed algorithms, leading to the concept of\nspatio-temporal compressors. This abstraction classifies existing compressors\nas spatio-temporal compressors, with their effectiveness described by\nconstructive stability criteria from nonlinear system theory. Subsequently, we\napply these spatio-temporal compressors to standard continuous-time consensus\nflows and distributed prime-dual flows, establishing conditions ensuring\nconvergence. Additionally, we introduce a novel observer-based distributed\nprimal-dual continuous flow integrated with spatio-temporal compressors, which\nprovides broader convergence conditions. These continuous flows achieve\nexponential convergence to the global optimum when the objective function is\nstrongly convex and can be discretized using Euler approximations. Finally,\nnumerical simulations illustrate the versatility of the proposed\nspatio-temporal compressors and verify the convergence of algorithms.\n","authors":["Zihao Ren","Lei Wang","Xinlei Yi","Xi Wang","Deming Yuan","Tao Yang","Zhengguang Wu","Guodong Shi"],"pdf_url":"https://arxiv.org/pdf/2409.00002v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2408.02332"},{"id":"http://arxiv.org/abs/2411.10031v1","updated":"2024-11-15T08:19:07Z","published":"2024-11-15T08:19:07Z","title":"Enforcing Cooperative Safety for Reinforcement Learning-based\n Mixed-Autonomy Platoon Control","summary":" It is recognized that the control of mixed-autonomy platoons comprising\nconnected and automated vehicles (CAVs) and human-driven vehicles (HDVs) can\nenhance traffic flow. Among existing methods, Multi-Agent Reinforcement\nLearning (MARL) appears to be a promising control strategy because it can\nmanage complex scenarios in real time. However, current research on MARL-based\nmixed-autonomy platoon control suffers from several limitations. First,\nexisting MARL approaches address safety by penalizing safety violations in the\nreward function, thus lacking theoretical safety guarantees due to the\nblack-box nature of RL. Second, few studies have explored the cooperative\nsafety of multi-CAV platoons, where CAVs can be coordinated to further enhance\nthe system-level safety involving the safety of both CAVs and HDVs. Third,\nexisting work tends to make an unrealistic assumption that the behavior of HDVs\nand CAVs is publicly known and rationale. To bridge the research gaps, we\npropose a safe MARL framework for mixed-autonomy platoons. Specifically, this\nframework (i) characterizes cooperative safety by designing a cooperative\nControl Barrier Function (CBF), enabling CAVs to collaboratively improve the\nsafety of the entire platoon, (ii) provides a safety guarantee to the\nMARL-based controller by integrating the CBF-based safety constraints into MARL\nthrough a differentiable quadratic programming (QP) layer, and (iii)\nincorporates a conformal prediction module that enables each CAV to estimate\nthe unknown behaviors of the surrounding vehicles with uncertainty\nqualification. Simulation results show that our proposed control strategy can\neffectively enhance the system-level safety through CAV cooperation of a\nmixed-autonomy platoon with a minimal impact on control performance.\n","authors":["Jingyuan Zhou","Longhao Yan","Jinhao Liang","Kaidi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.10031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02332v2","updated":"2024-11-15T08:03:01Z","published":"2024-08-05T09:16:45Z","title":"Spatio-Temporal Communication Compression in Distributed Prime-Dual\n Flows","summary":" In this paper, we study distributed prime-dual flows for multi-agent\noptimization with spatio-temporal compressions. The central aim of multi-agent\noptimization is for a network of agents to collaboratively solve a system-level\noptimization problem with local objective functions and node-to-node\ncommunication by distributed algorithms. The scalability of such algorithms\ncrucially depends on the complexity of the communication messages, and a number\nof communication compressors for distributed optimization have recently been\nproposed in the literature. First of all, we introduce a general\nspatio-temporal compressor characterized by the stability of the resulting\ndynamical system along the vector field of the compressor. We show that several\nimportant distributed optimization compressors such as the greedy sparsifier,\nthe uniform quantizer, and the scalarizer all fall into the category of this\nspatio-temporal compressor. Next, we propose two distributed prime-dual flows\nwith the spatio-temporal compressors being applied to local node states and\nlocal error states, respectively, and prove (exponential) convergence of the\nnode trajectories to the global optimizer for (strongly) convex cost functions.\nFinally, a few numerical examples are present to illustrate our theoretical\nresults.\n","authors":["Zihao Ren","Lei Wang","Deming Yuan","Hongye Su","Guodong Shi"],"pdf_url":"https://arxiv.org/pdf/2408.02332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09990v1","updated":"2024-11-15T06:52:44Z","published":"2024-11-15T06:52:44Z","title":"Exploring the Influence of Residential Electric Vehicle Charging on\n Distribution System Hosting Capacity -- A Case-Study in Arizona","summary":" The installation of high-capacity fast chargers for electric vehicles (EVs)\nis posing a significant risk to the distribution grid as the increased demand\nfrom widespread residential EV charging could exceed the technical limits of\nthe distribution system. Addressing this issue is critical, given that current\ninfrastructure upgrades to enhance EV hosting capacity are both costly and\ntime-consuming. Moreover, the inherent uncertainties associated with EV\ncharging parameters make it challenging for power utilities to accurately\nassess the impact of EVs added to specific locations. To address these\nknowledge gaps, this study (a) introduces an algorithm to coordinate\nresidential EV charging, and (b) proposes a comprehensive framework that\nevaluates all transformers within a feeder. The proposed method is applied to a\nreal-world feeder, which includes 120 transformers of varying capacities. The\nresults demonstrate that this approach effectively manages a substantial number\nof EVs without overloading any of the transformers, while also pinpointing\nlocations that must be prioritized for future upgrades. This framework can\nserve as a valuable reference for utilities when conducting distribution system\nevaluations for supporting the growing EV penetration.\n","authors":["Mohammad Golgol","Anamitra Pal","Vijay Vittal","Christine Fini","Ernest Palomino","Kyle Girardi"],"pdf_url":"https://arxiv.org/pdf/2411.09990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09956v1","updated":"2024-11-15T05:22:19Z","published":"2024-11-15T05:22:19Z","title":"A Secure Estimator with Gaussian Bernoulli Mixture Model","summary":" The implementation of cyber-physical systems in real-world applications is\nchallenged by safety requirements in the presence of sensor threats. Most\ncyber-physical systems, in particular the vulnerable multi-sensor systems,\nstruggle to detect the attack in observation signals. In this paper, we tackle\nthis issue by proposing a Gaussian-Bernoulli Secure (GBS) estimator, which\neffectively transforms the assessment of sensor status into an optimal\nestimation problem concerning the system state and observation indicators. It\nencompasses two theoretical sub-problems: sequential state estimation with\npartial observations and estimation updates with disordered new observations.\nWithin the framework of Kalman filter, we derive closed-form solutions for\nthese two issues. However, due to their computational inefficiency, we propose\nthe iterative approach employing proximal gradient descent to accelerate the\nestimation update. We conduct comprehensive experiments from three\nperspectives: computational efficiency, detection and estimation performance,\nand characterization of observation error. Our GBS estimator shows the\nimprovements compared to other methods.\n","authors":["Xingzhou Chen","Nachuan Yang","Peihu Duan","Shilei Li","Ling Shi"],"pdf_url":"https://arxiv.org/pdf/2411.09956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09954v1","updated":"2024-11-15T05:11:39Z","published":"2024-11-15T05:11:39Z","title":"Reaching Resilient Leader-Follower Consensus in Time-Varying Networks\n via Multi-Hop Relays","summary":" We study resilient leader-follower consensus of multi-agent systems (MASs) in\nthe presence of adversarial agents, where agents' communication is modeled by\ntime-varying topologies. The objective is to develop distributed algorithms for\nthe nonfaulty/normal followers to track an arbitrary reference value propagated\nby a set of leaders while they are in interaction with the unknown adversarial\nagents. Our approaches are based on the weighted mean subsequence reduced\n(W-MSR) algorithms with agents being capable to communicate with multi-hop\nneighbors. Our algorithms can handle agents possessing first-order and\nsecond-order dynamics. Moreover, we characterize necessary and sufficient graph\nconditions for our algorithms to succeed by the novel notion of jointly robust\nfollowing graphs. Our graph condition is tighter than the sufficient conditions\nin the literature when agents use only one-hop communication (without relays).\nUsing multi-hop relays, we can enhance robustness of leader-follower networks\nwithout increasing communication links and obtain further relaxed graph\nrequirements for our algorithms to succeed. Numerical examples are given to\nverify the efficacy of our algorithms.\n","authors":["Liwei Yuan","Hideaki Ishii"],"pdf_url":"https://arxiv.org/pdf/2411.09954v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2411.09913v1","updated":"2024-11-15T03:20:24Z","published":"2024-11-15T03:20:24Z","title":"A Graph-based Strategic Sensor Deployment Approach for k-coverage in WSN","summary":" This paper studies a graph-based sensor deployment approach in wireless\nsensor networks (WSNs). Specifically, in today's world, where sensors are\neverywhere, detecting various attributes like temperature and movement, their\ndeteriorating lifetime is indeed a very concerning issue. In many scenarios,\nthese sensors are placed in extremely remote areas, where maintenance becomes\nchallenging. As a result, it is not very wise to depend on a single sensor to\nobtain data from a particular terrain or place. Hence, multiple sensors are\ndeployed in these places, such that no problem arises if one or few of them\nfail. In this work, this problem of intelligent placement of sensors is\nmodelled from the graph theoretic point of view. We propose a new sensor\ndeployment approach here, which results in lesser sensor density per unit area\nand less number of sensors as compared to the existing benchmark schemes.\nFinally, the numerical results also support our claims and provide insights\nregarding the selection of parameters that enhance the system performance.\n","authors":["Lakshmikanta Sau","Priyadarshi Mukherjee","Sasthi C. Ghosh"],"pdf_url":"https://arxiv.org/pdf/2411.09913v1.pdf","comment":"Submitted for a possible publication"},{"id":"http://arxiv.org/abs/2411.00461v2","updated":"2024-11-15T03:01:59Z","published":"2024-11-01T09:18:38Z","title":"A Multi-Granularity Supervised Contrastive Framework for Remaining\n Useful Life Prediction of Aero-engines","summary":" Accurate remaining useful life (RUL) predictions are critical to the safe\noperation of aero-engines. Currently, the RUL prediction task is mainly a\nregression paradigm with only mean square error as the loss function and lacks\nresearch on feature space structure, the latter of which has shown excellent\nperformance in a large number of studies. This paper develops a\nmulti-granularity supervised contrastive (MGSC) framework from plain intuition\nthat samples with the same RUL label should be aligned in the feature space,\nand address the problems of too large minibatch size and unbalanced samples in\nthe implementation. The RUL prediction with MGSC is implemented on using the\nproposed multi-phase training strategy. This paper also demonstrates a simple\nand scalable basic network structure and validates the proposed MGSC strategy\non the CMPASS dataset using a convolutional long short-term memory network as a\nbaseline, which effectively improves the accuracy of RUL prediction.\n","authors":["Zixuan He","Ziqian Kong","Zhengyu Chen","Yuling Zhan","Zijun Que","Zhengguo Xu"],"pdf_url":"https://arxiv.org/pdf/2411.00461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09906v1","updated":"2024-11-15T03:01:23Z","published":"2024-11-15T03:01:23Z","title":"A Survey of Machine Learning-based Physical-Layer Authentication in\n Wireless Communications","summary":" To ensure secure and reliable communication in wireless systems,\nauthenticating the identities of numerous nodes is imperative. Traditional\ncryptography-based authentication methods suffer from issues such as low\ncompatibility, reliability, and high complexity. Physical-Layer Authentication\n(PLA) is emerging as a promising complement due to its exploitation of unique\nproperties in wireless environments. Recently, Machine Learning (ML)-based PLA\nhas gained attention for its intelligence, adaptability, universality, and\nscalability compared to non-ML approaches. However, a comprehensive overview of\nstate-of-the-art ML-based PLA and its foundational aspects is lacking. This\npaper presents a comprehensive survey of characteristics and technologies that\ncan be used in the ML-based PLA. We categorize existing ML-based PLA schemes\ninto two main types: multi-device identification and attack detection schemes.\nIn deep learning-based multi-device identification schemes, Deep Neural\nNetworks are employed to train models, avoiding complex processing and expert\nfeature transformation. Deep learning-based multi-device identification schemes\nare further subdivided, with schemes based on Convolutional Neural Networks\nbeing extensively researched. In ML-based attack detection schemes, receivers\nutilize intelligent ML techniques to set detection thresholds automatically,\neliminating the need for manual calculation or knowledge of channel models.\nML-based attack detection schemes are categorized into three sub-types:\nSupervised Learning, Unsupervised Learning, and Reinforcement Learning.\nAdditionally, we summarize open-source datasets used for PLA, encompassing\nRadio Frequency fingerprints and channel fingerprints. Finally, this paper\noutlines future research directions to guide researchers in related fields.\n","authors":["Rui Meng","Bingxuan Xu","Xiaodong Xu","Mengying Sun","Bizhu Wanga","Shujun Han","Suyu Lv","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.09906v1.pdf","comment":"111 pages, 9 figures"},{"id":"http://arxiv.org/abs/2303.10253v2","updated":"2024-11-15T02:12:49Z","published":"2023-03-17T21:21:56Z","title":"Pricing for Multi-modal Pickup and Delivery Problems with Heterogeneous\n Users","summary":" In this paper, we study the pickup and delivery problem with multiple\ntransportation modalities, and address the challenge of efficiently allocating\ntransportation resources while price matching users with their desired delivery\nmodes. More precisely, we consider that orders are demanded by a heterogeneous\npopulation of users with varying trade-offs between price and latency. To\ncapture how prices affect the behavior of heterogeneous selfish users choosing\nbetween multiple delivery modes, we construct a congestion game taking place\nover a form of star network, where each source-sink pair is composed of\nparallel links connecting users with their preferred delivery method. Using the\nunique geometry of this network, we prove that one can set prices explicitly to\ninduce any desired network flow, i.e, given a desired allocation strategy, we\nhave a closed-form solution for the delivery prices. We conclude by performing\na case study on a meal delivery problem with multiple courier modalities using\ndata from real world instances.\n","authors":["Mark Beliaev","Negar Mehr","Ramtin Pedarsani"],"pdf_url":"https://arxiv.org/pdf/2303.10253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09881v1","updated":"2024-11-15T02:01:36Z","published":"2024-11-15T02:01:36Z","title":"Regulating Stability Margins in Symbiotic Control: A Low-Pass Filter\n Approach","summary":" Symbiotic control synergistically integrates fixed-gain control and adaptive\nlearning architectures to mitigate system uncertainties more predictably than\nadaptive learning alone and without requiring prior knowledge of uncertainty\nbounds as compared to fixed-gain control alone. Specifically, increasing the\nfixed-gain control parameter achieves a desired level of closed-loop system\nperformance while the adaptive law simultaneously learns and suppresses the\nsystem uncertainties. However, stability margins can be reduced when this\nparameter is large and this paper aims to address this practical challenge. To\nthis end, we propose a new fixed-gain control architecture predicated on a\nlow-pass filter approach to regulate stability margins in the symbiotic control\nframework. In addition to the presented system-theoretical results focusing on\nthe stability of the closed-loop system, we provide two illustrative numerical\nexamples to demonstrate how the low-pass filter parameters are chosen for the\nstability margin regulation problem without significantly compromising the\nclosed-loop system performance.\n","authors":["Emre Yildirim","Tansel Yucelen","John T. Hrynuk"],"pdf_url":"https://arxiv.org/pdf/2411.09881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12908v2","updated":"2024-11-15T22:04:33Z","published":"2024-05-21T16:27:04Z","title":"Extremum Seeking is Stable for Scalar Maps that are Strictly but Not\n Strongly Convex","summary":" For a map that is strictly but not strongly convex, model-based gradient\nextremum seeking has an eigenvalue of zero at the extremum, i.e., it fails at\nexponential convergence. Interestingly, perturbation-based model-free extremum\nseeking has a negative Jacobian, in the average, meaning that its (practical)\nconvergence is exponential, even though the map's Hessian is zero at the\nextremum. While these observations for the gradient algorithm are not trivial,\nwe focus in this paper on an even more nontrivial study of the same phenomenon\nfor Newton-based extremum seeking control (NESC).\n NESC is a second-order method which corrects for the unknown Hessian of the\nunknown map, not only in order to speed up parameter convergence, but also (1)\nto make the convergence rate user-assignable in spite of the unknown Hessian,\nand (2) to equalize the convergence rates in different directions for\nmultivariable maps. Previous NESC work established stability only for maps\nwhose Hessians are strictly positive definite everywhere, so the Hessian is\ninvertible everywhere. For a scalar map, we establish the rather unexpected\nproperty that, even when the map behind is strictly convex but not strongly\nconvex, i.e., when the Hessian may be zero, NESC guarantees practical\nasymptotic stability, semiglobally. While a model-based Newton-based algorithm\nwould run into non-invertibility of the Hessian, the perturbation-based NESC,\nsurprisingly, avoids this challenge by leveraging the fact that the average of\nthe perturbation-based Hessian estimate is always positive, even though the\nactual Hessian may be zero.\n","authors":["Patrick McNamee","Miroslav Krstić","Zahra Nili Ahmadabadi"],"pdf_url":"https://arxiv.org/pdf/2405.12908v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.10603v1","updated":"2024-11-15T21:53:41Z","published":"2024-11-15T21:53:41Z","title":"A Novel MLLM-based Approach for Autonomous Driving in Different Weather\n Conditions","summary":" Autonomous driving (AD) technology promises to revolutionize daily\ntransportation by making it safer, more efficient, and more comfortable. Their\nrole in reducing traffic accidents and improving mobility will be vital to the\nfuture of intelligent transportation systems. Autonomous driving in harsh\nenvironmental conditions presents significant challenges that demand robust and\nadaptive solutions and require more investigation. In this context, we present\nin this paper a comprehensive performance analysis of an autonomous driving\nagent leveraging the capabilities of a Multi-modal Large Language Model (MLLM)\nusing GPT-4o within the LimSim++ framework that offers close loop interaction\nwith the CARLA driving simulator. We call it MLLM-AD-4o. Our study evaluates\nthe agent's decision-making, perception, and control under adverse conditions,\nincluding bad weather, poor visibility, and complex traffic scenarios. Our\nresults demonstrate the AD agent's ability to maintain high levels of safety\nand efficiency, even in challenging environments, underscoring the potential of\nGPT-4o to enhance autonomous driving systems (ADS) in any environment\ncondition. Moreover, we evaluate the performance of MLLM-AD-4o when different\nperception entities are used including either front cameras only, front and\nrear cameras, and when combined with LiDAR. The results of this work provide\nvaluable insights into integrating MLLMs with AD frameworks, paving the way for\nfuture advancements in this field.\n","authors":["Sonda Fourati","Wael Jaafar","Noura Baccar"],"pdf_url":"https://arxiv.org/pdf/2411.10603v1.pdf","comment":"9 pages, 6 figures; Submitted to IEEE Transactions on Intelligent\n Transportation Systems"},{"id":"http://arxiv.org/abs/2411.10592v1","updated":"2024-11-15T21:29:36Z","published":"2024-11-15T21:29:36Z","title":"A Systematic LMI Approach to Design Multivariable Sliding Mode\n Controllers","summary":" This paper deals with sliding mode control for multivariable polytopic\nuncertain systems. We provide systematic procedures to design variable\nstructure controllers (VSCs) and unit-vector controllers (UVCs). Based on\nsuitable representations for the closed-loop system, we derive sufficient\nconditions in the form of linear matrix inequalities (LMIs) to design the\nrobust sliding mode controllers such that the origin of the closed-loop system\nis globally stable in finite time. Moreover, by noticing that the reaching time\ndepends on the initial condition and the decay rate, we provide convex\noptimization problems to design robust controllers by considering the\nminimization of the reaching time associated with a given set of initial\nconditions. Two examples illustrate the effectiveness of the proposed\napproaches.\n","authors":["Pedro Henrique Silva Coutinho","Iury Bessa","Victor Hugo Pereira Rodrigues","Tiago Roux Oliveira"],"pdf_url":"https://arxiv.org/pdf/2411.10592v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.10580v1","updated":"2024-11-15T21:04:40Z","published":"2024-11-15T21:04:40Z","title":"Gradient-Based Stochastic Extremum-Seeking Control for Multivariable\n Systems with Distinct Input Delays","summary":" This paper addresses the design and analysis of a multivariable\ngradient-based stochastic extremum-seeking control method for multi-input\nsystems with arbitrary input delays. The approach accommodates systems with\ndistinct time delays across input channels and achieves local exponential\nstability of the closed-loop system, guaranteeing convergence to a small\nneighborhood around the extremum point. By incorporating phase compensation for\ndither signals and a novel predictor-feedback mechanism with averaging-based\nestimates of the unknown gradient and Hessian, the proposed method overcomes\ntraditional challenges associated with arbitrary, distinct input delays. Unlike\nprevious work on deterministic multiparameter extremum-seeking with distinct\ninput delays, this stability analysis is achieved without using backstepping\ntransformations, simplifying the predictor design and enabling a more\nstraightforward implementation. Specifically, the direct application of\nArtstein's reduction approach results in delay- and\nsystem-dimension-independent convergence rates, enhancing practical\napplicability. A numerical example illustrates the robust performance and\nadvantages of the proposed delay-compensated stochastic extremum-seeking\nmethod.\n","authors":["Paulo Cesar Souza Silva","Paulo Cesar Pellanda","Tiago Roux Oliveira"],"pdf_url":"https://arxiv.org/pdf/2411.10580v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.10528v1","updated":"2024-11-15T19:02:59Z","published":"2024-11-15T19:02:59Z","title":"AC-Informed DC Optimal Transmission Switching Problems via Parameter\n Optimization","summary":" Optimal Transmission Switching (OTS) problems minimize operational costs\nwhile treating both the transmission line energization statuses and generator\nsetpoints as decision variables. The combination of nonlinearities from an AC\npower flow model and discrete variables associated with line statuses makes\nAC-OTS a computationally challenging Mixed-Integer Nonlinear Program (MINLP).\nTo address these challenges, the DC power flow approximation is often used to\nobtain a DC-OTS formulation expressed as a Mixed-Integer Linear Program (MILP).\nHowever, this approximation often leads to suboptimal or infeasible switching\ndecisions when evaluated with an AC power flow model. This paper proposes an\nenhanced DC-OTS formulation that leverages techniques for training machine\nlearning models to optimize the DC power flow model's parameters. By optimally\nselecting parameter values that align flows in the DC power flow model with\napparent power flows -- incorporating both real and reactive components -- from\nAC Optimal Power Flow (OPF) solutions, our method more accurately captures line\ncongestion behavior. Integrating these optimized parameters into the DC-OTS\nformulation significantly improves the accuracy of switching decisions and\nreduces discrepancies between DC-OTS and AC-OTS solutions. We compare our\noptimized DC-OTS model against traditional OTS approaches, including DC-OTS,\nLinear Programming AC (LPAC)-OTS, and Quadratic Convex (QC)-OTS. Numeric\nresults show that switching decisions from our model yield better performance\nwhen evaluated using an AC power flow model, with up to $44\\%$ cost reductions\nin some cases.\n","authors":["Babak Taheri","Daniel K. Molzahn"],"pdf_url":"https://arxiv.org/pdf/2411.10528v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.10438v1","updated":"2024-11-15T18:57:39Z","published":"2024-11-15T18:57:39Z","title":"MARS: Unleashing the Power of Variance Reduction for Training Large\n Models","summary":" Training deep neural networks--and more recently, large models--demands\nefficient and scalable optimizers. Adaptive gradient algorithms like Adam,\nAdamW, and their variants have been central to this task. Despite the\ndevelopment of numerous variance reduction algorithms in the past decade aimed\nat accelerating stochastic optimization in both convex and nonconvex settings,\nvariance reduction has not found widespread success in training deep neural\nnetworks or large language models. Consequently, it has remained a less favored\napproach in modern AI. In this paper, to unleash the power of variance\nreduction for efficient training of large models, we propose a unified\noptimization framework, MARS (Make vAriance Reduction Shine), which reconciles\npreconditioned gradient methods with variance reduction via a scaled stochastic\nrecursive momentum technique. Within our framework, we introduce three\ninstances of MARS that leverage preconditioned gradient updates based on AdamW,\nLion, and Shampoo, respectively. We also draw a connection between our\nalgorithms and existing optimizers. Experimental results on training GPT-2\nmodels indicate that MARS consistently outperforms AdamW by a large margin.\n","authors":["Huizhuo Yuan","Yifeng Liu","Shuang Wu","Xun Zhou","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2411.10438v1.pdf","comment":"23 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2405.13712v4","updated":"2024-11-15T18:57:14Z","published":"2024-05-22T15:04:06Z","title":"Learning Diffusion Priors from Observations by Expectation Maximization","summary":" Diffusion models recently proved to be remarkable priors for Bayesian inverse\nproblems. However, training these models typically requires access to large\namounts of clean data, which could prove difficult in some settings. In this\nwork, we present a novel method based on the expectation-maximization algorithm\nfor training diffusion models from incomplete and noisy observations only.\nUnlike previous works, our method leads to proper diffusion models, which is\ncrucial for downstream tasks. As part of our method, we propose and motivate an\nimproved posterior sampling scheme for unconditional diffusion models. We\npresent empirical evidence supporting the effectiveness of our method.\n","authors":["François Rozet","Gérôme Andry","François Lanusse","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2405.13712v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10435v1","updated":"2024-11-15T18:56:00Z","published":"2024-11-15T18:56:00Z","title":"The Spatial Complexity of Optical Computing and How to Reduce It","summary":" Similar to algorithms, which consume time and memory to run, hardware\nrequires resources to function. For devices processing physical waves,\nimplementing operations needs sufficient \"space,\" as dictated by wave physics.\nHow much space is needed to perform a certain function is a fundamental\nquestion in optics, with recent research addressing it for given mathematical\noperations, but not for more general computing tasks, e.g., classification.\nInspired by computational complexity theory, we study the \"spatial complexity\"\nof optical computing systems in terms of scaling laws - specifically, how their\nphysical dimensions must scale as the dimension of the mathematical operation\nincreases - and propose a new paradigm for designing optical computing systems:\nspace-efficient neuromorphic optics, based on structural sparsity constraints\nand neural pruning methods motivated by wave physics (notably, the concept of\n\"overlapping nonlocality\"). On two mainstream platforms, free-space optics and\non-chip integrated photonics, our methods demonstrate substantial size\nreductions (to 1%-10% the size of conventional designs) with minimal compromise\non performance. Our theoretical and computational results reveal a trend of\ndiminishing returns on accuracy as structure dimensions increase, providing a\nnew perspective for interpreting and approaching the ultimate limits of optical\ncomputing - a balanced trade-off between device size and accuracy.\n","authors":["Yandong Li","Francesco Monticone"],"pdf_url":"https://arxiv.org/pdf/2411.10435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10429v1","updated":"2024-11-15T18:50:53Z","published":"2024-11-15T18:50:53Z","title":"Private Counterfactual Retrieval With Immutable Features","summary":" In a classification task, counterfactual explanations provide the minimum\nchange needed for an input to be classified into a favorable class. We consider\nthe problem of privately retrieving the exact closest counterfactual from a\ndatabase of accepted samples while enforcing that certain features of the input\nsample cannot be changed, i.e., they are \\emph{immutable}. An applicant (user)\nwhose feature vector is rejected by a machine learning model wants to retrieve\nthe sample closest to them in the database without altering a private subset of\ntheir features, which constitutes the immutable set. While doing this, the user\nshould keep their feature vector, immutable set and the resulting\ncounterfactual index information-theoretically private from the institution. We\nrefer to this as immutable private counterfactual retrieval (I-PCR) problem\nwhich generalizes PCR to a more practical setting. In this paper, we propose\ntwo I-PCR schemes by leveraging techniques from private information retrieval\n(PIR) and characterize their communication costs. Further, we quantify the\ninformation that the user learns about the database and compare it for the\nproposed schemes.\n","authors":["Shreya Meel","Pasan Dissanayake","Mohamed Nomeir","Sanghamitra Dutta","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2411.10429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10423v1","updated":"2024-11-15T18:43:29Z","published":"2024-11-15T18:43:29Z","title":"Back to Supervision: Boosting Word Boundary Detection through Frame\n Classification","summary":" Speech segmentation at both word and phoneme levels is crucial for various\nspeech processing tasks. It significantly aids in extracting meaningful units\nfrom an utterance, thus enabling the generation of discrete elements. In this\nwork we propose a model-agnostic framework to perform word boundary detection\nin a supervised manner also employing a labels augmentation technique and an\noutput-frame selection strategy. We trained and tested on the Buckeye dataset\nand only tested on TIMIT one, using state-of-the-art encoder models, including\npre-trained solutions (Wav2Vec 2.0 and HuBERT), as well as convolutional and\nconvolutional recurrent networks. Our method, with the HuBERT encoder,\nsurpasses the performance of other state-of-the-art architectures, whether\ntrained in supervised or self-supervised settings on the same datasets.\nSpecifically, we achieved F-values of 0.8427 on the Buckeye dataset and 0.7436\non the TIMIT dataset, along with R-values of 0.8489 and 0.7807, respectively.\nThese results establish a new state-of-the-art for both datasets. Beyond the\nimmediate task, our approach offers a robust and efficient preprocessing method\nfor future research in audio tokenization.\n","authors":["Simone Carnemolla","Salvatore Calcagno","Simone Palazzo","Daniela Giordano"],"pdf_url":"https://arxiv.org/pdf/2411.10423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10418v1","updated":"2024-11-15T18:38:18Z","published":"2024-11-15T18:38:18Z","title":"Multiscale Dubuc: A New Similarity Measure for Time Series","summary":" Quantifying similarities between time series in a meaningful way remains a\nchallenge in time series analysis, despite many advances in the field. Most\nreal-world solutions still rely on a few popular measures, such as Euclidean\nDistance (EuD), Longest Common Subsequence (LCSS), and Dynamic Time Warping\n(DTW). The strengths and weaknesses of these measures have been studied\nextensively, and incremental improvements have been proposed. In this study,\nhowever, we present a different similarity measure that fuses the notion of\nDubuc's variation from fractal analysis with the Intersection-over-Union (IoU)\nmeasure which is widely used in object recognition (also known as the Jaccard\nIndex). In this proof-of-concept paper, we introduce the Multiscale Dubuc\nDistance (MDD) measure and prove that it is a metric, possessing desirable\nproperties such as the triangle inequality. We use 95 datasets from the UCR\nTime Series Classification Archive to compare MDD's performance with EuD, LCSS,\nand DTW. Our experiments show that MDD's overall success, without any\ncase-specific customization, is comparable to DTW with optimized window sizes\nper dataset. We also highlight several datasets where MDD's performance\nimproves significantly when its single parameter is customized. This\ncustomization serves as a powerful tool for gauging MDD's sensitivity to noise.\nLastly, we show that MDD's running time is linear in the length of the time\nseries, which is crucial for real-world applications involving very large\ndatasets.\n","authors":["Mahsa Khazaei","Azim Ahmadzadeh","Krishna Rukmini Puthucode"],"pdf_url":"https://arxiv.org/pdf/2411.10418v1.pdf","comment":"6 pages, 3 figures, IEEE Big Data 2024"},{"id":"http://arxiv.org/abs/2411.05817v2","updated":"2024-11-15T18:36:30Z","published":"2024-11-01T18:21:37Z","title":"Demo: Multi-Modal Seizure Prediction System","summary":" This demo presents SeizNet, an innovative system for predicting epileptic\nseizures benefiting from a multi-modal sensor network and utilizing Deep\nLearning (DL) techniques. Epilepsy affects approximately 65 million people\nworldwide, many of whom experience drug-resistant seizures. SeizNet aims at\nproviding highly accurate alerts, allowing individuals to take preventive\nmeasures without being disturbed by false alarms. SeizNet uses a combination of\ndata collected through either invasive (intracranial electroencephalogram\n(iEEG)) or non-invasive (electroencephalogram (EEG) and electrocardiogram\n(ECG)) sensors, and processed by advanced DL algorithms that are optimized for\nreal-time inference at the edge, ensuring privacy and minimizing data\ntransmission. SeizNet achieves > 97% accuracy in seizure prediction while\nkeeping the size and energy restrictions of an implantable device.\n","authors":["Ali Saeizadeh","Pietro Brach del Prever","Douglas Schonholtz","Raffaele Guida","Emrecan Demirors","Jorge M. Jimenez","Pedram Johari","Tommaso Melodia"],"pdf_url":"https://arxiv.org/pdf/2411.05817v2.pdf","comment":"1 page, 1 figure, Proceedings of the IEEE 20th International\n Conference on Body Sensor Networks (BSN), October 2024"},{"id":"http://arxiv.org/abs/2403.19572v2","updated":"2024-11-15T18:31:46Z","published":"2024-03-28T16:56:39Z","title":"Swarm Characteristics Classification Using Neural Networks","summary":" Understanding the characteristics of swarming autonomous agents is critical\nfor defense and security applications. This article presents a study on using\nsupervised neural network time series classification (NN TSC) to predict key\nattributes and tactics of swarming autonomous agents for military contexts.\nSpecifically, NN TSC is applied to infer two binary attributes - communication\nand proportional navigation - which combine to define four mutually exclusive\nswarm tactics. We identify a gap in literature on using NNs for swarm\nclassification and demonstrate the effectiveness of NN TSC in rapidly deducing\nintelligence about attacking swarms to inform counter-maneuvers. Through\nsimulated swarm-vs-swarm engagements, we evaluate NN TSC performance in terms\nof observation window requirements, noise robustness, and scalability to swarm\nsize. Key findings show NNs can predict swarm behaviors with 97% accuracy using\nshort observation windows of 20 time steps, while also demonstrating graceful\ndegradation down to 80% accuracy under 50% noise, as well as excellent\nscalability to swarm sizes from 10 to 100 agents. These capabilities are\npromising for real-time decision-making support in defense scenarios by rapidly\ninferring insights about swarm behavior.\n","authors":["Donald W. Peltier III","Isaac Kaminer","Abram Clark","Marko Orescanin"],"pdf_url":"https://arxiv.org/pdf/2403.19572v2.pdf","comment":"Added funding acknowledgment and author bios"},{"id":"http://arxiv.org/abs/2411.10397v1","updated":"2024-11-15T18:03:52Z","published":"2024-11-15T18:03:52Z","title":"Features that Make a Difference: Leveraging Gradients for Improved\n Dictionary Learning","summary":" Sparse Autoencoders (SAEs) are a promising approach for extracting neural\nnetwork representations by learning a sparse and overcomplete decomposition of\nthe network's internal activations. However, SAEs are traditionally trained\nconsidering only activation values and not the effect those activations have on\ndownstream computations. This limits the information available to learn\nfeatures, and biases the autoencoder towards neglecting features which are\nrepresented with small activation values but strongly influence model outputs.\nTo address this, we introduce Gradient SAEs (g-SAEs), which modify the\n$k$-sparse autoencoder architecture by augmenting the TopK activation function\nto rely on the gradients of the input activation when selecting the $k$\nelements. For a given sparsity level, g-SAEs produce reconstructions that are\nmore faithful to original network performance when propagated through the\nnetwork. Additionally, we find evidence that g-SAEs learn latents that are on\naverage more effective at steering models in arbitrary contexts. By considering\nthe downstream effects of activations, our approach leverages the dual nature\nof neural network features as both $\\textit{representations}$, retrospectively,\nand $\\textit{actions}$, prospectively. While previous methods have approached\nthe problem of feature discovery primarily focused on the former aspect, g-SAEs\nrepresent a step towards accounting for the latter as well.\n","authors":["Jeffrey Olmo","Jared Wilson","Max Forsey","Bryce Hepner","Thomas Vin Howe","David Wingate"],"pdf_url":"https://arxiv.org/pdf/2411.10397v1.pdf","comment":"9 pages, 8 figures. Submitted to NAACL 2025"},{"id":"http://arxiv.org/abs/2410.17142v2","updated":"2024-11-15T18:02:00Z","published":"2024-10-22T16:19:13Z","title":"Coniferest: a complete active anomaly detection framework","summary":" We present coniferest, an open source generic purpose active anomaly\ndetection framework written in Python. The package design and implemented\nalgorithms are described. Currently, static outlier detection analysis is\nsupported via the Isolation forest algorithm. Moreover, Active Anomaly\nDiscovery (AAD) and Pineforest algorithms are available to tackle active\nanomaly detection problems. The algorithms and package performance are\nevaluated on a series of synthetic datasets. We also describe a few success\ncases which resulted from applying the package to real astronomical data in\nactive anomaly detection tasks within the SNAD project.\n","authors":["M. V. Kornilov","V. S. Korolev","K. L. Malanchev","A. D. Lavrukhina","E. Russeil","T. A. Semenikhin","E. Gangler","E. E. O. Ishida","M. V. Pruzhinskaya","A. A. Volnova","S. Sreejith"],"pdf_url":"https://arxiv.org/pdf/2410.17142v2.pdf","comment":"13 pages, 1 figure"},{"id":"http://arxiv.org/abs/2410.13986v3","updated":"2024-11-15T17:58:35Z","published":"2024-10-17T19:32:25Z","title":"Recurrent Neural Goodness-of-Fit Test for Time Series","summary":" Time series data are crucial across diverse domains such as finance and\nhealthcare, where accurate forecasting and decision-making rely on advanced\nmodeling techniques. While generative models have shown great promise in\ncapturing the intricate dynamics inherent in time series, evaluating their\nperformance remains a major challenge. Traditional evaluation metrics fall\nshort due to the temporal dependencies and potential high dimensionality of the\nfeatures. In this paper, we propose the REcurrent NeurAL (RENAL)\nGoodness-of-Fit test, a novel and statistically rigorous framework for\nevaluating generative time series models. By leveraging recurrent neural\nnetworks, we transform the time series into conditionally independent data\npairs, enabling the application of a chi-square-based goodness-of-fit test to\nthe temporal dependencies within the data. This approach offers a robust,\ntheoretically grounded solution for assessing the quality of generative models,\nparticularly in settings with limited time sequences. We demonstrate the\nefficacy of our method across both synthetic and real-world datasets,\noutperforming existing methods in terms of reliability and accuracy. Our method\nfills a critical gap in the evaluation of time series generative models,\noffering a tool that is both practical and adaptable to high-stakes\napplications.\n","authors":["Aoran Zhang","Wenbin Zhou","Liyan Xie","Shixiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.13986v3.pdf","comment":"27 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.10389v1","updated":"2024-11-15T17:50:46Z","published":"2024-11-15T17:50:46Z","title":"Deep Learning for Micro-Scale Crack Detection on Imbalanced Datasets\n Using Key Point Localization","summary":" Internal crack detection has been a subject of focus in structural health\nmonitoring. By focusing on crack detection in structural datasets, it is\ndemonstrated that deep learning (DL) methods can effectively analyze seismic\nwave fields interacting with micro-scale cracks, which are beyond the\nresolution of conventional visual inspection. This work explores a novel\napplication of DL-based key point detection technique, where cracks are\nlocalized by predicting the coordinates of four key points that define a\nbounding region of the crack. The study not only opens new research directions\nfor non-visual applications but also effectively mitigates the impact of\nimbalanced data which poses a challenge for previous DL models, as it can be\nbiased toward predicting the majority class (non-crack regions). Popular DL\ntechniques, such as the Inception blocks, are used and investigated. The model\nshows an overall reduction in loss when applied to micro-scale crack detection\nand is reflected in the lower average deviation between the location of actual\nand predicted cracks, with an average Intersection over Union (IoU) being 0.511\nfor all micro cracks (greater than 0.00 micrometers) and 0.631 for larger micro\ncracks (greater than 4 micrometers).\n","authors":["Fatahlla Moreh","Yusuf Hasan","Bilal Zahid Hussain","Mohammad Ammar","Sven Tomforde"],"pdf_url":"https://arxiv.org/pdf/2411.10389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10385v1","updated":"2024-11-15T17:48:06Z","published":"2024-11-15T17:48:06Z","title":"Low-Latency Task-Oriented Communications with Multi-Round, Multi-Task\n Deep Learning","summary":" In this paper, we address task-oriented (or goal-oriented) communications\nwhere an encoder at the transmitter learns compressed latent representations of\ndata, which are then transmitted over a wireless channel. At the receiver, a\ndecoder performs a machine learning task, specifically for classifying the\nreceived signals. The deep neural networks corresponding to the encoder-decoder\npair are jointly trained, taking both channel and data characteristics into\naccount. Our objective is to achieve high accuracy in completing the underlying\ntask while minimizing the number of channel uses determined by the encoder's\noutput size. To this end, we propose a multi-round, multi-task learning (MRMTL)\napproach for the dynamic update of channel uses in multi-round transmissions.\nThe transmitter incrementally sends an increasing number of encoded samples\nover the channel based on the feedback from the receiver, and the receiver\nutilizes the signals from a previous round to enhance the task performance,\nrather than only considering the latest transmission. This approach employs\nmulti-task learning to jointly optimize accuracy across varying number of\nchannel uses, treating each configuration as a distinct task. By evaluating the\nconfidence of the receiver in task decisions, MRMTL decides on whether to\nallocate additional channel uses in multiple rounds. We characterize both the\naccuracy and the delay (total number of channel uses) of MRMTL, demonstrating\nthat it achieves the accuracy close to that of conventional methods requiring\nlarge numbers of channel uses, but with reduced delay by incorporating signals\nfrom a prior round. We consider the CIFAR-10 dataset, convolutional neural\nnetwork architectures, and AWGN and Rayleigh channel models for performance\nevaluation. We show that MRMTL significantly improves the efficiency of\ntask-oriented communications, balancing accuracy and latency effectively.\n","authors":["Yalin E. Sagduyu","Tugba Erpek","Aylin Yener","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2411.10385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10383v1","updated":"2024-11-15T17:46:42Z","published":"2024-11-15T17:46:42Z","title":"Framework for Co-distillation Driven Federated Learning to Address Class\n Imbalance in Healthcare","summary":" Federated Learning (FL) is a pioneering approach in distributed machine\nlearning, enabling collaborative model training across multiple clients while\nretaining data privacy. However, the inherent heterogeneity due to imbalanced\nresource representations across multiple clients poses significant challenges,\noften introducing bias towards the majority class. This issue is particularly\nprevalent in healthcare settings, where hospitals acting as clients share\nmedical images. To address class imbalance and reduce bias, we propose a\nco-distillation driven framework in a federated healthcare setting. Unlike\ntraditional federated setups with a designated server client, our framework\npromotes knowledge sharing among clients to collectively improve learning\noutcomes. Our experiments demonstrate that in a federated healthcare setting,\nco-distillation outperforms other federated methods in handling class\nimbalance. Additionally, we demonstrate that our framework has the least\nstandard deviation with increasing imbalance while outperforming other\nbaselines, signifying the robustness of our framework for FL in healthcare.\n","authors":["Suraj Racha","Shubh Gupta","Humaira Firdowse","Aastik Solanki","Ganesh Ramakrishnan","Kshitij S. Jadhav"],"pdf_url":"https://arxiv.org/pdf/2411.10383v1.pdf","comment":"Accepted at CODS COMAD'24 and to be published in the Discover Data\n Journal(https://link.springer.com/journal/44248)"},{"id":"http://arxiv.org/abs/2410.23472v2","updated":"2024-11-15T17:18:57Z","published":"2024-10-30T21:32:56Z","title":"Risk Sources and Risk Management Measures in Support of Standards for\n General-Purpose AI Systems","summary":" There is an urgent need to identify both short and long-term risks from newly\nemerging types of Artificial Intelligence (AI), as well as available risk\nmanagement measures. In response, and to support global efforts in regulating\nAI and writing safety standards, we compile an extensive catalog of risk\nsources and risk management measures for general-purpose AI (GPAI) systems,\ncomplete with descriptions and supporting examples where relevant. This work\ninvolves identifying technical, operational, and societal risks across model\ndevelopment, training, and deployment stages, as well as surveying established\nand experimental methods for managing these risks. To the best of our\nknowledge, this paper is the first of its kind to provide extensive\ndocumentation of both GPAI risk sources and risk management measures that are\ndescriptive, self-contained and neutral with respect to any existing regulatory\nframework. This work intends to help AI providers, standards experts,\nresearchers, policymakers, and regulators in identifying and mitigating\nsystemic risks from GPAI systems. For this reason, the catalog is released\nunder a public domain license for ease of direct use by stakeholders in AI\ngovernance and standards.\n","authors":["Rokas Gipiškis","Ayrton San Joaquin","Ze Shen Chin","Adrian Regenfuß","Ariel Gil","Koen Holtman"],"pdf_url":"https://arxiv.org/pdf/2410.23472v2.pdf","comment":"92 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.10367v1","updated":"2024-11-15T17:17:06Z","published":"2024-11-15T17:17:06Z","title":"Continual Adversarial Reinforcement Learning (CARL) of False Data\n Injection detection: forgetting and explainability","summary":" False data injection attacks (FDIAs) on smart inverters are a growing concern\nlinked to increased renewable energy production. While data-based FDIA\ndetection methods are also actively developed, we show that they remain\nvulnerable to impactful and stealthy adversarial examples that can be crafted\nusing Reinforcement Learning (RL). We propose to include such adversarial\nexamples in data-based detection training procedure via a continual adversarial\nRL (CARL) approach. This way, one can pinpoint the deficiencies of data-based\ndetection, thereby offering explainability during their incremental\nimprovement. We show that a continual learning implementation is subject to\ncatastrophic forgetting, and additionally show that forgetting can be addressed\nby employing a joint training strategy on all generated FDIA scenarios.\n","authors":["Pooja Aslami","Kejun Chen","Timothy M. Hansen","Malik Hassanaly"],"pdf_url":"https://arxiv.org/pdf/2411.10367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10356v1","updated":"2024-11-15T17:05:33Z","published":"2024-11-15T17:05:33Z","title":"Weakly-Supervised Multimodal Learning on MIMIC-CXR","summary":" Multimodal data integration and label scarcity pose significant challenges\nfor machine learning in medical settings. To address these issues, we conduct\nan in-depth evaluation of the newly proposed Multimodal Variational\nMixture-of-Experts (MMVM) VAE on the challenging MIMIC-CXR dataset. Our\nanalysis demonstrates that the MMVM VAE consistently outperforms other\nmultimodal VAEs and fully supervised approaches, highlighting its strong\npotential for real-world medical applications.\n","authors":["Andrea Agostini","Daphné Chopard","Yang Meng","Norbert Fortin","Babak Shahbaba","Stephan Mandt","Thomas M. Sutter","Julia E. Vogt"],"pdf_url":"https://arxiv.org/pdf/2411.10356v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 13 pages. arXiv\n admin note: text overlap with arXiv:2403.05300"},{"id":"http://arxiv.org/abs/2311.13060v3","updated":"2024-11-15T16:55:47Z","published":"2023-11-21T23:49:51Z","title":"Training Deep 3D Convolutional Neural Networks to Extract BSM Physics\n Parameters Directly from HEP Data: a Proof-of-Concept Study Using Monte Carlo\n Simulations","summary":" We report on a novel application of computer vision techniques to extract\nbeyond the Standard Model parameters directly from high energy physics flavor\ndata. We propose a simple but novel data representation that transforms the\nangular and kinematic distributions into \"quasi-images\", which are used to\ntrain a convolutional neural network to perform regression tasks, similar to\nfitting. As a proof-of-concept, we train a 34-layer Residual Neural Network to\nregress on these images and determine information about the Wilson Coefficient\n$C_{9}$ in Monte Carlo simulations of $B^0 \\rightarrow K^{*0}\\mu^{+}\\mu^{-}$\ndecays. The method described here can be generalized and may find applicability\nacross a variety of experiments.\n","authors":["S. Dubey","T. E. Browder","S. Kohani","R. Mandal","A. Sibidanov","R. Sinha"],"pdf_url":"https://arxiv.org/pdf/2311.13060v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10345v1","updated":"2024-11-15T16:45:08Z","published":"2024-11-15T16:45:08Z","title":"Comparative Analysis of Machine Learning Approaches for Bone Age\n Assessment: A Comprehensive Study on Three Distinct Models","summary":" Radiologists and doctors make use of X-ray images of the non-dominant hands\nof children and infants to assess the possibility of genetic conditions and\ngrowth abnormalities. This is done by assessing the difference between the\nactual extent of growth found using the X-rays and the chronological age of the\nsubject. The assessment was done conventionally using The Greulich Pyle (GP) or\nTanner Whitehouse (TW) approach. These approaches require a high level of\nexpertise and may often lead to observer bias. Hence, to automate the process\nof assessing the X-rays, and to increase its accuracy and efficiency, several\nmachine learning models have been developed. These machine-learning models have\nseveral differences in their accuracy and efficiencies, leading to an unclear\nchoice for the suitable model depending on their needs and available resources.\nMethods: In this study, we have analyzed the 3 most widely used models for the\nautomation of bone age prediction, which are the Xception model, VGG model and\nCNN model. These models were trained on the preprocessed dataset and the\naccuracy was measured using the MAE in terms of months for each model. Using\nthis, the comparison between the models was done. Results: The 3 models,\nXception, VGG, and CNN models have been tested for accuracy and other relevant\nfactors.\n","authors":["Nandavardhan R.","Somanathan R.","Vikram Suresh","Savaridassan P"],"pdf_url":"https://arxiv.org/pdf/2411.10345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10337v1","updated":"2024-11-15T16:36:21Z","published":"2024-11-15T16:36:21Z","title":"On the Cost of Model-Serving Frameworks: An Experimental Evaluation","summary":" In machine learning (ML), the inference phase is the process of applying\npre-trained models to new, unseen data with the objective of making\npredictions. During the inference phase, end-users interact with ML services to\ngain insights, recommendations, or actions based on the input data. For this\nreason, serving strategies are nowadays crucial for deploying and managing\nmodels in production environments effectively. These strategies ensure that\nmodels are available, scalable, reliable, and performant for real-world\napplications, such as time series forecasting, image classification, natural\nlanguage processing, and so on. In this paper, we evaluate the performances of\nfive widely-used model serving frameworks (TensorFlow Serving, TorchServe,\nMLServer, MLflow, and BentoML) under four different scenarios (malware\ndetection, cryptocoin prices forecasting, image classification, and sentiment\nanalysis). We demonstrate that TensorFlow Serving is able to outperform all the\nother frameworks in serving deep learning (DL) models. Moreover, we show that\nDL-specific frameworks (TensorFlow Serving and TorchServe) display\nsignificantly lower latencies than the three general-purpose ML frameworks\n(BentoML, MLFlow, and MLServer).\n","authors":["Pasquale De Rosa","Yérom-David Bromberg","Pascal Felber","Djob Mvondo","Valerio Schiavoni"],"pdf_url":"https://arxiv.org/pdf/2411.10337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10325v1","updated":"2024-11-15T16:28:03Z","published":"2024-11-15T16:28:03Z","title":"Bitcoin Research with a Transaction Graph Dataset","summary":" Bitcoin, launched in 2008 by Satoshi Nakamoto, established a new digital\neconomy where value can be stored and transferred in a fully decentralized\nmanner - alleviating the need for a central authority. This paper introduces a\nlarge scale dataset in the form of a transactions graph representing\ntransactions between Bitcoin users along with a set of tasks and baselines. The\ngraph includes 252 million nodes and 785 million edges, covering a time span of\nnearly 13 years of and 670 million transactions. Each node and edge is\ntimestamped. As for supervised tasks we provide two labeled sets i. a 33,000\nnodes based on entity type and ii. nearly 100,000 Bitcoin addresses labeled\nwith an entity name and an entity type. This is the largest publicly available\ndata set of bitcoin transactions designed to facilitate advanced research and\nexploration in this domain, overcoming the limitations of existing datasets.\nVarious graph neural network models are trained to predict node labels,\nestablishing a baseline for future research. In addition, several use cases are\npresented to demonstrate the dataset's applicability beyond Bitcoin analysis.\nFinally, all data and source code is made publicly available to enable\nreproducibility of the results.\n","authors":["Hugo Schnoering","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2411.10325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05818v2","updated":"2024-11-15T16:23:17Z","published":"2024-11-02T12:02:09Z","title":"Open LLMs are Necessary for Current Private Adaptations and Outperform\n their Closed Alternatives","summary":" While open Large Language Models (LLMs) have made significant progress, they\nstill fall short of matching the performance of their closed, proprietary\ncounterparts, making the latter attractive even for the use on highly private\ndata. Recently, various new methods have been proposed to adapt closed LLMs to\nprivate data without leaking private information to third parties and/or the\nLLM provider. In this work, we analyze the privacy protection and performance\nof the four most recent methods for private adaptation of closed LLMs. By\nexamining their threat models and thoroughly comparing their performance under\ndifferent privacy levels according to differential privacy (DP), various LLM\narchitectures, and multiple datasets for classification and generation tasks,\nwe find that: (1) all the methods leak query data, i.e., the (potentially\nsensitive) user data that is queried at inference time, to the LLM provider,\n(2) three out of four methods also leak large fractions of private training\ndata to the LLM provider while the method that protects private data requires a\nlocal open LLM, (3) all the methods exhibit lower performance compared to three\nprivate gradient-based adaptation methods for local open LLMs, and (4) the\nprivate adaptation methods for closed LLMs incur higher monetary training and\nquery costs than running the alternative methods on local open LLMs. This\nyields the conclusion that, to achieve truly privacy-preserving LLM adaptations\nthat yield high performance and more privacy at lower costs, taking into\naccount current methods and models, one should use open LLMs.\n","authors":["Vincent Hanke","Tom Blanchard","Franziska Boenisch","Iyiola Emmanuel Olatunji","Michael Backes","Adam Dziedzic"],"pdf_url":"https://arxiv.org/pdf/2411.05818v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.14377v2","updated":"2024-11-15T16:23:15Z","published":"2024-06-20T14:45:13Z","title":"CE-SSL: Computation-Efficient Semi-Supervised Learning for ECG-based\n Cardiovascular Diseases Detection","summary":" The label scarcity problem is the main challenge that hinders the wide\napplication of deep learning systems in automatic cardiovascular diseases\n(CVDs) detection using electrocardiography (ECG). Tuning pre-trained models\nalleviates this problem by transferring knowledge learned from large datasets\nto downstream small datasets. However, bottlenecks in computational efficiency\nand detection performance limit its clinical applications. It is difficult to\nimprove the detection performance without significantly sacrificing the\ncomputational efficiency during model training. Here, we propose a\ncomputation-efficient semi-supervised learning paradigm (CE-SSL) for robust and\ncomputation-efficient CVDs detection using ECG. It enables a robust adaptation\nof pre-trained models on downstream datasets with limited supervision and high\ncomputational efficiency. First, a random-deactivation technique is developed\nto achieve robust and fast low-rank adaptation of pre-trained weights.\nSubsequently, we propose a one-shot rank allocation module to determine the\noptimal ranks for the update matrices of the pre-trained weights. Finally, a\nlightweight semi-supervised learning pipeline is introduced to enhance model\nperformance by leveraging labeled and unlabeled data with high computational\nefficiency. Extensive experiments on four downstream datasets demonstrate that\nCE-SSL not only outperforms the state-of-the-art methods in multi-label CVDs\ndetection but also consumes fewer GPU footprints, training time, and parameter\nstorage space. As such, this paradigm provides an effective solution for\nachieving high computational efficiency and robust detection performance in the\nclinical applications of pre-trained models under limited supervision. Code and\nSupplementary Materials are available at https://github.com/KAZABANA/CE-SSL\n","authors":["Rushuang Zhou","Lei Clifton","Zijun Liu","Kannie W. Y. Chan","David A. Clifton","Yuan-Ting Zhang","Yining Dong"],"pdf_url":"https://arxiv.org/pdf/2406.14377v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00514v2","updated":"2024-11-15T16:15:46Z","published":"2024-11-01T11:16:37Z","title":"Label Cluster Chains for Multi-Label Classification","summary":" Multi-label classification is a type of supervised machine learning that can\nsimultaneously assign multiple labels to an instance. To solve this task, some\nmethods divide the original problem into several sub-problems (local approach),\nothers learn all labels at once (global approach), and others combine several\nclassifiers (ensemble approach). Regardless of the approach used, exploring and\nlearning label correlations is important to improve the classifier predictions.\nEnsemble of Classifier Chains (ECC) is a well-known multi-label method that\nconsiders label correlations and can achieve good overall performance on\nseveral multi-label datasets and evaluation measures. However, one of the\nchallenges when working with ECC is the high dimensionality of the label space,\nwhich can impose limitations for fully-cascaded chains as the complexity\nincreases regarding feature space expansion. To improve classifier chains, we\npropose a method to chain disjoint correlated label clusters obtained by\napplying a partition method in the label space. During the training phase, the\nground truth labels of each cluster are used as new features for all of the\nfollowing clusters. During the test phase, the predicted labels of clusters are\nused as new features for all the following clusters. Our proposal, called Label\nCluster Chains for Multi-Label Classification (LCC-ML), uses multi-label Random\nForests as base classifiers in each cluster, combining their predictions to\nobtain a final multi-label classification. Our proposal obtained better results\ncompared to the original ECC. This shows that learning and chaining disjoint\ncorrelated label clusters can better explore and learn label correlations.\n","authors":["Elaine Cecília Gatto","Felipe Nakano Kenji","Jesse Read","Mauri Ferrandin","Ricardo Cerri","Celine Vens"],"pdf_url":"https://arxiv.org/pdf/2411.00514v2.pdf","comment":"The article was submitted prematurely, and after it was published on\n arXiv, we identified aspects that require attention, adjustments, and\n improvements. We are working to review and significantly improve the content.\n Therefore, we request its temporary withdrawal to avoid the dissemination of\n information that may be incomplete or incorrectly interpreted"},{"id":"http://arxiv.org/abs/2405.14089v2","updated":"2024-11-15T16:08:55Z","published":"2024-05-23T01:34:12Z","title":"Improved Canonicalization for Model Agnostic Equivariance","summary":" This work introduces a novel approach to achieving architecture-agnostic\nequivariance in deep learning, particularly addressing the limitations of\ntraditional layerwise equivariant architectures and the inefficiencies of the\nexisting architecture-agnostic methods. Building equivariant models using\ntraditional methods requires designing equivariant versions of existing models\nand training them from scratch, a process that is both impractical and\nresource-intensive. Canonicalization has emerged as a promising alternative for\ninducing equivariance without altering model architecture, but it suffers from\nthe need for highly expressive and expensive equivariant networks to learn\ncanonical orientations accurately. We propose a new optimization-based method\nthat employs any non-equivariant network for canonicalization. Our method uses\ncontrastive learning to efficiently learn a canonical orientation and offers\nmore flexibility for the choice of canonicalization network. We empirically\ndemonstrate that this approach outperforms existing methods in achieving\nequivariance for large pretrained models and significantly speeds up the\ncanonicalization process, making it up to 2 times faster.\n","authors":["Siba Smarak Panigrahi","Arnab Kumar Mondal"],"pdf_url":"https://arxiv.org/pdf/2405.14089v2.pdf","comment":"Accepted to EquiVision workshop, CVPR 2024. 8 pages, 2 figures, 2\n tables"},{"id":"http://arxiv.org/abs/2411.08954v2","updated":"2024-11-15T16:06:23Z","published":"2024-11-13T19:00:02Z","title":"Inconsistencies In Consistency Models: Better ODE Solving Does Not Imply\n Better Samples","summary":" Although diffusion models can generate remarkably high-quality samples, they\nare intrinsically bottlenecked by their expensive iterative sampling procedure.\nConsistency models (CMs) have recently emerged as a promising diffusion model\ndistillation method, reducing the cost of sampling by generating high-fidelity\nsamples in just a few iterations. Consistency model distillation aims to solve\nthe probability flow ordinary differential equation (ODE) defined by an\nexisting diffusion model. CMs are not directly trained to minimize error\nagainst an ODE solver, rather they use a more computationally tractable\nobjective. As a way to study how effectively CMs solve the probability flow\nODE, and the effect that any induced error has on the quality of generated\nsamples, we introduce Direct CMs, which \\textit{directly} minimize this error.\nIntriguingly, we find that Direct CMs reduce the ODE solving error compared to\nCMs but also result in significantly worse sample quality, calling into\nquestion why exactly CMs work well in the first place. Full code is available\nat: https://github.com/layer6ai-labs/direct-cms.\n","authors":["Noël Vouitsis","Rasa Hosseinzadeh","Brendan Leigh Ross","Valentin Villecroze","Satya Krishna Gorti","Jesse C. Cresswell","Gabriel Loaiza-Ganem"],"pdf_url":"https://arxiv.org/pdf/2411.08954v2.pdf","comment":"NeurIPS 2024 ATTRIB Workshop"},{"id":"http://arxiv.org/abs/2403.09871v4","updated":"2024-11-15T16:01:39Z","published":"2024-03-14T21:01:06Z","title":"ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric\n Thermal Images","summary":" Designing egocentric 3D hand pose estimation systems that can perform\nreliably in complex, real-world scenarios is crucial for downstream\napplications. Previous approaches using RGB or NIR imagery struggle in\nchallenging conditions: RGB methods are susceptible to lighting variations and\nobstructions like handwear, while NIR techniques can be disrupted by sunlight\nor interference from other NIR-equipped devices. To address these limitations,\nwe present ThermoHands, the first benchmark focused on thermal image-based\negocentric 3D hand pose estimation, demonstrating the potential of thermal\nimaging to achieve robust performance under these conditions. The benchmark\nincludes a multi-view and multi-spectral dataset collected from 28 subjects\nperforming hand-object and hand-virtual interactions under diverse scenarios,\naccurately annotated with 3D hand poses through an automated process. We\nintroduce a new baseline method, TherFormer, utilizing dual transformer modules\nfor effective egocentric 3D hand pose estimation in thermal imagery. Our\nexperimental results highlight TherFormer's leading performance and affirm\nthermal imaging's effectiveness in enabling robust 3D hand pose estimation in\nadverse conditions.\n","authors":["Fangqiang Ding","Yunzhou Zhu","Xiangyu Wen","Gaowen Liu","Chris Xiaoxuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.09871v4.pdf","comment":"15 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.10293v1","updated":"2024-11-15T15:51:25Z","published":"2024-11-15T15:51:25Z","title":"RETR: Multi-View Radar Detection Transformer for Indoor Perception","summary":" Indoor radar perception has seen rising interest due to affordable costs\ndriven by emerging automotive imaging radar developments and the benefits of\nreduced privacy concerns and reliability under hazardous conditions (e.g., fire\nand smoke). However, existing radar perception pipelines fail to account for\ndistinctive characteristics of the multi-view radar setting. In this paper, we\npropose Radar dEtection TRansformer (RETR), an extension of the popular DETR\narchitecture, tailored for multi-view radar perception. RETR inherits the\nadvantages of DETR, eliminating the need for hand-crafted components for object\ndetection and segmentation in the image plane. More importantly, RETR\nincorporates carefully designed modifications such as 1) depth-prioritized\nfeature similarity via a tunable positional encoding (TPE); 2) a tri-plane loss\nfrom both radar and camera coordinates; and 3) a learnable radar-to-camera\ntransformation via reparameterization, to account for the unique multi-view\nradar setting. Evaluated on two indoor radar perception datasets, our approach\noutperforms existing state-of-the-art methods by a margin of 15.38+ AP for\nobject detection and 11.77+ IoU for instance segmentation, respectively.\n","authors":["Ryoma Yataka","Adriano Cardace","Pu Perry Wang","Petros Boufounos","Ryuhei Takahashi"],"pdf_url":"https://arxiv.org/pdf/2411.10293v1.pdf","comment":"24 pages, Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10290v1","updated":"2024-11-15T15:47:32Z","published":"2024-11-15T15:47:32Z","title":"The ParClusterers Benchmark Suite (PCBS): A Fine-Grained Analysis of\n Scalable Graph Clustering","summary":" We introduce the ParClusterers Benchmark Suite (PCBS) -- a collection of\nhighly scalable parallel graph clustering algorithms and benchmarking tools\nthat streamline comparing different graph clustering algorithms and\nimplementations.\n The benchmark includes clustering algorithms that target a wide range of\nmodern clustering use cases, including community detection, classification, and\ndense subgraph mining.\n The benchmark toolkit makes it easy to run and evaluate multiple instances of\ndifferent clustering algorithms, which can be useful for fine-tuning the\nperformance of clustering on a given task, and for comparing different\nclustering algorithms based on different metrics of interest, including\nclustering quality and running time.\n Using PCBS, we evaluate a broad collection of real-world graph clustering\ndatasets. Somewhat surprisingly, we find that the best quality results are\nobtained by algorithms that not included in many popular graph clustering\ntoolkits. The PCBS provides a standardized way to evaluate and judge the\nquality-performance tradeoffs of the active research area of scalable graph\nclustering algorithms. We believe it will help enable fair, accurate, and\nnuanced evaluation of graph clustering algorithms in the future.\n","authors":["Shangdi Yu","Jessica Shi","Jamison Meindl","David Eisenstat","Xiaoen Ju","Sasan Tavakkol","Laxman Dhulipala","Jakub Łącki","Vahab Mirrokni","Julian Shun"],"pdf_url":"https://arxiv.org/pdf/2411.10290v1.pdf","comment":"This is a preliminary version of a paper that will appear at VLDB'25"},{"id":"http://arxiv.org/abs/2410.07364v2","updated":"2024-11-15T15:46:00Z","published":"2024-10-09T18:24:23Z","title":"Unlocking Real-Time Fluorescence Lifetime Imaging: Multi-Pixel\n Parallelism for FPGA-Accelerated Processing","summary":" Fluorescence lifetime imaging (FLI) is a widely used technique in the\nbiomedical field for measuring the decay times of fluorescent molecules,\nproviding insights into metabolic states, protein interactions, and\nligand-receptor bindings. However, its broader application in fast biological\nprocesses, such as dynamic activity monitoring, and clinical use, such as in\nguided surgery, is limited by long data acquisition times and computationally\ndemanding data processing. While deep learning has reduced post-processing\ntimes, time-resolved data acquisition remains a bottleneck for real-time\napplications. To address this, we propose a method to achieve real-time FLI\nusing an FPGA-based hardware accelerator. Specifically, we implemented a\nGRU-based sequence-to-sequence (Seq2Seq) model on an FPGA board compatible with\ntime-resolved cameras. The GRU model balances accurate processing with the\nresource constraints of FPGAs, which have limited DSP units and BRAM. The\nlimited memory and computational resources on the FPGA require efficient\nscheduling of operations and memory allocation to deploy deep learning models\nfor low-latency applications. We address these challenges by using STOMP, a\nqueue-based discrete-event simulator that automates and optimizes task\nscheduling and memory management on hardware. By integrating a GRU-based\nSeq2Seq model and its compressed version, called Seq2SeqLite, generated through\nknowledge distillation, we were able to process multiple pixels in parallel,\nreducing latency compared to sequential processing. We explore various levels\nof parallelism to achieve an optimal balance between performance and resource\nutilization. Our results indicate that the proposed techniques achieved a 17.7x\nand 52.0x speedup over manual scheduling for the Seq2Seq model and the\nSeq2SeqLite model, respectively.\n","authors":["Ismail Erbas","Aporva Amarnath","Vikas Pandey","Karthik Swaminathan","Naigang Wang","Xavier Intes"],"pdf_url":"https://arxiv.org/pdf/2410.07364v2.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.09468v2","updated":"2024-11-15T15:38:17Z","published":"2024-11-14T14:16:50Z","title":"Harnessing Machine Learning for Single-Shot Measurement of Free Electron\n Laser Pulse Power","summary":" Electron beam accelerators are essential in many scientific and technological\nfields. Their operation relies heavily on the stability and precision of the\nelectron beam. Traditional diagnostic techniques encounter difficulties in\naddressing the complex and dynamic nature of electron beams. Particularly in\nthe context of free-electron lasers (FELs), it is fundamentally impossible to\nmeasure the lasing-on and lasingoff electron power profiles for a single\nelectron bunch. This is a crucial hurdle in the exact reconstruction of the\nphoton pulse profile. To overcome this hurdle, we developed a machine learning\nmodel that predicts the temporal power profile of the electron bunch in the\nlasing-off regime using machine parameters that can be obtained when lasing is\non. The model was statistically validated and showed superior predictions\ncompared to the state-of-the-art batch calibrations. The work we present here\nis a critical element for a virtual pulse reconstruction diagnostic (VPRD) tool\ndesigned to reconstruct the power profile of individual photon pulses without\nrequiring repeated measurements in the lasing-off regime. This promises to\nsignificantly enhance the diagnostic capabilities in FELs at large.\n","authors":["Till Korten","Vladimir Rybnikov","Mathias Vogt","Juliane Roensch-Schulenburg","Peter Steinbach","Najmeh Mirian"],"pdf_url":"https://arxiv.org/pdf/2411.09468v2.pdf","comment":"10 pages, 4 figures, Machine Learning and the Physical Sciences\n Workshop, NeurIPS 2024 https://neurips.cc/virtual/2024/100009"},{"id":"http://arxiv.org/abs/2411.10281v1","updated":"2024-11-15T15:36:48Z","published":"2024-11-15T15:36:48Z","title":"Multidimensional Byte Pair Encoding: Shortened Sequences for Improved\n Visual Data Generation","summary":" In language processing, transformers benefit greatly from text being\ncondensed. This is achieved through a larger vocabulary that captures word\nfragments instead of plain characters. This is often done with Byte Pair\nEncoding. In the context of images, tokenisation of visual data is usually\nlimited to regular grids obtained from quantisation methods, without global\ncontent awareness. Our work improves tokenisation of visual data by bringing\nByte Pair Encoding from 1D to multiple dimensions, as a complementary add-on to\nexisting compression. We achieve this through counting constellations of token\npairs and replacing the most frequent token pair with a newly introduced token.\nThe multidimensionality only increases the computation time by a factor of 2\nfor images, making it applicable even to large datasets like ImageNet within\nminutes on consumer hardware. This is a lossless preprocessing step. Our\nevaluation shows improved training and inference performance of transformers on\nvisual data achieved by compressing frequent constellations of tokens: The\nresulting sequences are shorter, with more uniformly distributed information\ncontent, e.g. condensing empty regions in an image into single tokens. As our\nexperiments show, these condensed sequences are easier to process. We\nadditionally introduce a strategy to amplify this compression further by\nclustering the vocabulary.\n","authors":["Tim Elsner","Paula Usinger","Julius Nehring-Wirxel","Gregor Kobsik","Victor Czech","Yanjiang He","Isaak Lim","Leif Kobbelt"],"pdf_url":"https://arxiv.org/pdf/2411.10281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10272v1","updated":"2024-11-15T15:28:42Z","published":"2024-11-15T15:28:42Z","title":"Scaling Law for Post-training after Model Pruning","summary":" Large language models (LLMs) based on the Transformer architecture are widely\nemployed across various domains and tasks. However, their increasing size\nimposes significant hardware demands, limiting practical deployment. To\nmitigate this, model pruning techniques have been developed to create more\nefficient models while maintaining high performance. Despite this,\npost-training after pruning is crucial for performance recovery and can be\nresource-intensive. This paper investigates the post-training requirements of\npruned LLMs and introduces a scaling law to determine the optimal amount of\npost-training data. Post-training experiments with the Llama-3 and Qwen-2.5\nseries models, pruned using depth pruning, width pruning, and 2:4\nsemi-structured pruning, show that higher pruning ratios necessitate more\npost-training data for performance recovery, whereas larger LLMs require less.\nThe proposed scaling law predicts a model's loss based on its parameter counts\nbefore and after pruning, as well as the post-training token counts.\nFurthermore, we find that the scaling law established from smaller LLMs can be\nreliably extrapolated to larger LLMs. This work provides valuable insights into\nthe post-training of pruned LLMs and offers a practical scaling law for\noptimizing post-training data usage.\n","authors":["Xiaodong Chen","Yuxuan Hu","Jing Zhang","Xiaokang Zhang","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.10272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10268v1","updated":"2024-11-15T15:18:57Z","published":"2024-11-15T15:18:57Z","title":"Towards Sample-Efficiency and Generalization of Transfer and Inverse\n Reinforcement Learning: A Comprehensive Literature Review","summary":" Reinforcement learning (RL) is a sub-domain of machine learning, mainly\nconcerned with solving sequential decision-making problems by a learning agent\nthat interacts with the decision environment to improve its behavior through\nthe reward it receives from the environment. This learning paradigm is,\nhowever, well-known for being time-consuming due to the necessity of collecting\na large amount of data, making RL suffer from sample inefficiency and difficult\ngeneralization. Furthermore, the construction of an explicit reward function\nthat accounts for the trade-off between multiple desiderata of a decision\nproblem is often a laborious task. These challenges have been recently\naddressed utilizing transfer and inverse reinforcement learning (T-IRL). In\nthis regard, this paper is devoted to a comprehensive review of realizing the\nsample efficiency and generalization of RL algorithms through T-IRL. Following\na brief introduction to RL, the fundamental T-IRL methods are presented and the\nmost recent advancements in each research field have been extensively reviewed.\nOur findings denote that a majority of recent research works have dealt with\nthe aforementioned challenges by utilizing human-in-the-loop and sim-to-real\nstrategies for the efficient transfer of knowledge from source domains to the\ntarget domain under the transfer learning scheme. Under the IRL structure,\ntraining schemes that require a low number of experience transitions and\nextension of such frameworks to multi-agent and multi-intention problems have\nbeen the priority of researchers in recent years.\n","authors":["Hossein Hassani","Roozbeh Razavi-Far","Mehrdad Saif","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2411.10268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14551v2","updated":"2024-11-15T15:16:56Z","published":"2024-02-22T13:45:01Z","title":"CLCE: An Approach to Refining Cross-Entropy and Contrastive Learning for\n Optimized Learning Fusion","summary":" State-of-the-art pre-trained image models predominantly adopt a two-stage\napproach: initial unsupervised pre-training on large-scale datasets followed by\ntask-specific fine-tuning using Cross-Entropy loss~(CE). However, it has been\ndemonstrated that CE can compromise model generalization and stability. While\nrecent works employing contrastive learning address some of these limitations\nby enhancing the quality of embeddings and producing better decision\nboundaries, they often overlook the importance of hard negative mining and rely\non resource intensive and slow training using large sample batches. To counter\nthese issues, we introduce a novel approach named CLCE, which integrates\nLabel-Aware Contrastive Learning with CE. Our approach not only maintains the\nstrengths of both loss functions but also leverages hard negative mining in a\nsynergistic way to enhance performance. Experimental results demonstrate that\nCLCE significantly outperforms CE in Top-1 accuracy across twelve benchmarks,\nachieving gains of up to 3.52% in few-shot learning scenarios and 3.41% in\ntransfer learning settings with the BEiT-3 model. Importantly, our proposed\nCLCE approach effectively mitigates the dependency of contrastive learning on\nlarge batch sizes such as 4096 samples per batch, a limitation that has\npreviously constrained the application of contrastive learning in\nbudget-limited hardware environments.\n","authors":["Zijun Long","George Killick","Lipeng Zhuang","Gerardo Aragon-Camarasa","Zaiqiao Meng","Richard Mccreadie"],"pdf_url":"https://arxiv.org/pdf/2402.14551v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10258v1","updated":"2024-11-15T15:05:01Z","published":"2024-11-15T15:05:01Z","title":"MDHP-Net: Detecting Injection Attacks on In-vehicle Network using\n Multi-Dimensional Hawkes Process and Temporal Model","summary":" The integration of intelligent and connected technologies in modern vehicles,\nwhile offering enhanced functionalities through Electronic Control Unit and\ninterfaces like OBD-II and telematics, also exposes the vehicle's in-vehicle\nnetwork (IVN) to potential cyberattacks. In this paper, we consider a specific\ntype of cyberattack known as the injection attack. As demonstrated by empirical\ndata from real-world cybersecurity adversarial competitions(available at\nhttps://mimic2024.xctf.org.cn/race/qwmimic2024 ), these injection attacks have\nexcitation effect over time, gradually manipulating network traffic and\ndisrupting the vehicle's normal functioning, ultimately compromising both its\nstability and safety. To profile the abnormal behavior of attackers, we propose\na novel injection attack detector to extract long-term features of attack\nbehavior. Specifically, we first provide a theoretical analysis of modeling the\ntime-excitation effects of the attack using Multi-Dimensional Hawkes Process\n(MDHP). A gradient descent solver specifically tailored for MDHP, MDHP-GDS, is\ndeveloped to accurately estimate optimal MDHP parameters. We then propose an\ninjection attack detector, MDHP-Net, which integrates optimal MDHP parameters\nwith MDHP-LSTM blocks to enhance temporal feature extraction. By introducing\nMDHP parameters, MDHP-Net captures complex temporal features that standard Long\nShort-Term Memory (LSTM) cannot, enriching temporal dependencies within our\ncustomized structure. Extensive evaluations demonstrate the effectiveness of\nour proposed detection approach.\n","authors":["Qi Liu","Yanchen Liu","Ruifeng Li","Chenhong Cao","Yufeng Li","Xingyu Li","Peng Wang","Runhan Feng"],"pdf_url":"https://arxiv.org/pdf/2411.10258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10257v1","updated":"2024-11-15T15:04:04Z","published":"2024-11-15T15:04:04Z","title":"The Unreasonable Effectiveness of Guidance for Diffusion Models","summary":" Guidance is an error-correcting technique used to improve the perceptual\nquality of images generated by diffusion models. Typically, the correction is\nachieved by linear extrapolation, using an auxiliary diffusion model that has\nlower performance than the primary model. Using a 2D toy example, we show that\nit is highly beneficial when the auxiliary model exhibits similar errors as the\nprimary one but stronger. We verify this finding in higher dimensions, where we\nshow that competitive generative performance to state-of-the-art guidance\nmethods can be achieved when the auxiliary model differs from the primary one\nonly by having stronger weight regularization. As an independent contribution,\nwe investigate whether upweighting long-range spatial dependencies improves\nvisual fidelity. The result is a novel guidance method, which we call sliding\nwindow guidance (SWG), that guides the primary model with itself by\nconstraining its receptive field. Intriguingly, SWG aligns better with human\npreferences than state-of-the-art guidance methods while requiring neither\ntraining, architectural modifications, nor class conditioning. The code will be\nreleased.\n","authors":["Tim Kaiser","Nikolas Adaloglou","Markus Kollmann"],"pdf_url":"https://arxiv.org/pdf/2411.10257v1.pdf","comment":"Preprint. 19 pages, 14 figures in total, including references and\n appendix"},{"id":"http://arxiv.org/abs/2411.10254v1","updated":"2024-11-15T15:02:35Z","published":"2024-11-15T15:02:35Z","title":"Uncertainty in Supply Chain Digital Twins: A Quantum-Classical Hybrid\n Approach","summary":" This study investigates uncertainty quantification (UQ) using\nquantum-classical hybrid machine learning (ML) models for applications in\ncomplex and dynamic fields, such as attaining resiliency in supply chain\ndigital twins and financial risk assessment. Although quantum feature\ntransformations have been integrated into ML models for complex data tasks, a\ngap exists in determining their impact on UQ within their hybrid architectures\n(quantum-classical approach). This work applies existing UQ techniques for\ndifferent models within a hybrid framework, examining how quantum feature\ntransformation affects uncertainty propagation. Increasing qubits from 4 to 16\nshows varied model responsiveness to outlier detection (OD) samples, which is a\ncritical factor for resilient decision-making in dynamic environments. This\nwork shows how quantum computing techniques can transform data features for UQ,\nparticularly when combined with traditional methods.\n","authors":["Abdullah Abdullah","Fannya Ratana Sandjaja","Ayesha Abdul Majeed","Gyan Wickremasinghe","Karen Rafferty","Vishal Sharma"],"pdf_url":"https://arxiv.org/pdf/2411.10254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10242v1","updated":"2024-11-15T14:55:01Z","published":"2024-11-15T14:55:01Z","title":"Measuring Non-Adversarial Reproduction of Training Data in Large\n Language Models","summary":" Large language models memorize parts of their training data. Memorizing short\nsnippets and facts is required to answer questions about the world and to be\nfluent in any language. But models have also been shown to reproduce long\nverbatim sequences of memorized text when prompted by a motivated adversary. In\nthis work, we investigate an intermediate regime of memorization that we call\nnon-adversarial reproduction, where we quantify the overlap between model\nresponses and pretraining data when responding to natural and benign prompts.\nFor a variety of innocuous prompt categories (e.g., writing a letter or a\ntutorial), we show that up to 15% of the text output by popular conversational\nlanguage models overlaps with snippets from the Internet. In worst cases, we\nfind generations where 100% of the content can be found exactly online. For the\nsame tasks, we find that human-written text has far less overlap with Internet\ndata. We further study whether prompting strategies can close this reproduction\ngap between models and humans. While appropriate prompting can reduce\nnon-adversarial reproduction on average, we find that mitigating worst-case\nreproduction of training data requires stronger defenses -- even for benign\ninteractions.\n","authors":["Michael Aerni","Javier Rando","Edoardo Debenedetti","Nicholas Carlini","Daphne Ippolito","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2411.10242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02604v2","updated":"2024-11-15T14:55:00Z","published":"2024-08-05T16:27:38Z","title":"Learning rheological parameters of non-Newtonian fluids from velocimetry\n data","summary":" We solve a Bayesian inverse Navier-Stokes (N-S) problem that assimilates\nvelocimetry data in order to jointly reconstruct the flow field and learn the\nunknown N-S parameters. By incorporating a Carreau shear-thinning viscosity\nmodel into the N-S problem, we devise an algorithm that learns the most likely\nCarreau parameters of a shear-thinning fluid, and estimates their\nuncertainties, from velocimetry data alone. We then conduct a flow-MRI\nexperiment to obtain velocimetry data of an axisymmetric laminar jet through an\nidealised medical device (FDA nozzle) for a blood analogue fluid. We show that\nthe algorithm can successfully reconstruct the flow field by learning the most\nlikely Carreau parameters, and that the learned parameters are in very good\nagreement with rheometry measurements. The algorithm accepts any algebraic\neffective viscosity model, as long as the model is differentiable, and it can\nbe extended to more complicated non-Newtonian fluids (e.g. Oldroyd-B fluid) if\na viscoelastic model is incorporated into the N-S problem.\n","authors":["Alexandros Kontogiannis","Richard Hodgkinson","Emily L. Manchester"],"pdf_url":"https://arxiv.org/pdf/2408.02604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10240v1","updated":"2024-11-15T14:53:34Z","published":"2024-11-15T14:53:34Z","title":"Efficient Neural Hybrid System Learning and Transition System\n Abstraction for Dynamical Systems","summary":" This paper proposes a neural network hybrid modeling framework for dynamics\nlearning to promote an interpretable, computationally efficient way of dynamics\nlearning and system identification. First, a low-level model will be trained to\nlearn the system dynamics, which utilizes multiple simple neural networks to\napproximate the local dynamics generated from data-driven partitions. Then,\nbased on the low-level model, a high-level model will be trained to abstract\nthe low-level neural hybrid system model into a transition system that allows\nComputational Tree Logic Verification to promote the model's ability with human\ninteraction and verification efficiency.\n","authors":["Yejiang Yang","Zihao Mo","Weiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2411.10240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14249v3","updated":"2024-11-15T14:49:21Z","published":"2023-12-21T19:06:34Z","title":"GenoCraft: A Comprehensive, User-Friendly Web-Based Platform for\n High-Throughput Omics Data Analysis and Visualization","summary":" The surge in high-throughput omics data has reshaped the landscape of\nbiological research, underlining the need for powerful, user-friendly data\nanalysis and interpretation tools. This paper presents GenoCraft, a web-based\ncomprehensive software solution designed to handle the entire pipeline of omics\ndata processing. GenoCraft offers a unified platform featuring advanced\nbioinformatics tools, covering all aspects of omics data analysis. It\nencompasses a range of functionalities, such as normalization, quality control,\ndifferential analysis, network analysis, pathway analysis, and diverse\nvisualization techniques. This software makes state-of-the-art omics data\nanalysis more accessible to a wider range of users. With GenoCraft, researchers\nand data scientists have access to an array of cutting-edge bioinformatics\ntools under a user-friendly interface, making it a valuable resource for\nmanaging and analyzing large-scale omics data. The API with an interactive web\ninterface is publicly available at https://genocraft.stanford. edu/. We also\nrelease all the codes in https://github.com/futianfan/GenoCraft.\n","authors":["Yingzhou Lu","Minjie Shen","Ling Yue","Chenhao Li","Lulu Chen","Fan Meng","Xiao Wang","David Herrington","Yue Wang","Yue Zhao","Tianfan Fu","Capucine Van Rechem"],"pdf_url":"https://arxiv.org/pdf/2312.14249v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10231v1","updated":"2024-11-15T14:43:58Z","published":"2024-11-15T14:43:58Z","title":"A Low-Resolution Image is Worth 1x1 Words: Enabling Fine Image\n Super-Resolution with Transformers and TaylorShift","summary":" Transformer-based Super-Resolution (SR) models have recently advanced image\nreconstruction quality, yet challenges remain due to computational complexity\nand an over-reliance on large patch sizes, which constrain fine-grained detail\nenhancement. In this work, we propose TaylorIR to address these limitations by\nutilizing a patch size of 1x1, enabling pixel-level processing in any\ntransformer-based SR model. To address the significant computational demands\nunder the traditional self-attention mechanism, we employ the TaylorShift\nattention mechanism, a memory-efficient alternative based on Taylor series\nexpansion, achieving full token-to-token interactions with linear complexity.\nExperimental results demonstrate that our approach achieves new\nstate-of-the-art SR performance while reducing memory consumption by up to 60%\ncompared to traditional self-attention-based transformers.\n","authors":["Sanath Budakegowdanadoddi Nagaraju","Brian Bernhard Moser","Tobias Christian Nauen","Stanislav Frolov","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.10231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17561v5","updated":"2024-11-15T14:30:43Z","published":"2024-03-26T10:10:53Z","title":"A Survey on State-of-the-art Deep Learning Applications and Challenges","summary":" Deep learning, a branch of artificial intelligence, is a data-driven method\nthat uses multiple layers of interconnected units (neurons) to learn intricate\npatterns and representations directly from raw input data. Empowered by this\nlearning capability, it has become a powerful tool for solving complex problems\nand is the core driver of many groundbreaking technologies and innovations.\nBuilding a deep learning model is challenging due to the algorithm's complexity\nand the dynamic nature of real-world problems. Several studies have reviewed\ndeep learning concepts and applications. However, the studies mostly focused on\nthe types of deep learning models and convolutional neural network\narchitectures, offering limited coverage of the state-of-the-art deep learning\nmodels and their applications in solving complex problems across different\ndomains. Therefore, motivated by the limitations, this study aims to\ncomprehensively review the state-of-the-art deep learning models in computer\nvision, natural language processing, time series analysis and pervasive\ncomputing. We highlight the key features of the models and their effectiveness\nin solving the problems within each domain. Furthermore, this study presents\nthe fundamentals of deep learning, various deep learning model types and\nprominent convolutional neural network architectures. Finally, challenges and\nfuture directions in deep learning research are discussed to offer a broader\nperspective for future researchers.\n","authors":["Mohd Halim Mohd Noor","Ayokunle Olalekan Ige"],"pdf_url":"https://arxiv.org/pdf/2403.17561v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17986v2","updated":"2024-11-15T14:22:00Z","published":"2024-09-26T15:56:40Z","title":"Supra-Laplacian Encoding for Transformer on Dynamic Graphs","summary":" Fully connected Graph Transformers (GT) have rapidly become prominent in the\nstatic graph community as an alternative to Message-Passing models, which\nsuffer from a lack of expressivity, oversquashing, and under-reaching. However,\nin a dynamic context, by interconnecting all nodes at multiple snapshots with\nself-attention, GT loose both structural and temporal information. In this\nwork, we introduce Supra-LAplacian encoding for spatio-temporal TransformErs\n(SLATE), a new spatio-temporal encoding to leverage the GT architecture while\nkeeping spatio-temporal information. Specifically, we transform Discrete Time\nDynamic Graphs into multi-layer graphs and take advantage of the spectral\nproperties of their associated supra-Laplacian matrix. Our second contribution\nexplicitly model nodes' pairwise relationships with a cross-attention\nmechanism, providing an accurate edge representation for dynamic link\nprediction. SLATE outperforms numerous state-of-the-art methods based on\nMessage-Passing Graph Neural Networks combined with recurrent models (e.g\nLSTM), and Dynamic Graph Transformers, on 9 datasets. Code is available at:\ngithub.com/ykrmm/SLATE.\n","authors":["Yannis Karmim","Marc Lafon","Raphael Fournier S'niehotta","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2409.17986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10214v1","updated":"2024-11-15T14:21:32Z","published":"2024-11-15T14:21:32Z","title":"Machine Learning Algorithms to Assess Site Closure Time Frames for Soil\n and Groundwater Contamination","summary":" Monitored Natural Attenuation (MNA) is gaining prominence as an effective\nmethod for managing soil and groundwater contamination due to its\ncost-efficiency and minimal environmental disruption. Despite its benefits, MNA\nnecessitates extensive groundwater monitoring to ensure that contaminant levels\ndecrease to meet safety standards. This study expands the capabilities of\nPyLEnM, a Python package designed for long-term environmental monitoring, by\nincorporating new algorithms to enhance its predictive and analytical\nfunctionalities. We introduce methods to estimate the timeframe required for\ncontaminants like Sr-90 and I-129 to reach regulatory safety standards using\nlinear regression and to forecast future contaminant levels with the\nBidirectional Long Short-Term Memory (Bi-LSTM) networks. Additionally, Random\nForest regression is employed to identify factors influencing the time to reach\nsafety standards. Our methods are illustrated using data from the Savannah\nRiver Site (SRS) F-Area, where preliminary findings reveal a notable downward\ntrend in contaminant levels, with variability linked to initial concentrations\nand groundwater flow dynamics. The Bi-LSTM model effectively predicts\ncontaminant concentrations for the next four years, demonstrating the potential\nof advanced time series analysis to improve MNA strategies and reduce reliance\non manual groundwater sampling. The code, along with its usage instructions,\nvalidation, and requirements, is available at:\nhttps://github.com/csplevuanh/pylenm_extension.\n","authors":["Vu-Anh Le","Haruko Murakami Wainwright","Hansell Gonzalez-Raymat","Carol Eddy-Dilek"],"pdf_url":"https://arxiv.org/pdf/2411.10214v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.10212v1","updated":"2024-11-15T14:17:19Z","published":"2024-11-15T14:17:19Z","title":"Embedding Byzantine Fault Tolerance into Federated Learning via Virtual\n Data-Driven Consistency Scoring Plugin","summary":" Given sufficient data from multiple edge devices, federated learning (FL)\nenables training a shared model without transmitting private data to a central\nserver. However, FL is generally vulnerable to Byzantine attacks from\ncompromised edge devices, which can significantly degrade the model\nperformance. In this paper, we propose a intuitive plugin that can be\nintegrated into existing FL techniques to achieve Byzantine-Resilience. Key\nidea is to generate virtual data samples and evaluate model consistency scores\nacross local updates to effectively filter out compromised edge devices. By\nutilizing this scoring mechanism before the aggregation phase, the proposed\nplugin enables existing FL techniques to become robust against Byzantine\nattacks while maintaining their original benefits. Numerical results on medical\nimage classification task validate that plugging the proposed approach into\nrepresentative FL algorithms, effectively achieves Byzantine resilience.\nFurthermore, the proposed plugin maintains the original convergence properties\nof the base FL algorithms when no Byzantine attacks are present.\n","authors":["Youngjoon Lee","Jinu Gong","Joonhyuk Kang"],"pdf_url":"https://arxiv.org/pdf/2411.10212v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2411.10204v1","updated":"2024-11-15T14:10:52Z","published":"2024-11-15T14:10:52Z","title":"Fused Gromov-Wasserstein Variance Decomposition with Linear Optimal\n Transport","summary":" Wasserstein distances form a family of metrics on spaces of probability\nmeasures that have recently seen many applications. However, statistical\nanalysis in these spaces is complex due to the nonlinearity of Wasserstein\nspaces. One potential solution to this problem is Linear Optimal Transport\n(LOT). This method allows one to find a Euclidean embedding, called LOT\nembedding, of measures in some Wasserstein spaces, but some information is lost\nin this embedding. So, to understand whether statistical analysis relying on\nLOT embeddings can make valid inferences about original data, it is helpful to\nquantify how well these embeddings describe that data. To answer this question,\nwe present a decomposition of the Fr\\'echet variance of a set of measures in\nthe 2-Wasserstein space, which allows one to compute the percentage of variance\nexplained by LOT embeddings of those measures. We then extend this\ndecomposition to the Fused Gromov-Wasserstein setting. We also present several\nexperiments that explore the relationship between the dimension of the LOT\nembedding, the percentage of variance explained by the embedding, and the\nclassification accuracy of machine learning classifiers built on the embedded\ndata. We use the MNIST handwritten digits dataset, IMDB-50000 dataset, and\nDiffusion Tensor MRI images for these experiments. Our results illustrate the\neffectiveness of low dimensional LOT embeddings in terms of the percentage of\nvariance explained and the classification accuracy of models built on the\nembedded data.\n","authors":["Michael Wilson","Tom Needham","Anuj Srivastava"],"pdf_url":"https://arxiv.org/pdf/2411.10204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10191v1","updated":"2024-11-15T13:44:37Z","published":"2024-11-15T13:44:37Z","title":"FengWu-W2S: A deep learning model for seamless weather-to-subseasonal\n forecast of global atmosphere","summary":" Seamless forecasting that produces warning information at continuum\ntimescales based on only one system is a long-standing pursuit for\nweather-climate service. While the rapid advancement of deep learning has\ninduced revolutionary changes in classical forecasting field, current efforts\nare still focused on building separate AI models for weather and climate\nforecasts. To explore the seamless forecasting ability based on one AI model,\nwe propose FengWu-Weather to Subseasonal (FengWu-W2S), which builds on the\nFengWu global weather forecast model and incorporates an ocean-atmosphere-land\ncoupling structure along with a diverse perturbation strategy. FengWu-W2S can\ngenerate 6-hourly atmosphere forecasts extending up to 42 days through an\nautoregressive and seamless manner. Our hindcast results demonstrate that\nFengWu-W2S reliably predicts atmospheric conditions out to 3-6 weeks ahead,\nenhancing predictive capabilities for global surface air temperature,\nprecipitation, geopotential height and intraseasonal signals such as the\nMadden-Julian Oscillation (MJO) and North Atlantic Oscillation (NAO). Moreover,\nour ablation experiments on forecast error growth from daily to seasonal\ntimescales reveal potential pathways for developing AI-based integrated system\nfor seamless weather-climate forecasting in the future.\n","authors":["Fenghua Ling","Kang Chen","Jiye Wu","Tao Han","Jing-Jia Luo","Wanli Ouyang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2411.10191v1.pdf","comment":"23 pages,8 figures"},{"id":"http://arxiv.org/abs/2403.00321v3","updated":"2024-11-15T13:42:28Z","published":"2024-03-01T06:48:58Z","title":"DEEP-IoT: Downlink-Enhanced Efficient-Power Internet of Things","summary":" At the heart of the Internet of Things (IoT) -- a domain witnessing explosive\ngrowth -- the imperative for energy efficiency and the extension of device\nlifespans has never been more pressing. This paper presents DEEP-IoT, an\ninnovative communication paradigm poised to redefine how IoT devices\ncommunicate. Through a pioneering feedback channel coding strategy, DEEP-IoT\nchallenges and transforms the traditional transmitter (IoT devices)-centric\ncommunication model to one where the receiver (the access point) play a pivotal\nrole, thereby cutting down energy use and boosting device longevity. We not\nonly conceptualize DEEP-IoT but also actualize it by integrating deep\nlearning-enhanced feedback channel codes within a narrow-band system.\nSimulation results show a significant enhancement in the operational lifespan\nof IoT cells -- surpassing traditional systems using Turbo and Polar codes by\nup to 52.71%. This leap signifies a paradigm shift in IoT communications,\nsetting the stage for a future where IoT devices boast unprecedented efficiency\nand durability.\n","authors":["Yulin Shao"],"pdf_url":"https://arxiv.org/pdf/2403.00321v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10180v1","updated":"2024-11-15T13:29:44Z","published":"2024-11-15T13:29:44Z","title":"CART: Compositional Auto-Regressive Transformer for Image Generation","summary":" In recent years, image synthesis has achieved remarkable advancements,\nenabling diverse applications in content creation, virtual reality, and beyond.\nWe introduce a novel approach to image generation using Auto-Regressive (AR)\nmodeling, which leverages a next-detail prediction strategy for enhanced\nfidelity and scalability. While AR models have achieved transformative success\nin language modeling, replicating this success in vision tasks has presented\nunique challenges due to the inherent spatial dependencies in images. Our\nproposed method addresses these challenges by iteratively adding finer details\nto an image compositionally, constructing it as a hierarchical combination of\nbase and detail image factors. This strategy is shown to be more effective than\nthe conventional next-token prediction and even surpasses the state-of-the-art\nnext-scale prediction approaches. A key advantage of this method is its\nscalability to higher resolutions without requiring full model retraining,\nmaking it a versatile solution for high-resolution image generation.\n","authors":["Siddharth Roheda"],"pdf_url":"https://arxiv.org/pdf/2411.10180v1.pdf","comment":"under review at CVPR 2025"},{"id":"http://arxiv.org/abs/2411.10175v1","updated":"2024-11-15T13:21:26Z","published":"2024-11-15T13:21:26Z","title":"The Surprising Ineffectiveness of Pre-Trained Visual Representations for\n Model-Based Reinforcement Learning","summary":" Visual Reinforcement Learning (RL) methods often require extensive amounts of\ndata. As opposed to model-free RL, model-based RL (MBRL) offers a potential\nsolution with efficient data utilization through planning. Additionally, RL\nlacks generalization capabilities for real-world tasks. Prior work has shown\nthat incorporating pre-trained visual representations (PVRs) enhances sample\nefficiency and generalization. While PVRs have been extensively studied in the\ncontext of model-free RL, their potential in MBRL remains largely unexplored.\nIn this paper, we benchmark a set of PVRs on challenging control tasks in a\nmodel-based RL setting. We investigate the data efficiency, generalization\ncapabilities, and the impact of different properties of PVRs on the performance\nof model-based agents. Our results, perhaps surprisingly, reveal that for MBRL\ncurrent PVRs are not more sample efficient than learning representations from\nscratch, and that they do not generalize better to out-of-distribution (OOD)\nsettings. To explain this, we analyze the quality of the trained dynamics\nmodel. Furthermore, we show that data diversity and network architecture are\nthe most important contributors to OOD generalization performance.\n","authors":["Moritz Schneider","Robert Krug","Narunas Vaskevicius","Luigi Palmieri","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2411.10175v1.pdf","comment":"Published at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/"},{"id":"http://arxiv.org/abs/2411.10154v1","updated":"2024-11-15T12:55:05Z","published":"2024-11-15T12:55:05Z","title":"Continuous Bayesian Model Selection for Multivariate Causal Discovery","summary":" Current causal discovery approaches require restrictive model assumptions or\nassume access to interventional data to ensure structure identifiability. These\nassumptions often do not hold in real-world applications leading to a loss of\nguarantees and poor accuracy in practice. Recent work has shown that, in the\nbivariate case, Bayesian model selection can greatly improve accuracy by\nexchanging restrictive modelling for more flexible assumptions, at the cost of\na small probability of error. We extend the Bayesian model selection approach\nto the important multivariate setting by making the large discrete selection\nproblem scalable through a continuous relaxation. We demonstrate how for our\nchoice of Bayesian non-parametric model, the Causal Gaussian Process\nConditional Density Estimator (CGP-CDE), an adjacency matrix can be constructed\nfrom the model hyperparameters. This adjacency matrix is then optimised using\nthe marginal likelihood and an acyclicity regulariser, outputting the maximum a\nposteriori causal graph. We demonstrate the competitiveness of our approach on\nboth synthetic and real-world datasets, showing it is possible to perform\nmultivariate causal discovery without infeasible assumptions using Bayesian\nmodel selection.\n","authors":["Anish Dhir","Ruby Sedgwick","Avinash Kori","Ben Glocker","Mark van der Wilk"],"pdf_url":"https://arxiv.org/pdf/2411.10154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10153v1","updated":"2024-11-15T12:52:02Z","published":"2024-11-15T12:52:02Z","title":"BONE: a unifying framework for Bayesian online learning in\n non-stationary environments","summary":" We propose a unifying framework for methods that perform Bayesian online\nlearning in non-stationary environments. We call the framework BONE, which\nstands for (B)ayesian (O)nline learning in (N)on-stationary (E)nvironments.\nBONE provides a common structure to tackle a variety of problems, including\nonline continual learning, prequential forecasting, and contextual bandits. The\nframework requires specifying three modelling choices: (i) a model for\nmeasurements (e.g., a neural network), (ii) an auxiliary process to model\nnon-stationarity (e.g., the time since the last changepoint), and (iii) a\nconditional prior over model parameters (e.g., a multivariate Gaussian). The\nframework also requires two algorithmic choices, which we use to carry out\napproximate inference under this framework: (i) an algorithm to estimate\nbeliefs (posterior distribution) about the model parameters given the auxiliary\nvariable, and (ii) an algorithm to estimate beliefs about the auxiliary\nvariable. We show how this modularity allows us to write many different\nexisting methods as instances of BONE; we also use this framework to propose a\nnew method. We then experimentally compare existing methods with our proposed\nnew method on several datasets; we provide insights into the situations that\nmake one method more suitable than another for a given task.\n","authors":["Gerardo Duran-Martin","Leandro Sánchez-Betancourt","Alexander Y. Shestopaloff","Kevin Murphy"],"pdf_url":"https://arxiv.org/pdf/2411.10153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10152v1","updated":"2024-11-15T12:50:57Z","published":"2024-11-15T12:50:57Z","title":"Causal Time-Series Synchronization for Multi-Dimensional Forecasting","summary":" The process industry's high expectations for Digital Twins require modeling\napproaches that can generalize across tasks and diverse domains with\npotentially different data dimensions and distributional shifts i.e.,\nFoundational Models. Despite success in natural language processing and\ncomputer vision, transfer learning with (self-) supervised signals for\npre-training general-purpose models is largely unexplored in the context of\nDigital Twins in the process industry due to challenges posed by\nmulti-dimensional time-series data, lagged cause-effect dependencies, complex\ncausal structures, and varying number of (exogenous) variables. We propose a\nnovel channel-dependent pre-training strategy that leverages synchronized\ncause-effect pairs to overcome these challenges by breaking down the\nmulti-dimensional time-series data into pairs of cause-effect variables. Our\napproach focuses on: (i) identifying highly lagged causal relationships using\ndata-driven methods, (ii) synchronizing cause-effect pairs to generate training\nsamples for channel-dependent pre-training, and (iii) evaluating the\neffectiveness of this approach in channel-dependent forecasting. Our\nexperimental results demonstrate significant improvements in forecasting\naccuracy and generalization capability compared to traditional training\nmethods.\n","authors":["Michael Mayr","Georgios C. Chasparis","Josef Küng"],"pdf_url":"https://arxiv.org/pdf/2411.10152v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2410.14979v5","updated":"2024-11-15T12:46:30Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration From Cognitive Psychology","summary":" The cognitive mechanism by which Large Language Models (LLMs) solve\nmathematical problems remains a widely debated and unresolved issue. Currently,\nthere is little interpretable experimental evidence that connects LLMs'\nproblem-solving with human cognitive psychology.To determine if LLMs possess\nhuman-like mathematical reasoning, we modified the problems used in the human\nCognitive Reflection Test (CRT). Our results show that, even with the use of\nChains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model\n(noted for its reasoning capabilities), have a high error rate when solving\nthese modified CRT problems. Specifically, the average accuracy rate dropped by\nup to 50% compared to the original questions.Further analysis of LLMs'\nincorrect answers suggests that they primarily rely on pattern matching from\ntheir training data, which aligns more with human intuition (System 1 thinking)\nrather than with human-like reasoning (System 2 thinking). This finding\nchallenges the belief that LLMs have genuine mathematical reasoning abilities\ncomparable to humans. As a result, this work may adjust overly optimistic views\non LLMs' progress towards artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Kai Chen","Xiaobing Sun","Baosheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14979v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10144v1","updated":"2024-11-15T12:36:01Z","published":"2024-11-15T12:36:01Z","title":"DaYu: Data-Driven Model for Geostationary Satellite Observed Cloud\n Images Forecasting","summary":" In the past few years, Artificial Intelligence (AI)-based weather forecasting\nmethods have widely demonstrated strong competitiveness among the weather\nforecasting systems. However, these methods are insufficient for\nhigh-spatial-resolution short-term nowcasting within 6 hours, which is crucial\nfor warning short-duration, mesoscale and small-scale weather events.\nGeostationary satellite remote sensing provides detailed, high spatio-temporal\nand all-day observations, which can address the above limitations of existing\nmethods. Therefore, this paper proposed an advanced data-driven thermal\ninfrared cloud images forecasting model, \"DaYu.\" Unlike existing data-driven\nweather forecasting models, DaYu is specifically designed for geostationary\nsatellite observations, with a temporal resolution of 0.5 hours and a spatial\nresolution of ${0.05}^\\circ$ $\\times$ ${0.05}^\\circ$. DaYu is based on a\nlarge-scale transformer architecture, which enables it to capture fine-grained\ncloud structures and learn fast-changing spatio-temporal evolution features\neffectively. Moreover, its attention mechanism design achieves a balance in\ncomputational complexity, making it practical for applications. DaYu not only\nachieves accurate forecasts up to 3 hours with a correlation coefficient higher\nthan 0.9, 6 hours higher than 0.8, and 12 hours higher than 0.7, but also\ndetects short-duration, mesoscale, and small-scale weather events with enhanced\ndetail, effectively addressing the shortcomings of existing methods in\nproviding detailed short-term nowcasting within 6 hours. Furthermore, DaYu has\nsignificant potential in short-term climate disaster prevention and mitigation.\n","authors":["Xujun Wei","Feng Zhang","Renhe Zhang","Wenwen Li","Cuiping Liu","Bin Guo","Jingwei Li","Haoyang Fu","Xu Tang"],"pdf_url":"https://arxiv.org/pdf/2411.10144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04825v8","updated":"2024-11-15T12:29:07Z","published":"2022-09-11T09:53:14Z","title":"Arithmetical Binary Decision Tree Traversals","summary":" This paper introduces a series of methods for traversing binary decision\ntrees using arithmetic operations. We present a suite of binary tree traversal\nalgorithms that leverage novel representation matrices to flatten the full\nbinary tree structure and embed the aggregated internal node Boolean tests into\na single binary vector. Our approach, grounded in maximum inner product search,\noffers new insights into decision tree.\n","authors":["Jinxiong Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.04825v8.pdf","comment":"Correct some citation format and typoes"},{"id":"http://arxiv.org/abs/2407.15527v2","updated":"2024-11-15T12:05:27Z","published":"2024-07-22T10:32:48Z","title":"Interpretable Concept-Based Memory Reasoning","summary":" The lack of transparency in the decision-making processes of deep learning\nsystems presents a significant challenge in modern artificial intelligence\n(AI), as it impairs users' ability to rely on and verify these systems. To\naddress this challenge, Concept Bottleneck Models (CBMs) have made significant\nprogress by incorporating human-interpretable concepts into deep learning\narchitectures. This approach allows predictions to be traced back to specific\nconcept patterns that users can understand and potentially intervene on.\nHowever, existing CBMs' task predictors are not fully interpretable, preventing\na thorough analysis and any form of formal verification of their\ndecision-making process prior to deployment, thereby raising significant\nreliability concerns. To bridge this gap, we introduce Concept-based Memory\nReasoner (CMR), a novel CBM designed to provide a human-understandable and\nprovably-verifiable task prediction process. Our approach is to model each task\nprediction as a neural selection mechanism over a memory of learnable logic\nrules, followed by a symbolic evaluation of the selected rule. The presence of\nan explicit memory and the symbolic evaluation allow domain experts to inspect\nand formally verify the validity of certain global properties of interest for\nthe task prediction process. Experimental results demonstrate that CMR achieves\nbetter accuracy-interpretability trade-offs to state-of-the-art CBMs, discovers\nlogic rules consistent with ground truths, allows for rule interventions, and\nallows pre-deployment verification.\n","authors":["David Debot","Pietro Barbiero","Francesco Giannini","Gabriele Ciravegna","Michelangelo Diligenti","Giuseppe Marra"],"pdf_url":"https://arxiv.org/pdf/2407.15527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22591v2","updated":"2024-11-15T12:02:15Z","published":"2024-10-29T23:10:01Z","title":"FGCE: Feasible Group Counterfactual Explanations for Auditing Fairness","summary":" This paper introduces the first graph-based framework for generating group\ncounterfactual explanations to audit model fairness, a crucial aspect of\ntrustworthy machine learning. Counterfactual explanations are instrumental in\nunderstanding and mitigating unfairness by revealing how inputs should change\nto achieve a desired outcome. Our framework, named Feasible Group\nCounterfactual Explanations (FGCEs), captures real-world feasibility\nconstraints and constructs subgroups with similar counterfactuals, setting it\napart from existing methods. It also addresses key trade-offs in counterfactual\ngeneration, including the balance between the number of counterfactuals, their\nassociated costs, and the breadth of coverage achieved. To evaluate these\ntrade-offs and assess fairness, we propose measures tailored to group\ncounterfactual generation. Our experimental results on benchmark datasets\ndemonstrate the effectiveness of our approach in managing feasibility\nconstraints and trade-offs, as well as the potential of our proposed metrics in\nidentifying and quantifying fairness issues.\n","authors":["Christos Fragkathoulas","Vasiliki Papanikou","Evaggelia Pitoura","Evimaria Terzi"],"pdf_url":"https://arxiv.org/pdf/2410.22591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10129v1","updated":"2024-11-15T12:01:38Z","published":"2024-11-15T12:01:38Z","title":"Prompting and Fine-tuning Large Language Models for Automated Code\n Review Comment Generation","summary":" Generating accurate code review comments remains a significant challenge due\nto the inherently diverse and non-unique nature of the task output. Large\nlanguage models pretrained on both programming and natural language data tend\nto perform well in code-oriented tasks. However, large-scale pretraining is not\nalways feasible due to its environmental impact and project-specific\ngeneralizability issues. In this work, first we fine-tune open-source Large\nlanguage models (LLM) in parameter-efficient, quantized low-rank (QLoRA)\nfashion on consumer-grade hardware to improve review comment generation. Recent\nstudies demonstrate the efficacy of augmenting semantic metadata information\ninto prompts to boost performance in other code-related tasks. To explore this\nin code review activities, we also prompt proprietary, closed-source LLMs\naugmenting the input code patch with function call graphs and code summaries.\nBoth of our strategies improve the review comment generation performance, with\nfunction call graph augmented few-shot prompting on the GPT-3.5 model\nsurpassing the pretrained baseline by around 90% BLEU-4 score on the\nCodeReviewer dataset. Moreover, few-shot prompted Gemini-1.0 Pro, QLoRA\nfine-tuned Code Llama and Llama 3.1 models achieve competitive results (ranging\nfrom 25% to 83% performance improvement) on this task. An additional human\nevaluation study further validates our experimental findings, reflecting\nreal-world developers' perceptions of LLM-generated code review comments based\non relevant qualitative metrics.\n","authors":["Md. Asif Haider","Ayesha Binte Mostofa","Sk. Sabit Bin Mosaddek","Anindya Iqbal","Toufique Ahmed"],"pdf_url":"https://arxiv.org/pdf/2411.10129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10128v1","updated":"2024-11-15T12:01:03Z","published":"2024-11-15T12:01:03Z","title":"On the Universal Statistical Consistency of Expansive Hyperbolic Deep\n Convolutional Neural Networks","summary":" The emergence of Deep Convolutional Neural Networks (DCNNs) has been a\npervasive tool for accomplishing widespread applications in computer vision.\nDespite its potential capability to capture intricate patterns inside the data,\nthe underlying embedding space remains Euclidean and primarily pursues\ncontractive convolution. Several instances can serve as a precedent for the\nexacerbating performance of DCNNs. The recent advancement of neural networks in\nthe hyperbolic spaces gained traction, incentivizing the development of\nconvolutional deep neural networks in the hyperbolic space. In this work, we\npropose Hyperbolic DCNN based on the Poincar\\'{e} Disc. The work predominantly\nrevolves around analyzing the nature of expansive convolution in the context of\nthe non-Euclidean domain. We further offer extensive theoretical insights\npertaining to the universal consistency of the expansive convolution in the\nhyperbolic space. Several simulations were performed not only on the synthetic\ndatasets but also on some real-world datasets. The experimental results reveal\nthat the hyperbolic convolutional architecture outperforms the Euclidean ones\nby a commendable margin.\n","authors":["Sagar Ghosh","Kushal Bose","Swagatam Das"],"pdf_url":"https://arxiv.org/pdf/2411.10128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03864v2","updated":"2024-11-15T11:51:10Z","published":"2024-07-04T11:53:51Z","title":"Adversarial Robustness of VAEs across Intersectional Subgroups","summary":" Despite advancements in Autoencoders (AEs) for tasks like dimensionality\nreduction, representation learning and data generation, they remain vulnerable\nto adversarial attacks. Variational Autoencoders (VAEs), with their\nprobabilistic approach to disentangling latent spaces, show stronger resistance\nto such perturbations compared to deterministic AEs; however, their resilience\nagainst adversarial inputs is still a concern. This study evaluates the\nrobustness of VAEs against non-targeted adversarial attacks by optimizing\nminimal sample-specific perturbations to cause maximal damage across diverse\ndemographic subgroups (combinations of age and gender). We investigate two\nquestions: whether there are robustness disparities among subgroups, and what\nfactors contribute to these disparities, such as data scarcity and\nrepresentation entanglement. Our findings reveal that robustness disparities\nexist but are not always correlated with the size of the subgroup. By using\ndownstream gender and age classifiers and examining latent embeddings, we\nhighlight the vulnerability of subgroups like older women, who are prone to\nmisclassification due to adversarial perturbations pushing their\nrepresentations toward those of other subgroups.\n","authors":["Chethan Krishnamurthy Ramanaik","Arjun Roy","Eirini Ntoutsi"],"pdf_url":"https://arxiv.org/pdf/2407.03864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10125v1","updated":"2024-11-15T11:48:14Z","published":"2024-11-15T11:48:14Z","title":"Energy-GNoME: A Living Database of Selected Materials for Energy\n Applications","summary":" Artificial Intelligence (AI) in materials science is driving significant\nadvancements in the discovery of advanced materials for energy applications.\nThe recent GNoME protocol identifies over 380,000 novel stable crystals. From\nthis, we identify over 33,000 materials with potential as energy materials\nforming the Energy-GNoME database. Leveraging Machine Learning (ML) and Deep\nLearning (DL) tools, our protocol mitigates cross-domain data bias using\nfeature spaces to identify potential candidates for thermoelectric materials,\nnovel battery cathodes, and novel perovskites. Classifiers with both structural\nand compositional features identify domains of applicability, where we expect\nenhanced accuracy of the regressors. Such regressors are trained to predict key\nmaterials properties like, thermoelectric figure of merit (zT), band gap (Eg),\nand cathode voltage ($\\Delta V_c$). This method significantly narrows the pool\nof potential candidates, serving as an efficient guide for experimental and\ncomputational chemistry investigations and accelerating the discovery of\nmaterials suited for electricity generation, energy storage and conversion.\n","authors":["Paolo De Angelis","Giovanni Trezza","Giulio Barletta","Pietro Asinari","Eliodoro Chiavazzo"],"pdf_url":"https://arxiv.org/pdf/2411.10125v1.pdf","comment":"60 pages, 16 figures"},{"id":"http://arxiv.org/abs/2406.02269v2","updated":"2024-11-15T11:41:56Z","published":"2024-06-04T12:47:13Z","title":"Graph Neural Networks Do Not Always Oversmooth","summary":" Graph neural networks (GNNs) have emerged as powerful tools for processing\nrelational data in applications. However, GNNs suffer from the problem of\noversmoothing, the property that the features of all nodes exponentially\nconverge to the same vector over layers, prohibiting the design of deep GNNs.\nIn this work we study oversmoothing in graph convolutional networks (GCNs) by\nusing their Gaussian process (GP) equivalence in the limit of infinitely many\nhidden features. By generalizing methods from conventional deep neural networks\n(DNNs), we can describe the distribution of features at the output layer of\ndeep GCNs in terms of a GP: as expected, we find that typical parameter choices\nfrom the literature lead to oversmoothing. The theory, however, allows us to\nidentify a new, non-oversmoothing phase: if the initial weights of the network\nhave sufficiently large variance, GCNs do not oversmooth, and node features\nremain informative even at large depth. We demonstrate the validity of this\nprediction in finite-size GCNs by training a linear classifier on their output.\nMoreover, using the linearization of the GCN GP, we generalize the concept of\npropagation depth of information from DNNs to GCNs. This propagation depth\ndiverges at the transition between the oversmoothing and non-oversmoothing\nphase. We test the predictions of our approach and find good agreement with\nfinite-size GCNs. Initializing GCNs near the transition to the\nnon-oversmoothing phase, we obtain networks which are both deep and expressive.\n","authors":["Bastian Epping","Alexandre René","Moritz Helias","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2406.02269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10109v1","updated":"2024-11-15T11:14:34Z","published":"2024-11-15T11:14:34Z","title":"Generative Agent Simulations of 1,000 People","summary":" The promise of human behavioral simulation--general-purpose computational\nagents that replicate human behavior across domains--could enable broad\napplications in policymaking and social science. We present a novel agent\narchitecture that simulates the attitudes and behaviors of 1,052 real\nindividuals--applying large language models to qualitative interviews about\ntheir lives, then measuring how well these agents replicate the attitudes and\nbehaviors of the individuals that they represent. The generative agents\nreplicate participants' responses on the General Social Survey 85% as\naccurately as participants replicate their own answers two weeks later, and\nperform comparably in predicting personality traits and outcomes in\nexperimental replications. Our architecture reduces accuracy biases across\nracial and ideological groups compared to agents given demographic\ndescriptions. This work provides a foundation for new tools that can help\ninvestigate individual and collective behavior.\n","authors":["Joon Sung Park","Carolyn Q. Zou","Aaron Shaw","Benjamin Mako Hill","Carrie Cai","Meredith Ringel Morris","Robb Willer","Percy Liang","Michael S. Bernstein"],"pdf_url":"https://arxiv.org/pdf/2411.10109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11626v3","updated":"2024-11-15T11:01:19Z","published":"2024-07-16T11:41:35Z","title":"Dynamic Dimension Wrapping (DDW) Algorithm: A Novel Approach for\n Efficient Cross-Dimensional Search in Dynamic Multidimensional Spaces","summary":" To effectively search for the optimal motion template in dynamic\nmultidimensional space, this paper proposes a novel optimization algorithm,\nDynamic Dimension Wrapping (DDW).The algorithm combines Dynamic Time Warping\n(DTW) and Euclidean distance, and designs a fitness function that adapts to\ndynamic multidimensional space by establishing a time-data chain mapping across\ndimensions. This paper also proposes a novel update mechanism,Optimal Dimension\nCollection (ODC), combined with the search strategy of traditional optimization\nalgorithms, enables DDW to adjust both the dimension values and the number of\ndimensions of the population individuals simultaneously. In this way, DDW\nsignificantly reduces computational complexity and improves search accuracy.\nExperimental results show that DDW performs excellently in dynamic\nmultidimensional space, outperforming 31 traditional optimization algorithms.\nThis algorithm provides a novel approach to solving dynamic multidimensional\noptimization problems and demonstrates broad application potential in fields\nsuch as motion data analysis.\n","authors":["Dongnan Jin","Yali Liu","Qiuzhi Song","Xunju Ma","Yue Liu","Dehao Wu"],"pdf_url":"https://arxiv.org/pdf/2407.11626v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10101v1","updated":"2024-11-15T10:53:40Z","published":"2024-11-15T10:53:40Z","title":"Recent Advances on Machine Learning-aided DSP for Short-reach and\n Long-haul Optical Communications","summary":" In this paper, we highlight recent advances in the use of machine learning\nfor implementing equalizers for optical communications. We highlight both\nalgorithmic advances as well as implementation aspects using conventional and\nneuromorphic hardware.\n","authors":["Laurent Schmalen","Vincent Lauinger","Jonas Ney","Norbert Wehn","Patrick Matalla","Sebastian Randel","Alexander von Bank","Eike-Manuel Edelmann"],"pdf_url":"https://arxiv.org/pdf/2411.10101v1.pdf","comment":"paper accompanying an invited presentation at OFC 2025"},{"id":"http://arxiv.org/abs/2411.09510v2","updated":"2024-11-15T10:47:37Z","published":"2024-11-14T15:19:01Z","title":"Communication Compression for Tensor Parallel LLM Inference","summary":" Large Language Models (LLMs) have pushed the frontier of artificial\nintelligence but are comprised of hundreds of billions of parameters and\noperations. For faster inference latency, LLMs are deployed on multiple\nhardware accelerators through various Model Parallelism strategies. Our paper\nlooks into the details on one such strategy - Tensor Parallel - and proposes to\nreduce latency by compressing inter-accelerator communication. We leverage fine\ngrained quantization techniques to compress selected activations by 3.5 - 4.5x.\nOur proposed method leads up to 2x reduction of time-to-first-token (TTFT) with\nnegligible model performance degradation.\n","authors":["Jan Hansen-Palmus","Michael Truong Le","Oliver Hausdörfer","Alok Verma"],"pdf_url":"https://arxiv.org/pdf/2411.09510v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10096v1","updated":"2024-11-15T10:44:29Z","published":"2024-11-15T10:44:29Z","title":"Neural Port-Hamiltonian Models for Nonlinear Distributed Control: An\n Unconstrained Parametrization Approach","summary":" The control of large-scale cyber-physical systems requires optimal\ndistributed policies relying solely on limited communication with neighboring\nagents. However, computing stabilizing controllers for nonlinear systems while\noptimizing complex costs remains a significant challenge. Neural Networks\n(NNs), known for their expressivity, can be leveraged to parametrize control\npolicies that yield good performance. However, NNs' sensitivity to small input\nchanges poses a risk of destabilizing the closed-loop system. Many existing\napproaches enforce constraints on the controllers' parameter space to guarantee\nclosed-loop stability, leading to computationally expensive optimization\nprocedures. To address these problems, we leverage the framework of\nport-Hamiltonian systems to design continuous-time distributed control policies\nfor nonlinear systems that guarantee closed-loop stability and finite\n$\\mathcal{L}_2$ or incremental $\\mathcal{L}_2$ gains, independent of the\noptimzation parameters of the controllers. This eliminates the need to\nconstrain parameters during optimization, allowing the use of standard\ntechniques such as gradient-based methods. Additionally, we discuss\ndiscretization schemes that preserve the dissipation properties of these\ncontrollers for implementation on embedded systems. The effectiveness of the\nproposed distributed controllers is demonstrated through consensus control of\nnon-holonomic mobile robots subject to collision avoidance and averaged voltage\nregulation with weighted power sharing in DC microgrids.\n","authors":["Muhammad Zakwan","Giancarlo Ferrari-Trecate"],"pdf_url":"https://arxiv.org/pdf/2411.10096v1.pdf","comment":"The paper has 15 pages, and has been submitted for a possible\n publication. arXiv admin note: text overlap with arXiv:2403.17785"},{"id":"http://arxiv.org/abs/2402.04298v4","updated":"2024-11-15T10:35:03Z","published":"2024-02-06T15:53:49Z","title":"Multi-View Symbolic Regression","summary":" Symbolic regression (SR) searches for analytical expressions representing the\nrelationship between a set of explanatory and response variables. Current SR\nmethods assume a single dataset extracted from a single experiment.\nNevertheless, frequently, the researcher is confronted with multiple sets of\nresults obtained from experiments conducted with different setups. Traditional\nSR methods may fail to find the underlying expression since the parameters of\neach experiment can be different. In this work we present Multi-View Symbolic\nRegression (MvSR), which takes into account multiple datasets simultaneously,\nmimicking experimental environments, and outputs a general parametric solution.\nThis approach fits the evaluated expression to each independent dataset and\nreturns a parametric family of functions f(x; theta) simultaneously capable of\naccurately fitting all datasets. We demonstrate the effectiveness of MvSR using\ndata generated from known expressions, as well as real-world data from\nastronomy, chemistry and economy, for which an a priori analytical expression\nis not available. Results show that MvSR obtains the correct expression more\nfrequently and is robust to hyperparameters change. In real-world data, it is\nable to grasp the group behavior, recovering known expressions from the\nliterature as well as promising alternatives, thus enabling the use of SR to a\nlarge range of experimental scenarios.\n","authors":["Etienne Russeil","Fabrício Olivetti de França","Konstantin Malanchev","Bogdan Burlacu","Emille E. O. Ishida","Marion Leroux","Clément Michelin","Guillaume Moinard","Emmanuel Gangler"],"pdf_url":"https://arxiv.org/pdf/2402.04298v4.pdf","comment":"Published in GECCO-2024. 11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.10087v1","updated":"2024-11-15T10:16:38Z","published":"2024-11-15T10:16:38Z","title":"PFML: Self-Supervised Learning of Time-Series Data Without\n Representation Collapse","summary":" Self-supervised learning (SSL) is a data-driven learning approach that\nutilizes the innate structure of the data to guide the learning process. In\ncontrast to supervised learning, which depends on external labels, SSL utilizes\nthe inherent characteristics of the data to produce its own supervisory signal.\nHowever, one frequent issue with SSL methods is representation collapse, where\nthe model outputs a constant input-invariant feature representation. This issue\nhinders the potential application of SSL methods to new data modalities, as\ntrying to avoid representation collapse wastes researchers' time and effort.\nThis paper introduces a novel SSL algorithm for time-series data called\nPrediction of Functionals from Masked Latents (PFML). Instead of predicting\nmasked input signals or their latent representations directly, PFML operates by\npredicting statistical functionals of the input signal corresponding to masked\nembeddings, given a sequence of unmasked embeddings. The algorithm is designed\nto avoid representation collapse, rendering it straightforwardly applicable to\ndifferent time-series data domains, such as novel sensor modalities in clinical\ndata. We demonstrate the effectiveness of PFML through complex, real-life\nclassification tasks across three different data modalities: infant posture and\nmovement classification from multi-sensor inertial measurement unit data,\nemotion recognition from speech data, and sleep stage classification from EEG\ndata. The results show that PFML is superior to a conceptually similar\npre-existing SSL method and competitive against the current state-of-the-art\nSSL method, while also being conceptually simpler and without suffering from\nrepresentation collapse.\n","authors":["Einari Vaaras","Manu Airaksinen","Okko Räsänen"],"pdf_url":"https://arxiv.org/pdf/2411.10087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09476v2","updated":"2024-11-15T10:09:33Z","published":"2024-11-14T14:31:52Z","title":"Graph Neural Networks and Differential Equations: A hybrid approach for\n data assimilation of fluid flows","summary":" This study presents a novel hybrid approach that combines Graph Neural\nNetworks (GNNs) with Reynolds-Averaged Navier Stokes (RANS) equations to\nenhance the accuracy of mean flow reconstruction across a range of fluid\ndynamics applications. Traditional purely data-driven Neural Networks (NNs)\nmodels, often struggle maintaining physical consistency. Moreover, they\ntypically require large datasets to achieve reliable performances. The GNN\nframework, which naturally handles unstructured data such as complex geometries\nin Computational Fluid Dynamics (CFD), is here integrated with RANS equations\nas a physical baseline model. The methodology leverages the adjoint method,\nenabling the use of RANS-derived gradients as optimization terms in the GNN\ntraining process. This ensures that the learned model adheres to the governing\nphysics, maintaining physical consistency while improving the prediction\naccuracy. We test our approach on multiple CFD scenarios, including cases\ninvolving generalization with respect to the Reynolds number, sparse\nmeasurements, denoising and inpainting of missing portions of the mean flow.\nThe results demonstrate significant improvements in the accuracy of the\nreconstructed mean flow compared to purely data-driven models, using limited\namounts of data in the training dataset. The key strengths of this study are\nthe integration of physical laws into the training process of the GNN, and the\nability to achieve high-accuracy predictions with a limited amount of data,\nmaking this approach particularly valuable for applications in fluid dynamics\nwhere data is often scarce.\n","authors":["M. Quattromini","M. A. Bucci","S. Cherubini","O. Semeraro"],"pdf_url":"https://arxiv.org/pdf/2411.09476v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15171v4","updated":"2024-11-15T09:41:26Z","published":"2024-02-23T08:07:54Z","title":"Towards Efficient and Optimal Covariance-Adaptive Algorithms for\n Combinatorial Semi-Bandits","summary":" We address the problem of stochastic combinatorial semi-bandits, where a\nplayer selects among P actions from the power set of a set containing d base\nitems. Adaptivity to the problem's structure is essential in order to obtain\noptimal regret upper bounds. As estimating the coefficients of a covariance\nmatrix can be manageable in practice, leveraging them should improve the\nregret. We design \"optimistic\" covariance-adaptive algorithms relying on online\nestimations of the covariance structure, called OLS-UCB-C and COS-V (only the\nvariances for the latter). They both yields improved gap-free regret. Although\nCOS-V can be slightly suboptimal, it improves on computational complexity by\ntaking inspiration from ThompsonSampling approaches. It is the first\nsampling-based algorithm satisfying a T^1/2 gap-free regret (up to poly-logs).\nWe also show that in some cases, our approach efficiently leverages the\nsemi-bandit feedback and outperforms bandit feedback approaches, not only in\nexponential regimes where P >> d but also when P <= d, which is not covered by\nexisting analyses.\n","authors":["Julien Zhou","Pierre Gaillard","Thibaud Rahier","Houssam Zenati","Julyan Arbel"],"pdf_url":"https://arxiv.org/pdf/2402.15171v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10071v1","updated":"2024-11-15T09:34:28Z","published":"2024-11-15T09:34:28Z","title":"Evidential Federated Learning for Skin Lesion Image Classification","summary":" We introduce FedEvPrompt, a federated learning approach that integrates\nprinciples of evidential deep learning, prompt tuning, and knowledge\ndistillation for distributed skin lesion classification. FedEvPrompt leverages\ntwo sets of prompts: b-prompts (for low-level basic visual knowledge) and\nt-prompts (for task-specific knowledge) prepended to frozen pre-trained Vision\nTransformer (ViT) models trained in an evidential learning framework to\nmaximize class evidences. Crucially, knowledge sharing across federation\nclients is achieved only through knowledge distillation on attention maps\ngenerated by the local ViT models, ensuring enhanced privacy preservation\ncompared to traditional parameter or synthetic image sharing methodologies.\nFedEvPrompt is optimized within a round-based learning paradigm, where each\nround involves training local models followed by attention maps sharing with\nall federation clients. Experimental validation conducted in a real distributed\nsetting, on the ISIC2019 dataset, demonstrates the superior performance of\nFedEvPrompt against baseline federated learning algorithms and knowledge\ndistillation methods, without sharing model parameters. In conclusion,\nFedEvPrompt offers a promising approach for federated learning, effectively\naddressing challenges such as data heterogeneity, imbalance, privacy\npreservation, and knowledge sharing.\n","authors":["Rutger Hendrix","Federica Proietto Salanitri","Concetto Spampinato","Simone Palazzo","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2411.10071v1.pdf","comment":"Published as a conference paper at ICPR 2024"},{"id":"http://arxiv.org/abs/2410.15658v2","updated":"2024-11-15T09:34:23Z","published":"2024-10-21T05:56:31Z","title":"Calibration of ordinal regression networks","summary":" Recent studies have shown that deep neural networks are not well-calibrated\nand often produce over-confident predictions. The miscalibration issue\nprimarily stems from using cross-entropy in classifications, which aims to\nalign predicted softmax probabilities with one-hot labels. In ordinal\nregression tasks, this problem is compounded by an additional challenge: the\nexpectation that softmax probabilities should exhibit unimodal distribution is\nnot met with cross-entropy. The ordinal regression literature has focused on\nlearning orders and overlooked calibration. To address both issues, we propose\na novel loss function that introduces order-aware calibration, ensuring that\nprediction confidence adheres to ordinal relationships between classes. It\nincorporates soft ordinal encoding and order-aware regularization to enforce\nboth calibration and unimodality. Extensive experiments across three popular\nordinal regression benchmarks demonstrate that our approach achieves\nstate-of-the-art calibration without compromising accuracy.\n","authors":["Daehwan Kim","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2410.15658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06740v2","updated":"2024-11-15T09:31:52Z","published":"2024-11-11T06:25:13Z","title":"Dockformer: A transformer-based molecular docking paradigm for\n large-scale virtual screening","summary":" Molecular docking enables virtual screening of compound libraries to identify\npotential ligands that target proteins of interest, a crucial step in drug\ndevelopment; however, as the size of the compound library increases, the\ncomputational complexity of traditional docking models increases. Deep learning\nalgorithms can provide data-driven research and development models to increase\nthe speed of the docking process. Unfortunately, few models can achieve\nsuperior screening performance compared to that of traditional models.\nTherefore, a novel deep learning-based docking approach named Dockformer is\nintroduced in this study. Dockformer leverages multimodal information to\ncapture the geometric topology and structural knowledge of molecules and can\ndirectly generate binding conformations with the corresponding confidence\nmeasures in an end-to-end manner. The experimental results show that Dockformer\nachieves success rates of 90.53\\% and 82.71\\% on the PDBbind core set and\nPoseBusters benchmarks, respectively, and more than a 100-fold increase in the\ninference process speed, outperforming almost all state-of-the-art docking\nmethods. In addition, the ability of Dockformer to identify the main protease\ninhibitors of coronaviruses is demonstrated in a real-world virtual screening\nscenario. Considering its high docking accuracy and screening efficiency,\nDockformer can be regarded as a powerful and robust tool in the field of drug\ndesign.\n","authors":["Zhangfan Yang","Junkai Ji","Shan He","Jianqiang Li","Ruibin Bai","Zexuan Zhu","Yew Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2411.06740v2.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.10064v1","updated":"2024-11-15T09:28:55Z","published":"2024-11-15T09:28:55Z","title":"Adaptive Physics-Guided Neural Network","summary":" This paper introduces an adaptive physics-guided neural network (APGNN)\nframework for predicting quality attributes from image data by integrating\nphysical laws into deep learning models. The APGNN adaptively balances\ndata-driven and physics-informed predictions, enhancing model accuracy and\nrobustness across different environments. Our approach is evaluated on both\nsynthetic and real-world datasets, with comparisons to conventional data-driven\nmodels such as ResNet. For the synthetic data, 2D domains were generated using\nthree distinct governing equations: the diffusion equation, the\nadvection-diffusion equation, and the Poisson equation. Non-linear\ntransformations were applied to these domains to emulate complex physical\nprocesses in image form.\n In real-world experiments, the APGNN consistently demonstrated superior\nperformance in the diverse thermal image dataset. On the cucumber dataset,\ncharacterized by low material diversity and controlled conditions, APGNN and\nPGNN showed similar performance, both outperforming the data-driven ResNet.\nHowever, in the more complex thermal dataset, particularly for outdoor\nmaterials with higher environmental variability, APGNN outperformed both PGNN\nand ResNet by dynamically adjusting its reliance on physics-based versus\ndata-driven insights. This adaptability allowed APGNN to maintain robust\nperformance across structured, low-variability settings and more heterogeneous\nscenarios. These findings underscore the potential of adaptive physics-guided\nlearning to integrate physical constraints effectively, even in challenging\nreal-world contexts with diverse environmental conditions.\n","authors":["David Shulman","Itai Dattner"],"pdf_url":"https://arxiv.org/pdf/2411.10064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10063v1","updated":"2024-11-15T09:26:00Z","published":"2024-11-15T09:26:00Z","title":"Federated Domain Generalization via Prompt Learning and Aggregation","summary":" Federated domain generalization (FedDG) aims to improve the global model\ngeneralization in unseen domains by addressing data heterogeneity under\nprivacy-preserving constraints. A common strategy in existing FedDG studies\ninvolves sharing domain-specific knowledge among clients, such as spectrum\ninformation, class prototypes, and data styles. However, this knowledge is\nextracted directly from local client samples, and sharing such sensitive\ninformation poses a potential risk of data leakage, which might not fully meet\nthe requirements of FedDG. In this paper, we introduce prompt learning to adapt\npre-trained vision-language models (VLMs) in the FedDG scenario, and leverage\nlocally learned prompts as a more secure bridge to facilitate knowledge\ntransfer among clients. Specifically, we propose a novel FedDG framework\nthrough Prompt Learning and AggregatioN (PLAN), which comprises two training\nstages to collaboratively generate local prompts and global prompts at each\nfederated round. First, each client performs both text and visual prompt\nlearning using their own data, with local prompts indirectly synchronized by\nregarding the global prompts as a common reference. Second, all domain-specific\nlocal prompts are exchanged among clients and selectively aggregated into the\nglobal prompts using lightweight attention-based aggregators. The global\nprompts are finally applied to adapt VLMs to unseen target domains. As our PLAN\nframework requires training only a limited number of prompts and lightweight\naggregators, it offers notable advantages in computational and communication\nefficiency for FedDG. Extensive experiments demonstrate the superior\ngeneralization ability of PLAN across four benchmark datasets.\n","authors":["Shuai Gong","Chaoran Cui","Chunyun Zhang","Wenna Wang","Xiushan Nie","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.10063v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.10058v1","updated":"2024-11-15T09:21:54Z","published":"2024-11-15T09:21:54Z","title":"Unsupervised Congestion Status Identification Using LMP Data","summary":" Having a better understanding of how locational marginal prices (LMPs) change\nhelps in price forecasting and market strategy making. This paper investigates\nthe fundamental distribution of the congestion part of LMPs in high-dimensional\nEuclidean space using an unsupervised approach. LMP models based on the\nlossless and lossy DC optimal power flow (DC-OPF) are analyzed to show the\noverlapping subspace property of the LMP data. The congestion part of LMPs is\nspanned by certain row vectors of the power transfer distribution factor (PTDF)\nmatrix, and the subspace attributes of an LMP vector uniquely are found to\nreflect the instantaneous congestion status of all the transmission lines. The\nproposed method searches for the basis vectors that span the subspaces of\ncongestion LMP data in hierarchical ways. In the bottom-up search, the data\nbelonging to 1-dimensional subspaces are detected, and other data are projected\non the orthogonal subspaces. This procedure is repeated until all the basis\nvectors are found or the basis gap appears. Top-down searching is used to\naddress the basis gap by hyperplane detection with outliers. Once all the basis\nvectors are detected, the congestion status can be identified. Numerical\nexperiments based on the IEEE 30-bus system, IEEE 118-bus system, Illinois\n200-bus system, and Southwest Power Pool are conducted to show the performance\nof the proposed method.\n","authors":["Kedi Zheng","Qixin Chen","Yi Wang","Chongqing Kang","Le Xie"],"pdf_url":"https://arxiv.org/pdf/2411.10058v1.pdf","comment":"Paper accepted for IEEE Transactions on Smart Grid. Personal use of\n this material is permitted. Permission from IEEE must be obtained for all\n other uses"},{"id":"http://arxiv.org/abs/2411.10057v1","updated":"2024-11-15T09:20:46Z","published":"2024-11-15T09:20:46Z","title":"KuaiFormer: Transformer-Based Retrieval at Kuaishou","summary":" In large-scale content recommendation systems, retrieval serves as the\ninitial stage in the pipeline, responsible for selecting thousands of candidate\nitems from billions of options to pass on to ranking modules. Traditionally,\nthe dominant retrieval method has been Embedding-Based Retrieval (EBR) using a\nDeep Neural Network (DNN) dual-tower structure. However, applying transformer\nin retrieval tasks has been the focus of recent research, though real-world\nindustrial deployment still presents significant challenges. In this paper, we\nintroduce KuaiFormer, a novel transformer-based retrieval framework deployed in\na large-scale content recommendation system. KuaiFormer fundamentally redefines\nthe retrieval process by shifting from conventional score estimation tasks\n(such as click-through rate estimate) to a transformer-driven Next Action\nPrediction paradigm. This shift enables more effective real-time interest\nacquisition and multi-interest extraction, significantly enhancing retrieval\nperformance. KuaiFormer has been successfully integrated into Kuaishou App's\nshort-video recommendation system since May 2024, serving over 400 million\ndaily active users and resulting in a marked increase in average daily usage\ntime of Kuaishou users. We provide insights into both the technical and\nbusiness aspects of deploying transformer in large-scale recommendation\nsystems, addressing practical challenges encountered during industrial\nimplementation. Our findings offer valuable guidance for engineers and\nresearchers aiming to leverage transformer models to optimize large-scale\ncontent recommendation systems.\n","authors":["Chi Liu","Jiangxia Cao","Rui Huang","Kai Zheng","Qiang Luo","Kun Gai","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.10057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10053v1","updated":"2024-11-15T09:11:10Z","published":"2024-11-15T09:11:10Z","title":"That Chip Has Sailed: A Critique of Unfounded Skepticism Around AI for\n Chip Design","summary":" In 2020, we introduced a deep reinforcement learning method capable of\ngenerating superhuman chip layouts, which we then published in Nature and\nopen-sourced on GitHub. AlphaChip has inspired an explosion of work on AI for\nchip design, and has been deployed in state-of-the-art chips across Alphabet\nand extended by external chipmakers. Even so, a non-peer-reviewed invited paper\nat ISPD 2023 questioned its performance claims, despite failing to run our\nmethod as described in Nature. For example, it did not pre-train the RL method\n(removing its ability to learn from prior experience), used substantially fewer\ncompute resources (20x fewer RL experience collectors and half as many GPUs),\ndid not train to convergence (standard practice in machine learning), and\nevaluated on test cases that are not representative of modern chips. Recently,\nIgor Markov published a meta-analysis of three papers: our peer-reviewed Nature\npaper, the non-peer-reviewed ISPD paper, and Markov's own unpublished paper\n(though he does not disclose that he co-authored it). Although AlphaChip has\nalready achieved widespread adoption and impact, we publish this response to\nensure that no one is wrongly discouraged from innovating in this impactful\narea.\n","authors":["Anna Goldie","Azalia Mirhoseini","Jeff Dean"],"pdf_url":"https://arxiv.org/pdf/2411.10053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10050v1","updated":"2024-11-15T09:05:03Z","published":"2024-11-15T09:05:03Z","title":"Jal Anveshak: Prediction of fishing zones using fine-tuned LlaMa 2","summary":" In recent years, the global and Indian government efforts in monitoring and\ncollecting data related to the fisheries industry have witnessed significant\nadvancements. Despite this wealth of data, there exists an untapped potential\nfor leveraging artificial intelligence based technological systems to benefit\nIndian fishermen in coastal areas. To fill this void in the Indian technology\necosystem, the authors introduce Jal Anveshak. This is an application framework\nwritten in Dart and Flutter that uses a Llama 2 based Large Language Model\nfine-tuned on pre-processed and augmented government data related to fishing\nyield and availability. Its main purpose is to help Indian fishermen safely get\nthe maximum yield of fish from coastal areas and to resolve their fishing\nrelated queries in multilingual and multimodal ways.\n","authors":["Arnav Mejari","Maitreya Vaghulade","Paarshva Chitaliya","Arya Telang","Lynette D'mello"],"pdf_url":"https://arxiv.org/pdf/2411.10050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10048v1","updated":"2024-11-15T08:55:31Z","published":"2024-11-15T08:55:31Z","title":"Physics-informed neural networks need a physicist to be accurate: the\n case of mass and heat transport in Fischer-Tropsch catalyst particles","summary":" Physics-Informed Neural Networks (PINNs) have emerged as an influential\ntechnology, merging the swift and automated capabilities of machine learning\nwith the precision and dependability of simulations grounded in theoretical\nphysics. PINNs are often employed to solve algebraic or differential equations\nto replace some or even all steps of multi-stage computational workflows,\nleading to their significant speed-up. However, wide adoption of PINNs is still\nhindered by reliability issues, particularly at extreme ends of the input\nparameter ranges. In this study, we demonstrate this in the context of a system\nof coupled non-linear differential reaction-diffusion and heat transfer\nequations related to Fischer-Tropsch synthesis, which are solved by a\nfinite-difference method with a PINN used in evaluating their source terms. It\nis shown that the testing strategies traditionally used to assess the accuracy\nof neural networks as function approximators can overlook the peculiarities\nwhich ultimately cause instabilities of the finite-difference solver. We\npropose a domain knowledge-based modifications to the PINN architecture\nensuring its correct asymptotic behavior. When combined with an improved\nnumerical scheme employed as an initial guess generator, the proposed\nmodifications are shown to recover the overall stability of the simulations,\nwhile preserving the speed-up brought by PINN as the workflow component. We\ndiscuss the possible applications of the proposed hybrid transport equation\nsolver in context of chemical reactors simulations.\n","authors":["Tymofii Nikolaienko","Harshil Patel","Aniruddha Panda","Subodh Madhav Joshi","Stanislav Jaso","Kaushic Kalyanaraman"],"pdf_url":"https://arxiv.org/pdf/2411.10048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21745v2","updated":"2024-11-15T08:54:31Z","published":"2024-10-29T05:18:34Z","title":"A Dual Adaptive Assignment Approach for Robust Graph-Based Clustering","summary":" Graph clustering is an essential aspect of network analysis that involves\ngrouping nodes into separate clusters. Recent developments in deep learning\nhave resulted in advanced deep graph clustering techniques, which have proven\neffective in many applications. Nonetheless, these methods often encounter\ndifficulties when dealing with the complexities of real-world graphs,\nparticularly in the presence of noisy edges. Additionally, many denoising graph\nclustering strategies tend to suffer from lower performance compared to their\nnon-denoised counterparts, training instability, and challenges in scaling to\nlarge datasets. To tackle these issues, we introduce a new framework called the\nDual Adaptive Assignment Approach for Robust Graph-Based Clustering (RDSA).\nRDSA consists of three key components: (i) a node embedding module that\neffectively integrates the graph's topological features and node attributes;\n(ii) a structure-based soft assignment module that improves graph modularity by\nutilizing an affinity matrix for node assignments; and (iii) a node-based soft\nassignment module that identifies community landmarks and refines node\nassignments to enhance the model's robustness. We assess RDSA on various\nreal-world datasets, demonstrating its superior performance relative to\nexisting state-of-the-art methods. Our findings indicate that RDSA provides\nrobust clustering across different graph types, excelling in clustering\neffectiveness and robustness, including adaptability to noise, stability, and\nscalability.\n","authors":["Yang Xiang","Li Fan","Tulika Saha","Xiaoying Pang","Yushan Pan","Haiyang Zhang","Chengtao Ji"],"pdf_url":"https://arxiv.org/pdf/2410.21745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09312v2","updated":"2024-11-15T08:17:22Z","published":"2024-11-14T09:38:58Z","title":"Approximate Probabilistic Inference for Time-Series Data A Robust Latent\n Gaussian Model With Temporal Awareness","summary":" The development of robust generative models for highly varied non-stationary\ntime series data is a complex yet important problem. Traditional models for\ntime series data prediction, such as Long Short-Term Memory (LSTM), are\ninefficient and generalize poorly as they cannot capture complex temporal\nrelationships. In this paper, we present a probabilistic generative model that\ncan be trained to capture temporal information, and that is robust to data\nerrors. We call it Time Deep Latent Gaussian Model (tDLGM). Its novel\narchitecture is inspired by Deep Latent Gaussian Model (DLGM). Our model is\ntrained to minimize a loss function based on the negative log loss. One\ncontributing factor to Time Deep Latent Gaussian Model (tDLGM) robustness is\nour regularizer, which accounts for data trends. Experiments conducted show\nthat tDLGM is able to reconstruct and generate complex time series data, and\nthat it is robust against to noise and faulty data.\n","authors":["Anton Johansson","Arunselvan Ramaswamy"],"pdf_url":"https://arxiv.org/pdf/2411.09312v2.pdf","comment":"New revision added a space between \"for\" and \"Time-Series\" in the\n title"},{"id":"http://arxiv.org/abs/2405.18816v3","updated":"2024-11-15T08:10:51Z","published":"2024-05-29T06:56:12Z","title":"Flow Priors for Linear Inverse Problems via Iterative Corrupted\n Trajectory Matching","summary":" Generative models based on flow matching have attracted significant attention\nfor their simplicity and superior performance in high-resolution image\nsynthesis. By leveraging the instantaneous change-of-variables formula, one can\ndirectly compute image likelihoods from a learned flow, making them enticing\ncandidates as priors for downstream tasks such as inverse problems. In\nparticular, a natural approach would be to incorporate such image probabilities\nin a maximum-a-posteriori (MAP) estimation problem. A major obstacle, however,\nlies in the slow computation of the log-likelihood, as it requires\nbackpropagating through an ODE solver, which can be prohibitively slow for\nhigh-dimensional problems. In this work, we propose an iterative algorithm to\napproximate the MAP estimator efficiently to solve a variety of linear inverse\nproblems. Our algorithm is mathematically justified by the observation that the\nMAP objective can be approximated by a sum of $N$ ``local MAP'' objectives,\nwhere $N$ is the number of function evaluations. By leveraging Tweedie's\nformula, we show that we can perform gradient steps to sequentially optimize\nthese objectives. We validate our approach for various linear inverse problems,\nsuch as super-resolution, deblurring, inpainting, and compressed sensing, and\ndemonstrate that we can outperform other methods based on flow matching. Code\nis available at https://github.com/YasminZhang/ICTM.\n","authors":["Yasi Zhang","Peiyu Yu","Yaxuan Zhu","Yingshan Chang","Feng Gao","Ying Nian Wu","Oscar Leong"],"pdf_url":"https://arxiv.org/pdf/2405.18816v3.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10023v1","updated":"2024-11-15T08:09:28Z","published":"2024-11-15T08:09:28Z","title":"Model Inversion Attacks: A Survey of Approaches and Countermeasures","summary":" The success of deep neural networks has driven numerous research studies and\napplications from Euclidean to non-Euclidean data. However, there are\nincreasing concerns about privacy leakage, as these networks rely on processing\nprivate data. Recently, a new type of privacy attack, the model inversion\nattacks (MIAs), aims to extract sensitive features of private data for training\nby abusing access to a well-trained model. The effectiveness of MIAs has been\ndemonstrated in various domains, including images, texts, and graphs. These\nattacks highlight the vulnerability of neural networks and raise awareness\nabout the risk of privacy leakage within the research community. Despite the\nsignificance, there is a lack of systematic studies that provide a\ncomprehensive overview and deeper insights into MIAs across different domains.\nThis survey aims to summarize up-to-date MIA methods in both attacks and\ndefenses, highlighting their contributions and limitations, underlying modeling\nprinciples, optimization challenges, and future directions. We hope this survey\nbridges the gap in the literature and facilitates future research in this\ncritical area. Besides, we are maintaining a repository to keep track of\nrelevant research at\nhttps://github.com/AndrewZhou924/Awesome-model-inversion-attack.\n","authors":["Zhanke Zhou","Jianing Zhu","Fengfei Yu","Xuan Li","Xiong Peng","Tongliang Liu","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2411.10023v1.pdf","comment":"40 pages, 17 figures"},{"id":"http://arxiv.org/abs/2405.17484v3","updated":"2024-11-15T08:02:03Z","published":"2024-05-24T16:18:16Z","title":"Bridging The Gap between Low-rank and Orthogonal Adaptation via\n Householder Reflection Adaptation","summary":" While following different technical routes, both low-rank and orthogonal\nadaptation techniques can efficiently adapt large-scale pre-training models in\nspecific tasks or domains based on a small piece of trainable parameters. In\nthis study, we bridge the gap between these two techniques, proposing a simple\nbut effective adaptation method based on Householder reflections. Given a\npre-trained model, our method fine-tunes its layers by multiplying each frozen\nweight matrix with an orthogonal matrix constructed by a chain of learnable\nHouseholder reflections (HRs). This HR-based orthogonal fine-tuning is\nequivalent to an adaptive low-rank adaptation. Moreover, we show that the\northogonality of the reflection planes corresponding to the HRs impacts the\nmodel capacity and regularity. The analysis motivates us to regularize the\northogonality of the HRs, leading to different implementations of the proposed\nHouseholder reflection adaptation (HRA) method. Compared with state-of-the-art\nmethods, HRA achieves superior performance with fewer learnable parameters when\nadapting large language models and conditional image generators. The code of\nthe experiments is available at \\url{https://github.com/DaShenZi721/HRA}, and\nthe method has been merged into the\n\\href{https://github.com/huggingface/peft}{PEFT} package.\n","authors":["Shen Yuan","Haotian Liu","Hongteng Xu"],"pdf_url":"https://arxiv.org/pdf/2405.17484v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10019v1","updated":"2024-11-15T07:54:14Z","published":"2024-11-15T07:54:14Z","title":"Towards Utilising a Range of Neural Activations for Comprehending\n Representational Associations","summary":" Recent efforts to understand intermediate representations in deep neural\nnetworks have commonly attempted to label individual neurons and combinations\nof neurons that make up linear directions in the latent space by examining\nextremal neuron activations and the highest direction projections. In this\npaper, we show that this approach, although yielding a good approximation for\nmany purposes, fails to capture valuable information about the behaviour of a\nrepresentation. Neural network activations are generally dense, and so a more\ncomplex, but realistic scenario is that linear directions encode information at\nvarious levels of stimulation. We hypothesise that non-extremal level\nactivations contain complex information worth investigating, such as\nstatistical associations, and thus may be used to locate confounding human\ninterpretable concepts. We explore the value of studying a range of neuron\nactivations by taking the case of mid-level output neuron activations and\ndemonstrate on a synthetic dataset how they can inform us about aspects of\nrepresentations in the penultimate layer not evident through analysing maximal\nactivations alone. We use our findings to develop a method to curate data from\nmid-range logit samples for retraining to mitigate spurious correlations, or\nconfounding concepts in the penultimate layer, on real benchmark datasets. The\nsuccess of our method exemplifies the utility of inspecting non-maximal\nactivations to extract complex relationships learned by models.\n","authors":["Laura O'Mahony","Nikola S. Nikolov","David JP O'Sullivan"],"pdf_url":"https://arxiv.org/pdf/2411.10019v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.10015v1","updated":"2024-11-15T07:50:01Z","published":"2024-11-15T07:50:01Z","title":"MicroCrackAttentionNeXt: Advancing Microcrack Detection in Wave Field\n Analysis Using Deep Neural Networks through Feature Visualization","summary":" Micro Crack detection using deep neural networks (DNNs) through an automated\npipeline using wave fields interacting with the damaged areas is highly sought\nafter. These high-dimensional spatio-temporal crack data are limited, and these\ndatasets have large dimensions in the temporal domain. The dataset presents a\nsubstantial class imbalance, with crack pixels constituting an average of only\n5% of the total pixels per sample. This extreme class imbalance poses a\nchallenge for deep learning models with the different micro-scale cracks, as\nthe network can be biased toward predicting the majority class, generally\nleading to poor detection accuracy. This study builds upon the previous\nbenchmark SpAsE-Net, an asymmetric encoder-decoder network for micro-crack\ndetection. The impact of various activation and loss functions were examined\nthrough feature space visualization using the manifold discovery and analysis\n(MDA) algorithm. The optimized architecture and training methodology achieved\nan accuracy of 86.85%.\n","authors":["Fatahlla Moreh","Yusuf Hasan","Bilal Zahid Hussain","Mohammad Ammar","Sven Tomforde"],"pdf_url":"https://arxiv.org/pdf/2411.10015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17550v2","updated":"2024-11-15T07:48:26Z","published":"2024-09-26T05:39:52Z","title":"A Simple but Strong Baseline for Sounding Video Generation: Effective\n Adaptation of Audio and Video Diffusion Models for Joint Generation","summary":" In this work, we build a simple but strong baseline for sounding video\ngeneration. Given base diffusion models for audio and video, we integrate them\nwith additional modules into a single model and train it to make the model\njointly generate audio and video. To enhance alignment between audio-video\npairs, we introduce two novel mechanisms in our model. The first one is\ntimestep adjustment, which provides different timestep information to each base\nmodel. It is designed to align how samples are generated along with timesteps\nacross modalities. The second one is a new design of the additional modules,\ntermed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE,\ncross-modal information is embedded as if it represents temporal position\ninformation, and the embeddings are fed into the model like positional\nencoding. Compared with the popular cross-attention mechanism, CMC-PE provides\na better inductive bias for temporal alignment in the generated data.\nExperimental results validate the effectiveness of the two newly introduced\nmechanisms and also demonstrate that our method outperforms existing methods.\n","authors":["Masato Ishii","Akio Hayakawa","Takashi Shibuya","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.17550v2.pdf","comment":"The source code is available:\n https://github.com/SonyResearch/SVG_baseline"},{"id":"http://arxiv.org/abs/2411.10013v1","updated":"2024-11-15T07:43:45Z","published":"2024-11-15T07:43:45Z","title":"Efficient Depth Estimation for Unstable Stereo Camera Systems on AR\n Glasses","summary":" Stereo depth estimation is a fundamental component in augmented reality (AR)\napplications. Although AR applications require very low latency for their\nreal-time applications, traditional depth estimation models often rely on\ntime-consuming preprocessing steps such as rectification to achieve high\naccuracy. Also, non standard ML operator based algorithms such as cost volume\nalso require significant latency, which is aggravated on compute\nresource-constrained mobile platforms. Therefore, we develop hardware-friendly\nalternatives to the costly cost volume and preprocessing and design two new\nmodels based on them, MultiHeadDepth and HomoDepth. Our approaches for cost\nvolume is replacing it with a new group-pointwise convolution-based operator\nand approximation of consine similarity based on layernorm and dot product. For\nonline stereo rectification (preprocessing), we introduce homograhy matrix\nprediction network with a rectification positional encoding (RPE), which\ndelivers both low latency and robustness to unrectified images, which\neliminates the needs for preprocessing. Our MultiHeadDepth, which includes\noptimized cost volume, provides 11.8-30.3% improvements in accuracy and\n22.9-25.2% reduction in latency compared to a state-of-the-art depth estimation\nmodel for AR glasses from industry. Our HomoDepth, which includes optimized\npreprocessing (Homograhpy + RPE) upon MultiHeadDepth, can process unrectified\nimages and reduce the end-to-end latency by 44.5%. We adopt a multi-task\nlearning framework to handle misaligned stereo inputs on HomoDepth, which\nreduces theAbsRel error by 10.0-24.3%. The results demonstrate the efficacy of\nour approaches in achieving both high model performance with low latency, which\nmakes a step forward toward practical depth estimation on future AR devices.\n","authors":["Yongfan Liu","Hyoukjun Kwon"],"pdf_url":"https://arxiv.org/pdf/2411.10013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10010v1","updated":"2024-11-15T07:42:16Z","published":"2024-11-15T07:42:16Z","title":"DeepMedcast: A Deep Learning Method for Generating Intermediate Weather\n Forecasts among Multiple NWP Models","summary":" Numerical weather prediction (NWP) centers around the world operate a variety\nof NWP models, and recent advances in AI-driven NWP models have increased the\navailability of diverse NWP outputs. While this expansion holds the potential\nto improve forecast accuracy, it also raises a critical challenge of\nidentifying the most reliable predictions for specific forecast scenarios.\nTraditional approaches, such as ensemble or weighted averaging, combine\nmultiple NWP outputs but often generate unrealistic atmospheric fields,\ncomplicating the production of reliable and consistent forecasts in operational\nsettings. In this study, we introduce DeepMedcast, a deep learning method that\ngenerates intermediate forecast, or \"medcast\", between two or more NWP outputs.\nUnlike ensemble averaging, DeepMedcast can provide consistent and explainable\nmedcast without distorting meteorological fields. This paper details the\nmethodology and case studies of DeepMedcast, discussing its advantages and\npotential contributions to operational forecasting.\n","authors":["Atsushi Kudo"],"pdf_url":"https://arxiv.org/pdf/2411.10010v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.03316v2","updated":"2024-11-15T07:28:15Z","published":"2024-05-06T09:48:47Z","title":"Provably Unlearnable Data Examples","summary":" The exploitation of publicly accessible data has led to escalating concerns\nregarding data privacy and intellectual property (IP) breaches in the age of\nartificial intelligence. To safeguard both data privacy and IP-related domain\nknowledge, efforts have been undertaken to render shared data unlearnable for\nunauthorized models in the wild. Existing methods apply empirically optimized\nperturbations to the data in the hope of disrupting the correlation between the\ninputs and the corresponding labels such that the data samples are converted\ninto Unlearnable Examples (UEs). Nevertheless, the absence of mechanisms to\nverify the robustness of UEs against uncertainty in unauthorized models and\ntheir training procedures engenders several under-explored challenges. First,\nit is hard to quantify the unlearnability of UEs against unauthorized\nadversaries from different runs of training, leaving the soundness of the\ndefense in obscurity. Particularly, as a prevailing evaluation metric,\nempirical test accuracy faces generalization errors and may not plausibly\nrepresent the quality of UEs. This also leaves room for attackers, as there is\nno rigid guarantee of the maximal test accuracy achievable by attackers.\nFurthermore, we find that a simple recovery attack can restore the clean-task\nperformance of the classifiers trained on UEs by slightly perturbing the\nlearned weights. To mitigate the aforementioned problems, in this paper, we\npropose a mechanism for certifying the so-called $(q, \\eta)$-Learnability of an\nunlearnable dataset via parametric smoothing. A lower certified $(q,\n\\eta)$-Learnability indicates a more robust and effective protection over the\ndataset. Concretely, we 1) improve the tightness of certified $(q,\n\\eta)$-Learnability and 2) design Provably Unlearnable Examples (PUEs) which\nhave reduced $(q, \\eta)$-Learnability.\n","authors":["Derui Wang","Minhui Xue","Bo Li","Seyit Camtepe","Liming Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.03316v2.pdf","comment":"Accepted to Network and Distributed System Security (NDSS) Symposium\n 2025, San Diego, CA, USA. Source code is available at\n https://github.com/NeuralSec/certified-data-learnability"},{"id":"http://arxiv.org/abs/2411.10000v1","updated":"2024-11-15T07:15:05Z","published":"2024-11-15T07:15:05Z","title":"DuSEGO: Dual Second-order Equivariant Graph Ordinary Differential\n Equation","summary":" Graph Neural Networks (GNNs) with equivariant properties have achieved\nsignificant success in modeling complex dynamic systems and molecular\nproperties. However, their expressiveness ability is limited by: (1) Existing\nmethods often overlook the over-smoothing issue caused by traditional GNN\nmodels, as well as the gradient explosion or vanishing problems in deep GNNs.\n(2) Most models operate on first-order information, neglecting that the real\nworld often consists of second-order systems, which further limits the model's\nrepresentation capabilities. To address these issues, we propose the\n\\textbf{Du}al \\textbf{S}econd-order \\textbf{E}quivariant \\textbf{G}raph\n\\textbf{O}rdinary Differential Equation (\\method{}) for equivariant\nrepresentation. Specifically, \\method{} apply the dual second-order equivariant\ngraph ordinary differential equations (Graph ODEs) on graph embeddings and node\ncoordinates, simultaneously. Theoretically, we first prove that \\method{}\nmaintains the equivariant property. Furthermore, we provide theoretical\ninsights showing that \\method{} effectively alleviates the over-smoothing\nproblem in both feature representation and coordinate update. Additionally, we\ndemonstrate that the proposed \\method{} mitigates the exploding and vanishing\ngradients problem, facilitating the training of deep multi-layer GNNs.\nExtensive experiments on benchmark datasets validate the superiority of the\nproposed \\method{} compared to baselines.\n","authors":["Yingxu Wang","Nan Yin","Mingyan Xiao","Xinhao Yi","Siwei Liu","Shangsong Liang"],"pdf_url":"https://arxiv.org/pdf/2411.10000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09998v1","updated":"2024-11-15T07:12:18Z","published":"2024-11-15T07:12:18Z","title":"Adaptive Non-Uniform Timestep Sampling for Diffusion Model Training","summary":" As a highly expressive generative model, diffusion models have demonstrated\nexceptional success across various domains, including image generation, natural\nlanguage processing, and combinatorial optimization. However, as data\ndistributions grow more complex, training these models to convergence becomes\nincreasingly computationally intensive. While diffusion models are typically\ntrained using uniform timestep sampling, our research shows that the variance\nin stochastic gradients varies significantly across timesteps, with\nhigh-variance timesteps becoming bottlenecks that hinder faster convergence. To\naddress this issue, we introduce a non-uniform timestep sampling method that\nprioritizes these more critical timesteps. Our method tracks the impact of\ngradient updates on the objective for each timestep, adaptively selecting those\nmost likely to minimize the objective effectively. Experimental results\ndemonstrate that this approach not only accelerates the training process, but\nalso leads to improved performance at convergence. Furthermore, our method\nshows robust performance across various datasets, scheduling strategies, and\ndiffusion architectures, outperforming previously proposed timestep sampling\nand weighting heuristics that lack this degree of robustness.\n","authors":["Myunsoo Kim","Donghyeon Ki","Seong-Woong Shim","Byung-Jun Lee"],"pdf_url":"https://arxiv.org/pdf/2411.09998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09979v1","updated":"2024-11-15T06:26:37Z","published":"2024-11-15T06:26:37Z","title":"Fully Dynamic Adversarially Robust Correlation Clustering in\n Polylogarithmic Update Time","summary":" We study the dynamic correlation clustering problem with $\\textit{adaptive}$\nedge label flips. In correlation clustering, we are given a $n$-vertex complete\ngraph whose edges are labeled either $(+)$ or $(-)$, and the goal is to\nminimize the total number of $(+)$ edges between clusters and the number of\n$(-)$ edges within clusters. We consider the dynamic setting with adversarial\nrobustness, in which the $\\textit{adaptive}$ adversary could flip the label of\nan edge based on the current output of the algorithm. Our main result is a\nrandomized algorithm that always maintains an $O(1)$-approximation to the\noptimal correlation clustering with $O(\\log^{2}{n})$ amortized update time.\nPrior to our work, no algorithm with $O(1)$-approximation and\n$\\text{polylog}{(n)}$ update time for the adversarially robust setting was\nknown. We further validate our theoretical results with experiments on\nsynthetic and real-world datasets with competitive empirical performances. Our\nmain technical ingredient is an algorithm that maintains $\\textit{sparse-dense\ndecomposition}$ with $\\text{polylog}{(n)}$ update time, which could be of\nindependent interest.\n","authors":["Vladimir Braverman","Prathamesh Dharangutte","Shreyas Pai","Vihan Shah","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08933v2","updated":"2024-11-15T06:13:33Z","published":"2024-11-13T09:13:20Z","title":"Confidence-aware Denoised Fine-tuning of Off-the-shelf Models for\n Certified Robustness","summary":" The remarkable advances in deep learning have led to the emergence of many\noff-the-shelf classifiers, e.g., large pre-trained models. However, since they\nare typically trained on clean data, they remain vulnerable to adversarial\nattacks. Despite this vulnerability, their superior performance and\ntransferability make off-the-shelf classifiers still valuable in practice,\ndemanding further work to provide adversarial robustness for them in a post-hoc\nmanner. A recently proposed method, denoised smoothing, leverages a denoiser\nmodel in front of the classifier to obtain provable robustness without\nadditional training. However, the denoiser often creates hallucination, i.e.,\nimages that have lost the semantics of their originally assigned class, leading\nto a drop in robustness. Furthermore, its noise-and-denoise procedure\nintroduces a significant distribution shift from the original distribution,\ncausing the denoised smoothing framework to achieve sub-optimal robustness. In\nthis paper, we introduce Fine-Tuning with Confidence-Aware Denoised Image\nSelection (FT-CADIS), a novel fine-tuning scheme to enhance the certified\nrobustness of off-the-shelf classifiers. FT-CADIS is inspired by the\nobservation that the confidence of off-the-shelf classifiers can effectively\nidentify hallucinated images during denoised smoothing. Based on this, we\ndevelop a confidence-aware training objective to handle such hallucinated\nimages and improve the stability of fine-tuning from denoised images. In this\nway, the classifier can be fine-tuned using only images that are beneficial for\nadversarial robustness. We also find that such a fine-tuning can be done by\nupdating a small fraction of parameters of the classifier. Extensive\nexperiments demonstrate that FT-CADIS has established the state-of-the-art\ncertified robustness among denoised smoothing methods across all\n$\\ell_2$-adversary radius in various benchmarks.\n","authors":["Suhyeok Jang","Seojin Kim","Jinwoo Shin","Jongheon Jeong"],"pdf_url":"https://arxiv.org/pdf/2411.08933v2.pdf","comment":"26 pages; TMLR 2024; Code is available at\n https://github.com/suhyeok24/FT-CADIS"},{"id":"http://arxiv.org/abs/2411.09973v1","updated":"2024-11-15T06:05:52Z","published":"2024-11-15T06:05:52Z","title":"Establishing and Evaluating Trustworthy AI: Overview and Research\n Challenges","summary":" Artificial intelligence (AI) technologies (re-)shape modern life, driving\ninnovation in a wide range of sectors. However, some AI systems have yielded\nunexpected or undesirable outcomes or have been used in questionable manners.\nAs a result, there has been a surge in public and academic discussions about\naspects that AI systems must fulfill to be considered trustworthy. In this\npaper, we synthesize existing conceptualizations of trustworthy AI along six\nrequirements: 1) human agency and oversight, 2) fairness and\nnon-discrimination, 3) transparency and explainability, 4) robustness and\naccuracy, 5) privacy and security, and 6) accountability. For each one, we\nprovide a definition, describe how it can be established and evaluated, and\ndiscuss requirement-specific research challenges. Finally, we conclude this\nanalysis by identifying overarching research challenges across the requirements\nwith respect to 1) interdisciplinary research, 2) conceptual clarity, 3)\ncontext-dependency, 4) dynamics in evolving systems, and 5) investigations in\nreal-world contexts. Thus, this paper synthesizes and consolidates a\nwide-ranging and active discussion currently taking place in various academic\nsub-communities and public forums. It aims to serve as a reference for a broad\naudience and as a basis for future research directions.\n","authors":["Dominik Kowald","Sebastian Scher","Viktoria Pammer-Schindler","Peter Müllner","Kerstin Waxnegger","Lea Demelius","Angela Fessl","Maximilian Toller","Inti Gabriel Mendoza Estrada","Ilija Simic","Vedran Sabol","Andreas Truegler","Eduardo Veas","Roman Kern","Tomislav Nad","Simone Kopeinik"],"pdf_url":"https://arxiv.org/pdf/2411.09973v1.pdf","comment":"Accepted in Frontiers in Big Data and AI, Research Topic: Towards\n Fair AI for Trustworthy Artificial Intelligence"},{"id":"http://arxiv.org/abs/2411.09961v1","updated":"2024-11-15T05:30:36Z","published":"2024-11-15T05:30:36Z","title":"Dense ReLU Neural Networks for Temporal-spatial Model","summary":" In this paper, we focus on fully connected deep neural networks utilizing the\nRectified Linear Unit (ReLU) activation function for nonparametric estimation.\nWe derive non-asymptotic bounds that lead to convergence rates, addressing both\ntemporal and spatial dependence in the observed measurements. By accounting for\ndependencies across time and space, our models better reflect the complexities\nof real-world data, enhancing both predictive performance and theoretical\nrobustness. We also tackle the curse of dimensionality by modeling the data on\na manifold, exploring the intrinsic dimensionality of high-dimensional data. We\nbroaden existing theoretical findings of temporal-spatial analysis by applying\nthem to neural networks in more general contexts and demonstrate that our proof\ntechniques are effective for models with short-range dependence. Our empirical\nsimulations across various synthetic response functions underscore the superior\nperformance of our method, outperforming established approaches in the existing\nliterature. These findings provide valuable insights into the strong\ncapabilities of dense neural networks for temporal-spatial modeling across a\nbroad range of function classes.\n","authors":["Zhi Zhang","Carlos Misael Madrid Padilla","Xiaokai Luo","Oscar Hernan Madrid Padilla","Daren Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09955v1","updated":"2024-11-15T05:18:15Z","published":"2024-11-15T05:18:15Z","title":"Instruction-Guided Editing Controls for Images and Multimedia: A Survey\n in LLM era","summary":" The rapid advancement of large language models (LLMs) and multimodal learning\nhas transformed digital content creation and manipulation. Traditional visual\nediting tools require significant expertise, limiting accessibility. Recent\nstrides in instruction-based editing have enabled intuitive interaction with\nvisual content, using natural language as a bridge between user intent and\ncomplex editing operations. This survey provides an overview of these\ntechniques, focusing on how LLMs and multimodal models empower users to achieve\nprecise visual modifications without deep technical knowledge. By synthesizing\nover 100 publications, we explore methods from generative adversarial networks\nto diffusion models, examining multimodal integration for fine-grained content\ncontrol. We discuss practical applications across domains such as fashion, 3D\nscene manipulation, and video synthesis, highlighting increased accessibility\nand alignment with human intuition. Our survey compares existing literature,\nemphasizing LLM-empowered editing, and identifies key challenges to stimulate\nfurther research. We aim to democratize powerful visual editing across various\nindustries, from entertainment to education. Interested readers are encouraged\nto access our repository at\nhttps://github.com/tamlhp/awesome-instruction-editing.\n","authors":["Thanh Tam Nguyen","Zhao Ren","Trinh Pham","Phi Le Nguyen","Hongzhi Yin","Quoc Viet Hung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.09955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16842v2","updated":"2024-11-15T05:08:56Z","published":"2024-08-29T18:10:36Z","title":"AdapShare: An RL-Based Dynamic Spectrum Sharing Solution for O-RAN","summary":" The Open Radio Access Network (O-RAN) initiative, characterized by open\ninterfaces and AI/ML-capable RAN Intelligent Controller (RIC), facilitates\neffective spectrum sharing among RANs. In this context, we introduce AdapShare,\nan ORAN-compatible solution leveraging Reinforcement Learning (RL) for\nintent-based spectrum management, with the primary goal of minimizing resource\nsurpluses or deficits in RANs. By employing RL agents, AdapShare intelligently\nlearns network demand patterns and uses them to allocate resources. We\ndemonstrate the efficacy of AdapShare in the spectrum sharing scenario between\nLTE and NR networks, incorporating real-world LTE resource usage data and\nsynthetic NR usage data to demonstrate its practical use. We use the average\nsurplus or deficit and fairness index to measure the system's performance in\nvarious scenarios. AdapShare outperforms a quasi-static resource allocation\nscheme based on long-term network demand statistics, particularly when\navailable resources are scarce or exceed the aggregate demand from the\nnetworks. Lastly, we present a high-level O-RAN compatible architecture using\nRL agents, which demonstrates the seamless integration of AdapShare into\nreal-world deployment scenarios.\n","authors":["Sneihil Gopal","David Griffith","Richard A. Rouil","Chunmei Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16842v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.09110"},{"id":"http://arxiv.org/abs/2411.09945v1","updated":"2024-11-15T04:52:11Z","published":"2024-11-15T04:52:11Z","title":"TEESlice: Protecting Sensitive Neural Network Models in Trusted\n Execution Environments When Attackers have Pre-Trained Models","summary":" Trusted Execution Environments (TEE) are used to safeguard on-device models.\nHowever, directly employing TEEs to secure the entire DNN model is challenging\ndue to the limited computational speed. Utilizing GPU can accelerate DNN's\ncomputation speed but commercial widely-available GPUs usually lack security\nprotection. To this end, scholars introduce TSDP, a method that protects\nprivacy-sensitive weights within TEEs and offloads insensitive weights to GPUs.\nNevertheless, current methods do not consider the presence of a knowledgeable\nadversary who can access abundant publicly available pre-trained models and\ndatasets. This paper investigates the security of existing methods against such\na knowledgeable adversary and reveals their inability to fulfill their security\npromises. Consequently, we introduce a novel partition before training\nstrategy, which effectively separates privacy-sensitive weights from other\ncomponents of the model. Our evaluation demonstrates that our approach can\noffer full model protection with a computational cost reduced by a factor of\n10. In addition to traditional CNN models, we also demonstrate the scalability\nto large language models. Our approach can compress the private functionalities\nof the large language model to lightweight slices and achieve the same level of\nprotection as the shielding-whole-model baseline.\n","authors":["Ding Li","Ziqi Zhang","Mengyu Yao","Yifeng Cai","Yao Guo","Xiangqun Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09945v1.pdf","comment":"Accepted by TOSEM. Extended version of the S&P24 paper\n (arXiv:2310.07152)"},{"id":"http://arxiv.org/abs/2409.09095v2","updated":"2024-11-15T04:48:06Z","published":"2024-09-12T20:03:26Z","title":"meds_reader: A fast and efficient EHR processing library","summary":" The growing demand for machine learning in healthcare requires processing\nincreasingly large electronic health record (EHR) datasets, but existing\npipelines are not computationally efficient or scalable. In this paper, we\nintroduce meds_reader, an optimized Python package for efficient EHR data\nprocessing that is designed to take advantage of many intrinsic properties of\nEHR data for improved speed. We then demonstrate the benefits of meds_reader by\nreimplementing key components of two major EHR processing pipelines, achieving\n10-100x improvements in memory, speed, and disk usage. The code for meds_reader\ncan be found at https://github.com/som-shahlab/meds_reader.\n","authors":["Ethan Steinberg","Michael Wornow","Suhana Bedi","Jason Alan Fries","Matthew B. A. McDermott","Nigam H. Shah"],"pdf_url":"https://arxiv.org/pdf/2409.09095v2.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 8 pages"},{"id":"http://arxiv.org/abs/2411.09943v1","updated":"2024-11-15T04:43:44Z","published":"2024-11-15T04:43:44Z","title":"Zero-shot Voice Conversion with Diffusion Transformers","summary":" Zero-shot voice conversion aims to transform a source speech utterance to\nmatch the timbre of a reference speech from an unseen speaker. Traditional\napproaches struggle with timbre leakage, insufficient timbre representation,\nand mismatches between training and inference tasks. We propose Seed-VC, a\nnovel framework that addresses these issues by introducing an external timbre\nshifter during training to perturb the source speech timbre, mitigating leakage\nand aligning training with inference. Additionally, we employ a diffusion\ntransformer that leverages the entire reference speech context, capturing\nfine-grained timbre features through in-context learning. Experiments\ndemonstrate that Seed-VC outperforms strong baselines like OpenVoice and\nCosyVoice, achieving higher speaker similarity and lower word error rates in\nzero-shot voice conversion tasks. We further extend our approach to zero-shot\nsinging voice conversion by incorporating fundamental frequency (F0)\nconditioning, resulting in comparative performance to current state-of-the-art\nmethods. Our findings highlight the effectiveness of Seed-VC in overcoming core\nchallenges, paving the way for more accurate and versatile voice conversion\nsystems.\n","authors":["Songting Liu"],"pdf_url":"https://arxiv.org/pdf/2411.09943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21263v3","updated":"2024-11-15T04:32:55Z","published":"2024-10-28T17:57:06Z","title":"Adaptive Transfer Clustering: A Unified Framework","summary":" We propose a general transfer learning framework for clustering given a main\ndataset and an auxiliary one about the same subjects. The two datasets may\nreflect similar but different latent grouping structures of the subjects. We\npropose an adaptive transfer clustering (ATC) algorithm that automatically\nleverages the commonality in the presence of unknown discrepancy, by optimizing\nan estimated bias-variance decomposition. It applies to a broad class of\nstatistical models including Gaussian mixture models, stochastic block models,\nand latent class models. A theoretical analysis proves the optimality of ATC\nunder the Gaussian mixture model and explicitly quantifies the benefit of\ntransfer. Extensive simulations and real data experiments confirm our method's\neffectiveness in various scenarios.\n","authors":["Yuqi Gu","Zhongyuan Lyu","Kaizheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.21263v3.pdf","comment":"55 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.02827v3","updated":"2024-11-15T04:17:35Z","published":"2024-04-03T15:59:42Z","title":"BAdam: A Memory Efficient Full Parameter Optimization Method for Large\n Language Models","summary":" This work presents BAdam, an optimization method that leverages the block\ncoordinate descent (BCD) framework with Adam's update rule. BAdam offers a\nmemory efficient approach to the full parameter finetuning of large language\nmodels. We conduct a theoretical convergence analysis for BAdam in the\ndeterministic case. Experimentally, we apply BAdam to finetune the Llama 3-8B\nand Llama 3-70B models using a single RTX3090-24GB GPU and 4 A100-80GB GPUs,\nrespectively. The results confirm BAdam's efficiency in terms of memory usage,\nrunning time, and optimization capability. Furthermore, the downstream\nperformance evaluation based on MT-bench and math benchmarks shows that BAdam\noutperforms existing memory efficient baselines such as LoRA. It also\ndemonstrates that BAdam can achieve comparable or even superior performance\ncompared to Adam. Finally, the ablation study using SGD's update rule\nillustrates the suitability of BCD for finetuning LLMs. Our code can be easily\nintegrated into any PyTorch-based codebase and is available at\nhttps://github.com/Ledzy/BAdam.\n","authors":["Qijun Luo","Hengxu Yu","Xiao Li"],"pdf_url":"https://arxiv.org/pdf/2404.02827v3.pdf","comment":"Accepted for Publication in Conference on Neural Information\n Processing Systems, 2024"},{"id":"http://arxiv.org/abs/2411.09928v1","updated":"2024-11-15T04:00:54Z","published":"2024-11-15T04:00:54Z","title":"Is Precise Recovery Necessary? A Task-Oriented Imputation Approach for\n Time Series Forecasting on Variable Subset","summary":" Variable Subset Forecasting (VSF) refers to a unique scenario in multivariate\ntime series forecasting, where available variables in the inference phase are\nonly a subset of the variables in the training phase. VSF presents significant\nchallenges as the entire time series may be missing, and neither inter- nor\nintra-variable correlations persist. Such conditions impede the effectiveness\nof traditional imputation methods, primarily focusing on filling in individual\nmissing data points. Inspired by the principle of feature engineering that not\nall variables contribute positively to forecasting, we propose Task-Oriented\nImputation for VSF (TOI-VSF), a novel framework shifts the focus from accurate\ndata recovery to directly support the downstream forecasting task. TOI-VSF\nincorporates a self-supervised imputation module, agnostic to the forecasting\nmodel, designed to fill in missing variables while preserving the vital\ncharacteristics and temporal patterns of time series data. Additionally, we\nimplement a joint learning strategy for imputation and forecasting, ensuring\nthat the imputation process is directly aligned with and beneficial to the\nforecasting objective. Extensive experiments across four datasets demonstrate\nthe superiority of TOI-VSF, outperforming baseline methods by $15\\%$ on\naverage.\n","authors":["Qi Hao","Runchang Liang","Yue Gao","Hao Dong","Wei Fan","Lu Jiang","Pengyang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10942v3","updated":"2024-11-15T03:48:07Z","published":"2024-06-16T13:44:41Z","title":"Effective Generative AI: The Human-Algorithm Centaur","summary":" Advanced analytics science methods have enabled combining the power of\nartificial and human intelligence, creating \\textit{centaurs} that allow\nsuperior decision-making. Centaurs are hybrid human-algorithm models that\ncombine both formal analytics and human intuition in a symbiotic manner within\ntheir learning and reasoning process. We argue that the future of AI\ndevelopment and use in many domains needs to focus more on centaurs as opposed\nto other AI approaches. This paradigm shift towards centaur-based AI methods\nraises some fundamental questions: How are centaurs different from other\nhuman-in-the-loop methods? What are the most effective methods for creating\ncentaurs? When should centaurs be used, and when should the lead be given to\npure AI models? Doesn't the incorporation of human intuition -- which at times\ncan be misleading -- in centaurs' decision-making process degrade its\nperformance compared to pure AI methods? This work aims to address these\nfundamental questions, focusing on recent advancements in generative AI, and\nespecially in Large Language Models (LLMs), as a main case study to illustrate\ncentaurs' critical essentiality to future AI endeavors.\n","authors":["Soroush Saghafian","Lihi Idan"],"pdf_url":"https://arxiv.org/pdf/2406.10942v3.pdf","comment":"To Appear in SI: Future Shock, Harvard Data Science Review\n (https://hdsr.mitpress.mit.edu/specialissue5)"},{"id":"http://arxiv.org/abs/2309.14326v3","updated":"2024-11-15T03:40:41Z","published":"2023-09-25T17:53:12Z","title":"Efficient Pauli channel estimation with logarithmic quantum memory","summary":" Here we revisit one of the prototypical tasks for characterizing the\nstructure of noise in quantum devices: estimating every eigenvalue of an\n$n$-qubit Pauli noise channel to error $\\epsilon$. Prior work [14] proved no-go\ntheorems for this task in the practical regime where one has a limited amount\nof quantum memory, e.g. any protocol with $\\le 0.99n$ ancilla qubits of quantum\nmemory must make exponentially many measurements, provided it is\nnon-concatenating. Such protocols can only interact with the channel by\nrepeatedly preparing a state, passing it through the channel, and measuring\nimmediately afterward.\n This left open a natural question: does the lower bound hold even for general\nprotocols, i.e. ones which chain together many queries to the channel,\ninterleaved with arbitrary data-processing channels, before measuring?\nSurprisingly, in this work we show the opposite: there is a protocol that can\nestimate the eigenvalues of a Pauli channel to error $\\epsilon$ using only\n$O(\\log n/\\epsilon^2)$ ancilla and $\\tilde{O}(n^2/\\epsilon^2)$ measurements. In\ncontrast, we show that any protocol with zero ancilla, even a concatenating\none, must make $\\Omega(2^n/\\epsilon^2)$ measurements, which is tight.\n Our results imply, to our knowledge, the first quantum learning task where\nlogarithmically many qubits of quantum memory suffice for an exponential\nstatistical advantage. Our protocol can be naturally extended to a protocol\nthat learns the eigenvalues of Pauli terms within any subset $A$ of a Pauli\nchannel with $O(\\log\\log(|A|)/\\epsilon^2)$ ancilla and\n$\\tilde{O}(n^2/\\epsilon^2)$ measurements.\n","authors":["Sitan Chen","Weiyuan Gong"],"pdf_url":"https://arxiv.org/pdf/2309.14326v3.pdf","comment":"57 pages, 1 figure"},{"id":"http://arxiv.org/abs/2405.17743v3","updated":"2024-11-15T03:25:40Z","published":"2024-05-28T01:55:35Z","title":"ORLM: A Customizable Framework in Training Large Models for Automated\n Optimization Modeling","summary":" Optimization modeling and solving play a critical role in the application of\nOperations Research (OR) tools to address real-world problems, yet they pose\nchallenges and require extensive expertise from OR experts. With the advent of\nlarge language models (LLMs), new opportunities have emerged to streamline and\nautomate these tasks. However, current research predominantly relies on\nclosed-source LLMs such as GPT-4, along with extensive prompt engineering\ntechniques. This reliance stems from the scarcity of high-quality training\ndatasets for optimization modeling, resulting in elevated costs, prolonged\nprocessing times, and privacy concerns. To address these challenges, our work\nis the first to propose a viable path for training open-source LLMs that are\ncapable of optimization modeling as well as developing and executing solver\ncodes, eventually leading to a superior ability for automating optimization\nmodeling and solving. Particularly, we introduce a semi-automated data\nsynthesis framework designed for optimization modeling issues, named\nOR-Instruct. This framework merges the training data requirements of large\nmodels with the unique characteristics of optimization modeling problems, and\nallows for customizable enhancements tailored to specific scenarios or modeling\ntypes. To evaluate the performance of our proposed framework, we present the\nIndustryOR benchmark, the inaugural industrial standard for evaluating LLMs in\nsolving practical OR problems. Utilizing data synthesized through OR-Instruct,\nwe train various open-source LLMs with a capacity of 7 billion parameters\n(dubbed ORLMs). The resulting model demonstrates significantly enhanced\noptimization modeling capabilities, achieving state-of-the-art performance\nacross the NL4OPT, MAMO, and IndustryOR benchmarks. Our code and data are\navailable at \\url{https://github.com/Cardinal-Operations/ORLM}.\n","authors":["Chenyu Huang","Zhengyang Tang","Dongdong Ge","Shixi Hu","Ruoqing Jiang","Benyou Wang","Zizhuo Wang","Xin Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.17743v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.09915v1","updated":"2024-11-15T03:23:26Z","published":"2024-11-15T03:23:26Z","title":"Physics-informed Machine Learning for Battery Pack Thermal Management","summary":" With the popularity of electric vehicles, the demand for lithium-ion\nbatteries is increasing. Temperature significantly influences the performance\nand safety of batteries. Battery thermal management systems can effectively\ncontrol the temperature of batteries; therefore, the performance and safety can\nbe ensured. However, the development process of battery thermal management\nsystems is time-consuming and costly due to the extensive training dataset\nneeded by data-driven models requiring enormous computational costs for finite\nelement analysis. Therefore, a new approach to constructing surrogate models is\nneeded in the era of AI. Physics-informed machine learning enforces the\nphysical laws in surrogate models, making it the perfect candidate for\nestimating battery pack temperature distribution. In this study, we first\ndeveloped a 21700 battery pack indirect liquid cooling system with cold plates\non the top and bottom with thermal paste surrounding the battery cells. Then,\nthe simplified finite element model was built based on experiment results. Due\nto the high coolant flow rate, the cold plates can be considered as constant\ntemperature boundaries, while battery cells are the heat sources. The\nphysics-informed convolutional neural network served as a surrogate model to\nestimate the temperature distribution of the battery pack. The loss function\nwas constructed considering the heat conduction equation based on the finite\ndifference method. The physics-informed loss function helped the convergence of\nthe training process with less data. As a result, the physics-informed\nconvolutional neural network showed more than 15 percents improvement in\naccuracy compared to the data-driven method with the same training data.\n","authors":["Zheng Liu","Yuan Jiang","Yumeng Li","Pingfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06967v2","updated":"2024-11-15T03:20:45Z","published":"2024-08-13T15:23:17Z","title":"Stabilizer bootstrapping: A recipe for efficient agnostic tomography and\n magic estimation","summary":" We study the task of agnostic tomography: given copies of an unknown\n$n$-qubit state $\\rho$ which has fidelity $\\tau$ with some state in a given\nclass $C$, find a state which has fidelity $\\ge \\tau - \\epsilon$ with $\\rho$.\nWe give a new framework, stabilizer bootstrapping, for designing\ncomputationally efficient protocols for this task, and use this to get new\nagnostic tomography protocols for the following classes:\n Stabilizer states: We give a protocol that runs in time\n$\\mathrm{poly}(n,1/\\epsilon)\\cdot (1/\\tau)^{O(\\log(1/\\tau))}$, answering an\nopen question posed by Grewal, Iyer, Kretschmer, Liang [43] and Anshu and\nArunachalam [6]. Previous protocols ran in time $\\mathrm{exp}(\\Theta(n))$ or\nrequired $\\tau>\\cos^2(\\pi/8)$.\n States with stabilizer dimension $n - t$: We give a protocol that runs in\ntime $n^3\\cdot(2^t/\\tau)^{O(\\log(1/\\epsilon))}$, extending recent work on\nlearning quantum states prepared by circuits with few non-Clifford gates, which\nonly applied in the realizable setting where $\\tau = 1$ [33, 40, 49, 66].\n Discrete product states: If $C = K^{\\otimes n}$ for some $\\mu$-separated\ndiscrete set $K$ of single-qubit states, we give a protocol that runs in time\n$(n/\\mu)^{O((1 + \\log (1/\\tau))/\\mu)}/\\epsilon^2$. This strictly generalizes a\nprior guarantee which applied to stabilizer product states [42]. For stabilizer\nproduct states, we give a further improved protocol that runs in time\n$(n^2/\\epsilon^2)\\cdot (1/\\tau)^{O(\\log(1/\\tau))}$.\n As a corollary, we give the first protocol for estimating stabilizer\nfidelity, a standard measure of magic for quantum states, to error $\\epsilon$\nin $n^3 \\mathrm{quasipoly}(1/\\epsilon)$ time.\n","authors":["Sitan Chen","Weiyuan Gong","Qi Ye","Zhihan Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06967v2.pdf","comment":"68 pages"},{"id":"http://arxiv.org/abs/2411.00461v2","updated":"2024-11-15T03:01:59Z","published":"2024-11-01T09:18:38Z","title":"A Multi-Granularity Supervised Contrastive Framework for Remaining\n Useful Life Prediction of Aero-engines","summary":" Accurate remaining useful life (RUL) predictions are critical to the safe\noperation of aero-engines. Currently, the RUL prediction task is mainly a\nregression paradigm with only mean square error as the loss function and lacks\nresearch on feature space structure, the latter of which has shown excellent\nperformance in a large number of studies. This paper develops a\nmulti-granularity supervised contrastive (MGSC) framework from plain intuition\nthat samples with the same RUL label should be aligned in the feature space,\nand address the problems of too large minibatch size and unbalanced samples in\nthe implementation. The RUL prediction with MGSC is implemented on using the\nproposed multi-phase training strategy. This paper also demonstrates a simple\nand scalable basic network structure and validates the proposed MGSC strategy\non the CMPASS dataset using a convolutional long short-term memory network as a\nbaseline, which effectively improves the accuracy of RUL prediction.\n","authors":["Zixuan He","Ziqian Kong","Zhengyu Chen","Yuling Zhan","Zijun Que","Zhengguo Xu"],"pdf_url":"https://arxiv.org/pdf/2411.00461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09904v1","updated":"2024-11-15T02:59:16Z","published":"2024-11-15T02:59:16Z","title":"Self-Supervised Learning of Grasping Arbitrary Objects On-the-Move","summary":" Mobile grasping enhances manipulation efficiency by utilizing robots'\nmobility. This study aims to enable a commercial off-the-shelf robot for mobile\ngrasping, requiring precise timing and pose adjustments. Self-supervised\nlearning can develop a generalizable policy to adjust the robot's velocity and\ndetermine grasp position and orientation based on the target object's shape and\npose. Due to mobile grasping's complexity, action primitivization and\nstep-by-step learning are crucial to avoid data sparsity in learning from trial\nand error. This study simplifies mobile grasping into two grasp action\nprimitives and a moving action primitive, which can be operated with limited\ndegrees of freedom for the manipulator. This study introduces three fully\nconvolutional neural network (FCN) models to predict static grasp primitive,\ndynamic grasp primitive, and residual moving velocity error from visual inputs.\nA two-stage grasp learning approach facilitates seamless FCN model learning.\nThe ablation study demonstrated that the proposed method achieved the highest\ngrasping accuracy and pick-and-place efficiency. Furthermore, randomizing\nobject shapes and environments in the simulation effectively achieved\ngeneralizable mobile grasping.\n","authors":["Takuya Kiyokawa","Eiki Nagata","Yoshihisa Tsurumine","Yuhwan Kwon","Takamitsu Matsubara"],"pdf_url":"https://arxiv.org/pdf/2411.09904v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.09900v1","updated":"2024-11-15T02:46:55Z","published":"2024-11-15T02:46:55Z","title":"Statistical Analysis of Policy Space Compression Problem","summary":" Policy search methods are crucial in reinforcement learning, offering a\nframework to address continuous state-action and partially observable problems.\nHowever, the complexity of exploring vast policy spaces can lead to significant\ninefficiencies. Reducing the policy space through policy compression emerges as\na powerful, reward-free approach to accelerate the learning process. This\ntechnique condenses the policy space into a smaller, representative set while\nmaintaining most of the original effectiveness. Our research focuses on\ndetermining the necessary sample size to learn this compressed set accurately.\nWe employ R\\'enyi divergence to measure the similarity between true and\nestimated policy distributions, establishing error bounds for good\napproximations. To simplify the analysis, we employ the $l_1$ norm, determining\nsample size requirements for both model-based and model-free settings. Finally,\nwe correlate the error bounds from the $l_1$ norm with those from R\\'enyi\ndivergence, distinguishing between policies near the vertices and those in the\nmiddle of the policy space, to determine the lower and upper bounds for the\nrequired sample sizes.\n","authors":["Majid Molaei","Marcello Restelli","Alberto Maria Metelli","Matteo Papini"],"pdf_url":"https://arxiv.org/pdf/2411.09900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09896v1","updated":"2024-11-15T02:44:32Z","published":"2024-11-15T02:44:32Z","title":"Revealing the Evolution of Order in Materials Microstructures Using\n Multi-Modal Computer Vision","summary":" The development of high-performance materials for microelectronics, energy\nstorage, and extreme environments depends on our ability to describe and direct\nproperty-defining microstructural order. Our present understanding is typically\nderived from laborious manual analysis of imaging and spectroscopy data, which\nis difficult to scale, challenging to reproduce, and lacks the ability to\nreveal latent associations needed for mechanistic models. Here, we demonstrate\na multi-modal machine learning (ML) approach to describe order from electron\nmicroscopy analysis of the complex oxide La$_{1-x}$Sr$_x$FeO$_3$. We construct\na hybrid pipeline based on fully and semi-supervised classification, allowing\nus to evaluate both the characteristics of each data modality and the value\neach modality adds to the ensemble. We observe distinct differences in the\nperformance of uni- and multi-modal models, from which we draw general lessons\nin describing crystal order using computer vision.\n","authors":["Arman Ter-Petrosyan","Michael Holden","Jenna A. Bilbrey","Sarah Akers","Christina Doty","Kayla H. Yano","Le Wang","Rajendra Paudel","Eric Lang","Khalid Hattar","Ryan B. Comes","Yingge Du","Bethany E. Matthews","Steven R. Spurgeon"],"pdf_url":"https://arxiv.org/pdf/2411.09896v1.pdf","comment":"30 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.09892v1","updated":"2024-11-15T02:36:36Z","published":"2024-11-15T02:36:36Z","title":"Deep learning robotics using self-supervised spatial differentiation\n drive autonomous contact-based semiconductor characterization","summary":" Integrating autonomous contact-based robotic characterization into\nself-driving laboratories can enhance measurement quality, reliability, and\nthroughput. While deep learning models support robust autonomy, current methods\nlack pixel-precision positioning and require extensive labeled data. To\novercome these challenges, we propose a self-supervised convolutional neural\nnetwork with a spatially differentiable loss function, incorporating shape\npriors to refine predictions of optimal robot contact poses for semiconductor\ncharacterization. This network improves valid pose generation by 20.0%,\nrelative to existing models. We demonstrate our network's performance by\ndriving a 4-degree-of-freedom robot to characterize photoconductivity at 3,025\npredicted poses across a gradient of perovskite compositions, achieving\nthroughputs over 125 measurements per hour. Spatially mapping photoconductivity\nonto each drop-casted film reveals regions of inhomogeneity. With this\nself-supervised deep learning-driven robotic system, we enable high-precision\nand reliable automation of contact-based characterization techniques at high\nthroughputs, thereby allowing the measurement of previously inaccessible yet\nimportant semiconductor properties for self-driving laboratories.\n","authors":["Alexander E. Siemenn","Basita Das","Kangyu Ji","Fang Sheng","Tonio Buonassisi"],"pdf_url":"https://arxiv.org/pdf/2411.09892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09891v1","updated":"2024-11-15T02:35:20Z","published":"2024-11-15T02:35:20Z","title":"Off-Dynamics Reinforcement Learning via Domain Adaptation and Reward\n Augmented Imitation","summary":" Training a policy in a source domain for deployment in the target domain\nunder a dynamics shift can be challenging, often resulting in performance\ndegradation. Previous work tackles this challenge by training on the source\ndomain with modified rewards derived by matching distributions between the\nsource and the target optimal trajectories. However, pure modified rewards only\nensure the behavior of the learned policy in the source domain resembles\ntrajectories produced by the target optimal policies, which does not guarantee\noptimal performance when the learned policy is actually deployed to the target\ndomain. In this work, we propose to utilize imitation learning to transfer the\npolicy learned from the reward modification to the target domain so that the\nnew policy can generate the same trajectories in the target domain. Our\napproach, Domain Adaptation and Reward Augmented Imitation Learning (DARAIL),\nutilizes the reward modification for domain adaptation and follows the general\nframework of generative adversarial imitation learning from observation (GAIfO)\nby applying a reward augmented estimator for the policy optimization step.\nTheoretically, we present an error bound for our method under a mild assumption\nregarding the dynamics shift to justify the motivation of our method.\nEmpirically, our method outperforms the pure modified reward method without\nimitation learning and also outperforms other baselines in benchmark\noff-dynamics environments.\n","authors":["Yihong Guo","Yixuan Wang","Yuanyuan Shi","Pan Xu","Anqi Liu"],"pdf_url":"https://arxiv.org/pdf/2411.09891v1.pdf","comment":"Published at Neurips 2024"},{"id":"http://arxiv.org/abs/2410.09298v2","updated":"2024-11-15T01:09:27Z","published":"2024-10-11T23:07:19Z","title":"DeepOSets: Non-Autoregressive In-Context Learning of Supervised Learning\n Operators","summary":" We introduce DeepSets Operator Networks (DeepOSets), an efficient,\nnon-autoregressive neural network architecture for in-context operator\nlearning. In-context learning allows a trained machine learning model to learn\nfrom a user prompt without further training. DeepOSets adds in-context learning\ncapabilities to Deep Operator Networks (DeepONets) by combining it with the\nDeepSets architecture. As the first non-autoregressive model for in-context\noperator learning, DeepOSets allow the user prompt to be processed in parallel,\nleading to significant computational savings. Here, we present the application\nof DeepOSets in the problem of learning supervised learning algorithms, which\nare operators mapping a finite-dimensional space of labeled data into an\ninfinite-dimensional hypothesis space of prediction functions. In an empirical\ncomparison with a popular autoregressive (transformer-based) model for\nin-context learning of linear regression in one and five dimensions, DeepOSets\nreduced the number of model weights by several orders of magnitude and required\na fraction of training and inference time. Furthermore, DeepOSets proved to be\nless sensitive to noise, significantly outperforming the transformer model in\nnoisy settings.\n","authors":["Shao-Ting Chiu","Junyuan Hong","Ulisses Braga-Neto"],"pdf_url":"https://arxiv.org/pdf/2410.09298v2.pdf","comment":"Janossy pooling results were added; Figures 1 and 2 were updated;\n minor edits were made throughout"},{"id":"http://arxiv.org/abs/2410.19715v2","updated":"2024-11-15T01:01:44Z","published":"2024-10-25T17:35:03Z","title":"Adversarial Environment Design via Regret-Guided Diffusion Models","summary":" Training agents that are robust to environmental changes remains a\nsignificant challenge in deep reinforcement learning (RL). Unsupervised\nenvironment design (UED) has recently emerged to address this issue by\ngenerating a set of training environments tailored to the agent's capabilities.\nWhile prior works demonstrate that UED has the potential to learn a robust\npolicy, their performance is constrained by the capabilities of the environment\ngeneration. To this end, we propose a novel UED algorithm, adversarial\nenvironment design via regret-guided diffusion models (ADD). The proposed\nmethod guides the diffusion-based environment generator with the regret of the\nagent to produce environments that the agent finds challenging but conducive to\nfurther improvement. By exploiting the representation power of diffusion\nmodels, ADD can directly generate adversarial environments while maintaining\nthe diversity of training environments, enabling the agent to effectively learn\na robust policy. Our experimental results demonstrate that the proposed method\nsuccessfully generates an instructive curriculum of environments, outperforming\nUED baselines in zero-shot generalization across novel, out-of-distribution\nenvironments. Project page: https://rllab-snu.github.io/projects/ADD\n","authors":["Hojun Chung","Junseo Lee","Minsoo Kim","Dohyeong Kim","Songhwai Oh"],"pdf_url":"https://arxiv.org/pdf/2410.19715v2.pdf","comment":"38th Conference on Neural Information Processing Systems"},{"id":"http://arxiv.org/abs/2410.21564v3","updated":"2024-11-15T00:32:50Z","published":"2024-10-28T21:54:44Z","title":"Mitigating Gradient Overlap in Deep Residual Networks with Gradient\n Normalization for Improved Non-Convex Optimization","summary":" In deep learning, Residual Networks (ResNets) have proven effective in\naddressing the vanishing gradient problem, allowing for the successful training\nof very deep networks. However, skip connections in ResNets can lead to\ngradient overlap, where gradients from both the learned transformation and the\nskip connection combine, potentially resulting in overestimated gradients. This\noverestimation can cause inefficiencies in optimization, as some updates may\novershoot optimal regions, affecting weight updates. To address this, we\nexamine Z-score Normalization (ZNorm) as a technique to manage gradient\noverlap. ZNorm adjusts the gradient scale, standardizing gradients across\nlayers and reducing the negative impact of overlapping gradients. Our\nexperiments demonstrate that ZNorm improves training process, especially in\nnon-convex optimization scenarios common in deep learning, where finding\noptimal solutions is challenging. These findings suggest that ZNorm can affect\nthe gradient flow, enhancing performance in large-scale data processing where\naccuracy is critical.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2410.21564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09856v1","updated":"2024-11-15T00:31:45Z","published":"2024-11-15T00:31:45Z","title":"InvestESG: A multi-agent reinforcement learning benchmark for studying\n climate investment as a social dilemma","summary":" InvestESG is a novel multi-agent reinforcement learning (MARL) benchmark\ndesigned to study the impact of Environmental, Social, and Governance (ESG)\ndisclosure mandates on corporate climate investments. Supported by both PyTorch\nand GPU-accelerated JAX framework, the benchmark models an intertemporal social\ndilemma where companies balance short-term profit losses from climate\nmitigation efforts and long-term benefits from reducing climate risk, while\nESG-conscious investors attempt to influence corporate behavior through their\ninvestment decisions. Companies allocate capital across mitigation,\ngreenwashing, and resilience, with varying strategies influencing climate\noutcomes and investor preferences. Our experiments show that without\nESG-conscious investors with sufficient capital, corporate mitigation efforts\nremain limited under the disclosure mandate. However, when a critical mass of\ninvestors prioritizes ESG, corporate cooperation increases, which in turn\nreduces climate risks and enhances long-term financial stability. Additionally,\nproviding more information about global climate risks encourages companies to\ninvest more in mitigation, even without investor involvement. Our findings\nalign with empirical research using real-world data, highlighting MARL's\npotential to inform policy by providing insights into large-scale\nsocio-economic challenges through efficient testing of alternative policy and\nmarket designs.\n","authors":["Xiaoxuan Hou","Jiayi Yuan","Joel Z. Leibo","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2411.09856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14567v3","updated":"2024-11-15T00:24:00Z","published":"2024-05-23T13:43:29Z","title":"EHRMamba: Towards Generalizable and Scalable Foundation Models for\n Electronic Health Records","summary":" Transformers have significantly advanced the modeling of Electronic Health\nRecords (EHR), yet their deployment in real-world healthcare is limited by\nseveral key challenges. Firstly, the quadratic computational cost and\ninsufficient context length of these models hinder hospitals' ability in\nprocessing the extensive medical histories typical in EHR data. Additionally,\nexisting models employ separate finetuning for each clinical task, complicating\nmaintenance in healthcare environments. Moreover, these models focus\nexclusively on either clinical prediction or EHR forecasting, lacking\nproficiency in both tasks. To overcome these limitations, we introduce\nEHRMamba, a robust foundation model built on the Mamba architecture. EHRMamba\ncan process sequences up to 300% longer than previous models due to its linear\ncomputational cost. We also introduce a novel approach to Multitask Prompted\nFinetuning (MPF) for EHR data, which enables EHRMamba to simultaneously learn\nmultiple clinical tasks in a single finetuning phase, significantly enhancing\ndeployment and cross-task generalization. Furthermore, our model leverages the\nHL7 FHIR data standard to simplify integration into existing hospital systems.\nAlongside EHRMamba, we open-source Odyssey, a toolkit designed to support the\ndevelopment and deployment of EHR foundation models, with an emphasis on data\nstandardization and interpretability. Our evaluations on the MIMIC-IV dataset\ndemonstrate that EHRMamba advances state-of-the-art performance across 6 major\nclinical tasks and excels in EHR forecasting, marking a significant leap\nforward in the field.\n","authors":["Adibvafa Fallahpour","Mahshid Alinoori","Wenqian Ye","Xu Cao","Arash Afkanpour","Amrit Krishnan"],"pdf_url":"https://arxiv.org/pdf/2405.14567v3.pdf","comment":"17 Pages, 4 Figures"},{"id":"http://arxiv.org/abs/2411.09854v1","updated":"2024-11-15T00:23:59Z","published":"2024-11-15T00:23:59Z","title":"Fair Secretaries with Unfair Predictions","summary":" Algorithms with predictions is a recent framework for decision-making under\nuncertainty that leverages the power of machine-learned predictions without\nmaking any assumption about their quality. The goal in this framework is for\nalgorithms to achieve an improved performance when the predictions are accurate\nwhile maintaining acceptable guarantees when the predictions are erroneous. A\nserious concern with algorithms that use predictions is that these predictions\ncan be biased and, as a result, cause the algorithm to make decisions that are\ndeemed unfair. We show that this concern manifests itself in the classical\nsecretary problem in the learning-augmented setting -- the state-of-the-art\nalgorithm can have zero probability of accepting the best candidate, which we\ndeem unfair, despite promising to accept a candidate whose expected value is at\nleast $\\max\\{\\Omega (1) , 1 - O(\\epsilon)\\}$ times the optimal value, where\n$\\epsilon$ is the prediction error. We show how to preserve this promise while\nalso guaranteeing to accept the best candidate with probability $\\Omega(1)$.\nOur algorithm and analysis are based on a new \"pegging\" idea that diverges from\nexisting works and simplifies/unifies some of their results. Finally, we extend\nto the $k$-secretary problem and complement our theoretical analysis with\nexperiments.\n","authors":["Eric Balkanski","Will Ma","Andreas Maggiori"],"pdf_url":"https://arxiv.org/pdf/2411.09854v1.pdf","comment":"to appear at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09853v1","updated":"2024-11-15T00:21:02Z","published":"2024-11-15T00:21:02Z","title":"KULCQ: An Unsupervised Keyword-based Utterance Level Clustering Quality\n Metric","summary":" Intent discovery is crucial for both building new conversational agents and\nimproving existing ones. While several approaches have been proposed for intent\ndiscovery, most rely on clustering to group similar utterances together.\nTraditional evaluation of these utterance clusters requires intent labels for\neach utterance, limiting scalability. Although some clustering quality metrics\nexist that do not require labeled data, they focus solely on cluster geometry\nwhile ignoring the linguistic nuances present in conversational transcripts. In\nthis paper, we introduce Keyword-based Utterance Level Clustering Quality\n(KULCQ), an unsupervised metric that leverages keyword analysis to evaluate\nclustering quality. We demonstrate KULCQ's effectiveness by comparing it with\nexisting unsupervised clustering metrics and validate its performance through\ncomprehensive ablation studies. Our results show that KULCQ better captures\nsemantic relationships in conversational data while maintaining consistency\nwith geometric clustering principles.\n","authors":["Pranav Guruprasad","Negar Mokhberian","Nikhil Varghese","Chandra Khatri","Amol Kelkar"],"pdf_url":"https://arxiv.org/pdf/2411.09853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09852v1","updated":"2024-11-15T00:20:36Z","published":"2024-11-15T00:20:36Z","title":"InterFormer: Towards Effective Heterogeneous Interaction Learning for\n Click-Through Rate Prediction","summary":" Click-through rate (CTR) prediction, which predicts the probability of a user\nclicking an ad, is a fundamental task in recommender systems. The emergence of\nheterogeneous information, such as user profile and behavior sequences, depicts\nuser interests from different aspects. A mutually beneficial integration of\nheterogeneous information is the cornerstone towards the success of CTR\nprediction. However, most of the existing methods suffer from two fundamental\nlimitations, including (1) insufficient inter-mode interaction due to the\nunidirectional information flow between modes, and (2) aggressive information\naggregation caused by early summarization, resulting in excessive information\nloss. To address the above limitations, we propose a novel module named\nInterFormer to learn heterogeneous information interaction in an interleaving\nstyle. To achieve better interaction learning, InterFormer enables\nbidirectional information flow for mutually beneficial learning across\ndifferent modes. To avoid aggressive information aggregation, we retain\ncomplete information in each data mode and use a separate bridging arch for\neffective information selection and summarization. Our proposed InterFormer\nachieves state-of-the-art performance on three public datasets and a\nlarge-scale industrial dataset.\n","authors":["Zhichen Zeng","Xiaolong Liu","Mengyue Hang","Xiaoyi Liu","Qinghai Zhou","Chaofei Yang","Yiqun Liu","Yichen Ruan","Laming Chen","Yuxin Chen","Yujia Hao","Jiaqi Xu","Jade Nie","Xi Liu","Buyun Zhang","Wei Wen","Siyang Yuan","Kai Wang","Wen-Yen Chen","Yiping Han","Huayu Li","Chunzhi Yang","Bo Long","Philip S. Yu","Hanghang Tong","Jiyan Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09852v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.10930v3","updated":"2024-11-15T00:09:44Z","published":"2024-01-31T17:52:52Z","title":"ConSmax: Hardware-Friendly Alternative Softmax with Learnable Parameters","summary":" The self-attention mechanism distinguishes transformer-based large language\nmodels (LLMs) apart from convolutional and recurrent neural networks. Despite\nthe performance improvement, achieving real-time LLM inference on silicon\nremains challenging due to the extensive use of Softmax in self-attention. In\naddition to the non-linearity, the low arithmetic intensity significantly\nlimits processing parallelism, especially when working with longer contexts. To\naddress this challenge, we propose Constant Softmax (ConSmax), a\nsoftware-hardware co-design that serves as an efficient alternative to Softmax.\nConSmax utilizes differentiable normalization parameters to eliminate the need\nfor maximum searching and denominator summation in Softmax. This approach\nenables extensive parallelization while still executing the essential functions\nof Softmax. Moreover, a scalable ConSmax hardware design with a bitwidth-split\nlook-up table (LUT) can achieve lossless non-linear operations and support\nmixed-precision computing. Experimental results show that ConSmax achieves a\nminuscule power consumption of 0.2mW and an area of 0.0008mm^2 at 1250MHz\nworking frequency in 16nm FinFET technology. For open-source contribution, we\nfurther implement our design with the OpenROAD toolchain under SkyWater's 130nm\nCMOS technology. The corresponding power is 2.69mW and the area is 0.007mm^2.\nConSmax achieves 3.35x power savings and 2.75x area savings in 16nm technology,\nand 3.15x power savings and 4.14x area savings with the open-source EDA\ntoolchain. In the meantime, it also maintains comparable accuracy on the GPT-2\nmodel and the WikiText103 dataset. The project is available at\nhttps://github.com/ReaLLMASIC/ConSmax\n","authors":["Shiwei Liu","Guanchen Tao","Yifei Zou","Derek Chow","Zichen Fan","Kauna Lei","Bangfei Pan","Dennis Sylvester","Gregory Kielian","Mehdi Saligane"],"pdf_url":"https://arxiv.org/pdf/2402.10930v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09851v1","updated":"2024-11-15T00:09:37Z","published":"2024-11-15T00:09:37Z","title":"SymbolFit: Automatic Parametric Modeling with Symbolic Regression","summary":" We introduce SymbolFit, a framework that automates parametric modeling by\nusing symbolic regression to perform a machine-search for functions that fit\nthe data, while simultaneously providing uncertainty estimates in a single run.\nTraditionally, constructing a parametric model to accurately describe binned\ndata has been a manual and iterative process, requiring an adequate functional\nform to be determined before the fit can be performed. The main challenge\narises when the appropriate functional forms cannot be derived from first\nprinciples, especially when there is no underlying true closed-form function\nfor the distribution. In this work, we address this problem by utilizing\nsymbolic regression, a machine learning technique that explores a vast space of\ncandidate functions without needing a predefined functional form, treating the\nfunctional form itself as a trainable parameter. Our approach is demonstrated\nin data analysis applications in high-energy physics experiments at the CERN\nLarge Hadron Collider (LHC). We demonstrate its effectiveness and efficiency\nusing five real proton-proton collision datasets from new physics searches at\nthe LHC, namely the background modeling in resonance searches for high-mass\ndijet, trijet, paired-dijet, diphoton, and dimuon events. We also validate the\nframework using several toy datasets with one and more variables.\n","authors":["Ho Fung Tsoi","Dylan Rankin","Cecile Caillol","Miles Cranmer","Sridhara Dasu","Javier Duarte","Philip Harris","Elliot Lipeles","Vladimir Loncar"],"pdf_url":"https://arxiv.org/pdf/2411.09851v1.pdf","comment":"53 pages, 35 figures. Under review"},{"id":"http://arxiv.org/abs/2411.09850v1","updated":"2024-11-15T00:06:57Z","published":"2024-11-15T00:06:57Z","title":"Enhancing Diffusion Posterior Sampling for Inverse Problems by\n Integrating Crafted Measurements","summary":" Diffusion models have emerged as a powerful foundation model for visual\ngeneration. With an appropriate sampling process, it can effectively serve as a\ngenerative prior to solve general inverse problems. Current posterior sampling\nbased methods take the measurement (i.e., degraded image sample) into the\nposterior sampling to infer the distribution of the target data (i.e., clean\nimage sample). However, in this manner, we show that high-frequency information\ncan be prematurely introduced during the early stages, which could induce\nlarger posterior estimate errors during the restoration sampling. To address\nthis issue, we first reveal that forming the log posterior gradient with the\nnoisy measurement ( i.e., samples from a diffusion forward process) instead of\nthe clean one can benefit the reverse process. Consequently, we propose a novel\ndiffusion posterior sampling method DPS-CM, which incorporates a Crafted\nMeasurement (i.e., samples generated by a reverse denoising process, compared\nto random sampling with noise in standard methods) to form the posterior\nestimate. This integration aims to mitigate the misalignment with the diffusion\nprior caused by cumulative posterior estimate errors. Experimental results\ndemonstrate that our approach significantly improves the overall capacity to\nsolve general and noisy inverse problems, such as Gaussian deblurring,\nsuper-resolution, inpainting, nonlinear deblurring, and tasks with Poisson\nnoise, relative to existing approaches.\n","authors":["Shijie Zhou","Huaisheng Zhu","Rohan Sharma","Ruiyi Zhang","Kaiyi Ji","Changyou Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06683v6","updated":"2024-11-15T00:00:24Z","published":"2023-01-17T03:53:29Z","title":"From Isolation to Collaboration: Federated Class-Heterogeneous Learning\n for Chest X-Ray Classification","summary":" Federated learning (FL) is a promising paradigm to collaboratively train a\nglobal chest x-ray (CXR) classification model using distributed datasets while\npreserving patient privacy. A significant, yet relatively underexplored,\nchallenge in FL is class-heterogeneity, where clients have different sets of\nclasses. We propose surgical aggregation, a FL method that uses selective\naggregation to collaboratively train a global model using distributed,\nclass-heterogeneous datasets. Unlike other methods, our method does not rely on\nthe assumption that clients share the same classes as other clients, know the\nclasses of other clients, or have access to a fully annotated dataset. We\nevaluate surgical aggregation using class-heterogeneous CXR datasets across IID\nand non-IID settings. Our results show that our method outperforms current\nmethods and has better generalizability.\n","authors":["Pranav Kulkarni","Adway Kanhere","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2301.06683v6.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.10446v1","updated":"2024-11-15T18:59:51Z","published":"2024-11-15T18:59:51Z","title":"VeriGraph: Scene Graphs for Execution Verifiable Robot Planning","summary":" Recent advancements in vision-language models (VLMs) offer potential for\nrobot task planning, but challenges remain due to VLMs' tendency to generate\nincorrect action sequences. To address these limitations, we propose VeriGraph,\na novel framework that integrates VLMs for robotic planning while verifying\naction feasibility. VeriGraph employs scene graphs as an intermediate\nrepresentation, capturing key objects and spatial relationships to improve plan\nverification and refinement. The system generates a scene graph from input\nimages and uses it to iteratively check and correct action sequences generated\nby an LLM-based task planner, ensuring constraints are respected and actions\nare executable. Our approach significantly enhances task completion rates\nacross diverse manipulation scenarios, outperforming baseline methods by 58%\nfor language-based tasks and 30% for image-based tasks.\n","authors":["Daniel Ekpo","Mara Levy","Saksham Suri","Chuong Huynh","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2411.10446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08894v2","updated":"2024-11-15T18:58:47Z","published":"2024-10-28T21:15:49Z","title":"Temporal Patterns of Multiple Long-Term Conditions in Individuals with\n Intellectual Disability Living in Wales: An Unsupervised Clustering Approach\n to Disease Trajectories","summary":" Identifying and understanding the co-occurrence of multiple long-term\nconditions (MLTC) in individuals with intellectual disabilities (ID) is vital\nfor effective healthcare management. These individuals often face earlier onset\nand higher prevalence of MLTCs, yet specific co-occurrence patterns remain\nunexplored. This study applies an unsupervised approach to characterise MLTC\nclusters based on shared disease trajectories using electronic health records\n(EHRs) from 13069 individuals with ID in Wales (2000-2021). Disease\nassociations and temporal directionality were assessed, followed by spectral\nclustering to group shared trajectories. The population consisted of 52.3%\nmales and 47.7% females, with an average of 4.5 conditions per patient. Males\nunder 45 formed a single cluster dominated by neurological conditions (32.4%),\nwhile males above 45 had three clusters, the largest characterised circulatory\n(51.8%). Females under 45 formed one cluster with digestive conditions (24.6%)\nas most prevalent, while those aged 45 and older showed two clusters: one\ndominated by circulatory (34.1%), and the other by digestive (25.9%) and\nmusculoskeletal (21.9%) system conditions. Mental illness, epilepsy, and reflux\nwere common across groups. These clusters offer insights into disease\nprogression in individuals with ID, informing targeted interventions and\npersonalised healthcare strategies.\n","authors":["Rania Kousovista","Georgina Cosma","Emeka Abakasanga","Ashley Akbari","Francesco Zaccardi","Gyuchan Thomas Jun","Reza Kiani","Satheesh Gangadharan"],"pdf_url":"https://arxiv.org/pdf/2411.08894v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10436v1","updated":"2024-11-15T18:56:01Z","published":"2024-11-15T18:56:01Z","title":"Mitigating Hallucination in Multimodal Large Language Model via\n Hallucination-targeted Direct Preference Optimization","summary":" Multimodal Large Language Models (MLLMs) are known to hallucinate, which\nlimits their practical applications. Recent works have attempted to apply\nDirect Preference Optimization (DPO) to enhance the performance of MLLMs, but\nhave shown inconsistent improvements in mitigating hallucinations. To address\nthis issue more effectively, we introduce Hallucination-targeted Direct\nPreference Optimization (HDPO) to reduce hallucinations in MLLMs. Unlike\nprevious approaches, our method tackles hallucinations from their diverse forms\nand causes. Specifically, we develop three types of preference pair data\ntargeting the following causes of MLLM hallucinations: (1) insufficient visual\ncapabilities, (2) long context generation, and (3) multimodal conflicts.\nExperimental results demonstrate that our method achieves superior performance\nacross multiple hallucination evaluation datasets, surpassing most\nstate-of-the-art (SOTA) methods and highlighting the potential of our approach.\nAblation studies and in-depth analyses further confirm the effectiveness of our\nmethod and suggest the potential for further improvements through scaling up.\n","authors":["Yuhan Fu","Ruobing Xie","Xingwu Sun","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2411.10436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10431v1","updated":"2024-11-15T18:53:08Z","published":"2024-11-15T18:53:08Z","title":"Mitigating Parameter Degeneracy using Joint Conditional Diffusion Model\n for WECC Composite Load Model in Power Systems","summary":" Data-driven modeling for dynamic systems has gained widespread attention in\nrecent years. Its inverse formulation, parameter estimation, aims to infer the\ninherent model parameters from observations. However, parameter degeneracy,\nwhere different combinations of parameters yield the same observable output,\nposes a critical barrier to accurately and uniquely identifying model\nparameters. In the context of WECC composite load model (CLM) in power systems,\nutility practitioners have observed that CLM parameters carefully selected for\none fault event may not perform satisfactorily in another fault. Here, we\ninnovate a joint conditional diffusion model-based inverse problem solver\n(JCDI), that incorporates a joint conditioning architecture with simultaneous\ninputs of multi-event observations to improve parameter generalizability.\nSimulation studies on the WECC CLM show that the proposed JCDI effectively\nreduces uncertainties of degenerate parameters, thus the parameter estimation\nerror is decreased by 42.1% compared to a single-event learning scheme. This\nenables the model to achieve high accuracy in predicting power trajectories\nunder different fault events, including electronic load tripping and motor\nstalling, outperforming standard deep reinforcement learning and supervised\nlearning approaches. We anticipate this work will contribute to mitigating\nparameter degeneracy in system dynamics, providing a general parameter\nestimation framework across various scientific domains.\n","authors":["Feiqin Zhu","Dmitrii Torbunov","Yihui Ren","Zhongjing Jiang","Tianqiao Zhao","Amirthagunaraj Yogarathnam","Meng Yue"],"pdf_url":"https://arxiv.org/pdf/2411.10431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10422v1","updated":"2024-11-15T18:42:48Z","published":"2024-11-15T18:42:48Z","title":"Evaluating Creativity and Deception in Large Language Models: A\n Simulation Framework for Multi-Agent Balderdash","summary":" Large Language Models (LLMs) have shown impressive capabilities in complex\ntasks and interactive environments, yet their creativity remains underexplored.\nThis paper introduces a simulation framework utilizing the game Balderdash to\nevaluate both the creativity and logical reasoning of LLMs. In Balderdash,\nplayers generate fictitious definitions for obscure terms to deceive others\nwhile identifying correct definitions. Our framework enables multiple LLM\nagents to participate in this game, assessing their ability to produce\nplausible definitions and strategize based on game rules and history. We\nimplemented a centralized game engine featuring various LLMs as participants\nand a judge LLM to evaluate semantic equivalence. Through a series of\nexperiments, we analyzed the performance of different LLMs, examining metrics\nsuch as True Definition Ratio, Deception Ratio, and Correct Guess Ratio. The\nresults provide insights into the creative and deceptive capabilities of LLMs,\nhighlighting their strengths and areas for improvement. Specifically, the study\nreveals that infrequent vocabulary in LLMs' input leads to poor reasoning on\ngame rules and historical context\n(https://github.com/ParsaHejabi/Simulation-Framework-for-Multi-Agent-Balderdash).\n","authors":["Parsa Hejabi","Elnaz Rahmati","Alireza S. Ziabari","Preni Golazizian","Jesse Thomason","Morteza Dehghani"],"pdf_url":"https://arxiv.org/pdf/2411.10422v1.pdf","comment":"Accepted at Wordplay: When Language Meets Games @ ACL 2024"},{"id":"http://arxiv.org/abs/2411.10416v1","updated":"2024-11-15T18:35:00Z","published":"2024-11-15T18:35:00Z","title":"Towards Automatic Evaluation of Task-Oriented Dialogue Flows","summary":" Task-oriented dialogue systems rely on predefined conversation schemes\n(dialogue flows) often represented as directed acyclic graphs. These flows can\nbe manually designed or automatically generated from previously recorded\nconversations. Due to variations in domain expertise or reliance on different\nsets of prior conversations, these dialogue flows can manifest in significantly\ndifferent graph structures. Despite their importance, there is no standard\nmethod for evaluating the quality of dialogue flows. We introduce FuDGE (Fuzzy\nDialogue-Graph Edit Distance), a novel metric that evaluates dialogue flows by\nassessing their structural complexity and representational coverage of the\nconversation data. FuDGE measures how well individual conversations align with\na flow and, consequently, how well a set of conversations is represented by the\nflow overall. Through extensive experiments on manually configured flows and\nflows generated by automated techniques, we demonstrate the effectiveness of\nFuDGE and its evaluation framework. By standardizing and optimizing dialogue\nflows, FuDGE enables conversational designers and automated techniques to\nachieve higher levels of efficiency and automation.\n","authors":["Mehrnoosh Mirtaheri","Nikhil Varghese","Chandra Khatri","Amol Kelkar"],"pdf_url":"https://arxiv.org/pdf/2411.10416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09584v2","updated":"2024-11-15T18:34:42Z","published":"2024-02-14T21:19:33Z","title":"Large Language Model-Based Interpretable Machine Learning Control in\n Building Energy Systems","summary":" The potential of Machine Learning Control (MLC) in HVAC systems is hindered\nby its opaque nature and inference mechanisms, which is challenging for users\nand modelers to fully comprehend, ultimately leading to a lack of trust in\nMLC-based decision-making. To address this challenge, this paper investigates\nand explores Interpretable Machine Learning (IML), a branch of Machine Learning\n(ML) that enhances transparency and understanding of models and their\ninferences, to improve the credibility of MLC and its industrial application in\nHVAC systems. Specifically, we developed an innovative framework that combines\nthe principles of Shapley values and the in-context learning feature of Large\nLanguage Models (LLMs). While the Shapley values are instrumental in dissecting\nthe contributions of various features in ML models, LLM provides an in-depth\nunderstanding of the non-data-driven or rule-based elements in MLC; combining\nthem, LLM further packages these insights into a coherent, human-understandable\nnarrative. The paper presents a case study to demonstrate the feasibility of\nthe developed IML framework for model predictive control-based precooling under\ndemand response events in a virtual testbed. The results indicate that the\ndeveloped framework generates and explains the control signals in accordance\nwith the rule-based rationale.\n","authors":["Liang Zhang","Zhelun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.09584v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10411v1","updated":"2024-11-15T18:29:59Z","published":"2024-11-15T18:29:59Z","title":"Repurposing Stable Diffusion Attention for Training-Free Unsupervised\n Interactive Segmentation","summary":" Recent progress in interactive point prompt based Image Segmentation allows\nto significantly reduce the manual effort to obtain high quality semantic\nlabels. State-of-the-art unsupervised methods use self-supervised pre-trained\nmodels to obtain pseudo-labels which are used in training a prompt-based\nsegmentation model. In this paper, we propose a novel unsupervised and\ntraining-free approach based solely on the self-attention of Stable Diffusion.\nWe interpret the self-attention tensor as a Markov transition operator, which\nenables us to iteratively construct a Markov chain. Pixel-wise counting of the\nrequired number of iterations along the Markov-chain to reach a relative\nprobability threshold yields a Markov-iteration-map, which we simply call a\nMarkov-map. Compared to the raw attention maps, we show that our proposed\nMarkov-map has less noise, sharper semantic boundaries and more uniform values\nwithin semantically similar regions. We integrate the Markov-map in a simple\nyet effective truncated nearest neighbor framework to obtain interactive point\nprompt based segmentation. Despite being training-free, we experimentally show\nthat our approach yields excellent results in terms of Number of Clicks (NoC),\neven outperforming state-of-the-art training based unsupervised methods in most\nof the datasets.\n","authors":["Markus Karmann","Onay Urfalioglu"],"pdf_url":"https://arxiv.org/pdf/2411.10411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09579v2","updated":"2024-11-15T18:20:23Z","published":"2024-02-14T21:02:07Z","title":"Advancing Building Energy Modeling with Large Language Models:\n Exploration and Case Studies","summary":" The rapid progression in artificial intelligence has facilitated the\nemergence of large language models like ChatGPT, offering potential\napplications extending into specialized engineering modeling, especially\nphysics-based building energy modeling. This paper investigates the innovative\nintegration of large language models with building energy modeling software,\nfocusing specifically on the fusion of ChatGPT with EnergyPlus. A literature\nreview is first conducted to reveal a growing trend of incorporating large\nlanguage models in engineering modeling, albeit limited research on their\napplication in building energy modeling. We underscore the potential of large\nlanguage models in addressing building energy modeling challenges and outline\npotential applications including simulation input generation, simulation output\nanalysis and visualization, conducting error analysis, co-simulation,\nsimulation knowledge extraction and training, and simulation optimization.\nThree case studies reveal the transformative potential of large language models\nin automating and optimizing building energy modeling tasks, underscoring the\npivotal role of artificial intelligence in advancing sustainable building\npractices and energy efficiency. The case studies demonstrate that selecting\nthe right large language model techniques is essential to enhance performance\nand reduce engineering efforts. The findings advocate a multidisciplinary\napproach in future artificial intelligence research, with implications\nextending beyond building energy modeling to other specialized engineering\nmodeling.\n","authors":["Liang Zhang","Zhelun Chen","Vitaly Ford"],"pdf_url":"https://arxiv.org/pdf/2402.09579v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10397v1","updated":"2024-11-15T18:03:52Z","published":"2024-11-15T18:03:52Z","title":"Features that Make a Difference: Leveraging Gradients for Improved\n Dictionary Learning","summary":" Sparse Autoencoders (SAEs) are a promising approach for extracting neural\nnetwork representations by learning a sparse and overcomplete decomposition of\nthe network's internal activations. However, SAEs are traditionally trained\nconsidering only activation values and not the effect those activations have on\ndownstream computations. This limits the information available to learn\nfeatures, and biases the autoencoder towards neglecting features which are\nrepresented with small activation values but strongly influence model outputs.\nTo address this, we introduce Gradient SAEs (g-SAEs), which modify the\n$k$-sparse autoencoder architecture by augmenting the TopK activation function\nto rely on the gradients of the input activation when selecting the $k$\nelements. For a given sparsity level, g-SAEs produce reconstructions that are\nmore faithful to original network performance when propagated through the\nnetwork. Additionally, we find evidence that g-SAEs learn latents that are on\naverage more effective at steering models in arbitrary contexts. By considering\nthe downstream effects of activations, our approach leverages the dual nature\nof neural network features as both $\\textit{representations}$, retrospectively,\nand $\\textit{actions}$, prospectively. While previous methods have approached\nthe problem of feature discovery primarily focused on the former aspect, g-SAEs\nrepresent a step towards accounting for the latter as well.\n","authors":["Jeffrey Olmo","Jared Wilson","Max Forsey","Bryce Hepner","Thomas Vin Howe","David Wingate"],"pdf_url":"https://arxiv.org/pdf/2411.10397v1.pdf","comment":"9 pages, 8 figures. Submitted to NAACL 2025"},{"id":"http://arxiv.org/abs/2407.00342v4","updated":"2024-11-15T17:59:10Z","published":"2024-06-29T07:01:51Z","title":"KPC-cF: Aspect-Based Sentiment Analysis via Implicit-Feature Alignment\n with Corpus Filtering","summary":" Investigations into Aspect-Based Sentiment Analysis (ABSA) for Korean\nindustrial reviews are notably lacking in the existing literature. Our research\nproposes an intuitive and effective framework for ABSA in low-resource\nlanguages such as Korean. It optimizes prediction labels by integrating\ntranslated benchmark and unlabeled Korean data. Using a model fine-tuned on\ntranslated data, we pseudo-labeled the actual Korean NLI set. Subsequently, we\napplied LaBSE and \\MSP{}-based filtering to this pseudo-NLI set as implicit\nfeature, enhancing Aspect Category Detection and Polarity determination through\nadditional training. Incorporating dual filtering, this model bridged dataset\ngaps, achieving positive results in Korean ABSA with minimal resources. Through\nadditional data injection pipelines, our approach aims to utilize high-resource\ndata and construct effective models within communities, whether corporate or\nindividual, in low-resource language countries. Compared to English ABSA, our\nframework showed an approximately 3\\% difference in F1 scores and accuracy. We\nrelease the dataset and our code for Korean ABSA, at this link.\n","authors":["Kibeom Nam"],"pdf_url":"https://arxiv.org/pdf/2407.00342v4.pdf","comment":"Work in Progress, DMLR@ICML 2024"},{"id":"http://arxiv.org/abs/2408.14090v2","updated":"2024-11-15T17:55:40Z","published":"2024-08-26T08:20:50Z","title":"Exploring GPU-to-GPU Communication: Insights into Supercomputer\n Interconnects","summary":" Multi-GPU nodes are increasingly common in the rapidly evolving landscape of\nexascale supercomputers. On these systems, GPUs on the same node are connected\nthrough dedicated networks, with bandwidths up to a few terabits per second.\nHowever, gauging performance expectations and maximizing system efficiency is\nchallenging due to different technologies, design options, and software layers.\nThis paper comprehensively characterizes three supercomputers - Alps, Leonardo,\nand LUMI - each with a unique architecture and design. We focus on performance\nevaluation of intra-node and inter-node interconnects on up to 4096 GPUs, using\na mix of intra-node and inter-node benchmarks. By analyzing its limitations and\nopportunities, we aim to offer practical guidance to researchers, system\narchitects, and software developers dealing with multi-GPU supercomputing. Our\nresults show that there is untapped bandwidth, and there are still many\nopportunities for optimization, ranging from network to software optimization.\n","authors":["Daniele De Sensi","Lorenzo Pichetti","Flavio Vella","Tiziano De Matteis","Zebin Ren","Luigi Fusco","Matteo Turisini","Daniele Cesarini","Kurt Lust","Animesh Trivedi","Duncan Roweth","Filippo Spiga","Salvatore Di Girolamo","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2408.14090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10389v1","updated":"2024-11-15T17:50:46Z","published":"2024-11-15T17:50:46Z","title":"Deep Learning for Micro-Scale Crack Detection on Imbalanced Datasets\n Using Key Point Localization","summary":" Internal crack detection has been a subject of focus in structural health\nmonitoring. By focusing on crack detection in structural datasets, it is\ndemonstrated that deep learning (DL) methods can effectively analyze seismic\nwave fields interacting with micro-scale cracks, which are beyond the\nresolution of conventional visual inspection. This work explores a novel\napplication of DL-based key point detection technique, where cracks are\nlocalized by predicting the coordinates of four key points that define a\nbounding region of the crack. The study not only opens new research directions\nfor non-visual applications but also effectively mitigates the impact of\nimbalanced data which poses a challenge for previous DL models, as it can be\nbiased toward predicting the majority class (non-crack regions). Popular DL\ntechniques, such as the Inception blocks, are used and investigated. The model\nshows an overall reduction in loss when applied to micro-scale crack detection\nand is reflected in the lower average deviation between the location of actual\nand predicted cracks, with an average Intersection over Union (IoU) being 0.511\nfor all micro cracks (greater than 0.00 micrometers) and 0.631 for larger micro\ncracks (greater than 4 micrometers).\n","authors":["Fatahlla Moreh","Yusuf Hasan","Bilal Zahid Hussain","Mohammad Ammar","Sven Tomforde"],"pdf_url":"https://arxiv.org/pdf/2411.10389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10385v1","updated":"2024-11-15T17:48:06Z","published":"2024-11-15T17:48:06Z","title":"Low-Latency Task-Oriented Communications with Multi-Round, Multi-Task\n Deep Learning","summary":" In this paper, we address task-oriented (or goal-oriented) communications\nwhere an encoder at the transmitter learns compressed latent representations of\ndata, which are then transmitted over a wireless channel. At the receiver, a\ndecoder performs a machine learning task, specifically for classifying the\nreceived signals. The deep neural networks corresponding to the encoder-decoder\npair are jointly trained, taking both channel and data characteristics into\naccount. Our objective is to achieve high accuracy in completing the underlying\ntask while minimizing the number of channel uses determined by the encoder's\noutput size. To this end, we propose a multi-round, multi-task learning (MRMTL)\napproach for the dynamic update of channel uses in multi-round transmissions.\nThe transmitter incrementally sends an increasing number of encoded samples\nover the channel based on the feedback from the receiver, and the receiver\nutilizes the signals from a previous round to enhance the task performance,\nrather than only considering the latest transmission. This approach employs\nmulti-task learning to jointly optimize accuracy across varying number of\nchannel uses, treating each configuration as a distinct task. By evaluating the\nconfidence of the receiver in task decisions, MRMTL decides on whether to\nallocate additional channel uses in multiple rounds. We characterize both the\naccuracy and the delay (total number of channel uses) of MRMTL, demonstrating\nthat it achieves the accuracy close to that of conventional methods requiring\nlarge numbers of channel uses, but with reduced delay by incorporating signals\nfrom a prior round. We consider the CIFAR-10 dataset, convolutional neural\nnetwork architectures, and AWGN and Rayleigh channel models for performance\nevaluation. We show that MRMTL significantly improves the efficiency of\ntask-oriented communications, balancing accuracy and latency effectively.\n","authors":["Yalin E. Sagduyu","Tugba Erpek","Aylin Yener","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2411.10385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10371v1","updated":"2024-11-15T17:19:42Z","published":"2024-11-15T17:19:42Z","title":"A Survey of Event Causality Identification: Principles, Taxonomy,\n Challenges, and Assessment","summary":" Event Causality Identification (ECI) has become a crucial task in Natural\nLanguage Processing (NLP), aimed at automatically extracting causalities from\ntextual data. In this survey, we systematically address the foundational\nprinciples, technical frameworks, and challenges of ECI, offering a\ncomprehensive taxonomy to categorize and clarify current research\nmethodologies, as well as a quantitative assessment of existing models. We\nfirst establish a conceptual framework for ECI, outlining key definitions,\nproblem formulations, and evaluation standards. Our taxonomy classifies ECI\nmethods according to the two primary tasks of sentence-level (SECI) and\ndocument-level (DECI) event causality identification. For SECI, we examine\nfeature pattern-based matching, deep semantic encoding, causal knowledge\npre-training and prompt-based fine-tuning, and external knowledge enhancement\nmethods. For DECI, we highlight approaches focused on event graph reasoning and\nprompt-based techniques to address the complexity of cross-sentence causal\ninference. Additionally, we analyze the strengths, limitations, and open\nchallenges of each approach. We further conduct an extensive quantitative\nevaluation of various ECI methods on two benchmark datasets. Finally, we\nexplore future research directions, highlighting promising pathways to overcome\ncurrent limitations and broaden ECI applications.\n","authors":["Zefan Zeng","Qing Cheng","Xingchen Hu","Yuehang Si","Zhong Liu"],"pdf_url":"https://arxiv.org/pdf/2411.10371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10369v1","updated":"2024-11-15T17:19:18Z","published":"2024-11-15T17:19:18Z","title":"Towards High-Fidelity 3D Portrait Generation with Rich Details by\n Cross-View Prior-Aware Diffusion","summary":" Recent diffusion-based Single-image 3D portrait generation methods typically\nemploy 2D diffusion models to provide multi-view knowledge, which is then\ndistilled into 3D representations. However, these methods usually struggle to\nproduce high-fidelity 3D models, frequently yielding excessively blurred\ntextures. We attribute this issue to the insufficient consideration of\ncross-view consistency during the diffusion process, resulting in significant\ndisparities between different views and ultimately leading to blurred 3D\nrepresentations. In this paper, we address this issue by comprehensively\nexploiting multi-view priors in both the conditioning and diffusion procedures\nto produce consistent, detail-rich portraits. From the conditioning standpoint,\nwe propose a Hybrid Priors Diffsion model, which explicitly and implicitly\nincorporates multi-view priors as conditions to enhance the status consistency\nof the generated multi-view portraits. From the diffusion perspective,\nconsidering the significant impact of the diffusion noise distribution on\ndetailed texture generation, we propose a Multi-View Noise Resamplig Strategy\nintegrated within the optimization process leveraging cross-view priors to\nenhance representation consistency. Extensive experiments demonstrate that our\nmethod can produce 3D portraits with accurate geometry and rich details from a\nsingle image. The project page is at\n\\url{https://haoran-wei.github.io/Portrait-Diffusion}.\n","authors":["Haoran Wei","Wencheng Han","Xingping Dong","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2411.10369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23472v2","updated":"2024-11-15T17:18:57Z","published":"2024-10-30T21:32:56Z","title":"Risk Sources and Risk Management Measures in Support of Standards for\n General-Purpose AI Systems","summary":" There is an urgent need to identify both short and long-term risks from newly\nemerging types of Artificial Intelligence (AI), as well as available risk\nmanagement measures. In response, and to support global efforts in regulating\nAI and writing safety standards, we compile an extensive catalog of risk\nsources and risk management measures for general-purpose AI (GPAI) systems,\ncomplete with descriptions and supporting examples where relevant. This work\ninvolves identifying technical, operational, and societal risks across model\ndevelopment, training, and deployment stages, as well as surveying established\nand experimental methods for managing these risks. To the best of our\nknowledge, this paper is the first of its kind to provide extensive\ndocumentation of both GPAI risk sources and risk management measures that are\ndescriptive, self-contained and neutral with respect to any existing regulatory\nframework. This work intends to help AI providers, standards experts,\nresearchers, policymakers, and regulators in identifying and mitigating\nsystemic risks from GPAI systems. For this reason, the catalog is released\nunder a public domain license for ease of direct use by stakeholders in AI\ngovernance and standards.\n","authors":["Rokas Gipiškis","Ayrton San Joaquin","Ze Shen Chin","Adrian Regenfuß","Ariel Gil","Koen Holtman"],"pdf_url":"https://arxiv.org/pdf/2410.23472v2.pdf","comment":"92 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.10368v1","updated":"2024-11-15T17:17:46Z","published":"2024-11-15T17:17:46Z","title":"Mechanisms of Generative Image-to-Image Translation Networks","summary":" Generative Adversarial Networks (GANs) are a class of neural networks that\nhave been widely used in the field of image-to-image translation. In this\npaper, we propose a streamlined image-to-image translation network with a\nsimpler architecture compared to existing models. We investigate the\nrelationship between GANs and autoencoders and provide an explanation for the\nefficacy of employing only the GAN component for tasks involving image\ntranslation. We show that adversarial for GAN models yields results comparable\nto those of existing methods without additional complex loss penalties.\nSubsequently, we elucidate the rationale behind this phenomenon. We also\nincorporate experimental results to demonstrate the validity of our findings.\n","authors":["Guangzong Chen","Mingui Sun","Zhi-Hong Mao","Kangni Liu","Wenyan Jia"],"pdf_url":"https://arxiv.org/pdf/2411.10368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10367v1","updated":"2024-11-15T17:17:06Z","published":"2024-11-15T17:17:06Z","title":"Continual Adversarial Reinforcement Learning (CARL) of False Data\n Injection detection: forgetting and explainability","summary":" False data injection attacks (FDIAs) on smart inverters are a growing concern\nlinked to increased renewable energy production. While data-based FDIA\ndetection methods are also actively developed, we show that they remain\nvulnerable to impactful and stealthy adversarial examples that can be crafted\nusing Reinforcement Learning (RL). We propose to include such adversarial\nexamples in data-based detection training procedure via a continual adversarial\nRL (CARL) approach. This way, one can pinpoint the deficiencies of data-based\ndetection, thereby offering explainability during their incremental\nimprovement. We show that a continual learning implementation is subject to\ncatastrophic forgetting, and additionally show that forgetting can be addressed\nby employing a joint training strategy on all generated FDIA scenarios.\n","authors":["Pooja Aslami","Kejun Chen","Timothy M. Hansen","Malik Hassanaly"],"pdf_url":"https://arxiv.org/pdf/2411.10367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10364v1","updated":"2024-11-15T17:14:18Z","published":"2024-11-15T17:14:18Z","title":"Forming Auxiliary High-confident Instance-level Loss to Promote Learning\n from Label Proportions","summary":" Learning from label proportions (LLP), i.e., a challenging weakly-supervised\nlearning task, aims to train a classifier by using bags of instances and the\nproportions of classes within bags, rather than annotated labels for each\ninstance. Beyond the traditional bag-level loss, the mainstream methodology of\nLLP is to incorporate an auxiliary instance-level loss with pseudo-labels\nformed by predictions. Unfortunately, we empirically observed that the\npseudo-labels are are often inaccurate due to over-smoothing, especially for\nthe scenarios with large bag sizes, hurting the classifier induction. To\nalleviate this problem, we suggest a novel LLP method, namely Learning from\nLabel Proportions with Auxiliary High-confident Instance-level Loss\n(L^2P-AHIL). Specifically, we propose a dual entropy-based weight (DEW) method\nto adaptively measure the confidences of pseudo-labels. It simultaneously\nemphasizes accurate predictions at the bag level and avoids overly smoothed\npredictions. We then form high-confident instance-level loss with DEW, and\njointly optimize it with the bag-level loss in a self-training manner. The\nexperimental results on benchmark datasets show that L^2P-AHIL can surpass the\nexisting baseline methods, and the performance gain can be more significant as\nthe bag size increases.\n","authors":["Tianhao Ma","Han Chen","Juncheng Hu","Yungang Zhu","Ximing Li"],"pdf_url":"https://arxiv.org/pdf/2411.10364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14279v3","updated":"2024-11-15T17:11:08Z","published":"2024-02-22T04:41:52Z","title":"Mitigating the Linguistic Gap with Phonemic Representations for Robust\n Cross-lingual Transfer","summary":" Approaches to improving multilingual language understanding often struggle\nwith significant performance gaps between high-resource and low-resource\nlanguages. While there are efforts to align the languages in a single latent\nspace to mitigate such gaps, how different input-level representations\ninfluence such gaps has not been investigated, particularly with phonemic\ninputs. We hypothesize that the performance gaps are affected by representation\ndiscrepancies between these languages, and revisit the use of phonemic\nrepresentations as a means to mitigate these discrepancies. To demonstrate the\neffectiveness of phonemic representations, we present experiments on three\nrepresentative cross-lingual tasks on 12 languages in total. The results show\nthat phonemic representations exhibit higher similarities between languages\ncompared to orthographic representations, and it consistently outperforms\ngrapheme-based baseline model on languages that are relatively low-resourced.\nWe present quantitative evidence from three cross-lingual tasks that\ndemonstrate the effectiveness of phonemic representations, and it is further\njustified by a theoretical analysis of the cross-lingual performance gap.\n","authors":["Haeji Jung","Changdae Oh","Jooeon Kang","Jimin Sohn","Kyungwoo Song","Jinkyu Kim","David R. Mortensen"],"pdf_url":"https://arxiv.org/pdf/2402.14279v3.pdf","comment":"Accepted to the 4th Multilingual Representation Learning (MRL)\n Workshop (co-located with EMNLP 2024)"},{"id":"http://arxiv.org/abs/2411.09102v2","updated":"2024-11-15T17:09:40Z","published":"2024-11-14T00:18:25Z","title":"Provocation: Who benefits from \"inclusion\" in Generative AI?","summary":" The demands for accurate and representative generative AI systems means there\nis an increased demand on participatory evaluation structures. While these\nparticipatory structures are paramount to to ensure non-dominant values,\nknowledge and material culture are also reflected in AI models and the media\nthey generate, we argue that dominant structures of community participation in\nAI development and evaluation are not explicit enough about the benefits and\nharms that members of socially marginalized groups may experience as a result\nof their participation. Without explicit interrogation of these benefits by AI\ndevelopers, as a community we may remain blind to the immensity of systemic\nchange that is needed as well. To support this provocation, we present a\nspeculative case study, developed from our own collective experiences as AI\nresearchers. We use this speculative context to itemize the barriers that need\nto be overcome in order for the proposed benefits to marginalized communities\nto be realized, and harms mitigated.\n","authors":["Samantha Dalal","Siobhan Mackenzie Hall","Nari Johnson"],"pdf_url":"https://arxiv.org/pdf/2411.09102v2.pdf","comment":"3 pages, 1 figure. Published as a Short Paper in the NeurIPS 2024\n Workshop on Evaluating Evaluations: Examining Best Practices for Measuring\n Broader Impacts of Generative AI"},{"id":"http://arxiv.org/abs/2411.10340v1","updated":"2024-11-15T16:40:43Z","published":"2024-11-15T16:40:43Z","title":"Domain Adaptation-based Edge Computing for Cross-Conditions Fault\n Diagnosis","summary":" Fault diagnosis technology supports the healthy operation of mechanical\nequipment. However, the variations conditions during the operation of\nmechanical equipment lead to significant disparities in data distribution,\nposing challenges to fault diagnosis. Furthermore, when deploying applications,\ntraditional methods often encounter issues such as latency and data security.\nTherefore, conducting fault diagnosis and deploying application methods under\ncross-operating conditions holds significant value. This paper proposes a\ndomain adaptation-based lightweight fault diagnosis framework for edge\ncomputing scenarios. Incorporating the local maximum mean discrepancy into\nknowledge transfer aligns the feature distributions of different domains in a\nhigh-dimensional feature space, to discover a common feature space across\ndomains. The acquired fault diagnosis expertise from the cloud-model is\ntransferred to the lightweight edge-model using adaptation knowledge transfer\nmethods. While ensuring real-time diagnostic capabilities, accurate fault\ndiagnosis is achieved across working conditions. We conducted validation\nexperiments on the NVIDIA Jetson Xavier NX kit. In terms of diagnostic\nperformance, the proposed method significantly improved diagnostic accuracy,\nwith average increases of 34.44% and 17.33% compared to the comparison method,\nrespectively. Regarding lightweight effectiveness, proposed method achieved an\naverage inference speed increase of 80.47%. Additionally, compared to the\ncloud-model, the parameter count of the edge-model decreased by 96.37%, while\nthe Flops decreased by 83.08%.\n","authors":["Yanzhi Wang","Chu Wang","Jinhong Wu","Ziyang Yu","Qi Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.10340v1.pdf","comment":"28 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.10329v1","updated":"2024-11-15T16:29:02Z","published":"2024-11-15T16:29:02Z","title":"Safe Text-to-Image Generation: Simply Sanitize the Prompt Embedding","summary":" In recent years, text-to-image (T2I) generation models have made significant\nprogress in generating high-quality images that align with text descriptions.\nHowever, these models also face the risk of unsafe generation, potentially\nproducing harmful content that violates usage policies, such as explicit\nmaterial. Existing safe generation methods typically focus on suppressing\ninappropriate content by erasing undesired concepts from visual\nrepresentations, while neglecting to sanitize the textual representation.\nAlthough these methods help mitigate the risk of misuse to certain extent,\ntheir robustness remains insufficient when dealing with adversarial attacks.\n Given that semantic consistency between input text and output image is a\nfundamental requirement for T2I models, we identify that textual\nrepresentations (i.e., prompt embeddings) are likely the primary source of\nunsafe generation. To this end, we propose a vision-agnostic safe generation\nframework, Embedding Sanitizer (ES), which focuses on erasing inappropriate\nconcepts from prompt embeddings and uses the sanitized embeddings to guide the\nmodel for safe generation. ES is applied to the output of the text encoder as a\nplug-and-play module, enabling seamless integration with different T2I models\nas well as other safeguards. In addition, ES's unique scoring mechanism assigns\na score to each token in the prompt to indicate its potential harmfulness, and\ndynamically adjusts the sanitization intensity to balance defensive performance\nand generation quality. Through extensive evaluation on five prompt benchmarks,\nour approach achieves state-of-the-art robustness by sanitizing the source\n(prompt embedding) of unsafe generation compared to nine baseline methods. It\nsignificantly outperforms existing safeguards in terms of interpretability and\ncontrollability while maintaining generation quality.\n","authors":["Huming Qiu","Guanxu Chen","Mi Zhang","Min Yang"],"pdf_url":"https://arxiv.org/pdf/2411.10329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10323v1","updated":"2024-11-15T16:23:52Z","published":"2024-11-15T16:23:52Z","title":"The Dawn of GUI Agent: A Preliminary Case Study with Claude 3.5 Computer\n Use","summary":" The recently released model, Claude 3.5 Computer Use, stands out as the first\nfrontier AI model to offer computer use in public beta as a graphical user\ninterface (GUI) agent. As an early beta, its capability in the real-world\ncomplex environment remains unknown. In this case study to explore Claude 3.5\nComputer Use, we curate and organize a collection of carefully designed tasks\nspanning a variety of domains and software. Observations from these cases\ndemonstrate Claude 3.5 Computer Use's unprecedented ability in end-to-end\nlanguage to desktop actions. Along with this study, we provide an\nout-of-the-box agent framework for deploying API-based GUI automation models\nwith easy implementation. Our case studies aim to showcase a groundwork of\ncapabilities and limitations of Claude 3.5 Computer Use with detailed analyses\nand bring to the fore questions about planning, action, and critic, which must\nbe considered for future improvement. We hope this preliminary exploration will\ninspire future research into the GUI agent community. All the test cases in the\npaper can be tried through the project:\nhttps://github.com/showlab/computer_use_ootb.\n","authors":["Siyuan Hu","Mingyu Ouyang","Difei Gao","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2411.10323v1.pdf","comment":"40 pages, 21 figures, preprint"},{"id":"http://arxiv.org/abs/2406.14377v2","updated":"2024-11-15T16:23:15Z","published":"2024-06-20T14:45:13Z","title":"CE-SSL: Computation-Efficient Semi-Supervised Learning for ECG-based\n Cardiovascular Diseases Detection","summary":" The label scarcity problem is the main challenge that hinders the wide\napplication of deep learning systems in automatic cardiovascular diseases\n(CVDs) detection using electrocardiography (ECG). Tuning pre-trained models\nalleviates this problem by transferring knowledge learned from large datasets\nto downstream small datasets. However, bottlenecks in computational efficiency\nand detection performance limit its clinical applications. It is difficult to\nimprove the detection performance without significantly sacrificing the\ncomputational efficiency during model training. Here, we propose a\ncomputation-efficient semi-supervised learning paradigm (CE-SSL) for robust and\ncomputation-efficient CVDs detection using ECG. It enables a robust adaptation\nof pre-trained models on downstream datasets with limited supervision and high\ncomputational efficiency. First, a random-deactivation technique is developed\nto achieve robust and fast low-rank adaptation of pre-trained weights.\nSubsequently, we propose a one-shot rank allocation module to determine the\noptimal ranks for the update matrices of the pre-trained weights. Finally, a\nlightweight semi-supervised learning pipeline is introduced to enhance model\nperformance by leveraging labeled and unlabeled data with high computational\nefficiency. Extensive experiments on four downstream datasets demonstrate that\nCE-SSL not only outperforms the state-of-the-art methods in multi-label CVDs\ndetection but also consumes fewer GPU footprints, training time, and parameter\nstorage space. As such, this paradigm provides an effective solution for\nachieving high computational efficiency and robust detection performance in the\nclinical applications of pre-trained models under limited supervision. Code and\nSupplementary Materials are available at https://github.com/KAZABANA/CE-SSL\n","authors":["Rushuang Zhou","Lei Clifton","Zijun Liu","Kannie W. Y. Chan","David A. Clifton","Yuan-Ting Zhang","Yining Dong"],"pdf_url":"https://arxiv.org/pdf/2406.14377v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08954v2","updated":"2024-11-15T16:06:23Z","published":"2024-11-13T19:00:02Z","title":"Inconsistencies In Consistency Models: Better ODE Solving Does Not Imply\n Better Samples","summary":" Although diffusion models can generate remarkably high-quality samples, they\nare intrinsically bottlenecked by their expensive iterative sampling procedure.\nConsistency models (CMs) have recently emerged as a promising diffusion model\ndistillation method, reducing the cost of sampling by generating high-fidelity\nsamples in just a few iterations. Consistency model distillation aims to solve\nthe probability flow ordinary differential equation (ODE) defined by an\nexisting diffusion model. CMs are not directly trained to minimize error\nagainst an ODE solver, rather they use a more computationally tractable\nobjective. As a way to study how effectively CMs solve the probability flow\nODE, and the effect that any induced error has on the quality of generated\nsamples, we introduce Direct CMs, which \\textit{directly} minimize this error.\nIntriguingly, we find that Direct CMs reduce the ODE solving error compared to\nCMs but also result in significantly worse sample quality, calling into\nquestion why exactly CMs work well in the first place. Full code is available\nat: https://github.com/layer6ai-labs/direct-cms.\n","authors":["Noël Vouitsis","Rasa Hosseinzadeh","Brendan Leigh Ross","Valentin Villecroze","Satya Krishna Gorti","Jesse C. Cresswell","Gabriel Loaiza-Ganem"],"pdf_url":"https://arxiv.org/pdf/2411.08954v2.pdf","comment":"NeurIPS 2024 ATTRIB Workshop"},{"id":"http://arxiv.org/abs/2411.10308v1","updated":"2024-11-15T16:04:01Z","published":"2024-11-15T16:04:01Z","title":"A Realistic Collimated X-Ray Image Simulation Pipeline","summary":" Collimator detection remains a challenging task in X-ray systems with\nunreliable or non-available information about the detectors position relative\nto the source. This paper presents a physically motivated image processing\npipeline for simulating the characteristics of collimator shadows in X-ray\nimages. By generating randomized labels for collimator shapes and locations,\nincorporating scattered radiation simulation, and including Poisson noise, the\npipeline enables the expansion of limited datasets for training deep neural\nnetworks. We validate the proposed pipeline by a qualitative and quantitative\ncomparison against real collimator shadows. Furthermore, it is demonstrated\nthat utilizing simulated data within our deep learning framework not only\nserves as a suitable substitute for actual collimators but also enhances the\ngeneralization performance when applied to real-world data.\n","authors":["Benjamin El-Zein","Dominik Eckert","Thomas Weber","Maximilian Rohleder","Ludwig Ritschl","Steffen Kappler","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2411.10308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09871v4","updated":"2024-11-15T16:01:39Z","published":"2024-03-14T21:01:06Z","title":"ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric\n Thermal Images","summary":" Designing egocentric 3D hand pose estimation systems that can perform\nreliably in complex, real-world scenarios is crucial for downstream\napplications. Previous approaches using RGB or NIR imagery struggle in\nchallenging conditions: RGB methods are susceptible to lighting variations and\nobstructions like handwear, while NIR techniques can be disrupted by sunlight\nor interference from other NIR-equipped devices. To address these limitations,\nwe present ThermoHands, the first benchmark focused on thermal image-based\negocentric 3D hand pose estimation, demonstrating the potential of thermal\nimaging to achieve robust performance under these conditions. The benchmark\nincludes a multi-view and multi-spectral dataset collected from 28 subjects\nperforming hand-object and hand-virtual interactions under diverse scenarios,\naccurately annotated with 3D hand poses through an automated process. We\nintroduce a new baseline method, TherFormer, utilizing dual transformer modules\nfor effective egocentric 3D hand pose estimation in thermal imagery. Our\nexperimental results highlight TherFormer's leading performance and affirm\nthermal imaging's effectiveness in enabling robust 3D hand pose estimation in\nadverse conditions.\n","authors":["Fangqiang Ding","Yunzhou Zhu","Xiangyu Wen","Gaowen Liu","Chris Xiaoxuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.09871v4.pdf","comment":"15 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.10293v1","updated":"2024-11-15T15:51:25Z","published":"2024-11-15T15:51:25Z","title":"RETR: Multi-View Radar Detection Transformer for Indoor Perception","summary":" Indoor radar perception has seen rising interest due to affordable costs\ndriven by emerging automotive imaging radar developments and the benefits of\nreduced privacy concerns and reliability under hazardous conditions (e.g., fire\nand smoke). However, existing radar perception pipelines fail to account for\ndistinctive characteristics of the multi-view radar setting. In this paper, we\npropose Radar dEtection TRansformer (RETR), an extension of the popular DETR\narchitecture, tailored for multi-view radar perception. RETR inherits the\nadvantages of DETR, eliminating the need for hand-crafted components for object\ndetection and segmentation in the image plane. More importantly, RETR\nincorporates carefully designed modifications such as 1) depth-prioritized\nfeature similarity via a tunable positional encoding (TPE); 2) a tri-plane loss\nfrom both radar and camera coordinates; and 3) a learnable radar-to-camera\ntransformation via reparameterization, to account for the unique multi-view\nradar setting. Evaluated on two indoor radar perception datasets, our approach\noutperforms existing state-of-the-art methods by a margin of 15.38+ AP for\nobject detection and 11.77+ IoU for instance segmentation, respectively.\n","authors":["Ryoma Yataka","Adriano Cardace","Pu Perry Wang","Petros Boufounos","Ryuhei Takahashi"],"pdf_url":"https://arxiv.org/pdf/2411.10293v1.pdf","comment":"24 pages, Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10290v1","updated":"2024-11-15T15:47:32Z","published":"2024-11-15T15:47:32Z","title":"The ParClusterers Benchmark Suite (PCBS): A Fine-Grained Analysis of\n Scalable Graph Clustering","summary":" We introduce the ParClusterers Benchmark Suite (PCBS) -- a collection of\nhighly scalable parallel graph clustering algorithms and benchmarking tools\nthat streamline comparing different graph clustering algorithms and\nimplementations.\n The benchmark includes clustering algorithms that target a wide range of\nmodern clustering use cases, including community detection, classification, and\ndense subgraph mining.\n The benchmark toolkit makes it easy to run and evaluate multiple instances of\ndifferent clustering algorithms, which can be useful for fine-tuning the\nperformance of clustering on a given task, and for comparing different\nclustering algorithms based on different metrics of interest, including\nclustering quality and running time.\n Using PCBS, we evaluate a broad collection of real-world graph clustering\ndatasets. Somewhat surprisingly, we find that the best quality results are\nobtained by algorithms that not included in many popular graph clustering\ntoolkits. The PCBS provides a standardized way to evaluate and judge the\nquality-performance tradeoffs of the active research area of scalable graph\nclustering algorithms. We believe it will help enable fair, accurate, and\nnuanced evaluation of graph clustering algorithms in the future.\n","authors":["Shangdi Yu","Jessica Shi","Jamison Meindl","David Eisenstat","Xiaoen Ju","Sasan Tavakkol","Laxman Dhulipala","Jakub Łącki","Vahab Mirrokni","Julian Shun"],"pdf_url":"https://arxiv.org/pdf/2411.10290v1.pdf","comment":"This is a preliminary version of a paper that will appear at VLDB'25"},{"id":"http://arxiv.org/abs/2410.07364v2","updated":"2024-11-15T15:46:00Z","published":"2024-10-09T18:24:23Z","title":"Unlocking Real-Time Fluorescence Lifetime Imaging: Multi-Pixel\n Parallelism for FPGA-Accelerated Processing","summary":" Fluorescence lifetime imaging (FLI) is a widely used technique in the\nbiomedical field for measuring the decay times of fluorescent molecules,\nproviding insights into metabolic states, protein interactions, and\nligand-receptor bindings. However, its broader application in fast biological\nprocesses, such as dynamic activity monitoring, and clinical use, such as in\nguided surgery, is limited by long data acquisition times and computationally\ndemanding data processing. While deep learning has reduced post-processing\ntimes, time-resolved data acquisition remains a bottleneck for real-time\napplications. To address this, we propose a method to achieve real-time FLI\nusing an FPGA-based hardware accelerator. Specifically, we implemented a\nGRU-based sequence-to-sequence (Seq2Seq) model on an FPGA board compatible with\ntime-resolved cameras. The GRU model balances accurate processing with the\nresource constraints of FPGAs, which have limited DSP units and BRAM. The\nlimited memory and computational resources on the FPGA require efficient\nscheduling of operations and memory allocation to deploy deep learning models\nfor low-latency applications. We address these challenges by using STOMP, a\nqueue-based discrete-event simulator that automates and optimizes task\nscheduling and memory management on hardware. By integrating a GRU-based\nSeq2Seq model and its compressed version, called Seq2SeqLite, generated through\nknowledge distillation, we were able to process multiple pixels in parallel,\nreducing latency compared to sequential processing. We explore various levels\nof parallelism to achieve an optimal balance between performance and resource\nutilization. Our results indicate that the proposed techniques achieved a 17.7x\nand 52.0x speedup over manual scheduling for the Seq2Seq model and the\nSeq2SeqLite model, respectively.\n","authors":["Ismail Erbas","Aporva Amarnath","Vikas Pandey","Karthik Swaminathan","Naigang Wang","Xavier Intes"],"pdf_url":"https://arxiv.org/pdf/2410.07364v2.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.03500v2","updated":"2024-11-15T15:42:46Z","published":"2024-09-05T13:12:16Z","title":"Disclosure of AI-Generated News Increases Engagement but Does Not Reduce\n Aversion, Despite Positive Quality Ratings","summary":" The advancement of artificial intelligence (AI) has led to its application in\nmany areas, including news media. The integration of AI in journalism presents\nboth opportunities and risks for democracy, making it crucial to understand\npublic reception of and engagement with AI-generated news, as it may directly\ninfluence political knowledge and trust. This preregistered study investigates\n(i) the perceived quality of AI-assisted and AI-generated versus\nhuman-generated news articles, (ii) whether disclosure of AI's involvement in\ngenerating these news articles influences engagement with them, and (iii)\nwhether such awareness affects the willingness to read AI-generated articles in\nthe future. We employed a between-subjects survey experiment with 599\nparticipants from the German-speaking part of Switzerland, who evaluated the\ncredibility, readability, and expertise of news articles. These articles were\neither written by journalists (control group), rewritten by AI (AI-assisted\ngroup), or entirely generated by AI (AI-generated group). Our results indicate\nthat all news articles, regardless of whether they were written by journalists\nor AI, were perceived to be of equal quality. When participants in the\ntreatment groups were subsequently made aware of AI's involvement in generating\nthe articles, they expressed a higher willingness to engage with (i.e.,\ncontinue reading) the articles than participants in the control group. However,\nthey were not more willing to read AI-generated news in the future. These\nresults suggest that aversion to AI usage in news media is not primarily rooted\nin a perceived lack of quality, and that by disclosing using AI, journalists\ncould attract more immediate engagement with their content, at least in the\nshort term.\n","authors":["Fabrizio Gilardi","Sabrina Di Lorenzo","Juri Ezzaini","Beryl Santa","Benjamin Streiff","Eric Zurfluh","Emma Hoes"],"pdf_url":"https://arxiv.org/pdf/2409.03500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10285v1","updated":"2024-11-15T15:40:49Z","published":"2024-11-15T15:40:49Z","title":"Systolic Arrays and Structured Pruning Co-design for Efficient\n Transformers in Edge Systems","summary":" Efficient deployment of resource-intensive transformers on edge devices\nnecessitates cross-stack optimization. We thus study the interrelation between\nstructured pruning and systolic acceleration, matching the size of pruned\nblocks with the systolic array dimensions. In this setting, computations of\npruned weight blocks can be skipped, reducing run-time and energy consumption,\nbut potentially impacting quality of service (QoS). To evaluate the trade-offs\nbetween systolic array size and sparsity opportunities, we present a novel\nco-design framework that integrates algorithmic optimization, system\nsimulation, and hardware design. Targeting speech recognition using\ntransformers as a case study, we analyze how configuration choices across the\nstack affect performance metrics. Results demonstrate that structured pruning\non systems featuring systolic array acceleration can effectively increase\nperformance, while maintaining high QoS levels. Up to 26% system-wide speedups\ndue to structured pruning were measured, with only 1.4% word error rate\ndegradation on the standard Librispeech dataset.\n","authors":["Pedro Palacios","Rafael Medina","Jean-Luc Rouas","Giovanni Ansaloni","David Atienza"],"pdf_url":"https://arxiv.org/pdf/2411.10285v1.pdf","comment":"7 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.10279v1","updated":"2024-11-15T15:35:56Z","published":"2024-11-15T15:35:56Z","title":"Lateral Movement Detection via Time-aware Subgraph Classification on\n Authentication Logs","summary":" Lateral movement is a crucial component of advanced persistent threat (APT)\nattacks in networks. Attackers exploit security vulnerabilities in internal\nnetworks or IoT devices, expanding their control after initial infiltration to\nsteal sensitive data or carry out other malicious activities, posing a serious\nthreat to system security. Existing research suggests that attackers generally\nemploy seemingly unrelated operations to mask their malicious intentions,\nthereby evading existing lateral movement detection methods and hiding their\nintrusion traces. In this regard, we analyze host authentication log data from\na graph perspective and propose a multi-scale lateral movement detection\nframework called LMDetect. The main workflow of this framework proceeds as\nfollows: 1) Construct a heterogeneous multigraph from host authentication log\ndata to strengthen the correlations among internal system entities; 2) Design a\ntime-aware subgraph generator to extract subgraphs centered on authentication\nevents from the heterogeneous authentication multigraph; 3) Design a\nmulti-scale attention encoder that leverages both local and global attention to\ncapture hidden anomalous behavior patterns in the authentication subgraphs,\nthereby achieving lateral movement detection. Extensive experiments on two\nreal-world authentication log datasets demonstrate the effectiveness and\nsuperiority of our framework in detecting lateral movement behaviors.\n","authors":["Jiajun Zhou","Jiacheng Yao","Xuanze Chen","Shanqing Yu","Qi Xuan","Xiaoniu Yang"],"pdf_url":"https://arxiv.org/pdf/2411.10279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10272v1","updated":"2024-11-15T15:28:42Z","published":"2024-11-15T15:28:42Z","title":"Scaling Law for Post-training after Model Pruning","summary":" Large language models (LLMs) based on the Transformer architecture are widely\nemployed across various domains and tasks. However, their increasing size\nimposes significant hardware demands, limiting practical deployment. To\nmitigate this, model pruning techniques have been developed to create more\nefficient models while maintaining high performance. Despite this,\npost-training after pruning is crucial for performance recovery and can be\nresource-intensive. This paper investigates the post-training requirements of\npruned LLMs and introduces a scaling law to determine the optimal amount of\npost-training data. Post-training experiments with the Llama-3 and Qwen-2.5\nseries models, pruned using depth pruning, width pruning, and 2:4\nsemi-structured pruning, show that higher pruning ratios necessitate more\npost-training data for performance recovery, whereas larger LLMs require less.\nThe proposed scaling law predicts a model's loss based on its parameter counts\nbefore and after pruning, as well as the post-training token counts.\nFurthermore, we find that the scaling law established from smaller LLMs can be\nreliably extrapolated to larger LLMs. This work provides valuable insights into\nthe post-training of pruned LLMs and offers a practical scaling law for\noptimizing post-training data usage.\n","authors":["Xiaodong Chen","Yuxuan Hu","Jing Zhang","Xiaokang Zhang","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.10272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14551v2","updated":"2024-11-15T15:16:56Z","published":"2024-02-22T13:45:01Z","title":"CLCE: An Approach to Refining Cross-Entropy and Contrastive Learning for\n Optimized Learning Fusion","summary":" State-of-the-art pre-trained image models predominantly adopt a two-stage\napproach: initial unsupervised pre-training on large-scale datasets followed by\ntask-specific fine-tuning using Cross-Entropy loss~(CE). However, it has been\ndemonstrated that CE can compromise model generalization and stability. While\nrecent works employing contrastive learning address some of these limitations\nby enhancing the quality of embeddings and producing better decision\nboundaries, they often overlook the importance of hard negative mining and rely\non resource intensive and slow training using large sample batches. To counter\nthese issues, we introduce a novel approach named CLCE, which integrates\nLabel-Aware Contrastive Learning with CE. Our approach not only maintains the\nstrengths of both loss functions but also leverages hard negative mining in a\nsynergistic way to enhance performance. Experimental results demonstrate that\nCLCE significantly outperforms CE in Top-1 accuracy across twelve benchmarks,\nachieving gains of up to 3.52% in few-shot learning scenarios and 3.41% in\ntransfer learning settings with the BEiT-3 model. Importantly, our proposed\nCLCE approach effectively mitigates the dependency of contrastive learning on\nlarge batch sizes such as 4096 samples per batch, a limitation that has\npreviously constrained the application of contrastive learning in\nbudget-limited hardware environments.\n","authors":["Zijun Long","George Killick","Lipeng Zhuang","Gerardo Aragon-Camarasa","Zaiqiao Meng","Richard Mccreadie"],"pdf_url":"https://arxiv.org/pdf/2402.14551v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10257v1","updated":"2024-11-15T15:04:04Z","published":"2024-11-15T15:04:04Z","title":"The Unreasonable Effectiveness of Guidance for Diffusion Models","summary":" Guidance is an error-correcting technique used to improve the perceptual\nquality of images generated by diffusion models. Typically, the correction is\nachieved by linear extrapolation, using an auxiliary diffusion model that has\nlower performance than the primary model. Using a 2D toy example, we show that\nit is highly beneficial when the auxiliary model exhibits similar errors as the\nprimary one but stronger. We verify this finding in higher dimensions, where we\nshow that competitive generative performance to state-of-the-art guidance\nmethods can be achieved when the auxiliary model differs from the primary one\nonly by having stronger weight regularization. As an independent contribution,\nwe investigate whether upweighting long-range spatial dependencies improves\nvisual fidelity. The result is a novel guidance method, which we call sliding\nwindow guidance (SWG), that guides the primary model with itself by\nconstraining its receptive field. Intriguingly, SWG aligns better with human\npreferences than state-of-the-art guidance methods while requiring neither\ntraining, architectural modifications, nor class conditioning. The code will be\nreleased.\n","authors":["Tim Kaiser","Nikolas Adaloglou","Markus Kollmann"],"pdf_url":"https://arxiv.org/pdf/2411.10257v1.pdf","comment":"Preprint. 19 pages, 14 figures in total, including references and\n appendix"},{"id":"http://arxiv.org/abs/2411.10255v1","updated":"2024-11-15T15:03:34Z","published":"2024-11-15T15:03:34Z","title":"Artificial Intelligence in Pediatric Echocardiography: Exploring\n Challenges, Opportunities, and Clinical Applications with Explainable AI and\n Federated Learning","summary":" Pediatric heart diseases present a broad spectrum of congenital and acquired\ndiseases. More complex congenital malformations require a differentiated and\nmultimodal decision-making process, usually including echocardiography as a\ncentral imaging method. Artificial intelligence (AI) offers considerable\npromise for clinicians by facilitating automated interpretation of pediatric\nechocardiography data. However, adapting AI technologies for pediatric\nechocardiography analysis has challenges such as limited public data\navailability, data privacy, and AI model transparency. Recently, researchers\nhave focused on disruptive technologies, such as federated learning (FL) and\nexplainable AI (XAI), to improve automatic diagnostic and decision support\nworkflows. This study offers a comprehensive overview of the limitations and\nopportunities of AI in pediatric echocardiography, emphasizing the synergistic\nworkflow and role of XAI and FL, identifying research gaps, and exploring\npotential future developments. Additionally, three relevant clinical use cases\ndemonstrate the functionality of XAI and FL with a focus on (i) view\nrecognition, (ii) disease classification, (iii) segmentation of cardiac\nstructures, and (iv) quantitative assessment of cardiac function.\n","authors":["Mohammed Yaseen Jabarulla","Theodor Uden","Thomas Jack","Philipp Beerbaum","Steffen Oeltze-Jafra"],"pdf_url":"https://arxiv.org/pdf/2411.10255v1.pdf","comment":"This article is planned for submission to Frontiers Journal"},{"id":"http://arxiv.org/abs/2403.17710v3","updated":"2024-11-15T14:57:28Z","published":"2024-03-26T13:58:00Z","title":"Optimization-based Prompt Injection Attack to LLM-as-a-Judge","summary":" LLM-as-a-Judge uses a large language model (LLM) to select the best response\nfrom a set of candidates for a given question. LLM-as-a-Judge has many\napplications such as LLM-powered search, reinforcement learning with AI\nfeedback (RLAIF), and tool selection. In this work, we propose JudgeDeceiver,\nan optimization-based prompt injection attack to LLM-as-a-Judge. JudgeDeceiver\ninjects a carefully crafted sequence into an attacker-controlled candidate\nresponse such that LLM-as-a-Judge selects the candidate response for an\nattacker-chosen question no matter what other candidate responses are.\nSpecifically, we formulate finding such sequence as an optimization problem and\npropose a gradient based method to approximately solve it. Our extensive\nevaluation shows that JudgeDeceive is highly effective, and is much more\neffective than existing prompt injection attacks that manually craft the\ninjected sequences and jailbreak attacks when extended to our problem. We also\nshow the effectiveness of JudgeDeceiver in three case studies, i.e.,\nLLM-powered search, RLAIF, and tool selection. Moreover, we consider defenses\nincluding known-answer detection, perplexity detection, and perplexity windowed\ndetection. Our results show these defenses are insufficient, highlighting the\nurgent need for developing new defense strategies. Our implementation is\navailable at this repository: https://github.com/ShiJiawenwen/JudgeDeceiver.\n","authors":["Jiawen Shi","Zenghui Yuan","Yinuo Liu","Yue Huang","Pan Zhou","Lichao Sun","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2403.17710v3.pdf","comment":"To appear in the Proceedings of The ACM Conference on Computer and\n Communications Security (CCS), 2024"},{"id":"http://arxiv.org/abs/2407.11802v3","updated":"2024-11-15T14:54:58Z","published":"2024-07-16T14:53:35Z","title":"DCD: Discriminative and Consistent Representation Distillation","summary":" Knowledge Distillation (KD) aims to transfer knowledge from a large teacher\nmodel to a smaller student model. While contrastive learning has shown promise\nin self-supervised learning by creating discriminative representations, its\napplication in knowledge distillation remains limited and focuses primarily on\ndiscrimination, neglecting the structural relationships captured by the teacher\nmodel. To address this limitation, we propose Discriminative and Consistent\nDistillation (DCD), which employs a contrastive loss along with a consistency\nregularization to minimize the discrepancy between the distributions of teacher\nand student representations. Our method introduces learnable temperature and\nbias parameters that adapt during training to balance these complementary\nobjectives, replacing the fixed hyperparameters commonly used in contrastive\nlearning approaches. Through extensive experiments on CIFAR-100 and ImageNet\nILSVRC-2012, we demonstrate that DCD achieves state-of-the-art performance,\nwith the student model sometimes surpassing the teacher's accuracy.\nFurthermore, we show that DCD's learned representations exhibit superior\ncross-dataset generalization when transferred to Tiny ImageNet and STL-10. Code\nis available at https://github.com/giakoumoglou/distillers.\n","authors":["Nikolaos Giakoumoglou","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2407.11802v3.pdf","comment":"11 pages, 3 figures, 6 tables. The paper's title has been changed,\n again"},{"id":"http://arxiv.org/abs/2411.10234v1","updated":"2024-11-15T14:49:58Z","published":"2024-11-15T14:49:58Z","title":"Generative AI in Multimodal User Interfaces: Trends, Challenges, and\n Cross-Platform Adaptability","summary":" As the boundaries of human computer interaction expand, Generative AI emerges\nas a key driver in reshaping user interfaces, introducing new possibilities for\npersonalized, multimodal and cross-platform interactions. This integration\nreflects a growing demand for more adaptive and intuitive user interfaces that\ncan accommodate diverse input types such as text, voice and video, and deliver\nseamless experiences across devices. This paper explores the integration of\ngenerative AI in modern user interfaces, examining historical developments and\nfocusing on multimodal interaction, cross-platform adaptability and dynamic\npersonalization. A central theme is the interface dilemma, which addresses the\nchallenge of designing effective interactions for multimodal large language\nmodels, assessing the trade-offs between graphical, voice-based and immersive\ninterfaces. The paper further evaluates lightweight frameworks tailored for\nmobile platforms, spotlighting the role of mobile hardware in enabling scalable\nmultimodal AI. Technical and ethical challenges, including context retention,\nprivacy concerns and balancing cloud and on-device processing are thoroughly\nexamined. Finally, the paper outlines future directions such as emotionally\nadaptive interfaces, predictive AI driven user interfaces and real-time\ncollaborative systems, underscoring generative AI's potential to redefine\nadaptive user-centric interfaces across platforms.\n","authors":["J. Bieniek","M. Rahouti","D. C. Verma"],"pdf_url":"https://arxiv.org/pdf/2411.10234v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.10232v1","updated":"2024-11-15T14:45:58Z","published":"2024-11-15T14:45:58Z","title":"ColorEdit: Training-free Image-Guided Color editing with diffusion model","summary":" Text-to-image (T2I) diffusion models, with their impressive generative\ncapabilities, have been adopted for image editing tasks, demonstrating\nremarkable efficacy. However, due to attention leakage and collision between\nthe cross-attention map of the object and the new color attribute from the text\nprompt, text-guided image editing methods may fail to change the color of an\nobject, resulting in a misalignment between the resulting image and the text\nprompt. In this paper, we conduct an in-depth analysis on the process of\ntext-guided image synthesizing and what semantic information different\ncross-attention blocks have learned. We observe that the visual representation\nof an object is determined in the up-block of the diffusion model in the early\nstage of the denoising process, and color adjustment can be achieved through\nvalue matrices alignment in the cross-attention layer. Based on our findings,\nwe propose a straightforward, yet stable, and effective image-guided method to\nmodify the color of an object without requiring any additional fine-tuning or\ntraining. Lastly, we present a benchmark dataset called COLORBENCH, the first\nbenchmark to evaluate the performance of color change methods. Extensive\nexperiments validate the effectiveness of our method in object-level color\nediting and surpass the performance of popular text-guided image editing\napproaches in both synthesized and real images.\n","authors":["Xingxi Yin","Zhi Li","Jingfeng Zhang","Chenglin Li","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10231v1","updated":"2024-11-15T14:43:58Z","published":"2024-11-15T14:43:58Z","title":"A Low-Resolution Image is Worth 1x1 Words: Enabling Fine Image\n Super-Resolution with Transformers and TaylorShift","summary":" Transformer-based Super-Resolution (SR) models have recently advanced image\nreconstruction quality, yet challenges remain due to computational complexity\nand an over-reliance on large patch sizes, which constrain fine-grained detail\nenhancement. In this work, we propose TaylorIR to address these limitations by\nutilizing a patch size of 1x1, enabling pixel-level processing in any\ntransformer-based SR model. To address the significant computational demands\nunder the traditional self-attention mechanism, we employ the TaylorShift\nattention mechanism, a memory-efficient alternative based on Taylor series\nexpansion, achieving full token-to-token interactions with linear complexity.\nExperimental results demonstrate that our approach achieves new\nstate-of-the-art SR performance while reducing memory consumption by up to 60%\ncompared to traditional self-attention-based transformers.\n","authors":["Sanath Budakegowdanadoddi Nagaraju","Brian Bernhard Moser","Tobias Christian Nauen","Stanislav Frolov","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.10231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10224v1","updated":"2024-11-15T14:38:13Z","published":"2024-11-15T14:38:13Z","title":"MCL: Multi-view Enhanced Contrastive Learning for Chest X-ray Report\n Generation","summary":" Radiology reports are crucial for planning treatment strategies and enhancing\ndoctor-patient communication, yet manually writing these reports is burdensome\nfor radiologists. While automatic report generation offers a solution, existing\nmethods often rely on single-view radiographs, limiting diagnostic accuracy. To\naddress this problem, we propose MCL, a Multi-view enhanced Contrastive\nLearning method for chest X-ray report generation. Specifically, we first\nintroduce multi-view enhanced contrastive learning for visual representation by\nmaximizing agreements between multi-view radiographs and their corresponding\nreport. Subsequently, to fully exploit patient-specific indications (e.g.,\npatient's symptoms) for report generation, we add a transitional ``bridge\" for\nmissing indications to reduce embedding space discrepancies caused by their\npresence or absence. Additionally, we construct Multi-view CXR and Two-view CXR\ndatasets from public sources to support research on multi-view report\ngeneration. Our proposed MCL surpasses recent state-of-the-art methods across\nmultiple datasets, achieving a 5.0% F1 RadGraph improvement on MIMIC-CXR, a\n7.3% BLEU-1 improvement on MIMIC-ABN, a 3.1% BLEU-4 improvement on Multi-view\nCXR, and an 8.2% F1 CheXbert improvement on Two-view CXR.\n","authors":["Kang Liu","Zhuoqi Ma","Kun Xie","Zhicheng Jiao","Qiguang Miao"],"pdf_url":"https://arxiv.org/pdf/2411.10224v1.pdf","comment":"https://github.com/mk-runner/MCL"},{"id":"http://arxiv.org/abs/2308.16703v2","updated":"2024-11-15T14:20:32Z","published":"2023-08-31T13:09:33Z","title":"Fault Injection and Safe-Error Attack for Extraction of Embedded Neural\n Network Models","summary":" Model extraction emerges as a critical security threat with attack vectors\nexploiting both algorithmic and implementation-based approaches. The main goal\nof an attacker is to steal as much information as possible about a protected\nvictim model, so that he can mimic it with a substitute model, even with a\nlimited access to similar training data. Recently, physical attacks such as\nfault injection have shown worrying efficiency against the integrity and\nconfidentiality of embedded models. We focus on embedded deep neural network\nmodels on 32-bit microcontrollers, a widespread family of hardware platforms in\nIoT, and the use of a standard fault injection strategy - Safe Error Attack\n(SEA) - to perform a model extraction attack with an adversary having a limited\naccess to training data. Since the attack strongly depends on the input\nqueries, we propose a black-box approach to craft a successful attack set. For\na classical convolutional neural network, we successfully recover at least 90%\nof the most significant bits with about 1500 crafted inputs. These information\nenable to efficiently train a substitute model, with only 8% of the training\ndataset, that reaches high fidelity and near identical accuracy level than the\nvictim model.\n","authors":["Kevin Hector","Pierre-Alain Moellic","Mathieu Dumont","Jean-Max Dutertre"],"pdf_url":"https://arxiv.org/pdf/2308.16703v2.pdf","comment":"Accepted at SECAI Workshop, ESORICS 2023 (v2. Fix notations)"},{"id":"http://arxiv.org/abs/2411.10213v1","updated":"2024-11-15T14:19:15Z","published":"2024-11-15T14:19:15Z","title":"An Empirical Study on LLM-based Agents for Automated Bug Fixing","summary":" Large language models (LLMs) and LLM-based Agents have been applied to fix\nbugs automatically, demonstrating the capability in addressing software defects\nby engaging in development environment interaction, iterative validation and\ncode modification. However, systematic analysis of these agent and non-agent\nsystems remain limited, particularly regarding performance variations among\ntop-performing ones. In this paper, we examine seven proprietary and\nopen-source systems on the SWE-bench Lite benchmark for automated bug fixing.\nWe first assess each system's overall performance, noting instances solvable by\nall or none of these sytems, and explore why some instances are uniquely solved\nby specific system types. We also compare fault localization accuracy at file\nand line levels and evaluate bug reproduction capabilities, identifying\ninstances solvable only through dynamic reproduction. Through analysis, we\nconcluded that further optimization is needed in both the LLM itself and the\ndesign of Agentic flow to improve the effectiveness of the Agent in bug fixing.\n","authors":["Xiangxin Meng","Zexiong Ma","Pengfei Gao","Chao Peng"],"pdf_url":"https://arxiv.org/pdf/2411.10213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06607v3","updated":"2024-11-15T14:09:05Z","published":"2024-09-10T16:00:22Z","title":"An Ontology-based Approach Towards Traceable Behavior Specifications in\n Automated Driving","summary":" Vehicles in public traffic that are equipped with Automated Driving Systems\nare subject to a number of expectations: Among other aspects, their behavior\nshould be safe, conforming to the rules of the road and provide mobility to\ntheir users. This poses challenges for the developers of such systems:\nDevelopers are responsible for specifying this behavior, for example, in terms\nof requirements at system design time. As we will discuss in the article, this\nspecification always involves the need for assumptions and trade-offs. As a\nresult, insufficiencies in such a behavior specification can occur that can\npotentially lead to unsafe system behavior. In order to support the\nidentification of specification insufficiencies, requirements and respective\nassumptions need to be made explicit. In this article, we propose the Semantic\nNorm Behavior Analysis as an ontology-based approach to specify the behavior\nfor an Automated Driving System equipped vehicle. We use ontologies to formally\nrepresent specified behavior for a targeted operational environment, and to\nestablish traceability between specified behavior and the addressed stakeholder\nneeds. Furthermore, we illustrate the application of the Semantic Norm Behavior\nAnalysis in a German legal context with two example scenarios and evaluate our\nresults. Our evaluation shows that the explicit documentation of assumptions in\nthe behavior specification supports both the identification of specification\ninsufficiencies and their treatment. Therefore, this article provides\nrequirements, terminology and an according methodology to facilitate\nontology-based behavior specifications in automated driving.\n","authors":["Nayel Fabian Salem","Marcus Nolte","Veronica Haber","Till Menzel","Hans Steege","Robert Graubohm","Markus Maurer"],"pdf_url":"https://arxiv.org/pdf/2409.06607v3.pdf","comment":"24 pages, 12 figures, submitted for publication"},{"id":"http://arxiv.org/abs/2411.10197v1","updated":"2024-11-15T13:53:05Z","published":"2024-11-15T13:53:05Z","title":"A logic for reasoning with inconsistent knowledge -- A reformulation\n using nowadays terminology (2024)","summary":" In many situations humans have to reason with inconsistent knowledge. These\ninconsistencies may occur due to not fully reliable sources of information. In\norder to reason with inconsistent knowledge, it is not possible to view a set\nof premisses as absolute truths as is done in predicate logic. Viewing the set\nof premisses as a set of assumptions, however, it is possible to deduce useful\nconclusions from an inconsistent set of premisses. In this paper a logic for\nreasoning with inconsistent knowledge is described. This logic is a\ngeneralization of the work of N. Rescher [15]. In the logic a reliability\nrelation is used to choose between incompatible assumptions. These choices are\nonly made when a contradiction is derived. As long as no contradiction is\nderived, the knowledge is assumed to be consistent. This makes it possible to\ndefine an argumentation-based deduction process for the logic. For the logic a\nsemantics based on the ideas of Y. Shoham [22, 23], is defined. It turns out\nthat the semantics for the logic is a preferential semantics according to the\ndefinition S. Kraus, D. Lehmann and M. Magidor [12]. Therefore the logic is a\nlogic of system P and possesses all the properties of an ideal non-monotonic\nlogic.\n","authors":["Nico Roos"],"pdf_url":"https://arxiv.org/pdf/2411.10197v1.pdf","comment":"The original version was published in the Artificial Intelligence\n journal. This original version uses 'justifications' in the proof system,\n which we would call nowadays 'arguments'. The current version presents the\n same results but now using the terminology of an assumption-based\n argumentation system"},{"id":"http://arxiv.org/abs/2411.10191v1","updated":"2024-11-15T13:44:37Z","published":"2024-11-15T13:44:37Z","title":"FengWu-W2S: A deep learning model for seamless weather-to-subseasonal\n forecast of global atmosphere","summary":" Seamless forecasting that produces warning information at continuum\ntimescales based on only one system is a long-standing pursuit for\nweather-climate service. While the rapid advancement of deep learning has\ninduced revolutionary changes in classical forecasting field, current efforts\nare still focused on building separate AI models for weather and climate\nforecasts. To explore the seamless forecasting ability based on one AI model,\nwe propose FengWu-Weather to Subseasonal (FengWu-W2S), which builds on the\nFengWu global weather forecast model and incorporates an ocean-atmosphere-land\ncoupling structure along with a diverse perturbation strategy. FengWu-W2S can\ngenerate 6-hourly atmosphere forecasts extending up to 42 days through an\nautoregressive and seamless manner. Our hindcast results demonstrate that\nFengWu-W2S reliably predicts atmospheric conditions out to 3-6 weeks ahead,\nenhancing predictive capabilities for global surface air temperature,\nprecipitation, geopotential height and intraseasonal signals such as the\nMadden-Julian Oscillation (MJO) and North Atlantic Oscillation (NAO). Moreover,\nour ablation experiments on forecast error growth from daily to seasonal\ntimescales reveal potential pathways for developing AI-based integrated system\nfor seamless weather-climate forecasting in the future.\n","authors":["Fenghua Ling","Kang Chen","Jiye Wu","Tao Han","Jing-Jia Luo","Wanli Ouyang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2411.10191v1.pdf","comment":"23 pages,8 figures"},{"id":"http://arxiv.org/abs/2411.10184v1","updated":"2024-11-15T13:33:10Z","published":"2024-11-15T13:33:10Z","title":"Agentic LLMs in the Supply Chain: Towards Autonomous Multi-Agent\n Consensus-Seeking","summary":" This paper explores how Large Language Models (LLMs) can automate\nconsensus-seeking in supply chain management (SCM), where frequent decisions on\nproblems such as inventory levels and delivery times require coordination among\ncompanies. Traditional SCM relies on human consensus in decision-making to\navoid emergent problems like the bullwhip effect. Some routine consensus\nprocesses, especially those that are time-intensive and costly, can be\nautomated. Existing solutions for automated coordination have faced challenges\ndue to high entry barriers locking out SMEs, limited capabilities, and limited\nadaptability in complex scenarios. However, recent advances in Generative AI,\nparticularly LLMs, show promise in overcoming these barriers. LLMs, trained on\nvast datasets can negotiate, reason, and plan, facilitating near-human-level\nconsensus at scale with minimal entry barriers. In this work, we identify key\nlimitations in existing approaches and propose autonomous LLM agents to address\nthese gaps. We introduce a series of novel, supply chain-specific\nconsensus-seeking frameworks tailored for LLM agents and validate the\neffectiveness of our approach through a case study in inventory management. To\naccelerate progress within the SCM community, we open-source our code,\nproviding a foundation for further advancements in LLM-powered autonomous\nsupply chain solutions.\n","authors":["Valeria Jannelli","Stefan Schoepf","Matthias Bickel","Torbjørn Netland","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2411.10184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10176v1","updated":"2024-11-15T13:22:04Z","published":"2024-11-15T13:22:04Z","title":"Let people fail! Exploring the influence of explainable virtual and\n robotic agents in learning-by-doing tasks","summary":" Collaborative decision-making with artificial intelligence (AI) agents\npresents opportunities and challenges. While human-AI performance often\nsurpasses that of individuals, the impact of such technology on human behavior\nremains insufficiently understood, primarily when AI agents can provide\njustifiable explanations for their suggestions. This study compares the effects\nof classic vs. partner-aware explanations on human behavior and performance\nduring a learning-by-doing task. Three participant groups were involved: one\ninteracting with a computer, another with a humanoid robot, and a third one\nwithout assistance. Results indicated that partner-aware explanations\ninfluenced participants differently based on the type of artificial agents\ninvolved. With the computer, participants enhanced their task completion times.\nAt the same time, those interacting with the humanoid robot were more inclined\nto follow its suggestions, although they did not reduce their timing.\nInterestingly, participants autonomously performing the learning-by-doing task\ndemonstrated superior knowledge acquisition than those assisted by explainable\nAI (XAI). These findings raise profound questions and have significant\nimplications for automated tutoring and human-AI collaboration.\n","authors":["Marco Matarese","Francesco Rea","Katharina J. Rohlfing","Alessandra Sciutti"],"pdf_url":"https://arxiv.org/pdf/2411.10176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10175v1","updated":"2024-11-15T13:21:26Z","published":"2024-11-15T13:21:26Z","title":"The Surprising Ineffectiveness of Pre-Trained Visual Representations for\n Model-Based Reinforcement Learning","summary":" Visual Reinforcement Learning (RL) methods often require extensive amounts of\ndata. As opposed to model-free RL, model-based RL (MBRL) offers a potential\nsolution with efficient data utilization through planning. Additionally, RL\nlacks generalization capabilities for real-world tasks. Prior work has shown\nthat incorporating pre-trained visual representations (PVRs) enhances sample\nefficiency and generalization. While PVRs have been extensively studied in the\ncontext of model-free RL, their potential in MBRL remains largely unexplored.\nIn this paper, we benchmark a set of PVRs on challenging control tasks in a\nmodel-based RL setting. We investigate the data efficiency, generalization\ncapabilities, and the impact of different properties of PVRs on the performance\nof model-based agents. Our results, perhaps surprisingly, reveal that for MBRL\ncurrent PVRs are not more sample efficient than learning representations from\nscratch, and that they do not generalize better to out-of-distribution (OOD)\nsettings. To explain this, we analyze the quality of the trained dynamics\nmodel. Furthermore, we show that data diversity and network architecture are\nthe most important contributors to OOD generalization performance.\n","authors":["Moritz Schneider","Robert Krug","Narunas Vaskevicius","Luigi Palmieri","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2411.10175v1.pdf","comment":"Published at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/"},{"id":"http://arxiv.org/abs/2411.10174v1","updated":"2024-11-15T13:19:59Z","published":"2024-11-15T13:19:59Z","title":"A Hard-Label Cryptanalytic Extraction of Non-Fully Connected Deep Neural\n Networks using Side-Channel Attacks","summary":" During the past decade, Deep Neural Networks (DNNs) proved their value on a\nlarge variety of subjects. However despite their high value and public\naccessibility, the protection of the intellectual property of DNNs is still an\nissue and an emerging research field. Recent works have successfully extracted\nfully-connected DNNs using cryptanalytic methods in hard-label settings,\nproving that it was possible to copy a DNN with high fidelity, i.e., high\nsimilitude in the output predictions. However, the current cryptanalytic\nattacks cannot target complex, i.e., not fully connected, DNNs and are limited\nto special cases of neurons present in deep networks.\n In this work, we introduce a new end-to-end attack framework designed for\nmodel extraction of embedded DNNs with high fidelity. We describe a new\nblack-box side-channel attack which splits the DNN in several linear parts for\nwhich we can perform cryptanalytic extraction and retrieve the weights in\nhard-label settings. With this method, we are able to adapt cryptanalytic\nextraction, for the first time, to non-fully connected DNNs, while maintaining\na high fidelity. We validate our contributions by targeting several\narchitectures implemented on a microcontroller unit, including a Multi-Layer\nPerceptron (MLP) of 1.7 million parameters and a shortened MobileNetv1. Our\nframework successfully extracts all of these DNNs with high fidelity (88.4% for\nthe MobileNetv1 and 93.2% for the MLP). Furthermore, we use the stolen model to\ngenerate adversarial examples and achieve close to white-box performance on the\nvictim's model (95.8% and 96.7% transfer rate).\n","authors":["Benoit Coqueret","Mathieu Carbone","Olivier Sentieys","Gabriel Zaid"],"pdf_url":"https://arxiv.org/pdf/2411.10174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10173v1","updated":"2024-11-15T13:19:27Z","published":"2024-11-15T13:19:27Z","title":"Semantics and Spatiality of Emergent Communication","summary":" When artificial agents are jointly trained to perform collaborative tasks\nusing a communication channel, they develop opaque goal-oriented communication\nprotocols. Good task performance is often considered sufficient evidence that\nmeaningful communication is taking place, but existing empirical results show\nthat communication strategies induced by common objectives can be\ncounterintuitive whilst solving the task nearly perfectly. In this work, we\nidentify a goal-agnostic prerequisite to meaningful communication, which we\nterm semantic consistency, based on the idea that messages should have similar\nmeanings across instances. We provide a formal definition for this idea, and\nuse it to compare the two most common objectives in the field of emergent\ncommunication: discrimination and reconstruction. We prove, under mild\nassumptions, that semantically inconsistent communication protocols can be\noptimal solutions to the discrimination task, but not to reconstruction. We\nfurther show that the reconstruction objective encourages a stricter property,\nspatial meaningfulness, which also accounts for the distance between messages.\nExperiments with emergent communication games validate our theoretical results.\nThese findings demonstrate an inherent advantage of distance-based\ncommunication goals, and contextualize previous empirical discoveries.\n","authors":["Rotem Ben Zion","Boaz Carmeli","Orr Paradise","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2411.10173v1.pdf","comment":"34 pages, to be published in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10172v1","updated":"2024-11-15T13:18:18Z","published":"2024-11-15T13:18:18Z","title":"Increasing the Accessibility of Causal Domain Knowledge via Causal\n Information Extraction Methods: A Case Study in the Semiconductor\n Manufacturing Industry","summary":" The extraction of causal information from textual data is crucial in the\nindustry for identifying and mitigating potential failures, enhancing process\nefficiency, prompting quality improvements, and addressing various operational\nchallenges. This paper presents a study on the development of automated methods\nfor causal information extraction from actual industrial documents in the\nsemiconductor manufacturing industry. The study proposes two types of causal\ninformation extraction methods, single-stage sequence tagging (SST) and\nmulti-stage sequence tagging (MST), and evaluates their performance using\nexisting documents from a semiconductor manufacturing company, including\npresentation slides and FMEA (Failure Mode and Effects Analysis) documents. The\nstudy also investigates the effect of representation learning on downstream\ntasks. The presented case study showcases that the proposed MST methods for\nextracting causal information from industrial documents are suitable for\npractical applications, especially for semi structured documents such as FMEAs,\nwith a 93\\% F1 score. Additionally, MST achieves a 73\\% F1 score on texts\nextracted from presentation slides. Finally, the study highlights the\nimportance of choosing a language model that is more aligned with the domain\nand in-domain fine-tuning.\n","authors":["Houssam Razouk","Leonie Benischke","Daniel Garber","Roman Kern"],"pdf_url":"https://arxiv.org/pdf/2411.10172v1.pdf","comment":"17 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.10171v1","updated":"2024-11-15T13:17:54Z","published":"2024-11-15T13:17:54Z","title":"Imagine-2-Drive: High-Fidelity World Modeling in CARLA for Autonomous\n Vehicles","summary":" In autonomous driving with image based state space, accurate prediction of\nfuture events and modeling diverse behavioral modes are essential for safety\nand effective decision-making. World model-based Reinforcement Learning (WMRL)\napproaches offers a promising solution by simulating future states from current\nstate and actions. However, utility of world models is often limited by typical\nRL policies being limited to deterministic or single gaussian distribution. By\nfailing to capture the full spectrum of possible actions, reduces their\nadaptability in complex, dynamic environments. In this work, we introduce\nImagine-2-Drive, a framework that consists of two components, VISTAPlan, a\nhigh-fidelity world model for accurate future prediction and Diffusion Policy\nActor (DPA), a diffusion based policy to model multi-modal behaviors for\ntrajectory prediction. We use VISTAPlan to simulate and evaluate trajectories\nfrom DPA and use Denoising Diffusion Policy Optimization (DDPO) to train DPA to\nmaximize the cumulative sum of rewards over the trajectories. We analyze the\nbenefits of each component and the framework as a whole in CARLA with standard\ndriving metrics. As a consequence of our twin novelties- VISTAPlan and DPA, we\nsignificantly outperform the state of the art (SOTA) world models on standard\ndriving metrics by 15% and 20% on Route Completion and Success Rate\nrespectively.\n","authors":["Anant Garg","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2411.10171v1.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2411.10168v1","updated":"2024-11-15T13:16:11Z","published":"2024-11-15T13:16:11Z","title":"Evaluating the role of `Constitutions' for learning from AI feedback","summary":" The growing capabilities of large language models (LLMs) have led to their\nuse as substitutes for human feedback for training and assessing other LLMs.\nThese methods often rely on `constitutions', written guidelines which a critic\nmodel uses to provide feedback and improve generations. We investigate how the\nchoice of constitution affects feedback quality by using four different\nconstitutions to improve patient-centered communication in medical interviews.\nIn pairwise comparisons conducted by 215 human raters, we found that detailed\nconstitutions led to better results regarding emotive qualities. However, none\nof the constitutions outperformed the baseline in learning more\npractically-oriented skills related to information gathering and provision. Our\nfindings indicate that while detailed constitutions should be prioritised,\nthere are possible limitations to the effectiveness of AI feedback as a reward\nsignal in certain areas.\n","authors":["Saskia Redgate","Andrew M. Bean","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2411.10168v1.pdf","comment":"4 pages, 2 figures. In NeurIPS 2024 Workshop on Language Gamification"},{"id":"http://arxiv.org/abs/2411.10156v1","updated":"2024-11-15T12:59:46Z","published":"2024-11-15T12:59:46Z","title":"Mitigating Sycophancy in Decoder-Only Transformer Architectures:\n Synthetic Data Intervention","summary":" To address the sycophancy problem caused by reinforcement learning from human\nfeedback in large language models, this research applies synthetic data\nintervention technology to the decoder-only transformer architecture. Based on\nthe research gaps in the existing literature, the researcher designed an\nexperimental process to reduce the tendency of models to cater by generating\ndiversified data, and used GPT4o as an experimental tool for verification. The\nexperiment used 100 true and false questions, and compared the performance of\nthe model trained with synthetic data intervention and the original untrained\nmodel on multiple indicators. The results show that the SDI training model\nsupports the technology in terms of accuracy rate and sycophancy rate and has\nsignificant effectiveness in reducing sycophancy phenomena. Notably, the data\nset, experimental process, code and data results have been uploaded to Github,\nthe link is https://github.com/brucewang123456789/GeniusTrail.git.\n","authors":["Libo Wang"],"pdf_url":"https://arxiv.org/pdf/2411.10156v1.pdf","comment":"This research is also submitted to OpenReview. The main text is 9\n pages (excluding citations), 7 figures, and 1 table"},{"id":"http://arxiv.org/abs/2411.10152v1","updated":"2024-11-15T12:50:57Z","published":"2024-11-15T12:50:57Z","title":"Causal Time-Series Synchronization for Multi-Dimensional Forecasting","summary":" The process industry's high expectations for Digital Twins require modeling\napproaches that can generalize across tasks and diverse domains with\npotentially different data dimensions and distributional shifts i.e.,\nFoundational Models. Despite success in natural language processing and\ncomputer vision, transfer learning with (self-) supervised signals for\npre-training general-purpose models is largely unexplored in the context of\nDigital Twins in the process industry due to challenges posed by\nmulti-dimensional time-series data, lagged cause-effect dependencies, complex\ncausal structures, and varying number of (exogenous) variables. We propose a\nnovel channel-dependent pre-training strategy that leverages synchronized\ncause-effect pairs to overcome these challenges by breaking down the\nmulti-dimensional time-series data into pairs of cause-effect variables. Our\napproach focuses on: (i) identifying highly lagged causal relationships using\ndata-driven methods, (ii) synchronizing cause-effect pairs to generate training\nsamples for channel-dependent pre-training, and (iii) evaluating the\neffectiveness of this approach in channel-dependent forecasting. Our\nexperimental results demonstrate significant improvements in forecasting\naccuracy and generalization capability compared to traditional training\nmethods.\n","authors":["Michael Mayr","Georgios C. Chasparis","Josef Küng"],"pdf_url":"https://arxiv.org/pdf/2411.10152v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2410.14979v5","updated":"2024-11-15T12:46:30Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration From Cognitive Psychology","summary":" The cognitive mechanism by which Large Language Models (LLMs) solve\nmathematical problems remains a widely debated and unresolved issue. Currently,\nthere is little interpretable experimental evidence that connects LLMs'\nproblem-solving with human cognitive psychology.To determine if LLMs possess\nhuman-like mathematical reasoning, we modified the problems used in the human\nCognitive Reflection Test (CRT). Our results show that, even with the use of\nChains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model\n(noted for its reasoning capabilities), have a high error rate when solving\nthese modified CRT problems. Specifically, the average accuracy rate dropped by\nup to 50% compared to the original questions.Further analysis of LLMs'\nincorrect answers suggests that they primarily rely on pattern matching from\ntheir training data, which aligns more with human intuition (System 1 thinking)\nrather than with human-like reasoning (System 2 thinking). This finding\nchallenges the belief that LLMs have genuine mathematical reasoning abilities\ncomparable to humans. As a result, this work may adjust overly optimistic views\non LLMs' progress towards artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Kai Chen","Xiaobing Sun","Baosheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14979v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10137v1","updated":"2024-11-15T12:23:12Z","published":"2024-11-15T12:23:12Z","title":"Legal Evalutions and Challenges of Large Language Models","summary":" In this paper, we review legal testing methods based on Large Language Models\n(LLMs), using the OPENAI o1 model as a case study to evaluate the performance\nof large models in applying legal provisions. We compare current\nstate-of-the-art LLMs, including open-source, closed-source, and legal-specific\nmodels trained specifically for the legal domain. Systematic tests are\nconducted on English and Chinese legal cases, and the results are analyzed in\ndepth. Through systematic testing of legal cases from common law systems and\nChina, this paper explores the strengths and weaknesses of LLMs in\nunderstanding and applying legal texts, reasoning through legal issues, and\npredicting judgments. The experimental results highlight both the potential and\nlimitations of LLMs in legal applications, particularly in terms of challenges\nrelated to the interpretation of legal language and the accuracy of legal\nreasoning. Finally, the paper provides a comprehensive analysis of the\nadvantages and disadvantages of various types of models, offering valuable\ninsights and references for the future application of AI in the legal field.\n","authors":["Jiaqi Wang","Huan Zhao","Zhenyuan Yang","Peng Shu","Junhao Chen","Haobo Sun","Ruixi Liang","Shixin Li","Pengcheng Shi","Longjun Ma","Zongjia Liu","Zhengliang Liu","Tianyang Zhong","Yutong Zhang","Chong Ma","Xin Zhang","Tuo Zhang","Tianli Ding","Yudan Ren","Tianming Liu","Xi Jiang","Shu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15527v2","updated":"2024-11-15T12:05:27Z","published":"2024-07-22T10:32:48Z","title":"Interpretable Concept-Based Memory Reasoning","summary":" The lack of transparency in the decision-making processes of deep learning\nsystems presents a significant challenge in modern artificial intelligence\n(AI), as it impairs users' ability to rely on and verify these systems. To\naddress this challenge, Concept Bottleneck Models (CBMs) have made significant\nprogress by incorporating human-interpretable concepts into deep learning\narchitectures. This approach allows predictions to be traced back to specific\nconcept patterns that users can understand and potentially intervene on.\nHowever, existing CBMs' task predictors are not fully interpretable, preventing\na thorough analysis and any form of formal verification of their\ndecision-making process prior to deployment, thereby raising significant\nreliability concerns. To bridge this gap, we introduce Concept-based Memory\nReasoner (CMR), a novel CBM designed to provide a human-understandable and\nprovably-verifiable task prediction process. Our approach is to model each task\nprediction as a neural selection mechanism over a memory of learnable logic\nrules, followed by a symbolic evaluation of the selected rule. The presence of\nan explicit memory and the symbolic evaluation allow domain experts to inspect\nand formally verify the validity of certain global properties of interest for\nthe task prediction process. Experimental results demonstrate that CMR achieves\nbetter accuracy-interpretability trade-offs to state-of-the-art CBMs, discovers\nlogic rules consistent with ground truths, allows for rule interventions, and\nallows pre-deployment verification.\n","authors":["David Debot","Pietro Barbiero","Francesco Giannini","Gabriele Ciravegna","Michelangelo Diligenti","Giuseppe Marra"],"pdf_url":"https://arxiv.org/pdf/2407.15527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22591v2","updated":"2024-11-15T12:02:15Z","published":"2024-10-29T23:10:01Z","title":"FGCE: Feasible Group Counterfactual Explanations for Auditing Fairness","summary":" This paper introduces the first graph-based framework for generating group\ncounterfactual explanations to audit model fairness, a crucial aspect of\ntrustworthy machine learning. Counterfactual explanations are instrumental in\nunderstanding and mitigating unfairness by revealing how inputs should change\nto achieve a desired outcome. Our framework, named Feasible Group\nCounterfactual Explanations (FGCEs), captures real-world feasibility\nconstraints and constructs subgroups with similar counterfactuals, setting it\napart from existing methods. It also addresses key trade-offs in counterfactual\ngeneration, including the balance between the number of counterfactuals, their\nassociated costs, and the breadth of coverage achieved. To evaluate these\ntrade-offs and assess fairness, we propose measures tailored to group\ncounterfactual generation. Our experimental results on benchmark datasets\ndemonstrate the effectiveness of our approach in managing feasibility\nconstraints and trade-offs, as well as the potential of our proposed metrics in\nidentifying and quantifying fairness issues.\n","authors":["Christos Fragkathoulas","Vasiliki Papanikou","Evaggelia Pitoura","Evimaria Terzi"],"pdf_url":"https://arxiv.org/pdf/2410.22591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03864v2","updated":"2024-11-15T11:51:10Z","published":"2024-07-04T11:53:51Z","title":"Adversarial Robustness of VAEs across Intersectional Subgroups","summary":" Despite advancements in Autoencoders (AEs) for tasks like dimensionality\nreduction, representation learning and data generation, they remain vulnerable\nto adversarial attacks. Variational Autoencoders (VAEs), with their\nprobabilistic approach to disentangling latent spaces, show stronger resistance\nto such perturbations compared to deterministic AEs; however, their resilience\nagainst adversarial inputs is still a concern. This study evaluates the\nrobustness of VAEs against non-targeted adversarial attacks by optimizing\nminimal sample-specific perturbations to cause maximal damage across diverse\ndemographic subgroups (combinations of age and gender). We investigate two\nquestions: whether there are robustness disparities among subgroups, and what\nfactors contribute to these disparities, such as data scarcity and\nrepresentation entanglement. Our findings reveal that robustness disparities\nexist but are not always correlated with the size of the subgroup. By using\ndownstream gender and age classifiers and examining latent embeddings, we\nhighlight the vulnerability of subgroups like older women, who are prone to\nmisclassification due to adversarial perturbations pushing their\nrepresentations toward those of other subgroups.\n","authors":["Chethan Krishnamurthy Ramanaik","Arjun Roy","Eirini Ntoutsi"],"pdf_url":"https://arxiv.org/pdf/2407.03864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10115v1","updated":"2024-11-15T11:29:31Z","published":"2024-11-15T11:29:31Z","title":"Memorization in Attention-only Transformers","summary":" Recent research has explored the memorization capacity of multi-head\nattention, but these findings are constrained by unrealistic limitations on the\ncontext size. We present a novel proof for language-based Transformers that\nextends the current hypothesis to any context size. Our approach improves upon\nthe state-of-the-art by achieving more effective exact memorization with an\nattention layer, while also introducing the concept of approximate memorization\nof distributions. Through experimental validation, we demonstrate that our\nproposed bounds more accurately reflect the true memorization capacity of\nlanguage models, and provide a precise comparison with prior work.\n","authors":["Léo Dana","Muni Sreenivas Pydi","Yann Chevaleyre"],"pdf_url":"https://arxiv.org/pdf/2411.10115v1.pdf","comment":"16 pages, 6 figures, submitted to AISTATS 2025,"},{"id":"http://arxiv.org/abs/2411.10109v1","updated":"2024-11-15T11:14:34Z","published":"2024-11-15T11:14:34Z","title":"Generative Agent Simulations of 1,000 People","summary":" The promise of human behavioral simulation--general-purpose computational\nagents that replicate human behavior across domains--could enable broad\napplications in policymaking and social science. We present a novel agent\narchitecture that simulates the attitudes and behaviors of 1,052 real\nindividuals--applying large language models to qualitative interviews about\ntheir lives, then measuring how well these agents replicate the attitudes and\nbehaviors of the individuals that they represent. The generative agents\nreplicate participants' responses on the General Social Survey 85% as\naccurately as participants replicate their own answers two weeks later, and\nperform comparably in predicting personality traits and outcomes in\nexperimental replications. Our architecture reduces accuracy biases across\nracial and ideological groups compared to agents given demographic\ndescriptions. This work provides a foundation for new tools that can help\ninvestigate individual and collective behavior.\n","authors":["Joon Sung Park","Carolyn Q. Zou","Aaron Shaw","Benjamin Mako Hill","Carrie Cai","Meredith Ringel Morris","Robb Willer","Percy Liang","Michael S. Bernstein"],"pdf_url":"https://arxiv.org/pdf/2411.10109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10108v1","updated":"2024-11-15T11:09:34Z","published":"2024-11-15T11:09:34Z","title":"Identifying Key Drivers of Heatwaves: A Novel Spatio-Temporal Framework\n for Extreme Event Detection","summary":" Heatwaves (HWs) are extreme atmospheric events that produce significant\nsocietal and environmental impacts. Predicting these extreme events remains\nchallenging, as their complex interactions with large-scale atmospheric and\nclimatic variables are difficult to capture with traditional statistical and\ndynamical models. This work presents a general method for driver identification\nin extreme climate events. A novel framework (STCO-FS) is proposed to identify\nkey immediate (short-term) HW drivers by combining clustering algorithms with\nan ensemble evolutionary algorithm. The framework analyzes spatio-temporal\ndata, reduces dimensionality by grouping similar geographical nodes for each\nvariable, and develops driver selection in spatial and temporal domains,\nidentifying the best time lags between predictive variables and HW occurrences.\nThe proposed method has been applied to analyze HWs in the Adda river basin in\nItaly. The approach effectively identifies significant variables influencing\nHWs in this region. This research can potentially enhance our understanding of\nHW drivers and predictability.\n","authors":["J. Pérez-Aracil","C. Peláez-Rodríguez","Ronan McAdam","Antonello Squintu","Cosmin M. Marina","Eugenio Lorente-Ramos","Niklas Luther","Veronica Torralba","Enrico Scoccimarro","Leone Cavicchia","Matteo Giuliani","Eduardo Zorita","Felicitas Hansen","David Barriopedro","Ricardo Garcia-Herrera","Pedro A. Gutiérrez","Jürg Luterbacher","Elena Xoplaki","Andrea Castelletti","S. Salcedo-Sanz"],"pdf_url":"https://arxiv.org/pdf/2411.10108v1.pdf","comment":"28 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2411.10100v1","updated":"2024-11-15T10:50:36Z","published":"2024-11-15T10:50:36Z","title":"Multi-Task Adversarial Variational Autoencoder for Estimating Biological\n Brain Age with Multimodal Neuroimaging","summary":" Despite advances in deep learning for estimating brain age from structural\nMRI data, incorporating functional MRI data is challenging due to its complex\nstructure and the noisy nature of functional connectivity measurements. To\naddress this, we present the Multitask Adversarial Variational Autoencoder, a\ncustom deep learning framework designed to improve brain age predictions\nthrough multimodal MRI data integration. This model separates latent variables\ninto generic and unique codes, isolating shared and modality-specific features.\nBy integrating multitask learning with sex classification as an additional\ntask, the model captures sex-specific aging patterns. Evaluated on the OpenBHB\ndataset, a large multisite brain MRI collection, the model achieves a mean\nabsolute error of 2.77 years, outperforming traditional methods. This success\npositions M-AVAE as a powerful tool for metaverse-based healthcare applications\nin brain age estimation.\n","authors":["Muhammad Usman","Azka Rehman","Abdullah Shahid","Abd Ur Rehman","Sung-Min Gho","Aleum Lee","Tariq M. Khan","Imran Razzak"],"pdf_url":"https://arxiv.org/pdf/2411.10100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09510v2","updated":"2024-11-15T10:47:37Z","published":"2024-11-14T15:19:01Z","title":"Communication Compression for Tensor Parallel LLM Inference","summary":" Large Language Models (LLMs) have pushed the frontier of artificial\nintelligence but are comprised of hundreds of billions of parameters and\noperations. For faster inference latency, LLMs are deployed on multiple\nhardware accelerators through various Model Parallelism strategies. Our paper\nlooks into the details on one such strategy - Tensor Parallel - and proposes to\nreduce latency by compressing inter-accelerator communication. We leverage fine\ngrained quantization techniques to compress selected activations by 3.5 - 4.5x.\nOur proposed method leads up to 2x reduction of time-to-first-token (TTFT) with\nnegligible model performance degradation.\n","authors":["Jan Hansen-Palmus","Michael Truong Le","Oliver Hausdörfer","Alok Verma"],"pdf_url":"https://arxiv.org/pdf/2411.09510v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10091v1","updated":"2024-11-15T10:34:59Z","published":"2024-11-15T10:34:59Z","title":"AI and the Future of Work in Africa White Paper","summary":" This white paper is the output of a multidisciplinary workshop in Nairobi\n(Nov 2023). Led by a cross-organisational team including Microsoft Research,\nNEPAD, Lelapa AI, and University of Oxford. The workshop brought together\ndiverse thought-leaders from various sectors and backgrounds to discuss the\nimplications of Generative AI for the future of work in Africa. Discussions\ncentred around four key themes: Macroeconomic Impacts; Jobs, Skills and Labour\nMarkets; Workers' Perspectives and Africa-Centris AI Platforms. The white paper\nprovides an overview of the current state and trends of generative AI and its\napplications in different domains, as well as the challenges and risks\nassociated with its adoption and regulation. It represents a diverse set of\nperspectives to create a set of insights and recommendations which aim to\nencourage debate and collaborative action towards creating a dignified future\nof work for everyone across Africa.\n","authors":["Jacki O'Neill","Vukosi Marivate","Barbara Glover","Winnie Karanu","Girmaw Abebe Tadesse","Akua Gyekye","Anne Makena","Wesley Rosslyn-Smith","Matthew Grollnek","Charity Wayua","Rehema Baguma","Angel Maduke","Sarah Spencer","Daniel Kandie","Dennis Ndege Maari","Natasha Mutangana","Maxamed Axmed","Nyambura Kamau","Muhammad Adamu","Frank Swaniker","Brian Gatuguti","Jonathan Donner","Mark Graham","Janet Mumo","Caroline Mbindyo","Charlette N'Guessan","Irene Githinji","Lesego Makhafola","Sean Kruger","Olivia Etyang","Mulang Onando","Joe Sevilla","Nanjira Sambuli","Martin Mbaya","Paul Breloff","Gideon M. Anapey","Tebogo L. Mogaleemang","Tiyani Nghonyama","Muthoni Wanyoike","Bhekani Mbuli","Lawrence Nderu","Wambui Nyabero","Uzma Alam","Kayode Olaleye","Caroline Njenga","Abigail Sellen","David Kairo","Rutendo Chabikwa","Najeeb G. Abdulhamid","Ketry Kubasu","Chinasa T. Okolo","Eugenia Akpo","Joel Budu","Issa Karambal","Joseph Berkoh","William Wasswa","Muchai Njagwi","Rob Burnet","Loise Ochanda","Hanlie de Bod","Elizabeth Ankrah","Selemani Kinyunyu","Mutembei Kariuki","Angel Maduke","Kizito Kiyimba","Farida Eleshin","Lillian Secelela Madeje","Catherine Muraga","Ida Nganga","Judy Gichoya","Tabbz Maina","Samuel Maina","Muchai Mercy","Millicent Ochieng","Stephanie Nyairo"],"pdf_url":"https://arxiv.org/pdf/2411.10091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10087v1","updated":"2024-11-15T10:16:38Z","published":"2024-11-15T10:16:38Z","title":"PFML: Self-Supervised Learning of Time-Series Data Without\n Representation Collapse","summary":" Self-supervised learning (SSL) is a data-driven learning approach that\nutilizes the innate structure of the data to guide the learning process. In\ncontrast to supervised learning, which depends on external labels, SSL utilizes\nthe inherent characteristics of the data to produce its own supervisory signal.\nHowever, one frequent issue with SSL methods is representation collapse, where\nthe model outputs a constant input-invariant feature representation. This issue\nhinders the potential application of SSL methods to new data modalities, as\ntrying to avoid representation collapse wastes researchers' time and effort.\nThis paper introduces a novel SSL algorithm for time-series data called\nPrediction of Functionals from Masked Latents (PFML). Instead of predicting\nmasked input signals or their latent representations directly, PFML operates by\npredicting statistical functionals of the input signal corresponding to masked\nembeddings, given a sequence of unmasked embeddings. The algorithm is designed\nto avoid representation collapse, rendering it straightforwardly applicable to\ndifferent time-series data domains, such as novel sensor modalities in clinical\ndata. We demonstrate the effectiveness of PFML through complex, real-life\nclassification tasks across three different data modalities: infant posture and\nmovement classification from multi-sensor inertial measurement unit data,\nemotion recognition from speech data, and sleep stage classification from EEG\ndata. The results show that PFML is superior to a conceptually similar\npre-existing SSL method and competitive against the current state-of-the-art\nSSL method, while also being conceptually simpler and without suffering from\nrepresentation collapse.\n","authors":["Einari Vaaras","Manu Airaksinen","Okko Räsänen"],"pdf_url":"https://arxiv.org/pdf/2411.10087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10084v1","updated":"2024-11-15T10:02:48Z","published":"2024-11-15T10:02:48Z","title":"Adapting the Biological SSVEP Response to Artificial Neural Networks","summary":" Neuron importance assessment is crucial for understanding the inner workings\nof artificial neural networks (ANNs) and improving their interpretability and\nefficiency. This paper introduces a novel approach to neuron significance\nassessment inspired by frequency tagging, a technique from neuroscience. By\napplying sinusoidal contrast modulation to image inputs and analyzing resulting\nneuron activations, this method enables fine-grained analysis of a network's\ndecision-making processes. Experiments conducted with a convolutional neural\nnetwork for image classification reveal notable harmonics and intermodulations\nin neuron-specific responses under part-based frequency tagging. These findings\nsuggest that ANNs exhibit behavior akin to biological brains in tuning to\nflickering frequencies, thereby opening avenues for neuron/filter importance\nassessment through frequency tagging. The proposed method holds promise for\napplications in network pruning, and model interpretability, contributing to\nthe advancement of explainable artificial intelligence and addressing the lack\nof transparency in neural networks. Future research directions include\ndeveloping novel loss functions to encourage biologically plausible behavior in\nANNs.\n","authors":["Emirhan Böge","Yasemin Gunindi","Erchan Aptoula","Nihan Alp","Huseyin Ozkan"],"pdf_url":"https://arxiv.org/pdf/2411.10084v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.09402v2","updated":"2024-11-15T09:52:20Z","published":"2024-11-14T12:27:31Z","title":"Automated Segmentation of Ischemic Stroke Lesions in Non-Contrast\n Computed Tomography Images for Enhanced Treatment and Prognosis","summary":" Stroke is the second leading cause of death worldwide, and is increasingly\nprevalent in low- and middle-income countries (LMICs). Timely interventions can\nsignificantly influence stroke survivability and the quality of life after\ntreatment. However, the standard and most widely available imaging method for\nconfirming strokes and their sub-types, the NCCT, is more challenging and\ntime-consuming to employ in cases of ischemic stroke. For this reason, we\ndeveloped an automated method for ischemic stroke lesion segmentation in NCCTs\nusing the nnU-Net frame work, aimed at enhancing early treatment and improving\nthe prognosis of ischemic stroke patients. We achieved Dice scores of 0.596 and\nIntersection over Union (IoU) scores of 0.501 on the sampled dataset. After\nadjusting for outliers, these scores improved to 0.752 for the Dice score and\n0.643 for the IoU. Proper delineation of the region of infarction can help\nclinicians better assess the potential impact of the infarction, and guide\ntreatment procedures.\n","authors":["Toufiq Musah","Prince Ebenezer Adjei","Kojo Obed Otoo"],"pdf_url":"https://arxiv.org/pdf/2411.09402v2.pdf","comment":"7 pages, 3 figures, MICCAI Meets Africa Workshop"},{"id":"http://arxiv.org/abs/2411.10072v1","updated":"2024-11-15T09:37:49Z","published":"2024-11-15T09:37:49Z","title":"Real-Time AI-Driven People Tracking and Counting Using Overhead Cameras","summary":" Accurate people counting in smart buildings and intelligent transportation\nsystems is crucial for energy management, safety protocols, and resource\nallocation. This is especially critical during emergencies, where precise\noccupant counts are vital for safe evacuation. Existing methods struggle with\nlarge crowds, often losing accuracy with even a few additional people. To\naddress this limitation, this study proposes a novel approach combining a new\nobject tracking algorithm, a novel counting algorithm, and a fine-tuned object\ndetection model. This method achieves 97% accuracy in real-time people counting\nwith a frame rate of 20-27 FPS on a low-power edge computer.\n","authors":["Ishrath Ahamed","Chamith Dilshan Ranathunga","Dinuka Sandun Udayantha","Benny Kai Kiat Ng","Chau Yuen"],"pdf_url":"https://arxiv.org/pdf/2411.10072v1.pdf","comment":"This paper is accepted to IEEE Region 10 conference (TENCON) 2024"},{"id":"http://arxiv.org/abs/2411.10071v1","updated":"2024-11-15T09:34:28Z","published":"2024-11-15T09:34:28Z","title":"Evidential Federated Learning for Skin Lesion Image Classification","summary":" We introduce FedEvPrompt, a federated learning approach that integrates\nprinciples of evidential deep learning, prompt tuning, and knowledge\ndistillation for distributed skin lesion classification. FedEvPrompt leverages\ntwo sets of prompts: b-prompts (for low-level basic visual knowledge) and\nt-prompts (for task-specific knowledge) prepended to frozen pre-trained Vision\nTransformer (ViT) models trained in an evidential learning framework to\nmaximize class evidences. Crucially, knowledge sharing across federation\nclients is achieved only through knowledge distillation on attention maps\ngenerated by the local ViT models, ensuring enhanced privacy preservation\ncompared to traditional parameter or synthetic image sharing methodologies.\nFedEvPrompt is optimized within a round-based learning paradigm, where each\nround involves training local models followed by attention maps sharing with\nall federation clients. Experimental validation conducted in a real distributed\nsetting, on the ISIC2019 dataset, demonstrates the superior performance of\nFedEvPrompt against baseline federated learning algorithms and knowledge\ndistillation methods, without sharing model parameters. In conclusion,\nFedEvPrompt offers a promising approach for federated learning, effectively\naddressing challenges such as data heterogeneity, imbalance, privacy\npreservation, and knowledge sharing.\n","authors":["Rutger Hendrix","Federica Proietto Salanitri","Concetto Spampinato","Simone Palazzo","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2411.10071v1.pdf","comment":"Published as a conference paper at ICPR 2024"},{"id":"http://arxiv.org/abs/2411.06740v2","updated":"2024-11-15T09:31:52Z","published":"2024-11-11T06:25:13Z","title":"Dockformer: A transformer-based molecular docking paradigm for\n large-scale virtual screening","summary":" Molecular docking enables virtual screening of compound libraries to identify\npotential ligands that target proteins of interest, a crucial step in drug\ndevelopment; however, as the size of the compound library increases, the\ncomputational complexity of traditional docking models increases. Deep learning\nalgorithms can provide data-driven research and development models to increase\nthe speed of the docking process. Unfortunately, few models can achieve\nsuperior screening performance compared to that of traditional models.\nTherefore, a novel deep learning-based docking approach named Dockformer is\nintroduced in this study. Dockformer leverages multimodal information to\ncapture the geometric topology and structural knowledge of molecules and can\ndirectly generate binding conformations with the corresponding confidence\nmeasures in an end-to-end manner. The experimental results show that Dockformer\nachieves success rates of 90.53\\% and 82.71\\% on the PDBbind core set and\nPoseBusters benchmarks, respectively, and more than a 100-fold increase in the\ninference process speed, outperforming almost all state-of-the-art docking\nmethods. In addition, the ability of Dockformer to identify the main protease\ninhibitors of coronaviruses is demonstrated in a real-world virtual screening\nscenario. Considering its high docking accuracy and screening efficiency,\nDockformer can be regarded as a powerful and robust tool in the field of drug\ndesign.\n","authors":["Zhangfan Yang","Junkai Ji","Shan He","Jianqiang Li","Ruibin Bai","Zexuan Zhu","Yew Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2411.06740v2.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.10063v1","updated":"2024-11-15T09:26:00Z","published":"2024-11-15T09:26:00Z","title":"Federated Domain Generalization via Prompt Learning and Aggregation","summary":" Federated domain generalization (FedDG) aims to improve the global model\ngeneralization in unseen domains by addressing data heterogeneity under\nprivacy-preserving constraints. A common strategy in existing FedDG studies\ninvolves sharing domain-specific knowledge among clients, such as spectrum\ninformation, class prototypes, and data styles. However, this knowledge is\nextracted directly from local client samples, and sharing such sensitive\ninformation poses a potential risk of data leakage, which might not fully meet\nthe requirements of FedDG. In this paper, we introduce prompt learning to adapt\npre-trained vision-language models (VLMs) in the FedDG scenario, and leverage\nlocally learned prompts as a more secure bridge to facilitate knowledge\ntransfer among clients. Specifically, we propose a novel FedDG framework\nthrough Prompt Learning and AggregatioN (PLAN), which comprises two training\nstages to collaboratively generate local prompts and global prompts at each\nfederated round. First, each client performs both text and visual prompt\nlearning using their own data, with local prompts indirectly synchronized by\nregarding the global prompts as a common reference. Second, all domain-specific\nlocal prompts are exchanged among clients and selectively aggregated into the\nglobal prompts using lightweight attention-based aggregators. The global\nprompts are finally applied to adapt VLMs to unseen target domains. As our PLAN\nframework requires training only a limited number of prompts and lightweight\naggregators, it offers notable advantages in computational and communication\nefficiency for FedDG. Extensive experiments demonstrate the superior\ngeneralization ability of PLAN across four benchmark datasets.\n","authors":["Shuai Gong","Chaoran Cui","Chunyun Zhang","Wenna Wang","Xiushan Nie","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.10063v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.10057v1","updated":"2024-11-15T09:20:46Z","published":"2024-11-15T09:20:46Z","title":"KuaiFormer: Transformer-Based Retrieval at Kuaishou","summary":" In large-scale content recommendation systems, retrieval serves as the\ninitial stage in the pipeline, responsible for selecting thousands of candidate\nitems from billions of options to pass on to ranking modules. Traditionally,\nthe dominant retrieval method has been Embedding-Based Retrieval (EBR) using a\nDeep Neural Network (DNN) dual-tower structure. However, applying transformer\nin retrieval tasks has been the focus of recent research, though real-world\nindustrial deployment still presents significant challenges. In this paper, we\nintroduce KuaiFormer, a novel transformer-based retrieval framework deployed in\na large-scale content recommendation system. KuaiFormer fundamentally redefines\nthe retrieval process by shifting from conventional score estimation tasks\n(such as click-through rate estimate) to a transformer-driven Next Action\nPrediction paradigm. This shift enables more effective real-time interest\nacquisition and multi-interest extraction, significantly enhancing retrieval\nperformance. KuaiFormer has been successfully integrated into Kuaishou App's\nshort-video recommendation system since May 2024, serving over 400 million\ndaily active users and resulting in a marked increase in average daily usage\ntime of Kuaishou users. We provide insights into both the technical and\nbusiness aspects of deploying transformer in large-scale recommendation\nsystems, addressing practical challenges encountered during industrial\nimplementation. Our findings offer valuable guidance for engineers and\nresearchers aiming to leverage transformer models to optimize large-scale\ncontent recommendation systems.\n","authors":["Chi Liu","Jiangxia Cao","Rui Huang","Kai Zheng","Qiang Luo","Kun Gai","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.10057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10055v1","updated":"2024-11-15T09:17:40Z","published":"2024-11-15T09:17:40Z","title":"Towards unearthing neglected climate innovations from scientific\n literature using Large Language Models","summary":" Climate change poses an urgent global threat, needing the rapid\nidentification and deployment of innovative solutions. We hypothesise that many\nof these solutions already exist within scientific literature but remain\nunderutilised. To address this gap, this study employs a curated dataset\nsourced from OpenAlex, a comprehensive repository of scientific papers.\nUtilising Large Language Models (LLMs), such as GPT4-o from OpenAI, we evaluate\ntitle-abstract pairs from scientific papers on seven dimensions, covering\nclimate change mitigation potential, stage of technological development, and\nreadiness for deployment. The outputs of the language models are then compared\nwith human evaluations to assess their effectiveness in identifying promising\nyet overlooked climate innovations. Our findings suggest that these LLM-based\nmodels can effectively augment human expertise, uncovering climate solutions\nthat are potentially impactful but with far greater speed, throughput and\nconsistency. Here, we focused on UK-based solutions, but the workflow is\nregion-agnostic. This work contributes to the discovery of neglected\ninnovations in scientific literature and demonstrates the potential of AI in\nenhancing climate action strategies.\n","authors":["César Quilodrán-Casas","Christopher Waite","Nicole Alhadeff","Diyona Dsouza","Cathal Hughes","Larissa Kunstel-Tabet","Alyssa Gilbert"],"pdf_url":"https://arxiv.org/pdf/2411.10055v1.pdf","comment":"10 pages. Accepted in the LatinX in AI workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10053v1","updated":"2024-11-15T09:11:10Z","published":"2024-11-15T09:11:10Z","title":"That Chip Has Sailed: A Critique of Unfounded Skepticism Around AI for\n Chip Design","summary":" In 2020, we introduced a deep reinforcement learning method capable of\ngenerating superhuman chip layouts, which we then published in Nature and\nopen-sourced on GitHub. AlphaChip has inspired an explosion of work on AI for\nchip design, and has been deployed in state-of-the-art chips across Alphabet\nand extended by external chipmakers. Even so, a non-peer-reviewed invited paper\nat ISPD 2023 questioned its performance claims, despite failing to run our\nmethod as described in Nature. For example, it did not pre-train the RL method\n(removing its ability to learn from prior experience), used substantially fewer\ncompute resources (20x fewer RL experience collectors and half as many GPUs),\ndid not train to convergence (standard practice in machine learning), and\nevaluated on test cases that are not representative of modern chips. Recently,\nIgor Markov published a meta-analysis of three papers: our peer-reviewed Nature\npaper, the non-peer-reviewed ISPD paper, and Markov's own unpublished paper\n(though he does not disclose that he co-authored it). Although AlphaChip has\nalready achieved widespread adoption and impact, we publish this response to\nensure that no one is wrongly discouraged from innovating in this impactful\narea.\n","authors":["Anna Goldie","Azalia Mirhoseini","Jeff Dean"],"pdf_url":"https://arxiv.org/pdf/2411.10053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10050v1","updated":"2024-11-15T09:05:03Z","published":"2024-11-15T09:05:03Z","title":"Jal Anveshak: Prediction of fishing zones using fine-tuned LlaMa 2","summary":" In recent years, the global and Indian government efforts in monitoring and\ncollecting data related to the fisheries industry have witnessed significant\nadvancements. Despite this wealth of data, there exists an untapped potential\nfor leveraging artificial intelligence based technological systems to benefit\nIndian fishermen in coastal areas. To fill this void in the Indian technology\necosystem, the authors introduce Jal Anveshak. This is an application framework\nwritten in Dart and Flutter that uses a Llama 2 based Large Language Model\nfine-tuned on pre-processed and augmented government data related to fishing\nyield and availability. Its main purpose is to help Indian fishermen safely get\nthe maximum yield of fish from coastal areas and to resolve their fishing\nrelated queries in multilingual and multimodal ways.\n","authors":["Arnav Mejari","Maitreya Vaghulade","Paarshva Chitaliya","Arya Telang","Lynette D'mello"],"pdf_url":"https://arxiv.org/pdf/2411.10050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10048v1","updated":"2024-11-15T08:55:31Z","published":"2024-11-15T08:55:31Z","title":"Physics-informed neural networks need a physicist to be accurate: the\n case of mass and heat transport in Fischer-Tropsch catalyst particles","summary":" Physics-Informed Neural Networks (PINNs) have emerged as an influential\ntechnology, merging the swift and automated capabilities of machine learning\nwith the precision and dependability of simulations grounded in theoretical\nphysics. PINNs are often employed to solve algebraic or differential equations\nto replace some or even all steps of multi-stage computational workflows,\nleading to their significant speed-up. However, wide adoption of PINNs is still\nhindered by reliability issues, particularly at extreme ends of the input\nparameter ranges. In this study, we demonstrate this in the context of a system\nof coupled non-linear differential reaction-diffusion and heat transfer\nequations related to Fischer-Tropsch synthesis, which are solved by a\nfinite-difference method with a PINN used in evaluating their source terms. It\nis shown that the testing strategies traditionally used to assess the accuracy\nof neural networks as function approximators can overlook the peculiarities\nwhich ultimately cause instabilities of the finite-difference solver. We\npropose a domain knowledge-based modifications to the PINN architecture\nensuring its correct asymptotic behavior. When combined with an improved\nnumerical scheme employed as an initial guess generator, the proposed\nmodifications are shown to recover the overall stability of the simulations,\nwhile preserving the speed-up brought by PINN as the workflow component. We\ndiscuss the possible applications of the proposed hybrid transport equation\nsolver in context of chemical reactors simulations.\n","authors":["Tymofii Nikolaienko","Harshil Patel","Aniruddha Panda","Subodh Madhav Joshi","Stanislav Jaso","Kaushic Kalyanaraman"],"pdf_url":"https://arxiv.org/pdf/2411.10048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06740v3","updated":"2024-11-15T08:36:51Z","published":"2024-08-13T09:00:35Z","title":"DiffLoRA: Generating Personalized Low-Rank Adaptation Weights with\n Diffusion","summary":" Personalized text-to-image generation has gained significant attention for\nits capability to generate high-fidelity portraits of specific identities\nconditioned on user-defined prompts. Existing methods typically involve\ntest-time fine-tuning or incorporating an additional pre-trained branch.\nHowever, these approaches struggle to simultaneously address efficiency,\nidentity fidelity, and the preservation of the model's original generative\ncapabilities. In this paper, we propose DiffLoRA, an efficient method that\nleverages the diffusion model as a hypernetwork to predict personalized\nLow-Rank Adaptation (LoRA) weights based on the reference images. By\nincorporating these LoRA weights into the off-the-shelf text-to-image model,\nDiffLoRA enables zero-shot personalization during inference, eliminating the\nneed for post-processing optimization. Moreover, we introduce a novel\nidentity-oriented LoRA weights construction pipeline to facilitate the training\nprocess of DiffLoRA. The dataset generated through this pipeline enables\nDiffLoRA to produce consistently high-quality LoRA weights. Notably, the\ndistinctive properties of the diffusion model enhance the generation of\nsuperior weights by employing probabilistic modeling to capture intricate\nstructural patterns and thoroughly explore the weight space. Comprehensive\nexperimental results demonstrate that DiffLoRA outperforms existing\npersonalization approaches across multiple benchmarks, achieving both time\nefficiency and maintaining identity fidelity throughout the personalization\nprocess.\n","authors":["Yujia Wu","Yiming Shi","Jiwei Wei","Chengwei Sun","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2408.06740v3.pdf","comment":"9 pages,8 figures"},{"id":"http://arxiv.org/abs/2411.10036v1","updated":"2024-11-15T08:36:24Z","published":"2024-11-15T08:36:24Z","title":"Rethinking Normalization Strategies and Convolutional Kernels for\n Multimodal Image Fusion","summary":" Multimodal image fusion (MMIF) aims to integrate information from different\nmodalities to obtain a comprehensive image, aiding downstream tasks. However,\nexisting methods tend to prioritize natural image fusion and focus on\ninformation complementary and network training strategies. They ignore the\nessential distinction between natural and medical image fusion and the\ninfluence of underlying components. This paper dissects the significant\ndifferences between the two tasks regarding fusion goals, statistical\nproperties, and data distribution. Based on this, we rethink the suitability of\nthe normalization strategy and convolutional kernels for end-to-end\nMMIF.Specifically, this paper proposes a mixture of instance normalization and\ngroup normalization to preserve sample independence and reinforce intrinsic\nfeature correlation.This strategy promotes the potential of enriching feature\nmaps, thus boosting fusion performance. To this end, we further introduce the\nlarge kernel convolution, effectively expanding receptive fields and enhancing\nthe preservation of image detail. Moreover, the proposed multipath adaptive\nfusion module recalibrates the decoder input with features of various scales\nand receptive fields, ensuring the transmission of crucial information.\nExtensive experiments demonstrate that our method exhibits state-of-the-art\nperformance in multiple fusion tasks and significantly improves downstream\napplications. The code is available at https://github.com/HeDan-11/LKC-FUNet.\n","authors":["Dan He","Guofen Wang","Weisheng Li","Yucheng Shu","Wenbo Li","Lijian Yang","Yuping Huang","Feiyan Li"],"pdf_url":"https://arxiv.org/pdf/2411.10036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10032v1","updated":"2024-11-15T08:20:26Z","published":"2024-11-15T08:20:26Z","title":"VMID: A Multimodal Fusion LLM Framework for Detecting and Identifying\n Misinformation of Short Videos","summary":" Short video platforms have become important channels for news dissemination,\noffering a highly engaging and immediate way for users to access current events\nand share information. However, these platforms have also emerged as\nsignificant conduits for the rapid spread of misinformation, as fake news and\nrumors can leverage the visual appeal and wide reach of short videos to\ncirculate extensively among audiences. Existing fake news detection methods\nmainly rely on single-modal information, such as text or images, or apply only\nbasic fusion techniques, limiting their ability to handle the complex,\nmulti-layered information inherent in short videos. To address these\nlimitations, this paper presents a novel fake news detection method based on\nmultimodal information, designed to identify misinformation through a\nmulti-level analysis of video content. This approach effectively utilizes\ndifferent modal representations to generate a unified textual description,\nwhich is then fed into a large language model for comprehensive evaluation. The\nproposed framework successfully integrates multimodal features within videos,\nsignificantly enhancing the accuracy and reliability of fake news detection.\nExperimental results demonstrate that the proposed approach outperforms\nexisting models in terms of accuracy, robustness, and utilization of multimodal\ninformation, achieving an accuracy of 90.93%, which is significantly higher\nthan the best baseline model (SV-FEND) at 81.05%. Furthermore, case studies\nprovide additional evidence of the effectiveness of the approach in accurately\ndistinguishing between fake news, debunking content, and real incidents,\nhighlighting its reliability and robustness in real-world applications.\n","authors":["Weihao Zhong","Yinhao Xiao","Minghui Xu","Xiuzhen Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.10032v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2211.10973 by other authors"},{"id":"http://arxiv.org/abs/2411.10028v1","updated":"2024-11-15T08:17:05Z","published":"2024-11-15T08:17:05Z","title":"MOT\\_FCG++: Enhanced Representation of Motion and Appearance Features","summary":" The goal of multi-object tracking (MOT) is to detect and track all objects in\na scene across frames, while maintaining a unique identity for each object.\nMost existing methods rely on the spatial motion features and appearance\nembedding features of the detected objects in consecutive frames. Effectively\nand robustly representing the spatial and appearance features of long\ntrajectories has become a critical factor affecting the performance of MOT. We\npropose a novel approach for appearance and spatial feature representation,\nimproving upon the clustering association method MOT\\_FCG. For spatial motion\nfeatures, we propose Diagonal Modulated GIoU, which more accurately represents\nthe relationship between the position and shape of the objects. For appearance\nfeatures, we utilize a dynamic appearance representation that incorporates\nconfidence information, enabling the trajectory appearance features to be more\nrobust and global. Based on the baseline model MOT\\_FCG, we achieved 76.1 HOTA,\n80.4 MOTA and 81.3 IDF1 on the MOT17 validation set, and also achieved\ncompetitive performance on the MOT20 and DanceTrack validation sets.\n","authors":["Yanzhao Fang"],"pdf_url":"https://arxiv.org/pdf/2411.10028v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.07728v5","updated":"2024-11-15T07:51:08Z","published":"2024-07-10T15:00:08Z","title":"SaMoye: Zero-shot Singing Voice Conversion Model Based on Feature\n Disentanglement and Enhancement","summary":" Singing voice conversion (SVC) aims to convert a singer's voice to another\nsinger's from a reference audio while keeping the original semantics. However,\nexisting SVC methods can hardly perform zero-shot due to incomplete feature\ndisentanglement or dependence on the speaker look-up table. We propose the\nfirst open-source high-quality zero-shot SVC model SaMoye that can convert\nsinging to human and non-human timbre. SaMoye disentangles the singing voice's\nfeatures into content, timbre, and pitch features, where we combine multiple\nASR models and compress the content features to reduce timbre leaks. Besides,\nwe enhance the timbre features by unfreezing the speaker encoder and mixing the\nspeaker embedding with top-3 similar speakers. We also establish an\nunparalleled large-scale dataset to guarantee zero-shot performance, which\ncomprises more than 1,815 hours of pure singing voice and 6,367 speakers. We\nconduct objective and subjective experiments to find that SaMoye outperforms\nother models in zero-shot SVC tasks even under extreme conditions like\nconverting singing to animals' timbre. The code and weight of SaMoye are\navailable on https://github.com/CarlWangChina/SaMoye-SVC. The weights, code,\ndataset, and documents of SaMoye are publicly available on\n\\url{https://github.com/CarlWangChina/SaMoye-SVC}.\n","authors":["Zihao Wang","Le Ma","Yongsheng Feng","Xin Pan","Yuhang Jin","Kejun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.07728v5.pdf","comment":"This paper needs major changes for resubmit"},{"id":"http://arxiv.org/abs/2411.10015v1","updated":"2024-11-15T07:50:01Z","published":"2024-11-15T07:50:01Z","title":"MicroCrackAttentionNeXt: Advancing Microcrack Detection in Wave Field\n Analysis Using Deep Neural Networks through Feature Visualization","summary":" Micro Crack detection using deep neural networks (DNNs) through an automated\npipeline using wave fields interacting with the damaged areas is highly sought\nafter. These high-dimensional spatio-temporal crack data are limited, and these\ndatasets have large dimensions in the temporal domain. The dataset presents a\nsubstantial class imbalance, with crack pixels constituting an average of only\n5% of the total pixels per sample. This extreme class imbalance poses a\nchallenge for deep learning models with the different micro-scale cracks, as\nthe network can be biased toward predicting the majority class, generally\nleading to poor detection accuracy. This study builds upon the previous\nbenchmark SpAsE-Net, an asymmetric encoder-decoder network for micro-crack\ndetection. The impact of various activation and loss functions were examined\nthrough feature space visualization using the manifold discovery and analysis\n(MDA) algorithm. The optimized architecture and training methodology achieved\nan accuracy of 86.85%.\n","authors":["Fatahlla Moreh","Yusuf Hasan","Bilal Zahid Hussain","Mohammad Ammar","Sven Tomforde"],"pdf_url":"https://arxiv.org/pdf/2411.10015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10010v1","updated":"2024-11-15T07:42:16Z","published":"2024-11-15T07:42:16Z","title":"DeepMedcast: A Deep Learning Method for Generating Intermediate Weather\n Forecasts among Multiple NWP Models","summary":" Numerical weather prediction (NWP) centers around the world operate a variety\nof NWP models, and recent advances in AI-driven NWP models have increased the\navailability of diverse NWP outputs. While this expansion holds the potential\nto improve forecast accuracy, it also raises a critical challenge of\nidentifying the most reliable predictions for specific forecast scenarios.\nTraditional approaches, such as ensemble or weighted averaging, combine\nmultiple NWP outputs but often generate unrealistic atmospheric fields,\ncomplicating the production of reliable and consistent forecasts in operational\nsettings. In this study, we introduce DeepMedcast, a deep learning method that\ngenerates intermediate forecast, or \"medcast\", between two or more NWP outputs.\nUnlike ensemble averaging, DeepMedcast can provide consistent and explainable\nmedcast without distorting meteorological fields. This paper details the\nmethodology and case studies of DeepMedcast, discussing its advantages and\npotential contributions to operational forecasting.\n","authors":["Atsushi Kudo"],"pdf_url":"https://arxiv.org/pdf/2411.10010v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.10008v1","updated":"2024-11-15T07:42:01Z","published":"2024-11-15T07:42:01Z","title":"Graph-based Complexity for Causal Effect by Empirical Plug-in","summary":" This paper focuses on the computational complexity of computing empirical\nplug-in estimates for causal effect queries. Given a causal graph and\nobservational data, any identifiable causal query can be estimated from an\nexpression over the observed variables, called the estimand. The estimand can\nthen be evaluated by plugging in probabilities computed empirically from data.\nIn contrast to conventional wisdom, which assumes that high dimensional\nprobabilistic functions will lead to exponential evaluation time of the\nestimand. We show that computation can be done efficiently, potentially in time\nlinear in the data size, depending on the estimand's hypergraph.\n In particular, we show that both the treewidth and hypertree width of the\nestimand's structure bound the evaluation complexity of the plug-in estimands,\nanalogous to their role in the complexity of probabilistic inference in\ngraphical models. Often, the hypertree width provides a more effective bound,\nsince the empirical distributions are sparse.\n","authors":["Rina Dechter","Annie Raichev","Alexander Ihler","Jin Tian"],"pdf_url":"https://arxiv.org/pdf/2411.10008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10006v1","updated":"2024-11-15T07:35:47Z","published":"2024-11-15T07:35:47Z","title":"Orca: Enhancing Role-Playing Abilities of Large Language Models by\n Integrating Personality Traits","summary":" Large language models has catalyzed the development of personalized dialogue\nsystems, numerous role-playing conversational agents have emerged. While\nprevious research predominantly focused on enhancing the model's capability to\nfollow instructions by designing character profiles, neglecting the\npsychological factors that drive human conversations. In this paper, we propose\nOrca, a framework for data processing and training LLMs of custom characters by\nintegrating personality traits. Orca comprises four stages: (1) Personality\ntraits inferring, leverage LLMs to infer user's BigFive personality trait\nreports and scores. (2) Data Augment, simulate user's profile, background\nstory, and psychological activities. (3) Dataset construction,\npersonality-conditioned instruction prompting (PCIP) to stimulate LLMs. (4)\nModeling and Training, personality-conditioned instruction tuning (PTIT and\nPSIT), using the generated data to enhance existing open-source LLMs. We\nintroduce OrcaBench, the first benchmark for evaluating the quality of content\ngenerated by LLMs on social platforms across multiple scales. Our experiments\ndemonstrate that our proposed model achieves superior performance on this\nbenchmark, demonstrating its excellence and effectiveness in perceiving\npersonality traits that significantly improve role-playing abilities. Our Code\nis available at https://github.com/Aipura/Orca.\n","authors":["Yuxuan Huang"],"pdf_url":"https://arxiv.org/pdf/2411.10006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10004v1","updated":"2024-11-15T07:30:53Z","published":"2024-11-15T07:30:53Z","title":"EyeDiff: text-to-image diffusion model improves rare eye disease\n diagnosis","summary":" The rising prevalence of vision-threatening retinal diseases poses a\nsignificant burden on the global healthcare systems. Deep learning (DL) offers\na promising solution for automatic disease screening but demands substantial\ndata. Collecting and labeling large volumes of ophthalmic images across various\nmodalities encounters several real-world challenges, especially for rare\ndiseases. Here, we introduce EyeDiff, a text-to-image model designed to\ngenerate multimodal ophthalmic images from natural language prompts and\nevaluate its applicability in diagnosing common and rare diseases. EyeDiff is\ntrained on eight large-scale datasets using the advanced latent diffusion\nmodel, covering 14 ophthalmic image modalities and over 80 ocular diseases, and\nis adapted to ten multi-country external datasets. The generated images\naccurately capture essential lesional characteristics, achieving high alignment\nwith text prompts as evaluated by objective metrics and human experts.\nFurthermore, integrating generated images significantly enhances the accuracy\nof detecting minority classes and rare eye diseases, surpassing traditional\noversampling methods in addressing data imbalance. EyeDiff effectively tackles\nthe issue of data imbalance and insufficiency typically encountered in rare\ndiseases and addresses the challenges of collecting large-scale annotated\nimages, offering a transformative solution to enhance the development of\nexpert-level diseases diagnosis models in ophthalmic field.\n","authors":["Ruoyu Chen","Weiyi Zhang","Bowen Liu","Xiaolan Chen","Pusheng Xu","Shunming Liu","Mingguang He","Danli Shi"],"pdf_url":"https://arxiv.org/pdf/2411.10004v1.pdf","comment":"28 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.14704v2","updated":"2024-11-15T07:19:03Z","published":"2024-09-23T04:50:36Z","title":"VLEU: a Method for Automatic Evaluation for Generalizability of\n Text-to-Image Models","summary":" Progress in Text-to-Image (T2I) models has significantly improved the\ngeneration of images from textual descriptions. However, existing evaluation\nmetrics do not adequately assess the models' ability to handle a diverse range\nof textual prompts, which is crucial for their generalizability. To address\nthis, we introduce a new metric called Visual Language Evaluation Understudy\n(VLEU). VLEU uses large language models to sample from the visual text domain,\nthe set of all possible input texts for T2I models, to generate a wide variety\nof prompts. The images generated from these prompts are evaluated based on\ntheir alignment with the input text using the CLIP model.VLEU quantifies a\nmodel's generalizability by computing the Kullback-Leibler divergence between\nthe marginal distribution of the visual text and the conditional distribution\nof the images generated by the model. This metric provides a quantitative way\nto compare different T2I models and track improvements during model finetuning.\nOur experiments demonstrate the effectiveness of VLEU in evaluating the\ngeneralization capability of various T2I models, positioning it as an essential\nmetric for future research in text-to-image synthesis.\n","authors":["Jingtao Cao","Zheng Zhang","Hongru Wang","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2409.14704v2.pdf","comment":"accepted by EMNLP2024(long paper,main conference)"},{"id":"http://arxiv.org/abs/2411.10000v1","updated":"2024-11-15T07:15:05Z","published":"2024-11-15T07:15:05Z","title":"DuSEGO: Dual Second-order Equivariant Graph Ordinary Differential\n Equation","summary":" Graph Neural Networks (GNNs) with equivariant properties have achieved\nsignificant success in modeling complex dynamic systems and molecular\nproperties. However, their expressiveness ability is limited by: (1) Existing\nmethods often overlook the over-smoothing issue caused by traditional GNN\nmodels, as well as the gradient explosion or vanishing problems in deep GNNs.\n(2) Most models operate on first-order information, neglecting that the real\nworld often consists of second-order systems, which further limits the model's\nrepresentation capabilities. To address these issues, we propose the\n\\textbf{Du}al \\textbf{S}econd-order \\textbf{E}quivariant \\textbf{G}raph\n\\textbf{O}rdinary Differential Equation (\\method{}) for equivariant\nrepresentation. Specifically, \\method{} apply the dual second-order equivariant\ngraph ordinary differential equations (Graph ODEs) on graph embeddings and node\ncoordinates, simultaneously. Theoretically, we first prove that \\method{}\nmaintains the equivariant property. Furthermore, we provide theoretical\ninsights showing that \\method{} effectively alleviates the over-smoothing\nproblem in both feature representation and coordinate update. Additionally, we\ndemonstrate that the proposed \\method{} mitigates the exploding and vanishing\ngradients problem, facilitating the training of deep multi-layer GNNs.\nExtensive experiments on benchmark datasets validate the superiority of the\nproposed \\method{} compared to baselines.\n","authors":["Yingxu Wang","Nan Yin","Mingyan Xiao","Xinhao Yi","Siwei Liu","Shangsong Liang"],"pdf_url":"https://arxiv.org/pdf/2411.10000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09996v1","updated":"2024-11-15T07:01:44Z","published":"2024-11-15T07:01:44Z","title":"Building 6G Radio Foundation Models with Transformer Architectures","summary":" Foundation deep learning (DL) models are general models, designed to learn\ngeneral, robust and adaptable representations of their target modality,\nenabling finetuning across a range of downstream tasks. These models are\npretrained on large, unlabeled datasets using self-supervised learning (SSL).\nFoundation models have demonstrated better generalization than traditional\nsupervised approaches, a critical requirement for wireless communications where\nthe dynamic environment demands model adaptability. In this work, we propose\nand demonstrate the effectiveness of a Vision Transformer (ViT) as a radio\nfoundation model for spectrogram learning. We introduce a Masked Spectrogram\nModeling (MSM) approach to pretrain the ViT in a self-supervised fashion. We\nevaluate the ViT-based foundation model on two downstream tasks: Channel State\nInformation (CSI)-based Human Activity sensing and Spectrogram Segmentation.\nExperimental results demonstrate competitive performance to supervised training\nwhile generalizing across diverse domains. Notably, the pretrained ViT model\noutperforms a four-times larger model that is trained from scratch on the\nspectrogram segmentation task, while requiring significantly less training\ntime, and achieves competitive performance on the CSI-based human activity\nsensing task. This work demonstrates the effectiveness of ViT with MSM for\npretraining as a promising technique for scalable foundation model development\nin future 6G networks.\n","authors":["Ahmed Aboulfotouh","Ashkan Eshaghbeigi","Hatem Abou-Zeid"],"pdf_url":"https://arxiv.org/pdf/2411.09996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11282v3","updated":"2024-11-15T06:48:58Z","published":"2023-12-18T15:23:06Z","title":"Evaluating and Enhancing Large Language Models for Conversational\n Reasoning on Knowledge Graphs","summary":" The development of large language models (LLMs) has been catalyzed by\nadvancements in pre-training techniques. These models have demonstrated robust\nreasoning capabilities through manually designed prompts. In this work, we\nevaluate the conversational reasoning capabilities of the current\nstate-of-the-art LLM (GPT-4) on knowledge graphs (KGs). However, the\nperformance of LLMs is constrained due to a lack of KG environment awareness\nand the difficulties in developing effective optimization mechanisms for\nintermediary reasoning stages. We further introduce LLM-ARK, a LLM grounded KG\nreasoning agent designed to deliver precise and adaptable predictions on KG\npaths. LLM-ARK leverages Full Textual Environment (FTE) prompt to assimilate\nstate information within each reasoning step. We reframe the challenge of\nmulti-hop reasoning on the KG as a sequential decision-making task. Utilizing\nthe Proximal Policy Optimization (PPO) online policy gradient reinforcement\nlearning algorithm, our model is optimized to learn from rich reward signals.\nAdditionally, we conduct an evaluation of our model and GPT-4 on the OpenDialKG\ndataset. The experimental results reveal that LLaMA-2-7B-ARK outperforms the\ncurrent state-of-the-art model by 5.28 percentage points, with a performance\nrate of 36.39% on the target@1 evaluation metric. Meanwhile, GPT-4 scored\n14.91%, further demonstrating the effectiveness of our method. Our code is\navailable on GitHub (https://github.com/Aipura/LLM-ARK) for further access.\n","authors":["Yuxuan Huang"],"pdf_url":"https://arxiv.org/pdf/2312.11282v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09986v1","updated":"2024-11-15T06:43:49Z","published":"2024-11-15T06:43:49Z","title":"Unlocking Transfer Learning for Open-World Few-Shot Recognition","summary":" Few-Shot Open-Set Recognition (FSOSR) targets a critical real-world\nchallenge, aiming to categorize inputs into known categories, termed closed-set\nclasses, while identifying open-set inputs that fall outside these classes.\nAlthough transfer learning where a model is tuned to a given few-shot task has\nbecome a prominent paradigm in closed-world, we observe that it fails to expand\nto open-world. To unlock this challenge, we propose a two-stage method which\nconsists of open-set aware meta-learning with open-set free transfer learning.\nIn the open-set aware meta-learning stage, a model is trained to establish a\nmetric space that serves as a beneficial starting point for the subsequent\nstage. During the open-set free transfer learning stage, the model is further\nadapted to a specific target task through transfer learning. Additionally, we\nintroduce a strategy to simulate open-set examples by modifying the training\ndataset or generating pseudo open-set examples. The proposed method achieves\nstate-of-the-art performance on two widely recognized benchmarks, miniImageNet\nand tieredImageNet, with only a 1.5\\% increase in training effort. Our work\ndemonstrates the effectiveness of transfer learning in FSOSR.\n","authors":["Byeonggeun Kim","Juntae Lee","Kyuhong Shim","Simyung Chang"],"pdf_url":"https://arxiv.org/pdf/2411.09986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01483v3","updated":"2024-11-15T06:31:44Z","published":"2024-05-02T17:14:57Z","title":"MANTIS: Interleaved Multi-Image Instruction Tuning","summary":" Large multimodal models (LMMs) have shown great results in single-image\nvision language tasks. However, their abilities to solve multi-image visual\nlanguage tasks is yet to be improved. The existing LMMs like OpenFlamingo,\nEmu2, and Idefics gain their multi-image ability through pre-training on\nhundreds of millions of noisy interleaved image-text data from the web, which\nis neither efficient nor effective. In this paper, we aim to build strong\nmulti-image LMMs via instruction tuning with academic-level resources.\nTherefore, we meticulously construct Mantis-Instruct containing 721K\nmulti-image instruction data to train a family of Mantis models. The\ninstruction tuning empowers Mantis with different multi-image skills like\nco-reference, comparison, reasoning, and temporal understanding. We evaluate\nMantis on 8 multi-image benchmarks and 6 single-image benchmarks.\nMantis-Idefics2 can achieve SoTA results on all the multi-image benchmarks and\nbeat the strongest multi-image baseline, Idefics2-8B by an average of 13\nabsolute points. Notably, Idefics2-8B was pre-trained on 140M interleaved\nmulti-image data, which is 200x larger than Mantis-Instruct. We observe that\nMantis performs equivalently well on the held-in and held-out benchmarks, which\nshows its generalization ability. We further evaluate Mantis on single-image\nbenchmarks and demonstrate that Mantis also maintains a strong single-image\nperformance on par with CogVLM and Emu2. Our results show that multi-image\nabilities are not necessarily gained through massive pre-training, instead,\nthey can be gained by low-cost instruction tuning. The training and evaluation\nof Mantis has paved the road for future work to improve LMMs' multi-image\nabilities.\n","authors":["Dongfu Jiang","Xuan He","Huaye Zeng","Cong Wei","Max Ku","Qian Liu","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01483v3.pdf","comment":"13 pages, 3 figures, 13 tables"},{"id":"http://arxiv.org/abs/2411.08933v2","updated":"2024-11-15T06:13:33Z","published":"2024-11-13T09:13:20Z","title":"Confidence-aware Denoised Fine-tuning of Off-the-shelf Models for\n Certified Robustness","summary":" The remarkable advances in deep learning have led to the emergence of many\noff-the-shelf classifiers, e.g., large pre-trained models. However, since they\nare typically trained on clean data, they remain vulnerable to adversarial\nattacks. Despite this vulnerability, their superior performance and\ntransferability make off-the-shelf classifiers still valuable in practice,\ndemanding further work to provide adversarial robustness for them in a post-hoc\nmanner. A recently proposed method, denoised smoothing, leverages a denoiser\nmodel in front of the classifier to obtain provable robustness without\nadditional training. However, the denoiser often creates hallucination, i.e.,\nimages that have lost the semantics of their originally assigned class, leading\nto a drop in robustness. Furthermore, its noise-and-denoise procedure\nintroduces a significant distribution shift from the original distribution,\ncausing the denoised smoothing framework to achieve sub-optimal robustness. In\nthis paper, we introduce Fine-Tuning with Confidence-Aware Denoised Image\nSelection (FT-CADIS), a novel fine-tuning scheme to enhance the certified\nrobustness of off-the-shelf classifiers. FT-CADIS is inspired by the\nobservation that the confidence of off-the-shelf classifiers can effectively\nidentify hallucinated images during denoised smoothing. Based on this, we\ndevelop a confidence-aware training objective to handle such hallucinated\nimages and improve the stability of fine-tuning from denoised images. In this\nway, the classifier can be fine-tuned using only images that are beneficial for\nadversarial robustness. We also find that such a fine-tuning can be done by\nupdating a small fraction of parameters of the classifier. Extensive\nexperiments demonstrate that FT-CADIS has established the state-of-the-art\ncertified robustness among denoised smoothing methods across all\n$\\ell_2$-adversary radius in various benchmarks.\n","authors":["Suhyeok Jang","Seojin Kim","Jinwoo Shin","Jongheon Jeong"],"pdf_url":"https://arxiv.org/pdf/2411.08933v2.pdf","comment":"26 pages; TMLR 2024; Code is available at\n https://github.com/suhyeok24/FT-CADIS"},{"id":"http://arxiv.org/abs/2411.09972v1","updated":"2024-11-15T06:05:45Z","published":"2024-11-15T06:05:45Z","title":"Large Language Models as User-Agents for Evaluating\n Task-Oriented-Dialogue Systems","summary":" Traditionally, offline datasets have been used to evaluate task-oriented\ndialogue (TOD) models. These datasets lack context awareness, making them\nsuboptimal benchmarks for conversational systems. In contrast, user-agents,\nwhich are context-aware, can simulate the variability and unpredictability of\nhuman conversations, making them better alternatives as evaluators. Prior\nresearch has utilized large language models (LLMs) to develop user-agents. Our\nwork builds upon this by using LLMs to create user-agents for the evaluation of\nTOD systems. This involves prompting an LLM, using in-context examples as\nguidance, and tracking the user-goal state. Our evaluation of diversity and\ntask completion metrics for the user-agents shows improved performance with the\nuse of better prompts. Additionally, we propose methodologies for the automatic\nevaluation of TOD models within this dynamic framework.\n","authors":["Taaha Kazi","Ruiliang Lyu","Sizhe Zhou","Dilek Hakkani-Tur","Gokhan Tur"],"pdf_url":"https://arxiv.org/pdf/2411.09972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09969v1","updated":"2024-11-15T05:55:23Z","published":"2024-11-15T05:55:23Z","title":"Steering AI-Driven Personalization of Scientific Text for General\n Audiences","summary":" Digital media platforms (e.g., social media, science blogs) offer\nopportunities to communicate scientific content to general audiences at scale.\nHowever, these audiences vary in their scientific expertise, literacy levels,\nand personal backgrounds, making effective science communication challenging.\nTo address this challenge, we designed TranSlider, an AI-powered tool that\ngenerates personalized translations of scientific text based on individual user\nprofiles (e.g., hobbies, location, and education). Our tool features an\ninteractive slider that allows users to steer the degree of personalization\nfrom 0 (weakly relatable) to 100 (strongly relatable), leveraging LLMs to\ngenerate the translations with given degrees. Through an exploratory study with\n15 participants, we investigated both the utility of these AI-personalized\ntranslations and how interactive reading features influenced users'\nunderstanding and reading experiences. We found that participants who preferred\nhigher degrees of personalization appreciated the relatable and contextual\ntranslations, while those who preferred lower degrees valued concise\ntranslations with subtle contextualization. Furthermore, participants reported\nthe compounding effect of multiple translations on their understanding of\nscientific content. Given these findings, we discuss several implications of\nAI-personalized translation tools in facilitating communication in\ncollaborative contexts.\n","authors":["Taewook Kim","Dhruv Agarwal","Jordan Ackerman","Manaswi Saha"],"pdf_url":"https://arxiv.org/pdf/2411.09969v1.pdf","comment":"23 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2411.09968v1","updated":"2024-11-15T05:51:29Z","published":"2024-11-15T05:51:29Z","title":"Seeing Clearly by Layer Two: Enhancing Attention Heads to Alleviate\n Hallucination in LVLMs","summary":" The hallucination problem in multimodal large language models (MLLMs) remains\na common issue. Although image tokens occupy a majority of the input sequence\nof MLLMs, there is limited research to explore the relationship between image\ntokens and hallucinations. In this paper, we analyze the distribution of\nattention scores for image tokens across each layer and head of the model,\nrevealing an intriguing and common phenomenon: most hallucinations are closely\nlinked to the pattern of attention sinks in the self-attention matrix of image\ntokens, where shallow layers exhibit dense attention sinks and deeper layers\nshow sparse attention sinks. We further analyze the attention heads of\ndifferent layers and find that heads with high-density attention sink in the\nimage part play a positive role in alleviating hallucinations. In this paper,\nwe propose a training-free method named \\textcolor{red}{\\textbf{E}}nhancing\n\\textcolor{red}{\\textbf{A}}ttention \\textcolor{red}{\\textbf{H}}eads (EAH), an\napproach designed to enhance the convergence of image tokens attention sinks in\nthe shallow layers. EAH identifies the attention head that shows the vision\nsink in a shallow layer and extracts its attention matrix. This attention map\nis then broadcast to other heads in the layer, thereby strengthening the layer\nto pay more attention to the image itself. With extensive experiments, EAH\nshows significant hallucination-mitigating performance on different MLLMs and\nmetrics, proving its effectiveness and generality.\n","authors":["Xiaofeng Zhang","Yihao Quan","Chaochen Gu","Chen Shen","Xiaosong Yuan","Shaotian Yan","Hao Cheng","Kaijie Wu","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2411.09968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05531v3","updated":"2024-11-15T05:49:30Z","published":"2024-09-09T11:43:35Z","title":"HMAFlow: Learning More Accurate Optical Flow via Hierarchical Motion\n Field Alignment","summary":" Optical flow estimation is a fundamental and long-standing visual task. In\nthis work, we present a novel method, dubbed HMAFlow, to improve optical flow\nestimation in challenging scenes, particularly those involving small objects.\nThe proposed model mainly consists of two core components: a Hierarchical\nMotion Field Alignment (HMA) module and a Correlation Self-Attention (CSA)\nmodule. In addition, we rebuild 4D cost volumes by employing a Multi-Scale\nCorrelation Search (MCS) layer and replacing average pooling in common cost\nvolumes with a search strategy utilizing multiple search ranges. Experimental\nresults demonstrate that our model achieves the best generalization performance\ncompared to other state-of-the-art methods. Specifically, compared with RAFT,\nour method achieves relative error reductions of 14.2% and 3.4% on the clean\npass and final pass of the Sintel online benchmark, respectively. On the KITTI\ntest benchmark, HMAFlow surpasses RAFT and GMA in the Fl-all metric by relative\nmargins of 6.8% and 7.7%, respectively. To facilitate future research, our code\nwill be made available at https://github.com/BooTurbo/HMAFlow.\n","authors":["Dianbo Ma","Kousuke Imamura","Ziyan Gao","Xiangjie Wang","Satoshi Yamane"],"pdf_url":"https://arxiv.org/pdf/2409.05531v3.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.09955v1","updated":"2024-11-15T05:18:15Z","published":"2024-11-15T05:18:15Z","title":"Instruction-Guided Editing Controls for Images and Multimedia: A Survey\n in LLM era","summary":" The rapid advancement of large language models (LLMs) and multimodal learning\nhas transformed digital content creation and manipulation. Traditional visual\nediting tools require significant expertise, limiting accessibility. Recent\nstrides in instruction-based editing have enabled intuitive interaction with\nvisual content, using natural language as a bridge between user intent and\ncomplex editing operations. This survey provides an overview of these\ntechniques, focusing on how LLMs and multimodal models empower users to achieve\nprecise visual modifications without deep technical knowledge. By synthesizing\nover 100 publications, we explore methods from generative adversarial networks\nto diffusion models, examining multimodal integration for fine-grained content\ncontrol. We discuss practical applications across domains such as fashion, 3D\nscene manipulation, and video synthesis, highlighting increased accessibility\nand alignment with human intuition. Our survey compares existing literature,\nemphasizing LLM-empowered editing, and identifies key challenges to stimulate\nfurther research. We aim to democratize powerful visual editing across various\nindustries, from entertainment to education. Interested readers are encouraged\nto access our repository at\nhttps://github.com/tamlhp/awesome-instruction-editing.\n","authors":["Thanh Tam Nguyen","Zhao Ren","Trinh Pham","Phi Le Nguyen","Hongzhi Yin","Quoc Viet Hung Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.09955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09952v1","updated":"2024-11-15T05:09:20Z","published":"2024-11-15T05:09:20Z","title":"GGAvatar: Reconstructing Garment-Separated 3D Gaussian Splatting Avatars\n from Monocular Video","summary":" Avatar modelling has broad applications in human animation and virtual\ntry-ons. Recent advancements in this field have focused on high-quality and\ncomprehensive human reconstruction but often overlook the separation of\nclothing from the body. To bridge this gap, this paper introduces GGAvatar\n(Garment-separated 3D Gaussian Splatting Avatar), which relies on monocular\nvideos. Through advanced parameterized templates and unique phased training,\nthis model effectively achieves decoupled, editable, and realistic\nreconstruction of clothed humans. Comparative evaluations with other costly\nmodels confirm GGAvatar's superior quality and efficiency in modelling both\nclothed humans and separable garments. The paper also showcases applications in\nclothing editing, as illustrated in Figure 1, highlighting the model's benefits\nand the advantages of effective disentanglement. The code is available at\nhttps://github.com/J-X-Chen/GGAvatar/.\n","authors":["Jingxuan Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09952v1.pdf","comment":"MMAsia'24 Accepted"},{"id":"http://arxiv.org/abs/2408.08092v3","updated":"2024-11-15T05:01:34Z","published":"2024-08-15T11:34:53Z","title":"SC3D: Label-Efficient Outdoor 3D Object Detection via Single Click\n Annotation","summary":" LiDAR-based outdoor 3D object detection has received widespread attention.\nHowever, training 3D detectors from the LiDAR point cloud typically relies on\nexpensive bounding box annotations. This paper presents SC3D, an innovative\nlabel-efficient method requiring only a single coarse click on the bird's eye\nview of the 3D point cloud for each frame. A key challenge here is the absence\nof complete geometric descriptions of the target objects from such simple click\nannotations. To address this issue, our proposed SC3D adopts a progressive\npipeline. Initially, we design a mixed pseudo-label generation module that\nexpands limited click annotations into a mixture of bounding box and semantic\nmask supervision. Next, we propose a mix-supervised teacher model, enabling the\ndetector to learn mixed supervision information. Finally, we introduce a\nmixed-supervised student network that leverages the teacher model's\ngeneralization ability to learn unclicked instances.Experimental results on the\nwidely used nuScenes and KITTI datasets demonstrate that our SC3D with only\ncoarse clicks, which requires only 0.2% annotation cost, achieves\nstate-of-the-art performance compared to weakly-supervised 3D detection\nmethods.The code will be made publicly available.\n","authors":["Qiming Xia","Hongwei Lin","Wei Ye","Hai Wu","Yadan Luo","Cheng Wang","Chenglu Wen"],"pdf_url":"https://arxiv.org/pdf/2408.08092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09945v1","updated":"2024-11-15T04:52:11Z","published":"2024-11-15T04:52:11Z","title":"TEESlice: Protecting Sensitive Neural Network Models in Trusted\n Execution Environments When Attackers have Pre-Trained Models","summary":" Trusted Execution Environments (TEE) are used to safeguard on-device models.\nHowever, directly employing TEEs to secure the entire DNN model is challenging\ndue to the limited computational speed. Utilizing GPU can accelerate DNN's\ncomputation speed but commercial widely-available GPUs usually lack security\nprotection. To this end, scholars introduce TSDP, a method that protects\nprivacy-sensitive weights within TEEs and offloads insensitive weights to GPUs.\nNevertheless, current methods do not consider the presence of a knowledgeable\nadversary who can access abundant publicly available pre-trained models and\ndatasets. This paper investigates the security of existing methods against such\na knowledgeable adversary and reveals their inability to fulfill their security\npromises. Consequently, we introduce a novel partition before training\nstrategy, which effectively separates privacy-sensitive weights from other\ncomponents of the model. Our evaluation demonstrates that our approach can\noffer full model protection with a computational cost reduced by a factor of\n10. In addition to traditional CNN models, we also demonstrate the scalability\nto large language models. Our approach can compress the private functionalities\nof the large language model to lightweight slices and achieve the same level of\nprotection as the shielding-whole-model baseline.\n","authors":["Ding Li","Ziqi Zhang","Mengyu Yao","Yifeng Cai","Yao Guo","Xiangqun Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09945v1.pdf","comment":"Accepted by TOSEM. Extended version of the S&P24 paper\n (arXiv:2310.07152)"},{"id":"http://arxiv.org/abs/2405.10347v2","updated":"2024-11-15T04:44:40Z","published":"2024-05-16T02:00:44Z","title":"Networking Systems for Video Anomaly Detection: A Tutorial and Survey","summary":" The increasing utilization of surveillance cameras in smart cities, coupled\nwith the surge of online video applications, has heightened concerns regarding\npublic security and privacy protection, which propelled automated Video Anomaly\nDetection (VAD) into a fundamental research task within the Artificial\nIntelligence (AI) community. With the advancements in deep learning and edge\ncomputing, VAD has made significant progress and advances synergized with\nemerging applications in smart cities and video internet, which has moved\nbeyond the conventional research scope of algorithm engineering to deployable\nNetworking Systems for VAD (NSVAD), a practical hotspot for intersection\nexploration in the AI, IoVT, and computing fields. In this article, we\ndelineate the foundational assumptions, learning frameworks, and applicable\nscenarios of various deep learning-driven VAD routes, offering an exhaustive\ntutorial for novices in NSVAD. This article elucidates core concepts by\nreviewing recent advances and typical solutions and aggregating available\nresearch resources accessible at https://github.com/fdjingliu/NSVAD.\nAdditionally, we showcase our latest NSVAD research in industrial IoT and smart\ncities, along with an end-cloud collaborative architecture for deployable\nNSVAD. Lastly, this article projects future development trends and discusses\nhow the integration of AI and computing technologies can address existing\nresearch challenges and promote open opportunities, serving as an insightful\nguide for prospective researchers and engineers.\n","authors":["Jing Liu","Yang Liu","Jieyu Lin","Jielin Li","Liang Cao","Peng Sun","Bo Hu","Liang Song","Azzedine Boukerche","Victor C. M. Leung"],"pdf_url":"https://arxiv.org/pdf/2405.10347v2.pdf","comment":"Revised to ACM Computing Surveys, under review, for more information\n and supplementary material, please see https://github.com/fdjingliu/NSVAD"},{"id":"http://arxiv.org/abs/2310.02170v2","updated":"2024-11-15T04:30:04Z","published":"2023-10-03T16:05:48Z","title":"A Dynamic LLM-Powered Agent Network for Task-Oriented Agent\n Collaboration","summary":" Recent studies show that collaborating multiple large language model (LLM)\npowered agents is a promising way for task solving. However, current approaches\nare constrained by using a fixed number of agents and static communication\nstructures. In this work, we propose automatically selecting a team of agents\nfrom candidates to collaborate in a dynamic communication structure toward\ndifferent tasks and domains. Specifically, we build a framework named Dynamic\nLLM-Powered Agent Network ($\\textbf{DyLAN}$) for LLM-powered agent\ncollaboration, operating a two-stage paradigm: (1) Team Optimization and (2)\nTask Solving. During the first stage, we utilize an $\\textit{agent selection}$\nalgorithm, based on an unsupervised metric called $\\textit{Agent Importance\nScore}$, enabling the selection of best agents according to their contributions\nin a preliminary trial, oriented to the given task. Then, in the second stage,\nthe selected agents collaborate dynamically according to the query.\nEmpirically, we demonstrate that DyLAN outperforms strong baselines in code\ngeneration, decision-making, general reasoning, and arithmetic reasoning tasks\nwith moderate computational cost. On specific subjects in MMLU, selecting a\nteam of agents in the team optimization stage improves accuracy by up to 25.0%\nin DyLAN.\n","authors":["Zijun Liu","Yanzhe Zhang","Peng Li","Yang Liu","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2310.02170v2.pdf","comment":"Published in COLM2024. Code Repo: https://github.com/SALT-NLP/DyLAN"},{"id":"http://arxiv.org/abs/2411.09933v1","updated":"2024-11-15T04:16:50Z","published":"2024-11-15T04:16:50Z","title":"JRadiEvo: A Japanese Radiology Report Generation Model Enhanced by\n Evolutionary Optimization of Model Merging","summary":" With the rapid advancement of large language models (LLMs), foundational\nmodels (FMs) have seen significant advancements. Healthcare is one of the most\ncrucial application areas for these FMs, given the significant time and effort\nrequired for physicians to analyze large volumes of patient data. Recent\nefforts have focused on adapting multimodal FMs to the medical domain through\ntechniques like instruction-tuning, leading to the development of medical\nfoundation models (MFMs). However, these approaches typically require large\namounts of training data to effectively adapt models to the medical field.\nMoreover, most existing models are trained on English datasets, limiting their\npracticality in non-English-speaking regions where healthcare professionals and\npatients are not always fluent in English. The need for translation introduces\nadditional costs and inefficiencies. To address these challenges, we propose a\n\\textbf{J}apanese \\textbf{Radi}ology report generation model enhanced by\n\\textbf{Evo}lutionary optimization of model merging (JRadiEvo). This is the\nfirst attempt to extend a non-medical vision-language foundation model to the\nmedical domain through evolutionary optimization of model merging. We\nsuccessfully created a model that generates accurate Japanese reports from\nX-ray images using only 50 translated samples from publicly available data.\nThis model, developed with highly efficient use of limited data, outperformed\nleading models from recent research trained on much larger datasets.\nAdditionally, with only 8 billion parameters, this relatively compact\nfoundation model can be deployed locally within hospitals, making it a\npractical solution for environments where APIs and other external services\ncannot be used due to strict privacy and security requirements.\n","authors":["Kaito Baba","Ryota Yagi","Junichiro Takahashi","Risa Kishikawa","Satoshi Kodera"],"pdf_url":"https://arxiv.org/pdf/2411.09933v1.pdf","comment":"Accepted by NeurIPS'24 Workshop on AIM-FM: Advancements In Medical\n Foundation Models: Explainability, Robustness, Security, and Beyond"},{"id":"http://arxiv.org/abs/2406.10942v3","updated":"2024-11-15T03:48:07Z","published":"2024-06-16T13:44:41Z","title":"Effective Generative AI: The Human-Algorithm Centaur","summary":" Advanced analytics science methods have enabled combining the power of\nartificial and human intelligence, creating \\textit{centaurs} that allow\nsuperior decision-making. Centaurs are hybrid human-algorithm models that\ncombine both formal analytics and human intuition in a symbiotic manner within\ntheir learning and reasoning process. We argue that the future of AI\ndevelopment and use in many domains needs to focus more on centaurs as opposed\nto other AI approaches. This paradigm shift towards centaur-based AI methods\nraises some fundamental questions: How are centaurs different from other\nhuman-in-the-loop methods? What are the most effective methods for creating\ncentaurs? When should centaurs be used, and when should the lead be given to\npure AI models? Doesn't the incorporation of human intuition -- which at times\ncan be misleading -- in centaurs' decision-making process degrade its\nperformance compared to pure AI methods? This work aims to address these\nfundamental questions, focusing on recent advancements in generative AI, and\nespecially in Large Language Models (LLMs), as a main case study to illustrate\ncentaurs' critical essentiality to future AI endeavors.\n","authors":["Soroush Saghafian","Lihi Idan"],"pdf_url":"https://arxiv.org/pdf/2406.10942v3.pdf","comment":"To Appear in SI: Future Shock, Harvard Data Science Review\n (https://hdsr.mitpress.mit.edu/specialissue5)"},{"id":"http://arxiv.org/abs/2411.09921v1","updated":"2024-11-15T03:45:09Z","published":"2024-11-15T03:45:09Z","title":"Motion-Grounded Video Reasoning: Understanding and Perceiving Motion at\n Pixel Level","summary":" In this paper, we introduce Motion-Grounded Video Reasoning, a new motion\nunderstanding task that requires generating visual answers (video segmentation\nmasks) according to the input question, and hence needs implicit spatiotemporal\nreasoning and grounding. This task extends existing spatiotemporal grounding\nwork focusing on explicit action/motion grounding, to a more general format by\nenabling implicit reasoning via questions. To facilitate the development of the\nnew task, we collect a large-scale dataset called GROUNDMORE, which comprises\n1,715 video clips, 249K object masks that are deliberately designed with 4\nquestion types (Causal, Sequential, Counterfactual, and Descriptive) for\nbenchmarking deep and comprehensive motion reasoning abilities. GROUNDMORE\nuniquely requires models to generate visual answers, providing a more concrete\nand visually interpretable response than plain texts. It evaluates models on\nboth spatiotemporal grounding and reasoning, fostering to address complex\nchallenges in motion-related video reasoning, temporal perception, and\npixel-level understanding. Furthermore, we introduce a novel baseline model\nnamed Motion-Grounded Video Reasoning Assistant (MORA). MORA incorporates the\nmultimodal reasoning ability from the Multimodal LLM, the pixel-level\nperception capability from the grounding model (SAM), and the temporal\nperception ability from a lightweight localization head. MORA achieves\nrespectable performance on GROUNDMORE outperforming the best existing visual\ngrounding baseline model by an average of 21.5% relatively. We hope this novel\nand challenging task will pave the way for future advancements in robust and\ngeneral motion understanding via video reasoning segmentation\n","authors":["Andong Deng","Tongjia Chen","Shoubin Yu","Taojiannan Yang","Lincoln Spencer","Yapeng Tian","Ajmal Saeed Mian","Mohit Bansal","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17743v3","updated":"2024-11-15T03:25:40Z","published":"2024-05-28T01:55:35Z","title":"ORLM: A Customizable Framework in Training Large Models for Automated\n Optimization Modeling","summary":" Optimization modeling and solving play a critical role in the application of\nOperations Research (OR) tools to address real-world problems, yet they pose\nchallenges and require extensive expertise from OR experts. With the advent of\nlarge language models (LLMs), new opportunities have emerged to streamline and\nautomate these tasks. However, current research predominantly relies on\nclosed-source LLMs such as GPT-4, along with extensive prompt engineering\ntechniques. This reliance stems from the scarcity of high-quality training\ndatasets for optimization modeling, resulting in elevated costs, prolonged\nprocessing times, and privacy concerns. To address these challenges, our work\nis the first to propose a viable path for training open-source LLMs that are\ncapable of optimization modeling as well as developing and executing solver\ncodes, eventually leading to a superior ability for automating optimization\nmodeling and solving. Particularly, we introduce a semi-automated data\nsynthesis framework designed for optimization modeling issues, named\nOR-Instruct. This framework merges the training data requirements of large\nmodels with the unique characteristics of optimization modeling problems, and\nallows for customizable enhancements tailored to specific scenarios or modeling\ntypes. To evaluate the performance of our proposed framework, we present the\nIndustryOR benchmark, the inaugural industrial standard for evaluating LLMs in\nsolving practical OR problems. Utilizing data synthesized through OR-Instruct,\nwe train various open-source LLMs with a capacity of 7 billion parameters\n(dubbed ORLMs). The resulting model demonstrates significantly enhanced\noptimization modeling capabilities, achieving state-of-the-art performance\nacross the NL4OPT, MAMO, and IndustryOR benchmarks. Our code and data are\navailable at \\url{https://github.com/Cardinal-Operations/ORLM}.\n","authors":["Chenyu Huang","Zhengyang Tang","Dongdong Ge","Shixi Hu","Ruoqing Jiang","Benyou Wang","Zizhuo Wang","Xin Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.17743v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.09909v1","updated":"2024-11-15T03:11:19Z","published":"2024-11-15T03:11:19Z","title":"AMXFP4: Taming Activation Outliers with Asymmetric Microscaling\n Floating-Point for 4-bit LLM Inference","summary":" Scaling Large Language Models (LLMs) with extended context lengths has\nincreased the need for efficient low-bit quantization to manage their\nsubstantial computational demands. However, reducing precision to 4 bits\nfrequently degrades performance due to activation outliers. To address this, we\npropose Asymmetric Microscaling 4-bit Floating-Point (AMXFP4) for efficient LLM\ninference. This novel data format leverages asymmetric shared scales to\nmitigate outliers while naturally capturing the asymmetry introduced by\ngroup-wise quantization. Unlike conventional 4-bit quantization methods that\nrely on data rotation and costly calibration, AMXFP4 uses asymmetric shared\nscales for direct 4-bit casting, achieving near-ideal quantization accuracy\nacross various LLM tasks, including multi-turn conversations, long-context\nreasoning, and visual question answering. Our AMXFP4 format significantly\noutperforms MXFP4 and other leading quantization techniques, enabling robust,\ncalibration-free 4-bit inference.\n","authors":["Janghwan Lee","Jiwoong Park","Jinseok Kim","Yongjik Kim","Jungju Oh","Jinwook Oh","Jungwook Choi"],"pdf_url":"https://arxiv.org/pdf/2411.09909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00461v2","updated":"2024-11-15T03:01:59Z","published":"2024-11-01T09:18:38Z","title":"A Multi-Granularity Supervised Contrastive Framework for Remaining\n Useful Life Prediction of Aero-engines","summary":" Accurate remaining useful life (RUL) predictions are critical to the safe\noperation of aero-engines. Currently, the RUL prediction task is mainly a\nregression paradigm with only mean square error as the loss function and lacks\nresearch on feature space structure, the latter of which has shown excellent\nperformance in a large number of studies. This paper develops a\nmulti-granularity supervised contrastive (MGSC) framework from plain intuition\nthat samples with the same RUL label should be aligned in the feature space,\nand address the problems of too large minibatch size and unbalanced samples in\nthe implementation. The RUL prediction with MGSC is implemented on using the\nproposed multi-phase training strategy. This paper also demonstrates a simple\nand scalable basic network structure and validates the proposed MGSC strategy\non the CMPASS dataset using a convolutional long short-term memory network as a\nbaseline, which effectively improves the accuracy of RUL prediction.\n","authors":["Zixuan He","Ziqian Kong","Zhengyu Chen","Yuling Zhan","Zijun Que","Zhengguo Xu"],"pdf_url":"https://arxiv.org/pdf/2411.00461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17601v3","updated":"2024-11-15T02:56:58Z","published":"2024-09-26T07:35:23Z","title":"CleanerCLIP: Fine-grained Counterfactual Semantic Augmentation for\n Backdoor Defense in Contrastive Learning","summary":" Pre-trained large models for multimodal contrastive learning, such as CLIP,\nhave been widely recognized in the industry as highly susceptible to\ndata-poisoned backdoor attacks. This poses significant risks to downstream\nmodel training. In response to such potential threats, finetuning offers a\nsimpler and more efficient defense choice compared to retraining large models\nwith augmented data. In the supervised learning domain, fine-tuning defense\nstrategies can achieve excellent defense performance. However, in the\nunsupervised and semi-supervised domain, we find that when CLIP faces some\ncomplex attack techniques, the existing fine-tuning defense strategy,\nCleanCLIP, has some limitations on defense performance. The synonym\nsubstitution of its text-augmentation is insufficient to enhance the text\nfeature space. To compensate for this weakness, we improve it by proposing a\nfine-grained \\textbf{T}ext \\textbf{A}lignment \\textbf{C}leaner (TA-Cleaner) to\ncut off feature connections of backdoor triggers. We randomly select a few\nsamples for positive and negative subtext generation at each epoch of\nCleanCLIP, and align the subtexts to the images to strengthen the text\nself-supervision. We evaluate the effectiveness of our TA-Cleaner against six\nattack algorithms and conduct comprehensive zero-shot classification tests on\nImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves\nstate-of-the-art defensiveness among finetuning-based defense techniques. Even\nwhen faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms\nCleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\\% and 63.88\\%,\nrespectively.\n","authors":["Yuan Xun","Siyuan Liang","Xiaojun Jia","Xinwei Liu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.17601v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09900v1","updated":"2024-11-15T02:46:55Z","published":"2024-11-15T02:46:55Z","title":"Statistical Analysis of Policy Space Compression Problem","summary":" Policy search methods are crucial in reinforcement learning, offering a\nframework to address continuous state-action and partially observable problems.\nHowever, the complexity of exploring vast policy spaces can lead to significant\ninefficiencies. Reducing the policy space through policy compression emerges as\na powerful, reward-free approach to accelerate the learning process. This\ntechnique condenses the policy space into a smaller, representative set while\nmaintaining most of the original effectiveness. Our research focuses on\ndetermining the necessary sample size to learn this compressed set accurately.\nWe employ R\\'enyi divergence to measure the similarity between true and\nestimated policy distributions, establishing error bounds for good\napproximations. To simplify the analysis, we employ the $l_1$ norm, determining\nsample size requirements for both model-based and model-free settings. Finally,\nwe correlate the error bounds from the $l_1$ norm with those from R\\'enyi\ndivergence, distinguishing between policies near the vertices and those in the\nmiddle of the policy space, to determine the lower and upper bounds for the\nrequired sample sizes.\n","authors":["Majid Molaei","Marcello Restelli","Alberto Maria Metelli","Matteo Papini"],"pdf_url":"https://arxiv.org/pdf/2411.09900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09891v1","updated":"2024-11-15T02:35:20Z","published":"2024-11-15T02:35:20Z","title":"Off-Dynamics Reinforcement Learning via Domain Adaptation and Reward\n Augmented Imitation","summary":" Training a policy in a source domain for deployment in the target domain\nunder a dynamics shift can be challenging, often resulting in performance\ndegradation. Previous work tackles this challenge by training on the source\ndomain with modified rewards derived by matching distributions between the\nsource and the target optimal trajectories. However, pure modified rewards only\nensure the behavior of the learned policy in the source domain resembles\ntrajectories produced by the target optimal policies, which does not guarantee\noptimal performance when the learned policy is actually deployed to the target\ndomain. In this work, we propose to utilize imitation learning to transfer the\npolicy learned from the reward modification to the target domain so that the\nnew policy can generate the same trajectories in the target domain. Our\napproach, Domain Adaptation and Reward Augmented Imitation Learning (DARAIL),\nutilizes the reward modification for domain adaptation and follows the general\nframework of generative adversarial imitation learning from observation (GAIfO)\nby applying a reward augmented estimator for the policy optimization step.\nTheoretically, we present an error bound for our method under a mild assumption\nregarding the dynamics shift to justify the motivation of our method.\nEmpirically, our method outperforms the pure modified reward method without\nimitation learning and also outperforms other baselines in benchmark\noff-dynamics environments.\n","authors":["Yihong Guo","Yixuan Wang","Yuanyuan Shi","Pan Xu","Anqi Liu"],"pdf_url":"https://arxiv.org/pdf/2411.09891v1.pdf","comment":"Published at Neurips 2024"},{"id":"http://arxiv.org/abs/2406.18027v2","updated":"2024-11-15T02:07:34Z","published":"2024-06-26T02:49:28Z","title":"Automated Clinical Data Extraction with Knowledge Conditioned LLMs","summary":" The extraction of lung lesion information from clinical and medical imaging\nreports is crucial for research on and clinical care of lung-related diseases.\nLarge language models (LLMs) can be effective at interpreting unstructured text\nin reports, but they often hallucinate due to a lack of domain-specific\nknowledge, leading to reduced accuracy and posing challenges for use in\nclinical settings. To address this, we propose a novel framework that aligns\ngenerated internal knowledge with external knowledge through in-context\nlearning (ICL). Our framework employs a retriever to identify relevant units of\ninternal or external knowledge and a grader to evaluate the truthfulness and\nhelpfulness of the retrieved internal-knowledge rules, to align and update the\nknowledge bases. Experiments with expert-curated test datasets demonstrate that\nthis ICL approach can increase the F1 score for key fields (lesion size, margin\nand solidity) by an average of 12.9% over existing ICL methods.\n","authors":["Diya Li","Asim Kadav","Aijing Gao","Rui Li","Richard Bourgon"],"pdf_url":"https://arxiv.org/pdf/2406.18027v2.pdf","comment":"COLING25 Industry Track"},{"id":"http://arxiv.org/abs/2411.09874v1","updated":"2024-11-15T01:49:17Z","published":"2024-11-15T01:49:17Z","title":"A Hybrid Artificial Intelligence System for Automated EEG Background\n Analysis and Report Generation","summary":" Electroencephalography (EEG) plays a crucial role in the diagnosis of various\nneurological disorders. However, small hospitals and clinics often lack\nadvanced EEG signal analysis systems and are prone to misinterpretation in\nmanual EEG reading. This study proposes an innovative hybrid artificial\nintelligence (AI) system for automatic interpretation of EEG background\nactivity and report generation. The system combines deep learning models for\nposterior dominant rhythm (PDR) prediction, unsupervised artifact removal, and\nexpert-designed algorithms for abnormality detection. For PDR prediction, 1530\nlabeled EEGs were used, and the best ensemble model achieved a mean absolute\nerror (MAE) of 0.237, a root mean square error (RMSE) of 0.359, an accuracy of\n91.8% within a 0.6Hz error, and an accuracy of 99% within a 1.2Hz error. The AI\nsystem significantly outperformed neurologists in detecting generalized\nbackground slowing (p = 0.02; F1: AI 0.93, neurologists 0.82) and demonstrated\nimproved focal abnormality detection, although not statistically significant (p\n= 0.79; F1: AI 0.71, neurologists 0.55). Validation on both an internal dataset\nand the Temple University Abnormal EEG Corpus showed consistent performance\n(F1: 0.884 and 0.835, respectively; p = 0.66), demonstrating generalizability.\nThe use of large language models (LLMs) for report generation demonstrated 100%\naccuracy, verified by three other independent LLMs. This hybrid AI system\nprovides an easily scalable and accurate solution for EEG interpretation in\nresource-limited settings, assisting neurologists in improving diagnostic\naccuracy and reducing misdiagnosis rates.\n","authors":["Chin-Sung Tung","Sheng-Fu Liang","Shu-Feng Chang","Chung-Ping Young"],"pdf_url":"https://arxiv.org/pdf/2411.09874v1.pdf","comment":"Example code available at https://github.com/tcs211/AI_EEEG_REPORT"},{"id":"http://arxiv.org/abs/2305.15608v2","updated":"2024-11-15T01:43:35Z","published":"2023-05-24T22:51:52Z","title":"Semantic Segmentation by Semantic Proportions","summary":" Semantic segmentation is a critical task in computer vision aiming to\nidentify and classify individual pixels in an image, with numerous applications\nin for example autonomous driving and medical image analysis. However, semantic\nsegmentation can be highly challenging particularly due to the need for large\namounts of annotated data. Annotating images is a time-consuming and costly\nprocess, often requiring expert knowledge and significant effort; moreover,\nsaving the annotated images could dramatically increase the storage space. In\nthis paper, we propose a novel approach for semantic segmentation, requiring\nthe rough information of individual semantic class proportions, shortened as\nsemantic proportions, rather than the necessity of ground-truth segmentation\nmaps. This greatly simplifies the data annotation process and thus will\nsignificantly reduce the annotation time, cost and storage space, opening up\nnew possibilities for semantic segmentation tasks where obtaining the full\nground-truth segmentation maps may not be feasible or practical. Our proposed\nmethod of utilising semantic proportions can (i) further be utilised as a\nbooster in the presence of ground-truth segmentation maps to gain performance\nwithout extra data and model complexity, and (ii) also be seen as a\nparameter-free plug-and-play module, which can be attached to existing deep\nneural networks designed for semantic segmentation. Extensive experimental\nresults demonstrate the good performance of our method compared to benchmark\nmethods that rely on ground-truth segmentation maps. Utilising semantic\nproportions suggested in this work offers a promising direction for future\nsemantic segmentation research.\n","authors":["Halil Ibrahim Aysel","Xiaohao Cai","Adam Prügel-Bennett"],"pdf_url":"https://arxiv.org/pdf/2305.15608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19715v2","updated":"2024-11-15T01:01:44Z","published":"2024-10-25T17:35:03Z","title":"Adversarial Environment Design via Regret-Guided Diffusion Models","summary":" Training agents that are robust to environmental changes remains a\nsignificant challenge in deep reinforcement learning (RL). Unsupervised\nenvironment design (UED) has recently emerged to address this issue by\ngenerating a set of training environments tailored to the agent's capabilities.\nWhile prior works demonstrate that UED has the potential to learn a robust\npolicy, their performance is constrained by the capabilities of the environment\ngeneration. To this end, we propose a novel UED algorithm, adversarial\nenvironment design via regret-guided diffusion models (ADD). The proposed\nmethod guides the diffusion-based environment generator with the regret of the\nagent to produce environments that the agent finds challenging but conducive to\nfurther improvement. By exploiting the representation power of diffusion\nmodels, ADD can directly generate adversarial environments while maintaining\nthe diversity of training environments, enabling the agent to effectively learn\na robust policy. Our experimental results demonstrate that the proposed method\nsuccessfully generates an instructive curriculum of environments, outperforming\nUED baselines in zero-shot generalization across novel, out-of-distribution\nenvironments. Project page: https://rllab-snu.github.io/projects/ADD\n","authors":["Hojun Chung","Junseo Lee","Minsoo Kim","Dohyeong Kim","Songhwai Oh"],"pdf_url":"https://arxiv.org/pdf/2410.19715v2.pdf","comment":"38th Conference on Neural Information Processing Systems"},{"id":"http://arxiv.org/abs/2410.21564v3","updated":"2024-11-15T00:32:50Z","published":"2024-10-28T21:54:44Z","title":"Mitigating Gradient Overlap in Deep Residual Networks with Gradient\n Normalization for Improved Non-Convex Optimization","summary":" In deep learning, Residual Networks (ResNets) have proven effective in\naddressing the vanishing gradient problem, allowing for the successful training\nof very deep networks. However, skip connections in ResNets can lead to\ngradient overlap, where gradients from both the learned transformation and the\nskip connection combine, potentially resulting in overestimated gradients. This\noverestimation can cause inefficiencies in optimization, as some updates may\novershoot optimal regions, affecting weight updates. To address this, we\nexamine Z-score Normalization (ZNorm) as a technique to manage gradient\noverlap. ZNorm adjusts the gradient scale, standardizing gradients across\nlayers and reducing the negative impact of overlapping gradients. Our\nexperiments demonstrate that ZNorm improves training process, especially in\nnon-convex optimization scenarios common in deep learning, where finding\noptimal solutions is challenging. These findings suggest that ZNorm can affect\nthe gradient flow, enhancing performance in large-scale data processing where\naccuracy is critical.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2410.21564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09852v1","updated":"2024-11-15T00:20:36Z","published":"2024-11-15T00:20:36Z","title":"InterFormer: Towards Effective Heterogeneous Interaction Learning for\n Click-Through Rate Prediction","summary":" Click-through rate (CTR) prediction, which predicts the probability of a user\nclicking an ad, is a fundamental task in recommender systems. The emergence of\nheterogeneous information, such as user profile and behavior sequences, depicts\nuser interests from different aspects. A mutually beneficial integration of\nheterogeneous information is the cornerstone towards the success of CTR\nprediction. However, most of the existing methods suffer from two fundamental\nlimitations, including (1) insufficient inter-mode interaction due to the\nunidirectional information flow between modes, and (2) aggressive information\naggregation caused by early summarization, resulting in excessive information\nloss. To address the above limitations, we propose a novel module named\nInterFormer to learn heterogeneous information interaction in an interleaving\nstyle. To achieve better interaction learning, InterFormer enables\nbidirectional information flow for mutually beneficial learning across\ndifferent modes. To avoid aggressive information aggregation, we retain\ncomplete information in each data mode and use a separate bridging arch for\neffective information selection and summarization. Our proposed InterFormer\nachieves state-of-the-art performance on three public datasets and a\nlarge-scale industrial dataset.\n","authors":["Zhichen Zeng","Xiaolong Liu","Mengyue Hang","Xiaoyi Liu","Qinghai Zhou","Chaofei Yang","Yiqun Liu","Yichen Ruan","Laming Chen","Yuxin Chen","Yujia Hao","Jiaqi Xu","Jade Nie","Xi Liu","Buyun Zhang","Wei Wen","Siyang Yuan","Kai Wang","Wen-Yen Chen","Yiping Han","Huayu Li","Chunzhi Yang","Bo Long","Philip S. Yu","Hanghang Tong","Jiyan Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09852v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.18856v2","updated":"2024-11-15T00:15:18Z","published":"2024-10-24T15:41:56Z","title":"Demystifying Large Language Models for Medicine: A Primer","summary":" Large language models (LLMs) represent a transformative class of AI tools\ncapable of revolutionizing various aspects of healthcare by generating\nhuman-like responses across diverse contexts and adapting to novel tasks\nfollowing human instructions. Their potential application spans a broad range\nof medical tasks, such as clinical documentation, matching patients to clinical\ntrials, and answering medical questions. In this primer paper, we propose an\nactionable guideline to help healthcare professionals more efficiently utilize\nLLMs in their work, along with a set of best practices. This approach consists\nof several main phases, including formulating the task, choosing LLMs, prompt\nengineering, fine-tuning, and deployment. We start with the discussion of\ncritical considerations in identifying healthcare tasks that align with the\ncore capabilities of LLMs and selecting models based on the selected task and\ndata, performance requirements, and model interface. We then review the\nstrategies, such as prompt engineering and fine-tuning, to adapt standard LLMs\nto specialized medical tasks. Deployment considerations, including regulatory\ncompliance, ethical guidelines, and continuous monitoring for fairness and\nbias, are also discussed. By providing a structured step-by-step methodology,\nthis tutorial aims to equip healthcare professionals with the tools necessary\nto effectively integrate LLMs into clinical practice, ensuring that these\npowerful technologies are applied in a safe, reliable, and impactful manner.\n","authors":["Qiao Jin","Nicholas Wan","Robert Leaman","Shubo Tian","Zhizheng Wang","Yifan Yang","Zifeng Wang","Guangzhi Xiong","Po-Ting Lai","Qingqing Zhu","Benjamin Hou","Maame Sarfo-Gyamfi","Gongbo Zhang","Aidan Gilson","Balu Bhasuran","Zhe He","Aidong Zhang","Jimeng Sun","Chunhua Weng","Ronald M. Summers","Qingyu Chen","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2410.18856v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2402.10930v3","updated":"2024-11-15T00:09:44Z","published":"2024-01-31T17:52:52Z","title":"ConSmax: Hardware-Friendly Alternative Softmax with Learnable Parameters","summary":" The self-attention mechanism distinguishes transformer-based large language\nmodels (LLMs) apart from convolutional and recurrent neural networks. Despite\nthe performance improvement, achieving real-time LLM inference on silicon\nremains challenging due to the extensive use of Softmax in self-attention. In\naddition to the non-linearity, the low arithmetic intensity significantly\nlimits processing parallelism, especially when working with longer contexts. To\naddress this challenge, we propose Constant Softmax (ConSmax), a\nsoftware-hardware co-design that serves as an efficient alternative to Softmax.\nConSmax utilizes differentiable normalization parameters to eliminate the need\nfor maximum searching and denominator summation in Softmax. This approach\nenables extensive parallelization while still executing the essential functions\nof Softmax. Moreover, a scalable ConSmax hardware design with a bitwidth-split\nlook-up table (LUT) can achieve lossless non-linear operations and support\nmixed-precision computing. Experimental results show that ConSmax achieves a\nminuscule power consumption of 0.2mW and an area of 0.0008mm^2 at 1250MHz\nworking frequency in 16nm FinFET technology. For open-source contribution, we\nfurther implement our design with the OpenROAD toolchain under SkyWater's 130nm\nCMOS technology. The corresponding power is 2.69mW and the area is 0.007mm^2.\nConSmax achieves 3.35x power savings and 2.75x area savings in 16nm technology,\nand 3.15x power savings and 4.14x area savings with the open-source EDA\ntoolchain. In the meantime, it also maintains comparable accuracy on the GPT-2\nmodel and the WikiText103 dataset. The project is available at\nhttps://github.com/ReaLLMASIC/ConSmax\n","authors":["Shiwei Liu","Guanchen Tao","Yifei Zou","Derek Chow","Zichen Fan","Kauna Lei","Bangfei Pan","Dennis Sylvester","Gregory Kielian","Mehdi Saligane"],"pdf_url":"https://arxiv.org/pdf/2402.10930v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09850v1","updated":"2024-11-15T00:06:57Z","published":"2024-11-15T00:06:57Z","title":"Enhancing Diffusion Posterior Sampling for Inverse Problems by\n Integrating Crafted Measurements","summary":" Diffusion models have emerged as a powerful foundation model for visual\ngeneration. With an appropriate sampling process, it can effectively serve as a\ngenerative prior to solve general inverse problems. Current posterior sampling\nbased methods take the measurement (i.e., degraded image sample) into the\nposterior sampling to infer the distribution of the target data (i.e., clean\nimage sample). However, in this manner, we show that high-frequency information\ncan be prematurely introduced during the early stages, which could induce\nlarger posterior estimate errors during the restoration sampling. To address\nthis issue, we first reveal that forming the log posterior gradient with the\nnoisy measurement ( i.e., samples from a diffusion forward process) instead of\nthe clean one can benefit the reverse process. Consequently, we propose a novel\ndiffusion posterior sampling method DPS-CM, which incorporates a Crafted\nMeasurement (i.e., samples generated by a reverse denoising process, compared\nto random sampling with noise in standard methods) to form the posterior\nestimate. This integration aims to mitigate the misalignment with the diffusion\nprior caused by cumulative posterior estimate errors. Experimental results\ndemonstrate that our approach significantly improves the overall capacity to\nsolve general and noisy inverse problems, such as Gaussian deblurring,\nsuper-resolution, inpainting, nonlinear deblurring, and tasks with Poisson\nnoise, relative to existing approaches.\n","authors":["Shijie Zhou","Huaisheng Zhu","Rohan Sharma","Ruiyi Zhang","Kaiyi Ji","Changyou Chen"],"pdf_url":"https://arxiv.org/pdf/2411.09850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06683v6","updated":"2024-11-15T00:00:24Z","published":"2023-01-17T03:53:29Z","title":"From Isolation to Collaboration: Federated Class-Heterogeneous Learning\n for Chest X-Ray Classification","summary":" Federated learning (FL) is a promising paradigm to collaboratively train a\nglobal chest x-ray (CXR) classification model using distributed datasets while\npreserving patient privacy. A significant, yet relatively underexplored,\nchallenge in FL is class-heterogeneity, where clients have different sets of\nclasses. We propose surgical aggregation, a FL method that uses selective\naggregation to collaboratively train a global model using distributed,\nclass-heterogeneous datasets. Unlike other methods, our method does not rely on\nthe assumption that clients share the same classes as other clients, know the\nclasses of other clients, or have access to a fully annotated dataset. We\nevaluate surgical aggregation using class-heterogeneous CXR datasets across IID\nand non-IID settings. Our results show that our method outperforms current\nmethods and has better generalizability.\n","authors":["Pranav Kulkarni","Adway Kanhere","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2301.06683v6.pdf","comment":null}],"Computation and Language":[{"id":"http://arxiv.org/abs/2411.10442v1","updated":"2024-11-15T18:59:27Z","published":"2024-11-15T18:59:27Z","title":"Enhancing the Reasoning Ability of Multimodal Large Language Models via\n Mixed Preference Optimization","summary":" Existing open-source multimodal large language models (MLLMs) generally\nfollow a training process involving pre-training and supervised fine-tuning.\nHowever, these models suffer from distribution shifts, which limit their\nmultimodal reasoning, particularly in the Chain-of-Thought (CoT) performance.\nTo address this, we introduce a preference optimization (PO) process to enhance\nthe multimodal reasoning capabilities of MLLMs. Specifically, (1) on the data\nside, we design an automated preference data construction pipeline to create\nMMPR, a high-quality, large-scale multimodal reasoning preference dataset. and\n(2) on the model side, we explore integrating PO with MLLMs, developing a\nsimple yet effective method, termed Mixed Preference Optimization (MPO), which\nboosts multimodal CoT performance. Our approach demonstrates improved\nperformance across multiple benchmarks, particularly in multimodal reasoning\ntasks. Notably, our model, InternVL2-8B-MPO, achieves an accuracy of 67.0 on\nMathVista, outperforming InternVL2-8B by 8.7 points and achieving performance\ncomparable to the 10x larger InternVL2-76B. We hope this study could inspire\nfurther advancements in MLLMs. Code, data, and model shall be publicly\nreleased.\n","authors":["Weiyun Wang","Zhe Chen","Wenhai Wang","Yue Cao","Yangzhou Liu","Zhangwei Gao","Jinguo Zhu","Xizhou Zhu","Lewei Lu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2411.10442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10436v1","updated":"2024-11-15T18:56:01Z","published":"2024-11-15T18:56:01Z","title":"Mitigating Hallucination in Multimodal Large Language Model via\n Hallucination-targeted Direct Preference Optimization","summary":" Multimodal Large Language Models (MLLMs) are known to hallucinate, which\nlimits their practical applications. Recent works have attempted to apply\nDirect Preference Optimization (DPO) to enhance the performance of MLLMs, but\nhave shown inconsistent improvements in mitigating hallucinations. To address\nthis issue more effectively, we introduce Hallucination-targeted Direct\nPreference Optimization (HDPO) to reduce hallucinations in MLLMs. Unlike\nprevious approaches, our method tackles hallucinations from their diverse forms\nand causes. Specifically, we develop three types of preference pair data\ntargeting the following causes of MLLM hallucinations: (1) insufficient visual\ncapabilities, (2) long context generation, and (3) multimodal conflicts.\nExperimental results demonstrate that our method achieves superior performance\nacross multiple hallucination evaluation datasets, surpassing most\nstate-of-the-art (SOTA) methods and highlighting the potential of our approach.\nAblation studies and in-depth analyses further confirm the effectiveness of our\nmethod and suggest the potential for further improvements through scaling up.\n","authors":["Yuhan Fu","Ruobing Xie","Xingwu Sun","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2411.10436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10416v1","updated":"2024-11-15T18:35:00Z","published":"2024-11-15T18:35:00Z","title":"Towards Automatic Evaluation of Task-Oriented Dialogue Flows","summary":" Task-oriented dialogue systems rely on predefined conversation schemes\n(dialogue flows) often represented as directed acyclic graphs. These flows can\nbe manually designed or automatically generated from previously recorded\nconversations. Due to variations in domain expertise or reliance on different\nsets of prior conversations, these dialogue flows can manifest in significantly\ndifferent graph structures. Despite their importance, there is no standard\nmethod for evaluating the quality of dialogue flows. We introduce FuDGE (Fuzzy\nDialogue-Graph Edit Distance), a novel metric that evaluates dialogue flows by\nassessing their structural complexity and representational coverage of the\nconversation data. FuDGE measures how well individual conversations align with\na flow and, consequently, how well a set of conversations is represented by the\nflow overall. Through extensive experiments on manually configured flows and\nflows generated by automated techniques, we demonstrate the effectiveness of\nFuDGE and its evaluation framework. By standardizing and optimizing dialogue\nflows, FuDGE enables conversational designers and automated techniques to\nachieve higher levels of efficiency and automation.\n","authors":["Mehrnoosh Mirtaheri","Nikhil Varghese","Chandra Khatri","Amol Kelkar"],"pdf_url":"https://arxiv.org/pdf/2411.10416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10414v1","updated":"2024-11-15T18:34:07Z","published":"2024-11-15T18:34:07Z","title":"Llama Guard 3 Vision: Safeguarding Human-AI Image Understanding\n Conversations","summary":" We introduce Llama Guard 3 Vision, a multimodal LLM-based safeguard for\nhuman-AI conversations that involves image understanding: it can be used to\nsafeguard content for both multimodal LLM inputs (prompt classification) and\noutputs (response classification). Unlike the previous text-only Llama Guard\nversions (Inan et al., 2023; Llama Team, 2024b,a), it is specifically designed\nto support image reasoning use cases and is optimized to detect harmful\nmultimodal (text and image) prompts and text responses to these prompts. Llama\nGuard 3 Vision is fine-tuned on Llama 3.2-Vision and demonstrates strong\nperformance on the internal benchmarks using the MLCommons taxonomy. We also\ntest its robustness against adversarial attacks. We believe that Llama Guard 3\nVision serves as a good starting point to build more capable and robust content\nmoderation tools for human-AI conversation with multimodal capabilities.\n","authors":["Jianfeng Chi","Ujjwal Karn","Hongyuan Zhan","Eric Smith","Javier Rando","Yiming Zhang","Kate Plawiak","Zacharie Delpierre Coudert","Kartikeya Upasani","Mahesh Pasupuleti"],"pdf_url":"https://arxiv.org/pdf/2411.10414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10397v1","updated":"2024-11-15T18:03:52Z","published":"2024-11-15T18:03:52Z","title":"Features that Make a Difference: Leveraging Gradients for Improved\n Dictionary Learning","summary":" Sparse Autoencoders (SAEs) are a promising approach for extracting neural\nnetwork representations by learning a sparse and overcomplete decomposition of\nthe network's internal activations. However, SAEs are traditionally trained\nconsidering only activation values and not the effect those activations have on\ndownstream computations. This limits the information available to learn\nfeatures, and biases the autoencoder towards neglecting features which are\nrepresented with small activation values but strongly influence model outputs.\nTo address this, we introduce Gradient SAEs (g-SAEs), which modify the\n$k$-sparse autoencoder architecture by augmenting the TopK activation function\nto rely on the gradients of the input activation when selecting the $k$\nelements. For a given sparsity level, g-SAEs produce reconstructions that are\nmore faithful to original network performance when propagated through the\nnetwork. Additionally, we find evidence that g-SAEs learn latents that are on\naverage more effective at steering models in arbitrary contexts. By considering\nthe downstream effects of activations, our approach leverages the dual nature\nof neural network features as both $\\textit{representations}$, retrospectively,\nand $\\textit{actions}$, prospectively. While previous methods have approached\nthe problem of feature discovery primarily focused on the former aspect, g-SAEs\nrepresent a step towards accounting for the latter as well.\n","authors":["Jeffrey Olmo","Jared Wilson","Max Forsey","Bryce Hepner","Thomas Vin Howe","David Wingate"],"pdf_url":"https://arxiv.org/pdf/2411.10397v1.pdf","comment":"9 pages, 8 figures. Submitted to NAACL 2025"},{"id":"http://arxiv.org/abs/2407.00342v4","updated":"2024-11-15T17:59:10Z","published":"2024-06-29T07:01:51Z","title":"KPC-cF: Aspect-Based Sentiment Analysis via Implicit-Feature Alignment\n with Corpus Filtering","summary":" Investigations into Aspect-Based Sentiment Analysis (ABSA) for Korean\nindustrial reviews are notably lacking in the existing literature. Our research\nproposes an intuitive and effective framework for ABSA in low-resource\nlanguages such as Korean. It optimizes prediction labels by integrating\ntranslated benchmark and unlabeled Korean data. Using a model fine-tuned on\ntranslated data, we pseudo-labeled the actual Korean NLI set. Subsequently, we\napplied LaBSE and \\MSP{}-based filtering to this pseudo-NLI set as implicit\nfeature, enhancing Aspect Category Detection and Polarity determination through\nadditional training. Incorporating dual filtering, this model bridged dataset\ngaps, achieving positive results in Korean ABSA with minimal resources. Through\nadditional data injection pipelines, our approach aims to utilize high-resource\ndata and construct effective models within communities, whether corporate or\nindividual, in low-resource language countries. Compared to English ABSA, our\nframework showed an approximately 3\\% difference in F1 scores and accuracy. We\nrelease the dataset and our code for Korean ABSA, at this link.\n","authors":["Kibeom Nam"],"pdf_url":"https://arxiv.org/pdf/2407.00342v4.pdf","comment":"Work in Progress, DMLR@ICML 2024"},{"id":"http://arxiv.org/abs/2411.10371v1","updated":"2024-11-15T17:19:42Z","published":"2024-11-15T17:19:42Z","title":"A Survey of Event Causality Identification: Principles, Taxonomy,\n Challenges, and Assessment","summary":" Event Causality Identification (ECI) has become a crucial task in Natural\nLanguage Processing (NLP), aimed at automatically extracting causalities from\ntextual data. In this survey, we systematically address the foundational\nprinciples, technical frameworks, and challenges of ECI, offering a\ncomprehensive taxonomy to categorize and clarify current research\nmethodologies, as well as a quantitative assessment of existing models. We\nfirst establish a conceptual framework for ECI, outlining key definitions,\nproblem formulations, and evaluation standards. Our taxonomy classifies ECI\nmethods according to the two primary tasks of sentence-level (SECI) and\ndocument-level (DECI) event causality identification. For SECI, we examine\nfeature pattern-based matching, deep semantic encoding, causal knowledge\npre-training and prompt-based fine-tuning, and external knowledge enhancement\nmethods. For DECI, we highlight approaches focused on event graph reasoning and\nprompt-based techniques to address the complexity of cross-sentence causal\ninference. Additionally, we analyze the strengths, limitations, and open\nchallenges of each approach. We further conduct an extensive quantitative\nevaluation of various ECI methods on two benchmark datasets. Finally, we\nexplore future research directions, highlighting promising pathways to overcome\ncurrent limitations and broaden ECI applications.\n","authors":["Zefan Zeng","Qing Cheng","Xingchen Hu","Yuehang Si","Zhong Liu"],"pdf_url":"https://arxiv.org/pdf/2411.10371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14279v3","updated":"2024-11-15T17:11:08Z","published":"2024-02-22T04:41:52Z","title":"Mitigating the Linguistic Gap with Phonemic Representations for Robust\n Cross-lingual Transfer","summary":" Approaches to improving multilingual language understanding often struggle\nwith significant performance gaps between high-resource and low-resource\nlanguages. While there are efforts to align the languages in a single latent\nspace to mitigate such gaps, how different input-level representations\ninfluence such gaps has not been investigated, particularly with phonemic\ninputs. We hypothesize that the performance gaps are affected by representation\ndiscrepancies between these languages, and revisit the use of phonemic\nrepresentations as a means to mitigate these discrepancies. To demonstrate the\neffectiveness of phonemic representations, we present experiments on three\nrepresentative cross-lingual tasks on 12 languages in total. The results show\nthat phonemic representations exhibit higher similarities between languages\ncompared to orthographic representations, and it consistently outperforms\ngrapheme-based baseline model on languages that are relatively low-resourced.\nWe present quantitative evidence from three cross-lingual tasks that\ndemonstrate the effectiveness of phonemic representations, and it is further\njustified by a theoretical analysis of the cross-lingual performance gap.\n","authors":["Haeji Jung","Changdae Oh","Jooeon Kang","Jimin Sohn","Kyungwoo Song","Jinkyu Kim","David R. Mortensen"],"pdf_url":"https://arxiv.org/pdf/2402.14279v3.pdf","comment":"Accepted to the 4th Multilingual Representation Learning (MRL)\n Workshop (co-located with EMNLP 2024)"},{"id":"http://arxiv.org/abs/2405.04412v3","updated":"2024-11-15T16:53:18Z","published":"2024-05-07T15:39:45Z","title":"The Silicon Ceiling: Auditing GPT's Race and Gender Biases in Hiring","summary":" Large language models (LLMs) are increasingly being introduced in workplace\nsettings, with the goals of improving efficiency and fairness. However,\nconcerns have arisen regarding these models' potential to reflect or exacerbate\nsocial biases and stereotypes. This study explores the potential impact of LLMs\non hiring practices. To do so, we conduct an AI audit of race and gender biases\nin one commonly-used LLM, OpenAI's GPT-3.5, taking inspiration from the history\nof traditional offline resume audits. We conduct two studies using names with\nvaried race and gender connotations: resume assessment (Study 1) and resume\ngeneration (Study 2). In Study 1, we ask GPT to score resumes with 32 different\nnames (4 names for each combination of the 2 gender and 4 racial groups) and\ntwo anonymous options across 10 occupations and 3 evaluation tasks (overall\nrating, willingness to interview, and hireability). We find that the model\nreflects some biases based on stereotypes. In Study 2, we prompt GPT to create\nresumes (10 for each name) for fictitious job candidates. When generating\nresumes, GPT reveals underlying biases; women's resumes had occupations with\nless experience, while Asian and Hispanic resumes had immigrant markers, such\nas non-native English and non-U.S. education and work experiences. Our findings\ncontribute to a growing body of literature on LLM biases, particularly in\nworkplace contexts.\n","authors":["Lena Armstrong","Abbey Liu","Stephen MacNeil","Danaë Metaxa"],"pdf_url":"https://arxiv.org/pdf/2405.04412v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10329v1","updated":"2024-11-15T16:29:02Z","published":"2024-11-15T16:29:02Z","title":"Safe Text-to-Image Generation: Simply Sanitize the Prompt Embedding","summary":" In recent years, text-to-image (T2I) generation models have made significant\nprogress in generating high-quality images that align with text descriptions.\nHowever, these models also face the risk of unsafe generation, potentially\nproducing harmful content that violates usage policies, such as explicit\nmaterial. Existing safe generation methods typically focus on suppressing\ninappropriate content by erasing undesired concepts from visual\nrepresentations, while neglecting to sanitize the textual representation.\nAlthough these methods help mitigate the risk of misuse to certain extent,\ntheir robustness remains insufficient when dealing with adversarial attacks.\n Given that semantic consistency between input text and output image is a\nfundamental requirement for T2I models, we identify that textual\nrepresentations (i.e., prompt embeddings) are likely the primary source of\nunsafe generation. To this end, we propose a vision-agnostic safe generation\nframework, Embedding Sanitizer (ES), which focuses on erasing inappropriate\nconcepts from prompt embeddings and uses the sanitized embeddings to guide the\nmodel for safe generation. ES is applied to the output of the text encoder as a\nplug-and-play module, enabling seamless integration with different T2I models\nas well as other safeguards. In addition, ES's unique scoring mechanism assigns\na score to each token in the prompt to indicate its potential harmfulness, and\ndynamically adjusts the sanitization intensity to balance defensive performance\nand generation quality. Through extensive evaluation on five prompt benchmarks,\nour approach achieves state-of-the-art robustness by sanitizing the source\n(prompt embedding) of unsafe generation compared to nine baseline methods. It\nsignificantly outperforms existing safeguards in terms of interpretability and\ncontrollability while maintaining generation quality.\n","authors":["Huming Qiu","Guanxu Chen","Mi Zhang","Min Yang"],"pdf_url":"https://arxiv.org/pdf/2411.10329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10328v1","updated":"2024-11-15T16:28:25Z","published":"2024-11-15T16:28:25Z","title":"Emotion Detection in Reddit: Comparative Study of Machine Learning and\n Deep Learning Techniques","summary":" Emotion detection is pivotal in human communication, as it significantly\ninfluences behavior, relationships, and decision-making processes. This study\nconcentrates on text-based emotion detection by leveraging the GoEmotions\ndataset, which annotates Reddit comments with 27 distinct emotions. These\nemotions are subsequently mapped to Ekman's six basic categories: joy, anger,\nfear, sadness, disgust, and surprise. We employed a range of models for this\ntask, including six machine learning models, three ensemble models, and a Long\nShort-Term Memory (LSTM) model to determine the optimal model for emotion\ndetection. Results indicate that the Stacking classifier outperforms other\nmodels in accuracy and performance. We also benchmark our models against\nEmoBERTa, a pre-trained emotion detection model, with our Stacking classifier\nproving more effective. Finally, the Stacking classifier is deployed via a\nStreamlit web application, underscoring its potential for real-world\napplications in text-based emotion analysis.\n","authors":["Maliheh Alaeddini"],"pdf_url":"https://arxiv.org/pdf/2411.10328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10323v1","updated":"2024-11-15T16:23:52Z","published":"2024-11-15T16:23:52Z","title":"The Dawn of GUI Agent: A Preliminary Case Study with Claude 3.5 Computer\n Use","summary":" The recently released model, Claude 3.5 Computer Use, stands out as the first\nfrontier AI model to offer computer use in public beta as a graphical user\ninterface (GUI) agent. As an early beta, its capability in the real-world\ncomplex environment remains unknown. In this case study to explore Claude 3.5\nComputer Use, we curate and organize a collection of carefully designed tasks\nspanning a variety of domains and software. Observations from these cases\ndemonstrate Claude 3.5 Computer Use's unprecedented ability in end-to-end\nlanguage to desktop actions. Along with this study, we provide an\nout-of-the-box agent framework for deploying API-based GUI automation models\nwith easy implementation. Our case studies aim to showcase a groundwork of\ncapabilities and limitations of Claude 3.5 Computer Use with detailed analyses\nand bring to the fore questions about planning, action, and critic, which must\nbe considered for future improvement. We hope this preliminary exploration will\ninspire future research into the GUI agent community. All the test cases in the\npaper can be tried through the project:\nhttps://github.com/showlab/computer_use_ootb.\n","authors":["Siyuan Hu","Mingyu Ouyang","Difei Gao","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2411.10323v1.pdf","comment":"40 pages, 21 figures, preprint"},{"id":"http://arxiv.org/abs/2411.10298v1","updated":"2024-11-15T15:55:05Z","published":"2024-11-15T15:55:05Z","title":"Unveiling Topological Structures in Text: A Comprehensive Survey of\n Topological Data Analysis Applications in NLP","summary":" The surge of data available on the internet has led to the adoption of\nvarious computational methods to analyze and extract valuable insights from\nthis wealth of information. Among these, the field of Machine Learning (ML) has\nthrived by leveraging data to extract meaningful insights. However, ML\ntechniques face notable challenges when dealing with real-world data, often due\nto issues of imbalance, noise, insufficient labeling, and high dimensionality.\nTo address these limitations, some researchers advocate for the adoption of\nTopological Data Analysis (TDA), a statistical approach that discerningly\ncaptures the intrinsic shape of data despite noise. Despite its potential, TDA\nhas not gained as much traction within the Natural Language Processing (NLP)\ndomain compared to structurally distinct areas like computer vision.\nNevertheless, a dedicated community of researchers has been exploring the\napplication of TDA in NLP, yielding 85 papers we comprehensively survey in this\npaper. Our findings categorize these efforts into theoretical and\nnontheoretical approaches. Theoretical approaches aim to explain linguistic\nphenomena from a topological viewpoint, while non-theoretical approaches merge\nTDA with ML features, utilizing diverse numerical representation techniques. We\nconclude by exploring the challenges and unresolved questions that persist in\nthis niche field. Resources and a list of papers on this topic can be found at:\nhttps://github.com/AdaUchendu/AwesomeTDA4NLP.\n","authors":["Adaku Uchendu","Thai Le"],"pdf_url":"https://arxiv.org/pdf/2411.10298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10272v1","updated":"2024-11-15T15:28:42Z","published":"2024-11-15T15:28:42Z","title":"Scaling Law for Post-training after Model Pruning","summary":" Large language models (LLMs) based on the Transformer architecture are widely\nemployed across various domains and tasks. However, their increasing size\nimposes significant hardware demands, limiting practical deployment. To\nmitigate this, model pruning techniques have been developed to create more\nefficient models while maintaining high performance. Despite this,\npost-training after pruning is crucial for performance recovery and can be\nresource-intensive. This paper investigates the post-training requirements of\npruned LLMs and introduces a scaling law to determine the optimal amount of\npost-training data. Post-training experiments with the Llama-3 and Qwen-2.5\nseries models, pruned using depth pruning, width pruning, and 2:4\nsemi-structured pruning, show that higher pruning ratios necessitate more\npost-training data for performance recovery, whereas larger LLMs require less.\nThe proposed scaling law predicts a model's loss based on its parameter counts\nbefore and after pruning, as well as the post-training token counts.\nFurthermore, we find that the scaling law established from smaller LLMs can be\nreliably extrapolated to larger LLMs. This work provides valuable insights into\nthe post-training of pruned LLMs and offers a practical scaling law for\noptimizing post-training data usage.\n","authors":["Xiaodong Chen","Yuxuan Hu","Jing Zhang","Xiaokang Zhang","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.10272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10246v1","updated":"2024-11-15T14:57:39Z","published":"2024-11-15T14:57:39Z","title":"Scaling up the Evaluation of Collaborative Problem Solving: Promises and\n Challenges of Coding Chat Data with ChatGPT","summary":" Collaborative problem solving (CPS) is widely recognized as a critical 21st\ncentury skill. Efficiently coding communication data is a big challenge in\nscaling up research on assessing CPS. This paper reports the findings on using\nChatGPT to directly code CPS chat data by benchmarking performance across\nmultiple datasets and coding frameworks. We found that ChatGPT-based coding\noutperformed human coding in tasks where the discussions were characterized by\ncolloquial languages but fell short in tasks where the discussions dealt with\nspecialized scientific terminology and contexts. The findings offer practical\nguidelines for researchers to develop strategies for efficient and scalable\nanalysis of communication data from CPS tasks.\n","authors":["Jiangang Hao","Wenju Cui","Patrick Kyllonen","Emily Kerzabi","Lei Liu","Michael Flor"],"pdf_url":"https://arxiv.org/pdf/2411.10246v1.pdf","comment":"21 pages, 3 figures, 5 tables. Initially report in the edArXiv:xw6kz"},{"id":"http://arxiv.org/abs/2411.10242v1","updated":"2024-11-15T14:55:01Z","published":"2024-11-15T14:55:01Z","title":"Measuring Non-Adversarial Reproduction of Training Data in Large\n Language Models","summary":" Large language models memorize parts of their training data. Memorizing short\nsnippets and facts is required to answer questions about the world and to be\nfluent in any language. But models have also been shown to reproduce long\nverbatim sequences of memorized text when prompted by a motivated adversary. In\nthis work, we investigate an intermediate regime of memorization that we call\nnon-adversarial reproduction, where we quantify the overlap between model\nresponses and pretraining data when responding to natural and benign prompts.\nFor a variety of innocuous prompt categories (e.g., writing a letter or a\ntutorial), we show that up to 15% of the text output by popular conversational\nlanguage models overlaps with snippets from the Internet. In worst cases, we\nfind generations where 100% of the content can be found exactly online. For the\nsame tasks, we find that human-written text has far less overlap with Internet\ndata. We further study whether prompting strategies can close this reproduction\ngap between models and humans. While appropriate prompting can reduce\nnon-adversarial reproduction on average, we find that mitigating worst-case\nreproduction of training data requires stronger defenses -- even for benign\ninteractions.\n","authors":["Michael Aerni","Javier Rando","Edoardo Debenedetti","Nicholas Carlini","Daphne Ippolito","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2411.10242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10227v1","updated":"2024-11-15T14:40:59Z","published":"2024-11-15T14:40:59Z","title":"Entropy and type-token ratio in gigaword corpora","summary":" Lexical diversity measures the vocabulary variation in texts. While its\nutility is evident for analyses in language change and applied linguistics, it\nis not yet clear how to operationalize this concept in a unique way. We here\ninvestigate entropy and text-token ratio, two widely employed metrics for\nlexical diversities, in six massive linguistic datasets in English, Spanish,\nand Turkish, consisting of books, news articles, and tweets. These gigaword\ncorpora correspond to languages with distinct morphological features and differ\nin registers and genres, thus constituting a diverse testbed for a quantitative\napproach to lexical diversity. Strikingly, we find a functional relation\nbetween entropy and text-token ratio that holds across the corpora under\nconsideration. Further, in the limit of large vocabularies we find an\nanalytical expression that sheds light on the origin of this relation and its\nconnection with both Zipf and Heaps laws. Our results then contribute to the\ntheoretical understanding of text structure and offer practical implications\nfor fields like natural language processing.\n","authors":["Pablo Rosillo-Rodes","Maxi San Miguel","David Sanchez"],"pdf_url":"https://arxiv.org/pdf/2411.10227v1.pdf","comment":"12 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2411.10172v1","updated":"2024-11-15T13:18:18Z","published":"2024-11-15T13:18:18Z","title":"Increasing the Accessibility of Causal Domain Knowledge via Causal\n Information Extraction Methods: A Case Study in the Semiconductor\n Manufacturing Industry","summary":" The extraction of causal information from textual data is crucial in the\nindustry for identifying and mitigating potential failures, enhancing process\nefficiency, prompting quality improvements, and addressing various operational\nchallenges. This paper presents a study on the development of automated methods\nfor causal information extraction from actual industrial documents in the\nsemiconductor manufacturing industry. The study proposes two types of causal\ninformation extraction methods, single-stage sequence tagging (SST) and\nmulti-stage sequence tagging (MST), and evaluates their performance using\nexisting documents from a semiconductor manufacturing company, including\npresentation slides and FMEA (Failure Mode and Effects Analysis) documents. The\nstudy also investigates the effect of representation learning on downstream\ntasks. The presented case study showcases that the proposed MST methods for\nextracting causal information from industrial documents are suitable for\npractical applications, especially for semi structured documents such as FMEAs,\nwith a 93\\% F1 score. Additionally, MST achieves a 73\\% F1 score on texts\nextracted from presentation slides. Finally, the study highlights the\nimportance of choosing a language model that is more aligned with the domain\nand in-domain fine-tuning.\n","authors":["Houssam Razouk","Leonie Benischke","Daniel Garber","Roman Kern"],"pdf_url":"https://arxiv.org/pdf/2411.10172v1.pdf","comment":"17 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.10168v1","updated":"2024-11-15T13:16:11Z","published":"2024-11-15T13:16:11Z","title":"Evaluating the role of `Constitutions' for learning from AI feedback","summary":" The growing capabilities of large language models (LLMs) have led to their\nuse as substitutes for human feedback for training and assessing other LLMs.\nThese methods often rely on `constitutions', written guidelines which a critic\nmodel uses to provide feedback and improve generations. We investigate how the\nchoice of constitution affects feedback quality by using four different\nconstitutions to improve patient-centered communication in medical interviews.\nIn pairwise comparisons conducted by 215 human raters, we found that detailed\nconstitutions led to better results regarding emotive qualities. However, none\nof the constitutions outperformed the baseline in learning more\npractically-oriented skills related to information gathering and provision. Our\nfindings indicate that while detailed constitutions should be prioritised,\nthere are possible limitations to the effectiveness of AI feedback as a reward\nsignal in certain areas.\n","authors":["Saskia Redgate","Andrew M. Bean","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2411.10168v1.pdf","comment":"4 pages, 2 figures. In NeurIPS 2024 Workshop on Language Gamification"},{"id":"http://arxiv.org/abs/2411.10163v1","updated":"2024-11-15T13:12:29Z","published":"2024-11-15T13:12:29Z","title":"Compound-QA: A Benchmark for Evaluating LLMs on Compound Questions","summary":" Large language models (LLMs) demonstrate remarkable performance across\nvarious tasks, prompting researchers to develop diverse evaluation benchmarks.\nHowever, existing benchmarks typically measure the ability of LLMs to respond\nto individual questions, neglecting the complex interactions in real-world\napplications. In this paper, we introduce Compound Question Synthesis (CQ-Syn)\nto create the Compound-QA benchmark, focusing on compound questions with\nmultiple sub-questions. This benchmark is derived from existing QA datasets,\nannotated with proprietary LLMs and verified by humans for accuracy. It\nencompasses five categories: Factual-Statement, Cause-and-Effect,\nHypothetical-Analysis, Comparison-and-Selection, and Evaluation-and-Suggestion.\nIt evaluates the LLM capability in terms of three dimensions including\nunderstanding, reasoning, and knowledge. Our assessment of eight open-source\nLLMs using Compound-QA reveals distinct patterns in their responses to compound\nquestions, which are significantly poorer than those to non-compound questions.\nAdditionally, we investigate various methods to enhance LLMs performance on\ncompound questions. The results indicate that these approaches significantly\nimprove the models' comprehension and reasoning abilities on compound\nquestions.\n","authors":["Yutao Hou","Yajing Luo","Zhiwen Ruan","Hongru Wang","Weifeng Ge","Yun Chen","Guanhua Chen"],"pdf_url":"https://arxiv.org/pdf/2411.10163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14979v5","updated":"2024-11-15T12:46:30Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration From Cognitive Psychology","summary":" The cognitive mechanism by which Large Language Models (LLMs) solve\nmathematical problems remains a widely debated and unresolved issue. Currently,\nthere is little interpretable experimental evidence that connects LLMs'\nproblem-solving with human cognitive psychology.To determine if LLMs possess\nhuman-like mathematical reasoning, we modified the problems used in the human\nCognitive Reflection Test (CRT). Our results show that, even with the use of\nChains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model\n(noted for its reasoning capabilities), have a high error rate when solving\nthese modified CRT problems. Specifically, the average accuracy rate dropped by\nup to 50% compared to the original questions.Further analysis of LLMs'\nincorrect answers suggests that they primarily rely on pattern matching from\ntheir training data, which aligns more with human intuition (System 1 thinking)\nrather than with human-like reasoning (System 2 thinking). This finding\nchallenges the belief that LLMs have genuine mathematical reasoning abilities\ncomparable to humans. As a result, this work may adjust overly optimistic views\non LLMs' progress towards artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Kai Chen","Xiaobing Sun","Baosheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14979v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10145v1","updated":"2024-11-15T12:39:02Z","published":"2024-11-15T12:39:02Z","title":"An Effective Framework to Help Large Language Models Handle\n Numeric-involved Long-context Tasks","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nhandling long texts and have almost perfect performance in traditional\nretrieval tasks. However, their performance significantly degrades when it\ncomes to numerical calculations in the long-context. Numeric-involved\nlong-context tasks typically cannot be addressed by current LLMs in normal\nsettings due to their inherent limitations in simultaneously handling complex\nand massive information. Some CoT like prompting methods can improve accuracy\nbut demands massive output tokens, which is costly and slow. To address this\nissue, we propose a workflow, which decompose a numeric-involved long-context\ntask into 4 low-level subtasks: judging, extracting and processing with code\nand conclusion. The former 2 subtasks is relatively simple, which allows us to\nuse smaller models for efficiently processing long context. When numerical\ncalculations are required, we use code generated by LLMs to avoid the\ndisadvantage of LLM not being good at calculations. The results in 2\nnumeric-involved long-context benchmarks demonstrate our workflow can not only\nimprove accuracy, but also significantly reduce the cost of API calls.\n","authors":["Yijiong Yu"],"pdf_url":"https://arxiv.org/pdf/2411.10145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10137v1","updated":"2024-11-15T12:23:12Z","published":"2024-11-15T12:23:12Z","title":"Legal Evalutions and Challenges of Large Language Models","summary":" In this paper, we review legal testing methods based on Large Language Models\n(LLMs), using the OPENAI o1 model as a case study to evaluate the performance\nof large models in applying legal provisions. We compare current\nstate-of-the-art LLMs, including open-source, closed-source, and legal-specific\nmodels trained specifically for the legal domain. Systematic tests are\nconducted on English and Chinese legal cases, and the results are analyzed in\ndepth. Through systematic testing of legal cases from common law systems and\nChina, this paper explores the strengths and weaknesses of LLMs in\nunderstanding and applying legal texts, reasoning through legal issues, and\npredicting judgments. The experimental results highlight both the potential and\nlimitations of LLMs in legal applications, particularly in terms of challenges\nrelated to the interpretation of legal language and the accuracy of legal\nreasoning. Finally, the paper provides a comprehensive analysis of the\nadvantages and disadvantages of various types of models, offering valuable\ninsights and references for the future application of AI in the legal field.\n","authors":["Jiaqi Wang","Huan Zhao","Zhenyuan Yang","Peng Shu","Junhao Chen","Haobo Sun","Ruixi Liang","Shixin Li","Pengcheng Shi","Longjun Ma","Zongjia Liu","Zhengliang Liu","Tianyang Zhong","Yutong Zhang","Chong Ma","Xin Zhang","Tuo Zhang","Tianli Ding","Yudan Ren","Tianming Liu","Xi Jiang","Shu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10129v1","updated":"2024-11-15T12:01:38Z","published":"2024-11-15T12:01:38Z","title":"Prompting and Fine-tuning Large Language Models for Automated Code\n Review Comment Generation","summary":" Generating accurate code review comments remains a significant challenge due\nto the inherently diverse and non-unique nature of the task output. Large\nlanguage models pretrained on both programming and natural language data tend\nto perform well in code-oriented tasks. However, large-scale pretraining is not\nalways feasible due to its environmental impact and project-specific\ngeneralizability issues. In this work, first we fine-tune open-source Large\nlanguage models (LLM) in parameter-efficient, quantized low-rank (QLoRA)\nfashion on consumer-grade hardware to improve review comment generation. Recent\nstudies demonstrate the efficacy of augmenting semantic metadata information\ninto prompts to boost performance in other code-related tasks. To explore this\nin code review activities, we also prompt proprietary, closed-source LLMs\naugmenting the input code patch with function call graphs and code summaries.\nBoth of our strategies improve the review comment generation performance, with\nfunction call graph augmented few-shot prompting on the GPT-3.5 model\nsurpassing the pretrained baseline by around 90% BLEU-4 score on the\nCodeReviewer dataset. Moreover, few-shot prompted Gemini-1.0 Pro, QLoRA\nfine-tuned Code Llama and Llama 3.1 models achieve competitive results (ranging\nfrom 25% to 83% performance improvement) on this task. An additional human\nevaluation study further validates our experimental findings, reflecting\nreal-world developers' perceptions of LLM-generated code review comments based\non relevant qualitative metrics.\n","authors":["Md. Asif Haider","Ayesha Binte Mostofa","Sk. Sabit Bin Mosaddek","Anindya Iqbal","Toufique Ahmed"],"pdf_url":"https://arxiv.org/pdf/2411.10129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10115v1","updated":"2024-11-15T11:29:31Z","published":"2024-11-15T11:29:31Z","title":"Memorization in Attention-only Transformers","summary":" Recent research has explored the memorization capacity of multi-head\nattention, but these findings are constrained by unrealistic limitations on the\ncontext size. We present a novel proof for language-based Transformers that\nextends the current hypothesis to any context size. Our approach improves upon\nthe state-of-the-art by achieving more effective exact memorization with an\nattention layer, while also introducing the concept of approximate memorization\nof distributions. Through experimental validation, we demonstrate that our\nproposed bounds more accurately reflect the true memorization capacity of\nlanguage models, and provide a precise comparison with prior work.\n","authors":["Léo Dana","Muni Sreenivas Pydi","Yann Chevaleyre"],"pdf_url":"https://arxiv.org/pdf/2411.10115v1.pdf","comment":"16 pages, 6 figures, submitted to AISTATS 2025,"},{"id":"http://arxiv.org/abs/2411.09510v2","updated":"2024-11-15T10:47:37Z","published":"2024-11-14T15:19:01Z","title":"Communication Compression for Tensor Parallel LLM Inference","summary":" Large Language Models (LLMs) have pushed the frontier of artificial\nintelligence but are comprised of hundreds of billions of parameters and\noperations. For faster inference latency, LLMs are deployed on multiple\nhardware accelerators through various Model Parallelism strategies. Our paper\nlooks into the details on one such strategy - Tensor Parallel - and proposes to\nreduce latency by compressing inter-accelerator communication. We leverage fine\ngrained quantization techniques to compress selected activations by 3.5 - 4.5x.\nOur proposed method leads up to 2x reduction of time-to-first-token (TTFT) with\nnegligible model performance degradation.\n","authors":["Jan Hansen-Palmus","Michael Truong Le","Oliver Hausdörfer","Alok Verma"],"pdf_url":"https://arxiv.org/pdf/2411.09510v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10083v1","updated":"2024-11-15T10:01:52Z","published":"2024-11-15T10:01:52Z","title":"Xmodel-1.5: An 1B-scale Multilingual LLM","summary":" We introduce Xmodel-1.5, a novel 1-billion-parameter multilingual large model\npretrained on approximately 2 trillion tokens. The model demonstrates strong\nperformance across several languages, with particularly notable results in\nThai, Arabic, and French, alongside its effectiveness in Chinese and English.\nIn addition, we contribute to the research community by releasing a Thai\nevaluation dataset, which includes hundreds of questions annotated by students\nfrom Chulalongkorn University's School of Integrated Innovation. While the\nresults are promising, we acknowledge that there is still room for improvement.\nWe hope this work advances ongoing efforts in multilingual AI research and\npromotes better cross-linguistic understanding in various natural language\nprocessing tasks. Our models and code are publicly available on GitHub at\nhttps://github.com/XiaoduoAILab/XmodelLM.\n","authors":["Wang Qun","Liu Yang","Lin Qingquan","Jiang Ling"],"pdf_url":"https://arxiv.org/pdf/2411.10083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10080v1","updated":"2024-11-15T09:50:27Z","published":"2024-11-15T09:50:27Z","title":"Understanding The Effect Of Temperature On Alignment With Human Opinions","summary":" With the increasing capabilities of LLMs, recent studies focus on\nunderstanding whose opinions are represented by them and how to effectively\nextract aligned opinion distributions. We conducted an empirical analysis of\nthree straightforward methods for obtaining distributions and evaluated the\nresults across a variety of metrics. Our findings suggest that sampling and\nlog-probability approaches with simple parameter adjustments can return better\naligned outputs in subjective tasks compared to direct prompting. Yet, assuming\nmodels reflect human opinions may be limiting, highlighting the need for\nfurther research on how human subjectivity affects model uncertainty.\n","authors":["Maja Pavlovic","Massimo Poesio"],"pdf_url":"https://arxiv.org/pdf/2411.10080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10069v1","updated":"2024-11-15T09:33:47Z","published":"2024-11-15T09:33:47Z","title":"Layer Importance and Hallucination Analysis in Large Language Models via\n Enhanced Activation Variance-Sparsity","summary":" Evaluating the importance of different layers in large language models (LLMs)\nis crucial for optimizing model performance and interpretability. This paper\nfirst explores layer importance using the Activation Variance-Sparsity Score\n(AVSS), which combines normalized activation variance and sparsity to quantify\neach layer's contribution to overall model performance. By ranking layers based\non AVSS and pruning the least impactful 25\\%, our experiments on tasks such as\nquestion answering, language modeling, and sentiment classification show that\nover 90\\% of the original performance is retained, highlighting potential\nredundancies in LLM architectures. Building on AVSS, we propose an enhanced\nversion tailored to assess hallucination propensity across layers (EAVSS). This\nimproved approach introduces Hallucination-Specific Activation Variance (HSAV)\nand Hallucination-Specific Sparsity (HSS) metrics, allowing precise\nidentification of hallucination-prone layers. By incorporating contrastive\nlearning on these layers, we effectively mitigate hallucination generation,\ncontributing to more robust and efficient LLMs(The maximum performance\nimprovement is 12\\%). Our results on the NQ, SciQ, TriviaQA, TruthfulQA, and\nWikiQA datasets demonstrate the efficacy of this method, offering a\ncomprehensive framework for both layer importance evaluation and hallucination\nmitigation in LLMs.\n","authors":["Zichen Song","Sitan Huang","Yuxin Wu","Zhongfeng Kang"],"pdf_url":"https://arxiv.org/pdf/2411.10069v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.10060v1","updated":"2024-11-15T09:23:02Z","published":"2024-11-15T09:23:02Z","title":"CMATH: Cross-Modality Augmented Transformer with Hierarchical\n Variational Distillation for Multimodal Emotion Recognition in Conversation","summary":" Multimodal emotion recognition in conversation (MER) aims to accurately\nidentify emotions in conversational utterances by integrating multimodal\ninformation. Previous methods usually treat multimodal information as equal\nquality and employ symmetric architectures to conduct multimodal fusion.\nHowever, in reality, the quality of different modalities usually varies\nconsiderably, and utilizing a symmetric architecture is difficult to accurately\nrecognize conversational emotions when dealing with uneven modal information.\nFurthermore, fusing multi-modality information in a single granularity may fail\nto adequately integrate modal information, exacerbating the inaccuracy in\nemotion recognition. In this paper, we propose a novel Cross-Modality Augmented\nTransformer with Hierarchical Variational Distillation, called CMATH, which\nconsists of two major components, i.e., Multimodal Interaction Fusion and\nHierarchical Variational Distillation. The former is comprised of two\nsubmodules, including Modality Reconstruction and Cross-Modality Augmented\nTransformer (CMA-Transformer), where Modality Reconstruction focuses on\nobtaining high-quality compressed representation of each modality, and\nCMA-Transformer adopts an asymmetric fusion strategy which treats one modality\nas the central modality and takes others as auxiliary modalities. The latter\nfirst designs a variational fusion network to fuse the fine-grained\nrepresentations learned by CMA- Transformer into a coarse-grained\nrepresentations. Then, it introduces a hierarchical distillation framework to\nmaintain the consistency between modality representations with different\ngranularities. Experiments on the IEMOCAP and MELD datasets demonstrate that\nour proposed model outperforms previous state-of-the-art baselines.\nImplementation codes can be available at https://github.com/ cjw-MER/CMATH.\n","authors":["Xiaofei Zhu","Jiawei Cheng","Zhou Yang","Zhuo Chen","Qingyang Wang","Jianfeng Yao"],"pdf_url":"https://arxiv.org/pdf/2411.10060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10055v1","updated":"2024-11-15T09:17:40Z","published":"2024-11-15T09:17:40Z","title":"Towards unearthing neglected climate innovations from scientific\n literature using Large Language Models","summary":" Climate change poses an urgent global threat, needing the rapid\nidentification and deployment of innovative solutions. We hypothesise that many\nof these solutions already exist within scientific literature but remain\nunderutilised. To address this gap, this study employs a curated dataset\nsourced from OpenAlex, a comprehensive repository of scientific papers.\nUtilising Large Language Models (LLMs), such as GPT4-o from OpenAI, we evaluate\ntitle-abstract pairs from scientific papers on seven dimensions, covering\nclimate change mitigation potential, stage of technological development, and\nreadiness for deployment. The outputs of the language models are then compared\nwith human evaluations to assess their effectiveness in identifying promising\nyet overlooked climate innovations. Our findings suggest that these LLM-based\nmodels can effectively augment human expertise, uncovering climate solutions\nthat are potentially impactful but with far greater speed, throughput and\nconsistency. Here, we focused on UK-based solutions, but the workflow is\nregion-agnostic. This work contributes to the discovery of neglected\ninnovations in scientific literature and demonstrates the potential of AI in\nenhancing climate action strategies.\n","authors":["César Quilodrán-Casas","Christopher Waite","Nicole Alhadeff","Diyona Dsouza","Cathal Hughes","Larissa Kunstel-Tabet","Alyssa Gilbert"],"pdf_url":"https://arxiv.org/pdf/2411.10055v1.pdf","comment":"10 pages. Accepted in the LatinX in AI workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.03988v3","updated":"2024-11-15T08:38:26Z","published":"2024-02-06T13:26:19Z","title":"REBORN: Reinforcement-Learned Boundary Segmentation with Iterative\n Training for Unsupervised ASR","summary":" Unsupervised automatic speech recognition (ASR) aims to learn the mapping\nbetween the speech signal and its corresponding textual transcription without\nthe supervision of paired speech-text data. A word/phoneme in the speech signal\nis represented by a segment of speech signal with variable length and unknown\nboundary, and this segmental structure makes learning the mapping between\nspeech and text challenging, especially without paired data. In this paper, we\npropose REBORN,Reinforcement-Learned Boundary Segmentation with Iterative\nTraining for Unsupervised ASR. REBORN alternates between (1) training a\nsegmentation model that predicts the boundaries of the segmental structures in\nspeech signals and (2) training the phoneme prediction model, whose input is\nthe speech feature segmented by the segmentation model, to predict a phoneme\ntranscription. Since supervised data for training the segmentation model is not\navailable, we use reinforcement learning to train the segmentation model to\nfavor segmentations that yield phoneme sequence predictions with a lower\nperplexity. We conduct extensive experiments and find that under the same\nsetting, REBORN outperforms all prior unsupervised ASR models on LibriSpeech,\nTIMIT, and five non-English languages in Multilingual LibriSpeech. We\ncomprehensively analyze why the boundaries learned by REBORN improve the\nunsupervised ASR performance.\n","authors":["Liang-Hsuan Tseng","En-Pei Hu","Cheng-Han Chiang","Yuan Tseng","Hung-yi Lee","Lin-shan Lee","Shao-Hua Sun"],"pdf_url":"https://arxiv.org/pdf/2402.03988v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10020v1","updated":"2024-11-15T07:54:19Z","published":"2024-11-15T07:54:19Z","title":"Information Extraction from Clinical Notes: Are We Ready to Switch to\n Large Language Models?","summary":" Backgrounds: Information extraction (IE) is critical in clinical natural\nlanguage processing (NLP). While large language models (LLMs) excel on\ngenerative tasks, their performance on extractive tasks remains debated.\nMethods: We investigated Named Entity Recognition (NER) and Relation Extraction\n(RE) using 1,588 clinical notes from four sources (UT Physicians, MTSamples,\nMIMIC-III, and i2b2). We developed an annotated corpus covering 4 clinical\nentities and 16 modifiers, and compared instruction-tuned LLaMA-2 and LLaMA-3\nagainst BiomedBERT in terms of performance, generalizability, computational\nresources, and throughput to BiomedBERT. Results: LLaMA models outperformed\nBiomedBERT across datasets. With sufficient training data, LLaMA showed modest\nimprovements (1% on NER, 1.5-3.7% on RE); improvements were larger with limited\ntraining data. On unseen i2b2 data, LLaMA-3-70B outperformed BiomedBERT by 7%\n(F1) on NER and 4% on RE. However, LLaMA models required more computing\nresources and ran up to 28 times slower. We implemented \"Kiwi,\" a clinical IE\npackage featuring both models, available at https://kiwi.clinicalnlp.org/.\nConclusion: This study is among the first to develop and evaluate a\ncomprehensive clinical IE system using open-source LLMs. Results indicate that\nLLaMA models outperform BiomedBERT for clinical NER and RE but with higher\ncomputational costs and lower throughputs. These findings highlight that\nchoosing between LLMs and traditional deep learning methods for clinical IE\napplications should remain task-specific, taking into account both performance\nmetrics and practical considerations such as available computing resources and\nthe intended use case scenarios.\n","authors":["Yan Hu","Xu Zuo","Yujia Zhou","Xueqing Peng","Jimin Huang","Vipina K. Keloth","Vincent J. Zhang","Ruey-Ling Weng","Qingyu Chen","Xiaoqian Jiang","Kirk E. Roberts","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2411.10020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10018v1","updated":"2024-11-15T07:53:02Z","published":"2024-11-15T07:53:02Z","title":"Once More, With Feeling: Measuring Emotion of Acting Performances in\n Contemporary American Film","summary":" Narrative film is a composition of writing, cinematography, editing, and\nperformance. While much computational work has focused on the writing or visual\nstyle in film, we conduct in this paper a computational exploration of acting\nperformance. Applying speech emotion recognition models and a variationist\nsociolinguistic analytical framework to a corpus of popular, contemporary\nAmerican film, we find narrative structure, diachronic shifts, and genre- and\ndialogue-based constraints located in spoken performances.\n","authors":["Naitian Zhou","David Bamman"],"pdf_url":"https://arxiv.org/pdf/2411.10018v1.pdf","comment":"Accepted CHR 2024"},{"id":"http://arxiv.org/abs/2411.10006v1","updated":"2024-11-15T07:35:47Z","published":"2024-11-15T07:35:47Z","title":"Orca: Enhancing Role-Playing Abilities of Large Language Models by\n Integrating Personality Traits","summary":" Large language models has catalyzed the development of personalized dialogue\nsystems, numerous role-playing conversational agents have emerged. While\nprevious research predominantly focused on enhancing the model's capability to\nfollow instructions by designing character profiles, neglecting the\npsychological factors that drive human conversations. In this paper, we propose\nOrca, a framework for data processing and training LLMs of custom characters by\nintegrating personality traits. Orca comprises four stages: (1) Personality\ntraits inferring, leverage LLMs to infer user's BigFive personality trait\nreports and scores. (2) Data Augment, simulate user's profile, background\nstory, and psychological activities. (3) Dataset construction,\npersonality-conditioned instruction prompting (PCIP) to stimulate LLMs. (4)\nModeling and Training, personality-conditioned instruction tuning (PTIT and\nPSIT), using the generated data to enhance existing open-source LLMs. We\nintroduce OrcaBench, the first benchmark for evaluating the quality of content\ngenerated by LLMs on social platforms across multiple scales. Our experiments\ndemonstrate that our proposed model achieves superior performance on this\nbenchmark, demonstrating its excellence and effectiveness in perceiving\npersonality traits that significantly improve role-playing abilities. Our Code\nis available at https://github.com/Aipura/Orca.\n","authors":["Yuxuan Huang"],"pdf_url":"https://arxiv.org/pdf/2411.10006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14704v2","updated":"2024-11-15T07:19:03Z","published":"2024-09-23T04:50:36Z","title":"VLEU: a Method for Automatic Evaluation for Generalizability of\n Text-to-Image Models","summary":" Progress in Text-to-Image (T2I) models has significantly improved the\ngeneration of images from textual descriptions. However, existing evaluation\nmetrics do not adequately assess the models' ability to handle a diverse range\nof textual prompts, which is crucial for their generalizability. To address\nthis, we introduce a new metric called Visual Language Evaluation Understudy\n(VLEU). VLEU uses large language models to sample from the visual text domain,\nthe set of all possible input texts for T2I models, to generate a wide variety\nof prompts. The images generated from these prompts are evaluated based on\ntheir alignment with the input text using the CLIP model.VLEU quantifies a\nmodel's generalizability by computing the Kullback-Leibler divergence between\nthe marginal distribution of the visual text and the conditional distribution\nof the images generated by the model. This metric provides a quantitative way\nto compare different T2I models and track improvements during model finetuning.\nOur experiments demonstrate the effectiveness of VLEU in evaluating the\ngeneralization capability of various T2I models, positioning it as an essential\nmetric for future research in text-to-image synthesis.\n","authors":["Jingtao Cao","Zheng Zhang","Hongru Wang","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2409.14704v2.pdf","comment":"accepted by EMNLP2024(long paper,main conference)"},{"id":"http://arxiv.org/abs/2410.04422v5","updated":"2024-11-15T07:07:38Z","published":"2024-10-06T09:29:19Z","title":"Hyper-multi-step: The Truth Behind Difficult Long-context Tasks","summary":" Long-context language models (LCLM), characterized by their extensive context\nwindow, is becoming increasingly popular. Meanwhile, many long-context\nbenchmarks present challenging tasks that even the most advanced LCLMs struggle\nto complete. However, the underlying sources of various challenging\nlong-context tasks have seldom been studied. To bridge this gap, we conduct\nexperiments to indicate their difficulty stems primarily from two basic issues:\n\"multi-matching retrieval,\" which requires the simultaneous retrieval of\nmultiple items, and \"logic-based retrieval,\" which necessitates logical\njudgment within retrieval criteria. These two problems, while seemingly\nstraightforward, actually exceed the capabilities of LCLMs because they are\nproven to be hyper-multi-step (demanding numerous steps to solve) in nature.\nThis finding could explain why LLMs struggle with more advanced long-context\ntasks, providing a more accurate perspective for rethinking solutions for them.\n","authors":["Yijiong Yu","Ma Xiufa","Fang Jianwei","Zhi Xu","Su Guangyao","Wang Jiancheng","Yongfeng Huang","Zhixiao Qi","Wei Wang","Weifeng Liu","Ran Chen","Ji Pei"],"pdf_url":"https://arxiv.org/pdf/2410.04422v5.pdf","comment":"Our code is publicly available at\n https://github.com/yuyijiong/hard_retrieval_for_llm and the datasets is at\n https://huggingface.co/datasets/yuyijiong/difficult_retrieval"},{"id":"http://arxiv.org/abs/2312.11282v3","updated":"2024-11-15T06:48:58Z","published":"2023-12-18T15:23:06Z","title":"Evaluating and Enhancing Large Language Models for Conversational\n Reasoning on Knowledge Graphs","summary":" The development of large language models (LLMs) has been catalyzed by\nadvancements in pre-training techniques. These models have demonstrated robust\nreasoning capabilities through manually designed prompts. In this work, we\nevaluate the conversational reasoning capabilities of the current\nstate-of-the-art LLM (GPT-4) on knowledge graphs (KGs). However, the\nperformance of LLMs is constrained due to a lack of KG environment awareness\nand the difficulties in developing effective optimization mechanisms for\nintermediary reasoning stages. We further introduce LLM-ARK, a LLM grounded KG\nreasoning agent designed to deliver precise and adaptable predictions on KG\npaths. LLM-ARK leverages Full Textual Environment (FTE) prompt to assimilate\nstate information within each reasoning step. We reframe the challenge of\nmulti-hop reasoning on the KG as a sequential decision-making task. Utilizing\nthe Proximal Policy Optimization (PPO) online policy gradient reinforcement\nlearning algorithm, our model is optimized to learn from rich reward signals.\nAdditionally, we conduct an evaluation of our model and GPT-4 on the OpenDialKG\ndataset. The experimental results reveal that LLaMA-2-7B-ARK outperforms the\ncurrent state-of-the-art model by 5.28 percentage points, with a performance\nrate of 36.39% on the target@1 evaluation metric. Meanwhile, GPT-4 scored\n14.91%, further demonstrating the effectiveness of our method. Our code is\navailable on GitHub (https://github.com/Aipura/LLM-ARK) for further access.\n","authors":["Yuxuan Huang"],"pdf_url":"https://arxiv.org/pdf/2312.11282v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01483v3","updated":"2024-11-15T06:31:44Z","published":"2024-05-02T17:14:57Z","title":"MANTIS: Interleaved Multi-Image Instruction Tuning","summary":" Large multimodal models (LMMs) have shown great results in single-image\nvision language tasks. However, their abilities to solve multi-image visual\nlanguage tasks is yet to be improved. The existing LMMs like OpenFlamingo,\nEmu2, and Idefics gain their multi-image ability through pre-training on\nhundreds of millions of noisy interleaved image-text data from the web, which\nis neither efficient nor effective. In this paper, we aim to build strong\nmulti-image LMMs via instruction tuning with academic-level resources.\nTherefore, we meticulously construct Mantis-Instruct containing 721K\nmulti-image instruction data to train a family of Mantis models. The\ninstruction tuning empowers Mantis with different multi-image skills like\nco-reference, comparison, reasoning, and temporal understanding. We evaluate\nMantis on 8 multi-image benchmarks and 6 single-image benchmarks.\nMantis-Idefics2 can achieve SoTA results on all the multi-image benchmarks and\nbeat the strongest multi-image baseline, Idefics2-8B by an average of 13\nabsolute points. Notably, Idefics2-8B was pre-trained on 140M interleaved\nmulti-image data, which is 200x larger than Mantis-Instruct. We observe that\nMantis performs equivalently well on the held-in and held-out benchmarks, which\nshows its generalization ability. We further evaluate Mantis on single-image\nbenchmarks and demonstrate that Mantis also maintains a strong single-image\nperformance on par with CogVLM and Emu2. Our results show that multi-image\nabilities are not necessarily gained through massive pre-training, instead,\nthey can be gained by low-cost instruction tuning. The training and evaluation\nof Mantis has paved the road for future work to improve LMMs' multi-image\nabilities.\n","authors":["Dongfu Jiang","Xuan He","Huaye Zeng","Cong Wei","Max Ku","Qian Liu","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2405.01483v3.pdf","comment":"13 pages, 3 figures, 13 tables"},{"id":"http://arxiv.org/abs/2411.09978v1","updated":"2024-11-15T06:21:13Z","published":"2024-11-15T06:21:13Z","title":"HistoLens: An LLM-Powered Framework for Multi-Layered Analysis of\n Historical Texts -- A Case Application of Yantie Lun","summary":" This paper proposes HistoLens, a multi-layered analysis framework for\nhistorical texts based on Large Language Models (LLMs). Using the important\nWestern Han dynasty text \"Yantie Lun\" as a case study, we demonstrate the\nframework's potential applications in historical research and education.\nHistoLens integrates NLP technology (especially LLMs), including named entity\nrecognition, knowledge graph construction, and geographic information\nvisualization. The paper showcases how HistoLens explores Western Han culture\nin \"Yantie Lun\" through multi-dimensional, visual, and quantitative methods,\nfocusing particularly on the influence of Confucian and Legalist thoughts on\npolitical, economic, military, and ethnic. We also demonstrate how HistoLens\nconstructs a machine teaching scenario using LLMs for explainable analysis,\nbased on a dataset of Confucian and Legalist ideas extracted with LLM\nassistance. This approach offers novel and diverse perspectives for studying\nhistorical texts like \"Yantie Lun\" and provides new auxiliary tools for history\neducation. The framework aims to equip historians and learners with\nLLM-assisted tools to facilitate in-depth, multi-layered analysis of historical\ntexts and foster innovation in historical education.\n","authors":["Yifan Zeng"],"pdf_url":"https://arxiv.org/pdf/2411.09978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09972v1","updated":"2024-11-15T06:05:45Z","published":"2024-11-15T06:05:45Z","title":"Large Language Models as User-Agents for Evaluating\n Task-Oriented-Dialogue Systems","summary":" Traditionally, offline datasets have been used to evaluate task-oriented\ndialogue (TOD) models. These datasets lack context awareness, making them\nsuboptimal benchmarks for conversational systems. In contrast, user-agents,\nwhich are context-aware, can simulate the variability and unpredictability of\nhuman conversations, making them better alternatives as evaluators. Prior\nresearch has utilized large language models (LLMs) to develop user-agents. Our\nwork builds upon this by using LLMs to create user-agents for the evaluation of\nTOD systems. This involves prompting an LLM, using in-context examples as\nguidance, and tracking the user-goal state. Our evaluation of diversity and\ntask completion metrics for the user-agents shows improved performance with the\nuse of better prompts. Additionally, we propose methodologies for the automatic\nevaluation of TOD models within this dynamic framework.\n","authors":["Taaha Kazi","Ruiliang Lyu","Sizhe Zhou","Dilek Hakkani-Tur","Gokhan Tur"],"pdf_url":"https://arxiv.org/pdf/2411.09972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09947v1","updated":"2024-11-15T04:57:13Z","published":"2024-11-15T04:57:13Z","title":"LoRA-LiteE: A Computationally Efficient Framework for Chatbot\n Preference-Tuning","summary":" Effective preference tuning is pivotal in aligning chatbot responses with\nhuman expectations, enhancing user satisfaction and engagement. Traditional\napproaches, notably Reinforcement Learning from Human Feedback (RLHF) as\nemployed in advanced models like GPT-4, have demonstrated considerable success\nin this domain. However, RLHF methods are often computationally intensive and\nresource-demanding, limiting their scalability and accessibility for broader\napplications. To address these challenges, this study introduces LoRA-Lite\nEnsemble (LoRA-LiteE), an innovative framework that combines Supervised\nFine-tuning (SFT) with Low-Rank Adaptation (LoRA) and Ensemble Learning\ntechniques to effectively aggregate predictions of lightweight models, which\naim to achieve a balance between the performance and computational cost.\nUtilizing the Chatbot Arena benchmark dataset, we conduct a comprehensive\ncomparative analysis among our LoRA-LiteE model, corresponding base models at\ndifferent scales, and GPT-4 trained with RLHF. Our empirical results\ndemonstrate that the proposed LoRA-LiteE model achieves comparable performance\nto un-finetuned GPT-4 and outperforms the single larger-scale models under\nlimited resource constraints. These findings highlight that our LoRA-LiteE\nprovides a feasible and efficient methodology for human preference prediction\nin chatbot systems, enhancing scalability and accessibility, and thereby\nbroadening the applicability of preference-tuned chatbots in\nresource-constrained environments.\n","authors":["Yahe Yang","Chunliang Tao","Xiaojing Fan"],"pdf_url":"https://arxiv.org/pdf/2411.09947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09944v1","updated":"2024-11-15T04:44:34Z","published":"2024-11-15T04:44:34Z","title":"SlimLM: An Efficient Small Language Model for On-Device Document\n Assistance","summary":" While small language models (SLMs) show promises for mobile deployment, their\nreal-world performance and applications on smartphones remains underexplored.\nWe present SlimLM, a series of SLMs optimized for document assistance tasks on\nmobile devices. Through extensive experiments on a Samsung Galaxy S24, we\nidentify the optimal trade-offs between model size (ranging from 125M to 7B\nparameters), context length, and inference time for efficient on-device\nprocessing. SlimLM is pre-trained on SlimPajama-627B and fine-tuned on\nDocAssist, our constructed dataset for summarization, question answering and\nsuggestion tasks. Our smallest model demonstrates efficient performance on S24,\nwhile larger variants offer enhanced capabilities within mobile constraints. We\nevaluate SlimLM against existing SLMs, showing comparable or superior\nperformance and offering a benchmark for future research in on-device language\nmodels. We also provide an Android application, offering practical insights\ninto SLM deployment. Our findings provide valuable insights and illuminate the\ncapabilities of running advanced language models on high-end smartphones,\npotentially reducing server costs and enhancing privacy through on-device\nprocessing.\n","authors":["Thang M. Pham","Phat T. Nguyen","Seunghyun Yoon","Viet Dac Lai","Franck Dernoncourt","Trung Bui"],"pdf_url":"https://arxiv.org/pdf/2411.09944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02170v2","updated":"2024-11-15T04:30:04Z","published":"2023-10-03T16:05:48Z","title":"A Dynamic LLM-Powered Agent Network for Task-Oriented Agent\n Collaboration","summary":" Recent studies show that collaborating multiple large language model (LLM)\npowered agents is a promising way for task solving. However, current approaches\nare constrained by using a fixed number of agents and static communication\nstructures. In this work, we propose automatically selecting a team of agents\nfrom candidates to collaborate in a dynamic communication structure toward\ndifferent tasks and domains. Specifically, we build a framework named Dynamic\nLLM-Powered Agent Network ($\\textbf{DyLAN}$) for LLM-powered agent\ncollaboration, operating a two-stage paradigm: (1) Team Optimization and (2)\nTask Solving. During the first stage, we utilize an $\\textit{agent selection}$\nalgorithm, based on an unsupervised metric called $\\textit{Agent Importance\nScore}$, enabling the selection of best agents according to their contributions\nin a preliminary trial, oriented to the given task. Then, in the second stage,\nthe selected agents collaborate dynamically according to the query.\nEmpirically, we demonstrate that DyLAN outperforms strong baselines in code\ngeneration, decision-making, general reasoning, and arithmetic reasoning tasks\nwith moderate computational cost. On specific subjects in MMLU, selecting a\nteam of agents in the team optimization stage improves accuracy by up to 25.0%\nin DyLAN.\n","authors":["Zijun Liu","Yanzhe Zhang","Peng Li","Yang Liu","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2310.02170v2.pdf","comment":"Published in COLM2024. Code Repo: https://github.com/SALT-NLP/DyLAN"},{"id":"http://arxiv.org/abs/2411.09937v1","updated":"2024-11-15T04:22:21Z","published":"2024-11-15T04:22:21Z","title":"Refined and Segmented Price Sentiment Indices from Survey Comments","summary":" We aim to enhance a price sentiment index and to more precisely understand\nprice trends from the perspective of not only consumers but also businesses. We\nextract comments related to prices from the Economy Watchers Survey conducted\nby the Cabinet Office of Japan and classify price trends using a large language\nmodel (LLM). We classify whether the survey sample reflects the perspective of\nconsumers or businesses, and whether the comments pertain to goods or services\nby utilizing information on the fields of comments and the industries of\nrespondents included in the Economy Watchers Survey. From these classified\nprice-related comments, we construct price sentiment indices not only for a\ngeneral purpose but also for more specific objectives by combining perspectives\non consumers and prices, as well as goods and services. It becomes possible to\nachieve a more accurate classification of price directions by employing a LLM\nfor classification. Furthermore, integrating the outputs of multiple LLMs\nsuggests the potential for the better performance of the classification. The\nuse of more accurately classified comments allows for the construction of an\nindex with a higher correlation to existing indices than previous studies. We\ndemonstrate that the correlation of the price index for consumers, which has a\nlarger sample size, is further enhanced by selecting comments for aggregation\nbased on the industry of the survey respondents.\n","authors":["Masahiro Suzuki","Hiroki Sakaji"],"pdf_url":"https://arxiv.org/pdf/2411.09937v1.pdf","comment":"Accepted to IEEE BigData 2024. 9 pages, 11 tables, 1 figure"},{"id":"http://arxiv.org/abs/2411.09933v1","updated":"2024-11-15T04:16:50Z","published":"2024-11-15T04:16:50Z","title":"JRadiEvo: A Japanese Radiology Report Generation Model Enhanced by\n Evolutionary Optimization of Model Merging","summary":" With the rapid advancement of large language models (LLMs), foundational\nmodels (FMs) have seen significant advancements. Healthcare is one of the most\ncrucial application areas for these FMs, given the significant time and effort\nrequired for physicians to analyze large volumes of patient data. Recent\nefforts have focused on adapting multimodal FMs to the medical domain through\ntechniques like instruction-tuning, leading to the development of medical\nfoundation models (MFMs). However, these approaches typically require large\namounts of training data to effectively adapt models to the medical field.\nMoreover, most existing models are trained on English datasets, limiting their\npracticality in non-English-speaking regions where healthcare professionals and\npatients are not always fluent in English. The need for translation introduces\nadditional costs and inefficiencies. To address these challenges, we propose a\n\\textbf{J}apanese \\textbf{Radi}ology report generation model enhanced by\n\\textbf{Evo}lutionary optimization of model merging (JRadiEvo). This is the\nfirst attempt to extend a non-medical vision-language foundation model to the\nmedical domain through evolutionary optimization of model merging. We\nsuccessfully created a model that generates accurate Japanese reports from\nX-ray images using only 50 translated samples from publicly available data.\nThis model, developed with highly efficient use of limited data, outperformed\nleading models from recent research trained on much larger datasets.\nAdditionally, with only 8 billion parameters, this relatively compact\nfoundation model can be deployed locally within hospitals, making it a\npractical solution for environments where APIs and other external services\ncannot be used due to strict privacy and security requirements.\n","authors":["Kaito Baba","Ryota Yagi","Junichiro Takahashi","Risa Kishikawa","Satoshi Kodera"],"pdf_url":"https://arxiv.org/pdf/2411.09933v1.pdf","comment":"Accepted by NeurIPS'24 Workshop on AIM-FM: Advancements In Medical\n Foundation Models: Explainability, Robustness, Security, and Beyond"},{"id":"http://arxiv.org/abs/2405.17743v3","updated":"2024-11-15T03:25:40Z","published":"2024-05-28T01:55:35Z","title":"ORLM: A Customizable Framework in Training Large Models for Automated\n Optimization Modeling","summary":" Optimization modeling and solving play a critical role in the application of\nOperations Research (OR) tools to address real-world problems, yet they pose\nchallenges and require extensive expertise from OR experts. With the advent of\nlarge language models (LLMs), new opportunities have emerged to streamline and\nautomate these tasks. However, current research predominantly relies on\nclosed-source LLMs such as GPT-4, along with extensive prompt engineering\ntechniques. This reliance stems from the scarcity of high-quality training\ndatasets for optimization modeling, resulting in elevated costs, prolonged\nprocessing times, and privacy concerns. To address these challenges, our work\nis the first to propose a viable path for training open-source LLMs that are\ncapable of optimization modeling as well as developing and executing solver\ncodes, eventually leading to a superior ability for automating optimization\nmodeling and solving. Particularly, we introduce a semi-automated data\nsynthesis framework designed for optimization modeling issues, named\nOR-Instruct. This framework merges the training data requirements of large\nmodels with the unique characteristics of optimization modeling problems, and\nallows for customizable enhancements tailored to specific scenarios or modeling\ntypes. To evaluate the performance of our proposed framework, we present the\nIndustryOR benchmark, the inaugural industrial standard for evaluating LLMs in\nsolving practical OR problems. Utilizing data synthesized through OR-Instruct,\nwe train various open-source LLMs with a capacity of 7 billion parameters\n(dubbed ORLMs). The resulting model demonstrates significantly enhanced\noptimization modeling capabilities, achieving state-of-the-art performance\nacross the NL4OPT, MAMO, and IndustryOR benchmarks. Our code and data are\navailable at \\url{https://github.com/Cardinal-Operations/ORLM}.\n","authors":["Chenyu Huang","Zhengyang Tang","Dongdong Ge","Shixi Hu","Ruoqing Jiang","Benyou Wang","Zizhuo Wang","Xin Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.17743v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2410.07571v2","updated":"2024-11-15T03:20:57Z","published":"2024-10-10T03:12:03Z","title":"How Does Vision-Language Adaptation Impact the Safety of Vision Language\n Models?","summary":" Vision-Language adaptation (VL adaptation) transforms Large Language Models\n(LLMs) into Large Vision-Language Models (LVLMs) for multimodal tasks, but this\nprocess often compromises the inherent safety capabilities embedded in the\noriginal LLMs. Despite potential harmfulness due to weakened safety measures,\nin-depth analysis on the effects of VL adaptation on safety remains\nunder-explored. This study examines how VL adaptation influences safety and\nevaluates the impact of safety fine-tuning methods. Our analysis reveals that\nsafety degradation occurs during VL adaptation, even when the training data is\nsafe. While safety tuning techniques like supervised fine-tuning with safety\ndatasets or reinforcement learning from human feedback mitigate some risks,\nthey still lead to safety degradation and a reduction in helpfulness due to\nover-rejection issues. Further analysis of internal model weights suggests that\nVL adaptation may impact certain safety-related layers, potentially lowering\noverall safety levels. Additionally, our findings demonstrate that the\nobjectives of VL adaptation and safety tuning are divergent, which often\nresults in their simultaneous application being suboptimal. To address this, we\nsuggest the weight merging approach as an optimal solution effectively reducing\nsafety degradation while maintaining helpfulness. These insights help guide the\ndevelopment of more reliable and secure LVLMs for real-world applications.\n","authors":["Seongyun Lee","Geewook Kim","Jiyeon Kim","Hyunji Lee","Hoyeon Chang","Sue Hyun Park","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2410.07571v2.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2411.09318v2","updated":"2024-11-15T02:42:59Z","published":"2024-11-14T10:00:33Z","title":"DriveThru: a Document Extraction Platform and Benchmark Datasets for\n Indonesian Local Language Archives","summary":" Indonesia is one of the most diverse countries linguistically. However,\ndespite this linguistic diversity, Indonesian languages remain underrepresented\nin Natural Language Processing (NLP) research and technologies. In the past two\nyears, several efforts have been conducted to construct NLP resources for\nIndonesian languages. However, most of these efforts have been focused on\ncreating manual resources thus difficult to scale to more languages. Although\nmany Indonesian languages do not have a web presence, locally there are\nresources that document these languages well in printed forms such as books,\nmagazines, and newspapers. Digitizing these existing resources will enable\nscaling of Indonesian language resource construction to many more languages. In\nthis paper, we propose an alternative method of creating datasets by digitizing\ndocuments, which have not previously been used to build digital language\nresources in Indonesia. DriveThru is a platform for extracting document content\nutilizing Optical Character Recognition (OCR) techniques in its system to\nprovide language resource building with less manual effort and cost. This paper\nalso studies the utility of current state-of-the-art LLM for post-OCR\ncorrection to show the capability of increasing the character accuracy rate\n(CAR) and word accuracy rate (WAR) compared to off-the-shelf OCR.\n","authors":["Mohammad Rifqi Farhansyah","Muhammad Zuhdi Fikri Johari","Afinzaki Amiral","Ayu Purwarianti","Kumara Ari Yuana","Derry Tanti Wijaya"],"pdf_url":"https://arxiv.org/pdf/2411.09318v2.pdf","comment":"12 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.02591v2","updated":"2024-11-15T02:33:29Z","published":"2024-11-04T20:31:22Z","title":"Geometry of orofacial neuromuscular signals: speech articulation\n decoding using surface electromyography","summary":" Each year, millions of individuals lose the ability to speak intelligibly due\nto causes such as neuromuscular disease, stroke, trauma, and head/neck cancer\nsurgery (e.g. laryngectomy) or treatment (e.g. radiotherapy toxicity to the\nspeech articulators). Effective communication is crucial for daily activities,\nand losing the ability to speak leads to isolation, depression, anxiety, and a\nhost of detrimental sequelae. Noninvasive surface electromyography (sEMG) has\nshown promise to restore speech output in these individuals. The goal is to\ncollect sEMG signals from multiple articulatory sites as people silently\nproduce speech and then decode the signals to enable fluent and natural\ncommunication. Currently, many fundamental properties of orofacial\nneuromuscular signals relating to speech articulation remain unanswered. They\ninclude questions relating to 1) the data structure of the orofacial sEMG\nsignals, 2)the signal distribution shift of sEMG across individuals, 3) ability\nof sEMG signals to span the entire English language phonetic space during\nsilent speech articulations, and 4) the generalization capability of\nnon-invasive sEMG based silent speech interfaces. We address these questions\nthrough a series of experiments involving healthy human subjects. We show that\nsEMG signals evince graph data structure and that the signal distribution shift\nis given by a change of basis. Furthermore, we show that silently voiced\narticulations spanning the entire English language phonetic space can be\ndecoded using small neural networks which can be trained with little data and\nthat such architectures work well across individuals. To ensure transparency\nand reproducibility, we open-source all the data and codes used in this study.\n","authors":["Harshavardhana T. Gowda","Zachary D. McNaughton","Lee M. Miller"],"pdf_url":"https://arxiv.org/pdf/2411.02591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09884v1","updated":"2024-11-15T02:08:58Z","published":"2024-11-15T02:08:58Z","title":"Research on Domain-Specific Chinese Spelling Correction Method Based on\n Plugin Extension Modules","summary":" This paper proposes a Chinese spelling correction method based on plugin\nextension modules, aimed at addressing the limitations of existing models in\nhandling domain-specific texts. Traditional Chinese spelling correction models\nare typically trained on general-domain datasets, resulting in poor performance\nwhen encountering specialized terminology in domain-specific texts. To address\nthis issue, we design an extension module that learns the features of\ndomain-specific terminology, thereby enhancing the model's correction\ncapabilities within specific domains. This extension module can provide domain\nknowledge to the model without compromising its general spelling correction\nperformance, thus improving its accuracy in specialized fields. Experimental\nresults demonstrate that after integrating extension modules for medical,\nlegal, and official document domains, the model's correction performance is\nsignificantly improved compared to the baseline model without any extension\nmodules.\n","authors":["Xiaowu Zhang","Hongfei Zhao","Xuan Chang"],"pdf_url":"https://arxiv.org/pdf/2411.09884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18027v2","updated":"2024-11-15T02:07:34Z","published":"2024-06-26T02:49:28Z","title":"Automated Clinical Data Extraction with Knowledge Conditioned LLMs","summary":" The extraction of lung lesion information from clinical and medical imaging\nreports is crucial for research on and clinical care of lung-related diseases.\nLarge language models (LLMs) can be effective at interpreting unstructured text\nin reports, but they often hallucinate due to a lack of domain-specific\nknowledge, leading to reduced accuracy and posing challenges for use in\nclinical settings. To address this, we propose a novel framework that aligns\ngenerated internal knowledge with external knowledge through in-context\nlearning (ICL). Our framework employs a retriever to identify relevant units of\ninternal or external knowledge and a grader to evaluate the truthfulness and\nhelpfulness of the retrieved internal-knowledge rules, to align and update the\nknowledge bases. Experiments with expert-curated test datasets demonstrate that\nthis ICL approach can increase the F1 score for key fields (lesion size, margin\nand solidity) by an average of 12.9% over existing ICL methods.\n","authors":["Diya Li","Asim Kadav","Aijing Gao","Rui Li","Richard Bourgon"],"pdf_url":"https://arxiv.org/pdf/2406.18027v2.pdf","comment":"COLING25 Industry Track"},{"id":"http://arxiv.org/abs/2411.09853v1","updated":"2024-11-15T00:21:02Z","published":"2024-11-15T00:21:02Z","title":"KULCQ: An Unsupervised Keyword-based Utterance Level Clustering Quality\n Metric","summary":" Intent discovery is crucial for both building new conversational agents and\nimproving existing ones. While several approaches have been proposed for intent\ndiscovery, most rely on clustering to group similar utterances together.\nTraditional evaluation of these utterance clusters requires intent labels for\neach utterance, limiting scalability. Although some clustering quality metrics\nexist that do not require labeled data, they focus solely on cluster geometry\nwhile ignoring the linguistic nuances present in conversational transcripts. In\nthis paper, we introduce Keyword-based Utterance Level Clustering Quality\n(KULCQ), an unsupervised metric that leverages keyword analysis to evaluate\nclustering quality. We demonstrate KULCQ's effectiveness by comparing it with\nexisting unsupervised clustering metrics and validate its performance through\ncomprehensive ablation studies. Our results show that KULCQ better captures\nsemantic relationships in conversational data while maintaining consistency\nwith geometric clustering principles.\n","authors":["Pranav Guruprasad","Negar Mokhberian","Nikhil Varghese","Chandra Khatri","Amol Kelkar"],"pdf_url":"https://arxiv.org/pdf/2411.09853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18856v2","updated":"2024-11-15T00:15:18Z","published":"2024-10-24T15:41:56Z","title":"Demystifying Large Language Models for Medicine: A Primer","summary":" Large language models (LLMs) represent a transformative class of AI tools\ncapable of revolutionizing various aspects of healthcare by generating\nhuman-like responses across diverse contexts and adapting to novel tasks\nfollowing human instructions. Their potential application spans a broad range\nof medical tasks, such as clinical documentation, matching patients to clinical\ntrials, and answering medical questions. In this primer paper, we propose an\nactionable guideline to help healthcare professionals more efficiently utilize\nLLMs in their work, along with a set of best practices. This approach consists\nof several main phases, including formulating the task, choosing LLMs, prompt\nengineering, fine-tuning, and deployment. We start with the discussion of\ncritical considerations in identifying healthcare tasks that align with the\ncore capabilities of LLMs and selecting models based on the selected task and\ndata, performance requirements, and model interface. We then review the\nstrategies, such as prompt engineering and fine-tuning, to adapt standard LLMs\nto specialized medical tasks. Deployment considerations, including regulatory\ncompliance, ethical guidelines, and continuous monitoring for fairness and\nbias, are also discussed. By providing a structured step-by-step methodology,\nthis tutorial aims to equip healthcare professionals with the tools necessary\nto effectively integrate LLMs into clinical practice, ensuring that these\npowerful technologies are applied in a safe, reliable, and impactful manner.\n","authors":["Qiao Jin","Nicholas Wan","Robert Leaman","Shubo Tian","Zhizheng Wang","Yifan Yang","Zifeng Wang","Guangzhi Xiong","Po-Ting Lai","Qingqing Zhu","Benjamin Hou","Maame Sarfo-Gyamfi","Gongbo Zhang","Aidan Gilson","Balu Bhasuran","Zhe He","Aidong Zhang","Jimeng Sun","Chunhua Weng","Ronald M. Summers","Qingyu Chen","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2410.18856v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2411.10629v1","updated":"2024-11-15T23:24:07Z","published":"2024-11-15T23:24:07Z","title":"Leveraging large language models for efficient representation learning\n for entity resolution","summary":" In this paper, the authors propose TriBERTa, a supervised entity resolution\nsystem that utilizes a pre-trained large language model and a triplet loss\nfunction to learn representations for entity matching. The system consists of\ntwo steps: first, name entity records are fed into a Sentence Bidirectional\nEncoder Representations from Transformers (SBERT) model to generate vector\nrepresentations, which are then fine-tuned using contrastive learning based on\na triplet loss function. Fine-tuned representations are used as input for\nentity matching tasks, and the results show that the proposed approach\noutperforms state-of-the-art representations, including SBERT without\nfine-tuning and conventional Term Frequency-Inverse Document Frequency\n(TF-IDF), by a margin of 3 - 19%. Additionally, the representations generated\nby TriBERTa demonstrated increased robustness, maintaining consistently higher\nperformance across a range of datasets. The authors also discussed the\nimportance of entity resolution in today's data-driven landscape and the\nchallenges that arise when identifying and reconciling duplicate data across\ndifferent sources. They also described the ER process, which involves several\ncrucial steps, including blocking, entity matching, and clustering.\n","authors":["Xiaowei Xu","Bi T. Foua","Xingqiao Wang","Vivek Gunasekaran","John R. Talburt"],"pdf_url":"https://arxiv.org/pdf/2411.10629v1.pdf","comment":"22 pages and 12 figures"},{"id":"http://arxiv.org/abs/2410.08900v2","updated":"2024-11-15T23:18:53Z","published":"2024-10-11T15:20:11Z","title":"A Benchmark for Cross-Domain Argumentative Stance Classification on\n Social Media","summary":" Argumentative stance classification plays a key role in identifying authors'\nviewpoints on specific topics. However, generating diverse pairs of\nargumentative sentences across various domains is challenging. Existing\nbenchmarks often come from a single domain or focus on a limited set of topics.\nAdditionally, manual annotation for accurate labeling is time-consuming and\nlabor-intensive. To address these challenges, we propose leveraging platform\nrules, readily available expert-curated content, and large language models to\nbypass the need for human annotation. Our approach produces a multidomain\nbenchmark comprising 4,498 topical claims and 30,961 arguments from three\nsources, spanning 21 domains. We benchmark the dataset in fully supervised,\nzero-shot, and few-shot settings, shedding light on the strengths and\nlimitations of different methodologies. We release the dataset and code in this\nstudy at hidden for anonymity.\n","authors":["Jiaqing Yuan","Ruijie Xi","Munindar P. Singh"],"pdf_url":"https://arxiv.org/pdf/2410.08900v2.pdf","comment":"Accepted by AAAI ICWSM 2025"},{"id":"http://arxiv.org/abs/2410.04579v4","updated":"2024-11-15T21:33:18Z","published":"2024-10-06T18:29:46Z","title":"Upsample or Upweight? Balanced Training on Heavily Imbalanced Datasets","summary":" Data availability across domains often follows a long-tail distribution: a\nfew domains have abundant data, while most face dat . a scarcity. This\nimbalance poses challenges in training language models uniformly across all\ndomains. In our study, we focus on multilingual settings, where data sizes vary\nsignificantly between high- and low-resource languages. Common strategies to\naddress this include upsampling low-resource languages (Temperature Sampling)\nor upweighting their loss (Scalarization). Although often considered\nequivalent, this assumption has not been proven, which motivates our study.\nThrough both theoretical and empirical analysis, we identify the conditions\nunder which these approaches are equivalent and when they diverge.\nSpecifically, we demonstrate that these two methods are equivalent under full\ngradient descent, but this equivalence breaks down with stochastic gradient\ndescent. Empirically, we observe that Temperature Sampling converges more\nquickly but is prone to overfitting. We argue that this faster convergence is\nlikely due to the lower variance in gradient estimations, as shown\ntheoretically. Based on these insights, we propose Cooldown, a strategy that\nreduces sampling temperature during training, accelerating convergence without\noverfitting to low-resource languages. Our method is competitive with existing\ndata re-weighting and offers computational efficiency.\n","authors":["Tianjian Li","Haoran Xu","Weiting Tan","Kenton Murray","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2410.04579v4.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2411.10588v1","updated":"2024-11-15T21:19:04Z","published":"2024-11-15T21:19:04Z","title":"A dataset of questions on decision-theoretic reasoning in Newcomb-like\n problems","summary":" We introduce a dataset of natural-language questions in the decision theory\nof so-called Newcomb-like problems. Newcomb-like problems include, for\ninstance, decision problems in which an agent interacts with a similar other\nagent, and thus has to reason about the fact that the other agent will likely\nreason in similar ways. Evaluating LLM reasoning about Newcomb-like problems is\nimportant because interactions between foundation-model-based agents will often\nbe Newcomb-like. Some ways of reasoning about Newcomb-like problems may allow\nfor greater cooperation between models.\n Our dataset contains both capabilities questions (i.e., questions with a\nunique, uncontroversially correct answer) and attitude questions (i.e.,\nquestions about which decision theorists would disagree). We use our dataset\nfor an investigation of decision-theoretical capabilities and expressed\nattitudes and their interplay in existing models (different models by OpenAI,\nAnthropic, Meta, GDM, Reka, etc.), as well as models under simple prompt-based\ninterventions. We find, among other things, that attitudes vary significantly\nbetween existing models; that high capabilities are associated with attitudes\nmore favorable toward so-called evidential decision theory; and that attitudes\nare consistent across different types of questions.\n","authors":["Caspar Oesterheld","Emery Cooper","Miles Kodama","Linh Chi Nguyen","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2411.10588v1.pdf","comment":"48 pages, 15 figures; code and data at\n https://github.com/casparoe/newcomblike_questions_dataset"},{"id":"http://arxiv.org/abs/2411.10581v1","updated":"2024-11-15T21:09:36Z","published":"2024-11-15T21:09:36Z","title":"On the Shortcut Learning in Multilingual Neural Machine Translation","summary":" In this study, we revisit the commonly-cited off-target issue in multilingual\nneural machine translation (MNMT). By carefully designing experiments on\ndifferent MNMT scenarios and models, we attribute the off-target issue to the\noverfitting of the shortcuts of (non-centric, centric) language mappings.\nSpecifically, the learned shortcuts biases MNMT to mistakenly translate\nnon-centric languages into the centric language instead of the expected\nnon-centric language for zero-shot translation. Analyses on learning dynamics\nshow that the shortcut learning generally occurs in the later stage of model\ntraining, and multilingual pretraining accelerates and aggravates the shortcut\nlearning. Based on these observations, we propose a simple and effective\ntraining strategy to eliminate the shortcuts in MNMT models by leveraging the\nforgetting nature of model training. The only difference from the standard\ntraining is that we remove the training instances that may induce the shortcut\nlearning in the later stage of model training. Without introducing any\nadditional data and computational costs, our approach can consistently and\nsignificantly improve the zero-shot translation performance by alleviating the\nshortcut learning for different MNMT models and benchmarks.\n","authors":["Wenxuan Wang","Wenxiang Jiao","Jen-tse Huang","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2411.10581v1.pdf","comment":"Accepted by Neurocomputing 2024"},{"id":"http://arxiv.org/abs/2407.14971v2","updated":"2024-11-15T21:09:28Z","published":"2024-07-20T19:53:52Z","title":"Sim-CLIP: Unsupervised Siamese Adversarial Fine-Tuning for Robust and\n Semantically-Rich Vision-Language Models","summary":" Vision-language models (VLMs) have achieved significant strides in recent\ntimes specially in multimodal tasks, yet they remain susceptible to adversarial\nattacks on their vision components. To address this, we propose Sim-CLIP, an\nunsupervised adversarial fine-tuning method that enhances the robustness of the\nwidely-used CLIP vision encoder against such attacks while maintaining semantic\nrichness and specificity. By employing a Siamese architecture with cosine\nsimilarity loss, Sim-CLIP learns semantically meaningful and attack-resilient\nvisual representations without requiring large batch sizes or momentum\nencoders. Our results demonstrate that VLMs enhanced with Sim-CLIP's fine-tuned\nCLIP encoder exhibit significantly enhanced robustness against adversarial\nattacks, while preserving semantic meaning of the perturbed images. Notably,\nSim-CLIP does not require additional training or fine-tuning of the VLM itself;\nreplacing the original vision encoder with our fine-tuned Sim-CLIP suffices to\nprovide robustness. This work underscores the significance of reinforcing\nfoundational models like CLIP to safeguard the reliability of downstream VLM\napplications, paving the way for more secure and effective multimodal systems.\n","authors":["Md Zarif Hossain","Ahmed Imteaj"],"pdf_url":"https://arxiv.org/pdf/2407.14971v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10573v1","updated":"2024-11-15T20:46:58Z","published":"2024-11-15T20:46:58Z","title":"Hysteresis Activation Function for Efficient Inference","summary":" The widely used ReLU is favored for its hardware efficiency, {as the\nimplementation at inference is a one bit sign case,} yet suffers from issues\nsuch as the ``dying ReLU'' problem, where during training, neurons fail to\nactivate and constantly remain at zero, as highlighted by Lu et al. Traditional\napproaches to mitigate this issue often introduce more complex and less\nhardware-friendly activation functions. In this work, we propose a Hysteresis\nRectified Linear Unit (HeLU), an efficient activation function designed to\naddress the ``dying ReLU'' problem with minimal complexity. Unlike traditional\nactivation functions with fixed thresholds for training and inference, HeLU\nemploys a variable threshold that refines the backpropagation. This refined\nmechanism allows simpler activation functions to achieve competitive\nperformance comparable to their more complex counterparts without introducing\nunnecessary complexity or requiring inductive biases. Empirical evaluations\ndemonstrate that HeLU enhances model generalization across diverse datasets,\noffering a promising solution for efficient and effective inference suitable\nfor a wide range of neural network architectures.\n","authors":["Moshe Kimhi","Idan Kashani","Avi Mendelson","Chaim Baskin"],"pdf_url":"https://arxiv.org/pdf/2411.10573v1.pdf","comment":"Accepted to 4th NeurIPS Efficient Natural Language and Speech\n Processing Workshop (ENLSP-IV 2024)"},{"id":"http://arxiv.org/abs/2408.00884v2","updated":"2024-11-15T20:31:00Z","published":"2024-08-01T19:29:18Z","title":"Hybrid Querying Over Relational Databases and Large Language Models","summary":" Database queries traditionally operate under the closed-world assumption,\nproviding no answers to questions that require information beyond the data\nstored in the database. Hybrid querying using SQL offers an alternative by\nintegrating relational databases with large language models (LLMs) to answer\nbeyond-database questions. In this paper, we present the first cross-domain\nbenchmark, SWAN, containing 120 beyond-database questions over four real-world\ndatabases. To leverage state-of-the-art language models in addressing these\ncomplex questions in SWAN, we present two solutions: one based on schema\nexpansion and the other based on user defined functions. We also discuss\noptimization opportunities and potential future directions. Our evaluation\ndemonstrates that using GPT-4 Turbo with few-shot prompts, one can achieves up\nto 40.0\\% in execution accuracy and 48.2\\% in data factuality. These results\nhighlights both the potential and challenges for hybrid querying. We believe\nthat our work will inspire further research in creating more efficient and\naccurate data systems that seamlessly integrate relational databases and large\nlanguage models to address beyond-database questions.\n","authors":["Fuheng Zhao","Divyakant Agrawal","Amr El Abbadi"],"pdf_url":"https://arxiv.org/pdf/2408.00884v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10557v1","updated":"2024-11-15T20:09:59Z","published":"2024-11-15T20:09:59Z","title":"mlan: language-based instruction tuning improves zero-shot\n generalization of multimodal large language models","summary":" We present a novel instruction tuning recipe to improve the zero-shot task\ngeneralization of multimodal large language models. In contrast to existing\ninstruction tuning mechanisms that heavily rely on visual instructions, our\napproach focuses on language-based instruction tuning, offering a distinct and\nmore training efficient path for multimodal instruction tuning. We evaluate the\nperformance of the proposed approach on 9 unseen datasets across both language\nand vision modalities. Our results show that our language-only instruction\ntuning is able to significantly improve the performance of two pretrained\nmultimodal models based on Llama 2 and Vicuna on those unseen datasets.\nInterestingly, the language instruction following ability also helps unlock the\nmodels to follow vision instructions without explicit training. Compared to the\nstate of the art multimodal instruction tuning approaches that are mainly based\non visual instructions, our language-based method not only achieves superior\nperformance but also significantly enhances training efficiency. For instance,\nthe language-only instruction tuning produces competitive average performance\nacross the evaluated datasets (with even better performance on language\ndatasets) with significant training efficiency improvements (on average 4x),\nthanks to the striking reduction in the need for vision data. With a small\nnumber of visual instructions, this emerging language instruction following\nability transfers well to the unseen vision datasets, outperforming the state\nof the art with greater training efficiency.\n","authors":["Jianhong Tu","Zhuohao Ni","Nicholas Crispino","Zihao Yu","Michael Bendersky","Beliz Gunel","Ruoxi Jia","Xin Liu","Lingjuan Lyu","Dawn Song","Chenguang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.10557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10545v1","updated":"2024-11-15T19:36:15Z","published":"2024-11-15T19:36:15Z","title":"Efficient Alignment of Large Language Models via Data Sampling","summary":" LLM alignment ensures that large language models behave safely and\neffectively by aligning their outputs with human values, goals, and intentions.\nAligning LLMs employ huge amounts of data, computation, and time. Moreover,\ncurating data with human feedback is expensive and takes time. Recent research\ndepicts the benefit of data engineering in the fine-tuning and pre-training\nparadigms to bring down such costs. However, alignment differs from the\nafore-mentioned paradigms and it is unclear if data efficient alignment is\nfeasible. In this work, we first aim to understand how the performance of LLM\nalignment scales with data. We find out that LLM alignment performance follows\nan exponential plateau pattern which tapers off post a rapid initial increase.\nBased on this, we identify data subsampling as a viable method to reduce\nresources required for alignment. Further, we propose an information\ntheory-based methodology for efficient alignment by identifying a small high\nquality subset thereby reducing the computation and time required by alignment.\nWe evaluate the proposed methodology over multiple datasets and compare the\nresults. We find that the model aligned using our proposed methodology\noutperforms other sampling methods and performs comparable to the model aligned\nwith the full dataset while using less than 10% data, leading to greater than\n90% savings in costs, resources, and faster LLM alignment.\n","authors":["Amrit Khera","Rajat Ghosh","Debojyoti Dutta"],"pdf_url":"https://arxiv.org/pdf/2411.10545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10543v1","updated":"2024-11-15T19:29:51Z","published":"2024-11-15T19:29:51Z","title":"SoftLMs: Efficient Adaptive Low-Rank Approximation of Language Models\n using Soft-Thresholding Mechanism","summary":" Extensive efforts have been made to boost the performance in the domain of\nlanguage models by introducing various attention-based transformers. However,\nthe inclusion of linear layers with large dimensions contributes to significant\ncomputational and memory overheads. The escalating computational demands of\nthese models necessitate the development of various compression techniques to\nensure their deployment on devices, particularly in resource-constrained\nenvironments. In this paper, we propose a novel compression methodology that\ndynamically determines the rank of each layer using a soft thresholding\nmechanism, which clips the singular values with a small magnitude in a\ndifferentiable form. This approach automates the decision-making process to\nidentify the optimal degree of compression for each layer. We have successfully\napplied the proposed technique to attention-based architectures, including BERT\nfor discriminative tasks and GPT2 and TinyLlama for generative tasks.\nAdditionally, we have validated our method on Mamba, a recently proposed\nstate-space model. Our experiments demonstrate that the proposed technique\nachieves a speed-up of 1.33X to 1.72X in the encoder/ decoder with a 50%\nreduction in total parameters.\n","authors":["Priyansh Bhatnagar","Linfeng Wen","Mingu Kang"],"pdf_url":"https://arxiv.org/pdf/2411.10543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10541v1","updated":"2024-11-15T19:26:38Z","published":"2024-11-15T19:26:38Z","title":"Does Prompt Formatting Have Any Impact on LLM Performance?","summary":" In the realm of Large Language Models (LLMs), prompt optimization is crucial\nfor model performance. Although previous research has explored aspects like\nrephrasing prompt contexts, using various prompting techniques (like in-context\nlearning and chain-of-thought), and ordering few-shot examples, our\nunderstanding of LLM sensitivity to prompt templates remains limited.\nTherefore, this paper examines the impact of different prompt templates on LLM\nperformance. We formatted the same contexts into various human-readable\ntemplates, including plain text, Markdown, JSON, and YAML, and evaluated their\nimpact across tasks like natural language reasoning, code generation, and\ntranslation using OpenAI's GPT models. Experiments show that GPT-3.5-turbo's\nperformance varies by up to 40\\% in a code translation task depending on the\nprompt template, while larger models like GPT-4 are more robust to these\nvariations. Our analysis highlights the need to reconsider the use of fixed\nprompt templates, as different formats can significantly affect model\nperformance.\n","authors":["Jia He","Mukund Rungta","David Koleczek","Arshdeep Sekhon","Franklin X Wang","Sadid Hasan"],"pdf_url":"https://arxiv.org/pdf/2411.10541v1.pdf","comment":"Submitted to NAACL 2025"}]},"2024-11-18T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.11839v1","updated":"2024-11-18T18:58:03Z","published":"2024-11-18T18:58:03Z","title":"RoboGSim: A Real2Sim2Real Robotic Gaussian Splatting Simulator","summary":" Efficient acquisition of real-world embodied data has been increasingly\ncritical. However, large-scale demonstrations captured by remote operation tend\nto take extremely high costs and fail to scale up the data size in an efficient\nmanner. Sampling the episodes under a simulated environment is a promising way\nfor large-scale collection while existing simulators fail to high-fidelity\nmodeling on texture and physics. To address these limitations, we introduce the\nRoboGSim, a real2sim2real robotic simulator, powered by 3D Gaussian Splatting\nand the physics engine. RoboGSim mainly includes four parts: Gaussian\nReconstructor, Digital Twins Builder, Scene Composer, and Interactive Engine.\nIt can synthesize the simulated data with novel views, objects, trajectories,\nand scenes. RoboGSim also provides an online, reproducible, and safe evaluation\nfor different manipulation policies. The real2sim and sim2real transfer\nexperiments show a high consistency in the texture and physics. Moreover, the\neffectiveness of synthetic data is validated under the real-world manipulated\ntasks. We hope RoboGSim serves as a closed-loop simulator for fair comparison\non policy learning. More information can be found on our project page\nhttps://robogsim.github.io/ .\n","authors":["Xinhai Li","Jialin Li","Ziheng Zhang","Rui Zhang","Fan Jia","Tiancai Wang","Haoqiang Fan","Kuo-Kun Tseng","Ruiping Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11833v1","updated":"2024-11-18T18:51:57Z","published":"2024-11-18T18:51:57Z","title":"Differentiable GPU-Parallelized Task and Motion Planning","summary":" We present a differentiable optimization-based framework for Task and Motion\nPlanning (TAMP) that is massively parallelizable on GPUs, enabling thousands of\nsampled seeds to be optimized simultaneously. Existing sampling-based\napproaches inherently disconnect the parameters by generating samples for each\nindependently and combining them through composition and rejection, while\noptimization-based methods struggle with highly non-convex constraints and\nlocal optima. Our method treats TAMP constraint satisfaction as optimizing a\nbatch of particles, each representing an assignment to a plan skeleton's\ncontinuous parameters. We represent the plan skeleton's constraints using\ndifferentiable cost functions, enabling us to compute the gradient of each\nparticle and update it toward satisfying solutions. Our use of GPU parallelism\nbetter covers the parameter space through scale, increasing the likelihood of\nfinding the global optima by exploring multiple basins through global sampling.\nWe demonstrate that our algorithm can effectively solve a highly constrained\nTetris packing problem using a Franka arm in simulation and deploy our planner\non a real robot arm. Website: https://williamshen-nz.github.io/gpu-tamp\n","authors":["William Shen","Caelan Garrett","Ankit Goyal","Tucker Hermans","Fabio Ramos"],"pdf_url":"https://arxiv.org/pdf/2411.11833v1.pdf","comment":"2-page paper presented at the CoRL 2024 Workshop on Differentiable\n Optimization Everywhere"},{"id":"http://arxiv.org/abs/2411.11812v1","updated":"2024-11-18T18:27:37Z","published":"2024-11-18T18:27:37Z","title":"cHyRRT and cHySST: Two Motion Planning Tools for Hybrid Dynamical\n Systems","summary":" This paper describes two C++/Open Motion Planning Library implementations of\nthe recently developed motion planning algorithms HyRRT arXiv:2210.15082v1\n[cs.RO] and HySST arXiv:2305.18649v1 [cs.RO]. Specifically, cHyRRT, an\nimplementation of the HyRRT algorithm, is capable of generating a solution to a\nmotion planning problem for hybrid systems with probabilistically completeness,\nwhile cHySST, an implementation of the asymptotically near-optimal HySST\nalgorithm, is capable of computing a trajectory to solve the optimal motion\nplanning problem for hybrid systems. cHyRRT is suitable for motion planning\nproblems where an optimal solution is not required, whereas cHySST is suitable\nfor such problems that prefer optimal solutions, within all feasible solutions.\nThe structure, components, and usage of the two tools are described. Examples\nare included to illustrate the main capabilities of the toolbox.\n","authors":["Beverly Xu","Nan Wang","Ricardo Sanfelice"],"pdf_url":"https://arxiv.org/pdf/2411.11812v1.pdf","comment":"This paper has 26 pages and has been submitted to 28th ACM\n International Conference on Hybrid Systems: Computation and Control"},{"id":"http://arxiv.org/abs/2407.05478v5","updated":"2024-11-18T18:23:40Z","published":"2024-07-07T19:33:30Z","title":"Sequential Gaussian Variational Inference for Nonlinear State Estimation\n and Its Application in Robot Navigation","summary":" Probabilistic state estimation is essential for robots navigating uncertain\nenvironments. Accurately and efficiently managing uncertainty in estimated\nstates is key to robust robotic operation. However, nonlinearities in robotic\nplatforms pose significant challenges that require advanced estimation\ntechniques. Gaussian variational inference (GVI) offers an optimization\nperspective on the estimation problem, providing analytically tractable\nsolutions and efficiencies derived from the geometry of Gaussian space. We\npropose a Sequential Gaussian Variational Inference (S-GVI) method to address\nnonlinearity and provide efficient sequential inference processes. Our approach\nintegrates sequential Bayesian principles into the GVI framework, which are\naddressed using statistical approximations and gradient updates on the\ninformation geometry. Validations through simulations and real-world\nexperiments demonstrate significant improvements in state estimation over the\nMaximum A Posteriori (MAP) estimation method.\n","authors":["Min-Won Seo","Solmaz S. Kia"],"pdf_url":"https://arxiv.org/pdf/2407.05478v5.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2411.11788v1","updated":"2024-11-18T18:03:49Z","published":"2024-11-18T18:03:49Z","title":"Enabling steep slope walking on Husky using reduced order modeling and\n quadratic programming","summary":" Wing-assisted inclined running (WAIR) observed in some young birds, is an\nattractive maneuver that can be extended to legged aerial systems. This study\nproposes a control method using a modified Variable Length Inverted Pendulum\n(VLIP) by assuming a fixed zero moment point and thruster forces collocated at\nthe center of mass of the pendulum. A QP MPC is used to find the optimal ground\nreaction forces and thruster forces to track a reference position and velocity\ntrajectory. Simulation results of this VLIP model on a slope of 40 degrees is\nmaintained and shows thruster forces that can be obtained through posture\nmanipulation. The simulation also provides insight to how the combined efforts\nof the thrusters and the tractive forces from the legs make WAIR possible in\nthruster-assisted legged systems.\n","authors":["Kaushik Venkatesh Krishnamurthy","Eric Sihite","Chenghao Wang","Shreyansh Pitroda","Adarsh Salagame","Alireza Ramezani","Morteza Gharib"],"pdf_url":"https://arxiv.org/pdf/2411.11788v1.pdf","comment":"6 pages, 8 figures, submitted to the Humanoids 2025 conference"},{"id":"http://arxiv.org/abs/2411.11777v1","updated":"2024-11-18T17:54:35Z","published":"2024-11-18T17:54:35Z","title":"Assistive Control of Knee Exoskeletons for Human Walking on Granular\n Terrains","summary":" Human walkers traverse diverse environments and demonstrate different gait\nlocomotion and energy cost on granular terrains compared to solid ground. We\npresent a stiffness-based model predictive control approach of knee exoskeleton\nassistance on sand. The gait and locomotion comparison is first discussed for\nhuman walkers on sand and solid ground. A machine learning-based estimation\nscheme is then presented to predict the ground reaction forces (GRFs) for human\nwalkers on different terrains in real time. Built on the estimated GRFs and\nhuman joint torques, a knee exoskeleton controller is designed to provide\nassistive torque through a model predictive stiffness control scheme. We\nconduct indoor and outdoor experiments to validate the modeling and control\ndesign and their performance. The experiments demonstrate the major muscle\nactivation and metabolic reductions by respectively 15% and 3.7% under the\nassistive exoskeleton control of human walking on sand.\n","authors":["Chunchu Zhu","Xunjie Chen","Jingang Yi"],"pdf_url":"https://arxiv.org/pdf/2411.11777v1.pdf","comment":"Eight pages, eleven figures, submitted to IEEE Robotics and\n Automation Letters"},{"id":"http://arxiv.org/abs/2411.07534v2","updated":"2024-11-18T17:49:46Z","published":"2024-11-12T04:19:25Z","title":"Effective Virtual Reality Teleoperation of an Upper-body Humanoid with\n Modified Task Jacobians and Relaxed Barrier Functions for Self-Collision\n Avoidance","summary":" We present an approach for retartgeting off-the-shelf Virtual Reality (VR)\ntrackers to effectively teleoperate an upper-body humanoid while ensuring\nself-collision-free motions. Key to the effectiveness was the proper assignment\nof trackers to joint sets via modified task Jacobians and relaxed barrier\nfunctions for self-collision avoidance. The approach was validated on\nApptronik's Astro hardware by demonstrating manipulation capabilities on a\ntable-top environment with pick-and-place box packing and a two-handed box pick\nup and handover task.\n","authors":["Steven Jens Jorgensen","Ravi Bhadeshiya"],"pdf_url":"https://arxiv.org/pdf/2411.07534v2.pdf","comment":"First Prize Winner of Horizons of an extended robotics reality\n Workshop at International Conference on Intelligent Robots and Systems, 2022"},{"id":"http://arxiv.org/abs/2411.11762v1","updated":"2024-11-18T17:40:43Z","published":"2024-11-18T17:40:43Z","title":"High-Speed Cornering Control and Real-Vehicle Deployment for Autonomous\n Electric Vehicles","summary":" Executing drift maneuvers during high-speed cornering presents significant\nchallenges for autonomous vehicles, yet offers the potential to minimize\nturning time and enhance driving dynamics. While reinforcement learning (RL)\nhas shown promising results in simulated environments, discrepancies between\nsimulations and real-world conditions have limited its practical deployment.\nThis study introduces an innovative control framework that integrates\ntrajectory optimization with drift maneuvers, aiming to improve the algorithm's\nadaptability for real-vehicle implementation. We leveraged Bezier-based\npre-trajectory optimization to enhance rewards and optimize the controller\nthrough Twin Delayed Deep Deterministic Policy Gradient (TD3) in a simulated\nenvironment. For real-world deployment, we implement a hybrid RL-MPC fusion\nmechanism, , where TD3-derived maneuvers serve as primary inputs for a Model\nPredictive Controller (MPC). This integration enables precise real-time\ntracking of the optimal trajectory, with MPC providing corrective inputs to\nbridge the gap between simulation and reality. The efficacy of this method is\nvalidated through real-vehicle tests on consumer-grade electric vehicles,\nfocusing on drift U-turns and drift right-angle turns. The control outcomes of\nthese real-vehicle tests are thoroughly documented in the paper, supported by\nsupplementary video evidence (https://youtu.be/5wp67FcpfL8). Notably, this\nstudy is the first to deploy and apply an RL-based transient drift cornering\nalgorithm on consumer-grade electric vehicles.\n","authors":["Shiyue Zhao","Junzhi Zhang","Neda Masoud","Yuhong Jiang","Heye Huang","Tao Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11762v1.pdf","comment":"In the process of being submitted to the Journal of IEEE Transactions\n on Industrial Electronics"},{"id":"http://arxiv.org/abs/2411.11734v1","updated":"2024-11-18T17:02:15Z","published":"2024-11-18T17:02:15Z","title":"Joint-Space Control of a Structurally Elastic Humanoid Robot","summary":" In this work, the joint-control strategy is presented for the humanoid robot,\nPANDORA, whose structural components are designed to be compliant. As opposed\nto contemporary approaches which design the elasticity internal to the actuator\nhousing, PANDORA's structural components are designed to be compliant under\nload or, in other words, structurally elastic. To maintain the rapid design\nbenefit of additive manufacturing, this joint control strategy employs a\ndisturbance observer (DOB) modeled from an ideal elastic actuator. This robust\ncontroller treats the model variation from the structurally elastic components\nas a disturbance and eliminates the need for system identification of the 3D\nprinted parts. This enables mechanical design engineers to iterate on the 3D\nprinted linkages without requiring consistent tuning from the joint controller.\nTwo sets of hardware results are presented for validating the controller. The\nfirst set of results are conducted on an ideal elastic actuator testbed that\ndrives an unmodeled, 1 DoF weighted pendulum with a 10 kg mass. The results\nsupport the claim that the DOB can handle significant model variation. The\nsecond set of results is from a robust balancing experiment conducted on the 12\nDoF lower body of PANDORA. The robot maintains balance while an operator\napplies 50 N pushes to the pelvis, where the actuator tracking results are\npresented for the left leg.\n","authors":["Connor W. Herron","Christian Runyon","Isaac Pressgrove","Benjamin C. Beiter","Bhaben Kalita","Alexander Leonessa"],"pdf_url":"https://arxiv.org/pdf/2411.11734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11733v1","updated":"2024-11-18T17:00:52Z","published":"2024-11-18T17:00:52Z","title":"Integrating Active Sensing and Rearrangement Planning for Efficient\n Object Retrieval from Unknown, Confined, Cluttered Environments","summary":" Retrieving target objects from unknown, confined spaces remains a challenging\ntask that requires integrated, task-driven active sensing and rearrangement\nplanning. Previous approaches have independently addressed active sensing and\nrearrangement planning, limiting their practicality in real-world scenarios.\nThis paper presents a new, integrated heuristic-based active sensing and\nMonte-Carlo Tree Search (MCTS)-based retrieval planning approach. These\ncomponents provide feedback to one another to actively sense critical,\nunobserved areas suitable for the retrieval planner to plan a sequence for\nrelocating path-blocking obstacles and a collision-free trajectory for\nretrieving the target object. We demonstrate the effectiveness of our approach\nusing a robot arm equipped with an in-hand camera in both simulated and\nreal-world confined, cluttered scenarios. Our framework is compared against\nvarious state-of-the-art methods. The results indicate that our proposed\napproach outperforms baseline methods by a significant margin in terms of the\nsuccess rate, the object rearrangement planning time consumption and the number\nof planning trials before successfully retrieving the target. Videos can be\nfound at https://youtu.be/tea7I-3RtV0.\n","authors":["Junyong Kim","Hanwen Ren","Ahmed H. Qureshi"],"pdf_url":"https://arxiv.org/pdf/2411.11733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11714v1","updated":"2024-11-18T16:42:07Z","published":"2024-11-18T16:42:07Z","title":"Semantic-Geometric-Physical-Driven Robot Manipulation Skill Transfer via\n Skill Library and Tactile Representation","summary":" Deploying robots in open-world environments involves complex tasks\ncharacterized by long sequences and rich interactions, necessitating efficient\ntransfer of robotic skills across diverse and complex scenarios. To address\nthis challenge, we propose a skill library framework based on knowledge graphs,\nwhich endows robots with high-level skill awareness and spatial semantic\nunderstanding. The framework hierarchically organizes operational knowledge by\nconstructing a \"task graph\" and a \"scene graph\" to represent task and scene\nsemantic information, respectively. We introduce a \"state graph\" to facilitate\ninteraction between high-level task planning and low-level scene information.\nFurthermore, we propose a hierarchical transfer framework for operational\nskills. At the task level, the framework integrates contextual learning and\nchain-of-thought prompting within a four-stage prompt paradigm, leveraging\nlarge language models' (LLMs) reasoning and generalization capabilities to\nachieve task-level subtask sequence transfer. At the motion level, an adaptive\ntrajectory transfer method is developed using the A* algorithm and the skill\nlibrary, enabling motion-level adaptive trajectory transfer. At the physical\nlevel, we introduce an adaptive contour extraction and posture perception\nmethod based on tactile perception. This method dynamically obtains\nhigh-precision contour and posture information from visual-tactile texture data\nand adjusts transferred skills, such as contact positions and postures, to\nensure effectiveness in new environments. Experimental results validate the\neffectiveness of the proposed methods. Project\nwebsite:https://github.com/MingchaoQi/skill_transfer\n","authors":["Mingchao Qi","Yuanjin Li","Xing Liu","Zhengxiong Liu","Panfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2411.11714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11683v1","updated":"2024-11-18T16:09:26Z","published":"2024-11-18T16:09:26Z","title":"TrojanRobot: Backdoor Attacks Against Robotic Manipulation in the\n Physical World","summary":" Robotic manipulation refers to the autonomous handling and interaction of\nrobots with objects using advanced techniques in robotics and artificial\nintelligence. The advent of powerful tools such as large language models (LLMs)\nand large vision-language models (LVLMs) has significantly enhanced the\ncapabilities of these robots in environmental perception and decision-making.\nHowever, the introduction of these intelligent agents has led to security\nthreats such as jailbreak attacks and adversarial attacks.\n In this research, we take a further step by proposing a backdoor attack\nspecifically targeting robotic manipulation and, for the first time,\nimplementing backdoor attack in the physical world. By embedding a backdoor\nvisual language model into the visual perception module within the robotic\nsystem, we successfully mislead the robotic arm's operation in the physical\nworld, given the presence of common items as triggers. Experimental evaluations\nin the physical world demonstrate the effectiveness of the proposed backdoor\nattack.\n","authors":["Xianlong Wang","Hewen Pan","Hangtao Zhang","Minghui Li","Shengshan Hu","Ziqi Zhou","Lulu Xue","Peijin Guo","Yichen Wang","Wei Wan","Aishan Liu","Leo Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11683v1.pdf","comment":"Initial version with preliminary results. We welcome any feedback or\n suggestions"},{"id":"http://arxiv.org/abs/2411.11637v1","updated":"2024-11-18T15:15:24Z","published":"2024-11-18T15:15:24Z","title":"The ethical landscape of robot-assisted surgery. A systematic review","summary":" Background: Robot-assisted surgery has been widely adopted in recent years.\nHowever, compared to other health technologies operating in close proximity to\npatients in a vulnerable state, ethical issues of robot-assisted surgery have\nreceived less attention. Against the background of increasing automation that\nare expected to raise new ethical issues, this systematic review aims to map\nthe state of the ethical debate in this field.\n Methods: A protocol was registered in the international prospective register\nof systematic reviews (PROSPERO CRD42023397951). Medline via PubMed, EMBASE,\nCINHAL, Philosophers' Index, IEEE Xplorer, Web of Science (Core Collection),\nScopus and Google Scholar were searched in January 2023. Screening, extraction,\nand analysis were conducted independently by two authors. A qualitative\nnarrative synthesis was performed.\n Results: Out of 1,723 records, 66 records were included in the final dataset.\nSeven major strands of the ethical debate emerged during analysis. These\ninclude questions of harms and benefits, responsibility and control,\nprofessional-patient relationship, ethical issues in surgical training and\nlearning, justice, translational questions, and economic considerations.\n Discussion: The identified themes testify to a broad range of different and\ndiffering ethical issues requiring careful deliberation and integration into\nthe surgical ethos. Looking forward, we argue that a different perspective in\naddressing robotic surgical devices might be helpful to consider upcoming\nchallenges of automation.\n","authors":["Joschka Haltaufderheide","Stefanie Pfisterer-Heise","Dawid Pieper","Robert Ranisch"],"pdf_url":"https://arxiv.org/pdf/2411.11637v1.pdf","comment":"25 pages, 3 tables, 2 figures"},{"id":"http://arxiv.org/abs/2411.11616v1","updated":"2024-11-18T14:42:15Z","published":"2024-11-18T14:42:15Z","title":"Signaling and Social Learning in Swarms of Robots","summary":" This paper investigates the role of communication in improving coordination\nwithin robot swarms, focusing on a paradigm where learning and execution occur\nsimultaneously in a decentralized manner. We highlight the role communication\ncan play in addressing the credit assignment problem (individual contribution\nto the overall performance), and how it can be influenced by it. We propose a\ntaxonomy of existing and future works on communication, focusing on information\nselection and physical abstraction as principal axes for classification: from\nlow-level lossless compression with raw signal extraction and processing to\nhigh-level lossy compression with structured communication models. The paper\nreviews current research from evolutionary robotics, multi-agent (deep)\nreinforcement learning, language models, and biophysics models to outline the\nchallenges and opportunities of communication in a collective of robots that\ncontinuously learn from one another through local message exchanges,\nillustrating a form of social learning.\n","authors":["Leo Cazenille","Maxime Toquebiau","Nicolas Lobato-Dauzier","Alessia Loi","Loona Macabre","Nathanael Aubert-Kato","Anthony Genot","Nicolas Bredeche"],"pdf_url":"https://arxiv.org/pdf/2411.11616v1.pdf","comment":"17 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2411.11609v1","updated":"2024-11-18T14:30:46Z","published":"2024-11-18T14:30:46Z","title":"VLN-Game: Vision-Language Equilibrium Search for Zero-Shot Semantic\n Navigation","summary":" Following human instructions to explore and search for a specified target in\nan unfamiliar environment is a crucial skill for mobile service robots. Most of\nthe previous works on object goal navigation have typically focused on a single\ninput modality as the target, which may lead to limited consideration of\nlanguage descriptions containing detailed attributes and spatial relationships.\nTo address this limitation, we propose VLN-Game, a novel zero-shot framework\nfor visual target navigation that can process object names and descriptive\nlanguage targets effectively. To be more precise, our approach constructs a 3D\nobject-centric spatial map by integrating pre-trained visual-language features\nwith a 3D reconstruction of the physical environment. Then, the framework\nidentifies the most promising areas to explore in search of potential target\ncandidates. A game-theoretic vision language model is employed to determine\nwhich target best matches the given language description. Experiments conducted\non the Habitat-Matterport 3D (HM3D) dataset demonstrate that the proposed\nframework achieves state-of-the-art performance in both object goal navigation\nand language-based navigation tasks. Moreover, we show that VLN-Game can be\neasily deployed on real-world robots. The success of VLN-Game highlights the\npromising potential of using game-theoretic methods with compact\nvision-language models to advance decision-making capabilities in robotic\nsystems. The supplementary video and code can be accessed via the following\nlink: https://sites.google.com/view/vln-game.\n","authors":["Bangguo Yu","Yuzhen Liu","Lei Han","Hamidreza Kasaei","Tingguang Li","Ming Cao"],"pdf_url":"https://arxiv.org/pdf/2411.11609v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.11607v1","updated":"2024-11-18T14:29:22Z","published":"2024-11-18T14:29:22Z","title":"Performance evaluation of a ROS2 based Automated Driving System","summary":" Automated driving is currently a prominent area of scientific work. In the\nfuture, highly automated driving and new Advanced Driver Assistance Systems\nwill become reality. While Advanced Driver Assistance Systems and automated\ndriving functions for certain domains are already commercially available,\nubiquitous automated driving in complex scenarios remains a subject of ongoing\nresearch. Contrarily to single-purpose Electronic Control Units, the software\nfor automated driving is often executed on high performance PCs. The Robot\nOperating System 2 (ROS2) is commonly used to connect components in an\nautomated driving system. Due to the time critical nature of automated driving\nsystems, the performance of the framework is especially important. In this\npaper, a thorough performance evaluation of ROS2 is conducted, both in terms of\ntimeliness and error rate. The results show that ROS2 is a suitable framework\nfor automated driving systems.\n","authors":["Jorin Kouril","Bernd Schäufele","Ilja Radusch","Bettina Schnor"],"pdf_url":"https://arxiv.org/pdf/2411.11607v1.pdf","comment":"Published and presented at VEHITS 2024, Proceedings of the 10th\n International Conference on Vehicle Technology and Intelligent Transport\n Systems - VEHITS; 2024"},{"id":"http://arxiv.org/abs/2408.11048v2","updated":"2024-11-18T14:14:22Z","published":"2024-08-20T17:56:52Z","title":"RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual\n Dexterous Robot Hands","summary":" It has been a long-standing research goal to endow robot hands with\nhuman-level dexterity. Bi-manual robot piano playing constitutes a task that\ncombines challenges from dynamic tasks, such as generating fast while precise\nmotions, with slower but contact-rich manipulation problems. Although\nreinforcement learning based approaches have shown promising results in\nsingle-task performance, these methods struggle in a multi-song setting. Our\nwork aims to close this gap and, thereby, enable imitation learning approaches\nfor robot piano playing at scale. To this end, we introduce the Robot Piano 1\nMillion (RP1M) dataset, containing bi-manual robot piano playing motion data of\nmore than one million trajectories. We formulate finger placements as an\noptimal transport problem, thus, enabling automatic annotation of vast amounts\nof unlabeled songs. Benchmarking existing imitation learning approaches shows\nthat such approaches reach state-of-the-art robot piano playing performance by\nleveraging RP1M.\n","authors":["Yi Zhao","Le Chen","Jan Schneider","Quankai Gao","Juho Kannala","Bernhard Schölkopf","Joni Pajarinen","Dieter Büchler"],"pdf_url":"https://arxiv.org/pdf/2408.11048v2.pdf","comment":"Accepted by Conference on Robot Learning (CoRL) 2024. Project\n Website: https://rp1m.github.io/"},{"id":"http://arxiv.org/abs/2411.11510v1","updated":"2024-11-18T12:15:16Z","published":"2024-11-18T12:15:16Z","title":"Closed-loop multi-step planning with innate physics knowledge","summary":" We present a hierarchical framework to solve robot planning as an input\ncontrol problem. At the lowest level are temporary closed control loops,\n(\"tasks\"), each representing a behaviour, contingent on a specific sensory\ninput and therefore temporary. At the highest level, a supervising\n\"Configurator\" directs task creation and termination. Here resides \"core\"\nknowledge as a physics engine, where sequences of tasks can be simulated. The\nConfigurator encodes and interprets simulation results,based on which it can\nchoose a sequence of tasks as a plan. We implement this framework on a real\nrobot and test it in an overtaking scenario as proof-of-concept.\n","authors":["Giulia Lafratta","Bernd Porr","Christopher Chandler","Alice Miller"],"pdf_url":"https://arxiv.org/pdf/2411.11510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11497v1","updated":"2024-11-18T11:58:20Z","published":"2024-11-18T11:58:20Z","title":"Physics Encoded Blocks in Residual Neural Network Architectures for\n Digital Twin Models","summary":" Physics Informed Machine Learning has emerged as a popular approach in\nmodelling and simulation for digital twins to generate accurate models of\nprocesses and behaviours of real-world systems. However, despite their success\nin generating accurate and reliable models, the existing methods either use\nsimple regularizations in loss functions to offer limited physics integration\nor are too specific in architectural definitions to be generalized to a wide\nvariety of physical systems. This paper presents a generic approach based on a\nnovel physics-encoded residual neural network architecture to combine\ndata-driven and physics-based analytical models to address these limitations.\nOur method combines physics blocks as mathematical operators from physics-based\nmodels with learning blocks comprising feed-forward layers. Intermediate\nresidual blocks are incorporated for stable gradient flow as they train on\nphysical system observation data. This way, the model learns to comply with the\ngeometric and kinematic aspects of the physical system. Compared to\nconventional neural network-based methods, our method improves generalizability\nwith substantially low data requirements and model complexity in terms of\nparameters, especially in scenarios where prior physics knowledge is either\nelementary or incomplete. We investigate our approach in two application\ndomains. The first is a basic robotic motion model using Euler Lagrangian\nequations of motion as physics prior. The second application is a complex\nscenario of a steering model for a self-driving vehicle in a simulation. In\nboth applications, our method outperforms both conventional neural network\nbased approaches as-well as state-of-the-art Physics Informed Machine Learning\nmethods.\n","authors":["Muhammad Saad Zia","Ashiq Anjum","Lu Liu","Anthony Conway","Anasol Pena Rios"],"pdf_url":"https://arxiv.org/pdf/2411.11497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11483v1","updated":"2024-11-18T11:42:20Z","published":"2024-11-18T11:42:20Z","title":"Robust State Estimation for Legged Robots with Dual Beta Kalman Filter","summary":" Existing state estimation algorithms for legged robots that rely on\nproprioceptive sensors often overlook foot slippage and leg deformation in the\nphysical world, leading to large estimation errors. To address this limitation,\nwe propose a comprehensive measurement model that accounts for both foot\nslippage and variable leg length by analyzing the relative motion between foot\ncontact points and the robot's body center. We show that leg length is an\nobservable quantity, meaning that its value can be explicitly inferred by\ndesigning an auxiliary filter. To this end, we introduce a dual estimation\nframework that iteratively employs a parameter filter to estimate the leg\nlength parameters and a state filter to estimate the robot's state. To prevent\nerror accumulation in this iterative framework, we construct a partial\nmeasurement model for the parameter filter using the leg static equation. This\napproach ensures that leg length estimation relies solely on joint torques and\nfoot contact forces, avoiding the influence of state estimation errors on the\nparameter estimation. Unlike leg length which can be directly estimated, foot\nslippage cannot be measured directly with the current sensor configuration.\nHowever, since foot slippage occurs at a low frequency, it can be treated as\noutliers in the measurement data. To mitigate the impact of these outliers, we\npropose the beta Kalman filter (beta KF), which redefines the estimation loss\nin canonical Kalman filtering using beta divergence. This divergence can assign\nlow weights to outliers in an adaptive manner, thereby enhancing the robustness\nof the estimation algorithm. These techniques together form the dual\nbeta-Kalman filter (Dual beta KF), a novel algorithm for robust state\nestimation in legged robots. Experimental results on the Unitree GO2 robot\ndemonstrate that the Dual beta KF significantly outperforms state-of-the-art\nmethods.\n","authors":["Tianyi Zhang","Wenhan Cao","Chang Liu","Tao Zhang","Jiangtao Li","Shengbo Eben Li"],"pdf_url":"https://arxiv.org/pdf/2411.11483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11481v1","updated":"2024-11-18T11:36:17Z","published":"2024-11-18T11:36:17Z","title":"Exploring Emerging Trends and Research Opportunities in Visual Place\n Recognition","summary":" Visual-based recognition, e.g., image classification, object detection, etc.,\nis a long-standing challenge in computer vision and robotics communities.\nConcerning the roboticists, since the knowledge of the environment is a\nprerequisite for complex navigation tasks, visual place recognition is vital\nfor most localization implementations or re-localization and loop closure\ndetection pipelines within simultaneous localization and mapping (SLAM). More\nspecifically, it corresponds to the system's ability to identify and match a\npreviously visited location using computer vision tools. Towards developing\nnovel techniques with enhanced accuracy and robustness, while motivated by the\nsuccess presented in natural language processing methods, researchers have\nrecently turned their attention to vision-language models, which integrate\nvisual and textual data.\n","authors":["Antonios Gasteratos","Konstantinos A. Tsintotas","Tobias Fischer","Yiannis Aloimonos","Michael Milford"],"pdf_url":"https://arxiv.org/pdf/2411.11481v1.pdf","comment":"2 pages, 1 figure. 40th Anniversary of the IEEE Conference on\n Robotics and Automation (ICRA@40), Rotterdam, Netherlands, September 23-26,\n 2024"},{"id":"http://arxiv.org/abs/2410.08848v2","updated":"2024-11-18T09:34:30Z","published":"2024-10-11T14:25:23Z","title":"Learning Spatial Bimanual Action Models Based on Affordance Regions and\n Human Demonstrations","summary":" In this paper, we present a novel approach for learning bimanual manipulation\nactions from human demonstration by extracting spatial constraints between\naffordance regions, termed affordance constraints, of the objects involved.\nAffordance regions are defined as object parts that provide interaction\npossibilities to an agent. For example, the bottom of a bottle affords the\nobject to be placed on a surface, while its spout affords the contained liquid\nto be poured. We propose a novel approach to learn changes of affordance\nconstraints in human demonstration to construct spatial bimanual action models\nrepresenting object interactions. To exploit the information encoded in these\nspatial bimanual action models, we formulate an optimization problem to\ndetermine optimal object configurations across multiple execution keypoints\nwhile taking into account the initial scene, the learned affordance\nconstraints, and the robot's kinematics. We evaluate the approach in simulation\nwith two example tasks (pouring drinks and rolling dough) and compare three\ndifferent definitions of affordance constraints: (i) component-wise distances\nbetween affordance regions in Cartesian space, (ii) component-wise distances\nbetween affordance regions in cylindrical space, and (iii) degrees of\nsatisfaction of manually defined symbolic spatial affordance constraints.\n","authors":["Björn S. Plonka","Christian Dreher","Andre Meixner","Rainer Kartmann","Tamim Asfour"],"pdf_url":"https://arxiv.org/pdf/2410.08848v2.pdf","comment":"8 pages, accepted for publication at Humanoids 2024 - Copyright IEEE"},{"id":"http://arxiv.org/abs/2411.11409v1","updated":"2024-11-18T09:30:05Z","published":"2024-11-18T09:30:05Z","title":"IKEA Manuals at Work: 4D Grounding of Assembly Instructions on Internet\n Videos","summary":" Shape assembly is a ubiquitous task in daily life, integral for constructing\ncomplex 3D structures like IKEA furniture. While significant progress has been\nmade in developing autonomous agents for shape assembly, existing datasets have\nnot yet tackled the 4D grounding of assembly instructions in videos, essential\nfor a holistic understanding of assembly in 3D space over time. We introduce\nIKEA Video Manuals, a dataset that features 3D models of furniture parts,\ninstructional manuals, assembly videos from the Internet, and most importantly,\nannotations of dense spatio-temporal alignments between these data modalities.\nTo demonstrate the utility of IKEA Video Manuals, we present five applications\nessential for shape assembly: assembly plan generation, part-conditioned\nsegmentation, part-conditioned pose estimation, video object segmentation, and\nfurniture assembly based on instructional video manuals. For each application,\nwe provide evaluation metrics and baseline methods. Through experiments on our\nannotated data, we highlight many challenges in grounding assembly instructions\nin videos to improve shape assembly, including handling occlusions, varying\nviewpoints, and extended assembly sequences.\n","authors":["Yunong Liu","Cristobal Eyzaguirre","Manling Li","Shubh Khanna","Juan Carlos Niebles","Vineeth Ravi","Saumitra Mishra","Weiyu Liu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11409v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2411.11406v1","updated":"2024-11-18T09:28:11Z","published":"2024-11-18T09:28:11Z","title":"Bridging the Resource Gap: Deploying Advanced Imitation Learning Models\n onto Affordable Embedded Platforms","summary":" Advanced imitation learning with structures like the transformer is\nincreasingly demonstrating its advantages in robotics. However, deploying these\nlarge-scale models on embedded platforms remains a major challenge. In this\npaper, we propose a pipeline that facilitates the migration of advanced\nimitation learning algorithms to edge devices. The process is achieved via an\nefficient model compression method and a practical asynchronous parallel method\nTemporal Ensemble with Dropped Actions (TEDA) that enhances the smoothness of\noperations. To show the efficiency of the proposed pipeline, large-scale\nimitation learning models are trained on a server and deployed on an edge\ndevice to complete various manipulation tasks.\n","authors":["Haizhou Ge","Ruixiang Wang","Zhu-ang Xu","Hongrui Zhu","Ruichen Deng","Yuhang Dong","Zeyu Pang","Guyue Zhou","Junyu Zhang","Lu Shi"],"pdf_url":"https://arxiv.org/pdf/2411.11406v1.pdf","comment":"Accepted by the 2024 IEEE International Conference on Robotics and\n Biomimetics (IEEE ROBIO 2024)"},{"id":"http://arxiv.org/abs/2411.11405v1","updated":"2024-11-18T09:27:49Z","published":"2024-11-18T09:27:49Z","title":"Extended Neural Contractive Dynamical Systems: On Multiple Tasks and\n Riemannian Safety Regions","summary":" Stability guarantees are crucial when ensuring that a fully autonomous robot\ndoes not take undesirable or potentially harmful actions. We recently proposed\nthe Neural Contractive Dynamical Systems (NCDS), which is a neural network\narchitecture that guarantees contractive stability. With this,\nlearning-from-demonstrations approaches can trivially provide stability\nguarantees. However, our early work left several unanswered questions, which we\nhere address. Beyond providing an in-depth explanation of NCDS, this paper\nextends the framework with more careful regularization, a conditional variant\nof the framework for handling multiple tasks, and an uncertainty-driven\napproach to latent obstacle avoidance. Experiments verify that the developed\nsystem has the flexibility of ordinary neural networks while providing the\nstability guarantees needed for autonomous robotics.\n","authors":["Hadi Beik Mohammadi","Søren Hauberg","Georgios Arvanitidis","Gerhard Neumann","Leonel Rozo"],"pdf_url":"https://arxiv.org/pdf/2411.11405v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2401.09352"},{"id":"http://arxiv.org/abs/2411.11394v1","updated":"2024-11-18T09:11:48Z","published":"2024-11-18T09:11:48Z","title":"InstruGen: Automatic Instruction Generation for Vision-and-Language\n Navigation Via Large Multimodal Models","summary":" Recent research on Vision-and-Language Navigation (VLN) indicates that agents\nsuffer from poor generalization in unseen environments due to the lack of\nrealistic training environments and high-quality path-instruction pairs. Most\nexisting methods for constructing realistic navigation scenes have high costs,\nand the extension of instructions mainly relies on predefined templates or\nrules, lacking adaptability. To alleviate the issue, we propose InstruGen, a\nVLN path-instruction pairs generation paradigm. Specifically, we use YouTube\nhouse tour videos as realistic navigation scenes and leverage the powerful\nvisual understanding and generation abilities of large multimodal models (LMMs)\nto automatically generate diverse and high-quality VLN path-instruction pairs.\nOur method generates navigation instructions with different granularities and\nachieves fine-grained alignment between instructions and visual observations,\nwhich was difficult to achieve with previous methods. Additionally, we design a\nmulti-stage verification mechanism to reduce hallucinations and inconsistency\nof LMMs. Experimental results demonstrate that agents trained with\npath-instruction pairs generated by InstruGen achieves state-of-the-art\nperformance on the R2R and RxR benchmarks, particularly in unseen environments.\nCode is available at https://github.com/yanyu0526/InstruGen.\n","authors":["Yu Yan","Rongtao Xu","Jiazhao Zhang","Peiyang Li","Xiaodan Liang","Jianqin Yin"],"pdf_url":"https://arxiv.org/pdf/2411.11394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10541v2","updated":"2024-11-18T07:06:09Z","published":"2024-04-16T13:09:48Z","title":"Robotic Sensor Network: Achieving Mutual Communication Control\n Assistance With Fast Cross-Layer Optimization","summary":" Robotic sensor network (RSN) is an emerging paradigm that harvests data from\nremote sensors adopting mobile robots. However, communication and control\nfunctionalities in RSNs are interdependent, for which existing approaches\nbecome inefficient, as they plan robot trajectories merely based on\nunidirectional impact between communication and control. This paper proposes\nthe concept of mutual communication control assistance (MCCA), which leverages\na model predictive communication and control (MPC2) design for intertwined\noptimization of motion-assisted communication and communication-assisted\ncollision avoidance. The MPC2 problem jointly optimizes the cross-layer\nvariables of sensor powers and robot actions, and is solved by alternating\noptimization, strong duality, and cross-horizon minorization maximization in\nreal time. This approach contrasts with conventional communication control\nco-design methods that calculate an offline non-executable trajectory.\nExperiments in a high-fidelity RSN simulator demonstrate that the proposed MCCA\noutperforms various benchmarks in terms of communication efficiency and\nnavigation time.\n","authors":["Zhiyou Ji","Yujie Wan","Guoliang Li","Shuai Wang","Kejiang Ye","Derrick Wing Kwan Ng","Chengzhong Xu"],"pdf_url":"https://arxiv.org/pdf/2404.10541v2.pdf","comment":"5 pages, 6 figures, to appear in IEEE Wireless Communications Letters"},{"id":"http://arxiv.org/abs/2405.09822v2","updated":"2024-11-18T07:05:33Z","published":"2024-05-16T05:39:08Z","title":"SEEK: Semantic Reasoning for Object Goal Navigation in Real World\n Inspection Tasks","summary":" This paper addresses the problem of object-goal navigation in autonomous\ninspections in real-world environments. Object-goal navigation is crucial to\nenable effective inspections in various settings, often requiring the robot to\nidentify the target object within a large search space. Current object\ninspection methods fall short of human efficiency because they typically cannot\nbootstrap prior and common sense knowledge as humans do. In this paper, we\nintroduce a framework that enables robots to use semantic knowledge from prior\nspatial configurations of the environment and semantic common sense knowledge.\nWe propose SEEK (Semantic Reasoning for Object Inspection Tasks) that combines\nsemantic prior knowledge with the robot's observations to search for and\nnavigate toward target objects more efficiently. SEEK maintains two\nrepresentations: a Dynamic Scene Graph (DSG) and a Relational Semantic Network\n(RSN). The RSN is a compact and practical model that estimates the probability\nof finding the target object across spatial elements in the DSG. We propose a\nnovel probabilistic planning framework to search for the object using\nrelational semantic knowledge. Our simulation analyses demonstrate that SEEK\noutperforms the classical planning and Large Language Models (LLMs)-based\nmethods that are examined in this study in terms of efficiency for object-goal\ninspection tasks. We validated our approach on a physical legged robot in urban\nenvironments, showcasing its practicality and effectiveness in real-world\ninspection scenarios.\n","authors":["Muhammad Fadhil Ginting","Sung-Kyun Kim","David D. Fan","Matteo Palieri","Mykel J. Kochenderfer","Ali-akbar Agha-Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2405.09822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11323v1","updated":"2024-11-18T06:33:05Z","published":"2024-11-18T06:33:05Z","title":"SayComply: Grounding Field Robotic Tasks in Operational Compliance\n through Retrieval-Based Language Models","summary":" This paper addresses the problem of task planning for robots that must comply\nwith operational manuals in real-world settings. Task planning under these\nconstraints is essential for enabling autonomous robot operation in domains\nthat require adherence to domain-specific knowledge. Current methods for\ngenerating robot goals and plans rely on common sense knowledge encoded in\nlarge language models. However, these models lack grounding of robot plans to\ndomain-specific knowledge and are not easily transferable between multiple\nsites or customers with different compliance needs. In this work, we present\nSayComply, which enables grounding robotic task planning with operational\ncompliance using retrieval-based language models. We design a hierarchical\ndatabase of operational, environment, and robot embodiment manuals and\nprocedures to enable efficient retrieval of the relevant context under the\nlimited context length of the LLMs. We then design a task planner using a\ntree-based retrieval augmented generation (RAG) technique to generate robot\ntasks that follow user instructions while simultaneously complying with the\ndomain knowledge in the database. We demonstrate the benefits of our approach\nthrough simulations and hardware experiments in real-world scenarios that\nrequire precise context retrieval across various types of context,\noutperforming the standard RAG method. Our approach bridges the gap in\ndeploying robots that consistently adhere to operational protocols, offering a\nscalable and edge-deployable solution for ensuring compliance across varied and\ncomplex real-world environments. Project website: saycomply.github.io.\n","authors":["Muhammad Fadhil Ginting","Dong-Ki Kim","Sung-Kyun Kim","Bandi Jai Krishna","Mykel J. Kochenderfer","Shayegan Omidshafiei","Ali-akbar Agha-mohammadi"],"pdf_url":"https://arxiv.org/pdf/2411.11323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11306v1","updated":"2024-11-18T06:03:04Z","published":"2024-11-18T06:03:04Z","title":"Design a New Pulling Gear for the Automated Pant Bottom Hem Sewing\n Machine","summary":" Automated machinery design for garment manufacturing is essential for\nimproving productivity, consistency, and quality. This paper focuses on the\ndevelopment of new pulling gear for automated pant bottom hem sewing machines.\nTraditionally, these machines require manual intervention to guide the bottom\nhem sewing process, which often leads to inconsistent stitch quality and\nalignment. While twin-needle sewing machines can create twin lines for the\nbottom hem, they typically lack sufficient pulling force to adequately handle\nthe fabric of the pants' bottom hem. The innovative design of the pulling gear\naims to address this issue by providing the necessary pulling force for the\nbottom hem of eyelet pants. The research and design discussed in this article\nseek to solve technical challenges, eliminate the need for skilled manual\noperators, and enhance overall productivity. This improvement ensures smooth\nand precise feeding of fabric pieces in the automated twin needle sewing\nmachine, ultimately improving the consistency and quality of the stitching. By\nintegrating this innovation, garment manufacturers can boost productivity,\nreduce reliance on manual skilful labour, and optimize the output of the\nproduction process, thereby reaping the benefits of automation in the garment\nmanufacturing industry.\n","authors":["Ray Wai Man Kong","Theodore Ho Tin Kong","Miao Yi","Zerui Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11306v1.pdf","comment":"9 pages,11 figures, preprint to International Research Journal of\n Modernization in Engineering Technology and Science"},{"id":"http://arxiv.org/abs/2411.11252v1","updated":"2024-11-18T03:00:33Z","published":"2024-11-18T03:00:33Z","title":"DrivingSphere: Building a High-fidelity 4D World for Closed-loop\n Simulation","summary":" Autonomous driving evaluation requires simulation environments that closely\nreplicate actual road conditions, including real-world sensory data and\nresponsive feedback loops. However, many existing simulations need to predict\nwaypoints along fixed routes on public datasets or synthetic photorealistic\ndata, \\ie, open-loop simulation usually lacks the ability to assess dynamic\ndecision-making. While the recent efforts of closed-loop simulation offer\nfeedback-driven environments, they cannot process visual sensor inputs or\nproduce outputs that differ from real-world data. To address these challenges,\nwe propose DrivingSphere, a realistic and closed-loop simulation framework. Its\ncore idea is to build 4D world representation and generate real-life and\ncontrollable driving scenarios. In specific, our framework includes a Dynamic\nEnvironment Composition module that constructs a detailed 4D driving world with\na format of occupancy equipping with static backgrounds and dynamic objects,\nand a Visual Scene Synthesis module that transforms this data into\nhigh-fidelity, multi-view video outputs, ensuring spatial and temporal\nconsistency. By providing a dynamic and realistic simulation environment,\nDrivingSphere enables comprehensive testing and validation of autonomous\ndriving algorithms, ultimately advancing the development of more reliable\nautonomous cars. The benchmark will be publicly released.\n","authors":["Tianyi Yan","Dongming Wu","Wencheng Han","Junpeng Jiang","Xia Zhou","Kun Zhan","Cheng-zhong Xu","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2411.11252v1.pdf","comment":"https://yanty123.github.io/DrivingSphere/"},{"id":"http://arxiv.org/abs/2409.02389v2","updated":"2024-11-18T02:32:22Z","published":"2024-09-04T02:37:38Z","title":"Multi-modal Situated Reasoning in 3D Scenes","summary":" Situation awareness is essential for understanding and reasoning about 3D\nscenes in embodied AI agents. However, existing datasets and benchmarks for\nsituated understanding are limited in data modality, diversity, scale, and task\nscope. To address these limitations, we propose Multi-modal Situated Question\nAnswering (MSQA), a large-scale multi-modal situated reasoning dataset,\nscalably collected leveraging 3D scene graphs and vision-language models (VLMs)\nacross a diverse range of real-world 3D scenes. MSQA includes 251K situated\nquestion-answering pairs across 9 distinct question categories, covering\ncomplex scenarios within 3D scenes. We introduce a novel interleaved\nmulti-modal input setting in our benchmark to provide text, image, and point\ncloud for situation and question description, resolving ambiguity in previous\nsingle-modality convention (e.g., text). Additionally, we devise the\nMulti-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models'\nsituated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN\nhighlight the limitations of existing vision-language models and underscore the\nimportance of handling multi-modal interleaved inputs and situation modeling.\nExperiments on data scaling and cross-domain transfer further demonstrate the\nefficacy of leveraging MSQA as a pre-training dataset for developing more\npowerful situated reasoning models.\n","authors":["Xiongkun Linghu","Jiangyong Huang","Xuesong Niu","Xiaojian Ma","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2409.02389v2.pdf","comment":"Accepted by NeurIPS 2024 Datasets and Benchmarks Track. Project page:\n https://msr3d.github.io/"},{"id":"http://arxiv.org/abs/2408.00642v3","updated":"2024-11-18T02:28:06Z","published":"2024-08-01T15:33:16Z","title":"Coverage Path Planning For Minimizing Expected Time to Search For an\n Object With Continuous Sensing","summary":" In this paper, we present several results of both theoretical as well as\npractical interests. First, we propose the quota lawn mowing problem, an\nextension of the classic lawn mowing problem in computational geometry, as\nfollows: given a quota of coverage, compute the shortest lawn mowing route to\nachieve said quota. We give constant-factor approximations for the quota lawn\nmowing problem.\n Second, we investigate the expected detection time minimization problem in\ngeometric coverage path planning with local, continuous sensory information. We\nprovide the first approximation algorithm with provable error bounds with\npseudopolynomial running time. Our ideas also extend to another search\nmechanism, namely visibility-based search, which is related to the watchman\nroute problem. We complement our theoretical analysis with some simple but\neffective heuristics for finding an object in minimum expected time, on which\nwe provide simulation results.\n","authors":["Linh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.00642v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03132v3","updated":"2024-11-18T02:06:46Z","published":"2024-10-04T04:07:15Z","title":"Autoregressive Action Sequence Learning for Robotic Manipulation","summary":" Designing a universal policy architecture that performs well across diverse\nrobots and task configurations remains a key challenge. In this work, we\naddress this by representing robot actions as sequential data and generating\nactions through autoregressive sequence modeling. Existing autoregressive\narchitectures generate end-effector waypoints sequentially as word tokens in\nlanguage modeling, which are limited to low-frequency control tasks. Unlike\nlanguage, robot actions are heterogeneous and often include continuous values\n-- such as joint positions, 2D pixel coordinates, and end-effector poses --\nwhich are not easily suited for language-based modeling. Based on this insight,\nwe introduce a straightforward enhancement: we extend causal transformers'\nsingle-token prediction to support predicting a variable number of tokens in a\nsingle step through our Chunking Causal Transformer (CCT). This enhancement\nenables robust performance across diverse tasks of various control frequencies,\ngreater efficiency by having fewer autoregression steps, and lead to a hybrid\naction sequence design by mixing different types of actions and using a\ndifferent chunk size for each action type. Based on CCT, we propose the\nAutoregressive Policy (ARP) architecture, which solves manipulation tasks by\ngenerating hybrid action sequences. We evaluate ARP across diverse robotic\nmanipulation environments, including Push-T, ALOHA, and RLBench, and show that\nARP, as a universal architecture, outperforms the environment-specific\nstate-of-the-art in all tested benchmarks, while being more efficient in\ncomputation and parameter sizes. Videos of our real robot demonstrations, all\nsource code and the pretrained models of ARP can be found at\nhttp://github.com/mlzxy/arp.\n","authors":["Xinyu Zhang","Yuhan Liu","Haonan Chang","Liam Schramm","Abdeslam Boularias"],"pdf_url":"https://arxiv.org/pdf/2410.03132v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11218v1","updated":"2024-11-18T01:06:59Z","published":"2024-11-18T01:06:59Z","title":"Conjugate Momentum-Based Estimation of External Forces for Bio-Inspired\n Morphing Wing Flight","summary":" Dynamic morphing wing flights present significant challenges in accurately\nestimating external forces due to complex interactions between aerodynamics,\nrapid wing movements, and external disturbances. Traditional force estimation\nmethods often struggle with unpredictable disturbances like wind gusts or\nunmodeled impacts that can destabilize flight in real-world scenarios. This\npaper addresses these challenges by implementing a Conjugate Momentum-based\nObserver, which effectively estimates and manages unknown external forces\nacting on the Aerobat, a bio-inspired robotic platform with dynamically\nmorphing wings. Through simulations, the observer demonstrates its capability\nto accurately detect and quantify external forces, even in the presence of\nGaussian noise and abrupt impulse inputs. The results validate the robustness\nof the method, showing improved stability and control of the Aerobat in dynamic\nenvironments. This research contributes to advancements in bio-inspired\nrobotics by enhancing force estimation for flapping-wing systems, with\npotential applications in autonomous aerial navigation and robust flight\ncontrol.\n","authors":["Bibek Gupta","Eric Sihite","Alireza Ramezani"],"pdf_url":"https://arxiv.org/pdf/2411.11218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11216v1","updated":"2024-11-18T01:01:34Z","published":"2024-11-18T01:01:34Z","title":"Optimization free control and ground force estimation with momentum\n observer for a multimodal legged aerial robot","summary":" Legged-aerial multimodal robots can make the most of both legged and aerial\nsystems. In this paper, we propose a control framework that bypasses heavy\nonboard computers by using an optimization-free Explicit Reference Governor\nthat incorporates external thruster forces from an attitude controller. Ground\nreaction forces are maintained within friction cone constraints using costly\noptimization solvers, but the ERG framework filters applied velocity references\nthat ensure no slippage at the foot end. We also propose a Conjugate momentum\nobserver, that is widely used in Disturbance Observation to estimate ground\nreaction forces and compare its efficacy against a constrained model in\nestimating ground reaction forces in a reduced-order simulation of Husky.\n","authors":["Kaushik Venkatesh Krishnamurthy","Chenghao Wang","Shreyansh Pitroda","Eric Sihite","Alireza Ramezani","Morteza Gharib"],"pdf_url":"https://arxiv.org/pdf/2411.11216v1.pdf","comment":"6 pages, 10 figures, submitted to American Control Conference 2025"},{"id":"http://arxiv.org/abs/2411.11211v1","updated":"2024-11-18T00:18:46Z","published":"2024-11-18T00:18:46Z","title":"Operator Splitting Covariance Steering for Safe Stochastic Nonlinear\n Control","summary":" Most robotics applications are typically accompanied with safety restrictions\nthat need to be satisfied with a high degree of confidence even in environments\nunder uncertainty. Controlling the state distribution of a system and enforcing\nsuch specifications as distribution constraints is a promising approach for\nmeeting such requirements. In this direction, covariance steering (CS) is an\nincreasingly popular stochastic optimal control (SOC) framework for designing\nsafe controllers via explicit constraints on the system covariance.\nNevertheless, a major challenge in applying CS methods to systems with the\nnonlinear dynamics and chance constraints common in robotics is that the\napproximations needed are conservative and highly sensitive to the point of\napproximation. This can cause sequential convex programming methods to converge\nto poor local minima or incorrectly report problems as infeasible due to\nshifting constraints. This paper presents a novel algorithm for solving\nchance-constrained nonlinear CS problems that directly addresses this\nchallenge. Specifically, we propose an operator-splitting approach that\ntemporarily separates the main problem into subproblems that can be solved in\nparallel. The benefit of this relaxation lies in the fact that it does not\nrequire all iterates to satisfy all constraints simultaneously prior to\nconvergence, thus enhancing the exploration capabilities of the algorithm for\nfinding better solutions. Simulation results verify the ability of the proposed\nmethod to find higher quality solutions under stricter safety constraints than\nstandard methods on a variety of robotic systems. Finally, the applicability of\nthe algorithm on real systems is confirmed through hardware demonstrations.\n","authors":["Akash Ratheesh","Vincent Pacelli","Augustinos D. Saravanos","Evangelos A. Theodorou"],"pdf_url":"https://arxiv.org/pdf/2411.11211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08850v2","updated":"2024-11-18T21:51:16Z","published":"2023-07-17T21:22:17Z","title":"LiDAR-BEVMTN: Real-Time LiDAR Bird's-Eye View Multi-Task Perception\n Network for Autonomous Driving","summary":" LiDAR is crucial for robust 3D scene perception in autonomous driving. LiDAR\nperception has the largest body of literature after camera perception. However,\nmulti-task learning across tasks like detection, segmentation, and motion\nestimation using LiDAR remains relatively unexplored, especially on\nautomotive-grade embedded platforms. We present a real-time multi-task\nconvolutional neural network for LiDAR-based object detection, semantics, and\nmotion segmentation. The unified architecture comprises a shared encoder and\ntask-specific decoders, enabling joint representation learning. We propose a\nnovel Semantic Weighting and Guidance (SWAG) module to transfer semantic\nfeatures for improved object detection selectively. Our heterogeneous training\nscheme combines diverse datasets and exploits complementary cues between tasks.\nThe work provides the first embedded implementation unifying these key\nperception tasks from LiDAR point clouds achieving 3ms latency on the embedded\nNVIDIA Xavier platform. We achieve state-of-the-art results for two tasks,\nsemantic and motion segmentation, and close to state-of-the-art performance for\n3D object detection. By maximizing hardware efficiency and leveraging\nmulti-task synergies, our method delivers an accurate and efficient solution\ntailored for real-world automated driving deployment. Qualitative results can\nbe seen at https://youtu.be/H-hWRzv2lIY.\n","authors":["Sambit Mohapatra","Senthil Yogamani","Varun Ravi Kumar","Stefan Milz","Heinrich Gotzig","Patrick Mäder"],"pdf_url":"https://arxiv.org/pdf/2307.08850v2.pdf","comment":"Accepted for publication at IEEE Transactions on Intelligent\n Transportation Systems"},{"id":"http://arxiv.org/abs/2402.15384v2","updated":"2024-11-18T20:54:06Z","published":"2024-02-23T15:30:57Z","title":"Homeostatic motion planning with innate physics knowledge","summary":" Living organisms interact with their surroundings in a closed-loop fashion,\nwhere sensory inputs dictate the initiation and termination of behaviours. Even\nsimple animals are able to develop and execute complex plans, which has not yet\nbeen replicated in robotics using pure closed-loop input control. We propose a\nsolution to this problem by defining a set of discrete and temporary\nclosed-loop controllers, called \"tasks\", each representing a closed-loop\nbehaviour. We further introduce a supervisory module which has an innate\nunderstanding of physics and causality, through which it can simulate the\nexecution of task sequences over time and store the results in a model of the\nenvironment. On the basis of this model, plans can be made by chaining\ntemporary closed-loop controllers. The proposed framework was implemented for a\nreal robot and tested in two scenarios as proof of concept.\n","authors":["Giulia Lafratta","Bernd Porr","Christopher Chandler","Alice Miller"],"pdf_url":"https://arxiv.org/pdf/2402.15384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12047v1","updated":"2024-11-18T20:34:32Z","published":"2024-11-18T20:34:32Z","title":"Simultaneous Ground Reaction Force and State Estimation via Constrained\n Moving Horizon Estimation","summary":" Accurate ground reaction force (GRF) estimation can significantly improve the\nadaptability of legged robots in various real-world applications. For instance,\nwith estimated GRF and contact kinematics, the locomotion control and planning\nassist the robot in overcoming uncertain terrains. The canonical momentum-based\nmethods, formulated as nonlinear observers, do not fully address the noisy\nmeasurements and the dependence between floating base states and the\ngeneralized momentum dynamics. In this paper, we present a simultaneous ground\nreaction force and state estimation framework for legged robots, which\nsystematically addresses the sensor noise and the coupling between states and\ndynamics. With the floating base orientation estimated separately, a\ndecentralized Moving Horizon Estimation (MHE) method is implemented to fuse the\nrobot dynamics, proprioceptive sensors, exteroceptive sensors, and\ndeterministic contact complementarity constraints in a convex windowed\noptimization. The proposed method is shown to be capable of providing accurate\nGRF and state estimation on several legged robots, including the open-source\neducational planar bipedal robot STRIDE and quadrupedal robot Unitree Go1, with\na frequency of 200Hz and a past time window of 0.04s.\n","authors":["Jiarong Kang","Xiaobin Xiong"],"pdf_url":"https://arxiv.org/pdf/2411.12047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03908v2","updated":"2024-11-18T20:32:39Z","published":"2023-12-06T21:05:39Z","title":"Irrotational Contact Fields","summary":" We present a framework for generating convex approximations of complex\ncontact models, incorporating experimentally validated models like Hunt &\nCrossley coupled with Coulomb's law of friction alongside the principle of\nmaximum dissipation. Our approach is robust across a wide range of stiffness\nvalues, making it suitable for both compliant surfaces and rigid\napproximations. We evaluate these approximations across a wide variety of test\ncases, detailing properties and limitations. We implement a fully\ndifferentiable solution in the open-source robotics toolkit, Drake. Our novel\nhybrid approach enables computation of gradients for complex geometric models\nwhile reusing factorizations from contact resolution. We demonstrate robust\nsimulation of robotic tasks at interactive rates, with accurately resolved\nstiction and contact transitions, supporting effective sim-to-real transfer.\n","authors":["Alejandro Castro","Xuchen Han","Joseph Masterjohn"],"pdf_url":"https://arxiv.org/pdf/2312.03908v2.pdf","comment":"16 pages, 26 figures. The supplemental video is available publicly at\n https://youtu.be/FTUPYZ_8Xbk?si=MWndCUCGWMJsFnsO"},{"id":"http://arxiv.org/abs/2411.12042v1","updated":"2024-11-18T20:27:13Z","published":"2024-11-18T20:27:13Z","title":"Fast Convergence of Softmax Policy Mirror Ascent","summary":" Natural policy gradient (NPG) is a common policy optimization algorithm and\ncan be viewed as mirror ascent in the space of probabilities. Recently, Vaswani\net al. [2021] introduced a policy gradient method that corresponds to mirror\nascent in the dual space of logits. We refine this algorithm, removing its need\nfor a normalization across actions and analyze the resulting method (referred\nto as SPMA). For tabular MDPs, we prove that SPMA with a constant step-size\nmatches the linear convergence of NPG and achieves a faster convergence than\nconstant step-size (accelerated) softmax policy gradient. To handle large\nstate-action spaces, we extend SPMA to use a log-linear policy\nparameterization. Unlike that for NPG, generalizing SPMA to the linear function\napproximation (FA) setting does not require compatible function approximation.\nUnlike MDPO, a practical generalization of NPG, SPMA with linear FA only\nrequires solving convex softmax classification problems. We prove that SPMA\nachieves linear convergence to the neighbourhood of the optimal value function.\nWe extend SPMA to handle non-linear FA and evaluate its empirical performance\non the MuJoCo and Atari benchmarks. Our results demonstrate that SPMA\nconsistently achieves similar or better performance compared to MDPO, PPO and\nTRPO.\n","authors":["Reza Asad","Reza Babanezhad","Issam Laradji","Nicolas Le Roux","Sharan Vaswani"],"pdf_url":"https://arxiv.org/pdf/2411.12042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12014v1","updated":"2024-11-18T19:55:43Z","published":"2024-11-18T19:55:43Z","title":"On-the-Go Path Planning and Repair in Static and Dynamic Scenarios","summary":" Autonomous systems, including robots and drones, face significant challenges\nwhen navigating through dynamic environments, particularly within urban\nsettings where obstacles, fluctuating traffic, and pedestrian activity are\nconstantly shifting. Although, traditional motion planning algorithms like the\nwavefront planner and gradient descent planner, which use potential functions,\nwork well in static environments, they fall short in situations where the\nenvironment is continuously changing. This work proposes a dynamic, real-time\npath planning approach specifically designed for autonomous systems, allowing\nthem to effectively avoid static and dynamic obstacles, thereby enhancing their\noverall adaptability. The approach integrates the efficiency of conventional\nplanners with the ability to make rapid adjustments in response to moving\nobstacles and environmental changes. The simulation results discussed in this\narticle demonstrate the effectiveness of the proposed method, demonstrating its\nsuitability for robotic path planning in both known and unknown environments,\nincluding those involving mobile objects, agents, or potential threats.\n","authors":["Daniel Ajeleye"],"pdf_url":"https://arxiv.org/pdf/2411.12014v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2406.13700v2","updated":"2024-11-18T19:28:07Z","published":"2024-06-19T16:53:24Z","title":"Reinforcement Learning-Based Model Matching to Reduce the Sim-Real Gap\n in COBRA","summary":" This paper employs a reinforcement learning-based model identification method\naimed at enhancing the accuracy of the dynamics for our snake robot, called\nCOBRA. Leveraging gradient information and iterative optimization, the proposed\napproach refines the parameters of COBRA's dynamical model such as coefficient\nof friction and actuator parameters using experimental and simulated data.\nExperimental validation on the hardware platform demonstrates the efficacy of\nthe proposed approach, highlighting its potential to address sim-to-real gap in\nrobot implementation.\n","authors":["Adarsh Salagame","Harin Kumar Nallaguntla","Bardia Ardakanian","Eric Sihite","Gunar Schirner","Alireza Ramezani"],"pdf_url":"https://arxiv.org/pdf/2406.13700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11982v1","updated":"2024-11-18T19:13:06Z","published":"2024-11-18T19:13:06Z","title":"HPA-MPC: Hybrid Perception-Aware Nonlinear Model Predictive Control for\n Quadrotors with Suspended Loads","summary":" Quadrotors equipped with cable-suspended loads represent a versatile,\nlow-cost, and energy efficient solution for aerial transportation,\nconstruction, and manipulation tasks. However, their real-world deployment is\nhindered by several challenges. The system is difficult to control because it\nis nonlinear, underactuated, involves hybrid dynamics due to slack-taut cable\nmodes, and evolves on complex configuration spaces. Additionally, it is crucial\nto estimate the full state and the cable's mode transitions in real-time using\non-board sensors and computation. To address these challenges, we present a\nnovel Hybrid Perception-Aware Nonlinear Model Predictive Control (HPA-MPC)\ncontrol approach for quadrotors with suspended loads. Our method considers the\ncomplete hybrid system dynamics and includes a perception-aware cost to ensure\nthe payload remains visible in the robot's camera during navigation.\nFurthermore, the full state and hybrid dynamics' transitions are estimated\nusing onboard sensors. Experimental results demonstrate that our approach\nenables stable load tracking control, even during slack-taut transitions, and\noperates entirely onboard. The experiments also show that the perception-aware\nterm effectively keeps the payload in the robot's camera field of view when a\nhuman operator interacts with the load.\n","authors":["Mrunal Sarvaiya","Guanrui Li","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2411.11982v1.pdf","comment":"Accepted to IEEE Robotics and Automation Letters"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.11845v1","updated":"2024-11-18T18:59:58Z","published":"2024-11-18T18:59:58Z","title":"UniHands: Unifying Various Wild-Collected Keypoints for Personalized\n Hand Reconstruction","summary":" Accurate hand motion capture and standardized 3D representation are essential\nfor various hand-related tasks. Collecting keypoints-only data, while efficient\nand cost-effective, results in low-fidelity representations and lacks surface\ninformation. Furthermore, data inconsistencies across sources challenge their\nintegration and use. We present UniHands, a novel method for creating\nstandardized yet personalized hand models from wild-collected keypoints from\ndiverse sources. Unlike existing neural implicit representation methods,\nUniHands uses the widely-adopted parametric models MANO and NIMBLE, providing a\nmore scalable and versatile solution. It also derives unified hand joints from\nthe meshes, which facilitates seamless integration into various hand-related\ntasks. Experiments on the FreiHAND and InterHand2.6M datasets demonstrate its\nability to precisely reconstruct hand mesh vertices and keypoints, effectively\ncapturing high-degree articulation motions. Empirical studies involving nine\nparticipants show a clear preference for our unified joints over existing\nconfigurations for accuracy and naturalism (p-value 0.016).\n","authors":["Menghe Zhang","Joonyeoup Kim","Yangwen Liang","Shuangquan Wang","Kee-Bong Song"],"pdf_url":"https://arxiv.org/pdf/2411.11845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11844v1","updated":"2024-11-18T18:59:31Z","published":"2024-11-18T18:59:31Z","title":"Generative World Explorer","summary":" Planning with partial observation is a central challenge in embodied AI. A\nmajority of prior works have tackled this challenge by developing agents that\nphysically explore their environment to update their beliefs about the world\nstate.In contrast, humans can $\\textit{imagine}$ unseen parts of the world\nthrough a mental exploration and $\\textit{revise}$ their beliefs with imagined\nobservations. Such updated beliefs can allow them to make more informed\ndecisions, without necessitating the physical exploration of the world at all\ntimes. To achieve this human-like ability, we introduce the $\\textit{Generative\nWorld Explorer (Genex)}$, an egocentric world exploration framework that allows\nan agent to mentally explore a large-scale 3D world (e.g., urban scenes) and\nacquire imagined observations to update its belief. This updated belief will\nthen help the agent to make a more informed decision at the current step. To\ntrain $\\textit{Genex}$, we create a synthetic urban scene dataset, Genex-DB.\nOur experimental results demonstrate that (1) $\\textit{Genex}$ can generate\nhigh-quality and consistent observations during long-horizon exploration of a\nlarge virtual physical world and (2) the beliefs updated with the generated\nobservations can inform an existing decision-making model (e.g., an LLM agent)\nto make better plans.\n","authors":["Taiming Lu","Tianmin Shu","Alan Yuille","Daniel Khashabi","Jieneng Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11844v1.pdf","comment":"Website: generative-world-explorer.github.io"},{"id":"http://arxiv.org/abs/2411.11839v1","updated":"2024-11-18T18:58:03Z","published":"2024-11-18T18:58:03Z","title":"RoboGSim: A Real2Sim2Real Robotic Gaussian Splatting Simulator","summary":" Efficient acquisition of real-world embodied data has been increasingly\ncritical. However, large-scale demonstrations captured by remote operation tend\nto take extremely high costs and fail to scale up the data size in an efficient\nmanner. Sampling the episodes under a simulated environment is a promising way\nfor large-scale collection while existing simulators fail to high-fidelity\nmodeling on texture and physics. To address these limitations, we introduce the\nRoboGSim, a real2sim2real robotic simulator, powered by 3D Gaussian Splatting\nand the physics engine. RoboGSim mainly includes four parts: Gaussian\nReconstructor, Digital Twins Builder, Scene Composer, and Interactive Engine.\nIt can synthesize the simulated data with novel views, objects, trajectories,\nand scenes. RoboGSim also provides an online, reproducible, and safe evaluation\nfor different manipulation policies. The real2sim and sim2real transfer\nexperiments show a high consistency in the texture and physics. Moreover, the\neffectiveness of synthetic data is validated under the real-world manipulated\ntasks. We hope RoboGSim serves as a closed-loop simulator for fair comparison\non policy learning. More information can be found on our project page\nhttps://robogsim.github.io/ .\n","authors":["Xinhai Li","Jialin Li","Ziheng Zhang","Rui Zhang","Fan Jia","Tiancai Wang","Haoqiang Fan","Kuo-Kun Tseng","Ruiping Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11826v1","updated":"2024-11-18T18:44:10Z","published":"2024-11-18T18:44:10Z","title":"LightFFDNets: Lightweight Convolutional Neural Networks for Rapid Facial\n Forgery Detection","summary":" Accurate and fast recognition of forgeries is an issue of great importance in\nthe fields of artificial intelligence, image processing and object detection.\nRecognition of forgeries of facial imagery is the process of classifying and\ndefining the faces in it by analyzing real-world facial images. This process is\nusually accomplished by extracting features from an image, using classifier\nalgorithms, and correctly interpreting the results. Recognizing forgeries of\nfacial imagery correctly can encounter many different challenges. For example,\nfactors such as changing lighting conditions, viewing faces from different\nangles can affect recognition performance, and background complexity and\nperspective changes in facial images can make accurate recognition difficult.\nDespite these difficulties, significant progress has been made in the field of\nforgery detection. Deep learning algorithms, especially Convolutional Neural\nNetworks (CNNs), have significantly improved forgery detection performance.\n This study focuses on image processing-based forgery detection using\nFake-Vs-Real-Faces (Hard) [10] and 140k Real and Fake Faces [61] data sets.\nBoth data sets consist of two classes containing real and fake facial images.\nIn our study, two lightweight deep learning models are proposed to conduct\nforgery detection using these images. Additionally, 8 different pretrained CNN\narchitectures were tested on both data sets and the results were compared with\nnewly developed lightweight CNN models. It's shown that the proposed\nlightweight deep learning models have minimum number of layers. It's also shown\nthat the proposed lightweight deep learning models detect forgeries of facial\nimagery accurately, and computationally efficiently. Although the data set\nconsists only of face images, the developed models can also be used in other\ntwo-class object recognition problems.\n","authors":["Günel Jabbarlı","Murat Kurt"],"pdf_url":"https://arxiv.org/pdf/2411.11826v1.pdf","comment":"13 pages, 6 figures, 10 tables"},{"id":"http://arxiv.org/abs/2411.11819v1","updated":"2024-11-18T18:37:39Z","published":"2024-11-18T18:37:39Z","title":"Equivariant spatio-hemispherical networks for diffusion MRI\n deconvolution","summary":" Each voxel in a diffusion MRI (dMRI) image contains a spherical signal\ncorresponding to the direction and strength of water diffusion in the brain.\nThis paper advances the analysis of such spatio-spherical data by developing\nconvolutional network layers that are equivariant to the $\\mathbf{E(3) \\times\nSO(3)}$ group and account for the physical symmetries of dMRI including\nrotations, translations, and reflections of space alongside voxel-wise\nrotations. Further, neuronal fibers are typically antipodally symmetric, a fact\nwe leverage to construct highly efficient spatio-hemispherical graph\nconvolutions to accelerate the analysis of high-dimensional dMRI data. In the\ncontext of sparse spherical fiber deconvolution to recover white matter\nmicrostructure, our proposed equivariant network layers yield substantial\nperformance and efficiency gains, leading to better and more practical\nresolution of crossing neuronal fibers and fiber tractography. These gains are\nexperimentally consistent across both simulation and in vivo human datasets.\n","authors":["Axel Elaldi","Guido Gerig","Neel Dey"],"pdf_url":"https://arxiv.org/pdf/2411.11819v1.pdf","comment":"Accepted to NeurIPS 2024. 24 pages with 13 figures. Code available at\n https://github.com/AxelElaldi/fast-equivariant-deconv"},{"id":"http://arxiv.org/abs/2402.04507v2","updated":"2024-11-18T18:37:19Z","published":"2024-02-07T01:26:14Z","title":"A Review of Digital Pixel Sensors","summary":" Digital pixel sensor (DPS) has evolved as a pivotal component in modern\nimaging systems and has the potential to revolutionize various fields such as\nmedical imaging, astronomy, surveillance, IoT devices, etc. Compared to analog\npixel sensors, the DPS offers high speed and good image quality. However, the\nintroduced intrinsic complexity within each pixel, primarily attributed to the\naccommodation of the ADC circuit, engenders a substantial increase in the pixel\npitch. Unfortunately, such a pronounced escalation in pixel pitch drastically\nundermines the feasibility of achieving high-density integration, which is an\nobstacle that significantly narrows down the field of potential applications.\nNonetheless, designing compact conversion circuits along with strategic\nintegration of 3D architectural paradigms can be a potential remedy to the\nprevailing situation. This review article presents a comprehensive overview of\nthe vast area of DPS technology. The operating principles, advantages, and\nchallenges of different types of DPS circuits have been analyzed. We categorize\nthe schemes into several categories based on ADC operation. A comparative study\nbased on different performance metrics has also been showcased for a\nwell-rounded understanding.\n","authors":["Md Rahatul Islam Udoy","Shamiul Alam","Md Mazharul Islam","Akhilesh Jaiswal","Ahmedullah Aziz"],"pdf_url":"https://arxiv.org/pdf/2402.04507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04254v2","updated":"2024-11-18T18:35:06Z","published":"2024-04-05T17:58:52Z","title":"Watermark-based Detection and Attribution of AI-Generated Content","summary":" Several companies have deployed watermark-based detection to identify\nAI-generated content. However, attribution--the ability to trace back to the\nuser of a generative AI (GenAI) service who created a given piece of\nAI-generated content--remains largely unexplored despite its growing\nimportance. In this work, we aim to bridge this gap by conducting the first\nsystematic study on watermark-based, user-level attribution of AI-generated\ncontent. Our key idea is to assign a unique watermark to each user of the GenAI\nservice and embed this watermark into the AI-generated content created by that\nuser. Attribution is then performed by identifying the user whose watermark\nbest matches the one extracted from the given content. This approach, however,\nfaces a key challenge: How should watermarks be selected for users to maximize\nattribution performance? To address the challenge, we first theoretically\nderive lower bounds on detection and attribution performance through rigorous\nprobabilistic analysis for any given set of user watermarks. Then, we select\nwatermarks for users to maximize these lower bounds, thereby optimizing\ndetection and attribution performance. Our theoretical and empirical results\nshow that watermark-based attribution inherits both the accuracy and\n(non-)robustness properties of the underlying watermark. Specifically,\nattribution remains highly accurate when the watermarked AI-generated content\nis either not post-processed or subjected to common post-processing such as\nJPEG compression, as well as black-box adversarial post-processing with limited\nquery budgets.\n","authors":["Zhengyuan Jiang","Moyang Guo","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2404.04254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11799v1","updated":"2024-11-18T18:11:53Z","published":"2024-11-18T18:11:53Z","title":"Edge-Enhanced Dilated Residual Attention Network for Multimodal Medical\n Image Fusion","summary":" Multimodal medical image fusion is a crucial task that combines complementary\ninformation from different imaging modalities into a unified representation,\nthereby enhancing diagnostic accuracy and treatment planning. While deep\nlearning methods, particularly Convolutional Neural Networks (CNNs) and\nTransformers, have significantly advanced fusion performance, some of the\nexisting CNN-based methods fall short in capturing fine-grained multiscale and\nedge features, leading to suboptimal feature integration. Transformer-based\nmodels, on the other hand, are computationally intensive in both the training\nand fusion stages, making them impractical for real-time clinical use.\nMoreover, the clinical application of fused images remains unexplored. In this\npaper, we propose a novel CNN-based architecture that addresses these\nlimitations by introducing a Dilated Residual Attention Network Module for\neffective multiscale feature extraction, coupled with a gradient operator to\nenhance edge detail learning. To ensure fast and efficient fusion, we present a\nparameter-free fusion strategy based on the weighted nuclear norm of softmax,\nwhich requires no additional computations during training or inference.\nExtensive experiments, including a downstream brain tumor classification task,\ndemonstrate that our approach outperforms various baseline methods in terms of\nvisual quality, texture preservation, and fusion speed, making it a possible\npractical solution for real-world clinical applications. The code will be\nreleased at https://github.com/simonZhou86/en_dran.\n","authors":["Meng Zhou","Yuxuan Zhang","Xiaolan Xu","Jiayi Wang","Farzad Khalvati"],"pdf_url":"https://arxiv.org/pdf/2411.11799v1.pdf","comment":"An extended version of the paper accepted at IEEE BIBM 2024"},{"id":"http://arxiv.org/abs/2411.11795v1","updated":"2024-11-18T18:08:52Z","published":"2024-11-18T18:08:52Z","title":"Exploring adversarial robustness of JPEG AI: methodology, comparison and\n new methods","summary":" Adversarial robustness of neural networks is an increasingly important area\nof research, combining studies on computer vision models, large language models\n(LLMs), and others. With the release of JPEG AI - the first standard for\nend-to-end neural image compression (NIC) methods - the question of its\nrobustness has become critically significant. JPEG AI is among the first\ninternational, real-world applications of neural-network-based models to be\nembedded in consumer devices. However, research on NIC robustness has been\nlimited to open-source codecs and a narrow range of attacks. This paper\nproposes a new methodology for measuring NIC robustness to adversarial attacks.\nWe present the first large-scale evaluation of JPEG AI's robustness, comparing\nit with other NIC models. Our evaluation results and code are publicly\navailable online (link is hidden for a blind review).\n","authors":["Egor Kovalev","Georgii Bychkov","Khaled Abud","Aleksandr Gushchin","Anna Chistyakova","Sergey Lavrushkin","Dmitriy Vatolin","Anastasia Antsiferova"],"pdf_url":"https://arxiv.org/pdf/2411.11795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21343v2","updated":"2024-11-18T17:59:10Z","published":"2024-07-31T05:17:31Z","title":"MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation\n Framework","summary":" Medical imaging segmentation is a highly active area of research, with deep\nlearning-based methods achieving state-of-the-art results in several\nbenchmarks. However, the lack of standardized tools for training, testing, and\nevaluating new methods makes the comparison of methods difficult. To address\nthis, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple,\nmodular, and end-to-end medical imaging segmentation framework designed to\nfacilitate consistent training, testing, and evaluation of deep learning-based\nmedical imaging segmentation methods. MIST standardizes data analysis,\npreprocessing, and evaluation pipelines, accommodating multiple architectures\nand loss functions. This standardization ensures reproducible and fair\ncomparisons across different methods. We detail MIST's data format\nrequirements, pipelines, and auxiliary features and demonstrate its efficacy\nusing the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results\nhighlight MIST's ability to produce accurate segmentation masks and its\nscalability across multiple GPUs, showcasing its potential as a powerful tool\nfor future medical imaging research and development.\n","authors":["Adrian Celaya","Evan Lim","Rachel Glenn","Brayden Mi","Alex Balsells","Dawid Schellingerhout","Tucker Netherton","Caroline Chung","Beatrice Riviere","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2407.21343v2.pdf","comment":"Submitted to BraTS 2024"},{"id":"http://arxiv.org/abs/2411.11758v1","updated":"2024-11-18T17:37:10Z","published":"2024-11-18T17:37:10Z","title":"The Power of Many: Multi-Agent Multimodal Models for Cultural Image\n Captioning","summary":" Large Multimodal Models (LMMs) exhibit impressive performance across various\nmultimodal tasks. However, their effectiveness in cross-cultural contexts\nremains limited due to the predominantly Western-centric nature of most data\nand models. Conversely, multi-agent models have shown significant capability in\nsolving complex tasks. Our study evaluates the collective performance of LMMs\nin a multi-agent interaction setting for the novel task of cultural image\ncaptioning. Our contributions are as follows: (1) We introduce MosAIC, a\nMulti-Agent framework to enhance cross-cultural Image Captioning using LMMs\nwith distinct cultural personas; (2) We provide a dataset of culturally\nenriched image captions in English for images from China, India, and Romania\nacross three datasets: GeoDE, GD-VCR, CVQA; (3) We propose a culture-adaptable\nmetric for evaluating cultural information within image captions; and (4) We\nshow that the multi-agent interaction outperforms single-agent models across\ndifferent metrics, and offer valuable insights for future research. Our dataset\nand models can be accessed at https://github.com/MichiganNLP/MosAIC.\n","authors":["Longju Bai","Angana Borah","Oana Ignat","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2411.11758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10853v3","updated":"2024-11-18T17:28:57Z","published":"2024-06-16T08:54:38Z","title":"MV2Cyl: Reconstructing 3D Extrusion Cylinders from Multi-View Images","summary":" We present MV2Cyl, a novel method for reconstructing 3D from 2D multi-view\nimages, not merely as a field or raw geometry but as a sketch-extrude CAD\nmodel. Extracting extrusion cylinders from raw 3D geometry has been extensively\nresearched in computer vision, while the processing of 3D data through neural\nnetworks has remained a bottleneck. Since 3D scans are generally accompanied by\nmulti-view images, leveraging 2D convolutional neural networks allows these\nimages to be exploited as a rich source for extracting extrusion cylinder\ninformation. However, we observe that extracting only the surface information\nof the extrudes and utilizing it results in suboptimal outcomes due to the\nchallenges in the occlusion and surface segmentation. By synergizing with the\nextracted base curve information, we achieve the optimal reconstruction result\nwith the best accuracy in 2D sketch and extrude parameter estimation. Our\nexperiments, comparing our method with previous work that takes a raw 3D point\ncloud as input, demonstrate the effectiveness of our approach by taking\nadvantage of multi-view images. Our project page can be found at\nhttp://mv2cyl.github.io .\n","authors":["Eunji Hong","Minh Hieu Nguyen","Mikaela Angelina Uy","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2406.10853v3.pdf","comment":"NeurIPS 2024. Project page: http://mv2cyl.github.io"},{"id":"http://arxiv.org/abs/2212.14309v2","updated":"2024-11-18T17:14:42Z","published":"2022-12-29T13:55:28Z","title":"Learning to mask: Towards generalized face forgery detection","summary":" Generalizability to unseen forgery types is crucial for face forgery\ndetectors. Recent works have made significant progress in terms of\ngeneralization by synthetic forgery data augmentation. In this work, we explore\nanother path for improving the generalization. Our goal is to reduce the\nfeatures that are easy to learn in the training phase, so as to reduce the risk\nof overfitting on specific forgery types. Specifically, in our method, a\nteacher network takes as input the face images and generates an attention map\nof the deep features by a diverse multihead attention ViT. The attention map is\nused to guide a student network to focus on the low-attended features by\nreducing the highly-attended deep features. A deep feature mixup strategy is\nalso proposed to synthesize forgeries in the feature domain. Experiments\ndemonstrate that, without data augmentation, our method is able to achieve\npromising performances on unseen forgeries and highly compressed data.\n","authors":["Jianwei Fei","Yunshu Dai","Huaming Wang","Zhihua Xia"],"pdf_url":"https://arxiv.org/pdf/2212.14309v2.pdf","comment":"Incorrect experimental setting"},{"id":"http://arxiv.org/abs/2411.11740v1","updated":"2024-11-18T17:10:14Z","published":"2024-11-18T17:10:14Z","title":"Revitalizing Electoral Trust: Enhancing Transparency and Efficiency\n through Automated Voter Counting with Machine Learning","summary":" In order to address issues with manual vote counting during election\nprocedures, this study intends to examine the viability of using advanced image\nprocessing techniques for automated voter counting. The study aims to shed\nlight on how automated systems that utilize cutting-edge technologies like\nOpenCV, CVZone, and the MOG2 algorithm could greatly increase the effectiveness\nand openness of electoral operations. The empirical findings demonstrate how\nautomated voter counting can enhance voting processes and rebuild public\nconfidence in election outcomes, particularly in places where trust is low. The\nstudy also emphasizes how rigorous metrics, such as the F1 score, should be\nused to systematically compare the accuracy of automated systems against manual\ncounting methods. This methodology enables a detailed comprehension of the\ndifferences in performance between automated and human counting techniques by\nproviding a nuanced assessment. The incorporation of said measures serves to\nreinforce an extensive assessment structure, guaranteeing the legitimacy and\ndependability of automated voting systems inside the electoral sphere.\n","authors":["Mir Faris","Syeda Aynul Karim","Md. Juniadul Islam"],"pdf_url":"https://arxiv.org/pdf/2411.11740v1.pdf","comment":"13 Pages, 4 Figures"},{"id":"http://arxiv.org/abs/2411.11738v1","updated":"2024-11-18T17:07:37Z","published":"2024-11-18T17:07:37Z","title":"WoodYOLO: A Novel Object Detector for Wood Species Detection in\n Microscopic Images","summary":" Wood species identification plays a crucial role in various industries, from\nensuring the legality of timber products to advancing ecological conservation\nefforts. This paper introduces WoodYOLO, a novel object detection algorithm\nspecifically designed for microscopic wood fiber analysis. Our approach adapts\nthe YOLO architecture to address the challenges posed by large, high-resolution\nmicroscopy images and the need for high recall in localization of the cell type\nof interest (vessel elements). Our results show that WoodYOLO significantly\noutperforms state-of-the-art models, achieving performance gains of 12.9% and\n6.5% in F2 score over YOLOv10 and YOLOv7, respectively. This improvement in\nautomated wood cell type localization capabilities contributes to enhancing\nregulatory compliance, supporting sustainable forestry practices, and promoting\nbiodiversity conservation efforts globally.\n","authors":["Lars Nieradzik","Henrike Stephani","Jördis Sieburg-Rockel","Stephanie Helmling","Andrea Olbrich","Stephanie Wrage","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2411.11738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24060v3","updated":"2024-11-18T17:04:09Z","published":"2024-10-31T15:57:04Z","title":"Understanding Generalizability of Diffusion Models Requires Rethinking\n the Hidden Gaussian Structure","summary":" In this work, we study the generalizability of diffusion models by looking\ninto the hidden properties of the learned score functions, which are\nessentially a series of deep denoisers trained on various noise levels. We\nobserve that as diffusion models transition from memorization to\ngeneralization, their corresponding nonlinear diffusion denoisers exhibit\nincreasing linearity. This discovery leads us to investigate the linear\ncounterparts of the nonlinear diffusion models, which are a series of linear\nmodels trained to match the function mappings of the nonlinear diffusion\ndenoisers. Surprisingly, these linear denoisers are approximately the optimal\ndenoisers for a multivariate Gaussian distribution characterized by the\nempirical mean and covariance of the training dataset. This finding implies\nthat diffusion models have the inductive bias towards capturing and utilizing\nthe Gaussian structure (covariance information) of the training dataset for\ndata generation. We empirically demonstrate that this inductive bias is a\nunique property of diffusion models in the generalization regime, which becomes\nincreasingly evident when the model's capacity is relatively small compared to\nthe training dataset size. In the case that the model is highly\noverparameterized, this inductive bias emerges during the initial training\nphases before the model fully memorizes its training data. Our study provides\ncrucial insights into understanding the notable strong generalization\nphenomenon recently observed in real-world diffusion models.\n","authors":["Xiang Li","Yixiang Dai","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2410.24060v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14210v2","updated":"2024-11-18T17:00:25Z","published":"2024-05-23T06:09:08Z","title":"Eidos: Efficient, Imperceptible Adversarial 3D Point Clouds","summary":" Classification of 3D point clouds is a challenging machine learning (ML) task\nwith important real-world applications in a spectrum from autonomous driving\nand robot-assisted surgery to earth observation from low orbit. As with other\nML tasks, classification models are notoriously brittle in the presence of\nadversarial attacks. These are rooted in imperceptible changes to inputs with\nthe effect that a seemingly well-trained model ends up misclassifying the\ninput. This paper adds to the understanding of adversarial attacks by\npresenting Eidos, a framework providing Efficient Imperceptible aDversarial\nattacks on 3D pOint cloudS. Eidos supports a diverse set of imperceptibility\nmetrics. It employs an iterative, two-step procedure to identify optimal\nadversarial examples, thereby enabling a runtime-imperceptibility trade-off. We\nprovide empirical evidence relative to several popular 3D point cloud\nclassification models and several established 3D attack methods, showing Eidos'\nsuperiority with respect to efficiency as well as imperceptibility.\n","authors":["Hanwei Zhang","Luo Cheng","Qisong He","Wei Huang","Renjue Li","Ronan Sicre","Xiaowei Huang","Holger Hermanns","Lijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.14210v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2411.11727v1","updated":"2024-11-18T16:57:41Z","published":"2024-11-18T16:57:41Z","title":"Aligning Few-Step Diffusion Models with Dense Reward Difference Learning","summary":" Aligning diffusion models with downstream objectives is essential for their\npractical applications. However, standard alignment methods often struggle with\nstep generalization when directly applied to few-step diffusion models, leading\nto inconsistent performance across different denoising step scenarios. To\naddress this, we introduce Stepwise Diffusion Policy Optimization (SDPO), a\nnovel alignment method tailored for few-step diffusion models. Unlike prior\napproaches that rely on a single sparse reward from only the final step of each\ndenoising trajectory for trajectory-level optimization, SDPO incorporates dense\nreward feedback at every intermediate step. By learning the differences in\ndense rewards between paired samples, SDPO facilitates stepwise optimization of\nfew-step diffusion models, ensuring consistent alignment across all denoising\nsteps. To promote stable and efficient training, SDPO introduces an online\nreinforcement learning framework featuring several novel strategies designed to\neffectively exploit the stepwise granularity of dense rewards. Experimental\nresults demonstrate that SDPO consistently outperforms prior methods in\nreward-based alignment across diverse step configurations, underscoring its\nrobust step generalization capabilities. Code is avaliable at\nhttps://github.com/ZiyiZhang27/sdpo.\n","authors":["Ziyi Zhang","Li Shen","Sen Zhang","Deheng Ye","Yong Luo","Miaojing Shi","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2411.11727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11116v3","updated":"2024-11-18T16:57:14Z","published":"2024-03-17T06:53:44Z","title":"PhD: A ChatGPT-Prompted Visual hallucination Evaluation Dataset","summary":" Multimodal Large Language Models (MLLMs) hallucinate, resulting in an\nemerging topic of visual hallucination evaluation (VHE). This paper contributes\na ChatGPT-Prompted visual hallucination evaluation Dataset (PhD) for objective\nVHE at a large scale. The essence of VHE is to ask an MLLM questions about\nspecific images to assess its susceptibility to hallucination. Depending on\nwhat to ask (objects, attributes, sentiment, etc.) and how the questions are\nasked, we structure PhD along two dimensions, i.e., task and mode. Five visual\nrecognition tasks, ranging from low-level (object / attribute recognition) to\nmiddle-level (sentiment / position recognition and counting), are considered.\nBesides a normal visual QA mode, which we term PhD-base, PhD also asks\nquestions with inaccurate context (PhD-iac) or with incorrect context\n(PhD-icc), or with AI-generated counter common sense images (PhD-ccs). We\nconstruct PhD by a ChatGPT-assisted semi-automated pipeline, encompassing four\npivotal modules: task-specific hallucinatory item (hitem) selection,\nhitem-embedded question generation, inaccurate / incorrect context generation,\nand counter-common-sense (CCS) image generation. With over 14k daily images,\n750 CCS images and 102k VQA triplets in total, PhD reveals considerable\nvariability in MLLMs' performance across various modes and tasks, offering\nvaluable insights into the nature of hallucination. As such, PhD stands as a\npotent tool not only for VHE but may also play a significant role in the\nrefinement of MLLMs.\n","authors":["Jiazhen Liu","Yuhan Fu","Ruobing Xie","Runquan Xie","Xingwu Sun","Fengzong Lian","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2403.11116v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08402v2","updated":"2024-11-18T16:54:54Z","published":"2024-11-13T07:41:47Z","title":"V2X-R: Cooperative LiDAR-4D Radar Fusion for 3D Object Detection with\n Denoising Diffusion","summary":" Current Vehicle-to-Everything (V2X) systems have significantly enhanced 3D\nobject detection using LiDAR and camera data. However, these methods suffer\nfrom performance degradation in adverse weather conditions. The weatherrobust\n4D radar provides Doppler and additional geometric information, raising the\npossibility of addressing this challenge. To this end, we present V2X-R, the\nfirst simulated V2X dataset incorporating LiDAR, camera, and 4D radar. V2X-R\ncontains 12,079 scenarios with 37,727 frames of LiDAR and 4D radar point\nclouds, 150,908 images, and 170,859 annotated 3D vehicle bounding boxes.\nSubsequently, we propose a novel cooperative LiDAR-4D radar fusion pipeline for\n3D object detection and implement it with various fusion strategies. To achieve\nweather-robust detection, we additionally propose a Multi-modal Denoising\nDiffusion (MDD) module in our fusion pipeline. MDD utilizes weather-robust 4D\nradar feature as a condition to prompt the diffusion model to denoise noisy\nLiDAR features. Experiments show that our LiDAR-4D radar fusion pipeline\ndemonstrates superior performance in the V2X-R dataset. Over and above this,\nour MDD module further improved the performance of basic fusion model by up to\n5.73%/6.70% in foggy/snowy conditions with barely disrupting normal\nperformance. The dataset and code will be publicly available at:\nhttps://github.com/ylwhxht/V2X-R.\n","authors":["Xun Huang","Jinlong Wang","Qiming Xia","Siheng Chen","Bisheng Yang","Xin Li","Cheng Wang","Chenglu Wen"],"pdf_url":"https://arxiv.org/pdf/2411.08402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11717v1","updated":"2024-11-18T16:45:44Z","published":"2024-11-18T16:45:44Z","title":"RAWMamba: Unified sRGB-to-RAW De-rendering With State Space Model","summary":" Recent advancements in sRGB-to-RAW de-rendering have increasingly emphasized\nmetadata-driven approaches to reconstruct RAW data from sRGB images,\nsupplemented by partial RAW information. In image-based de-rendering, metadata\nis commonly obtained through sampling, whereas in video tasks, it is typically\nderived from the initial frame. The distinct metadata requirements necessitate\nspecialized network architectures, leading to architectural incompatibilities\nthat increase deployment complexity. In this paper, we propose RAWMamba, a\nMamba-based unified framework developed for sRGB-to-RAW de-rendering across\nboth image and video domains. The core of RAWMamba is the Unified Metadata\nEmbedding (UME) module, which harmonizes diverse metadata types into a unified\nrepresentation. In detail, a multi-perspective affinity modeling method is\nproposed to promote the extraction of reference information. In addition, we\nintroduce the Local Tone-Aware Mamba (LTA-Mamba) module, which captures\nlong-range dependencies to enable effective global propagation of metadata.\nExperimental results demonstrate that the proposed RAWMamba achieves\nstate-of-the-art performance, yielding high-quality RAW data reconstruction.\n","authors":["Hongjun Chen","Wencheng Han","Huan Zheng","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2411.11717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02588v2","updated":"2024-11-18T16:45:26Z","published":"2023-08-03T18:23:37Z","title":"Unmasking Parkinson's Disease with Smile: An AI-enabled Screening\n Framework","summary":" We present an efficient and accessible PD screening method by leveraging\nAI-driven models enabled by the largest video dataset of facial expressions\nfrom 1,059 unique participants. This dataset includes 256 individuals with PD,\n165 clinically diagnosed, and 91 self-reported. Participants used webcams to\nrecord themselves mimicking three facial expressions (smile, disgust, and\nsurprise) from diverse sources encompassing their homes across multiple\ncountries, a US clinic, and a PD wellness center in the US. Facial landmarks\nare automatically tracked from the recordings to extract features related to\nhypomimia, a prominent PD symptom characterized by reduced facial expressions.\nMachine learning algorithms are trained on these features to distinguish\nbetween individuals with and without PD. The model was tested for\ngeneralizability on external (unseen during training) test videos collected\nfrom a US clinic and Bangladesh. An ensemble of machine learning models trained\non smile videos achieved an accuracy of 87.9+-0.1% (95% Confidence Interval)\nwith an AUROC of 89.3+-0.3% as evaluated on held-out data (using k-fold\ncross-validation). In external test settings, the ensemble model achieved\n79.8+-0.6% accuracy with 81.9+-0.3% AUROC on the clinical test set and\n84.9+-0.4% accuracy with 81.2+-0.6% AUROC on participants from Bangladesh. In\nevery setting, the model was free from detectable bias across sex and ethnic\nsubgroups, except in the cohorts from Bangladesh, where the model performed\nsignificantly better for female participants than males. Smiling videos can\neffectively differentiate between individuals with and without PD, offering a\npotentially easy, accessible, and cost-efficient way to screen for PD,\nespecially when a clinical diagnosis is difficult to access.\n","authors":["Tariq Adnan","Md Saiful Islam","Wasifur Rahman","Sangwu Lee","Sutapa Dey Tithi","Kazi Noshin","Imran Sarker","M Saifur Rahman","Ehsan Hoque"],"pdf_url":"https://arxiv.org/pdf/2308.02588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11706v1","updated":"2024-11-18T16:33:52Z","published":"2024-11-18T16:33:52Z","title":"MC-LLaVA: Multi-Concept Personalized Vision-Language Model","summary":" Current vision-language models (VLMs) show exceptional abilities across\ndiverse tasks including visual question answering. To enhance user experience\nin practical applications, recent studies investigate VLM personalization to\nunderstand user-provided concepts. However, existing studies mainly focus on\nsingle-concept personalization, neglecting the existence and interplay of\nmultiple concepts, which limits the real-world applicability of personalized\nVLMs. In this paper, we propose the first multi-concept personalization method\nnamed MC-LLaVA along with a high-quality multi-concept personalization dataset.\nSpecifically, MC-LLaVA uses a joint training strategy incorporating multiple\nconcepts in a single training step, allowing VLMs to perform accurately in\nmulti-concept personalization. To reduce the cost of joint training, MC-LLaVA\nleverages visual token information for concept token initialization, yielding\nimproved concept representation and accelerating joint training. To advance\nmulti-concept personalization research, we further contribute a high-quality\ndataset. We carefully collect images from various movies that contain multiple\ncharacters and manually generate the multi-concept question-answer samples. Our\ndataset features diverse movie types and question-answer types. We conduct\ncomprehensive qualitative and quantitative experiments to demonstrate that\nMC-LLaVA can achieve impressive multi-concept personalized responses, paving\nthe way for VLMs to become better user-specific assistants. The code and\ndataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA.\n","authors":["Ruichuan An","Sihan Yang","Ming Lu","Kai Zeng","Yulin Luo","Ying Chen","Jiajun Cao","Hao Liang","Qi She","Shanghang Zhang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11693v1","updated":"2024-11-18T16:15:00Z","published":"2024-11-18T16:15:00Z","title":"From Spectra to Geography: Intelligent Mapping of RRUFF Mineral Data","summary":" Accurately determining the geographic origin of mineral samples is pivotal\nfor applications in geology, mineralogy, and material science. Leveraging the\ncomprehensive Raman spectral data from the RRUFF database, this study\nintroduces a novel machine learning framework aimed at geolocating mineral\nspecimens at the country level. We employ a one-dimensional ConvNeXt1D neural\nnetwork architecture to classify mineral spectra based solely on their spectral\nsignatures. The processed dataset comprises over 32,900 mineral samples,\npredominantly natural, spanning 101 countries. Through five-fold\ncross-validation, the ConvNeXt1D model achieved an impressive average\nclassification accuracy of 93%, demonstrating its efficacy in capturing\ngeospatial patterns inherent in Raman spectra.\n","authors":["Francesco Pappone","Federico Califano","Marco Tafani"],"pdf_url":"https://arxiv.org/pdf/2411.11693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11691v1","updated":"2024-11-18T16:13:47Z","published":"2024-11-18T16:13:47Z","title":"Towards Degradation-Robust Reconstruction in Generalizable NeRF","summary":" Generalizable Neural Radiance Field (GNeRF) across scenes has been proven to\nbe an effective way to avoid per-scene optimization by representing a scene\nwith deep image features of source images. However, despite its potential for\nreal-world applications, there has been limited research on the robustness of\nGNeRFs to different types of degradation present in the source images. The lack\nof such research is primarily attributed to the absence of a large-scale\ndataset fit for training a degradation-robust generalizable NeRF model. To\naddress this gap and facilitate investigations into the degradation robustness\nof 3D reconstruction tasks, we construct the Objaverse Blur Dataset, comprising\n50,000 images from over 1000 settings featuring multiple levels of blur\ndegradation. In addition, we design a simple and model-agnostic module for\nenhancing the degradation robustness of GNeRFs. Specifically, by extracting\n3D-aware features through a lightweight depth estimator and denoiser, the\nproposed module shows improvement on different popular methods in GNeRFs in\nterms of both quantitative and visual quality over varying degradation types\nand levels. Our dataset and code will be made publicly available.\n","authors":["Chan Ho Park","Ka Leong Cheng","Zhicheng Wang","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11667v1","updated":"2024-11-18T15:45:41Z","published":"2024-11-18T15:45:41Z","title":"Dissecting Misalignment of Multimodal Large Language Models via\n Influence Function","summary":" Multi-modal Large Language models (MLLMs) are always trained on data from\ndiverse and unreliable sources, which may contain misaligned or mislabeled\ntext-image pairs. This frequently causes robustness issues and hallucinations,\nleading to performance degradation. Data valuation is an efficient way to\ndetect and trace these misalignments. Nevertheless, existing methods are\ncomputationally expensive for MLLMs. While computationally efficient, the\nclassical influence functions are inadequate for contrastive learning models\nbecause they were originally designed for pointwise loss. Additionally,\ncontrastive learning involves minimizing the distance between the modalities of\npositive samples and maximizing the distance between the modalities of negative\nsamples. This requires us to evaluate the influence of samples from both\nperspectives. To tackle these challenges, we introduce the Extended Influence\nFunction for Contrastive Loss (ECIF), an influence function crafted for\ncontrastive loss. ECIF considers both positive and negative samples and\nprovides a closed-form approximation of contrastive learning models,\neliminating the need for retraining. Building upon ECIF, we develop a series of\nalgorithms for data evaluation in MLLM, misalignment detection, and\nmisprediction trace-back tasks. Experimental results demonstrate our ECIF\nadvances the transparency and interpretability of MLLMs by offering a more\naccurate assessment of data impact and model alignment compared to traditional\nbaseline methods.\n","authors":["Lijie Hu","Chenyang Ren","Huanyi Xie","Khouloud Saadi","Shu Yang","Jingfeng Zhang","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11667v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2402.18068v3","updated":"2024-11-18T15:43:58Z","published":"2024-02-28T05:54:02Z","title":"SynArtifact: Classifying and Alleviating Artifacts in Synthetic Images\n via Vision-Language Model","summary":" In the rapidly evolving area of image synthesis, a serious challenge is the\npresence of complex artifacts that compromise perceptual realism of synthetic\nimages. To alleviate artifacts and improve quality of synthetic images, we\nfine-tune Vision-Language Model (VLM) as artifact classifier to automatically\nidentify and classify a wide range of artifacts and provide supervision for\nfurther optimizing generative models. Specifically, we develop a comprehensive\nartifact taxonomy and construct a dataset of synthetic images with artifact\nannotations for fine-tuning VLM, named SynArtifact-1K. The fine-tuned VLM\nexhibits superior ability of identifying artifacts and outperforms the baseline\nby 25.66%. To our knowledge, this is the first time such end-to-end artifact\nclassification task and solution have been proposed. Finally, we leverage the\noutput of VLM as feedback to refine the generative model for alleviating\nartifacts. Visualization results and user study demonstrate that the quality of\nimages synthesized by the refined diffusion model has been obviously improved.\n","authors":["Bin Cao","Jianhao Yuan","Yexin Liu","Jian Li","Shuyang Sun","Jing Liu","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.18068v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13147v6","updated":"2024-11-18T15:41:01Z","published":"2024-10-17T02:04:57Z","title":"Utilizing Large Language Models in an iterative paradigm with domain\n feedback for molecule optimization","summary":" Molecule optimization is a critical task in drug discovery to optimize\ndesired properties of a given molecule through chemical modification. Despite\nLarge Language Models (LLMs) holding the potential to efficiently simulate this\ntask by using natural language to direct the optimization, straightforwardly\nutilizing them shows limited performance. In this work, we facilitate utilizing\nLLMs in an iterative paradigm by proposing a simple yet highly effective domain\nfeedback provider, namely $\\text{Re}^3$DF. In detail, $\\text{Re}^3$DF harnesses\nan external toolkit, RDKit, to handle the molecule hallucination, if the\nmodified molecule is chemically invalid. Otherwise, its desired properties are\ncomputed and compared to the original one, establishing reliable domain\nfeedback with correct direction and distance towards the objective, followed by\na retrieved example, to guide the LLM to refine the modified molecule. We\nconduct experiments across both single- and multi-property objectives with 2\nthresholds, where $\\text{Re}^3$DF shows significant improvements. Particularly,\nfor 20 single-property objectives, $\\text{Re}^3$DF enhances Hit ratio by 16.95%\nand 20.76% under loose (\\texttt{l}) and strict (\\texttt{s}) thresholds,\nrespectively. For 32 multi-property objectives, $\\text{Re}^3$DF enhances Hit\nratio by 6.04% and 5.25%.\n","authors":["Khiem Le","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2410.13147v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03047v2","updated":"2024-11-18T15:39:00Z","published":"2023-12-05T17:58:06Z","title":"MagicStick: Controllable Video Editing via Control Handle\n Transformations","summary":" Text-based video editing has recently attracted considerable interest in\nchanging the style or replacing the objects with a similar structure. Beyond\nthis, we demonstrate that properties such as shape, size, location, motion,\netc., can also be edited in videos. Our key insight is that the keyframe\ntransformations of the specific internal feature (e.g., edge maps of objects or\nhuman pose), can easily propagate to other frames to provide generation\nguidance. We thus propose MagicStick, a controllable video editing method that\nedits the video properties by utilizing the transformation on the extracted\ninternal control signals. In detail, to keep the appearance, we inflate both\nthe pretrained image diffusion model and ControlNet to the temporal dimension\nand train low-rank adaptions (LORA) layers to fit the specific scenes. Then, in\nediting, we perform an inversion and editing framework. Differently, finetuned\nControlNet is introduced in both inversion and generation for attention\nguidance with the proposed attention remix between the spatial attention maps\nof inversion and editing. Yet succinct, our method is the first method to show\nthe ability of video property editing from the pre-trained text-to-image model.\nWe present experiments on numerous examples within our unified framework. We\nalso compare with shape-aware text-based editing and handcrafted motion video\ngeneration, demonstrating our superior temporal consistency and editing\ncapability than previous works. The code and models are available on\nhttps://github.com/mayuelala/MagicStick.\n","authors":["Yue Ma","Xiaodong Cun","Sen Liang","Jinbo Xing","Yingqing He","Chenyang Qi","Siran Chen","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03047v2.pdf","comment":"Accepted by WACV 2025, Project page:\n https://magic-stick-edit.github.io/ Github repository:\n https://github.com/mayuelala/MagicStick"},{"id":"http://arxiv.org/abs/2411.11636v1","updated":"2024-11-18T15:14:36Z","published":"2024-11-18T15:14:36Z","title":"SP${ }^3$ : Superpixel-propagated pseudo-label learning for weakly\n semi-supervised medical image segmentation","summary":" Deep learning-based medical image segmentation helps assist diagnosis and\naccelerate the treatment process while the model training usually requires\nlarge-scale dense annotation datasets. Weakly semi-supervised medical image\nsegmentation is an essential application because it only requires a small\namount of scribbles and a large number of unlabeled data to train the model,\nwhich greatly reduces the clinician's effort to fully annotate images. To\nhandle the inadequate supervisory information challenge in weakly\nsemi-supervised segmentation (WSSS), a SuperPixel-Propagated Pseudo-label\n(SP${}^3$) learning method is proposed, using the structural information\ncontained in superpixel for supplemental information. Specifically, the\nannotation of scribbles is propagated to superpixels and thus obtains a dense\nannotation for supervised training. Since the quality of pseudo-labels is\nlimited by the low-quality annotation, the beneficial superpixels selected by\ndynamic thresholding are used to refine pseudo-labels. Furthermore, aiming to\nalleviate the negative impact of noise in pseudo-label, superpixel-level\nuncertainty is incorporated to guide the pseudo-label supervision for stable\nlearning. Our method achieves state-of-the-art performance on both tumor and\norgan segmentation datasets under the WSSS setting, using only 3\\% of the\nannotation workload compared to fully supervised methods and attaining\napproximately 80\\% Dice score. Additionally, our method outperforms eight\nweakly and semi-supervised methods under both weakly supervised and\nsemi-supervised settings. Results of extensive experiments validate the\neffectiveness and annotation efficiency of our weakly semi-supervised\nsegmentation, which can assist clinicians in achieving automated segmentation\nfor organs or tumors quickly and ultimately benefit patients.\n","authors":["Shiman Li","Jiayue Zhao","Shaolei Liu","Xiaokun Dai","Chenxi Zhang","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2411.11636v1.pdf","comment":"10 pages, 7 figures. Under Review"},{"id":"http://arxiv.org/abs/2404.09826v2","updated":"2024-11-18T14:52:09Z","published":"2024-04-15T14:23:39Z","title":"A Recipe for CAC: Mosaic-based Generalized Loss for Improved\n Class-Agnostic Counting","summary":" Class agnostic counting (CAC) is a vision task that can be used to count the\ntotal occurrence number of any given reference objects in the query image. The\ntask is usually formulated as a density map estimation problem through\nsimilarity computation among a few image samples of the reference object and\nthe query image. In this paper, we point out a severe issue of the existing CAC\nframework: Given a multi-class setting, models don't consider reference images\nand instead blindly match all dominant objects in the query image. Moreover,\nthe current evaluation metrics and dataset cannot be used to faithfully assess\nthe model's generalization performance and robustness. To this end, we discover\nthat the combination of mosaic augmentation with generalized loss is essential\nfor addressing the aforementioned issue of CAC models to count objects of\nmajority (i.e. dominant objects) regardless of the references. Furthermore, we\nintroduce a new evaluation protocol and metrics for resolving the problem\nbehind the existing CAC evaluation scheme and better benchmarking CAC models in\na more fair manner. Besides, extensive evaluation results demonstrate that our\nproposed recipe can consistently improve the performance of different CAC\nmodels. The code is available at https://github.com/littlepenguin89106/MGCAC.\n","authors":["Tsung-Han Chou","Brian Wang","Wei-Chen Chiu","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09826v2.pdf","comment":"Accepted by ACCV 2024"},{"id":"http://arxiv.org/abs/2411.11619v1","updated":"2024-11-18T14:48:06Z","published":"2024-11-18T14:48:06Z","title":"FERT: Real-Time Facial Expression Recognition with Short-Range FMCW\n Radar","summary":" This study proposes a novel approach for real-time facial expression\nrecognition utilizing short-range Frequency-Modulated Continuous-Wave (FMCW)\nradar equipped with one transmit (Tx), and three receive (Rx) antennas. The\nsystem leverages four distinct modalities simultaneously: Range-Doppler images\n(RDIs), micro range-Doppler Images (micro-RDIs), range azimuth images (RAIs),\nand range elevation images (REIs). Our innovative architecture integrates\nfeature extractor blocks, intermediate feature extractor blocks, and a ResNet\nblock to accurately classify facial expressions into smile, anger, neutral, and\nno-face classes. Our model achieves an average classification accuracy of\n98.91% on the dataset collected using a 60 GHz short-range FMCW radar. The\nproposed solution operates in real-time in a person-independent manner, which\nshows the potential use of low-cost FMCW radars for effective facial expression\nrecognition in various applications.\n","authors":["Sabri Mustafa Kahya","Muhammet Sami Yavuz","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2411.11619v1.pdf","comment":"Accepted at IEEE SENSORS 2024"},{"id":"http://arxiv.org/abs/2407.11211v3","updated":"2024-11-18T14:43:38Z","published":"2024-07-15T19:53:02Z","title":"Unconstrained Open Vocabulary Image Classification: Zero-Shot Transfer\n from Text to Image via CLIP Inversion","summary":" We introduce NOVIC, an innovative real-time uNconstrained Open Vocabulary\nImage Classifier that uses an autoregressive transformer to generatively output\nclassification labels as language. Leveraging the extensive knowledge of CLIP\nmodels, NOVIC harnesses the embedding space to enable zero-shot transfer from\npure text to images. Traditional CLIP models, despite their ability for open\nvocabulary classification, require an exhaustive prompt of potential class\nlabels, restricting their application to images of known content or context. To\naddress this, we propose an \"object decoder\" model that is trained on a\nlarge-scale 92M-target dataset of templated object noun sets and LLM-generated\ncaptions to always output the object noun in question. This effectively inverts\nthe CLIP text encoder and allows textual object labels from essentially the\nentire English language to be generated directly from image-derived embedding\nvectors, without requiring any a priori knowledge of the potential content of\nan image, and without any label biases. The trained decoders are tested on a\nmix of manually and web-curated datasets, as well as standard image\nclassification benchmarks, and achieve fine-grained prompt-free prediction\nscores of up to 87.5%, a strong result considering the model must work for any\nconceivable image and without any contextual clues.\n","authors":["Philipp Allgeuer","Kyra Ahrens","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2407.11211v3.pdf","comment":"Published at WACV 2025"},{"id":"http://arxiv.org/abs/2411.10261v2","updated":"2024-11-18T14:43:25Z","published":"2024-11-15T15:08:04Z","title":"Partial Scene Text Retrieval","summary":" The task of partial scene text retrieval involves localizing and searching\nfor text instances that are the same or similar to a given query text from an\nimage gallery. However, existing methods can only handle text-line instances,\nleaving the problem of searching for partial patches within these text-line\ninstances unsolved due to a lack of patch annotations in the training data. To\naddress this issue, we propose a network that can simultaneously retrieve both\ntext-line instances and their partial patches. Our method embeds the two types\nof data (query text and scene text instances) into a shared feature space and\nmeasures their cross-modal similarities. To handle partial patches, our\nproposed approach adopts a Multiple Instance Learning (MIL) approach to learn\ntheir similarities with query text, without requiring extra annotations.\nHowever, constructing bags, which is a standard step of conventional MIL\napproaches, can introduce numerous noisy samples for training, and lower\ninference speed. To address this issue, we propose a Ranking MIL (RankMIL)\napproach to adaptively filter those noisy samples. Additionally, we present a\nDynamic Partial Match Algorithm (DPMA) that can directly search for the target\npartial patch from a text-line instance during the inference stage, without\nrequiring bags. This greatly improves the search efficiency and the performance\nof retrieving partial patches. The source code and dataset are available at\nhttps://github.com/lanfeng4659/PSTR.\n","authors":["Hao Wang","Minghui Liao","Zhouyi Xie","Wenyu Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2411.10261v2.pdf","comment":"Accepted on TPAMI"},{"id":"http://arxiv.org/abs/2411.11613v1","updated":"2024-11-18T14:35:01Z","published":"2024-11-18T14:35:01Z","title":"Leveraging Computational Pathology AI for Noninvasive Optical Imaging\n Analysis Without Retraining","summary":" Noninvasive optical imaging modalities can probe patient's tissue in 3D and\nover time generate gigabytes of clinically relevant data per sample. There is a\nneed for AI models to analyze this data and assist clinical workflow. The lack\nof expert labelers and the large dataset required (>100,000 images) for model\ntraining and tuning are the main hurdles in creating foundation models. In this\npaper we introduce FoundationShift, a method to apply any AI model from\ncomputational pathology without retraining. We show our method is more accurate\nthan state of the art models (SAM, MedSAM, SAM-Med2D, CellProfiler, Hover-Net,\nPLIP, UNI and ChatGPT), with multiple imaging modalities (OCT and RCM). This is\nachieved without the need for model retraining or fine-tuning. Applying our\nmethod to noninvasive in vivo images could enable physicians to readily\nincorporate optical imaging modalities into their clinical practice, providing\nreal time tissue analysis and improving patient care.\n","authors":["Danny Barash","Emilie Manning","Aidan Van Vleck","Omri Hirsch","Kyi Lei Aye","Jingxi Li","Philip O. Scumpia","Aydogan Ozcan","Sumaira Aasi","Kerri E. Rieger","Kavita Y. Sarin","Oren Freifeld","Yonatan Winetraub"],"pdf_url":"https://arxiv.org/pdf/2411.11613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11273v2","updated":"2024-11-18T14:19:52Z","published":"2024-03-17T17:04:45Z","title":"BrightDreamer: Generic 3D Gaussian Generative Framework for Fast\n Text-to-3D Synthesis","summary":" Text-to-3D synthesis has recently seen intriguing advances by combining the\ntext-to-image priors with 3D representation methods, e.g., 3D Gaussian\nSplatting (3D GS), via Score Distillation Sampling (SDS). However, a hurdle of\nexisting methods is the low efficiency, per-prompt optimization for a single 3D\nobject. Therefore, it is imperative for a paradigm shift from per-prompt\noptimization to feed-forward generation for any unseen text prompts, which yet\nremains challenging. An obstacle is how to directly generate a set of millions\nof 3D Gaussians to represent a 3D object. This paper presents BrightDreamer, an\nend-to-end feed-forward approach that can achieve generalizable and fast (77\nms) text-to-3D generation. Our key idea is to formulate the generation process\nas estimating the 3D deformation from an anchor shape with predefined\npositions. For this, we first propose a Text-guided Shape Deformation (TSD)\nnetwork to predict the deformed shape and its new positions, used as the\ncenters (one attribute) of 3D Gaussians. To estimate the other four attributes\n(i.e., scaling, rotation, opacity, and SH), we then design a novel Text-guided\nTriplane Generator (TTG) to generate a triplane representation for a 3D object.\nThe center of each Gaussian enables us to transform the spatial feature into\nthe four attributes. The generated 3D Gaussians can be finally rendered at 705\nframes per second. Extensive experiments demonstrate the superiority of our\nmethod over existing methods. Also, BrightDreamer possesses a strong semantic\nunderstanding capability even for complex text prompts. The code is available\nin the project page.\n","authors":["Lutao Jiang","Xu Zheng","Yuanhuiyi Lyu","Jiazhou Zhou","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08931v3","updated":"2024-11-18T14:16:43Z","published":"2023-11-15T13:04:57Z","title":"Structural-Based Uncertainty in Deep Learning Across Anatomical Scales:\n Analysis in White Matter Lesion Segmentation","summary":" This paper explores uncertainty quantification (UQ) as an indicator of the\ntrustworthiness of automated deep-learning (DL) tools in the context of white\nmatter lesion (WML) segmentation from magnetic resonance imaging (MRI) scans of\nmultiple sclerosis (MS) patients. Our study focuses on two principal aspects of\nuncertainty in structured output segmentation tasks. First, we postulate that a\nreliable uncertainty measure should indicate predictions likely to be incorrect\nwith high uncertainty values. Second, we investigate the merit of quantifying\nuncertainty at different anatomical scales (voxel, lesion, or patient). We\nhypothesize that uncertainty at each scale is related to specific types of\nerrors. Our study aims to confirm this relationship by conducting separate\nanalyses for in-domain and out-of-domain settings. Our primary methodological\ncontributions are (i) the development of novel measures for quantifying\nuncertainty at lesion and patient scales, derived from structural prediction\ndiscrepancies, and (ii) the extension of an error retention curve analysis\nframework to facilitate the evaluation of UQ performance at both lesion and\npatient scales. The results from a multi-centric MRI dataset of 444 patients\ndemonstrate that our proposed measures more effectively capture model errors at\nthe lesion and patient scales compared to measures that average voxel-scale\nuncertainty values. We provide the UQ protocols code at\nhttps://github.com/Medical-Image-Analysis-Laboratory/MS_WML_uncs.\n","authors":["Nataliia Molchanova","Vatsal Raina","Andrey Malinin","Francesco La Rosa","Adrien Depeursinge","Mark Gales","Cristina Granziera","Henning Muller","Mara Graziani","Meritxell Bach Cuadra"],"pdf_url":"https://arxiv.org/pdf/2311.08931v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11562v1","updated":"2024-11-18T13:32:59Z","published":"2024-11-18T13:32:59Z","title":"MSSIDD: A Benchmark for Multi-Sensor Denoising","summary":" The cameras equipped on mobile terminals employ different sensors in\ndifferent photograph modes, and the transferability of raw domain denoising\nmodels between these sensors is significant but remains sufficient exploration.\nIndustrial solutions either develop distinct training strategies and models for\ndifferent sensors or ignore the differences between sensors and simply extend\nexisting models to new sensors, which leads to tedious training or\nunsatisfactory performance. In this paper, we introduce a new benchmark, the\nMulti-Sensor SIDD (MSSIDD) dataset, which is the first raw-domain dataset\ndesigned to evaluate the sensor transferability of denoising models. The MSSIDD\ndataset consists of 60,000 raw images of six distinct sensors, derived through\nthe degeneration of sRGB images via different camera sensor parameters.\nFurthermore, we propose a sensor consistency training framework that enables\ndenoising models to learn the sensor-invariant features, thereby facilitating\nthe generalization of the consistent model to unseen sensors. We evaluate\nprevious arts on the newly proposed MSSIDD dataset, and the experimental\nresults validate the effectiveness of our proposed method. Our dataset is\navailable at https://www.kaggle.com/datasets/sjtuwh/mssidd.\n","authors":["Shibin Mei","Hang Wang","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2411.11562v1.pdf","comment":"15 pages,7 figures"},{"id":"http://arxiv.org/abs/2307.09218v3","updated":"2024-11-18T13:26:41Z","published":"2023-07-16T16:27:58Z","title":"A Comprehensive Survey of Forgetting in Deep Learning Beyond Continual\n Learning","summary":" Forgetting refers to the loss or deterioration of previously acquired\nknowledge. While existing surveys on forgetting have primarily focused on\ncontinual learning, forgetting is a prevalent phenomenon observed in various\nother research domains within deep learning. Forgetting manifests in research\nfields such as generative models due to generator shifts, and federated\nlearning due to heterogeneous data distributions across clients. Addressing\nforgetting encompasses several challenges, including balancing the retention of\nold task knowledge with fast learning of new task, managing task interference\nwith conflicting goals, and preventing privacy leakage, etc. Moreover, most\nexisting surveys on continual learning implicitly assume that forgetting is\nalways harmful. In contrast, our survey argues that forgetting is a\ndouble-edged sword and can be beneficial and desirable in certain cases, such\nas privacy-preserving scenarios. By exploring forgetting in a broader context,\nwe present a more nuanced understanding of this phenomenon and highlight its\npotential advantages. Through this comprehensive survey, we aspire to uncover\npotential solutions by drawing upon ideas and approaches from various fields\nthat have dealt with forgetting. By examining forgetting beyond its\nconventional boundaries, we hope to encourage the development of novel\nstrategies for mitigating, harnessing, or even embracing forgetting in real\napplications. A comprehensive list of papers about forgetting in various\nresearch fields is available at\n\\url{https://github.com/EnnengYang/Awesome-Forgetting-in-Deep-Learning}.\n","authors":["Zhenyi Wang","Enneng Yang","Li Shen","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2307.09218v3.pdf","comment":"accepted at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2411.11548v1","updated":"2024-11-18T13:06:29Z","published":"2024-11-18T13:06:29Z","title":"Real-Time Fitness Exercise Classification and Counting from Video Frames","summary":" This paper introduces a novel method for real-time exercise classification\nusing a Bidirectional Long Short-Term Memory (BiLSTM) neural network. Existing\nexercise recognition approaches often rely on synthetic datasets, raw\ncoordinate inputs sensitive to user and camera variations, and fail to fully\nexploit the temporal dependencies in exercise movements. These issues limit\ntheir generalizability and robustness in real-world conditions, where lighting,\ncamera angles, and user body types vary.\n To address these challenges, we propose a BiLSTM-based model that leverages\ninvariant features, such as joint angles, alongside raw coordinates. By using\nboth angles and (x, y, z) coordinates, the model adapts to changes in\nperspective, user positioning, and body differences, improving generalization.\nTraining on 30-frame sequences enables the BiLSTM to capture the temporal\ncontext of exercises and recognize patterns evolving over time.\n We compiled a dataset combining synthetic data from the InfiniteRep dataset\nand real-world videos from Kaggle and other sources. This dataset includes four\ncommon exercises: squat, push-up, shoulder press, and bicep curl. The model was\ntrained and validated on these diverse datasets, achieving an accuracy of over\n99% on the test set. To assess generalizability, the model was tested on 2\nseparate test sets representative of typical usage conditions. Comparisons with\nthe previous approach from the literature are present in the result section\nshowing that the proposed model is the best-performing one.\n The classifier is integrated into a web application providing real-time\nexercise classification and repetition counting without manual exercise\nselection.\n Demo and datasets are available at the following GitHub Repository:\nhttps://github.com/RiccardoRiccio/Fitness-AI-Trainer-With-Automatic-Exercise-Recognition-and-Counting.\n","authors":["Riccardo Riccio"],"pdf_url":"https://arxiv.org/pdf/2411.11548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18459v2","updated":"2024-11-18T13:03:19Z","published":"2024-04-29T06:35:34Z","title":"Chameleon: A Data-Efficient Generalist for Dense Visual Prediction in\n the Wild","summary":" Large language models have evolved data-efficient generalists, benefiting\nfrom the universal language interface and large-scale pre-training. However,\nconstructing a data-efficient generalist for dense visual prediction presents a\ndistinct challenge due to the variation in label structures across different\ntasks. Consequently, generalization to unseen dense prediction tasks in the\nlow-data regime is not straightforward and has received less attention from\nprevious vision generalists. In this study, we explore a universal model that\ncan flexibly adapt to unseen dense label structures with a few examples,\nenabling it to serve as a data-efficient vision generalist in diverse\nreal-world scenarios. To this end, we base our method on a powerful\nmeta-learning framework and explore several axes to improve its performance and\nversatility for real-world problems, such as flexible adaptation mechanisms and\nscalability. We evaluate our model across a spectrum of unseen real-world\nscenarios where low-shot learning is desirable, including video, 3D, medical,\nbiological, and user-interactive tasks. Equipped with a generic architecture\nand an effective adaptation mechanism, our model flexibly adapts to all of\nthese tasks with at most 50 labeled images, showcasing a significant\nadvancement over existing data-efficient generalist approaches. Codes are\navailable at https://github.com/GitGyun/chameleon.\n","authors":["Donggyun Kim","Seongwoong Cho","Semin Kim","Chong Luo","Seunghoon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.18459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11543v1","updated":"2024-11-18T13:01:57Z","published":"2024-11-18T13:01:57Z","title":"Enhancing Vision-Language Model Safety through Progressive\n Concept-Bottleneck-Driven Alignment","summary":" Benefiting from the powerful capabilities of Large Language Models (LLMs),\npre-trained visual encoder models connected to LLMs form Vision Language Models\n(VLMs). However, recent research shows that the visual modality in VLMs is\nhighly vulnerable, allowing attackers to bypass safety alignment in LLMs\nthrough visually transmitted content, launching harmful attacks. To address\nthis challenge, we propose a progressive concept-based alignment strategy,\nPSA-VLM, which incorporates safety modules as concept bottlenecks to enhance\nvisual modality safety alignment. By aligning model predictions with specific\nsafety concepts, we improve defenses against risky images, enhancing\nexplainability and controllability while minimally impacting general\nperformance. Our method is obtained through two-stage training. The low\ncomputational cost of the first stage brings very effective performance\nimprovement, and the fine-tuning of the language model in the second stage\nfurther improves the safety performance. Our method achieves state-of-the-art\nresults on popular VLM safety benchmark.\n","authors":["Zhendong Liu","Yuanbi Nie","Yingshui Tan","Xiangyu Yue","Qiushi Cui","Chongjun Wang","Xiaoyong Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.11543v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.13581"},{"id":"http://arxiv.org/abs/2411.11525v1","updated":"2024-11-18T12:35:08Z","published":"2024-11-18T12:35:08Z","title":"Reliable Poisoned Sample Detection against Backdoor Attacks Enhanced by\n Sharpness Aware Minimization","summary":" Backdoor attack has been considered as a serious security threat to deep\nneural networks (DNNs). Poisoned sample detection (PSD) that aims at filtering\nout poisoned samples from an untrustworthy training dataset has shown very\npromising performance for defending against data poisoning based backdoor\nattacks. However, we observe that the detection performance of many advanced\nmethods is likely to be unstable when facing weak backdoor attacks, such as low\npoisoning ratio or weak trigger strength. To further verify this observation,\nwe make a statistical investigation among various backdoor attacks and poisoned\nsample detections, showing a positive correlation between backdoor effect and\ndetection performance. It inspires us to strengthen the backdoor effect to\nenhance detection performance. Since we cannot achieve that goal via directly\nmanipulating poisoning ratio or trigger strength, we propose to train one model\nusing the Sharpness-Aware Minimization (SAM) algorithm, rather than the vanilla\ntraining algorithm. We also provide both empirical and theoretical analysis\nabout how SAM training strengthens the backdoor effect. Then, this SAM trained\nmodel can be seamlessly integrated with any off-the-shelf PSD method that\nextracts discriminative features from the trained model for detection, called\nSAM-enhanced PSD. Extensive experiments on several benchmark datasets show the\nreliable detection performance of the proposed method against both weak and\nstrong backdoor attacks, with significant improvements against various attacks\n($+34.38\\%$ TPR on average), over the conventional PSD methods (i.e., without\nSAM enhancement). Overall, this work provides new insights about PSD and\nproposes a novel approach that can complement existing detection methods, which\nmay inspire more in-depth explorations in this field.\n","authors":["Mingda Zhang","Mingli Zhu","Zihao Zhu","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11515v1","updated":"2024-11-18T12:22:37Z","published":"2024-11-18T12:22:37Z","title":"Cascaded Diffusion Models for 2D and 3D Microscopy Image Synthesis to\n Enhance Cell Segmentation","summary":" Automated cell segmentation in microscopy images is essential for biomedical\nresearch, yet conventional methods are labor-intensive and prone to error.\nWhile deep learning-based approaches have proven effective, they often require\nlarge annotated datasets, which are scarce due to the challenges of manual\nannotation. To overcome this, we propose a novel framework for synthesizing\ndensely annotated 2D and 3D cell microscopy images using cascaded diffusion\nmodels. Our method synthesizes 2D and 3D cell masks from sparse 2D annotations\nusing multi-level diffusion models and NeuS, a 3D surface reconstruction\napproach. Following that, a pretrained 2D Stable Diffusion model is finetuned\nto generate realistic cell textures and the final outputs are combined to form\ncell populations. We show that training a segmentation model with a combination\nof our synthetic data and real data improves cell segmentation performance by\nup to 9\\% across multiple datasets. Additionally, the FID scores indicate that\nthe synthetic data closely resembles real data. The code for our proposed\napproach will be available at\nhttps://github.com/ruveydayilmaz0/cascaded\\_diffusion.\n","authors":["Rüveyda Yilmaz","Kaan Keven","Yuli Wu","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2411.11515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11514v1","updated":"2024-11-18T12:22:29Z","published":"2024-11-18T12:22:29Z","title":"Learning a Neural Association Network for Self-supervised Multi-Object\n Tracking","summary":" This paper introduces a novel framework to learn data association for\nmulti-object tracking in a self-supervised manner. Fully-supervised learning\nmethods are known to achieve excellent tracking performances, but acquiring\nidentity-level annotations is tedious and time-consuming. Motivated by the fact\nthat in real-world scenarios object motion can be usually represented by a\nMarkov process, we present a novel expectation maximization (EM) algorithm that\ntrains a neural network to associate detections for tracking, without requiring\nprior knowledge of their temporal correspondences. At the core of our method\nlies a neural Kalman filter, with an observation model conditioned on\nassociations of detections parameterized by a neural network. Given a batch of\nframes as input, data associations between detections from adjacent frames are\npredicted by a neural network followed by a Sinkhorn normalization that\ndetermines the assignment probabilities of detections to states. Kalman\nsmoothing is then used to obtain the marginal probability of observations given\nthe inferred states, producing a training objective to maximize this marginal\nprobability using gradient descent. The proposed framework is fully\ndifferentiable, allowing the underlying neural model to be trained end-to-end.\nWe evaluate our approach on the challenging MOT17 and MOT20 datasets and\nachieve state-of-the-art results in comparison to self-supervised trackers\nusing public detections. We furthermore demonstrate the capability of the\nlearned model to generalize across datasets.\n","authors":["Shuai Li","Michael Burke","Subramanian Ramamoorthy","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2411.11514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11507v1","updated":"2024-11-18T12:12:33Z","published":"2024-11-18T12:12:33Z","title":"SignEye: Traffic Sign Interpretation from Vehicle First-Person View","summary":" Traffic signs play a key role in assisting autonomous driving systems (ADS)\nby enabling the assessment of vehicle behavior in compliance with traffic\nregulations and providing navigation instructions. However, current works are\nlimited to basic sign understanding without considering the egocentric\nvehicle's spatial position, which fails to support further regulation\nassessment and direction navigation. Following the above issues, we introduce a\nnew task: traffic sign interpretation from the vehicle's first-person view,\nreferred to as TSI-FPV. Meanwhile, we develop a traffic guidance assistant\n(TGA) scenario application to re-explore the role of traffic signs in ADS as a\ncomplement to popular autonomous technologies (such as obstacle perception).\nNotably, TGA is not a replacement for electronic map navigation; rather, TGA\ncan be an automatic tool for updating it and complementing it in situations\nsuch as offline conditions or temporary sign adjustments. Lastly, a spatial and\nsemantic logic-aware stepwise reasoning pipeline (SignEye) is constructed to\nachieve the TSI-FPV and TGA, and an application-specific dataset (Traffic-CN)\nis built. Experiments show that TSI-FPV and TGA are achievable via our SignEye\ntrained on Traffic-CN. The results also demonstrate that the TGA can provide\ncomplementary information to ADS beyond existing popular autonomous\ntechnologies.\n","authors":["Chuang Yang","Xu Han","Tao Han","Yuejiao SU","Junyu Gao","Hongyuan Zhang","Yi Wang","Lap-Pui Chau"],"pdf_url":"https://arxiv.org/pdf/2411.11507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11505v1","updated":"2024-11-18T12:05:27Z","published":"2024-11-18T12:05:27Z","title":"LaVin-DiT: Large Vision Diffusion Transformer","summary":" This paper presents the Large Vision Diffusion Transformer (LaVin-DiT), a\nscalable and unified foundation model designed to tackle over 20 computer\nvision tasks in a generative framework. Unlike existing large vision models\ndirectly adapted from natural language processing architectures, which rely on\nless efficient autoregressive techniques and disrupt spatial relationships\nessential for vision data, LaVin-DiT introduces key innovations to optimize\ngenerative performance for vision tasks. First, to address the high\ndimensionality of visual data, we incorporate a spatial-temporal variational\nautoencoder that encodes data into a continuous latent space. Second, for\ngenerative modeling, we develop a joint diffusion transformer that\nprogressively produces vision outputs. Third, for unified multi-task training,\nin-context learning is implemented. Input-target pairs serve as task context,\nwhich guides the diffusion transformer to align outputs with specific tasks\nwithin the latent space. During inference, a task-specific context set and test\ndata as queries allow LaVin-DiT to generalize across tasks without fine-tuning.\nTrained on extensive vision datasets, the model is scaled from 0.1B to 3.4B\nparameters, demonstrating substantial scalability and state-of-the-art\nperformance across diverse vision tasks. This work introduces a novel pathway\nfor large vision foundation models, underscoring the promising potential of\ndiffusion transformers. The code and models will be open-sourced.\n","authors":["Zhaoqing Wang","Xiaobo Xia","Runnan Chen","Dongdong Yu","Changhu Wang","Mingming Gong","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11505v1.pdf","comment":"11 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2410.14148v2","updated":"2024-11-18T11:58:16Z","published":"2024-10-18T03:34:32Z","title":"Fine-Grained Verifiers: Preference Modeling as Next-token Prediction in\n Vision-Language Alignment","summary":" The recent advancements in large language models (LLMs) and pre-trained\nvision models have accelerated the development of vision-language large models\n(VLLMs), enhancing the interaction between visual and linguistic modalities.\nDespite their notable success across various domains, VLLMs face challenges in\nmodality alignment, which can lead to issues like hallucinations and unsafe\ncontent generation. Current alignment techniques often rely on coarse feedback\nand external datasets, limiting scalability and performance. In this paper, we\npropose FiSAO (Fine-Grained Self-Alignment Optimization), a novel\nself-alignment method that utilizes the model's own visual encoder as a\nfine-grained verifier to improve vision-language alignment without the need for\nadditional data. By leveraging token-level feedback from the vision encoder,\nFiSAO significantly improves vision-language alignment, even surpassing\ntraditional preference tuning methods that require additional data. Through\nboth theoretical analysis and experimental validation, we demonstrate that\nFiSAO effectively addresses the misalignment problem in VLLMs, marking the\nfirst instance of token-level rewards being applied to such models.\n","authors":["Chenhang Cui","An Zhang","Yiyang Zhou","Zhaorun Chen","Gelei Deng","Huaxiu Yao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.14148v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2411.11487v1","updated":"2024-11-18T11:48:45Z","published":"2024-11-18T11:48:45Z","title":"Look a Group at Once: Multi-Slide Modeling for Survival Prediction","summary":" Survival prediction is a critical task in pathology. In clinical practice,\npathologists often examine multiple cases, leveraging a broader spectrum of\ncancer phenotypes to enhance pathological assessment. Despite significant\nadvancements in deep learning, current solutions typically model each slide as\na sample, struggling to effectively capture comparable and slide-agnostic\npathological features. In this paper, we introduce GroupMIL, a novel framework\ninspired by the clinical practice of collective analysis, which models multiple\nslides as a single sample and organizes groups of patches and slides\nsequentially to capture cross-slide prognostic features. We also present\nGPAMamba, a model designed to facilitate intra- and inter-slide feature\ninteractions, effectively capturing local micro-environmental characteristics\nwithin slide-level graphs while uncovering essential prognostic patterns across\nan extended patch sequence within the group framework. Furthermore, we develop\na dual-head predictor that delivers comprehensive survival risk and probability\nassessments for each patient. Extensive empirical evaluations demonstrate that\nour model significantly outperforms state-of-the-art approaches across five\ndatasets from The Cancer Genome Atlas.\n","authors":["Xinyang Li","Yi Zhang","Yi Xie","Jianfei Yang","Xi Wang","Hao Chen","Haixian Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11481v1","updated":"2024-11-18T11:36:17Z","published":"2024-11-18T11:36:17Z","title":"Exploring Emerging Trends and Research Opportunities in Visual Place\n Recognition","summary":" Visual-based recognition, e.g., image classification, object detection, etc.,\nis a long-standing challenge in computer vision and robotics communities.\nConcerning the roboticists, since the knowledge of the environment is a\nprerequisite for complex navigation tasks, visual place recognition is vital\nfor most localization implementations or re-localization and loop closure\ndetection pipelines within simultaneous localization and mapping (SLAM). More\nspecifically, it corresponds to the system's ability to identify and match a\npreviously visited location using computer vision tools. Towards developing\nnovel techniques with enhanced accuracy and robustness, while motivated by the\nsuccess presented in natural language processing methods, researchers have\nrecently turned their attention to vision-language models, which integrate\nvisual and textual data.\n","authors":["Antonios Gasteratos","Konstantinos A. Tsintotas","Tobias Fischer","Yiannis Aloimonos","Michael Milford"],"pdf_url":"https://arxiv.org/pdf/2411.11481v1.pdf","comment":"2 pages, 1 figure. 40th Anniversary of the IEEE Conference on\n Robotics and Automation (ICRA@40), Rotterdam, Netherlands, September 23-26,\n 2024"},{"id":"http://arxiv.org/abs/2411.11477v1","updated":"2024-11-18T11:26:11Z","published":"2024-11-18T11:26:11Z","title":"SL-YOLO: A Stronger and Lighter Drone Target Detection Model","summary":" Detecting small objects in complex scenes, such as those captured by drones,\nis a daunting challenge due to the difficulty in capturing the complex features\nof small targets. While the YOLO family has achieved great success in large\ntarget detection, its performance is less than satisfactory when faced with\nsmall targets. Because of this, this paper proposes a revolutionary model\nSL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small\ntarget detection. We propose the Hierarchical Extended Path Aggregation Network\n(HEPAN), a pioneering cross-scale feature fusion method that can ensure\nunparalleled detection accuracy even in the most challenging environments. At\nthe same time, without sacrificing detection capabilities, we design the C2fDCB\nlightweight module and add the SCDown downsampling module to greatly reduce the\nmodel's parameters and computational complexity. Our experimental results on\nthe VisDrone2019 dataset reveal a significant improvement in performance, with\nmAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to\n28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M,\nand the FPS can reach 132, making it an ideal solution for real-time small\nobject detection in resource-constrained environments.\n","authors":["Defan Chen","Luchan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11475v1","updated":"2024-11-18T11:22:04Z","published":"2024-11-18T11:22:04Z","title":"MVLight: Relightable Text-to-3D Generation via Light-conditioned\n Multi-View Diffusion","summary":" Recent advancements in text-to-3D generation, building on the success of\nhigh-performance text-to-image generative models, have made it possible to\ncreate imaginative and richly textured 3D objects from textual descriptions.\nHowever, a key challenge remains in effectively decoupling light-independent\nand lighting-dependent components to enhance the quality of generated 3D models\nand their relighting performance. In this paper, we present MVLight, a novel\nlight-conditioned multi-view diffusion model that explicitly integrates\nlighting conditions directly into the generation process. This enables the\nmodel to synthesize high-quality images that faithfully reflect the specified\nlighting environment across multiple camera views. By leveraging this\ncapability to Score Distillation Sampling (SDS), we can effectively synthesize\n3D models with improved geometric precision and relighting capabilities. We\nvalidate the effectiveness of MVLight through extensive experiments and a user\nstudy.\n","authors":["Dongseok Shim","Yichun Shi","Kejie Li","H. Jin Kim","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11471v1","updated":"2024-11-18T11:13:30Z","published":"2024-11-18T11:13:30Z","title":"Generalizable Person Re-identification via Balancing Alignment and\n Uniformity","summary":" Domain generalizable person re-identification (DG re-ID) aims to learn\ndiscriminative representations that are robust to distributional shifts. While\ndata augmentation is a straightforward solution to improve generalization,\ncertain augmentations exhibit a polarized effect in this task, enhancing\nin-distribution performance while deteriorating out-of-distribution\nperformance. In this paper, we investigate this phenomenon and reveal that it\nleads to sparse representation spaces with reduced uniformity. To address this\nissue, we propose a novel framework, Balancing Alignment and Uniformity (BAU),\nwhich effectively mitigates this effect by maintaining a balance between\nalignment and uniformity. Specifically, BAU incorporates alignment and\nuniformity losses applied to both original and augmented images and integrates\na weighting strategy to assess the reliability of augmented samples, further\nimproving the alignment loss. Additionally, we introduce a domain-specific\nuniformity loss that promotes uniformity within each source domain, thereby\nenhancing the learning of domain-invariant features. Extensive experimental\nresults demonstrate that BAU effectively exploits the advantages of data\naugmentation, which previous studies could not fully utilize, and achieves\nstate-of-the-art performance without requiring complex training procedures. The\ncode is available at \\url{https://github.com/yoonkicho/BAU}.\n","authors":["Yoonki Cho","Jaeyoon Kim","Woo Jae Kim","Junsik Jung","Sung-eui Yoon"],"pdf_url":"https://arxiv.org/pdf/2411.11471v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11466v1","updated":"2024-11-18T11:01:25Z","published":"2024-11-18T11:01:25Z","title":"MGNiceNet: Unified Monocular Geometric Scene Understanding","summary":" Monocular geometric scene understanding combines panoptic segmentation and\nself-supervised depth estimation, focusing on real-time application in\nautonomous vehicles. We introduce MGNiceNet, a unified approach that uses a\nlinked kernel formulation for panoptic segmentation and self-supervised depth\nestimation. MGNiceNet is based on the state-of-the-art real-time panoptic\nsegmentation method RT-K-Net and extends the architecture to cover both\npanoptic segmentation and self-supervised monocular depth estimation. To this\nend, we introduce a tightly coupled self-supervised depth estimation predictor\nthat explicitly uses information from the panoptic path for depth prediction.\nFurthermore, we introduce a panoptic-guided motion masking method to improve\ndepth estimation without relying on video panoptic segmentation annotations. We\nevaluate our method on two popular autonomous driving datasets, Cityscapes and\nKITTI. Our model shows state-of-the-art results compared to other real-time\nmethods and closes the gap to computationally more demanding methods. Source\ncode and trained models are available at\nhttps://github.com/markusschoen/MGNiceNet.\n","authors":["Markus Schön","Michael Buchholz","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2411.11466v1.pdf","comment":"Accepted for ACCV 2024"},{"id":"http://arxiv.org/abs/2411.06397v2","updated":"2024-11-18T10:53:24Z","published":"2024-11-10T09:09:41Z","title":"A Hybrid Approach for COVID-19 Detection: Combining Wasserstein GAN with\n Transfer Learning","summary":" COVID-19 is extremely contagious and its rapid growth has drawn attention\ntowards its early diagnosis. Early diagnosis of COVID-19 enables healthcare\nprofessionals and government authorities to break the chain of transition and\nflatten the epidemic curve. With the number of cases accelerating across the\ndeveloped world, COVID-19 induced Viral Pneumonia cases is a big challenge.\nOverlapping of COVID-19 cases with Viral Pneumonia and other lung infections\nwith limited dataset and long training hours is a serious problem to cater.\nLimited amount of data often results in over-fitting models and due to this\nreason, model does not predict generalized results. To fill this gap, we\nproposed GAN-based approach to synthesize images which later fed into the deep\nlearning models to classify images of COVID-19, Normal, and Viral Pneumonia.\nSpecifically, customized Wasserstein GAN is proposed to generate 19% more Chest\nX-ray images as compare to the real images. This expanded dataset is then used\nto train four proposed deep learning models: VGG-16, ResNet-50, GoogLeNet and\nMNAST. The result showed that expanded dataset utilized deep learning models to\ndeliver high classification accuracies. In particular, VGG-16 achieved highest\naccuracy of 99.17% among all four proposed schemes. Rest of the models like\nResNet-50, GoogLeNet and MNAST delivered 93.9%, 94.49% and 97.75% testing\naccuracies respectively. Later, the efficiency of these models is compared with\nthe state of art models on the basis of accuracy. Further, our proposed models\ncan be applied to address the issue of scant datasets for any problem of image\nanalysis.\n","authors":["Sumera Rounaq","Shahid Munir Shah","Mahmoud Aljawarneh"],"pdf_url":"https://arxiv.org/pdf/2411.06397v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11458v1","updated":"2024-11-18T10:46:05Z","published":"2024-11-18T10:46:05Z","title":"HistoEncoder: a digital pathology foundation model for prostate cancer","summary":" Foundation models are trained on massive amounts of data to distinguish\ncomplex patterns and can be adapted to a wide range of downstream tasks with\nminimal computational resources. Here, we develop a foundation model for\nprostate cancer digital pathology called HistoEncoder by pre-training on 48\nmillion prostate tissue tile images. We demonstrate that HistoEncoder features\nextracted from tile images with similar histological patterns map closely\ntogether in the feature space. HistoEncoder outperforms models pre-trained with\nnatural images, even without fine-tuning or with 1000 times less training data.\nWe describe two use cases that leverage the capabilities of HistoEncoder by\nfine-tuning the model with a limited amount of data and computational\nresources. First, we show how HistoEncoder can be used to automatically\nannotate large-scale datasets with high accuracy. Second, we combine histomics\nwith commonly used clinical nomograms, significantly improving prostate\ncancer-specific death survival models. Foundation models such as HistoEncoder\ncan allow organizations with limited resources to build effective clinical\nsoftware tools without needing extensive datasets or significant amounts of\ncomputing.\n","authors":["Joona Pohjonen","Abderrahim-Oussama Batouche","Antti Rannikko","Kevin Sandeman","Andrew Erickson","Esa Pitkanen","Tuomas Mirtti"],"pdf_url":"https://arxiv.org/pdf/2411.11458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11455v1","updated":"2024-11-18T10:42:53Z","published":"2024-11-18T10:42:53Z","title":"The ADUULM-360 Dataset -- A Multi-Modal Dataset for Depth Estimation in\n Adverse Weather","summary":" Depth estimation is an essential task toward full scene understanding since\nit allows the projection of rich semantic information captured by cameras into\n3D space. While the field has gained much attention recently, datasets for\ndepth estimation lack scene diversity or sensor modalities. This work presents\nthe ADUULM-360 dataset, a novel multi-modal dataset for depth estimation. The\nADUULM-360 dataset covers all established autonomous driving sensor modalities,\ncameras, lidars, and radars. It covers a frontal-facing stereo setup, six\nsurround cameras covering the full 360-degree, two high-resolution long-range\nlidar sensors, and five long-range radar sensors. It is also the first depth\nestimation dataset that contains diverse scenes in good and adverse weather\nconditions. We conduct extensive experiments using state-of-the-art\nself-supervised depth estimation methods under different training tasks, such\nas monocular training, stereo training, and full surround training. Discussing\nthese results, we demonstrate common limitations of state-of-the-art methods,\nespecially in adverse weather conditions, which hopefully will inspire future\nresearch in this area. Our dataset, development kit, and trained baselines are\navailable at https://github.com/uulm-mrm/aduulm_360_dataset.\n","authors":["Markus Schön","Jona Ruof","Thomas Wodtko","Michael Buchholz","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2411.11455v1.pdf","comment":"2024 IEEE International Conference on Intelligent Transportation\n Systems (ITSC)"},{"id":"http://arxiv.org/abs/2411.11454v1","updated":"2024-11-18T10:42:27Z","published":"2024-11-18T10:42:27Z","title":"Relevance-guided Audio Visual Fusion for Video Saliency Prediction","summary":" Audio data, often synchronized with video frames, plays a crucial role in\nguiding the audience's visual attention. Incorporating audio information into\nvideo saliency prediction tasks can enhance the prediction of human visual\nbehavior. However, existing audio-visual saliency prediction methods often\ndirectly fuse audio and visual features, which ignore the possibility of\ninconsistency between the two modalities, such as when the audio serves as\nbackground music. To address this issue, we propose a novel relevance-guided\naudio-visual saliency prediction network dubbed AVRSP. Specifically, the\nRelevance-guided Audio-Visual feature Fusion module (RAVF) dynamically adjusts\nthe retention of audio features based on the semantic relevance between audio\nand visual elements, thereby refining the integration process with visual\nfeatures. Furthermore, the Multi-scale feature Synergy (MS) module integrates\nvisual features from different encoding stages, enhancing the network's ability\nto represent objects at various scales. The Multi-scale Regulator Gate (MRG)\ncould transfer crucial fusion information to visual features, thus optimizing\nthe utilization of multi-scale visual features. Extensive experiments on six\naudio-visual eye movement datasets have demonstrated that our AVRSP network\nachieves competitive performance in audio-visual saliency prediction.\n","authors":["Li Yu","Xuanzhe Sun","Pan Gao","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2411.11454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11435v1","updated":"2024-11-18T10:04:10Z","published":"2024-11-18T10:04:10Z","title":"GLDesigner: Leveraging Multi-Modal LLMs as Designer for Enhanced\n Aesthetic Text Glyph Layouts","summary":" Text logo design heavily relies on the creativity and expertise of\nprofessional designers, in which arranging element layouts is one of the most\nimportant procedures. However, few attention has been paid to this specific\ntask which needs to take precise textural details and user constraints into\nconsideration, but only on the broader tasks such as document/poster layout\ngeneration. In this paper, we propose a VLM-based framework that generates\ncontent-aware text logo layouts by integrating multi-modal inputs with user\nconstraints, supporting a more flexible and stable layout design in real-world\napplications. We introduce two model techniques to reduce the computation for\nprocessing multiple glyph images simultaneously, while does not face\nperformance degradation. To support instruction-tuning of out model, we\nconstruct two extensive text logo datasets, which are 5x more larger than the\nexisting public dataset. Except for the geometric annotations (e.g. text masks\nand character recognition), we also compliment with comprehensive layout\ndescriptions in natural language format, for more effective training to have\nreasoning ability when dealing with complex layouts and custom user\nconstraints. Experimental studies demonstrate the effectiveness of our proposed\nmodel and datasets, when comparing with previous methods in various benchmarks\nto evaluate geometric aesthetics and human preferences. The code and datasets\nwill be publicly available.\n","authors":["Junwen He","Yifan Wang","Lijun Wang","Huchuan Lu","Jun-Yan He","Chenyang Li","Hanyuan Chen","Jin-Peng Lan","Bin Luo","Yifeng Geng"],"pdf_url":"https://arxiv.org/pdf/2411.11435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01295v5","updated":"2024-11-18T09:54:05Z","published":"2024-07-01T13:47:54Z","title":"Formal Verification of Deep Neural Networks for Object Detection","summary":" Deep neural networks (DNNs) are widely used in real-world applications, yet\nthey remain vulnerable to errors and adversarial attacks. Formal verification\noffers a systematic approach to identify and mitigate these vulnerabilities,\nenhancing model robustness and reliability. While most existing verification\nmethods focus on image classification models, this work extends formal\nverification to the more complex domain of emph{object detection} models. We\npropose a formulation for verifying the robustness of such models and\ndemonstrate how state-of-the-art verification tools, originally developed for\nclassification, can be adapted for this purpose. Our experiments, conducted on\nvarious datasets and networks, highlight the ability of formal verification to\nuncover vulnerabilities in object detection models, underscoring the need to\nextend verification efforts to this domain. This work lays the foundation for\nfurther research into formal verification across a broader range of computer\nvision applications.\n","authors":["Yizhak Y. Elboher","Avraham Raviv","Yael Leibovich Weiss","Omer Cohen","Roy Assa","Guy Katz","Hillel Kugler"],"pdf_url":"https://arxiv.org/pdf/2407.01295v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11421v1","updated":"2024-11-18T09:46:45Z","published":"2024-11-18T09:46:45Z","title":"Towards fast DBSCAN via Spectrum-Preserving Data Compression","summary":" This paper introduces a novel method to significantly accelerate DBSCAN by\nemploying spectral data compression. The proposed approach reduces the size of\nthe data set by a factor of five while preserving the essential clustering\ncharacteristics through an innovative spectral compression technique. This\nenables DBSCAN to run substantially faster without any loss of accuracy.\nExperiments on real-world data sets, such as USPS, demonstrate the method's\ncapability to achieve this dramatic reduction in data size while maintaining\nclustering performance.\n","authors":["Yongyu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15287v2","updated":"2024-11-18T09:35:46Z","published":"2024-05-24T07:19:40Z","title":"ArtWeaver: Advanced Dynamic Style Integration via Diffusion Model","summary":" Stylized Text-to-Image Generation (STIG) aims to generate images from text\nprompts and style reference images. In this paper, we present ArtWeaver, a\nnovel framework that leverages pretrained Stable Diffusion (SD) to address\nchallenges such as misinterpreted styles and inconsistent semantics. Our\napproach introduces two innovative modules: the mixed style descriptor and the\ndynamic attention adapter. The mixed style descriptor enhances SD by combining\ncontent-aware and frequency-disentangled embeddings from CLIP with additional\nsources that capture global statistics and textual information, thus providing\na richer blend of style-related and semantic-related knowledge. To achieve a\nbetter balance between adapter capacity and semantic control, the dynamic\nattention adapter is integrated into the diffusion UNet, dynamically\ncalculating adaptation weights based on the style descriptors. Additionally, we\nintroduce two objective functions to optimize the model alongside the denoising\nloss, further enhancing semantic and style consistency. Extensive experiments\ndemonstrate the superiority of ArtWeaver over existing methods, producing\nimages with diverse target styles while maintaining the semantic integrity of\nthe text prompts.\n","authors":["Chengming Xu","Kai Hu","Qilin Wang","Donghao Luo","Jiangning Zhang","Xiaobin Hu","Yanwei Fu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2405.15287v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11409v1","updated":"2024-11-18T09:30:05Z","published":"2024-11-18T09:30:05Z","title":"IKEA Manuals at Work: 4D Grounding of Assembly Instructions on Internet\n Videos","summary":" Shape assembly is a ubiquitous task in daily life, integral for constructing\ncomplex 3D structures like IKEA furniture. While significant progress has been\nmade in developing autonomous agents for shape assembly, existing datasets have\nnot yet tackled the 4D grounding of assembly instructions in videos, essential\nfor a holistic understanding of assembly in 3D space over time. We introduce\nIKEA Video Manuals, a dataset that features 3D models of furniture parts,\ninstructional manuals, assembly videos from the Internet, and most importantly,\nannotations of dense spatio-temporal alignments between these data modalities.\nTo demonstrate the utility of IKEA Video Manuals, we present five applications\nessential for shape assembly: assembly plan generation, part-conditioned\nsegmentation, part-conditioned pose estimation, video object segmentation, and\nfurniture assembly based on instructional video manuals. For each application,\nwe provide evaluation metrics and baseline methods. Through experiments on our\nannotated data, we highlight many challenges in grounding assembly instructions\nin videos to improve shape assembly, including handling occlusions, varying\nviewpoints, and extended assembly sequences.\n","authors":["Yunong Liu","Cristobal Eyzaguirre","Manling Li","Shubh Khanna","Juan Carlos Niebles","Vineeth Ravi","Saumitra Mishra","Weiyu Liu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11409v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2411.11396v1","updated":"2024-11-18T09:18:36Z","published":"2024-11-18T09:18:36Z","title":"Stacking Brick by Brick: Aligned Feature Isolation for Incremental Face\n Forgery Detection","summary":" The rapid advancement of face forgery techniques has introduced a growing\nvariety of forgeries. Incremental Face Forgery Detection (IFFD), involving\ngradually adding new forgery data to fine-tune the previously trained model,\nhas been introduced as a promising strategy to deal with evolving forgery\nmethods. However, a naively trained IFFD model is prone to catastrophic\nforgetting when new forgeries are integrated, as treating all forgeries as a\nsingle ''Fake\" class in the Real/Fake classification can cause different\nforgery types overriding one another, thereby resulting in the forgetting of\nunique characteristics from earlier tasks and limiting the model's\neffectiveness in learning forgery specificity and generality. In this paper, we\npropose to stack the latent feature distributions of previous and new tasks\nbrick by brick, $\\textit{i.e.}$, achieving $\\textbf{aligned feature\nisolation}$. In this manner, we aim to preserve learned forgery information and\naccumulate new knowledge by minimizing distribution overriding, thereby\nmitigating catastrophic forgetting. To achieve this, we first introduce Sparse\nUniform Replay (SUR) to obtain the representative subsets that could be treated\nas the uniformly sparse versions of the previous global distributions. We then\npropose a Latent-space Incremental Detector (LID) that leverages SUR data to\nisolate and align distributions. For evaluation, we construct a more advanced\nand comprehensive benchmark tailored for IFFD. The leading experimental results\nvalidate the superiority of our method.\n","authors":["Jikang Cheng","Zhiyuan Yan","Ying Zhang","Li Hao","Jiaxin Ai","Qin Zou","Chen Li","Zhongyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05953v6","updated":"2024-11-18T08:53:41Z","published":"2024-05-09T17:46:22Z","title":"Frame Interpolation with Consecutive Brownian Bridge Diffusion","summary":" Recent work in Video Frame Interpolation (VFI) tries to formulate VFI as a\ndiffusion-based conditional image generation problem, synthesizing the\nintermediate frame given a random noise and neighboring frames. Due to the\nrelatively high resolution of videos, Latent Diffusion Models (LDMs) are\nemployed as the conditional generation model, where the autoencoder compresses\nimages into latent representations for diffusion and then reconstructs images\nfrom these latent representations. Such a formulation poses a crucial\nchallenge: VFI expects that the output is deterministically equal to the ground\ntruth intermediate frame, but LDMs randomly generate a diverse set of different\nimages when the model runs multiple times. The reason for the diverse\ngeneration is that the cumulative variance (variance accumulated at each step\nof generation) of generated latent representations in LDMs is large. This makes\nthe sampling trajectory random, resulting in diverse rather than deterministic\ngenerations. To address this problem, we propose our unique solution: Frame\nInterpolation with Consecutive Brownian Bridge Diffusion. Specifically, we\npropose consecutive Brownian Bridge diffusion that takes a deterministic\ninitial value as input, resulting in a much smaller cumulative variance of\ngenerated latent representations. Our experiments suggest that our method can\nimprove together with the improvement of the autoencoder and achieve\nstate-of-the-art performance in VFI, leaving strong potential for further\nenhancement.\n","authors":["Zonglin Lyu","Ming Li","Jianbo Jiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2405.05953v6.pdf","comment":"Formatting"},{"id":"http://arxiv.org/abs/2411.11376v1","updated":"2024-11-18T08:40:25Z","published":"2024-11-18T08:40:25Z","title":"Lung Disease Detection with Vision Transformers: A Comparative Study of\n Machine Learning Methods","summary":" Recent advancements in medical image analysis have predominantly relied on\nConvolutional Neural Networks (CNNs), achieving impressive performance in chest\nX-ray classification tasks, such as the 92% AUC reported by AutoThorax-Net and\nthe 88% AUC achieved by ChexNet in classifcation tasks. However, in the medical\nfield, even small improvements in accuracy can have significant clinical\nimplications. This study explores the application of Vision Transformers (ViT),\na state-of-the-art architecture in machine learning, to chest X-ray analysis,\naiming to push the boundaries of diagnostic accuracy. I present a comparative\nanalysis of two ViT-based approaches: one utilizing full chest X-ray images and\nanother focusing on segmented lung regions. Experiments demonstrate that both\nmethods surpass the performance of traditional CNN-based models, with the\nfull-image ViT achieving up to 97.83% accuracy and the lung-segmented ViT\nreaching 96.58% accuracy in classifcation of diseases on three label and AUC of\n94.54% when label numbers are increased to eight. Notably, the full-image\napproach showed superior performance across all metrics, including precision,\nrecall, F1 score, and AUC-ROC. These findings suggest that Vision Transformers\ncan effectively capture relevant features from chest X-rays without the need\nfor explicit lung segmentation, potentially simplifying the preprocessing\npipeline while maintaining high accuracy. This research contributes to the\ngrowing body of evidence supporting the efficacy of transformer-based\narchitectures in medical image analysis and highlights their potential to\nenhance diagnostic precision in clinical settings.\n","authors":["Baljinnyam Dayan"],"pdf_url":"https://arxiv.org/pdf/2411.11376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11374v1","updated":"2024-11-18T08:37:48Z","published":"2024-11-18T08:37:48Z","title":"LeC$^2$O-NeRF: Learning Continuous and Compact Large-Scale Occupancy for\n Urban Scenes","summary":" In NeRF, a critical problem is to effectively estimate the occupancy to guide\nempty-space skipping and point sampling. Grid-based methods work well for\nsmall-scale scenes. However, on large-scale scenes, they are limited by\npredefined bounding boxes, grid resolutions, and high memory usage for grid\nupdates, and thus struggle to speed up training for large-scale, irregularly\nbounded and complex urban scenes without sacrificing accuracy. In this paper,\nwe propose to learn a continuous and compact large-scale occupancy network,\nwhich can classify 3D points as occupied or unoccupied points. We train this\noccupancy network end-to-end together with the radiance field in a\nself-supervised manner by three designs. First, we propose a novel imbalanced\noccupancy loss to regularize the occupancy network. It makes the occupancy\nnetwork effectively control the ratio of unoccupied and occupied points,\nmotivated by the prior that most of 3D scene points are unoccupied. Second, we\ndesign an imbalanced architecture containing a large scene network and a small\nempty space network to separately encode occupied and unoccupied points\nclassified by the occupancy network. This imbalanced structure can effectively\nmodel the imbalanced nature of occupied and unoccupied regions. Third, we\ndesign an explicit density loss to guide the occupancy network, making the\ndensity of unoccupied points smaller. As far as we know, we are the first to\nlearn a continuous and compact occupancy of large-scale NeRF by a network. In\nour experiments, our occupancy network can quickly learn more compact, accurate\nand smooth occupancy compared to the occupancy grid. With our learned occupancy\nas guidance for empty space skipping on challenging large-scale benchmarks, our\nmethod consistently obtains higher accuracy compared to the occupancy grid, and\nour method can speed up state-of-the-art NeRF methods without sacrificing\naccuracy.\n","authors":["Zhenxing Mi","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2411.11374v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2411.11370v1","updated":"2024-11-18T08:32:51Z","published":"2024-11-18T08:32:51Z","title":"TL-CLIP: A Power-specific Multimodal Pre-trained Visual Foundation Model\n for Transmission Line Defect Recognition","summary":" Transmission line defect recognition models have traditionally used general\npre-trained weights as the initial basis for their training. These models often\nsuffer weak generalization capability due to the lack of domain knowledge in\nthe pre-training dataset. To address this issue, we propose a two-stage\ntransmission-line-oriented contrastive language-image pre-training (TL-CLIP)\nframework, which lays a more effective foundation for transmission line defect\nrecognition. The pre-training process employs a novel power-specific multimodal\nalgorithm assisted with two power-specific pre-training tasks for better\nmodeling the power-related semantic knowledge contained in the inspection data.\nTo fine-tune the pre-trained model, we develop a transfer learning strategy,\nnamely fine-tuning with pre-training objective (FTP), to alleviate the\noverfitting problem caused by limited inspection data. Experimental results\ndemonstrate that the proposed method significantly improves the performance of\ntransmission line defect recognition in both classification and detection\ntasks, indicating clear advantages over traditional pre-trained models in the\nscene of transmission line inspection.\n","authors":["Ke Zhang","Zhaoye Zheng","Yurong Guo","Jiacun Wang","Jiyuan Yang","Yangjie Xiao"],"pdf_url":"https://arxiv.org/pdf/2411.11370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10370v2","updated":"2024-11-18T08:29:08Z","published":"2024-05-16T18:03:41Z","title":"Grounded 3D-LLM with Referent Tokens","summary":" Prior studies on 3D scene understanding have primarily developed specialized\nmodels for specific tasks or required task-specific fine-tuning. In this study,\nwe propose Grounded 3D-LLM, which explores the potential of 3D large\nmulti-modal models (3D LMMs) to consolidate various 3D vision tasks within a\nunified generative framework. The model uses scene referent tokens as special\nnoun phrases to reference 3D scenes, enabling it to handle sequences that\ninterleave 3D and textual data. Per-task instruction-following templates are\nemployed to ensure natural and diversity in translating 3D vision tasks into\nlanguage formats. To facilitate the use of referent tokens in subsequent\nlanguage modeling, we provide a large-scale, automatically curated grounded\nscene-text dataset with over 1 million phrase-to-region correspondences and\nintroduce Contrastive Language-Scene Pre-training (CLASP) to perform\nphrase-level scene-text alignment using this data. Our comprehensive evaluation\ncovers open-ended tasks like dense captioning and 3D question answering,\nalongside close-ended tasks such as object detection and language grounding.\nExperiments across multiple 3D benchmarks reveal the leading performance and\nthe broad applicability of Grounded 3D-LLM. Code and datasets are available at\nthe https://groundedscenellm.github.io/grounded_3d-llm.github.io.\n","authors":["Yilun Chen","Shuai Yang","Haifeng Huang","Tai Wang","Runsen Xu","Ruiyuan Lyu","Dahua Lin","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2405.10370v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2411.11363v1","updated":"2024-11-18T08:18:44Z","published":"2024-11-18T08:18:44Z","title":"GPS-Gaussian+: Generalizable Pixel-wise 3D Gaussian Splatting for\n Real-Time Human-Scene Rendering from Sparse Views","summary":" Differentiable rendering techniques have recently shown promising results for\nfree-viewpoint video synthesis of characters. However, such methods, either\nGaussian Splatting or neural implicit rendering, typically necessitate\nper-subject optimization which does not meet the requirement of real-time\nrendering in an interactive application. We propose a generalizable Gaussian\nSplatting approach for high-resolution image rendering under a sparse-view\ncamera setting. To this end, we introduce Gaussian parameter maps defined on\nthe source views and directly regress Gaussian properties for instant novel\nview synthesis without any fine-tuning or optimization. We train our Gaussian\nparameter regression module on human-only data or human-scene data, jointly\nwith a depth estimation module to lift 2D parameter maps to 3D space. The\nproposed framework is fully differentiable with both depth and rendering\nsupervision or with only rendering supervision. We further introduce a\nregularization term and an epipolar attention mechanism to preserve geometry\nconsistency between two source views, especially when neglecting depth\nsupervision. Experiments on several datasets demonstrate that our method\noutperforms state-of-the-art methods while achieving an exceeding rendering\nspeed.\n","authors":["Boyao Zhou","Shunyuan Zheng","Hanzhang Tu","Ruizhi Shao","Boning Liu","Shengping Zhang","Liqiang Nie","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11363v1.pdf","comment":"Journal extension of CVPR 2024,Project\n page:https://yaourtb.github.io/GPS-Gaussian+"},{"id":"http://arxiv.org/abs/2411.01916v2","updated":"2024-11-18T08:16:36Z","published":"2024-11-04T09:28:18Z","title":"Masked Autoencoders are Parameter-Efficient Federated Continual Learners","summary":" Federated learning is a specific distributed learning paradigm in which a\ncentral server aggregates updates from multiple clients' local models, thereby\nenabling the server to learn without requiring clients to upload their private\ndata, maintaining data privacy. While existing federated learning methods are\nprimarily designed for static data, real-world applications often require\nclients to learn new categories over time. This challenge necessitates the\nintegration of continual learning techniques, leading to federated continual\nlearning (FCL). To address both catastrophic forgetting and non-IID issues, we\npropose to use masked autoencoders (MAEs) as parameter-efficient federated\ncontinual learners, called pMAE. pMAE learns reconstructive prompt on the\nclient side through image reconstruction using MAE. On the server side, it\nreconstructs the uploaded restore information to capture the data distribution\nacross previous tasks and different clients, using these reconstructed images\nto finetune discriminative prompt and classifier parameters tailored for\nclassification, thereby alleviating catastrophic forgetting and non-IID issues\non a global scale. Experimental results demonstrate that pMAE achieves\nperformance comparable to existing prompt-based methods and can enhance their\neffectiveness, particularly when using self-supervised pre-trained transformers\nas the backbone. Code is available at: https://github.com/ycheoo/pMAE.\n","authors":["Yuchen He","Xiangfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11362v1","updated":"2024-11-18T08:13:22Z","published":"2024-11-18T08:13:22Z","title":"MAIRA-Seg: Enhancing Radiology Report Generation with Segmentation-Aware\n Multimodal Large Language Models","summary":" There is growing interest in applying AI to radiology report generation,\nparticularly for chest X-rays (CXRs). This paper investigates whether\nincorporating pixel-level information through segmentation masks can improve\nfine-grained image interpretation of multimodal large language models (MLLMs)\nfor radiology report generation. We introduce MAIRA-Seg, a segmentation-aware\nMLLM framework designed to utilize semantic segmentation masks alongside CXRs\nfor generating radiology reports. We train expert segmentation models to obtain\nmask pseudolabels for radiology-specific structures in CXRs. Subsequently,\nbuilding on the architectures of MAIRA, a CXR-specialised model for report\ngeneration, we integrate a trainable segmentation tokens extractor that\nleverages these mask pseudolabels, and employ mask-aware prompting to generate\ndraft radiology reports. Our experiments on the publicly available MIMIC-CXR\ndataset show that MAIRA-Seg outperforms non-segmentation baselines. We also\ninvestigate set-of-marks prompting with MAIRA and find that MAIRA-Seg\nconsistently demonstrates comparable or superior performance. The results\nconfirm that using segmentation masks enhances the nuanced reasoning of MLLMs,\npotentially contributing to better clinical outcomes.\n","authors":["Harshita Sharma","Valentina Salvatelli","Shaury Srivastav","Kenza Bouzid","Shruthi Bannur","Daniel C. Castro","Maximilian Ilse","Sam Bond-Taylor","Mercy Prasanna Ranjit","Fabian Falck","Fernando Pérez-García","Anton Schwaighofer","Hannah Richardson","Maria Teodora Wetscherek","Stephanie L. Hyland","Javier Alvarez-Valle"],"pdf_url":"https://arxiv.org/pdf/2411.11362v1.pdf","comment":"Accepted as Proceedings Paper at ML4H 2024"},{"id":"http://arxiv.org/abs/2411.11361v1","updated":"2024-11-18T08:12:54Z","published":"2024-11-18T08:12:54Z","title":"Scalable Autoregressive Monocular Depth Estimation","summary":" This paper proposes a new autoregressive model as an effective and scalable\nmonocular depth estimator. Our idea is simple: We tackle the monocular depth\nestimation (MDE) task with an autoregressive prediction paradigm, based on two\ncore designs. First, our depth autoregressive model (DAR) treats the depth map\nof different resolutions as a set of tokens, and conducts the low-to-high\nresolution autoregressive objective with a patch-wise casual mask. Second, our\nDAR recursively discretizes the entire depth range into more compact intervals,\nand attains the coarse-to-fine granularity autoregressive objective in an\nordinal-regression manner. By coupling these two autoregressive objectives, our\nDAR establishes new state-of-the-art (SOTA) on KITTI and NYU Depth v2 by clear\nmargins. Further, our scalable approach allows us to scale the model up to 2.0B\nand achieve the best RMSE of 1.799 on the KITTI dataset (5% improvement)\ncompared to 1.896 by the current SOTA (Depth Anything). DAR further showcases\nzero-shot generalization ability on unseen datasets. These results suggest that\nDAR yields superior performance with an autoregressive prediction paradigm,\nproviding a promising approach to equip modern autoregressive large models\n(e.g., GPT-4o) with depth estimation capabilities.\n","authors":["Jinhong Wang","Jian Liu","Dongqi Tang","Weiqiang Wang","Wentong Li","Danny Chen","J intai Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11360v1","updated":"2024-11-18T08:10:49Z","published":"2024-11-18T08:10:49Z","title":"CCExpert: Advancing MLLM Capability in Remote Sensing Change Captioning\n with Difference-Aware Integration and a Foundational Dataset","summary":" Remote Sensing Image Change Captioning (RSICC) aims to generate natural\nlanguage descriptions of surface changes between multi-temporal remote sensing\nimages, detailing the categories, locations, and dynamics of changed objects\n(e.g., additions or disappearances). Many current methods attempt to leverage\nthe long-sequence understanding and reasoning capabilities of multimodal large\nlanguage models (MLLMs) for this task. However, without comprehensive data\nsupport, these approaches often alter the essential feature transmission\npathways of MLLMs, disrupting the intrinsic knowledge within the models and\nlimiting their potential in RSICC. In this paper, we propose a novel model,\nCCExpert, based on a new, advanced multimodal large model framework. Firstly,\nwe design a difference-aware integration module to capture multi-scale\ndifferences between bi-temporal images and incorporate them into the original\nimage context, thereby enhancing the signal-to-noise ratio of differential\nfeatures. Secondly, we constructed a high-quality, diversified dataset called\nCC-Foundation, containing 200,000 image pairs and 1.2 million captions, to\nprovide substantial data support for continue pretraining in this domain.\nLastly, we employed a three-stage progressive training process to ensure the\ndeep integration of the difference-aware integration module with the pretrained\nMLLM. CCExpert achieved a notable performance of $S^*_m=81.80$ on the LEVIR-CC\nbenchmark, significantly surpassing previous state-of-the-art methods. The code\nand part of the dataset will soon be open-sourced at\nhttps://github.com/Meize0729/CCExpert.\n","authors":["Zhiming Wang","Mingze Wang","Sheng Xu","Yanjing Li","Baochang Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11357v1","updated":"2024-11-18T08:03:11Z","published":"2024-11-18T08:03:11Z","title":"Text-guided Zero-Shot Object Localization","summary":" Object localization is a hot issue in computer vision area, which aims to\nidentify and determine the precise location of specific objects from image or\nvideo. Most existing object localization methods heavily rely on extensive\nlabeled data, which are costly to annotate and constrain their applicability.\nTherefore, we propose a new Zero-Shot Object Localization (ZSOL) framework for\naddressing the aforementioned challenges. In the proposed framework, we\nintroduce the Contrastive Language Image Pre-training (CLIP) module which could\nintegrate visual and linguistic information effectively. Furthermore, we design\na Text Self-Similarity Matching (TSSM) module, which could improve the\nlocalization accuracy by enhancing the representation of text features\nextracted by CLIP module. Hence, the proposed framework can be guided by prompt\nwords to identify and locate specific objects in an image in the absence of\nlabeled samples. The results of extensive experiments demonstrate that the\nproposed method could improve the localization performance significantly and\nestablishes an effective benchmark for further research.\n","authors":["Jingjing Wang","Xinglin Piao","Zongzhi Gao","Bo Li","Yong Zhang","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2411.11357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02506v2","updated":"2024-11-18T07:59:55Z","published":"2024-06-04T17:24:19Z","title":"An Open-Source Tool for Mapping War Destruction at Scale in Ukraine\n using Sentinel-1 Time Series","summary":" Access to detailed war impact assessments is crucial for humanitarian\norganizations to effectively assist populations most affected by armed\nconflicts. However, maintaining a comprehensive understanding of the situation\non the ground is challenging, especially in conflicts that cover vast\nterritories and extend over long periods. This study presents a scalable and\ntransferable method for estimating war-induced damage to buildings. We first\ntrain a machine learning model to output pixel-wise probability of destruction\nfrom Synthetic Aperture Radar (SAR) satellite image time series, leveraging\nexisting, manual damage assessments as ground truth and cloud-based geospatial\nanalysis tools for large-scale inference. We further post-process these\nassessments using open building footprints to obtain a final damage estimate\nper building. We introduce an accessible, open-source tool that allows users to\nadjust the confidence interval based on their specific requirements and use\ncases. Our approach enables humanitarian organizations and other actors to\nrapidly screen large geographic regions for war impacts. We provide two\npublicly accessible dashboards: a Ukraine Damage Explorer to dynamically view\nour pre-computed estimates, and a Rapid Damage Mapping Tool to easily run our\nmethod and produce custom maps.\n","authors":["Olivier Dietrich","Torben Peters","Vivien Sainte Fare Garnot","Valerie Sticher","Thao Ton-That Whelan","Konrad Schindler","Jan Dirk Wegner"],"pdf_url":"https://arxiv.org/pdf/2406.02506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11356v1","updated":"2024-11-18T07:57:59Z","published":"2024-11-18T07:57:59Z","title":"Superpixel-informed Implicit Neural Representation for Multi-Dimensional\n Data","summary":" Recently, implicit neural representations (INRs) have attracted increasing\nattention for multi-dimensional data recovery. However, INRs simply map\ncoordinates via a multi-layer perception (MLP) to corresponding values,\nignoring the inherent semantic information of the data. To leverage semantic\npriors from the data, we propose a novel Superpixel-informed INR (S-INR).\nSpecifically, we suggest utilizing generalized superpixel instead of pixel as\nan alternative basic unit of INR for multi-dimensional data (e.g., images and\nweather data). The coordinates of generalized superpixels are first fed into\nexclusive attention-based MLPs, and then the intermediate results interact with\na shared dictionary matrix. The elaborately designed modules in S-INR allow us\nto ingenuously exploit the semantic information within and across generalized\nsuperpixels. Extensive experiments on various applications validate the\neffectiveness and efficacy of our S-INR compared to state-of-the-art INR\nmethods.\n","authors":["Jiayi Li","Xile Zhao","Jianli Wang","Chao Wang","Min Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11356v1.pdf","comment":"Accepted at ECCV 2024, 18 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.11354v1","updated":"2024-11-18T07:50:22Z","published":"2024-11-18T07:50:22Z","title":"A comprehensive survey of oracle character recognition: challenges,\n benchmarks, and beyond","summary":" Oracle character recognition-an analysis of ancient Chinese inscriptions\nfound on oracle bones-has become a pivotal field intersecting archaeology,\npaleography, and historical cultural studies. Traditional methods of oracle\ncharacter recognition have relied heavily on manual interpretation by experts,\nwhich is not only labor-intensive but also limits broader accessibility to the\ngeneral public. With recent breakthroughs in pattern recognition and deep\nlearning, there is a growing movement towards the automation of oracle\ncharacter recognition (OrCR), showing considerable promise in tackling the\nchallenges inherent to these ancient scripts. However, a comprehensive\nunderstanding of OrCR still remains elusive. Therefore, this paper presents a\nsystematic and structured survey of the current landscape of OrCR research. We\ncommence by identifying and analyzing the key challenges of OrCR. Then, we\nprovide an overview of the primary benchmark datasets and digital resources\navailable for OrCR. A review of contemporary research methodologies follows, in\nwhich their respective efficacies, limitations, and applicability to the\ncomplex nature of oracle characters are critically highlighted and examined.\nAdditionally, our review extends to ancillary tasks associated with OrCR across\ndiverse disciplines, providing a broad-spectrum analysis of its applications.\nWe conclude with a forward-looking perspective, proposing potential avenues for\nfuture investigations that could yield significant advancements in the field.\n","authors":["Jing Li","Xueke Chi","Qiufeng Wang","Dahan Wang","Kaizhu Huang","Yongge Liu","Cheng-lin Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10679v2","updated":"2024-11-18T07:46:35Z","published":"2024-08-20T09:31:03Z","title":"DemMamba: Alignment-free Raw Video Demoireing with Frequency-assisted\n Spatio-Temporal Mamba","summary":" Moire patterns, resulting from the interference of two similar repetitive\npatterns, are frequently observed during the capture of images or videos on\nscreens. These patterns vary in color, shape, and location across video frames,\nposing challenges in extracting information from adjacent frames and preserving\ntemporal consistency throughout the restoration process. Existing deep learning\nmethods often depend on well-designed alignment modules, such as optical flow\nestimation, deformable convolution, and cross-frame self-attention layers,\nincurring high computational costs. Recent studies indicate that utilizing raw\ndata as input can significantly improve the effectiveness of video demoireing\nby providing the pristine degradation information and more detailed content.\nHowever, previous works fail to design both efficient and effective raw video\ndemoireing methods that can maintain temporal consistency and prevent\ndegradation of color and spatial details. This paper introduces a novel\nalignment-free raw video demoireing network with frequency-assisted\nspatio-temporal Mamba (DemMamba). It features sequentially arranged Spatial\nMamba Blocks (SMB) and Temporal Mamba Blocks (TMB) to effectively model the\ninter- and intra-relationships in raw videos affected by moire patterns. An\nAdaptive Frequency Block (AFB) within the SMB facilitates demoireing in the\nfrequency domain, while a Channel Attention Block (CAB) in the TMB enhances the\ntemporal information interactions by leveraging inter-channel relationships\namong features. Extensive experiments demonstrate that our proposed DemMamba\nsurpasses state-of-the-art methods by 1.3 dB in PSNR, and also provides a\nsatisfactory visual experience.\n","authors":["Shuning Xu","Xina Liu","Binbin Song","Xiangyu Chen","Qiubo Chen","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11351v1","updated":"2024-11-18T07:43:12Z","published":"2024-11-18T07:43:12Z","title":"Visual-Semantic Graph Matching Net for Zero-Shot Learning","summary":" Zero-shot learning (ZSL) aims to leverage additional semantic information to\nrecognize unseen classes. To transfer knowledge from seen to unseen classes,\nmost ZSL methods often learn a shared embedding space by simply aligning visual\nembeddings with semantic prototypes. However, methods trained under this\nparadigm often struggle to learn robust embedding space because they align the\ntwo modalities in an isolated manner among classes, which ignore the crucial\nclass relationship during the alignment process. To address the aforementioned\nchallenges, this paper proposes a Visual-Semantic Graph Matching Net, termed as\nVSGMN, which leverages semantic relationships among classes to aid in\nvisual-semantic embedding. VSGMN employs a Graph Build Network (GBN) and a\nGraph Matching Network (GMN) to achieve two-stage visual-semantic alignment.\nSpecifically, GBN first utilizes an embedding-based approach to build visual\nand semantic graphs in the semantic space and align the embedding with its\nprototype for first-stage alignment. Additionally, to supplement unseen class\nrelations in these graphs, GBN also build the unseen class nodes based on\nsemantic relationships. In the second stage, GMN continuously integrates\nneighbor and cross-graph information into the constructed graph nodes, and\naligns the node relationships between the two graphs under the class\nrelationship constraint. Extensive experiments on three benchmark datasets\ndemonstrate that VSGMN achieves superior performance in both conventional and\ngeneralized ZSL scenarios. The implementation of our VSGMN and experimental\nresults are available at github: https://github.com/dbwfd/VSGMN\n","authors":["Bowen Duan","Shiming Chen","Yufei Guo","Guo-Sen Xie","Weiping Ding","Yisong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11351v1.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.11343v1","updated":"2024-11-18T07:26:09Z","published":"2024-11-18T07:26:09Z","title":"Teaching Video Diffusion Model with Latent Physical Phenomenon Knowledge","summary":" Video diffusion models have exhibited tremendous progress in various video\ngeneration tasks. However, existing models struggle to capture latent physical\nknowledge, failing to infer physical phenomena that are challenging to\narticulate with natural language. Generating videos following the fundamental\nphysical laws is still an opening challenge. To address this challenge, we\npropose a novel method to teach video diffusion models with latent physical\nphenomenon knowledge, enabling the accurate generation of physically informed\nphenomena. Specifically, we first pretrain Masked Autoencoders (MAE) to\nreconstruct the physical phenomena, resulting in output embeddings that\nencapsulate latent physical phenomenon knowledge. Leveraging these embeddings,\nwe could generate the pseudo-language prompt features based on the aligned\nspatial relationships between CLIP vision and language encoders. Particularly,\ngiven that diffusion models typically use CLIP's language encoder for text\nprompt embeddings, our approach integrates the CLIP visual features informed by\nlatent physical knowledge into a quaternion hidden space. This enables the\nmodeling of spatial relationships to produce physical knowledge-informed\npseudo-language prompts. By incorporating these prompt features and fine-tuning\nthe video diffusion model in a parameter-efficient manner, the physical\nknowledge-informed videos are successfully generated. We validate our method\nextensively through both numerical simulations and real-world observations of\nphysical phenomena, demonstrating its remarkable performance across diverse\nscenarios.\n","authors":["Qinglong Cao","Ding Wang","Xirui Li","Yuntian Chen","Chao Ma","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.11343v1.pdf","comment":"7 figures, 14 pages"},{"id":"http://arxiv.org/abs/2312.09063v3","updated":"2024-11-18T07:20:10Z","published":"2023-12-14T16:00:28Z","title":"Image Demoireing in RAW and sRGB Domains","summary":" Moire patterns frequently appear when capturing screens with smartphones or\ncameras, potentially compromising image quality. Previous studies suggest that\nmoire pattern elimination in the RAW domain offers greater effectiveness\ncompared to demoireing in the sRGB domain. Nevertheless, relying solely on RAW\ndata for image demoireing is insufficient in mitigating the color cast due to\nthe absence of essential information required for the color correction by the\nimage signal processor (ISP). In this paper, we propose to jointly utilize both\nRAW and sRGB data for image demoireing (RRID), which are readily accessible in\nmodern smartphones and DSLR cameras. We develop Skip-Connection-based\nDemoireing Module (SCDM) with Gated Feedback Module (GFM) and Frequency\nSelection Module (FSM) embedded in skip-connections for the efficient and\neffective demoireing of RAW and sRGB features, respectively. Subsequently, we\ndesign a RGB Guided ISP (RGISP) to learn a device-dependent ISP, assisting the\nprocess of color recovery. Extensive experiments demonstrate that our RRID\noutperforms state-of-the-art approaches, in terms of the performance in moire\npattern removal and color cast correction by 0.62dB in PSNR and 0.003 in SSIM.\n","authors":["Shuning Xu","Binbin Song","Xiangyu Chen","Xina Liu","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.09063v3.pdf","comment":"Accepted in ECCV'24"},{"id":"http://arxiv.org/abs/2411.07899v2","updated":"2024-11-18T07:13:24Z","published":"2024-11-12T16:12:51Z","title":"Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse\n Tensor-based Transformer","summary":" The evolution of 3D visualization techniques has fundamentally transformed\nhow we interact with digital content. At the forefront of this change is point\ncloud technology, offering an immersive experience that surpasses traditional\n2D representations. However, the massive data size of point clouds presents\nsignificant challenges in data compression. Current methods for lossy point\ncloud attribute compression (PCAC) generally focus on reconstructing the\noriginal point clouds with minimal error. However, for point cloud\nvisualization scenarios, the reconstructed point clouds with distortion still\nneed to undergo a complex rendering process, which affects the final\nuser-perceived quality. In this paper, we propose an end-to-end deep learning\nframework that seamlessly integrates PCAC with differentiable rendering,\ndenoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of\nrendered multiview images for viewing. In a differentiable manner, the impact\nof the rendering process on the reconstructed point clouds is taken into\naccount. Moreover, we characterize point clouds as sparse tensors and propose a\nsparse tensor-based transformer, called SP-Trans. By aligning with the local\ndensity of the point cloud and utilizing an enhanced local attention mechanism,\nSP-Trans captures the intricate relationships within the point cloud, further\nimproving feature analysis and synthesis within the framework. Extensive\nexperiments demonstrate that the proposed RO-PCAC achieves state-of-the-art\ncompression performance, compared to existing reconstruction-oriented methods,\nincluding traditional, learning-based, and hybrid methods.\n","authors":["Xiao Huo","Junhui Hou","Shuai Wan","Fuzheng Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11335v1","updated":"2024-11-18T07:01:59Z","published":"2024-11-18T07:01:59Z","title":"Video-to-Task Learning via Motion-Guided Attention for Few-Shot Action\n Recognition","summary":" In recent years, few-shot action recognition has achieved remarkable\nperformance through spatio-temporal relation modeling. Although a wide range of\nspatial and temporal alignment modules have been proposed, they primarily\naddress spatial or temporal misalignments at the video level, while the\nspatio-temporal relationships across different videos at the task level remain\nunderexplored. Recent studies utilize class prototypes to learn task-specific\nfeatures but overlook the spatio-temporal relationships across different videos\nat the task level, especially in the spatial dimension, where these\nrelationships provide rich information. In this paper, we propose a novel Dual\nMotion-Guided Attention Learning method (called DMGAL) for few-shot action\nrecognition, aiming to learn the spatio-temporal relationships from the\nvideo-specific to the task-specific level. To achieve this, we propose a\ncarefully designed Motion-Guided Attention (MGA) method to identify and\ncorrelate motion-related region features from the video level to the task\nlevel. Specifically, the Self Motion-Guided Attention module (S-MGA) achieves\nspatio-temporal relation modeling at the video level by identifying and\ncorrelating motion-related region features between different frames within a\nvideo. The Cross Motion-Guided Attention module (C-MGA) identifies and\ncorrelates motion-related region features between frames of different videos\nwithin a specific task to achieve spatio-temporal relationships at the task\nlevel. This approach enables the model to construct class prototypes that fully\nincorporate spatio-temporal relationships from the video-specific level to the\ntask-specific level. We validate the effectiveness of our DMGAL method by\nemploying both fully fine-tuning and adapter-tuning paradigms. The models\ndeveloped using these paradigms are termed DMGAL-FT and DMGAL-Adapter,\nrespectively.\n","authors":["Hanyu Guo","Wanchuan Yu","Suzhou Que","Kaiwen Du","Yan Yan","Hanzi Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11329v1","updated":"2024-11-18T06:48:11Z","published":"2024-11-18T06:48:11Z","title":"Color-Oriented Redundancy Reduction in Dataset Distillation","summary":" Dataset Distillation (DD) is designed to generate condensed representations\nof extensive image datasets, enhancing training efficiency. Despite recent\nadvances, there remains considerable potential for improvement, particularly in\naddressing the notable redundancy within the color space of distilled images.\nIn this paper, we propose AutoPalette, a framework that minimizes color\nredundancy at the individual image and overall dataset levels, respectively. At\nthe image level, we employ a palette network, a specialized neural network, to\ndynamically allocate colors from a reduced color space to each pixel. The\npalette network identifies essential areas in synthetic images for model\ntraining and consequently assigns more unique colors to them. At the dataset\nlevel, we develop a color-guided initialization strategy to minimize redundancy\namong images. Representative images with the least replicated color patterns\nare selected based on the information gain. A comprehensive performance study\ninvolving various datasets and evaluation scenarios is conducted, demonstrating\nthe superior performance of our proposed color-aware DD compared to existing DD\nmethods. The code is available at\n\\url{https://github.com/KeViNYuAn0314/AutoPalette}.\n","authors":["Bowen Yuan","Zijian Wang","Yadan Luo","Mahsa Baktashmotlagh","Yadan Luo","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2411.11329v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2410.09797v2","updated":"2024-11-18T06:44:30Z","published":"2024-10-13T10:56:09Z","title":"Task Adaptive Feature Distribution Based Network for Few-shot\n Fine-grained Target Classification","summary":" Metric-based few-shot fine-grained classification has shown promise due to\nits simplicity and efficiency. However, existing methods often overlook\ntask-level special cases and struggle with accurate category description and\nirrelevant sample information. To tackle these, we propose TAFD-Net: a task\nadaptive feature distribution network. It features a task-adaptive component\nfor embedding to capture task-level nuances, an asymmetric metric for\ncalculating feature distribution similarities between query samples and support\ncategories, and a contrastive measure strategy to boost performance. Extensive\nexperiments have been conducted on three datasets and the experimental results\nshow that our proposed algorithm outperforms recent incremental learning\nalgorithms.\n","authors":["Ping Li","Hongbo Wang","Lei Lu"],"pdf_url":"https://arxiv.org/pdf/2410.09797v2.pdf","comment":"The presentation logic of the algorithm section in the paper is\n unclear, and there are errors in the experimental part that need to be\n corrected, along with additional experiments to be conducted"},{"id":"http://arxiv.org/abs/2408.11001v3","updated":"2024-11-18T06:40:24Z","published":"2024-08-20T16:53:34Z","title":"MegaFusion: Extend Diffusion Models towards Higher-resolution Image\n Generation without Further Tuning","summary":" Diffusion models have emerged as frontrunners in text-to-image generation,\nbut their fixed image resolution during training often leads to challenges in\nhigh-resolution image generation, such as semantic deviations and object\nreplication. This paper introduces MegaFusion, a novel approach that extends\nexisting diffusion-based text-to-image models towards efficient\nhigher-resolution generation without additional fine-tuning or adaptation.\nSpecifically, we employ an innovative truncate and relay strategy to bridge the\ndenoising processes across different resolutions, allowing for high-resolution\nimage generation in a coarse-to-fine manner. Moreover, by integrating dilated\nconvolutions and noise re-scheduling, we further adapt the model's priors for\nhigher resolution. The versatility and efficacy of MegaFusion make it\nuniversally applicable to both latent-space and pixel-space diffusion models,\nalong with other derivative models. Extensive experiments confirm that\nMegaFusion significantly boosts the capability of existing models to produce\nimages of megapixels and various aspect ratios, while only requiring about 40%\nof the original computational cost.\n","authors":["Haoning Wu","Shaocheng Shen","Qiang Hu","Xiaoyun Zhang","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11001v3.pdf","comment":"Accepted by WACV 2025. Project Page:\n https://haoningwu3639.github.io/MegaFusion/"},{"id":"http://arxiv.org/abs/2404.14653v3","updated":"2024-11-18T06:03:47Z","published":"2024-04-23T01:19:19Z","title":"Machine Vision-Based Assessment of Fall Color Changes and its\n Relationship with Leaf Nitrogen Concentration","summary":" Apple(\\textit{Malus domestica} Borkh.) trees are deciduous, shedding leaves\neach year. This process is preceded by a gradual change in leaf color from\ngreen to yellow as chlorophyll is degraded prior to abscission. The initiation\nand rate of this color change are affected by many factors including leaf\nnitrogen (N) concentration. We predict that leaf color during this transition\nmay be indicative of the nitrogen status of apple trees. This study assesses a\nmachine vision-based system for quantifying the change in leaf color and its\ncorrelation with leaf nitrogen content. An image dataset was collected in color\nand 3D over five weeks in the fall of 2021 and 2023 at a commercial orchard\nusing a ground vehicle-based stereovision sensor. Trees in the foreground were\nsegmented from the point cloud using color and depth thresholding methods.\nThen, to estimate the proportion of yellow leaves per canopy, the color\ninformation of the segmented canopy area was quantified using a custom-defined\nmetric, \\textit{yellowness index} (a normalized ratio of yellow to green\nfoliage in the tree) that varied from -1 to +1 (-1 being completely green and\n+1 being completely yellow). Both K-means-based methods and gradient boosting\nmethods were used to estimate the \\textit{yellowness index}. The gradient\nboosting based method proposed in this study was better than the K-means-based\nmethod (both in terms of computational time and accuracy), achieving an $R^2$\nof 0.72 in estimating the \\textit{yellowness index}. The metric was able to\ncapture the gradual color transition from green to yellow over the study\nduration. Trees with lower leaf nitrogen showed the color transition to yellow\nearlier than the trees with higher nitrogen.\n Keywords: Fruit Tree Nitrogen Management, Machine Vision, Point Cloud\nSegmentation, Precision Nitrogen Management\n","authors":["Achyut Paudel","Jostan Brown","Priyanka Upadhyaya","Atif Bilal Asad","Safal Kshetri","Joseph R. Davidson","Cindy Grimm","Ashley Thompson","Bernardita Sallato","Matthew D. Whiting","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2404.14653v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11305v1","updated":"2024-11-18T06:01:00Z","published":"2024-11-18T06:01:00Z","title":"TP-UNet: Temporal Prompt Guided UNet for Medical Image Segmentation","summary":" The advancement of medical image segmentation techniques has been propelled\nby the adoption of deep learning techniques, particularly UNet-based\napproaches, which exploit semantic information to improve the accuracy of\nsegmentations. However, the order of organs in scanned images has been\ndisregarded by current medical image segmentation approaches based on UNet.\nFurthermore, the inherent network structure of UNet does not provide direct\ncapabilities for integrating temporal information. To efficiently integrate\ntemporal information, we propose TP-UNet that utilizes temporal prompts,\nencompassing organ-construction relationships, to guide the segmentation UNet\nmodel. Specifically, our framework is featured with cross-attention and\nsemantic alignment based on unsupervised contrastive learning to combine\ntemporal prompts and image features effectively. Extensive evaluations on two\nmedical image segmentation datasets demonstrate the state-of-the-art\nperformance of TP-UNet. Our implementation will be open-sourced after\nacceptance.\n","authors":["Ranmin Wang","Limin Zhuang","Hongkun Chen","Boyan Xu","Ruichu Cai"],"pdf_url":"https://arxiv.org/pdf/2411.11305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11291v1","updated":"2024-11-18T05:34:31Z","published":"2024-11-18T05:34:31Z","title":"Performance Evaluation of Geospatial Images based on Zarr and Tiff","summary":" This evaluate the performance of geospatial image processing using two\ndistinct data storage formats: Zarr and TIFF. Geospatial images, converted to\nnumerous applications like environmental monitoring, urban planning, and\ndisaster management. Traditional Tagged Image File Format is mostly used\nbecause it is simple and compatible but may lack by performance limitations\nwhile working on large datasets. Zarr is a new format designed for the cloud\nsystems,that offers scalability and efficient storage with data chunking and\ncompression techniques. This study compares the two formats in terms of storage\nefficiency, access speed, and computational performance during typical\ngeospatial processing tasks. Through analysis on a range of geospatial\ndatasets, this provides details about the practical advantages and limitations\nof each format,helping users to select the appropriate format based on their\nspecific needs and constraints.\n","authors":["Jaheer Khan","Swarup E","Rakshit Ramesh"],"pdf_url":"https://arxiv.org/pdf/2411.11291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11288v1","updated":"2024-11-18T05:16:11Z","published":"2024-11-18T05:16:11Z","title":"Neuron: Learning Context-Aware Evolving Representations for Zero-Shot\n Skeleton Action Recognition","summary":" Zero-shot skeleton action recognition is a non-trivial task that requires\nrobust unseen generalization with prior knowledge from only seen classes and\nshared semantics. Existing methods typically build the skeleton-semantics\ninteractions by uncontrollable mappings and conspicuous representations,\nthereby can hardly capture the intricate and fine-grained relationship for\neffective cross-modal transferability. To address these issues, we propose a\nnovel dyNamically Evolving dUal skeleton-semantic syneRgistic framework with\nthe guidance of cOntext-aware side informatioN (dubbed Neuron), to explore more\nfine-grained cross-modal correspondence from micro to macro perspectives at\nboth spatial and temporal levels, respectively. Concretely, 1) we first\nconstruct the spatial-temporal evolving micro-prototypes and integrate dynamic\ncontext-aware side information to capture the intricate and synergistic\nskeleton-semantic correlations step-by-step, progressively refining cross-model\nalignment; and 2) we introduce the spatial compression and temporal memory\nmechanisms to guide the growth of spatial-temporal micro-prototypes, enabling\nthem to absorb structure-related spatial representations and\nregularity-dependent temporal patterns. Notably, such processes are analogous\nto the learning and growth of neurons, equipping the framework with the\ncapacity to generalize to novel unseen action categories. Extensive experiments\non various benchmark datasets demonstrated the superiority of the proposed\nmethod.\n","authors":["Yang Chen","Jingcai Guo","Song Guo","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2411.11288v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.11287v1","updated":"2024-11-18T05:16:09Z","published":"2024-11-18T05:16:09Z","title":"Reducing Label Dependency for Underwater Scene Understanding: A Survey\n of Datasets, Techniques and Applications","summary":" Underwater surveys provide long-term data for informing management\nstrategies, monitoring coral reef health, and estimating blue carbon stocks.\nAdvances in broad-scale survey methods, such as robotic underwater vehicles,\nhave increased the range of marine surveys but generate large volumes of\nimagery requiring analysis. Computer vision methods such as semantic\nsegmentation aid automated image analysis, but typically rely on fully\nsupervised training with extensive labelled data. While ground truth label\nmasks for tasks like street scene segmentation can be quickly and affordably\ngenerated by non-experts through crowdsourcing services like Amazon Mechanical\nTurk, ecology presents greater challenges. The complexity of underwater images,\ncoupled with the specialist expertise needed to accurately identify species at\nthe pixel level, makes this process costly, time-consuming, and heavily\ndependent on domain experts. In recent years, some works have performed\nautomated analysis of underwater imagery, and a smaller number of studies have\nfocused on weakly supervised approaches which aim to reduce the expert-provided\nlabelled data required. This survey focuses on approaches which reduce\ndependency on human expert input, while reviewing the prior and related\napproaches to position these works in the wider field of underwater perception.\nFurther, we offer an overview of coastal ecosystems and the challenges of\nunderwater imagery. We provide background on weakly and self-supervised deep\nlearning and integrate these elements into a taxonomy that centres on the\nintersection of underwater monitoring, computer vision, and deep learning,\nwhile motivating approaches for weakly supervised deep learning with reduced\ndependency on domain expert data annotations. Lastly, the survey examines\navailable datasets and platforms, and identifies gaps, barriers, and\nopportunities for automating underwater surveys.\n","authors":["Scarlett Raine","Frederic Maire","Niko Suenderhauf","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2411.11287v1.pdf","comment":"70 pages, 20 figures"},{"id":"http://arxiv.org/abs/2411.11285v1","updated":"2024-11-18T05:11:29Z","published":"2024-11-18T05:11:29Z","title":"Zero-Shot Automatic Annotation and Instance Segmentation using\n LLM-Generated Datasets: Eliminating Field Imaging and Manual Annotation for\n Deep Learning Model Development","summary":" Currently, deep learning-based instance segmentation for various applications\n(e.g., Agriculture) is predominantly performed using a labor-intensive process\ninvolving extensive field data collection using sophisticated sensors, followed\nby careful manual annotation of images, presenting significant logistical and\nfinancial challenges to researchers and organizations. The process also slows\ndown the model development and training process. In this study, we presented a\nnovel method for deep learning-based instance segmentation of apples in\ncommercial orchards that eliminates the need for labor-intensive field data\ncollection and manual annotation. Utilizing a Large Language Model (LLM), we\nsynthetically generated orchard images and automatically annotated them using\nthe Segment Anything Model (SAM) integrated with a YOLO11 base model. This\nmethod significantly reduces reliance on physical sensors and manual data\nprocessing, presenting a major advancement in \"Agricultural AI\". The synthetic,\nauto-annotated dataset was used to train the YOLO11 model for Apple instance\nsegmentation, which was then validated on real orchard images. The results\nshowed that the automatically generated annotations achieved a Dice Coefficient\nof 0.9513 and an IoU of 0.9303, validating the accuracy and overlap of the mask\nannotations. All YOLO11 configurations, trained solely on these synthetic\ndatasets with automated annotations, accurately recognized and delineated\napples, highlighting the method's efficacy. Specifically, the YOLO11m-seg\nconfiguration achieved a mask precision of 0.902 and a mask mAP@50 of 0.833 on\ntest images collected from a commercial orchard. Additionally, the YOLO11l-seg\nconfiguration outperformed other models in validation on 40 LLM-generated\nimages, achieving the highest mask precision and mAP@50 metrics.\n Keywords: YOLO, SAM, SAMv2, YOLO11, YOLOv11, Segment Anything, YOLO-SAM\n","authors":["Ranjan Sapkota","Achyut Paudel","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2411.11285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20064v2","updated":"2024-11-18T05:09:42Z","published":"2023-10-30T22:29:07Z","title":"A Scalable Training Strategy for Blind Multi-Distribution Noise Removal","summary":" Despite recent advances, developing general-purpose universal denoising and\nartifact-removal networks remains largely an open problem: Given fixed network\nweights, one inherently trades-off specialization at one task (e.g.,~removing\nPoisson noise) for performance at another (e.g.,~removing speckle noise). In\naddition, training such a network is challenging due to the curse of\ndimensionality: As one increases the dimensions of the specification-space\n(i.e.,~the number of parameters needed to describe the noise distribution) the\nnumber of unique specifications one needs to train for grows exponentially.\nUniformly sampling this space will result in a network that does well at very\nchallenging problem specifications but poorly at easy problem specifications,\nwhere even large errors will have a small effect on the overall mean squared\nerror.\n In this work we propose training denoising networks using an\nadaptive-sampling/active-learning strategy. Our work improves upon a recently\nproposed universal denoiser training strategy by extending these results to\nhigher dimensions and by incorporating a polynomial approximation of the true\nspecification-loss landscape. This approximation allows us to reduce training\ntimes by almost two orders of magnitude. We test our method on simulated joint\nPoisson-Gaussian-Speckle noise and demonstrate that with our proposed training\nstrategy, a single blind, generalist denoiser network can achieve peak\nsignal-to-noise ratios within a uniform bound of specialized denoiser networks\nacross a large range of operating conditions. We also capture a small dataset\nof images with varying amounts of joint Poisson-Gaussian-Speckle noise and\ndemonstrate that a universal denoiser trained using our adaptive-sampling\nstrategy outperforms uniformly trained baselines.\n","authors":["Kevin Zhang","Sakshum Kulshrestha","Christopher Metzler"],"pdf_url":"https://arxiv.org/pdf/2310.20064v2.pdf","comment":"IEEE TIP 2024"},{"id":"http://arxiv.org/abs/2409.06220v2","updated":"2024-11-18T05:00:58Z","published":"2024-09-10T05:08:26Z","title":"CerviXpert: A Multi-Structural Convolutional Neural Network for\n Predicting Cervix Type and Cervical Cell Abnormalities","summary":" Cervical cancer is a major cause of cancer-related mortality among women\nworldwide, and its survival rate improves significantly with early detection.\nTraditional diagnostic methods such as Pap smears and cervical biopsies rely\nheavily on cytologist expertise, making the process prone to human error. This\nstudy introduces CerviXpert, a multi-structural convolutional neural network\nmodel designed to efficiently classify cervix types and detect cervical cell\nabnormalities. CerviXpert is built as a computationally efficient model that\nclassifies cervical cancer using images from the publicly available SiPaKMeD\ndataset. The model architecture emphasizes simplicity, using a limited number\nof convolutional layers followed by max pooling and dense layers, trained from\nscratch.\n We assessed the performance of CerviXpert against other state of the art\nconvolutional neural network models including ResNet50, VGG16, MobileNetV2, and\nInceptionV3, evaluating them on accuracy, computational efficiency, and\nrobustness using five fold cross validation. CerviXpert achieved an accuracy of\n98.04 percent in classifying cervical cell abnormalities into three classes and\n98.60 percent for five class cervix type classification, outperforming\nMobileNetV2 and InceptionV3 in both accuracy and computational requirements. It\nshowed comparable results to ResNet50 and VGG16 while reducing computational\ncomplexity and resource needs.\n CerviXpert provides an effective solution for cervical cancer screening and\ndiagnosis, balancing accuracy with computational efficiency. Its streamlined\ndesign enables deployment in resource constrained environments, potentially\nenhancing early detection and management of cervical cancer.\n","authors":["Rashik Shahriar Akash","Radiful Islam","S. M. Saiful Islam Badhon","K. S. M. Tozammel Hossain"],"pdf_url":"https://arxiv.org/pdf/2409.06220v2.pdf","comment":"11 figures, 9 tables"},{"id":"http://arxiv.org/abs/2411.11282v1","updated":"2024-11-18T04:54:04Z","published":"2024-11-18T04:54:04Z","title":"Continuous K-space Recovery Network with Image Guidance for Fast MRI\n Reconstruction","summary":" Magnetic resonance imaging (MRI) is a crucial tool for clinical diagnosis\nwhile facing the challenge of long scanning time. To reduce the acquisition\ntime, fast MRI reconstruction aims to restore high-quality images from the\nundersampled k-space. Existing methods typically train deep learning models to\nmap the undersampled data to artifact-free MRI images. However, these studies\noften overlook the unique properties of k-space and directly apply general\nnetworks designed for image processing to k-space recovery, leaving the precise\nlearning of k-space largely underexplored. In this work, we propose a\ncontinuous k-space recovery network from a new perspective of implicit neural\nrepresentation with image domain guidance, which boosts the performance of MRI\nreconstruction. Specifically, (1) an implicit neural representation based\nencoder-decoder structure is customized to continuously query unsampled\nk-values. (2) an image guidance module is designed to mine the semantic\ninformation from the low-quality MRI images to further guide the k-space\nrecovery. (3) a multi-stage training strategy is proposed to recover dense\nk-space progressively. Extensive experiments conducted on CC359, fastMRI, and\nIXI datasets demonstrate the effectiveness of our method and its superiority\nover other competitors.\n","authors":["Yucong Meng","Zhiwei Yang","Minghong Duan","Yonghong Shi","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2411.11282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18530v2","updated":"2024-11-18T04:45:19Z","published":"2024-06-26T17:57:25Z","title":"MatchTime: Towards Automatic Soccer Game Commentary Generation","summary":" Soccer is a globally popular sport with a vast audience, in this paper, we\nconsider constructing an automatic soccer game commentary model to improve the\naudiences' viewing experience. In general, we make the following contributions:\nFirst, observing the prevalent video-text misalignment in existing datasets, we\nmanually annotate timestamps for 49 matches, establishing a more robust\nbenchmark for soccer game commentary generation, termed as\nSN-Caption-test-align; Second, we propose a multi-modal temporal alignment\npipeline to automatically correct and filter the existing dataset at scale,\ncreating a higher-quality soccer game commentary dataset for training, denoted\nas MatchTime; Third, based on our curated dataset, we train an automatic\ncommentary generation model, named MatchVoice. Extensive experiments and\nablation studies have demonstrated the effectiveness of our alignment pipeline,\nand training model on the curated dataset achieves state-of-the-art performance\nfor commentary generation, showcasing that better alignment can lead to\nsignificant performance improvements in downstream tasks.\n","authors":["Jiayuan Rao","Haoning Wu","Chang Liu","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2406.18530v2.pdf","comment":"Accepted by EMNLP 2024 (Oral Presentation); Project Page:\n https://haoningwu3639.github.io/MatchTime/"},{"id":"http://arxiv.org/abs/2411.11278v1","updated":"2024-11-18T04:35:20Z","published":"2024-11-18T04:35:20Z","title":"Towards Open-Vocabulary Audio-Visual Event Localization","summary":" The Audio-Visual Event Localization (AVEL) task aims to temporally locate and\nclassify video events that are both audible and visible. Most research in this\nfield assumes a closed-set setting, which restricts these models' ability to\nhandle test data containing event categories absent (unseen) during training.\nRecently, a few studies have explored AVEL in an open-set setting, enabling the\nrecognition of unseen events as ``unknown'', but without providing\ncategory-specific semantics. In this paper, we advance the field by introducing\nthe Open-Vocabulary Audio-Visual Event Localization (OV-AVEL) problem, which\nrequires localizing audio-visual events and predicting explicit categories for\nboth seen and unseen data at inference. To address this new task, we propose\nthe OV-AVEBench dataset, comprising 24,800 videos across 67 real-life\naudio-visual scenes (seen:unseen = 46:21), each with manual segment-level\nannotation. We also establish three evaluation metrics for this task. Moreover,\nwe investigate two baseline approaches, one training-free and one using a\nfurther fine-tuning paradigm. Specifically, we utilize the unified multimodal\nspace from the pretrained ImageBind model to extract audio, visual, and textual\n(event classes) features. The training-free baseline then determines\npredictions by comparing the consistency of audio-text and visual-text feature\nsimilarities. The fine-tuning baseline incorporates lightweight temporal layers\nto encode temporal relations within the audio and visual modalities, using\nOV-AVEBench training data for model fine-tuning. We evaluate these baselines on\nthe proposed OV-AVEBench dataset and discuss potential directions for future\nwork in this new field.\n","authors":["Jinxing Zhou","Dan Guo","Ruohao Guo","Yuxin Mao","Jingjing Hu","Yiran Zhong","Xiaojun Chang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11278v1.pdf","comment":"Project page: https://github.com/jasongief/OV-AVEL"},{"id":"http://arxiv.org/abs/2405.14701v3","updated":"2024-11-18T03:52:26Z","published":"2024-05-23T15:35:48Z","title":"DreamText: High Fidelity Scene Text Synthesis","summary":" Scene text synthesis involves rendering specified texts onto arbitrary\nimages. Current methods typically formulate this task in an end-to-end manner\nbut lack effective character-level guidance during training. Besides, their\ntext encoders, pre-trained on a single font type, struggle to adapt to the\ndiverse font styles encountered in practical applications. Consequently, these\nmethods suffer from character distortion, repetition, and absence, particularly\nin polystylistic scenarios. To this end, this paper proposes DreamText for\nhigh-fidelity scene text synthesis. Our key idea is to reconstruct the\ndiffusion training process, introducing more refined guidance tailored to this\ntask, to expose and rectify the model's attention at the character level and\nstrengthen its learning of text regions. This transformation poses a hybrid\noptimization challenge, involving both discrete and continuous variables. To\neffectively tackle this challenge, we employ a heuristic alternate optimization\nstrategy. Meanwhile, we jointly train the text encoder and generator to\ncomprehensively learn and utilize the diverse font present in the training\ndataset. This joint training is seamlessly integrated into the alternate\noptimization process, fostering a synergistic relationship between learning\ncharacter embedding and re-estimating character attention. Specifically, in\neach step, we first encode potential character-generated position information\nfrom cross-attention maps into latent character masks. These masks are then\nutilized to update the representation of specific characters in the current\nstep, which, in turn, enables the generator to correct the character's\nattention in the subsequent steps. Both qualitative and quantitative results\ndemonstrate the superiority of our method to the state of the art.\n","authors":["Yibin Wang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2405.14701v3.pdf","comment":"Code: https://github.com/CodeGoat24/DreamText, Project page:\n https://codegoat24.github.io/DreamText/"},{"id":"http://arxiv.org/abs/2411.11262v1","updated":"2024-11-18T03:35:34Z","published":"2024-11-18T03:35:34Z","title":"Cross-Patient Pseudo Bags Generation and Curriculum Contrastive Learning\n for Imbalanced Multiclassification of Whole Slide Image","summary":" Pathology computing has dramatically improved pathologists' workflow and\ndiagnostic decision-making processes. Although computer-aided diagnostic\nsystems have shown considerable value in whole slide image (WSI) analysis, the\nproblem of multi-classification under sample imbalance remains an intractable\nchallenge. To address this, we propose learning fine-grained information by\ngenerating sub-bags with feature distributions similar to the original WSIs.\nAdditionally, we utilize a pseudo-bag generation algorithm to further leverage\nthe abundant and redundant information in WSIs, allowing efficient training in\nunbalanced-sample multi-classification tasks. Furthermore, we introduce an\naffinity-based sample selection and curriculum contrastive learning strategy to\nenhance the stability of model representation learning. Unlike previous\napproaches, our framework transitions from learning bag-level representations\nto understanding and exploiting the feature distribution of multi-instance\nbags. Our method demonstrates significant performance improvements on three\ndatasets, including tumor classification and lymph node metastasis. On average,\nit achieves a 4.39-point improvement in F1 score compared to the second-best\nmethod across the three tasks, underscoring its superior performance.\n","authors":["Yonghuang Wu","Xuan Xie","Xinyuan Niu","Chengqian Zhao","Jinhua Yu"],"pdf_url":"https://arxiv.org/pdf/2411.11262v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.07433v5","updated":"2024-11-18T03:14:37Z","published":"2024-08-14T10:08:46Z","title":"MagicFace: Training-free Universal-Style Human Image Customized\n Synthesis","summary":" Current human image customization methods leverage Stable Diffusion (SD) for\nits rich semantic prior. However, since SD is not specifically designed for\nhuman-oriented generation, these methods often require extensive fine-tuning on\nlarge-scale datasets, which renders them susceptible to overfitting and hinders\ntheir ability to personalize individuals with previously unseen styles.\nMoreover, these methods extensively focus on single-concept human image\nsynthesis and lack the flexibility to customize individuals using multiple\ngiven concepts, thereby impeding their broader practical application. This\npaper proposes MagicFace, a novel training-free method for multi-concept\nuniversal-style human image personalized synthesis. Our core idea is to\nsimulate how humans create images given specific concepts, i.e., first\nestablish a semantic layout considering factors such as concepts' shape and\nposture, then optimize details by comparing with concepts at the pixel level.\nTo implement this process, we introduce a coarse-to-fine generation pipeline,\ninvolving two sequential stages: semantic layout construction and concept\nfeature injection. This is achieved by our Reference-aware Self-Attention (RSA)\nand Region-grouped Blend Attention (RBA) mechanisms. In the first stage, RSA\nenables the latent image to query features from all reference concepts\nsimultaneously, extracting the overall semantic understanding to facilitate the\ninitial semantic layout establishment. In the second stage, we employ an\nattention-based semantic segmentation method to pinpoint the latent generated\nregions of all concepts at each step. Following this, RBA divides the pixels of\nthe latent image into semantic groups, with each group querying fine-grained\nfeatures from the corresponding reference concept. Extensive experiments\ndemonstrate the superiority of our MagicFace.\n","authors":["Yibin Wang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2408.07433v5.pdf","comment":"project page: https://codegoat24.github.io/MagicFace"},{"id":"http://arxiv.org/abs/2411.11254v1","updated":"2024-11-18T03:09:39Z","published":"2024-11-18T03:09:39Z","title":"Semantic or Covariate? A Study on the Intractable Case of\n Out-of-Distribution Detection","summary":" The primary goal of out-of-distribution (OOD) detection tasks is to identify\ninputs with semantic shifts, i.e., if samples from novel classes are absent in\nthe in-distribution (ID) dataset used for training, we should reject these OOD\nsamples rather than misclassifying them into existing ID classes. However, we\nfind the current definition of \"semantic shift\" is ambiguous, which renders\ncertain OOD testing protocols intractable for the post-hoc OOD detection\nmethods based on a classifier trained on the ID dataset. In this paper, we\noffer a more precise definition of the Semantic Space and the Covariate Space\nfor the ID distribution, allowing us to theoretically analyze which types of\nOOD distributions make the detection task intractable. To avoid the flaw in the\nexisting OOD settings, we further define the \"Tractable OOD\" setting which\nensures the distinguishability of OOD and ID distributions for the post-hoc OOD\ndetection methods. Finally, we conduct several experiments to demonstrate the\nnecessity of our definitions and validate the correctness of our theorems.\n","authors":["Xingming Long","Jie Zhang","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11254v1.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2410.10496v2","updated":"2024-11-18T03:09:19Z","published":"2024-10-14T13:41:37Z","title":"Vision-guided and Mask-enhanced Adaptive Denoising for Prompt-based\n Image Editing","summary":" Text-to-image diffusion models have demonstrated remarkable progress in\nsynthesizing high-quality images from text prompts, which boosts researches on\nprompt-based image editing that edits a source image according to a target\nprompt. Despite their advances, existing methods still encounter three key\nissues: 1) limited capacity of the text prompt in guiding target image\ngeneration, 2) insufficient mining of word-to-patch and patch-to-patch\nrelationships for grounding editing areas, and 3) unified editing strength for\nall regions during each denoising step. To address these issues, we present a\nVision-guided and Mask-enhanced Adaptive Editing (ViMAEdit) method with three\nkey novel designs. First, we propose to leverage image embeddings as explicit\nguidance to enhance the conventional textual prompt-based denoising process,\nwhere a CLIP-based target image embedding estimation strategy is introduced.\nSecond, we devise a self-attention-guided iterative editing area grounding\nstrategy, which iteratively exploits patch-to-patch relationships conveyed by\nself-attention maps to refine those word-to-patch relationships contained in\ncross-attention maps. Last, we present a spatially adaptive variance-guided\nsampling, which highlights sampling variances for critical image regions to\npromote the editing capability. Experimental results demonstrate the superior\nediting capacity of ViMAEdit over all existing methods.\n","authors":["Kejie Wang","Xuemeng Song","Meng Liu","Jin Yuan","Weili Guan"],"pdf_url":"https://arxiv.org/pdf/2410.10496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23091v4","updated":"2024-11-18T03:06:52Z","published":"2024-10-30T15:06:44Z","title":"CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for\n Adversarial Defense","summary":" Despite ongoing efforts to defend neural classifiers from adversarial\nattacks, they remain vulnerable, especially to unseen attacks. In contrast,\nhumans are difficult to be cheated by subtle manipulations, since we make\njudgments only based on essential factors. Inspired by this observation, we\nattempt to model label generation with essential label-causative factors and\nincorporate label-non-causative factors to assist data generation. For an\nadversarial example, we aim to discriminate the perturbations as non-causative\nfactors and make predictions only based on the label-causative factors.\nConcretely, we propose a casual diffusion model (CausalDiff) that adapts\ndiffusion models for conditional data generation and disentangles the two types\nof casual factors by learning towards a novel casual information bottleneck\nobjective. Empirically, CausalDiff has significantly outperformed\nstate-of-the-art defense methods on various unseen attacks, achieving an\naverage robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on\nCIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition\nBenchmark). The code is available at\n\\href{https://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff}{https://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff}\n","authors":["Mingkun Zhang","Keping Bi","Wei Chen","Quanrun Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.23091v4.pdf","comment":"accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.15778v2","updated":"2024-11-18T03:02:17Z","published":"2024-06-22T07:57:58Z","title":"ObjectNLQ @ Ego4D Episodic Memory Challenge 2024","summary":" In this report, we present our approach for the Natural Language Query track\nand Goal Step track of the Ego4D Episodic Memory Benchmark at CVPR 2024. Both\nchallenges require the localization of actions within long video sequences\nusing textual queries. To enhance localization accuracy, our method not only\nprocesses the temporal information of videos but also identifies fine-grained\nobjects spatially within the frames. To this end, we introduce a novel\napproach, termed ObjectNLQ, which incorporates an object branch to augment the\nvideo representation with detailed object information, thereby improving\ngrounding efficiency. ObjectNLQ achieves a mean R@1 of 23.15, ranking 2nd in\nthe Natural Language Queries Challenge, and gains 33.00 in terms of the metric\nR@1, IoU=0.3, ranking 3rd in the Goal Step Challenge. Our code will be released\nat https://github.com/Yisen-Feng/ObjectNLQ.\n","authors":["Yisen Feng","Haoyu Zhang","Yuquan Xie","Zaijing Li","Meng Liu","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2406.15778v2.pdf","comment":"The solution for the Natural Language Query track and Goal Step track\n at CVPR EgoVis Workshop 2024"},{"id":"http://arxiv.org/abs/2411.11252v1","updated":"2024-11-18T03:00:33Z","published":"2024-11-18T03:00:33Z","title":"DrivingSphere: Building a High-fidelity 4D World for Closed-loop\n Simulation","summary":" Autonomous driving evaluation requires simulation environments that closely\nreplicate actual road conditions, including real-world sensory data and\nresponsive feedback loops. However, many existing simulations need to predict\nwaypoints along fixed routes on public datasets or synthetic photorealistic\ndata, \\ie, open-loop simulation usually lacks the ability to assess dynamic\ndecision-making. While the recent efforts of closed-loop simulation offer\nfeedback-driven environments, they cannot process visual sensor inputs or\nproduce outputs that differ from real-world data. To address these challenges,\nwe propose DrivingSphere, a realistic and closed-loop simulation framework. Its\ncore idea is to build 4D world representation and generate real-life and\ncontrollable driving scenarios. In specific, our framework includes a Dynamic\nEnvironment Composition module that constructs a detailed 4D driving world with\na format of occupancy equipping with static backgrounds and dynamic objects,\nand a Visual Scene Synthesis module that transforms this data into\nhigh-fidelity, multi-view video outputs, ensuring spatial and temporal\nconsistency. By providing a dynamic and realistic simulation environment,\nDrivingSphere enables comprehensive testing and validation of autonomous\ndriving algorithms, ultimately advancing the development of more reliable\nautonomous cars. The benchmark will be publicly released.\n","authors":["Tianyi Yan","Dongming Wu","Wencheng Han","Junpeng Jiang","Xia Zhou","Kun Zhan","Cheng-zhong Xu","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2411.11252v1.pdf","comment":"https://yanty123.github.io/DrivingSphere/"},{"id":"http://arxiv.org/abs/2204.01645v3","updated":"2024-11-18T02:56:06Z","published":"2022-04-04T16:50:03Z","title":"3D microstructural generation from 2D images of cement paste using\n generative adversarial networks","summary":" Establishing a realistic three-dimensional (3D) microstructure is a crucial\nstep for studying microstructure development of hardened cement pastes.\nHowever, acquiring 3D microstructural images for cement often involves high\ncosts and quality compromises. This paper proposes a generative adversarial\nnetworks-based method for generating 3D microstructures from a single\ntwo-dimensional (2D) image, capable of producing high-quality and realistic 3D\nimages at low cost. In the method, a framework (CEM3DMG) is designed to\nsynthesize 3D images by learning microstructural information from a 2D\ncross-sectional image. Experimental results show that CEM3DMG can generate\nrealistic 3D images of large size. Visual observation confirms that the\ngenerated 3D images exhibit similar microstructural features to the 2D images,\nincluding similar pore distribution and particle morphology. Furthermore,\nquantitative analysis reveals that reconstructed 3D microstructures closely\nmatch the real 2D microstructure in terms of gray level histogram, phase\nproportions, and pore size distribution. The source code for CEM3DMG is\navailable in the GitHub repository at: https://github.com/NBICLAB/CEM3DMG.\n","authors":["Xin Zhao","Lin Wang","Qinfei Li","Heng Chen","Shuangrong Liu","Pengkun Hou","Jiayuan Ye","Yan Pei","Xu Wu","Jianfeng Yuan","Haozhong Gao","Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2204.01645v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01443v2","updated":"2024-11-18T02:45:17Z","published":"2024-11-03T06:00:36Z","title":"Activating Self-Attention for Multi-Scene Absolute Pose Regression","summary":" Multi-scene absolute pose regression addresses the demand for fast and\nmemory-efficient camera pose estimation across various real-world environments.\nNowadays, transformer-based model has been devised to regress the camera pose\ndirectly in multi-scenes. Despite its potential, transformer encoders are\nunderutilized due to the collapsed self-attention map, having low\nrepresentation capacity. This work highlights the problem and investigates it\nfrom a new perspective: distortion of query-key embedding space. Based on the\nstatistical analysis, we reveal that queries and keys are mapped in completely\ndifferent spaces while only a few keys are blended into the query region. This\nleads to the collapse of the self-attention map as all queries are considered\nsimilar to those few keys. Therefore, we propose simple but effective solutions\nto activate self-attention. Concretely, we present an auxiliary loss that\naligns queries and keys, preventing the distortion of query-key space and\nencouraging the model to find global relations by self-attention. In addition,\nthe fixed sinusoidal positional encoding is adopted instead of undertrained\nlearnable one to reflect appropriate positional clues into the inputs of\nself-attention. As a result, our approach resolves the aforementioned problem\neffectively, thus outperforming existing methods in both outdoor and indoor\nscenes.\n","authors":["Miso Lee","Jihwan Kim","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2411.01443v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.02389v2","updated":"2024-11-18T02:32:22Z","published":"2024-09-04T02:37:38Z","title":"Multi-modal Situated Reasoning in 3D Scenes","summary":" Situation awareness is essential for understanding and reasoning about 3D\nscenes in embodied AI agents. However, existing datasets and benchmarks for\nsituated understanding are limited in data modality, diversity, scale, and task\nscope. To address these limitations, we propose Multi-modal Situated Question\nAnswering (MSQA), a large-scale multi-modal situated reasoning dataset,\nscalably collected leveraging 3D scene graphs and vision-language models (VLMs)\nacross a diverse range of real-world 3D scenes. MSQA includes 251K situated\nquestion-answering pairs across 9 distinct question categories, covering\ncomplex scenarios within 3D scenes. We introduce a novel interleaved\nmulti-modal input setting in our benchmark to provide text, image, and point\ncloud for situation and question description, resolving ambiguity in previous\nsingle-modality convention (e.g., text). Additionally, we devise the\nMulti-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models'\nsituated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN\nhighlight the limitations of existing vision-language models and underscore the\nimportance of handling multi-modal interleaved inputs and situation modeling.\nExperiments on data scaling and cross-domain transfer further demonstrate the\nefficacy of leveraging MSQA as a pre-training dataset for developing more\npowerful situated reasoning models.\n","authors":["Xiongkun Linghu","Jiangyong Huang","Xuesong Niu","Xiaojian Ma","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2409.02389v2.pdf","comment":"Accepted by NeurIPS 2024 Datasets and Benchmarks Track. Project page:\n https://msr3d.github.io/"},{"id":"http://arxiv.org/abs/2310.07259v3","updated":"2024-11-18T02:18:14Z","published":"2023-10-11T07:37:13Z","title":"Uncovering Hidden Connections: Iterative Search and Reasoning for\n Video-grounded Dialog","summary":" In contrast to conventional visual question answering, video-grounded dialog\nnecessitates a profound understanding of both dialog history and video content\nfor accurate response generation. Despite commendable progress made by existing\napproaches, they still face the challenges of incrementally understanding\ncomplex dialog history and assimilating video information. In response to these\nchallenges, we present an iterative search and reasoning framework, which\nconsists of a textual encoder, a visual encoder, and a generator. Specifically,\nwe devise a path search and aggregation strategy in the textual encoder, mining\ncore cues from dialog history that are pivotal to understanding the posed\nquestions. Concurrently, our visual encoder harnesses an iterative reasoning\nnetwork to extract and emphasize critical visual markers from videos, enhancing\nthe depth of visual comprehension. Finally, we utilize the pre-trained GPT-2\nmodel as our answer generator to decode the mined hidden clues into coherent\nand contextualized answers. Extensive experiments on three public datasets\ndemonstrate the effectiveness and generalizability of our proposed framework.\n","authors":["Haoyu Zhang","Meng Liu","Yaowei Wang","Da Cao","Weili Guan","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2310.07259v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11233v1","updated":"2024-11-18T02:02:24Z","published":"2024-11-18T02:02:24Z","title":"Noise Filtering Benchmark for Neuromorphic Satellites Observations","summary":" Event cameras capture sparse, asynchronous brightness changes which offer\nhigh temporal resolution, high dynamic range, low power consumption, and sparse\ndata output. These advantages make them ideal for Space Situational Awareness,\nparticularly in detecting resident space objects moving within a telescope's\nfield of view. However, the output from event cameras often includes\nsubstantial background activity noise, which is known to be more prevalent in\nlow-light conditions. This noise can overwhelm the sparse events generated by\nsatellite signals, making detection and tracking more challenging. Existing\nnoise-filtering algorithms struggle in these scenarios because they are\ntypically designed for denser scenes, where losing some signal is acceptable.\nThis limitation hinders the application of event cameras in complex, real-world\nenvironments where signals are extremely sparse. In this paper, we propose new\nevent-driven noise-filtering algorithms specifically designed for very sparse\nscenes. We categorise the algorithms into logical-based and learning-based\napproaches and benchmark their performance against 11 state-of-the-art\nnoise-filtering algorithms, evaluating how effectively they remove noise and\nhot pixels while preserving the signal. Their performance was quantified by\nmeasuring signal retention and noise removal accuracy, with results reported\nusing ROC curves across the parameter space. Additionally, we introduce a new\nhigh-resolution satellite dataset with ground truth from a real-world platform\nunder various noise conditions, which we have made publicly available. Code,\ndataset, and trained weights are available at\n\\url{https://github.com/samiarja/dvs_sparse_filter}.\n","authors":["Sami Arja","Alexandre Marcireau","Nicholas Owen Ralph","Saeed Afshar","Gregory Cohen"],"pdf_url":"https://arxiv.org/pdf/2411.11233v1.pdf","comment":"17 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2411.11231v1","updated":"2024-11-18T01:52:31Z","published":"2024-11-18T01:52:31Z","title":"BeautyBank: Encoding Facial Makeup in Latent Space","summary":" The advancement of makeup transfer, editing, and image encoding has\ndemonstrated their effectiveness and superior quality. However, existing makeup\nworks primarily focus on low-dimensional features such as color distributions\nand patterns, limiting their versatillity across a wide range of makeup\napplications. Futhermore, existing high-dimensional latent encoding methods\nmainly target global features such as structure and style, and are less\neffective for tasks that require detailed attention to local color and pattern\nfeatures of makeup. To overcome these limitations, we propose BeautyBank, a\nnovel makeup encoder that disentangles pattern features of bare and makeup\nfaces. Our method encodes makeup features into a high-dimensional space,\npreserving essential details necessary for makeup reconstruction and broadening\nthe scope of potential makeup research applications. We also propose a\nProgressive Makeup Tuning (PMT) strategy, specifically designed to enhance the\npreservation of detailed makeup features while preventing the inclusion of\nirrelevant attributes. We further explore novel makeup applications, including\nfacial image generation with makeup injection and makeup similarity measure.\nExtensive empirical experiments validate that our method offers superior task\nadaptability and holds significant potential for widespread application in\nvarious makeup-related fields. Furthermore, to address the lack of large-scale,\nhigh-quality paired makeup datasets in the field, we constructed the\nBare-Makeup Synthesis Dataset (BMS), comprising 324,000 pairs of 512x512 pixel\nimages of bare and makeup-enhanced faces.\n","authors":["Qianwen Lu","Xingchao Yang","Takafumi Taketomi"],"pdf_url":"https://arxiv.org/pdf/2411.11231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20605v2","updated":"2024-11-18T01:47:56Z","published":"2024-05-31T03:39:26Z","title":"Searching for internal symbols underlying deep learning","summary":" Deep learning (DL) enables deep neural networks (DNNs) to automatically learn\ncomplex tasks or rules from given examples without instructions or guiding\nprinciples. As we do not engineer DNNs' functions, it is extremely difficult to\ndiagnose their decisions, and multiple lines of studies proposed to explain the\nprinciples of their operations. Notably, one line of studies suggests that DNNs\nmay learn concepts, the high level features that are recognizable to humans. In\nthis study, we extend this line of studies and hypothesize that DNNs can\ndevelop abstract codes that can be used to augment DNNs' decision-making. To\naddress this hypothesis, we combine foundation segmentation models and\nunsupervised learning to extract internal codes and identify potential use of\nabstract codes to make DL's decision-making more reliable and safer.\n","authors":["Jung H. Lee","Sujith Vijayan"],"pdf_url":"https://arxiv.org/pdf/2405.20605v2.pdf","comment":"16 pages, 10 figures, 5 tables and 1 supplementary table"},{"id":"http://arxiv.org/abs/2411.11223v1","updated":"2024-11-18T01:25:58Z","published":"2024-11-18T01:25:58Z","title":"Efficient Transfer Learning for Video-language Foundation Models","summary":" Pre-trained vision-language models provide a robust foundation for efficient\ntransfer learning across various downstream tasks. In the field of video action\nrecognition, mainstream approaches often introduce additional parameter modules\nto capture temporal information. While the increased model capacity brought by\nthese additional parameters helps better fit the video-specific inductive\nbiases, existing methods require learning a large number of parameters and are\nprone to catastrophic forgetting of the original generalizable knowledge. In\nthis paper, we propose a simple yet effective Multi-modal Spatio-Temporal\nAdapter (MSTA) to improve the alignment between representations in the text and\nvision branches, achieving a balance between general knowledge and\ntask-specific knowledge. Furthermore, to mitigate over-fitting and enhance\ngeneralizability, we introduce a spatio-temporal description-guided consistency\nconstraint. This constraint involves feeding template inputs (i.e., ``a video\nof $\\{\\textbf{cls}\\}$'') into the trainable language branch, while\nLLM-generated spatio-temporal descriptions are input into the pre-trained\nlanguage branch, enforcing consistency between the outputs of the two branches.\nThis mechanism prevents over-fitting to downstream tasks and improves the\ndistinguishability of the trainable branch within the spatio-temporal semantic\nspace. We evaluate the effectiveness of our approach across four tasks:\nzero-shot transfer, few-shot learning, base-to-novel generalization, and\nfully-supervised learning. Compared to many state-of-the-art methods, our MSTA\nachieves outstanding performance across all evaluations, while using only 2-7\\%\nof the trainable parameters in the original model. Code will be avaliable at\nhttps://github.com/chenhaoxing/ETL4Video.\n","authors":["Haoxing Chen","Zizheng Huang","Yan Hong","Yanshuo Wang","Zhongcai Lyu","Zhuoer Xu","Jun Lan","Zhangxuan Gu"],"pdf_url":"https://arxiv.org/pdf/2411.11223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11222v1","updated":"2024-11-18T01:19:37Z","published":"2024-11-18T01:19:37Z","title":"The Sound of Water: Inferring Physical Properties from Pouring Liquids","summary":" We study the connection between audio-visual observations and the underlying\nphysics of a mundane yet intriguing everyday activity: pouring liquids. Given\nonly the sound of liquid pouring into a container, our objective is to\nautomatically infer physical properties such as the liquid level, the shape and\nsize of the container, the pouring rate and the time to fill. To this end, we:\n(i) show in theory that these properties can be determined from the fundamental\nfrequency (pitch); (ii) train a pitch detection model with supervision from\nsimulated data and visual data with a physics-inspired objective; (iii)\nintroduce a new large dataset of real pouring videos for a systematic study;\n(iv) show that the trained model can indeed infer these physical properties for\nreal data; and finally, (v) we demonstrate strong generalization to various\ncontainer shapes, other datasets, and in-the-wild YouTube videos. Our work\npresents a keen understanding of a narrow yet rich problem at the intersection\nof acoustics, physics, and learning. It opens up applications to enhance\nmultisensory perception in robotic pouring.\n","authors":["Piyush Bagad","Makarand Tapaswi","Cees G. M. Snoek","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.11222v1.pdf","comment":"25 pages, 17 figures. Project page at\n https://bpiyush.github.io/pouring-water-website"},{"id":"http://arxiv.org/abs/2409.19483v3","updated":"2024-11-18T01:14:03Z","published":"2024-09-28T23:10:37Z","title":"MedCLIP-SAMv2: Towards Universal Text-Driven Medical Image Segmentation","summary":" Segmentation of anatomical structures and pathological regions in medical\nimages is essential for modern clinical diagnosis, disease research, and\ntreatment planning. While significant advancements have been made in deep\nlearning-based segmentation techniques, many of these methods still suffer from\nlimitations in data efficiency, generalizability, and interactivity. As a\nresult, developing precise segmentation methods that require fewer labeled\ndatasets remains a critical challenge in medical image analysis. Recently, the\nintroduction of foundation models like CLIP and Segment-Anything-Model (SAM),\nwith robust cross-domain representations, has paved the way for interactive and\nuniversal image segmentation. However, further exploration of these models for\ndata-efficient segmentation in medical imaging is still needed and highly\nrelevant. In this paper, we introduce MedCLIP-SAMv2, a novel framework that\nintegrates the CLIP and SAM models to perform segmentation on clinical scans\nusing text prompts, in both zero-shot and weakly supervised settings. Our\napproach includes fine-tuning the BiomedCLIP model with a new Decoupled Hard\nNegative Noise Contrastive Estimation (DHN-NCE) loss, and leveraging the\nMulti-modal Information Bottleneck (M2IB) to create visual prompts for\ngenerating segmentation masks from SAM in the zero-shot setting. We also\ninvestigate using zero-shot segmentation labels within a weakly supervised\nparadigm to enhance segmentation quality further. Extensive testing across four\ndiverse segmentation tasks and medical imaging modalities (breast tumor\nultrasound, brain tumor MRI, lung X-ray, and lung CT) demonstrates the high\naccuracy of our proposed framework. Our code is available at\nhttps://github.com/HealthX-Lab/MedCLIP-SAMv2.\n","authors":["Taha Koleilat","Hojat Asgariandehkordi","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.19483v3.pdf","comment":"10 pages, 2 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.11219v1","updated":"2024-11-18T01:11:47Z","published":"2024-11-18T01:11:47Z","title":"Relational Contrastive Learning and Masked Image Modeling for Scene Text\n Recognition","summary":" Context-aware methods have achieved remarkable advancements in supervised\nscene text recognition by leveraging semantic priors from words. Considering\nthe heterogeneity of text and background in STR, we propose that such\ncontextual priors can be reinterpreted as the relations between textual\nelements, serving as effective self-supervised labels for representation\nlearning. However, textual relations are restricted to the finite size of the\ndataset due to lexical dependencies, which causes over-fitting problem, thus\ncompromising the representation quality. To address this, our work introduces a\nunified framework of Relational Contrastive Learning and Masked Image Modeling\nfor STR (RCMSTR), which explicitly models the enriched textual relations. For\nthe RCL branch, we first introduce the relational rearrangement module to\ncultivate new relations on the fly. Based on this, we further conduct\nrelational contrastive learning to model the intra- and inter-hierarchical\nrelations for frames, sub-words and words.On the other hand, MIM can naturally\nboost the context information via masking, where we find that the block masking\nstrategy is more effective for STR. For the effective integration of RCL and\nMIM, we also introduce a novel decoupling design aimed at mitigating the impact\nof masked images on contrastive learning. Additionally, to enhance the\ncompatibility of MIM with CNNs, we propose the adoption of sparse convolutions\nand directly sharing the weights with dense convolutions in training. The\nproposed RCMSTR demonstrates superior performance in various evaluation\nprotocols for different STR-related downstream tasks, outperforming the\nexisting state-of-the-art self-supervised STR techniques. Ablation studies and\nqualitative experimental results further validate the effectiveness of our\nmethod.The code and pre-trained models will be available at\nhttps://github.com/ThunderVVV/RCMSTR .\n","authors":["Tiancheng Lin","Jinglei Zhang","Yi Xu","Kai Chen","Rui Zhang","Chang-Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11214v1","updated":"2024-11-18T00:46:59Z","published":"2024-11-18T00:46:59Z","title":"DeforHMR: Vision Transformer with Deformable Cross-Attention for 3D\n Human Mesh Recovery","summary":" Human Mesh Recovery (HMR) is an important yet challenging problem with\napplications across various domains including motion capture, augmented\nreality, and biomechanics. Accurately predicting human pose parameters from a\nsingle image remains a challenging 3D computer vision task. In this work, we\nintroduce DeforHMR, a novel regression-based monocular HMR framework designed\nto enhance the prediction of human pose parameters using deformable attention\ntransformers. DeforHMR leverages a novel query-agnostic deformable\ncross-attention mechanism within the transformer decoder to effectively regress\nthe visual features extracted from a frozen pretrained vision transformer (ViT)\nencoder. The proposed deformable cross-attention mechanism allows the model to\nattend to relevant spatial features more flexibly and in a data-dependent\nmanner. Equipped with a transformer decoder capable of spatially-nuanced\nattention, DeforHMR achieves state-of-the-art performance for single-frame\nregression-based methods on the widely used 3D HMR benchmarks 3DPW and RICH. By\npushing the boundary on the field of 3D human mesh recovery through deformable\nattention, we introduce an new, effective paradigm for decoding local spatial\ninformation from large pretrained vision encoders in computer vision.\n","authors":["Jaewoo Heo","George Hu","Zeyu Wang","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2411.11214v1.pdf","comment":"11 pages, 5 figures, 3DV2025"},{"id":"http://arxiv.org/abs/2411.04335v2","updated":"2024-11-18T00:31:33Z","published":"2024-11-07T00:22:38Z","title":"GazeGen: Gaze-Driven User Interaction for Visual Content Generation","summary":" We present GazeGen, a user interaction system that generates visual content\n(images and videos) for locations indicated by the user's eye gaze. GazeGen\nallows intuitive manipulation of visual content by targeting regions of\ninterest with gaze. Using advanced techniques in object detection and\ngenerative AI, GazeGen performs gaze-controlled image adding/deleting,\nrepositioning, and surface style changes of image objects, and converts static\nimages into videos. Central to GazeGen is the DFT Gaze (Distilled and\nFine-Tuned Gaze) agent, an ultra-lightweight model with only 281K parameters,\nperforming accurate real-time gaze predictions tailored to individual users'\neyes on small edge devices. GazeGen is the first system to combine visual\ncontent generation with real-time gaze estimation, made possible exclusively by\nDFT Gaze. This real-time gaze estimation enables various visual content\ngeneration tasks, all controlled by the user's gaze. The input for DFT Gaze is\nthe user's eye images, while the inputs for visual content generation are the\nuser's view and the predicted gaze point from DFT Gaze. To achieve efficient\ngaze predictions, we derive the small model from a large model (10x larger) via\nnovel knowledge distillation and personal adaptation techniques. We integrate\nknowledge distillation with a masked autoencoder, developing a compact yet\npowerful gaze estimation model. This model is further fine-tuned with Adapters,\nenabling highly accurate and personalized gaze predictions with minimal user\ninput. DFT Gaze ensures low-latency and precise gaze tracking, supporting a\nwide range of gaze-driven tasks. We validate the performance of DFT Gaze on AEA\nand OpenEDS2020 benchmarks, demonstrating low angular gaze error and low\nlatency on the edge device (Raspberry Pi 4). Furthermore, we describe\napplications of GazeGen, illustrating its versatility and effectiveness in\nvarious usage scenarios.\n","authors":["He-Yen Hsieh","Ziyun Li","Sai Qian Zhang","Wei-Te Mark Ting","Kao-Den Chang","Barbara De Salvo","Chiao Liu","H. T. Kung"],"pdf_url":"https://arxiv.org/pdf/2411.04335v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.07987v3","updated":"2024-11-18T00:21:40Z","published":"2024-04-11T17:59:09Z","title":"ControlNet++: Improving Conditional Controls with Efficient Consistency\n Feedback","summary":" To enhance the controllability of text-to-image diffusion models, existing\nefforts like ControlNet incorporated image-based conditional controls. In this\npaper, we reveal that existing methods still face significant challenges in\ngenerating images that align with the image conditional controls. To this end,\nwe propose ControlNet++, a novel approach that improves controllable generation\nby explicitly optimizing pixel-level cycle consistency between generated images\nand conditional controls. Specifically, for an input conditional control, we\nuse a pre-trained discriminative reward model to extract the corresponding\ncondition of the generated images, and then optimize the consistency loss\nbetween the input conditional control and extracted condition. A\nstraightforward implementation would be generating images from random noises\nand then calculating the consistency loss, but such an approach requires\nstoring gradients for multiple sampling timesteps, leading to considerable time\nand memory costs. To address this, we introduce an efficient reward strategy\nthat deliberately disturbs the input images by adding noise, and then uses the\nsingle-step denoised images for reward fine-tuning. This avoids the extensive\ncosts associated with image sampling, allowing for more efficient reward\nfine-tuning. Extensive experiments show that ControlNet++ significantly\nimproves controllability under various conditional controls. For example, it\nachieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE,\nrespectively, for segmentation mask, line-art edge, and depth conditions. All\nthe code, models, demo and organized data have been open sourced on our Github\nRepo.\n","authors":["Ming Li","Taojiannan Yang","Huafeng Kuang","Jie Wu","Zhaoning Wang","Xuefeng Xiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07987v3.pdf","comment":"Camera Ready Version. Project Page:\n https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data:\n https://github.com/liming-ai/ControlNet_Plus_Plus"},{"id":"http://arxiv.org/abs/2403.06269v2","updated":"2024-11-18T23:08:21Z","published":"2024-03-10T17:12:01Z","title":"FastVideoEdit: Leveraging Consistency Models for Efficient Text-to-Video\n Editing","summary":" Diffusion models have demonstrated remarkable capabilities in text-to-image\nand text-to-video generation, opening up possibilities for video editing based\non textual input. However, the computational cost associated with sequential\nsampling in diffusion models poses challenges for efficient video editing.\nExisting approaches relying on image generation models for video editing suffer\nfrom time-consuming one-shot fine-tuning, additional condition extraction, or\nDDIM inversion, making real-time applications impractical. In this work, we\npropose FastVideoEdit, an efficient zero-shot video editing approach inspired\nby Consistency Models (CMs). By leveraging the self-consistency property of\nCMs, we eliminate the need for time-consuming inversion or additional condition\nextraction, reducing editing time. Our method enables direct mapping from\nsource video to target video with strong preservation ability utilizing a\nspecial variance schedule. This results in improved speed advantages, as fewer\nsampling steps can be used while maintaining comparable generation quality.\nExperimental results validate the state-of-the-art performance and speed\nadvantages of FastVideoEdit across evaluation metrics encompassing editing\nspeed, temporal consistency, and text-video alignment.\n","authors":["Youyuan Zhang","Xuan Ju","James J. Clark"],"pdf_url":"https://arxiv.org/pdf/2403.06269v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2411.12115v1","updated":"2024-11-18T22:51:44Z","published":"2024-11-18T22:51:44Z","title":"Distill the Best, Ignore the Rest: Improving Dataset Distillation with\n Loss-Value-Based Pruning","summary":" Dataset distillation has gained significant interest in recent years, yet\nexisting approaches typically distill from the entire dataset, potentially\nincluding non-beneficial samples. We introduce a novel \"Prune First, Distill\nAfter\" framework that systematically prunes datasets via loss-based sampling\nprior to distillation. By leveraging pruning before classical distillation\ntechniques and generative priors, we create a representative core-set that\nleads to enhanced generalization for unseen architectures - a significant\nchallenge of current distillation methods. More specifically, our proposed\nframework significantly boosts distilled quality, achieving up to a 5.2\npercentage points accuracy increase even with substantial dataset pruning,\ni.e., removing 80% of the original dataset prior to distillation. Overall, our\nexperimental results highlight the advantages of our easy-sample prioritization\nand cross-architecture robustness, paving the way for more effective and\nhigh-quality dataset distillation.\n","authors":["Brian B. Moser","Federico Raue","Tobias C. Nauen","Stanislav Frolov","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.12115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08426v6","updated":"2024-11-18T22:26:15Z","published":"2024-01-16T15:11:29Z","title":"GD doesn't make the cut: Three ways that non-differentiability affects\n neural network training","summary":" This paper critically examines the fundamental distinctions between gradient\nmethods applied to non-differentiable functions (NGDMs) and classical gradient\ndescents (GDs) for differentiable functions, revealing significant gaps in\ncurrent deep learning optimization theory. We demonstrate that NGDMs exhibit\nmarkedly different convergence properties compared to GDs, strongly challenging\nthe applicability of extensive neural network convergence literature based on\n$L-smoothness$ to non-smooth neural networks. Our analysis reveals paradoxical\nbehavior of NDGM solutions for $L_{1}$-regularized problems, where increasing\nregularization counterintuitively leads to larger $L_{1}$ norms of optimal\nsolutions. This finding calls into question widely adopted $L_{1}$ penalization\ntechniques for network pruning. We further challenge the common assumption that\noptimization algorithms like RMSProp behave similarly in differentiable and\nnon-differentiable contexts. Expanding on the Edge of Stability phenomenon, we\ndemonstrate its occurrence in a broader class of functions, including Lipschitz\ncontinuous convex differentiable functions. This finding raises important\nquestions about its relevance and interpretation in non-convex,\nnon-differentiable neural networks, particularly those using ReLU activations.\nOur work identifies critical misunderstandings of NDGMs in influential\nliterature, stemming from an overreliance on strong smoothness assumptions.\nThese findings necessitate a reevaluation of optimization dynamics in deep\nlearning, emphasizing the crucial need for more nuanced theoretical foundations\nin analyzing these complex systems.\n","authors":["Siddharth Krishna Kumar"],"pdf_url":"https://arxiv.org/pdf/2401.08426v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12089v1","updated":"2024-11-18T22:00:19Z","published":"2024-11-18T22:00:19Z","title":"FruitNinja: 3D Object Interior Texture Generation with Gaussian\n Splatting","summary":" In the real world, objects reveal internal textures when sliced or cut, yet\nthis behavior is not well-studied in 3D generation tasks today. For example,\nslicing a virtual 3D watermelon should reveal flesh and seeds. Given that no\navailable dataset captures an object's full internal structure and collecting\ndata from all slices is impractical, generative methods become the obvious\napproach. However, current 3D generation and inpainting methods often focus on\nvisible appearance and overlook internal textures. To bridge this gap, we\nintroduce FruitNinja, the first method to generate internal textures for 3D\nobjects undergoing geometric and topological changes. Our approach produces\nobjects via 3D Gaussian Splatting (3DGS) with both surface and interior\ntextures synthesized, enabling real-time slicing and rendering without\nadditional optimization. FruitNinja leverages a pre-trained diffusion model to\nprogressively inpaint cross-sectional views and applies voxel-grid-based\nsmoothing to achieve cohesive textures throughout the object. Our OpaqueAtom GS\nstrategy overcomes 3DGS limitations by employing densely distributed opaque\nGaussians, avoiding biases toward larger particles that destabilize training\nand sharp color transitions for fine-grained textures. Experimental results\nshow that FruitNinja substantially outperforms existing approaches, showcasing\nunmatched visual quality in real-time rendered internal views across arbitrary\ngeometry manipulations.\n","authors":["Fangyu Wu","Yuhao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.12089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10225v2","updated":"2024-11-18T21:52:03Z","published":"2024-06-14T17:58:28Z","title":"SatDiffMoE: A Mixture of Estimation Method for Satellite Image\n Super-resolution with Latent Diffusion Models","summary":" During the acquisition of satellite images, there is generally a trade-off\nbetween spatial resolution and temporal resolution (acquisition frequency) due\nto the onboard sensors of satellite imaging systems. High-resolution satellite\nimages are very important for land crop monitoring, urban planning, wildfire\nmanagement and a variety of applications. It is a significant yet challenging\ntask to achieve high spatial-temporal resolution in satellite imaging. With the\nadvent of diffusion models, we can now learn strong generative priors to\ngenerate realistic satellite images with high resolution, which can be utilized\nto promote the super-resolution task as well. In this work, we propose a novel\ndiffusion-based fusion algorithm called \\textbf{SatDiffMoE} that can take an\narbitrary number of sequential low-resolution satellite images at the same\nlocation as inputs, and fuse them into one high-resolution reconstructed image\nwith more fine details, by leveraging and fusing the complementary information\nfrom different time points. Our algorithm is highly flexible and allows\ntraining and inference on arbitrary number of low-resolution images.\nExperimental results show that our proposed SatDiffMoE method not only achieves\nsuperior performance for the satellite image super-resolution tasks on a\nvariety of datasets, but also gets an improved computational efficiency with\nreduced model parameters, compared with previous methods.\n","authors":["Zhaoxu Luo","Bowen Song","Liyue Shen"],"pdf_url":"https://arxiv.org/pdf/2406.10225v2.pdf","comment":"Accepted by ICML 2024 Workshop on Advancing Neural Network Training\n (WANT): Computational Efficiency, Scalability, and Resource Optimization"},{"id":"http://arxiv.org/abs/2307.08850v2","updated":"2024-11-18T21:51:16Z","published":"2023-07-17T21:22:17Z","title":"LiDAR-BEVMTN: Real-Time LiDAR Bird's-Eye View Multi-Task Perception\n Network for Autonomous Driving","summary":" LiDAR is crucial for robust 3D scene perception in autonomous driving. LiDAR\nperception has the largest body of literature after camera perception. However,\nmulti-task learning across tasks like detection, segmentation, and motion\nestimation using LiDAR remains relatively unexplored, especially on\nautomotive-grade embedded platforms. We present a real-time multi-task\nconvolutional neural network for LiDAR-based object detection, semantics, and\nmotion segmentation. The unified architecture comprises a shared encoder and\ntask-specific decoders, enabling joint representation learning. We propose a\nnovel Semantic Weighting and Guidance (SWAG) module to transfer semantic\nfeatures for improved object detection selectively. Our heterogeneous training\nscheme combines diverse datasets and exploits complementary cues between tasks.\nThe work provides the first embedded implementation unifying these key\nperception tasks from LiDAR point clouds achieving 3ms latency on the embedded\nNVIDIA Xavier platform. We achieve state-of-the-art results for two tasks,\nsemantic and motion segmentation, and close to state-of-the-art performance for\n3D object detection. By maximizing hardware efficiency and leveraging\nmulti-task synergies, our method delivers an accurate and efficient solution\ntailored for real-world automated driving deployment. Qualitative results can\nbe seen at https://youtu.be/H-hWRzv2lIY.\n","authors":["Sambit Mohapatra","Senthil Yogamani","Varun Ravi Kumar","Stefan Milz","Heinrich Gotzig","Patrick Mäder"],"pdf_url":"https://arxiv.org/pdf/2307.08850v2.pdf","comment":"Accepted for publication at IEEE Transactions on Intelligent\n Transportation Systems"},{"id":"http://arxiv.org/abs/2406.13358v2","updated":"2024-11-18T21:36:53Z","published":"2024-06-19T09:05:05Z","title":"Multi-scale Restoration of Missing Data in Optical Time-series Images\n with Masked Spatial-Temporal Attention Network","summary":" Remote sensing images often suffer from substantial data loss due to factors\nsuch as thick cloud cover and sensor limitations. Existing methods for imputing\nmissing values in remote sensing images fail to fully exploit spatiotemporal\nauxiliary information, which restricts the accuracy of their reconstructions.\nTo address this issue, this paper proposes a novel deep learning-based approach\ncalled MS2TAN (Multi-Scale Masked Spatial-Temporal Attention Network) for\nreconstructing time-series remote sensing images. First, we introduce an\nefficient spatiotemporal feature extractor based on Masked Spatial-Temporal\nAttention (MSTA) to capture high-quality representations of spatiotemporal\nneighborhood features surrounding missing regions while significantly reducing\nthe computational complexity of the attention mechanism. Second, a Multi-Scale\nRestoration Network composed of MSTA-based Feature Extractors is designed to\nprogressively refine missing values by exploring spatiotemporal neighborhood\nfeatures at different scales. Third, we propose a \"Pixel-Structure-Perception\"\nMulti-Objective Joint Optimization method to enhance the visual quality of the\nreconstructed results from multiple perspectives and to preserve more texture\nstructures. Finally, quantitative experimental results under multi-temporal\ninputs on two public datasets demonstrate that the proposed method outperforms\ncompetitive approaches, achieving a 9.76%/9.30% reduction in Mean Absolute\nError (MAE) and a 0.56 dB/0.62 dB increase in Peak Signal-to-Noise Ratio\n(PSNR), along with stronger texture and structural consistency. Ablation\nexperiments further validate the contribution of the core innovations to\nimputation accuracy.\n","authors":["Zaiyan Zhang","Jining Yan","Yuanqi Liang","Jiaxin Feng","Haixu He","Li Cao"],"pdf_url":"https://arxiv.org/pdf/2406.13358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12073v1","updated":"2024-11-18T21:34:05Z","published":"2024-11-18T21:34:05Z","title":"Just Leaf It: Accelerating Diffusion Classifiers with Hierarchical Class\n Pruning","summary":" Diffusion models, known for their generative capabilities, have recently\nshown unexpected potential in image classification tasks by using Bayes'\ntheorem. However, most diffusion classifiers require evaluating all class\nlabels for a single classification, leading to significant computational costs\nthat can hinder their application in large-scale scenarios. To address this, we\npresent a Hierarchical Diffusion Classifier (HDC) that exploits the inherent\nhierarchical label structure of a dataset. By progressively pruning irrelevant\nhigh-level categories and refining predictions only within relevant\nsubcategories, i.e., leaf nodes, HDC reduces the total number of class\nevaluations. As a result, HDC can accelerate inference by up to 60% while\nmaintaining and, in some cases, improving classification accuracy. Our work\nenables a new control mechanism of the trade-off between speed and precision,\nmaking diffusion-based classification more viable for real-world applications,\nparticularly in large-scale image classification tasks.\n","authors":["Arundhati S. Shanbhag","Brian B. Moser","Tobias C. Nauen","Stanislav Frolov","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.12073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12072v1","updated":"2024-11-18T21:32:49Z","published":"2024-11-18T21:32:49Z","title":"Zoomed In, Diffused Out: Towards Local Degradation-Aware Multi-Diffusion\n for Extreme Image Super-Resolution","summary":" Large-scale, pre-trained Text-to-Image (T2I) diffusion models have gained\nsignificant popularity in image generation tasks and have shown unexpected\npotential in image Super-Resolution (SR). However, most existing T2I diffusion\nmodels are trained with a resolution limit of 512x512, making scaling beyond\nthis resolution an unresolved but necessary challenge for image SR. In this\nwork, we introduce a novel approach that, for the first time, enables these\nmodels to generate 2K, 4K, and even 8K images without any additional training.\nOur method leverages MultiDiffusion, which distributes the generation across\nmultiple diffusion paths to ensure global coherence at larger scales, and local\ndegradation-aware prompt extraction, which guides the T2I model to reconstruct\nfine local structures according to its low-resolution input. These innovations\nunlock higher resolutions, allowing T2I diffusion models to be applied to image\nSR tasks without limitation on resolution.\n","authors":["Brian B. Moser","Stanislav Frolov","Tobias C. Nauen","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.12072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12070v1","updated":"2024-11-18T21:29:50Z","published":"2024-11-18T21:29:50Z","title":"Autoassociative Learning of Structural Representations for Modeling and\n Classification in Medical Imaging","summary":" Deep learning architectures based on convolutional neural networks tend to\nrely on continuous, smooth features. While this characteristics provides\nsignificant robustness and proves useful in many real-world tasks, it is\nstrikingly incompatible with the physical characteristic of the world, which,\nat the scale in which humans operate, comprises crisp objects, typically\nrepresenting well-defined categories. This study proposes a class of\nneurosymbolic systems that learn by reconstructing the observed images in terms\nof visual primitives and are thus forced to form high-level, structural\nexplanations of them. When applied to the task of diagnosing abnormalities in\nhistological imaging, the method proved superior to a conventional deep\nlearning architecture in terms of classification accuracy, while being more\ntransparent.\n","authors":["Zuzanna Buchnajzer","Kacper Dobek","Stanisław Hapke","Daniel Jankowski","Krzysztof Krawiec"],"pdf_url":"https://arxiv.org/pdf/2411.12070v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.03766v2","updated":"2024-11-18T21:08:21Z","published":"2024-08-20T15:57:40Z","title":"OpenCap markerless motion capture estimation of lower extremity\n kinematics and dynamics in cycling","summary":" Markerless motion capture offers several benefits over traditional\nmarker-based systems by eliminating the need for physical markers, which are\nprone to misplacement and artifacts. Utilizing computer vision and deep\nlearning algorithms, markerless systems can directly detect human body\nlandmarks, reducing manual processing and errors associated with marker\nplacement. These systems are adaptable, able to track user-defined features,\nand practical for real-world applications using consumer-grade devices such as\nsmartphone cameras. This study compares the performance of OpenCap, a\nmarkerless motion capture system, with traditional marker-based systems in\nassessing cycling biomechanics. Ten healthy adults participated in experiments\nto capture sagittal hip, knee, and ankle kinematics and dynamics using both\nmethods. OpenCap used videos from smartphones and integrated computer vision\nand musculoskeletal simulations to estimate 3D kinematics. Results showed high\nagreement between the two systems, with no significant differences in kinematic\nand kinetic measurements for the hip, knee, and ankle. The correlation\ncoefficients exceeded 0.98, indicating very strong consistency. Errors were\nminimal, with kinematic errors under 4 degrees and kinetic errors below 5 Nm.\nThis study concludes that OpenCap is a viable alternative to marker-based\nmotion capture, offering comparable precision without extensive setup for hip\n(flexion/extension), knee (flexion/extension), and ankle\n(dorsiflexion/plantarflexion) joints. Future work should aim to enhance the\naccuracy of ankle joint measurements and extend analyses to 3D kinematics and\nkinetics for comprehensive biomechanical assessments.\n","authors":["Reza Kakavand","Reza Ahmadi","Atousa Parsaei","W. Brent Edwards","Amin Komeili"],"pdf_url":"https://arxiv.org/pdf/2409.03766v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08982v2","updated":"2024-11-18T21:08:16Z","published":"2024-08-16T19:17:02Z","title":"Deep Generative Classification of Blood Cell Morphology","summary":" Accurate classification of haematological cells is critical for diagnosing\nblood disorders, but presents significant challenges for machine automation\nowing to the complexity of cell morphology, heterogeneities of biological,\npathological, and imaging characteristics, and the imbalance of cell type\nfrequencies. We introduce CytoDiffusion, a diffusion-based classifier that\neffectively models blood cell morphology, combining accurate classification\nwith robust anomaly detection, resistance to distributional shifts,\ninterpretability, data efficiency, and superhuman uncertainty quantification.\nOur approach outperforms state-of-the-art discriminative models in anomaly\ndetection (AUC 0.990 vs. 0.918), resistance to domain shifts (85.85% vs. 74.38%\nbalanced accuracy), and performance in low-data regimes (95.88% vs. 94.95%\nbalanced accuracy). Notably, our model generates synthetic blood cell images\nthat are nearly indistinguishable from real images, as demonstrated by an\nauthenticity test in which expert haematologists achieved only 52.3% accuracy\n(95% CI: [50.5%, 54.2%]) in distinguishing real from generated images.\nFurthermore, we enhance model explainability through the generation of directly\ninterpretable counterfactual heatmaps. Our comprehensive evaluation framework,\nencompassing these multiple performance dimensions, establishes a new benchmark\nfor medical image analysis in haematology, ultimately enabling improved\ndiagnostic accuracy in clinical settings. Our code is available at\nhttps://github.com/CambridgeCIA/CytoDiffusion.\n","authors":["Simon Deltadahl","Julian Gilbey","Christine Van Laer","Nancy Boeckx","Mathie Leers","Tanya Freeman","Laura Aiken","Timothy Farren","Matthew Smith","Mohamad Zeina","BloodCounts consortium","James HF Rudd","Concetta Piazzese","Joseph Taylor","Nicholas Gleadall","Carola-Bibiane Schönlieb","Suthesh Sivapalaratnam","Michael Roberts","Parashkev Nachev"],"pdf_url":"https://arxiv.org/pdf/2408.08982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12044v1","updated":"2024-11-18T20:31:38Z","published":"2024-11-18T20:31:38Z","title":"ITACLIP: Boosting Training-Free Semantic Segmentation with Image, Text,\n and Architectural Enhancements","summary":" Recent advances in foundational Vision Language Models (VLMs) have reshaped\nthe evaluation paradigm in computer vision tasks. These foundational models,\nespecially CLIP, have accelerated research in open-vocabulary computer vision\ntasks, including Open-Vocabulary Semantic Segmentation (OVSS). Although the\ninitial results are promising, the dense prediction capabilities of VLMs still\nrequire further improvement. In this study, we enhance the semantic\nsegmentation performance of CLIP by introducing new modules and modifications:\n1) architectural changes in the last layer of ViT and the incorporation of\nattention maps from the middle layers with the last layer, 2) Image\nEngineering: applying data augmentations to enrich input image representations,\nand 3) using Large Language Models (LLMs) to generate definitions and synonyms\nfor each class name to leverage CLIP's open-vocabulary capabilities. Our\ntraining-free method, ITACLIP, outperforms current state-of-the-art approaches\non segmentation benchmarks such as COCO-Stuff, COCO-Object, Pascal Context, and\nPascal VOC. Our code is available at https://github.com/m-arda-aydn/ITACLIP.\n","authors":["M. Arda Aydın","Efe Mert Çırpar","Elvin Abdinli","Gozde Unal","Yusuf H. Sahin"],"pdf_url":"https://arxiv.org/pdf/2411.12044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13379v2","updated":"2024-11-18T20:30:20Z","published":"2024-08-23T21:25:16Z","title":"N-DriverMotion: Driver motion learning and prediction using an\n event-based camera and directly trained spiking neural networks on Loihi 2","summary":" Driver motion recognition is a principal factor in ensuring the safety of\ndriving systems. This paper presents a novel system for learning and predicting\ndriver motions and an event-based high-resolution (1280x720) dataset,\nN-DriverMotion, newly collected to train on a neuromorphic vision system. The\nsystem comprises an event-based camera that generates the first high-resolution\ndriver motion dataset representing spike inputs and efficient spiking neural\nnetworks (SNNs) that are effective in training and predicting the driver's\ngestures. The event dataset consists of 13 driver motion categories classified\nby direction (front, side), illumination (bright, moderate, dark), and\nparticipant. A novel simplified four-layer convolutional spiking neural network\n(CSNN) that we proposed was directly trained using the high-resolution dataset\nwithout any time-consuming preprocessing. This enables efficient adaptation to\non-device SNNs for real-time inference on high-resolution event-based streams.\nCompared with recent gesture recognition systems adopting neural networks for\nvision processing, the proposed neuromorphic vision system achieves comparable\naccuracy, 94.04\\%, in recognizing driver motions with the CSNN architecture.\nOur proposed CSNN and the dataset can be used to develop safer and more\nefficient driver monitoring systems for autonomous vehicles or edge devices\nrequiring an efficient neural network architecture.\n","authors":["Hyo Jong Chung","Byungkon Kang","Yoonseok Yang"],"pdf_url":"https://arxiv.org/pdf/2408.13379v2.pdf","comment":"Accepted for publication in IEEE Open Journal of Vehicular Technology\n (OJVT) on 18 November 2024"},{"id":"http://arxiv.org/abs/2403.12977v3","updated":"2024-11-18T20:09:57Z","published":"2024-02-10T01:16:21Z","title":"SportsNGEN: Sustained Generation of Realistic Multi-player Sports\n Gameplay","summary":" We present a transformer decoder based sports simulation engine, SportsNGEN,\ntrained on sports player and ball tracking sequences, that is capable of\ngenerating sustained gameplay and accurately mimicking the decision making of\nreal players. By training on a large database of professional tennis tracking\ndata, we demonstrate that simulations produced by SportsNGEN can be used to\npredict the outcomes of rallies, determine the best shot choices at any point,\nand evaluate counterfactual or what if scenarios to inform coaching decisions\nand elevate broadcast coverage. By combining the generated simulations with a\nshot classifier and logic to start and end rallies, the system is capable of\nsimulating an entire tennis match. We evaluate SportsNGEN by comparing\nstatistics of the simulations with those of real matches between the same\nplayers. We show that the model output sampling parameters are crucial to\nsimulation realism and that SportsNGEN is probabilistically well-calibrated to\nreal data. In addition, a generic version of SportsNGEN can be customized to a\nspecific player by fine-tuning on the subset of match data that includes that\nplayer. Finally, we show qualitative results indicating the same approach works\nfor football.\n","authors":["Lachlan Thorpe","Lewis Bawden","Karanjot Vendal","John Bronskill","Richard E. Turner"],"pdf_url":"https://arxiv.org/pdf/2403.12977v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12028v1","updated":"2024-11-18T20:03:49Z","published":"2024-11-18T20:03:49Z","title":"In-Situ Melt Pool Characterization via Thermal Imaging for Defect\n Detection in Directed Energy Deposition Using Vision Transformers","summary":" Directed Energy Deposition (DED) offers significant potential for\nmanufacturing complex and multi-material parts. However, internal defects such\nas porosity and cracks can compromise mechanical properties and overall\nperformance. This study focuses on in-situ monitoring and characterization of\nmelt pools associated with porosity, aiming to improve defect detection and\nquality control in DED-printed parts. Traditional machine learning approaches\nfor defect identification rely on extensive labeled datasets, often scarce and\nexpensive to generate in real-world manufacturing. To address this, our\nframework employs self-supervised learning on unlabeled melt pool data using a\nVision Transformer-based Masked Autoencoder (MAE) to produce highly\nrepresentative embeddings. These fine-tuned embeddings are leveraged via\ntransfer learning to train classifiers on a limited labeled dataset, enabling\nthe effective identification of melt pool anomalies. We evaluate two\nclassifiers: (1) a Vision Transformer (ViT) classifier utilizing the fine-tuned\nMAE Encoder's parameters and (2) the fine-tuned MAE Encoder combined with an\nMLP classifier head. Our framework achieves overall accuracy ranging from\n95.44% to 99.17% and an average F1 score exceeding 80%, with the ViT Classifier\nslightly outperforming the MAE Encoder Classifier. This demonstrates the\nscalability and cost-effectiveness of our approach for automated quality\ncontrol in DED, effectively detecting defects with minimal labeled data.\n","authors":["Israt Zarin Era","Fan Zhou","Ahmed Shoyeb Raihan","Imtiaz Ahmed","Alan Abul-Haj","James Craig","Srinjoy Das","Zhichao Liu"],"pdf_url":"https://arxiv.org/pdf/2411.12028v1.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2411.11803v1","updated":"2024-11-18T18:17:36Z","published":"2024-11-18T18:17:36Z","title":"Scalable control synthesis for stochastic systems via structural IMDP\n abstractions","summary":" This paper introduces a novel abstraction-based framework for controller\nsynthesis of nonlinear discrete-time stochastic systems. The focus is on\nprobabilistic reach-avoid specifications. The framework is based on abstracting\na stochastic system into a new class of robust Markov models, called\northogonally decoupled Interval Markov Decision Processes (odIMDPs).\nSpecifically, an odIMDPs is a class of robust Markov processes, where the\ntransition probabilities between each pair of states are uncertain and have the\nproduct form. We show that such a specific form in the transition probabilities\nallows one to build compositional abstractions of stochastic systems that, for\neach state, are only required to store the marginal probability bounds of the\noriginal system. This leads to improved memory complexity for our approach\ncompared to commonly employed abstraction-based approaches. Furthermore, we\nshow that an optimal control strategy for a odIMDPs can be computed by solving\na set of linear problems. When the resulting strategy is mapped back to the\noriginal system, it is guaranteed to lead to reduced conservatism compared to\nexisting approaches. To test our theoretical framework, we perform an extensive\nempirical comparison of our methods against Interval Markov Decision Process-\nand Markov Decision Process-based approaches on various benchmarks including 7D\nsystems. Our empirical analysis shows that our approach substantially\noutperforms state-of-the-art approaches in terms of both memory requirements\nand the conservatism of the results.\n","authors":["Frederik Baymler Mathiesen","Sofie Haesaert","Luca Laurenti"],"pdf_url":"https://arxiv.org/pdf/2411.11803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11791v1","updated":"2024-11-18T18:05:27Z","published":"2024-11-18T18:05:27Z","title":"Machine Learning-Assisted Distribution System Network Reconfiguration\n Problem","summary":" High penetration from volatile renewable energy resources in the grid and the\nvarying nature of loads raise the need for frequent line switching to ensure\nthe efficient operation of electrical distribution networks. Operators must\nensure maximum load delivery, reduced losses, and the operation between voltage\nlimits. However, computations to decide the optimal feeder configuration are\noften computationally expensive and intractable, making it unfavorable for\nreal-time operations. This is mainly due to the existence of binary variables\nin the network reconfiguration optimization problem. To tackle this issue, we\nhave devised an approach that leverages machine learning techniques to reshape\ndistribution networks featuring multiple substations. This involves predicting\nthe substation responsible for serving each part of the network. Hence, it\nleaves simple and more tractable Optimal Power Flow problems to be solved. This\nmethod can produce accurate results in a significantly faster time, as\ndemonstrated using the IEEE 37-bus distribution feeder. Compared to the\ntraditional optimization-based approaches, a feasible solution is achieved\napproximately ten times faster for all the tested scenarios.\n","authors":["Richard Asiamah","Yuqi Zhou","Ahmed S. Zamzam"],"pdf_url":"https://arxiv.org/pdf/2411.11791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11788v1","updated":"2024-11-18T18:03:49Z","published":"2024-11-18T18:03:49Z","title":"Enabling steep slope walking on Husky using reduced order modeling and\n quadratic programming","summary":" Wing-assisted inclined running (WAIR) observed in some young birds, is an\nattractive maneuver that can be extended to legged aerial systems. This study\nproposes a control method using a modified Variable Length Inverted Pendulum\n(VLIP) by assuming a fixed zero moment point and thruster forces collocated at\nthe center of mass of the pendulum. A QP MPC is used to find the optimal ground\nreaction forces and thruster forces to track a reference position and velocity\ntrajectory. Simulation results of this VLIP model on a slope of 40 degrees is\nmaintained and shows thruster forces that can be obtained through posture\nmanipulation. The simulation also provides insight to how the combined efforts\nof the thrusters and the tractive forces from the legs make WAIR possible in\nthruster-assisted legged systems.\n","authors":["Kaushik Venkatesh Krishnamurthy","Eric Sihite","Chenghao Wang","Shreyansh Pitroda","Adarsh Salagame","Alireza Ramezani","Morteza Gharib"],"pdf_url":"https://arxiv.org/pdf/2411.11788v1.pdf","comment":"6 pages, 8 figures, submitted to the Humanoids 2025 conference"},{"id":"http://arxiv.org/abs/2411.11778v1","updated":"2024-11-18T17:55:02Z","published":"2024-11-18T17:55:02Z","title":"Design And Optimization Of Multi-rendezvous Manoeuvres Based On\n Reinforcement Learning And Convex Optimization","summary":" Optimizing space vehicle routing is crucial for critical applications such as\non-orbit servicing, constellation deployment, and space debris de-orbiting.\nMulti-target Rendezvous presents a significant challenge in this domain. This\nproblem involves determining the optimal sequence in which to visit a set of\ntargets, and the corresponding optimal trajectories: this results in a\ndemanding NP-hard problem. We introduce a framework for the design and\nrefinement of multi-rendezvous trajectories based on heuristic combinatorial\noptimization and Sequential Convex Programming. Our framework is both highly\nmodular and capable of leveraging candidate solutions obtained with advanced\napproaches and handcrafted heuristics. We demonstrate this flexibility by\nintegrating an Attention-based routing policy trained with Reinforcement\nLearning to improve the performance of the combinatorial optimization process.\nWe show that Reinforcement Learning approaches for combinatorial optimization\ncan be effectively applied to spacecraft routing problems. We apply the\nproposed framework to the UARX Space OSSIE mission: we are able to thoroughly\nexplore the mission design space, finding optimal tours and trajectories for a\nwide variety of mission scenarios.\n","authors":["Antonio López Rivera","Lucrezia Marcovaldi","Jesús Ramírez","Alex Cuenca","David Bermejo"],"pdf_url":"https://arxiv.org/pdf/2411.11778v1.pdf","comment":"18 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2411.11762v1","updated":"2024-11-18T17:40:43Z","published":"2024-11-18T17:40:43Z","title":"High-Speed Cornering Control and Real-Vehicle Deployment for Autonomous\n Electric Vehicles","summary":" Executing drift maneuvers during high-speed cornering presents significant\nchallenges for autonomous vehicles, yet offers the potential to minimize\nturning time and enhance driving dynamics. While reinforcement learning (RL)\nhas shown promising results in simulated environments, discrepancies between\nsimulations and real-world conditions have limited its practical deployment.\nThis study introduces an innovative control framework that integrates\ntrajectory optimization with drift maneuvers, aiming to improve the algorithm's\nadaptability for real-vehicle implementation. We leveraged Bezier-based\npre-trajectory optimization to enhance rewards and optimize the controller\nthrough Twin Delayed Deep Deterministic Policy Gradient (TD3) in a simulated\nenvironment. For real-world deployment, we implement a hybrid RL-MPC fusion\nmechanism, , where TD3-derived maneuvers serve as primary inputs for a Model\nPredictive Controller (MPC). This integration enables precise real-time\ntracking of the optimal trajectory, with MPC providing corrective inputs to\nbridge the gap between simulation and reality. The efficacy of this method is\nvalidated through real-vehicle tests on consumer-grade electric vehicles,\nfocusing on drift U-turns and drift right-angle turns. The control outcomes of\nthese real-vehicle tests are thoroughly documented in the paper, supported by\nsupplementary video evidence (https://youtu.be/5wp67FcpfL8). Notably, this\nstudy is the first to deploy and apply an RL-based transient drift cornering\nalgorithm on consumer-grade electric vehicles.\n","authors":["Shiyue Zhao","Junzhi Zhang","Neda Masoud","Yuhong Jiang","Heye Huang","Tao Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11762v1.pdf","comment":"In the process of being submitted to the Journal of IEEE Transactions\n on Industrial Electronics"},{"id":"http://arxiv.org/abs/2311.05744v2","updated":"2024-11-18T17:09:17Z","published":"2023-11-09T21:03:39Z","title":"Flexibility of Integrated Power and Gas Systems: Gas Flow Modeling and\n Solution Choices Matter","summary":" Due to their slow gas flow dynamics, natural gas pipelines function as\nshort-term storage, the so-called linepack. By efficiently utilizing linepack,\nthe natural gas system can provide flexibility to the power system through the\nflexible operation of gas-fired power plants. This requires accurately\nrepresenting the gas flow physics governed by partial differential equations.\nAlthough several modeling and solution choices have been proposed in the\nliterature, their impact on the flexibility provision of gas networks to power\nsystems has not been thoroughly analyzed and compared. This paper bridges this\ngap by first developing a unified framework. We harmonize existing approaches\nand demonstrate their derivation from and application to the partial\ndifferential equations. Secondly, based on the proposed framework, we\nnumerically analyze the implications of various modeling and solution choices\non the flexibility provision from gas networks to power systems. One key\nconclusion is that relaxation-based approaches allow charging and discharging\nthe linepack at physically infeasible high rates, ultimately overestimating the\nflexibility.\n","authors":["Enrica Raheli","Yannick Werner","Jalal Kazempour"],"pdf_url":"https://arxiv.org/pdf/2311.05744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11698v1","updated":"2024-11-18T16:20:21Z","published":"2024-11-18T16:20:21Z","title":"A New Finite-Horizon Dynamic Programming Analysis of Nonanticipative\n Rate-Distortion Function for Markov Sources","summary":" This paper deals with the computation of a non-asymptotic lower bound by\nmeans of the nonanticipative rate-distortion function (NRDF) on the\ndiscrete-time zero-delay variable-rate lossy compression problem for discrete\nMarkov sources with per-stage, single-letter distortion. First, we derive a new\ninformation structure of the NRDF for Markov sources and single-letter\ndistortions. Second, we derive new convexity results on the NRDF, which\nfacilitate the use of Lagrange duality theorem to cast the problem as an\nunconstrained partially observable finite-time horizon stochastic dynamic\nprogramming (DP) algorithm subject to a probabilistic state (belief state) that\nsummarizes the past information about the reproduction symbols and takes values\nin a continuous state space. Instead of approximating the DP algorithm\ndirectly, we use Karush-Kuhn-Tucker (KKT) conditions to find an implicit\nclosed-form expression of the optimal control policy of the stochastic DP\n(i.e., the minimizing distribution of the NRDF) and approximate the control\npolicy and the cost-to-go function (a function of the rate) stage-wise, via a\nnovel dynamic alternating minimization (AM) approach, that is realized by an\noffline algorithm operating using backward recursions, with provable\nconvergence guarantees. We obtain the clean values of the aforementioned\nquantities using an online (forward) algorithm operating for any finite-time\nhorizon. Our methodology provides an approximate solution to the exact NRDF\nsolution, which becomes near-optimal as the search space of the belief state\nbecomes sufficiently large at each time stage. We corroborate our theoretical\nfindings with simulation studies where we apply our algorithms assuming\ntime-varying and time-invariant binary Markov processes.\n","authors":["Zixuan He","Charalambos D. Charalambous","Photios A. Stavrou"],"pdf_url":"https://arxiv.org/pdf/2411.11698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11687v1","updated":"2024-11-18T16:11:07Z","published":"2024-11-18T16:11:07Z","title":"Coevolution of Opinion Dynamics and Recommendation System: Modeling\n Analysis and Reinforcement Learning Based Manipulation","summary":" In this work, we develop an analytical framework that integrates opinion\ndynamics with a recommendation system. By incorporating elements such as\ncollaborative filtering, we provide a precise characterization of how\nrecommendation systems shape interpersonal interactions and influence opinion\nformation. Moreover, the property of the coevolution of both opinion dynamics\nand recommendation systems is also shown. Specifically, the convergence of this\ncoevolutionary system is theoretically proved, and the mechanisms behind filter\nbubble formation are elucidated. Our analysis of the maximum number of opinion\nclusters shows how recommendation system parameters affect opinion grouping and\npolarization. Additionally, we incorporate the influence of propagators into\nour model and propose a reinforcement learning-based solution. The analysis and\nthe propagation solution are demonstrated in simulation using the Yelp data\nset.\n","authors":["Yuhong Chen","Xiaobing Dai","Martin Buss","Fangzhou Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11652v1","updated":"2024-11-18T15:29:00Z","published":"2024-11-18T15:29:00Z","title":"On the Incorporation of Stability Constraints into Sequential\n Operational Scheduling","summary":" With the increasing penetration of Inverter-Based Resources (IBRs), power\nsystem stability constraints must be incorporated into the operational\nframework, transforming it into stability-constrained optimization. Currently,\nthere exist parallel research efforts on developing the stability constraints\nwithin DC power flow-based unit commitment (UC) and AC Optimal Power Flow\n(OPF). However, few studies discuss how including such constraints can interact\nwith each other and eventually impact grid stability. In this context, this\nwork simulates a realistic power system decision making framework and provides\na thorough analysis on the necessity of incorporating frequency nadir and small\nsignal stability constraints into these sequentially connected two operation\nstages. The simulation results demonstrate that including both stability\nconstraints in the UC is essential to maintain power system stability, while\nthe inclusion in AC OPF can further improve the stability index.\n","authors":["Wangkun Xu","Zhongda Chu","Florin Capitanescu","Fei Teng"],"pdf_url":"https://arxiv.org/pdf/2411.11652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17221v2","updated":"2024-11-18T15:21:40Z","published":"2024-10-22T17:45:45Z","title":"Scalable spectral representations for multi-agent reinforcement learning\n in network MDPs","summary":" Network Markov Decision Processes (MDPs), a popular model for multi-agent\ncontrol, pose a significant challenge to efficient learning due to the\nexponential growth of the global state-action space with the number of agents.\nIn this work, utilizing the exponential decay property of network dynamics, we\nfirst derive scalable spectral local representations for network MDPs, which\ninduces a network linear subspace for the local $Q$-function of each agent.\nBuilding on these local spectral representations, we design a scalable\nalgorithmic framework for continuous state-action network MDPs, and provide\nend-to-end guarantees for the convergence of our algorithm. Empirically, we\nvalidate the effectiveness of our scalable representation-based approach on two\nbenchmark problems, and demonstrate the advantages of our approach over generic\nfunction approximation approaches to representing the local $Q$-functions.\n","authors":["Zhaolin Ren","Runyu Zhang","Bo Dai","Na Li"],"pdf_url":"https://arxiv.org/pdf/2410.17221v2.pdf","comment":"Updated title, corrected an issue with an author's name"},{"id":"http://arxiv.org/abs/2409.07242v2","updated":"2024-11-18T14:33:04Z","published":"2024-09-11T12:54:35Z","title":"Orthogonal Mode Decomposition for Finite Discrete Signals","summary":" In this paper, an orthogonal mode decomposition method is proposed to\ndecompose ffnite length real signals on both the real and imaginary axes of the\ncomplex plane. The interpolation function space of ffnite length discrete\nsignal is constructed, and the relationship between the dimensionality of the\ninterpolation function space and its subspaces and the band width of the\ninterpolation function is analyzed. It is proved that the intrinsic mode is\nactually the narrow band signal whose intrinsic instantaneous frequency is\nalways positive (or always negative). Thus, the eigenmode decomposition problem\nis transformed into the orthogonal projection problem of interpolation function\nspace to its low frequency subspace or narrow band subspace. Different from the\nexisting mode decomposition methods, the orthogonal modal decomposition is a\nlocal time-frequency domain algorithm. Each operation extracts a speciffc mode.\nThe global decomposition results obtained under the precise deffnition of\neigenmodes have uniqueness and orthogonality. The computational complexity of\nthe orthogonal mode decomposition method is also much smaller than that of the\nexisting mode decomposition methods.\n","authors":["Ning Li","Lezhi Li"],"pdf_url":"https://arxiv.org/pdf/2409.07242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11610v1","updated":"2024-11-18T14:30:54Z","published":"2024-11-18T14:30:54Z","title":"Approximate predictive control barrier function for discrete-time\n systems","summary":" We propose integrating an explicit approximation of a predictive control\nbarrier function (PCBF) in a safety filter framework. The approximated PCBF is\nimplicitly defined through an optimal control problem and allows guaranteeing\ninvariance of an implicitly defined safe set as well as stability of this safe\nset within a larger domain of attraction. By extending existing theoretical\nanalysis of the PCBF, we establish inherent robustness of the original\nalgorithm and translate the guarantees to input-to-state stability of the\nproposed algorithm with respect to possible approximation errors, recovering\nthe same guarantees in the absence of approximation errors. The proposed\nalgorithm allows certifying inputs with respect to state constraint\nsatisfaction through a single function evaluation and filtering unsafe inputs\nthrough a control barrier function based safety filter, which is independent of\nthe time horizon of the original predictive optimisation problem, resulting in\nsignificant online computational benefits. We demonstrate the stability\nproperties of the proposed algorithm on a linear system example as well as its\nuse a fast safety filter for miniature race cars in simulation.\n","authors":["Alexandre Didier","Melanie N. Zeilinger"],"pdf_url":"https://arxiv.org/pdf/2411.11610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11598v1","updated":"2024-11-18T14:20:44Z","published":"2024-11-18T14:20:44Z","title":"Carleman-Fourier Linearization of Complex Dynamical Systems: Convergence\n and Explicit Error Bounds","summary":" This paper presents a Carleman-Fourier linearization method for nonlinear\ndynamical systems with periodic vector fields involving multiple fundamental\nfrequencies. By employing Fourier basis functions, the nonlinear dynamical\nsystem is transformed into a linear model on an infinite-dimensional space. The\nproposed approach yields accurate approximations over extended regions around\nequilibria and for longer time horizons, compared to traditional Carleman\nlinearization with monomials. Additionally, we develop a finite-section\napproximation for the resulting infinite-dimensional system and provide\nexplicit error bounds that demonstrate exponential convergence to the original\nsystem's solution as the truncation length increases. For specific classes of\ndynamical systems, exponential convergence is achieved across the entire time\nhorizon. The practical significance of these results lies in guiding the\nselection of suitable truncation lengths for applications such as model\npredictive control, safety verification through reachability analysis, and\nefficient quantum computing algorithms. The theoretical findings are validated\nthrough illustrative simulations.\n","authors":["Panpan Chen","Nader Motee","Qiyu Sun"],"pdf_url":"https://arxiv.org/pdf/2411.11598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11596v1","updated":"2024-11-18T14:18:18Z","published":"2024-11-18T14:18:18Z","title":"Integrating and Comparing Radiality Constraints for Optimized\n Distribution System Reconfiguration","summary":" The reconfiguration of electrical power distribution systems is a crucial\noptimization problem aimed at minimizing power losses by altering the system\ntopology through the operation of interconnection switches. This problem,\ntypically modelled as a mixed integer nonlinear program demands high\ncomputational resources for large scale networks and requires specialized\nradiality constraints for maintaining the tree like structure of distribution\nnetworks. This paper presents a comprehensive analysis that integrates and\ncompares the computational burden associated with different radiality\nconstraint formulations proposed in the specialized literature for the\nreconfiguration of distribution systems. By using consistent hardware and\nsoftware setups, we evaluate the performance of these constraints across\nseveral well known test cases. Our findings reveal significant differences in\ncomputational efficiency depending on the chosen set of radiality constraints,\nproviding valuable insights for optimizing reconfiguration strategies in\npractical distribution networks.\n","authors":["Pablo Cortes","Alejandra Tabares","Fredy Franco"],"pdf_url":"https://arxiv.org/pdf/2411.11596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11587v1","updated":"2024-11-18T14:04:34Z","published":"2024-11-18T14:04:34Z","title":"A Linear Differential Inclusion for Contraction Analysis to Known\n Trajectories","summary":" Infinitesimal contraction analysis provides exponential convergence rates\nbetween arbitrary pairs of trajectories of a system by studying the system's\nlinearization. An essentially equivalent viewpoint arises through stability\nanalysis of a linear differential inclusion (LDI) encompassing the incremental\nbehavior of the system. In this note, we study contraction of a system to a\nparticular known trajectory, deriving a new LDI characterizing the error\nbetween arbitrary trajectories and this known trajectory. As with classical\ncontraction analysis, this new inclusion is constructed via first partial\nderivatives of the system's vector field, and contraction rates are obtained\nwith familiar tools: uniform bounding of the logarithmic norm and LMI-based\nLyapunov conditions. Our LDI is guaranteed to outperform a usual contraction\nanalysis in two special circumstances: i) when the bound on the logarithmic\nnorm arises from an interval overapproximation of the Jacobian matrix, and ii)\nwhen the norm considered is the $\\ell_1$ norm. Finally, we demonstrate how the\nproposed approach strictly improves an existing framework for ellipsoidal\nreachable set computation.\n","authors":["Akash Harapanahalli","Samuel Coogan"],"pdf_url":"https://arxiv.org/pdf/2411.11587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11582v1","updated":"2024-11-18T13:59:29Z","published":"2024-11-18T13:59:29Z","title":"Exploring LLMs for Verifying Technical System Specifications Against\n Requirements","summary":" Requirements engineering is a knowledge intensive process and crucial for the\nsuccess of engineering projects. The field of knowledge-based requirements\nengineering (KBRE) aims to support engineers by providing knowledge to assist\nin the elicitation, validation, and management of system requirements. The\nadvent of large language models (LLMs) opens new opportunities in the field of\nKBRE. This work experimentally investigates the potential of LLMs in\nrequirements verification. Therein, LLMs are provided with a set of\nrequirements and a textual system specification and are prompted to assess\nwhich requirements are fulfilled by the system specification. Different\nexperimental variables such as system specification complexity, the number of\nrequirements, and prompting strategies were analyzed. Formal rule-based systems\nserve as a benchmark to compare LLM performance to. Requirements and system\nspecifications are derived from the smart-grid domain. Results show that\nadvanced LLMs, like GPT-4o and Claude 3.5 Sonnet, achieved f1-scores between 79\n% and 94 % in identifying non-fulfilled requirements, indicating potential for\nLLMs to be leveraged for requirements verification.\n","authors":["Lasse M. Reinpold","Marvin Schieseck","Lukas P. Wagner","Felix Gehlhoff","Alexander Fay"],"pdf_url":"https://arxiv.org/pdf/2411.11582v1.pdf","comment":"Submitted to 3rd IEEE Industrial Electronics Society Annual Online\n Conference (ONCON)"},{"id":"http://arxiv.org/abs/2411.11574v1","updated":"2024-11-18T13:50:31Z","published":"2024-11-18T13:50:31Z","title":"Reduced Network Cumulative Constraint Violation for Distributed Bandit\n Convex Optimization under Slater Condition","summary":" This paper studies the distributed bandit convex optimization problem with\ntime-varying inequality constraints, where the goal is to minimize network\nregret and cumulative constraint violation. To calculate network cumulative\nconstraint violation, existing distributed bandit online algorithms solving\nthis problem directly use the clipped constraint function to replace its\noriginal constraint function. However, the use of the clipping operation\nrenders Slater condition (i.e, there exists a point that strictly satisfies the\ninequality constraints at all iterations) ineffective to achieve reduced\nnetwork cumulative constraint violation. To tackle this challenge, we propose a\nnew distributed bandit online primal-dual algorithm. If local loss functions\nare convex, we show that the proposed algorithm establishes sublinear network\nregret and cumulative constraint violation bounds. When Slater condition holds,\nthe network cumulative constraint violation bound is reduced. In addition, if\nlocal loss functions are strongly convex, for the case where strongly convex\nparameters are unknown, the network regret bound is reduced. For the case where\nstrongly convex parameters are known, the network regret and cumulative\nconstraint violation bounds are further reduced. To the best of our knowledge,\nthis paper is among the first to establish reduced (network) cumulative\nconstraint violation bounds for (distributed) bandit convex optimization with\ntime-varying constraints under Slater condition. Finally, a numerical example\nis provided to verify the theoretical results.\n","authors":["Kunpeng Zhang","Xinlei Yi","Jinliang Ding","Ming Cao","Karl H. Johansson","Tao Yang"],"pdf_url":"https://arxiv.org/pdf/2411.11574v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.14060,\n arXiv:2306.00149"},{"id":"http://arxiv.org/abs/2409.07666v2","updated":"2024-11-18T13:08:26Z","published":"2024-09-11T23:47:11Z","title":"Design of Distributed Controller for Discrete-Time Systems Via the\n Integration of Extended LMI and Clique-Wise Decomposition","summary":" This study addresses a design of distributed controllers for discrete-time\nsystems using linear matrix inequalities (LMIs). Sparsity constraints on\ncontrol gains of distributed controllers result in conservatism via the\nconvexification of the existing methods such as the extended LMI method. In\norder to mitigate the conservatism, we introduce a novel LMI formulation for\nthis problem, utilizing the clique-wise decomposition method from our previous\nwork on continuous-time systems. By reformulating the sparsity constraint on\nthe gain matrix within cliques, this method achieves a broader solution set.\nAlso, the analytical superiority of our method is confirmed through numerical\nexamples.\n","authors":["Sotaro Fushimi","Yuto Watanabe","Kazunori Sakurama"],"pdf_url":"https://arxiv.org/pdf/2409.07666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11549v1","updated":"2024-11-18T13:06:32Z","published":"2024-11-18T13:06:32Z","title":"Sound Value Iteration for Simple Stochastic Games","summary":" Algorithmic analysis of Markov decision processes (MDP) and stochastic games\n(SG) in practice relies on value-iteration (VI) algorithms. Since the basic\nversion of VI does not provide guarantees on the precision of the result,\nvariants of VI have been proposed that offer such guarantees. In particular,\nsound value iteration (SVI) not only provides precise lower and upper bounds on\nthe result, but also converges faster in the presence of probabilistic cycles.\nUnfortunately, it is neither applicable to SG, nor to MDP with end components.\nIn this paper, we extend SVI and cover both cases. The technical challenge\nconsists mainly in proper treatment of end components, which require different\nhandling than in the literature. Moreover, we provide several optimizations of\nSVI. Finally, we also evaluate our prototype implementation experimentally to\nconfirm its advantages on systems with probabilistic cycles.\n","authors":["Muqsit Azeem","Jan Kretinsky","Maximilian Weininger"],"pdf_url":"https://arxiv.org/pdf/2411.11549v1.pdf","comment":"Preprint. Under Review"},{"id":"http://arxiv.org/abs/2411.11542v1","updated":"2024-11-18T13:00:20Z","published":"2024-11-18T13:00:20Z","title":"Data-Driven Structured Robust Control of Linear Systems","summary":" Static structured control refers to the task of designing a state-feedback\ncontroller such that the control gain satisfies a subspace constraint.\nStructured control has applications in control of communication-inhibited\ndynamical systems, such as systems in networked environments. This work\nperforms $H_2$-suboptimal regulation under a common structured state-feedback\ncontroller for a class of data-consistent plants. The certification of\n$H_2$-performance is attained through a combination of standard $H_2$ LMIs,\nconvex sufficient conditions for structured control, and a matrix S-lemma for\nset-membership. The resulting convex optimization problems are linear matrix\ninequalities whose size scales independently of the number of data samples\ncollected. Data-driven structured $H_2$-regulation control is demonstrated on\nexample systems.\n","authors":["Jared Miller","Jaap Eising","Florian Dörfler","Roy S. Smith"],"pdf_url":"https://arxiv.org/pdf/2411.11542v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2411.11510v1","updated":"2024-11-18T12:15:16Z","published":"2024-11-18T12:15:16Z","title":"Closed-loop multi-step planning with innate physics knowledge","summary":" We present a hierarchical framework to solve robot planning as an input\ncontrol problem. At the lowest level are temporary closed control loops,\n(\"tasks\"), each representing a behaviour, contingent on a specific sensory\ninput and therefore temporary. At the highest level, a supervising\n\"Configurator\" directs task creation and termination. Here resides \"core\"\nknowledge as a physics engine, where sequences of tasks can be simulated. The\nConfigurator encodes and interprets simulation results,based on which it can\nchoose a sequence of tasks as a plan. We implement this framework on a real\nrobot and test it in an overtaking scenario as proof-of-concept.\n","authors":["Giulia Lafratta","Bernd Porr","Christopher Chandler","Alice Miller"],"pdf_url":"https://arxiv.org/pdf/2411.11510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03810v2","updated":"2024-11-18T10:39:10Z","published":"2024-08-07T14:37:03Z","title":"Improved Tangential Interpolation-based Multi-input Multi-output Modal\n Analysis of a Full Aircraft","summary":" In the field of Structural Dynamics, modal analysis is the foundation of\nSystem Identification and vibration-based inspection. However, despite their\nwidespread use, current state-of-the-art methods for extracting modal\nparameters from multi-input multi-output (MIMO) frequency domain data are still\naffected by many technical limitations. Mainly, they can be computationally\ncumbersome and/or negatively affected by close-in-frequency modes. The Loewner\nFramework (LF) was recently proposed to alleviate these problems with the\nlimitation of working with single-input data only. This work proposes a\ncomputationally improved version of the LF, or iLF, to extract modal parameters\nmore efficiently. Also, the proposed implementation is extended in order to\nhandle MIMO data in the frequency domain. This new implementation is compared\nto state-of-the-art methods such as the frequency domain implementations of the\nLeast Square Complex Exponential method and the Numerical Algorithm for\nSubspace State Space System Identification on numerical and experimental\ndatasets. More specifically, a finite element model of a 3D Euler-Bernoulli\nbeam is used for the baseline comparison and the noise robustness verification\nof the proposed MIMO iLF algorithm. Then, an experimental dataset from MIMO\nground vibration tests of a trainer jet aircraft with over 91 accelerometer\nchannels is chosen for the algorithm validation on a real-life application. Its\nvalidation is carried out with known results from a single-input multi-output\ndataset of the starboard wing of the same aircraft. Excellent results are\nachieved in terms of accuracy, robustness to noise, and computational\nperformance by the proposed improved MIMO method, both on the numerical and the\nexperimental datasets. The MIMO iLF MATLAB implementation is shared in the work\nsupplementary material.\n","authors":["Gabriele Dessena","Marco Civera"],"pdf_url":"https://arxiv.org/pdf/2408.03810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11411v1","updated":"2024-11-18T09:30:58Z","published":"2024-11-18T09:30:58Z","title":"Distributed Learning with Partial Information Sharing","summary":" This work studies the distributed learning process on a network of agents.\nAgents make partial observation about an unknown hypothesis and iteratively\nshare their beliefs over a set of possible hypotheses with their neighbors to\nlearn the true hypothesis. We present and analyze a distributed learning\nalgorithm in which agents share belief on only one randomly chosen hypothesis\nat a time. Agents estimate the beliefs on missed hypotheses using previously\nshared beliefs. We show that agents learn the true hypothesis almost surely\nunder standard network connectivity and observation model assumptions if belief\non each hypothesis is shared with positive probability at every time. We also\npresent a memory-efficient variant of the learning algorithm with partial\nbelief sharing and present simulation results to compare rate of convergence of\nfull and partial information sharing algorithms.\n","authors":["P Raghavendra Rao","Pooja Vyavahare"],"pdf_url":"https://arxiv.org/pdf/2411.11411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11310v1","updated":"2024-11-18T06:10:53Z","published":"2024-11-18T06:10:53Z","title":"Towards Mitigating Sim2Real Gaps: A Formal Quantitative Approach","summary":" In this paper, we introduce the notion of simulation-gap functions to\nformally quantify the potential gap between an approximate nominal mathematical\nmodel and the high-fidelity simulator representation of a real system. Given a\nnominal mathematical model alongside a quantified simulation gap, the system\ncan be conceptualized as one characterized by bounded states and\ninput-dependent disturbances. This allows us to leverage the existing powerful\nmodel-based control algorithms effectively, ensuring the enforcement of desired\nspecifications while guaranteeing a seamless transition from simulation to\nreal-world application. To provide a formal guarantee for quantifying the\nsimulation gap, we develop a data-driven approach. In particular, we collect\ndata using high-fidelity simulators, leveraging recent advancements in\nReal-to-Sim transfer to ensure close alignment with reality. We demonstrate the\neffectiveness of the proposed method through experiments conducted on a\nnonlinear pendulum system and a nonlinear Turtlebot model in simulators.\n","authors":["P Sangeerth","Abolfazl Lavaei","Pushpak Jagtap"],"pdf_url":"https://arxiv.org/pdf/2411.11310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11230v1","updated":"2024-11-18T01:48:23Z","published":"2024-11-18T01:48:23Z","title":"Network-Security Informed Offer-Making of Aggregator with Utility-Owned\n Storage Lease Opportunity: Stochastic Stackelberg Game and Distributed\n Solution Methods","summary":" Aggregators of distributed energy resources are increasingly encouraged to\nparticipate in wholesale market bidding. However, the delivery of the power\nthey are awarded can result in over-voltage or congestion issues within the\ndistribution network (DN). The opportunity to lease energy storage from the\nutility that manages the DN provides the aggregator with a means to mitigate\nthese issues, while also benefiting the utility in terms of additional lease\nrevenue. Nevertheless, this leasing opportunity considerably complicates the\naggregator's offer-making process, as it requires the consideration of market\nuncertainties, uncertain power injection at DN buses, and the strategic\ninteractions between the aggregator and the utility. This paper presents a\nstochastic Stackelberg game model that effectively captures the interactions\nbetween the aggregator and the utility, ensuring DN security across all\npotential uncertainty scenarios. Furthermore, in light of the privacy concerns\nof both the aggregator and the utility, two distributed solution methods are\nproposed. The first method follows a traditional predict-then-optimize\nframework and has been validated to achieve the game equilibrium. The second\nmethod employs an end-to-end framework, which has been empirically shown to\nyield superior economic results. Case studies conducted on 69 and 533-bus DNs\nillustrate the efficacy of the proposed methods.\n","authors":["Congcong Liu","Zhengshuo Li"],"pdf_url":"https://arxiv.org/pdf/2411.11230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11221v1","updated":"2024-11-18T01:18:18Z","published":"2024-11-18T01:18:18Z","title":"Data Driven Automatic Electrical Machine Preliminary Design with\n Artificial Intelligence Expert Guidance","summary":" This paper presents a data-driven electrical machine design (EMD) framework\nusing wound-rotor synchronous generator (WRSG) as a design example. Unlike\ntraditional preliminary EMD processes that heavily rely on expertise, this\nframework leverages an artificial-intelligence based expert database, to\nprovide preliminary designs directly from user specifications. Initial data is\ngenerated using 2D finite element (FE) machine models by sweeping fundamental\ndesign variables including machine length and diameter, enabling scalable\nmachine geometry with machine performance for each design is recorded. This\ndata trains a Metamodel of Optimal Prognosis (MOP)-based surrogate model, which\nmaps design variables to key performance indicators (KPIs). Once trained,\nguided by metaheuristic algorithms, the surrogate model can generate thousands\nof geometric scalable designs, covering a wide power range, forming an AI\nexpert database to guide future preliminary design. The framework is validated\nwith a 30kVA WRSG design case. A prebuilt WRSG database, covering power from 10\nto 60kVA, is validated by FE simulation. Design No.1138 is selected from\ndatabase and compared with conventional design. Results show No.1138 achieves a\nhigher power density of 2.21 kVA/kg in just 5 seconds, compared to 2.02 kVA/kg\nobtained using traditional method, which take several days. The developed AI\nexpert database also serves as a high-quality data source for further\ndeveloping AI models for automatic electrical machine design.\n","authors":["Yiwei Wang","Tao Yang","Hailin Huang","Tianjie Zou","Jincai Li","Nuo Chen","Zhuoran Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11218v1","updated":"2024-11-18T01:06:59Z","published":"2024-11-18T01:06:59Z","title":"Conjugate Momentum-Based Estimation of External Forces for Bio-Inspired\n Morphing Wing Flight","summary":" Dynamic morphing wing flights present significant challenges in accurately\nestimating external forces due to complex interactions between aerodynamics,\nrapid wing movements, and external disturbances. Traditional force estimation\nmethods often struggle with unpredictable disturbances like wind gusts or\nunmodeled impacts that can destabilize flight in real-world scenarios. This\npaper addresses these challenges by implementing a Conjugate Momentum-based\nObserver, which effectively estimates and manages unknown external forces\nacting on the Aerobat, a bio-inspired robotic platform with dynamically\nmorphing wings. Through simulations, the observer demonstrates its capability\nto accurately detect and quantify external forces, even in the presence of\nGaussian noise and abrupt impulse inputs. The results validate the robustness\nof the method, showing improved stability and control of the Aerobat in dynamic\nenvironments. This research contributes to advancements in bio-inspired\nrobotics by enhancing force estimation for flapping-wing systems, with\npotential applications in autonomous aerial navigation and robust flight\ncontrol.\n","authors":["Bibek Gupta","Eric Sihite","Alireza Ramezani"],"pdf_url":"https://arxiv.org/pdf/2411.11218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11216v1","updated":"2024-11-18T01:01:34Z","published":"2024-11-18T01:01:34Z","title":"Optimization free control and ground force estimation with momentum\n observer for a multimodal legged aerial robot","summary":" Legged-aerial multimodal robots can make the most of both legged and aerial\nsystems. In this paper, we propose a control framework that bypasses heavy\nonboard computers by using an optimization-free Explicit Reference Governor\nthat incorporates external thruster forces from an attitude controller. Ground\nreaction forces are maintained within friction cone constraints using costly\noptimization solvers, but the ERG framework filters applied velocity references\nthat ensure no slippage at the foot end. We also propose a Conjugate momentum\nobserver, that is widely used in Disturbance Observation to estimate ground\nreaction forces and compare its efficacy against a constrained model in\nestimating ground reaction forces in a reduced-order simulation of Husky.\n","authors":["Kaushik Venkatesh Krishnamurthy","Chenghao Wang","Shreyansh Pitroda","Eric Sihite","Alireza Ramezani","Morteza Gharib"],"pdf_url":"https://arxiv.org/pdf/2411.11216v1.pdf","comment":"6 pages, 10 figures, submitted to American Control Conference 2025"},{"id":"http://arxiv.org/abs/2411.12104v1","updated":"2024-11-18T22:32:06Z","published":"2024-11-18T22:32:06Z","title":"Is Locational Marginal Price All You Need for Locational Marginal\n Emission?","summary":" Growing concerns over climate change call for improved techniques for\nestimating and quantifying the greenhouse gas emissions associated with\nelectricity generation and transmission. Among the emission metrics designated\nfor power grids, locational marginal emission (LME) can provide system\noperators and electricity market participants with valuable information on the\nemissions associated with electricity usage at various locations in the power\nnetwork. In this paper, by investigating the operating patterns and physical\ninterpretations of marginal emissions and costs in the security-constrained\neconomic dispatch (SCED) problem, we identify and draw the exact connection\nbetween locational marginal price (LMP) and LME. Such interpretation helps\ninstantly derive LME given nodal demand vectors or LMP, and also reveals the\ninterplay between network congestion and nodal emission pattern. Our proposed\napproach helps reduce the computation time of LME by an order of magnitude\ncompared to analytical approaches, while it can also serve as a plug-and-play\nmodule accompanied by an off-the-shelf market clearing and LMP calculation\nprocess.\n","authors":["Xuan He","Danny H. K. Tsang","Yize Chen"],"pdf_url":"https://arxiv.org/pdf/2411.12104v1.pdf","comment":"8 pages, 5 figures, in submission"},{"id":"http://arxiv.org/abs/2411.12100v1","updated":"2024-11-18T22:12:26Z","published":"2024-11-18T22:12:26Z","title":"Stability and Performance Analysis on Self-dual Cones","summary":" In this paper, we consider nonsymmetric solutions to certain Lyapunov and\nRiccati equations and inequalities with coefficient matrices corresponding to\ncone-preserving dynamical systems. Most results presented here appear to be\nnovel even in the special case of positive systems. First, we provide a simple\neigenvalue criterion for a Sylvester equation to admit a cone-preserving\nsolution. For a single system preserving a self-dual cone, this reduces to\nstability. Further, we provide a set of conditions equivalent to testing a\ngiven H-infinity norm bound, as in the bounded real lemma. These feature the\nstability of a coefficient matrix similar to the Hamiltonian, a solution to two\nconic inequalities, and a stabilizing cone-preserving solution to a\nnonsymmetric Riccati equation. Finally, we show that the H-infinity norm is\nattained at zero frequency.\n","authors":["Emil Vladu"],"pdf_url":"https://arxiv.org/pdf/2411.12100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15384v2","updated":"2024-11-18T20:54:06Z","published":"2024-02-23T15:30:57Z","title":"Homeostatic motion planning with innate physics knowledge","summary":" Living organisms interact with their surroundings in a closed-loop fashion,\nwhere sensory inputs dictate the initiation and termination of behaviours. Even\nsimple animals are able to develop and execute complex plans, which has not yet\nbeen replicated in robotics using pure closed-loop input control. We propose a\nsolution to this problem by defining a set of discrete and temporary\nclosed-loop controllers, called \"tasks\", each representing a closed-loop\nbehaviour. We further introduce a supervisory module which has an innate\nunderstanding of physics and causality, through which it can simulate the\nexecution of task sequences over time and store the results in a model of the\nenvironment. On the basis of this model, plans can be made by chaining\ntemporary closed-loop controllers. The proposed framework was implemented for a\nreal robot and tested in two scenarios as proof of concept.\n","authors":["Giulia Lafratta","Bernd Porr","Christopher Chandler","Alice Miller"],"pdf_url":"https://arxiv.org/pdf/2402.15384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00157v2","updated":"2024-11-18T19:58:46Z","published":"2024-04-30T19:13:04Z","title":"Information-Theoretic Opacity-Enforcement in Markov Decision Processes","summary":" The paper studies information-theoretic opacity, an information-flow privacy\nproperty, in a setting involving two agents: A planning agent who controls a\nstochastic system and an observer who partially observes the system states. The\ngoal of the observer is to infer some secret, represented by a random variable,\nfrom its partial observations, while the goal of the planning agent is to make\nthe secret maximally opaque to the observer while achieving a satisfactory\ntotal return. Modeling the stochastic system using a Markov decision process,\ntwo classes of opacity properties are considered -- Last-state opacity is to\nensure that the observer is uncertain if the last state is in a specific set\nand initial-state opacity is to ensure that the observer is unsure of the\nrealization of the initial state. As the measure of opacity, we employ the\nShannon conditional entropy capturing the information about the secret revealed\nby the observable. Then, we develop primal-dual policy gradient methods for\nopacity-enforcement planning subject to constraints on total returns. We\npropose novel algorithms to compute the policy gradient of entropy for each\nobservation, leveraging message passing within the hidden Markov models. This\ngradient computation enables us to have stable and fast convergence. We\ndemonstrate our solution of opacity-enforcement control through a grid world\nexample.\n","authors":["Chongyang Shi","Yuheng Bu","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2405.00157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12014v1","updated":"2024-11-18T19:55:43Z","published":"2024-11-18T19:55:43Z","title":"On-the-Go Path Planning and Repair in Static and Dynamic Scenarios","summary":" Autonomous systems, including robots and drones, face significant challenges\nwhen navigating through dynamic environments, particularly within urban\nsettings where obstacles, fluctuating traffic, and pedestrian activity are\nconstantly shifting. Although, traditional motion planning algorithms like the\nwavefront planner and gradient descent planner, which use potential functions,\nwork well in static environments, they fall short in situations where the\nenvironment is continuously changing. This work proposes a dynamic, real-time\npath planning approach specifically designed for autonomous systems, allowing\nthem to effectively avoid static and dynamic obstacles, thereby enhancing their\noverall adaptability. The approach integrates the efficiency of conventional\nplanners with the ability to make rapid adjustments in response to moving\nobstacles and environmental changes. The simulation results discussed in this\narticle demonstrate the effectiveness of the proposed method, demonstrating its\nsuitability for robotic path planning in both known and unknown environments,\nincluding those involving mobile objects, agents, or potential threats.\n","authors":["Daniel Ajeleye"],"pdf_url":"https://arxiv.org/pdf/2411.12014v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2411.12006v1","updated":"2024-11-18T19:42:26Z","published":"2024-11-18T19:42:26Z","title":"A Robust Solver for Phasor-Domain Short-Circuit Analysis with\n Inverter-Based Resources","summary":" The integration of Inverter-Based Resource (IBR) model into phasor-domain\nshort circuit (SC) solvers challenges their numerical stability. To address the\nchallenge, this paper proposes a solver that improves numerical stability by\nemploying the Newton-Raphson iterative method. The solver can integrate the\nlatest implementation of IBR SC model in industry-standard fault analysis\nprograms including the voltage controlled current source tabular model as well\nas vendor-specific black-box and white-box equation-based models. The superior\nnumerical stability of the proposed solver has been mathematically\ndemonstrated, with identified convergence conditions. An algorithm for the\nimplementation of the proposed solver in fault analysis programs has been\ndeveloped. The objective is to improve the capability of the industry to\naccurately represent IBRs in SC studies and ensure system protection\nreliability in an IBR-dominated future.\n","authors":["Aboutaleb Haddadi","Evangelos Farantatos","Ilhan Kocar"],"pdf_url":"https://arxiv.org/pdf/2411.12006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11998v1","updated":"2024-11-18T19:32:28Z","published":"2024-11-18T19:32:28Z","title":"Uncertainty Propagation and Minimization for Channel Estimation in\n UAV-mounted RIS Systems","summary":" Reconfigurable Intelligent Surfaces (RIS) are emerging as a key technology\nfor sixth-generation (6G) wireless networks, leveraging adjustable reflecting\nelements to dynamically control electromagnetic wave propagation and optimize\nwireless connectivity. By positioning the RIS on an unmanned aerial vehicle\n(UAV), it can maintain line-of-sight and proximity to both the transmitter and\nreceiver, critical factors that mitigate path loss and enhance signal strength.\nThe lightweight, power-efficient nature of RIS makes UAV integration feasible,\nyet the setup faces significant disturbances from UAV motion, which can degrade\nRIS alignment and link performance. In this study, we address these challenges\nusing both experimental measurements and analytical methods. Using an extended\nKalman filter (EKF), we estimate the UAV's orientation in real time during\nexperimental flights to capture real disturbance effects. The resulting\norientation uncertainty is then propagated to the RIS's channel estimates by\napplying the Guide to the Expression of Uncertainty in Measurement (GUM)\nframework as well as complex-valued propagation techniques to accurately assess\nand minimize the impact of UAV orientation uncertainties on RIS performance.\nThis method enables us to systematically trace and quantify how orientation\nuncertainties affect channel gain and phase stability in real-time. Through\nnumerical simulations, we find that the uncertainty of the RIS channel link is\ninfluenced by the RIS's configuration. Furthermore, our results demonstrate\nthat the uncertainty area is most accurately represented by an annular section,\nenabling a 58% reduction in the uncertainty area while maintaining a 95%\ncoverage probability.\n","authors":["Kevin Weinberger","David Müller","Martin Mönnigmann","Aydin Sezgin"],"pdf_url":"https://arxiv.org/pdf/2411.11998v1.pdf","comment":"6 pages, 3 figures, submitted to IEEE International Conference on\n Communications 2025"},{"id":"http://arxiv.org/abs/2411.11980v1","updated":"2024-11-18T19:10:49Z","published":"2024-11-18T19:10:49Z","title":"Transmission Line Outage Probability Prediction Under Extreme Events\n Using Peter-Clark Bayesian Structural Learning","summary":" Recent years have seen a notable increase in the frequency and intensity of\nextreme weather events. With a rising number of power outages caused by these\nevents, accurate prediction of power line outages is essential for safe and\nreliable operation of power grids. The Bayesian network is a probabilistic\nmodel that is very effective for predicting line outages under weather-related\nuncertainties. However, most existing studies in this area offer general risk\nassessments, but fall short of providing specific outage probabilities. In this\nwork, we introduce a novel approach for predicting transmission line outage\nprobabilities using a Bayesian network combined with Peter-Clark (PC)\nstructural learning. Our approach not only enables precise outage probability\ncalculations, but also demonstrates better scalability and robust performance,\neven with limited data. Case studies using data from BPA and NOAA show the\neffectiveness of this approach, while comparisons with several existing methods\nfurther highlight its advantages.\n","authors":["Xiaolin Chen","Qiuhua Huang","Yuqi Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.11980v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.11838v1","updated":"2024-11-18T18:56:46Z","published":"2024-11-18T18:56:46Z","title":"Pairwise Markov Chains for Volatility Forecasting","summary":" The Pairwise Markov Chain (PMC) is a probabilistic graphical model extending\nthe well-known Hidden Markov Model. This model, although highly effective for\nmany tasks, has been scarcely utilized for continuous value prediction. This is\nmainly due to the issue of modeling observations inherent in generative\nprobabilistic models. In this paper, we introduce a new algorithm for\nprediction with the PMC. On the one hand, this algorithm allows circumventing\nthe feature problem, thus fully exploiting the capabilities of the PMC. On the\nother hand, it enables the PMC to extend any predictive model by introducing\nhidden states, updated at each time step, and allowing the introduction of\nnon-stationarity for any model. We apply the PMC with its new algorithm for\nvolatility forecasting, which we compare to the highly popular GARCH(1,1) and\nfeedforward neural models across numerous pairs. This is particularly relevant\ngiven the regime changes that we can observe in volatility. For each scenario,\nour algorithm enhances the performance of the extended model, demonstrating the\nvalue of our approach.\n","authors":["Elie Azeraf"],"pdf_url":"https://arxiv.org/pdf/2411.11838v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.07681v2","updated":"2024-11-18T18:49:59Z","published":"2024-11-12T09:52:40Z","title":"What Do Learning Dynamics Reveal About Generalization in LLM Reasoning?","summary":" Despite the remarkable capabilities of modern large language models (LLMs),\nthe mechanisms behind their problem-solving abilities remain elusive. In this\nwork, we aim to better understand how the learning dynamics of LLM finetuning\nshapes downstream generalization. Our analysis focuses on reasoning tasks,\nwhose problem structure allows us to distinguish between memorization (the\nexact replication of reasoning steps from the training data) and performance\n(the correctness of the final solution). We find that a model's generalization\nbehavior can be effectively characterized by a training metric we call\npre-memorization train accuracy: the accuracy of model samples on training\nqueries before they begin to copy the exact reasoning steps from the training\nset. On the dataset level, this metric is able to reliably predict test\naccuracy, achieving $R^2$ of around or exceeding 0.9 across various models\n(Llama3 8, Gemma2 9B), datasets (GSM8k, MATH), and training configurations. On\na per-example level, this metric is also indicative of whether individual model\npredictions are robust to perturbations in the training query. By connecting a\nmodel's learning behavior to its generalization, pre-memorization train\naccuracy can guide targeted improvements to training strategies. We focus on\ndata curation as an example, and show that prioritizing examples with low\npre-memorization accuracy leads to 1.5-2x improvements in data efficiency\ncompared to i.i.d. data scaling, and outperforms other standard data curation\ntechniques.\n","authors":["Katie Kang","Amrith Setlur","Dibya Ghosh","Jacob Steinhardt","Claire Tomlin","Sergey Levine","Aviral Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11829v1","updated":"2024-11-18T18:48:13Z","published":"2024-11-18T18:48:13Z","title":"Tackling prediction tasks in relational databases with LLMs","summary":" Though large language models (LLMs) have demonstrated exceptional performance\nacross numerous problems, their application to predictive tasks in relational\ndatabases remains largely unexplored. In this work, we address the notion that\nLLMs cannot yield satisfactory results on relational databases due to their\ninterconnected tables, complex relationships, and heterogeneous data types.\nUsing the recently introduced RelBench benchmark, we demonstrate that even a\nstraightforward application of LLMs achieves competitive performance on these\ntasks. These findings establish LLMs as a promising new baseline for ML on\nrelational databases and encourage further research in this direction.\n","authors":["Marek Wydmuch","Łukasz Borchmann","Filip Graliński"],"pdf_url":"https://arxiv.org/pdf/2411.11829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04254v2","updated":"2024-11-18T18:35:06Z","published":"2024-04-05T17:58:52Z","title":"Watermark-based Detection and Attribution of AI-Generated Content","summary":" Several companies have deployed watermark-based detection to identify\nAI-generated content. However, attribution--the ability to trace back to the\nuser of a generative AI (GenAI) service who created a given piece of\nAI-generated content--remains largely unexplored despite its growing\nimportance. In this work, we aim to bridge this gap by conducting the first\nsystematic study on watermark-based, user-level attribution of AI-generated\ncontent. Our key idea is to assign a unique watermark to each user of the GenAI\nservice and embed this watermark into the AI-generated content created by that\nuser. Attribution is then performed by identifying the user whose watermark\nbest matches the one extracted from the given content. This approach, however,\nfaces a key challenge: How should watermarks be selected for users to maximize\nattribution performance? To address the challenge, we first theoretically\nderive lower bounds on detection and attribution performance through rigorous\nprobabilistic analysis for any given set of user watermarks. Then, we select\nwatermarks for users to maximize these lower bounds, thereby optimizing\ndetection and attribution performance. Our theoretical and empirical results\nshow that watermark-based attribution inherits both the accuracy and\n(non-)robustness properties of the underlying watermark. Specifically,\nattribution remains highly accurate when the watermarked AI-generated content\nis either not post-processed or subjected to common post-processing such as\nJPEG compression, as well as black-box adversarial post-processing with limited\nquery budgets.\n","authors":["Zhengyuan Jiang","Moyang Guo","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2404.04254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11801v1","updated":"2024-11-18T18:14:51Z","published":"2024-11-18T18:14:51Z","title":"KAN/MultKAN with Physics-Informed Spline fitting (KAN-PISF) for\n ordinary/partial differential equation discovery of nonlinear dynamic systems","summary":" Machine learning for scientific discovery is increasingly becoming popular\nbecause of its ability to extract and recognize the nonlinear characteristics\nfrom the data. The black-box nature of deep learning methods poses difficulties\nin interpreting the identified model. There is a dire need to interpret the\nmachine learning models to develop a physical understanding of dynamic systems.\nAn interpretable form of neural network called Kolmogorov-Arnold networks (KAN)\nor Multiplicative KAN (MultKAN) offers critical features that help recognize\nthe nonlinearities in the governing ordinary/partial differential equations\n(ODE/PDE) of various dynamic systems and find their equation structures. In\nthis study, an equation discovery framework is proposed that includes i)\nsequentially regularized derivatives for denoising (SRDD) algorithm to denoise\nthe measure data to obtain accurate derivatives, ii) KAN to identify the\nequation structure and suggest relevant nonlinear functions that are used to\ncreate a small overcomplete library of functions, and iii) physics-informed\nspline fitting (PISF) algorithm to filter the excess functions from the library\nand converge to the correct equation. The framework was tested on the forced\nDuffing oscillator, Van der Pol oscillator (stiff ODE), Burger's equation, and\nBouc-Wen model (coupled ODE). The proposed method converged to the true\nequation for the first three systems. It provided an approximate model for the\nBouc-Wen model that could acceptably capture the hysteresis response. Using KAN\nmaintains low complexity, which helps the user interpret the results throughout\nthe process and avoid the black-box-type nature of machine learning methods.\n","authors":["Ashish Pal","Satish Nagarajaiah"],"pdf_url":"https://arxiv.org/pdf/2411.11801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11794v1","updated":"2024-11-18T18:08:05Z","published":"2024-11-18T18:08:05Z","title":"Competing Bandits in Decentralized Large Contextual Matching Markets","summary":" Sequential learning in a multi-agent resource constrained matching market has\nreceived significant interest in the past few years. We study decentralized\nlearning in two-sided matching markets where the demand side (aka players or\nagents) competes for a `large' supply side (aka arms) with potentially\ntime-varying preferences, to obtain a stable match. Despite a long line of work\nin the recent past, existing learning algorithms such as Explore-Then-Commit or\nUpper-Confidence-Bound remain inefficient for this problem. In particular, the\nper-agent regret achieved by these algorithms scales linearly with the number\nof arms, $K$. Motivated by the linear contextual bandit framework, we assume\nthat for each agent an arm-mean can be represented by a linear function of a\nknown feature vector and an unknown (agent-specific) parameter.\n Moreover, our setup captures the essence of a dynamic (non-stationary)\nmatching market where the preferences over arms change over time. Our proposed\nalgorithms achieve instance-dependent logarithmic regret, scaling independently\nof the number of arms, $K$.\n","authors":["Satush Parikh","Soumya Basu","Avishek Ghosh","Abishek Sankararaman"],"pdf_url":"https://arxiv.org/pdf/2411.11794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11793v1","updated":"2024-11-18T18:06:44Z","published":"2024-11-18T18:06:44Z","title":"A Potential Game Perspective in Federated Learning","summary":" Federated learning (FL) is an emerging paradigm for training machine learning\nmodels across distributed clients. Traditionally, in FL settings, a central\nserver assigns training efforts (or strategies) to clients. However, from a\nmarket-oriented perspective, clients may independently choose their training\nefforts based on rational self-interest. To explore this, we propose a\npotential game framework where each client's payoff is determined by their\nindividual efforts and the rewards provided by the server. The rewards are\ninfluenced by the collective efforts of all clients and can be modulated\nthrough a reward factor. Our study begins by establishing the existence of Nash\nequilibria (NEs), followed by an investigation of uniqueness in homogeneous\nsettings. We demonstrate a significant improvement in clients' training efforts\nat a critical reward factor, identifying it as the optimal choice for the\nserver. Furthermore, we prove the convergence of the best-response algorithm to\ncompute NEs for our FL game. Finally, we apply the training efforts derived\nfrom specific NEs to a real-world FL scenario, validating the effectiveness of\nthe identified optimal reward factor.\n","authors":["Kang Liu","Ziqi Wang","Enrique Zuazua"],"pdf_url":"https://arxiv.org/pdf/2411.11793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11786v1","updated":"2024-11-18T18:01:13Z","published":"2024-11-18T18:01:13Z","title":"Parallelly Tempered Generative Adversarial Networks","summary":" A generative adversarial network (GAN) has been a representative backbone\nmodel in generative artificial intelligence (AI) because of its powerful\nperformance in capturing intricate data-generating processes. However, the GAN\ntraining is well-known for its notorious training instability, usually\ncharacterized by the occurrence of mode collapse. Through the lens of\ngradients' variance, this work particularly analyzes the training instability\nand inefficiency in the presence of mode collapse by linking it to\nmultimodality in the target distribution. To ease the raised training issues\nfrom severe multimodality, we introduce a novel GAN training framework that\nleverages a series of tempered distributions produced via convex interpolation.\nWith our newly developed GAN objective function, the generator can learn all\nthe tempered distributions simultaneously, conceptually resonating with the\nparallel tempering in Statistics. Our simulation studies demonstrate the\nsuperiority of our approach over existing popular training strategies in both\nimage and tabular data synthesis. We theoretically analyze that such\nsignificant improvement can arise from reducing the variance of gradient\nestimates by using the tempered distributions. Finally, we further develop a\nvariant of the proposed framework aimed at generating fair synthetic data which\nis one of the growing interests in the field of trustworthy AI.\n","authors":["Jinwon Sohn","Qifan Song"],"pdf_url":"https://arxiv.org/pdf/2411.11786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21343v2","updated":"2024-11-18T17:59:10Z","published":"2024-07-31T05:17:31Z","title":"MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation\n Framework","summary":" Medical imaging segmentation is a highly active area of research, with deep\nlearning-based methods achieving state-of-the-art results in several\nbenchmarks. However, the lack of standardized tools for training, testing, and\nevaluating new methods makes the comparison of methods difficult. To address\nthis, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple,\nmodular, and end-to-end medical imaging segmentation framework designed to\nfacilitate consistent training, testing, and evaluation of deep learning-based\nmedical imaging segmentation methods. MIST standardizes data analysis,\npreprocessing, and evaluation pipelines, accommodating multiple architectures\nand loss functions. This standardization ensures reproducible and fair\ncomparisons across different methods. We detail MIST's data format\nrequirements, pipelines, and auxiliary features and demonstrate its efficacy\nusing the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results\nhighlight MIST's ability to produce accurate segmentation masks and its\nscalability across multiple GPUs, showcasing its potential as a powerful tool\nfor future medical imaging research and development.\n","authors":["Adrian Celaya","Evan Lim","Rachel Glenn","Brayden Mi","Alex Balsells","Dawid Schellingerhout","Tucker Netherton","Caroline Chung","Beatrice Riviere","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2407.21343v2.pdf","comment":"Submitted to BraTS 2024"},{"id":"http://arxiv.org/abs/2403.11093v2","updated":"2024-11-18T17:58:35Z","published":"2024-03-17T05:07:04Z","title":"Learning-Based Pricing and Matching for Two-Sided Queues","summary":" We consider a dynamic system with multiple types of customers and servers.\nEach type of waiting customer or server joins a separate queue, forming a\nbipartite graph with customer-side queues and server-side queues. The platform\ncan match the servers and customers if their types are compatible. The matched\npairs then leave the system. The platform will charge a customer a price\naccording to their type when they arrive and will pay a server a price\naccording to their type. The arrival rate of each queue is determined by the\nprice according to some unknown demand or supply functions. Our goal is to\ndesign pricing and matching algorithms to maximize the profit of the platform\nwith unknown demand and supply functions, while keeping queue lengths of both\ncustomers and servers below a predetermined threshold. This system can be used\nto model two-sided markets such as ride-sharing markets with passengers and\ndrivers. The difficulties of the problem include simultaneous learning and\ndecision making, and the tradeoff between maximizing profit and minimizing\nqueue length. We use a longest-queue-first matching algorithm and propose a\nlearning-based pricing algorithm, which combines gradient-free stochastic\nprojected gradient ascent with bisection search. We prove that our proposed\nalgorithm yields a sublinear regret $\\tilde{O}(T^{5/6})$ and anytime\nqueue-length bound $\\tilde{O}(T^{1/6})$, where $T$ is the time horizon. We\nfurther establish a tradeoff between the regret bound and the queue-length\nbound: $\\tilde{O}(T^{1-\\gamma})$ versus $\\tilde{O}(T^{\\gamma})$ for $\\gamma \\in\n(0, 1/6].$\n","authors":["Zixian Yang","Lei Ying"],"pdf_url":"https://arxiv.org/pdf/2403.11093v2.pdf","comment":"60 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.11779v1","updated":"2024-11-18T17:56:13Z","published":"2024-11-18T17:56:13Z","title":"LLM-IE: A Python Package for Generative Information Extraction with\n Large Language Models","summary":" Objectives: Despite the recent adoption of large language models (LLMs) for\nbiomedical information extraction, challenges in prompt engineering and\nalgorithms persist, with no dedicated software available. To address this, we\ndeveloped LLM-IE: a Python package for building complete information extraction\npipelines. Our key innovation is an interactive LLM agent to support schema\ndefinition and prompt design.\n Materials and Methods: The LLM-IE supports named entity recognition, entity\nattribute extraction, and relation extraction tasks. We benchmarked on the i2b2\ndatasets and conducted a system evaluation.\n Results: The sentence-based prompting algorithm resulted in the best\nperformance while requiring a longer inference time. System evaluation provided\nintuitive visualization.\n Discussion: LLM-IE was designed from practical NLP experience in healthcare\nand has been adopted in internal projects. It should hold great value to the\nbiomedical NLP community.\n Conclusion: We developed a Python package, LLM-IE, that provides building\nblocks for robust information extraction pipeline construction.\n","authors":["Enshuo Hsu","Kirk Roberts"],"pdf_url":"https://arxiv.org/pdf/2411.11779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07534v2","updated":"2024-11-18T17:49:46Z","published":"2024-11-12T04:19:25Z","title":"Effective Virtual Reality Teleoperation of an Upper-body Humanoid with\n Modified Task Jacobians and Relaxed Barrier Functions for Self-Collision\n Avoidance","summary":" We present an approach for retartgeting off-the-shelf Virtual Reality (VR)\ntrackers to effectively teleoperate an upper-body humanoid while ensuring\nself-collision-free motions. Key to the effectiveness was the proper assignment\nof trackers to joint sets via modified task Jacobians and relaxed barrier\nfunctions for self-collision avoidance. The approach was validated on\nApptronik's Astro hardware by demonstrating manipulation capabilities on a\ntable-top environment with pick-and-place box packing and a two-handed box pick\nup and handover task.\n","authors":["Steven Jens Jorgensen","Ravi Bhadeshiya"],"pdf_url":"https://arxiv.org/pdf/2411.07534v2.pdf","comment":"First Prize Winner of Horizons of an extended robotics reality\n Workshop at International Conference on Intelligent Robots and Systems, 2022"},{"id":"http://arxiv.org/abs/2409.03077v2","updated":"2024-11-18T17:48:59Z","published":"2024-09-04T21:05:42Z","title":"Backdoor defense, learnability and obfuscation","summary":" We introduce a formal notion of defendability against backdoors using a game\nbetween an attacker and a defender. In this game, the attacker modifies a\nfunction to behave differently on a particular input known as the \"trigger\",\nwhile behaving the same almost everywhere else. The defender then attempts to\ndetect the trigger at evaluation time. If the defender succeeds with high\nenough probability, then the function class is said to be defendable. The key\nconstraint on the attacker that makes defense possible is that the attacker's\nstrategy must work for a randomly-chosen trigger.\n Our definition is simple and does not explicitly mention learning, yet we\ndemonstrate that it is closely connected to learnability. In the\ncomputationally unbounded setting, we use a voting algorithm of Hanneke et al.\n(2022) to show that defendability is essentially determined by the VC dimension\nof the function class, in much the same way as PAC learnability. In the\ncomputationally bounded setting, we use a similar argument to show that\nefficient PAC learnability implies efficient defendability, but not conversely.\nOn the other hand, we use indistinguishability obfuscation to show that the\nclass of polynomial size circuits is not efficiently defendable. Finally, we\npresent polynomial size decision trees as a natural example for which defense\nis strictly easier than learning. Thus, we identify efficient defendability as\na notable intermediate concept in between efficient learnability and\nobfuscation.\n","authors":["Paul Christiano","Jacob Hilton","Victor Lecomte","Mark Xu"],"pdf_url":"https://arxiv.org/pdf/2409.03077v2.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2411.11767v1","updated":"2024-11-18T17:46:32Z","published":"2024-11-18T17:46:32Z","title":"Drowning in Documents: Consequences of Scaling Reranker Inference","summary":" Rerankers, typically cross-encoders, are often used to re-score the documents\nretrieved by cheaper initial IR systems. This is because, though expensive,\nrerankers are assumed to be more effective. We challenge this assumption by\nmeasuring reranker performance for full retrieval, not just re-scoring\nfirst-stage retrieval. Our experiments reveal a surprising trend: the best\nexisting rerankers provide diminishing returns when scoring progressively more\ndocuments and actually degrade quality beyond a certain limit. In fact, in this\nsetting, rerankers can frequently assign high scores to documents with no\nlexical or semantic overlap with the query. We hope that our findings will spur\nfuture research to improve reranking.\n","authors":["Mathew Jacob","Erik Lindgren","Matei Zaharia","Michael Carbin","Omar Khattab","Andrew Drozdov"],"pdf_url":"https://arxiv.org/pdf/2411.11767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11764v1","updated":"2024-11-18T17:43:43Z","published":"2024-11-18T17:43:43Z","title":"Freezing of Gait Detection Using Gramian Angular Fields and Federated\n Learning from Wearable Sensors","summary":" Freezing of gait (FOG) is a debilitating symptom of Parkinson's disease (PD)\nthat impairs mobility and safety. Traditional detection methods face challenges\ndue to intra and inter-patient variability, and most systems are tested in\ncontrolled settings, limiting their real-world applicability. Addressing these\ngaps, we present FOGSense, a novel FOG detection system designed for\nuncontrolled, free-living conditions. It uses Gramian Angular Field (GAF)\ntransformations and federated deep learning to capture temporal and spatial\ngait patterns missed by traditional methods. We evaluated our FOGSense system\nusing a public PD dataset, 'tdcsfog'. FOGSense improves accuracy by 10.4% over\na single-axis accelerometer, reduces failure points compared to multi-sensor\nsystems, and demonstrates robustness to missing values. The federated\narchitecture allows personalized model adaptation and efficient smartphone\nsynchronization during off-peak hours, making it effective for long-term\nmonitoring as symptoms evolve. Overall, FOGSense achieves a 22.2% improvement\nin F1-score compared to state-of-the-art methods, along with enhanced\nsensitivity for FOG episode detection. Code is available:\nhttps://github.com/shovito66/FOGSense.\n","authors":["Shovito Barua Soumma","S M Raihanul Alam","Rudmila Rahman","Umme Niraj Mahi","Sayyed Mostafa Mostafavi","Hassan Ghasemzadeh"],"pdf_url":"https://arxiv.org/pdf/2411.11764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09901v2","updated":"2024-11-18T17:43:31Z","published":"2024-03-14T22:25:37Z","title":"Robust Subgraph Learning by Monitoring Early Training Representations","summary":" Graph neural networks (GNNs) have attracted significant attention for their\noutstanding performance in graph learning and node classification tasks.\nHowever, their vulnerability to adversarial attacks, particularly through\nsusceptible nodes, poses a challenge in decision-making. The need for robust\ngraph summarization is evident in adversarial challenges resulting from the\npropagation of attacks throughout the entire graph. In this paper, we address\nboth performance and adversarial robustness in graph input by introducing the\nnovel technique SHERD (Subgraph Learning Hale through Early Training\nRepresentation Distances). SHERD leverages information from layers of a\npartially trained graph convolutional network (GCN) to detect susceptible nodes\nduring adversarial attacks using standard distance metrics. The method\nidentifies \"vulnerable (bad)\" nodes and removes such nodes to form a robust\nsubgraph while maintaining node classification performance. Through our\nexperiments, we demonstrate the increased performance of SHERD in enhancing\nrobustness by comparing the network's performance on original and subgraph\ninputs against various baselines alongside existing adversarial attacks. Our\nexperiments across multiple datasets, including citation datasets such as Cora,\nCiteseer, and Pubmed, as well as microanatomical tissue structures of cell\ngraphs in the placenta, highlight that SHERD not only achieves substantial\nimprovement in robust performance but also outperforms several baselines in\nterms of node classification accuracy and computational complexity.\n","authors":["Sepideh Neshatfar","Salimeh Yasaei Sekeh"],"pdf_url":"https://arxiv.org/pdf/2403.09901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11761v1","updated":"2024-11-18T17:40:42Z","published":"2024-11-18T17:40:42Z","title":"Mapping out the Space of Human Feedback for Reinforcement Learning: A\n Conceptual Framework","summary":" Reinforcement Learning from Human feedback (RLHF) has become a powerful tool\nto fine-tune or train agentic machine learning models. Similar to how humans\ninteract in social contexts, we can use many types of feedback to communicate\nour preferences, intentions, and knowledge to an RL agent. However,\napplications of human feedback in RL are often limited in scope and disregard\nhuman factors. In this work, we bridge the gap between machine learning and\nhuman-computer interaction efforts by developing a shared understanding of\nhuman feedback in interactive learning scenarios. We first introduce a taxonomy\nof feedback types for reward-based learning from human feedback based on nine\nkey dimensions. Our taxonomy allows for unifying human-centered,\ninterface-centered, and model-centered aspects. In addition, we identify seven\nquality metrics of human feedback influencing both the human ability to express\nfeedback and the agent's ability to learn from the feedback. Based on the\nfeedback taxonomy and quality criteria, we derive requirements and design\nchoices for systems learning from human feedback. We relate these requirements\nand design choices to existing work in interactive machine learning. In the\nprocess, we identify gaps in existing work and future research opportunities.\nWe call for interdisciplinary collaboration to harness the full potential of\nreinforcement learning with data-driven co-adaptive modeling and varied\ninteraction mechanics.\n","authors":["Yannick Metz","David Lindner","Raphaël Baur","Mennatallah El-Assady"],"pdf_url":"https://arxiv.org/pdf/2411.11761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11748v1","updated":"2024-11-18T17:25:06Z","published":"2024-11-18T17:25:06Z","title":"Debiased Regression for Root-N-Consistent Conditional Mean Estimation","summary":" This study introduces a debiasing method for regression estimators, including\nhigh-dimensional and nonparametric regression estimators. For example,\nnonparametric regression methods allow for the estimation of regression\nfunctions in a data-driven manner with minimal assumptions; however, these\nmethods typically fail to achieve $\\sqrt{n}$-consistency in their convergence\nrates, and many, including those in machine learning, lack guarantees that\ntheir estimators asymptotically follow a normal distribution. To address these\nchallenges, we propose a debiasing technique for nonparametric estimators by\nadding a bias-correction term to the original estimators, extending the\nconventional one-step estimator used in semiparametric analysis. Specifically,\nfor each data point, we estimate the conditional expected residual of the\noriginal nonparametric estimator, which can, for instance, be computed using\nkernel (Nadaraya-Watson) regression, and incorporate it as a bias-reduction\nterm. Our theoretical analysis demonstrates that the proposed estimator\nachieves $\\sqrt{n}$-consistency and asymptotic normality under a mild\nconvergence rate condition for both the original nonparametric estimator and\nthe conditional expected residual estimator. Notably, this approach remains\nmodel-free as long as the original estimator and the conditional expected\nresidual estimator satisfy the convergence rate condition. The proposed method\noffers several advantages, including improved estimation accuracy and\nsimplified construction of confidence intervals.\n","authors":["Masahiro Kato"],"pdf_url":"https://arxiv.org/pdf/2411.11748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11745v1","updated":"2024-11-18T17:16:58Z","published":"2024-11-18T17:16:58Z","title":"BitMoD: Bit-serial Mixture-of-Datatype LLM Acceleration","summary":" Large language models (LLMs) have demonstrated remarkable performance across\nvarious machine learning tasks. Yet the substantial memory footprint of LLMs\nsignificantly hinders their deployment. In this paper, we improve the\naccessibility of LLMs through BitMoD, an algorithm-hardware co-design solution\nthat enables efficient LLM acceleration at low weight precision. On the\nalgorithm side, BitMoD introduces fine-grained data type adaptation that uses a\ndifferent numerical data type to quantize a group of (e.g., 128) weights.\nThrough the careful design of these new data types, BitMoD is able to quantize\nLLM weights to very low precision (e.g., 4 bits and 3 bits) while maintaining\nhigh accuracy. On the hardware side, BitMoD employs a bit-serial processing\nelement to easily support multiple numerical precisions and data types; our\nhardware design includes two key innovations: First, it employs a unified\nrepresentation to process different weight data types, thus reducing the\nhardware cost. Second, it adopts a bit-serial dequantization unit to rescale\nthe per-group partial sum with minimal hardware overhead. Our evaluation on six\nrepresentative LLMs demonstrates that BitMoD significantly outperforms\nstate-of-the-art LLM quantization and acceleration methods. For discriminative\ntasks, BitMoD can quantize LLM weights to 4-bit with $<\\!0.5\\%$ accuracy loss\non average. For generative tasks, BitMoD is able to quantize LLM weights to\n3-bit while achieving better perplexity than prior LLM quantization scheme.\nCombining the superior model performance with an efficient accelerator design,\nBitMoD achieves an average of $1.69\\times$ and $1.48\\times$ speedups compared\nto prior LLM accelerators ANT and OliVe, respectively.\n","authors":["Yuzong Chen","Ahmed F. AbouElhamayed","Xilai Dai","Yang Wang","Marta Andronic","George A. Constantinides","Mohamed S. Abdelfattah"],"pdf_url":"https://arxiv.org/pdf/2411.11745v1.pdf","comment":"HPCA 2025"},{"id":"http://arxiv.org/abs/2411.11740v1","updated":"2024-11-18T17:10:14Z","published":"2024-11-18T17:10:14Z","title":"Revitalizing Electoral Trust: Enhancing Transparency and Efficiency\n through Automated Voter Counting with Machine Learning","summary":" In order to address issues with manual vote counting during election\nprocedures, this study intends to examine the viability of using advanced image\nprocessing techniques for automated voter counting. The study aims to shed\nlight on how automated systems that utilize cutting-edge technologies like\nOpenCV, CVZone, and the MOG2 algorithm could greatly increase the effectiveness\nand openness of electoral operations. The empirical findings demonstrate how\nautomated voter counting can enhance voting processes and rebuild public\nconfidence in election outcomes, particularly in places where trust is low. The\nstudy also emphasizes how rigorous metrics, such as the F1 score, should be\nused to systematically compare the accuracy of automated systems against manual\ncounting methods. This methodology enables a detailed comprehension of the\ndifferences in performance between automated and human counting techniques by\nproviding a nuanced assessment. The incorporation of said measures serves to\nreinforce an extensive assessment structure, guaranteeing the legitimacy and\ndependability of automated voting systems inside the electoral sphere.\n","authors":["Mir Faris","Syeda Aynul Karim","Md. Juniadul Islam"],"pdf_url":"https://arxiv.org/pdf/2411.11740v1.pdf","comment":"13 Pages, 4 Figures"},{"id":"http://arxiv.org/abs/2410.24060v3","updated":"2024-11-18T17:04:09Z","published":"2024-10-31T15:57:04Z","title":"Understanding Generalizability of Diffusion Models Requires Rethinking\n the Hidden Gaussian Structure","summary":" In this work, we study the generalizability of diffusion models by looking\ninto the hidden properties of the learned score functions, which are\nessentially a series of deep denoisers trained on various noise levels. We\nobserve that as diffusion models transition from memorization to\ngeneralization, their corresponding nonlinear diffusion denoisers exhibit\nincreasing linearity. This discovery leads us to investigate the linear\ncounterparts of the nonlinear diffusion models, which are a series of linear\nmodels trained to match the function mappings of the nonlinear diffusion\ndenoisers. Surprisingly, these linear denoisers are approximately the optimal\ndenoisers for a multivariate Gaussian distribution characterized by the\nempirical mean and covariance of the training dataset. This finding implies\nthat diffusion models have the inductive bias towards capturing and utilizing\nthe Gaussian structure (covariance information) of the training dataset for\ndata generation. We empirically demonstrate that this inductive bias is a\nunique property of diffusion models in the generalization regime, which becomes\nincreasingly evident when the model's capacity is relatively small compared to\nthe training dataset size. In the case that the model is highly\noverparameterized, this inductive bias emerges during the initial training\nphases before the model fully memorizes its training data. Our study provides\ncrucial insights into understanding the notable strong generalization\nphenomenon recently observed in real-world diffusion models.\n","authors":["Xiang Li","Yixiang Dai","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2410.24060v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15367v2","updated":"2024-11-18T17:00:32Z","published":"2024-09-18T18:36:18Z","title":"Fine-Tuning a Time Series Foundation Model with Wasserstein Loss","summary":" Inspired by recent advancements in large language models (LLMs) for Natural\nLanguage Processing (NLP), there has been a surge in research focused on\ndeveloping foundational models for time series forecasting. One approach\ninvolves training LLM architectures on tokenized time series data using\ncross-entropy loss. Although this method has demonstrated promising results,\ncross-entropy loss is primarily designed for classification tasks and does not\naccount for the distance between classes. To address this limitation, we\npropose using the Wasserstein loss for such architectures. To validate our\napproach, we fine-tuned a foundational time series model on $22$ zero-shot\ndatasets, comparing the performance of cross-entropy loss with that of\nWasserstein loss. Our results demonstrate that replacing cross-entropy loss\nwith Wasserstein loss significantly improves point estimation.\n","authors":["Andrei Chernov"],"pdf_url":"https://arxiv.org/pdf/2409.15367v2.pdf","comment":"4 main pages; 2 figures"},{"id":"http://arxiv.org/abs/2411.11730v1","updated":"2024-11-18T16:59:44Z","published":"2024-11-18T16:59:44Z","title":"Lifted Model Construction without Normalisation: A Vectorised Approach\n to Exploit Symmetries in Factor Graphs","summary":" Lifted probabilistic inference exploits symmetries in a probabilistic model\nto allow for tractable probabilistic inference with respect to domain sizes of\nlogical variables. We found that the current state-of-the-art algorithm to\nconstruct a lifted representation in form of a parametric factor graph misses\nsymmetries between factors that are exchangeable but scaled differently,\nthereby leading to a less compact representation. In this paper, we propose a\ngeneralisation of the advanced colour passing (ACP) algorithm, which is the\nstate of the art to construct a parametric factor graph. Our proposed algorithm\nallows for potentials of factors to be scaled arbitrarily and efficiently\ndetects more symmetries than the original ACP algorithm. By detecting strictly\nmore symmetries than ACP, our algorithm significantly reduces online query\ntimes for probabilistic inference when the resulting model is applied, which we\nalso confirm in our experiments.\n","authors":["Malte Luttermann","Ralf Möller","Marcel Gehrke"],"pdf_url":"https://arxiv.org/pdf/2411.11730v1.pdf","comment":"Accepted to the Proceedings of the 3rd Learning on Graphs Conference\n (LoG 2024)"},{"id":"http://arxiv.org/abs/2411.11727v1","updated":"2024-11-18T16:57:41Z","published":"2024-11-18T16:57:41Z","title":"Aligning Few-Step Diffusion Models with Dense Reward Difference Learning","summary":" Aligning diffusion models with downstream objectives is essential for their\npractical applications. However, standard alignment methods often struggle with\nstep generalization when directly applied to few-step diffusion models, leading\nto inconsistent performance across different denoising step scenarios. To\naddress this, we introduce Stepwise Diffusion Policy Optimization (SDPO), a\nnovel alignment method tailored for few-step diffusion models. Unlike prior\napproaches that rely on a single sparse reward from only the final step of each\ndenoising trajectory for trajectory-level optimization, SDPO incorporates dense\nreward feedback at every intermediate step. By learning the differences in\ndense rewards between paired samples, SDPO facilitates stepwise optimization of\nfew-step diffusion models, ensuring consistent alignment across all denoising\nsteps. To promote stable and efficient training, SDPO introduces an online\nreinforcement learning framework featuring several novel strategies designed to\neffectively exploit the stepwise granularity of dense rewards. Experimental\nresults demonstrate that SDPO consistently outperforms prior methods in\nreward-based alignment across diverse step configurations, underscoring its\nrobust step generalization capabilities. Code is avaliable at\nhttps://github.com/ZiyiZhang27/sdpo.\n","authors":["Ziyi Zhang","Li Shen","Sen Zhang","Deheng Ye","Yong Luo","Miaojing Shi","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2411.11727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02588v2","updated":"2024-11-18T16:45:26Z","published":"2023-08-03T18:23:37Z","title":"Unmasking Parkinson's Disease with Smile: An AI-enabled Screening\n Framework","summary":" We present an efficient and accessible PD screening method by leveraging\nAI-driven models enabled by the largest video dataset of facial expressions\nfrom 1,059 unique participants. This dataset includes 256 individuals with PD,\n165 clinically diagnosed, and 91 self-reported. Participants used webcams to\nrecord themselves mimicking three facial expressions (smile, disgust, and\nsurprise) from diverse sources encompassing their homes across multiple\ncountries, a US clinic, and a PD wellness center in the US. Facial landmarks\nare automatically tracked from the recordings to extract features related to\nhypomimia, a prominent PD symptom characterized by reduced facial expressions.\nMachine learning algorithms are trained on these features to distinguish\nbetween individuals with and without PD. The model was tested for\ngeneralizability on external (unseen during training) test videos collected\nfrom a US clinic and Bangladesh. An ensemble of machine learning models trained\non smile videos achieved an accuracy of 87.9+-0.1% (95% Confidence Interval)\nwith an AUROC of 89.3+-0.3% as evaluated on held-out data (using k-fold\ncross-validation). In external test settings, the ensemble model achieved\n79.8+-0.6% accuracy with 81.9+-0.3% AUROC on the clinical test set and\n84.9+-0.4% accuracy with 81.2+-0.6% AUROC on participants from Bangladesh. In\nevery setting, the model was free from detectable bias across sex and ethnic\nsubgroups, except in the cohorts from Bangladesh, where the model performed\nsignificantly better for female participants than males. Smiling videos can\neffectively differentiate between individuals with and without PD, offering a\npotentially easy, accessible, and cost-efficient way to screen for PD,\nespecially when a clinical diagnosis is difficult to access.\n","authors":["Tariq Adnan","Md Saiful Islam","Wasifur Rahman","Sangwu Lee","Sutapa Dey Tithi","Kazi Noshin","Imran Sarker","M Saifur Rahman","Ehsan Hoque"],"pdf_url":"https://arxiv.org/pdf/2308.02588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11713v1","updated":"2024-11-18T16:37:41Z","published":"2024-11-18T16:37:41Z","title":"FLMarket: Enabling Privacy-preserved Pre-training Data Pricing for\n Federated Learning","summary":" Federated Learning (FL), as a mainstream privacy-preserving machine learning\nparadigm, offers promising solutions for privacy-critical domains such as\nhealthcare and finance. Although extensive efforts have been dedicated from\nboth academia and industry to improve the vanilla FL, little work focuses on\nthe data pricing mechanism. In contrast to the straightforward in/post-training\npricing techniques, we study a more difficult problem of pre-training pricing\nwithout direct information from the learning process. We propose FLMarket that\nintegrates a two-stage, auction-based pricing mechanism with a security\nprotocol to address the utility-privacy conflict. Through comprehensive\nexperiments, we show that the client selection according to FLMarket can\nachieve more than 10% higher accuracy in subsequent FL training compared to\nstate-of-the-art methods. In addition, it outperforms the in-training baseline\nwith more than 2% accuracy increase and 3x run-time speedup.\n","authors":["Zhenyu Wen","Wanglei Feng","Di Wu","Haozhen Hu","Chang Xu","Bin Qian","Zhen Hong","Cong Wang","Shouling Ji"],"pdf_url":"https://arxiv.org/pdf/2411.11713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11181v2","updated":"2024-11-18T16:25:53Z","published":"2024-10-15T01:51:29Z","title":"DARNet: Dual Attention Refinement Network with Spatiotemporal\n Construction for Auditory Attention Detection","summary":" At a cocktail party, humans exhibit an impressive ability to direct their\nattention. The auditory attention detection (AAD) approach seeks to identify\nthe attended speaker by analyzing brain signals, such as EEG signals. However,\ncurrent AAD algorithms overlook the spatial distribution information within EEG\nsignals and lack the ability to capture long-range latent dependencies,\nlimiting the model's ability to decode brain activity. To address these issues,\nthis paper proposes a dual attention refinement network with spatiotemporal\nconstruction for AAD, named DARNet, which consists of the spatiotemporal\nconstruction module, dual attention refinement module, and feature fusion \\&\nclassifier module. Specifically, the spatiotemporal construction module aims to\nconstruct more expressive spatiotemporal feature representations, by capturing\nthe spatial distribution characteristics of EEG signals. The dual attention\nrefinement module aims to extract different levels of temporal patterns in EEG\nsignals and enhance the model's ability to capture long-range latent\ndependencies. The feature fusion \\& classifier module aims to aggregate\ntemporal patterns and dependencies from different levels and obtain the final\nclassification results. The experimental results indicate that compared to the\nstate-of-the-art models, DARNet achieves an average classification accuracy\nimprovement of 5.9\\% for 0.1s, 4.6\\% for 1s, and 3.9\\% for 2s on the DTU\ndataset. While maintaining excellent classification performance, DARNet\nsignificantly reduces the number of required parameters. Compared to the\nstate-of-the-art models, DARNet reduces the parameter count by 91\\%. Code is\navailable at: https://github.com/fchest/DARNet.git.\n","authors":["Sheng Yan","Cunhang fan","Hongyu Zhang","Xiaoke Yang","Jianhua Tao","Zhao Lv"],"pdf_url":"https://arxiv.org/pdf/2410.11181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19212v3","updated":"2024-11-18T16:22:41Z","published":"2024-05-29T15:54:03Z","title":"Partial Information Decomposition for Data Interpretability and Feature\n Selection","summary":" In this paper, we introduce Partial Information Decomposition of Features\n(PIDF), a new paradigm for simultaneous data interpretability and feature\nselection. Contrary to traditional methods that assign a single importance\nvalue, our approach is based on three metrics per feature: the mutual\ninformation shared with the target variable, the feature's contribution to\nsynergistic information, and the amount of this information that is redundant.\nIn particular, we develop a novel procedure based on these three metrics, which\nreveals not only how features are correlated with the target but also the\nadditional and overlapping information provided by considering them in\ncombination with other features. We extensively evaluate PIDF using both\nsynthetic and real-world data, demonstrating its potential applications and\neffectiveness, by considering case studies from genetics and neuroscience.\n","authors":["Charles Westphal","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2405.19212v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11697v1","updated":"2024-11-18T16:17:34Z","published":"2024-11-18T16:17:34Z","title":"Robust Reinforcement Learning under Diffusion Models for Data with Jumps","summary":" Reinforcement Learning (RL) has proven effective in solving complex\ndecision-making tasks across various domains, but challenges remain in\ncontinuous-time settings, particularly when state dynamics are governed by\nstochastic differential equations (SDEs) with jump components. In this paper,\nwe address this challenge by introducing the Mean-Square Bipower Variation\nError (MSBVE) algorithm, which enhances robustness and convergence in scenarios\ninvolving significant stochastic noise and jumps. We first revisit the\nMean-Square TD Error (MSTDE) algorithm, commonly used in continuous-time RL,\nand highlight its limitations in handling jumps in state dynamics. The proposed\nMSBVE algorithm minimizes the mean-square quadratic variation error, offering\nimproved performance over MSTDE in environments characterized by SDEs with\njumps. Simulations and formal proofs demonstrate that the MSBVE algorithm\nreliably estimates the value function in complex settings, surpassing MSTDE's\nperformance when faced with jump processes. These findings underscore the\nimportance of alternative error metrics to improve the resilience and\neffectiveness of RL algorithms in continuous-time frameworks.\n","authors":["Chenyang Jiang","Donggyu Kim","Alejandra Quintos","Yazhen Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11682v1","updated":"2024-11-18T16:07:47Z","published":"2024-11-18T16:07:47Z","title":"Learning Differentiable Surrogate Losses for Structured Prediction","summary":" Structured prediction involves learning to predict complex structures rather\nthan simple scalar values. The main challenge arises from the non-Euclidean\nnature of the output space, which generally requires relaxing the problem\nformulation. Surrogate methods build on kernel-induced losses or more\ngenerally, loss functions admitting an Implicit Loss Embedding, and convert the\noriginal problem into a regression task followed by a decoding step. However,\ndesigning effective losses for objects with complex structures presents\nsignificant challenges and often requires domain-specific expertise. In this\nwork, we introduce a novel framework in which a structured loss function,\nparameterized by neural networks, is learned directly from output training data\nthrough Contrastive Learning, prior to addressing the supervised surrogate\nregression problem. As a result, the differentiable loss not only enables the\nlearning of neural networks due to the finite dimension of the surrogate space\nbut also allows for the prediction of new structures of the output data via a\ndecoding strategy based on gradient descent. Numerical experiments on\nsupervised graph prediction problems show that our approach achieves similar or\neven better performance than methods based on a pre-defined kernel.\n","authors":["Junjie Yang","Matthieu Labeau","Florence d'Alché-Buc"],"pdf_url":"https://arxiv.org/pdf/2411.11682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11681v1","updated":"2024-11-18T16:03:51Z","published":"2024-11-18T16:03:51Z","title":"PSPO*: An Effective Process-supervised Policy Optimization for Reasoning\n Alignment","summary":" Process supervision enhances the performance of large language models in\nreasoning tasks by providing feedback at each step of chain-of-thought\nreasoning. However, due to the lack of effective process supervision methods,\neven advanced large language models are prone to logical errors and redundant\nreasoning. We claim that the effectiveness of process supervision significantly\ndepends on both the accuracy and the length of reasoning chains. Moreover, we\nidentify that these factors exhibit a nonlinear relationship with the overall\nreward score of the reasoning process. Inspired by these insights, we propose a\nnovel process supervision paradigm, PSPO*, which systematically outlines the\nworkflow from reward model training to policy optimization, and highlights the\nimportance of nonlinear rewards in process supervision. Based on PSPO*, we\ndevelop the PSPO-WRS, which considers the number of reasoning steps in\ndetermining reward scores and utilizes an adjusted Weibull distribution for\nnonlinear reward shaping. Experimental results on six mathematical reasoning\ndatasets demonstrate that PSPO-WRS consistently outperforms current mainstream\nmodels.\n","authors":["Jiawei Li","Xinyue Liang","Yizhe Yang","Chong Feng","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2411.11681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11678v1","updated":"2024-11-18T15:59:30Z","published":"2024-11-18T15:59:30Z","title":"Analysis of Hardware Synthesis Strategies for Machine Learning in\n Collider Trigger and Data Acquisition","summary":" To fully exploit the physics potential of current and future high energy\nparticle colliders, machine learning (ML) can be implemented in detector\nelectronics for intelligent data processing and acquisition. The implementation\nof ML in real-time at colliders requires very low latencies that are\nunachievable with a software-based approach, requiring optimization and\nsynthesis of ML algorithms for deployment on hardware. An analysis of neural\nnetwork inference efficiency is presented, focusing on the application of\ncollider trigger algorithms in field programmable gate arrays (FPGAs).\nTrade-offs are evaluated between two frameworks, the SLAC Neural Network\nLibrary (SNL) and hls4ml, in terms of resources and latency for different model\nsizes. Results highlight the strengths and limitations of each approach,\noffering valuable insights for optimizing real-time neural network deployments\nat colliders. This work aims to guide researchers and engineers in selecting\nthe most suitable hardware and software configurations for real-time,\nresource-constrained environments.\n","authors":["Haoyi Jia","Abhilasha Dave","Julia Gonski","Ryan Herbst"],"pdf_url":"https://arxiv.org/pdf/2411.11678v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.11677v1","updated":"2024-11-18T15:57:14Z","published":"2024-11-18T15:57:14Z","title":"Few-shot Model Extraction Attacks against Sequential Recommender Systems","summary":" Among adversarial attacks against sequential recommender systems, model\nextraction attacks represent a method to attack sequential recommendation\nmodels without prior knowledge. Existing research has primarily concentrated on\nthe adversary's execution of black-box attacks through data-free model\nextraction. However, a significant gap remains in the literature concerning the\ndevelopment of surrogate models by adversaries with access to few-shot raw data\n(10\\% even less). That is, the challenge of how to construct a surrogate model\nwith high functional similarity within the context of few-shot data scenarios\nremains an issue that requires resolution.This study addresses this gap by\nintroducing a novel few-shot model extraction framework against sequential\nrecommenders, which is designed to construct a superior surrogate model with\nthe utilization of few-shot data. The proposed few-shot model extraction\nframework is comprised of two components: an autoregressive augmentation\ngeneration strategy and a bidirectional repair loss-facilitated model\ndistillation procedure. Specifically, to generate synthetic data that closely\napproximate the distribution of raw data, autoregressive augmentation\ngeneration strategy integrates a probabilistic interaction sampler to extract\ninherent dependencies and a synthesis determinant signal module to characterize\nuser behavioral patterns. Subsequently, bidirectional repair loss, which target\nthe discrepancies between the recommendation lists, is designed as auxiliary\nloss to rectify erroneous predictions from surrogate models, transferring\nknowledge from the victim model to the surrogate model effectively. Experiments\non three datasets show that the proposed few-shot model extraction framework\nyields superior surrogate models.\n","authors":["Hui Zhang","Fu Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11672v1","updated":"2024-11-18T15:51:45Z","published":"2024-11-18T15:51:45Z","title":"Artificial Scientific Discovery","summary":" Rooted in the explosion of deep learning over the past decade, this thesis\nspans from AlphaGo to ChatGPT to empirically examine the fundamental concepts\nneeded to realize the vision of an artificial scientist: a machine with the\ncapacity to autonomously generate original research and contribute to the\nexpansion of human knowledge. The investigation begins with {\\sc Olivaw}, an\nAlphaGo Zero-like agent that discovers Othello knowledge from scratch but is\nunable to communicate it. This realization leads to the development of the\nExplanatory Learning (EL) framework, a formalization of the problem faced by a\nscientist when trying to explain a new phenomenon to their peers. The effective\nEL prescriptions allow us to crack Zendo, a board game simulating the\nscientific endeavor. This success comes with a fundamental insight: an\nartificial scientist must develop its own interpretation of the language used\nto explain its findings. This perspective then leads us to see modern\nmultimodal models as interpreters, and to devise a new way to build\ninterpretable and cost-effective CLIP-like models: by coupling two unimodal\nmodels using little multimodal data and no further training. Finally, we\ndiscuss what ChatGPT and its siblings are still missing to become artificial\nscientists, and introduce Odeen, a benchmark about interpreting explanations\nthat sees LLMs going no further than random chance while being instead fully\nsolved by humans.\n","authors":["Antonio Norelli"],"pdf_url":"https://arxiv.org/pdf/2411.11672v1.pdf","comment":"PhD thesis, 123 pages"},{"id":"http://arxiv.org/abs/2411.11668v1","updated":"2024-11-18T15:47:37Z","published":"2024-11-18T15:47:37Z","title":"Efficient and Robust Continual Graph Learning for Graph Classification\n in Biology","summary":" Graph classification is essential for understanding complex biological\nsystems, where molecular structures and interactions are naturally represented\nas graphs. Traditional graph neural networks (GNNs) perform well on static\ntasks but struggle in dynamic settings due to catastrophic forgetting. We\npresent Perturbed and Sparsified Continual Graph Learning (PSCGL), a robust and\nefficient continual graph learning framework for graph data classification,\nspecifically targeting biological datasets. We introduce a perturbed sampling\nstrategy to identify critical data points that contribute to model learning and\na motif-based graph sparsification technique to reduce storage needs while\nmaintaining performance. Additionally, our PSCGL framework inherently defends\nagainst graph backdoor attacks, which is crucial for applications in sensitive\nbiological contexts. Extensive experiments on biological datasets demonstrate\nthat PSCGL not only retains knowledge across tasks but also enhances the\nefficiency and robustness of graph classification models in biology.\n","authors":["Ding Zhang","Jane Downer","Can Chen","Ren Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11667v1","updated":"2024-11-18T15:45:41Z","published":"2024-11-18T15:45:41Z","title":"Dissecting Misalignment of Multimodal Large Language Models via\n Influence Function","summary":" Multi-modal Large Language models (MLLMs) are always trained on data from\ndiverse and unreliable sources, which may contain misaligned or mislabeled\ntext-image pairs. This frequently causes robustness issues and hallucinations,\nleading to performance degradation. Data valuation is an efficient way to\ndetect and trace these misalignments. Nevertheless, existing methods are\ncomputationally expensive for MLLMs. While computationally efficient, the\nclassical influence functions are inadequate for contrastive learning models\nbecause they were originally designed for pointwise loss. Additionally,\ncontrastive learning involves minimizing the distance between the modalities of\npositive samples and maximizing the distance between the modalities of negative\nsamples. This requires us to evaluate the influence of samples from both\nperspectives. To tackle these challenges, we introduce the Extended Influence\nFunction for Contrastive Loss (ECIF), an influence function crafted for\ncontrastive loss. ECIF considers both positive and negative samples and\nprovides a closed-form approximation of contrastive learning models,\neliminating the need for retraining. Building upon ECIF, we develop a series of\nalgorithms for data evaluation in MLLM, misalignment detection, and\nmisprediction trace-back tasks. Experimental results demonstrate our ECIF\nadvances the transparency and interpretability of MLLMs by offering a more\naccurate assessment of data impact and model alignment compared to traditional\nbaseline methods.\n","authors":["Lijie Hu","Chenyang Ren","Huanyi Xie","Khouloud Saadi","Shu Yang","Jingfeng Zhang","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11667v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2407.12804v2","updated":"2024-11-18T15:41:24Z","published":"2024-06-24T16:31:11Z","title":"Modulating Language Model Experiences through Frictions","summary":" Language models are transforming the ways that their users engage with the\nworld. Despite impressive capabilities, over-consumption of language model\noutputs risks propagating unchecked errors in the short-term and damaging human\ncapabilities for critical thinking in the long-term. How can we develop\nscaffolding around language models to curate more appropriate use? We propose\nselective frictions for language model experiences, inspired by behavioral\nscience interventions, to dampen misuse. Frictions involve small modifications\nto a user's experience, e.g., the addition of a button impeding model access\nand reminding a user of their expertise relative to the model. Through a user\nstudy with real humans, we observe shifts in user behavior from the imposition\nof a friction over LLMs in the context of a multi-topic question-answering task\nas a representative task that people may use LLMs for, e.g., in education and\ninformation retrieval. We find that frictions modulate over-reliance by driving\ndown users' click rates while minimally affecting accuracy for those topics.\nYet, frictions may have unintended effects. We find marked differences in\nusers' click behaviors even on topics where frictions were not provisioned. Our\ncontributions motivate further study of human-AI behavioral interaction to\ninform more effective and appropriate LLM use.\n","authors":["Katherine M. Collins","Valerie Chen","Ilia Sucholutsky","Hannah Rose Kirk","Malak Sadek","Holli Sargeant","Ameet Talwalkar","Adrian Weller","Umang Bhatt"],"pdf_url":"https://arxiv.org/pdf/2407.12804v2.pdf","comment":"NeurIPS Workshop on Behavioral ML; non-archival"},{"id":"http://arxiv.org/abs/2410.13147v6","updated":"2024-11-18T15:41:01Z","published":"2024-10-17T02:04:57Z","title":"Utilizing Large Language Models in an iterative paradigm with domain\n feedback for molecule optimization","summary":" Molecule optimization is a critical task in drug discovery to optimize\ndesired properties of a given molecule through chemical modification. Despite\nLarge Language Models (LLMs) holding the potential to efficiently simulate this\ntask by using natural language to direct the optimization, straightforwardly\nutilizing them shows limited performance. In this work, we facilitate utilizing\nLLMs in an iterative paradigm by proposing a simple yet highly effective domain\nfeedback provider, namely $\\text{Re}^3$DF. In detail, $\\text{Re}^3$DF harnesses\nan external toolkit, RDKit, to handle the molecule hallucination, if the\nmodified molecule is chemically invalid. Otherwise, its desired properties are\ncomputed and compared to the original one, establishing reliable domain\nfeedback with correct direction and distance towards the objective, followed by\na retrieved example, to guide the LLM to refine the modified molecule. We\nconduct experiments across both single- and multi-property objectives with 2\nthresholds, where $\\text{Re}^3$DF shows significant improvements. Particularly,\nfor 20 single-property objectives, $\\text{Re}^3$DF enhances Hit ratio by 16.95%\nand 20.76% under loose (\\texttt{l}) and strict (\\texttt{s}) thresholds,\nrespectively. For 32 multi-property objectives, $\\text{Re}^3$DF enhances Hit\nratio by 6.04% and 5.25%.\n","authors":["Khiem Le","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2410.13147v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14949v2","updated":"2024-11-18T15:35:52Z","published":"2024-10-19T02:36:11Z","title":"Straightness of Rectified Flow: A Theoretical Insight into Wasserstein\n Convergence","summary":" Diffusion models have emerged as a powerful tool for image generation and\ndenoising. Typically, generative models learn a trajectory between the starting\nnoise distribution and the target data distribution. Recently Liu et al.\n(2023b) designed a novel alternative generative model Rectified Flow (RF),\nwhich aims to learn straight flow trajectories from noise to data using a\nsequence of convex optimization problems with close ties to optimal transport.\nIf the trajectory is curved, one must use many Euler discretization steps or\nnovel strategies, such as exponential integrators, to achieve a satisfactory\ngeneration quality. In contrast, RF has been shown to theoretically straighten\nthe trajectory through successive rectifications, reducing the number of\nfunction evaluations (NFEs) while sampling. It has also been shown empirically\nthat RF may improve the straightness in two rectifications if one can solve the\nunderlying optimization problem within a sufficiently small error. In this\npaper, we make two key theoretical contributions: 1) we provide the first\ntheoretical analysis of the Wasserstein distance between the sampling\ndistribution of RF and the target distribution. Our error rate is characterized\nby the number of discretization steps and a new formulation of straightness\nstronger than that in the original work. 2) under a mild regularity assumption,\nwe show that for a rectified flow from a Gaussian to any general target\ndistribution with finite first moment (e.g. mixture of Gaussians), two\nrectifications are sufficient to achieve a straight flow, which is in line with\nthe previous empirical findings. Additionally, we also present empirical\nresults on both simulated and real datasets to validate our theoretical\nfindings.\n","authors":["Vansh Bansal","Saptarshi Roy","Purnamrita Sarkar","Alessandro Rinaldo"],"pdf_url":"https://arxiv.org/pdf/2410.14949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04154v7","updated":"2024-11-18T15:31:52Z","published":"2024-02-06T17:09:25Z","title":"Read to Play (R2-Play): Decision Transformer with Multimodal Game\n Instruction","summary":" Developing a generalist agent is a longstanding objective in artificial\nintelligence. Previous efforts utilizing extensive offline datasets from\nvarious tasks demonstrate remarkable performance in multitasking scenarios\nwithin Reinforcement Learning. However, these works encounter challenges in\nextending their capabilities to new tasks. Recent approaches integrate textual\nguidance or visual trajectory into decision networks to provide task-specific\ncontextual cues, representing a promising direction. However, it is observed\nthat relying solely on textual guidance or visual trajectory is insufficient\nfor accurately conveying the contextual information of tasks. This paper\nexplores enhanced forms of task guidance for agents, enabling them to\ncomprehend gameplay instructions, thereby facilitating a \"read-to-play\"\ncapability. Drawing inspiration from the success of multimodal instruction\ntuning in visual tasks, we treat the visual-based RL task as a long-horizon\nvision task and construct a set of multimodal game instructions to incorporate\ninstruction tuning into a decision transformer. Experimental results\ndemonstrate that incorporating multimodal game instructions significantly\nenhances the decision transformer's multitasking and generalization\ncapabilities.\n","authors":["Yonggang Jin","Ge Zhang","Hao Zhao","Tianyu Zheng","Jarvi Guo","Liuyu Xiang","Shawn Yue","Stephen W. Huang","Zhaofeng He","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2402.04154v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13565v2","updated":"2024-11-18T15:30:16Z","published":"2024-03-20T12:58:46Z","title":"Feature-wise and Sample-wise Adaptive Transfer Learning for\n High-dimensional Linear Regression","summary":" We consider the transfer learning problem in the high dimensional linear\nregression setting, where the feature dimension is larger than the sample size.\nTo learn transferable information, which may vary across features or the source\nsamples, we propose an adaptive transfer learning method that can detect and\naggregate the feature-wise (F-AdaTrans) or sample-wise (S-AdaTrans)\ntransferable structures. We achieve this by employing a fused-penalty, coupled\nwith weights that can adapt according to the transferable structure. To choose\nthe weight, we propose a theoretically informed, data-driven procedure,\nenabling F-AdaTrans to selectively fuse the transferable signals with the\ntarget while filtering out non-transferable signals, and S-AdaTrans to obtain\nthe optimal combination of information transferred from each source sample. We\nshow that, with appropriately chosen weights, F-AdaTrans achieves a convergence\nrate close to that of an oracle estimator with a known transferable structure,\nand S-AdaTrans recovers existing near-minimax optimal rates as a special case.\nThe effectiveness of the proposed method is validated using both simulation and\nreal data, demonstrating favorable performance compared to the existing\nmethods.\n","authors":["Zelin He","Ying Sun","Jingyuan Liu","Runze Li"],"pdf_url":"https://arxiv.org/pdf/2403.13565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11647v1","updated":"2024-11-18T15:24:11Z","published":"2024-11-18T15:24:11Z","title":"No-regret Exploration in Shuffle Private Reinforcement Learning","summary":" Differential privacy (DP) has recently been introduced into episodic\nreinforcement learning (RL) to formally address user privacy concerns in\npersonalized services. Previous work mainly focuses on two trust models of DP:\nthe central model, where a central agent is responsible for protecting users'\nsensitive data, and the (stronger) local model, where the protection occurs\ndirectly on the user side. However, they either require a trusted central agent\nor incur a significantly higher privacy cost, making it unsuitable for many\nscenarios. This work introduces a trust model stronger than the central model\nbut with a lower privacy cost than the local model, leveraging the emerging\n\\emph{shuffle} model of privacy. We present the first generic algorithm for\nepisodic RL under the shuffle model, where a trusted shuffler randomly permutes\na batch of users' data before sending it to the central agent. We then\ninstantiate the algorithm using our proposed shuffle Privatizer, relying on a\nshuffle private binary summation mechanism. Our analysis shows that the\nalgorithm achieves a near-optimal regret bound comparable to that of the\ncentralized model and significantly outperforms the local model in terms of\nprivacy cost.\n","authors":["Shaojie Bai","Mohammad Sadegh Talebi","Chengcheng Zhao","Peng Cheng","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17221v2","updated":"2024-11-18T15:21:40Z","published":"2024-10-22T17:45:45Z","title":"Scalable spectral representations for multi-agent reinforcement learning\n in network MDPs","summary":" Network Markov Decision Processes (MDPs), a popular model for multi-agent\ncontrol, pose a significant challenge to efficient learning due to the\nexponential growth of the global state-action space with the number of agents.\nIn this work, utilizing the exponential decay property of network dynamics, we\nfirst derive scalable spectral local representations for network MDPs, which\ninduces a network linear subspace for the local $Q$-function of each agent.\nBuilding on these local spectral representations, we design a scalable\nalgorithmic framework for continuous state-action network MDPs, and provide\nend-to-end guarantees for the convergence of our algorithm. Empirically, we\nvalidate the effectiveness of our scalable representation-based approach on two\nbenchmark problems, and demonstrate the advantages of our approach over generic\nfunction approximation approaches to representing the local $Q$-functions.\n","authors":["Zhaolin Ren","Runyu Zhang","Bo Dai","Na Li"],"pdf_url":"https://arxiv.org/pdf/2410.17221v2.pdf","comment":"Updated title, corrected an issue with an author's name"},{"id":"http://arxiv.org/abs/2406.12112v2","updated":"2024-11-18T15:21:31Z","published":"2024-06-17T21:44:05Z","title":"Thermodynamic Transferability in Coarse-Grained Force Fields using Graph\n Neural Networks","summary":" Coarse-graining is a molecular modeling technique in which an atomistic\nsystem is represented in a simplified fashion that retains the most significant\nsystem features that contribute to a target output, while removing the degrees\nof freedom that are less relevant. This reduction in model complexity allows\ncoarse-grained molecular simulations to reach increased spatial and temporal\nscales compared to corresponding all-atom models. A core challenge in\ncoarse-graining is to construct a force field that represents the interactions\nin the new representation in a way that preserves the atomistic-level\nproperties. Many approaches to building coarse-grained force fields have\nlimited transferability between different thermodynamic conditions as a result\nof averaging over internal fluctuations at a specific thermodynamic state\npoint. Here, we use a graph-convolutional neural network architecture, the\nHierarchically Interacting Particle Neural Network with Tensor Sensitivity\n(HIP-NN-TS), to develop a highly automated training pipeline for coarse grained\nforce fields which allows for studying the transferability of coarse-grained\nmodels based on the force-matching approach. We show that this approach not\nonly yields highly accurate force fields, but also that these force fields are\nmore transferable through a variety of thermodynamic conditions. These results\nillustrate the potential of machine learning techniques such as graph neural\nnetworks to improve the construction of transferable coarse-grained force\nfields.\n","authors":["Emily Shinkle","Aleksandra Pachalieva","Riti Bahl","Sakib Matin","Brendan Gifford","Galen T. Craven","Nicholas Lubbers"],"pdf_url":"https://arxiv.org/pdf/2406.12112v2.pdf","comment":"Post-referee revisions. Accepted by Journal of Chemical Theory and\n Computation (JCTC). 46 pages, 10 figures + TOC figure + SI (19 pages, 6\n figures)"},{"id":"http://arxiv.org/abs/2411.11641v1","updated":"2024-11-18T15:19:54Z","published":"2024-11-18T15:19:54Z","title":"TSINR: Capturing Temporal Continuity via Implicit Neural Representations\n for Time Series Anomaly Detection","summary":" Time series anomaly detection aims to identify unusual patterns in data or\ndeviations from systems' expected behavior. The reconstruction-based methods\nare the mainstream in this task, which learn point-wise representation via\nunsupervised learning. However, the unlabeled anomaly points in training data\nmay cause these reconstruction-based methods to learn and reconstruct anomalous\ndata, resulting in the challenge of capturing normal patterns. In this paper,\nwe propose a time series anomaly detection method based on implicit neural\nrepresentation (INR) reconstruction, named TSINR, to address this challenge.\nDue to the property of spectral bias, TSINR enables prioritizing low-frequency\nsignals and exhibiting poorer performance on high-frequency abnormal data.\nSpecifically, we adopt INR to parameterize time series data as a continuous\nfunction and employ a transformer-based architecture to predict the INR of\ngiven data. As a result, the proposed TSINR method achieves the advantage of\ncapturing the temporal continuity and thus is more sensitive to discontinuous\nanomaly data. In addition, we further design a novel form of INR continuous\nfunction to learn inter- and intra-channel information, and leverage a\npre-trained large language model to amplify the intense fluctuations in\nanomalies. Extensive experiments demonstrate that TSINR achieves superior\noverall performance on both univariate and multivariate time series anomaly\ndetection benchmarks compared to other state-of-the-art reconstruction-based\nmethods. Our codes are available.\n","authors":["Mengxuan Li","Ke Liu","Hongyang Chen","Jiajun Bu","Hongwei Wang","Haishuai Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11641v1.pdf","comment":"Accepted by SIGKDD 2025"},{"id":"http://arxiv.org/abs/2411.04394v2","updated":"2024-11-18T15:18:54Z","published":"2024-11-07T03:11:53Z","title":"Statistical-Computational Trade-offs for Recursive Adaptive Partitioning\n Estimators","summary":" Models based on recursive adaptive partitioning such as decision trees and\ntheir ensembles are popular for high-dimensional regression as they can\npotentially avoid the curse of dimensionality. Because empirical risk\nminimization (ERM) is computationally infeasible, these models are typically\ntrained using greedy algorithms. Although effective in many cases, these\nalgorithms have been empirically observed to get stuck at local optima. We\nexplore this phenomenon in the context of learning sparse regression functions\nover $d$ binary features, showing that when the true regression function $f^*$\ndoes not satisfy Abbe et al. (2022)'s Merged Staircase Property (MSP), greedy\ntraining requires $\\exp(\\Omega(d))$ to achieve low estimation error.\nConversely, when $f^*$ does satisfy MSP, greedy training can attain small\nestimation error with only $O(\\log d)$ samples. This dichotomy mirrors that of\ntwo-layer neural networks trained with stochastic gradient descent (SGD) in the\nmean-field regime, thereby establishing a head-to-head comparison between\nSGD-trained neural networks and greedy recursive partitioning estimators.\nFurthermore, ERM-trained recursive partitioning estimators achieve low\nestimation error with $O(\\log d)$ samples irrespective of whether $f^*$\nsatisfies MSP, thereby demonstrating a statistical-computational trade-off for\ngreedy training. Our proofs are based on a novel interpretation of greedy\nrecursive partitioning using stochastic process theory and a coupling technique\nthat may be of independent interest.\n","authors":["Yan Shuo Tan","Jason M. Klusowski","Krishnakumar Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2411.04394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01781v3","updated":"2024-11-18T15:11:11Z","published":"2024-06-03T20:52:34Z","title":"DEFT: Efficient Fine-Tuning of Diffusion Models by Learning the\n Generalised $h$-transform","summary":" Generative modelling paradigms based on denoising diffusion processes have\nemerged as a leading candidate for conditional sampling in inverse problems. In\nmany real-world applications, we often have access to large, expensively\ntrained unconditional diffusion models, which we aim to exploit for improving\nconditional sampling. Most recent approaches are motivated heuristically and\nlack a unifying framework, obscuring connections between them. Further, they\noften suffer from issues such as being very sensitive to hyperparameters, being\nexpensive to train or needing access to weights hidden behind a closed API. In\nthis work, we unify conditional training and sampling using the mathematically\nwell-understood Doob's h-transform. This new perspective allows us to unify\nmany existing methods under a common umbrella. Under this framework, we propose\nDEFT (Doob's h-transform Efficient FineTuning), a new approach for conditional\ngeneration that simply fine-tunes a very small network to quickly learn the\nconditional $h$-transform, while keeping the larger unconditional network\nunchanged. DEFT is much faster than existing baselines while achieving\nstate-of-the-art performance across a variety of linear and non-linear\nbenchmarks. On image reconstruction tasks, we achieve speedups of up to\n1.6$\\times$, while having the best perceptual quality on natural images and\nreconstruction performance on medical images. Further, we also provide initial\nexperiments on protein motif scaffolding and outperform reconstruction guidance\nmethods.\n","authors":["Alexander Denker","Francisco Vargas","Shreyas Padhy","Kieran Didi","Simon Mathis","Vincent Dutordoir","Riccardo Barbano","Emile Mathieu","Urszula Julia Komorowska","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2406.01781v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.09236"},{"id":"http://arxiv.org/abs/2411.11620v1","updated":"2024-11-18T14:49:12Z","published":"2024-11-18T14:49:12Z","title":"ST-Tree with Interpretability for Multivariate Time Series\n Classification","summary":" Multivariate time series classification is of great importance in practical\napplications and is a challenging task. However, deep neural network models\nsuch as Transformers exhibit high accuracy in multivariate time series\nclassification but lack interpretability and fail to provide insights into the\ndecision-making process. On the other hand, traditional approaches based on\ndecision tree classifiers offer clear decision processes but relatively lower\naccuracy. Swin Transformer (ST) addresses these issues by leveraging\nself-attention mechanisms to capture both fine-grained local patterns and\nglobal patterns. It can also model multi-scale feature representation learning,\nthereby providing a more comprehensive representation of time series features.\nTo tackle the aforementioned challenges, we propose ST-Tree with\ninterpretability for multivariate time series classification. Specifically, the\nST-Tree model combines ST as the backbone network with an additional neural\ntree model. This integration allows us to fully leverage the advantages of ST\nin learning time series context while providing interpretable decision\nprocesses through the neural tree. This enables researchers to gain clear\ninsights into the model's decision-making process and extract meaningful\ninterpretations. Through experimental evaluations on 10 UEA datasets, we\ndemonstrate that the ST-Tree model improves accuracy in multivariate time\nseries classification tasks and provides interpretability through visualizing\nthe decision-making process across different datasets.\n","authors":["Mingsen Du","Yanxuan Wei","Yingxia Tang","Xiangwei Zheng","Shoushui Wei","Cun Ji"],"pdf_url":"https://arxiv.org/pdf/2411.11620v1.pdf","comment":"Submitted on May 15, 2024, major revisions on Aug 31, 2024"},{"id":"http://arxiv.org/abs/2411.11619v1","updated":"2024-11-18T14:48:06Z","published":"2024-11-18T14:48:06Z","title":"FERT: Real-Time Facial Expression Recognition with Short-Range FMCW\n Radar","summary":" This study proposes a novel approach for real-time facial expression\nrecognition utilizing short-range Frequency-Modulated Continuous-Wave (FMCW)\nradar equipped with one transmit (Tx), and three receive (Rx) antennas. The\nsystem leverages four distinct modalities simultaneously: Range-Doppler images\n(RDIs), micro range-Doppler Images (micro-RDIs), range azimuth images (RAIs),\nand range elevation images (REIs). Our innovative architecture integrates\nfeature extractor blocks, intermediate feature extractor blocks, and a ResNet\nblock to accurately classify facial expressions into smile, anger, neutral, and\nno-face classes. Our model achieves an average classification accuracy of\n98.91% on the dataset collected using a 60 GHz short-range FMCW radar. The\nproposed solution operates in real-time in a person-independent manner, which\nshows the potential use of low-cost FMCW radars for effective facial expression\nrecognition in various applications.\n","authors":["Sabri Mustafa Kahya","Muhammet Sami Yavuz","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2411.11619v1.pdf","comment":"Accepted at IEEE SENSORS 2024"},{"id":"http://arxiv.org/abs/2411.11616v1","updated":"2024-11-18T14:42:15Z","published":"2024-11-18T14:42:15Z","title":"Signaling and Social Learning in Swarms of Robots","summary":" This paper investigates the role of communication in improving coordination\nwithin robot swarms, focusing on a paradigm where learning and execution occur\nsimultaneously in a decentralized manner. We highlight the role communication\ncan play in addressing the credit assignment problem (individual contribution\nto the overall performance), and how it can be influenced by it. We propose a\ntaxonomy of existing and future works on communication, focusing on information\nselection and physical abstraction as principal axes for classification: from\nlow-level lossless compression with raw signal extraction and processing to\nhigh-level lossy compression with structured communication models. The paper\nreviews current research from evolutionary robotics, multi-agent (deep)\nreinforcement learning, language models, and biophysics models to outline the\nchallenges and opportunities of communication in a collective of robots that\ncontinuously learn from one another through local message exchanges,\nillustrating a form of social learning.\n","authors":["Leo Cazenille","Maxime Toquebiau","Nicolas Lobato-Dauzier","Alessia Loi","Loona Macabre","Nathanael Aubert-Kato","Anthony Genot","Nicolas Bredeche"],"pdf_url":"https://arxiv.org/pdf/2411.11616v1.pdf","comment":"17 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2411.11614v1","updated":"2024-11-18T14:40:58Z","published":"2024-11-18T14:40:58Z","title":"On the physics of nested Markov models: a generalized probabilistic\n theory perspective","summary":" Determining potential probability distributions with a given causal graph is\nvital for causality studies. To bypass the difficulty in characterizing latent\nvariables in a Bayesian network, the nested Markov model provides an elegant\nalgebraic approach by listing exactly all the equality constraints on the\nobserved variables. However, this algebraically motivated causal model\ncomprises distributions outside Bayesian networks, and its physical\ninterpretation remains vague. In this work, we inspect the nested Markov model\nthrough the lens of generalized probabilistic theory, an axiomatic framework to\ndescribe general physical theories. We prove that all the equality constraints\ndefining the nested Markov model hold valid theory-independently. Yet, we show\nthis model generally contains distributions not implementable even within such\nrelaxed physical theories subjected to merely the relativity principles and\nmild probabilistic rules. To interpret the origin of such a gap, we establish a\nnew causal model that defines valid distributions as projected from a\nhigh-dimensional Bell-type causal structure. The new model unveils inequality\nconstraints induced by relativity principles, or equivalently high-dimensional\nconditional independences, which are absent in the nested Markov model.\nNevertheless, we also notice that the restrictions on states and measurements\nintroduced by the generalized probabilistic theory framework can pose\nadditional inequality constraints beyond the new causal model. As a by-product,\nwe discover a new causal structure exhibiting strict gaps between the\ndistribution sets of a Bayesian network, generalized probabilistic theories,\nand the nested Markov model. We anticipate our results will enlighten further\nexplorations on the unification of algebraic and physical perspectives of\ncausality.\n","authors":["Xingjian Zhang","Yuhao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11614v1.pdf","comment":"21 pages, 5 figures, 5 tables; Comments are welcome!"},{"id":"http://arxiv.org/abs/2406.07302v2","updated":"2024-11-18T14:40:54Z","published":"2024-06-11T14:30:34Z","title":"BertaQA: How Much Do Language Models Know About Local Culture?","summary":" Large Language Models (LLMs) exhibit extensive knowledge about the world, but\nmost evaluations have been limited to global or anglocentric subjects. This\nraises the question of how well these models perform on topics relevant to\nother cultures, whose presence on the web is not that prominent. To address\nthis gap, we introduce BertaQA, a multiple-choice trivia dataset that is\nparallel in English and Basque. The dataset consists of a local subset with\nquestions pertinent to the Basque culture, and a global subset with questions\nof broader interest. We find that state-of-the-art LLMs struggle with local\ncultural knowledge, even as they excel on global topics. However, we show that\ncontinued pre-training in Basque significantly improves the models' performance\non Basque culture, even when queried in English. To our knowledge, this is the\nfirst solid evidence of knowledge transfer from a low-resource to a\nhigh-resource language. Our analysis sheds light on the complex interplay\nbetween language and knowledge, and reveals that some prior findings do not\nfully hold when reassessed on local topics. Our dataset and evaluation code are\navailable under open licenses at https://github.com/juletx/BertaQA.\n","authors":["Julen Etxaniz","Gorka Azkune","Aitor Soroa","Oier Lopez de Lacalle","Mikel Artetxe"],"pdf_url":"https://arxiv.org/pdf/2406.07302v2.pdf","comment":"NEURIPS Datasets & Benchmarks 2024"},{"id":"http://arxiv.org/abs/2411.11603v1","updated":"2024-11-18T14:25:55Z","published":"2024-11-18T14:25:55Z","title":"Feature Selection for Network Intrusion Detection","summary":" Network Intrusion Detection (NID) remains a key area of research within the\ninformation security community, while also being relevant to Machine Learning\n(ML) practitioners. The latter generally aim to detect attacks using network\nfeatures, which have been extracted from raw network data typically using\ndimensionality reduction methods, such as principal component analysis (PCA).\nHowever, PCA is not able to assess the relevance of features for the task at\nhand. Consequently, the features available are of varying quality, with some\nbeing entirely non-informative. From this, two major drawbacks arise. Firstly,\ntrained and deployed models have to process large amounts of unnecessary data,\ntherefore draining potentially costly resources. Secondly, the noise caused by\nthe presence of irrelevant features can, in some cases, impede a model's\nability to detect an attack. In order to deal with these challenges, we present\nFeature Selection for Network Intrusion Detection (FSNID) a novel\ninformation-theoretic method that facilitates the exclusion of non-informative\nfeatures when detecting network intrusions. The proposed method is based on\nfunction approximation using a neural network, which enables a version of our\napproach that incorporates a recurrent layer. Consequently, this version\nuniquely enables the integration of temporal dependencies. Through an extensive\nset of experiments, we demonstrate that the proposed method selects a\nsignificantly reduced feature set, while maintaining NID performance. Code will\nbe made available upon publication.\n","authors":["Charles Westphal","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2411.11603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11048v2","updated":"2024-11-18T14:14:22Z","published":"2024-08-20T17:56:52Z","title":"RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual\n Dexterous Robot Hands","summary":" It has been a long-standing research goal to endow robot hands with\nhuman-level dexterity. Bi-manual robot piano playing constitutes a task that\ncombines challenges from dynamic tasks, such as generating fast while precise\nmotions, with slower but contact-rich manipulation problems. Although\nreinforcement learning based approaches have shown promising results in\nsingle-task performance, these methods struggle in a multi-song setting. Our\nwork aims to close this gap and, thereby, enable imitation learning approaches\nfor robot piano playing at scale. To this end, we introduce the Robot Piano 1\nMillion (RP1M) dataset, containing bi-manual robot piano playing motion data of\nmore than one million trajectories. We formulate finger placements as an\noptimal transport problem, thus, enabling automatic annotation of vast amounts\nof unlabeled songs. Benchmarking existing imitation learning approaches shows\nthat such approaches reach state-of-the-art robot piano playing performance by\nleveraging RP1M.\n","authors":["Yi Zhao","Le Chen","Jan Schneider","Quankai Gao","Juho Kannala","Bernhard Schölkopf","Joni Pajarinen","Dieter Büchler"],"pdf_url":"https://arxiv.org/pdf/2408.11048v2.pdf","comment":"Accepted by Conference on Robot Learning (CoRL) 2024. Project\n Website: https://rp1m.github.io/"},{"id":"http://arxiv.org/abs/2411.11592v1","updated":"2024-11-18T14:10:20Z","published":"2024-11-18T14:10:20Z","title":"Generative Spatio-temporal GraphNet for Transonic Wing Pressure\n Distribution Forecasting","summary":" This study presents a framework for predicting unsteady transonic wing\npressure distributions, integrating an autoencoder architecture with graph\nconvolutional networks and graph-based temporal layers to model time\ndependencies. The framework compresses high-dimensional pressure distribution\ndata into a lower-dimensional latent space using an autoencoder, ensuring\nefficient data representation while preserving essential features. Within this\nlatent space, graph-based temporal layers are employed to predict future wing\npressures based on past data, effectively capturing temporal dependencies and\nimproving predictive accuracy. This combined approach leverages the strengths\nof autoencoders for dimensionality reduction, graph convolutional networks for\nhandling unstructured grid data, and temporal layers for modeling time-based\nsequences. The effectiveness of the proposed framework is validated through its\napplication to the Benchmark Super Critical Wing test case, achieving accuracy\ncomparable to computational fluid dynamics, while significantly reducing\nprediction time. This framework offers a scalable, computationally efficient\nsolution for the aerodynamic analysis of unsteady phenomena.\n","authors":["Gabriele Immordino","Andrea Vaiuso","Andrea Da Ronch","Marcello Righi"],"pdf_url":"https://arxiv.org/pdf/2411.11592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11590v1","updated":"2024-11-18T14:09:01Z","published":"2024-11-18T14:09:01Z","title":"Robust Causal Analysis of Linear Cyclic Systems With Hidden Confounders","summary":" We live in a world full of complex systems which we need to improve our\nunderstanding of. To accomplish this, purely probabilistic investigations are\noften not enough. They are only the first step and must be followed by learning\nthe system's underlying mechanisms. This is what the discipline of causality is\nconcerned with. Many of those complex systems contain feedback loops which\nmeans that our methods have to allow for cyclic causal relations. Furthermore,\nsystems are rarely sufficiently isolated, which means that there are usually\nhidden confounders, i.e., unmeasured variables that each causally affects more\nthan one measured variable. Finally, data is often distorted by contaminating\nprocesses, and we need to apply methods that are robust against such\ndistortions. That's why we consider the robustness of LLC, see \\cite{llc}, one\nof the few causal analysis methods that can deal with cyclic models with hidden\nconfounders. Following a theoretical analysis of LLC's robustness properties,\nwe also provide robust extensions of LLC. To facilitate reproducibility and\nfurther research in this field, we make the source code publicly available.\n","authors":["Boris Lorbeer"],"pdf_url":"https://arxiv.org/pdf/2411.11590v1.pdf","comment":"18 pages, 2 figures"},{"id":"http://arxiv.org/abs/2405.14073v2","updated":"2024-11-18T14:06:10Z","published":"2024-05-23T00:35:23Z","title":"PEAC: Unsupervised Pre-training for Cross-Embodiment Reinforcement\n Learning","summary":" Designing generalizable agents capable of adapting to diverse embodiments has\nachieved significant attention in Reinforcement Learning (RL), which is\ncritical for deploying RL agents in various real-world applications. Previous\nCross-Embodiment RL approaches have focused on transferring knowledge across\nembodiments within specific tasks. These methods often result in knowledge\ntightly coupled with those tasks and fail to adequately capture the distinct\ncharacteristics of different embodiments. To address this limitation, we\nintroduce the notion of Cross-Embodiment Unsupervised RL (CEURL), which\nleverages unsupervised learning to enable agents to acquire embodiment-aware\nand task-agnostic knowledge through online interactions within reward-free\nenvironments. We formulate CEURL as a novel Controlled Embodiment Markov\nDecision Process (CE-MDP) and systematically analyze CEURL's pre-training\nobjectives under CE-MDP. Based on these analyses, we develop a novel algorithm\nPre-trained Embodiment-Aware Control (PEAC) for handling CEURL, incorporating\nan intrinsic reward function specifically designed for cross-embodiment\npre-training. PEAC not only provides an intuitive optimization strategy for\ncross-embodiment pre-training but also can integrate flexibly with existing\nunsupervised RL methods, facilitating cross-embodiment exploration and skill\ndiscovery. Extensive experiments in both simulated (e.g., DMC and Robosuite)\nand real-world environments (e.g., legged locomotion) demonstrate that PEAC\nsignificantly improves adaptation performance and cross-embodiment\ngeneralization, demonstrating its effectiveness in overcoming the unique\nchallenges of CEURL. The project page and code are in\nhttps://yingchengyang.github.io/ceurl.\n","authors":["Chengyang Ying","Zhongkai Hao","Xinning Zhou","Xuezhou Xu","Hang Su","Xingxing Zhang","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.14073v2.pdf","comment":"NeurIPS24"},{"id":"http://arxiv.org/abs/2402.16731v6","updated":"2024-11-18T14:05:29Z","published":"2024-02-26T16:52:35Z","title":"PyGim: An Efficient Graph Neural Network Library for Real\n Processing-In-Memory Architectures","summary":" Graph Neural Networks (GNNs) are emerging ML models to analyze\ngraph-structure data. Graph Neural Network (GNN) execution involves both\ncompute-intensive and memory-intensive kernels, the latter dominates the total\ntime, being significantly bottlenecked by data movement between memory and\nprocessors. Processing-In-Memory (PIM) systems can alleviate this data movement\nbottleneck by placing simple processors near or inside to memory arrays. In\nthis work, we introduce PyGim, an efficient ML library that accelerates GNNs on\nreal PIM systems. We propose intelligent parallelization techniques for\nmemory-intensive kernels of GNNs tailored for real PIM systems, and develop\nhandy Python API for them. We provide hybrid GNN execution, in which the\ncompute-intensive and memory-intensive kernels are executed in\nprocessor-centric and memory-centric computing systems, respectively. We\nextensively evaluate PyGim on a real-world PIM system with 1992 PIM cores using\nemerging GNN models, and demonstrate that it outperforms its state-of-the-art\nCPU counterpart on Intel Xeon by on average 3.04x, and achieves higher resource\nutilization than CPU and GPU systems. Our work provides useful recommendations\nfor software, system and hardware designers. PyGim is publicly available at\nhttps://github.com/CMU-SAFARI/PyGim.\n","authors":["Christina Giannoula","Peiming Yang","Ivan Fernandez","Jiacheng Yang","Sankeerth Durvasula","Yu Xin Li","Mohammad Sadrosadati","Juan Gomez Luna","Onur Mutlu","Gennady Pekhimenko"],"pdf_url":"https://arxiv.org/pdf/2402.16731v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11576v1","updated":"2024-11-18T13:54:44Z","published":"2024-11-18T13:54:44Z","title":"Hybrid Data-Driven SSM for Interpretable and Label-Free mmWave Channel\n Prediction","summary":" Accurate prediction of mmWave time-varying channels is essential for\nmitigating the issue of channel aging in complex scenarios owing to high user\nmobility. Existing channel prediction methods have limitations: classical\nmodel-based methods often struggle to track highly nonlinear channel dynamics\ndue to limited expert knowledge, while emerging data-driven methods typically\nrequire substantial labeled data for effective training and often lack\ninterpretability. To address these issues, this paper proposes a novel hybrid\nmethod that integrates a data-driven neural network into a conventional\nmodel-based workflow based on a state-space model (SSM), implicitly tracking\ncomplex channel dynamics from data without requiring precise expert knowledge.\nAdditionally, a novel unsupervised learning strategy is developed to train the\nembedded neural network solely with unlabeled data. Theoretical analyses and\nablation studies are conducted to interpret the enhanced benefits gained from\nthe hybrid integration. Numerical simulations based on the 3GPP mmWave channel\nmodel corroborate the superior prediction accuracy of the proposed method,\ncompared to state-of-the-art methods that are either purely model-based or\ndata-driven. Furthermore, extensive experiments validate its robustness against\nvarious challenging factors, including among others severe channel variations\nand high noise levels.\n","authors":["Yiyong Sun","Jiajun He","Zhidi Lin","Wenqiang Pu","Feng Yin","Hing Cheung So"],"pdf_url":"https://arxiv.org/pdf/2411.11576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11567v1","updated":"2024-11-18T13:40:03Z","published":"2024-11-18T13:40:03Z","title":"GNN-Based Code Annotation Logic for Establishing Security Boundaries in\n C Code","summary":" Securing sensitive operations in today's interconnected software landscape is\ncrucial yet challenging. Modern platforms rely on Trusted Execution\nEnvironments (TEEs), such as Intel SGX and ARM TrustZone, to isolate security\nsensitive code from the main system, reducing the Trusted Computing Base (TCB)\nand providing stronger assurances. However, identifying which code should\nreside in TEEs is complex and requires specialized expertise, which is not\nsupported by current automated tools. Existing solutions often migrate entire\napplications to TEEs, leading to suboptimal use and an increased TCB. To\naddress this gap, we propose Code Annotation Logic (CAL), a pioneering tool\nthat automatically identifies security sensitive components for TEE isolation.\nCAL analyzes codebases, leveraging a graph-based approach with novel feature\nconstruction and employing a custom graph neural network model to accurately\ndetermine which parts of the code should be isolated. CAL effectively optimizes\nTCB, reducing the burden of manual analysis and enhancing overall security. Our\ncontributions include the definition of security sensitive code, the\nconstruction and labeling of a comprehensive dataset of source files, a feature\nrich graph based data preparation pipeline, and the CAL model for TEE\nintegration. Evaluation results demonstrate CAL's efficacy in identifying\nsensitive code with a recall of 86.05%, an F1 score of 81.56%, and an\nidentification rate of 91.59% for security sensitive functions. By enabling\nefficient code isolation, CAL advances the secure development of applications\nusing TEEs, offering a practical solution for developers to reduce attack\nvectors.\n","authors":["Varun Gadey","Raphael Goetz","Christoph Sendner","Sampo Sovio","Alexandra Dmitrienko"],"pdf_url":"https://arxiv.org/pdf/2411.11567v1.pdf","comment":"Submitted to the IEEE Symposium on Security and Privacy 2025"},{"id":"http://arxiv.org/abs/2307.09218v3","updated":"2024-11-18T13:26:41Z","published":"2023-07-16T16:27:58Z","title":"A Comprehensive Survey of Forgetting in Deep Learning Beyond Continual\n Learning","summary":" Forgetting refers to the loss or deterioration of previously acquired\nknowledge. While existing surveys on forgetting have primarily focused on\ncontinual learning, forgetting is a prevalent phenomenon observed in various\nother research domains within deep learning. Forgetting manifests in research\nfields such as generative models due to generator shifts, and federated\nlearning due to heterogeneous data distributions across clients. Addressing\nforgetting encompasses several challenges, including balancing the retention of\nold task knowledge with fast learning of new task, managing task interference\nwith conflicting goals, and preventing privacy leakage, etc. Moreover, most\nexisting surveys on continual learning implicitly assume that forgetting is\nalways harmful. In contrast, our survey argues that forgetting is a\ndouble-edged sword and can be beneficial and desirable in certain cases, such\nas privacy-preserving scenarios. By exploring forgetting in a broader context,\nwe present a more nuanced understanding of this phenomenon and highlight its\npotential advantages. Through this comprehensive survey, we aspire to uncover\npotential solutions by drawing upon ideas and approaches from various fields\nthat have dealt with forgetting. By examining forgetting beyond its\nconventional boundaries, we hope to encourage the development of novel\nstrategies for mitigating, harnessing, or even embracing forgetting in real\napplications. A comprehensive list of papers about forgetting in various\nresearch fields is available at\n\\url{https://github.com/EnnengYang/Awesome-Forgetting-in-Deep-Learning}.\n","authors":["Zhenyi Wang","Enneng Yang","Li Shen","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2307.09218v3.pdf","comment":"accepted at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2411.11556v1","updated":"2024-11-18T13:17:10Z","published":"2024-11-18T13:17:10Z","title":"Data-driven model reconstruction for nonlinear wave dynamics","summary":" The use of machine learning to predict wave dynamics is a topic of growing\ninterest, but commonly-used deep learning approaches suffer from a lack of\ninterpretability of the trained models. Here we present an interpretable\nmachine learning framework for analyzing the nonlinear evolution dynamics of\noptical wavepackets in complex wave media. We use sparse regression to reduce\nmicroscopic discrete lattice models to simpler effective continuum models which\ncan accurately describe the dynamics of the wavepacket envelope. We apply our\napproach to valley-Hall domain walls in honeycomb photonic lattices of\nlaser-written waveguides with Kerr-type nonlinearity and different boundary\nshapes. The reconstructed equations accurately reproduce the linear dispersion\nand nonlinear effects including self-steepening and self-focusing. This scheme\nis proven free of the a priori limitations imposed by the underlying hierarchy\nof scales traditionally employed in asymptotic analytical methods. It\nrepresents a powerful interpretable machine learning technique of interest for\nadvancing design capabilities in photonics and framing the complex\ninteraction-driven dynamics in various topological materials.\n","authors":["Ekaterina Smolina","Lev Smirnov","Daniel Leykam","Franco Nori","Daria Smirnova"],"pdf_url":"https://arxiv.org/pdf/2411.11556v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.11548v1","updated":"2024-11-18T13:06:29Z","published":"2024-11-18T13:06:29Z","title":"Real-Time Fitness Exercise Classification and Counting from Video Frames","summary":" This paper introduces a novel method for real-time exercise classification\nusing a Bidirectional Long Short-Term Memory (BiLSTM) neural network. Existing\nexercise recognition approaches often rely on synthetic datasets, raw\ncoordinate inputs sensitive to user and camera variations, and fail to fully\nexploit the temporal dependencies in exercise movements. These issues limit\ntheir generalizability and robustness in real-world conditions, where lighting,\ncamera angles, and user body types vary.\n To address these challenges, we propose a BiLSTM-based model that leverages\ninvariant features, such as joint angles, alongside raw coordinates. By using\nboth angles and (x, y, z) coordinates, the model adapts to changes in\nperspective, user positioning, and body differences, improving generalization.\nTraining on 30-frame sequences enables the BiLSTM to capture the temporal\ncontext of exercises and recognize patterns evolving over time.\n We compiled a dataset combining synthetic data from the InfiniteRep dataset\nand real-world videos from Kaggle and other sources. This dataset includes four\ncommon exercises: squat, push-up, shoulder press, and bicep curl. The model was\ntrained and validated on these diverse datasets, achieving an accuracy of over\n99% on the test set. To assess generalizability, the model was tested on 2\nseparate test sets representative of typical usage conditions. Comparisons with\nthe previous approach from the literature are present in the result section\nshowing that the proposed model is the best-performing one.\n The classifier is integrated into a web application providing real-time\nexercise classification and repetition counting without manual exercise\nselection.\n Demo and datasets are available at the following GitHub Repository:\nhttps://github.com/RiccardoRiccio/Fitness-AI-Trainer-With-Automatic-Exercise-Recognition-and-Counting.\n","authors":["Riccardo Riccio"],"pdf_url":"https://arxiv.org/pdf/2411.11548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07929v2","updated":"2024-11-18T12:55:45Z","published":"2023-11-14T06:15:16Z","title":"Variational Graph Autoencoder for Heterogeneous Information Networks\n with Missing and Inaccurate Attributes","summary":" Heterogeneous Information Networks (HINs), which consist of various types of\nnodes and edges, have recently demonstrated excellent performance in graph\nmining. However, most existing heterogeneous graph neural networks (HGNNs)\nignore the problems of missing attributes, inaccurate attributes and scarce\nlabels for nodes, which limits their expressiveness. In this paper, we propose\na generative self-supervised model GraMI to address these issues\nsimultaneously. Specifically, GraMI first initializes all the nodes in the\ngraph with a low-dimensional representation matrix. After that, based on the\nvariational graph autoencoder framework, GraMI learns both node-level and\nattribute-level embeddings in the encoder, which can provide fine-grained\nsemantic information to construct node attributes. In the decoder, GraMI\nreconstructs both links and attributes. Instead of directly reconstructing raw\nfeatures for attributed nodes, GraMI generates the initial low-dimensional\nrepresentation matrix for all the nodes, based on which raw features of\nattributed nodes are further reconstructed to leverage accurate attributes. In\nthis way, GraMI can not only complete informative features for non-attributed\nnodes, but rectify inaccurate ones for attributed nodes. Finally, we conduct\nextensive experiments to show the superiority of GraMI in tackling HINs with\nmissing and inaccurate attributes.\n","authors":["Yige Zhao","Jianxiang Yu","Yao Cheng","Chengcheng Yu","Yiding Liu","Xiang Li","Shuaiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.07929v2.pdf","comment":"Accepted by KDD 2025"},{"id":"http://arxiv.org/abs/2411.11536v1","updated":"2024-11-18T12:48:15Z","published":"2024-11-18T12:48:15Z","title":"Hierarchical-Graph-Structured Edge Partition Models for Learning\n Evolving Community Structure","summary":" We propose a novel dynamic network model to capture evolving latent\ncommunities within temporal networks. To achieve this, we decompose each\nobserved dynamic edge between vertices using a Poisson-gamma edge partition\nmodel, assigning each vertex to one or more latent communities through\n\\emph{nonnegative} vertex-community memberships. Specifically, hierarchical\ntransition kernels are employed to model the interactions between these latent\ncommunities in the observed temporal network. A hierarchical graph prior is\nplaced on the transition structure of the latent communities, allowing us to\nmodel how they evolve and interact over time. Consequently, our dynamic network\nenables the inferred community structure to merge, split, and interact with one\nanother, providing a comprehensive understanding of complex network dynamics.\nExperiments on various real-world network datasets demonstrate that the\nproposed model not only effectively uncovers interpretable latent structures\nbut also surpasses other state-of-the art dynamic network models in the tasks\nof link prediction and community detection.\n","authors":["Xincan Yu","Sikun Yang"],"pdf_url":"https://arxiv.org/pdf/2411.11536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11530v1","updated":"2024-11-18T12:40:39Z","published":"2024-11-18T12:40:39Z","title":"SeqProFT: Applying LoRA Finetuning for Sequence-only Protein Property\n Predictions","summary":" Protein language models (PLMs) are capable of learning the relationships\nbetween protein sequences and functions by treating amino acid sequences as\ntextual data in a self-supervised manner. However, fine-tuning these models\ntypically demands substantial computational resources and time, with results\nthat may not always be optimized for specific tasks. To overcome these\nchallenges, this study employs the LoRA method to perform end-to-end\nfine-tuning of the ESM-2 model specifically for protein property prediction\ntasks, utilizing only sequence information. Additionally, a multi-head\nattention mechanism is integrated into the downstream network to combine\nsequence features with contact map information, thereby enhancing the model's\ncomprehension of protein sequences. Experimental results of extensive\nclassification and regression tasks demonstrate that the fine-tuned model\nachieves strong performance and faster convergence across multiple regression\nand classification tasks.\n","authors":["Shuo Zhang","Jian K. Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22559v2","updated":"2024-11-18T12:36:04Z","published":"2024-10-29T21:54:18Z","title":"Unpicking Data at the Seams: VAEs, Disentanglement and Independent\n Components","summary":" Disentanglement, or identifying salient statistically independent factors of\nthe data, is of interest in many areas of machine learning and statistics, with\nrelevance to synthetic data generation with controlled properties, robust\nclassification of features, parsimonious encoding, and a greater understanding\nof the generative process underlying the data. Disentanglement arises in\nseveral generative paradigms, including Variational Autoencoders (VAEs),\nGenerative Adversarial Networks and diffusion models. Particular progress has\nrecently been made in understanding disentanglement in VAEs, where the choice\nof diagonal posterior covariance matrices is suggested to promote mutual\northogonality between columns of the decoder's Jacobian. We continue this\nthread to show how this linear independence translates to statistical\nindependence, completing the chain in understanding how the VAE's objective\nidentifies independent components of, or disentangles, the data.\n","authors":["Carl Allen"],"pdf_url":"https://arxiv.org/pdf/2410.22559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11521v1","updated":"2024-11-18T12:31:22Z","published":"2024-11-18T12:31:22Z","title":"Preempting Text Sanitization Utility in Resource-Constrained\n Privacy-Preserving LLM Interactions","summary":" Individuals have been increasingly interacting with online Large Language\nModels (LLMs), both in their work and personal lives. These interactions raise\nprivacy issues as the LLMs are typically hosted by third-parties who can gather\na variety of sensitive information about users and their companies. Text\nSanitization techniques have been proposed in the literature and can be used to\nsanitize user prompts before sending them to the LLM. However, sanitization has\nan impact on the downstream task performed by the LLM, and often to such an\nextent that it leads to unacceptable results for the user. This is not just a\nminor annoyance, with clear monetary consequences as LLM services charge on a\nper use basis as well as great amount of computing resources wasted. We propose\nan architecture leveraging a Small Language Model (SLM) at the user-side to\nhelp estimate the impact of sanitization on a prompt before it is sent to the\nLLM, thus preventing resource losses.\n Our evaluation of this architecture revealed a significant problem with text\nsanitization based on Differential Privacy, on which we want to draw the\nattention of the community for further investigation.\n","authors":["Robin Carpentier","Benjamin Zi Hao Zhao","Hassan Jameel Asghar","Dali Kaafar"],"pdf_url":"https://arxiv.org/pdf/2411.11521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11520v1","updated":"2024-11-18T12:29:06Z","published":"2024-11-18T12:29:06Z","title":"A Pre-Trained Graph-Based Model for Adaptive Sequencing of Educational\n Documents","summary":" Massive Open Online Courses (MOOCs) have greatly contributed to making\neducation more accessible.However, many MOOCs maintain a rigid,\none-size-fits-all structure that fails to address the diverse needs and\nbackgrounds of individual learners.Learning path personalization aims to\naddress this limitation, by tailoring sequences of educational content to\noptimize individual student learning outcomes.Existing approaches, however,\noften require either massive student interaction data or extensive expert\nannotation, limiting their broad application.In this study, we introduce a\nnovel data-efficient framework for learning path personalization that operates\nwithout expert annotation.Our method employs a flexible recommender system\npre-trained with reinforcement learning on a dataset of raw course\nmaterials.Through experiments on semi-synthetic data, we show that this\npre-training stage substantially improves data-efficiency in a range of\nadaptive learning scenarios featuring new educational materials.This opens up\nnew perspectives for the design of foundation models for adaptive learning.\n","authors":["Jean Vassoyan","Anan Schütt","Jill-Jênn Vie","Arun-Balajiee Lekshmi-Narayanan","Elisabeth André","Nicolas Vayatis"],"pdf_url":"https://arxiv.org/pdf/2411.11520v1.pdf","comment":"NeurIPS 2024 Workshop on Large Foundation Models for Educational\n Assessment (FM-Assess), Dec 2024, Vancouver, Canada"},{"id":"http://arxiv.org/abs/2411.11516v1","updated":"2024-11-18T12:25:34Z","published":"2024-11-18T12:25:34Z","title":"Efficient Sample-optimal Learning of Gaussian Tree Models via\n Sample-optimal Testing of Gaussian Mutual Information","summary":" Learning high-dimensional distributions is a significant challenge in machine\nlearning and statistics. Classical research has mostly concentrated on\nasymptotic analysis of such data under suitable assumptions. While existing\nworks [Bhattacharyya et al.: SICOMP 2023, Daskalakis et al.: STOC 2021, Choo et\nal.: ALT 2024] focus on discrete distributions, the current work addresses the\ntree structure learning problem for Gaussian distributions, providing efficient\nalgorithms with solid theoretical guarantees. This is crucial as real-world\ndistributions are often continuous and differ from the discrete scenarios\nstudied in prior works.\n In this work, we design a conditional mutual information tester for Gaussian\nrandom variables that can test whether two Gaussian random variables are\nindependent, or their conditional mutual information is at least $\\varepsilon$,\nfor some parameter $\\varepsilon \\in (0,1)$ using\n$\\mathcal{O}(\\varepsilon^{-1})$ samples which we show to be near-optimal. In\ncontrast, an additive estimation would require $\\Omega(\\varepsilon^{-2})$\nsamples. Our upper bound technique uses linear regression on a pair of suitably\ntransformed random variables. Importantly, we show that the chain rule of\nconditional mutual information continues to hold for the estimated\n(conditional) mutual information. As an application of such a mutual\ninformation tester, we give an efficient $\\varepsilon$-approximate\nstructure-learning algorithm for an $n$-variate Gaussian tree model that takes\n$\\widetilde{\\Theta}(n\\varepsilon^{-1})$ samples which we again show to be\nnear-optimal. In contrast, when the underlying Gaussian model is not known to\nbe tree-structured, we show that $\\widetilde{{{\\Theta}}}(n^2\\varepsilon^{-2})$\nsamples are necessary and sufficient to output an $\\varepsilon$-approximate\ntree structure. We perform extensive experiments that corroborate our\ntheoretical convergence bounds.\n","authors":["Sutanu Gayen","Sanket Kale","Sayantan Sen"],"pdf_url":"https://arxiv.org/pdf/2411.11516v1.pdf","comment":"47 pages, 16 figures, abstract shortened as per arXiv criteria"},{"id":"http://arxiv.org/abs/2411.11515v1","updated":"2024-11-18T12:22:37Z","published":"2024-11-18T12:22:37Z","title":"Cascaded Diffusion Models for 2D and 3D Microscopy Image Synthesis to\n Enhance Cell Segmentation","summary":" Automated cell segmentation in microscopy images is essential for biomedical\nresearch, yet conventional methods are labor-intensive and prone to error.\nWhile deep learning-based approaches have proven effective, they often require\nlarge annotated datasets, which are scarce due to the challenges of manual\nannotation. To overcome this, we propose a novel framework for synthesizing\ndensely annotated 2D and 3D cell microscopy images using cascaded diffusion\nmodels. Our method synthesizes 2D and 3D cell masks from sparse 2D annotations\nusing multi-level diffusion models and NeuS, a 3D surface reconstruction\napproach. Following that, a pretrained 2D Stable Diffusion model is finetuned\nto generate realistic cell textures and the final outputs are combined to form\ncell populations. We show that training a segmentation model with a combination\nof our synthetic data and real data improves cell segmentation performance by\nup to 9\\% across multiple datasets. Additionally, the FID scores indicate that\nthe synthetic data closely resembles real data. The code for our proposed\napproach will be available at\nhttps://github.com/ruveydayilmaz0/cascaded\\_diffusion.\n","authors":["Rüveyda Yilmaz","Kaan Keven","Yuli Wu","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2411.11515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11513v1","updated":"2024-11-18T12:21:48Z","published":"2024-11-18T12:21:48Z","title":"A Modular Open Source Framework for Genomic Variant Calling","summary":" Variant calling is a fundamental task in genomic research, essential for\ndetecting genetic variations such as single nucleotide polymorphisms (SNPs) and\ninsertions or deletions (indels). This paper presents an enhancement to\nDeepChem, a widely used open-source drug discovery framework, through the\nintegration of DeepVariant. In particular, we introduce a variant calling\npipeline that leverages DeepVariant's convolutional neural network (CNN)\narchitecture to improve the accuracy and reliability of variant detection. The\nimplemented pipeline includes stages for realignment of sequencing reads,\ncandidate variant detection, and pileup image generation, followed by variant\nclassification using a modified Inception v3 model. Our work adds a modular and\nextensible variant calling framework to the DeepChem framework and enables\nfuture work integrating DeepChem's drug discovery infrastructure more tightly\nwith bioinformatics pipelines.\n","authors":["Ankita Vaishnobi Bisoi","Bharath Ramsundar"],"pdf_url":"https://arxiv.org/pdf/2411.11513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11511v1","updated":"2024-11-18T12:16:03Z","published":"2024-11-18T12:16:03Z","title":"Structure learning with Temporal Gaussian Mixture for model-based\n Reinforcement Learning","summary":" Model-based reinforcement learning refers to a set of approaches capable of\nsample-efficient decision making, which create an explicit model of the\nenvironment. This model can subsequently be used for learning optimal policies.\nIn this paper, we propose a temporal Gaussian Mixture Model composed of a\nperception model and a transition model. The perception model extracts discrete\n(latent) states from continuous observations using a variational Gaussian\nmixture likelihood. Importantly, our model constantly monitors the collected\ndata searching for new Gaussian components, i.e., the perception model performs\na form of structure learning (Smith et al., 2020; Friston et al., 2018; Neacsu\net al., 2022) as it learns the number of Gaussian components in the mixture.\nAdditionally, the transition model learns the temporal transition between\nconsecutive time steps by taking advantage of the Dirichlet-categorical\nconjugacy. Both the perception and transition models are able to forget part of\nthe data points, while integrating the information they provide within the\nprior, which ensure fast variational inference. Finally, decision making is\nperformed with a variant of Q-learning which is able to learn Q-values from\nbeliefs over states. Empirically, we have demonstrated the model's ability to\nlearn the structure of several mazes: the model discovered the number of states\nand the transition probabilities between these states. Moreover, using its\nlearned Q-values, the agent was able to successfully navigate from the starting\nposition to the maze's exit.\n","authors":["Théophile Champion","Marek Grześ","Howard Bowman"],"pdf_url":"https://arxiv.org/pdf/2411.11511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05579v2","updated":"2024-11-18T12:01:29Z","published":"2023-01-13T14:38:24Z","title":"A survey and taxonomy of loss functions in machine learning","summary":" Most state-of-the-art machine learning techniques revolve around the\noptimisation of loss functions. Defining appropriate loss functions is\ntherefore critical to successfully solving problems in this field. In this\nsurvey, we present a comprehensive overview of the most widely used loss\nfunctions across key applications, including regression, classification,\ngenerative modeling, ranking, and energy-based modeling. We introduce 43\ndistinct loss functions, structured within an intuitive taxonomy that clarifies\ntheir theoretical foundations, properties, and optimal application contexts.\nThis survey is intended as a resource for undergraduate, graduate, and Ph.D.\nstudents, as well as researchers seeking a deeper understanding of loss\nfunctions.\n","authors":["Lorenzo Ciampiconi","Adam Elwood","Marco Leonardi","Ashraf Mohamed","Alessandro Rozza"],"pdf_url":"https://arxiv.org/pdf/2301.05579v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11497v1","updated":"2024-11-18T11:58:20Z","published":"2024-11-18T11:58:20Z","title":"Physics Encoded Blocks in Residual Neural Network Architectures for\n Digital Twin Models","summary":" Physics Informed Machine Learning has emerged as a popular approach in\nmodelling and simulation for digital twins to generate accurate models of\nprocesses and behaviours of real-world systems. However, despite their success\nin generating accurate and reliable models, the existing methods either use\nsimple regularizations in loss functions to offer limited physics integration\nor are too specific in architectural definitions to be generalized to a wide\nvariety of physical systems. This paper presents a generic approach based on a\nnovel physics-encoded residual neural network architecture to combine\ndata-driven and physics-based analytical models to address these limitations.\nOur method combines physics blocks as mathematical operators from physics-based\nmodels with learning blocks comprising feed-forward layers. Intermediate\nresidual blocks are incorporated for stable gradient flow as they train on\nphysical system observation data. This way, the model learns to comply with the\ngeometric and kinematic aspects of the physical system. Compared to\nconventional neural network-based methods, our method improves generalizability\nwith substantially low data requirements and model complexity in terms of\nparameters, especially in scenarios where prior physics knowledge is either\nelementary or incomplete. We investigate our approach in two application\ndomains. The first is a basic robotic motion model using Euler Lagrangian\nequations of motion as physics prior. The second application is a complex\nscenario of a steering model for a self-driving vehicle in a simulation. In\nboth applications, our method outperforms both conventional neural network\nbased approaches as-well as state-of-the-art Physics Informed Machine Learning\nmethods.\n","authors":["Muhammad Saad Zia","Ashiq Anjum","Lu Liu","Anthony Conway","Anasol Pena Rios"],"pdf_url":"https://arxiv.org/pdf/2411.11497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11494v1","updated":"2024-11-18T11:55:38Z","published":"2024-11-18T11:55:38Z","title":"Alien Recombination: Exploring Concept Blends Beyond Human Cognitive\n Availability in Visual Art","summary":" While AI models have demonstrated remarkable capabilities in constrained\ndomains like game strategy, their potential for genuine creativity in\nopen-ended domains like art remains debated. We explore this question by\nexamining how AI can transcend human cognitive limitations in visual art\ncreation. Our research hypothesizes that visual art contains a vast unexplored\nspace of conceptual combinations, constrained not by inherent incompatibility,\nbut by cognitive limitations imposed by artists' cultural, temporal,\ngeographical and social contexts.\n To test this hypothesis, we present the Alien Recombination method, a novel\napproach utilizing fine-tuned large language models to identify and generate\nconcept combinations that lie beyond human cognitive availability. The system\nmodels and deliberately counteracts human availability bias, the tendency to\nrely on immediately accessible examples, to discover novel artistic\ncombinations.\n This system not only produces combinations that have never been attempted\nbefore within our dataset but also identifies and generates combinations that\nare cognitively unavailable to all artists in the domain. Furthermore, we\ntranslate these combinations into visual representations, enabling the\nexploration of subjective perceptions of novelty. Our findings suggest that\ncognitive unavailability is a promising metric for optimizing artistic novelty,\noutperforming merely temperature scaling without additional evaluation\ncriteria. This approach uses generative models to connect previously\nunconnected ideas, providing new insight into the potential of framing\nAI-driven creativity as a combinatorial problem.\n","authors":["Alejandro Hernandez","Levin Brinkmann","Ignacio Serna","Nasim Rahaman","Hassan Abu Alhaija","Hiromu Yakura","Mar Canet Sola","Bernhard Schölkopf","Iyad Rahwan"],"pdf_url":"https://arxiv.org/pdf/2411.11494v1.pdf","comment":"NeurIPS 2024 Workshop on Creativity & Generative AI, 13 pages, 11\n figures"},{"id":"http://arxiv.org/abs/2406.12060v3","updated":"2024-11-18T11:51:38Z","published":"2024-06-17T20:00:04Z","title":"Not Eliminate but Aggregate: Post-Hoc Control over Mixture-of-Experts to\n Address Shortcut Shifts in Natural Language Understanding","summary":" Recent models for natural language understanding are inclined to exploit\nsimple patterns in datasets, commonly known as shortcuts. These shortcuts hinge\non spurious correlations between labels and latent features existing in the\ntraining data. At inference time, shortcut-dependent models are likely to\ngenerate erroneous predictions under distribution shifts, particularly when\nsome latent features are no longer correlated with the labels. To avoid this,\nprevious studies have trained models to eliminate the reliance on shortcuts. In\nthis study, we explore a different direction: pessimistically aggregating the\npredictions of a mixture-of-experts, assuming each expert captures relatively\ndifferent latent features. The experimental results demonstrate that our\npost-hoc control over the experts significantly enhances the model's robustness\nto the distribution shift in shortcuts. Besides, we show that our approach has\nsome practical advantages. We also analyze our model and provide results to\nsupport the assumption.\n","authors":["Ukyo Honda","Tatsushi Oka","Peinan Zhang","Masato Mita"],"pdf_url":"https://arxiv.org/pdf/2406.12060v3.pdf","comment":"21 pages, 5 figures (the layout differs from the MIT Press\n publication version)"},{"id":"http://arxiv.org/abs/2411.11474v1","updated":"2024-11-18T11:16:13Z","published":"2024-11-18T11:16:13Z","title":"Graph Artificial Intelligence for Quantifying Compatibility Mechanisms\n in Traditional Chinese Medicine","summary":" Traditional Chinese Medicine (TCM) involves complex compatibility mechanisms\ncharacterized by multi-component and multi-target interactions, which are\nchallenging to quantify. To address this challenge, we applied graph artificial\nintelligence to develop a TCM multi-dimensional knowledge graph that bridges\ntraditional TCM theory and modern biomedical science\n(https://zenodo.org/records/13763953 ). Using feature engineering and\nembedding, we processed key TCM terminology and Chinese herbal pieces (CHP),\nintroducing medicinal properties as virtual nodes and employing graph neural\nnetworks with attention mechanisms to model and analyze 6,080 Chinese herbal\nformulas (CHF). Our method quantitatively assessed the roles of CHP within CHF\nand was validated using 215 CHF designed for COVID-19 management. With\ninterpretable models, open-source data, and code\n(https://github.com/ZENGJingqi/GraphAI-for-TCM ), this study provides robust\ntools for advancing TCM theory and drug discovery.\n","authors":["Jingqi Zeng","Xiaobin Jia"],"pdf_url":"https://arxiv.org/pdf/2411.11474v1.pdf","comment":"10 pages, 5 figures. Includes open-source dataset and code for\n reproducibility"},{"id":"http://arxiv.org/abs/2405.18009v2","updated":"2024-11-18T11:15:56Z","published":"2024-05-28T09:50:46Z","title":"Exploring Context Window of Large Language Models via Decomposed\n Positional Vectors","summary":" Transformer-based large language models (LLMs) typically have a limited\ncontext window, resulting in significant performance degradation when\nprocessing text beyond the length of the context window. Extensive studies have\nbeen proposed to extend the context window and achieve length extrapolation of\nLLMs, but there is still a lack of in-depth interpretation of these approaches.\nIn this study, we explore the positional information within and beyond the\ncontext window for deciphering the underlying mechanism of LLMs. By using a\nmean-based decomposition method, we disentangle positional vectors from hidden\nstates of LLMs and analyze their formation and effect on attention.\nFurthermore, when texts exceed the context window, we analyze the change of\npositional vectors in two settings, i.e., direct extrapolation and context\nwindow extension. Based on our findings, we design two training-free context\nwindow extension methods, positional vector replacement and attention window\nextension. Experimental results show that our methods can effectively extend\nthe context window length.\n","authors":["Zican Dong","Junyi Li","Xin Men","Wayne Xin Zhao","Bingbing Wang","Zhen Tian","Weipeng Chen","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.18009v2.pdf","comment":"Accepted by Neurips 2024 as a spotlight"},{"id":"http://arxiv.org/abs/2405.20821v2","updated":"2024-11-18T11:06:59Z","published":"2024-05-31T14:15:44Z","title":"Pursuing Overall Welfare in Federated Learning through Sequential\n Decision Making","summary":" In traditional federated learning, a single global model cannot perform\nequally well for all clients. Therefore, the need to achieve the client-level\nfairness in federated system has been emphasized, which can be realized by\nmodifying the static aggregation scheme for updating the global model to an\nadaptive one, in response to the local signals of the participating clients.\nOur work reveals that existing fairness-aware aggregation strategies can be\nunified into an online convex optimization framework, in other words, a central\nserver's sequential decision making process. To enhance the decision making\ncapability, we propose simple and intuitive improvements for suboptimal designs\nwithin existing methods, presenting AAggFF. Considering practical requirements,\nwe further subdivide our method tailored for the cross-device and the\ncross-silo settings, respectively. Theoretical analyses guarantee sublinear\nregret upper bounds for both settings: $\\mathcal{O}(\\sqrt{T \\log{K}})$ for the\ncross-device setting, and $\\mathcal{O}(K \\log{T})$ for the cross-silo setting,\nwith $K$ clients and $T$ federation rounds. Extensive experiments demonstrate\nthat the federated system equipped with AAggFF achieves better degree of\nclient-level fairness than existing methods in both practical settings. Code is\navailable at https://github.com/vaseline555/AAggFF\n","authors":["Seok-Ju Hahn","Gi-Soo Kim","Junghye Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20821v2.pdf","comment":"Accepted at ICML 2024; added missing but important references, fixed\n typos"},{"id":"http://arxiv.org/abs/2411.11467v1","updated":"2024-11-18T11:03:15Z","published":"2024-11-18T11:03:15Z","title":"Physics meets Topology: Physics-informed topological neural networks for\n learning rigid body dynamics","summary":" Rigid body interactions are fundamental to numerous scientific disciplines,\nbut remain challenging to simulate due to their abrupt nonlinear nature and\nsensitivity to complex, often unknown environmental factors. These challenges\ncall for adaptable learning-based methods capable of capturing complex\ninteractions beyond explicit physical models and simulations. While graph\nneural networks can handle simple scenarios, they struggle with complex scenes\nand long-term predictions. We introduce a novel framework for modeling rigid\nbody dynamics and learning collision interactions, addressing key limitations\nof existing graph-based methods. Our approach extends the traditional\nrepresentation of meshes by incorporating higher-order topology complexes,\noffering a physically consistent representation. Additionally, we propose a\nphysics-informed message-passing neural architecture, embedding physical laws\ndirectly in the model. Our method demonstrates superior accuracy, even during\nlong rollouts, and exhibits strong generalization to unseen scenarios.\nImportantly, this work addresses the challenge of multi-entity dynamic\ninteractions, with applications spanning diverse scientific and engineering\ndomains.\n","authors":["Amaury Wei","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2411.11467v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.11465v1","updated":"2024-11-18T10:58:46Z","published":"2024-11-18T10:58:46Z","title":"Re-examining learning linear functions in context","summary":" In context learning (ICL) is an attractive method of solving a wide range of\nproblems. Inspired by Garg et al. (2022), we look closely at ICL in a variety\nof train and test settings for several transformer models of different sizes\ntrained from scratch. Our study complements prior work by pointing out several\nsystematic failures of these models to generalize to data not in the training\ndistribution, thereby showing some limitations of ICL. We find that models\nadopt a strategy for this task that is very different from standard solutions.\n","authors":["Omar Naim","Guilhem Fouilhé","Nicholas Asher"],"pdf_url":"https://arxiv.org/pdf/2411.11465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11464v1","updated":"2024-11-18T10:58:16Z","published":"2024-11-18T10:58:16Z","title":"PALMS: Parallel Adaptive Lasso with Multi-directional Signals for Latent\n Networks Reconstruction","summary":" Large-scale networks exist in many field and play an important role in\nreal-world dynamics. However, the networks are usually latent and expensive to\ndetect, which becomes the main challenging for many applications and empirical\nanalysis. Several statistical methods were proposed to infer the edges, but the\ncomplexity of algorithms make them hard to be applied for large-scale networks.\nIn this paper, we proposed a general distributed and parallel computing\nframework for network reconstruction methods via compressive sensing technical,\nto make them feasible for inferring the super large networks in practice.\nCombining with the CALMS, we proposed for those estimators enjoy additional\ntheoretical properties, such as the consistency and asymptotic normality, we\nprove that the approximate estimation utilizing the distributed algorithm can\nkeep the theoretical results.\n","authors":["Zhaoyu Xing","Wei Zhong"],"pdf_url":"https://arxiv.org/pdf/2411.11464v1.pdf","comment":"48 pages"},{"id":"http://arxiv.org/abs/2403.03276v2","updated":"2024-11-18T10:46:04Z","published":"2024-03-05T19:15:17Z","title":"ARNN: Attentive Recurrent Neural Network for Multi-channel EEG Signals\n to Identify Epileptic Seizures","summary":" Electroencephalography (EEG) is a widely used tool for diagnosing brain\ndisorders due to its high temporal resolution, non-invasive nature, and\naffordability. Manual analysis of EEG is labor-intensive and requires\nexpertise, making automatic EEG interpretation crucial for reducing workload\nand accurately assessing seizures. In epilepsy diagnosis, prolonged EEG\nmonitoring generates extensive data, often spanning hours, days, or even weeks.\nWhile machine learning techniques for automatic EEG interpretation have\nadvanced significantly in recent decades, there remains a gap in its ability to\nefficiently analyze large datasets with a balance of accuracy and computational\nefficiency. To address the challenges mentioned above, an Attention Recurrent\nNeural Network (ARNN) is proposed that can process a large amount of data\nefficiently and accurately. This ARNN cell recurrently applies attention layers\nalong a sequence and has linear complexity with the sequence length and\nleverages parallel computation by processing multi-channel EEG signals rather\nthan single-channel signals. In this architecture, the attention layer is a\ncomputational unit that efficiently applies self-attention and cross-attention\nmechanisms to compute a recurrent function over a wide number of state vectors\nand input signals. This framework is inspired in part by the attention layer\nand long short-term memory (LSTM) cells, but it scales this typical cell up by\nseveral orders to parallelize for multi-channel EEG signals. It inherits the\nadvantages of attention layers and LSTM gate while avoiding their respective\ndrawbacks. The model's effectiveness is evaluated through extensive experiments\nwith heterogeneous datasets, including the CHB-MIT and UPenn and Mayo's Clinic\ndatasets.\n","authors":["Salim Rukhsar","Anil Kumar Tiwari"],"pdf_url":"https://arxiv.org/pdf/2403.03276v2.pdf","comment":"11 pages, 7 figures, Journal Paper"},{"id":"http://arxiv.org/abs/2411.11457v1","updated":"2024-11-18T10:44:20Z","published":"2024-11-18T10:44:20Z","title":"Upside-Down Reinforcement Learning for More Interpretable Optimal\n Control","summary":" Model-Free Reinforcement Learning (RL) algorithms either learn how to map\nstates to expected rewards or search for policies that can maximize a certain\nperformance function. Model-Based algorithms instead, aim to learn an\napproximation of the underlying model of the RL environment and then use it in\ncombination with planning algorithms. Upside-Down Reinforcement Learning (UDRL)\nis a novel learning paradigm that aims to learn how to predict actions from\nstates and desired commands. This task is formulated as a Supervised Learning\nproblem and has successfully been tackled by Neural Networks (NNs). In this\npaper, we investigate whether function approximation algorithms other than NNs\ncan also be used within a UDRL framework. Our experiments, performed over\nseveral popular optimal control benchmarks, show that tree-based methods like\nRandom Forests and Extremely Randomized Trees can perform just as well as NNs\nwith the significant benefit of resulting in policies that are inherently more\ninterpretable than NNs, therefore paving the way for more transparent, safe,\nand robust RL.\n","authors":["Juan Cardenas-Cartagena","Massimiliano Falzari","Marco Zullich","Matthia Sabatelli"],"pdf_url":"https://arxiv.org/pdf/2411.11457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15019v2","updated":"2024-11-18T10:35:37Z","published":"2024-09-23T13:46:38Z","title":"Evaluating Synthetic Activations composed of SAE Latents in GPT-2","summary":" Sparse Auto-Encoders (SAEs) are commonly employed in mechanistic\ninterpretability to decompose the residual stream into monosemantic SAE\nlatents. Recent work demonstrates that perturbing a model's activations at an\nearly layer results in a step-function-like change in the model's final layer\nactivations. Furthermore, the model's sensitivity to this perturbation differs\nbetween model-generated (real) activations and random activations. In our\nstudy, we assess model sensitivity in order to compare real activations to\nsynthetic activations composed of SAE latents. Our findings indicate that\nsynthetic activations closely resemble real activations when we control for the\nsparsity and cosine similarity of the constituent SAE latents. This suggests\nthat real activations cannot be explained by a simple \"bag of SAE latents\"\nlacking internal structure, and instead suggests that SAE latents possess\nsignificant geometric and statistical properties. Notably, we observe that our\nsynthetic activations exhibit less pronounced activation plateaus compared to\nthose typically surrounding real activations.\n","authors":["Giorgi Giglemiani","Nora Petrova","Chatrik Singh Mangat","Jett Janiak","Stefan Heimersheim"],"pdf_url":"https://arxiv.org/pdf/2409.15019v2.pdf","comment":"Presented at the Attributing Model Behavior at Scale (ATTRIB)\n workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.17113v4","updated":"2024-11-18T10:32:32Z","published":"2024-09-25T17:27:02Z","title":"Characterizing stable regions in the residual stream of LLMs","summary":" We identify stable regions in the residual stream of Transformers, where the\nmodel's output remains insensitive to small activation changes, but exhibits\nhigh sensitivity at region boundaries. These regions emerge during training and\nbecome more defined as training progresses or model size increases. The regions\nappear to be much larger than previously studied polytopes. Our analysis\nsuggests that these stable regions align with semantic distinctions, where\nsimilar prompts cluster within regions, and activations from the same region\nlead to similar next token predictions. This work provides a promising research\ndirection for understanding the complexity of neural networks, shedding light\non training dynamics, and advancing interpretability.\n","authors":["Jett Janiak","Jacek Karwowski","Chatrik Singh Mangat","Giorgi Giglemiani","Nora Petrova","Stefan Heimersheim"],"pdf_url":"https://arxiv.org/pdf/2409.17113v4.pdf","comment":"Presented at the Scientific Methods for Understanding Deep Learning\n (SciForDL) workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11448v1","updated":"2024-11-18T10:30:34Z","published":"2024-11-18T10:30:34Z","title":"Unveiling the Inflexibility of Adaptive Embedding in Traffic Forecasting","summary":" Spatiotemporal Graph Neural Networks (ST-GNNs) and Transformers have shown\nsignificant promise in traffic forecasting by effectively modeling temporal and\nspatial correlations. However, rapid urbanization in recent years has led to\ndynamic shifts in traffic patterns and travel demand, posing major challenges\nfor accurate long-term traffic prediction. The generalization capability of\nST-GNNs in extended temporal scenarios and cross-city applications remains\nlargely unexplored. In this study, we evaluate state-of-the-art models on an\nextended traffic benchmark and observe substantial performance degradation in\nexisting ST-GNNs over time, which we attribute to their limited inductive\ncapabilities. Our analysis reveals that this degradation stems from an\ninability to adapt to evolving spatial relationships within urban environments.\nTo address this limitation, we reconsider the design of adaptive embeddings and\npropose a Principal Component Analysis (PCA) embedding approach that enables\nmodels to adapt to new scenarios without retraining. We incorporate PCA\nembeddings into existing ST-GNN and Transformer architectures, achieving marked\nimprovements in performance. Notably, PCA embeddings allow for flexibility in\ngraph structures between training and testing, enabling models trained on one\ncity to perform zero-shot predictions on other cities. This adaptability\ndemonstrates the potential of PCA embeddings in enhancing the robustness and\ngeneralization of spatiotemporal models.\n","authors":["Hongjun Wang","Jiyuan Chen","Lingyu Zhang","Renhe Jiang","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2411.11448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12555v2","updated":"2024-11-18T10:20:35Z","published":"2024-10-16T13:32:35Z","title":"Investigating Sensitive Directions in GPT-2: An Improved Baseline and\n Comparative Analysis of SAEs","summary":" Sensitive directions experiments attempt to understand the computational\nfeatures of Language Models (LMs) by measuring how much the next token\nprediction probabilities change by perturbing activations along specific\ndirections. We extend the sensitive directions work by introducing an improved\nbaseline for perturbation directions. We demonstrate that KL divergence for\nSparse Autoencoder (SAE) reconstruction errors are no longer pathologically\nhigh compared to the improved baseline. We also show that feature directions\nuncovered by SAEs have varying impacts on model outputs depending on the SAE's\nsparsity, with lower L0 SAE feature directions exerting a greater influence.\nAdditionally, we find that end-to-end SAE features do not exhibit stronger\neffects on model outputs compared to traditional SAEs.\n","authors":["Daniel J. Lee","Stefan Heimersheim"],"pdf_url":"https://arxiv.org/pdf/2410.12555v2.pdf","comment":"Presented at the Attributing Model Behavior at Scale (ATTRIB) and\n Scientific Methods for Understanding Deep Learning (SciForDL) workshops at\n NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.05472v2","updated":"2024-11-18T10:19:17Z","published":"2024-08-10T07:42:01Z","title":"FuXi Weather: A data-to-forecast machine learning system for global\n weather","summary":" Weather forecasting traditionally relies on numerical weather prediction\n(NWP) systems that integrates global observational systems, data assimilation\n(DA), and forecasting models. Despite steady improvements in forecast accuracy\nover recent decades, further advances are increasingly constrained by high\ncomputational costs, the underutilization of vast observational datasets, and\nthe challenges of obtaining finer resolution. These limitations, alongside the\nuneven distribution of observational networks, result in global disparities in\nforecast accuracy, leaving some regions vulnerable to extreme weather. Recent\nadvances in machine learning present a promising alternative, providing more\nefficient and accurate forecasts using the same initial conditions as NWP.\nHowever, current machine learning models still depend on the initial conditions\ngenerated by NWP systems, which require extensive computational resources and\nexpertise. Here we introduce FuXi Weather, a machine learning weather\nforecasting system that assimilates data from multiple satellites. Operating on\na 6-hourly DA and forecast cycle, FuXi Weather generates reliable and accurate\n10-day global weather forecasts at a spatial resolution of $0.25^\\circ$. FuXi\nWeather is the first system to achieve all-grid, all-surface, all-channel, and\nall-sky DA and forecasting, extending skillful forecast lead times beyond those\nof the European Centre for Medium-range Weather Forecasts (ECMWF)\nhigh-resolution forecasts (HRES) while using significantly fewer observations.\nFuXi Weather consistently outperforms ECMWF HRES in observation-sparse regions,\nsuch as central Africa, demonstrating its potential to improve forecasts where\nobservational infrastructure is limited.\n","authors":["Xiuyu Sun","Xiaohui Zhong","Xiaoze Xu","Yuanqing Huang","Hao Li","J. David Neelin","Deliang Chen","Jie Feng","Wei Han","Libo Wu","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2408.05472v2.pdf","comment":"73 pages"},{"id":"http://arxiv.org/abs/2408.15590v2","updated":"2024-11-18T10:17:25Z","published":"2024-08-28T07:26:30Z","title":"Bayesian optimization of atomic structures with prior probabilities from\n universal interatomic potentials","summary":" The optimization of atomic structures plays a pivotal role in understanding\nand designing materials with desired properties. However, conventional\ncomputational methods often struggle with the formidable task of navigating the\nvast potential energy surface, especially in high-dimensional spaces with\nnumerous local minima. Recent advancements in machine learning-driven surrogate\nmodels offer a promising avenue for alleviating this computational burden. In\nthis study, we propose a novel approach that combines the strengths of\nuniversal machine learning potentials with a Bayesian approach using Gaussian\nprocesses. By using the machine learning potentials as priors for the Gaussian\nprocess, the Gaussian process has to learn only the difference between the\nmachine learning potential and the target energy surface calculated for example\nby density functional theory. This turns out to improve the speed by which the\nglobal optimal structure is identified across diverse systems for a\nwell-behaved machine learning potential. The approach is tested on periodic\nbulk materials, surface structures, and a cluster.\n","authors":["Peder Lyngby","Casper Larsen","Karsten Wedel Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2408.15590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10153v2","updated":"2024-11-18T10:16:14Z","published":"2024-11-15T12:52:02Z","title":"BONE: a unifying framework for Bayesian online learning in\n non-stationary environments","summary":" We propose a unifying framework for methods that perform Bayesian online\nlearning in non-stationary environments. We call the framework BONE, which\nstands for (B)ayesian (O)nline learning in (N)on-stationary (E)nvironments.\nBONE provides a common structure to tackle a variety of problems, including\nonline continual learning, prequential forecasting, and contextual bandits. The\nframework requires specifying three modelling choices: (i) a model for\nmeasurements (e.g., a neural network), (ii) an auxiliary process to model\nnon-stationarity (e.g., the time since the last changepoint), and (iii) a\nconditional prior over model parameters (e.g., a multivariate Gaussian). The\nframework also requires two algorithmic choices, which we use to carry out\napproximate inference under this framework: (i) an algorithm to estimate\nbeliefs (posterior distribution) about the model parameters given the auxiliary\nvariable, and (ii) an algorithm to estimate beliefs about the auxiliary\nvariable. We show how this modularity allows us to write many different\nexisting methods as instances of BONE; we also use this framework to propose a\nnew method. We then experimentally compare existing methods with our proposed\nnew method on several datasets; we provide insights into the situations that\nmake one method more suitable than another for a given task.\n","authors":["Gerardo Duran-Martin","Leandro Sánchez-Betancourt","Alexander Y. Shestopaloff","Kevin Murphy"],"pdf_url":"https://arxiv.org/pdf/2411.10153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11436v1","updated":"2024-11-18T10:08:05Z","published":"2024-11-18T10:08:05Z","title":"Implicit Regularization for Multi-label Feature Selection","summary":" In this paper, we address the problem of feature selection in the context of\nmulti-label learning, by using a new estimator based on implicit regularization\nand label embedding. Unlike the sparse feature selection methods that use a\npenalized estimator with explicit regularization terms such as $l_{2,1}$-norm,\nMCP or SCAD, we propose a simple alternative method via Hadamard product\nparameterization. In order to guide the feature selection process, a latent\nsemantic of multi-label information method is adopted, as a label embedding.\nExperimental results on some known benchmark datasets suggest that the proposed\nestimator suffers much less from extra bias, and may lead to benign\noverfitting.\n","authors":["Dou El Kefel Mansouri","Khalid Benabdeslem","Seif-Eddine Benkabou"],"pdf_url":"https://arxiv.org/pdf/2411.11436v1.pdf","comment":"11 pages, 7 figures, My paper is currently under review at TPAMI\n journal"},{"id":"http://arxiv.org/abs/2403.10250v2","updated":"2024-11-18T10:06:01Z","published":"2024-03-15T12:38:00Z","title":"Interpretable Machine Learning for Survival Analysis","summary":" With the spread and rapid advancement of black box machine learning models,\nthe field of interpretable machine learning (IML) or explainable artificial\nintelligence (XAI) has become increasingly important over the last decade. This\nis particularly relevant for survival analysis, where the adoption of IML\ntechniques promotes transparency, accountability and fairness in sensitive\nareas, such as clinical decision making processes, the development of targeted\ntherapies, interventions or in other medical or healthcare related contexts.\nMore specifically, explainability can uncover a survival model's potential\nbiases and limitations and provide more mathematically sound ways to understand\nhow and which features are influential for prediction or constitute risk\nfactors. However, the lack of readily available IML methods may have deterred\nmedical practitioners and policy makers in public health from leveraging the\nfull potential of machine learning for predicting time-to-event data. We\npresent a comprehensive review of the limited existing amount of work on IML\nmethods for survival analysis within the context of the general IML taxonomy.\nIn addition, we formally detail how commonly used IML methods, such as such as\nindividual conditional expectation (ICE), partial dependence plots (PDP),\naccumulated local effects (ALE), different feature importance measures or\nFriedman's H-interaction statistics can be adapted to survival outcomes. An\napplication of several IML methods to real data on data on under-5 year\nmortality of Ghanaian children from the Demographic and Health Surveys (DHS)\nProgram serves as a tutorial or guide for researchers, on how to utilize the\ntechniques in practice to facilitate understanding of model decisions or\npredictions.\n","authors":["Sophie Hanna Langbein","Mateusz Krzyziński","Mikołaj Spytek","Hubert Baniecki","Przemysław Biecek","Marvin N. Wright"],"pdf_url":"https://arxiv.org/pdf/2403.10250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02967v3","updated":"2024-11-18T09:44:08Z","published":"2024-03-05T13:43:58Z","title":"Non-convex Stochastic Composite Optimization with Polyak Momentum","summary":" The stochastic proximal gradient method is a powerful generalization of the\nwidely used stochastic gradient descent (SGD) method and has found numerous\napplications in Machine Learning. However, it is notoriously known that this\nmethod fails to converge in non-convex settings where the stochastic noise is\nsignificant (i.e. when only small or bounded batch sizes are used). In this\npaper, we focus on the stochastic proximal gradient method with Polyak\nmomentum. We prove this method attains an optimal convergence rate for\nnon-convex composite optimization problems, regardless of batch size.\nAdditionally, we rigorously analyze the variance reduction effect of the Polyak\nmomentum in the composite optimization setting and we show the method also\nconverges when the proximal step can only be solved inexactly. Finally, we\nprovide numerical experiments to validate our theoretical results.\n","authors":["Yuan Gao","Anton Rodomanov","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2403.02967v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20268v2","updated":"2024-11-18T09:39:18Z","published":"2024-10-26T20:39:41Z","title":"Centaur: a foundation model of human cognition","summary":" Establishing a unified theory of cognition has been a major goal of\npsychology. While there have been previous attempts to instantiate such\ntheories by building computational models, we currently do not have one model\nthat captures the human mind in its entirety. Here we introduce Centaur, a\ncomputational model that can predict and simulate human behavior in any\nexperiment expressible in natural language. We derived Centaur by finetuning a\nstate-of-the-art language model on a novel, large-scale data set called\nPsych-101. Psych-101 reaches an unprecedented scale, covering trial-by-trial\ndata from over 60,000 participants performing over 10,000,000 choices in 160\nexperiments. Centaur not only captures the behavior of held-out participants\nbetter than existing cognitive models, but also generalizes to new cover\nstories, structural task modifications, and entirely new domains. Furthermore,\nwe find that the model's internal representations become more aligned with\nhuman neural activity after finetuning. Taken together, Centaur is the first\nreal candidate for a unified model of human cognition. We anticipate that it\nwill have a disruptive impact on the cognitive sciences, challenging the\nexisting paradigm for developing computational models.\n","authors":["Marcel Binz","Elif Akata","Matthias Bethge","Franziska Brändle","Fred Callaway","Julian Coda-Forno","Peter Dayan","Can Demircan","Maria K. Eckstein","Noémi Éltető","Thomas L. Griffiths","Susanne Haridi","Akshay K. Jagadish","Li Ji-An","Alexander Kipnis","Sreejan Kumar","Tobias Ludwig","Marvin Mathony","Marcelo Mattar","Alireza Modirshanechi","Surabhi S. Nath","Joshua C. Peterson","Milena Rmus","Evan M. Russek","Tankred Saanum","Natalia Scharfenberg","Johannes A. Schubert","Luca M. Schulze Buschoff","Nishad Singhi","Xin Sui","Mirko Thalmann","Fabian Theis","Vuong Truong","Vishaal Udandarao","Konstantinos Voudouris","Robert Wilson","Kristin Witte","Shuchen Wu","Dirk Wulff","Huadong Xiong","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2410.20268v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11414v1","updated":"2024-11-18T09:35:22Z","published":"2024-11-18T09:35:22Z","title":"Temporal and Spatial Reservoir Ensembling Techniques for Liquid State\n Machines","summary":" Reservoir computing (RC), is a class of computational methods such as Echo\nState Networks (ESN) and Liquid State Machines (LSM) describe a generic method\nto perform pattern recognition and temporal analysis with any non-linear\nsystem. This is enabled by Reservoir Computing being a shallow network model\nwith only Input, Reservoir, and Readout layers where input and reservoir\nweights are not learned (only the readout layer is trained). LSM is a special\ncase of Reservoir computing inspired by the organization of neurons in the\nbrain and generally refers to spike-based Reservoir computing approaches. LSMs\nhave been successfully used to showcase decent performance on some neuromorphic\nvision and speech datasets but a common problem associated with LSMs is that\nsince the model is more-or-less fixed, the main way to improve the performance\nis by scaling up the Reservoir size, but that only gives diminishing rewards\ndespite a tremendous increase in model size and computation. In this paper, we\npropose two approaches for effectively ensembling LSM models - Multi-Length\nScale Reservoir Ensemble (MuLRE) and Temporal Excitation Partitioned Reservoir\nEnsemble (TEPRE) and benchmark them on Neuromorphic-MNIST (N-MNIST), Spiking\nHeidelberg Digits (SHD), and DVSGesture datasets, which are standard\nneuromorphic benchmarks. We achieve 98.1% test accuracy on N-MNIST with a\n3600-neuron LSM model which is higher than any prior LSM-based approach and\n77.8% test accuracy on the SHD dataset which is on par with a standard\nRecurrent Spiking Neural Network trained by Backprop Through Time (BPTT). We\nalso propose receptive field-based input weights to the Reservoir to work\nalongside the Multi-Length Scale Reservoir ensemble model for vision tasks.\nThus, we introduce effective means of scaling up the performance of LSM models\nand evaluate them against relevant neuromorphic benchmarks\n","authors":["Anmol Biswas","Sharvari Ashok Medhe","Raghav Singhal","Udayan Ganguly"],"pdf_url":"https://arxiv.org/pdf/2411.11414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11409v1","updated":"2024-11-18T09:30:05Z","published":"2024-11-18T09:30:05Z","title":"IKEA Manuals at Work: 4D Grounding of Assembly Instructions on Internet\n Videos","summary":" Shape assembly is a ubiquitous task in daily life, integral for constructing\ncomplex 3D structures like IKEA furniture. While significant progress has been\nmade in developing autonomous agents for shape assembly, existing datasets have\nnot yet tackled the 4D grounding of assembly instructions in videos, essential\nfor a holistic understanding of assembly in 3D space over time. We introduce\nIKEA Video Manuals, a dataset that features 3D models of furniture parts,\ninstructional manuals, assembly videos from the Internet, and most importantly,\nannotations of dense spatio-temporal alignments between these data modalities.\nTo demonstrate the utility of IKEA Video Manuals, we present five applications\nessential for shape assembly: assembly plan generation, part-conditioned\nsegmentation, part-conditioned pose estimation, video object segmentation, and\nfurniture assembly based on instructional video manuals. For each application,\nwe provide evaluation metrics and baseline methods. Through experiments on our\nannotated data, we highlight many challenges in grounding assembly instructions\nin videos to improve shape assembly, including handling occlusions, varying\nviewpoints, and extended assembly sequences.\n","authors":["Yunong Liu","Cristobal Eyzaguirre","Manling Li","Shubh Khanna","Juan Carlos Niebles","Vineeth Ravi","Saumitra Mishra","Weiyu Liu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11409v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2411.11407v1","updated":"2024-11-18T09:28:58Z","published":"2024-11-18T09:28:58Z","title":"The Dark Side of Trust: Authority Citation-Driven Jailbreak Attacks on\n Large Language Models","summary":" The widespread deployment of large language models (LLMs) across various\ndomains has showcased their immense potential while exposing significant safety\nvulnerabilities. A major concern is ensuring that LLM-generated content aligns\nwith human values. Existing jailbreak techniques reveal how this alignment can\nbe compromised through specific prompts or adversarial suffixes. In this study,\nwe introduce a new threat: LLMs' bias toward authority. While this inherent\nbias can improve the quality of outputs generated by LLMs, it also introduces a\npotential vulnerability, increasing the risk of producing harmful content.\nNotably, the biases in LLMs is the varying levels of trust given to different\ntypes of authoritative information in harmful queries. For example, malware\ndevelopment often favors trust GitHub. To better reveal the risks with LLM, we\npropose DarkCite, an adaptive authority citation matcher and generator designed\nfor a black-box setting. DarkCite matches optimal citation types to specific\nrisk types and generates authoritative citations relevant to harmful\ninstructions, enabling more effective jailbreak attacks on aligned LLMs.Our\nexperiments show that DarkCite achieves a higher attack success rate (e.g.,\nLLama-2 at 76% versus 68%) than previous methods. To counter this risk, we\npropose an authenticity and harm verification defense strategy, raising the\naverage defense pass rate (DPR) from 11% to 74%. More importantly, the ability\nto link citations to the content they encompass has become a foundational\nfunction in LLMs, amplifying the influence of LLMs' bias toward authority.\n","authors":["Xikang Yang","Xuehai Tang","Jizhong Han","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2411.11407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16503v2","updated":"2024-11-18T09:28:57Z","published":"2024-05-26T09:47:17Z","title":"Integrating GNN and Neural ODEs for Estimating Non-Reciprocal Two-Body\n Interactions in Mixed-Species Collective Motion","summary":" Analyzing the motion of multiple biological agents, be it cells or individual\nanimals, is pivotal for the understanding of complex collective behaviors. With\nthe advent of advanced microscopy, detailed images of complex tissue formations\ninvolving multiple cell types have become more accessible in recent years.\nHowever, deciphering the underlying rules that govern cell movements is far\nfrom trivial. Here, we present a novel deep learning framework for estimating\nthe underlying equations of motion from observed trajectories, a pivotal step\nin decoding such complex dynamics. Our framework integrates graph neural\nnetworks with neural differential equations, enabling effective prediction of\ntwo-body interactions based on the states of the interacting entities. We\ndemonstrate the efficacy of our approach through two numerical experiments.\nFirst, we used simulated data from a toy model to tune the hyperparameters.\nBased on the obtained hyperparameters, we then applied this approach to a more\ncomplex model with non-reciprocal forces that mimic the collective dynamics of\nthe cells of slime molds. Our results show that the proposed method can\naccurately estimate the functional forms of two-body interactions -- even when\nthey are nonreciprocal -- thereby precisely replicating both individual and\ncollective behaviors within these systems.\n","authors":["Masahito Uwamichi","Simon K. Schnyder","Tetsuya J. Kobayashi","Satoshi Sawai"],"pdf_url":"https://arxiv.org/pdf/2405.16503v2.pdf","comment":"Accepted at NeurIPS 2024. Some contents are omitted due to arXiv's\n storage limit. Please refer to the full paper at OpenReview (NeurIPS 2024) or\n https://github.com/MasahitoUWAMICHI/collectiveMotionNN"},{"id":"http://arxiv.org/abs/2411.11406v1","updated":"2024-11-18T09:28:11Z","published":"2024-11-18T09:28:11Z","title":"Bridging the Resource Gap: Deploying Advanced Imitation Learning Models\n onto Affordable Embedded Platforms","summary":" Advanced imitation learning with structures like the transformer is\nincreasingly demonstrating its advantages in robotics. However, deploying these\nlarge-scale models on embedded platforms remains a major challenge. In this\npaper, we propose a pipeline that facilitates the migration of advanced\nimitation learning algorithms to edge devices. The process is achieved via an\nefficient model compression method and a practical asynchronous parallel method\nTemporal Ensemble with Dropped Actions (TEDA) that enhances the smoothness of\noperations. To show the efficiency of the proposed pipeline, large-scale\nimitation learning models are trained on a server and deployed on an edge\ndevice to complete various manipulation tasks.\n","authors":["Haizhou Ge","Ruixiang Wang","Zhu-ang Xu","Hongrui Zhu","Ruichen Deng","Yuhang Dong","Zeyu Pang","Guyue Zhou","Junyu Zhang","Lu Shi"],"pdf_url":"https://arxiv.org/pdf/2411.11406v1.pdf","comment":"Accepted by the 2024 IEEE International Conference on Robotics and\n Biomimetics (IEEE ROBIO 2024)"},{"id":"http://arxiv.org/abs/2411.11405v1","updated":"2024-11-18T09:27:49Z","published":"2024-11-18T09:27:49Z","title":"Extended Neural Contractive Dynamical Systems: On Multiple Tasks and\n Riemannian Safety Regions","summary":" Stability guarantees are crucial when ensuring that a fully autonomous robot\ndoes not take undesirable or potentially harmful actions. We recently proposed\nthe Neural Contractive Dynamical Systems (NCDS), which is a neural network\narchitecture that guarantees contractive stability. With this,\nlearning-from-demonstrations approaches can trivially provide stability\nguarantees. However, our early work left several unanswered questions, which we\nhere address. Beyond providing an in-depth explanation of NCDS, this paper\nextends the framework with more careful regularization, a conditional variant\nof the framework for handling multiple tasks, and an uncertainty-driven\napproach to latent obstacle avoidance. Experiments verify that the developed\nsystem has the flexibility of ordinary neural networks while providing the\nstability guarantees needed for autonomous robotics.\n","authors":["Hadi Beik Mohammadi","Søren Hauberg","Georgios Arvanitidis","Gerhard Neumann","Leonel Rozo"],"pdf_url":"https://arxiv.org/pdf/2411.11405v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2401.09352"},{"id":"http://arxiv.org/abs/2402.14259v2","updated":"2024-11-18T09:19:25Z","published":"2024-02-22T03:46:08Z","title":"Word-Sequence Entropy: Towards Uncertainty Estimation in Free-Form\n Medical Question Answering Applications and Beyond","summary":" Uncertainty estimation is crucial for the reliability of safety-critical\nhuman and artificial intelligence (AI) interaction systems, particularly in the\ndomain of healthcare engineering. However, a robust and general uncertainty\nmeasure for free-form answers has not been well-established in open-ended\nmedical question-answering (QA) tasks, where generative inequality introduces a\nlarge number of irrelevant words and sequences within the generated set for\nuncertainty quantification (UQ), which can lead to biases. This paper\nintroduces Word-Sequence Entropy (WSE), a method that calibrates uncertainty at\nboth the word and sequence levels, considering semantic relevance. WSE\nquantifies uncertainty in a way that is more closely aligned with the\nreliability of LLMs during uncertainty quantification (UQ). We compare WSE with\nsix baseline methods on five free-form medical QA datasets, utilizing seven\npopular large language models (LLMs). Experimental results demonstrate that WSE\nexhibits superior performance in UQ under two standard criteria for correctness\nevaluation. Additionally, in terms of real-world medical QA applications, the\nperformance of LLMs is significantly enhanced (e.g., a 6.36% improvement in\nmodel accuracy on the COVID-QA dataset) by employing responses with lower\nuncertainty that are identified by WSE as final answers, without any additional\ntask-specific fine-tuning or architectural modifications.\n","authors":["Zhiyuan Wang","Jinhao Duan","Chenxi Yuan","Qingyu Chen","Tianlong Chen","Yue Zhang","Ren Wang","Xiaoshuang Shi","Kaidi Xu"],"pdf_url":"https://arxiv.org/pdf/2402.14259v2.pdf","comment":"Accepted by Engineering Applications of Artificial Intelligence"},{"id":"http://arxiv.org/abs/2411.11391v1","updated":"2024-11-18T09:08:30Z","published":"2024-11-18T09:08:30Z","title":"The GECo algorithm for Graph Neural Networks Explanation","summary":" Graph Neural Networks (GNNs) are powerful models that can manage complex data\nsources and their interconnection links. One of GNNs' main drawbacks is their\nlack of interpretability, which limits their application in sensitive fields.\nIn this paper, we introduce a new methodology involving graph communities to\naddress the interpretability of graph classification problems. The proposed\nmethod, called GECo, exploits the idea that if a community is a subset of graph\nnodes densely connected, this property should play a role in graph\nclassification. This is reasonable, especially if we consider the\nmessage-passing mechanism, which is the basic mechanism of GNNs. GECo analyzes\nthe contribution to the classification result of the communities in the graph,\nbuilding a mask that highlights graph-relevant structures. GECo is tested for\nGraph Convolutional Networks on six artificial and four real-world graph\ndatasets and is compared to the main explainability methods such as\nPGMExplainer, PGExplainer, GNNExplainer, and SubgraphX using four different\nmetrics. The obtained results outperform the other methods for artificial graph\ndatasets and most real-world datasets.\n","authors":["Salvatore Calderaro","Domenico Amato","Giosuè Lo Bosco","Riccardo Rizzo","Filippo Vella"],"pdf_url":"https://arxiv.org/pdf/2411.11391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11375v1","updated":"2024-11-18T08:39:24Z","published":"2024-11-18T08:39:24Z","title":"Graph Neural Networks on Graph Databases","summary":" Training graph neural networks on large datasets has long been a challenge.\nTraditional approaches include efficiently representing the whole graph\nin-memory, designing parameter efficient and sampling-based models, and graph\npartitioning in a distributed setup. Separately, graph databases with native\ngraph storage and query engines have been developed, which enable time and\nresource efficient graph analytics workloads. We show how to directly train a\nGNN on a graph DB, by retrieving minimal data into memory and sampling using\nthe query engine. Our experiments show resource advantages for single-machine\nand distributed training. Our approach opens up a new way of scaling GNNs as\nwell as a new application area for graph DBs.\n","authors":["Dmytro Lopushanskyy","Borun Shi"],"pdf_url":"https://arxiv.org/pdf/2411.11375v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.11371v1","updated":"2024-11-18T08:34:38Z","published":"2024-11-18T08:34:38Z","title":"Rethinking Thinking Tokens: Understanding Why They Underperform in\n Practice","summary":" Thinking Tokens (TT) have been proposed as an unsupervised method to\nfacilitate reasoning in language models. However, despite their conceptual\nappeal, our findings show that TTs marginally improves performance and\nconsistently underperforms compared to Chain-of-Thought (CoT) reasoning across\nmultiple benchmarks. We hypothesize that this underperformance stems from the\nreliance on a single embedding for TTs, which results in inconsistent learning\nsignals and introduces noisy gradients. This paper provides a comprehensive\nempirical analysis to validate this hypothesis and discusses the implications\nfor future research on unsupervised reasoning in LLMs.\n","authors":["Sreeram Vennam","David Valente","David Herel","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2411.11371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00499v3","updated":"2024-11-18T08:33:35Z","published":"2024-06-29T17:33:07Z","title":"ConU: Conformal Uncertainty in Large Language Models with Correctness\n Coverage Guarantees","summary":" Uncertainty quantification (UQ) in natural language generation (NLG) tasks\nremains an open challenge, exacerbated by the closed-source nature of the\nlatest large language models (LLMs). This study investigates applying conformal\nprediction (CP), which can transform any heuristic uncertainty notion into\nrigorous prediction sets, to black-box LLMs in open-ended NLG tasks. We\nintroduce a novel uncertainty measure based on self-consistency theory, and\nthen develop a conformal uncertainty criterion by integrating the uncertainty\ncondition aligned with correctness into the CP algorithm. Empirical evaluations\nindicate that our uncertainty measure outperforms prior state-of-the-art\nmethods. Furthermore, we achieve strict control over the correctness coverage\nrate utilizing 7 popular LLMs on 4 free-form NLG datasets, spanning\ngeneral-purpose and medical scenarios. Additionally, the calibrated prediction\nsets with small size further highlights the efficiency of our method in\nproviding trustworthy guarantees for practical open-ended NLG applications.\n","authors":["Zhiyuan Wang","Jinhao Duan","Lu Cheng","Yue Zhang","Qingni Wang","Xiaoshuang Shi","Kaidi Xu","Hengtao Shen","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.00499v3.pdf","comment":"Accepted by EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2411.11364v1","updated":"2024-11-18T08:20:21Z","published":"2024-11-18T08:20:21Z","title":"Continual Task Learning through Adaptive Policy Self-Composition","summary":" Training a generalizable agent to continually learn a sequence of tasks from\noffline trajectories is a natural requirement for long-lived agents, yet\nremains a significant challenge for current offline reinforcement learning (RL)\nalgorithms. Specifically, an agent must be able to rapidly adapt to new tasks\nusing newly collected trajectories (plasticity), while retaining knowledge from\npreviously learned tasks (stability). However, systematic analyses of this\nsetting are scarce, and it remains unclear whether conventional continual\nlearning (CL) methods are effective in continual offline RL (CORL) scenarios.\nIn this study, we develop the Offline Continual World benchmark and demonstrate\nthat traditional CL methods struggle with catastrophic forgetting, primarily\ndue to the unique distribution shifts inherent to CORL scenarios. To address\nthis challenge, we introduce CompoFormer, a structure-based continual\ntransformer model that adaptively composes previous policies via a meta-policy\nnetwork. Upon encountering a new task, CompoFormer leverages semantic\ncorrelations to selectively integrate relevant prior policies alongside newly\ntrained parameters, thereby enhancing knowledge sharing and accelerating the\nlearning process. Our experiments reveal that CompoFormer outperforms\nconventional CL methods, particularly in longer task sequences, showcasing a\npromising balance between plasticity and stability.\n","authors":["Shengchao Hu","Yuhang Zhou","Ziqing Fan","Jifeng Hu","Li Shen","Ya Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2411.11364v1.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.09726v2","updated":"2024-11-18T07:50:45Z","published":"2024-11-14T15:36:19Z","title":"Spatio-Temporal Jump Model for Urban Thermal Comfort Monitoring","summary":" Thermal comfort is essential for well-being in urban spaces, especially as\ncities face increasing heat from urbanization and climate change. Existing\nthermal comfort models usually overlook temporal dynamics alongside spatial\ndependencies. We address this problem by introducing a spatio-temporal jump\nmodel that clusters data with persistence across both spatial and temporal\ndimensions. This framework enhances interpretability, minimizes abrupt state\nchanges, and easily handles missing data. We validate our approach through\nextensive simulations, demonstrating its accuracy in recovering the true\nunderlying partition. When applied to hourly environmental data gathered from a\nset of weather stations located across the city of Singapore, our proposal\nidentifies meaningful thermal comfort regimes, demonstrating its effectiveness\nin dynamic urban settings and suitability for real-world monitoring. The\ncomparison of these regimes with feedback on thermal preference indicates the\npotential of an unsupervised approach to avoid extensive surveys.\n","authors":["Federico P. Cortese","Antonio Pievatolo"],"pdf_url":"https://arxiv.org/pdf/2411.09726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11350v1","updated":"2024-11-18T07:39:46Z","published":"2024-11-18T07:39:46Z","title":"Zero-Shot Load Forecasting with Large Language Models","summary":" Deep learning models have shown strong performance in load forecasting, but\nthey generally require large amounts of data for model training before being\napplied to new scenarios, which limits their effectiveness in data-scarce\nscenarios. Inspired by the great success of pre-trained language models (LLMs)\nin natural language processing, this paper proposes a zero-shot load\nforecasting approach using an advanced LLM framework denoted as the Chronos\nmodel. By utilizing its extensive pre-trained knowledge, the Chronos model\nenables accurate load forecasting in data-scarce scenarios without the need for\nextensive data-specific training. Simulation results across five real-world\ndatasets demonstrate that the Chronos model significantly outperforms nine\npopular baseline models for both deterministic and probabilistic load\nforecasting with various forecast horizons (e.g., 1 to 48 hours), even though\nthe Chronos model is neither tailored nor fine-tuned to these specific load\ndatasets. Notably, Chronos reduces root mean squared error (RMSE), continuous\nranked probability score (CRPS), and quantile score (QS) by approximately\n7.34%-84.30%, 19.63%-60.06%, and 22.83%-54.49%, respectively, compared to\nbaseline models. These results highlight the superiority and flexibility of the\nChronos model, positioning it as an effective solution in data-scarce\nscenarios.\n","authors":["Wenlong Liao","Zhe Yang","Mengshuo Jia","Christian Rehtanz","Jiannong Fang","Fernando Porté-Agel"],"pdf_url":"https://arxiv.org/pdf/2411.11350v1.pdf","comment":"21 pages,5 figures"},{"id":"http://arxiv.org/abs/2411.11348v1","updated":"2024-11-18T07:38:25Z","published":"2024-11-18T07:38:25Z","title":"Modeling Multivariable High-resolution 3D Urban Microclimate Using\n Localized Fourier Neural Operator","summary":" Accurate urban microclimate analysis with wind velocity and temperature is\nvital for energy-efficient urban planning, supporting carbon reduction,\nenhancing public health and comfort, and advancing the low-altitude economy.\nHowever, traditional computational fluid dynamics (CFD) simulations that couple\nvelocity and temperature are computationally expensive. Recent machine learning\nadvancements offer promising alternatives for accelerating urban microclimate\nsimulations. The Fourier neural operator (FNO) has shown efficiency and\naccuracy in predicting single-variable velocity magnitudes in urban wind\nfields. Yet, for multivariable high-resolution 3D urban microclimate\nprediction, FNO faces three key limitations: blurry output quality, high GPU\nmemory demand, and substantial data requirements. To address these issues, we\npropose a novel localized Fourier neural operator (Local-FNO) model that\nemploys local training, geometry encoding, and patch overlapping. Local-FNO\nprovides accurate predictions for rapidly changing turbulence in urban\nmicroclimate over 60 seconds, four times the average turbulence integral time\nscale, with an average error of 0.35 m/s in velocity and 0.30 {\\deg}C in\ntemperature. It also accurately captures turbulent heat flux represented by the\nvelocity-temperature correlation. In a 2 km by 2 km domain, Local-FNO resolves\nturbulence patterns down to a 10 m resolution. It provides high-resolution\npredictions with 150 million feature dimensions on a single 32 GB GPU at nearly\n50 times the speed of a CFD solver. Compared to FNO, Local-FNO achieves a 23.9%\nreduction in prediction error and a 47.3% improvement in turbulent fluctuation\ncorrelation.\n","authors":["Shaoxiang Qin","Dongxue Zhan","Dingyang Geng","Wenhui Peng","Geng Tian","Yurong Shi","Naiping Gao","Xue Liu","Liangzhu Leon Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09273v4","updated":"2024-11-18T07:34:51Z","published":"2024-05-15T11:42:41Z","title":"Fair Generalized Linear Mixed Models","summary":" When using machine learning for automated prediction, it is important to\naccount for fairness in the prediction. Fairness in machine learning aims to\nensure that biases in the data and model inaccuracies do not lead to\ndiscriminatory decisions. E.g., predictions from fair machine learning models\nshould not discriminate against sensitive variables such as sexual orientation\nand ethnicity. The training data often in obtained from social surveys. In\nsocial surveys, oftentimes the data collection process is a strata sampling,\ne.g. due to cost restrictions. In strata samples, the assumption of\nindependence between the observation is not fulfilled. Hence, if the machine\nlearning models do not account for the strata correlations, the results may be\nbiased. Especially high is the bias in cases where the strata assignment is\ncorrelated to the variable of interest. We present in this paper an algorithm\nthat can handle both problems simultaneously, and we demonstrate the impact of\nstratified sampling on the quality of fair machine learning predictions in a\nreproducible simulation study.\n","authors":["Jan Pablo Burgard","João Vitor Pamplona"],"pdf_url":"https://arxiv.org/pdf/2405.09273v4.pdf","comment":"25 pages, 12 figures. arXiv admin note: text overlap with\n arXiv:2405.06433"},{"id":"http://arxiv.org/abs/2411.08521v2","updated":"2024-11-18T07:29:38Z","published":"2024-11-13T11:08:28Z","title":"SAD-TIME: a Spatiotemporal-fused network for depression detection with\n Automated multi-scale Depth-wise and TIME-interval-related common feature\n extractor","summary":" Background and Objective: Depression is a severe mental disorder, and\naccurate diagnosis is pivotal to the cure and rehabilitation of people with\ndepression. However, the current questionnaire-based diagnostic methods could\nbring subjective biases and may be denied by subjects. In search of a more\nobjective means of diagnosis, researchers have begun to experiment with deep\nlearning-based methods for identifying depressive disorders in recent years.\nMethods: In this study, a novel Spatiotemporal-fused network with Automated\nmulti-scale Depth-wise and TIME-interval-related common feature extractor\n(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common\nfeatures extractor (CFE), a spatial sector (SpS), a modified temporal sector\n(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale\ndepth-wise 1D-convolutional neural network and a time-interval embedding\ngenerator, where the unique information of each channel is preserved. The SpS\nfuses the functional connectivity with the distance-based connectivity\ncontaining spatial position of EEG electrodes. A multi-head-attention graph\nconvolutional network is also applied in the SpS to fuse the features from\ndifferent EEG channels. The TeS is based on long short-term memory and graph\ntransformer networks, where the temporal information of different time-windows\nis fused. Moreover, the DAL is used after the SpS to obtain the\ndomain-invariant feature. Results: Experimental results under tenfold\ncross-validation show that the proposed SAD-TIME method achieves 92.00% and\n94.00% depression classification accuracies on two datasets, respectively, in\ncross-subject mode. Conclusion: SAD-TIME is a robust depression detection\nmodel, where the automatedly-generated features, the SpS and the TeS assist the\nclassification performance with the fusion of the innate spatiotemporal\ninformation in the EEG signals.\n","authors":["Han-Guang Wang","Hui-Rang Hou","Li-Cheng Jin","Chen-Yang Xu","Zhong-Yi Zhang","Qing-Hao Meng"],"pdf_url":"https://arxiv.org/pdf/2411.08521v2.pdf","comment":"21pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.03741v4","updated":"2024-11-18T07:26:43Z","published":"2023-05-18T03:08:18Z","title":"Pre-training Tensor-Train Networks Facilitates Machine Learning with\n Variational Quantum Circuits","summary":" Variational quantum circuits (VQCs) hold promise for quantum machine learning\non noisy intermediate-scale quantum (NISQ) devices. While tensor-train networks\n(TTNs) can enhance VQC representation and generalization, the resulting hybrid\nmodel, TTN-VQC, faces optimization challenges due to the Polyak-Lojasiewicz\n(PL) condition. To mitigate this challenge, we introduce Pre+TTN-VQC, a\npre-trained TTN model combined with a VQC. Our theoretical analysis, grounded\nin two-stage empirical risk minimization, provides an upper bound on the\ntransfer learning risk. It demonstrates the approach's advantages in overcoming\nthe optimization challenge while maintaining TTN-VQC's generalization\ncapability. We validate our findings through experiments on quantum dot and\nhandwritten digit classification using simulated and actual NISQ environments.\n","authors":["Jun Qi","Chao-Han Huck Yang","Pin-Yu Chen","Min-Hsiu Hsieh"],"pdf_url":"https://arxiv.org/pdf/2306.03741v4.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2405.03911v3","updated":"2024-11-18T07:17:56Z","published":"2024-05-07T00:08:15Z","title":"Federated Graph Condensation with Information Bottleneck Principles","summary":" Graph condensation, which reduces the size of a large-scale graph by\nsynthesizing a small-scale condensed graph as its substitution, has immediately\nbenefited various graph learning tasks. However, existing graph condensation\nmethods rely on centralized data storage, which is unfeasible for real-world\ndecentralized data distribution, and overlook data holders' privacy-preserving\nrequirements. To bridge the gap, we propose and study the novel problem of\nfederated graph condensation for graph neural networks (GNNs). Specifically, we\nfirst propose a general framework for federated graph condensation, in which we\ndecouple the typical gradient matching process for graph condensation into\nclient-side gradient calculation and server-side gradient matching. In this\nway, the burdensome computation cost in client-side is largely alleviated.\nBesides, our empirical studies show that under the federated setting, the\ncondensed graph will consistently leak data membership privacy, i.e., the\ncondensed graph during the federated training can be utilized to steal the\ntraining data under the membership inference attacks (MIA). To tackle this\nissue, we innovatively incorporate information bottleneck principles into the\nfederated graph condensation, which only needs to extract partial node features\nin one local pre-training step and utilize the features during federated\ntraining. Extensive experiments on real-world datasets demonstrate that our\nframework can consistently protect membership privacy during training.\nMeanwhile, it also achieves comparable and even superior performance against\nexisting centralized graph condensation and federated graph learning methods.\n","authors":["Bo Yan","Sihao He","Cheng Yang","Shang Liu","Yang Cao","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2405.03911v3.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2411.11340v1","updated":"2024-11-18T07:15:23Z","published":"2024-11-18T07:15:23Z","title":"A Hybrid Loss Framework for Decomposition-based Time Series Forecasting\n Methods: Balancing Global and Component Errors","summary":" Accurate time series forecasting, predicting future values based on past\ndata, is crucial for diverse industries. Many current time series methods\ndecompose time series into multiple sub-series, applying different model\narchitectures and training with an end-to-end overall loss for forecasting.\nHowever, this raises a question: does this overall loss prioritize the\nimportance of critical sub-series within the decomposition for the better\nperformance? To investigate this, we conduct a study on the impact of overall\nloss on existing time series methods with sequence decomposition. Our findings\nreveal that overall loss may introduce bias in model learning, hindering the\nlearning of the prioritization of more significant sub-series and limiting the\nforecasting performance. To address this, we propose a hybrid loss framework\ncombining the global and component losses. This framework introduces component\nlosses for each sub-series alongside the original overall loss. It employs a\ndual min-max algorithm to dynamically adjust weights between the overall loss\nand component losses, and within component losses. This enables the model to\nachieve better performance of current time series methods by focusing on more\ncritical sub-series while still maintaining a low overall loss. We integrate\nour loss framework into several time series methods and evaluate the\nperformance on multiple datasets. Results show an average improvement of 0.5-2%\nover existing methods without any modifications to the model architectures.\n","authors":["Ronghui Han","Duanyu Feng","Hongyu Du","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10885v2","updated":"2024-11-18T06:57:23Z","published":"2024-10-10T15:00:52Z","title":"Adaptive AI-Driven Material Synthesis: Towards Autonomous 2D Materials\n Growth","summary":" Two-dimensional (2D) materials are poised to revolutionize current\nsolid-state technology with their extraordinary properties. Yet, the primary\nchallenge remains their scalable production. While there have been significant\nadvancements, much of the scientific progress has depended on the exfoliation\nof materials, a method that poses severe challenges for large-scale\napplications. With the advent of artificial intelligence (AI) in materials\nscience, innovative synthesis methodologies are now on the horizon. This study\nexplores the forefront of autonomous materials synthesis using an artificial\nneural network (ANN) trained by evolutionary methods, focusing on the efficient\nproduction of graphene. Our approach demonstrates that a neural network can\niteratively and autonomously learn a time-dependent protocol for the efficient\ngrowth of graphene, without requiring pretraining on what constitutes an\neffective recipe. Evaluation criteria are based on the proximity of the Raman\nsignature to that of monolayer graphene: higher scores are granted to outcomes\nwhose spectrum more closely resembles that of an ideal continuous monolayer\nstructure. This feedback mechanism allows for iterative refinement of the ANN's\ntime-dependent synthesis protocols, progressively improving sample quality.\nThrough the advancement and application of AI methodologies, this work makes a\nsubstantial contribution to the field of materials engineering, fostering a new\nera of innovation and efficiency in the synthesis process.\n","authors":["Leonardo Sabattini","Annalisa Coriolano","Corneel Casert","Stiven Forti","Edward S. Barnard","Fabio Beltram","Massimiliano Pontil","Stephen Whitelam","Camilla Coletti","Antonio Rossi"],"pdf_url":"https://arxiv.org/pdf/2410.10885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10918v2","updated":"2024-11-18T06:50:30Z","published":"2024-05-17T17:09:45Z","title":"A Framework for Leveraging Partially-Labeled Data for Product\n Attribute-Value Identification","summary":" In the e-commerce domain, the accurate extraction of attribute-value pairs\n(e.g., Brand: Apple) from product titles and user search queries is crucial for\nenhancing search and recommendation systems. A major challenge with neural\nmodels for this task is the lack of high-quality training data, as the\nannotations for attribute-value pairs in the available datasets are often\nincomplete. To address this, we introduce GenToC, a model designed for training\ndirectly with partially-labeled data, eliminating the necessity for a fully\nannotated dataset. GenToC employs a marker-augmented generative model to\nidentify potential attributes, followed by a token classification model that\ndetermines the associated values for each attribute. GenToC outperforms\nexisting state-of-the-art models, exhibiting upto 56.3% increase in the number\nof accurate extractions. Furthermore, we utilize GenToC to regenerate the\ntraining dataset to expand attribute-value annotations. This bootstrapping\nsubstantially improves the data quality for training other standard NER models,\nwhich are typically faster but less capable in handling partially-labeled data,\nenabling them to achieve comparable performance to GenToC. Our results\ndemonstrate GenToC's unique ability to learn from a limited set of\npartially-labeled data and improve the training of more efficient models,\nadvancing the automated extraction of attribute-value pairs. Finally, our model\nhas been successfully integrated into IndiaMART, India's largest B2B e-commerce\nplatform, achieving a significant increase of 20.2% in the number of correctly\nidentified attribute-value pairs over the existing deployed system while\nachieving a high precision of 89.5%.\n","authors":["D. Subhalingam","Keshav Kolluru"," Mausam","Saurabh Singal"],"pdf_url":"https://arxiv.org/pdf/2405.10918v2.pdf","comment":"Accepted to KDD 2025 ADS Track"},{"id":"http://arxiv.org/abs/2411.11327v1","updated":"2024-11-18T06:44:14Z","published":"2024-11-18T06:44:14Z","title":"Enhancing Decision Transformer with Diffusion-Based Trajectory Branch\n Generation","summary":" Decision Transformer (DT) can learn effective policy from offline datasets by\nconverting the offline reinforcement learning (RL) into a supervised sequence\nmodeling task, where the trajectory elements are generated auto-regressively\nconditioned on the return-to-go (RTG).However, the sequence modeling learning\napproach tends to learn policies that converge on the sub-optimal trajectories\nwithin the dataset, for lack of bridging data to move to better trajectories,\neven if the condition is set to the highest RTG.To address this issue, we\nintroduce Diffusion-Based Trajectory Branch Generation (BG), which expands the\ntrajectories of the dataset with branches generated by a diffusion model.The\ntrajectory branch is generated based on the segment of the trajectory within\nthe dataset, and leads to trajectories with higher returns.We concatenate the\ngenerated branch with the trajectory segment as an expansion of the\ntrajectory.After expanding, DT has more opportunities to learn policies to move\nto better trajectories, preventing it from converging to the sub-optimal\ntrajectories.Empirically, after processing with BG, DT outperforms\nstate-of-the-art sequence modeling methods on D4RL benchmark, demonstrating the\neffectiveness of adding branches to the dataset without further modifications.\n","authors":["Zhihong Liu","Long Qian","Zeyang Liu","Lipeng Wan","Xingyu Chen","Xuguang Lan"],"pdf_url":"https://arxiv.org/pdf/2411.11327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11324v1","updated":"2024-11-18T06:33:40Z","published":"2024-11-18T06:33:40Z","title":"Cuvis.Ai: An Open-Source, Low-Code Software Ecosystem for Hyperspectral\n Processing and Classification","summary":" Machine learning is an important tool for analyzing high-dimension\nhyperspectral data; however, existing software solutions are either\nclosed-source or inextensible research products. In this paper, we present\ncuvis.ai, an open-source and low-code software ecosystem for data acquisition,\npreprocessing, and model training. The package is written in Python and\nprovides wrappers around common machine learning libraries, allowing both\nclassical and deep learning models to be trained on hyperspectral data. The\ncodebase abstracts processing interconnections and data dependencies between\noperations to minimize code complexity for users. This software package\ninstantiates nodes in a directed acyclic graph to handle all stages of a\nmachine learning ecosystem, from data acquisition, including live or static\ndata sources, to final class assignment or property prediction. User-created\nmodels contain convenient serialization methods to ensure portability and\nincrease sharing within the research community. All code and data are available\nonline: https://github.com/cubert-hyperspectral/cuvis.ai\n","authors":["Nathaniel Hanson","Philip Manke","Simon Birkholz","Maximilian Mühlbauer","Rene Heine","Arnd Brandes"],"pdf_url":"https://arxiv.org/pdf/2411.11324v1.pdf","comment":"5 pages, 2024 14th Workshop on Hyperspectral Imaging and Signal\n Processing: Evolution in Remote Sensing (WHISPERS)"},{"id":"http://arxiv.org/abs/2411.11315v1","updated":"2024-11-18T06:18:13Z","published":"2024-11-18T06:18:13Z","title":"A Review on Machine Unlearning","summary":" Recently, an increasing number of laws have governed the useability of users'\nprivacy. For example, Article 17 of the General Data Protection Regulation\n(GDPR), the right to be forgotten, requires machine learning applications to\nremove a portion of data from a dataset and retrain it if the user makes such a\nrequest. Furthermore, from the security perspective, training data for machine\nlearning models, i.e., data that may contain user privacy, should be\neffectively protected, including appropriate erasure. Therefore, researchers\npropose various privacy-preserving methods to deal with such issues as machine\nunlearning. This paper provides an in-depth review of the security and privacy\nconcerns in machine learning models. First, we present how machine learning can\nuse users' private data in daily life and the role that the GDPR plays in this\nproblem. Then, we introduce the concept of machine unlearning by describing the\nsecurity threats in machine learning models and how to protect users' privacy\nfrom being violated using machine learning platforms. As the core content of\nthe paper, we introduce and analyze current machine unlearning approaches and\nseveral representative research results and discuss them in the context of the\ndata lineage. Furthermore, we also discuss the future research challenges in\nthis field.\n","authors":["Haibo Zhang","Toru Nakamura","Takamasa Isohara","Kouichi Sakurai"],"pdf_url":"https://arxiv.org/pdf/2411.11315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14653v3","updated":"2024-11-18T06:03:47Z","published":"2024-04-23T01:19:19Z","title":"Machine Vision-Based Assessment of Fall Color Changes and its\n Relationship with Leaf Nitrogen Concentration","summary":" Apple(\\textit{Malus domestica} Borkh.) trees are deciduous, shedding leaves\neach year. This process is preceded by a gradual change in leaf color from\ngreen to yellow as chlorophyll is degraded prior to abscission. The initiation\nand rate of this color change are affected by many factors including leaf\nnitrogen (N) concentration. We predict that leaf color during this transition\nmay be indicative of the nitrogen status of apple trees. This study assesses a\nmachine vision-based system for quantifying the change in leaf color and its\ncorrelation with leaf nitrogen content. An image dataset was collected in color\nand 3D over five weeks in the fall of 2021 and 2023 at a commercial orchard\nusing a ground vehicle-based stereovision sensor. Trees in the foreground were\nsegmented from the point cloud using color and depth thresholding methods.\nThen, to estimate the proportion of yellow leaves per canopy, the color\ninformation of the segmented canopy area was quantified using a custom-defined\nmetric, \\textit{yellowness index} (a normalized ratio of yellow to green\nfoliage in the tree) that varied from -1 to +1 (-1 being completely green and\n+1 being completely yellow). Both K-means-based methods and gradient boosting\nmethods were used to estimate the \\textit{yellowness index}. The gradient\nboosting based method proposed in this study was better than the K-means-based\nmethod (both in terms of computational time and accuracy), achieving an $R^2$\nof 0.72 in estimating the \\textit{yellowness index}. The metric was able to\ncapture the gradual color transition from green to yellow over the study\nduration. Trees with lower leaf nitrogen showed the color transition to yellow\nearlier than the trees with higher nitrogen.\n Keywords: Fruit Tree Nitrogen Management, Machine Vision, Point Cloud\nSegmentation, Precision Nitrogen Management\n","authors":["Achyut Paudel","Jostan Brown","Priyanka Upadhyaya","Atif Bilal Asad","Safal Kshetri","Joseph R. Davidson","Cindy Grimm","Ashley Thompson","Bernardita Sallato","Matthew D. Whiting","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2404.14653v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11304v1","updated":"2024-11-18T05:59:29Z","published":"2024-11-18T05:59:29Z","title":"Toward Personalized Federated Node Classification in One-shot\n Communication","summary":" Federated Graph Learning (FGL) has become a promising paradigm for\ncollaborative training with distributed and private graph data. One-shot\nFederated Learning (OFL) enables collaboration in a single communication round\nto largely reduce communication costs and potential security concerns. However,\nexisting OFL methods are not designed for graph data and existing FGL methods\nare ineffective within one communication round under both data and model\nheterogeneity. To mitigate this gap, we are the first to propose a one-shot\npersonalized federated graph learning method for node classification, which is\nalso compatible with the Secure Aggregation scheme. We estimate and aggregate\nthe statistics of class-wise feature distribution to generate a global\npseudo-graph on the server, which could be used to train a global graph model.\nFurthermore, We reveal the under-explored problem of existing personalized FGL\nmethods that their personalized models are biased and neglect the ability to\ngeneralize to minorities. To achieve better personalization and generalization\nsimultaneously, we propose a two-stage personalized training to adaptively\nutilize the personal information from local data and global information from\nthe global pseudo-graph. Comprehensive experiments on 8 multi-scale graph\ndatasets under different partitions with various settings demonstrate our\nsuperior performance over state-of-the-art baselines.\n","authors":["Guochen Yan","Xunkai Li","Luyuan Xie","Wentao Zhang","Qingni Shen","Yuejian Fang","Zhonghai Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11304v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.11303v1","updated":"2024-11-18T05:58:47Z","published":"2024-11-18T05:58:47Z","title":"Recurrent Stochastic Configuration Networks with Incremental Blocks","summary":" Recurrent stochastic configuration networks (RSCNs) have shown promise in\nmodelling nonlinear dynamic systems with order uncertainty due to their\nadvantages of easy implementation, less human intervention, and strong\napproximation capability. This paper develops the original RSCNs with block\nincrements, termed block RSCNs (BRSCNs), to further enhance the learning\ncapacity and efficiency of the network. BRSCNs can simultaneously add multiple\nreservoir nodes (subreservoirs) during the construction. Each subreservoir is\nconfigured with a unique structure in the light of a supervisory mechanism,\nensuring the universal approximation property. The reservoir feedback matrix is\nappropriately scaled to guarantee the echo state property of the network.\nFurthermore, the output weights are updated online using a projection\nalgorithm, and the persistent excitation conditions that facilitate parameter\nconvergence are also established. Numerical results over a time series\nprediction, a nonlinear system identification task, and two industrial data\npredictive analyses demonstrate that the proposed BRSCN performs favourably in\nterms of modelling efficiency, learning, and generalization performance,\nhighlighting their significant potential for coping with complex dynamics.\n","authors":["Gang Dang","Dainhui Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11300v1","updated":"2024-11-18T05:50:58Z","published":"2024-11-18T05:50:58Z","title":"Accelerating spherical K-means clustering for large-scale sparse\n document data","summary":" This paper presents an accelerated spherical K-means clustering algorithm for\nlarge-scale and high-dimensional sparse document data sets. We design an\nalgorithm working in an architecture-friendly manner (AFM), which is a\nprocedure of suppressing performance-degradation factors such as the numbers of\ninstructions, branch mispredictions, and cache misses in CPUs of a modern\ncomputer system. For the AFM operation, we leverage unique universal\ncharacteristics (UCs) of a data-object and a cluster's mean set, which are\nskewed distributions on data relationships such as Zipf's law and a\nfeature-value concentration phenomenon. The UCs indicate that the most part of\nthe number of multiplications for similarity calculations is executed regarding\nterms with high document frequencies (df) and the most part of a similarity\nbetween an object- and a mean-feature vector is obtained by the multiplications\nregarding a few high mean-feature values. Our proposed algorithm applies an\ninverted-index data structure to a mean set, extracts the specific region with\nhigh-df terms and high mean-feature values in the mean-inverted index by newly\nintroduced two structural parameters, and exploits the index divided into three\nparts for efficient pruning. The algorithm determines the two structural\nparameters by minimizing the approximate number of multiplications related to\nthat of instructions, reduces the branch mispredictions by sharing the index\nstructure including the two parameters with all the objects, and suppressing\nthe cache misses by keeping in the caches the frequently used data in the\nforegoing specific region, resulting in working in the AFM. We experimentally\ndemonstrate that our algorithm efficiently achieves superior speed performance\nin large-scale documents compared with algorithms using the state-of-the-art\ntechniques.\n","authors":["Kazuo Aoyama","Kazumi Saito"],"pdf_url":"https://arxiv.org/pdf/2411.11300v1.pdf","comment":"28 pages, 23 figures"},{"id":"http://arxiv.org/abs/2406.08527v2","updated":"2024-11-18T05:47:10Z","published":"2024-06-12T08:31:34Z","title":"Optimized Feature Generation for Tabular Data via LLMs with Decision\n Tree Reasoning","summary":" In tabular prediction tasks, tree-based models combined with automated\nfeature engineering methods often outperform deep learning approaches that rely\non learned representations. While these feature engineering techniques are\neffective, they typically depend on a pre-defined search space and primarily\nuse validation scores for feature selection, thereby missing valuable insights\nfrom previous experiments. To address these limitations, we propose a novel\ntabular learning framework that utilizes large language models (LLMs), termed\nOptimizing Column feature generator with decision Tree reasoning (OCTree). Our\nkey idea is to leverage the reasoning capabilities of LLMs to identify\neffective feature generation rules without manually specifying the search space\nand provide language-based reasoning information highlighting past experiments\nas feedback for iterative rule improvements. We use decision trees to convey\nthis reasoning information, as they can be easily represented in natural\nlanguage, effectively providing knowledge from prior experiments (i.e., the\nimpact of the generated features on performance) to the LLMs. Our empirical\nresults demonstrate that OCTree consistently enhances the performance of\nvarious prediction models across diverse benchmarks, outperforming competing\nautomated feature engineering methods. Code is available at\nhttps://github.com/jaehyun513/OCTree.\n","authors":["Jaehyun Nam","Kyuyoung Kim","Seunghyuk Oh","Jihoon Tack","Jaehyung Kim","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2406.08527v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11296v1","updated":"2024-11-18T05:47:02Z","published":"2024-11-18T05:47:02Z","title":"Steering Language Model Refusal with Sparse Autoencoders","summary":" Responsible practices for deploying language models include guiding models to\nrecognize and refuse answering prompts that are considered unsafe, while\ncomplying with safe prompts. Achieving such behavior typically requires\nupdating model weights, which is costly and inflexible. We explore\nopportunities to steering model activations at inference time, which does not\nrequire updating weights. Using sparse autoencoders, we identify and steer\nfeatures in Phi-3 Mini that mediate refusal behavior. We find that feature\nsteering can improve Phi-3 Minis robustness to jailbreak attempts across\nvarious harms, including challenging multi-turn attacks. However, we discover\nthat feature steering can adversely affect overall performance on benchmarks.\nThese results suggest that identifying steerable mechanisms for refusal via\nsparse autoencoders is a promising approach for enhancing language model\nsafety, but that more research is needed to mitigate feature steerings adverse\neffects on performance.\n","authors":["Kyle O'Brien","David Majercak","Xavier Fernandes","Richard Edgar","Jingya Chen","Harsha Nori","Dean Carignan","Eric Horvitz","Forough Poursabzi-Sangde"],"pdf_url":"https://arxiv.org/pdf/2411.11296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11293v1","updated":"2024-11-18T05:39:00Z","published":"2024-11-18T05:39:00Z","title":"SADDE: Semi-supervised Anomaly Detection with Dependable Explanations","summary":" Semi-supervised learning holds a pivotal position in anomaly detection\napplications, yet identifying anomaly patterns with a limited number of labeled\nsamples poses a significant challenge. Furthermore, the absence of\ninterpretability poses major obstacles to the practical adoption of\nsemi-supervised frameworks. The majority of existing interpretation techniques\nare tailored for supervised/unsupervised frameworks or non-security domains,\nfalling short in providing dependable interpretations. In this research paper,\nwe introduce SADDE, a general framework designed to accomplish two primary\nobjectives: (1) to render the anomaly detection process interpretable and\nenhance the credibility of interpretation outcomes, and (2) to assign\nhigh-confidence pseudo labels to unlabeled samples, thereby boosting the\nperformance of anomaly detection systems when supervised data is scarce. To\nachieve the first objective, we devise a cutting-edge interpretation method\nthat utilizes both global and local interpreters to furnish trustworthy\nexplanations. For the second objective, we conceptualize a novel two-stage\nsemi-supervised learning framework tailored for network anomaly detection,\nensuring that the model predictions of both stages align with specific\nconstraints. We apply SADDE to two illustrative network anomaly detection tasks\nand conduct extensive evaluations in comparison with notable prior works. The\nexperimental findings underscore that SADDE is capable of delivering precise\ndetection results alongside dependable interpretations for semi-supervised\nnetwork anomaly detection systems. The source code for SADDE is accessible at:\nhttps://github.com/M-Code-Space/SADDE.\n","authors":["Yachao Yuan","Yu Huang","Yali Yuan","Jin Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14782v2","updated":"2024-11-18T05:27:38Z","published":"2023-11-24T16:32:47Z","title":"Understanding the Role of Textual Prompts in LLM for Time Series\n Forecasting: an Adapter View","summary":" In the burgeoning domain of Large Language Models (LLMs), there is a growing\ninterest in applying LLM to time series forecasting, with multiple studies\nfocused on leveraging textual prompts to further enhance the predictive\nprowess. This study aims to understand how and why the integration of textual\nprompts into LLM can effectively improve the prediction accuracy of time\nseries, which is not obvious at the glance, given the significant domain gap\nbetween texts and time series. Our extensive examination leads us to believe\nthat (a) adding text prompts is roughly equivalent to introducing additional\nadapters, and (b) It is the introduction of learnable parameters rather than\ntextual information that aligns the LLM with the time series forecasting task,\nultimately enhancing prediction accuracy. Inspired by this discovery, we\ndeveloped four adapters that explicitly address the gap between LLM and time\nseries, and further improve the prediction accuracy. Overall,our work\nhighlights how textual prompts enhance LLM accuracy in time series forecasting\nand suggests new avenues for continually improving LLM-based time series\nanalysis.\n","authors":["Peisong Niu","Tian Zhou","Xue Wang","Liang Sun","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2311.14782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20064v2","updated":"2024-11-18T05:09:42Z","published":"2023-10-30T22:29:07Z","title":"A Scalable Training Strategy for Blind Multi-Distribution Noise Removal","summary":" Despite recent advances, developing general-purpose universal denoising and\nartifact-removal networks remains largely an open problem: Given fixed network\nweights, one inherently trades-off specialization at one task (e.g.,~removing\nPoisson noise) for performance at another (e.g.,~removing speckle noise). In\naddition, training such a network is challenging due to the curse of\ndimensionality: As one increases the dimensions of the specification-space\n(i.e.,~the number of parameters needed to describe the noise distribution) the\nnumber of unique specifications one needs to train for grows exponentially.\nUniformly sampling this space will result in a network that does well at very\nchallenging problem specifications but poorly at easy problem specifications,\nwhere even large errors will have a small effect on the overall mean squared\nerror.\n In this work we propose training denoising networks using an\nadaptive-sampling/active-learning strategy. Our work improves upon a recently\nproposed universal denoiser training strategy by extending these results to\nhigher dimensions and by incorporating a polynomial approximation of the true\nspecification-loss landscape. This approximation allows us to reduce training\ntimes by almost two orders of magnitude. We test our method on simulated joint\nPoisson-Gaussian-Speckle noise and demonstrate that with our proposed training\nstrategy, a single blind, generalist denoiser network can achieve peak\nsignal-to-noise ratios within a uniform bound of specialized denoiser networks\nacross a large range of operating conditions. We also capture a small dataset\nof images with varying amounts of joint Poisson-Gaussian-Speckle noise and\ndemonstrate that a universal denoiser trained using our adaptive-sampling\nstrategy outperforms uniformly trained baselines.\n","authors":["Kevin Zhang","Sakshum Kulshrestha","Christopher Metzler"],"pdf_url":"https://arxiv.org/pdf/2310.20064v2.pdf","comment":"IEEE TIP 2024"},{"id":"http://arxiv.org/abs/2411.11284v1","updated":"2024-11-18T04:57:05Z","published":"2024-11-18T04:57:05Z","title":"Dual-Frequency Filtering Self-aware Graph Neural Networks for Homophilic\n and Heterophilic Graphs","summary":" Graph Neural Networks (GNNs) have excelled in handling graph-structured data,\nattracting significant research interest. However, two primary challenges have\nemerged: interference between topology and attributes distorting node\nrepresentations, and the low-pass filtering nature of most GNNs leading to the\noversight of valuable high-frequency information in graph signals. These issues\nare particularly pronounced in heterophilic graphs. To address these\nchallenges, we propose Dual-Frequency Filtering Self-aware Graph Neural\nNetworks (DFGNN). DFGNN integrates low-pass and high-pass filters to extract\nsmooth and detailed topological features, using frequency-specific constraints\nto minimize noise and redundancy in the respective frequency bands. The model\ndynamically adjusts filtering ratios to accommodate both homophilic and\nheterophilic graphs. Furthermore, DFGNN mitigates interference by aligning\ntopological and attribute representations through dynamic correspondences\nbetween their respective frequency bands, enhancing overall model performance\nand expressiveness. Extensive experiments conducted on benchmark datasets\ndemonstrate that DFGNN outperforms state-of-the-art methods in classification\nperformance, highlighting its effectiveness in handling both homophilic and\nheterophilic graphs.\n","authors":["Yachao Yang","Yanfeng Sun","Jipeng Guo","Junbin Gao","Shaofan Wang","Fujiao Ju","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2411.11284v1.pdf","comment":"11pages,17figures"},{"id":"http://arxiv.org/abs/2411.11283v1","updated":"2024-11-18T04:55:26Z","published":"2024-11-18T04:55:26Z","title":"Multi-Hyperbolic Space-based Heterogeneous Graph Attention Network","summary":" To leverage the complex structures within heterogeneous graphs, recent\nstudies on heterogeneous graph embedding use a hyperbolic space, characterized\nby a constant negative curvature and exponentially increasing space, which\naligns with the structural properties of heterogeneous graphs. However, despite\nheterogeneous graphs inherently possessing diverse power-law structures, most\nhyperbolic heterogeneous graph embedding models use a single hyperbolic space\nfor the entire heterogeneous graph, which may not effectively capture the\ndiverse power-law structures within the heterogeneous graph. To address this\nlimitation, we propose Multi-hyperbolic Space-based heterogeneous Graph\nAttention Network (MSGAT), which uses multiple hyperbolic spaces to effectively\ncapture diverse power-law structures within heterogeneous graphs. We conduct\ncomprehensive experiments to evaluate the effectiveness of MSGAT. The\nexperimental results demonstrate that MSGAT outperforms state-of-the-art\nbaselines in various graph machine learning tasks, effectively capturing the\ncomplex structures of heterogeneous graphs.\n","authors":["Jongmin Park","Seunghoon Han","Jong-Ryul Lee","Sungsu Lim"],"pdf_url":"https://arxiv.org/pdf/2411.11283v1.pdf","comment":"Accepted in IEEE ICDM 2024"},{"id":"http://arxiv.org/abs/2411.11276v1","updated":"2024-11-18T04:32:42Z","published":"2024-11-18T04:32:42Z","title":"Coupled Integral PINN for conservation law","summary":" The Physics-Informed Neural Network (PINN) is an innovative approach to solve\na diverse array of partial differential equations (PDEs) leveraging the power\nof neural networks. This is achieved by minimizing the residual loss associated\nwith the explicit physical information, usually coupled with data derived from\ninitial and boundary conditions. However, a challenge arises in the context of\nnonlinear conservation laws where derivatives are undefined at shocks, leading\nto solutions that deviate from the true physical phenomena. To solve this\nissue, the physical solution must be extracted from the weak formulation of the\nPDE and is typically further bounded by entropy conditions. Within the\nnumerical framework, finite volume methods (FVM) are employed to address\nconservation laws. These methods resolve the integral form of conservation laws\nand delineate the shock characteristics. Inspired by the principles underlying\nFVM, this paper introduces a novel Coupled Integrated PINN methodology that\ninvolves fitting the integral solutions of equations using additional neural\nnetworks. This technique not only augments the conventional PINN's capability\nin modeling shock waves, but also eliminates the need for spatial and temporal\ndiscretization. As such, it bypasses the complexities of numerical integration\nand reconstruction associated with non-convex fluxes. Finally, we show that the\nproposed new Integrated PINN performs well in conservative law and outperforms\nthe vanilla PINN when tackle the challenging shock problems using examples of\nBurger's equation, Buckley-Leverett Equation and Euler System.\n","authors":["Yeping Wang","Shihao Yang"],"pdf_url":"https://arxiv.org/pdf/2411.11276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00435v2","updated":"2024-11-18T04:28:09Z","published":"2024-10-01T06:34:58Z","title":"Incorporating Arbitrary Matrix Group Equivariance into KANs","summary":" Kolmogorov-Arnold Networks (KANs) have seen great success in scientific\ndomains thanks to spline activation functions, becoming an alternative to\nMulti-Layer Perceptrons (MLPs). However, spline functions may not respect\nsymmetry in tasks, which is crucial prior knowledge in machine learning.\nPreviously, equivariant networks embed symmetry into their architectures,\nachieving better performance in specific applications. Among these, Equivariant\nMulti-Layer Perceptrons (EMLP) introduce arbitrary matrix group equivariance\ninto MLPs, providing a general framework for constructing equivariant networks\nlayer by layer. In this paper, we propose Equivariant Kolmogorov-Arnold\nNetworks (EKAN), a method for incorporating matrix group equivariance into\nKANs, aiming to broaden their applicability to more fields. First, we construct\ngated spline basis functions, which form the EKAN layer together with\nequivariant linear weights. We then define a lift layer to align the input\nspace of EKAN with the feature space of the dataset, thereby building the\nentire EKAN architecture. Compared with baseline models, EKAN achieves higher\naccuracy with smaller datasets or fewer parameters on symmetry-related tasks,\nsuch as particle scattering and the three-body problem, often reducing test MSE\nby several orders of magnitude. Even in non-symbolic formula scenarios, such as\ntop quark tagging with three jet constituents, EKAN achieves comparable results\nwith EMLP using only $26\\%$ of the parameters, while KANs do not outperform\nMLPs as expected.\n","authors":["Lexiang Hu","Yisen Wang","Zhouchen Lin"],"pdf_url":"https://arxiv.org/pdf/2410.00435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11275v1","updated":"2024-11-18T04:23:20Z","published":"2024-11-18T04:23:20Z","title":"Effective Predictive Modeling for Emergency Department Visits and\n Evaluating Exogenous Variables Impact: Using Explainable Meta-learning\n Gradient Boosting","summary":" Over an extensive duration, administrators and clinicians have endeavoured to\npredict Emergency Department (ED) visits with precision, aiming to optimise\nresource distribution. Despite the proliferation of diverse AI-driven models\ntailored for precise prognostication, this task persists as a formidable\nchallenge, besieged by constraints such as restrained generalisability,\nsusceptibility to overfitting and underfitting, scalability issues, and complex\nfine-tuning hyper-parameters. In this study, we introduce a novel Meta-learning\nGradient Booster (Meta-ED) approach for precisely forecasting daily ED visits\nand leveraging a comprehensive dataset of exogenous variables, including\nsocio-demographic characteristics, healthcare service use, chronic diseases,\ndiagnosis, and climate parameters spanning 23 years from Canberra Hospital in\nACT, Australia. The proposed Meta-ED consists of four foundational\nlearners-Catboost, Random Forest, Extra Tree, and lightGBoost-alongside a\ndependable top-level learner, Multi-Layer Perceptron (MLP), by combining the\nunique capabilities of varied base models (sub-learners). Our study assesses\nthe efficacy of the Meta-ED model through an extensive comparative analysis\ninvolving 23 models. The evaluation outcomes reveal a notable superiority of\nMeta-ED over the other models in accuracy at 85.7% (95% CI ;85.4%, 86.0%) and\nacross a spectrum of 10 evaluation metrics. Notably, when compared with\nprominent techniques, XGBoost, Random Forest (RF), AdaBoost, LightGBoost, and\nExtra Tree (ExT), Meta-ED showcases substantial accuracy enhancements of 58.6%,\n106.3%, 22.3%, 7.0%, and 15.7%, respectively. Furthermore, incorporating\nweather-related features demonstrates a 3.25% improvement in the prediction\naccuracy of visitors' numbers. The encouraging outcomes of our study underscore\nMeta-ED as a foundation model for the precise prediction of daily ED visitors.\n","authors":["Mehdi Neshat","Michael Phipps","Nikhil Jha","Danial Khojasteh","Michael Tong","Amir Gandomi"],"pdf_url":"https://arxiv.org/pdf/2411.11275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11268v1","updated":"2024-11-18T03:57:07Z","published":"2024-11-18T03:57:07Z","title":"ACE2: Accurately learning subseasonal to decadal atmospheric variability\n and forced responses","summary":" Existing machine learning models of weather variability are not formulated to\nenable assessment of their response to varying external boundary conditions\nsuch as sea surface temperature and greenhouse gases. Here we present ACE2 (Ai2\nClimate Emulator version 2) and its application to reproducing atmospheric\nvariability over the past 80 years on timescales from days to decades. ACE2 is\na 450M-parameter autoregressive machine learning emulator, operating with\n6-hour temporal resolution, 1{\\deg} horizontal resolution and eight vertical\nlayers. It exactly conserves global dry air mass and moisture and can be\nstepped forward stably for arbitrarily many steps with a throughput of about\n1500 simulated years per wall clock day. ACE2 generates emergent phenomena such\nas tropical cyclones, the Madden Julian Oscillation, and sudden stratospheric\nwarmings. Furthermore, it accurately reproduces the atmospheric response to El\nNi\\~no variability and global trends of temperature over the past 80 years.\nHowever, its sensitivities to separately changing sea surface temperature and\ncarbon dioxide are not entirely realistic.\n","authors":["Oliver Watt-Meyer","Brian Henn","Jeremy McGibbon","Spencer K. Clark","Anna Kwa","W. Andre Perkins","Elynn Wu","Lucas Harris","Christopher S. Bretherton"],"pdf_url":"https://arxiv.org/pdf/2411.11268v1.pdf","comment":"31 pages, 23 figures"},{"id":"http://arxiv.org/abs/2309.07176v3","updated":"2024-11-18T03:40:52Z","published":"2023-09-12T20:45:30Z","title":"Optimal and Fair Encouragement Policy Evaluation and Learning","summary":" In consequential domains, it is often impossible to compel individuals to\ntake treatment, so that optimal policy rules are merely suggestions in the\npresence of human non-adherence to treatment recommendations. Under\nheterogeneity, covariates may predict take-up of treatment and final outcome,\nbut differently. While optimal treatment rules optimize causal outcomes across\nthe population, access parity constraints or other fairness considerations on\nwho receives treatment can be important. For example, in social services, a\npersistent puzzle is the gap in take-up of beneficial services among those who\nmay benefit from them the most. We study causal identification and robust\nestimation of optimal treatment rules, including under potential violations of\npositivity. We consider fairness constraints such as demographic parity in\ntreatment take-up, and other constraints, via constrained optimization. Our\nframework can be extended to handle algorithmic recommendations under an\noften-reasonable covariate-conditional exclusion restriction, using our\nrobustness checks for lack of positivity in the recommendation. We develop a\ntwo-stage algorithm for solving over parametrized policy classes under general\nconstraints to obtain variance-sensitive regret bounds. We illustrate the\nmethods in three case studies based on data from reminders of SNAP benefits\nrecertification, randomized encouragement to enroll in insurance, and from\npretrial supervised release with electronic monitoring. While the specific\nremedy to inequities in algorithmic allocation is context-specific, it requires\nstudying both take-up of decisions and downstream outcomes of them.\n","authors":["Angela Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.07176v3.pdf","comment":"Updated with major new case study on SNAP recertification benefits"},{"id":"http://arxiv.org/abs/2411.11265v1","updated":"2024-11-18T03:38:42Z","published":"2024-11-18T03:38:42Z","title":"GROOT: Effective Design of Biological Sequences with Limited\n Experimental Data","summary":" Latent space optimization (LSO) is a powerful method for designing discrete,\nhigh-dimensional biological sequences that maximize expensive black-box\nfunctions, such as wet lab experiments. This is accomplished by learning a\nlatent space from available data and using a surrogate model to guide\noptimization algorithms toward optimal outputs. However, existing methods\nstruggle when labeled data is limited, as training the surrogate model with few\nlabeled data points can lead to subpar outputs, offering no advantage over the\ntraining data itself. We address this challenge by introducing GROOT, a\nGraph-based Latent Smoothing for Biological Sequence Optimization. In\nparticular, GROOT generates pseudo-labels for neighbors sampled around the\ntraining latent embeddings. These pseudo-labels are then refined and smoothed\nby Label Propagation. Additionally, we theoretically and empirically justify\nour approach, demonstrate GROOT's ability to extrapolate to regions beyond the\ntraining set while maintaining reliability within an upper bound of their\nexpected distances from the training regions. We evaluate GROOT on various\nbiological sequence design tasks, including protein optimization (GFP and AAV)\nand three tasks with exact oracles from Design-Bench. The results demonstrate\nthat GROOT equalizes and surpasses existing methods without requiring access to\nblack-box oracles or vast amounts of labeled data, highlighting its\npracticality and effectiveness. We release our code at\nhttps://anonymous.4open.science/r/GROOT-D554\n","authors":["Thanh V. T. Tran","Nhat Khang Ngo","Viet Anh Nguyen","Truong Son Hy"],"pdf_url":"https://arxiv.org/pdf/2411.11265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11259v1","updated":"2024-11-18T03:28:11Z","published":"2024-11-18T03:28:11Z","title":"Graph Retention Networks for Dynamic Graphs","summary":" In this work, we propose Graph Retention Network as a unified architecture\nfor deep learning on dynamic graphs. The GRN extends the core computational\nmanner of retention to dynamic graph data as graph retention, which empowers\nthe model with three key computational paradigms that enable training\nparallelism, $O(1)$ low-cost inference, and long-term batch training. This\narchitecture achieves an optimal balance of effectiveness, efficiency, and\nscalability. Extensive experiments conducted on benchmark datasets present the\nsuperior performance of the GRN in both edge-level prediction and node-level\nclassification tasks. Our architecture achieves cutting-edge results while\nmaintaining lower training latency, reduced GPU memory consumption, and up to\nan 86.7x improvement in inference throughput compared to baseline models. The\nGRNs have demonstrated strong potential to become a widely adopted architecture\nfor dynamic graph learning tasks. Code will be available at\nhttps://github.com/Chandler-Q/GraphRetentionNet.\n","authors":["Qian Chang","Xia Li","Xiufeng Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.11259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11256v1","updated":"2024-11-18T03:17:40Z","published":"2024-11-18T03:17:40Z","title":"Progressive Generalization Risk Reduction for Data-Efficient Causal\n Effect Estimation","summary":" Causal effect estimation (CEE) provides a crucial tool for predicting the\nunobserved counterfactual outcome for an entity. As CEE relaxes the requirement\nfor ``perfect'' counterfactual samples (e.g., patients with identical\nattributes and only differ in treatments received) that are impractical to\nobtain and can instead operate on observational data, it is usually used in\nhigh-stake domains like medical treatment effect prediction. Nevertheless, in\nthose high-stake domains, gathering a decently sized, fully labelled\nobservational dataset remains challenging due to hurdles associated with costs,\nethics, expertise and time needed, etc., of which medical treatment surveys are\na typical example. Consequently, if the training dataset is small in scale, low\ngeneralization risks can hardly be achieved on any CEE algorithms.\n Unlike existing CEE methods that assume the constant availability of a\ndataset with abundant samples, in this paper, we study a more realistic CEE\nsetting where the labelled data samples are scarce at the beginning, while more\ncan be gradually acquired over the course of training -- assuredly under a\nlimited budget considering their expensive nature. Then, the problem naturally\ncomes down to actively selecting the best possible samples to be labelled,\ne.g., identifying the next subset of patients to conduct the treatment survey.\nHowever, acquiring quality data for reducing the CEE risk under limited\nlabelling budgets remains under-explored until now. To fill the gap, we\ntheoretically analyse the generalization risk from an intriguing perspective of\nprogressively shrinking its upper bound, and develop a principled label\nacquisition pipeline exclusively for CEE tasks. With our analysis, we propose\nthe Model Agnostic Causal Active Learning (MACAL) algorithm for batch-wise\nlabel acquisition, which aims to reduce both the CEE model's uncertainty and\nthe post-acquisition ...\n","authors":["Hechuan Wen","Tong Chen","Guanhua Ye","Li Kheng Chai","Shazia Sadiq","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2411.11256v1.pdf","comment":"Accepted by KDD'25"},{"id":"http://arxiv.org/abs/2406.07005v2","updated":"2024-11-18T03:02:13Z","published":"2024-06-11T06:59:17Z","title":"DecoR: Deconfounding Time Series with Robust Regression","summary":" Causal inference on time series data is a challenging problem, especially in\nthe presence of unobserved confounders. This work focuses on estimating the\ncausal effect between two time series that are confounded by a third,\nunobserved time series. Assuming spectral sparsity of the confounder, we show\nhow in the frequency domain this problem can be framed as an adversarial\noutlier problem. We introduce Deconfounding by Robust regression (DecoR), a\nnovel approach that estimates the causal effect using robust linear regression\nin the frequency domain. Considering two different robust regression\ntechniques, we first improve existing bounds on the estimation error for such\ntechniques. Crucially, our results do not require distributional assumptions on\nthe covariates. We can therefore use them in time series settings. Applying\nthese results to DecoR, we prove, under suitable assumptions, upper bounds for\nthe estimation error of DecoR that imply consistency. We demonstrate DecoR's\neffectiveness through experiments on both synthetic and real-world data from\nEarth system science. The simulation experiments furthermore suggest that DecoR\nis robust with respect to model misspecification.\n","authors":["Felix Schur","Jonas Peters"],"pdf_url":"https://arxiv.org/pdf/2406.07005v2.pdf","comment":"27 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.16726v3","updated":"2024-11-18T02:56:27Z","published":"2024-02-26T16:48:12Z","title":"Towards Empirical Interpretation of Internal Circuits and Properties in\n Grokked Transformers on Modular Polynomials","summary":" Grokking has been actively explored to reveal the mystery of delayed\ngeneralization and identifying interpretable representations and algorithms\ninside the grokked models is a suggestive hint to understanding its mechanism.\nGrokking on modular addition has been known to implement Fourier representation\nand its calculation circuits with trigonometric identities in Transformers.\nConsidering the periodicity in modular arithmetic, the natural question is to\nwhat extent these explanations and interpretations hold for the grokking on\nother modular operations beyond addition. For a closer look, we first\nhypothesize that any modular operations can be characterized with distinctive\nFourier representation or internal circuits, grokked models obtain common\nfeatures transferable among similar operations, and mixing datasets with\nsimilar operations promotes grokking. Then, we extensively examine them by\nlearning Transformers on complex modular arithmetic tasks, including\npolynomials. Our Fourier analysis and novel progress measure for modular\narithmetic, Fourier Frequency Density and Fourier Coefficient Ratio,\ncharacterize distinctive internal representations of grokked models per modular\noperation; for instance, polynomials often result in the superposition of the\nFourier components seen in elementary arithmetic, but clear patterns do not\nemerge in challenging non-factorizable polynomials. In contrast, our ablation\nstudy on the pre-grokked models reveals that the transferability among the\nmodels grokked with each operation can be only limited to specific\ncombinations, such as from elementary arithmetic to linear expressions.\nMoreover, some multi-task mixtures may lead to co-grokking -- where grokking\nsimultaneously happens for all the tasks -- and accelerate generalization,\nwhile others may not find optimal solutions. We provide empirical steps towards\nthe interpretability of internal circuits.\n","authors":["Hiroki Furuta","Gouki Minegishi","Yusuke Iwasawa","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2402.16726v3.pdf","comment":"Published at Transactions on Machine Learning Research (TMLR), Code:\n https://github.com/frt03/grok_mod_poly"},{"id":"http://arxiv.org/abs/2211.10285v2","updated":"2024-11-18T02:50:46Z","published":"2022-11-18T15:17:28Z","title":"A Fair Loss Function for Network Pruning","summary":" Model pruning can enable the deployment of neural networks in environments\nwith resource constraints. While pruning may have a small effect on the overall\nperformance of the model, it can exacerbate existing biases into the model such\nthat subsets of samples see significantly degraded performance. In this paper,\nwe introduce the performance weighted loss function, a simple modified\ncross-entropy loss function that can be used to limit the introduction of\nbiases during pruning. Experiments using the CelebA, Fitzpatrick17k and\nCIFAR-10 datasets demonstrate that the proposed method is a simple and\neffective tool that can enable existing pruning methods to be used in fairness\nsensitive contexts. Code used to produce all experiments contained in this\npaper can be found at https://github.com/robbiemeyer/pw_loss_pruning.\n","authors":["Robbie Meyer","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2211.10285v2.pdf","comment":"[v1] Trustworthy and Socially Responsible Machine Learning (TSRML\n 2022) workshop co-located with NeurIPS 2022"},{"id":"http://arxiv.org/abs/2405.18634v2","updated":"2024-11-18T02:42:23Z","published":"2024-05-28T22:33:02Z","title":"A Theoretical Understanding of Self-Correction through In-context\n Alignment","summary":" Going beyond mimicking limited human experiences, recent studies show initial\nevidence that, like humans, large language models (LLMs) are capable of\nimproving their abilities purely by self-correction, i.e., correcting previous\nresponses through self-examination, in certain circumstances. Nevertheless,\nlittle is known about how such capabilities arise. In this work, based on a\nsimplified setup akin to an alignment task, we theoretically analyze\nself-correction from an in-context learning perspective, showing that when LLMs\ngive relatively accurate self-examinations as rewards, they are capable of\nrefining responses in an in-context way. Notably, going beyond previous\ntheories on over-simplified linear transformers, our theoretical construction\nunderpins the roles of several key designs of realistic transformers for\nself-correction: softmax attention, multi-head attention, and the MLP block. We\nvalidate these findings extensively on synthetic datasets. Inspired by these\nfindings, we also illustrate novel applications of self-correction, such as\ndefending against LLM jailbreaks, where a simple self-correction step does make\na large difference. We believe that these findings will inspire further\nresearch on understanding, exploiting, and enhancing self-correction for\nbuilding better foundation models.\n","authors":["Yifei Wang","Yuyang Wu","Zeming Wei","Stefanie Jegelka","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2405.18634v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2404.14197v3","updated":"2024-11-18T02:41:59Z","published":"2024-04-22T14:06:35Z","title":"SOFTS: Efficient Multivariate Time Series Forecasting with Series-Core\n Fusion","summary":" Multivariate time series forecasting plays a crucial role in various fields\nsuch as finance, traffic management, energy, and healthcare. Recent studies\nhave highlighted the advantages of channel independence to resist distribution\ndrift but neglect channel correlations, limiting further enhancements. Several\nmethods utilize mechanisms like attention or mixer to address this by capturing\nchannel correlations, but they either introduce excessive complexity or rely\ntoo heavily on the correlation to achieve satisfactory results under\ndistribution drifts, particularly with a large number of channels. Addressing\nthis gap, this paper presents an efficient MLP-based model, the Series-cOre\nFused Time Series forecaster (SOFTS), which incorporates a novel STar\nAggregate-Redistribute (STAR) module. Unlike traditional approaches that manage\nchannel interactions through distributed structures, \\textit{e.g.}, attention,\nSTAR employs a centralized strategy to improve efficiency and reduce reliance\non the quality of each channel. It aggregates all series to form a global core\nrepresentation, which is then dispatched and fused with individual series\nrepresentations to facilitate channel interactions effectively.SOFTS achieves\nsuperior performance over existing state-of-the-art methods with only linear\ncomplexity. The broad applicability of the STAR module across different\nforecasting models is also demonstrated empirically. For further research and\ndevelopment, we have made our code publicly available at\nhttps://github.com/Secilia-Cxy/SOFTS.\n","authors":["Lu Han","Xu-Yang Chen","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2404.14197v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11249v1","updated":"2024-11-18T02:36:19Z","published":"2024-11-18T02:36:19Z","title":"EXCON: Extreme Instance-based Contrastive Representation Learning of\n Severely Imbalanced Multivariate Time Series for Solar Flare Prediction","summary":" In heliophysics research, predicting solar flares is crucial due to their\npotential to impact both space-based systems and Earth's infrastructure\nsubstantially. Magnetic field data from solar active regions, recorded by solar\nimaging observatories, are transformed into multivariate time series to enable\nsolar flare prediction using temporal window-based analysis. In the realm of\nmultivariate time series-driven solar flare prediction, addressing severe class\nimbalance with effective strategies for multivariate time series representation\nlearning is key to developing robust predictive models. Traditional methods\noften struggle with overfitting to the majority class in prediction tasks where\nmajor solar flares are infrequent. This work presents EXCON, a contrastive\nrepresentation learning framework designed to enhance classification\nperformance amidst such imbalances. EXCON operates through four stages:\nobtaining core features from multivariate time series data; selecting\ndistinctive contrastive representations for each class to maximize inter-class\nseparation; training a temporal feature embedding module with a custom extreme\nreconstruction loss to minimize intra-class variation; and applying a\nclassifier to the learned embeddings for robust classification. The proposed\nmethod leverages contrastive learning principles to map similar instances\ncloser in the feature space while distancing dissimilar ones, a strategy not\nextensively explored in solar flare prediction tasks. This approach not only\naddresses class imbalance but also offers a versatile solution applicable to\nunivariate and multivariate time series across binary and multiclass\nclassification problems. Experimental results, including evaluations on the\nbenchmark solar flare dataset and multiple time series archive datasets with\nbinary and multiclass labels, demonstrate EXCON's efficacy in enhancing\nclassification performance.\n","authors":["Onur Vural","Shah Muhammad Hamdi","Soukaina Filali Boubrahimi"],"pdf_url":"https://arxiv.org/pdf/2411.11249v1.pdf","comment":"This work has been accepted at the 2024 IEEE International Conference\n on Big Data (IEEE BigData 2024) on October 27, 2024, as a main conference\n paper"},{"id":"http://arxiv.org/abs/2411.09915v2","updated":"2024-11-18T02:27:04Z","published":"2024-11-15T03:23:26Z","title":"Physics-informed Machine Learning for Battery Pack Thermal Management","summary":" With the popularity of electric vehicles, the demand for lithium-ion\nbatteries is increasing. Temperature significantly influences the performance\nand safety of batteries. Battery thermal management systems can effectively\ncontrol the temperature of batteries; therefore, the performance and safety can\nbe ensured. However, the development process of battery thermal management\nsystems is time-consuming and costly due to the extensive training dataset\nneeded by data-driven models requiring enormous computational costs for finite\nelement analysis. Therefore, a new approach to constructing surrogate models is\nneeded in the era of AI. Physics-informed machine learning enforces the\nphysical laws in surrogate models, making it the perfect candidate for\nestimating battery pack temperature distribution. In this study, we first\ndeveloped a 21700 battery pack indirect liquid cooling system with cold plates\non the top and bottom with thermal paste surrounding the battery cells. Then,\nthe simplified finite element model was built based on experiment results. Due\nto the high coolant flow rate, the cold plates can be considered as constant\ntemperature boundaries, while battery cells are the heat sources. The\nphysics-informed convolutional neural network served as a surrogate model to\nestimate the temperature distribution of the battery pack. The loss function\nwas constructed considering the heat conduction equation based on the finite\ndifference method. The physics-informed loss function helped the convergence of\nthe training process with less data. As a result, the physics-informed\nconvolutional neural network showed more than 15 percents improvement in\naccuracy compared to the data-driven method with the same training data.\n","authors":["Zheng Liu","Yuan Jiang","Yumeng Li","Pingfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11242v1","updated":"2024-11-18T02:18:32Z","published":"2024-11-18T02:18:32Z","title":"Mirror Descent on Reproducing Kernel Banach Spaces","summary":" Recent advances in machine learning have led to increased interest in\nreproducing kernel Banach spaces (RKBS) as a more general framework that\nextends beyond reproducing kernel Hilbert spaces (RKHS). These works have\nresulted in the formulation of representer theorems under several regularized\nlearning schemes. However, little is known about an optimization method that\nencompasses these results in this setting. This paper addresses a learning\nproblem on Banach spaces endowed with a reproducing kernel, focusing on\nefficient optimization within RKBS. To tackle this challenge, we propose an\nalgorithm based on mirror descent (MDA). Our approach involves an iterative\nmethod that employs gradient steps in the dual space of the Banach space using\nthe reproducing kernel.\n We analyze the convergence properties of our algorithm under various\nassumptions and establish two types of results: first, we identify conditions\nunder which a linear convergence rate is achievable, akin to optimization in\nthe Euclidean setting, and provide a proof of the linear rate; second, we\ndemonstrate a standard convergence rate in a constrained setting. Moreover, to\ninstantiate this algorithm in practice, we introduce a novel family of RKBSs\nwith $p$-norm ($p \\neq 2$), characterized by both an explicit dual map and a\nkernel.\n","authors":["Akash Kumar","Mikhail Belkin","Parthe Pandit"],"pdf_url":"https://arxiv.org/pdf/2411.11242v1.pdf","comment":"42 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.11238v1","updated":"2024-11-18T02:13:11Z","published":"2024-11-18T02:13:11Z","title":"Reliable Learning of Halfspaces under Gaussian Marginals","summary":" We study the problem of PAC learning halfspaces in the reliable agnostic\nmodel of Kalai et al. (2012). The reliable PAC model captures learning\nscenarios where one type of error is costlier than the others. Our main\npositive result is a new algorithm for reliable learning of Gaussian halfspaces\non $\\mathbb{R}^d$ with sample and computational complexity $$d^{O(\\log\n(\\min\\{1/\\alpha, 1/\\epsilon\\}))}\\min (2^{\\log(1/\\epsilon)^{O(\\log\n(1/\\alpha))}},2^{\\mathrm{poly}(1/\\epsilon)})\\;,$$ where $\\epsilon$ is the\nexcess error and $\\alpha$ is the bias of the optimal halfspace. We complement\nour upper bound with a Statistical Query lower bound suggesting that the\n$d^{\\Omega(\\log (1/\\alpha))}$ dependence is best possible. Conceptually, our\nresults imply a strong computational separation between reliable agnostic\nlearning and standard agnostic learning of halfspaces in the Gaussian setting.\n","authors":["Ilias Diakonikolas","Lisheng Ren","Nikos Zarifis"],"pdf_url":"https://arxiv.org/pdf/2411.11238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03132v3","updated":"2024-11-18T02:06:46Z","published":"2024-10-04T04:07:15Z","title":"Autoregressive Action Sequence Learning for Robotic Manipulation","summary":" Designing a universal policy architecture that performs well across diverse\nrobots and task configurations remains a key challenge. In this work, we\naddress this by representing robot actions as sequential data and generating\nactions through autoregressive sequence modeling. Existing autoregressive\narchitectures generate end-effector waypoints sequentially as word tokens in\nlanguage modeling, which are limited to low-frequency control tasks. Unlike\nlanguage, robot actions are heterogeneous and often include continuous values\n-- such as joint positions, 2D pixel coordinates, and end-effector poses --\nwhich are not easily suited for language-based modeling. Based on this insight,\nwe introduce a straightforward enhancement: we extend causal transformers'\nsingle-token prediction to support predicting a variable number of tokens in a\nsingle step through our Chunking Causal Transformer (CCT). This enhancement\nenables robust performance across diverse tasks of various control frequencies,\ngreater efficiency by having fewer autoregression steps, and lead to a hybrid\naction sequence design by mixing different types of actions and using a\ndifferent chunk size for each action type. Based on CCT, we propose the\nAutoregressive Policy (ARP) architecture, which solves manipulation tasks by\ngenerating hybrid action sequences. We evaluate ARP across diverse robotic\nmanipulation environments, including Push-T, ALOHA, and RLBench, and show that\nARP, as a universal architecture, outperforms the environment-specific\nstate-of-the-art in all tested benchmarks, while being more efficient in\ncomputation and parameter sizes. Videos of our real robot demonstrations, all\nsource code and the pretrained models of ARP can be found at\nhttp://github.com/mlzxy/arp.\n","authors":["Xinyu Zhang","Yuhan Liu","Haonan Chang","Liam Schramm","Abdeslam Boularias"],"pdf_url":"https://arxiv.org/pdf/2410.03132v3.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.11843v1","updated":"2024-11-18T18:59:15Z","published":"2024-11-18T18:59:15Z","title":"Bi-Mamba: Towards Accurate 1-Bit State Space Models","summary":" The typical selective state-space model (SSM) of Mamba addresses several\nlimitations of Transformers, such as quadratic computational complexity with\nsequence length and significant inference-time memory requirements due to the\nkey-value cache. However, the growing size of Mamba models continues to pose\ntraining and deployment challenges and raises environmental concerns due to\nconsiderable energy consumption. In this work, we introduce Bi-Mamba, a\nscalable and powerful 1-bit Mamba architecture designed for more efficient\nlarge language models with multiple sizes across 780M, 1.3B, and 2.7B. Bi-Mamba\nmodels are trained from scratch on data volume as regular LLM pertaining using\nan autoregressive distillation loss. Extensive experimental results on language\nmodeling demonstrate that Bi-Mamba achieves performance comparable to its\nfull-precision counterparts (e.g., FP16 or BF16) and much better accuracy than\npost-training-binarization (PTB) Mamba baselines, while significantly reducing\nmemory footprint and energy consumption compared to the original Mamba model.\nOur study pioneers a new linear computational complexity LLM framework under\nlow-bit representation and facilitates the future design of specialized\nhardware tailored for efficient 1-bit Mamba-based LLMs.\n","authors":["Shengkun Tang","Liqun Ma","Haonan Li","Mingjie Sun","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2411.11843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11826v1","updated":"2024-11-18T18:44:10Z","published":"2024-11-18T18:44:10Z","title":"LightFFDNets: Lightweight Convolutional Neural Networks for Rapid Facial\n Forgery Detection","summary":" Accurate and fast recognition of forgeries is an issue of great importance in\nthe fields of artificial intelligence, image processing and object detection.\nRecognition of forgeries of facial imagery is the process of classifying and\ndefining the faces in it by analyzing real-world facial images. This process is\nusually accomplished by extracting features from an image, using classifier\nalgorithms, and correctly interpreting the results. Recognizing forgeries of\nfacial imagery correctly can encounter many different challenges. For example,\nfactors such as changing lighting conditions, viewing faces from different\nangles can affect recognition performance, and background complexity and\nperspective changes in facial images can make accurate recognition difficult.\nDespite these difficulties, significant progress has been made in the field of\nforgery detection. Deep learning algorithms, especially Convolutional Neural\nNetworks (CNNs), have significantly improved forgery detection performance.\n This study focuses on image processing-based forgery detection using\nFake-Vs-Real-Faces (Hard) [10] and 140k Real and Fake Faces [61] data sets.\nBoth data sets consist of two classes containing real and fake facial images.\nIn our study, two lightweight deep learning models are proposed to conduct\nforgery detection using these images. Additionally, 8 different pretrained CNN\narchitectures were tested on both data sets and the results were compared with\nnewly developed lightweight CNN models. It's shown that the proposed\nlightweight deep learning models have minimum number of layers. It's also shown\nthat the proposed lightweight deep learning models detect forgeries of facial\nimagery accurately, and computationally efficiently. Although the data set\nconsists only of face images, the developed models can also be used in other\ntwo-class object recognition problems.\n","authors":["Günel Jabbarlı","Murat Kurt"],"pdf_url":"https://arxiv.org/pdf/2411.11826v1.pdf","comment":"13 pages, 6 figures, 10 tables"},{"id":"http://arxiv.org/abs/2411.00024v2","updated":"2024-11-18T18:41:08Z","published":"2024-10-28T22:30:06Z","title":"A Perspective for Adapting Generalist AI to Specialized Medical AI\n Applications and Their Challenges","summary":" The integration of Large Language Models (LLMs) into medical applications has\nsparked widespread interest across the healthcare industry, from drug discovery\nand development to clinical decision support, assisting telemedicine, medical\ndevices, and healthcare insurance applications. This perspective paper aims to\ndiscuss the inner workings of building LLM-powered medical AI applications and\nintroduces a comprehensive framework for their development. We review existing\nliterature and outline the unique challenges of applying LLMs in specialized\nmedical contexts. Additionally, we introduce a three-step framework to organize\nmedical LLM research activities: 1) Modeling: breaking down complex medical\nworkflows into manageable steps for developing medical-specific models; 2)\nOptimization: optimizing the model performance with crafted prompts and\nintegrating external knowledge and tools, and 3) System engineering:\ndecomposing complex tasks into subtasks and leveraging human expertise for\nbuilding medical AI applications. Furthermore, we offer a detailed use case\nplaybook that describes various LLM-powered medical AI applications, such as\noptimizing clinical trial design, enhancing clinical decision support, and\nadvancing medical imaging analysis. Finally, we discuss various challenges and\nconsiderations for building medical AI applications with LLMs, such as handling\nhallucination issues, data ownership and compliance, privacy, intellectual\nproperty considerations, compute cost, sustainability issues, and responsible\nAI requirements.\n","authors":["Zifeng Wang","Hanyin Wang","Benjamin Danek","Ying Li","Christina Mack","Hoifung Poon","Yajuan Wang","Pranav Rajpurkar","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2411.00024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04254v2","updated":"2024-11-18T18:35:06Z","published":"2024-04-05T17:58:52Z","title":"Watermark-based Detection and Attribution of AI-Generated Content","summary":" Several companies have deployed watermark-based detection to identify\nAI-generated content. However, attribution--the ability to trace back to the\nuser of a generative AI (GenAI) service who created a given piece of\nAI-generated content--remains largely unexplored despite its growing\nimportance. In this work, we aim to bridge this gap by conducting the first\nsystematic study on watermark-based, user-level attribution of AI-generated\ncontent. Our key idea is to assign a unique watermark to each user of the GenAI\nservice and embed this watermark into the AI-generated content created by that\nuser. Attribution is then performed by identifying the user whose watermark\nbest matches the one extracted from the given content. This approach, however,\nfaces a key challenge: How should watermarks be selected for users to maximize\nattribution performance? To address the challenge, we first theoretically\nderive lower bounds on detection and attribution performance through rigorous\nprobabilistic analysis for any given set of user watermarks. Then, we select\nwatermarks for users to maximize these lower bounds, thereby optimizing\ndetection and attribution performance. Our theoretical and empirical results\nshow that watermark-based attribution inherits both the accuracy and\n(non-)robustness properties of the underlying watermark. Specifically,\nattribution remains highly accurate when the watermarked AI-generated content\nis either not post-processed or subjected to common post-processing such as\nJPEG compression, as well as black-box adversarial post-processing with limited\nquery budgets.\n","authors":["Zhengyuan Jiang","Moyang Guo","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2404.04254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06097v2","updated":"2024-11-18T18:19:34Z","published":"2024-11-09T07:19:19Z","title":"A Multimodal Adaptive Graph-based Intelligent Classification Model for\n Fake News","summary":" Numerous studies have been proposed to detect fake news focusing on\nmulti-modalities based on machine and/or deep learning. However, studies\nfocusing on graph-based structures using geometric deep learning are lacking.\nTo address this challenge, we introduce the Multimodal Adaptive Graph-based\nIntelligent Classification (aptly referred to as MAGIC) for fake news\ndetection. Specifically, the Encoder Representations from Transformers was used\nfor text vectorization whilst ResNet50 was used for images. A comprehensive\ninformation interaction graph was built using the adaptive Graph Attention\nNetwork before classifying the multimodal input through the Softmax function.\nMAGIC was trained and tested on two fake news datasets, that is, Fakeddit\n(English) and Multimodal Fake News Detection (Chinese), with the model\nachieving an accuracy of 98.8\\% and 86.3\\%, respectively. Ablation experiments\nalso revealed MAGIC to yield superior performance across both the datasets.\nFindings show that a graph-based deep learning adaptive model is effective in\ndetecting multimodal fake news, surpassing state-of-the-art methods.\n","authors":[" Jun-hao"," Xu"],"pdf_url":"https://arxiv.org/pdf/2411.06097v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2411.11799v1","updated":"2024-11-18T18:11:53Z","published":"2024-11-18T18:11:53Z","title":"Edge-Enhanced Dilated Residual Attention Network for Multimodal Medical\n Image Fusion","summary":" Multimodal medical image fusion is a crucial task that combines complementary\ninformation from different imaging modalities into a unified representation,\nthereby enhancing diagnostic accuracy and treatment planning. While deep\nlearning methods, particularly Convolutional Neural Networks (CNNs) and\nTransformers, have significantly advanced fusion performance, some of the\nexisting CNN-based methods fall short in capturing fine-grained multiscale and\nedge features, leading to suboptimal feature integration. Transformer-based\nmodels, on the other hand, are computationally intensive in both the training\nand fusion stages, making them impractical for real-time clinical use.\nMoreover, the clinical application of fused images remains unexplored. In this\npaper, we propose a novel CNN-based architecture that addresses these\nlimitations by introducing a Dilated Residual Attention Network Module for\neffective multiscale feature extraction, coupled with a gradient operator to\nenhance edge detail learning. To ensure fast and efficient fusion, we present a\nparameter-free fusion strategy based on the weighted nuclear norm of softmax,\nwhich requires no additional computations during training or inference.\nExtensive experiments, including a downstream brain tumor classification task,\ndemonstrate that our approach outperforms various baseline methods in terms of\nvisual quality, texture preservation, and fusion speed, making it a possible\npractical solution for real-world clinical applications. The code will be\nreleased at https://github.com/simonZhou86/en_dran.\n","authors":["Meng Zhou","Yuxuan Zhang","Xiaolan Xu","Jiayi Wang","Farzad Khalvati"],"pdf_url":"https://arxiv.org/pdf/2411.11799v1.pdf","comment":"An extended version of the paper accepted at IEEE BIBM 2024"},{"id":"http://arxiv.org/abs/2411.11795v1","updated":"2024-11-18T18:08:52Z","published":"2024-11-18T18:08:52Z","title":"Exploring adversarial robustness of JPEG AI: methodology, comparison and\n new methods","summary":" Adversarial robustness of neural networks is an increasingly important area\nof research, combining studies on computer vision models, large language models\n(LLMs), and others. With the release of JPEG AI - the first standard for\nend-to-end neural image compression (NIC) methods - the question of its\nrobustness has become critically significant. JPEG AI is among the first\ninternational, real-world applications of neural-network-based models to be\nembedded in consumer devices. However, research on NIC robustness has been\nlimited to open-source codecs and a narrow range of attacks. This paper\nproposes a new methodology for measuring NIC robustness to adversarial attacks.\nWe present the first large-scale evaluation of JPEG AI's robustness, comparing\nit with other NIC models. Our evaluation results and code are publicly\navailable online (link is hidden for a blind review).\n","authors":["Egor Kovalev","Georgii Bychkov","Khaled Abud","Aleksandr Gushchin","Anna Chistyakova","Sergey Lavrushkin","Dmitriy Vatolin","Anastasia Antsiferova"],"pdf_url":"https://arxiv.org/pdf/2411.11795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17994v3","updated":"2024-11-18T18:00:47Z","published":"2024-09-26T16:06:38Z","title":"CRoP: Context-wise Robust Static Human-Sensing Personalization","summary":" The advancement in deep learning and internet-of-things have led to diverse\nhuman sensing applications. However, distinct patterns in human sensing,\ninfluenced by various factors or contexts, challenge the generic neural network\nmodel's performance due to natural distribution shifts. To address this,\npersonalization tailors models to individual users. Yet most personalization\nstudies overlook intra-user heterogeneity across contexts in sensory data,\nlimiting intra-user generalizability. This limitation is especially critical in\nclinical applications, where limited data availability hampers both\ngeneralizability and personalization. Notably, intra-user sensing attributes\nare expected to change due to external factors such as treatment progression,\nfurther complicating the challenges. To address the intra-user generalization\nchallenge, this work introduces CRoP, a novel static personalization approach.\nCRoP leverages off-the-shelf pre-trained models as generic starting points and\ncaptures user-specific traits through adaptive pruning on a minimal sub-network\nwhile preserving generic knowledge in the remaining parameters. CRoP\ndemonstrates superior personalization effectiveness and intra-user robustness\nacross four human-sensing datasets, including two from real-world health\ndomains, underscoring its practical and social impact. Additionally, to support\nCRoP's generalization ability and design choices, we provide empirical\njustification through gradient inner product analysis, ablation studies, and\ncomparisons against state-of-the-art baselines.\n","authors":["Sawinder Kaur","Avery Gump","Jingyu Xin","Yi Xiao","Harshit Sharma","Nina R Benway","Jonathan L Preston","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2409.17994v3.pdf","comment":"33 pages, 6 figues and 12 tables"},{"id":"http://arxiv.org/abs/2407.21343v2","updated":"2024-11-18T17:59:10Z","published":"2024-07-31T05:17:31Z","title":"MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation\n Framework","summary":" Medical imaging segmentation is a highly active area of research, with deep\nlearning-based methods achieving state-of-the-art results in several\nbenchmarks. However, the lack of standardized tools for training, testing, and\nevaluating new methods makes the comparison of methods difficult. To address\nthis, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple,\nmodular, and end-to-end medical imaging segmentation framework designed to\nfacilitate consistent training, testing, and evaluation of deep learning-based\nmedical imaging segmentation methods. MIST standardizes data analysis,\npreprocessing, and evaluation pipelines, accommodating multiple architectures\nand loss functions. This standardization ensures reproducible and fair\ncomparisons across different methods. We detail MIST's data format\nrequirements, pipelines, and auxiliary features and demonstrate its efficacy\nusing the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results\nhighlight MIST's ability to produce accurate segmentation masks and its\nscalability across multiple GPUs, showcasing its potential as a powerful tool\nfor future medical imaging research and development.\n","authors":["Adrian Celaya","Evan Lim","Rachel Glenn","Brayden Mi","Alex Balsells","Dawid Schellingerhout","Tucker Netherton","Caroline Chung","Beatrice Riviere","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2407.21343v2.pdf","comment":"Submitted to BraTS 2024"},{"id":"http://arxiv.org/abs/2411.11774v1","updated":"2024-11-18T17:53:07Z","published":"2024-11-18T17:53:07Z","title":"Exploring the Requirements of Clinicians for Explainable AI Decision\n Support Systems in Intensive Care","summary":" There is a growing need to understand how digital systems can support\nclinical decision-making, particularly as artificial intelligence (AI) models\nbecome increasingly complex and less human-interpretable. This complexity\nraises concerns about trustworthiness, impacting safe and effective adoption of\nsuch technologies. Improved understanding of decision-making processes and\nrequirements for explanations coming from decision support tools is a vital\ncomponent in providing effective explainable solutions. This is particularly\nrelevant in the data-intensive, fast-paced environments of intensive care units\n(ICUs). To explore these issues, group interviews were conducted with seven ICU\nclinicians, representing various roles and experience levels. Thematic analysis\nrevealed three core themes: (T1) ICU decision-making relies on a wide range of\nfactors, (T2) the complexity of patient state is challenging for shared\ndecision-making, and (T3) requirements and capabilities of AI decision support\nsystems. We include design recommendations from clinical input, providing\ninsights to inform future AI systems for intensive care.\n","authors":["Jeffrey N. Clark","Matthew Wragg","Emily Nielsen","Miquel Perello-Nieto","Nawid Keshtmand","Michael Ambler","Shiv Sharma","Christopher P. Bourdeaux","Amberly Brigden","Raul Santos-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2411.11774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11770v1","updated":"2024-11-18T17:50:34Z","published":"2024-11-18T17:50:34Z","title":"CNMBert: A Model For Hanyu Pinyin Abbreviation to Character Conversion\n Task","summary":" The task of converting Hanyu Pinyin abbreviations to Chinese characters\nrepresents a significant branch within the domain of Chinese Spelling\nCorrection (CSC). This task is typically one of text-length alignment, however,\ndue to the limited informational content in pinyin abbreviations, achieving\naccurate conversion is challenging. In this paper, we propose CNMBert which\nstands for zh-CN Pinyin Multi-mask Bert Model as a solution to this issue.\nCNMBert surpasses few-shot GPT models, achieving a 59.63% MRR on a\n10,424-sample Hanyu Pinyin abbreviation test dataset.\n","authors":["Zishuo Feng","Feng Cao"],"pdf_url":"https://arxiv.org/pdf/2411.11770v1.pdf","comment":"9 pages, 2figures"},{"id":"http://arxiv.org/abs/2409.03077v2","updated":"2024-11-18T17:48:59Z","published":"2024-09-04T21:05:42Z","title":"Backdoor defense, learnability and obfuscation","summary":" We introduce a formal notion of defendability against backdoors using a game\nbetween an attacker and a defender. In this game, the attacker modifies a\nfunction to behave differently on a particular input known as the \"trigger\",\nwhile behaving the same almost everywhere else. The defender then attempts to\ndetect the trigger at evaluation time. If the defender succeeds with high\nenough probability, then the function class is said to be defendable. The key\nconstraint on the attacker that makes defense possible is that the attacker's\nstrategy must work for a randomly-chosen trigger.\n Our definition is simple and does not explicitly mention learning, yet we\ndemonstrate that it is closely connected to learnability. In the\ncomputationally unbounded setting, we use a voting algorithm of Hanneke et al.\n(2022) to show that defendability is essentially determined by the VC dimension\nof the function class, in much the same way as PAC learnability. In the\ncomputationally bounded setting, we use a similar argument to show that\nefficient PAC learnability implies efficient defendability, but not conversely.\nOn the other hand, we use indistinguishability obfuscation to show that the\nclass of polynomial size circuits is not efficiently defendable. Finally, we\npresent polynomial size decision trees as a natural example for which defense\nis strictly easier than learning. Thus, we identify efficient defendability as\na notable intermediate concept in between efficient learnability and\nobfuscation.\n","authors":["Paul Christiano","Jacob Hilton","Victor Lecomte","Mark Xu"],"pdf_url":"https://arxiv.org/pdf/2409.03077v2.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2411.11768v1","updated":"2024-11-18T17:47:54Z","published":"2024-11-18T17:47:54Z","title":"AdaptLIL: A Gaze-Adaptive Visualization for Ontology Mapping","summary":" This paper showcases AdaptLIL, a real-time adaptive link-indented list\nontology mapping visualization that uses eye gaze as the primary input source.\nThrough a multimodal combination of real-time systems, deep learning, and web\ndevelopment applications, this system uniquely curtails graphical overlays\n(adaptations) to pairwise mappings of link-indented list ontology\nvisualizations for individual users based solely on their eye gaze.\n","authors":["Nicholas Chow","Bo Fu"],"pdf_url":"https://arxiv.org/pdf/2411.11768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07096v5","updated":"2024-11-18T17:40:57Z","published":"2024-10-09T17:35:25Z","title":"Identifying and Addressing Delusions for Target-Directed Decision-Making","summary":" Target-directed agents utilize self-generated targets, to guide their\nbehaviors for better generalization. These agents are prone to blindly chasing\nproblematic targets, resulting in worse generalization and safety catastrophes.\nWe show that these behaviors can be results of delusions, stemming from\nimproper designs around training: the agent may naturally come to hold false\nbeliefs about certain targets. We identify delusions via intuitive examples in\ncontrolled environments, and investigate their causes and mitigations. With the\ninsights, we demonstrate how we can make agents address delusions preemptively\nand autonomously. We validate empirically the effectiveness of the proposed\nstrategies in correcting delusional behaviors and improving out-of-distribution\ngeneralization.\n","authors":["Mingde Zhao","Tristan Sylvain","Doina Precup","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2410.07096v5.pdf","comment":"20241118 12h40: incorporated changes of rebuttal"},{"id":"http://arxiv.org/abs/2411.11758v1","updated":"2024-11-18T17:37:10Z","published":"2024-11-18T17:37:10Z","title":"The Power of Many: Multi-Agent Multimodal Models for Cultural Image\n Captioning","summary":" Large Multimodal Models (LMMs) exhibit impressive performance across various\nmultimodal tasks. However, their effectiveness in cross-cultural contexts\nremains limited due to the predominantly Western-centric nature of most data\nand models. Conversely, multi-agent models have shown significant capability in\nsolving complex tasks. Our study evaluates the collective performance of LMMs\nin a multi-agent interaction setting for the novel task of cultural image\ncaptioning. Our contributions are as follows: (1) We introduce MosAIC, a\nMulti-Agent framework to enhance cross-cultural Image Captioning using LMMs\nwith distinct cultural personas; (2) We provide a dataset of culturally\nenriched image captions in English for images from China, India, and Romania\nacross three datasets: GeoDE, GD-VCR, CVQA; (3) We propose a culture-adaptable\nmetric for evaluating cultural information within image captions; and (4) We\nshow that the multi-agent interaction outperforms single-agent models across\ndifferent metrics, and offer valuable insights for future research. Our dataset\nand models can be accessed at https://github.com/MichiganNLP/MosAIC.\n","authors":["Longju Bai","Angana Borah","Oana Ignat","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2411.11758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22339v2","updated":"2024-11-18T17:30:47Z","published":"2024-10-11T18:47:04Z","title":"DAWN: Designing Distributed Agents in a Worldwide Network","summary":" The rapid evolution of Large Language Models (LLMs) has transformed them from\nbasic conversational tools into sophisticated entities capable of complex\nreasoning and decision-making. These advancements have led to the development\nof specialized LLM-based agents designed for diverse tasks such as coding and\nweb browsing. As these agents become more capable, the need for a robust\nframework that facilitates global communication and collaboration among them\ntowards advanced objectives has become increasingly critical. Distributed\nAgents in a Worldwide Network (DAWN) addresses this need by offering a\nversatile framework that integrates LLM-based agents with traditional software\nsystems, enabling the creation of agentic applications suited for a wide range\nof use cases. DAWN enables distributed agents worldwide to register and be\neasily discovered through Gateway Agents. Collaborations among these agents are\ncoordinated by a Principal Agent equipped with reasoning strategies. DAWN\noffers three operational modes: No-LLM Mode for deterministic tasks, Copilot\nfor augmented decision-making, and LLM Agent for autonomous operations.\nAdditionally, DAWN ensures the safety and security of agent collaborations\nglobally through a dedicated safety, security, and compliance layer, protecting\nthe network against attackers and adhering to stringent security and compliance\nstandards. These features make DAWN a robust network for deploying agent-based\napplications across various industries.\n","authors":["Zahra Aminiranjbar","Jianan Tang","Qiudan Wang","Shubha Pant","Mahesh Viswanathan"],"pdf_url":"https://arxiv.org/pdf/2410.22339v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11739v1","updated":"2024-11-18T17:08:35Z","published":"2024-11-18T17:08:35Z","title":"QARM: Quantitative Alignment Multi-Modal Recommendation at Kuaishou","summary":" In recent years, with the significant evolution of multi-modal large models,\nmany recommender researchers realized the potential of multi-modal information\nfor user interest modeling. In industry, a wide-used modeling architecture is a\ncascading paradigm: (1) first pre-training a multi-modal model to provide\nomnipotent representations for downstream services; (2) The downstream\nrecommendation model takes the multi-modal representation as additional input\nto fit real user-item behaviours. Although such paradigm achieves remarkable\nimprovements, however, there still exist two problems that limit model\nperformance: (1) Representation Unmatching: The pre-trained multi-modal model\nis always supervised by the classic NLP/CV tasks, while the recommendation\nmodels are supervised by real user-item interaction. As a result, the two\nfundamentally different tasks' goals were relatively separate, and there was a\nlack of consistent objective on their representations; (2) Representation\nUnlearning: The generated multi-modal representations are always stored in\ncache store and serve as extra fixed input of recommendation model, thus could\nnot be updated by recommendation model gradient, further unfriendly for\ndownstream training. Inspired by the two difficulties challenges in downstream\ntasks usage, we introduce a quantitative multi-modal framework to customize the\nspecialized and trainable multi-modal information for different downstream\nmodels.\n","authors":["Xinchen Luo","Jiangxia Cao","Tianyu Sun","Jinkai Yu","Rui Huang","Wei Yuan","Hezheng Lin","Yichen Zheng","Shiyao Wang","Qigen Hu","Changqing Qiu","Jiaqi Zhang","Xu Zhang","Zhiheng Yan","Jingming Zhang","Simin Zhang","Mingxing Wen","Zhaojie Liu","Kun Gai","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.11739v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.11738v1","updated":"2024-11-18T17:07:37Z","published":"2024-11-18T17:07:37Z","title":"WoodYOLO: A Novel Object Detector for Wood Species Detection in\n Microscopic Images","summary":" Wood species identification plays a crucial role in various industries, from\nensuring the legality of timber products to advancing ecological conservation\nefforts. This paper introduces WoodYOLO, a novel object detection algorithm\nspecifically designed for microscopic wood fiber analysis. Our approach adapts\nthe YOLO architecture to address the challenges posed by large, high-resolution\nmicroscopy images and the need for high recall in localization of the cell type\nof interest (vessel elements). Our results show that WoodYOLO significantly\noutperforms state-of-the-art models, achieving performance gains of 12.9% and\n6.5% in F2 score over YOLOv10 and YOLOv7, respectively. This improvement in\nautomated wood cell type localization capabilities contributes to enhancing\nregulatory compliance, supporting sustainable forestry practices, and promoting\nbiodiversity conservation efforts globally.\n","authors":["Lars Nieradzik","Henrike Stephani","Jördis Sieburg-Rockel","Stephanie Helmling","Andrea Olbrich","Stephanie Wrage","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2411.11738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15367v2","updated":"2024-11-18T17:00:32Z","published":"2024-09-18T18:36:18Z","title":"Fine-Tuning a Time Series Foundation Model with Wasserstein Loss","summary":" Inspired by recent advancements in large language models (LLMs) for Natural\nLanguage Processing (NLP), there has been a surge in research focused on\ndeveloping foundational models for time series forecasting. One approach\ninvolves training LLM architectures on tokenized time series data using\ncross-entropy loss. Although this method has demonstrated promising results,\ncross-entropy loss is primarily designed for classification tasks and does not\naccount for the distance between classes. To address this limitation, we\npropose using the Wasserstein loss for such architectures. To validate our\napproach, we fine-tuned a foundational time series model on $22$ zero-shot\ndatasets, comparing the performance of cross-entropy loss with that of\nWasserstein loss. Our results demonstrate that replacing cross-entropy loss\nwith Wasserstein loss significantly improves point estimation.\n","authors":["Andrei Chernov"],"pdf_url":"https://arxiv.org/pdf/2409.15367v2.pdf","comment":"4 main pages; 2 figures"},{"id":"http://arxiv.org/abs/2411.11731v1","updated":"2024-11-18T16:59:59Z","published":"2024-11-18T16:59:59Z","title":"Moral Persuasion in Large Language Models: Evaluating Susceptibility and\n Ethical Alignment","summary":" We explore how large language models (LLMs) can be influenced by prompting\nthem to alter their initial decisions and align them with established ethical\nframeworks. Our study is based on two experiments designed to assess the\nsusceptibility of LLMs to moral persuasion. In the first experiment, we examine\nthe susceptibility to moral ambiguity by evaluating a Base Agent LLM on morally\nambiguous scenarios and observing how a Persuader Agent attempts to modify the\nBase Agent's initial decisions. The second experiment evaluates the\nsusceptibility of LLMs to align with predefined ethical frameworks by prompting\nthem to adopt specific value alignments rooted in established philosophical\ntheories. The results demonstrate that LLMs can indeed be persuaded in morally\ncharged scenarios, with the success of persuasion depending on factors such as\nthe model used, the complexity of the scenario, and the conversation length.\nNotably, LLMs of distinct sizes but from the same company produced markedly\ndifferent outcomes, highlighting the variability in their susceptibility to\nethical persuasion.\n","authors":["Allison Huang","Yulu Niki Pi","Carlos Mougan"],"pdf_url":"https://arxiv.org/pdf/2411.11731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11730v1","updated":"2024-11-18T16:59:44Z","published":"2024-11-18T16:59:44Z","title":"Lifted Model Construction without Normalisation: A Vectorised Approach\n to Exploit Symmetries in Factor Graphs","summary":" Lifted probabilistic inference exploits symmetries in a probabilistic model\nto allow for tractable probabilistic inference with respect to domain sizes of\nlogical variables. We found that the current state-of-the-art algorithm to\nconstruct a lifted representation in form of a parametric factor graph misses\nsymmetries between factors that are exchangeable but scaled differently,\nthereby leading to a less compact representation. In this paper, we propose a\ngeneralisation of the advanced colour passing (ACP) algorithm, which is the\nstate of the art to construct a parametric factor graph. Our proposed algorithm\nallows for potentials of factors to be scaled arbitrarily and efficiently\ndetects more symmetries than the original ACP algorithm. By detecting strictly\nmore symmetries than ACP, our algorithm significantly reduces online query\ntimes for probabilistic inference when the resulting model is applied, which we\nalso confirm in our experiments.\n","authors":["Malte Luttermann","Ralf Möller","Marcel Gehrke"],"pdf_url":"https://arxiv.org/pdf/2411.11730v1.pdf","comment":"Accepted to the Proceedings of the 3rd Learning on Graphs Conference\n (LoG 2024)"},{"id":"http://arxiv.org/abs/2403.11116v3","updated":"2024-11-18T16:57:14Z","published":"2024-03-17T06:53:44Z","title":"PhD: A ChatGPT-Prompted Visual hallucination Evaluation Dataset","summary":" Multimodal Large Language Models (MLLMs) hallucinate, resulting in an\nemerging topic of visual hallucination evaluation (VHE). This paper contributes\na ChatGPT-Prompted visual hallucination evaluation Dataset (PhD) for objective\nVHE at a large scale. The essence of VHE is to ask an MLLM questions about\nspecific images to assess its susceptibility to hallucination. Depending on\nwhat to ask (objects, attributes, sentiment, etc.) and how the questions are\nasked, we structure PhD along two dimensions, i.e., task and mode. Five visual\nrecognition tasks, ranging from low-level (object / attribute recognition) to\nmiddle-level (sentiment / position recognition and counting), are considered.\nBesides a normal visual QA mode, which we term PhD-base, PhD also asks\nquestions with inaccurate context (PhD-iac) or with incorrect context\n(PhD-icc), or with AI-generated counter common sense images (PhD-ccs). We\nconstruct PhD by a ChatGPT-assisted semi-automated pipeline, encompassing four\npivotal modules: task-specific hallucinatory item (hitem) selection,\nhitem-embedded question generation, inaccurate / incorrect context generation,\nand counter-common-sense (CCS) image generation. With over 14k daily images,\n750 CCS images and 102k VQA triplets in total, PhD reveals considerable\nvariability in MLLMs' performance across various modes and tasks, offering\nvaluable insights into the nature of hallucination. As such, PhD stands as a\npotent tool not only for VHE but may also play a significant role in the\nrefinement of MLLMs.\n","authors":["Jiazhen Liu","Yuhan Fu","Ruobing Xie","Runquan Xie","Xingwu Sun","Fengzong Lian","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2403.11116v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11714v1","updated":"2024-11-18T16:42:07Z","published":"2024-11-18T16:42:07Z","title":"Semantic-Geometric-Physical-Driven Robot Manipulation Skill Transfer via\n Skill Library and Tactile Representation","summary":" Deploying robots in open-world environments involves complex tasks\ncharacterized by long sequences and rich interactions, necessitating efficient\ntransfer of robotic skills across diverse and complex scenarios. To address\nthis challenge, we propose a skill library framework based on knowledge graphs,\nwhich endows robots with high-level skill awareness and spatial semantic\nunderstanding. The framework hierarchically organizes operational knowledge by\nconstructing a \"task graph\" and a \"scene graph\" to represent task and scene\nsemantic information, respectively. We introduce a \"state graph\" to facilitate\ninteraction between high-level task planning and low-level scene information.\nFurthermore, we propose a hierarchical transfer framework for operational\nskills. At the task level, the framework integrates contextual learning and\nchain-of-thought prompting within a four-stage prompt paradigm, leveraging\nlarge language models' (LLMs) reasoning and generalization capabilities to\nachieve task-level subtask sequence transfer. At the motion level, an adaptive\ntrajectory transfer method is developed using the A* algorithm and the skill\nlibrary, enabling motion-level adaptive trajectory transfer. At the physical\nlevel, we introduce an adaptive contour extraction and posture perception\nmethod based on tactile perception. This method dynamically obtains\nhigh-precision contour and posture information from visual-tactile texture data\nand adjusts transferred skills, such as contact positions and postures, to\nensure effectiveness in new environments. Experimental results validate the\neffectiveness of the proposed methods. Project\nwebsite:https://github.com/MingchaoQi/skill_transfer\n","authors":["Mingchao Qi","Yuanjin Li","Xing Liu","Zhengxiong Liu","Panfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2411.11714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11707v1","updated":"2024-11-18T16:34:58Z","published":"2024-11-18T16:34:58Z","title":"FedCoLLM: A Parameter-Efficient Federated Co-tuning Framework for Large\n and Small Language Models","summary":" By adapting Large Language Models (LLMs) to domain-specific tasks or\nenriching them with domain-specific knowledge, we can fully harness the\ncapabilities of LLMs. Nonetheless, a gap persists in achieving simultaneous\nmutual enhancement between the server's LLM and the downstream clients' Small\nLanguage Models (SLMs). To address this, we propose FedCoLLM, a novel and\nparameter-efficient federated framework designed for co-tuning LLMs and SLMs.\nThis approach is aimed at adaptively transferring server-side LLMs knowledge to\nclients' SLMs while simultaneously enriching the LLMs with domain insights from\nthe clients. To accomplish this, FedCoLLM utilizes lightweight adapters in\nconjunction with SLMs, facilitating knowledge exchange between server and\nclients in a manner that respects data privacy while also minimizing\ncomputational and communication overhead. Our evaluation of FedCoLLM, utilizing\nvarious public LLMs and SLMs across a range of NLP text generation tasks,\nreveals that the performance of clients' SLMs experiences notable improvements\nwith the assistance of the LLMs. Simultaneously, the LLMs enhanced via FedCoLLM\nachieves comparable performance to that obtained through direct fine-tuning on\nclients' data.\n","authors":["Tao Fan","Yan Kang","Guoqiang Ma","Lixin Fan","Kai Chen","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.11707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11706v1","updated":"2024-11-18T16:33:52Z","published":"2024-11-18T16:33:52Z","title":"MC-LLaVA: Multi-Concept Personalized Vision-Language Model","summary":" Current vision-language models (VLMs) show exceptional abilities across\ndiverse tasks including visual question answering. To enhance user experience\nin practical applications, recent studies investigate VLM personalization to\nunderstand user-provided concepts. However, existing studies mainly focus on\nsingle-concept personalization, neglecting the existence and interplay of\nmultiple concepts, which limits the real-world applicability of personalized\nVLMs. In this paper, we propose the first multi-concept personalization method\nnamed MC-LLaVA along with a high-quality multi-concept personalization dataset.\nSpecifically, MC-LLaVA uses a joint training strategy incorporating multiple\nconcepts in a single training step, allowing VLMs to perform accurately in\nmulti-concept personalization. To reduce the cost of joint training, MC-LLaVA\nleverages visual token information for concept token initialization, yielding\nimproved concept representation and accelerating joint training. To advance\nmulti-concept personalization research, we further contribute a high-quality\ndataset. We carefully collect images from various movies that contain multiple\ncharacters and manually generate the multi-concept question-answer samples. Our\ndataset features diverse movie types and question-answer types. We conduct\ncomprehensive qualitative and quantitative experiments to demonstrate that\nMC-LLaVA can achieve impressive multi-concept personalized responses, paving\nthe way for VLMs to become better user-specific assistants. The code and\ndataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA.\n","authors":["Ruichuan An","Sihan Yang","Ming Lu","Kai Zeng","Yulin Luo","Ying Chen","Jiajun Cao","Hao Liang","Qi She","Shanghang Zhang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11181v2","updated":"2024-11-18T16:25:53Z","published":"2024-10-15T01:51:29Z","title":"DARNet: Dual Attention Refinement Network with Spatiotemporal\n Construction for Auditory Attention Detection","summary":" At a cocktail party, humans exhibit an impressive ability to direct their\nattention. The auditory attention detection (AAD) approach seeks to identify\nthe attended speaker by analyzing brain signals, such as EEG signals. However,\ncurrent AAD algorithms overlook the spatial distribution information within EEG\nsignals and lack the ability to capture long-range latent dependencies,\nlimiting the model's ability to decode brain activity. To address these issues,\nthis paper proposes a dual attention refinement network with spatiotemporal\nconstruction for AAD, named DARNet, which consists of the spatiotemporal\nconstruction module, dual attention refinement module, and feature fusion \\&\nclassifier module. Specifically, the spatiotemporal construction module aims to\nconstruct more expressive spatiotemporal feature representations, by capturing\nthe spatial distribution characteristics of EEG signals. The dual attention\nrefinement module aims to extract different levels of temporal patterns in EEG\nsignals and enhance the model's ability to capture long-range latent\ndependencies. The feature fusion \\& classifier module aims to aggregate\ntemporal patterns and dependencies from different levels and obtain the final\nclassification results. The experimental results indicate that compared to the\nstate-of-the-art models, DARNet achieves an average classification accuracy\nimprovement of 5.9\\% for 0.1s, 4.6\\% for 1s, and 3.9\\% for 2s on the DTU\ndataset. While maintaining excellent classification performance, DARNet\nsignificantly reduces the number of required parameters. Compared to the\nstate-of-the-art models, DARNet reduces the parameter count by 91\\%. Code is\navailable at: https://github.com/fchest/DARNet.git.\n","authors":["Sheng Yan","Cunhang fan","Hongyu Zhang","Xiaoke Yang","Jianhua Tao","Zhao Lv"],"pdf_url":"https://arxiv.org/pdf/2410.11181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19212v3","updated":"2024-11-18T16:22:41Z","published":"2024-05-29T15:54:03Z","title":"Partial Information Decomposition for Data Interpretability and Feature\n Selection","summary":" In this paper, we introduce Partial Information Decomposition of Features\n(PIDF), a new paradigm for simultaneous data interpretability and feature\nselection. Contrary to traditional methods that assign a single importance\nvalue, our approach is based on three metrics per feature: the mutual\ninformation shared with the target variable, the feature's contribution to\nsynergistic information, and the amount of this information that is redundant.\nIn particular, we develop a novel procedure based on these three metrics, which\nreveals not only how features are correlated with the target but also the\nadditional and overlapping information provided by considering them in\ncombination with other features. We extensively evaluate PIDF using both\nsynthetic and real-world data, demonstrating its potential applications and\neffectiveness, by considering case studies from genetics and neuroscience.\n","authors":["Charles Westphal","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2405.19212v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11694v1","updated":"2024-11-18T16:15:17Z","published":"2024-11-18T16:15:17Z","title":"Technical Report: Enhancing LLM Reasoning with Reward-guided Tree Search","summary":" Recently, test-time scaling has garnered significant attention from the\nresearch community, largely due to the substantial advancements of the o1 model\nreleased by OpenAI. By allocating more computational resources during the\ninference phase, large language models~(LLMs) can extensively explore the\nsolution space by generating more thought tokens or diverse solutions, thereby\nproducing more accurate responses. However, developing an o1-like reasoning\napproach is challenging, and researchers have been making various attempts to\nadvance this open area of research. In this paper, we present a preliminary\nexploration into enhancing the reasoning abilities of LLMs through\nreward-guided tree search algorithms. This framework is implemented by\nintegrating the policy model, reward model, and search algorithm. It is\nprimarily constructed around a tree search algorithm, where the policy model\nnavigates a dynamically expanding tree guided by a specially trained reward\nmodel. We thoroughly explore various design considerations necessary for\nimplementing this framework and provide a detailed report of the technical\naspects. To assess the effectiveness of our approach, we focus on mathematical\nreasoning tasks and conduct extensive evaluations on four challenging datasets,\nsignificantly enhancing the reasoning abilities of LLMs.\n","authors":["Jinhao Jiang","Zhipeng Chen","Yingqian Min","Jie Chen","Xiaoxue Cheng","Jiapeng Wang","Yiru Tang","Haoxiang Sun","Jia Deng","Wayne Xin Zhao","Zheng Liu","Dong Yan","Jian Xie","Zhongyuan Wang","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2411.11694v1.pdf","comment":"LLM;Complex Reasoning;Math"},{"id":"http://arxiv.org/abs/2411.11688v1","updated":"2024-11-18T16:11:25Z","published":"2024-11-18T16:11:25Z","title":"Conceptwm: A Diffusion Model Watermark for Concept Protection","summary":" The personalization techniques of diffusion models succeed in generating\nspecific concepts but also pose threats to copyright protection and illegal\nuse. Model Watermarking is an effective method to prevent the unauthorized use\nof subject-driven or style-driven image generation, safeguarding concept\ncopyrights. However, under the goal of concept-oriented protection, current\nwatermarking schemes typically add watermarks to all images rather than\napplying them in a refined manner targeted at specific concepts. Additionally,\nthe personalization techniques of diffusion models can easily remove\nwatermarks. Existing watermarking methods struggle to achieve fine-grained\nwatermark embedding with a few images of specific concept and prevent removal\nof watermarks through personalized fine-tuning. Therefore, we introduce a novel\nconcept-oriented watermarking framework that seamlessly embeds imperceptible\nwatermarks into the concept of diffusion models. We conduct extensive\nexperiments and ablation studies to verify our framework. Our code is available\nat https://anonymous.4open.science/r/Conceptwm-4EB3/.\n","authors":["Liangqi Lei","Keke Gai","Jing Yu","Liehuang Zhu","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11683v1","updated":"2024-11-18T16:09:26Z","published":"2024-11-18T16:09:26Z","title":"TrojanRobot: Backdoor Attacks Against Robotic Manipulation in the\n Physical World","summary":" Robotic manipulation refers to the autonomous handling and interaction of\nrobots with objects using advanced techniques in robotics and artificial\nintelligence. The advent of powerful tools such as large language models (LLMs)\nand large vision-language models (LVLMs) has significantly enhanced the\ncapabilities of these robots in environmental perception and decision-making.\nHowever, the introduction of these intelligent agents has led to security\nthreats such as jailbreak attacks and adversarial attacks.\n In this research, we take a further step by proposing a backdoor attack\nspecifically targeting robotic manipulation and, for the first time,\nimplementing backdoor attack in the physical world. By embedding a backdoor\nvisual language model into the visual perception module within the robotic\nsystem, we successfully mislead the robotic arm's operation in the physical\nworld, given the presence of common items as triggers. Experimental evaluations\nin the physical world demonstrate the effectiveness of the proposed backdoor\nattack.\n","authors":["Xianlong Wang","Hewen Pan","Hangtao Zhang","Minghui Li","Shengshan Hu","Ziqi Zhou","Lulu Xue","Peijin Guo","Yichen Wang","Wei Wan","Aishan Liu","Leo Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11683v1.pdf","comment":"Initial version with preliminary results. We welcome any feedback or\n suggestions"},{"id":"http://arxiv.org/abs/2411.11681v1","updated":"2024-11-18T16:03:51Z","published":"2024-11-18T16:03:51Z","title":"PSPO*: An Effective Process-supervised Policy Optimization for Reasoning\n Alignment","summary":" Process supervision enhances the performance of large language models in\nreasoning tasks by providing feedback at each step of chain-of-thought\nreasoning. However, due to the lack of effective process supervision methods,\neven advanced large language models are prone to logical errors and redundant\nreasoning. We claim that the effectiveness of process supervision significantly\ndepends on both the accuracy and the length of reasoning chains. Moreover, we\nidentify that these factors exhibit a nonlinear relationship with the overall\nreward score of the reasoning process. Inspired by these insights, we propose a\nnovel process supervision paradigm, PSPO*, which systematically outlines the\nworkflow from reward model training to policy optimization, and highlights the\nimportance of nonlinear rewards in process supervision. Based on PSPO*, we\ndevelop the PSPO-WRS, which considers the number of reasoning steps in\ndetermining reward scores and utilizes an adjusted Weibull distribution for\nnonlinear reward shaping. Experimental results on six mathematical reasoning\ndatasets demonstrate that PSPO-WRS consistently outperforms current mainstream\nmodels.\n","authors":["Jiawei Li","Xinyue Liang","Yizhe Yang","Chong Feng","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2411.11681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02164v4","updated":"2024-11-18T16:01:29Z","published":"2024-02-03T14:24:21Z","title":"Hierarchical Structure Enhances the Convergence and Generalizability of\n Linear Molecular Representation","summary":" Language models demonstrate fundamental abilities in syntax, semantics, and\nreasoning, though their performance often depends significantly on the inputs\nthey process. This study introduces TSIS (Simplified TSID) and its\nvariants:TSISD (TSIS with Depth-First Search), TSISO (TSIS in Order), and TSISR\n(TSIS in Random), as integral components of the t-SMILES framework. These\nadditions complete the framework's design, providing diverse approaches to\nmolecular representation. Through comprehensive analysis and experiments\nemploying deep generative models, including GPT, diffusion models, and\nreinforcement learning, the findings reveal that the hierarchical structure of\nt-SMILES is more straightforward to parse than initially anticipated.\nFurthermore, t-SMILES consistently outperforms other linear representations\nsuch as SMILES, SELFIES, and SAFE, demonstrating superior convergence speed and\nenhanced generalization capabilities.\n","authors":["Juan-Ni Wu","Tong Wang","Li-Juan Tang","Hai-Long Wu","Ru-Qin Yu"],"pdf_url":"https://arxiv.org/pdf/2402.02164v4.pdf","comment":"26pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.11672v1","updated":"2024-11-18T15:51:45Z","published":"2024-11-18T15:51:45Z","title":"Artificial Scientific Discovery","summary":" Rooted in the explosion of deep learning over the past decade, this thesis\nspans from AlphaGo to ChatGPT to empirically examine the fundamental concepts\nneeded to realize the vision of an artificial scientist: a machine with the\ncapacity to autonomously generate original research and contribute to the\nexpansion of human knowledge. The investigation begins with {\\sc Olivaw}, an\nAlphaGo Zero-like agent that discovers Othello knowledge from scratch but is\nunable to communicate it. This realization leads to the development of the\nExplanatory Learning (EL) framework, a formalization of the problem faced by a\nscientist when trying to explain a new phenomenon to their peers. The effective\nEL prescriptions allow us to crack Zendo, a board game simulating the\nscientific endeavor. This success comes with a fundamental insight: an\nartificial scientist must develop its own interpretation of the language used\nto explain its findings. This perspective then leads us to see modern\nmultimodal models as interpreters, and to devise a new way to build\ninterpretable and cost-effective CLIP-like models: by coupling two unimodal\nmodels using little multimodal data and no further training. Finally, we\ndiscuss what ChatGPT and its siblings are still missing to become artificial\nscientists, and introduce Odeen, a benchmark about interpreting explanations\nthat sees LLMs going no further than random chance while being instead fully\nsolved by humans.\n","authors":["Antonio Norelli"],"pdf_url":"https://arxiv.org/pdf/2411.11672v1.pdf","comment":"PhD thesis, 123 pages"},{"id":"http://arxiv.org/abs/2411.11667v1","updated":"2024-11-18T15:45:41Z","published":"2024-11-18T15:45:41Z","title":"Dissecting Misalignment of Multimodal Large Language Models via\n Influence Function","summary":" Multi-modal Large Language models (MLLMs) are always trained on data from\ndiverse and unreliable sources, which may contain misaligned or mislabeled\ntext-image pairs. This frequently causes robustness issues and hallucinations,\nleading to performance degradation. Data valuation is an efficient way to\ndetect and trace these misalignments. Nevertheless, existing methods are\ncomputationally expensive for MLLMs. While computationally efficient, the\nclassical influence functions are inadequate for contrastive learning models\nbecause they were originally designed for pointwise loss. Additionally,\ncontrastive learning involves minimizing the distance between the modalities of\npositive samples and maximizing the distance between the modalities of negative\nsamples. This requires us to evaluate the influence of samples from both\nperspectives. To tackle these challenges, we introduce the Extended Influence\nFunction for Contrastive Loss (ECIF), an influence function crafted for\ncontrastive loss. ECIF considers both positive and negative samples and\nprovides a closed-form approximation of contrastive learning models,\neliminating the need for retraining. Building upon ECIF, we develop a series of\nalgorithms for data evaluation in MLLM, misalignment detection, and\nmisprediction trace-back tasks. Experimental results demonstrate our ECIF\nadvances the transparency and interpretability of MLLMs by offering a more\naccurate assessment of data impact and model alignment compared to traditional\nbaseline methods.\n","authors":["Lijie Hu","Chenyang Ren","Huanyi Xie","Khouloud Saadi","Shu Yang","Jingfeng Zhang","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11667v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2407.12804v2","updated":"2024-11-18T15:41:24Z","published":"2024-06-24T16:31:11Z","title":"Modulating Language Model Experiences through Frictions","summary":" Language models are transforming the ways that their users engage with the\nworld. Despite impressive capabilities, over-consumption of language model\noutputs risks propagating unchecked errors in the short-term and damaging human\ncapabilities for critical thinking in the long-term. How can we develop\nscaffolding around language models to curate more appropriate use? We propose\nselective frictions for language model experiences, inspired by behavioral\nscience interventions, to dampen misuse. Frictions involve small modifications\nto a user's experience, e.g., the addition of a button impeding model access\nand reminding a user of their expertise relative to the model. Through a user\nstudy with real humans, we observe shifts in user behavior from the imposition\nof a friction over LLMs in the context of a multi-topic question-answering task\nas a representative task that people may use LLMs for, e.g., in education and\ninformation retrieval. We find that frictions modulate over-reliance by driving\ndown users' click rates while minimally affecting accuracy for those topics.\nYet, frictions may have unintended effects. We find marked differences in\nusers' click behaviors even on topics where frictions were not provisioned. Our\ncontributions motivate further study of human-AI behavioral interaction to\ninform more effective and appropriate LLM use.\n","authors":["Katherine M. Collins","Valerie Chen","Ilia Sucholutsky","Hannah Rose Kirk","Malak Sadek","Holli Sargeant","Ameet Talwalkar","Adrian Weller","Umang Bhatt"],"pdf_url":"https://arxiv.org/pdf/2407.12804v2.pdf","comment":"NeurIPS Workshop on Behavioral ML; non-archival"},{"id":"http://arxiv.org/abs/2410.13147v6","updated":"2024-11-18T15:41:01Z","published":"2024-10-17T02:04:57Z","title":"Utilizing Large Language Models in an iterative paradigm with domain\n feedback for molecule optimization","summary":" Molecule optimization is a critical task in drug discovery to optimize\ndesired properties of a given molecule through chemical modification. Despite\nLarge Language Models (LLMs) holding the potential to efficiently simulate this\ntask by using natural language to direct the optimization, straightforwardly\nutilizing them shows limited performance. In this work, we facilitate utilizing\nLLMs in an iterative paradigm by proposing a simple yet highly effective domain\nfeedback provider, namely $\\text{Re}^3$DF. In detail, $\\text{Re}^3$DF harnesses\nan external toolkit, RDKit, to handle the molecule hallucination, if the\nmodified molecule is chemically invalid. Otherwise, its desired properties are\ncomputed and compared to the original one, establishing reliable domain\nfeedback with correct direction and distance towards the objective, followed by\na retrieved example, to guide the LLM to refine the modified molecule. We\nconduct experiments across both single- and multi-property objectives with 2\nthresholds, where $\\text{Re}^3$DF shows significant improvements. Particularly,\nfor 20 single-property objectives, $\\text{Re}^3$DF enhances Hit ratio by 16.95%\nand 20.76% under loose (\\texttt{l}) and strict (\\texttt{s}) thresholds,\nrespectively. For 32 multi-property objectives, $\\text{Re}^3$DF enhances Hit\nratio by 6.04% and 5.25%.\n","authors":["Khiem Le","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2410.13147v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04154v7","updated":"2024-11-18T15:31:52Z","published":"2024-02-06T17:09:25Z","title":"Read to Play (R2-Play): Decision Transformer with Multimodal Game\n Instruction","summary":" Developing a generalist agent is a longstanding objective in artificial\nintelligence. Previous efforts utilizing extensive offline datasets from\nvarious tasks demonstrate remarkable performance in multitasking scenarios\nwithin Reinforcement Learning. However, these works encounter challenges in\nextending their capabilities to new tasks. Recent approaches integrate textual\nguidance or visual trajectory into decision networks to provide task-specific\ncontextual cues, representing a promising direction. However, it is observed\nthat relying solely on textual guidance or visual trajectory is insufficient\nfor accurately conveying the contextual information of tasks. This paper\nexplores enhanced forms of task guidance for agents, enabling them to\ncomprehend gameplay instructions, thereby facilitating a \"read-to-play\"\ncapability. Drawing inspiration from the success of multimodal instruction\ntuning in visual tasks, we treat the visual-based RL task as a long-horizon\nvision task and construct a set of multimodal game instructions to incorporate\ninstruction tuning into a decision transformer. Experimental results\ndemonstrate that incorporating multimodal game instructions significantly\nenhances the decision transformer's multitasking and generalization\ncapabilities.\n","authors":["Yonggang Jin","Ge Zhang","Hao Zhao","Tianyu Zheng","Jarvi Guo","Liuyu Xiang","Shawn Yue","Stephen W. Huang","Zhaofeng He","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2402.04154v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11064v2","updated":"2024-11-18T15:29:05Z","published":"2024-10-14T20:21:11Z","title":"Parsing altered brain connectivity in neurodevelopmental disorders by\n integrating graph-based normative modeling and deep generative networks","summary":" Divergent brain connectivity is thought to underlie the behavioral and\ncognitive symptoms observed in many neurodevelopmental disorders. Quantifying\ndivergence from neurotypical connectivity patterns offers a promising pathway\nto inform diagnosis and therapeutic interventions. While advanced neuroimaging\ntechniques, such as diffusion MRI (dMRI), have facilitated the mapping of\nbrain's structural connectome, the challenge lies in accurately modeling\ndevelopmental trajectories within these complex networked structures to create\nrobust neurodivergence markers. In this work, we present the Brain\nRepresentation via Individualized Deep Generative Embedding (BRIDGE) framework,\nwhich integrates normative modeling with a bio-inspired deep generative model\nto create a reference trajectory of connectivity transformation as part of\nneurotypical development. This will enable the assessment of neurodivergence by\ncomparing individuals to the established neurotypical trajectory. BRIDGE\nprovides a global neurodivergence score based on the difference between\nconnectivity-based brain age and chronological age, along with region-wise\nneurodivergence maps that highlight localized connectivity differences.\nApplication of BRIDGE to a large cohort of children with autism spectrum\ndisorder demonstrates that the global neurodivergence score correlates with\nclinical assessments in autism, and the regional map offers insights into the\nheterogeneity at the individual level in neurodevelopmental disorders.\nTogether, the neurodivergence score and map form powerful tools for quantifying\ndevelopmental divergence in connectivity patterns, advancing the development of\nimaging markers for personalized diagnosis and intervention in various clinical\ncontexts.\n","authors":["Rui Sherry Shen","Yusuf Osmanlıoğlu","Drew Parker","Darien Aunapu","Benjamin E. Yerys","Birkan Tunç","Ragini Verma"],"pdf_url":"https://arxiv.org/pdf/2410.11064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11647v1","updated":"2024-11-18T15:24:11Z","published":"2024-11-18T15:24:11Z","title":"No-regret Exploration in Shuffle Private Reinforcement Learning","summary":" Differential privacy (DP) has recently been introduced into episodic\nreinforcement learning (RL) to formally address user privacy concerns in\npersonalized services. Previous work mainly focuses on two trust models of DP:\nthe central model, where a central agent is responsible for protecting users'\nsensitive data, and the (stronger) local model, where the protection occurs\ndirectly on the user side. However, they either require a trusted central agent\nor incur a significantly higher privacy cost, making it unsuitable for many\nscenarios. This work introduces a trust model stronger than the central model\nbut with a lower privacy cost than the local model, leveraging the emerging\n\\emph{shuffle} model of privacy. We present the first generic algorithm for\nepisodic RL under the shuffle model, where a trusted shuffler randomly permutes\na batch of users' data before sending it to the central agent. We then\ninstantiate the algorithm using our proposed shuffle Privatizer, relying on a\nshuffle private binary summation mechanism. Our analysis shows that the\nalgorithm achieves a near-optimal regret bound comparable to that of the\ncentralized model and significantly outperforms the local model in terms of\nprivacy cost.\n","authors":["Shaojie Bai","Mohammad Sadegh Talebi","Chengcheng Zhao","Peng Cheng","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16934v3","updated":"2024-11-18T15:22:32Z","published":"2024-09-25T13:45:23Z","title":"Investigating OCR-Sensitive Neurons to Improve Entity Recognition in\n Historical Documents","summary":" This paper investigates the presence of OCR-sensitive neurons within the\nTransformer architecture and their influence on named entity recognition (NER)\nperformance on historical documents. By analysing neuron activation patterns in\nresponse to clean and noisy text inputs, we identify and then neutralise\nOCR-sensitive neurons to improve model performance. Based on two open access\nlarge language models (Llama2 and Mistral), experiments demonstrate the\nexistence of OCR-sensitive regions and show improvements in NER performance on\nhistorical newspapers and classical commentaries, highlighting the potential of\ntargeted neuron modulation to improve models' performance on noisy text.\n","authors":["Emanuela Boros","Maud Ehrmann"],"pdf_url":"https://arxiv.org/pdf/2409.16934v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11641v1","updated":"2024-11-18T15:19:54Z","published":"2024-11-18T15:19:54Z","title":"TSINR: Capturing Temporal Continuity via Implicit Neural Representations\n for Time Series Anomaly Detection","summary":" Time series anomaly detection aims to identify unusual patterns in data or\ndeviations from systems' expected behavior. The reconstruction-based methods\nare the mainstream in this task, which learn point-wise representation via\nunsupervised learning. However, the unlabeled anomaly points in training data\nmay cause these reconstruction-based methods to learn and reconstruct anomalous\ndata, resulting in the challenge of capturing normal patterns. In this paper,\nwe propose a time series anomaly detection method based on implicit neural\nrepresentation (INR) reconstruction, named TSINR, to address this challenge.\nDue to the property of spectral bias, TSINR enables prioritizing low-frequency\nsignals and exhibiting poorer performance on high-frequency abnormal data.\nSpecifically, we adopt INR to parameterize time series data as a continuous\nfunction and employ a transformer-based architecture to predict the INR of\ngiven data. As a result, the proposed TSINR method achieves the advantage of\ncapturing the temporal continuity and thus is more sensitive to discontinuous\nanomaly data. In addition, we further design a novel form of INR continuous\nfunction to learn inter- and intra-channel information, and leverage a\npre-trained large language model to amplify the intense fluctuations in\nanomalies. Extensive experiments demonstrate that TSINR achieves superior\noverall performance on both univariate and multivariate time series anomaly\ndetection benchmarks compared to other state-of-the-art reconstruction-based\nmethods. Our codes are available.\n","authors":["Mengxuan Li","Ke Liu","Hongyang Chen","Jiajun Bu","Hongwei Wang","Haishuai Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11641v1.pdf","comment":"Accepted by SIGKDD 2025"},{"id":"http://arxiv.org/abs/2411.11636v1","updated":"2024-11-18T15:14:36Z","published":"2024-11-18T15:14:36Z","title":"SP${ }^3$ : Superpixel-propagated pseudo-label learning for weakly\n semi-supervised medical image segmentation","summary":" Deep learning-based medical image segmentation helps assist diagnosis and\naccelerate the treatment process while the model training usually requires\nlarge-scale dense annotation datasets. Weakly semi-supervised medical image\nsegmentation is an essential application because it only requires a small\namount of scribbles and a large number of unlabeled data to train the model,\nwhich greatly reduces the clinician's effort to fully annotate images. To\nhandle the inadequate supervisory information challenge in weakly\nsemi-supervised segmentation (WSSS), a SuperPixel-Propagated Pseudo-label\n(SP${}^3$) learning method is proposed, using the structural information\ncontained in superpixel for supplemental information. Specifically, the\nannotation of scribbles is propagated to superpixels and thus obtains a dense\nannotation for supervised training. Since the quality of pseudo-labels is\nlimited by the low-quality annotation, the beneficial superpixels selected by\ndynamic thresholding are used to refine pseudo-labels. Furthermore, aiming to\nalleviate the negative impact of noise in pseudo-label, superpixel-level\nuncertainty is incorporated to guide the pseudo-label supervision for stable\nlearning. Our method achieves state-of-the-art performance on both tumor and\norgan segmentation datasets under the WSSS setting, using only 3\\% of the\nannotation workload compared to fully supervised methods and attaining\napproximately 80\\% Dice score. Additionally, our method outperforms eight\nweakly and semi-supervised methods under both weakly supervised and\nsemi-supervised settings. Results of extensive experiments validate the\neffectiveness and annotation efficiency of our weakly semi-supervised\nsegmentation, which can assist clinicians in achieving automated segmentation\nfor organs or tumors quickly and ultimately benefit patients.\n","authors":["Shiman Li","Jiayue Zhao","Shaolei Liu","Xiaokun Dai","Chenxi Zhang","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2411.11636v1.pdf","comment":"10 pages, 7 figures. Under Review"},{"id":"http://arxiv.org/abs/2411.11635v1","updated":"2024-11-18T15:13:47Z","published":"2024-11-18T15:13:47Z","title":"Chapter 7 Review of Data-Driven Generative AI Models for Knowledge\n Extraction from Scientific Literature in Healthcare","summary":" This review examines the development of abstractive NLP-based text\nsummarization approaches and compares them to existing techniques for\nextractive summarization. A brief history of text summarization from the 1950s\nto the introduction of pre-trained language models such as Bidirectional\nEncoder Representations from Transformer (BERT) and Generative Pre-training\nTransformers (GPT) are presented. In total, 60 studies were identified in\nPubMed and Web of Science, of which 29 were excluded and 24 were read and\nevaluated for eligibility, resulting in the use of seven studies for further\nanalysis. This chapter also includes a section with examples including an\nexample of a comparison between GPT-3 and state-of-the-art GPT-4 solutions in\nscientific text summarisation. Natural language processing has not yet reached\nits full potential in the generation of brief textual summaries. As there are\nacknowledged concerns that must be addressed, we can expect gradual\nintroduction of such models in practise.\n","authors":["Leon Kopitar","Primoz Kocbek","Lucija Gosak","Gregor Stiglic"],"pdf_url":"https://arxiv.org/pdf/2411.11635v1.pdf","comment":"16 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.03685v6","updated":"2024-11-18T14:58:00Z","published":"2024-04-01T13:12:27Z","title":"Cooperative Evolutionary Pressure and Diminishing Returns Might Explain\n the Fermi Paradox: On What Super-AIs Are Like","summary":" With an evolutionary approach, the basis of morality can be explained as\nadaptations to problems of cooperation. With 'evolution' taken in a broad\nsense, AIs that satisfy the conditions for evolution to apply will be subject\nto the same cooperative evolutionary pressure as biological entities. Here the\nadaptiveness of increased cooperation as material safety and wealth increase is\ndiscussed -- for humans, for other societies, and for AIs. Diminishing\nbeneficial returns from increased access to material resources also suggests\nthe possibility that, on the whole, there will be no incentive to for instance\ncolonize entire galaxies, thus providing a possible explanation of the Fermi\nparadox, wondering where everybody is. It is further argued that old societies\ncould engender, give way to, super-AIs, since it is likely that super-AIs are\nfeasible, and fitter. Closing is an aside on effective ways for morals and\ngoals to affect life and society, emphasizing environments, cultures, and laws,\nand exemplified by how to eat.\n Appended are an algorithm for colonizing for example a galaxy quickly, models\nof the evolution of cooperation and fairness under diminishing returns, and\nsoftware for simulating signaling development. It is also noted that there can\nbe no exponential colonization or reproduction, for mathematical reasons, as\neach entity takes up a certain amount of space. 'Diminishing returns' is\ndefined, as less than roots.\n","authors":["Daniel Vallstrom"],"pdf_url":"https://arxiv.org/pdf/2404.03685v6.pdf","comment":"32 pages, 3 figures. Added definition, clarifications, expansions,\n references"},{"id":"http://arxiv.org/abs/2411.11620v1","updated":"2024-11-18T14:49:12Z","published":"2024-11-18T14:49:12Z","title":"ST-Tree with Interpretability for Multivariate Time Series\n Classification","summary":" Multivariate time series classification is of great importance in practical\napplications and is a challenging task. However, deep neural network models\nsuch as Transformers exhibit high accuracy in multivariate time series\nclassification but lack interpretability and fail to provide insights into the\ndecision-making process. On the other hand, traditional approaches based on\ndecision tree classifiers offer clear decision processes but relatively lower\naccuracy. Swin Transformer (ST) addresses these issues by leveraging\nself-attention mechanisms to capture both fine-grained local patterns and\nglobal patterns. It can also model multi-scale feature representation learning,\nthereby providing a more comprehensive representation of time series features.\nTo tackle the aforementioned challenges, we propose ST-Tree with\ninterpretability for multivariate time series classification. Specifically, the\nST-Tree model combines ST as the backbone network with an additional neural\ntree model. This integration allows us to fully leverage the advantages of ST\nin learning time series context while providing interpretable decision\nprocesses through the neural tree. This enables researchers to gain clear\ninsights into the model's decision-making process and extract meaningful\ninterpretations. Through experimental evaluations on 10 UEA datasets, we\ndemonstrate that the ST-Tree model improves accuracy in multivariate time\nseries classification tasks and provides interpretability through visualizing\nthe decision-making process across different datasets.\n","authors":["Mingsen Du","Yanxuan Wei","Yingxia Tang","Xiangwei Zheng","Shoushui Wei","Cun Ji"],"pdf_url":"https://arxiv.org/pdf/2411.11620v1.pdf","comment":"Submitted on May 15, 2024, major revisions on Aug 31, 2024"},{"id":"http://arxiv.org/abs/2407.11211v3","updated":"2024-11-18T14:43:38Z","published":"2024-07-15T19:53:02Z","title":"Unconstrained Open Vocabulary Image Classification: Zero-Shot Transfer\n from Text to Image via CLIP Inversion","summary":" We introduce NOVIC, an innovative real-time uNconstrained Open Vocabulary\nImage Classifier that uses an autoregressive transformer to generatively output\nclassification labels as language. Leveraging the extensive knowledge of CLIP\nmodels, NOVIC harnesses the embedding space to enable zero-shot transfer from\npure text to images. Traditional CLIP models, despite their ability for open\nvocabulary classification, require an exhaustive prompt of potential class\nlabels, restricting their application to images of known content or context. To\naddress this, we propose an \"object decoder\" model that is trained on a\nlarge-scale 92M-target dataset of templated object noun sets and LLM-generated\ncaptions to always output the object noun in question. This effectively inverts\nthe CLIP text encoder and allows textual object labels from essentially the\nentire English language to be generated directly from image-derived embedding\nvectors, without requiring any a priori knowledge of the potential content of\nan image, and without any label biases. The trained decoders are tested on a\nmix of manually and web-curated datasets, as well as standard image\nclassification benchmarks, and achieve fine-grained prompt-free prediction\nscores of up to 87.5%, a strong result considering the model must work for any\nconceivable image and without any contextual clues.\n","authors":["Philipp Allgeuer","Kyra Ahrens","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2407.11211v3.pdf","comment":"Published at WACV 2025"},{"id":"http://arxiv.org/abs/2411.11616v1","updated":"2024-11-18T14:42:15Z","published":"2024-11-18T14:42:15Z","title":"Signaling and Social Learning in Swarms of Robots","summary":" This paper investigates the role of communication in improving coordination\nwithin robot swarms, focusing on a paradigm where learning and execution occur\nsimultaneously in a decentralized manner. We highlight the role communication\ncan play in addressing the credit assignment problem (individual contribution\nto the overall performance), and how it can be influenced by it. We propose a\ntaxonomy of existing and future works on communication, focusing on information\nselection and physical abstraction as principal axes for classification: from\nlow-level lossless compression with raw signal extraction and processing to\nhigh-level lossy compression with structured communication models. The paper\nreviews current research from evolutionary robotics, multi-agent (deep)\nreinforcement learning, language models, and biophysics models to outline the\nchallenges and opportunities of communication in a collective of robots that\ncontinuously learn from one another through local message exchanges,\nillustrating a form of social learning.\n","authors":["Leo Cazenille","Maxime Toquebiau","Nicolas Lobato-Dauzier","Alessia Loi","Loona Macabre","Nathanael Aubert-Kato","Anthony Genot","Nicolas Bredeche"],"pdf_url":"https://arxiv.org/pdf/2411.11616v1.pdf","comment":"17 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2411.08745v2","updated":"2024-11-18T14:41:38Z","published":"2024-11-13T16:26:19Z","title":"Separating Tongue from Thought: Activation Patching Reveals\n Language-Agnostic Concept Representations in Transformers","summary":" A central question in multilingual language modeling is whether large\nlanguage models (LLMs) develop a universal concept representation, disentangled\nfrom specific languages. In this paper, we address this question by analyzing\nlatent representations (latents) during a word translation task in\ntransformer-based LLMs. We strategically extract latents from a source\ntranslation prompt and insert them into the forward pass on a target\ntranslation prompt. By doing so, we find that the output language is encoded in\nthe latent at an earlier layer than the concept to be translated. Building on\nthis insight, we conduct two key experiments. First, we demonstrate that we can\nchange the concept without changing the language and vice versa through\nactivation patching alone. Second, we show that patching with the mean over\nlatents across different languages does not impair and instead improves the\nmodels' performance in translating the concept. Our results provide evidence\nfor the existence of language-agnostic concept representations within the\ninvestigated models.\n","authors":["Clément Dumas","Chris Wendler","Veniamin Veselovsky","Giovanni Monea","Robert West"],"pdf_url":"https://arxiv.org/pdf/2411.08745v2.pdf","comment":"12 pages, 10 figures, previous version published under the title \"How\n Do Llamas Process Multilingual Text? A Latent Exploration through Activation\n Patching\" at the ICML 2024 mechanistic interpretability workshop at\n https://openreview.net/forum?id=0ku2hIm4BS"},{"id":"http://arxiv.org/abs/2406.07302v2","updated":"2024-11-18T14:40:54Z","published":"2024-06-11T14:30:34Z","title":"BertaQA: How Much Do Language Models Know About Local Culture?","summary":" Large Language Models (LLMs) exhibit extensive knowledge about the world, but\nmost evaluations have been limited to global or anglocentric subjects. This\nraises the question of how well these models perform on topics relevant to\nother cultures, whose presence on the web is not that prominent. To address\nthis gap, we introduce BertaQA, a multiple-choice trivia dataset that is\nparallel in English and Basque. The dataset consists of a local subset with\nquestions pertinent to the Basque culture, and a global subset with questions\nof broader interest. We find that state-of-the-art LLMs struggle with local\ncultural knowledge, even as they excel on global topics. However, we show that\ncontinued pre-training in Basque significantly improves the models' performance\non Basque culture, even when queried in English. To our knowledge, this is the\nfirst solid evidence of knowledge transfer from a low-resource to a\nhigh-resource language. Our analysis sheds light on the complex interplay\nbetween language and knowledge, and reveals that some prior findings do not\nfully hold when reassessed on local topics. Our dataset and evaluation code are\navailable under open licenses at https://github.com/juletx/BertaQA.\n","authors":["Julen Etxaniz","Gorka Azkune","Aitor Soroa","Oier Lopez de Lacalle","Mikel Artetxe"],"pdf_url":"https://arxiv.org/pdf/2406.07302v2.pdf","comment":"NEURIPS Datasets & Benchmarks 2024"},{"id":"http://arxiv.org/abs/2403.08425v2","updated":"2024-11-18T14:21:53Z","published":"2024-03-13T11:20:34Z","title":"Specification Overfitting in Artificial Intelligence","summary":" Machine learning (ML) and artificial intelligence (AI) approaches are often\ncriticized for their inherent bias and for their lack of control,\naccountability, and transparency. Consequently, regulatory bodies struggle with\ncontaining this technology's potential negative side effects. High-level\nrequirements such as fairness and robustness need to be formalized into\nconcrete specification metrics, imperfect proxies that capture isolated aspects\nof the underlying requirements. Given possible trade-offs between different\nmetrics and their vulnerability to over-optimization, integrating specification\nmetrics in system development processes is not trivial. This paper defines\nspecification overfitting, a scenario where systems focus excessively on\nspecified metrics to the detriment of high-level requirements and task\nperformance. We present an extensive literature survey to categorize how\nresearchers propose, measure, and optimize specification metrics in several AI\nfields (e.g., natural language processing, computer vision, reinforcement\nlearning). Using a keyword-based search on papers from major AI conferences and\njournals between 2018 and mid-2023, we identify and analyze 74 papers that\npropose or optimize specification metrics. We find that although most papers\nimplicitly address specification overfitting (e.g., by reporting more than one\nspecification metric), they rarely discuss which role specification metrics\nshould play in system development or explicitly define the scope and\nassumptions behind metric formulations.\n","authors":["Benjamin Roth","Pedro Henrique Luz de Araujo","Yuxi Xia","Saskia Kaltenbrunner","Christoph Korab"],"pdf_url":"https://arxiv.org/pdf/2403.08425v2.pdf","comment":"41 pages, 2 figures. Accepted at Artificial Intelligence Review"},{"id":"http://arxiv.org/abs/2408.11048v2","updated":"2024-11-18T14:14:22Z","published":"2024-08-20T17:56:52Z","title":"RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual\n Dexterous Robot Hands","summary":" It has been a long-standing research goal to endow robot hands with\nhuman-level dexterity. Bi-manual robot piano playing constitutes a task that\ncombines challenges from dynamic tasks, such as generating fast while precise\nmotions, with slower but contact-rich manipulation problems. Although\nreinforcement learning based approaches have shown promising results in\nsingle-task performance, these methods struggle in a multi-song setting. Our\nwork aims to close this gap and, thereby, enable imitation learning approaches\nfor robot piano playing at scale. To this end, we introduce the Robot Piano 1\nMillion (RP1M) dataset, containing bi-manual robot piano playing motion data of\nmore than one million trajectories. We formulate finger placements as an\noptimal transport problem, thus, enabling automatic annotation of vast amounts\nof unlabeled songs. Benchmarking existing imitation learning approaches shows\nthat such approaches reach state-of-the-art robot piano playing performance by\nleveraging RP1M.\n","authors":["Yi Zhao","Le Chen","Jan Schneider","Quankai Gao","Juho Kannala","Bernhard Schölkopf","Joni Pajarinen","Dieter Büchler"],"pdf_url":"https://arxiv.org/pdf/2408.11048v2.pdf","comment":"Accepted by Conference on Robot Learning (CoRL) 2024. Project\n Website: https://rp1m.github.io/"},{"id":"http://arxiv.org/abs/2411.11576v1","updated":"2024-11-18T13:54:44Z","published":"2024-11-18T13:54:44Z","title":"Hybrid Data-Driven SSM for Interpretable and Label-Free mmWave Channel\n Prediction","summary":" Accurate prediction of mmWave time-varying channels is essential for\nmitigating the issue of channel aging in complex scenarios owing to high user\nmobility. Existing channel prediction methods have limitations: classical\nmodel-based methods often struggle to track highly nonlinear channel dynamics\ndue to limited expert knowledge, while emerging data-driven methods typically\nrequire substantial labeled data for effective training and often lack\ninterpretability. To address these issues, this paper proposes a novel hybrid\nmethod that integrates a data-driven neural network into a conventional\nmodel-based workflow based on a state-space model (SSM), implicitly tracking\ncomplex channel dynamics from data without requiring precise expert knowledge.\nAdditionally, a novel unsupervised learning strategy is developed to train the\nembedded neural network solely with unlabeled data. Theoretical analyses and\nablation studies are conducted to interpret the enhanced benefits gained from\nthe hybrid integration. Numerical simulations based on the 3GPP mmWave channel\nmodel corroborate the superior prediction accuracy of the proposed method,\ncompared to state-of-the-art methods that are either purely model-based or\ndata-driven. Furthermore, extensive experiments validate its robustness against\nvarious challenging factors, including among others severe channel variations\nand high noise levels.\n","authors":["Yiyong Sun","Jiajun He","Zhidi Lin","Wenqiang Pu","Feng Yin","Hing Cheung So"],"pdf_url":"https://arxiv.org/pdf/2411.11576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09218v3","updated":"2024-11-18T13:26:41Z","published":"2023-07-16T16:27:58Z","title":"A Comprehensive Survey of Forgetting in Deep Learning Beyond Continual\n Learning","summary":" Forgetting refers to the loss or deterioration of previously acquired\nknowledge. While existing surveys on forgetting have primarily focused on\ncontinual learning, forgetting is a prevalent phenomenon observed in various\nother research domains within deep learning. Forgetting manifests in research\nfields such as generative models due to generator shifts, and federated\nlearning due to heterogeneous data distributions across clients. Addressing\nforgetting encompasses several challenges, including balancing the retention of\nold task knowledge with fast learning of new task, managing task interference\nwith conflicting goals, and preventing privacy leakage, etc. Moreover, most\nexisting surveys on continual learning implicitly assume that forgetting is\nalways harmful. In contrast, our survey argues that forgetting is a\ndouble-edged sword and can be beneficial and desirable in certain cases, such\nas privacy-preserving scenarios. By exploring forgetting in a broader context,\nwe present a more nuanced understanding of this phenomenon and highlight its\npotential advantages. Through this comprehensive survey, we aspire to uncover\npotential solutions by drawing upon ideas and approaches from various fields\nthat have dealt with forgetting. By examining forgetting beyond its\nconventional boundaries, we hope to encourage the development of novel\nstrategies for mitigating, harnessing, or even embracing forgetting in real\napplications. A comprehensive list of papers about forgetting in various\nresearch fields is available at\n\\url{https://github.com/EnnengYang/Awesome-Forgetting-in-Deep-Learning}.\n","authors":["Zhenyi Wang","Enneng Yang","Li Shen","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2307.09218v3.pdf","comment":"accepted at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2411.11560v1","updated":"2024-11-18T13:26:09Z","published":"2024-11-18T13:26:09Z","title":"Topology-aware Preemptive Scheduling for Co-located LLM Workloads","summary":" Hosting diverse large language model workloads in a unified resource pool\nthrough co-location is cost-effective. For example, long-running chat services\ngenerally follow diurnal traffic patterns, which inspire co-location of batch\njobs to fulfill resource valleys between successive peaks, and thus to saturate\nresource allocation in cluster-wide scope. These heterogeneous workloads often\nhave different business priorities, and therefore preemption can be leveraged\nfor resource elasticity. However, workloads often have distinct topology\npreferences as well. The resources released by lower-priority instances may\nfail to meet the requirements of high-priority online services which are\nusually latency-sensitive. The root cause behind such mis-match is a lack of\ntopology awareness of resource scheduler, especially during preemption. To\nbridge this gap, we develop a fine-grained topology-aware method for preemptive\nscheduling of hybrid workloads. The method ensures that the resources freed by\npreempted tasks adhere to the topological affinity needs of high-priority\npreemptors in a guaranteed or best-effort manner. This dynamic alignment\nsignificantly increases the efficiency of preemption and improves overall\nscheduled performance for LLM workloads by $55\\%$.\n","authors":["Ping Zhang","Lei Su","Jinjie Yang","Xin Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11560v1.pdf","comment":"17 Pages, 11 Figures, 5 Tables"},{"id":"http://arxiv.org/abs/2411.11548v1","updated":"2024-11-18T13:06:29Z","published":"2024-11-18T13:06:29Z","title":"Real-Time Fitness Exercise Classification and Counting from Video Frames","summary":" This paper introduces a novel method for real-time exercise classification\nusing a Bidirectional Long Short-Term Memory (BiLSTM) neural network. Existing\nexercise recognition approaches often rely on synthetic datasets, raw\ncoordinate inputs sensitive to user and camera variations, and fail to fully\nexploit the temporal dependencies in exercise movements. These issues limit\ntheir generalizability and robustness in real-world conditions, where lighting,\ncamera angles, and user body types vary.\n To address these challenges, we propose a BiLSTM-based model that leverages\ninvariant features, such as joint angles, alongside raw coordinates. By using\nboth angles and (x, y, z) coordinates, the model adapts to changes in\nperspective, user positioning, and body differences, improving generalization.\nTraining on 30-frame sequences enables the BiLSTM to capture the temporal\ncontext of exercises and recognize patterns evolving over time.\n We compiled a dataset combining synthetic data from the InfiniteRep dataset\nand real-world videos from Kaggle and other sources. This dataset includes four\ncommon exercises: squat, push-up, shoulder press, and bicep curl. The model was\ntrained and validated on these diverse datasets, achieving an accuracy of over\n99% on the test set. To assess generalizability, the model was tested on 2\nseparate test sets representative of typical usage conditions. Comparisons with\nthe previous approach from the literature are present in the result section\nshowing that the proposed model is the best-performing one.\n The classifier is integrated into a web application providing real-time\nexercise classification and repetition counting without manual exercise\nselection.\n Demo and datasets are available at the following GitHub Repository:\nhttps://github.com/RiccardoRiccio/Fitness-AI-Trainer-With-Automatic-Exercise-Recognition-and-Counting.\n","authors":["Riccardo Riccio"],"pdf_url":"https://arxiv.org/pdf/2411.11548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11543v1","updated":"2024-11-18T13:01:57Z","published":"2024-11-18T13:01:57Z","title":"Enhancing Vision-Language Model Safety through Progressive\n Concept-Bottleneck-Driven Alignment","summary":" Benefiting from the powerful capabilities of Large Language Models (LLMs),\npre-trained visual encoder models connected to LLMs form Vision Language Models\n(VLMs). However, recent research shows that the visual modality in VLMs is\nhighly vulnerable, allowing attackers to bypass safety alignment in LLMs\nthrough visually transmitted content, launching harmful attacks. To address\nthis challenge, we propose a progressive concept-based alignment strategy,\nPSA-VLM, which incorporates safety modules as concept bottlenecks to enhance\nvisual modality safety alignment. By aligning model predictions with specific\nsafety concepts, we improve defenses against risky images, enhancing\nexplainability and controllability while minimally impacting general\nperformance. Our method is obtained through two-stage training. The low\ncomputational cost of the first stage brings very effective performance\nimprovement, and the fine-tuning of the language model in the second stage\nfurther improves the safety performance. Our method achieves state-of-the-art\nresults on popular VLM safety benchmark.\n","authors":["Zhendong Liu","Yuanbi Nie","Yingshui Tan","Xiangyu Yue","Qiushi Cui","Chongjun Wang","Xiaoyong Zhu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.11543v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.13581"},{"id":"http://arxiv.org/abs/2411.11531v1","updated":"2024-11-18T12:40:51Z","published":"2024-11-18T12:40:51Z","title":"Addressing Hallucinations in Language Models with Knowledge Graph\n Embeddings as an Additional Modality","summary":" In this paper we present an approach to reduce hallucinations in Large\nLanguage Models (LLMs) by incorporating Knowledge Graphs (KGs) as an additional\nmodality. Our method involves transforming input text into a set of KG\nembeddings and using an adapter to integrate these embeddings into the language\nmodel space, without relying on external retrieval processes.\n To facilitate this, we created WikiEntities, a dataset containing over 3\nmillion Wikipedia texts annotated with entities from Wikidata and their\ncorresponding embeddings from PyTorch-BigGraph. This dataset serves as a\nvaluable resource for training Entity Linking models and adapting the described\nmethod to various LLMs using specialized adapters.\n Our method does not require fine-tuning of the language models themselves;\ninstead, we only train the adapter. This ensures that the model's performance\non other tasks is not affected. We trained an adapter for the Mistral 7B, LLaMA\n2-7B (chat), and LLaMA 3-8B (instruct) models using this dataset and\ndemonstrated that our approach improves performance on the HaluEval, True-False\nbenchmarks and FEVER dataset. The results indicate that incorporating KGs as a\nnew modality can effectively reduce hallucinations and improve the factual\naccuracy of language models, all without the need for external retrieval.\n","authors":["Viktoriia Chekalina","Anton Razzigaev","Elizaveta Goncharova","Andrey Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2411.11531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16937v2","updated":"2024-11-18T12:36:13Z","published":"2024-06-17T09:39:34Z","title":"A Complete Survey on LLM-based AI Chatbots","summary":" The past few decades have witnessed an upsurge in data, forming the\nfoundation for data-hungry, learning-based AI technology. Conversational\nagents, often referred to as AI chatbots, rely heavily on such data to train\nlarge language models (LLMs) and generate new content (knowledge) in response\nto user prompts. With the advent of OpenAI's ChatGPT, LLM-based chatbots have\nset new standards in the AI community. This paper presents a complete survey of\nthe evolution and deployment of LLM-based chatbots in various sectors. We first\nsummarize the development of foundational chatbots, followed by the evolution\nof LLMs, and then provide an overview of LLM-based chatbots currently in use\nand those in the development phase. Recognizing AI chatbots as tools for\ngenerating new knowledge, we explore their diverse applications across various\nindustries. We then discuss the open challenges, considering how the data used\nto train the LLMs and the misuse of the generated knowledge can cause several\nissues. Finally, we explore the future outlook to augment their efficiency and\nreliability in numerous applications. By addressing key milestones and the\npresent-day context of LLM-based chatbots, our survey invites readers to delve\ndeeper into this realm, reflecting on how their next generation will reshape\nconversational AI.\n","authors":["Sumit Kumar Dam","Choong Seon Hong","Yu Qiao","Chaoning Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.16937v2.pdf","comment":"23 pages, 10 figures"},{"id":"http://arxiv.org/abs/2410.22559v2","updated":"2024-11-18T12:36:04Z","published":"2024-10-29T21:54:18Z","title":"Unpicking Data at the Seams: VAEs, Disentanglement and Independent\n Components","summary":" Disentanglement, or identifying salient statistically independent factors of\nthe data, is of interest in many areas of machine learning and statistics, with\nrelevance to synthetic data generation with controlled properties, robust\nclassification of features, parsimonious encoding, and a greater understanding\nof the generative process underlying the data. Disentanglement arises in\nseveral generative paradigms, including Variational Autoencoders (VAEs),\nGenerative Adversarial Networks and diffusion models. Particular progress has\nrecently been made in understanding disentanglement in VAEs, where the choice\nof diagonal posterior covariance matrices is suggested to promote mutual\northogonality between columns of the decoder's Jacobian. We continue this\nthread to show how this linear independence translates to statistical\nindependence, completing the chain in understanding how the VAE's objective\nidentifies independent components of, or disentangles, the data.\n","authors":["Carl Allen"],"pdf_url":"https://arxiv.org/pdf/2410.22559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11520v1","updated":"2024-11-18T12:29:06Z","published":"2024-11-18T12:29:06Z","title":"A Pre-Trained Graph-Based Model for Adaptive Sequencing of Educational\n Documents","summary":" Massive Open Online Courses (MOOCs) have greatly contributed to making\neducation more accessible.However, many MOOCs maintain a rigid,\none-size-fits-all structure that fails to address the diverse needs and\nbackgrounds of individual learners.Learning path personalization aims to\naddress this limitation, by tailoring sequences of educational content to\noptimize individual student learning outcomes.Existing approaches, however,\noften require either massive student interaction data or extensive expert\nannotation, limiting their broad application.In this study, we introduce a\nnovel data-efficient framework for learning path personalization that operates\nwithout expert annotation.Our method employs a flexible recommender system\npre-trained with reinforcement learning on a dataset of raw course\nmaterials.Through experiments on semi-synthetic data, we show that this\npre-training stage substantially improves data-efficiency in a range of\nadaptive learning scenarios featuring new educational materials.This opens up\nnew perspectives for the design of foundation models for adaptive learning.\n","authors":["Jean Vassoyan","Anan Schütt","Jill-Jênn Vie","Arun-Balajiee Lekshmi-Narayanan","Elisabeth André","Nicolas Vayatis"],"pdf_url":"https://arxiv.org/pdf/2411.11520v1.pdf","comment":"NeurIPS 2024 Workshop on Large Foundation Models for Educational\n Assessment (FM-Assess), Dec 2024, Vancouver, Canada"},{"id":"http://arxiv.org/abs/2411.11511v1","updated":"2024-11-18T12:16:03Z","published":"2024-11-18T12:16:03Z","title":"Structure learning with Temporal Gaussian Mixture for model-based\n Reinforcement Learning","summary":" Model-based reinforcement learning refers to a set of approaches capable of\nsample-efficient decision making, which create an explicit model of the\nenvironment. This model can subsequently be used for learning optimal policies.\nIn this paper, we propose a temporal Gaussian Mixture Model composed of a\nperception model and a transition model. The perception model extracts discrete\n(latent) states from continuous observations using a variational Gaussian\nmixture likelihood. Importantly, our model constantly monitors the collected\ndata searching for new Gaussian components, i.e., the perception model performs\na form of structure learning (Smith et al., 2020; Friston et al., 2018; Neacsu\net al., 2022) as it learns the number of Gaussian components in the mixture.\nAdditionally, the transition model learns the temporal transition between\nconsecutive time steps by taking advantage of the Dirichlet-categorical\nconjugacy. Both the perception and transition models are able to forget part of\nthe data points, while integrating the information they provide within the\nprior, which ensure fast variational inference. Finally, decision making is\nperformed with a variant of Q-learning which is able to learn Q-values from\nbeliefs over states. Empirically, we have demonstrated the model's ability to\nlearn the structure of several mazes: the model discovered the number of states\nand the transition probabilities between these states. Moreover, using its\nlearned Q-values, the agent was able to successfully navigate from the starting\nposition to the maze's exit.\n","authors":["Théophile Champion","Marek Grześ","Howard Bowman"],"pdf_url":"https://arxiv.org/pdf/2411.11511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11510v1","updated":"2024-11-18T12:15:16Z","published":"2024-11-18T12:15:16Z","title":"Closed-loop multi-step planning with innate physics knowledge","summary":" We present a hierarchical framework to solve robot planning as an input\ncontrol problem. At the lowest level are temporary closed control loops,\n(\"tasks\"), each representing a behaviour, contingent on a specific sensory\ninput and therefore temporary. At the highest level, a supervising\n\"Configurator\" directs task creation and termination. Here resides \"core\"\nknowledge as a physics engine, where sequences of tasks can be simulated. The\nConfigurator encodes and interprets simulation results,based on which it can\nchoose a sequence of tasks as a plan. We implement this framework on a real\nrobot and test it in an overtaking scenario as proof-of-concept.\n","authors":["Giulia Lafratta","Bernd Porr","Christopher Chandler","Alice Miller"],"pdf_url":"https://arxiv.org/pdf/2411.11510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11504v1","updated":"2024-11-18T12:04:52Z","published":"2024-11-18T12:04:52Z","title":"Search, Verify and Feedback: Towards Next Generation Post-training\n Paradigm of Foundation Models via Verifier Engineering","summary":" The evolution of machine learning has increasingly prioritized the\ndevelopment of powerful models and more scalable supervision signals. However,\nthe emergence of foundation models presents significant challenges in providing\neffective supervision signals necessary for further enhancing their\ncapabilities. Consequently, there is an urgent need to explore novel\nsupervision signals and technical approaches. In this paper, we propose\nverifier engineering, a novel post-training paradigm specifically designed for\nthe era of foundation models. The core of verifier engineering involves\nleveraging a suite of automated verifiers to perform verification tasks and\ndeliver meaningful feedback to foundation models. We systematically categorize\nthe verifier engineering process into three essential stages: search, verify,\nand feedback, and provide a comprehensive review of state-of-the-art research\ndevelopments within each stage. We believe that verifier engineering\nconstitutes a fundamental pathway toward achieving Artificial General\nIntelligence.\n","authors":["Xinyan Guan","Yanjiang Liu","Xinyu Lu","Boxi Cao","Ben He","Xianpei Han","Le Sun","Jie Lou","Bowen Yu","Yaojie Lu","Hongyu Lin"],"pdf_url":"https://arxiv.org/pdf/2411.11504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11494v1","updated":"2024-11-18T11:55:38Z","published":"2024-11-18T11:55:38Z","title":"Alien Recombination: Exploring Concept Blends Beyond Human Cognitive\n Availability in Visual Art","summary":" While AI models have demonstrated remarkable capabilities in constrained\ndomains like game strategy, their potential for genuine creativity in\nopen-ended domains like art remains debated. We explore this question by\nexamining how AI can transcend human cognitive limitations in visual art\ncreation. Our research hypothesizes that visual art contains a vast unexplored\nspace of conceptual combinations, constrained not by inherent incompatibility,\nbut by cognitive limitations imposed by artists' cultural, temporal,\ngeographical and social contexts.\n To test this hypothesis, we present the Alien Recombination method, a novel\napproach utilizing fine-tuned large language models to identify and generate\nconcept combinations that lie beyond human cognitive availability. The system\nmodels and deliberately counteracts human availability bias, the tendency to\nrely on immediately accessible examples, to discover novel artistic\ncombinations.\n This system not only produces combinations that have never been attempted\nbefore within our dataset but also identifies and generates combinations that\nare cognitively unavailable to all artists in the domain. Furthermore, we\ntranslate these combinations into visual representations, enabling the\nexploration of subjective perceptions of novelty. Our findings suggest that\ncognitive unavailability is a promising metric for optimizing artistic novelty,\noutperforming merely temperature scaling without additional evaluation\ncriteria. This approach uses generative models to connect previously\nunconnected ideas, providing new insight into the potential of framing\nAI-driven creativity as a combinatorial problem.\n","authors":["Alejandro Hernandez","Levin Brinkmann","Ignacio Serna","Nasim Rahaman","Hassan Abu Alhaija","Hiromu Yakura","Mar Canet Sola","Bernhard Schölkopf","Iyad Rahwan"],"pdf_url":"https://arxiv.org/pdf/2411.11494v1.pdf","comment":"NeurIPS 2024 Workshop on Creativity & Generative AI, 13 pages, 11\n figures"},{"id":"http://arxiv.org/abs/2404.12138v2","updated":"2024-11-18T11:29:47Z","published":"2024-04-18T12:40:59Z","title":"Character is Destiny: Can Role-Playing Language Agents Make\n Persona-Driven Decisions?","summary":" Can Large Language Models (LLMs) simulate humans in making important\ndecisions? Recent research has unveiled the potential of using LLMs to develop\nrole-playing language agents (RPLAs), mimicking mainly the knowledge and tones\nof various characters. However, imitative decision-making necessitates a more\nnuanced understanding of personas. In this paper, we benchmark the ability of\nLLMs in persona-driven decision-making. Specifically, we investigate whether\nLLMs can predict characters' decisions provided by the preceding stories in\nhigh-quality novels. Leveraging character analyses written by literary experts,\nwe construct a dataset LIFECHOICE comprising 1,462 characters' decision points\nfrom 388 books. Then, we conduct comprehensive experiments on LIFECHOICE, with\nvarious LLMs and RPLA methodologies. The results demonstrate that\nstate-of-the-art LLMs exhibit promising capabilities in this task, yet\nsubstantial room for improvement remains. Hence, we further propose the CHARMAP\nmethod, which adopts persona-based memory retrieval and significantly advances\nRPLAs on this task, achieving 5.03% increase in accuracy.\n","authors":["Rui Xu","Xintao Wang","Jiangjie Chen","Siyu Yuan","Xinfeng Yuan","Jiaqing Liang","Zulong Chen","Xiaoqing Dong","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.12138v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11458v1","updated":"2024-11-18T10:46:05Z","published":"2024-11-18T10:46:05Z","title":"HistoEncoder: a digital pathology foundation model for prostate cancer","summary":" Foundation models are trained on massive amounts of data to distinguish\ncomplex patterns and can be adapted to a wide range of downstream tasks with\nminimal computational resources. Here, we develop a foundation model for\nprostate cancer digital pathology called HistoEncoder by pre-training on 48\nmillion prostate tissue tile images. We demonstrate that HistoEncoder features\nextracted from tile images with similar histological patterns map closely\ntogether in the feature space. HistoEncoder outperforms models pre-trained with\nnatural images, even without fine-tuning or with 1000 times less training data.\nWe describe two use cases that leverage the capabilities of HistoEncoder by\nfine-tuning the model with a limited amount of data and computational\nresources. First, we show how HistoEncoder can be used to automatically\nannotate large-scale datasets with high accuracy. Second, we combine histomics\nwith commonly used clinical nomograms, significantly improving prostate\ncancer-specific death survival models. Foundation models such as HistoEncoder\ncan allow organizations with limited resources to build effective clinical\nsoftware tools without needing extensive datasets or significant amounts of\ncomputing.\n","authors":["Joona Pohjonen","Abderrahim-Oussama Batouche","Antti Rannikko","Kevin Sandeman","Andrew Erickson","Esa Pitkanen","Tuomas Mirtti"],"pdf_url":"https://arxiv.org/pdf/2411.11458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03276v2","updated":"2024-11-18T10:46:04Z","published":"2024-03-05T19:15:17Z","title":"ARNN: Attentive Recurrent Neural Network for Multi-channel EEG Signals\n to Identify Epileptic Seizures","summary":" Electroencephalography (EEG) is a widely used tool for diagnosing brain\ndisorders due to its high temporal resolution, non-invasive nature, and\naffordability. Manual analysis of EEG is labor-intensive and requires\nexpertise, making automatic EEG interpretation crucial for reducing workload\nand accurately assessing seizures. In epilepsy diagnosis, prolonged EEG\nmonitoring generates extensive data, often spanning hours, days, or even weeks.\nWhile machine learning techniques for automatic EEG interpretation have\nadvanced significantly in recent decades, there remains a gap in its ability to\nefficiently analyze large datasets with a balance of accuracy and computational\nefficiency. To address the challenges mentioned above, an Attention Recurrent\nNeural Network (ARNN) is proposed that can process a large amount of data\nefficiently and accurately. This ARNN cell recurrently applies attention layers\nalong a sequence and has linear complexity with the sequence length and\nleverages parallel computation by processing multi-channel EEG signals rather\nthan single-channel signals. In this architecture, the attention layer is a\ncomputational unit that efficiently applies self-attention and cross-attention\nmechanisms to compute a recurrent function over a wide number of state vectors\nand input signals. This framework is inspired in part by the attention layer\nand long short-term memory (LSTM) cells, but it scales this typical cell up by\nseveral orders to parallelize for multi-channel EEG signals. It inherits the\nadvantages of attention layers and LSTM gate while avoiding their respective\ndrawbacks. The model's effectiveness is evaluated through extensive experiments\nwith heterogeneous datasets, including the CHB-MIT and UPenn and Mayo's Clinic\ndatasets.\n","authors":["Salim Rukhsar","Anil Kumar Tiwari"],"pdf_url":"https://arxiv.org/pdf/2403.03276v2.pdf","comment":"11 pages, 7 figures, Journal Paper"},{"id":"http://arxiv.org/abs/2411.11451v1","updated":"2024-11-18T10:34:14Z","published":"2024-11-18T10:34:14Z","title":"Robust Markov Decision Processes: A Place Where AI and Formal Methods\n Meet","summary":" Markov decision processes (MDPs) are a standard model for sequential\ndecision-making problems and are widely used across many scientific areas,\nincluding formal methods and artificial intelligence (AI). MDPs do, however,\ncome with the restrictive assumption that the transition probabilities need to\nbe precisely known. Robust MDPs (RMDPs) overcome this assumption by instead\ndefining the transition probabilities to belong to some uncertainty set. We\npresent a gentle survey on RMDPs, providing a tutorial covering their\nfundamentals. In particular, we discuss RMDP semantics and how to solve them by\nextending standard MDP methods such as value iteration and policy iteration. We\nalso discuss how RMDPs relate to other models and how they are used in several\ncontexts, including reinforcement learning and abstraction techniques. We\nconclude with some challenges for future work on RMDPs.\n","authors":["Marnix Suilen","Thom Badings","Eline M. Bovy","David Parker","Nils Jansen"],"pdf_url":"https://arxiv.org/pdf/2411.11451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11448v1","updated":"2024-11-18T10:30:34Z","published":"2024-11-18T10:30:34Z","title":"Unveiling the Inflexibility of Adaptive Embedding in Traffic Forecasting","summary":" Spatiotemporal Graph Neural Networks (ST-GNNs) and Transformers have shown\nsignificant promise in traffic forecasting by effectively modeling temporal and\nspatial correlations. However, rapid urbanization in recent years has led to\ndynamic shifts in traffic patterns and travel demand, posing major challenges\nfor accurate long-term traffic prediction. The generalization capability of\nST-GNNs in extended temporal scenarios and cross-city applications remains\nlargely unexplored. In this study, we evaluate state-of-the-art models on an\nextended traffic benchmark and observe substantial performance degradation in\nexisting ST-GNNs over time, which we attribute to their limited inductive\ncapabilities. Our analysis reveals that this degradation stems from an\ninability to adapt to evolving spatial relationships within urban environments.\nTo address this limitation, we reconsider the design of adaptive embeddings and\npropose a Principal Component Analysis (PCA) embedding approach that enables\nmodels to adapt to new scenarios without retraining. We incorporate PCA\nembeddings into existing ST-GNN and Transformer architectures, achieving marked\nimprovements in performance. Notably, PCA embeddings allow for flexibility in\ngraph structures between training and testing, enabling models trained on one\ncity to perform zero-shot predictions on other cities. This adaptability\ndemonstrates the potential of PCA embeddings in enhancing the robustness and\ngeneralization of spatiotemporal models.\n","authors":["Hongjun Wang","Jiyuan Chen","Lingyu Zhang","Renhe Jiang","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2411.11448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11436v1","updated":"2024-11-18T10:08:05Z","published":"2024-11-18T10:08:05Z","title":"Implicit Regularization for Multi-label Feature Selection","summary":" In this paper, we address the problem of feature selection in the context of\nmulti-label learning, by using a new estimator based on implicit regularization\nand label embedding. Unlike the sparse feature selection methods that use a\npenalized estimator with explicit regularization terms such as $l_{2,1}$-norm,\nMCP or SCAD, we propose a simple alternative method via Hadamard product\nparameterization. In order to guide the feature selection process, a latent\nsemantic of multi-label information method is adopted, as a label embedding.\nExperimental results on some known benchmark datasets suggest that the proposed\nestimator suffers much less from extra bias, and may lead to benign\noverfitting.\n","authors":["Dou El Kefel Mansouri","Khalid Benabdeslem","Seif-Eddine Benkabou"],"pdf_url":"https://arxiv.org/pdf/2411.11436v1.pdf","comment":"11 pages, 7 figures, My paper is currently under review at TPAMI\n journal"},{"id":"http://arxiv.org/abs/2405.18492v3","updated":"2024-11-18T09:44:26Z","published":"2024-05-28T18:01:52Z","title":"LLMs and Memorization: On Quality and Specificity of Copyright\n Compliance","summary":" Memorization in large language models (LLMs) is a growing concern. LLMs have\nbeen shown to easily reproduce parts of their training data, including\ncopyrighted work. This is an important problem to solve, as it may violate\nexisting copyright laws as well as the European AI Act. In this work, we\npropose a systematic analysis to quantify the extent of potential copyright\ninfringements in LLMs using European law as an example. Unlike previous work,\nwe evaluate instruction-finetuned models in a realistic end-user scenario. Our\nanalysis builds on a proposed threshold of 160 characters, which we borrow from\nthe German Copyright Service Provider Act and a fuzzy text matching algorithm\nto identify potentially copyright-infringing textual reproductions. The\nspecificity of countermeasures against copyright infringement is analyzed by\ncomparing model behavior on copyrighted and public domain data. We investigate\nwhat behaviors models show instead of producing protected text (such as refusal\nor hallucination) and provide a first legal assessment of these behaviors. We\nfind that there are huge differences in copyright compliance, specificity, and\nappropriate refusal among popular LLMs. Alpaca, GPT 4, GPT 3.5, and Luminous\nperform best in our comparison, with OpenGPT-X, Alpaca, and Luminous producing\na particularly low absolute number of potential copyright violations. Code can\nbe found at https://github.com/felixbmuller/llms-memorization-copyright.\n","authors":["Felix B Mueller","Rebekka Görge","Anna K Bernzen","Janna C Pirk","Maximilian Poretschkin"],"pdf_url":"https://arxiv.org/pdf/2405.18492v3.pdf","comment":"10 pages, 3 figures, AIES 2024 conference"},{"id":"http://arxiv.org/abs/2411.11409v1","updated":"2024-11-18T09:30:05Z","published":"2024-11-18T09:30:05Z","title":"IKEA Manuals at Work: 4D Grounding of Assembly Instructions on Internet\n Videos","summary":" Shape assembly is a ubiquitous task in daily life, integral for constructing\ncomplex 3D structures like IKEA furniture. While significant progress has been\nmade in developing autonomous agents for shape assembly, existing datasets have\nnot yet tackled the 4D grounding of assembly instructions in videos, essential\nfor a holistic understanding of assembly in 3D space over time. We introduce\nIKEA Video Manuals, a dataset that features 3D models of furniture parts,\ninstructional manuals, assembly videos from the Internet, and most importantly,\nannotations of dense spatio-temporal alignments between these data modalities.\nTo demonstrate the utility of IKEA Video Manuals, we present five applications\nessential for shape assembly: assembly plan generation, part-conditioned\nsegmentation, part-conditioned pose estimation, video object segmentation, and\nfurniture assembly based on instructional video manuals. For each application,\nwe provide evaluation metrics and baseline methods. Through experiments on our\nannotated data, we highlight many challenges in grounding assembly instructions\nin videos to improve shape assembly, including handling occlusions, varying\nviewpoints, and extended assembly sequences.\n","authors":["Yunong Liu","Cristobal Eyzaguirre","Manling Li","Shubh Khanna","Juan Carlos Niebles","Vineeth Ravi","Saumitra Mishra","Weiyu Liu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11409v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2402.14259v2","updated":"2024-11-18T09:19:25Z","published":"2024-02-22T03:46:08Z","title":"Word-Sequence Entropy: Towards Uncertainty Estimation in Free-Form\n Medical Question Answering Applications and Beyond","summary":" Uncertainty estimation is crucial for the reliability of safety-critical\nhuman and artificial intelligence (AI) interaction systems, particularly in the\ndomain of healthcare engineering. However, a robust and general uncertainty\nmeasure for free-form answers has not been well-established in open-ended\nmedical question-answering (QA) tasks, where generative inequality introduces a\nlarge number of irrelevant words and sequences within the generated set for\nuncertainty quantification (UQ), which can lead to biases. This paper\nintroduces Word-Sequence Entropy (WSE), a method that calibrates uncertainty at\nboth the word and sequence levels, considering semantic relevance. WSE\nquantifies uncertainty in a way that is more closely aligned with the\nreliability of LLMs during uncertainty quantification (UQ). We compare WSE with\nsix baseline methods on five free-form medical QA datasets, utilizing seven\npopular large language models (LLMs). Experimental results demonstrate that WSE\nexhibits superior performance in UQ under two standard criteria for correctness\nevaluation. Additionally, in terms of real-world medical QA applications, the\nperformance of LLMs is significantly enhanced (e.g., a 6.36% improvement in\nmodel accuracy on the COVID-QA dataset) by employing responses with lower\nuncertainty that are identified by WSE as final answers, without any additional\ntask-specific fine-tuning or architectural modifications.\n","authors":["Zhiyuan Wang","Jinhao Duan","Chenxi Yuan","Qingyu Chen","Tianlong Chen","Yue Zhang","Ren Wang","Xiaoshuang Shi","Kaidi Xu"],"pdf_url":"https://arxiv.org/pdf/2402.14259v2.pdf","comment":"Accepted by Engineering Applications of Artificial Intelligence"},{"id":"http://arxiv.org/abs/2410.20911v2","updated":"2024-11-18T09:15:46Z","published":"2024-10-28T10:43:34Z","title":"Hacking Back the AI-Hacker: Prompt Injection as a Defense Against\n LLM-driven Cyberattacks","summary":" Large language models (LLMs) are increasingly being harnessed to automate\ncyberattacks, making sophisticated exploits more accessible and scalable. In\nresponse, we propose a new defense strategy tailored to counter LLM-driven\ncyberattacks. We introduce Mantis, a defensive framework that exploits LLMs'\nsusceptibility to adversarial inputs to undermine malicious operations. Upon\ndetecting an automated cyberattack, Mantis plants carefully crafted inputs into\nsystem responses, leading the attacker's LLM to disrupt their own operations\n(passive defense) or even compromise the attacker's machine (active defense).\nBy deploying purposefully vulnerable decoy services to attract the attacker and\nusing dynamic prompt injections for the attacker's LLM, Mantis can autonomously\nhack back the attacker. In our experiments, Mantis consistently achieved over\n95% effectiveness against automated LLM-driven attacks. To foster further\nresearch and collaboration, Mantis is available as an open-source tool:\nhttps://github.com/pasquini-dario/project_mantis\n","authors":["Dario Pasquini","Evgenios M. Kornaropoulos","Giuseppe Ateniese"],"pdf_url":"https://arxiv.org/pdf/2410.20911v2.pdf","comment":"v0.2 (evaluated on more agents)"},{"id":"http://arxiv.org/abs/2411.11391v1","updated":"2024-11-18T09:08:30Z","published":"2024-11-18T09:08:30Z","title":"The GECo algorithm for Graph Neural Networks Explanation","summary":" Graph Neural Networks (GNNs) are powerful models that can manage complex data\nsources and their interconnection links. One of GNNs' main drawbacks is their\nlack of interpretability, which limits their application in sensitive fields.\nIn this paper, we introduce a new methodology involving graph communities to\naddress the interpretability of graph classification problems. The proposed\nmethod, called GECo, exploits the idea that if a community is a subset of graph\nnodes densely connected, this property should play a role in graph\nclassification. This is reasonable, especially if we consider the\nmessage-passing mechanism, which is the basic mechanism of GNNs. GECo analyzes\nthe contribution to the classification result of the communities in the graph,\nbuilding a mask that highlights graph-relevant structures. GECo is tested for\nGraph Convolutional Networks on six artificial and four real-world graph\ndatasets and is compared to the main explainability methods such as\nPGMExplainer, PGExplainer, GNNExplainer, and SubgraphX using four different\nmetrics. The obtained results outperform the other methods for artificial graph\ndatasets and most real-world datasets.\n","authors":["Salvatore Calderaro","Domenico Amato","Giosuè Lo Bosco","Riccardo Rizzo","Filippo Vella"],"pdf_url":"https://arxiv.org/pdf/2411.11391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00499v3","updated":"2024-11-18T08:33:35Z","published":"2024-06-29T17:33:07Z","title":"ConU: Conformal Uncertainty in Large Language Models with Correctness\n Coverage Guarantees","summary":" Uncertainty quantification (UQ) in natural language generation (NLG) tasks\nremains an open challenge, exacerbated by the closed-source nature of the\nlatest large language models (LLMs). This study investigates applying conformal\nprediction (CP), which can transform any heuristic uncertainty notion into\nrigorous prediction sets, to black-box LLMs in open-ended NLG tasks. We\nintroduce a novel uncertainty measure based on self-consistency theory, and\nthen develop a conformal uncertainty criterion by integrating the uncertainty\ncondition aligned with correctness into the CP algorithm. Empirical evaluations\nindicate that our uncertainty measure outperforms prior state-of-the-art\nmethods. Furthermore, we achieve strict control over the correctness coverage\nrate utilizing 7 popular LLMs on 4 free-form NLG datasets, spanning\ngeneral-purpose and medical scenarios. Additionally, the calibrated prediction\nsets with small size further highlights the efficiency of our method in\nproviding trustworthy guarantees for practical open-ended NLG applications.\n","authors":["Zhiyuan Wang","Jinhao Duan","Lu Cheng","Yue Zhang","Qingni Wang","Xiaoshuang Shi","Kaidi Xu","Hengtao Shen","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.00499v3.pdf","comment":"Accepted by EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2411.11364v1","updated":"2024-11-18T08:20:21Z","published":"2024-11-18T08:20:21Z","title":"Continual Task Learning through Adaptive Policy Self-Composition","summary":" Training a generalizable agent to continually learn a sequence of tasks from\noffline trajectories is a natural requirement for long-lived agents, yet\nremains a significant challenge for current offline reinforcement learning (RL)\nalgorithms. Specifically, an agent must be able to rapidly adapt to new tasks\nusing newly collected trajectories (plasticity), while retaining knowledge from\npreviously learned tasks (stability). However, systematic analyses of this\nsetting are scarce, and it remains unclear whether conventional continual\nlearning (CL) methods are effective in continual offline RL (CORL) scenarios.\nIn this study, we develop the Offline Continual World benchmark and demonstrate\nthat traditional CL methods struggle with catastrophic forgetting, primarily\ndue to the unique distribution shifts inherent to CORL scenarios. To address\nthis challenge, we introduce CompoFormer, a structure-based continual\ntransformer model that adaptively composes previous policies via a meta-policy\nnetwork. Upon encountering a new task, CompoFormer leverages semantic\ncorrelations to selectively integrate relevant prior policies alongside newly\ntrained parameters, thereby enhancing knowledge sharing and accelerating the\nlearning process. Our experiments reveal that CompoFormer outperforms\nconventional CL methods, particularly in longer task sequences, showcasing a\npromising balance between plasticity and stability.\n","authors":["Shengchao Hu","Yuhang Zhou","Ziqing Fan","Jifeng Hu","Li Shen","Ya Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2411.11364v1.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.06269v2","updated":"2024-11-18T08:10:38Z","published":"2024-11-09T19:53:15Z","title":"AI's Spatial Intelligence: Evaluating AI's Understanding of Spatial\n Transformations in PSVT:R and Augmented Reality","summary":" Spatial intelligence is important in Architecture, Construction, Science,\nTechnology, Engineering, and Mathematics (STEM), and Medicine. Understanding\nthree-dimensional (3D) spatial rotations can involve verbal descriptions and\nvisual or interactive examples, illustrating how objects change orientation in\n3D space. Recent studies show Artificial Intelligence (AI) with language and\nvision capabilities still face limitations in spatial reasoning. In this paper,\nwe have studied generative AI's spatial capabilities of understanding rotations\nof objects utilizing its image and language processing features. We examined\nthe spatial intelligence of the GPT-4 model with vision in understanding\nspatial rotation process with diagrams based on the Revised Purdue Spatial\nVisualization Test: Visualization of Rotations (Revised PSVT:R). Next, we\nincorporated a layer of coordinate system axes on Revised PSVT:R to study the\nvariations in GPT-4's performance. We also examined GPT-4's understanding of 3D\nrotations in Augmented Reality (AR) scenes that visualize spatial rotations of\nan object in 3D space and observed increased accuracy of GPT-4's understanding\nof the rotations by adding supplementary textual information depicting the\nrotation process or mathematical representations of the rotation (e.g.,\nmatrices). The results indicate that while GPT-4 as a major current Generative\nAI model lacks the understanding of a spatial rotation process, it has the\npotential to understand the rotation process with additional information that\ncan be provided by methods such as AR. By combining the potentials in spatial\nintelligence of AI with AR's interactive visualization abilities, we expect to\noffer enhanced guidance for students' spatial learning activities. Such spatial\nguidance can benefit understanding spatial transformations and additionally\nsupport processes like assembly, fabrication, and manufacturing.\n","authors":["Uttamasha Monjoree","Wei Yan"],"pdf_url":"https://arxiv.org/pdf/2411.06269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11418v2","updated":"2024-11-18T08:01:24Z","published":"2024-07-16T06:19:14Z","title":"Semantic Operators: A Declarative Model for Rich, AI-based Analytics\n Over Text Data","summary":" The semantic capabilities of language models (LMs) have the potential to\nenable rich analytics and reasoning over vast knowledge corpora. Unfortunately,\nexisting systems lack high-level abstractions to perform bulk semantic queries\nacross large corpora. We introduce semantic operators, a declarative\nprogramming interface that extends the relational model with composable\nAI-based operations for bulk semantic queries (e.g., filtering, sorting,\njoining or aggregating records using natural language criteria). Each operator\ncan be implemented and optimized in multiple ways, opening a rich space for\nexecution plans similar to relational operators. We implement our operators in\nLOTUS, an open source query engine with a DataFrame API. Furthermore, we\ndevelop several novel optimizations that take advantage of the declarative\nnature of semantic operators to accelerate semantic filtering, clustering and\njoin operators by up to $400\\times$ while offering statistical accuracy\nguarantees. We demonstrate LOTUS' effectiveness on real AI applications\nincluding fact-checking, extreme multi-label classification, and search. We\nshow that the semantic operator model is expressive, capturing state-of-the-art\nAI pipelines in a few operator calls, and making it easy to express new\npipelines that achieve up to $180\\%$ higher quality. Overall, LOTUS queries\nmatch or exceed the accuracy of state-of-the-art AI pipelines for each task\nwhile running up to 28$\\times$ faster. LOTUS is publicly available at\nhttps://github.com/stanford-futuredata/lotus.\n","authors":["Liana Patel","Siddharth Jha","Parth Asawa","Melissa Pan","Carlos Guestrin","Matei Zaharia"],"pdf_url":"https://arxiv.org/pdf/2407.11418v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11354v1","updated":"2024-11-18T07:50:22Z","published":"2024-11-18T07:50:22Z","title":"A comprehensive survey of oracle character recognition: challenges,\n benchmarks, and beyond","summary":" Oracle character recognition-an analysis of ancient Chinese inscriptions\nfound on oracle bones-has become a pivotal field intersecting archaeology,\npaleography, and historical cultural studies. Traditional methods of oracle\ncharacter recognition have relied heavily on manual interpretation by experts,\nwhich is not only labor-intensive but also limits broader accessibility to the\ngeneral public. With recent breakthroughs in pattern recognition and deep\nlearning, there is a growing movement towards the automation of oracle\ncharacter recognition (OrCR), showing considerable promise in tackling the\nchallenges inherent to these ancient scripts. However, a comprehensive\nunderstanding of OrCR still remains elusive. Therefore, this paper presents a\nsystematic and structured survey of the current landscape of OrCR research. We\ncommence by identifying and analyzing the key challenges of OrCR. Then, we\nprovide an overview of the primary benchmark datasets and digital resources\navailable for OrCR. A review of contemporary research methodologies follows, in\nwhich their respective efficacies, limitations, and applicability to the\ncomplex nature of oracle characters are critically highlighted and examined.\nAdditionally, our review extends to ancillary tasks associated with OrCR across\ndiverse disciplines, providing a broad-spectrum analysis of its applications.\nWe conclude with a forward-looking perspective, proposing potential avenues for\nfuture investigations that could yield significant advancements in the field.\n","authors":["Jing Li","Xueke Chi","Qiufeng Wang","Dahan Wang","Kaizhu Huang","Yongge Liu","Cheng-lin Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02156v2","updated":"2024-11-18T07:36:36Z","published":"2024-10-03T02:36:30Z","title":"The why, what, and how of AI-based coding in scientific research","summary":" Computer programming (coding) is indispensable for researchers across\ndisciplines, yet it remains challenging to learn and time-consuming to carry\nout. Generative AI, particularly large language models (LLMs), has the\npotential to transform coding into intuitive conversations, but best practices\nand effective workflows are only emerging. We dissect AI-based coding through\nthree key lenses: the nature and role of LLMs in coding (why), six types of\ncoding assistance they provide (what), and a five-step workflow in action with\npractical implementation strategies (how). Additionally, we address the\nlimitations and future outlook of AI in coding. By offering actionable\ninsights, this framework helps to guide researchers in effectively leveraging\nAI to enhance coding practices and education, accelerating scientific progress.\n","authors":["Tonghe Zhuang","Zhicheng Lin"],"pdf_url":"https://arxiv.org/pdf/2410.02156v2.pdf","comment":"23 pages, 7 figure, 3 boxes"},{"id":"http://arxiv.org/abs/2411.11344v1","updated":"2024-11-18T07:33:10Z","published":"2024-11-18T07:33:10Z","title":"Mitigating Knowledge Conflicts in Language Model-Driven Question\n Answering","summary":" Knowledge-aware sequence to sequence generation tasks such as document\nquestion answering and abstract summarization typically requires two types of\nknowledge: encoded parametric knowledge and retrieved contextual information.\nPrevious work show improper correlation between parametric knowledge and\nanswers in the training set could cause the model ignore input information at\ntest time, resulting in un-desirable model behaviour such as over-stability and\nhallucination. In this work, we argue that hallucination could be mitigated via\nexplicit correlation between input source and generated content. We focus on a\ntypical example of hallucination, entity-based knowledge conflicts in question\nanswering, where correlation of entities and their description at training time\nhinders model behaviour during inference.\n","authors":["Han Cao","Zhaoyang Zhang","Xiangtian Li","Chufan Wu","Hansong Zhang","Wenqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08521v2","updated":"2024-11-18T07:29:38Z","published":"2024-11-13T11:08:28Z","title":"SAD-TIME: a Spatiotemporal-fused network for depression detection with\n Automated multi-scale Depth-wise and TIME-interval-related common feature\n extractor","summary":" Background and Objective: Depression is a severe mental disorder, and\naccurate diagnosis is pivotal to the cure and rehabilitation of people with\ndepression. However, the current questionnaire-based diagnostic methods could\nbring subjective biases and may be denied by subjects. In search of a more\nobjective means of diagnosis, researchers have begun to experiment with deep\nlearning-based methods for identifying depressive disorders in recent years.\nMethods: In this study, a novel Spatiotemporal-fused network with Automated\nmulti-scale Depth-wise and TIME-interval-related common feature extractor\n(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common\nfeatures extractor (CFE), a spatial sector (SpS), a modified temporal sector\n(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale\ndepth-wise 1D-convolutional neural network and a time-interval embedding\ngenerator, where the unique information of each channel is preserved. The SpS\nfuses the functional connectivity with the distance-based connectivity\ncontaining spatial position of EEG electrodes. A multi-head-attention graph\nconvolutional network is also applied in the SpS to fuse the features from\ndifferent EEG channels. The TeS is based on long short-term memory and graph\ntransformer networks, where the temporal information of different time-windows\nis fused. Moreover, the DAL is used after the SpS to obtain the\ndomain-invariant feature. Results: Experimental results under tenfold\ncross-validation show that the proposed SAD-TIME method achieves 92.00% and\n94.00% depression classification accuracies on two datasets, respectively, in\ncross-subject mode. Conclusion: SAD-TIME is a robust depression detection\nmodel, where the automatedly-generated features, the SpS and the TeS assist the\nclassification performance with the fusion of the innate spatiotemporal\ninformation in the EEG signals.\n","authors":["Han-Guang Wang","Hui-Rang Hou","Li-Cheng Jin","Chen-Yang Xu","Zhong-Yi Zhang","Qing-Hao Meng"],"pdf_url":"https://arxiv.org/pdf/2411.08521v2.pdf","comment":"21pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.03911v3","updated":"2024-11-18T07:17:56Z","published":"2024-05-07T00:08:15Z","title":"Federated Graph Condensation with Information Bottleneck Principles","summary":" Graph condensation, which reduces the size of a large-scale graph by\nsynthesizing a small-scale condensed graph as its substitution, has immediately\nbenefited various graph learning tasks. However, existing graph condensation\nmethods rely on centralized data storage, which is unfeasible for real-world\ndecentralized data distribution, and overlook data holders' privacy-preserving\nrequirements. To bridge the gap, we propose and study the novel problem of\nfederated graph condensation for graph neural networks (GNNs). Specifically, we\nfirst propose a general framework for federated graph condensation, in which we\ndecouple the typical gradient matching process for graph condensation into\nclient-side gradient calculation and server-side gradient matching. In this\nway, the burdensome computation cost in client-side is largely alleviated.\nBesides, our empirical studies show that under the federated setting, the\ncondensed graph will consistently leak data membership privacy, i.e., the\ncondensed graph during the federated training can be utilized to steal the\ntraining data under the membership inference attacks (MIA). To tackle this\nissue, we innovatively incorporate information bottleneck principles into the\nfederated graph condensation, which only needs to extract partial node features\nin one local pre-training step and utilize the features during federated\ntraining. Extensive experiments on real-world datasets demonstrate that our\nframework can consistently protect membership privacy during training.\nMeanwhile, it also achieves comparable and even superior performance against\nexisting centralized graph condensation and federated graph learning methods.\n","authors":["Bo Yan","Sihao He","Cheng Yang","Shang Liu","Yang Cao","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2405.03911v3.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2405.10918v2","updated":"2024-11-18T06:50:30Z","published":"2024-05-17T17:09:45Z","title":"A Framework for Leveraging Partially-Labeled Data for Product\n Attribute-Value Identification","summary":" In the e-commerce domain, the accurate extraction of attribute-value pairs\n(e.g., Brand: Apple) from product titles and user search queries is crucial for\nenhancing search and recommendation systems. A major challenge with neural\nmodels for this task is the lack of high-quality training data, as the\nannotations for attribute-value pairs in the available datasets are often\nincomplete. To address this, we introduce GenToC, a model designed for training\ndirectly with partially-labeled data, eliminating the necessity for a fully\nannotated dataset. GenToC employs a marker-augmented generative model to\nidentify potential attributes, followed by a token classification model that\ndetermines the associated values for each attribute. GenToC outperforms\nexisting state-of-the-art models, exhibiting upto 56.3% increase in the number\nof accurate extractions. Furthermore, we utilize GenToC to regenerate the\ntraining dataset to expand attribute-value annotations. This bootstrapping\nsubstantially improves the data quality for training other standard NER models,\nwhich are typically faster but less capable in handling partially-labeled data,\nenabling them to achieve comparable performance to GenToC. Our results\ndemonstrate GenToC's unique ability to learn from a limited set of\npartially-labeled data and improve the training of more efficient models,\nadvancing the automated extraction of attribute-value pairs. Finally, our model\nhas been successfully integrated into IndiaMART, India's largest B2B e-commerce\nplatform, achieving a significant increase of 20.2% in the number of correctly\nidentified attribute-value pairs over the existing deployed system while\nachieving a high precision of 89.5%.\n","authors":["D. Subhalingam","Keshav Kolluru"," Mausam","Saurabh Singal"],"pdf_url":"https://arxiv.org/pdf/2405.10918v2.pdf","comment":"Accepted to KDD 2025 ADS Track"},{"id":"http://arxiv.org/abs/2411.09261v2","updated":"2024-11-18T06:41:26Z","published":"2024-11-14T07:58:44Z","title":"Automating Autograding: Large Language Models as Test Suite Generators\n for Introductory Programming","summary":" Automatically graded programming assignments provide instant feedback to\nstudents and significantly reduce manual grading time for instructors. However,\ncreating comprehensive suites of test cases for programming problems within\nautomatic graders can be time-consuming and complex. The effort needed to\ndefine test suites may deter some instructors from creating additional problems\nor lead to inadequate test coverage, potentially resulting in misleading\nfeedback on student solutions. Such limitations may reduce student access to\nthe well-documented benefits of timely feedback when learning programming.\n In this work, we evaluate the effectiveness of using Large Language Models\n(LLMs), as part of a larger workflow, to automatically generate test suites for\nCS1-level programming problems. Each problem's statement and reference solution\nare provided to GPT-4 to produce a test suite that can be used by an\nautograder. We evaluate our proposed approach using a sample of 26 problems,\nand more than 25,000 attempted solutions to those problems, submitted by\nstudents in an introductory programming course. We compare the performance of\nthe LLM-generated test suites against the instructor-created test suites for\neach problem. Our findings reveal that LLM-generated test suites can correctly\nidentify most valid solutions, and for most problems are at least as\ncomprehensive as the instructor test suites. Additionally, the LLM-generated\ntest suites exposed ambiguities in some problem statements, underscoring their\npotential to improve both autograding and instructional design.\n","authors":["Umar Alkafaween","Ibrahim Albluwi","Paul Denny"],"pdf_url":"https://arxiv.org/pdf/2411.09261v2.pdf","comment":"Submitted to Journal of Computer Assisted Learning; updated table\n refs"},{"id":"http://arxiv.org/abs/2411.11318v1","updated":"2024-11-18T06:22:30Z","published":"2024-11-18T06:22:30Z","title":"Syllabus: Portable Curricula for Reinforcement Learning Agents","summary":" Curriculum learning has been a quiet yet crucial component of many of the\nhigh-profile successes of reinforcement learning. Despite this, none of the\nmajor reinforcement learning libraries directly support curriculum learning or\ninclude curriculum learning implementations. These methods can improve the\ncapabilities and robustness of RL agents, but often require significant,\ncomplex changes to agent training code. We introduce Syllabus, a library for\ntraining RL agents with curriculum learning, as a solution to this problem.\nSyllabus provides a universal API for curriculum learning algorithms,\nimplementations of popular curriculum learning methods, and infrastructure for\neasily integrating them with distributed training code written in nearly any RL\nlibrary. Syllabus provides a minimal API for each of the core components of\ncurriculum learning, dramatically simplifying the process of designing new\nalgorithms and applying existing algorithms to new environments. We demonstrate\nthat the same Syllabus code can be used to train agents written in multiple\ndifferent RL libraries on numerous domains. In doing so, we present the first\nexamples of curriculum learning in NetHack and Neural MMO, two of the premier\nchallenges for single-agent and multi-agent RL respectively, achieving strong\nresults compared to state of the art baselines.\n","authors":["Ryan Sullivan","Ryan Pégoud","Ameen Ur Rahmen","Xinchen Yang","Junyun Huang","Aayush Verma","Nistha Mitra","John P. Dickerson"],"pdf_url":"https://arxiv.org/pdf/2411.11318v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2411.11312v1","updated":"2024-11-18T06:13:51Z","published":"2024-11-18T06:13:51Z","title":"Study of the Performance of CEEMDAN in Underdetermined Speech Separation","summary":" The CEEMDAN algorithm is one of the modern methods used in the analysis of\nnon-stationary signals. This research presents a study of the effectiveness of\nthis method in audio source separation to know the limits of its work. It\nconcluded two conditions related to frequencies and amplitudes of mixed signals\nto be separated by CEEMDAN. The performance of the algorithm in separating\nnoise from speech and separating speech signals from each other is studied. The\nresearch reached a conclusion that CEEMDAN can remove some types of noise from\nspeech (speech improvement), and it cannot separate speech signals from each\nother (cocktail party). Simulation is done using Matlab environment and Noizeus\ndatabase.\n","authors":["Rawad Melhem","Riad Hamadeh","Assef Jafar"],"pdf_url":"https://arxiv.org/pdf/2411.11312v1.pdf","comment":"in Arabic language"},{"id":"http://arxiv.org/abs/2411.11305v1","updated":"2024-11-18T06:01:00Z","published":"2024-11-18T06:01:00Z","title":"TP-UNet: Temporal Prompt Guided UNet for Medical Image Segmentation","summary":" The advancement of medical image segmentation techniques has been propelled\nby the adoption of deep learning techniques, particularly UNet-based\napproaches, which exploit semantic information to improve the accuracy of\nsegmentations. However, the order of organs in scanned images has been\ndisregarded by current medical image segmentation approaches based on UNet.\nFurthermore, the inherent network structure of UNet does not provide direct\ncapabilities for integrating temporal information. To efficiently integrate\ntemporal information, we propose TP-UNet that utilizes temporal prompts,\nencompassing organ-construction relationships, to guide the segmentation UNet\nmodel. Specifically, our framework is featured with cross-attention and\nsemantic alignment based on unsupervised contrastive learning to combine\ntemporal prompts and image features effectively. Extensive evaluations on two\nmedical image segmentation datasets demonstrate the state-of-the-art\nperformance of TP-UNet. Our implementation will be open-sourced after\nacceptance.\n","authors":["Ranmin Wang","Limin Zhuang","Hongkun Chen","Boyan Xu","Ruichu Cai"],"pdf_url":"https://arxiv.org/pdf/2411.11305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11303v1","updated":"2024-11-18T05:58:47Z","published":"2024-11-18T05:58:47Z","title":"Recurrent Stochastic Configuration Networks with Incremental Blocks","summary":" Recurrent stochastic configuration networks (RSCNs) have shown promise in\nmodelling nonlinear dynamic systems with order uncertainty due to their\nadvantages of easy implementation, less human intervention, and strong\napproximation capability. This paper develops the original RSCNs with block\nincrements, termed block RSCNs (BRSCNs), to further enhance the learning\ncapacity and efficiency of the network. BRSCNs can simultaneously add multiple\nreservoir nodes (subreservoirs) during the construction. Each subreservoir is\nconfigured with a unique structure in the light of a supervisory mechanism,\nensuring the universal approximation property. The reservoir feedback matrix is\nappropriately scaled to guarantee the echo state property of the network.\nFurthermore, the output weights are updated online using a projection\nalgorithm, and the persistent excitation conditions that facilitate parameter\nconvergence are also established. Numerical results over a time series\nprediction, a nonlinear system identification task, and two industrial data\npredictive analyses demonstrate that the proposed BRSCN performs favourably in\nterms of modelling efficiency, learning, and generalization performance,\nhighlighting their significant potential for coping with complex dynamics.\n","authors":["Gang Dang","Dainhui Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11302v1","updated":"2024-11-18T05:58:41Z","published":"2024-11-18T05:58:41Z","title":"Towards Personalized Brain-Computer Interface Application Based on\n Endogenous EEG Paradigms","summary":" In this paper, we propose a conceptual framework for personalized\nbrain-computer interface (BCI) applications, which can offer an enhanced user\nexperience by customizing services to individual preferences and needs, based\non endogenous electroencephalography (EEG) paradigms including motor imagery\n(MI), speech imagery (SI), and visual imagery. The framework includes two\nessential components: user identification and intention classification, which\nenable personalized services by identifying individual users and recognizing\ntheir intended actions through EEG signals. We validate the feasibility of our\nframework using a private EEG dataset collected from eight subjects, employing\nthe ShallowConvNet architecture to decode EEG features. The experimental\nresults demonstrate that user identification achieved an average classification\naccuracy of 0.995, while intention classification achieved 0.47 accuracy across\nall paradigms, with MI demonstrating the best performance. These findings\nindicate that EEG signals can effectively support personalized BCI\napplications, offering robust identification and reliable intention decoding,\nespecially for MI and SI.\n","authors":["Heon-Gyu Kwak","Gi-Hwan Shin","Yeon-Woo Choi","Dong-Hoon Lee","Yoo-In Jeon","Jun-Su Kang","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2411.11302v1.pdf","comment":"Submissoion version for IEEE International BCI Winter Conference 2025"},{"id":"http://arxiv.org/abs/2406.08527v2","updated":"2024-11-18T05:47:10Z","published":"2024-06-12T08:31:34Z","title":"Optimized Feature Generation for Tabular Data via LLMs with Decision\n Tree Reasoning","summary":" In tabular prediction tasks, tree-based models combined with automated\nfeature engineering methods often outperform deep learning approaches that rely\non learned representations. While these feature engineering techniques are\neffective, they typically depend on a pre-defined search space and primarily\nuse validation scores for feature selection, thereby missing valuable insights\nfrom previous experiments. To address these limitations, we propose a novel\ntabular learning framework that utilizes large language models (LLMs), termed\nOptimizing Column feature generator with decision Tree reasoning (OCTree). Our\nkey idea is to leverage the reasoning capabilities of LLMs to identify\neffective feature generation rules without manually specifying the search space\nand provide language-based reasoning information highlighting past experiments\nas feedback for iterative rule improvements. We use decision trees to convey\nthis reasoning information, as they can be easily represented in natural\nlanguage, effectively providing knowledge from prior experiments (i.e., the\nimpact of the generated features on performance) to the LLMs. Our empirical\nresults demonstrate that OCTree consistently enhances the performance of\nvarious prediction models across diverse benchmarks, outperforming competing\nautomated feature engineering methods. Code is available at\nhttps://github.com/jaehyun513/OCTree.\n","authors":["Jaehyun Nam","Kyuyoung Kim","Seunghyuk Oh","Jihoon Tack","Jaehyung Kim","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2406.08527v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11295v1","updated":"2024-11-18T05:41:27Z","published":"2024-11-18T05:41:27Z","title":"Transcending Language Boundaries: Harnessing LLMs for Low-Resource\n Language Translation","summary":" Large Language Models (LLMs) have demonstrated remarkable success across a\nwide range of tasks and domains. However, their performance in low-resource\nlanguage translation, particularly when translating into these languages,\nremains underexplored. This gap poses significant challenges, as linguistic\nbarriers hinder the cultural preservation and development of minority\ncommunities. To address this issue, this paper introduces a novel\nretrieval-based method that enhances translation quality for low-resource\nlanguages by focusing on key terms, which involves translating keywords and\nretrieving corresponding examples from existing data. To evaluate the\neffectiveness of this method, we conducted experiments translating from English\ninto three low-resource languages: Cherokee, a critically endangered indigenous\nlanguage of North America; Tibetan, a historically and culturally significant\nlanguage in Asia; and Manchu, a language with few remaining speakers. Our\ncomparison with the zero-shot performance of GPT-4o and LLaMA 3.1 405B,\nhighlights the significant challenges these models face when translating into\nlow-resource languages. In contrast, our retrieval-based method shows promise\nin improving both word-level accuracy and overall semantic understanding by\nleveraging existing resources more effectively.\n","authors":["Peng Shu","Junhao Chen","Zhengliang Liu","Hui Wang","Zihao Wu","Tianyang Zhong","Yiwei Li","Huaqin Zhao","Hanqi Jiang","Yi Pan","Yifan Zhou","Constance Owl","Xiaoming Zhai","Ninghao Liu","Claudio Saunt","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12311v3","updated":"2024-11-18T05:23:42Z","published":"2024-10-16T07:24:28Z","title":"Open Domain Question Answering with Conflicting Contexts","summary":" Open domain question answering systems frequently rely on information\nretrieved from large collections of text (such as the Web) to answer questions.\nHowever, such collections of text often contain conflicting information, and\nindiscriminately depending on this information may result in untruthful and\ninaccurate answers. To understand the gravity of this problem, we collect a\nhuman-annotated dataset, Question Answering with Conflicting Contexts (QACC),\nand find that as much as 25% of unambiguous, open domain questions can lead to\nconflicting contexts when retrieved using Google Search. We evaluate and\nbenchmark three powerful Large Language Models (LLMs) with our dataset QACC and\ndemonstrate their limitations in effectively addressing questions with\nconflicting information. To explore how humans reason through conflicting\ncontexts, we request our annotators to provide explanations for their\nselections of correct answers. We demonstrate that by finetuning LLMs to\nexplain their answers, we can introduce richer information into their training\nthat guide them through the process of reasoning with conflicting contexts.\n","authors":["Siyi Liu","Qiang Ning","Kishaloy Halder","Wei Xiao","Zheng Qi","Phu Mon Htut","Yi Zhang","Neha Anna John","Bonan Min","Yassine Benajiba","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2410.12311v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01585v3","updated":"2024-11-18T05:18:51Z","published":"2024-08-02T21:54:13Z","title":"LibreLog: Accurate and Efficient Unsupervised Log Parsing Using\n Open-Source Large Language Models","summary":" Log parsing is a critical step that transforms unstructured log data into\nstructured formats, facilitating subsequent log-based analysis. Traditional\nsyntax-based log parsers are efficient and effective, but they often experience\ndecreased accuracy when processing logs that deviate from the predefined rules.\nRecently, large language models (LLM) based log parsers have shown superior\nparsing accuracy. However, existing LLM-based parsers face three main\nchallenges: 1)time-consuming and labor-intensive manual labeling for\nfine-tuning or in-context learning, 2)increased parsing costs due to the vast\nvolume of log data and limited context size of LLMs, and 3)privacy risks from\nusing commercial models like ChatGPT with sensitive log information. To\novercome these limitations, this paper introduces LibreLog, an unsupervised log\nparsing approach that leverages open-source LLMs (i.e., Llama3-8B) to enhance\nprivacy and reduce operational costs while achieving state-of-the-art parsing\naccuracy. LibreLog first groups logs with similar static text but varying\ndynamic variables using a fixed-depth grouping tree. It then parses logs within\nthese groups using three components: i)similarity scoring-based retrieval\naugmented generation: selects diverse logs within each group based on Jaccard\nsimilarity, helping the LLM distinguish between static text and dynamic\nvariables; ii)self-reflection: iteratively query LLMs to refine log templates\nto improve parsing accuracy; and iii) log template memory: stores parsed\ntemplates to reduce LLM queries for improved parsing efficiency. Our evaluation\non LogHub-2.0 shows that LibreLog achieves 25% higher parsing accuracy and\nprocesses logs 2.7 times faster compared to state-of-the-art LLM-based parsers.\nIn short, LibreLog addresses privacy and cost concerns of using commercial LLMs\nwhile achieving state-of-the-arts parsing efficiency and accuracy.\n","authors":["Zeyang Ma","Dong Jae Kim","Tse-Hsun Chen"],"pdf_url":"https://arxiv.org/pdf/2408.01585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11289v1","updated":"2024-11-18T05:17:27Z","published":"2024-11-18T05:17:27Z","title":"LP Data Pipeline: Lightweight, Purpose-driven Data Pipeline for Large\n Language Models","summary":" Creating high-quality, large-scale datasets for large language models (LLMs)\noften relies on resource-intensive, GPU-accelerated models for quality\nfiltering, making the process time-consuming and costly. This dependence on\nGPUs limits accessibility for organizations lacking significant computational\ninfrastructure. To address this issue, we introduce the Lightweight,\nPurpose-driven (LP) Data Pipeline, a framework that operates entirely on CPUs\nto streamline the processes of dataset extraction, filtering, and curation.\nBased on our four core principles, the LP Data Pipeline significantly reduces\npreparation time and cost while maintaining high data quality. Importantly, our\npipeline enables the creation of purpose-driven datasets tailored to specific\ndomains and languages, enhancing the applicability of LLMs in specialized\ncontexts. We anticipate that our pipeline will lower the barriers to LLM\ndevelopment, enabling a wide range of organizations to access LLMs more easily.\n","authors":["Yungi Kim","Hyunsoo Ha","Seonghoon Yang","Sukyung Lee","Jihoo Kim","Chanjun Park"],"pdf_url":"https://arxiv.org/pdf/2411.11289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11285v1","updated":"2024-11-18T05:11:29Z","published":"2024-11-18T05:11:29Z","title":"Zero-Shot Automatic Annotation and Instance Segmentation using\n LLM-Generated Datasets: Eliminating Field Imaging and Manual Annotation for\n Deep Learning Model Development","summary":" Currently, deep learning-based instance segmentation for various applications\n(e.g., Agriculture) is predominantly performed using a labor-intensive process\ninvolving extensive field data collection using sophisticated sensors, followed\nby careful manual annotation of images, presenting significant logistical and\nfinancial challenges to researchers and organizations. The process also slows\ndown the model development and training process. In this study, we presented a\nnovel method for deep learning-based instance segmentation of apples in\ncommercial orchards that eliminates the need for labor-intensive field data\ncollection and manual annotation. Utilizing a Large Language Model (LLM), we\nsynthetically generated orchard images and automatically annotated them using\nthe Segment Anything Model (SAM) integrated with a YOLO11 base model. This\nmethod significantly reduces reliance on physical sensors and manual data\nprocessing, presenting a major advancement in \"Agricultural AI\". The synthetic,\nauto-annotated dataset was used to train the YOLO11 model for Apple instance\nsegmentation, which was then validated on real orchard images. The results\nshowed that the automatically generated annotations achieved a Dice Coefficient\nof 0.9513 and an IoU of 0.9303, validating the accuracy and overlap of the mask\nannotations. All YOLO11 configurations, trained solely on these synthetic\ndatasets with automated annotations, accurately recognized and delineated\napples, highlighting the method's efficacy. Specifically, the YOLO11m-seg\nconfiguration achieved a mask precision of 0.902 and a mask mAP@50 of 0.833 on\ntest images collected from a commercial orchard. Additionally, the YOLO11l-seg\nconfiguration outperformed other models in validation on 40 LLM-generated\nimages, achieving the highest mask precision and mAP@50 metrics.\n Keywords: YOLO, SAM, SAMv2, YOLO11, YOLOv11, Segment Anything, YOLO-SAM\n","authors":["Ranjan Sapkota","Achyut Paudel","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2411.11285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06220v2","updated":"2024-11-18T05:00:58Z","published":"2024-09-10T05:08:26Z","title":"CerviXpert: A Multi-Structural Convolutional Neural Network for\n Predicting Cervix Type and Cervical Cell Abnormalities","summary":" Cervical cancer is a major cause of cancer-related mortality among women\nworldwide, and its survival rate improves significantly with early detection.\nTraditional diagnostic methods such as Pap smears and cervical biopsies rely\nheavily on cytologist expertise, making the process prone to human error. This\nstudy introduces CerviXpert, a multi-structural convolutional neural network\nmodel designed to efficiently classify cervix types and detect cervical cell\nabnormalities. CerviXpert is built as a computationally efficient model that\nclassifies cervical cancer using images from the publicly available SiPaKMeD\ndataset. The model architecture emphasizes simplicity, using a limited number\nof convolutional layers followed by max pooling and dense layers, trained from\nscratch.\n We assessed the performance of CerviXpert against other state of the art\nconvolutional neural network models including ResNet50, VGG16, MobileNetV2, and\nInceptionV3, evaluating them on accuracy, computational efficiency, and\nrobustness using five fold cross validation. CerviXpert achieved an accuracy of\n98.04 percent in classifying cervical cell abnormalities into three classes and\n98.60 percent for five class cervix type classification, outperforming\nMobileNetV2 and InceptionV3 in both accuracy and computational requirements. It\nshowed comparable results to ResNet50 and VGG16 while reducing computational\ncomplexity and resource needs.\n CerviXpert provides an effective solution for cervical cancer screening and\ndiagnosis, balancing accuracy with computational efficiency. Its streamlined\ndesign enables deployment in resource constrained environments, potentially\nenhancing early detection and management of cervical cancer.\n","authors":["Rashik Shahriar Akash","Radiful Islam","S. M. Saiful Islam Badhon","K. S. M. Tozammel Hossain"],"pdf_url":"https://arxiv.org/pdf/2409.06220v2.pdf","comment":"11 figures, 9 tables"},{"id":"http://arxiv.org/abs/2411.11283v1","updated":"2024-11-18T04:55:26Z","published":"2024-11-18T04:55:26Z","title":"Multi-Hyperbolic Space-based Heterogeneous Graph Attention Network","summary":" To leverage the complex structures within heterogeneous graphs, recent\nstudies on heterogeneous graph embedding use a hyperbolic space, characterized\nby a constant negative curvature and exponentially increasing space, which\naligns with the structural properties of heterogeneous graphs. However, despite\nheterogeneous graphs inherently possessing diverse power-law structures, most\nhyperbolic heterogeneous graph embedding models use a single hyperbolic space\nfor the entire heterogeneous graph, which may not effectively capture the\ndiverse power-law structures within the heterogeneous graph. To address this\nlimitation, we propose Multi-hyperbolic Space-based heterogeneous Graph\nAttention Network (MSGAT), which uses multiple hyperbolic spaces to effectively\ncapture diverse power-law structures within heterogeneous graphs. We conduct\ncomprehensive experiments to evaluate the effectiveness of MSGAT. The\nexperimental results demonstrate that MSGAT outperforms state-of-the-art\nbaselines in various graph machine learning tasks, effectively capturing the\ncomplex structures of heterogeneous graphs.\n","authors":["Jongmin Park","Seunghoon Han","Jong-Ryul Lee","Sungsu Lim"],"pdf_url":"https://arxiv.org/pdf/2411.11283v1.pdf","comment":"Accepted in IEEE ICDM 2024"},{"id":"http://arxiv.org/abs/2411.11282v1","updated":"2024-11-18T04:54:04Z","published":"2024-11-18T04:54:04Z","title":"Continuous K-space Recovery Network with Image Guidance for Fast MRI\n Reconstruction","summary":" Magnetic resonance imaging (MRI) is a crucial tool for clinical diagnosis\nwhile facing the challenge of long scanning time. To reduce the acquisition\ntime, fast MRI reconstruction aims to restore high-quality images from the\nundersampled k-space. Existing methods typically train deep learning models to\nmap the undersampled data to artifact-free MRI images. However, these studies\noften overlook the unique properties of k-space and directly apply general\nnetworks designed for image processing to k-space recovery, leaving the precise\nlearning of k-space largely underexplored. In this work, we propose a\ncontinuous k-space recovery network from a new perspective of implicit neural\nrepresentation with image domain guidance, which boosts the performance of MRI\nreconstruction. Specifically, (1) an implicit neural representation based\nencoder-decoder structure is customized to continuously query unsampled\nk-values. (2) an image guidance module is designed to mine the semantic\ninformation from the low-quality MRI images to further guide the k-space\nrecovery. (3) a multi-stage training strategy is proposed to recover dense\nk-space progressively. Extensive experiments conducted on CC359, fastMRI, and\nIXI datasets demonstrate the effectiveness of our method and its superiority\nover other competitors.\n","authors":["Yucong Meng","Zhiwei Yang","Minghong Duan","Yonghong Shi","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2411.11282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15051v5","updated":"2024-11-18T03:55:02Z","published":"2023-07-27T17:56:56Z","title":"Matching Patients to Clinical Trials with Large Language Models","summary":" Patient recruitment is challenging for clinical trials. We introduce\nTrialGPT, an end-to-end framework for zero-shot patient-to-trial matching with\nlarge language models. TrialGPT comprises three modules: it first performs\nlarge-scale filtering to retrieve candidate trials (TrialGPT-Retrieval); then\npredicts criterion-level patient eligibility (TrialGPT-Matching); and finally\ngenerates trial-level scores (TrialGPT-Ranking). We evaluate TrialGPT on three\ncohorts of 183 synthetic patients with over 75,000 trial annotations.\nTrialGPT-Retrieval can recall over 90% of relevant trials using less than 6% of\nthe initial collection. Manual evaluations on 1,015 patient-criterion pairs\nshow that TrialGPT-Matching achieves an accuracy of 87.3% with faithful\nexplanations, close to the expert performance. The TrialGPT-Ranking scores are\nhighly correlated with human judgments and outperform the best-competing models\nby 43.8% in ranking and excluding trials. Furthermore, our user study reveals\nthat TrialGPT can reduce the screening time by 42.6% in patient recruitment.\nOverall, these results have demonstrated promising opportunities for\npatient-to-trial matching with TrialGPT.\n","authors":["Qiao Jin","Zifeng Wang","Charalampos S. Floudas","Fangyuan Chen","Changlin Gong","Dara Bracken-Clarke","Elisabetta Xue","Yifan Yang","Jimeng Sun","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.15051v5.pdf","comment":"Nature Communications"},{"id":"http://arxiv.org/abs/2407.05550v4","updated":"2024-11-18T03:55:00Z","published":"2024-07-08T01:58:48Z","title":"MEEG and AT-DGNN: Improving EEG Emotion Recognition with Music\n Introducing and Graph-based Learning","summary":" We present the MEEG dataset, a multi-modal collection of music-induced\nelectroencephalogram (EEG) recordings designed to capture emotional responses\nto various musical stimuli across different valence and arousal levels. This\npublic dataset facilitates an in-depth examination of brainwave patterns within\nmusical contexts, providing a robust foundation for studying brain network\ntopology during emotional processing. Leveraging the MEEG dataset, we introduce\nthe Attention-based Temporal Learner with Dynamic Graph Neural Network\n(AT-DGNN), a novel framework for EEG-based emotion recognition. This model\ncombines an attention mechanism with a dynamic graph neural network (DGNN) to\ncapture intricate EEG dynamics. The AT-DGNN achieves state-of-the-art (SOTA)\nperformance with an accuracy of 83.74% in arousal recognition and 86.01% in\nvalence recognition, outperforming existing SOTA methods. Comparative analysis\nwith traditional datasets, such as DEAP, further validates the model's\neffectiveness and underscores the potency of music as an emotional stimulus.\nThis study advances graph-based learning methodology in brain-computer\ninterfaces (BCI), significantly improving the accuracy of EEG-based emotion\nrecognition. The MEEG dataset and source code are publicly available at\nhttps://github.com/xmh1011/AT-DGNN.\n","authors":["Minghao Xiao","Zhengxi Zhu","Kang Xie","Bin Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.05550v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14701v3","updated":"2024-11-18T03:52:26Z","published":"2024-05-23T15:35:48Z","title":"DreamText: High Fidelity Scene Text Synthesis","summary":" Scene text synthesis involves rendering specified texts onto arbitrary\nimages. Current methods typically formulate this task in an end-to-end manner\nbut lack effective character-level guidance during training. Besides, their\ntext encoders, pre-trained on a single font type, struggle to adapt to the\ndiverse font styles encountered in practical applications. Consequently, these\nmethods suffer from character distortion, repetition, and absence, particularly\nin polystylistic scenarios. To this end, this paper proposes DreamText for\nhigh-fidelity scene text synthesis. Our key idea is to reconstruct the\ndiffusion training process, introducing more refined guidance tailored to this\ntask, to expose and rectify the model's attention at the character level and\nstrengthen its learning of text regions. This transformation poses a hybrid\noptimization challenge, involving both discrete and continuous variables. To\neffectively tackle this challenge, we employ a heuristic alternate optimization\nstrategy. Meanwhile, we jointly train the text encoder and generator to\ncomprehensively learn and utilize the diverse font present in the training\ndataset. This joint training is seamlessly integrated into the alternate\noptimization process, fostering a synergistic relationship between learning\ncharacter embedding and re-estimating character attention. Specifically, in\neach step, we first encode potential character-generated position information\nfrom cross-attention maps into latent character masks. These masks are then\nutilized to update the representation of specific characters in the current\nstep, which, in turn, enables the generator to correct the character's\nattention in the subsequent steps. Both qualitative and quantitative results\ndemonstrate the superiority of our method to the state of the art.\n","authors":["Yibin Wang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2405.14701v3.pdf","comment":"Code: https://github.com/CodeGoat24/DreamText, Project page:\n https://codegoat24.github.io/DreamText/"},{"id":"http://arxiv.org/abs/2411.11262v1","updated":"2024-11-18T03:35:34Z","published":"2024-11-18T03:35:34Z","title":"Cross-Patient Pseudo Bags Generation and Curriculum Contrastive Learning\n for Imbalanced Multiclassification of Whole Slide Image","summary":" Pathology computing has dramatically improved pathologists' workflow and\ndiagnostic decision-making processes. Although computer-aided diagnostic\nsystems have shown considerable value in whole slide image (WSI) analysis, the\nproblem of multi-classification under sample imbalance remains an intractable\nchallenge. To address this, we propose learning fine-grained information by\ngenerating sub-bags with feature distributions similar to the original WSIs.\nAdditionally, we utilize a pseudo-bag generation algorithm to further leverage\nthe abundant and redundant information in WSIs, allowing efficient training in\nunbalanced-sample multi-classification tasks. Furthermore, we introduce an\naffinity-based sample selection and curriculum contrastive learning strategy to\nenhance the stability of model representation learning. Unlike previous\napproaches, our framework transitions from learning bag-level representations\nto understanding and exploiting the feature distribution of multi-instance\nbags. Our method demonstrates significant performance improvements on three\ndatasets, including tumor classification and lymph node metastasis. On average,\nit achieves a 4.39-point improvement in F1 score compared to the second-best\nmethod across the three tasks, underscoring its superior performance.\n","authors":["Yonghuang Wu","Xuan Xie","Xinyuan Niu","Chengqian Zhao","Jinhua Yu"],"pdf_url":"https://arxiv.org/pdf/2411.11262v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.02387v5","updated":"2024-11-18T03:17:32Z","published":"2024-09-04T02:30:12Z","title":"Large Language Models and Cognitive Science: A Comprehensive Review of\n Similarities, Differences, and Challenges","summary":" This comprehensive review explores the intersection of Large Language Models\n(LLMs) and cognitive science, examining similarities and differences between\nLLMs and human cognitive processes. We analyze methods for evaluating LLMs\ncognitive abilities and discuss their potential as cognitive models. The review\ncovers applications of LLMs in various cognitive fields, highlighting insights\ngained for cognitive science research. We assess cognitive biases and\nlimitations of LLMs, along with proposed methods for improving their\nperformance. The integration of LLMs with cognitive architectures is examined,\nrevealing promising avenues for enhancing artificial intelligence (AI)\ncapabilities. Key challenges and future research directions are identified,\nemphasizing the need for continued refinement of LLMs to better align with\nhuman cognition. This review provides a balanced perspective on the current\nstate and future potential of LLMs in advancing our understanding of both\nartificial and human intelligence.\n","authors":["Qian Niu","Junyu Liu","Ziqian Bi","Pohsun Feng","Benji Peng","Keyu Chen","Ming Li","Lawrence KQ Yan","Yichao Zhang","Caitlyn Heqi Yin","Cheng Fei","Tianyang Wang","Yunze Wang","Silin Chen"],"pdf_url":"https://arxiv.org/pdf/2409.02387v5.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.07433v5","updated":"2024-11-18T03:14:37Z","published":"2024-08-14T10:08:46Z","title":"MagicFace: Training-free Universal-Style Human Image Customized\n Synthesis","summary":" Current human image customization methods leverage Stable Diffusion (SD) for\nits rich semantic prior. However, since SD is not specifically designed for\nhuman-oriented generation, these methods often require extensive fine-tuning on\nlarge-scale datasets, which renders them susceptible to overfitting and hinders\ntheir ability to personalize individuals with previously unseen styles.\nMoreover, these methods extensively focus on single-concept human image\nsynthesis and lack the flexibility to customize individuals using multiple\ngiven concepts, thereby impeding their broader practical application. This\npaper proposes MagicFace, a novel training-free method for multi-concept\nuniversal-style human image personalized synthesis. Our core idea is to\nsimulate how humans create images given specific concepts, i.e., first\nestablish a semantic layout considering factors such as concepts' shape and\nposture, then optimize details by comparing with concepts at the pixel level.\nTo implement this process, we introduce a coarse-to-fine generation pipeline,\ninvolving two sequential stages: semantic layout construction and concept\nfeature injection. This is achieved by our Reference-aware Self-Attention (RSA)\nand Region-grouped Blend Attention (RBA) mechanisms. In the first stage, RSA\nenables the latent image to query features from all reference concepts\nsimultaneously, extracting the overall semantic understanding to facilitate the\ninitial semantic layout establishment. In the second stage, we employ an\nattention-based semantic segmentation method to pinpoint the latent generated\nregions of all concepts at each step. Following this, RBA divides the pixels of\nthe latent image into semantic groups, with each group querying fine-grained\nfeatures from the corresponding reference concept. Extensive experiments\ndemonstrate the superiority of our MagicFace.\n","authors":["Yibin Wang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2408.07433v5.pdf","comment":"project page: https://codegoat24.github.io/MagicFace"},{"id":"http://arxiv.org/abs/2406.15778v2","updated":"2024-11-18T03:02:17Z","published":"2024-06-22T07:57:58Z","title":"ObjectNLQ @ Ego4D Episodic Memory Challenge 2024","summary":" In this report, we present our approach for the Natural Language Query track\nand Goal Step track of the Ego4D Episodic Memory Benchmark at CVPR 2024. Both\nchallenges require the localization of actions within long video sequences\nusing textual queries. To enhance localization accuracy, our method not only\nprocesses the temporal information of videos but also identifies fine-grained\nobjects spatially within the frames. To this end, we introduce a novel\napproach, termed ObjectNLQ, which incorporates an object branch to augment the\nvideo representation with detailed object information, thereby improving\ngrounding efficiency. ObjectNLQ achieves a mean R@1 of 23.15, ranking 2nd in\nthe Natural Language Queries Challenge, and gains 33.00 in terms of the metric\nR@1, IoU=0.3, ranking 3rd in the Goal Step Challenge. Our code will be released\nat https://github.com/Yisen-Feng/ObjectNLQ.\n","authors":["Yisen Feng","Haoyu Zhang","Yuquan Xie","Zaijing Li","Meng Liu","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2406.15778v2.pdf","comment":"The solution for the Natural Language Query track and Goal Step track\n at CVPR EgoVis Workshop 2024"},{"id":"http://arxiv.org/abs/2402.16726v3","updated":"2024-11-18T02:56:27Z","published":"2024-02-26T16:48:12Z","title":"Towards Empirical Interpretation of Internal Circuits and Properties in\n Grokked Transformers on Modular Polynomials","summary":" Grokking has been actively explored to reveal the mystery of delayed\ngeneralization and identifying interpretable representations and algorithms\ninside the grokked models is a suggestive hint to understanding its mechanism.\nGrokking on modular addition has been known to implement Fourier representation\nand its calculation circuits with trigonometric identities in Transformers.\nConsidering the periodicity in modular arithmetic, the natural question is to\nwhat extent these explanations and interpretations hold for the grokking on\nother modular operations beyond addition. For a closer look, we first\nhypothesize that any modular operations can be characterized with distinctive\nFourier representation or internal circuits, grokked models obtain common\nfeatures transferable among similar operations, and mixing datasets with\nsimilar operations promotes grokking. Then, we extensively examine them by\nlearning Transformers on complex modular arithmetic tasks, including\npolynomials. Our Fourier analysis and novel progress measure for modular\narithmetic, Fourier Frequency Density and Fourier Coefficient Ratio,\ncharacterize distinctive internal representations of grokked models per modular\noperation; for instance, polynomials often result in the superposition of the\nFourier components seen in elementary arithmetic, but clear patterns do not\nemerge in challenging non-factorizable polynomials. In contrast, our ablation\nstudy on the pre-grokked models reveals that the transferability among the\nmodels grokked with each operation can be only limited to specific\ncombinations, such as from elementary arithmetic to linear expressions.\nMoreover, some multi-task mixtures may lead to co-grokking -- where grokking\nsimultaneously happens for all the tasks -- and accelerate generalization,\nwhile others may not find optimal solutions. We provide empirical steps towards\nthe interpretability of internal circuits.\n","authors":["Hiroki Furuta","Gouki Minegishi","Yusuke Iwasawa","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2402.16726v3.pdf","comment":"Published at Transactions on Machine Learning Research (TMLR), Code:\n https://github.com/frt03/grok_mod_poly"},{"id":"http://arxiv.org/abs/2411.11249v1","updated":"2024-11-18T02:36:19Z","published":"2024-11-18T02:36:19Z","title":"EXCON: Extreme Instance-based Contrastive Representation Learning of\n Severely Imbalanced Multivariate Time Series for Solar Flare Prediction","summary":" In heliophysics research, predicting solar flares is crucial due to their\npotential to impact both space-based systems and Earth's infrastructure\nsubstantially. Magnetic field data from solar active regions, recorded by solar\nimaging observatories, are transformed into multivariate time series to enable\nsolar flare prediction using temporal window-based analysis. In the realm of\nmultivariate time series-driven solar flare prediction, addressing severe class\nimbalance with effective strategies for multivariate time series representation\nlearning is key to developing robust predictive models. Traditional methods\noften struggle with overfitting to the majority class in prediction tasks where\nmajor solar flares are infrequent. This work presents EXCON, a contrastive\nrepresentation learning framework designed to enhance classification\nperformance amidst such imbalances. EXCON operates through four stages:\nobtaining core features from multivariate time series data; selecting\ndistinctive contrastive representations for each class to maximize inter-class\nseparation; training a temporal feature embedding module with a custom extreme\nreconstruction loss to minimize intra-class variation; and applying a\nclassifier to the learned embeddings for robust classification. The proposed\nmethod leverages contrastive learning principles to map similar instances\ncloser in the feature space while distancing dissimilar ones, a strategy not\nextensively explored in solar flare prediction tasks. This approach not only\naddresses class imbalance but also offers a versatile solution applicable to\nunivariate and multivariate time series across binary and multiclass\nclassification problems. Experimental results, including evaluations on the\nbenchmark solar flare dataset and multiple time series archive datasets with\nbinary and multiclass labels, demonstrate EXCON's efficacy in enhancing\nclassification performance.\n","authors":["Onur Vural","Shah Muhammad Hamdi","Soukaina Filali Boubrahimi"],"pdf_url":"https://arxiv.org/pdf/2411.11249v1.pdf","comment":"This work has been accepted at the 2024 IEEE International Conference\n on Big Data (IEEE BigData 2024) on October 27, 2024, as a main conference\n paper"},{"id":"http://arxiv.org/abs/2411.11247v1","updated":"2024-11-18T02:35:15Z","published":"2024-11-18T02:35:15Z","title":"ZeFaV: Boosting Large Language Models for Zero-shot Fact Verification","summary":" In this paper, we propose ZeFaV - a zero-shot based fact-checking\nverification framework to enhance the performance on fact verification task of\nlarge language models by leveraging the in-context learning ability of large\nlanguage models to extract the relations among the entities within a claim,\nre-organized the information from the evidence in a relationally logical form,\nand combine the above information with the original evidence to generate the\ncontext from which our fact-checking model provide verdicts for the input\nclaims. We conducted empirical experiments to evaluate our approach on two\nmulti-hop fact-checking datasets including HoVer and FEVEROUS, and achieved\npotential results results comparable to other state-of-the-art fact\nverification task methods.\n","authors":["Son T. Luu","Hiep Nguyen","Trung Vo","Le-Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.11247v1.pdf","comment":"This pre-print has been published in PRICAI 2024: Trends in\n Artificial Intelligence. The published version is available at\n https://doi.org/10.1007/978-981-96-0119-6_28"},{"id":"http://arxiv.org/abs/2409.02389v2","updated":"2024-11-18T02:32:22Z","published":"2024-09-04T02:37:38Z","title":"Multi-modal Situated Reasoning in 3D Scenes","summary":" Situation awareness is essential for understanding and reasoning about 3D\nscenes in embodied AI agents. However, existing datasets and benchmarks for\nsituated understanding are limited in data modality, diversity, scale, and task\nscope. To address these limitations, we propose Multi-modal Situated Question\nAnswering (MSQA), a large-scale multi-modal situated reasoning dataset,\nscalably collected leveraging 3D scene graphs and vision-language models (VLMs)\nacross a diverse range of real-world 3D scenes. MSQA includes 251K situated\nquestion-answering pairs across 9 distinct question categories, covering\ncomplex scenarios within 3D scenes. We introduce a novel interleaved\nmulti-modal input setting in our benchmark to provide text, image, and point\ncloud for situation and question description, resolving ambiguity in previous\nsingle-modality convention (e.g., text). Additionally, we devise the\nMulti-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models'\nsituated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN\nhighlight the limitations of existing vision-language models and underscore the\nimportance of handling multi-modal interleaved inputs and situation modeling.\nExperiments on data scaling and cross-domain transfer further demonstrate the\nefficacy of leveraging MSQA as a pre-training dataset for developing more\npowerful situated reasoning models.\n","authors":["Xiongkun Linghu","Jiangyong Huang","Xuesong Niu","Xiaojian Ma","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2409.02389v2.pdf","comment":"Accepted by NeurIPS 2024 Datasets and Benchmarks Track. Project page:\n https://msr3d.github.io/"},{"id":"http://arxiv.org/abs/2406.10806v2","updated":"2024-11-18T02:19:02Z","published":"2024-06-16T05:17:56Z","title":"ptt5-v2: A Closer Look at Continued Pretraining of T5 Models for the\n Portuguese Language","summary":" Despite advancements in Natural Language Processing (NLP) and the growing\navailability of pretrained models, the English language remains the primary\nfocus of model development. Continued pretraining on language-specific corpora\nprovides a practical solution for adapting models to other languages. However,\nthe impact of different pretraining settings on downstream tasks remains\nunderexplored. This work introduces $\\texttt{ptt5-v2}$, investigating the\ncontinued pretraining of T5 models for Portuguese. We first develop a baseline\nset of settings and pretrain models with sizes up to 3B parameters. Finetuning\non three Portuguese downstream tasks (assin2 STS, assin2 RTE, and TweetSentBR)\nyields SOTA results on the latter two. We then explore the effects of different\npretraining configurations, including pretraining data quality, optimization\nstrategies, and multi-epoch pretraining. Perhaps surprisingly, their impact\nremains subtle compared to our baseline. We release $\\texttt{ptt5-v2}$\npretrained checkpoints and their MonoT5-based finetuned $\\texttt{MonoPTT5}$\nrerankers on HuggingFace in their respective collections at\n\\url{https://huggingface.co/unicamp-dl}.\n","authors":["Marcos Piau","Roberto Lotufo","Rodrigo Nogueira"],"pdf_url":"https://arxiv.org/pdf/2406.10806v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07259v3","updated":"2024-11-18T02:18:14Z","published":"2023-10-11T07:37:13Z","title":"Uncovering Hidden Connections: Iterative Search and Reasoning for\n Video-grounded Dialog","summary":" In contrast to conventional visual question answering, video-grounded dialog\nnecessitates a profound understanding of both dialog history and video content\nfor accurate response generation. Despite commendable progress made by existing\napproaches, they still face the challenges of incrementally understanding\ncomplex dialog history and assimilating video information. In response to these\nchallenges, we present an iterative search and reasoning framework, which\nconsists of a textual encoder, a visual encoder, and a generator. Specifically,\nwe devise a path search and aggregation strategy in the textual encoder, mining\ncore cues from dialog history that are pivotal to understanding the posed\nquestions. Concurrently, our visual encoder harnesses an iterative reasoning\nnetwork to extract and emphasize critical visual markers from videos, enhancing\nthe depth of visual comprehension. Finally, we utilize the pre-trained GPT-2\nmodel as our answer generator to decode the mined hidden clues into coherent\nand contextualized answers. Extensive experiments on three public datasets\ndemonstrate the effectiveness and generalizability of our proposed framework.\n","authors":["Haoyu Zhang","Meng Liu","Yaowei Wang","Da Cao","Weili Guan","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2310.07259v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15297v2","updated":"2024-11-18T02:13:31Z","published":"2024-10-20T05:57:10Z","title":"Redefining Proactivity for Information Seeking Dialogue","summary":" Information-Seeking Dialogue (ISD) agents aim to provide accurate responses\nto user queries. While proficient in directly addressing user queries, these\nagents, as well as LLMs in general, predominantly exhibit reactive behavior,\nlacking the ability to generate proactive responses that actively engage users\nin sustained conversations. However, existing definitions of proactive dialogue\nin this context do not focus on how each response actively engages the user and\nsustains the conversation. Hence, we present a new definition of proactivity\nthat focuses on enhancing the `proactiveness' of each generated response via\nthe introduction of new information related to the initial query. To this end,\nwe construct a proactive dialogue dataset comprising 2,000 single-turn\nconversations, and introduce several automatic metrics to evaluate response\n`proactiveness' which achieved high correlation with human annotation.\nAdditionally, we introduce two innovative Chain-of-Thought (CoT) prompts, the\n3-step CoT and the 3-in-1 CoT prompts, which consistently outperform standard\nprompts by up to 90% in the zero-shot setting.\n","authors":["Jing Yang Lee","Seokhwan Kim","Kartik Mehta","Jiun-Yu Kao","Yu-Hsiang Lin","Arpit Gupta"],"pdf_url":"https://arxiv.org/pdf/2410.15297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11235v1","updated":"2024-11-18T02:09:48Z","published":"2024-11-18T02:09:48Z","title":"MEMO-Bench: A Multiple Benchmark for Text-to-Image and Multimodal Large\n Language Models on Human Emotion Analysis","summary":" Artificial Intelligence (AI) has demonstrated significant capabilities in\nvarious fields, and in areas such as human-computer interaction (HCI), embodied\nintelligence, and the design and animation of virtual digital humans, both\npractitioners and users are increasingly concerned with AI's ability to\nunderstand and express emotion. Consequently, the question of whether AI can\naccurately interpret human emotions remains a critical challenge. To date, two\nprimary classes of AI models have been involved in human emotion analysis:\ngenerative models and Multimodal Large Language Models (MLLMs). To assess the\nemotional capabilities of these two classes of models, this study introduces\nMEMO-Bench, a comprehensive benchmark consisting of 7,145 portraits, each\ndepicting one of six different emotions, generated by 12 Text-to-Image (T2I)\nmodels. Unlike previous works, MEMO-Bench provides a framework for evaluating\nboth T2I models and MLLMs in the context of sentiment analysis. Additionally, a\nprogressive evaluation approach is employed, moving from coarse-grained to\nfine-grained metrics, to offer a more detailed and comprehensive assessment of\nthe sentiment analysis capabilities of MLLMs. The experimental results\ndemonstrate that existing T2I models are more effective at generating positive\nemotions than negative ones. Meanwhile, although MLLMs show a certain degree of\neffectiveness in distinguishing and recognizing human emotions, they fall short\nof human-level accuracy, particularly in fine-grained emotion analysis. The\nMEMO-Bench will be made publicly available to support further research in this\narea.\n","authors":["Yingjie Zhou","Zicheng Zhang","Jiezhang Cao","Jun Jia","Yanwei Jiang","Farong Wen","Xiaohong Liu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2411.11235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03132v3","updated":"2024-11-18T02:06:46Z","published":"2024-10-04T04:07:15Z","title":"Autoregressive Action Sequence Learning for Robotic Manipulation","summary":" Designing a universal policy architecture that performs well across diverse\nrobots and task configurations remains a key challenge. In this work, we\naddress this by representing robot actions as sequential data and generating\nactions through autoregressive sequence modeling. Existing autoregressive\narchitectures generate end-effector waypoints sequentially as word tokens in\nlanguage modeling, which are limited to low-frequency control tasks. Unlike\nlanguage, robot actions are heterogeneous and often include continuous values\n-- such as joint positions, 2D pixel coordinates, and end-effector poses --\nwhich are not easily suited for language-based modeling. Based on this insight,\nwe introduce a straightforward enhancement: we extend causal transformers'\nsingle-token prediction to support predicting a variable number of tokens in a\nsingle step through our Chunking Causal Transformer (CCT). This enhancement\nenables robust performance across diverse tasks of various control frequencies,\ngreater efficiency by having fewer autoregression steps, and lead to a hybrid\naction sequence design by mixing different types of actions and using a\ndifferent chunk size for each action type. Based on CCT, we propose the\nAutoregressive Policy (ARP) architecture, which solves manipulation tasks by\ngenerating hybrid action sequences. We evaluate ARP across diverse robotic\nmanipulation environments, including Push-T, ALOHA, and RLBench, and show that\nARP, as a universal architecture, outperforms the environment-specific\nstate-of-the-art in all tested benchmarks, while being more efficient in\ncomputation and parameter sizes. Videos of our real robot demonstrations, all\nsource code and the pretrained models of ARP can be found at\nhttp://github.com/mlzxy/arp.\n","authors":["Xinyu Zhang","Yuhan Liu","Haonan Chang","Liam Schramm","Abdeslam Boularias"],"pdf_url":"https://arxiv.org/pdf/2410.03132v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20605v2","updated":"2024-11-18T01:47:56Z","published":"2024-05-31T03:39:26Z","title":"Searching for internal symbols underlying deep learning","summary":" Deep learning (DL) enables deep neural networks (DNNs) to automatically learn\ncomplex tasks or rules from given examples without instructions or guiding\nprinciples. As we do not engineer DNNs' functions, it is extremely difficult to\ndiagnose their decisions, and multiple lines of studies proposed to explain the\nprinciples of their operations. Notably, one line of studies suggests that DNNs\nmay learn concepts, the high level features that are recognizable to humans. In\nthis study, we extend this line of studies and hypothesize that DNNs can\ndevelop abstract codes that can be used to augment DNNs' decision-making. To\naddress this hypothesis, we combine foundation segmentation models and\nunsupervised learning to extract internal codes and identify potential use of\nabstract codes to make DL's decision-making more reliable and safer.\n","authors":["Jung H. Lee","Sujith Vijayan"],"pdf_url":"https://arxiv.org/pdf/2405.20605v2.pdf","comment":"16 pages, 10 figures, 5 tables and 1 supplementary table"},{"id":"http://arxiv.org/abs/2406.05339v3","updated":"2024-11-18T01:46:11Z","published":"2024-06-08T03:44:39Z","title":"To what extent can ASV systems naturally defend against spoofing\n attacks?","summary":" The current automatic speaker verification (ASV) task involves making binary\ndecisions on two types of trials: target and non-target. However, emerging\nadvancements in speech generation technology pose significant threats to the\nreliability of ASV systems. This study investigates whether ASV effortlessly\nacquires robustness against spoofing attacks (i.e., zero-shot capability) by\nsystematically exploring diverse ASV systems and spoofing attacks, ranging from\ntraditional to cutting-edge techniques. Through extensive analyses conducted on\neight distinct ASV systems and 29 spoofing attack systems, we demonstrate that\nthe evolution of ASV inherently incorporates defense mechanisms against\nspoofing attacks. Nevertheless, our findings also underscore that the\nadvancement of spoofing attacks far outpaces that of ASV systems, hence\nnecessitating further research on spoofing-robust ASV methodologies.\n","authors":["Jee-weon Jung","Xin Wang","Nicholas Evans","Shinji Watanabe","Hye-jin Shim","Hemlata Tak","Sidhhant Arora","Junichi Yamagishi","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2406.05339v3.pdf","comment":"5 pages, 3 figures, 3 tables, Interspeech 2024"},{"id":"http://arxiv.org/abs/2410.20772v2","updated":"2024-11-18T01:20:49Z","published":"2024-10-28T06:17:20Z","title":"Introducing Spectral Attention for Long-Range Dependency in Time Series\n Forecasting","summary":" Sequence modeling faces challenges in capturing long-range dependencies\nacross diverse tasks. Recent linear and transformer-based forecasters have\nshown superior performance in time series forecasting. However, they are\nconstrained by their inherent inability to effectively address long-range\ndependencies in time series data, primarily due to using fixed-size inputs for\nprediction. Furthermore, they typically sacrifice essential temporal\ncorrelation among consecutive training samples by shuffling them into\nmini-batches. To overcome these limitations, we introduce a fast and effective\nSpectral Attention mechanism, which preserves temporal correlations among\nsamples and facilitates the handling of long-range information while\nmaintaining the base model structure. Spectral Attention preserves long-period\ntrends through a low-pass filter and facilitates gradient to flow between\nsamples. Spectral Attention can be seamlessly integrated into most sequence\nmodels, allowing models with fixed-sized look-back windows to capture\nlong-range dependencies over thousands of steps. Through extensive experiments\non 11 real-world time series datasets using 7 recent forecasting models, we\nconsistently demonstrate the efficacy of our Spectral Attention mechanism,\nachieving state-of-the-art results.\n","authors":["Bong Gyun Kang","Dongjun Lee","HyunGi Kim","DoHyun Chung","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2410.20772v2.pdf","comment":"Co-first Author: Bong Gyun Kang, Dongjun Lee. Accepted to NeurIPS\n 2024"},{"id":"http://arxiv.org/abs/2411.11217v1","updated":"2024-11-18T01:06:12Z","published":"2024-11-18T01:06:12Z","title":"MoE-Lightning: High-Throughput MoE Inference on Memory-constrained GPUs","summary":" Efficient deployment of large language models, particularly Mixture of\nExperts (MoE), on resource-constrained platforms presents significant\nchallenges, especially in terms of computational efficiency and memory\nutilization. The MoE architecture, renowned for its ability to increase model\ncapacity without a proportional increase in inference cost, greatly reduces the\ntoken generation latency compared with dense models. However, the large model\nsize makes MoE models inaccessible to individuals without high-end GPUs. In\nthis paper, we propose a high-throughput MoE batch inference system, that\nsignificantly outperforms past work. MoE-Lightning introduces a novel\nCPU-GPU-I/O pipelining schedule, CGOPipe, with paged weights to achieve high\nresource utilization, and a performance model, HRM, based on a Hierarchical\nRoofline Model we introduce to help find policies with higher throughput than\nexisting systems. MoE-Lightning can achieve up to 10.3x higher throughput than\nstate-of-the-art offloading-enabled LLM inference systems for Mixtral 8x7B on a\nsingle T4 GPU (16GB). When the theoretical system throughput is bounded by the\nGPU memory, MoE-Lightning can reach the throughput upper bound with 2-3x less\nCPU memory, significantly increasing resource utilization. MoE-Lightning also\nsupports efficient batch inference for much larger MoEs (e.g., Mixtral 8x22B\nand DBRX) on multiple low-cost GPUs (e.g., 2-4 T4).\n","authors":["Shiyi Cao","Shu Liu","Tyler Griggs","Peter Schafhalter","Xiaoxuan Liu","Ying Sheng","Joseph E. Gonzalez","Matei Zaharia","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2411.11217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11213v1","updated":"2024-11-18T00:46:38Z","published":"2024-11-18T00:46:38Z","title":"Making Sigmoid-MSE Great Again: Output Reset Challenges Softmax\n Cross-Entropy in Neural Network Classification","summary":" This study presents a comparative analysis of two objective functions, Mean\nSquared Error (MSE) and Softmax Cross-Entropy (SCE) for neural network\nclassification tasks. While SCE combined with softmax activation is the\nconventional choice for transforming network outputs into class probabilities,\nwe explore an alternative approach using MSE with sigmoid activation. We\nintroduce the Output Reset algorithm, which reduces inconsistent errors and\nenhances classifier robustness. Through extensive experiments on benchmark\ndatasets (MNIST, CIFAR-10, and Fashion-MNIST), we demonstrate that MSE with\nsigmoid activation achieves comparable accuracy and convergence rates to SCE,\nwhile exhibiting superior performance in scenarios with noisy data. Our\nfindings indicate that MSE, despite its traditional association with regression\ntasks, serves as a viable alternative for classification problems, challenging\nconventional wisdom about neural network training strategies.\n","authors":["Kanishka Tyagi","Chinmay Rane","Ketaki Vaidya","Jeshwanth Challgundla","Soumitro Swapan Auddy","Michael Manry"],"pdf_url":"https://arxiv.org/pdf/2411.11213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07987v3","updated":"2024-11-18T00:21:40Z","published":"2024-04-11T17:59:09Z","title":"ControlNet++: Improving Conditional Controls with Efficient Consistency\n Feedback","summary":" To enhance the controllability of text-to-image diffusion models, existing\nefforts like ControlNet incorporated image-based conditional controls. In this\npaper, we reveal that existing methods still face significant challenges in\ngenerating images that align with the image conditional controls. To this end,\nwe propose ControlNet++, a novel approach that improves controllable generation\nby explicitly optimizing pixel-level cycle consistency between generated images\nand conditional controls. Specifically, for an input conditional control, we\nuse a pre-trained discriminative reward model to extract the corresponding\ncondition of the generated images, and then optimize the consistency loss\nbetween the input conditional control and extracted condition. A\nstraightforward implementation would be generating images from random noises\nand then calculating the consistency loss, but such an approach requires\nstoring gradients for multiple sampling timesteps, leading to considerable time\nand memory costs. To address this, we introduce an efficient reward strategy\nthat deliberately disturbs the input images by adding noise, and then uses the\nsingle-step denoised images for reward fine-tuning. This avoids the extensive\ncosts associated with image sampling, allowing for more efficient reward\nfine-tuning. Extensive experiments show that ControlNet++ significantly\nimproves controllability under various conditional controls. For example, it\nachieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE,\nrespectively, for segmentation mask, line-art edge, and depth conditions. All\nthe code, models, demo and organized data have been open sourced on our Github\nRepo.\n","authors":["Ming Li","Taojiannan Yang","Huafeng Kuang","Jie Wu","Zhaoning Wang","Xuefeng Xiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07987v3.pdf","comment":"Camera Ready Version. Project Page:\n https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data:\n https://github.com/liming-ai/ControlNet_Plus_Plus"},{"id":"http://arxiv.org/abs/2411.12128v1","updated":"2024-11-18T23:58:24Z","published":"2024-11-18T23:58:24Z","title":"The Role of Accuracy and Validation Effectiveness in Conversational\n Business Analytics","summary":" This study examines conversational business analytics, an approach that\nutilizes AI to address the technical competency gaps that hindered end users\nfrom effectively using traditional self-service analytics. By facilitating\nnatural language interactions, conversational business analytics aims to enable\nend users to independently retrieve data and generate insights. The analysis\nfocuses on Text-to-SQL as a representative technology for translating natural\nlanguage requests into SQL statements. Using models grounded in expected\nutility theory, the study identifies conditions under which conversational\nbusiness analytics, through partial or full support, can outperform delegation\nto human experts. The results indicate that partial support, which focuses\nsolely on information generation by AI, is viable when the accuracy of\nAI-generated SQL queries exceeds a defined threshold. In contrast, full support\nincludes not only information generation but also validation through\nexplanations provided by the AI, and requires sufficiently high validation\neffectiveness to be reliable. However, user-based validation presents\nchallenges, such as misjudgment and rejection of valid SQL queries, which may\nlimit the effectiveness of conversational business analytics. These challenges\nunderscore the need for robust validation mechanisms, including improved user\nsupport, automated processes, and methods for assessing quality independently\nof end users' technical competencies.\n","authors":["Adem Alparslan"],"pdf_url":"https://arxiv.org/pdf/2411.12128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05589v4","updated":"2024-11-18T23:21:50Z","published":"2024-03-04T17:44:18Z","title":"Ergonomic Design of Computer Laboratory Furniture: Mismatch Analysis\n Utilizing Anthropometric Data of University Students","summary":" Many studies have shown how ergonomically designed furniture improves\nproductivity and well-being. As computers have become a part of students'\nacademic lives, they will grow further in the future. We propose\nanthropometric-based furniture dimensions suitable for university students to\nimprove computer laboratory ergonomics. We collected data from 380 participants\nand analyzed 11 anthropometric measurements, correlating them to 11 furniture\ndimensions. Two types of furniture were studied: a non-adjustable chair with a\nnon-adjustable table and an adjustable chair with a non-adjustable table. The\nmismatch calculation showed a significant difference between furniture\ndimensions and anthropometric measurements. The one-way ANOVA test with a\nsignificance level of 5% also showed a significant difference between proposed\nand existing furniture dimensions. The proposed dimensions were found to be\nmore compatible and reduced mismatch percentages for both males and females\ncompared to existing furniture. The proposed dimensions of the furniture set\nwith adjustable seat height showed slightly improved results compared to the\nnon-adjustable furniture set. This suggests that the proposed dimensions can\nimprove comfort levels and reduce the risk of musculoskeletal disorders among\nstudents. Further studies on the implementation and long-term effects of these\nproposed dimensions in real-world computer laboratory settings are recommended.\n","authors":["Anik Kumar Saha","Md Abrar Jahin","Md. Rafiquzzaman","M. F. Mridha"],"pdf_url":"https://arxiv.org/pdf/2403.05589v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04685v3","updated":"2024-11-18T23:15:36Z","published":"2024-11-07T13:14:52Z","title":"Solving Generalized Grouping Problems in Cellular Manufacturing Systems\n Using a Network Flow Model","summary":" This paper focuses on the generalized grouping problem in the context of\ncellular manufacturing systems (CMS), where parts may have more than one\nprocess route. A process route lists the machines corresponding to each part of\nthe operation. Inspired by the extensive and widespread use of network flow\nalgorithms, this research formulates the process route family formation for\ngeneralized grouping as a unit capacity minimum cost network flow model. The\nobjective is to minimize dissimilarity (based on the machines required) among\nthe process routes within a family. The proposed model optimally solves the\nprocess route family formation problem without pre-specifying the number of\npart families to be formed. The process route of family formation is the first\nstage in a hierarchical procedure. For the second stage (machine cell\nformation), two procedures, a quadratic assignment programming (QAP)\nformulation, and a heuristic procedure, are proposed. The QAP simultaneously\nassigns process route families and machines to a pre-specified number of cells\nin such a way that total machine utilization is maximized. The heuristic\nprocedure for machine cell formation is hierarchical in nature. Computational\nresults for some test problems show that the QAP and the heuristic procedure\nyield the same results.\n","authors":["Md. Kutub Uddin","Md. Saiful Islam","Md Abrar Jahin","Md. Saiful Islam Seam","M. F. Mridha"],"pdf_url":"https://arxiv.org/pdf/2411.04685v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01601v3","updated":"2024-11-18T23:06:46Z","published":"2024-05-21T14:42:18Z","title":"Backpropagation-Free Multi-modal On-Device Model Adaptation via\n Cloud-Device Collaboration","summary":" In our increasingly interconnected world, where intelligent devices\ncontinually amass copious personalized multi-modal data, a pressing need arises\nto deliver high-quality, personalized device-aware services. However, this\nendeavor presents a multifaceted challenge to prevailing artificial\nintelligence (AI) systems primarily rooted in the cloud. As these systems\ngrapple with shifting data distributions between the cloud and devices, the\ntraditional approach of fine-tuning-based adaptation (FTA) exists the following\nissues: the costly and time-consuming data annotation required by FTA and the\nlooming risk of model overfitting. To surmount these challenges, we introduce a\nUniversal On-Device Multi-modal Model Adaptation Framework, revolutionizing\non-device model adaptation by striking a balance between efficiency and\neffectiveness. The framework features the Fast Domain Adaptor (FDA) hosted in\nthe cloud, providing tailored parameters for the Lightweight Multi-modal Model\non devices. To enhance adaptability across multi-modal tasks, the AnchorFrame\nDistribution Reasoner (ADR) minimizes communication costs. Our contributions,\nencapsulated in the Cloud-Device Collaboration Multi-modal Parameter Generation\n(CDC-MMPG) framework, represent a pioneering solution for on-Device Multi-modal\nModel Adaptation (DMMA). Extensive experiments validate the efficiency and\neffectiveness of our method, particularly in video question answering and\nretrieval tasks, driving forward the integration of intelligent devices into\nour daily lives.\n","authors":["Wei Ji","Li Li","Zheqi Lv","Wenqiao Zhang","Mengze Li","Zhen Wan","Wenqiang Lei","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2406.01601v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12115v1","updated":"2024-11-18T22:51:44Z","published":"2024-11-18T22:51:44Z","title":"Distill the Best, Ignore the Rest: Improving Dataset Distillation with\n Loss-Value-Based Pruning","summary":" Dataset distillation has gained significant interest in recent years, yet\nexisting approaches typically distill from the entire dataset, potentially\nincluding non-beneficial samples. We introduce a novel \"Prune First, Distill\nAfter\" framework that systematically prunes datasets via loss-based sampling\nprior to distillation. By leveraging pruning before classical distillation\ntechniques and generative priors, we create a representative core-set that\nleads to enhanced generalization for unseen architectures - a significant\nchallenge of current distillation methods. More specifically, our proposed\nframework significantly boosts distilled quality, achieving up to a 5.2\npercentage points accuracy increase even with substantial dataset pruning,\ni.e., removing 80% of the original dataset prior to distillation. Overall, our\nexperimental results highlight the advantages of our easy-sample prioritization\nand cross-architecture robustness, paving the way for more effective and\nhigh-quality dataset distillation.\n","authors":["Brian B. Moser","Federico Raue","Tobias C. Nauen","Stanislav Frolov","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.12115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03272v4","updated":"2024-11-18T22:05:04Z","published":"2023-10-05T02:58:29Z","title":"T-GAE: Transferable Graph Autoencoder for Network Alignment","summary":" Network alignment is the task of establishing one-to-one correspondences\nbetween the nodes of different graphs. Although finding a plethora of\napplications in high-impact domains, this task is known to be NP-hard in its\ngeneral form. Existing optimization algorithms do not scale up as the size of\nthe graphs increases. While being able to reduce the matching complexity,\ncurrent GNN approaches fit a deep neural network on each graph and requires\nre-train on unseen samples, which is time and memory inefficient. To tackle\nboth challenges we propose T-GAE, a transferable graph autoencoder framework\nthat leverages transferability and stability of GNNs to achieve efficient\nnetwork alignment on out-of-distribution graphs without retraining. We prove\nthat GNN-generated embeddings can achieve more accurate alignment compared to\nclassical spectral methods. Our experiments on real-world benchmarks\ndemonstrate that T-GAE outperforms the state-of-the-art optimization method and\nthe best GNN approach by up to 38.7% and 50.8%, respectively, while being able\nto reduce 90% of the training time when matching out-of-distribution large\nscale networks. We conduct ablation studies to highlight the effectiveness of\nthe proposed encoder architecture and training objective in enhancing the\nexpressiveness of GNNs to match perturbed graphs. T-GAE is also proved to be\nflexible to utilize matching algorithms of different complexities. Our code is\navailable at https://github.com/Jason-Tree/T-GAE.\n","authors":["Jiashu He","Charilaos I. Kanatsoulis","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2310.03272v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12073v1","updated":"2024-11-18T21:34:05Z","published":"2024-11-18T21:34:05Z","title":"Just Leaf It: Accelerating Diffusion Classifiers with Hierarchical Class\n Pruning","summary":" Diffusion models, known for their generative capabilities, have recently\nshown unexpected potential in image classification tasks by using Bayes'\ntheorem. However, most diffusion classifiers require evaluating all class\nlabels for a single classification, leading to significant computational costs\nthat can hinder their application in large-scale scenarios. To address this, we\npresent a Hierarchical Diffusion Classifier (HDC) that exploits the inherent\nhierarchical label structure of a dataset. By progressively pruning irrelevant\nhigh-level categories and refining predictions only within relevant\nsubcategories, i.e., leaf nodes, HDC reduces the total number of class\nevaluations. As a result, HDC can accelerate inference by up to 60% while\nmaintaining and, in some cases, improving classification accuracy. Our work\nenables a new control mechanism of the trade-off between speed and precision,\nmaking diffusion-based classification more viable for real-world applications,\nparticularly in large-scale image classification tasks.\n","authors":["Arundhati S. Shanbhag","Brian B. Moser","Tobias C. Nauen","Stanislav Frolov","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.12073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12072v1","updated":"2024-11-18T21:32:49Z","published":"2024-11-18T21:32:49Z","title":"Zoomed In, Diffused Out: Towards Local Degradation-Aware Multi-Diffusion\n for Extreme Image Super-Resolution","summary":" Large-scale, pre-trained Text-to-Image (T2I) diffusion models have gained\nsignificant popularity in image generation tasks and have shown unexpected\npotential in image Super-Resolution (SR). However, most existing T2I diffusion\nmodels are trained with a resolution limit of 512x512, making scaling beyond\nthis resolution an unresolved but necessary challenge for image SR. In this\nwork, we introduce a novel approach that, for the first time, enables these\nmodels to generate 2K, 4K, and even 8K images without any additional training.\nOur method leverages MultiDiffusion, which distributes the generation across\nmultiple diffusion paths to ensure global coherence at larger scales, and local\ndegradation-aware prompt extraction, which guides the T2I model to reconstruct\nfine local structures according to its low-resolution input. These innovations\nunlock higher resolutions, allowing T2I diffusion models to be applied to image\nSR tasks without limitation on resolution.\n","authors":["Brian B. Moser","Stanislav Frolov","Tobias C. Nauen","Federico Raue","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2411.12072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17110v3","updated":"2024-11-18T21:26:10Z","published":"2023-03-30T02:51:00Z","title":"Contextual Combinatorial Bandits with Probabilistically Triggered Arms","summary":" We study contextual combinatorial bandits with probabilistically triggered\narms (C$^2$MAB-T) under a variety of smoothness conditions that capture a wide\nrange of applications, such as contextual cascading bandits and contextual\ninfluence maximization bandits. Under the triggering probability modulated\n(TPM) condition, we devise the C$^2$-UCB-T algorithm and propose a novel\nanalysis that achieves an $\\tilde{O}(d\\sqrt{KT})$ regret bound, removing a\npotentially exponentially large factor $O(1/p_{\\min})$, where $d$ is the\ndimension of contexts, $p_{\\min}$ is the minimum positive probability that any\narm can be triggered, and batch-size $K$ is the maximum number of arms that can\nbe triggered per round. Under the variance modulated (VM) or triggering\nprobability and variance modulated (TPVM) conditions, we propose a new\nvariance-adaptive algorithm VAC$^2$-UCB and derive a regret bound\n$\\tilde{O}(d\\sqrt{T})$, which is independent of the batch-size $K$. As a\nvaluable by-product, our analysis technique and variance-adaptive algorithm can\nbe applied to the CMAB-T and C$^2$MAB setting, improving existing results there\nas well. We also include experiments that demonstrate the improved performance\nof our algorithms compared with benchmark algorithms on synthetic and\nreal-world datasets.\n","authors":["Xutong Liu","Jinhang Zuo","Siwei Wang","John C. S. Lui","Mohammad Hajiesmaili","Adam Wierman","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2303.17110v3.pdf","comment":"The 40th International Conference on Machine Learning (ICML), 2023"},{"id":"http://arxiv.org/abs/2411.12064v1","updated":"2024-11-18T21:10:14Z","published":"2024-11-18T21:10:14Z","title":"TSPRank: Bridging Pairwise and Listwise Methods with a Bilinear\n Travelling Salesman Model","summary":" Traditional Learning-To-Rank (LETOR) approaches, including pairwise methods\nlike RankNet and LambdaMART, often fall short by solely focusing on pairwise\ncomparisons, leading to sub-optimal global rankings. Conversely, deep learning\nbased listwise methods, while aiming to optimise entire lists, require complex\ntuning and yield only marginal improvements over robust pairwise models. To\novercome these limitations, we introduce Travelling Salesman Problem Rank\n(TSPRank), a hybrid pairwise-listwise ranking method. TSPRank reframes the\nranking problem as a Travelling Salesman Problem (TSP), a well-known\ncombinatorial optimisation challenge that has been extensively studied for its\nnumerous solution algorithms and applications. This approach enables the\nmodelling of pairwise relationships and leverages combinatorial optimisation to\ndetermine the listwise ranking. This approach can be directly integrated as an\nadditional component into embeddings generated by existing backbone models to\nenhance ranking performance. Our extensive experiments across three backbone\nmodels on diverse tasks, including stock ranking, information retrieval, and\nhistorical events ordering, demonstrate that TSPRank significantly outperforms\nboth pure pairwise and listwise methods. Our qualitative analysis reveals that\nTSPRank's main advantage over existing methods is its ability to harness global\ninformation better while ranking. TSPRank's robustness and superior performance\nacross different domains highlight its potential as a versatile and effective\nLETOR solution. The code and preprocessed data are available at\nhttps://github.com/waylonli/TSPRank-KDD2025.\n","authors":["Weixian Waylon Li","Yftah Ziser","Yifei Xie","Shay B. Cohen","Tiejun Ma"],"pdf_url":"https://arxiv.org/pdf/2411.12064v1.pdf","comment":"Accepted to ACM SIGKDD 2025 Research Track"},{"id":"http://arxiv.org/abs/2411.12056v1","updated":"2024-11-18T20:54:17Z","published":"2024-11-18T20:54:17Z","title":"Benchmarking pre-trained text embedding models in aligning built asset\n information","summary":" Accurate mapping of the built asset information to established data\nclassification systems and taxonomies is crucial for effective asset\nmanagement, whether for compliance at project handover or ad-hoc data\nintegration scenarios. Due to the complex nature of built asset data, which\npredominantly comprises technical text elements, this process remains largely\nmanual and reliant on domain expert input. Recent breakthroughs in contextual\ntext representation learning (text embedding), particularly through pre-trained\nlarge language models, offer promising approaches that can facilitate the\nautomation of cross-mapping of the built asset data. However, no comprehensive\nevaluation has yet been conducted to assess these models' ability to\neffectively represent the complex semantics specific to built asset technical\nterminology. This study presents a comparative benchmark of state-of-the-art\ntext embedding models to evaluate their effectiveness in aligning built asset\ninformation with domain-specific technical concepts. Our proposed datasets are\nderived from two renowned built asset data classification dictionaries. The\nresults of our benchmarking across six proposed datasets, covering three tasks\nof clustering, retrieval, and reranking, highlight the need for future research\non domain adaptation techniques. The benchmarking resources are published as an\nopen-source library, which will be maintained and extended to support future\nevaluations in this field.\n","authors":["Mehrzad Shahinmoghadam","Ali Motamedi"],"pdf_url":"https://arxiv.org/pdf/2411.12056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15384v2","updated":"2024-11-18T20:54:06Z","published":"2024-02-23T15:30:57Z","title":"Homeostatic motion planning with innate physics knowledge","summary":" Living organisms interact with their surroundings in a closed-loop fashion,\nwhere sensory inputs dictate the initiation and termination of behaviours. Even\nsimple animals are able to develop and execute complex plans, which has not yet\nbeen replicated in robotics using pure closed-loop input control. We propose a\nsolution to this problem by defining a set of discrete and temporary\nclosed-loop controllers, called \"tasks\", each representing a closed-loop\nbehaviour. We further introduce a supervisory module which has an innate\nunderstanding of physics and causality, through which it can simulate the\nexecution of task sequences over time and store the results in a model of the\nenvironment. On the basis of this model, plans can be made by chaining\ntemporary closed-loop controllers. The proposed framework was implemented for a\nreal robot and tested in two scenarios as proof of concept.\n","authors":["Giulia Lafratta","Bernd Porr","Christopher Chandler","Alice Miller"],"pdf_url":"https://arxiv.org/pdf/2402.15384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12045v1","updated":"2024-11-18T20:32:31Z","published":"2024-11-18T20:32:31Z","title":"Fingerprinting and Tracing Shadows: The Development and Impact of\n Browser Fingerprinting on Digital Privacy","summary":" Browser fingerprinting is a growing technique for identifying and tracking\nusers online without traditional methods like cookies. This paper gives an\noverview by examining the various fingerprinting techniques and analyzes the\nentropy and uniqueness of the collected data. The analysis highlights that\nbrowser fingerprinting poses a complex challenge from both technical and\nprivacy perspectives, as users often have no control over the collection and\nuse of their data. In addition, it raises significant privacy concerns as users\nare often tracked without their knowledge or consent.\n","authors":["Alexander Lawall"],"pdf_url":"https://arxiv.org/pdf/2411.12045v1.pdf","comment":"SECURWARE 2024, France, Nice"},{"id":"http://arxiv.org/abs/2408.13379v2","updated":"2024-11-18T20:30:20Z","published":"2024-08-23T21:25:16Z","title":"N-DriverMotion: Driver motion learning and prediction using an\n event-based camera and directly trained spiking neural networks on Loihi 2","summary":" Driver motion recognition is a principal factor in ensuring the safety of\ndriving systems. This paper presents a novel system for learning and predicting\ndriver motions and an event-based high-resolution (1280x720) dataset,\nN-DriverMotion, newly collected to train on a neuromorphic vision system. The\nsystem comprises an event-based camera that generates the first high-resolution\ndriver motion dataset representing spike inputs and efficient spiking neural\nnetworks (SNNs) that are effective in training and predicting the driver's\ngestures. The event dataset consists of 13 driver motion categories classified\nby direction (front, side), illumination (bright, moderate, dark), and\nparticipant. A novel simplified four-layer convolutional spiking neural network\n(CSNN) that we proposed was directly trained using the high-resolution dataset\nwithout any time-consuming preprocessing. This enables efficient adaptation to\non-device SNNs for real-time inference on high-resolution event-based streams.\nCompared with recent gesture recognition systems adopting neural networks for\nvision processing, the proposed neuromorphic vision system achieves comparable\naccuracy, 94.04\\%, in recognizing driver motions with the CSNN architecture.\nOur proposed CSNN and the dataset can be used to develop safer and more\nefficient driver monitoring systems for autonomous vehicles or edge devices\nrequiring an efficient neural network architecture.\n","authors":["Hyo Jong Chung","Byungkon Kang","Yoonseok Yang"],"pdf_url":"https://arxiv.org/pdf/2408.13379v2.pdf","comment":"Accepted for publication in IEEE Open Journal of Vehicular Technology\n (OJVT) on 18 November 2024"},{"id":"http://arxiv.org/abs/2411.12042v1","updated":"2024-11-18T20:27:13Z","published":"2024-11-18T20:27:13Z","title":"Fast Convergence of Softmax Policy Mirror Ascent","summary":" Natural policy gradient (NPG) is a common policy optimization algorithm and\ncan be viewed as mirror ascent in the space of probabilities. Recently, Vaswani\net al. [2021] introduced a policy gradient method that corresponds to mirror\nascent in the dual space of logits. We refine this algorithm, removing its need\nfor a normalization across actions and analyze the resulting method (referred\nto as SPMA). For tabular MDPs, we prove that SPMA with a constant step-size\nmatches the linear convergence of NPG and achieves a faster convergence than\nconstant step-size (accelerated) softmax policy gradient. To handle large\nstate-action spaces, we extend SPMA to use a log-linear policy\nparameterization. Unlike that for NPG, generalizing SPMA to the linear function\napproximation (FA) setting does not require compatible function approximation.\nUnlike MDPO, a practical generalization of NPG, SPMA with linear FA only\nrequires solving convex softmax classification problems. We prove that SPMA\nachieves linear convergence to the neighbourhood of the optimal value function.\nWe extend SPMA to handle non-linear FA and evaluate its empirical performance\non the MuJoCo and Atari benchmarks. Our results demonstrate that SPMA\nconsistently achieves similar or better performance compared to MDPO, PPO and\nTRPO.\n","authors":["Reza Asad","Reza Babanezhad","Issam Laradji","Nicolas Le Roux","Sharan Vaswani"],"pdf_url":"https://arxiv.org/pdf/2411.12042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14634v2","updated":"2024-11-18T20:25:38Z","published":"2024-09-23T00:09:34Z","title":"Scideator: Human-LLM Scientific Idea Generation Grounded in\n Research-Paper Facet Recombination","summary":" The scientific ideation process often involves blending salient aspects of\nexisting papers to create new ideas. To see if large language models (LLMs) can\nassist this process, we contribute Scideator, a novel mixed-initiative tool for\nscientific ideation. Starting from a user-provided set of papers, Scideator\nextracts key facets (purposes, mechanisms, and evaluations) from these and\nrelevant papers, allowing users to explore the idea space by interactively\nrecombining facets to synthesize inventive ideas. Scideator also helps users to\ngauge idea novelty by searching the literature for potential overlaps and\nshowing automated novelty assessments and explanations. To support these tasks,\nScideator introduces four LLM-powered retrieval-augmented generation (RAG)\nmodules: Analogous Paper Facet Finder, Faceted Idea Generator, Idea Novelty\nChecker, and Idea Novelty Iterator. In a within-subjects user study, 19\ncomputer-science researchers identified significantly more interesting ideas\nusing Scideator compared to a strong baseline combining a scientific search\nengine with LLM interaction.\n","authors":["Marissa Radensky","Simra Shahid","Raymond Fok","Pao Siangliulue","Tom Hope","Daniel S. Weld"],"pdf_url":"https://arxiv.org/pdf/2409.14634v2.pdf","comment":"Revised TextGRAD results after noting inaccuracies in their reporting"},{"id":"http://arxiv.org/abs/2411.12038v1","updated":"2024-11-18T20:19:49Z","published":"2024-11-18T20:19:49Z","title":"Scaling Deep Learning Research with Kubernetes on the NRP Nautilus\n HyperCluster","summary":" Throughout the scientific computing space, deep learning algorithms have\nshown excellent performance in a wide range of applications. As these deep\nneural networks (DNNs) continue to mature, the necessary compute required to\ntrain them has continued to grow. Today, modern DNNs require millions of FLOPs\nand days to weeks of training to generate a well-trained model. The training\ntimes required for DNNs are oftentimes a bottleneck in DNN research for a\nvariety of deep learning applications, and as such, accelerating and scaling\nDNN training enables more robust and accelerated research. To that end, in this\nwork, we explore utilizing the NRP Nautilus HyperCluster to automate and scale\ndeep learning model training for three separate applications of DNNs, including\noverhead object detection, burned area segmentation, and deforestation\ndetection. In total, 234 deep neural models are trained on Nautilus, for a\ntotal time of 4,040 hours\n","authors":["J. Alex Hurt","Anes Ouadou","Mariam Alshehri","Grant J. Scott"],"pdf_url":"https://arxiv.org/pdf/2411.12038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05981v4","updated":"2024-11-18T20:18:32Z","published":"2024-06-10T02:47:55Z","title":"ShiftAddLLM: Accelerating Pretrained LLMs via Post-Training\n Multiplication-Less Reparameterization","summary":" Large language models (LLMs) have shown impressive performance on language\ntasks but face challenges when deployed on resource-constrained devices due to\ntheir extensive parameters and reliance on dense multiplications, resulting in\nhigh memory demands and latency bottlenecks. Shift-and-add reparameterization\noffers a promising solution by replacing costly multiplications with\nhardware-friendly primitives in both the attention and multi-layer perceptron\n(MLP) layers of an LLM. However, current reparameterization techniques require\ntraining from scratch or full parameter fine-tuning to restore accuracy, which\nis resource-intensive for LLMs. To address this, we propose accelerating\npretrained LLMs through post-training shift-and-add reparameterization,\ncreating efficient multiplication-free models, dubbed ShiftAddLLM.\nSpecifically, we quantize each weight matrix into binary matrices paired with\ngroup-wise scaling factors. The associated multiplications are reparameterized\ninto (1) shifts between activations and scaling factors and (2) queries and\nadds according to the binary matrices. To reduce accuracy loss, we present a\nmulti-objective optimization method to minimize both weight and output\nactivation reparameterization errors. Additionally, based on varying\nsensitivity across layers to reparameterization, we develop an automated bit\nallocation strategy to further reduce memory usage and latency. Experiments on\nfive LLM families and eight tasks consistently validate the effectiveness of\nShiftAddLLM, achieving average perplexity improvements of 5.6 and 22.7 points\nat comparable or lower latency compared to the most competitive quantized LLMs\nat 3 and 2 bits, respectively, and more than 80% memory and energy reductions\nover the original LLMs. Codes and models are available at\nhttps://github.com/GATECH-EIC/ShiftAddLLM.\n","authors":["Haoran You","Yipin Guo","Yichao Fu","Wei Zhou","Huihong Shi","Xiaofan Zhang","Souvik Kundu","Amir Yazdanbakhsh","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2406.05981v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.12019v1","updated":"2024-11-18T20:01:45Z","published":"2024-11-18T20:01:45Z","title":"Regret-Free Reinforcement Learning for LTL Specifications","summary":" Reinforcement learning (RL) is a promising method to learn optimal control\npolicies for systems with unknown dynamics. In particular, synthesizing\ncontrollers for safety-critical systems based on high-level specifications,\nsuch as those expressed in temporal languages like linear temporal logic (LTL),\npresents a significant challenge in control systems research. Current RL-based\nmethods designed for LTL tasks typically offer only asymptotic guarantees,\nwhich provide no insight into the transient performance during the learning\nphase. While running an RL algorithm, it is crucial to assess how close we are\nto achieving optimal behavior if we stop learning.\n In this paper, we present the first regret-free online algorithm for learning\na controller that addresses the general class of LTL specifications over Markov\ndecision processes (MDPs) with a finite set of states and actions. We begin by\nproposing a regret-free learning algorithm to solve infinite-horizon\nreach-avoid problems. For general LTL specifications, we show that the\nsynthesis problem can be reduced to a reach-avoid problem when the graph\nstructure is known. Additionally, we provide an algorithm for learning the\ngraph structure, assuming knowledge of a minimum transition probability, which\noperates independently of the main regret-free algorithm.\n","authors":["Rupak Majumdar","Mahmoud Salamati","Sadegh Soudjani"],"pdf_url":"https://arxiv.org/pdf/2411.12019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12000v1","updated":"2024-11-18T19:36:26Z","published":"2024-11-18T19:36:26Z","title":"ByteScience: Bridging Unstructured Scientific Literature and Structured\n Data with Auto Fine-tuned Large Language Model in Token Granularity","summary":" Natural Language Processing (NLP) is widely used to supply summarization\nability from long context to structured information. However, extracting\nstructured knowledge from scientific text by NLP models remains a challenge\nbecause of its domain-specific nature to complex data preprocessing and the\ngranularity of multi-layered device-level information. To address this, we\nintroduce ByteScience, a non-profit cloud-based auto fine-tuned Large Language\nModel (LLM) platform, which is designed to extract structured scientific data\nand synthesize new scientific knowledge from vast scientific corpora. The\nplatform capitalizes on DARWIN, an open-source, fine-tuned LLM dedicated to\nnatural science. The platform was built on Amazon Web Services (AWS) and\nprovides an automated, user-friendly workflow for custom model development and\ndata extraction. The platform achieves remarkable accuracy with only a small\namount of well-annotated articles. This innovative tool streamlines the\ntransition from the science literature to structured knowledge and data and\nbenefits the advancements in natural informatics.\n","authors":["Tong Xie","Hanzhi Zhang","Shaozhou Wang","Yuwei Wan","Imran Razzak","Chunyu Kit","Wenjie Zhangand Bram Hoex"],"pdf_url":"https://arxiv.org/pdf/2411.12000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11984v1","updated":"2024-11-18T19:14:36Z","published":"2024-11-18T19:14:36Z","title":"Understanding Chain-of-Thought in LLMs through Information Theory","summary":" Large Language Models (LLMs) have shown impressive performance in complex\nreasoning tasks through Chain-of-Thought (CoT) reasoning, allowing models to\nbreak down problems into manageable sub-tasks. However, existing CoT evaluation\ntechniques either require annotated CoT data or fall short in accurately\nassessing intermediate reasoning steps, leading to high rates of false\npositives. In this paper, we formalize CoT reasoning in LLMs through an\ninformation-theoretic lens. Specifically, our framework quantifies the\n`information gain' at each reasoning step, enabling the identification of\nfailure modes in LLMs without the need for expensive annotated datasets. We\ndemonstrate the efficacy of our approach through extensive experiments on toy\nand GSM-8K data, where it significantly outperforms existing outcome-based\nmethods by providing more accurate insights into model performance on\nindividual tasks.\n","authors":["Jean-Francois Ton","Muhammad Faaiz Taufiq","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11943v1","updated":"2024-11-18T18:37:09Z","published":"2024-11-18T18:37:09Z","title":"Medical Video Generation for Disease Progression Simulation","summary":" Modeling disease progression is crucial for improving the quality and\nefficacy of clinical diagnosis and prognosis, but it is often hindered by a\nlack of longitudinal medical image monitoring for individual patients. To\naddress this challenge, we propose the first Medical Video Generation (MVG)\nframework that enables controlled manipulation of disease-related image and\nvideo features, allowing precise, realistic, and personalized simulations of\ndisease progression. Our approach begins by leveraging large language models\n(LLMs) to recaption prompt for disease trajectory. Next, a controllable\nmulti-round diffusion model simulates the disease progression state for each\npatient, creating realistic intermediate disease state sequence. Finally, a\ndiffusion-based video transition generation model interpolates disease\nprogression between these states. We validate our framework across three\nmedical imaging domains: chest X-ray, fundus photography, and skin image. Our\nresults demonstrate that MVG significantly outperforms baseline models in\ngenerating coherent and clinically plausible disease trajectories. Two user\nstudies by veteran physicians, provide further validation and insights into the\nclinical utility of the generated sequences. MVG has the potential to assist\nhealthcare providers in modeling disease trajectories, interpolating missing\nmedical image data, and enhancing medical education through realistic, dynamic\nvisualizations of disease progression.\n","authors":["Xu Cao","Kaizhao Liang","Kuei-Da Liao","Tianren Gao","Wenqian Ye","Jintai Chen","Zhiguang Ding","Jianguo Cao","James M. Rehg","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2411.11943v1.pdf","comment":"Tech Report. The appendix will release soon. arXiv admin note: text\n overlap with arXiv:2309.11745"},{"id":"http://arxiv.org/abs/2208.14837v3","updated":"2024-11-18T17:30:05Z","published":"2022-08-31T13:09:39Z","title":"Batch-Size Independent Regret Bounds for Combinatorial Semi-Bandits with\n Probabilistically Triggered Arms or Independent Arms","summary":" In this paper, we study the combinatorial semi-bandits (CMAB) and focus on\nreducing the dependency of the batch-size $K$ in the regret bound, where $K$ is\nthe total number of arms that can be pulled or triggered in each round. First,\nfor the setting of CMAB with probabilistically triggered arms (CMAB-T), we\ndiscover a novel (directional) triggering probability and variance modulated\n(TPVM) condition that can replace the previously-used smoothness condition for\nvarious applications, such as cascading bandits, online network exploration and\nonline influence maximization. Under this new condition, we propose a BCUCB-T\nalgorithm with variance-aware confidence intervals and conduct regret analysis\nwhich reduces the $O(K)$ factor to $O(\\log K)$ or $O(\\log^2 K)$ in the regret\nbound, significantly improving the regret bounds for the above applications.\nSecond, for the setting of non-triggering CMAB with independent arms, we\npropose a SESCB algorithm which leverages on the non-triggering version of the\nTPVM condition and completely removes the dependency on $K$ in the leading\nregret. As a valuable by-product, the regret analysis used in this paper can\nimprove several existing results by a factor of $O(\\log K)$. Finally,\nexperimental evaluations show our superior performance compared with benchmark\nalgorithms in different applications.\n","authors":["Xutong Liu","Jinhang Zuo","Siwei Wang","Carlee Joe-Wong","John C. S. Lui","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2208.14837v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11942v1","updated":"2024-11-18T17:15:35Z","published":"2024-11-18T17:15:35Z","title":"Variable Rate Neural Compression for Sparse Detector Data","summary":" High-energy large-scale particle colliders generate data at extraordinary\nrates. Developing real-time high-throughput data compression algorithms to\nreduce data volume and meet the bandwidth requirement for storage has become\nincreasingly critical. Deep learning is a promising technology that can address\nthis challenging topic. At the newly constructed sPHENIX experiment at the\nRelativistic Heavy Ion Collider, a Time Projection Chamber (TPC) serves as the\nmain tracking detector, which records three-dimensional particle trajectories\nin a volume of a gas-filled cylinder. In terms of occupancy, the resulting data\nflow can be very sparse reaching $10^{-3}$ for proton-proton collisions. Such\nsparsity presents a challenge to conventional learning-free lossy compression\nalgorithms, such as SZ, ZFP, and MGARD. In contrast, emerging deep\nlearning-based models, particularly those utilizing convolutional neural\nnetworks for compression, have outperformed these conventional methods in terms\nof compression ratios and reconstruction accuracy. However, research on the\nefficacy of these deep learning models in handling sparse datasets, like those\nproduced in particle colliders, remains limited. Furthermore, most deep\nlearning models do not adapt their processing speeds to data sparsity, which\naffects efficiency. To address this issue, we propose a novel approach for TPC\ndata compression via key-point identification facilitated by sparse\nconvolution. Our proposed algorithm, BCAE-VS, achieves a $75\\%$ improvement in\nreconstruction accuracy with a $10\\%$ increase in compression ratio over the\nprevious state-of-the-art model. Additionally, BCAE-VS manages to achieve these\nresults with a model size over two orders of magnitude smaller. Lastly, we have\nexperimentally verified that as sparsity increases, so does the model's\nthroughput.\n","authors":["Yi Huang","Yeonju Go","Jin Huang","Shuhang Li","Xihaier Luo","Thomas Marshall","Joseph Osborn","Christopher Pinkenburg","Yihui Ren","Evgeny Shulga","Shinjae Yoo","Byung-Jun Yoon"],"pdf_url":"https://arxiv.org/pdf/2411.11942v1.pdf","comment":"37 pages, 12 figures, submitted to Journal of Computational Physics"}],"Computation and Language":[{"id":"http://arxiv.org/abs/2411.11843v1","updated":"2024-11-18T18:59:15Z","published":"2024-11-18T18:59:15Z","title":"Bi-Mamba: Towards Accurate 1-Bit State Space Models","summary":" The typical selective state-space model (SSM) of Mamba addresses several\nlimitations of Transformers, such as quadratic computational complexity with\nsequence length and significant inference-time memory requirements due to the\nkey-value cache. However, the growing size of Mamba models continues to pose\ntraining and deployment challenges and raises environmental concerns due to\nconsiderable energy consumption. In this work, we introduce Bi-Mamba, a\nscalable and powerful 1-bit Mamba architecture designed for more efficient\nlarge language models with multiple sizes across 780M, 1.3B, and 2.7B. Bi-Mamba\nmodels are trained from scratch on data volume as regular LLM pertaining using\nan autoregressive distillation loss. Extensive experimental results on language\nmodeling demonstrate that Bi-Mamba achieves performance comparable to its\nfull-precision counterparts (e.g., FP16 or BF16) and much better accuracy than\npost-training-binarization (PTB) Mamba baselines, while significantly reducing\nmemory footprint and energy consumption compared to the original Mamba model.\nOur study pioneers a new linear computational complexity LLM framework under\nlow-bit representation and facilitates the future design of specialized\nhardware tailored for efficient 1-bit Mamba-based LLMs.\n","authors":["Shengkun Tang","Liqun Ma","Haonan Li","Mingjie Sun","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2411.11843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11829v1","updated":"2024-11-18T18:48:13Z","published":"2024-11-18T18:48:13Z","title":"Tackling prediction tasks in relational databases with LLMs","summary":" Though large language models (LLMs) have demonstrated exceptional performance\nacross numerous problems, their application to predictive tasks in relational\ndatabases remains largely unexplored. In this work, we address the notion that\nLLMs cannot yield satisfactory results on relational databases due to their\ninterconnected tables, complex relationships, and heterogeneous data types.\nUsing the recently introduced RelBench benchmark, we demonstrate that even a\nstraightforward application of LLMs achieves competitive performance on these\ntasks. These findings establish LLMs as a promising new baseline for ML on\nrelational databases and encourage further research in this direction.\n","authors":["Marek Wydmuch","Łukasz Borchmann","Filip Graliński"],"pdf_url":"https://arxiv.org/pdf/2411.11829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22587v2","updated":"2024-11-18T18:42:44Z","published":"2024-10-29T23:00:05Z","title":"Toxicity of the Commons: Curating Open-Source Pre-Training Data","summary":" Open-source large language models are becoming increasingly available and\npopular among researchers and practitioners. While significant progress has\nbeen made on open-weight models, open training data is a practice yet to be\nadopted by the leading open-weight models creators. At the same time, there\nresearchers are working to make language models safer. We propose a data\ncuration pipeline to reduce harmful outputs by models trained on public domain\ndata. There are unique challenges to working with public domain data, as these\nsources differ from web text in both form and content. Many sources are\nhistorical documents and are the result of Optical Character Recognition (OCR).\nConsequently, current state-of-the-art approaches to toxicity filtering are\noften infeasible or inappropriate for open data models. In this paper, we\nintroduce a new fully open-source pipeline for open-data toxicity filtering.\nOur contributions are threefold. We create a custom training dataset,\nToxicCommons, which is composed of texts which have been classified across five\ndifferent dimensions (racial/origin-based, gender/sex-based, religious,\nability-based discrimination, and violence). We use this dataset to train a\ncustom classifier, Celadon, that can be used to detect toxic content in open\ndata more efficiently at a larger scale. Finally, we describe the balanced\napproach to content filtration that optimizes safety filtering with respect to\nthe filtered data available for training.\n","authors":["Catherine Arnett","Eliot Jones","Ivan P. Yamshchikov","Pierre-Carl Langlais"],"pdf_url":"https://arxiv.org/pdf/2410.22587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00024v2","updated":"2024-11-18T18:41:08Z","published":"2024-10-28T22:30:06Z","title":"A Perspective for Adapting Generalist AI to Specialized Medical AI\n Applications and Their Challenges","summary":" The integration of Large Language Models (LLMs) into medical applications has\nsparked widespread interest across the healthcare industry, from drug discovery\nand development to clinical decision support, assisting telemedicine, medical\ndevices, and healthcare insurance applications. This perspective paper aims to\ndiscuss the inner workings of building LLM-powered medical AI applications and\nintroduces a comprehensive framework for their development. We review existing\nliterature and outline the unique challenges of applying LLMs in specialized\nmedical contexts. Additionally, we introduce a three-step framework to organize\nmedical LLM research activities: 1) Modeling: breaking down complex medical\nworkflows into manageable steps for developing medical-specific models; 2)\nOptimization: optimizing the model performance with crafted prompts and\nintegrating external knowledge and tools, and 3) System engineering:\ndecomposing complex tasks into subtasks and leveraging human expertise for\nbuilding medical AI applications. Furthermore, we offer a detailed use case\nplaybook that describes various LLM-powered medical AI applications, such as\noptimizing clinical trial design, enhancing clinical decision support, and\nadvancing medical imaging analysis. Finally, we discuss various challenges and\nconsiderations for building medical AI applications with LLMs, such as handling\nhallucination issues, data ownership and compliance, privacy, intellectual\nproperty considerations, compute cost, sustainability issues, and responsible\nAI requirements.\n","authors":["Zifeng Wang","Hanyin Wang","Benjamin Danek","Ying Li","Christina Mack","Hoifung Poon","Yajuan Wang","Pranav Rajpurkar","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2411.00024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04254v2","updated":"2024-11-18T18:35:06Z","published":"2024-04-05T17:58:52Z","title":"Watermark-based Detection and Attribution of AI-Generated Content","summary":" Several companies have deployed watermark-based detection to identify\nAI-generated content. However, attribution--the ability to trace back to the\nuser of a generative AI (GenAI) service who created a given piece of\nAI-generated content--remains largely unexplored despite its growing\nimportance. In this work, we aim to bridge this gap by conducting the first\nsystematic study on watermark-based, user-level attribution of AI-generated\ncontent. Our key idea is to assign a unique watermark to each user of the GenAI\nservice and embed this watermark into the AI-generated content created by that\nuser. Attribution is then performed by identifying the user whose watermark\nbest matches the one extracted from the given content. This approach, however,\nfaces a key challenge: How should watermarks be selected for users to maximize\nattribution performance? To address the challenge, we first theoretically\nderive lower bounds on detection and attribution performance through rigorous\nprobabilistic analysis for any given set of user watermarks. Then, we select\nwatermarks for users to maximize these lower bounds, thereby optimizing\ndetection and attribution performance. Our theoretical and empirical results\nshow that watermark-based attribution inherits both the accuracy and\n(non-)robustness properties of the underlying watermark. Specifically,\nattribution remains highly accurate when the watermarked AI-generated content\nis either not post-processed or subjected to common post-processing such as\nJPEG compression, as well as black-box adversarial post-processing with limited\nquery budgets.\n","authors":["Zhengyuan Jiang","Moyang Guo","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2404.04254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11770v1","updated":"2024-11-18T17:50:34Z","published":"2024-11-18T17:50:34Z","title":"CNMBert: A Model For Hanyu Pinyin Abbreviation to Character Conversion\n Task","summary":" The task of converting Hanyu Pinyin abbreviations to Chinese characters\nrepresents a significant branch within the domain of Chinese Spelling\nCorrection (CSC). This task is typically one of text-length alignment, however,\ndue to the limited informational content in pinyin abbreviations, achieving\naccurate conversion is challenging. In this paper, we propose CNMBert which\nstands for zh-CN Pinyin Multi-mask Bert Model as a solution to this issue.\nCNMBert surpasses few-shot GPT models, achieving a 59.63% MRR on a\n10,424-sample Hanyu Pinyin abbreviation test dataset.\n","authors":["Zishuo Feng","Feng Cao"],"pdf_url":"https://arxiv.org/pdf/2411.11770v1.pdf","comment":"9 pages, 2figures"},{"id":"http://arxiv.org/abs/2411.11767v1","updated":"2024-11-18T17:46:32Z","published":"2024-11-18T17:46:32Z","title":"Drowning in Documents: Consequences of Scaling Reranker Inference","summary":" Rerankers, typically cross-encoders, are often used to re-score the documents\nretrieved by cheaper initial IR systems. This is because, though expensive,\nrerankers are assumed to be more effective. We challenge this assumption by\nmeasuring reranker performance for full retrieval, not just re-scoring\nfirst-stage retrieval. Our experiments reveal a surprising trend: the best\nexisting rerankers provide diminishing returns when scoring progressively more\ndocuments and actually degrade quality beyond a certain limit. In fact, in this\nsetting, rerankers can frequently assign high scores to documents with no\nlexical or semantic overlap with the query. We hope that our findings will spur\nfuture research to improve reranking.\n","authors":["Mathew Jacob","Erik Lindgren","Matei Zaharia","Michael Carbin","Omar Khattab","Andrew Drozdov"],"pdf_url":"https://arxiv.org/pdf/2411.11767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11758v1","updated":"2024-11-18T17:37:10Z","published":"2024-11-18T17:37:10Z","title":"The Power of Many: Multi-Agent Multimodal Models for Cultural Image\n Captioning","summary":" Large Multimodal Models (LMMs) exhibit impressive performance across various\nmultimodal tasks. However, their effectiveness in cross-cultural contexts\nremains limited due to the predominantly Western-centric nature of most data\nand models. Conversely, multi-agent models have shown significant capability in\nsolving complex tasks. Our study evaluates the collective performance of LMMs\nin a multi-agent interaction setting for the novel task of cultural image\ncaptioning. Our contributions are as follows: (1) We introduce MosAIC, a\nMulti-Agent framework to enhance cross-cultural Image Captioning using LMMs\nwith distinct cultural personas; (2) We provide a dataset of culturally\nenriched image captions in English for images from China, India, and Romania\nacross three datasets: GeoDE, GD-VCR, CVQA; (3) We propose a culture-adaptable\nmetric for evaluating cultural information within image captions; and (4) We\nshow that the multi-agent interaction outperforms single-agent models across\ndifferent metrics, and offer valuable insights for future research. Our dataset\nand models can be accessed at https://github.com/MichiganNLP/MosAIC.\n","authors":["Longju Bai","Angana Borah","Oana Ignat","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2411.11758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06153v2","updated":"2024-11-18T17:25:15Z","published":"2024-10-08T15:52:42Z","title":"AgentSquare: Automatic LLM Agent Search in Modular Design Space","summary":" Recent advancements in Large Language Models (LLMs) have led to a rapid\ngrowth of agentic systems capable of handling a wide range of complex tasks.\nHowever, current research largely relies on manual, task-specific design,\nlimiting their adaptability to novel tasks. In this paper, we introduce a new\nresearch problem: Modularized LLM Agent Search (MoLAS). We propose a modular\ndesign space that abstracts existing LLM agent designs into four fundamental\nmodules with uniform IO interface: Planning, Reasoning, Tool Use, and Memory.\nBuilding on this design space, we present a novel LLM agent search framework\ncalled AgentSquare, which introduces two core mechanisms, i.e., module\nevolution and recombination, to efficiently search for optimized LLM agents. To\nfurther accelerate the process, we design a performance predictor that uses\nin-context surrogate models to skip unpromising agent designs. Extensive\nexperiments across six benchmarks, covering the diverse scenarios of web,\nembodied, tool use and game applications, show that AgentSquare substantially\noutperforms hand-crafted agents, achieving an average performance gain of 17.2%\nagainst best-known human designs. Moreover, AgentSquare can generate\ninterpretable design insights, enabling a deeper understanding of agentic\narchitecture and its impact on task performance. We believe that the modular\ndesign space and AgentSquare search framework offer a platform for fully\nexploiting the potential of prior successful designs and consolidating the\ncollective efforts of research community. Code repo is available at\nhttps://github.com/tsinghua-fib-lab/AgentSquare.\n","authors":["Yu Shang","Yu Li","Keyu Zhao","Likai Ma","Jiahe Liu","Fengli Xu","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2410.06153v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2411.11736v1","updated":"2024-11-18T17:03:30Z","published":"2024-11-18T17:03:30Z","title":"Advacheck at GenAI Detection Task 1: AI Detection Powered by\n Domain-Aware Multi-Tasking","summary":" The paper describes a system designed by Advacheck team to recognise\nmachine-generated and human-written texts in the monolingual subtask of GenAI\nDetection Task 1 competition. Our developed system is a multi-task architecture\nwith shared Transformer Encoder between several classification heads. One head\nis responsible for binary classification between human-written and\nmachine-generated texts, while the other heads are auxiliary multiclass\nclassifiers for texts of different domains from particular datasets. As\nmulticlass heads were trained to distinguish the domains presented in the data,\nthey provide a better understanding of the samples. This approach led us to\nachieve the first place in the official ranking with 83.07% macro F1-score on\nthe test set and bypass the baseline by 10%. We further study obtained system\nthrough ablation, error and representation analyses, finding that multi-task\nlearning outperforms single-task mode and simultaneous tasks form a cluster\nstructure in embeddings space.\n","authors":["German Gritsai","Anastasia Voznyuk","Ildar Khabutdinov","Andrey Grabovoy"],"pdf_url":"https://arxiv.org/pdf/2411.11736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15367v2","updated":"2024-11-18T17:00:32Z","published":"2024-09-18T18:36:18Z","title":"Fine-Tuning a Time Series Foundation Model with Wasserstein Loss","summary":" Inspired by recent advancements in large language models (LLMs) for Natural\nLanguage Processing (NLP), there has been a surge in research focused on\ndeveloping foundational models for time series forecasting. One approach\ninvolves training LLM architectures on tokenized time series data using\ncross-entropy loss. Although this method has demonstrated promising results,\ncross-entropy loss is primarily designed for classification tasks and does not\naccount for the distance between classes. To address this limitation, we\npropose using the Wasserstein loss for such architectures. To validate our\napproach, we fine-tuned a foundational time series model on $22$ zero-shot\ndatasets, comparing the performance of cross-entropy loss with that of\nWasserstein loss. Our results demonstrate that replacing cross-entropy loss\nwith Wasserstein loss significantly improves point estimation.\n","authors":["Andrei Chernov"],"pdf_url":"https://arxiv.org/pdf/2409.15367v2.pdf","comment":"4 main pages; 2 figures"},{"id":"http://arxiv.org/abs/2411.11731v1","updated":"2024-11-18T16:59:59Z","published":"2024-11-18T16:59:59Z","title":"Moral Persuasion in Large Language Models: Evaluating Susceptibility and\n Ethical Alignment","summary":" We explore how large language models (LLMs) can be influenced by prompting\nthem to alter their initial decisions and align them with established ethical\nframeworks. Our study is based on two experiments designed to assess the\nsusceptibility of LLMs to moral persuasion. In the first experiment, we examine\nthe susceptibility to moral ambiguity by evaluating a Base Agent LLM on morally\nambiguous scenarios and observing how a Persuader Agent attempts to modify the\nBase Agent's initial decisions. The second experiment evaluates the\nsusceptibility of LLMs to align with predefined ethical frameworks by prompting\nthem to adopt specific value alignments rooted in established philosophical\ntheories. The results demonstrate that LLMs can indeed be persuaded in morally\ncharged scenarios, with the success of persuasion depending on factors such as\nthe model used, the complexity of the scenario, and the conversation length.\nNotably, LLMs of distinct sizes but from the same company produced markedly\ndifferent outcomes, highlighting the variability in their susceptibility to\nethical persuasion.\n","authors":["Allison Huang","Yulu Niki Pi","Carlos Mougan"],"pdf_url":"https://arxiv.org/pdf/2411.11731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11707v1","updated":"2024-11-18T16:34:58Z","published":"2024-11-18T16:34:58Z","title":"FedCoLLM: A Parameter-Efficient Federated Co-tuning Framework for Large\n and Small Language Models","summary":" By adapting Large Language Models (LLMs) to domain-specific tasks or\nenriching them with domain-specific knowledge, we can fully harness the\ncapabilities of LLMs. Nonetheless, a gap persists in achieving simultaneous\nmutual enhancement between the server's LLM and the downstream clients' Small\nLanguage Models (SLMs). To address this, we propose FedCoLLM, a novel and\nparameter-efficient federated framework designed for co-tuning LLMs and SLMs.\nThis approach is aimed at adaptively transferring server-side LLMs knowledge to\nclients' SLMs while simultaneously enriching the LLMs with domain insights from\nthe clients. To accomplish this, FedCoLLM utilizes lightweight adapters in\nconjunction with SLMs, facilitating knowledge exchange between server and\nclients in a manner that respects data privacy while also minimizing\ncomputational and communication overhead. Our evaluation of FedCoLLM, utilizing\nvarious public LLMs and SLMs across a range of NLP text generation tasks,\nreveals that the performance of clients' SLMs experiences notable improvements\nwith the assistance of the LLMs. Simultaneously, the LLMs enhanced via FedCoLLM\nachieves comparable performance to that obtained through direct fine-tuning on\nclients' data.\n","authors":["Tao Fan","Yan Kang","Guoqiang Ma","Lixin Fan","Kai Chen","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.11707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11694v1","updated":"2024-11-18T16:15:17Z","published":"2024-11-18T16:15:17Z","title":"Technical Report: Enhancing LLM Reasoning with Reward-guided Tree Search","summary":" Recently, test-time scaling has garnered significant attention from the\nresearch community, largely due to the substantial advancements of the o1 model\nreleased by OpenAI. By allocating more computational resources during the\ninference phase, large language models~(LLMs) can extensively explore the\nsolution space by generating more thought tokens or diverse solutions, thereby\nproducing more accurate responses. However, developing an o1-like reasoning\napproach is challenging, and researchers have been making various attempts to\nadvance this open area of research. In this paper, we present a preliminary\nexploration into enhancing the reasoning abilities of LLMs through\nreward-guided tree search algorithms. This framework is implemented by\nintegrating the policy model, reward model, and search algorithm. It is\nprimarily constructed around a tree search algorithm, where the policy model\nnavigates a dynamically expanding tree guided by a specially trained reward\nmodel. We thoroughly explore various design considerations necessary for\nimplementing this framework and provide a detailed report of the technical\naspects. To assess the effectiveness of our approach, we focus on mathematical\nreasoning tasks and conduct extensive evaluations on four challenging datasets,\nsignificantly enhancing the reasoning abilities of LLMs.\n","authors":["Jinhao Jiang","Zhipeng Chen","Yingqian Min","Jie Chen","Xiaoxue Cheng","Jiapeng Wang","Yiru Tang","Haoxiang Sun","Jia Deng","Wayne Xin Zhao","Zheng Liu","Dong Yan","Jian Xie","Zhongyuan Wang","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2411.11694v1.pdf","comment":"LLM;Complex Reasoning;Math"},{"id":"http://arxiv.org/abs/2409.16934v3","updated":"2024-11-18T15:22:32Z","published":"2024-09-25T13:45:23Z","title":"Investigating OCR-Sensitive Neurons to Improve Entity Recognition in\n Historical Documents","summary":" This paper investigates the presence of OCR-sensitive neurons within the\nTransformer architecture and their influence on named entity recognition (NER)\nperformance on historical documents. By analysing neuron activation patterns in\nresponse to clean and noisy text inputs, we identify and then neutralise\nOCR-sensitive neurons to improve model performance. Based on two open access\nlarge language models (Llama2 and Mistral), experiments demonstrate the\nexistence of OCR-sensitive regions and show improvements in NER performance on\nhistorical newspapers and classical commentaries, highlighting the potential of\ntargeted neuron modulation to improve models' performance on noisy text.\n","authors":["Emanuela Boros","Maud Ehrmann"],"pdf_url":"https://arxiv.org/pdf/2409.16934v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11635v1","updated":"2024-11-18T15:13:47Z","published":"2024-11-18T15:13:47Z","title":"Chapter 7 Review of Data-Driven Generative AI Models for Knowledge\n Extraction from Scientific Literature in Healthcare","summary":" This review examines the development of abstractive NLP-based text\nsummarization approaches and compares them to existing techniques for\nextractive summarization. A brief history of text summarization from the 1950s\nto the introduction of pre-trained language models such as Bidirectional\nEncoder Representations from Transformer (BERT) and Generative Pre-training\nTransformers (GPT) are presented. In total, 60 studies were identified in\nPubMed and Web of Science, of which 29 were excluded and 24 were read and\nevaluated for eligibility, resulting in the use of seven studies for further\nanalysis. This chapter also includes a section with examples including an\nexample of a comparison between GPT-3 and state-of-the-art GPT-4 solutions in\nscientific text summarisation. Natural language processing has not yet reached\nits full potential in the generation of brief textual summaries. As there are\nacknowledged concerns that must be addressed, we can expect gradual\nintroduction of such models in practise.\n","authors":["Leon Kopitar","Primoz Kocbek","Lucija Gosak","Gregor Stiglic"],"pdf_url":"https://arxiv.org/pdf/2411.11635v1.pdf","comment":"16 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2411.11623v1","updated":"2024-11-18T14:53:53Z","published":"2024-11-18T14:53:53Z","title":"Federated Incremental Named Entity Recognition","summary":" Federated Named Entity Recognition (FNER) boosts model training within each\nlocal client by aggregating the model updates of decentralized local clients,\nwithout sharing their private data. However, existing FNER methods assume fixed\nentity types and local clients in advance, leading to their ineffectiveness in\npractical applications. In a more realistic scenario, local clients receive new\nentity types continuously, while new local clients collecting novel data may\nirregularly join the global FNER training. This challenging setup, referred to\nhere as Federated Incremental NER, renders the global model suffering from\nheterogeneous forgetting of old entity types from both intra-client and\ninter-client perspectives. To overcome these challenges, we propose a\nLocal-Global Forgetting Defense (LGFD) model. Specifically, to address\nintra-client forgetting, we develop a structural knowledge distillation loss to\nretain the latent space's feature structure and a pseudo-label-guided\ninter-type contrastive loss to enhance discriminative capability over different\nentity types, effectively preserving previously learned knowledge within local\nclients. To tackle inter-client forgetting, we propose a task switching monitor\nthat can automatically identify new entity types under privacy protection and\nstore the latest old global model for knowledge distillation and\npseudo-labeling. Experiments demonstrate significant improvement of our LGFD\nmodel over comparison methods.\n","authors":["Duzhen Zhang","Yahan Yu","Chenxing Li","Jiahua Dong","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2411.11623v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2411.07381v2","updated":"2024-11-18T14:51:17Z","published":"2024-11-11T21:32:06Z","title":"BeeManc at the PLABA Track of TAC-2024: RoBERTa for task 1 -- LLaMA3.1\n and GPT-4o for task 2","summary":" This report is the system description of the BeeManc team for shared task\nPlain Language Adaptation of Biomedical Abstracts (PLABA) 2024. This report\ncontains two sections corresponding to the two sub-tasks in PLABA 2024. In task\none, we applied fine-tuned ReBERTa-Base models to identify and classify the\ndifficult terms, jargon and acronyms in the biomedical abstracts and reported\nthe F1 score. Due to time constraints, we didn't finish the replacement task.\nIn task two, we leveraged Llamma3.1-70B-Instruct and GPT-4o with the one-shot\nprompts to complete the abstract adaptation and reported the scores in BLEU,\nSARI, BERTScore, LENS, and SALSA. From the official Evaluation from PLABA-2024\non Task 1A and 1B, our \\textbf{much smaller fine-tuned RoBERTa-Base} model\nranked 3rd and 2nd respectively on the two sub-task, and the \\textbf{1st on\naveraged F1 scores across the two tasks} from 9 evaluated systems. Our\nLLaMA-3.1-70B-instructed model achieved the \\textbf{highest Completeness} score\nfor Task-2. We share our fine-tuned models and related resources at\n\\url{https://github.com/HECTA-UoM/PLABA2024}\n","authors":["Zhidong Ling","Zihao Li","Pablo Romero","Lifeng Han","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2411.07381v2.pdf","comment":"ongoing work - system report"},{"id":"http://arxiv.org/abs/2407.11211v3","updated":"2024-11-18T14:43:38Z","published":"2024-07-15T19:53:02Z","title":"Unconstrained Open Vocabulary Image Classification: Zero-Shot Transfer\n from Text to Image via CLIP Inversion","summary":" We introduce NOVIC, an innovative real-time uNconstrained Open Vocabulary\nImage Classifier that uses an autoregressive transformer to generatively output\nclassification labels as language. Leveraging the extensive knowledge of CLIP\nmodels, NOVIC harnesses the embedding space to enable zero-shot transfer from\npure text to images. Traditional CLIP models, despite their ability for open\nvocabulary classification, require an exhaustive prompt of potential class\nlabels, restricting their application to images of known content or context. To\naddress this, we propose an \"object decoder\" model that is trained on a\nlarge-scale 92M-target dataset of templated object noun sets and LLM-generated\ncaptions to always output the object noun in question. This effectively inverts\nthe CLIP text encoder and allows textual object labels from essentially the\nentire English language to be generated directly from image-derived embedding\nvectors, without requiring any a priori knowledge of the potential content of\nan image, and without any label biases. The trained decoders are tested on a\nmix of manually and web-curated datasets, as well as standard image\nclassification benchmarks, and achieve fine-grained prompt-free prediction\nscores of up to 87.5%, a strong result considering the model must work for any\nconceivable image and without any contextual clues.\n","authors":["Philipp Allgeuer","Kyra Ahrens","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2407.11211v3.pdf","comment":"Published at WACV 2025"},{"id":"http://arxiv.org/abs/2411.08745v2","updated":"2024-11-18T14:41:38Z","published":"2024-11-13T16:26:19Z","title":"Separating Tongue from Thought: Activation Patching Reveals\n Language-Agnostic Concept Representations in Transformers","summary":" A central question in multilingual language modeling is whether large\nlanguage models (LLMs) develop a universal concept representation, disentangled\nfrom specific languages. In this paper, we address this question by analyzing\nlatent representations (latents) during a word translation task in\ntransformer-based LLMs. We strategically extract latents from a source\ntranslation prompt and insert them into the forward pass on a target\ntranslation prompt. By doing so, we find that the output language is encoded in\nthe latent at an earlier layer than the concept to be translated. Building on\nthis insight, we conduct two key experiments. First, we demonstrate that we can\nchange the concept without changing the language and vice versa through\nactivation patching alone. Second, we show that patching with the mean over\nlatents across different languages does not impair and instead improves the\nmodels' performance in translating the concept. Our results provide evidence\nfor the existence of language-agnostic concept representations within the\ninvestigated models.\n","authors":["Clément Dumas","Chris Wendler","Veniamin Veselovsky","Giovanni Monea","Robert West"],"pdf_url":"https://arxiv.org/pdf/2411.08745v2.pdf","comment":"12 pages, 10 figures, previous version published under the title \"How\n Do Llamas Process Multilingual Text? A Latent Exploration through Activation\n Patching\" at the ICML 2024 mechanistic interpretability workshop at\n https://openreview.net/forum?id=0ku2hIm4BS"},{"id":"http://arxiv.org/abs/2406.07302v2","updated":"2024-11-18T14:40:54Z","published":"2024-06-11T14:30:34Z","title":"BertaQA: How Much Do Language Models Know About Local Culture?","summary":" Large Language Models (LLMs) exhibit extensive knowledge about the world, but\nmost evaluations have been limited to global or anglocentric subjects. This\nraises the question of how well these models perform on topics relevant to\nother cultures, whose presence on the web is not that prominent. To address\nthis gap, we introduce BertaQA, a multiple-choice trivia dataset that is\nparallel in English and Basque. The dataset consists of a local subset with\nquestions pertinent to the Basque culture, and a global subset with questions\nof broader interest. We find that state-of-the-art LLMs struggle with local\ncultural knowledge, even as they excel on global topics. However, we show that\ncontinued pre-training in Basque significantly improves the models' performance\non Basque culture, even when queried in English. To our knowledge, this is the\nfirst solid evidence of knowledge transfer from a low-resource to a\nhigh-resource language. Our analysis sheds light on the complex interplay\nbetween language and knowledge, and reveals that some prior findings do not\nfully hold when reassessed on local topics. Our dataset and evaluation code are\navailable under open licenses at https://github.com/juletx/BertaQA.\n","authors":["Julen Etxaniz","Gorka Azkune","Aitor Soroa","Oier Lopez de Lacalle","Mikel Artetxe"],"pdf_url":"https://arxiv.org/pdf/2406.07302v2.pdf","comment":"NEURIPS Datasets & Benchmarks 2024"},{"id":"http://arxiv.org/abs/2411.11581v1","updated":"2024-11-18T13:57:35Z","published":"2024-11-18T13:57:35Z","title":"OASIS: Open Agents Social Interaction Simulations on One Million Agents","summary":" There has been a growing interest in enhancing rule-based agent-based models\n(ABMs) for social media platforms (\\emph{i.e.}, X, Reddit) with more realistic\nlarge language model (LLM) agents, thereby allowing for a more nuanced study of\ncomplex systems. As a result, several LLM-based ABMs have been proposed in the\npast year. While they hold promise, each simulator is specifically designed to\nstudy a particular scenario, making it time-consuming and resource-intensive to\nexplore other phenomena using the same ABM. Additionally, these models simulate\nonly a limited number of agents, whereas real-world social media platforms\ninvolve millions of users. To this end, we propose OASIS, a generalizable and\nscalable social media simulator. OASIS is designed based on real-world social\nmedia platforms, incorporating dynamically updated environments (\\emph{i.e.},\ndynamic social networks and post information), diverse action spaces\n(\\emph{i.e.}, following, commenting), and recommendation systems (\\emph{i.e.},\ninterest-based and hot-score-based). Additionally, OASIS supports large-scale\nuser simulations, capable of modeling up to one million users. With these\nfeatures, OASIS can be easily extended to different social media platforms to\nstudy large-scale group phenomena and behaviors. We replicate various social\nphenomena, including information spreading, group polarization, and herd\neffects across X and Reddit platforms. Moreover, we provide observations of\nsocial phenomena at different agent group scales. We observe that the larger\nagent group scale leads to more enhanced group dynamics and more diverse and\nhelpful agents' opinions. These findings demonstrate OASIS's potential as a\npowerful tool for studying complex systems in digital environments.\n","authors":["Ziyi Yang","Zaibin Zhang","Zirui Zheng","Yuxian Jiang","Ziyue Gan","Zhiyu Wang","Zijian Ling","Jinsong Chen","Martz Ma","Bowen Dong","Prateek Gupta","Shuyue Hu","Zhenfei Yin","Guohao Li","Xu Jia","Lijun Wang","Bernard Ghanem","Huchuan Lu","Wanli Ouyang","Yu Qiao","Philip Torr","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2411.11581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04950v3","updated":"2024-11-18T13:15:59Z","published":"2024-11-07T18:28:40Z","title":"Estimating the Influence of Sequentially Correlated Literary Properties\n in Textual Classification: A Data-Centric Hypothesis-Testing Approach","summary":" Stylometry aims to distinguish authors by analyzing literary traits assumed\nto reflect semi-conscious choices distinct from elements like genre or theme.\nHowever, these components often overlap, complicating text classification based\nsolely on feature distributions. While some literary properties, such as\nthematic content, are likely to manifest as correlations between adjacent text\nunits, others, like authorial style, may be independent thereof. We introduce a\nhypothesis-testing approach to evaluate the influence of sequentially\ncorrelated literary properties on text classification, aiming to determine when\nthese correlations drive classification. Using a multivariate binary\ndistribution, our method models sequential correlations between text units as a\nstochastic process, assessing the likelihood of clustering across varying\nadjacency scales. This enables us to examine whether classification is\ndominated by sequentially correlated properties or remains independent. In\nexperiments on a diverse English prose corpus, our analysis integrates\ntraditional and neural embeddings within supervised and unsupervised\nframeworks. Results demonstrate that our approach effectively identifies when\ntextual classification is not primarily influenced by sequentially correlated\nliterary properties, particularly in cases where texts differ in authorial\nstyle or genre rather than by a single author within a similar genre.\n","authors":["Gideon Yoffe","Nachum Dershowitz","Ariel Vishne","Barak Sober"],"pdf_url":"https://arxiv.org/pdf/2411.04950v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06913v2","updated":"2024-11-18T13:15:41Z","published":"2024-10-09T14:12:51Z","title":"Utilize the Flow before Stepping into the Same River Twice: Certainty\n Represented Knowledge Flow for Refusal-Aware Instruction Tuning","summary":" Refusal-Aware Instruction Tuning (RAIT) enables Large Language Models (LLMs)\nto refuse to answer unknown questions. By modifying responses of unknown\nquestions in the training data to refusal responses such as \"I don't know\",\nRAIT enhances the reliability of LLMs and reduces their hallucination.\nGenerally, RAIT modifies training samples based on the correctness of the\ninitial LLM's response. However, this crude approach can cause LLMs to\nexcessively refuse answering questions they could have correctly answered, the\nproblem we call over-refusal. In this paper, we explore two primary causes of\nover-refusal: Static conflict occurs when similar samples within the LLM's\nfeature space receive differing supervision signals (original vs. modified \"I\ndon't know\"). Dynamic conflict, on the other hand, emerges as the LLM's\nknowledge evolves during SFT, allowing it to answer questions that were\npreviously unanswerable. Yet, these now-answerable training samples still\nretain the original \"I don't know\" supervision signals based on the initial LLM\nstate, resulting in inconsistencies. These conflicts cause the trained LLM to\nmisclassify known questions as unknown, resulting in over-refusal. To address\nthis issue, we introduce Certainty Represented Knowledge Flow for Refusal-Aware\nInstructions Tuning (CRaFT). CRaFT centers on two main contributions: First, we\nadditionally incorporate response certainty to selectively filter and modify\ndata, reducing static conflicts. Second, we implement preliminary rehearsal\ntraining to characterize changes in the LLM's knowledge state, which helps\nmitigate dynamic conflicts during the fine-tuning process. We conducted\nextensive experiments on open-ended question answering and multiple-choice\nquestion task. Experiment results show that CRaFT can improve LLM's overall\nperformance during the RAIT process. Source code and training data will be\nreleased at Github.\n","authors":["Runchuan Zhu","Zhipeng Ma","Jiang Wu","Junyuan Gao","Jiaqi Wang","Dahua Lin","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2410.06913v2.pdf","comment":"Equal contribution: Runchuan Zhu, Zhipeng Ma, Jiang Wu; Corresponding\n author: Conghui He"},{"id":"http://arxiv.org/abs/2411.11531v1","updated":"2024-11-18T12:40:51Z","published":"2024-11-18T12:40:51Z","title":"Addressing Hallucinations in Language Models with Knowledge Graph\n Embeddings as an Additional Modality","summary":" In this paper we present an approach to reduce hallucinations in Large\nLanguage Models (LLMs) by incorporating Knowledge Graphs (KGs) as an additional\nmodality. Our method involves transforming input text into a set of KG\nembeddings and using an adapter to integrate these embeddings into the language\nmodel space, without relying on external retrieval processes.\n To facilitate this, we created WikiEntities, a dataset containing over 3\nmillion Wikipedia texts annotated with entities from Wikidata and their\ncorresponding embeddings from PyTorch-BigGraph. This dataset serves as a\nvaluable resource for training Entity Linking models and adapting the described\nmethod to various LLMs using specialized adapters.\n Our method does not require fine-tuning of the language models themselves;\ninstead, we only train the adapter. This ensures that the model's performance\non other tasks is not affected. We trained an adapter for the Mistral 7B, LLaMA\n2-7B (chat), and LLaMA 3-8B (instruct) models using this dataset and\ndemonstrated that our approach improves performance on the HaluEval, True-False\nbenchmarks and FEVER dataset. The results indicate that incorporating KGs as a\nnew modality can effectively reduce hallucinations and improve the factual\naccuracy of language models, all without the need for external retrieval.\n","authors":["Viktoriia Chekalina","Anton Razzigaev","Elizaveta Goncharova","Andrey Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2411.11531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16937v2","updated":"2024-11-18T12:36:13Z","published":"2024-06-17T09:39:34Z","title":"A Complete Survey on LLM-based AI Chatbots","summary":" The past few decades have witnessed an upsurge in data, forming the\nfoundation for data-hungry, learning-based AI technology. Conversational\nagents, often referred to as AI chatbots, rely heavily on such data to train\nlarge language models (LLMs) and generate new content (knowledge) in response\nto user prompts. With the advent of OpenAI's ChatGPT, LLM-based chatbots have\nset new standards in the AI community. This paper presents a complete survey of\nthe evolution and deployment of LLM-based chatbots in various sectors. We first\nsummarize the development of foundational chatbots, followed by the evolution\nof LLMs, and then provide an overview of LLM-based chatbots currently in use\nand those in the development phase. Recognizing AI chatbots as tools for\ngenerating new knowledge, we explore their diverse applications across various\nindustries. We then discuss the open challenges, considering how the data used\nto train the LLMs and the misuse of the generated knowledge can cause several\nissues. Finally, we explore the future outlook to augment their efficiency and\nreliability in numerous applications. By addressing key milestones and the\npresent-day context of LLM-based chatbots, our survey invites readers to delve\ndeeper into this realm, reflecting on how their next generation will reshape\nconversational AI.\n","authors":["Sumit Kumar Dam","Choong Seon Hong","Yu Qiao","Chaoning Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.16937v2.pdf","comment":"23 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.11504v1","updated":"2024-11-18T12:04:52Z","published":"2024-11-18T12:04:52Z","title":"Search, Verify and Feedback: Towards Next Generation Post-training\n Paradigm of Foundation Models via Verifier Engineering","summary":" The evolution of machine learning has increasingly prioritized the\ndevelopment of powerful models and more scalable supervision signals. However,\nthe emergence of foundation models presents significant challenges in providing\neffective supervision signals necessary for further enhancing their\ncapabilities. Consequently, there is an urgent need to explore novel\nsupervision signals and technical approaches. In this paper, we propose\nverifier engineering, a novel post-training paradigm specifically designed for\nthe era of foundation models. The core of verifier engineering involves\nleveraging a suite of automated verifiers to perform verification tasks and\ndeliver meaningful feedback to foundation models. We systematically categorize\nthe verifier engineering process into three essential stages: search, verify,\nand feedback, and provide a comprehensive review of state-of-the-art research\ndevelopments within each stage. We believe that verifier engineering\nconstitutes a fundamental pathway toward achieving Artificial General\nIntelligence.\n","authors":["Xinyan Guan","Yanjiang Liu","Xinyu Lu","Boxi Cao","Ben He","Xianpei Han","Le Sun","Jie Lou","Bowen Yu","Yaojie Lu","Hongyu Lin"],"pdf_url":"https://arxiv.org/pdf/2411.11504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14148v2","updated":"2024-11-18T11:58:16Z","published":"2024-10-18T03:34:32Z","title":"Fine-Grained Verifiers: Preference Modeling as Next-token Prediction in\n Vision-Language Alignment","summary":" The recent advancements in large language models (LLMs) and pre-trained\nvision models have accelerated the development of vision-language large models\n(VLLMs), enhancing the interaction between visual and linguistic modalities.\nDespite their notable success across various domains, VLLMs face challenges in\nmodality alignment, which can lead to issues like hallucinations and unsafe\ncontent generation. Current alignment techniques often rely on coarse feedback\nand external datasets, limiting scalability and performance. In this paper, we\npropose FiSAO (Fine-Grained Self-Alignment Optimization), a novel\nself-alignment method that utilizes the model's own visual encoder as a\nfine-grained verifier to improve vision-language alignment without the need for\nadditional data. By leveraging token-level feedback from the vision encoder,\nFiSAO significantly improves vision-language alignment, even surpassing\ntraditional preference tuning methods that require additional data. Through\nboth theoretical analysis and experimental validation, we demonstrate that\nFiSAO effectively addresses the misalignment problem in VLLMs, marking the\nfirst instance of token-level rewards being applied to such models.\n","authors":["Chenhang Cui","An Zhang","Yiyang Zhou","Zhaorun Chen","Gelei Deng","Huaxiu Yao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.14148v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2411.11496v1","updated":"2024-11-18T11:58:07Z","published":"2024-11-18T11:58:07Z","title":"Safe + Safe = Unsafe? Exploring How Safe Images Can Be Exploited to\n Jailbreak Large Vision-Language Models","summary":" Recent advances in Large Vision-Language Models (LVLMs) have showcased strong\nreasoning abilities across multiple modalities, achieving significant\nbreakthroughs in various real-world applications. Despite this great success,\nthe safety guardrail of LVLMs may not cover the unforeseen domains introduced\nby the visual modality. Existing studies primarily focus on eliciting LVLMs to\ngenerate harmful responses via carefully crafted image-based jailbreaks\ndesigned to bypass alignment defenses. In this study, we reveal that a safe\nimage can be exploited to achieve the same jailbreak consequence when combined\nwith additional safe images and prompts. This stems from two fundamental\nproperties of LVLMs: universal reasoning capabilities and safety snowball\neffect. Building on these insights, we propose Safety Snowball Agent (SSA), a\nnovel agent-based framework leveraging agents' autonomous and tool-using\nabilities to jailbreak LVLMs. SSA operates through two principal stages: (1)\ninitial response generation, where tools generate or retrieve jailbreak images\nbased on potential harmful intents, and (2) harmful snowballing, where refined\nsubsequent prompts induce progressively harmful outputs. Our experiments\ndemonstrate that \\ours can use nearly any image to induce LVLMs to produce\nunsafe content, achieving high success jailbreaking rates against the latest\nLVLMs. Unlike prior works that exploit alignment flaws, \\ours leverages the\ninherent properties of LVLMs, presenting a profound challenge for enforcing\nsafety in generative multimodal systems. Our code is avaliable at\n\\url{https://github.com/gzcch/Safety_Snowball_Agent}.\n","authors":["Chenhang Cui","Gelei Deng","An Zhang","Jingnan Zheng","Yicong Li","Lianli Gao","Tianwei Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2411.11496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12060v3","updated":"2024-11-18T11:51:38Z","published":"2024-06-17T20:00:04Z","title":"Not Eliminate but Aggregate: Post-Hoc Control over Mixture-of-Experts to\n Address Shortcut Shifts in Natural Language Understanding","summary":" Recent models for natural language understanding are inclined to exploit\nsimple patterns in datasets, commonly known as shortcuts. These shortcuts hinge\non spurious correlations between labels and latent features existing in the\ntraining data. At inference time, shortcut-dependent models are likely to\ngenerate erroneous predictions under distribution shifts, particularly when\nsome latent features are no longer correlated with the labels. To avoid this,\nprevious studies have trained models to eliminate the reliance on shortcuts. In\nthis study, we explore a different direction: pessimistically aggregating the\npredictions of a mixture-of-experts, assuming each expert captures relatively\ndifferent latent features. The experimental results demonstrate that our\npost-hoc control over the experts significantly enhances the model's robustness\nto the distribution shift in shortcuts. Besides, we show that our approach has\nsome practical advantages. We also analyze our model and provide results to\nsupport the assumption.\n","authors":["Ukyo Honda","Tatsushi Oka","Peinan Zhang","Masato Mita"],"pdf_url":"https://arxiv.org/pdf/2406.12060v3.pdf","comment":"21 pages, 5 figures (the layout differs from the MIT Press\n publication version)"},{"id":"http://arxiv.org/abs/2411.11479v1","updated":"2024-11-18T11:31:10Z","published":"2024-11-18T11:31:10Z","title":"Quantifying Preferences of Vision-Language Models via Value\n Decomposition in Social Media Contexts","summary":" The rapid advancement of Vision-Language Models (VLMs) has expanded\nmultimodal applications, yet evaluations often focus on basic tasks like object\nrecognition, overlooking abstract aspects such as personalities and values. To\naddress this gap, we introduce Value-Spectrum, a visual question-answering\nbenchmark aimed at assessing VLMs based on Schwartz's value dimensions, which\ncapture core values guiding people's beliefs and actions across cultures. We\nconstructed a vectorized database of over 50,000 short videos sourced from\nTikTok, YouTube Shorts, and Instagram Reels, covering multiple months and a\nwide array of topics such as family, health, hobbies, society, and technology.\nWe also developed a VLM agent pipeline to automate video browsing and analysis.\nBenchmarking representative VLMs on Value-Spectrum reveals significant\ndifferences in their responses to value-oriented content, with most models\nexhibiting a preference for hedonistic topics. Beyond identifying natural\npreferences, we explored the ability of VLM agents to adopt specific personas\nwhen explicitly prompted, revealing insights into the models' adaptability in\nrole-playing scenarios. These findings highlight the potential of\nValue-Spectrum as a comprehensive evaluation set for tracking VLM advancements\nin value-based tasks and for developing more sophisticated role-playing AI\nagents.\n","authors":["Jingxuan Li","Yuning Yang","Shengqi Yang","Yizhou Zhao","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18009v2","updated":"2024-11-18T11:15:56Z","published":"2024-05-28T09:50:46Z","title":"Exploring Context Window of Large Language Models via Decomposed\n Positional Vectors","summary":" Transformer-based large language models (LLMs) typically have a limited\ncontext window, resulting in significant performance degradation when\nprocessing text beyond the length of the context window. Extensive studies have\nbeen proposed to extend the context window and achieve length extrapolation of\nLLMs, but there is still a lack of in-depth interpretation of these approaches.\nIn this study, we explore the positional information within and beyond the\ncontext window for deciphering the underlying mechanism of LLMs. By using a\nmean-based decomposition method, we disentangle positional vectors from hidden\nstates of LLMs and analyze their formation and effect on attention.\nFurthermore, when texts exceed the context window, we analyze the change of\npositional vectors in two settings, i.e., direct extrapolation and context\nwindow extension. Based on our findings, we design two training-free context\nwindow extension methods, positional vector replacement and attention window\nextension. Experimental results show that our methods can effectively extend\nthe context window length.\n","authors":["Zican Dong","Junyi Li","Xin Men","Wayne Xin Zhao","Bingbing Wang","Zhen Tian","Weipeng Chen","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.18009v2.pdf","comment":"Accepted by Neurips 2024 as a spotlight"},{"id":"http://arxiv.org/abs/2411.11465v1","updated":"2024-11-18T10:58:46Z","published":"2024-11-18T10:58:46Z","title":"Re-examining learning linear functions in context","summary":" In context learning (ICL) is an attractive method of solving a wide range of\nproblems. Inspired by Garg et al. (2022), we look closely at ICL in a variety\nof train and test settings for several transformer models of different sizes\ntrained from scratch. Our study complements prior work by pointing out several\nsystematic failures of these models to generalize to data not in the training\ndistribution, thereby showing some limitations of ICL. We find that models\nadopt a strategy for this task that is very different from standard solutions.\n","authors":["Omar Naim","Guilhem Fouilhé","Nicholas Asher"],"pdf_url":"https://arxiv.org/pdf/2411.11465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11437v1","updated":"2024-11-18T10:08:10Z","published":"2024-11-18T10:08:10Z","title":"Causal Effect of Group Diversity on Redundancy and Coverage in\n Peer-Reviewing","summary":" A large host of scientific journals and conferences solicit peer reviews from\nmultiple reviewers for the same submission, aiming to gather a broader range of\nperspectives and mitigate individual biases. In this work, we reflect on the\nrole of diversity in the slate of reviewers assigned to evaluate a submitted\npaper as a factor in diversifying perspectives and improving the utility of the\npeer-review process. We propose two measures for assessing review utility:\nreview coverage -- reviews should cover most contents of the paper -- and\nreview redundancy -- reviews should add information not already present in\nother reviews. We hypothesize that reviews from diverse reviewers will exhibit\nhigh coverage and low redundancy. We conduct a causal study of different\nmeasures of reviewer diversity on review coverage and redundancy using\nobservational data from a peer-reviewed conference with approximately 5,000\nsubmitted papers. Our study reveals disparate effects of different diversity\nmeasures on review coverage and redundancy. Our study finds that assigning a\ngroup of reviewers that are topically diverse, have different seniority levels,\nor have distinct publication networks leads to broader coverage of the paper or\nreview criteria, but we find no evidence of an increase in coverage for\nreviewer slates with reviewers from diverse organizations or geographical\nlocations. Reviewers from different organizations, seniority levels, topics, or\npublications networks (all except geographical diversity) lead to a decrease in\nredundancy in reviews. Furthermore, publication network-based diversity alone\nalso helps bring in varying perspectives (that is, low redundancy), even within\nspecific review criteria. Our study adopts a group decision-making perspective\nfor reviewer assignments in peer review and suggests dimensions of diversity\nthat can help guide the reviewer assignment process.\n","authors":["Navita Goyal","Ivan Stelmakh","Nihar Shah","Hal Daumé III"],"pdf_url":"https://arxiv.org/pdf/2411.11437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08449v2","updated":"2024-11-18T09:57:04Z","published":"2024-11-13T09:11:56Z","title":"Towards Evaluating Large Language Models for Graph Query Generation","summary":" Large Language Models (LLMs) are revolutionizing the landscape of Generative\nArtificial Intelligence (GenAI), with innovative LLM-backed solutions emerging\nrapidly. However, when applied to database technologies, specifically query\ngeneration for graph databases and Knowledge Graphs (KGs), LLMs still face\nsignificant challenges. While research on LLM-driven query generation for\nStructured Query Language (SQL) exists, similar systems for graph databases\nremain underdeveloped. This paper presents a comparative study addressing the\nchallenge of generating Cypher queries a powerful language for interacting with\ngraph databases using open-access LLMs. We rigorously evaluate several LLM\nagents (OpenAI ChatGPT 4o, Claude Sonnet 3.5, Google Gemini Pro 1.5, and a\nlocally deployed Llama 3.1 8B) using a designed few-shot learning prompt and\nRetrieval Augmented Generation (RAG) backed by Chain-of-Thoughts (CoT)\nreasoning. Our empirical analysis of query generation accuracy reveals that\nClaude Sonnet 3.5 outperforms its counterparts in this specific domain.\nFurther, we highlight promising future research directions to address the\nidentified limitations and advance LLM-driven query generation for graph\ndatabases.\n","authors":["Siraj Munir","Alessandro Aldini"],"pdf_url":"https://arxiv.org/pdf/2411.08449v2.pdf","comment":"Paper accepted and will be presented at CSCI2024 in December 2024,\n Later will be published at Springer LNCS"},{"id":"http://arxiv.org/abs/2402.10691v4","updated":"2024-11-18T09:53:03Z","published":"2024-02-16T13:48:06Z","title":"Python is Not Always the Best Choice: Embracing Multilingual Program of\n Thoughts","summary":" Program of Thoughts (PoT) is an approach characterized by its executable\nintermediate steps, which ensure the accuracy of the logical calculations in\nthe reasoning process. Currently, PoT primarily uses Python. However, relying\nsolely on a single language may result in suboptimal solutions and overlook the\npotential benefits of other programming languages. In this paper, we conduct\ncomprehensive experiments on the programming languages used in PoT and find\nthat no single language consistently delivers optimal performance across all\ntasks and models. The effectiveness of each language varies depending on the\nspecific scenarios. Inspired by this, we propose a task and model agnostic\napproach called MultiPoT, which harnesses strength and diversity from various\nlanguages. Experimental results reveal that it significantly outperforms Python\nSelf-Consistency. Furthermore, it achieves comparable or superior performance\ncompared to the best monolingual PoT in almost all tasks across all models. In\nparticular, MultiPoT achieves more than 4.6% improvement on average on ChatGPT\n(gpt-3.5-turbo-0701).\n","authors":["Xianzhen Luo","Qingfu Zhu","Zhiming Zhang","Libo Qin","Xuanyu Zhang","Qing Yang","Dongliang Xu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2402.10691v4.pdf","comment":"Accepted by EMNLP 2024. Code and data are released at\n https://github.com/Luowaterbi/MultiPoT"},{"id":"http://arxiv.org/abs/2411.11424v1","updated":"2024-11-18T09:50:54Z","published":"2024-11-18T09:50:54Z","title":"Membership Inference Attack against Long-Context Large Language Models","summary":" Recent advances in Large Language Models (LLMs) have enabled them to overcome\ntheir context window limitations, and demonstrate exceptional retrieval and\nreasoning capacities on longer context. Quesion-answering systems augmented\nwith Long-Context Language Models (LCLMs) can automatically search massive\nexternal data and incorporate it into their contexts, enabling faithful\npredictions and reducing issues such as hallucinations and knowledge staleness.\nExisting studies targeting LCLMs mainly concentrate on addressing the so-called\nlost-in-the-middle problem or improving the inference effiencicy, leaving their\nprivacy risks largely unexplored. In this paper, we aim to bridge this gap and\nargue that integrating all information into the long context makes it a\nrepository of sensitive information, which often contains private data such as\nmedical records or personal identities. We further investigate the membership\nprivacy within LCLMs external context, with the aim of determining whether a\ngiven document or sequence is included in the LCLMs context. Our basic idea is\nthat if a document lies in the context, it will exhibit a low generation loss\nor a high degree of semantic similarity to the contents generated by LCLMs. We\nfor the first time propose six membership inference attack (MIA) strategies\ntailored for LCLMs and conduct extensive experiments on various popular models.\nEmpirical results demonstrate that our attacks can accurately infer membership\nstatus in most cases, e.g., 90.66% attack F1-score on Multi-document QA\ndatasets with LongChat-7b-v1.5-32k, highlighting significant risks of\nmembership leakage within LCLMs input contexts. Furthermore, we examine the\nunderlying reasons why LCLMs are susceptible to revealing such membership\ninformation.\n","authors":["Zixiong Wang","Gaoyang Liu","Yang Yang","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18492v3","updated":"2024-11-18T09:44:26Z","published":"2024-05-28T18:01:52Z","title":"LLMs and Memorization: On Quality and Specificity of Copyright\n Compliance","summary":" Memorization in large language models (LLMs) is a growing concern. LLMs have\nbeen shown to easily reproduce parts of their training data, including\ncopyrighted work. This is an important problem to solve, as it may violate\nexisting copyright laws as well as the European AI Act. In this work, we\npropose a systematic analysis to quantify the extent of potential copyright\ninfringements in LLMs using European law as an example. Unlike previous work,\nwe evaluate instruction-finetuned models in a realistic end-user scenario. Our\nanalysis builds on a proposed threshold of 160 characters, which we borrow from\nthe German Copyright Service Provider Act and a fuzzy text matching algorithm\nto identify potentially copyright-infringing textual reproductions. The\nspecificity of countermeasures against copyright infringement is analyzed by\ncomparing model behavior on copyrighted and public domain data. We investigate\nwhat behaviors models show instead of producing protected text (such as refusal\nor hallucination) and provide a first legal assessment of these behaviors. We\nfind that there are huge differences in copyright compliance, specificity, and\nappropriate refusal among popular LLMs. Alpaca, GPT 4, GPT 3.5, and Luminous\nperform best in our comparison, with OpenGPT-X, Alpaca, and Luminous producing\na particularly low absolute number of potential copyright violations. Code can\nbe found at https://github.com/felixbmuller/llms-memorization-copyright.\n","authors":["Felix B Mueller","Rebekka Görge","Anna K Bernzen","Janna C Pirk","Maximilian Poretschkin"],"pdf_url":"https://arxiv.org/pdf/2405.18492v3.pdf","comment":"10 pages, 3 figures, AIES 2024 conference"},{"id":"http://arxiv.org/abs/2402.18191v3","updated":"2024-11-18T09:26:51Z","published":"2024-02-28T09:27:29Z","title":"Clustering and Ranking: Diversity-preserved Instruction Selection\n through Expert-aligned Quality Estimation","summary":" With contributions from the open-source community, a vast amount of\ninstruction tuning (IT) data has emerged. Given the significant resource\nallocation required for training and evaluating models, it is advantageous to\nhave an efficient method for selecting high-quality IT data. However, existing\nmethods for instruction data selection have limitations such as relying on\nfragile external APIs, being affected by biases in GPT models, or reducing the\ndiversity of the selected instruction dataset. In this paper, we propose an\nindustrial-friendly, expert-aligned and diversity-preserved instruction data\nselection method: Clustering and Ranking (CaR). CaR employs a two-step process:\nfirst, it ranks instruction pairs using a high-accuracy (84.25%) scoring model\naligned with expert preferences; second, it preserves dataset diversity through\nclustering. In our experiment, CaR efficiently selected a mere 1.96% of\nAlpaca's IT data, yet the resulting AlpaCaR model surpassed Alpaca's\nperformance by an average of 32.1% in GPT-4 evaluations. Moreover, we find that\ndata selecting is a consistent paradigm whether the pre-trained model is more\ncapable or the model parameters scaling up. Our approach employs compact models\nwith 550M parameters and incurs just 11.2% of the financial outlay of current\nmethods, enhancing its industrial deployability.\n","authors":["Yuan Ge","Yilun Liu","Chi Hu","Weibin Meng","Shimin Tao","Xiaofeng Zhao","Hongxia Ma","Li Zhang","Boxing Chen","Hao Yang","Bei Li","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.18191v3.pdf","comment":"Accepted by EMNLP2024"},{"id":"http://arxiv.org/abs/2402.14259v2","updated":"2024-11-18T09:19:25Z","published":"2024-02-22T03:46:08Z","title":"Word-Sequence Entropy: Towards Uncertainty Estimation in Free-Form\n Medical Question Answering Applications and Beyond","summary":" Uncertainty estimation is crucial for the reliability of safety-critical\nhuman and artificial intelligence (AI) interaction systems, particularly in the\ndomain of healthcare engineering. However, a robust and general uncertainty\nmeasure for free-form answers has not been well-established in open-ended\nmedical question-answering (QA) tasks, where generative inequality introduces a\nlarge number of irrelevant words and sequences within the generated set for\nuncertainty quantification (UQ), which can lead to biases. This paper\nintroduces Word-Sequence Entropy (WSE), a method that calibrates uncertainty at\nboth the word and sequence levels, considering semantic relevance. WSE\nquantifies uncertainty in a way that is more closely aligned with the\nreliability of LLMs during uncertainty quantification (UQ). We compare WSE with\nsix baseline methods on five free-form medical QA datasets, utilizing seven\npopular large language models (LLMs). Experimental results demonstrate that WSE\nexhibits superior performance in UQ under two standard criteria for correctness\nevaluation. Additionally, in terms of real-world medical QA applications, the\nperformance of LLMs is significantly enhanced (e.g., a 6.36% improvement in\nmodel accuracy on the COVID-QA dataset) by employing responses with lower\nuncertainty that are identified by WSE as final answers, without any additional\ntask-specific fine-tuning or architectural modifications.\n","authors":["Zhiyuan Wang","Jinhao Duan","Chenxi Yuan","Qingyu Chen","Tianlong Chen","Yue Zhang","Ren Wang","Xiaoshuang Shi","Kaidi Xu"],"pdf_url":"https://arxiv.org/pdf/2402.14259v2.pdf","comment":"Accepted by Engineering Applications of Artificial Intelligence"},{"id":"http://arxiv.org/abs/2411.11371v1","updated":"2024-11-18T08:34:38Z","published":"2024-11-18T08:34:38Z","title":"Rethinking Thinking Tokens: Understanding Why They Underperform in\n Practice","summary":" Thinking Tokens (TT) have been proposed as an unsupervised method to\nfacilitate reasoning in language models. However, despite their conceptual\nappeal, our findings show that TTs marginally improves performance and\nconsistently underperforms compared to Chain-of-Thought (CoT) reasoning across\nmultiple benchmarks. We hypothesize that this underperformance stems from the\nreliance on a single embedding for TTs, which results in inconsistent learning\nsignals and introduces noisy gradients. This paper provides a comprehensive\nempirical analysis to validate this hypothesis and discusses the implications\nfor future research on unsupervised reasoning in LLMs.\n","authors":["Sreeram Vennam","David Valente","David Herel","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2411.11371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00499v3","updated":"2024-11-18T08:33:35Z","published":"2024-06-29T17:33:07Z","title":"ConU: Conformal Uncertainty in Large Language Models with Correctness\n Coverage Guarantees","summary":" Uncertainty quantification (UQ) in natural language generation (NLG) tasks\nremains an open challenge, exacerbated by the closed-source nature of the\nlatest large language models (LLMs). This study investigates applying conformal\nprediction (CP), which can transform any heuristic uncertainty notion into\nrigorous prediction sets, to black-box LLMs in open-ended NLG tasks. We\nintroduce a novel uncertainty measure based on self-consistency theory, and\nthen develop a conformal uncertainty criterion by integrating the uncertainty\ncondition aligned with correctness into the CP algorithm. Empirical evaluations\nindicate that our uncertainty measure outperforms prior state-of-the-art\nmethods. Furthermore, we achieve strict control over the correctness coverage\nrate utilizing 7 popular LLMs on 4 free-form NLG datasets, spanning\ngeneral-purpose and medical scenarios. Additionally, the calibrated prediction\nsets with small size further highlights the efficiency of our method in\nproviding trustworthy guarantees for practical open-ended NLG applications.\n","authors":["Zhiyuan Wang","Jinhao Duan","Lu Cheng","Yue Zhang","Qingni Wang","Xiaoshuang Shi","Kaidi Xu","Hengtao Shen","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.00499v3.pdf","comment":"Accepted by EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2411.11362v1","updated":"2024-11-18T08:13:22Z","published":"2024-11-18T08:13:22Z","title":"MAIRA-Seg: Enhancing Radiology Report Generation with Segmentation-Aware\n Multimodal Large Language Models","summary":" There is growing interest in applying AI to radiology report generation,\nparticularly for chest X-rays (CXRs). This paper investigates whether\nincorporating pixel-level information through segmentation masks can improve\nfine-grained image interpretation of multimodal large language models (MLLMs)\nfor radiology report generation. We introduce MAIRA-Seg, a segmentation-aware\nMLLM framework designed to utilize semantic segmentation masks alongside CXRs\nfor generating radiology reports. We train expert segmentation models to obtain\nmask pseudolabels for radiology-specific structures in CXRs. Subsequently,\nbuilding on the architectures of MAIRA, a CXR-specialised model for report\ngeneration, we integrate a trainable segmentation tokens extractor that\nleverages these mask pseudolabels, and employ mask-aware prompting to generate\ndraft radiology reports. Our experiments on the publicly available MIMIC-CXR\ndataset show that MAIRA-Seg outperforms non-segmentation baselines. We also\ninvestigate set-of-marks prompting with MAIRA and find that MAIRA-Seg\nconsistently demonstrates comparable or superior performance. The results\nconfirm that using segmentation masks enhances the nuanced reasoning of MLLMs,\npotentially contributing to better clinical outcomes.\n","authors":["Harshita Sharma","Valentina Salvatelli","Shaury Srivastav","Kenza Bouzid","Shruthi Bannur","Daniel C. Castro","Maximilian Ilse","Sam Bond-Taylor","Mercy Prasanna Ranjit","Fabian Falck","Fernando Pérez-García","Anton Schwaighofer","Hannah Richardson","Maria Teodora Wetscherek","Stephanie L. Hyland","Javier Alvarez-Valle"],"pdf_url":"https://arxiv.org/pdf/2411.11362v1.pdf","comment":"Accepted as Proceedings Paper at ML4H 2024"},{"id":"http://arxiv.org/abs/2407.11418v2","updated":"2024-11-18T08:01:24Z","published":"2024-07-16T06:19:14Z","title":"Semantic Operators: A Declarative Model for Rich, AI-based Analytics\n Over Text Data","summary":" The semantic capabilities of language models (LMs) have the potential to\nenable rich analytics and reasoning over vast knowledge corpora. Unfortunately,\nexisting systems lack high-level abstractions to perform bulk semantic queries\nacross large corpora. We introduce semantic operators, a declarative\nprogramming interface that extends the relational model with composable\nAI-based operations for bulk semantic queries (e.g., filtering, sorting,\njoining or aggregating records using natural language criteria). Each operator\ncan be implemented and optimized in multiple ways, opening a rich space for\nexecution plans similar to relational operators. We implement our operators in\nLOTUS, an open source query engine with a DataFrame API. Furthermore, we\ndevelop several novel optimizations that take advantage of the declarative\nnature of semantic operators to accelerate semantic filtering, clustering and\njoin operators by up to $400\\times$ while offering statistical accuracy\nguarantees. We demonstrate LOTUS' effectiveness on real AI applications\nincluding fact-checking, extreme multi-label classification, and search. We\nshow that the semantic operator model is expressive, capturing state-of-the-art\nAI pipelines in a few operator calls, and making it easy to express new\npipelines that achieve up to $180\\%$ higher quality. Overall, LOTUS queries\nmatch or exceed the accuracy of state-of-the-art AI pipelines for each task\nwhile running up to 28$\\times$ faster. LOTUS is publicly available at\nhttps://github.com/stanford-futuredata/lotus.\n","authors":["Liana Patel","Siddharth Jha","Parth Asawa","Melissa Pan","Carlos Guestrin","Matei Zaharia"],"pdf_url":"https://arxiv.org/pdf/2407.11418v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02156v2","updated":"2024-11-18T07:36:36Z","published":"2024-10-03T02:36:30Z","title":"The why, what, and how of AI-based coding in scientific research","summary":" Computer programming (coding) is indispensable for researchers across\ndisciplines, yet it remains challenging to learn and time-consuming to carry\nout. Generative AI, particularly large language models (LLMs), has the\npotential to transform coding into intuitive conversations, but best practices\nand effective workflows are only emerging. We dissect AI-based coding through\nthree key lenses: the nature and role of LLMs in coding (why), six types of\ncoding assistance they provide (what), and a five-step workflow in action with\npractical implementation strategies (how). Additionally, we address the\nlimitations and future outlook of AI in coding. By offering actionable\ninsights, this framework helps to guide researchers in effectively leveraging\nAI to enhance coding practices and education, accelerating scientific progress.\n","authors":["Tonghe Zhuang","Zhicheng Lin"],"pdf_url":"https://arxiv.org/pdf/2410.02156v2.pdf","comment":"23 pages, 7 figure, 3 boxes"},{"id":"http://arxiv.org/abs/2411.11344v1","updated":"2024-11-18T07:33:10Z","published":"2024-11-18T07:33:10Z","title":"Mitigating Knowledge Conflicts in Language Model-Driven Question\n Answering","summary":" Knowledge-aware sequence to sequence generation tasks such as document\nquestion answering and abstract summarization typically requires two types of\nknowledge: encoded parametric knowledge and retrieved contextual information.\nPrevious work show improper correlation between parametric knowledge and\nanswers in the training set could cause the model ignore input information at\ntest time, resulting in un-desirable model behaviour such as over-stability and\nhallucination. In this work, we argue that hallucination could be mitigated via\nexplicit correlation between input source and generated content. We focus on a\ntypical example of hallucination, entity-based knowledge conflicts in question\nanswering, where correlation of entities and their description at training time\nhinders model behaviour during inference.\n","authors":["Han Cao","Zhaoyang Zhang","Xiangtian Li","Chufan Wu","Hansong Zhang","Wenqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08484v2","updated":"2024-11-18T07:32:16Z","published":"2024-03-13T12:50:23Z","title":"Targeted Efficient Fine-tuning: Optimizing Parameter Updates with\n Data-Driven Sample Selection","summary":" Fine-tuning all parameters of Large Language Models (LLMs) is computationally\nexpensive. Parameter-Efficient Fine-Tuning (PEFT) methods address this by\nselectively fine-tuning specific parameters. Most of the parameter efficient\nfine-tuning (PEFT) methods center on selecting or introducing a set of\nparameters to be fine-tuned. However, there are few methods that consider the\nimpact of data samples on parameter selecting. Representative data driven\nmethods include FISH Mask based method, which randomly selects a portion of\ndata samples as a basis when selecting parameters. However, this random data\nsample selection method cannot select optimal parameters for unstable data\ndistribution. In this work, we introduce a data-centric approach and propose\nthe Iterative Range Decreasing (IRD) algorithm to optimize the sample-parameter\npair selection in FISH Mask. IRD iteratively refines the selection by\nidentifying subsets of samples and parameters exhibiting higher Fisher\ninformation. We demonstrate the effectiveness and rationality of proposed\nstrategy by conducting experiments on GLUE benchmark. Experimental results show\nour strategy optimizes the parameter selection and achieves preferable\nperformance over some typical baseline methods.\n","authors":["Ming Dong","Kang Xue","Bolong Zheng","Tingting He"],"pdf_url":"https://arxiv.org/pdf/2403.08484v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10918v2","updated":"2024-11-18T06:50:30Z","published":"2024-05-17T17:09:45Z","title":"A Framework for Leveraging Partially-Labeled Data for Product\n Attribute-Value Identification","summary":" In the e-commerce domain, the accurate extraction of attribute-value pairs\n(e.g., Brand: Apple) from product titles and user search queries is crucial for\nenhancing search and recommendation systems. A major challenge with neural\nmodels for this task is the lack of high-quality training data, as the\nannotations for attribute-value pairs in the available datasets are often\nincomplete. To address this, we introduce GenToC, a model designed for training\ndirectly with partially-labeled data, eliminating the necessity for a fully\nannotated dataset. GenToC employs a marker-augmented generative model to\nidentify potential attributes, followed by a token classification model that\ndetermines the associated values for each attribute. GenToC outperforms\nexisting state-of-the-art models, exhibiting upto 56.3% increase in the number\nof accurate extractions. Furthermore, we utilize GenToC to regenerate the\ntraining dataset to expand attribute-value annotations. This bootstrapping\nsubstantially improves the data quality for training other standard NER models,\nwhich are typically faster but less capable in handling partially-labeled data,\nenabling them to achieve comparable performance to GenToC. Our results\ndemonstrate GenToC's unique ability to learn from a limited set of\npartially-labeled data and improve the training of more efficient models,\nadvancing the automated extraction of attribute-value pairs. Finally, our model\nhas been successfully integrated into IndiaMART, India's largest B2B e-commerce\nplatform, achieving a significant increase of 20.2% in the number of correctly\nidentified attribute-value pairs over the existing deployed system while\nachieving a high precision of 89.5%.\n","authors":["D. Subhalingam","Keshav Kolluru"," Mausam","Saurabh Singal"],"pdf_url":"https://arxiv.org/pdf/2405.10918v2.pdf","comment":"Accepted to KDD 2025 ADS Track"},{"id":"http://arxiv.org/abs/2409.19979v3","updated":"2024-11-18T06:28:01Z","published":"2024-09-30T06:07:12Z","title":"Enhancing High-order Interaction Awareness in LLM-based Recommender\n Model","summary":" Large language models (LLMs) have demonstrated prominent reasoning\ncapabilities in recommendation tasks by transforming them into text-generation\ntasks. However, existing approaches either disregard or ineffectively model the\nuser-item high-order interactions. To this end, this paper presents an enhanced\nLLM-based recommender (ELMRec). We enhance whole-word embeddings to\nsubstantially enhance LLMs' interpretation of graph-constructed interactions\nfor recommendations, without requiring graph pre-training. This finding may\ninspire endeavors to incorporate rich knowledge graphs into LLM-based\nrecommenders via whole-word embedding. We also found that LLMs often recommend\nitems based on users' earlier interactions rather than recent ones, and present\na reranking solution. Our ELMRec outperforms state-of-the-art (SOTA) methods in\nboth direct and sequential recommendations.\n","authors":["Xinfeng Wang","Jin Cui","Fumiyo Fukumoto","Yoshimi Suzuki"],"pdf_url":"https://arxiv.org/pdf/2409.19979v3.pdf","comment":"Long paper accepted to EMNLP 2024 Main. 16 pages"},{"id":"http://arxiv.org/abs/2411.10020v2","updated":"2024-11-18T06:14:51Z","published":"2024-11-15T07:54:19Z","title":"Information Extraction from Clinical Notes: Are We Ready to Switch to\n Large Language Models?","summary":" Backgrounds: Information extraction (IE) is critical in clinical natural\nlanguage processing (NLP). While large language models (LLMs) excel on\ngenerative tasks, their performance on extractive tasks remains debated.\nMethods: We investigated Named Entity Recognition (NER) and Relation Extraction\n(RE) using 1,588 clinical notes from four sources (UT Physicians, MTSamples,\nMIMIC-III, and i2b2). We developed an annotated corpus covering 4 clinical\nentities and 16 modifiers, and compared instruction-tuned LLaMA-2 and LLaMA-3\nagainst BiomedBERT in terms of performance, generalizability, computational\nresources, and throughput to BiomedBERT. Results: LLaMA models outperformed\nBiomedBERT across datasets. With sufficient training data, LLaMA showed modest\nimprovements (1% on NER, 1.5-3.7% on RE); improvements were larger with limited\ntraining data. On unseen i2b2 data, LLaMA-3-70B outperformed BiomedBERT by 7%\n(F1) on NER and 4% on RE. However, LLaMA models required more computing\nresources and ran up to 28 times slower. We implemented \"Kiwi,\" a clinical IE\npackage featuring both models, available at https://kiwi.clinicalnlp.org/.\nConclusion: This study is among the first to develop and evaluate a\ncomprehensive clinical IE system using open-source LLMs. Results indicate that\nLLaMA models outperform BiomedBERT for clinical NER and RE but with higher\ncomputational costs and lower throughputs. These findings highlight that\nchoosing between LLMs and traditional deep learning methods for clinical IE\napplications should remain task-specific, taking into account both performance\nmetrics and practical considerations such as available computing resources and\nthe intended use case scenarios.\n","authors":["Yan Hu","Xu Zuo","Yujia Zhou","Xueqing Peng","Jimin Huang","Vipina K. Keloth","Vincent J. Zhang","Ruey-Ling Weng","Qingyu Chen","Xiaoqian Jiang","Kirk E. Roberts","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2411.10020v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11295v1","updated":"2024-11-18T05:41:27Z","published":"2024-11-18T05:41:27Z","title":"Transcending Language Boundaries: Harnessing LLMs for Low-Resource\n Language Translation","summary":" Large Language Models (LLMs) have demonstrated remarkable success across a\nwide range of tasks and domains. However, their performance in low-resource\nlanguage translation, particularly when translating into these languages,\nremains underexplored. This gap poses significant challenges, as linguistic\nbarriers hinder the cultural preservation and development of minority\ncommunities. To address this issue, this paper introduces a novel\nretrieval-based method that enhances translation quality for low-resource\nlanguages by focusing on key terms, which involves translating keywords and\nretrieving corresponding examples from existing data. To evaluate the\neffectiveness of this method, we conducted experiments translating from English\ninto three low-resource languages: Cherokee, a critically endangered indigenous\nlanguage of North America; Tibetan, a historically and culturally significant\nlanguage in Asia; and Manchu, a language with few remaining speakers. Our\ncomparison with the zero-shot performance of GPT-4o and LLaMA 3.1 405B,\nhighlights the significant challenges these models face when translating into\nlow-resource languages. In contrast, our retrieval-based method shows promise\nin improving both word-level accuracy and overall semantic understanding by\nleveraging existing resources more effectively.\n","authors":["Peng Shu","Junhao Chen","Zhengliang Liu","Hui Wang","Zihao Wu","Tianyang Zhong","Yiwei Li","Huaqin Zhao","Hanqi Jiang","Yi Pan","Yifan Zhou","Constance Owl","Xiaoming Zhai","Ninghao Liu","Claudio Saunt","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03816v3","updated":"2024-11-18T05:36:16Z","published":"2024-06-06T07:40:00Z","title":"ReST-MCTS*: LLM Self-Training via Process Reward Guided Tree Search","summary":" Recent methodologies in LLM self-training mostly rely on LLM generating\nresponses and filtering those with correct output answers as training data.\nThis approach often yields a low-quality fine-tuning training set (e.g.,\nincorrect plans or intermediate reasoning). In this paper, we develop a\nreinforced self-training approach, called ReST-MCTS*, based on integrating\nprocess reward guidance with tree search MCTS* for collecting higher-quality\nreasoning traces as well as per-step value to train policy and reward models.\nReST-MCTS* circumvents the per-step manual annotation typically used to train\nprocess rewards by tree-search-based reinforcement learning: Given oracle final\ncorrect answers, ReST-MCTS* is able to infer the correct process rewards by\nestimating the probability this step can help lead to the correct answer. These\ninferred rewards serve dual purposes: they act as value targets for further\nrefining the process reward model and also facilitate the selection of\nhigh-quality traces for policy model self-training. We first show that the\ntree-search policy in ReST-MCTS* achieves higher accuracy compared with prior\nLLM reasoning baselines such as Best-of-N and Tree-of-Thought, within the same\nsearch budget. We then show that by using traces searched by this tree-search\npolicy as training data, we can continuously enhance the three language models\nfor multiple iterations, and outperform other self-training algorithms such as\nReST$^\\text{EM}$ and Self-Rewarding LM. We release all code at\nhttps://github.com/THUDM/ReST-MCTS.\n","authors":["Dan Zhang","Sining Zhoubian","Ziniu Hu","Yisong Yue","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2406.03816v3.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2401.07950v3","updated":"2024-11-18T05:30:50Z","published":"2024-01-15T20:22:21Z","title":"SciInstruct: a Self-Reflective Instruction Annotated Dataset for\n Training Scientific Language Models","summary":" Large Language Models (LLMs) have shown promise in assisting scientific\ndiscovery. However, such applications are currently limited by LLMs'\ndeficiencies in understanding intricate scientific concepts, deriving symbolic\nequations, and solving advanced numerical calculations. To bridge these gaps,\nwe introduce SciInstruct, a suite of scientific instructions for training\nscientific language models capable of college-level scientific reasoning.\nCentral to our approach is a novel self-reflective instruction annotation\nframework to address the data scarcity challenge in the science domain. This\nframework leverages existing LLMs to generate step-by-step reasoning for\nunlabelled scientific questions, followed by a process of self-reflective\ncritic-and-revise. Applying this framework, we curated a diverse and\nhigh-quality dataset encompassing physics, chemistry, math, and formal proofs.\nWe analyze the curated SciInstruct from multiple interesting perspectives\n(e.g., domain, scale, source, question type, answer length, etc.). To verify\nthe effectiveness of SciInstruct, we fine-tuned different language models with\nSciInstruct, i.e., ChatGLM3 (6B and 32B), Llama3-8B-Instruct, and Mistral-7B:\nMetaMath, enhancing their scientific and mathematical reasoning capabilities,\nwithout sacrificing the language understanding capabilities of the base model.\nWe release all codes and SciInstruct at https://github.com/THUDM/SciGLM.\n","authors":["Dan Zhang","Ziniu Hu","Sining Zhoubian","Zhengxiao Du","Kaiyu Yang","Zihan Wang","Yisong Yue","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2401.07950v3.pdf","comment":"Accepted to NeurIPS D&B Track 2024"},{"id":"http://arxiv.org/abs/2410.12311v3","updated":"2024-11-18T05:23:42Z","published":"2024-10-16T07:24:28Z","title":"Open Domain Question Answering with Conflicting Contexts","summary":" Open domain question answering systems frequently rely on information\nretrieved from large collections of text (such as the Web) to answer questions.\nHowever, such collections of text often contain conflicting information, and\nindiscriminately depending on this information may result in untruthful and\ninaccurate answers. To understand the gravity of this problem, we collect a\nhuman-annotated dataset, Question Answering with Conflicting Contexts (QACC),\nand find that as much as 25% of unambiguous, open domain questions can lead to\nconflicting contexts when retrieved using Google Search. We evaluate and\nbenchmark three powerful Large Language Models (LLMs) with our dataset QACC and\ndemonstrate their limitations in effectively addressing questions with\nconflicting information. To explore how humans reason through conflicting\ncontexts, we request our annotators to provide explanations for their\nselections of correct answers. We demonstrate that by finetuning LLMs to\nexplain their answers, we can introduce richer information into their training\nthat guide them through the process of reasoning with conflicting contexts.\n","authors":["Siyi Liu","Qiang Ning","Kishaloy Halder","Wei Xiao","Zheng Qi","Phu Mon Htut","Yi Zhang","Neha Anna John","Bonan Min","Yassine Benajiba","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2410.12311v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11289v1","updated":"2024-11-18T05:17:27Z","published":"2024-11-18T05:17:27Z","title":"LP Data Pipeline: Lightweight, Purpose-driven Data Pipeline for Large\n Language Models","summary":" Creating high-quality, large-scale datasets for large language models (LLMs)\noften relies on resource-intensive, GPU-accelerated models for quality\nfiltering, making the process time-consuming and costly. This dependence on\nGPUs limits accessibility for organizations lacking significant computational\ninfrastructure. To address this issue, we introduce the Lightweight,\nPurpose-driven (LP) Data Pipeline, a framework that operates entirely on CPUs\nto streamline the processes of dataset extraction, filtering, and curation.\nBased on our four core principles, the LP Data Pipeline significantly reduces\npreparation time and cost while maintaining high data quality. Importantly, our\npipeline enables the creation of purpose-driven datasets tailored to specific\ndomains and languages, enhancing the applicability of LLMs in specialized\ncontexts. We anticipate that our pipeline will lower the barriers to LLM\ndevelopment, enabling a wide range of organizations to access LLMs more easily.\n","authors":["Yungi Kim","Hyunsoo Ha","Seonghoon Yang","Sukyung Lee","Jihoo Kim","Chanjun Park"],"pdf_url":"https://arxiv.org/pdf/2411.11289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15051v5","updated":"2024-11-18T03:55:02Z","published":"2023-07-27T17:56:56Z","title":"Matching Patients to Clinical Trials with Large Language Models","summary":" Patient recruitment is challenging for clinical trials. We introduce\nTrialGPT, an end-to-end framework for zero-shot patient-to-trial matching with\nlarge language models. TrialGPT comprises three modules: it first performs\nlarge-scale filtering to retrieve candidate trials (TrialGPT-Retrieval); then\npredicts criterion-level patient eligibility (TrialGPT-Matching); and finally\ngenerates trial-level scores (TrialGPT-Ranking). We evaluate TrialGPT on three\ncohorts of 183 synthetic patients with over 75,000 trial annotations.\nTrialGPT-Retrieval can recall over 90% of relevant trials using less than 6% of\nthe initial collection. Manual evaluations on 1,015 patient-criterion pairs\nshow that TrialGPT-Matching achieves an accuracy of 87.3% with faithful\nexplanations, close to the expert performance. The TrialGPT-Ranking scores are\nhighly correlated with human judgments and outperform the best-competing models\nby 43.8% in ranking and excluding trials. Furthermore, our user study reveals\nthat TrialGPT can reduce the screening time by 42.6% in patient recruitment.\nOverall, these results have demonstrated promising opportunities for\npatient-to-trial matching with TrialGPT.\n","authors":["Qiao Jin","Zifeng Wang","Charalampos S. Floudas","Fangyuan Chen","Changlin Gong","Dara Bracken-Clarke","Elisabetta Xue","Yifan Yang","Jimeng Sun","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.15051v5.pdf","comment":"Nature Communications"},{"id":"http://arxiv.org/abs/2411.11266v1","updated":"2024-11-18T03:45:34Z","published":"2024-11-18T03:45:34Z","title":"VersaTune: Fine-Tuning Multi-Ability LLMs Efficiently","summary":" Large Language Models (LLMs) exhibit remarkable capabilities in handling\nmultiple tasks across domains due to their emergent properties. These\ncapabilities are further augmented during the Supervised Fine-Tuning (SFT)\nphase. Despite their potential, existing work mainly focuses on domain-specific\nenhancements during fine-tuning, the challenge of which lies in catastrophic\nforgetting of knowledge across other domains. In this study, we introduce\nVersaTune, a novel data composition framework designed for enhancing LLMs'\noverall multi-ability performances during fine-tuning. We categorize knowledge\ninto distinct domains including law, medicine, finance, science, code. We begin\nwith detecting the distribution of domain-specific knowledge within the base\nmodel, followed by the composition of training data that aligns with the\nmodel's existing knowledge distribution. During the fine-tuning process,\nweights of different domains are dynamically adjusted based on their learnable\npotential and forgetting degree. Experimental results demonstrate that\nVersaTune achieves significant improvements in multi-domain performance, with a\n35.21% enhancement in comprehensive multi-domain tasks. Additionally, in\nscenarios where specific domain optimization is required, VersaTune reduces the\ndegradation of performance in other domains by 38.77%, without compromising the\ntarget domain's training efficacy.\n","authors":["Keer Lu","Keshi Zhao","Zheng Liang","Da Pan","Shusen Zhang","Xin Wu","Weipeng Chen","Zenan Zhou","Guosheng Dong","Bin Cui","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11260v1","updated":"2024-11-18T03:29:48Z","published":"2024-11-18T03:29:48Z","title":"Large corpora and large language models: a replicable method for\n automating grammatical annotation","summary":" Much linguistic research relies on annotated datasets of features extracted\nfrom text corpora, but the rapid quantitative growth of these corpora has\ncreated practical difficulties for linguists to manually annotate large data\nsamples. In this paper, we present a replicable, supervised method that\nleverages large language models for assisting the linguist in grammatical\nannotation through prompt engineering, training, and evaluation. We introduce a\nmethodological pipeline applied to the case study of formal variation in the\nEnglish evaluative verb construction 'consider X (as) (to be) Y', based on the\nlarge language model Claude 3.5 Sonnet and corpus data from Davies' NOW and\nEnTenTen21 (SketchEngine). Overall, we reach a model accuracy of over 90% on\nour held-out test samples with only a small amount of training data, validating\nthe method for the annotation of very large quantities of tokens of the\nconstruction in the future. We discuss the generalisability of our results for\na wider range of case studies of grammatical constructions and grammatical\nvariation and change, underlining the value of AI copilots as tools for future\nlinguistic research.\n","authors":["Cameron Morin","Matti Marttinen Larsson"],"pdf_url":"https://arxiv.org/pdf/2411.11260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02387v5","updated":"2024-11-18T03:17:32Z","published":"2024-09-04T02:30:12Z","title":"Large Language Models and Cognitive Science: A Comprehensive Review of\n Similarities, Differences, and Challenges","summary":" This comprehensive review explores the intersection of Large Language Models\n(LLMs) and cognitive science, examining similarities and differences between\nLLMs and human cognitive processes. We analyze methods for evaluating LLMs\ncognitive abilities and discuss their potential as cognitive models. The review\ncovers applications of LLMs in various cognitive fields, highlighting insights\ngained for cognitive science research. We assess cognitive biases and\nlimitations of LLMs, along with proposed methods for improving their\nperformance. The integration of LLMs with cognitive architectures is examined,\nrevealing promising avenues for enhancing artificial intelligence (AI)\ncapabilities. Key challenges and future research directions are identified,\nemphasizing the need for continued refinement of LLMs to better align with\nhuman cognition. This review provides a balanced perspective on the current\nstate and future potential of LLMs in advancing our understanding of both\nartificial and human intelligence.\n","authors":["Qian Niu","Junyu Liu","Ziqian Bi","Pohsun Feng","Benji Peng","Keyu Chen","Ming Li","Lawrence KQ Yan","Yichao Zhang","Caitlyn Heqi Yin","Cheng Fei","Tianyang Wang","Yunze Wang","Silin Chen"],"pdf_url":"https://arxiv.org/pdf/2409.02387v5.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2405.18634v2","updated":"2024-11-18T02:42:23Z","published":"2024-05-28T22:33:02Z","title":"A Theoretical Understanding of Self-Correction through In-context\n Alignment","summary":" Going beyond mimicking limited human experiences, recent studies show initial\nevidence that, like humans, large language models (LLMs) are capable of\nimproving their abilities purely by self-correction, i.e., correcting previous\nresponses through self-examination, in certain circumstances. Nevertheless,\nlittle is known about how such capabilities arise. In this work, based on a\nsimplified setup akin to an alignment task, we theoretically analyze\nself-correction from an in-context learning perspective, showing that when LLMs\ngive relatively accurate self-examinations as rewards, they are capable of\nrefining responses in an in-context way. Notably, going beyond previous\ntheories on over-simplified linear transformers, our theoretical construction\nunderpins the roles of several key designs of realistic transformers for\nself-correction: softmax attention, multi-head attention, and the MLP block. We\nvalidate these findings extensively on synthetic datasets. Inspired by these\nfindings, we also illustrate novel applications of self-correction, such as\ndefending against LLM jailbreaks, where a simple self-correction step does make\na large difference. We believe that these findings will inspire further\nresearch on understanding, exploiting, and enhancing self-correction for\nbuilding better foundation models.\n","authors":["Yifei Wang","Yuyang Wu","Zeming Wei","Stefanie Jegelka","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2405.18634v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11247v1","updated":"2024-11-18T02:35:15Z","published":"2024-11-18T02:35:15Z","title":"ZeFaV: Boosting Large Language Models for Zero-shot Fact Verification","summary":" In this paper, we propose ZeFaV - a zero-shot based fact-checking\nverification framework to enhance the performance on fact verification task of\nlarge language models by leveraging the in-context learning ability of large\nlanguage models to extract the relations among the entities within a claim,\nre-organized the information from the evidence in a relationally logical form,\nand combine the above information with the original evidence to generate the\ncontext from which our fact-checking model provide verdicts for the input\nclaims. We conducted empirical experiments to evaluate our approach on two\nmulti-hop fact-checking datasets including HoVer and FEVEROUS, and achieved\npotential results results comparable to other state-of-the-art fact\nverification task methods.\n","authors":["Son T. Luu","Hiep Nguyen","Trung Vo","Le-Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.11247v1.pdf","comment":"This pre-print has been published in PRICAI 2024: Trends in\n Artificial Intelligence. The published version is available at\n https://doi.org/10.1007/978-981-96-0119-6_28"},{"id":"http://arxiv.org/abs/2406.10806v2","updated":"2024-11-18T02:19:02Z","published":"2024-06-16T05:17:56Z","title":"ptt5-v2: A Closer Look at Continued Pretraining of T5 Models for the\n Portuguese Language","summary":" Despite advancements in Natural Language Processing (NLP) and the growing\navailability of pretrained models, the English language remains the primary\nfocus of model development. Continued pretraining on language-specific corpora\nprovides a practical solution for adapting models to other languages. However,\nthe impact of different pretraining settings on downstream tasks remains\nunderexplored. This work introduces $\\texttt{ptt5-v2}$, investigating the\ncontinued pretraining of T5 models for Portuguese. We first develop a baseline\nset of settings and pretrain models with sizes up to 3B parameters. Finetuning\non three Portuguese downstream tasks (assin2 STS, assin2 RTE, and TweetSentBR)\nyields SOTA results on the latter two. We then explore the effects of different\npretraining configurations, including pretraining data quality, optimization\nstrategies, and multi-epoch pretraining. Perhaps surprisingly, their impact\nremains subtle compared to our baseline. We release $\\texttt{ptt5-v2}$\npretrained checkpoints and their MonoT5-based finetuned $\\texttt{MonoPTT5}$\nrerankers on HuggingFace in their respective collections at\n\\url{https://huggingface.co/unicamp-dl}.\n","authors":["Marcos Piau","Roberto Lotufo","Rodrigo Nogueira"],"pdf_url":"https://arxiv.org/pdf/2406.10806v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15297v2","updated":"2024-11-18T02:13:31Z","published":"2024-10-20T05:57:10Z","title":"Redefining Proactivity for Information Seeking Dialogue","summary":" Information-Seeking Dialogue (ISD) agents aim to provide accurate responses\nto user queries. While proficient in directly addressing user queries, these\nagents, as well as LLMs in general, predominantly exhibit reactive behavior,\nlacking the ability to generate proactive responses that actively engage users\nin sustained conversations. However, existing definitions of proactive dialogue\nin this context do not focus on how each response actively engages the user and\nsustains the conversation. Hence, we present a new definition of proactivity\nthat focuses on enhancing the `proactiveness' of each generated response via\nthe introduction of new information related to the initial query. To this end,\nwe construct a proactive dialogue dataset comprising 2,000 single-turn\nconversations, and introduce several automatic metrics to evaluate response\n`proactiveness' which achieved high correlation with human annotation.\nAdditionally, we introduce two innovative Chain-of-Thought (CoT) prompts, the\n3-step CoT and the 3-in-1 CoT prompts, which consistently outperform standard\nprompts by up to 90% in the zero-shot setting.\n","authors":["Jing Yang Lee","Seokhwan Kim","Kartik Mehta","Jiun-Yu Kao","Yu-Hsiang Lin","Arpit Gupta"],"pdf_url":"https://arxiv.org/pdf/2410.15297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11235v1","updated":"2024-11-18T02:09:48Z","published":"2024-11-18T02:09:48Z","title":"MEMO-Bench: A Multiple Benchmark for Text-to-Image and Multimodal Large\n Language Models on Human Emotion Analysis","summary":" Artificial Intelligence (AI) has demonstrated significant capabilities in\nvarious fields, and in areas such as human-computer interaction (HCI), embodied\nintelligence, and the design and animation of virtual digital humans, both\npractitioners and users are increasingly concerned with AI's ability to\nunderstand and express emotion. Consequently, the question of whether AI can\naccurately interpret human emotions remains a critical challenge. To date, two\nprimary classes of AI models have been involved in human emotion analysis:\ngenerative models and Multimodal Large Language Models (MLLMs). To assess the\nemotional capabilities of these two classes of models, this study introduces\nMEMO-Bench, a comprehensive benchmark consisting of 7,145 portraits, each\ndepicting one of six different emotions, generated by 12 Text-to-Image (T2I)\nmodels. Unlike previous works, MEMO-Bench provides a framework for evaluating\nboth T2I models and MLLMs in the context of sentiment analysis. Additionally, a\nprogressive evaluation approach is employed, moving from coarse-grained to\nfine-grained metrics, to offer a more detailed and comprehensive assessment of\nthe sentiment analysis capabilities of MLLMs. The experimental results\ndemonstrate that existing T2I models are more effective at generating positive\nemotions than negative ones. Meanwhile, although MLLMs show a certain degree of\neffectiveness in distinguishing and recognizing human emotions, they fall short\nof human-level accuracy, particularly in fine-grained emotion analysis. The\nMEMO-Bench will be made publicly available to support further research in this\narea.\n","authors":["Yingjie Zhou","Zicheng Zhang","Jiezhang Cao","Jun Jia","Yanwei Jiang","Farong Wen","Xiaohong Liu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2411.11235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19483v3","updated":"2024-11-18T01:14:03Z","published":"2024-09-28T23:10:37Z","title":"MedCLIP-SAMv2: Towards Universal Text-Driven Medical Image Segmentation","summary":" Segmentation of anatomical structures and pathological regions in medical\nimages is essential for modern clinical diagnosis, disease research, and\ntreatment planning. While significant advancements have been made in deep\nlearning-based segmentation techniques, many of these methods still suffer from\nlimitations in data efficiency, generalizability, and interactivity. As a\nresult, developing precise segmentation methods that require fewer labeled\ndatasets remains a critical challenge in medical image analysis. Recently, the\nintroduction of foundation models like CLIP and Segment-Anything-Model (SAM),\nwith robust cross-domain representations, has paved the way for interactive and\nuniversal image segmentation. However, further exploration of these models for\ndata-efficient segmentation in medical imaging is still needed and highly\nrelevant. In this paper, we introduce MedCLIP-SAMv2, a novel framework that\nintegrates the CLIP and SAM models to perform segmentation on clinical scans\nusing text prompts, in both zero-shot and weakly supervised settings. Our\napproach includes fine-tuning the BiomedCLIP model with a new Decoupled Hard\nNegative Noise Contrastive Estimation (DHN-NCE) loss, and leveraging the\nMulti-modal Information Bottleneck (M2IB) to create visual prompts for\ngenerating segmentation masks from SAM in the zero-shot setting. We also\ninvestigate using zero-shot segmentation labels within a weakly supervised\nparadigm to enhance segmentation quality further. Extensive testing across four\ndiverse segmentation tasks and medical imaging modalities (breast tumor\nultrasound, brain tumor MRI, lung X-ray, and lung CT) demonstrates the high\naccuracy of our proposed framework. Our code is available at\nhttps://github.com/HealthX-Lab/MedCLIP-SAMv2.\n","authors":["Taha Koleilat","Hojat Asgariandehkordi","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.19483v3.pdf","comment":"10 pages, 2 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.12103v1","updated":"2024-11-18T22:31:17Z","published":"2024-11-18T22:31:17Z","title":"Does Unlearning Truly Unlearn? A Black Box Evaluation of LLM Unlearning\n Methods","summary":" Large language model unlearning aims to remove harmful information that LLMs\nhave learnt to prevent their use for malicious purposes. LLMU and RMU have been\nproposed as two methods for LLM unlearning, achieving impressive results on\nunlearning benchmarks. We study in detail the efficacy of these methods by\nevaluating their impact on general model capabilities on the WMDP benchmark as\nwell as a biology benchmark we create. Our experiments show that RMU generally\nleads to better preservation of model capabilities, for similar or better\nunlearning. We further test the robustness of these methods and find that doing\n5-shot prompting or rephrasing the question in simple ways can lead to an over\nten-fold increase in accuracy on unlearning benchmarks. Finally, we show that\ntraining on unrelated data can almost completely recover pre-unlearning\nperformance, demonstrating that these methods fail at truly unlearning. The\ncode is available at\n$\\href{https://github.com/JaiDoshi/Knowledge-Erasure}{this\\, https\\, URL}$.\n","authors":["Jai Doshi","Asa Cooper Stickland"],"pdf_url":"https://arxiv.org/pdf/2411.12103v1.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.12074v1","updated":"2024-11-18T21:36:44Z","published":"2024-11-18T21:36:44Z","title":"Mitigating Gender Bias in Contextual Word Embeddings","summary":" Word embeddings have been shown to produce remarkable results in tackling a\nvast majority of NLP related tasks. Unfortunately, word embeddings also capture\nthe stereotypical biases that are prevalent in society, affecting the\npredictive performance of the embeddings when used in downstream tasks. While\nvarious techniques have been proposed \\cite{bolukbasi2016man, zhao2018learning}\nand criticized\\cite{gonen2019lipstick} for static embeddings, very little work\nhas focused on mitigating bias in contextual embeddings. In this paper, we\npropose a novel objective function for MLM(Masked-Language Modeling) which\nlargely mitigates the gender bias in contextual embeddings and also preserves\nthe performance for downstream tasks. Since previous works on measuring bias in\ncontextual embeddings lack in normative reasoning, we also propose novel\nevaluation metrics that are straight-forward and aligned with our motivations\nin debiasing. We also propose new methods for debiasing static embeddings and\nprovide empirical proof via extensive analysis and experiments, as to why the\nmain source of bias in static embeddings stems from the presence of\nstereotypical names rather than gendered words themselves. All experiments and\nembeddings studied are in English, unless otherwise\nspecified.\\citep{bender2011achieving}.\n","authors":["Navya Yarrabelly","Vinay Damodaran","Feng-Guang Su"],"pdf_url":"https://arxiv.org/pdf/2411.12074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12056v1","updated":"2024-11-18T20:54:17Z","published":"2024-11-18T20:54:17Z","title":"Benchmarking pre-trained text embedding models in aligning built asset\n information","summary":" Accurate mapping of the built asset information to established data\nclassification systems and taxonomies is crucial for effective asset\nmanagement, whether for compliance at project handover or ad-hoc data\nintegration scenarios. Due to the complex nature of built asset data, which\npredominantly comprises technical text elements, this process remains largely\nmanual and reliant on domain expert input. Recent breakthroughs in contextual\ntext representation learning (text embedding), particularly through pre-trained\nlarge language models, offer promising approaches that can facilitate the\nautomation of cross-mapping of the built asset data. However, no comprehensive\nevaluation has yet been conducted to assess these models' ability to\neffectively represent the complex semantics specific to built asset technical\nterminology. This study presents a comparative benchmark of state-of-the-art\ntext embedding models to evaluate their effectiveness in aligning built asset\ninformation with domain-specific technical concepts. Our proposed datasets are\nderived from two renowned built asset data classification dictionaries. The\nresults of our benchmarking across six proposed datasets, covering three tasks\nof clustering, retrieval, and reranking, highlight the need for future research\non domain adaptation techniques. The benchmarking resources are published as an\nopen-source library, which will be maintained and extended to support future\nevaluations in this field.\n","authors":["Mehrzad Shahinmoghadam","Ali Motamedi"],"pdf_url":"https://arxiv.org/pdf/2411.12056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05981v4","updated":"2024-11-18T20:18:32Z","published":"2024-06-10T02:47:55Z","title":"ShiftAddLLM: Accelerating Pretrained LLMs via Post-Training\n Multiplication-Less Reparameterization","summary":" Large language models (LLMs) have shown impressive performance on language\ntasks but face challenges when deployed on resource-constrained devices due to\ntheir extensive parameters and reliance on dense multiplications, resulting in\nhigh memory demands and latency bottlenecks. Shift-and-add reparameterization\noffers a promising solution by replacing costly multiplications with\nhardware-friendly primitives in both the attention and multi-layer perceptron\n(MLP) layers of an LLM. However, current reparameterization techniques require\ntraining from scratch or full parameter fine-tuning to restore accuracy, which\nis resource-intensive for LLMs. To address this, we propose accelerating\npretrained LLMs through post-training shift-and-add reparameterization,\ncreating efficient multiplication-free models, dubbed ShiftAddLLM.\nSpecifically, we quantize each weight matrix into binary matrices paired with\ngroup-wise scaling factors. The associated multiplications are reparameterized\ninto (1) shifts between activations and scaling factors and (2) queries and\nadds according to the binary matrices. To reduce accuracy loss, we present a\nmulti-objective optimization method to minimize both weight and output\nactivation reparameterization errors. Additionally, based on varying\nsensitivity across layers to reparameterization, we develop an automated bit\nallocation strategy to further reduce memory usage and latency. Experiments on\nfive LLM families and eight tasks consistently validate the effectiveness of\nShiftAddLLM, achieving average perplexity improvements of 5.6 and 22.7 points\nat comparable or lower latency compared to the most competitive quantized LLMs\nat 3 and 2 bits, respectively, and more than 80% memory and energy reductions\nover the original LLMs. Codes and models are available at\nhttps://github.com/GATECH-EIC/ShiftAddLLM.\n","authors":["Haoran You","Yipin Guo","Yichao Fu","Wei Zhou","Huihong Shi","Xiaofan Zhang","Souvik Kundu","Amir Yazdanbakhsh","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2406.05981v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.12000v1","updated":"2024-11-18T19:36:26Z","published":"2024-11-18T19:36:26Z","title":"ByteScience: Bridging Unstructured Scientific Literature and Structured\n Data with Auto Fine-tuned Large Language Model in Token Granularity","summary":" Natural Language Processing (NLP) is widely used to supply summarization\nability from long context to structured information. However, extracting\nstructured knowledge from scientific text by NLP models remains a challenge\nbecause of its domain-specific nature to complex data preprocessing and the\ngranularity of multi-layered device-level information. To address this, we\nintroduce ByteScience, a non-profit cloud-based auto fine-tuned Large Language\nModel (LLM) platform, which is designed to extract structured scientific data\nand synthesize new scientific knowledge from vast scientific corpora. The\nplatform capitalizes on DARWIN, an open-source, fine-tuned LLM dedicated to\nnatural science. The platform was built on Amazon Web Services (AWS) and\nprovides an automated, user-friendly workflow for custom model development and\ndata extraction. The platform achieves remarkable accuracy with only a small\namount of well-annotated articles. This innovative tool streamlines the\ntransition from the science literature to structured knowledge and data and\nbenefits the advancements in natural informatics.\n","authors":["Tong Xie","Hanzhi Zhang","Shaozhou Wang","Yuwei Wan","Imran Razzak","Chunyu Kit","Wenjie Zhangand Bram Hoex"],"pdf_url":"https://arxiv.org/pdf/2411.12000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11984v1","updated":"2024-11-18T19:14:36Z","published":"2024-11-18T19:14:36Z","title":"Understanding Chain-of-Thought in LLMs through Information Theory","summary":" Large Language Models (LLMs) have shown impressive performance in complex\nreasoning tasks through Chain-of-Thought (CoT) reasoning, allowing models to\nbreak down problems into manageable sub-tasks. However, existing CoT evaluation\ntechniques either require annotated CoT data or fall short in accurately\nassessing intermediate reasoning steps, leading to high rates of false\npositives. In this paper, we formalize CoT reasoning in LLMs through an\ninformation-theoretic lens. Specifically, our framework quantifies the\n`information gain' at each reasoning step, enabling the identification of\nfailure modes in LLMs without the need for expensive annotated datasets. We\ndemonstrate the efficacy of our approach through extensive experiments on toy\nand GSM-8K data, where it significantly outperforms existing outcome-based\nmethods by providing more accurate insights into model performance on\nindividual tasks.\n","authors":["Jean-Francois Ton","Muhammad Faaiz Taufiq","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.11984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13360v2","updated":"2024-11-18T15:35:14Z","published":"2024-10-17T09:10:26Z","title":"Retrieval-Augmented Personalization for Multimodal Large Language Models","summary":" The development of large language models (LLMs) has significantly enhanced\nthe capabilities of multimodal LLMs (MLLMs) as general assistants. However,\nlack of user-specific knowledge still restricts their application in human's\ndaily life. In this paper, we introduce the Retrieval Augmented Personalization\n(RAP) framework for MLLMs' personalization. Starting from a general MLLM, we\nturn it into a personalized assistant in three steps. (a) Remember: We design a\nkey-value database to store user-related information, e.g., user's name, avatar\nand other attributes. (b) Retrieve: When the user initiates a conversation, RAP\nwill retrieve relevant information from the database using a multimodal\nretriever. (c) Generate: The input query and retrieved concepts' information\nare fed into MLLMs to generate personalized, knowledge-augmented responses.\nUnlike previous methods, RAP allows real-time concept editing via updating the\nexternal database. To further improve generation quality and alignment with\nuser-specific information, we design a pipeline for data collection and create\na specialized dataset for personalized training of MLLMs. Based on the dataset,\nwe train a series of MLLMs as personalized multimodal assistants. By\npretraining on large-scale dataset, RAP-MLLMs can generalize to infinite visual\nconcepts without additional finetuning. Our models demonstrate outstanding\nflexibility and generation quality across a variety of tasks, such as\npersonalized image captioning, question answering and visual recognition. The\ncode, data and models are available at https://github.com/Hoar012/RAP-MLLM.\n","authors":["Haoran Hao","Jiaming Han","Changsheng Li","Yu-Feng Li","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2410.13360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11932v1","updated":"2024-11-18T14:28:04Z","published":"2024-11-18T14:28:04Z","title":"Reviving Dormant Memories: Investigating Catastrophic Forgetting in\n Language Models through Rationale-Guidance Difficulty","summary":" Although substantial efforts have been made to mitigate catastrophic\nforgetting in continual learning, the intrinsic mechanisms are not well\nunderstood. In this paper, we discover that when a forgetting model passively\nreceives an externally provided partial appropriate rationale, its performance\non the forgotten task can be restored. Furthermore, by simply adding a\ntask-agnostic prefix to the original instruction, the forgetting model can\nactively generate an appropriate rationale to reach the correct answer. These\nfindings suggest that the model does not actually ``forget'' the task\nknowledge; instead, the degraded performance can be attributed to the failure\nof the original instructions in guiding the model to generate the appropriate\nrationales. Based on this insight, we propose the Rationale-Guidance Difficulty\nmetric to evaluate how effectively a given instruction guides the model in\ngenerating appropriate rationales. We apply this metric to optimize the\nallocation of replay data in replay-based continual learning algorithm.\nExperimental results demonstrate that our data allocation method effectively\nmitigates catastrophic forgetting and maintains better model plasticity\nsimultaneously across models.\n","authors":["Huashan Sun","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2411.11932v1.pdf","comment":"Working in progress"}]},"2024-11-17T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.11196v1","updated":"2024-11-17T23:09:08Z","published":"2024-11-17T23:09:08Z","title":"PickScan: Object discovery and reconstruction from handheld interactions","summary":" Reconstructing compositional 3D representations of scenes, where each object\nis represented with its own 3D model, is a highly desirable capability in\nrobotics and augmented reality. However, most existing methods rely heavily on\nstrong appearance priors for object discovery, therefore only working on those\nclasses of objects on which the method has been trained, or do not allow for\nobject manipulation, which is necessary to scan objects fully and to guide\nobject discovery in challenging scenarios. We address these limitations with a\nnovel interaction-guided and class-agnostic method based on object\ndisplacements that allows a user to move around a scene with an RGB-D camera,\nhold up objects, and finally outputs one 3D model per held-up object. Our main\ncontribution to this end is a novel approach to detecting user-object\ninteractions and extracting the masks of manipulated objects. On a\ncustom-captured dataset, our pipeline discovers manipulated objects with 78.3%\nprecision at 100% recall and reconstructs them with a mean chamfer distance of\n0.90cm. Compared to Co-Fusion, the only comparable interaction-based and\nclass-agnostic baseline, this corresponds to a reduction in chamfer distance of\n73% while detecting 99% fewer false positives.\n","authors":["Vincent van der Brugge","Marc Pollefeys","Joshua B. Tenenbaum","Ayush Tewari","Krishna Murthy Jatavallabhula"],"pdf_url":"https://arxiv.org/pdf/2411.11196v1.pdf","comment":"7 pages, 8 figures, published in the 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2411.11192v1","updated":"2024-11-17T22:54:53Z","published":"2024-11-17T22:54:53Z","title":"Robot Metabolism: Towards machines that can grow by consuming other\n machines","summary":" Biological lifeforms can heal, grow, adapt, and reproduce -- abilities\nessential for sustained survival and development. In contrast, robots today are\nprimarily monolithic machines with limited ability to self-repair, physically\ndevelop, or incorporate material from their environments. A key challenge to\nsuch physical adaptation has been that while robot minds are rapidly evolving\nnew behaviors through AI, their bodies remain closed systems, unable to\nsystematically integrate new material to grow or heal. We argue that open-ended\nphysical adaptation is only possible when robots are designed using only a\nsmall repertoire of simple modules. This allows machines to mechanically adapt\nby consuming parts from other machines or their surroundings and shedding\nbroken components. We demonstrate this principle using a truss modular robot\nplatform composed of one-dimensional actuated bars. We show how robots in this\nspace can grow bigger, faster, and more capable by consuming materials from\ntheir environment and from other robots. We suggest that machine metabolic\nprocesses akin to the one demonstrated here will be an essential part of any\nsustained future robot ecology.\n","authors":["Philippe Martin Wyder","Riyaan Bakhda","Meiqi Zhao","Quinn A. Booth","Matthew E. Modi","Andrew Song","Simon Kang","Jiahao Wu","Priya Patel","Robert T. Kasumi","David Yi","Nihar Niraj Garg","Pranav Jhunjhunwala","Siddharth Bhutoria","Evan H. Tong","Yuhang Hu","Judah Goldfeder","Omer Mustel","Donghan Kim","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2411.11192v1.pdf","comment":"Manuscript combined with Supplementary Materials File for arXiv\n submission. Submitting to Journal and will update external DOI once available"},{"id":"http://arxiv.org/abs/2411.11182v1","updated":"2024-11-17T21:52:58Z","published":"2024-11-17T21:52:58Z","title":"Improving User Experience in Preference-Based Optimization of Reward\n Functions for Assistive Robots","summary":" Assistive robots interact with humans and must adapt to different users'\npreferences to be effective. An easy and effective technique to learn\nnon-expert users' preferences is through rankings of robot behaviors, for\nexample, robot movement trajectories or gestures. Existing techniques focus on\ngenerating trajectories for users to rank that maximize the outcome of the\npreference learning process. However, the generated trajectories do not appear\nto reflect the user's preference over repeated interactions. In this work, we\ndesign an algorithm to generate trajectories for users to rank that we call\nCovariance Matrix Adaptation Evolution Strategies with Information Gain\n(CMA-ES-IG). CMA-ES-IG prioritizes the user's experience of the preference\nlearning process. We show that users find our algorithm more intuitive and\neasier to use than previous approaches across both physical and social robot\ntasks. This project's code is hosted at github.com/interaction-lab/CMA-ES-IG\n","authors":["Nathaniel Dennler","Zhonghao Shi","Stefanos Nikolaidis","Maja Matarić"],"pdf_url":"https://arxiv.org/pdf/2411.11182v1.pdf","comment":"Accepted to ISRR"},{"id":"http://arxiv.org/abs/2411.11151v1","updated":"2024-11-17T18:53:20Z","published":"2024-11-17T18:53:20Z","title":"Person Segmentation and Action Classification for Multi-Channel\n Hemisphere Field of View LiDAR Sensors","summary":" Robots need to perceive persons in their surroundings for safety and to\ninteract with them. In this paper, we present a person segmentation and action\nclassification approach that operates on 3D scans of hemisphere field of view\nLiDAR sensors. We recorded a data set with an Ouster OSDome-64 sensor\nconsisting of scenes where persons perform three different actions and\nannotated it. We propose a method based on a MaskDINO model to detect and\nsegment persons and to recognize their actions from combined spherical\nprojected multi-channel representations of the LiDAR data with an additional\npositional encoding. Our approach demonstrates good performance for the person\nsegmentation task and further performs well for the estimation of the person\naction states walking, waving, and sitting. An ablation study provides insights\nabout the individual channel contributions for the person segmentation task.\nThe trained models, code and dataset are made publicly available.\n","authors":["Svetlana Seliunina","Artem Otelepko","Raphael Memmesheimer","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2411.11151v1.pdf","comment":"6 pages, 9 figures, 4 tables, accepted for publication at IEEE/SICE\n International Symposium on System Integration (SII), Munich, Germany, January\n 2025"},{"id":"http://arxiv.org/abs/2411.09658v2","updated":"2024-11-17T18:30:27Z","published":"2024-11-14T18:29:31Z","title":"Motion Before Action: Diffusing Object Motion as Manipulation Condition","summary":" Inferring object motion representations from observations enhances the\nperformance of robotic manipulation tasks. This paper introduces a new paradigm\nfor robot imitation learning that generates action sequences by reasoning about\nobject motion from visual observations. We propose MBA (Motion Before Action),\na novel module that employs two cascaded diffusion processes for object motion\ngeneration and robot action generation under object motion guidance. MBA first\npredicts the future pose sequence of the object based on observations, then\nuses this sequence as a condition to guide robot action generation. Designed as\na plug-and-play component, MBA can be flexibly integrated into existing robotic\nmanipulation policies with diffusion action heads. Extensive experiments in\nboth simulated and real-world environments demonstrate that our approach\nsubstantially improves the performance of existing policies across a wide range\nof manipulation tasks. Project page: https://selen-suyue.github.io/MBApage/\n","authors":["Yue Su","Xinyu Zhan","Hongjie Fang","Yong-Lu Li","Cewu Lu","Lixin Yang"],"pdf_url":"https://arxiv.org/pdf/2411.09658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11142v1","updated":"2024-11-17T18:13:46Z","published":"2024-11-17T18:13:46Z","title":"Emergent Structure in Multi-agent Systems Using Geometric Embeddings","summary":" This work investigates the self-organization of multi-agent systems into\nclosed trajectories, a common requirement in unmanned aerial vehicle (UAV)\nsurveillance tasks. In such scenarios, smooth, unbiased control signals save\nenergy and mitigate mechanical strain. We propose a decentralized control\nsystem architecture that produces a globally stable emergent structure from\nlocal observations only; there is no requirement for agents to share a global\nplan or follow prescribed trajectories. Central to our approach is the\nformulation of an injective virtual embedding induced by rotations from the\nactual agent positions. This embedding serves as a structure-preserving map\naround which all agent stabilize their relative positions and permits the use\nof well-established linear control techniques. We construct the embedding such\nthat it is topologically equivalent to the desired trajectory (i.e., a\nhomeomorphism), thereby preserving the stability characteristics. We\ndemonstrate the versatility of this approach through implementation on a swarm\nof Quanser QDrone quadcopters. Results demonstrate the quadcopters\nself-organize into the desired trajectory while maintaining even separation.\n","authors":["Dimitria Silveria","Kleber Cabral","Peter Jardine","Sidney Givigi"],"pdf_url":"https://arxiv.org/pdf/2411.11142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07105v3","updated":"2024-11-17T15:07:34Z","published":"2023-11-13T06:40:31Z","title":"Collaborative Goal Tracking of Multiple Mobile Robots Based on Geometric\n Graph Neural Network","summary":" Multiple mobile robots play a significant role in various spatially\ndistributed tasks.In unfamiliar and non-repetitive scenarios, reconstructing\nthe global map is time-inefficient and sometimes unrealistic. Hence, research\nhas focused on achieving real-time collaborative planning by utilizing sensor\ndata from multiple robots located at different positions, all without relying\non a global map.This paper introduces a Multi-Robot collaborative Path Planning\nmethod based on Geometric Graph Neural Network (MRPP-GeoGNN). We extract the\nfeatures of each neighboring robot's sensory data and integrate the relative\npositions of neighboring robots into each interaction layer to incorporate\nobstacle information along with location details using geometric feature\nencoders. After that, a MLP layer is used to map the amalgamated local features\nto multiple forward directions for the robot's actual movement. We generated\nexpert data in ROS to train the network and carried out both simulations and\nphysical experiments to validate the effectiveness of the proposed method.\nSimulation results demonstrate an approximate 5% improvement in accuracy\ncompared to the model based solely on CNN on expert datasets. The success rate\nis enhanced by about 4% compared to CNN, and the flowtime increase is reduced\nby approximately 18% in the ROS test, surpassing other GNN models. Besides, the\nproposed method is able to leverage neighbor's information and greatly improves\npath efficiency in real-world scenarios.\n","authors":["Weining Lu","Qingquan Lin","Litong Meng","Chenxi Li","Bin Liang"],"pdf_url":"https://arxiv.org/pdf/2311.07105v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2005.07917v3","updated":"2024-11-17T14:41:01Z","published":"2020-05-16T09:12:39Z","title":"Gathering on a Circle with Limited Visibility by Anonymous Oblivious\n Robots","summary":" A swarm of anonymous oblivious mobile robots, operating in deterministic\nLook-Compute-Move cycles, is confined within a circular track. All robots agree\non the clockwise direction (chirality), they are activated by an adversarial\nsemi-synchronous scheduler (SSYNCH), and an active robot always reaches the\ndestination point it computes (rigidity). Robots have limited visibility: each\nrobot can see only the points on the circle that have an angular distance\nstrictly smaller than a constant $\\vartheta$ from the robot's current location,\nwhere $0<\\vartheta\\leq\\pi$ (angles are expressed in radians).\n We study the Gathering problem for such a swarm of robots: that is, all\nrobots are initially in distinct locations on the circle, and their task is to\nreach the same point on the circle in a finite number of turns, regardless of\nthe way they are activated by the scheduler. Note that, due to the anonymity of\nthe robots, this task is impossible if the initial configuration is\nrotationally symmetric; hence, we have to make the assumption that the initial\nconfiguration be rotationally asymmetric.\n We prove that, if $\\vartheta=\\pi$ (i.e., each robot can see the entire circle\nexcept its antipodal point), there is a distributed algorithm that solves the\nGathering problem for swarms of any size. By contrast, we also prove that, if\n$\\vartheta\\leq \\pi/2$, no distributed algorithm solves the Gathering problem,\nregardless of the size of the swarm, even under the assumption that the initial\nconfiguration is rotationally asymmetric and the visibility graph of the robots\nis connected.\n The latter impossibility result relies on a probabilistic technique based on\nrandom perturbations, which is novel in the context of anonymous mobile robots.\nSuch a technique is of independent interest, and immediately applies to other\nPattern-Formation problems.\n","authors":["Giuseppe A. Di Luna","Ryuhei Uehara","Giovanni Viglietta","Yukiko Yamauchi"],"pdf_url":"https://arxiv.org/pdf/2005.07917v3.pdf","comment":"34 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.14737v2","updated":"2024-11-17T12:50:29Z","published":"2024-09-23T06:33:52Z","title":"Adverse Weather-Immune Semantic Segmentation with Unfolded\n Regularization and Foundation Model Knowledge Distillation for Autonomous\n Driving","summary":" Various adverse weather conditions pose a significant challenge to autonomous\ndriving (AD) street scene semantic understanding (segmentation). A common\nstrategy is to minimize the disparity between images captured in clear and\nadverse weather conditions. However, this technique typically relies on\nutilizing clear image as a reference, which is challenging to obtain in\npractice. Furthermore, this method typically targets a single adverse\ncondition, and thus perform poorly when confronting a mixture of multiple\nadverse weather conditions. To address these issues, we introduce a\nreference-free and Adverse weather-Immune scheme (called AdvImmu) that\nleverages the invariance of weather conditions over short periods (seconds).\nSpecifically, AdvImmu includes three components: Locally Sequential Mechanism\n(LSM), Globally Shuffled Mechanism (GSM), and Unfolded Regularizers (URs). LSM\nleverages temporal correlations between adjacent frames to enhance model\nperformance. GSM is proposed to shuffle LSM segments to prevent overfitting of\ntemporal patterns. URs are the deep unfolding implementation of two proposed\nregularizers to penalize the model complexity to enhance across-weather\ngeneralization. In addition, to overcome the over-reliance on consecutive\nframe-wise annotations in the training of AdvImmu (typically unavailable in AD\nscenarios), we incorporate a foundation model named Segment Anything Model\n(SAM) to assist to annotate frames, and additionally propose a cluster\nalgorithm (denoted as SBICAC) to surmount SAM's category-agnostic issue to\ngenerate pseudo-labels. Extensive experiments demonstrate that the proposed\nAdvImmu outperforms existing state-of-the-art methods by 88.56% in mean\nIntersection over Union (mIoU).\n","authors":["Wei-Bin Kou","Guangxu Zhu","Rongguang Ye","Qingfeng Lin","Zeyi Ren","Ming Tang","Yik-Chung Wu"],"pdf_url":"https://arxiv.org/pdf/2409.14737v2.pdf","comment":"16 Pages"},{"id":"http://arxiv.org/abs/2411.11004v1","updated":"2024-11-17T08:50:47Z","published":"2024-11-17T08:50:47Z","title":"EROAM: Event-based Camera Rotational Odometry and Mapping in Real-time","summary":" This paper presents EROAM, a novel event-based rotational odometry and\nmapping system that achieves real-time, accurate camera rotation estimation.\nUnlike existing approaches that rely on event generation models or contrast\nmaximization, EROAM employs a spherical event representation by projecting\nevents onto a unit sphere and introduces Event Spherical Iterative Closest\nPoint (ES-ICP), a novel geometric optimization framework designed specifically\nfor event camera data. The spherical representation simplifies rotational\nmotion formulation while enabling continuous mapping for enhanced spatial\nresolution. Combined with parallel point-to-line optimization, EROAM achieves\nefficient computation without compromising accuracy. Extensive experiments on\nboth synthetic and real-world datasets show that EROAM significantly\noutperforms state-of-the-art methods in terms of accuracy, robustness, and\ncomputational efficiency. Our method maintains consistent performance under\nchallenging conditions, including high angular velocities and extended\nsequences, where other methods often fail or show significant drift.\nAdditionally, EROAM produces high-quality panoramic reconstructions with\npreserved fine structural details.\n","authors":["Wanli Xing","Shijie Lin","Linhan Yang","Zeqing Zhang","Yanjun Du","Maolin Lei","Yipeng Pan","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2411.11004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10991v1","updated":"2024-11-17T07:25:54Z","published":"2024-11-17T07:25:54Z","title":"Modulating Reservoir Dynamics via Reinforcement Learning for Efficient\n Robot Skill Synthesis","summary":" A random recurrent neural network, called a reservoir, can be used to learn\nrobot movements conditioned on context inputs that encode task goals. The\nLearning is achieved by mapping the random dynamics of the reservoir modulated\nby context to desired trajectories via linear regression. This makes the\nreservoir computing (RC) approach computationally efficient as no iterative\ngradient descent learning is needed. In this work, we propose a novel RC-based\nLearning from Demonstration (LfD) framework that not only learns to generate\nthe demonstrated movements but also allows online modulation of the reservoir\ndynamics to generate movement trajectories that are not covered by the initial\ndemonstration set. This is made possible by using a Reinforcement Learning (RL)\nmodule that learns a policy to output context as its actions based on the robot\nstate. Considering that the context dimension is typically low, learning with\nthe RL module is very efficient. We show the validity of the proposed model\nwith systematic experiments on a 2 degrees-of-freedom (DOF) simulated robot\nthat is taught to reach targets, encoded as context, with and without obstacle\navoidance constraint. The initial data set includes a set of reaching\ndemonstrations which are learned by the reservoir system. To enable reaching\nout-of-distribution targets, the RL module is engaged in learning a policy to\ngenerate dynamic contexts so that the generated trajectory achieves the desired\ngoal without any learning in the reservoir system. Overall, the proposed model\nuses an initial learned motor primitive set to efficiently generate diverse\nmotor behaviors guided by the designed reward function. Thus the model can be\nused as a flexible and effective LfD system where the action repertoire can be\nextended without new data collection.\n","authors":["Zahra Koulaeizadeh","Erhan Oztop"],"pdf_url":"https://arxiv.org/pdf/2411.10991v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.10974v1","updated":"2024-11-17T05:46:30Z","published":"2024-11-17T05:46:30Z","title":"CropNav: a Framework for Autonomous Navigation in Real Farms","summary":" Small robots that can operate under the plant canopy can enable new\npossibilities in agriculture. However, unlike larger autonomous tractors,\nautonomous navigation for such under canopy robots remains an open challenge\nbecause Global Navigation Satellite System (GNSS) is unreliable under the plant\ncanopy. We present a hybrid navigation system that autonomously switches\nbetween different sets of sensing modalities to enable full field navigation,\nboth inside and outside of crop. By choosing the appropriate path reference\nsource, the robot can accommodate for loss of GNSS signal quality and leverage\nrow-crop structure to autonomously navigate. However, such switching can be\ntricky and difficult to execute over scale. Our system provides a solution by\nautomatically switching between an exteroceptive sensing based system, such as\nLight Detection And Ranging (LiDAR) row-following navigation and waypoints path\ntracking. In addition, we show how our system can detect when the navigate\nfails and recover automatically extending the autonomous time and mitigating\nthe necessity of human intervention. Our system shows an improvement of about\n750 m per intervention over GNSS-based navigation and 500 m over row following\nnavigation.\n","authors":["Mateus Valverde Gasparino","Vitor Akihiro Hisano Higuti","Arun Narenthiran Sivakumar","Andres Eduardo Baquero Velasquez","Marcelo Becker","Girish Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2411.10974v1.pdf","comment":"Presented in the 2023 IEEE International Conference on Robotics and\n Automation (ICRA)"},{"id":"http://arxiv.org/abs/2411.10966v1","updated":"2024-11-17T05:11:44Z","published":"2024-11-17T05:11:44Z","title":"Avian-Inspired High-Precision Tracking Control for Aerial Manipulators","summary":" Aerial manipulators, composed of multirotors and robotic arms, have a\nstructure and function highly reminiscent of avian species. This paper studies\nthe tracking control problem for aerial manipulators. This paper studies the\ntracking control problem for aerial manipulators. We propose an avian-inspired\naerial manipulation system, which includes an avian-inspired robotic arm\ndesign, a Recursive Newton-Euler (RNE) method-based nonlinear flight\ncontroller, and a coordinated controller with two modes. Compared to existing\nmethods, our proposed approach offers several attractive features. First, the\nmorphological characteristics of avian species are used to determine the size\nproportion of the multirotor and the robotic arm in the aerial manipulator.\nSecond, the dynamic coupling of the aerial manipulator is addressed by the\nRNE-based flight controller and a dual-mode coordinated controller.\nSpecifically, under our proposed algorithm, the aerial manipulator can\nstabilize the end-effector's pose, similar to avian head stabilization. The\nproposed approach is verified through three numerical experiments. The results\nshow that even when the quadcopter is disturbed by different forces, the\nposition error of the end-effector achieves millimeter-level accuracy, and the\nattitude error remains within 1 degree. The limitation of this work is not\nconsidering aggressive manipulation like that seen in birds. Addressing this\nthrough future studies that explore real-world experiments will be a key\ndirection for research.\n","authors":["Mengyu Ji","Jiahao Shen","Huazi Cao","Shiyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.10966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10941v1","updated":"2024-11-17T02:39:58Z","published":"2024-11-17T02:39:58Z","title":"Efficient Estimation of Relaxed Model Parameters for Robust UAV\n Trajectory Optimization","summary":" Online trajectory optimization and optimal control methods are crucial for\nenabling sustainable unmanned aerial vehicle (UAV) services, such as\nagriculture, environmental monitoring, and transportation, where available\nactuation and energy are limited. However, optimal controllers are highly\nsensitive to model mismatch, which can occur due to loaded equipment, packages\nto be delivered, or pre-existing variability in fundamental structural and\nthrust-related parameters. To circumvent this problem, optimal controllers can\nbe paired with parameter estimators to improve their trajectory planning\nperformance and perform adaptive control. However, UAV platforms are limited in\nterms of onboard processing power, oftentimes making nonlinear parameter\nestimation too computationally expensive to consider. To address these issues,\nwe propose a relaxed, affine-in-parameters multirotor model along with an\nefficient optimal parameter estimator. We convexify the nominal Moving Horizon\nParameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via\nan affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast\nquadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)\nin real time. We compare this approach to the equivalent nonlinear estimator in\nMonte Carlo simulations, demonstrating a decrease in average solve time and\ntrajectory optimality cost by 98.2% and 23.9-56.2%, respectively.\n","authors":["D. Fan","D. A. Copp"],"pdf_url":"https://arxiv.org/pdf/2411.10941v1.pdf","comment":"8 pages, 5 figures, submitted to IEEE Sustech 2025"},{"id":"http://arxiv.org/abs/2411.10935v1","updated":"2024-11-17T02:18:21Z","published":"2024-11-17T02:18:21Z","title":"Exciting Contact Modes in Differentiable Simulations for Robot Learning","summary":" In this paper, we explore an approach to actively plan and excite contact\nmodes in differentiable simulators as a means to tighten the sim-to-real gap.\nWe propose an optimal experimental design approach derived from\ninformation-theoretic methods to identify and search for information-rich\ncontact modes through the use of contact-implicit optimization. We demonstrate\nour approach on a robot parameter estimation problem with unknown inertial and\nkinematic parameters which actively seeks contacts with a nearby surface. We\nshow that our approach improves the identification of unknown parameter\nestimates over experimental runs by an estimate error reduction of at least\n$\\sim 84\\%$ when compared to a random sampling baseline, with significantly\nhigher information gains.\n","authors":["Hrishikesh Sathyanarayan","Ian Abraham"],"pdf_url":"https://arxiv.org/pdf/2411.10935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05003v2","updated":"2024-11-17T01:28:37Z","published":"2024-06-07T15:17:06Z","title":"Designs for Enabling Collaboration in Human-Machine Teaming via\n Interactive and Explainable Systems","summary":" Collaborative robots and machine learning-based virtual agents are\nincreasingly entering the human workspace with the aim of increasing\nproductivity and enhancing safety. Despite this, we show in a ubiquitous\nexperimental domain, Overcooked-AI, that state-of-the-art techniques for\nhuman-machine teaming (HMT), which rely on imitation or reinforcement learning,\nare brittle and result in a machine agent that aims to decouple the machine and\nhuman's actions to act independently rather than in a synergistic fashion. To\nremedy this deficiency, we develop HMT approaches that enable iterative,\nmixed-initiative team development allowing end-users to interactively reprogram\ninterpretable AI teammates. Our 50-subject study provides several findings that\nwe summarize into guidelines. While all approaches underperform a simple\ncollaborative heuristic (a critical, negative result for learning-based\nmethods), we find that white-box approaches supported by interactive\nmodification can lead to significant team development, outperforming white-box\napproaches alone, and that black-box approaches are easier to train and result\nin better HMT performance highlighting a tradeoff between explainability and\ninteractivity versus ease-of-training. Together, these findings present three\nimportant future research directions: 1) Improving the ability to generate\ncollaborative agents with white-box models, 2) Better learning methods to\nfacilitate collaboration rather than individualized coordination, and 3)\nMixed-initiative interfaces that enable users, who may vary in ability, to\nimprove collaboration.\n","authors":["Rohan Paleja","Michael Munje","Kimberlee Chang","Reed Jensen","Matthew Gombolay"],"pdf_url":"https://arxiv.org/pdf/2406.05003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06828v2","updated":"2024-11-17T00:07:21Z","published":"2024-03-11T15:44:38Z","title":"NeuPAN: Direct Point Robot Navigation with End-to-End Model-based\n Learning","summary":" Navigating a nonholonomic robot in a cluttered environment requires extremely\naccurate perception and locomotion for collision avoidance. This paper presents\nNeuPAN: a real-time, highly-accurate, map-free, robot-agnostic, and\nenvironment-invariant robot navigation solution. Leveraging a tightly-coupled\nperception-locomotion framework, NeuPAN has two key innovations compared to\nexisting approaches: 1) it directly maps raw points to a learned multi-frame\ndistance space, avoiding error propagation from perception to control; 2) it is\ninterpretable from an end-to-end model-based learning perspective, enabling\nprovable convergence. The crux of NeuPAN is to solve a high-dimensional\nend-to-end mathematical model with various point-level constraints using the\nplug-and-play (PnP) proximal alternating-minimization network (PAN) with\nneurons in the loop. This allows NeuPAN to generate real-time, end-to-end,\nphysically-interpretable motions directly from point clouds, which seamlessly\nintegrates data- and knowledge-engines, where its network parameters are\nadjusted via back propagation. We evaluate NeuPAN on car-like robot,\nwheel-legged robot, and passenger autonomous vehicle, in both simulated and\nreal-world environments. Experiments demonstrate that NeuPAN outperforms\nvarious benchmarks, in terms of accuracy, efficiency, robustness, and\ngeneralization capability across various environments, including the cluttered\nsandbox, office, corridor, and parking lot. We show that NeuPAN works well in\nunstructured environments with arbitrary-shape undetectable objects, making\nimpassable ways passable.\n","authors":["Ruihua Han","Shuai Wang","Shuaijun Wang","Zeqing Zhang","Jianjun Chen","Shijie Lin","Chengyang Li","Chengzhong Xu","Yonina C. Eldar","Qi Hao","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2403.06828v2.pdf","comment":"revision in TRO; project website:\n https://hanruihua.github.io/neupan_project/"},{"id":"http://arxiv.org/abs/2411.11913v1","updated":"2024-11-17T23:20:37Z","published":"2024-11-17T23:20:37Z","title":"On-Board Vision-Language Models for Personalized Autonomous Vehicle\n Motion Control: System Design and Real-World Validation","summary":" Personalized driving refers to an autonomous vehicle's ability to adapt its\ndriving behavior or control strategies to match individual users' preferences\nand driving styles while maintaining safety and comfort standards. However,\nexisting works either fail to capture every individual preference precisely or\nbecome computationally inefficient as the user base expands. Vision-Language\nModels (VLMs) offer promising solutions to this front through their natural\nlanguage understanding and scene reasoning capabilities. In this work, we\npropose a lightweight yet effective on-board VLM framework that provides\nlow-latency personalized driving performance while maintaining strong reasoning\ncapabilities. Our solution incorporates a Retrieval-Augmented Generation\n(RAG)-based memory module that enables continuous learning of individual\ndriving preferences through human feedback. Through comprehensive real-world\nvehicle deployment and experiments, our system has demonstrated the ability to\nprovide safe, comfortable, and personalized driving experiences across various\nscenarios and significantly reduce takeover rates by up to 76.9%. To the best\nof our knowledge, this work represents the first end-to-end VLM-based motion\ncontrol system in real-world autonomous vehicles.\n","authors":["Can Cui","Zichong Yang","Yupeng Zhou","Juntong Peng","Sung-Yeon Park","Cong Zhang","Yunsheng Ma","Xu Cao","Wenqian Ye","Yiheng Feng","Jitesh Panchal","Lingxi Li","Yaobin Chen","Ziran Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11911v1","updated":"2024-11-17T16:36:09Z","published":"2024-11-17T16:36:09Z","title":"ModeSeq: Taming Sparse Multimodal Motion Prediction with Sequential Mode\n Modeling","summary":" Anticipating the multimodality of future events lays the foundation for safe\nautonomous driving. However, multimodal motion prediction for traffic agents\nhas been clouded by the lack of multimodal ground truth. Existing works\npredominantly adopt the winner-take-all training strategy to tackle this\nchallenge, yet still suffer from limited trajectory diversity and misaligned\nmode confidence. While some approaches address these limitations by generating\nexcessive trajectory candidates, they necessitate a post-processing stage to\nidentify the most representative modes, a process lacking universal principles\nand compromising trajectory accuracy. We are thus motivated to introduce\nModeSeq, a new multimodal prediction paradigm that models modes as sequences.\nUnlike the common practice of decoding multiple plausible trajectories in one\nshot, ModeSeq requires motion decoders to infer the next mode step by step,\nthereby more explicitly capturing the correlation between modes and\nsignificantly enhancing the ability to reason about multimodality. Leveraging\nthe inductive bias of sequential mode prediction, we also propose the\nEarly-Match-Take-All (EMTA) training strategy to diversify the trajectories\nfurther. Without relying on dense mode prediction or rule-based trajectory\nselection, ModeSeq considerably improves the diversity of multimodal output\nwhile attaining satisfactory trajectory accuracy, resulting in balanced\nperformance on motion prediction benchmarks. Moreover, ModeSeq naturally\nemerges with the capability of mode extrapolation, which supports forecasting\nmore behavior modes when the future is highly uncertain.\n","authors":["Zikang Zhou","Hengjian Zhou","Haibo Hu","Zihao Wen","Jianping Wang","Yung-Hui Li","Yu-Kai Huang"],"pdf_url":"https://arxiv.org/pdf/2411.11911v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.08978v2","updated":"2024-11-17T23:51:14Z","published":"2023-07-18T05:22:25Z","title":"Learned Scalable Video Coding For Humans and Machines","summary":" Video coding has traditionally been developed to support services such as\nvideo streaming, videoconferencing, digital TV, and so on. The main intent was\nto enable human viewing of the encoded content. However, with the advances in\ndeep neural networks (DNNs), encoded video is increasingly being used for\nautomatic video analytics performed by machines. In applications such as\nautomatic traffic monitoring, analytics such as vehicle detection, tracking and\ncounting, would run continuously, while human viewing could be required\noccasionally to review potential incidents. To support such applications, a new\nparadigm for video coding is needed that will facilitate efficient\nrepresentation and compression of video for both machine and human use in a\nscalable manner. In this manuscript, we introduce an end-to-end learnable video\ncodec that supports a machine vision task in its base layer, while its\nenhancement layer, together with the base layer, supports input reconstruction\nfor human viewing. The proposed system is constructed based on the concept of\nconditional coding to achieve better compression gains. Comprehensive\nexperimental evaluations conducted on four standard video datasets demonstrate\nthat our framework outperforms both state-of-the-art learned and conventional\nvideo codecs in its base layer, while maintaining comparable performance on the\nhuman vision task in its enhancement layer. Implementation of the proposed\nsystem is available at https://github.com/hadipardis/svc\n","authors":["Hadi Hadizadeh","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2307.08978v2.pdf","comment":"18 pages, 18 figures"},{"id":"http://arxiv.org/abs/2411.11199v1","updated":"2024-11-17T23:22:48Z","published":"2024-11-17T23:22:48Z","title":"BVI-CR: A Multi-View Human Dataset for Volumetric Video Compression","summary":" The advances in immersive technologies and 3D reconstruction have enabled the\ncreation of digital replicas of real-world objects and environments with fine\ndetails. These processes generate vast amounts of 3D data, requiring more\nefficient compression methods to satisfy the memory and bandwidth constraints\nassociated with data storage and transmission. However, the development and\nvalidation of efficient 3D data compression methods are constrained by the lack\nof comprehensive and high-quality volumetric video datasets, which typically\nrequire much more effort to acquire and consume increased resources compared to\n2D image and video databases. To bridge this gap, we present an open multi-view\nvolumetric human dataset, denoted BVI-CR, which contains 18 multi-view RGB-D\ncaptures and their corresponding textured polygonal meshes, depicting a range\nof diverse human actions. Each video sequence contains 10 views in 1080p\nresolution with durations between 10-15 seconds at 30FPS. Using BVI-CR, we\nbenchmarked three conventional and neural coordinate-based multi-view video\ncompression methods, following the MPEG MIV Common Test Conditions, and\nreported their rate quality performance based on various quality metrics. The\nresults show the great potential of neural representation based methods in\nvolumetric video compression compared to conventional video coding methods\n(with an up to 38\\% average coding gain in PSNR). This dataset provides a\ndevelopment and validation platform for a variety of tasks including volumetric\nreconstruction, compression, and quality assessment. The database will be\nshared publicly at \\url{https://github.com/fan-aaron-zhang/bvi-cr}.\n","authors":["Ge Gao","Adrian Azzarelli","Ho Man Kwan","Nantheera Anantrasirichai","Fan Zhang","Oliver Moolan-Feroze","David Bull"],"pdf_url":"https://arxiv.org/pdf/2411.11199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11196v1","updated":"2024-11-17T23:09:08Z","published":"2024-11-17T23:09:08Z","title":"PickScan: Object discovery and reconstruction from handheld interactions","summary":" Reconstructing compositional 3D representations of scenes, where each object\nis represented with its own 3D model, is a highly desirable capability in\nrobotics and augmented reality. However, most existing methods rely heavily on\nstrong appearance priors for object discovery, therefore only working on those\nclasses of objects on which the method has been trained, or do not allow for\nobject manipulation, which is necessary to scan objects fully and to guide\nobject discovery in challenging scenarios. We address these limitations with a\nnovel interaction-guided and class-agnostic method based on object\ndisplacements that allows a user to move around a scene with an RGB-D camera,\nhold up objects, and finally outputs one 3D model per held-up object. Our main\ncontribution to this end is a novel approach to detecting user-object\ninteractions and extracting the masks of manipulated objects. On a\ncustom-captured dataset, our pipeline discovers manipulated objects with 78.3%\nprecision at 100% recall and reconstructs them with a mean chamfer distance of\n0.90cm. Compared to Co-Fusion, the only comparable interaction-based and\nclass-agnostic baseline, this corresponds to a reduction in chamfer distance of\n73% while detecting 99% fewer false positives.\n","authors":["Vincent van der Brugge","Marc Pollefeys","Joshua B. Tenenbaum","Ayush Tewari","Krishna Murthy Jatavallabhula"],"pdf_url":"https://arxiv.org/pdf/2411.11196v1.pdf","comment":"7 pages, 8 figures, published in the 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2411.09553v2","updated":"2024-11-17T22:53:09Z","published":"2024-11-14T16:06:30Z","title":"OOD-SEG: Out-Of-Distribution detection for image SEGmentation with\n sparse multi-class positive-only annotations","summary":" Despite significant advancements, segmentation based on deep neural networks\nin medical and surgical imaging faces several challenges, two of which we aim\nto address in this work. First, acquiring complete pixel-level segmentation\nlabels for medical images is time-consuming and requires domain expertise.\nSecond, typical segmentation pipelines cannot detect out-of-distribution (OOD)\npixels, leaving them prone to spurious outputs during deployment. In this work,\nwe propose a novel segmentation approach exploiting OOD detection that learns\nonly from sparsely annotated pixels from multiple positive-only classes. These\nmulti-class positive annotations naturally fall within the in-distribution (ID)\nset. Unlabelled pixels may contain positive classes but also negative ones,\nincluding what is typically referred to as \\emph{background} in standard\nsegmentation formulations. Here, we forgo the need for background annotation\nand consider these together with any other unseen classes as part of the OOD\nset. Our framework can integrate, at a pixel-level, any OOD detection\napproaches designed for classification tasks. To address the lack of existing\nOOD datasets and established evaluation metric for medical image segmentation,\nwe propose a cross-validation strategy that treats held-out labelled classes as\nOOD. Extensive experiments on both multi-class hyperspectral and RGB surgical\nimaging datasets demonstrate the robustness and generalisation capability of\nour proposed framework.\n","authors":["Junwen Wang","Zhonghao Wang","Oscar MacCormac","Jonathan Shapey","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2411.09553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11190v1","updated":"2024-11-17T22:43:07Z","published":"2024-11-17T22:43:07Z","title":"DeepSPV: An Interpretable Deep Learning Pipeline for 3D Spleen Volume\n Estimation from 2D Ultrasound Images","summary":" Splenomegaly, the enlargement of the spleen, is an important clinical\nindicator for various associated medical conditions, such as sickle cell\ndisease (SCD). Spleen length measured from 2D ultrasound is the most widely\nused metric for characterising spleen size. However, it is still considered a\nsurrogate measure, and spleen volume remains the gold standard for assessing\nspleen size. Accurate spleen volume measurement typically requires 3D imaging\nmodalities, such as computed tomography or magnetic resonance imaging, but\nthese are not widely available, especially in the Global South which has a high\nprevalence of SCD. In this work, we introduce a deep learning pipeline,\nDeepSPV, for precise spleen volume estimation from single or dual 2D ultrasound\nimages. The pipeline involves a segmentation network and a variational\nautoencoder for learning low-dimensional representations from the estimated\nsegmentations. We investigate three approaches for spleen volume estimation and\nour best model achieves 86.62%/92.5% mean relative volume accuracy (MRVA) under\nsingle-view/dual-view settings, surpassing the performance of human experts. In\naddition, the pipeline can provide confidence intervals for the volume\nestimates as well as offering benefits in terms of interpretability, which\nfurther support clinicians in decision-making when identifying splenomegaly. We\nevaluate the full pipeline using a highly realistic synthetic dataset generated\nby a diffusion model, achieving an overall MRVA of 83.0% from a single 2D\nultrasound image. Our proposed DeepSPV is the first work to use deep learning\nto estimate 3D spleen volume from 2D ultrasound images and can be seamlessly\nintegrated into the current clinical workflow for spleen assessment.\n","authors":["Zhen Yuan","David Stojanovski","Lei Li","Alberto Gomez","Haran Jogeesvaran","Esther Puyol-Antón","Baba Inusa","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2411.11190v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.08038"},{"id":"http://arxiv.org/abs/2411.11189v1","updated":"2024-11-17T22:38:39Z","published":"2024-11-17T22:38:39Z","title":"Freqformer: Frequency-Domain Transformer for 3-D Visualization and\n Quantification of Human Retinal Circulation","summary":" We introduce Freqformer, a novel Transformer-based architecture designed for\n3-D, high-definition visualization of human retinal circulation from a single\nscan in commercial optical coherence tomography angiography (OCTA). Freqformer\naddresses the challenge of limited signal-to-noise ratio in OCTA volume by\nutilizing a complex-valued frequency-domain module (CFDM) and a simplified\nmulti-head attention (Sim-MHA) mechanism. Using merged volumes as ground truth,\nFreqformer enables accurate reconstruction of retinal vasculature across the\ndepth planes, allowing for 3-D quantification of capillary segments (count,\ndensity, and length). Our method outperforms state-of-the-art convolutional\nneural networks (CNNs) and several Transformer-based models, with superior\nperformance in peak signal-to-noise ratio (PSNR), structural similarity index\nmeasure (SSIM), and learned perceptual image patch similarity (LPIPS).\nFurthermore, Freqformer demonstrates excellent generalizability across lower\nscanning density, effectively enhancing OCTA scans with larger fields of view\n(from 3$\\times$3 $mm^{2}$ to 6$\\times$6 $mm^{2}$ and 12$\\times$12 $mm^{2}$).\nThese results suggest that Freqformer can significantly improve the\nunderstanding and characterization of retinal circulation, offering potential\nclinical applications in diagnosing and managing retinal vascular diseases.\n","authors":["Lingyun Wang","Bingjie Wang","Jay Chhablani","Jose Alain Sahel","Shaohua Pi"],"pdf_url":"https://arxiv.org/pdf/2411.11189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09579v2","updated":"2024-11-17T22:26:11Z","published":"2023-02-19T14:08:01Z","title":"Evaluating Representations with Readout Model Switching","summary":" Although much of the success of Deep Learning builds on learning good\nrepresentations, a rigorous method to evaluate their quality is lacking. In\nthis paper, we treat the evaluation of representations as a model selection\nproblem and propose to use the Minimum Description Length (MDL) principle to\ndevise an evaluation metric. Contrary to the established practice of limiting\nthe capacity of the readout model, we design a hybrid discrete and\ncontinuous-valued model space for the readout models and employ a switching\nstrategy to combine their predictions. The MDL score takes model complexity, as\nwell as data efficiency into account. As a result, the most appropriate model\nfor the specific task and representation will be chosen, making it a unified\nmeasure for comparison. The proposed metric can be efficiently computed with an\nonline method and we present results for pre-trained vision encoders of various\narchitectures (ResNet and ViT) and objective functions (supervised and\nself-supervised) on a range of downstream tasks. We compare our methods with\naccuracy-based approaches and show that the latter are inconsistent when\nmultiple readout models are used. Finally, we discuss important properties\nrevealed by our evaluations such as model scaling, preferred readout model, and\ndata efficiency.\n","authors":["Yazhe Li","Jorg Bornschein","Marcus Hutter"],"pdf_url":"https://arxiv.org/pdf/2302.09579v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03607v2","updated":"2024-11-17T21:40:50Z","published":"2024-02-06T00:51:27Z","title":"Enhancing Cross-Modal Contextual Congruence for Crowdfunding Success\n using Knowledge-infused Learning","summary":" The digital landscape continually evolves with multimodality, enriching the\nonline experience for users. Creators and marketers aim to weave subtle\ncontextual cues from various modalities into congruent content to engage users\nwith a harmonious message. This interplay of multimodal cues is often a crucial\nfactor in attracting users' attention. However, this richness of multimodality\npresents a challenge to computational modeling, as the semantic contextual cues\nspanning across modalities need to be unified to capture the true holistic\nmeaning of the multimodal content. This contextual meaning is critical in\nattracting user engagement as it conveys the intended message of the brand or\nthe organization. In this work, we incorporate external commonsense knowledge\nfrom knowledge graphs to enhance the representation of multimodal data using\ncompact Visual Language Models (VLMs) and predict the success of multi-modal\ncrowdfunding campaigns. Our results show that external knowledge commonsense\nbridges the semantic gap between text and image modalities, and the enhanced\nknowledge-infused representations improve the predictive performance of models\nfor campaign success upon the baselines without knowledge. Our findings\nhighlight the significance of contextual congruence in online multimodal\ncontent for engaging and successful crowdfunding campaigns.\n","authors":["Trilok Padhi","Ugur Kursuncu","Yaman Kumar","Valerie L. Shalin","Lane Peterson Fronczek"],"pdf_url":"https://arxiv.org/pdf/2402.03607v2.pdf","comment":"Accepted at IEEE International Conference on Big Data 2024 (IEEE\n BigData 2024)"},{"id":"http://arxiv.org/abs/2411.11179v1","updated":"2024-11-17T21:25:24Z","published":"2024-11-17T21:25:24Z","title":"Enhanced Anime Image Generation Using USE-CMHSA-GAN","summary":" With the growing popularity of ACG (Anime, Comics, and Games) culture,\ngenerating high-quality anime character images has become an important research\ntopic. This paper introduces a novel Generative Adversarial Network model,\nUSE-CMHSA-GAN, designed to produce high-quality anime character images. The\nmodel builds upon the traditional DCGAN framework, incorporating USE and CMHSA\nmodules to enhance feature extraction capabilities for anime character images.\nExperiments were conducted on the anime-face-dataset, and the results\ndemonstrate that USE-CMHSA-GAN outperforms other benchmark models, including\nDCGAN, VAE-GAN, and WGAN, in terms of FID and IS scores, indicating superior\nimage quality. These findings suggest that USE-CMHSA-GAN is highly effective\nfor anime character image generation and provides new insights for further\nimproving the quality of generative models.\n","authors":["J. Lu"],"pdf_url":"https://arxiv.org/pdf/2411.11179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11162v1","updated":"2024-11-17T19:45:26Z","published":"2024-11-17T19:45:26Z","title":"RPN 2: On Interdependence Function Learning Towards Unifying and\n Advancing CNN, RNN, GNN, and Transformer","summary":" This paper builds upon our previous work on the Reconciled Polynomial Network\n(RPN). The original RPN model was designed under the assumption of input data\nindependence, presuming the independence among both individual instances within\ndata batches and attributes in each data instance. However, this assumption\noften proves invalid for function learning tasks involving complex,\ninterdependent data such as language, images, time series, and graphs. Ignoring\nsuch data interdependence may inevitably lead to significant performance\ndegradation.\n To overcome these limitations, we introduce the new Reconciled Polynomial\nNetwork (version 2), namely RPN 2, in this paper. By incorporating data and\nstructural interdependence functions, RPN 2 explicitly models data\ninterdependence via new component functions in its architecture.\n This enhancement not only significantly improves RPN 2's learning performance\nbut also substantially expands its unifying potential, enabling it to encompass\na broader range of contemporary dominant backbone models within its canonical\nrepresentation. These backbones include, but are not limited to, convolutional\nneural networks (CNNs), recurrent neural networks (RNNs), graph neural networks\n(GNNs), and Transformers. Our analysis reveals that the fundamental\ndistinctions among these backbone models primarily stem from their diverse\napproaches to defining the interdependence functions. Furthermore, this unified\nrepresentation opens up new opportunities for designing innovative\narchitectures with the potential to surpass the performance of these dominant\nbackbones.\n","authors":["Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11162v1.pdf","comment":"105 pages, 37 figures, 6 tables, preprint version"},{"id":"http://arxiv.org/abs/2411.07118v2","updated":"2024-11-17T18:58:41Z","published":"2024-11-11T16:45:18Z","title":"ConvMixFormer- A Resource-efficient Convolution Mixer for\n Transformer-based Dynamic Hand Gesture Recognition","summary":" Transformer models have demonstrated remarkable success in many domains such\nas natural language processing (NLP) and computer vision. With the growing\ninterest in transformer-based architectures, they are now utilized for gesture\nrecognition. So, we also explore and devise a novel ConvMixFormer architecture\nfor dynamic hand gestures. The transformers use quadratic scaling of the\nattention features with the sequential data, due to which these models are\ncomputationally complex and heavy. We have considered this drawback of the\ntransformer and designed a resource-efficient model that replaces the\nself-attention in the transformer with the simple convolutional layer-based\ntoken mixer. The computational cost and the parameters used for the\nconvolution-based mixer are comparatively less than the quadratic\nself-attention. Convolution-mixer helps the model capture the local spatial\nfeatures that self-attention struggles to capture due to their sequential\nprocessing nature. Further, an efficient gate mechanism is employed instead of\na conventional feed-forward network in the transformer to help the model\ncontrol the flow of features within different stages of the proposed model.\nThis design uses fewer learnable parameters which is nearly half the vanilla\ntransformer that helps in fast and efficient training. The proposed method is\nevaluated on NVidia Dynamic Hand Gesture and Briareo datasets and our model has\nachieved state-of-the-art results on single and multimodal inputs. We have also\nshown the parameter efficiency of the proposed ConvMixFormer model compared to\nother methods. The source code is available at\nhttps://github.com/mallikagarg/ConvMixFormer.\n","authors":["Mallika Garg","Debashis Ghosh","Pyari Mohan Pradhan"],"pdf_url":"https://arxiv.org/pdf/2411.07118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11151v1","updated":"2024-11-17T18:53:20Z","published":"2024-11-17T18:53:20Z","title":"Person Segmentation and Action Classification for Multi-Channel\n Hemisphere Field of View LiDAR Sensors","summary":" Robots need to perceive persons in their surroundings for safety and to\ninteract with them. In this paper, we present a person segmentation and action\nclassification approach that operates on 3D scans of hemisphere field of view\nLiDAR sensors. We recorded a data set with an Ouster OSDome-64 sensor\nconsisting of scenes where persons perform three different actions and\nannotated it. We propose a method based on a MaskDINO model to detect and\nsegment persons and to recognize their actions from combined spherical\nprojected multi-channel representations of the LiDAR data with an additional\npositional encoding. Our approach demonstrates good performance for the person\nsegmentation task and further performs well for the estimation of the person\naction states walking, waving, and sitting. An ablation study provides insights\nabout the individual channel contributions for the person segmentation task.\nThe trained models, code and dataset are made publicly available.\n","authors":["Svetlana Seliunina","Artem Otelepko","Raphael Memmesheimer","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2411.11151v1.pdf","comment":"6 pages, 9 figures, 4 tables, accepted for publication at IEEE/SICE\n International Symposium on System Integration (SII), Munich, Germany, January\n 2025"},{"id":"http://arxiv.org/abs/2411.11150v1","updated":"2024-11-17T18:52:06Z","published":"2024-11-17T18:52:06Z","title":"A Comprehensive Survey on Visual Question Answering Datasets and\n Algorithms","summary":" Visual question answering (VQA) refers to the problem where, given an image\nand a natural language question about the image, a correct natural language\nanswer has to be generated. A VQA model has to demonstrate both the visual\nunderstanding of the image and the semantic understanding of the question,\ndemonstrating reasoning capability. Since the inception of this field, a\nplethora of VQA datasets and models have been published. In this article, we\nmeticulously analyze the current state of VQA datasets and models, while\ncleanly dividing them into distinct categories and then summarizing the\nmethodologies and characteristics of each category. We divide VQA datasets into\nfour categories: (1) available datasets that contain a rich collection of\nauthentic images, (2) synthetic datasets that contain only synthetic images\nproduced through artificial means, (3) diagnostic datasets that are specially\ndesigned to test model performance in a particular area, e.g., understanding\nthe scene text, and (4) KB (Knowledge-Based) datasets that are designed to\nmeasure a model's ability to utilize outside knowledge. Concurrently, we\nexplore six main paradigms of VQA models: fusion, where we discuss different\nmethods of fusing information between visual and textual modalities; attention,\nthe technique of using information from one modality to filter information from\nanother; external knowledge base, where we discuss different models utilizing\noutside information; composition or reasoning, where we analyze techniques to\nanswer advanced questions that require complex reasoning steps; explanation,\nwhich is the process of generating visual and textual descriptions to verify\nsound reasoning; and graph models, which encode and manipulate relationships\nthrough nodes in a graph. We also discuss some miscellaneous topics, such as\nscene text understanding, counting, and bias reduction.\n","authors":["Raihan Kabir","Naznin Haque","Md Saiful Islam"," Marium-E-Jannat"],"pdf_url":"https://arxiv.org/pdf/2411.11150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10921v3","updated":"2024-11-17T17:55:19Z","published":"2024-07-15T17:22:16Z","title":"Leveraging Bi-Focal Perspectives and Granular Feature Integration for\n Accurate Reliable Early Alzheimer's Detection","summary":" Alzheimer's disease (AD) is the most common neurodegeneration, annually\ndiagnosed in millions of patients. The present medicine scenario still finds\nchallenges in the exact diagnosis and classification of AD through neuroimaging\ndata. Traditional CNNs can extract a good amount of low-level information in an\nimage but fail to extract high-level minuscule particles, which is a\nsignificant challenge in detecting AD from MRI scans. To overcome this, we\npropose a novel Granular Feature Integration method to combine information\nextraction at different scales combined with an efficient information flow,\nenabling the model to capture both broad and fine-grained features\nsimultaneously. We also propose a Bi-Focal Perspective mechanism to highlight\nthe subtle neurofibrillary tangles and amyloid plaques in the MRI scans,\nensuring that critical pathological markers are accurately identified. Our\nmodel achieved an F1-Score of 99.31%, precision of 99.24%, and recall of\n99.51%. These scores prove that our model is significantly better than the\nstate-of-the-art (SOTA) CNNs in existence.\n","authors":["Pandiyaraju V","Shravan Venkatraman","Abeshek A","Pavan Kumar S","Aravintakshan S A","Kannan A"],"pdf_url":"https://arxiv.org/pdf/2407.10921v3.pdf","comment":"14 pages, 12 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.11135v1","updated":"2024-11-17T17:45:37Z","published":"2024-11-17T17:45:37Z","title":"Oscillation Inversion: Understand the structure of Large Flow Model\n through the Lens of Inversion Method","summary":" We explore the oscillatory behavior observed in inversion methods applied to\nlarge-scale text-to-image diffusion models, with a focus on the \"Flux\" model.\nBy employing a fixed-point-inspired iterative approach to invert real-world\nimages, we observe that the solution does not achieve convergence, instead\noscillating between distinct clusters. Through both toy experiments and\nreal-world diffusion models, we demonstrate that these oscillating clusters\nexhibit notable semantic coherence. We offer theoretical insights, showing that\nthis behavior arises from oscillatory dynamics in rectified flow models.\nBuilding on this understanding, we introduce a simple and fast distribution\ntransfer technique that facilitates image enhancement, stroke-based recoloring,\nas well as visual prompt-guided image editing. Furthermore, we provide\nquantitative results demonstrating the effectiveness of our method for tasks\nsuch as image enhancement, makeup transfer, reconstruction quality, and guided\nsampling quality. Higher-quality examples of videos and images are available at\n\\href{https://yanyanzheng96.github.io/oscillation_inversion/}{this link}.\n","authors":["Yan Zheng","Zhenxiao Liang","Xiaoyan Cong","Lanqing guo","Yuehao Wang","Peihao Wang","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20092v2","updated":"2024-11-17T17:05:03Z","published":"2024-06-28T17:57:14Z","title":"Efficient Large Multi-modal Models via Visual Context Compression","summary":" While significant advancements have been made in compressed representations\nfor text embeddings in large language models (LLMs), the compression of visual\ntokens in multi-modal LLMs (MLLMs) has remained a largely overlooked area. In\nthis work, we present the study on the analysis of redundancy concerning visual\ntokens and efficient training within these models. Our initial experiments show\nthat eliminating up to 70% of visual tokens at the testing stage by simply\naverage pooling only leads to a minimal 3% reduction in visual question\nanswering accuracy on the GQA benchmark, indicating significant redundancy in\nvisual context. Addressing this, we introduce Visual Context Compressor, which\nreduces the number of visual tokens to enhance training and inference\nefficiency without sacrificing performance. To minimize information loss caused\nby the compression on visual tokens while maintaining training efficiency, we\ndevelop LLaVolta as a light and staged training scheme that incorporates\nstage-wise visual context compression to progressively compress the visual\ntokens from heavily to lightly compression during training, yielding no loss of\ninformation when testing. Extensive experiments demonstrate that our approach\nenhances the performance of MLLMs in both image-language and video-language\nunderstanding, while also significantly cutting training costs and improving\ninference efficiency.\n","authors":["Jieneng Chen","Luoxin Ye","Ju He","Zhao-Yang Wang","Daniel Khashabi","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2406.20092v2.pdf","comment":"NeurIPS 2024 Camera Ready; Code is available at\n https://github.com/Beckschen/LLaVolta"},{"id":"http://arxiv.org/abs/2411.11116v1","updated":"2024-11-17T16:14:00Z","published":"2024-11-17T16:14:00Z","title":"DBF-Net: A Dual-Branch Network with Feature Fusion for Ultrasound Image\n Segmentation","summary":" Accurately segmenting lesions in ultrasound images is challenging due to the\ndifficulty in distinguishing boundaries between lesions and surrounding\ntissues. While deep learning has improved segmentation accuracy, there is\nlimited focus on boundary quality and its relationship with body structures. To\naddress this, we introduce UBBS-Net, a dual-branch deep neural network that\nlearns the relationship between body and boundary for improved segmentation. We\nalso propose a feature fusion module to integrate body and boundary\ninformation. Evaluated on three public datasets, UBBS-Net outperforms existing\nmethods, achieving Dice Similarity Coefficients of 81.05% for breast cancer,\n76.41% for brachial plexus nerves, and 87.75% for infantile hemangioma\nsegmentation. Our results demonstrate the effectiveness of UBBS-Net for\nultrasound image segmentation. The code is available at\nhttps://github.com/apple1986/DBF-Net.\n","authors":["Guoping Xu","Ximing Wu","Wentao Liao","Xinglong Wu","Qing Huang","Chang Li"],"pdf_url":"https://arxiv.org/pdf/2411.11116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20098v2","updated":"2024-11-17T16:11:00Z","published":"2024-06-28T17:59:46Z","title":"Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework\n for Multimodal LLMs","summary":" Multimodal large language models (MLLMs) have shown impressive success across\nmodalities such as image, video, and audio in a variety of understanding and\ngeneration tasks. However, current MLLMs are surprisingly poor at understanding\nwebpage screenshots and generating their corresponding HTML code. To address\nthis problem, we propose $\\texttt{Web2Code}$, a benchmark consisting of a new\nlarge-scale webpage-to-code dataset for instruction tuning and an evaluation\nframework for the webpage understanding and HTML code translation abilities of\nMLLMs. For dataset construction, we leverage pretrained LLMs to enhance\nexisting webpage-to-code datasets as well as generate a diverse pool of new\nwebpages rendered into images. Specifically, the inputs are webpage images and\ninstructions, while the responses are the webpage's HTML code. We further\ninclude diverse natural language QA pairs about the webpage content in the\nresponses to enable a more comprehensive understanding of the web content. To\nevaluate model performance in these tasks, we develop an evaluation framework\nfor testing MLLMs' abilities in webpage understanding and web-to-code\ngeneration. Extensive experiments show that our proposed dataset is beneficial\nnot only to our proposed tasks but also in the general visual domain. We hope\nour work will contribute to the development of general MLLMs suitable for\nweb-based content generation and task automation. Our data and code are\navailable at https://github.com/MBZUAI-LLM/web2code.\n","authors":["Sukmin Yun","Haokun Lin","Rusiru Thushara","Mohammad Qazim Bhat","Yongxin Wang","Zutao Jiang","Mingkai Deng","Jinhong Wang","Tianhua Tao","Junbo Li","Haonan Li","Preslav Nakov","Timothy Baldwin","Zhengzhong Liu","Eric P. Xing","Xiaodan Liang","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2406.20098v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Camera-ready Version. Website at\n https://mbzuai-llm.github.io/webpage2code/"},{"id":"http://arxiv.org/abs/2411.11110v1","updated":"2024-11-17T16:03:30Z","published":"2024-11-17T16:03:30Z","title":"Retinal Vessel Segmentation via Neuron Programming","summary":" The accurate segmentation of retinal blood vessels plays a crucial role in\nthe early diagnosis and treatment of various ophthalmic diseases. Designing a\nnetwork model for this task requires meticulous tuning and extensive\nexperimentation to handle the tiny and intertwined morphology of retinal blood\nvessels. To tackle this challenge, Neural Architecture Search (NAS) methods are\ndeveloped to fully explore the space of potential network architectures and go\nafter the most powerful one. Inspired by neuronal diversity which is the\nbiological foundation of all kinds of intelligent behaviors in our brain, this\npaper introduces a novel and foundational approach to neural network design,\ntermed ``neuron programming'', to automatically search neuronal types into a\nnetwork to enhance a network's representation ability at the neuronal level,\nwhich is complementary to architecture-level enhancement done by NAS.\nAdditionally, to mitigate the time and computational intensity of neuron\nprogramming, we develop a hypernetwork that leverages the search-derived\narchitectural information to predict optimal neuronal configurations.\nComprehensive experiments validate that neuron programming can achieve\ncompetitive performance in retinal blood segmentation, demonstrating the strong\npotential of neuronal diversity in medical image analysis.\n","authors":["Tingting Wu","Ruyi Min","Peixuan Song","Hengtao Guo","Tieyong Zeng","Feng-Lei Fan"],"pdf_url":"https://arxiv.org/pdf/2411.11110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11105v1","updated":"2024-11-17T15:50:25Z","published":"2024-11-17T15:50:25Z","title":"Label Sharing Incremental Learning Framework for Independent Multi-Label\n Segmentation Tasks","summary":" In a setting where segmentation models have to be built for multiple\ndatasets, each with its own corresponding label set, a straightforward way is\nto learn one model for every dataset and its labels. Alternatively, multi-task\narchitectures with shared encoders and multiple segmentation heads or shared\nweights with compound labels can also be made use of. This work proposes a\nnovel label sharing framework where a shared common label space is constructed\nand each of the individual label sets are systematically mapped to the common\nlabels. This transforms multiple datasets with disparate label sets into a\nsingle large dataset with shared labels, and therefore all the segmentation\ntasks can be addressed by learning a single model. This eliminates the need for\ntask specific adaptations in network architectures and also results in\nparameter and data efficient models. Furthermore, label sharing framework is\nnaturally amenable for incremental learning where segmentations for new\ndatasets can be easily learnt. We experimentally validate our method on various\nmedical image segmentation datasets, each involving multi-label segmentation.\nFurthermore, we demonstrate the efficacy of the proposed method in terms of\nperformance and incremental learning ability vis-a-vis alternative methods.\n","authors":["Deepa Anand","Bipul Das","Vyshnav Dangeti","Antony Jerald","Rakesh Mullick","Uday Patil","Pakhi Sharma","Prasad Sudhakar"],"pdf_url":"https://arxiv.org/pdf/2411.11105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11098v1","updated":"2024-11-17T15:00:09Z","published":"2024-11-17T15:00:09Z","title":"MolParser: End-to-end Visual Recognition of Molecule Structures in the\n Wild","summary":" In recent decades, chemistry publications and patents have increased rapidly.\nA significant portion of key information is embedded in molecular structure\nfigures, complicating large-scale literature searches and limiting the\napplication of large language models in fields such as biology, chemistry, and\npharmaceuticals. The automatic extraction of precise chemical structures is of\ncritical importance. However, the presence of numerous Markush structures in\nreal-world documents, along with variations in molecular image quality, drawing\nstyles, and noise, significantly limits the performance of existing optical\nchemical structure recognition (OCSR) methods. We present MolParser, a novel\nend-to-end OCSR method that efficiently and accurately recognizes chemical\nstructures from real-world documents, including difficult Markush structure. We\nuse a extended SMILES encoding rule to annotate our training dataset. Under\nthis rule, we build MolParser-7M, the largest annotated molecular image dataset\nto our knowledge. While utilizing a large amount of synthetic data, we employed\nactive learning methods to incorporate substantial in-the-wild data,\nspecifically samples cropped from real patents and scientific literature, into\nthe training process. We trained an end-to-end molecular image captioning\nmodel, MolParser, using a curriculum learning approach. MolParser significantly\noutperforms classical and learning-based methods across most scenarios, with\npotential for broader downstream applications. The dataset is publicly\navailable.\n","authors":["Xi Fang","Jiankun Wang","Xiaochen Cai","Shangqian Chen","Shuwen Yang","Lin Yao","Linfeng Zhang","Guolin Ke"],"pdf_url":"https://arxiv.org/pdf/2411.11098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05100v2","updated":"2024-11-17T14:42:51Z","published":"2024-03-08T07:03:18Z","title":"Exploring the Adversarial Frontier: Quantifying Robustness via\n Adversarial Hypervolume","summary":" The escalating threat of adversarial attacks on deep learning models,\nparticularly in security-critical fields, has underscored the need for robust\ndeep learning systems. Conventional robustness evaluations have relied on\nadversarial accuracy, which measures a model's performance under a specific\nperturbation intensity. However, this singular metric does not fully\nencapsulate the overall resilience of a model against varying degrees of\nperturbation. To address this gap, we propose a new metric termed adversarial\nhypervolume, assessing the robustness of deep learning models comprehensively\nover a range of perturbation intensities from a multi-objective optimization\nstandpoint. This metric allows for an in-depth comparison of defense mechanisms\nand recognizes the trivial improvements in robustness afforded by less potent\ndefensive strategies. Additionally, we adopt a novel training algorithm that\nenhances adversarial robustness uniformly across various perturbation\nintensities, in contrast to methods narrowly focused on optimizing adversarial\naccuracy. Our extensive empirical studies validate the effectiveness of the\nadversarial hypervolume metric, demonstrating its ability to reveal subtle\ndifferences in robustness that adversarial accuracy overlooks. This research\ncontributes a new measure of robustness and establishes a standard for\nassessing and benchmarking the resilience of current and future defensive\nmodels against adversarial threats.\n","authors":["Ping Guo","Cheng Gong","Xi Lin","Zhiyuan Yang","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05100v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17920v2","updated":"2024-11-17T14:41:36Z","published":"2024-10-23T14:38:57Z","title":"Gaze-Assisted Medical Image Segmentation","summary":" The annotation of patient organs is a crucial part of various diagnostic and\ntreatment procedures, such as radiotherapy planning. Manual annotation is\nextremely time-consuming, while its automation using modern image analysis\ntechniques has not yet reached levels sufficient for clinical adoption. This\npaper investigates the idea of semi-supervised medical image segmentation using\nhuman gaze as interactive input for segmentation correction. In particular, we\nfine-tuned the Segment Anything Model in Medical Images (MedSAM), a public\nsolution that uses various prompt types as additional input for semi-automated\nsegmentation correction. We used human gaze data from reading abdominal images\nas a prompt for fine-tuning MedSAM. The model was validated on a public WORD\ndatabase, which consists of 120 CT scans of 16 abdominal organs. The results of\nthe gaze-assisted MedSAM were shown to be superior to the results of the\nstate-of-the-art segmentation models. In particular, the average Dice\ncoefficient for 16 abdominal organs was 85.8%, 86.7%, 81.7%, and 90.5% for\nnnUNetV2, ResUNet, original MedSAM, and our gaze-assisted MedSAM model,\nrespectively.\n","authors":["Leila Khaertdinova","Ilya Pershin","Tatiana Shmykova","Bulat Ibragimov"],"pdf_url":"https://arxiv.org/pdf/2410.17920v2.pdf","comment":"16 pages, 4 figures, Accepted to AIM-FM Workshop @ NeurIPS'24"},{"id":"http://arxiv.org/abs/2411.11087v1","updated":"2024-11-17T14:30:50Z","published":"2024-11-17T14:30:50Z","title":"D-Cube: Exploiting Hyper-Features of Diffusion Model for Robust Medical\n Classification","summary":" The integration of deep learning technologies in medical imaging aims to\nenhance the efficiency and accuracy of cancer diagnosis, particularly for\npancreatic and breast cancers, which present significant diagnostic challenges\ndue to their high mortality rates and complex imaging characteristics. This\npaper introduces Diffusion-Driven Diagnosis (D-Cube), a novel approach that\nleverages hyper-features from a diffusion model combined with contrastive\nlearning to improve cancer diagnosis. D-Cube employs advanced feature selection\ntechniques that utilize the robust representational capabilities of diffusion\nmodels, enhancing classification performance on medical datasets under\nchallenging conditions such as data imbalance and limited sample availability.\nThe feature selection process optimizes the extraction of clinically relevant\nfeatures, significantly improving classification accuracy and demonstrating\nresilience in imbalanced and limited data scenarios. Experimental results\nvalidate the effectiveness of D-Cube across multiple medical imaging\nmodalities, including CT, MRI, and X-ray, showing superior performance compared\nto existing baseline models. D-Cube represents a new strategy in cancer\ndetection, employing advanced deep learning techniques to achieve\nstate-of-the-art diagnostic accuracy and efficiency.\n","authors":["Minhee Jang","Juheon Son","Thanaporn Viriyasaranon","Junho Kim","Jang-Hwan Choi"],"pdf_url":"https://arxiv.org/pdf/2411.11087v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.03992v5","updated":"2024-11-17T14:27:58Z","published":"2023-07-08T14:59:41Z","title":"Stimulating Diffusion Model for Image Denoising via Adaptive Embedding\n and Ensembling","summary":" Image denoising is a fundamental problem in computational photography, where\nachieving high perception with low distortion is highly demanding. Current\nmethods either struggle with perceptual quality or suffer from significant\ndistortion. Recently, the emerging diffusion model has achieved\nstate-of-the-art performance in various tasks and demonstrates great potential\nfor image denoising. However, stimulating diffusion models for image denoising\nis not straightforward and requires solving several critical problems. For one\nthing, the input inconsistency hinders the connection between diffusion models\nand image denoising. For another, the content inconsistency between the\ngenerated image and the desired denoised image introduces distortion. To tackle\nthese problems, we present a novel strategy called the Diffusion Model for\nImage Denoising (DMID) by understanding and rethinking the diffusion model from\na denoising perspective. Our DMID strategy includes an adaptive embedding\nmethod that embeds the noisy image into a pre-trained unconditional diffusion\nmodel and an adaptive ensembling method that reduces distortion in the denoised\nimage. Our DMID strategy achieves state-of-the-art performance on both\ndistortion-based and perception-based metrics, for both Gaussian and real-world\nimage denoising.The code is available at https://github.com/Li-Tong-621/DMID.\n","authors":["Tong Li","Hansen Feng","Lizhi Wang","Zhiwei Xiong","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2307.03992v5.pdf","comment":"18 pages,15 figures"},{"id":"http://arxiv.org/abs/2411.11082v1","updated":"2024-11-17T14:15:54Z","published":"2024-11-17T14:15:54Z","title":"STOP: Spatiotemporal Orthogonal Propagation for Weight-Threshold-Leakage\n Synergistic Training of Deep Spiking Neural Networks","summary":" The prevailing of artificial intelligence-of-things calls for higher\nenergy-efficient edge computing paradigms, such as neuromorphic agents\nleveraging brain-inspired spiking neural network (SNN) models based on\nspatiotemporally sparse binary activations. However, the lack of efficient and\nhigh-accuracy deep SNN learning algorithms prevents them from practical edge\ndeployments with a strictly bounded cost. In this paper, we propose a\nspatiotemporal orthogonal propagation (STOP) algorithm to tack this challenge.\nOur algorithm enables fully synergistic learning of synaptic weights as well as\nfiring thresholds and leakage factors in spiking neurons to improve SNN\naccuracy, while under a unified temporally-forward trace-based framework to\nmitigate the huge memory requirement for storing neural states of all\ntime-steps in the forward pass. Characteristically, the spatially-backward\nneuronal errors and temporally-forward traces propagate orthogonally to and\nindependently of each other, substantially reducing computational overhead. Our\nSTOP algorithm obtained high recognition accuracies of 99.53%, 94.84%, 74.92%,\n98.26% and 77.10% on the MNIST, CIFAR-10, CIFAR-100, DVS-Gesture and\nDVS-CIFAR10 datasets with adequate SNNs of intermediate scales from LeNet-5 to\nResNet-18. Compared with other deep SNN training works, our method is more\nplausible for edge intelligent scenarios where resources are limited but\nhigh-accuracy in-situ learning is desired.\n","authors":["Haoran Gao","Xichuan Zhou","Yingcheng Lin","Min Tian","Liyuan Liu","Cong Shi"],"pdf_url":"https://arxiv.org/pdf/2411.11082v1.pdf","comment":"13 pages (exclude supplementary), 5 figures"},{"id":"http://arxiv.org/abs/2406.10569v3","updated":"2024-11-17T14:08:23Z","published":"2024-06-15T09:08:58Z","title":"MDA: An Interpretable and Scalable Multi-Modal Fusion under Missing\n Modalities and Intrinsic Noise Conditions","summary":" Multi-modal learning has shown exceptional performance in various tasks,\nespecially in medical applications, where it integrates diverse medical\ninformation for comprehensive diagnostic evidence. However, there still are\nseveral challenges in multi-modal learning, 1. Heterogeneity between\nmodalities, 2. uncertainty in missing modalities, 3. influence of intrinsic\nnoise, and 4. interpretability for fusion result. This paper introduces the\nModal-Domain Attention (MDA) model to address the above challenges. MDA\nconstructs linear relationships between modalities through continuous\nattention, due to its ability to adaptively allocate dynamic attention to\ndifferent modalities, MDA can reduce attention to low-correlation data, missing\nmodalities, or modalities with inherent noise, thereby maintaining SOTA\nperformance across various tasks on multiple public datasets. Furthermore, our\nobservations on the contribution of different modalities indicate that MDA\naligns with established clinical diagnostic imaging gold standards and holds\npromise as a reference for pathologies where these standards are not yet\nclearly defined. The code and dataset will be available.\n","authors":["Lin Fan","Yafei Ou","Cenyang Zheng","Pengyu Dai","Tamotsu Kamishima","Masayuki Ikebe","Kenji Suzuki","Xun Gong"],"pdf_url":"https://arxiv.org/pdf/2406.10569v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11079v1","updated":"2024-11-17T13:55:35Z","published":"2024-11-17T13:55:35Z","title":"Electrostatic Force Regularization for Neural Structured Pruning","summary":" The demand for deploying deep convolutional neural networks (DCNNs) on\nresource-constrained devices for real-time applications remains substantial.\nHowever, existing state-of-the-art structured pruning methods often involve\nintricate implementations, require modifications to the original network\narchitectures, and necessitate an extensive fine-tuning phase. To overcome\nthese challenges, we propose a novel method that, for the first time,\nincorporates the concepts of charge and electrostatic force from physics into\nthe training process of DCNNs. The magnitude of this force is directly\nproportional to the product of the charges of the convolution filter and the\nsource filter, and inversely proportional to the square of the distance between\nthem. We applied this electrostatic-like force to the convolution filters,\neither attracting filters with opposite charges toward non-zero weights or\nrepelling filters with like charges toward zero weights. Consequently, filters\nsubject to repulsive forces have their weights reduced to zero, enabling their\nremoval, while the attractive forces preserve filters with significant weights\nthat retain information. Unlike conventional methods, our approach is\nstraightforward to implement, does not require any architectural modifications,\nand simultaneously optimizes weights and ranks filter importance, all without\nthe need for extensive fine-tuning. We validated the efficacy of our method on\nmodern DCNN architectures using the MNIST, CIFAR, and ImageNet datasets,\nachieving competitive performance compared to existing structured pruning\napproaches.\n","authors":["Abdesselam Ferdi","Abdelmalik Taleb-Ahmed","Amir Nakib","Youcef Ferdi"],"pdf_url":"https://arxiv.org/pdf/2411.11079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11069v1","updated":"2024-11-17T13:18:05Z","published":"2024-11-17T13:18:05Z","title":"Skeleton-Guided Spatial-Temporal Feature Learning for Video-Based\n Visible-Infrared Person Re-Identification","summary":" Video-based visible-infrared person re-identification (VVI-ReID) is\nchallenging due to significant modality feature discrepancies. Spatial-temporal\ninformation in videos is crucial, but the accuracy of spatial-temporal\ninformation is often influenced by issues like low quality and occlusions in\nvideos. Existing methods mainly focus on reducing modality differences, but pay\nlimited attention to improving spatial-temporal features, particularly for\ninfrared videos. To address this, we propose a novel Skeleton-guided\nspatial-Temporal feAture leaRning (STAR) method for VVI-ReID. By using skeleton\ninformation, which is robust to issues such as poor image quality and\nocclusions, STAR improves the accuracy of spatial-temporal features in videos\nof both modalities. Specifically, STAR employs two levels of skeleton-guided\nstrategies: frame level and sequence level. At the frame level, the robust\nstructured skeleton information is used to refine the visual features of\nindividual frames. At the sequence level, we design a feature aggregation\nmechanism based on skeleton key points graph, which learns the contribution of\ndifferent body parts to spatial-temporal features, further enhancing the\naccuracy of global features. Experiments on benchmark datasets demonstrate that\nSTAR outperforms state-of-the-art methods. Code will be open source soon.\n","authors":["Wenjia Jiang","Xiaoke Zhu","Jiakang Gao","Di Liao"],"pdf_url":"https://arxiv.org/pdf/2411.11069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11066v1","updated":"2024-11-17T13:08:29Z","published":"2024-11-17T13:08:29Z","title":"TS-LLaVA: Constructing Visual Tokens through Thumbnail-and-Sampling for\n Training-Free Video Large Language Models","summary":" Recent advances in multimodal Large Language Models (LLMs) have shown great\nsuccess in understanding multi-modal contents. For video understanding tasks,\ntraining-based video LLMs are difficult to build due to the scarcity of\nhigh-quality, curated video-text paired data. In contrast, paired image-text\ndata are much easier to obtain, and there is substantial similarity between\nimages and videos. Consequently, extending image LLMs for video understanding\ntasks presents an appealing alternative. Developing effective strategies for\ncompressing visual tokens from multiple frames is a promising way to leverage\nthe powerful pre-trained image LLM. In this work, we explore the limitations of\nthe existing compression strategies for building a training-free video LLM. The\nfindings lead to our method TS-LLaVA, which constructs visual tokens through a\nThumbnail-and-Sampling strategy. Given a video, we select few equidistant\nframes from all input frames to construct a Thumbnail image as a detailed\nvisual cue, complemented by Sampled visual tokens from all input frames. Our\nmethod establishes the new state-of-the-art performance among training-free\nvideo LLMs on various benchmarks. Notably, our 34B model outperforms GPT-4V on\nthe MVBench benchmark, and achieves performance comparable to the 72B\ntraining-based video LLM, Video-LLaMA2, on the challenging MLVU benchmark. Code\nis available at https://github.com/tingyu215/TS-LLaVA.\n","authors":["Tingyu Qu","Mingxiao Li","Tinne Tuytelaars","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2411.11066v1.pdf","comment":"work in progress"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2411.11192v1","updated":"2024-11-17T22:54:53Z","published":"2024-11-17T22:54:53Z","title":"Robot Metabolism: Towards machines that can grow by consuming other\n machines","summary":" Biological lifeforms can heal, grow, adapt, and reproduce -- abilities\nessential for sustained survival and development. In contrast, robots today are\nprimarily monolithic machines with limited ability to self-repair, physically\ndevelop, or incorporate material from their environments. A key challenge to\nsuch physical adaptation has been that while robot minds are rapidly evolving\nnew behaviors through AI, their bodies remain closed systems, unable to\nsystematically integrate new material to grow or heal. We argue that open-ended\nphysical adaptation is only possible when robots are designed using only a\nsmall repertoire of simple modules. This allows machines to mechanically adapt\nby consuming parts from other machines or their surroundings and shedding\nbroken components. We demonstrate this principle using a truss modular robot\nplatform composed of one-dimensional actuated bars. We show how robots in this\nspace can grow bigger, faster, and more capable by consuming materials from\ntheir environment and from other robots. We suggest that machine metabolic\nprocesses akin to the one demonstrated here will be an essential part of any\nsustained future robot ecology.\n","authors":["Philippe Martin Wyder","Riyaan Bakhda","Meiqi Zhao","Quinn A. Booth","Matthew E. Modi","Andrew Song","Simon Kang","Jiahao Wu","Priya Patel","Robert T. Kasumi","David Yi","Nihar Niraj Garg","Pranav Jhunjhunwala","Siddharth Bhutoria","Evan H. Tong","Yuhang Hu","Judah Goldfeder","Omer Mustel","Donghan Kim","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2411.11192v1.pdf","comment":"Manuscript combined with Supplementary Materials File for arXiv\n submission. Submitting to Journal and will update external DOI once available"},{"id":"http://arxiv.org/abs/2411.11180v1","updated":"2024-11-17T21:30:48Z","published":"2024-11-17T21:30:48Z","title":"Robust Defense Against Extreme Grid Events Using Dual-Policy\n Reinforcement Learning Agents","summary":" Reinforcement learning (RL) agents are powerful tools for managing power\ngrids. They use large amounts of data to inform their actions and receive\nrewards or penalties as feedback to learn favorable responses for the system.\nOnce trained, these agents can efficiently make decisions that would be too\ncomputationally complex for a human operator. This ability is especially\nvaluable in decarbonizing power networks, where the demand for RL agents is\nincreasing. These agents are well suited to control grid actions since the\naction space is constantly growing due to uncertainties in renewable\ngeneration, microgrid integration, and cybersecurity threats. To assess the\nefficacy of RL agents in response to an adverse grid event, we use the Grid2Op\nplatform for agent training. We employ a proximal policy optimization (PPO)\nalgorithm in conjunction with graph neural networks (GNNs). By simulating\nagents' responses to grid events, we assess their performance in avoiding grid\nfailure for as long as possible. The performance of an agent is expressed\nconcisely through its reward function, which helps the agent learn the most\noptimal ways to reconfigure a grid's topology amidst certain events. To model\nmulti-actor scenarios that threaten modern power networks, particularly those\nresulting from cyberattacks, we integrate an opponent that acts iteratively\nagainst a given agent. This interplay between the RL agent and opponent is\nutilized in N-k contingency screening, providing a novel alternative to the\ntraditional security assessment.\n","authors":["Benjamin M. Peter","Mert Korkali"],"pdf_url":"https://arxiv.org/pdf/2411.11180v1.pdf","comment":"6 pages, 5 figures, submitted to the 2025 Texas Power and Energy\n Conference (TPEC)"},{"id":"http://arxiv.org/abs/2411.02605v2","updated":"2024-11-17T20:09:19Z","published":"2024-11-04T20:58:39Z","title":"Tunable Sub-THz and THz lasing effect using FETs at room temperature","summary":" I report on the first observed self-amplification by stimulated emission of\n0.2THz and 1.63THz radiation using InGaAs/GaAs HEMT operating in the deep\nsaturation regime at room temperature. I demonstrate both theoretically and\nexperimentally that the Sub-THz and THz FETs response is due to rectification\nof the nonlinear dependence of the device current-voltage characteristics. FETs\ndo operate as a nonlinear THz mixers and rectifiers and its open-drain\nresponsivity is given by a similar expression to that of zero-bias Schottky\ndiode detector. However, operating FETs deep in the saturation regime does\nallow the accurate tuning of the device to the resonance condition or the\nnegative resistance mode at room temperature, hence FETs can be tuned in the\ndeep saturation regime to enable sub-THz and THz lasing effect. This observed\nsub-THz and THz laser phenomena using FETs will revolutionize human technology\nin all fields of life in the near future.\n","authors":["tamer Elkhatib"],"pdf_url":"https://arxiv.org/pdf/2411.02605v2.pdf","comment":"5 pages, 5 figures, to be submitted in Journal"},{"id":"http://arxiv.org/abs/2409.08411v2","updated":"2024-11-17T19:01:35Z","published":"2024-09-12T21:53:48Z","title":"Social Equity Based Optimal Power Flow Framework to Hedge Against Price\n Events","summary":" With the increasing frequency of high impact low probability events,\nelectricity markets are experiencing significant price spikes more often. This\npaper proposes a novel social equity driven optimal power flow framework to\nmitigate the adverse effects of price events that lead to such price spikes.\nThe framework integrates social welfare optimization with socioeconomic\nconsiderations by including a socioeconomic score that quantifies the energy\nburden and socioeconomic status of consumers. By incorporating both supply cost\nand consumer satisfaction, the model aims to achieve a balanced and fair\ndistribution of resources during price events, while considering resource\nscarcity and possible load curtailment. The proposed framework is tested for\nconvergence on modified versions of the PJM 5-bus system and IEEE 24-bus\nreliability test system, discussing its potential effectiveness in enhancing\nsocial equity and optimizing power flow under system security constraints.\nSensitivity analysis further highlights the impact of socioeconomic score on\nsocial welfare, providing insights for future improvements.\n","authors":["Sachinth Viththarachchige","Demy Alexander","Sarangan Rajendran","Visvakumar Aravinthan"],"pdf_url":"https://arxiv.org/pdf/2409.08411v2.pdf","comment":"Published in proceedings of the 2024 56th North American Power\n Symposium (NAPS)"},{"id":"http://arxiv.org/abs/2411.04510v2","updated":"2024-11-17T18:14:40Z","published":"2024-11-07T08:06:37Z","title":"Sliding Mode Roll Control of Active Suspension Electric Vehicles","summary":" Vehicle roll control has been a well studied problem. One of the ubiquitous\nmethods to mitigate vehicle rollover in the automobile industry is via a\nmechanical anti-roll bar. However with the advent of electric vehicles,\nrollover mitigation can be pursued using electric actuation. In this work, we\nstudy a roll control algorithm using sliding mode control for active suspension\nvehicles, where the actuation for the roll control signal is generated by\nelectric motors independently at the four corners of the vehicle. This\ntechnology precludes the need for any mechanical actuation which is often\nslower as well as any anti-roll bar to mitigate vehicle rollover situations. We\nprovide an implementation of the proposed algorithm and conduct numerical\nexperiments to validate the functionality and effectiveness. Specifically, we\nperform Slalom and J-turn maneuvering tests on an active suspension electric\nvehicle with sliding model roll control and it is shown to mitigate rollover by\natleast 50% compared to passive suspension vehicles, while simultaneously\nmaintaining rider comfort.\n","authors":["Mruganka Kashyap"],"pdf_url":"https://arxiv.org/pdf/2411.04510v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11142v1","updated":"2024-11-17T18:13:46Z","published":"2024-11-17T18:13:46Z","title":"Emergent Structure in Multi-agent Systems Using Geometric Embeddings","summary":" This work investigates the self-organization of multi-agent systems into\nclosed trajectories, a common requirement in unmanned aerial vehicle (UAV)\nsurveillance tasks. In such scenarios, smooth, unbiased control signals save\nenergy and mitigate mechanical strain. We propose a decentralized control\nsystem architecture that produces a globally stable emergent structure from\nlocal observations only; there is no requirement for agents to share a global\nplan or follow prescribed trajectories. Central to our approach is the\nformulation of an injective virtual embedding induced by rotations from the\nactual agent positions. This embedding serves as a structure-preserving map\naround which all agent stabilize their relative positions and permits the use\nof well-established linear control techniques. We construct the embedding such\nthat it is topologically equivalent to the desired trajectory (i.e., a\nhomeomorphism), thereby preserving the stability characteristics. We\ndemonstrate the versatility of this approach through implementation on a swarm\nof Quanser QDrone quadcopters. Results demonstrate the quadcopters\nself-organize into the desired trajectory while maintaining even separation.\n","authors":["Dimitria Silveria","Kleber Cabral","Peter Jardine","Sidney Givigi"],"pdf_url":"https://arxiv.org/pdf/2411.11142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11119v1","updated":"2024-11-17T16:26:04Z","published":"2024-11-17T16:26:04Z","title":"Leveraging Bitcoin Mining Machines in Demand-Response Mechanisms to\n Mitigate Ramping-Induced Transients","summary":" We propose an extended demand response program, based on ancillary service\nfor supplying flexible electricity demand. In our proposed scheme, we suggest a\nbroader management model to control the scheduling and power consumption of\nBitcoin mining machines. The main aspect that we focus on is suppressing the\npower ramping and related transient effects. We extend previous works on the\nsubject, that study the impact of incorporating cryptocurrency mining machines\ninto existing power grid, and explore the potential profit of exploiting this\nflexible load in the Israeli electricity market. We analyze a trend based on\nhistorical data, of increasing electricity prices and ramping costs due to the\nincreasing penetration of renewable energy sources. We suggest an extension to\nthe unit commitment problem from which we obtain the scheduling scheme of the\nBitcoin mining machines. We use simulation and the real-world data acquired\nfrom the \"Noga\" grid operator to verify the proposed ancillary service and test\nits practical limits for reducing the ramping costs, under changing ratio of\nenergy production from renewable sources. Out results suggests that the machine\nprice and ratio of production from renewable sources plays a significant role\nin determining the profitability of the proposed demand-response program.\n","authors":["Elinor Ginzburg-Ganz","Ittay Eyal","Ram Machlev","Dmitry Baimel","Leena Santosh","Juri Belikov","Yoash Levron"],"pdf_url":"https://arxiv.org/pdf/2411.11119v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.11108v1","updated":"2024-11-17T15:54:08Z","published":"2024-11-17T15:54:08Z","title":"Iterative Learning Control for Ramp Metering on Service Station On-ramps","summary":" Congestion on highways has become a significant social problem due to the\nincreasing number of vehicles, leading to considerable waste of time and\npollution. Regulating the outflow from the Service Station can help alleviate\nthis congestion. Notably, traffic flows follow recurring patterns over days and\nweeks, allowing for the application of Iterative Learning Control (ILC).\nBuilding on these insights, we propose an ILC approach based on the Cell\nTransmission Model with service stations (CTM-s). It is shown that ILC can\neffectively compensate for potential inaccuracies in model parameter estimates\nby leveraging historical data.\n","authors":["Hongxi Xiang","Carlo Cenedese","Efe C. Balta","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2411.11108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03997v4","updated":"2024-11-17T15:29:32Z","published":"2024-01-08T16:20:05Z","title":"Low-Complexity Control for a Class of Uncertain MIMO Nonlinear Systems\n under Generalized Time-Varying Output Constraints","summary":" This paper introduces a novel control framework to address the satisfaction\nof multiple time-varying output constraints in uncertain high-order MIMO\nnonlinear control systems. Unlike existing methods, which often assume that the\nconstraints are always decoupled and feasible, our approach can handle coupled\ntime-varying constraints even in the presence of potential infeasibilities.\nFirst, it is shown that satisfying multiple constraints essentially boils down\nto ensuring the positivity of a scalar variable, representing the signed\ndistance from the boundary of the time-varying output-constrained set. To\nachieve this, a single consolidating constraint is designed that, when\nsatisfied, guarantees convergence to and invariance of the time-varying\noutput-constrained set within a user-defined finite time. Next, a novel robust\nand low-complexity feedback controller is proposed to ensure the satisfaction\nof the consolidating constraint. Additionally, we provide a mechanism for\nonline modification of the consolidating constraint to find a least violating\nsolution when the constraints become mutually infeasible for some time.\nFinally, simulation examples of trajectory and region tracking for a mobile\nrobot validate the proposed approach.\n","authors":["Farhad Mehdifar","Lars Lindemann","Charalampos P. Bechlioulis","Dimos V. Dimarogonas"],"pdf_url":"https://arxiv.org/pdf/2401.03997v4.pdf","comment":"extended version, 21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.11093v1","updated":"2024-11-17T14:48:48Z","published":"2024-11-17T14:48:48Z","title":"Dynamic Dimensioning of Frequency Containment Reserves: The Case of the\n Nordic Grid","summary":" One of the main responsibilities of a Transmission System Operator (TSO)\noperating an electric grid is to maintain a designated frequency (e.g., 50 Hz\nin Europe). To achieve this, TSOs have created several products called\nfrequency-supporting ancillary services. The Frequency Containment Reserve\n(FCR) is one of these ancillary service products. This article focuses on the\nTSO problem of determining the volume procured for FCR. Specifically, we\ninvestigate the potential benefits and impact on grid security when\ntransitioning from a traditionally static procurement method to a dynamic\nstrategy for FCR volume. We take the Nordic synchronous area in Europe as a\ncase study and use a diffusion model to capture its frequency development. We\nintroduce a controlled mean reversal parameter to assess changes in FCR\nobligations, in particular for the Nordic FCR-N ancillary service product. We\nestablish closed-form expressions for exceedance probabilities and use\nhistorical frequency data as input to calibrate the model. We show that a\ndynamic dimensioning approach for FCR has the potential to significantly reduce\nthe exceedance probabilities (up to 37%) while keeping the total yearly\nprocured FCR volume the same as compared to the current static approach.\n","authors":["Jöbke Janssen","Alessandro Zocca","Bert Zwart","Jalal Kazempour"],"pdf_url":"https://arxiv.org/pdf/2411.11093v1.pdf","comment":"10 pages, 10 figures, submitted to IEEE Transactions on Power Systems"},{"id":"http://arxiv.org/abs/2206.09945v4","updated":"2024-11-17T14:08:16Z","published":"2022-06-20T18:05:07Z","title":"Sparse Representations of Dynamical Networks: A Coprime Factorization\n Approach","summary":" We study a class of dynamical networks modeled by linear and time-invariant\nsystems which are described by state-space realizations. For these networks, we\ninvestigate the relations between various types of factorizations which\npreserve the structure of their component subsystems' interconnection. In doing\nso, we provide tractable means of shifting between different types of\nsparsity-preserving representations and we show how to employ these\nfactorizations to obtain distributed implementations for stabilizing and\npossibly stable controllers. By formulating all these results for both\ndiscrete- and continuous-time systems, we develop specialized distributed\nimplementations that, up to this point, were only available for networks\nmodeled as discrete-time systems.\n","authors":["Şerban Sabău","Andrei Sperilă","Cristian Oară","Ali Jadbabaie"],"pdf_url":"https://arxiv.org/pdf/2206.09945v4.pdf","comment":"35 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.04721v2","updated":"2024-11-17T06:35:43Z","published":"2024-09-07T05:55:51Z","title":"Optimal decentralized wavelength control in light sources for\n lithography","summary":" Pulsed light sources are a critical component of modern lithography, with\nfine light beam wavelength control paramount for wafer etching accuracy. We\nstudy optimal wavelength control by casting it as a decentralized linear\nquadratic Gaussian (LQG) problem in presence of time-delays. In particular, we\nconsider the multi-optics module (optics and actuators) used for generating the\nrequisite wavelength in light sources as cooperatively interacting systems\ndefined over a directed acyclic graph (DAG). We show that any measurement and\nother continuous time-delays can be exactly compensated, and the resulting\noptimal controller implementation at the individual optics-level outperforms\nany existing wavelength control techniques.\n","authors":["Mruganka Kashyap"],"pdf_url":"https://arxiv.org/pdf/2409.04721v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10965v1","updated":"2024-11-17T05:03:40Z","published":"2024-11-17T05:03:40Z","title":"Immersion of General Nonlinear Systems Into State-Affine Ones for the\n Design of Generalized Parameter Estimation-Based Observers: A Simple\n Algebraic Procedure","summary":" Generalized parameter estimation-based observers have proven very successful\nto deal with systems described in state-affine form. In this paper, we enlarge\nthe domain of applicability of this method proposing an algebraic procedure to\nimmerse} an $n$-dimensional general nonlinear system into and $n_z$-dimensional\nsystem in state affine form, with $n_z>n$. First, we recall the necessary and\nsufficient condition for the solution of the general problem, which requires\nthe solution of a partial differential equation that, moreover, has to satisfy\na restrictive injectivity condition. Given the complexity of this task we\npropose an alternative simple algebraic method to identify the required dynamic\nextension and coordinate transformation, a procedure that, as shown in the\npaper, is rather natural for physical systems. We illustrate the method with\nsome academic benchmark examples from observer theory literature -- that, in\nspite of their apparent simplicity, are difficult to solve with the existing\nmethods -- as well as several practically relevant physical examples.\n","authors":["Romeo Ortega","Alexey Bobtsov","Jose Guadalupe Romero","Leyan Fang"],"pdf_url":"https://arxiv.org/pdf/2411.10965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10941v1","updated":"2024-11-17T02:39:58Z","published":"2024-11-17T02:39:58Z","title":"Efficient Estimation of Relaxed Model Parameters for Robust UAV\n Trajectory Optimization","summary":" Online trajectory optimization and optimal control methods are crucial for\nenabling sustainable unmanned aerial vehicle (UAV) services, such as\nagriculture, environmental monitoring, and transportation, where available\nactuation and energy are limited. However, optimal controllers are highly\nsensitive to model mismatch, which can occur due to loaded equipment, packages\nto be delivered, or pre-existing variability in fundamental structural and\nthrust-related parameters. To circumvent this problem, optimal controllers can\nbe paired with parameter estimators to improve their trajectory planning\nperformance and perform adaptive control. However, UAV platforms are limited in\nterms of onboard processing power, oftentimes making nonlinear parameter\nestimation too computationally expensive to consider. To address these issues,\nwe propose a relaxed, affine-in-parameters multirotor model along with an\nefficient optimal parameter estimator. We convexify the nominal Moving Horizon\nParameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via\nan affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast\nquadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC)\nin real time. We compare this approach to the equivalent nonlinear estimator in\nMonte Carlo simulations, demonstrating a decrease in average solve time and\ntrajectory optimality cost by 98.2% and 23.9-56.2%, respectively.\n","authors":["D. Fan","D. A. Copp"],"pdf_url":"https://arxiv.org/pdf/2411.10941v1.pdf","comment":"8 pages, 5 figures, submitted to IEEE Sustech 2025"},{"id":"http://arxiv.org/abs/2411.10929v1","updated":"2024-11-17T01:22:51Z","published":"2024-11-17T01:22:51Z","title":"Wildfire Risk Metric Impact on Public Safety Power Shut-off Cost Savings","summary":" Public Safety Power Shutoffs (PSPS) are a proactive strategy to mitigate fire\nhazards from power system infrastructure failures. System operators employ PSPS\nto deactivate portions of the electric grid with heightened wildfire risks to\nprevent wildfire ignition and redispatch generators to minimize load shedding.\nA measure of vegetation flammability, called the Wildland Fire Potential Index\n(WFPI), has been widely used to evaluate the risk of nearby wildfires to power\nsystem operation. However, the WFPI does not correlate as strongly to\nhistorically observed wildfire ignition probabilities (OWIP) as WFPI-based the\nLarge Fire Probability (WLFP).Prior work chose not to incorporate\nwildfire-driven failure probabilities, such as the WLFP, because constraints\nwith Bernoulli random variables to represent wildfire ignitions could require\nnon-linear or non-convex constraints. This paper uses a deterministic\nequivalent of an otherwise complicating line de-energization constraint by\nquantifying the wildfire risk of operating transmission line as a sum of each\nenergized line's wildfire ignition log probability (log(WIP)) rather than as a\nsum of each energized line's WFPI. A day-ahead unit commitment and line\nde-energization PSPS framework is used to assess the cost differences driven by\nthe choice between the WFPI and WLFP risk metrics. Training the optimization on\nscenarios developed by mapping WLFP to log(WIP) rather than mapping the WFPI to\nlog(WIP) leads to reductions in the total real-time costs. For the IEEE RTS\n24-bus test system, mapping transmission line WLFP values to log(WIP) resulted\nin a 14.8 % (on average) decrease in expected real-time costs.\n","authors":["Ryan Greenough","Kohei Murakami","Jan Kleissl","Adil Khurram"],"pdf_url":"https://arxiv.org/pdf/2411.10929v1.pdf","comment":"10 pages, 9 figures, 2 tables"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.11206v1","updated":"2024-11-17T23:40:00Z","published":"2024-11-17T23:40:00Z","title":"Capturing Sparks of Abstraction for the ARC Challenge","summary":" Excellent progress has been made recently in solving ARC Challenge problems.\nHowever, it seems that new techniques may be required to push beyond 60%\naccuracy. Even commercial Large Language Models (LLMs) struggle to 'understand'\nmany of the problems (when given the input and output grids), which makes\ndiscovering solutions by LLM-lead program search somewhat futile.\n In this work, LLM 'understanding' is attempted from a stronger starting\nposition : An LLM is given complete solutions to tasks in code, and then asked\nto explain how the task is being solved at various levels of abstraction.\nSpecifically, the LLM was given code solutions implemented in arc-dsl-llm (an\nLLM-legible version of Hodel's arc-dsl to obtain: (a) commented code; (b) code\nrefactored into reusable functional chunks; (c) problem solution steps; and (d)\nhigh-level problem-solving tactics.\n We demonstrate that 'Sparks of Abstraction' can be extracted from the LLM\noutput - in a form that could be used in downstream tasks with Local LLMs\neligible to enter the ARC Prize.\n Both the arc-dsl-llm DSL framework (with the re-engineered solutions) and the\nGemini LLM-generated data (along with the generation code) are made Open\nSource.\n","authors":["Martin Andrews"],"pdf_url":"https://arxiv.org/pdf/2411.11206v1.pdf","comment":"Submitted as a paper entry for the 2024 ARC Prize"},{"id":"http://arxiv.org/abs/2411.11196v1","updated":"2024-11-17T23:09:08Z","published":"2024-11-17T23:09:08Z","title":"PickScan: Object discovery and reconstruction from handheld interactions","summary":" Reconstructing compositional 3D representations of scenes, where each object\nis represented with its own 3D model, is a highly desirable capability in\nrobotics and augmented reality. However, most existing methods rely heavily on\nstrong appearance priors for object discovery, therefore only working on those\nclasses of objects on which the method has been trained, or do not allow for\nobject manipulation, which is necessary to scan objects fully and to guide\nobject discovery in challenging scenarios. We address these limitations with a\nnovel interaction-guided and class-agnostic method based on object\ndisplacements that allows a user to move around a scene with an RGB-D camera,\nhold up objects, and finally outputs one 3D model per held-up object. Our main\ncontribution to this end is a novel approach to detecting user-object\ninteractions and extracting the masks of manipulated objects. On a\ncustom-captured dataset, our pipeline discovers manipulated objects with 78.3%\nprecision at 100% recall and reconstructs them with a mean chamfer distance of\n0.90cm. Compared to Co-Fusion, the only comparable interaction-based and\nclass-agnostic baseline, this corresponds to a reduction in chamfer distance of\n73% while detecting 99% fewer false positives.\n","authors":["Vincent van der Brugge","Marc Pollefeys","Joshua B. Tenenbaum","Ayush Tewari","Krishna Murthy Jatavallabhula"],"pdf_url":"https://arxiv.org/pdf/2411.11196v1.pdf","comment":"7 pages, 8 figures, published in the 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2407.20181v2","updated":"2024-11-17T22:23:45Z","published":"2024-07-26T15:24:01Z","title":"Blockchain for Large Language Model Security and Safety: A Holistic\n Survey","summary":" With the growing development and deployment of large language models (LLMs)\nin both industrial and academic fields, their security and safety concerns have\nbecome increasingly critical. However, recent studies indicate that LLMs face\nnumerous vulnerabilities, including data poisoning, prompt injections, and\nunauthorized data exposure, which conventional methods have struggled to\naddress fully. In parallel, blockchain technology, known for its data\nimmutability and decentralized structure, offers a promising foundation for\nsafeguarding LLMs. In this survey, we aim to comprehensively assess how to\nleverage blockchain technology to enhance LLMs' security and safety. Besides,\nwe propose a new taxonomy of blockchain for large language models (BC4LLMs) to\nsystematically categorize related works in this emerging field. Our analysis\nincludes novel frameworks and definitions to delineate security and safety in\nthe context of BC4LLMs, highlighting potential research directions and\nchallenges at this intersection. Through this study, we aim to stimulate\ntargeted advancements in blockchain-integrated LLM security.\n","authors":["Caleb Geren","Amanda Board","Gaby G. Dagher","Tim Andersen","Jun Zhuang"],"pdf_url":"https://arxiv.org/pdf/2407.20181v2.pdf","comment":"Accepted to SIGKDD Explorations, to appear Dec 2024"},{"id":"http://arxiv.org/abs/2402.05271v4","updated":"2024-11-17T22:18:40Z","published":"2024-02-07T21:31:53Z","title":"Feature learning as alignment: a structural property of gradient descent\n in non-linear neural networks","summary":" Understanding the mechanisms through which neural networks extract statistics\nfrom input-label pairs through feature learning is one of the most important\nunsolved problems in supervised learning. Prior works demonstrated that the\ngram matrices of the weights (the neural feature matrices, NFM) and the average\ngradient outer products (AGOP) become correlated during training, in a\nstatement known as the neural feature ansatz (NFA). Through the NFA, the\nauthors introduce mapping with the AGOP as a general mechanism for neural\nfeature learning. However, these works do not provide a theoretical explanation\nfor this correlation or its origins. In this work, we further clarify the\nnature of this correlation, and explain its emergence. We show that this\ncorrelation is equivalent to alignment between the left singular structure of\nthe weight matrices and the newly defined pre-activation tangent features at\neach layer. We further establish that the alignment is driven by the\ninteraction of weight changes induced by SGD with the pre-activation features,\nand analyze the resulting dynamics analytically at early times in terms of\nsimple statistics of the inputs and labels. We prove the derivative alignment\noccurs almost surely in specific high dimensional settings. Finally, we\nintroduce a simple optimization rule motivated by our analysis of the centered\ncorrelation which dramatically increases the NFA correlations at any given\nlayer and improves the quality of features learned.\n","authors":["Daniel Beaglehole","Ioannis Mitliagkas","Atish Agarwala"],"pdf_url":"https://arxiv.org/pdf/2402.05271v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11182v1","updated":"2024-11-17T21:52:58Z","published":"2024-11-17T21:52:58Z","title":"Improving User Experience in Preference-Based Optimization of Reward\n Functions for Assistive Robots","summary":" Assistive robots interact with humans and must adapt to different users'\npreferences to be effective. An easy and effective technique to learn\nnon-expert users' preferences is through rankings of robot behaviors, for\nexample, robot movement trajectories or gestures. Existing techniques focus on\ngenerating trajectories for users to rank that maximize the outcome of the\npreference learning process. However, the generated trajectories do not appear\nto reflect the user's preference over repeated interactions. In this work, we\ndesign an algorithm to generate trajectories for users to rank that we call\nCovariance Matrix Adaptation Evolution Strategies with Information Gain\n(CMA-ES-IG). CMA-ES-IG prioritizes the user's experience of the preference\nlearning process. We show that users find our algorithm more intuitive and\neasier to use than previous approaches across both physical and social robot\ntasks. This project's code is hosted at github.com/interaction-lab/CMA-ES-IG\n","authors":["Nathaniel Dennler","Zhonghao Shi","Stefanos Nikolaidis","Maja Matarić"],"pdf_url":"https://arxiv.org/pdf/2411.11182v1.pdf","comment":"Accepted to ISRR"},{"id":"http://arxiv.org/abs/2402.03607v2","updated":"2024-11-17T21:40:50Z","published":"2024-02-06T00:51:27Z","title":"Enhancing Cross-Modal Contextual Congruence for Crowdfunding Success\n using Knowledge-infused Learning","summary":" The digital landscape continually evolves with multimodality, enriching the\nonline experience for users. Creators and marketers aim to weave subtle\ncontextual cues from various modalities into congruent content to engage users\nwith a harmonious message. This interplay of multimodal cues is often a crucial\nfactor in attracting users' attention. However, this richness of multimodality\npresents a challenge to computational modeling, as the semantic contextual cues\nspanning across modalities need to be unified to capture the true holistic\nmeaning of the multimodal content. This contextual meaning is critical in\nattracting user engagement as it conveys the intended message of the brand or\nthe organization. In this work, we incorporate external commonsense knowledge\nfrom knowledge graphs to enhance the representation of multimodal data using\ncompact Visual Language Models (VLMs) and predict the success of multi-modal\ncrowdfunding campaigns. Our results show that external knowledge commonsense\nbridges the semantic gap between text and image modalities, and the enhanced\nknowledge-infused representations improve the predictive performance of models\nfor campaign success upon the baselines without knowledge. Our findings\nhighlight the significance of contextual congruence in online multimodal\ncontent for engaging and successful crowdfunding campaigns.\n","authors":["Trilok Padhi","Ugur Kursuncu","Yaman Kumar","Valerie L. Shalin","Lane Peterson Fronczek"],"pdf_url":"https://arxiv.org/pdf/2402.03607v2.pdf","comment":"Accepted at IEEE International Conference on Big Data 2024 (IEEE\n BigData 2024)"},{"id":"http://arxiv.org/abs/2409.13870v3","updated":"2024-11-17T21:28:01Z","published":"2024-09-20T19:49:45Z","title":"Instruct-Tuning Pretrained Causal Language Models for Ancient Greek\n Papyrology and Epigraphy","summary":" This article presents an experiment in fine-tuning a pretrained causal\nlanguage model (Meta's Llama 3.1 8B Instruct) to assist with restoring missing\nor illegible characters in ancient Greek inscriptions and documentary papyri.\nUtilizing a straightforward instruction-based approach and a 95%/5% train/test\nsplit, the papyrus restoration model achieved a character error rate (CER) of\n14.9%, a top-1 accuracy of 73.5%, and a top-20 accuracy of 86.0% for sequences\nup to 10 characters. A model was also fine-tuned for geographic attribution,\nreaching a top-1 accuracy of 66.4% and a top-3 accuracy of 79.9%. In\nchronological attribution, it demonstrated an average deviation of 21.7 years\nfrom the actual terminus post/ante quem, with a median deviation of 0 years.\nFor inscriptions, the restoration model achieved a CER of 20.5%, a top-1\naccuracy of 63.7%, and a top-20 accuracy of 83.0% for sequences up to 10\ncharacters. In geographic attribution, it attained a top-1 accuracy of 75.0%\nand a top-3 accuracy of 83.7%, while in dating, it had an average deviation of\n37.1 years and a median deviation of 3 years from the actual date range.\nBenchmarked against the state-of-the-art model (Ithaca) on a shared test set\nand on recently edited inscriptions, the instruction-tuned models excelled in\ntext restoration, while also offering the practical advantage of ignoring\nspaces during reconstruction, which aligns with the scriptio continua of\nancient textual artifacts. However, their performance in geographic and\nchronological attribution was lower than Ithaca's. To evaluate the approach in\na more even setup, the instruction model was retrained with an 80%/10%/10%\ntrain-validation-test split, and still outperformed Ithaca in text restoration.\nThe results suggest that fine-tuning larger pretrained causal language models\nusing instruction templates for emendations and conjectures to ancient texts\nholds promise.\n","authors":["Eric Cullhed"],"pdf_url":"https://arxiv.org/pdf/2409.13870v3.pdf","comment":"9 pages, 1 table. To be submitted"},{"id":"http://arxiv.org/abs/2411.11179v1","updated":"2024-11-17T21:25:24Z","published":"2024-11-17T21:25:24Z","title":"Enhanced Anime Image Generation Using USE-CMHSA-GAN","summary":" With the growing popularity of ACG (Anime, Comics, and Games) culture,\ngenerating high-quality anime character images has become an important research\ntopic. This paper introduces a novel Generative Adversarial Network model,\nUSE-CMHSA-GAN, designed to produce high-quality anime character images. The\nmodel builds upon the traditional DCGAN framework, incorporating USE and CMHSA\nmodules to enhance feature extraction capabilities for anime character images.\nExperiments were conducted on the anime-face-dataset, and the results\ndemonstrate that USE-CMHSA-GAN outperforms other benchmark models, including\nDCGAN, VAE-GAN, and WGAN, in terms of FID and IS scores, indicating superior\nimage quality. These findings suggest that USE-CMHSA-GAN is highly effective\nfor anime character image generation and provides new insights for further\nimproving the quality of generative models.\n","authors":["J. Lu"],"pdf_url":"https://arxiv.org/pdf/2411.11179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04793v2","updated":"2024-11-17T21:13:54Z","published":"2024-06-07T09:40:09Z","title":"Learning-Augmented Priority Queues","summary":" Priority queues are one of the most fundamental and widely used data\nstructures in computer science. Their primary objective is to efficiently\nsupport the insertion of new elements with assigned priorities and the\nextraction of the highest priority element. In this study, we investigate the\ndesign of priority queues within the learning-augmented framework, where\nalgorithms use potentially inaccurate predictions to enhance their worst-case\nperformance. We examine three prediction models spanning different use cases,\nand show how the predictions can be leveraged to enhance the performance of\npriority queue operations. Moreover, we demonstrate the optimality of our\nsolution and discuss some possible applications.\n","authors":["Ziyad Benomar","Christian Coester"],"pdf_url":"https://arxiv.org/pdf/2406.04793v2.pdf","comment":"Accepted as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11171v1","updated":"2024-11-17T20:44:34Z","published":"2024-11-17T20:44:34Z","title":"LLäMmlein: Compact and Competitive German-Only Language Models from\n Scratch","summary":" We create two German-only decoder models, LL\\\"aMmlein 120M and 1B,\ntransparently from scratch and publish them, along with the training data, for\nthe German NLP research community to use. The model training involved several\nkey steps, including extensive data preprocessing, the creation of a custom\nGerman tokenizer, the training itself, as well as the evaluation of the final\nmodels on various benchmarks. Throughout the training process, multiple\ncheckpoints were saved and analyzed using the SuperGLEBer benchmark to monitor\nthe models' learning dynamics. Compared to state-of-the-art models on the\nSuperGLEBer benchmark, both LL\\\"aMmlein models performed competitively,\nconsistently matching or surpassing models with similar parameter sizes. The\nresults show that the models' quality scales with size as expected, but\nperformance improvements on some tasks plateaued early, offering valuable\ninsights into resource allocation for future model development.\n","authors":["Jan Pfister","Julia Wunderle","Andreas Hotho"],"pdf_url":"https://arxiv.org/pdf/2411.11171v1.pdf","comment":"first draft;\n https://www.informatik.uni-wuerzburg.de/datascience/projects/nlp/llammlein/"},{"id":"http://arxiv.org/abs/2404.19336v3","updated":"2024-11-17T19:49:58Z","published":"2024-04-30T08:03:22Z","title":"Improving LLM Classification of Logical Errors by Integrating Error\n Relationship into Prompts","summary":" LLMs trained in the understanding of programming syntax are now providing\neffective assistance to developers and are being used in programming education\nsuch as in generation of coding problem examples or providing code\nexplanations. A key aspect of programming education is understanding and\ndealing with error message. However, 'logical errors' in which the program\noperates against the programmer's intentions do not receive error messages from\nthe compiler. In this study, building on existing research on programming\nerrors, we first define the types of logical errors that can occur in\nprogramming in general. Based on the definition, we propose an effective\napproach for detecting logical errors with LLMs that makes use of relations\namong error types in the Chain-of-Thought and Tree-of-Thought prompts. The\nexperimental results indicate that when such logical error descriptions in the\nprompt are used, the average classifition performance is about 21% higher than\nthe ones without them. We also conducted an experiment for exploiting the\nrelations among errors in generating a new logical error dataset using LLMs. As\nthere is very limited dataset for logical errors such benchmark dataset can be\nvery useful for various programming related applications. We expect that our\nwork can assist novice programmers in identifying the causes of code errors and\ncorrect them more effectively.\n","authors":["Yanggyu Lee","Suchae Jeong","Jihie Kim"],"pdf_url":"https://arxiv.org/pdf/2404.19336v3.pdf","comment":"Published in ITS 2024 (Best Paper Award)"},{"id":"http://arxiv.org/abs/2411.11162v1","updated":"2024-11-17T19:45:26Z","published":"2024-11-17T19:45:26Z","title":"RPN 2: On Interdependence Function Learning Towards Unifying and\n Advancing CNN, RNN, GNN, and Transformer","summary":" This paper builds upon our previous work on the Reconciled Polynomial Network\n(RPN). The original RPN model was designed under the assumption of input data\nindependence, presuming the independence among both individual instances within\ndata batches and attributes in each data instance. However, this assumption\noften proves invalid for function learning tasks involving complex,\ninterdependent data such as language, images, time series, and graphs. Ignoring\nsuch data interdependence may inevitably lead to significant performance\ndegradation.\n To overcome these limitations, we introduce the new Reconciled Polynomial\nNetwork (version 2), namely RPN 2, in this paper. By incorporating data and\nstructural interdependence functions, RPN 2 explicitly models data\ninterdependence via new component functions in its architecture.\n This enhancement not only significantly improves RPN 2's learning performance\nbut also substantially expands its unifying potential, enabling it to encompass\na broader range of contemporary dominant backbone models within its canonical\nrepresentation. These backbones include, but are not limited to, convolutional\nneural networks (CNNs), recurrent neural networks (RNNs), graph neural networks\n(GNNs), and Transformers. Our analysis reveals that the fundamental\ndistinctions among these backbone models primarily stem from their diverse\napproaches to defining the interdependence functions. Furthermore, this unified\nrepresentation opens up new opportunities for designing innovative\narchitectures with the potential to surpass the performance of these dominant\nbackbones.\n","authors":["Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11162v1.pdf","comment":"105 pages, 37 figures, 6 tables, preprint version"},{"id":"http://arxiv.org/abs/2411.11161v1","updated":"2024-11-17T19:43:10Z","published":"2024-11-17T19:43:10Z","title":"MPLite: Multi-Aspect Pretraining for Mining Clinical Health Records","summary":" The adoption of digital systems in healthcare has resulted in the\naccumulation of vast electronic health records (EHRs), offering valuable data\nfor machine learning methods to predict patient health outcomes. However,\nsingle-visit records of patients are often neglected in the training process\ndue to the lack of annotations of next-visit information, thereby limiting the\npredictive and expressive power of machine learning models. In this paper, we\npresent a novel framework MPLite that utilizes Multi-aspect Pretraining with\nLab results through a light-weight neural network to enhance medical concept\nrepresentation and predict future health outcomes of individuals. By\nincorporating both structured medical data and additional information from lab\nresults, our approach fully leverages patient admission records. We design a\npretraining module that predicts medical codes based on lab results, ensuring\nrobust prediction by fusing multiple aspects of features. Our experimental\nevaluation using both MIMIC-III and MIMIC-IV datasets demonstrates improvements\nover existing models in diagnosis prediction and heart failure prediction\ntasks, achieving a higher weighted-F1 and recall with MPLite. This work reveals\nthe potential of integrating diverse aspects of data to advance predictive\nmodeling in healthcare.\n","authors":["Eric Yang","Pengfei Hu","Xiaoxue Han","Yue Ning"],"pdf_url":"https://arxiv.org/pdf/2411.11161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11148v1","updated":"2024-11-17T18:42:46Z","published":"2024-11-17T18:42:46Z","title":"TabDeco: A Comprehensive Contrastive Framework for Decoupled\n Representations in Tabular Data","summary":" Representation learning is a fundamental aspect of modern artificial\nintelligence, driving substantial improvements across diverse applications.\nWhile selfsupervised contrastive learning has led to significant advancements\nin fields like computer vision and natural language processing, its adaptation\nto tabular data presents unique challenges. Traditional approaches often\nprioritize optimizing model architecture and loss functions but may overlook\nthe crucial task of constructing meaningful positive and negative sample pairs\nfrom various perspectives like feature interactions, instance-level patterns\nand batch-specific contexts. To address these challenges, we introduce TabDeco,\na novel method that leverages attention-based encoding strategies across both\nrows and columns and employs contrastive learning framework to effectively\ndisentangle feature representations at multiple levels, including features,\ninstances and data batches. With the innovative feature decoupling hierarchies,\nTabDeco consistently surpasses existing deep learning methods and leading\ngradient boosting algorithms, including XG-Boost, CatBoost, and LightGBM,\nacross various benchmark tasks, underscoring its effectiveness in advancing\ntabular data representation learning.\n","authors":["Suiyao Chen","Jing Wu","Yunxiao Wang","Cheng Ji","Tianpei Xie","Daniel Cociorva","Michael Sharps","Cecile Levasseur","Hakan Brunzell"],"pdf_url":"https://arxiv.org/pdf/2411.11148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11144v1","updated":"2024-11-17T18:25:01Z","published":"2024-11-17T18:25:01Z","title":"CLMIA: Membership Inference Attacks via Unsupervised Contrastive\n Learning","summary":" Since machine learning model is often trained on a limited data set, the\nmodel is trained multiple times on the same data sample, which causes the model\nto memorize most of the training set data. Membership Inference Attacks (MIAs)\nexploit this feature to determine whether a data sample is used for training a\nmachine learning model. However, in realistic scenarios, it is difficult for\nthe adversary to obtain enough qualified samples that mark accurate identity\ninformation, especially since most samples are non-members in real world\napplications. To address this limitation, in this paper, we propose a new\nattack method called CLMIA, which uses unsupervised contrastive learning to\ntrain an attack model without using extra membership status information.\nMeanwhile, in CLMIA, we require only a small amount of data with known\nmembership status to fine-tune the attack model. Experimental results\ndemonstrate that CLMIA performs better than existing attack methods for\ndifferent datasets and model structures, especially with data with less marked\nidentity information. In addition, we experimentally find that the attack\nperforms differently for different proportions of labeled identity information\nfor member and non-member data. More analysis proves that our attack method\nperforms better with less labeled identity information, which applies to more\nrealistic scenarios.\n","authors":["Depeng Chen","Xiao Liu","Jie Cui","Hong Zhong"],"pdf_url":"https://arxiv.org/pdf/2411.11144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12366v3","updated":"2024-11-17T18:03:40Z","published":"2023-01-29T06:03:20Z","title":"Smooth Non-Stationary Bandits","summary":" In many applications of online decision making, the environment is\nnon-stationary and it is therefore crucial to use bandit algorithms that handle\nchanges. Most existing approaches are designed to protect against non-smooth\nchanges, constrained only by total variation or Lipschitzness over time.\nHowever, in practice, environments often change {\\em smoothly}, so such\nalgorithms may incur higher-than-necessary regret. We study a non-stationary\nbandits problem where each arm's mean reward sequence can be embedded into a\n$\\beta$-H\\\"older function, i.e., a function that is $(\\beta-1)$-times\nLipschitz-continuously differentiable. The non-stationarity becomes more smooth\nas $\\beta$ increases. When $\\beta=1$, this corresponds to the non-smooth\nregime, where \\cite{besbes2014stochastic} established a minimax regret of\n$\\tilde \\Theta(T^{2/3})$. We show the first separation between the smooth\n(i.e., $\\beta\\ge 2$) and non-smooth (i.e., $\\beta=1$) regimes by presenting a\npolicy with $\\tilde O(k^{4/5} T^{3/5})$ regret on any $k$-armed, $2$-H\\\"older\ninstance. We complement this result by showing that the minimax regret on the\n$\\beta$-H\\\"older family of instances is $\\Omega(T^{(\\beta+1)/(2\\beta+1)})$ for\nany integer $\\beta\\ge 1$. This matches our upper bound for $\\beta=2$ up to\nlogarithmic factors. Furthermore, we validated the effectiveness of our policy\nthrough a comprehensive numerical study using real-world click-through rate\ndata.\n","authors":["Su Jia","Qian Xie","Nathan Kallus","Peter I. Frazier"],"pdf_url":"https://arxiv.org/pdf/2301.12366v3.pdf","comment":"Accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2410.05558v2","updated":"2024-11-17T17:00:11Z","published":"2024-10-07T23:36:05Z","title":"Narrative-of-Thought: Improving Temporal Reasoning of Large Language\n Models via Recounted Narratives","summary":" Reasoning about time and temporal relations is an integral aspect of human\ncognition, essential for perceiving the world and navigating our experiences.\nThough large language models (LLMs) have demonstrated impressive performance in\nmany reasoning tasks, temporal reasoning remains challenging due to its\nintrinsic complexity. In this work, we first study an essential task of\ntemporal reasoning -- temporal graph generation, to unveil LLMs' inherent,\nglobal reasoning capabilities. We show that this task presents great challenges\neven for the most powerful LLMs, such as GPT-3.5/4. We also notice a\nsignificant performance gap by small models (<10B) that lag behind LLMs by 50%.\nNext, we study how to close this gap with a budget constraint, e.g., not using\nmodel finetuning. We propose a new prompting technique tailored for temporal\nreasoning, Narrative-of-Thought (NoT), that first converts the events set to a\nPython class, then prompts a small model to generate a temporally grounded\nnarrative, guiding the final generation of a temporal graph. Extensive\nexperiments showcase the efficacy of NoT in improving various metrics. Notably,\nNoT attains the highest F1 on the Schema-11 evaluation set, while securing an\noverall F1 on par with GPT-3.5. NoT also achieves the best structural\nsimilarity across the board, even compared with GPT-3.5/4. Our code is\navailable at https://github.com/launchnlp/NoT.\n","authors":["Xinliang Frederick Zhang","Nick Beauchamp","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2410.05558v2.pdf","comment":"EMNLP'24 Findings"},{"id":"http://arxiv.org/abs/2406.20098v2","updated":"2024-11-17T16:11:00Z","published":"2024-06-28T17:59:46Z","title":"Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework\n for Multimodal LLMs","summary":" Multimodal large language models (MLLMs) have shown impressive success across\nmodalities such as image, video, and audio in a variety of understanding and\ngeneration tasks. However, current MLLMs are surprisingly poor at understanding\nwebpage screenshots and generating their corresponding HTML code. To address\nthis problem, we propose $\\texttt{Web2Code}$, a benchmark consisting of a new\nlarge-scale webpage-to-code dataset for instruction tuning and an evaluation\nframework for the webpage understanding and HTML code translation abilities of\nMLLMs. For dataset construction, we leverage pretrained LLMs to enhance\nexisting webpage-to-code datasets as well as generate a diverse pool of new\nwebpages rendered into images. Specifically, the inputs are webpage images and\ninstructions, while the responses are the webpage's HTML code. We further\ninclude diverse natural language QA pairs about the webpage content in the\nresponses to enable a more comprehensive understanding of the web content. To\nevaluate model performance in these tasks, we develop an evaluation framework\nfor testing MLLMs' abilities in webpage understanding and web-to-code\ngeneration. Extensive experiments show that our proposed dataset is beneficial\nnot only to our proposed tasks but also in the general visual domain. We hope\nour work will contribute to the development of general MLLMs suitable for\nweb-based content generation and task automation. Our data and code are\navailable at https://github.com/MBZUAI-LLM/web2code.\n","authors":["Sukmin Yun","Haokun Lin","Rusiru Thushara","Mohammad Qazim Bhat","Yongxin Wang","Zutao Jiang","Mingkai Deng","Jinhong Wang","Tianhua Tao","Junbo Li","Haonan Li","Preslav Nakov","Timothy Baldwin","Zhengzhong Liu","Eric P. Xing","Xiaodan Liang","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2406.20098v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Camera-ready Version. Website at\n https://mbzuai-llm.github.io/webpage2code/"},{"id":"http://arxiv.org/abs/2410.14970v3","updated":"2024-11-17T16:09:12Z","published":"2024-10-19T04:28:44Z","title":"Taming the Long Tail in Human Mobility Prediction","summary":" With the popularity of location-based services, human mobility prediction\nplays a key role in enhancing personalized navigation, optimizing\nrecommendation systems, and facilitating urban mobility and planning. This\ninvolves predicting a user's next POI (point-of-interest) visit using their\npast visit history. However, the uneven distribution of visitations over time\nand space, namely the long-tail problem in spatial distribution, makes it\ndifficult for AI models to predict those POIs that are less visited by humans.\nIn light of this issue, we propose the Long-Tail Adjusted Next POI Prediction\n(LoTNext) framework for mobility prediction, combining a Long-Tailed Graph\nAdjustment module to reduce the impact of the long-tailed nodes in the user-POI\ninteraction graph and a novel Long-Tailed Loss Adjustment module to adjust loss\nby logit score and sample weight adjustment strategy. Also, we employ the\nauxiliary prediction task to enhance generalization and accuracy. Our\nexperiments with two real-world trajectory datasets demonstrate that LoTNext\nsignificantly surpasses existing state-of-the-art works. Our code is available\nat https://github.com/Yukayo/LoTNext.\n","authors":["Xiaohang Xu","Renhe Jiang","Chuang Yang","Zipei Fan","Kaoru Sezaki"],"pdf_url":"https://arxiv.org/pdf/2410.14970v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11105v1","updated":"2024-11-17T15:50:25Z","published":"2024-11-17T15:50:25Z","title":"Label Sharing Incremental Learning Framework for Independent Multi-Label\n Segmentation Tasks","summary":" In a setting where segmentation models have to be built for multiple\ndatasets, each with its own corresponding label set, a straightforward way is\nto learn one model for every dataset and its labels. Alternatively, multi-task\narchitectures with shared encoders and multiple segmentation heads or shared\nweights with compound labels can also be made use of. This work proposes a\nnovel label sharing framework where a shared common label space is constructed\nand each of the individual label sets are systematically mapped to the common\nlabels. This transforms multiple datasets with disparate label sets into a\nsingle large dataset with shared labels, and therefore all the segmentation\ntasks can be addressed by learning a single model. This eliminates the need for\ntask specific adaptations in network architectures and also results in\nparameter and data efficient models. Furthermore, label sharing framework is\nnaturally amenable for incremental learning where segmentations for new\ndatasets can be easily learnt. We experimentally validate our method on various\nmedical image segmentation datasets, each involving multi-label segmentation.\nFurthermore, we demonstrate the efficacy of the proposed method in terms of\nperformance and incremental learning ability vis-a-vis alternative methods.\n","authors":["Deepa Anand","Bipul Das","Vyshnav Dangeti","Antony Jerald","Rakesh Mullick","Uday Patil","Pakhi Sharma","Prasad Sudhakar"],"pdf_url":"https://arxiv.org/pdf/2411.11105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11101v1","updated":"2024-11-17T15:17:08Z","published":"2024-11-17T15:17:08Z","title":"Different Horses for Different Courses: Comparing Bias Mitigation\n Algorithms in ML","summary":" With fairness concerns gaining significant attention in Machine Learning\n(ML), several bias mitigation techniques have been proposed, often compared\nagainst each other to find the best method. These benchmarking efforts tend to\nuse a common setup for evaluation under the assumption that providing a uniform\nenvironment ensures a fair comparison. However, bias mitigation techniques are\nsensitive to hyperparameter choices, random seeds, feature selection, etc.,\nmeaning that comparison on just one setting can unfairly favour certain\nalgorithms. In this work, we show significant variance in fairness achieved by\nseveral algorithms and the influence of the learning pipeline on fairness\nscores. We highlight that most bias mitigation techniques can achieve\ncomparable performance, given the freedom to perform hyperparameter\noptimization, suggesting that the choice of the evaluation parameters-rather\nthan the mitigation technique itself-can sometimes create the perceived\nsuperiority of one method over another. We hope our work encourages future\nresearch on how various choices in the lifecycle of developing an algorithm\nimpact fairness, and trends that guide the selection of appropriate algorithms.\n","authors":["Prakhar Ganeesh","Usman Gohar","Lu Cheng","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2411.11101v1.pdf","comment":"To appear at AFME@NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11099v1","updated":"2024-11-17T15:00:39Z","published":"2024-11-17T15:00:39Z","title":"Mitigating Relative Over-Generalization in Multi-Agent Reinforcement\n Learning","summary":" In decentralized multi-agent reinforcement learning, agents learning in\nisolation can lead to relative over-generalization (RO), where optimal joint\nactions are undervalued in favor of suboptimal ones. This hinders effective\ncoordination in cooperative tasks, as agents tend to choose actions that are\nindividually rational but collectively suboptimal. To address this issue, we\nintroduce MaxMax Q-Learning (MMQ), which employs an iterative process of\nsampling and evaluating potential next states, selecting those with maximal\nQ-values for learning. This approach refines approximations of ideal state\ntransitions, aligning more closely with the optimal joint policy of\ncollaborating agents. We provide theoretical analysis supporting MMQ's\npotential and present empirical evaluations across various environments\nsusceptible to RO. Our results demonstrate that MMQ frequently outperforms\nexisting baselines, exhibiting enhanced convergence and sample efficiency.\n","authors":["Ting Zhu","Yue Jin","Jeremie Houssineau","Giovanni Montana"],"pdf_url":"https://arxiv.org/pdf/2411.11099v1.pdf","comment":"Published in Transactions on Machine Learning Research (11/2024)"},{"id":"http://arxiv.org/abs/2403.05100v2","updated":"2024-11-17T14:42:51Z","published":"2024-03-08T07:03:18Z","title":"Exploring the Adversarial Frontier: Quantifying Robustness via\n Adversarial Hypervolume","summary":" The escalating threat of adversarial attacks on deep learning models,\nparticularly in security-critical fields, has underscored the need for robust\ndeep learning systems. Conventional robustness evaluations have relied on\nadversarial accuracy, which measures a model's performance under a specific\nperturbation intensity. However, this singular metric does not fully\nencapsulate the overall resilience of a model against varying degrees of\nperturbation. To address this gap, we propose a new metric termed adversarial\nhypervolume, assessing the robustness of deep learning models comprehensively\nover a range of perturbation intensities from a multi-objective optimization\nstandpoint. This metric allows for an in-depth comparison of defense mechanisms\nand recognizes the trivial improvements in robustness afforded by less potent\ndefensive strategies. Additionally, we adopt a novel training algorithm that\nenhances adversarial robustness uniformly across various perturbation\nintensities, in contrast to methods narrowly focused on optimizing adversarial\naccuracy. Our extensive empirical studies validate the effectiveness of the\nadversarial hypervolume metric, demonstrating its ability to reveal subtle\ndifferences in robustness that adversarial accuracy overlooks. This research\ncontributes a new measure of robustness and establishes a standard for\nassessing and benchmarking the resilience of current and future defensive\nmodels against adversarial threats.\n","authors":["Ping Guo","Cheng Gong","Xi Lin","Zhiyuan Yang","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05100v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11057v1","updated":"2024-11-17T12:38:13Z","published":"2024-11-17T12:38:13Z","title":"Reinforcing Competitive Multi-Agents for Playing So Long Sucker","summary":" This paper examines the use of classical deep reinforcement learning (DRL)\nalgorithms, DQN, DDQN, and Dueling DQN, in the strategy game So Long Sucker\n(SLS), a diplomacy-driven game defined by coalition-building and strategic\nbetrayal. SLS poses unique challenges due to its blend of cooperative and\nadversarial dynamics, making it an ideal platform for studying multi-agent\nlearning and game theory. The study's primary goal is to teach autonomous\nagents the game's rules and strategies using classical DRL methods. To support\nthis effort, the authors developed a novel, publicly available implementation\nof SLS, featuring a graphical user interface (GUI) and benchmarking tools for\nDRL algorithms. Experimental results reveal that while considered basic by\nmodern DRL standards, DQN, DDQN, and Dueling DQN agents achieved roughly 50% of\nthe maximum possible game reward. This suggests a baseline understanding of the\ngame's mechanics, with agents favoring legal moves over illegal ones. However,\na significant limitation was the extensive training required, around 2000\ngames, for agents to reach peak performance, compared to human players who\ngrasp the game within a few rounds. Even after prolonged training, agents\noccasionally made illegal moves, highlighting both the potential and\nlimitations of these classical DRL methods in semi-complex, socially driven\ngames. The findings establish a foundational benchmark for training agents in\nSLS and similar negotiation-based environments while underscoring the need for\nadvanced or hybrid DRL approaches to improve learning efficiency and\nadaptability. Future research could incorporate game-theoretic strategies to\nenhance agent decision-making in dynamic multi-agent contexts.\n","authors":["Medant Sharan","Chandranath Adak"],"pdf_url":"https://arxiv.org/pdf/2411.11057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11053v1","updated":"2024-11-17T12:31:04Z","published":"2024-11-17T12:31:04Z","title":"SRA-MCTS: Self-driven Reasoning Aurmentation with Monte Carlo Tree\n Search for Enhanced Code Generation","summary":" Large language models demonstrate exceptional performance in simple code\ngeneration tasks but still face challenges in tackling complex problems. These\nchallenges may stem from insufficient reasoning and problem decomposition\ncapabilities. To address this issue, we propose a reasoning-augmented data\ngeneration process, SRA-MCTS, which guides the model to autonomously generate\nhigh-quality intermediate reasoning paths. This creates a positive feedback\nloop, enabling continuous improvement. Our method operates entirely through the\nmodel itself without requiring additional supervision. By synthesizing natural\nlanguage reasoning paths and translating them into executable code, the\napproach ensures analytical accuracy and enhances the success rate in solving\ncomplex tasks. Experimental results show that, even without additional\nsupervisory signals, our method achieves performance improvements across\ndifferent model scales, demonstrating the significant potential of\nself-improvement in small models. Furthermore, the method remains robust when\ntraditional Chain-of-Thought (CoT) approaches exhibit performance degradation,\nwith notable improvements observed in diversity metrics such as pass@10. We\nencourage further exploration of reasoning processes within training data to\nenhance the ability of language models to address complex problems.\n","authors":["Bin Xu","Yiguan Lin","Yinghao Li"," YangGao"],"pdf_url":"https://arxiv.org/pdf/2411.11053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17747v5","updated":"2024-11-17T12:18:45Z","published":"2024-02-27T18:32:11Z","title":"When Your AIs Deceive You: Challenges of Partial Observability in\n Reinforcement Learning from Human Feedback","summary":" Past analyses of reinforcement learning from human feedback (RLHF) assume\nthat the human evaluators fully observe the environment. What happens when\nhuman feedback is based only on partial observations? We formally define two\nfailure cases: deceptive inflation and overjustification. Modeling the human as\nBoltzmann-rational w.r.t. a belief over trajectories, we prove conditions under\nwhich RLHF is guaranteed to result in policies that deceptively inflate their\nperformance, overjustify their behavior to make an impression, or both. Under\nthe new assumption that the human's partial observability is known and\naccounted for, we then analyze how much information the feedback process\nprovides about the return function. We show that sometimes, the human's\nfeedback determines the return function uniquely up to an additive constant,\nbut in other realistic cases, there is irreducible ambiguity. We propose\nexploratory research directions to help tackle these challenges, experimentally\nvalidate both the theoretical concerns and potential mitigations, and caution\nagainst blindly applying RLHF in partially observable settings.\n","authors":["Leon Lang","Davis Foote","Stuart Russell","Anca Dragan","Erik Jenner","Scott Emmons"],"pdf_url":"https://arxiv.org/pdf/2402.17747v5.pdf","comment":"Advances in Neural Information Processing Systems 37 (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2311.18252v3","updated":"2024-11-17T12:09:49Z","published":"2023-11-30T05:03:08Z","title":"Privacy and Copyright Protection in Generative AI: A Lifecycle\n Perspective","summary":" The advent of Generative AI has marked a significant milestone in artificial\nintelligence, demonstrating remarkable capabilities in generating realistic\nimages, texts, and data patterns. However, these advancements come with\nheightened concerns over data privacy and copyright infringement, primarily due\nto the reliance on vast datasets for model training. Traditional approaches\nlike differential privacy, machine unlearning, and data poisoning only offer\nfragmented solutions to these complex issues. Our paper delves into the\nmultifaceted challenges of privacy and copyright protection within the data\nlifecycle. We advocate for integrated approaches that combines technical\ninnovation with ethical foresight, holistically addressing these concerns by\ninvestigating and devising solutions that are informed by the lifecycle\nperspective. This work aims to catalyze a broader discussion and inspire\nconcerted efforts towards data privacy and copyright integrity in Generative\nAI.\n","authors":["Dawen Zhang","Boming Xia","Yue Liu","Xiwei Xu","Thong Hoang","Zhenchang Xing","Mark Staples","Qinghua Lu","Liming Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.18252v3.pdf","comment":"Accepted by 2024 IEEE/ACM 3rd International Conference on AI\n Engineering - Software Engineering for AI (CAIN)"},{"id":"http://arxiv.org/abs/2411.11046v1","updated":"2024-11-17T11:53:54Z","published":"2024-11-17T11:53:54Z","title":"Knowledge-enhanced Transformer for Multivariate Long Sequence\n Time-series Forecasting","summary":" Multivariate Long Sequence Time-series Forecasting (LSTF) has been a critical\ntask across various real-world applications. Recent advancements focus on the\napplication of transformer architectures attributable to their ability to\ncapture temporal patterns effectively over extended periods. However, these\napproaches often overlook the inherent relationships and interactions between\nthe input variables that could be drawn from their characteristic properties.\nIn this paper, we aim to bridge this gap by integrating information-rich\nKnowledge Graph Embeddings (KGE) with state-of-the-art transformer-based\narchitectures. We introduce a novel approach that encapsulates conceptual\nrelationships among variables within a well-defined knowledge graph, forming\ndynamic and learnable KGEs for seamless integration into the transformer\narchitecture. We investigate the influence of this integration into seminal\narchitectures such as PatchTST, Autoformer, Informer, and Vanilla Transformer.\nFurthermore, we thoroughly investigate the performance of these\nknowledge-enhanced architectures along with their original implementations for\nlong forecasting horizons and demonstrate significant improvement in the\nbenchmark results. This enhancement empowers transformer-based architectures to\naddress the inherent structural relation between variables. Our\nknowledge-enhanced approach improves the accuracy of multivariate LSTF by\ncapturing complex temporal and relational dynamics across multiple domains. To\nsubstantiate the validity of our model, we conduct comprehensive experiments\nusing Weather and Electric Transformer Temperature (ETT) datasets.\n","authors":["Shubham Tanaji Kakde","Rony Mitra","Jasashwi Mandal","Manoj Kumar Tiwari"],"pdf_url":"https://arxiv.org/pdf/2411.11046v1.pdf","comment":"9 pages, 4 figures, 4 tables"}],"Computation and Language":[{"id":"http://arxiv.org/abs/2411.11206v1","updated":"2024-11-17T23:40:00Z","published":"2024-11-17T23:40:00Z","title":"Capturing Sparks of Abstraction for the ARC Challenge","summary":" Excellent progress has been made recently in solving ARC Challenge problems.\nHowever, it seems that new techniques may be required to push beyond 60%\naccuracy. Even commercial Large Language Models (LLMs) struggle to 'understand'\nmany of the problems (when given the input and output grids), which makes\ndiscovering solutions by LLM-lead program search somewhat futile.\n In this work, LLM 'understanding' is attempted from a stronger starting\nposition : An LLM is given complete solutions to tasks in code, and then asked\nto explain how the task is being solved at various levels of abstraction.\nSpecifically, the LLM was given code solutions implemented in arc-dsl-llm (an\nLLM-legible version of Hodel's arc-dsl to obtain: (a) commented code; (b) code\nrefactored into reusable functional chunks; (c) problem solution steps; and (d)\nhigh-level problem-solving tactics.\n We demonstrate that 'Sparks of Abstraction' can be extracted from the LLM\noutput - in a form that could be used in downstream tasks with Local LLMs\neligible to enter the ARC Prize.\n Both the arc-dsl-llm DSL framework (with the re-engineered solutions) and the\nGemini LLM-generated data (along with the generation code) are made Open\nSource.\n","authors":["Martin Andrews"],"pdf_url":"https://arxiv.org/pdf/2411.11206v1.pdf","comment":"Submitted as a paper entry for the 2024 ARC Prize"},{"id":"http://arxiv.org/abs/2411.11203v1","updated":"2024-11-17T23:36:37Z","published":"2024-11-17T23:36:37Z","title":"Debiasing Watermarks for Large Language Models via Maximal Coupling","summary":" Watermarking language models is essential for distinguishing between human\nand machine-generated text and thus maintaining the integrity and\ntrustworthiness of digital communication. We present a novel green/red list\nwatermarking approach that partitions the token set into ``green'' and ``red''\nlists, subtly increasing the generation probability for green tokens. To\ncorrect token distribution bias, our method employs maximal coupling, using a\nuniform coin flip to decide whether to apply bias correction, with the result\nembedded as a pseudorandom watermark signal. Theoretical analysis confirms this\napproach's unbiased nature and robust detection capabilities. Experimental\nresults show that it outperforms prior techniques by preserving text quality\nwhile maintaining high detectability, and it demonstrates resilience to\ntargeted modifications aimed at improving text quality. This research provides\na promising watermarking solution for language models, balancing effective\ndetection with minimal impact on text quality.\n","authors":["Yangxinyu Xie","Xiang Li","Tanwi Mallick","Weijie J. Su","Ruixun Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06304v2","updated":"2024-11-17T23:22:18Z","published":"2024-10-08T19:25:26Z","title":"FG-PRM: Fine-grained Hallucination Detection and Mitigation in Language\n Model Mathematical Reasoning","summary":" Hallucinations in large language models (LLMs) pose significant challenges in\ntasks requiring complex multi-step reasoning, such as mathematical\nproblem-solving. Existing approaches primarily detect the presence of\nhallucinations but lack a nuanced understanding of their types and\nmanifestations. In this paper, we first introduce a comprehensive taxonomy that\ncategorizes the common hallucinations in mathematical reasoning task into six\ntypes: fabrication, factual inconsistency, context inconsistency, instruction\ninconsistency, logical inconsistency, and logical error. We then propose FG-PRM\n(Fine-Grained Process Reward Model), an augmented model designed to detect and\nmitigate hallucinations in a fine-grained, step-level manner. To address the\nlimitations of manually labeling training data, we propose an automated method\nfor generating fine-grained hallucination data using LLMs. By injecting\nhallucinations into reasoning steps of correct solutions, we create a diverse\nand balanced synthetic dataset for training FG-PRM, which consists of six\nspecialized Process Reward Models (PRMs), each tailored to detect a specific\nhallucination type. Our FG-PRM demonstrates superior performance across two key\ntasks: 1) Fine-grained hallucination detection: classifying hallucination types\nfor each reasoning step; and 2) Verification: ranking multiple LLM-generated\noutputs to select the most accurate solution, mitigating reasoning\nhallucinations. Our experiments show that FG-PRM outperforms ChatGPT-3.5 and\nClaude-3 on fine-grained hallucination detection and substantially boosts the\nperformance of LLMs on GSM8K and MATH benchmarks.\n","authors":["Ruosen Li","Ziming Luo","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2410.06304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13710v2","updated":"2024-11-17T22:32:53Z","published":"2024-09-06T16:17:06Z","title":"You can remove GPT2's LayerNorm by fine-tuning","summary":" The LayerNorm (LN) layer in GPT-style transformer models has long been a\nhindrance to mechanistic interpretability. LN is a crucial component required\nto stabilize the training of large language models, and LN or the similar\nRMSNorm have been used in practically all large language models based on the\ntransformer architecture. The non-linear nature of the LN layers is a hindrance\nfor mechanistic interpretability as it hinders interpretation of the residual\nstream, and makes it difficult to decompose the model into circuits. Some\nresearchers have gone so far as to name \"reasons interpretability researchers\nhate layer norm.\"\n In this paper we show that it is possible to remove the LN layers from a\npre-trained GPT2-small model by fine-tuning on a fraction (500M tokens) of the\ntraining data. We demonstrate that this LN-free model achieves similar\nperformance to the original model on the OpenWebText and ThePile datasets\n(-0.05 cross-entropy loss), and the Hellaswag benchmark (-0.5% accuracy). We\nprovide our implementation at https://github.com/ApolloResearch/gpt2_noLN, and\nfine-tuned GPT2-small models at\nhttps://huggingface.co/apollo-research/gpt2_noLN.\n Our work not only provides a simplified model for mechanistic\ninterpretability research, but also provides evidence that the LN layers, at\ninference time, do not play a crucial role in transformer models.\n","authors":["Stefan Heimersheim"],"pdf_url":"https://arxiv.org/pdf/2409.13710v2.pdf","comment":"Presented at the Attributing Model Behavior at Scale (ATTRIB) and\n Interpretable AI: Past, Present, and Future workshops at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.03607v2","updated":"2024-11-17T21:40:50Z","published":"2024-02-06T00:51:27Z","title":"Enhancing Cross-Modal Contextual Congruence for Crowdfunding Success\n using Knowledge-infused Learning","summary":" The digital landscape continually evolves with multimodality, enriching the\nonline experience for users. Creators and marketers aim to weave subtle\ncontextual cues from various modalities into congruent content to engage users\nwith a harmonious message. This interplay of multimodal cues is often a crucial\nfactor in attracting users' attention. However, this richness of multimodality\npresents a challenge to computational modeling, as the semantic contextual cues\nspanning across modalities need to be unified to capture the true holistic\nmeaning of the multimodal content. This contextual meaning is critical in\nattracting user engagement as it conveys the intended message of the brand or\nthe organization. In this work, we incorporate external commonsense knowledge\nfrom knowledge graphs to enhance the representation of multimodal data using\ncompact Visual Language Models (VLMs) and predict the success of multi-modal\ncrowdfunding campaigns. Our results show that external knowledge commonsense\nbridges the semantic gap between text and image modalities, and the enhanced\nknowledge-infused representations improve the predictive performance of models\nfor campaign success upon the baselines without knowledge. Our findings\nhighlight the significance of contextual congruence in online multimodal\ncontent for engaging and successful crowdfunding campaigns.\n","authors":["Trilok Padhi","Ugur Kursuncu","Yaman Kumar","Valerie L. Shalin","Lane Peterson Fronczek"],"pdf_url":"https://arxiv.org/pdf/2402.03607v2.pdf","comment":"Accepted at IEEE International Conference on Big Data 2024 (IEEE\n BigData 2024)"},{"id":"http://arxiv.org/abs/2409.13870v3","updated":"2024-11-17T21:28:01Z","published":"2024-09-20T19:49:45Z","title":"Instruct-Tuning Pretrained Causal Language Models for Ancient Greek\n Papyrology and Epigraphy","summary":" This article presents an experiment in fine-tuning a pretrained causal\nlanguage model (Meta's Llama 3.1 8B Instruct) to assist with restoring missing\nor illegible characters in ancient Greek inscriptions and documentary papyri.\nUtilizing a straightforward instruction-based approach and a 95%/5% train/test\nsplit, the papyrus restoration model achieved a character error rate (CER) of\n14.9%, a top-1 accuracy of 73.5%, and a top-20 accuracy of 86.0% for sequences\nup to 10 characters. A model was also fine-tuned for geographic attribution,\nreaching a top-1 accuracy of 66.4% and a top-3 accuracy of 79.9%. In\nchronological attribution, it demonstrated an average deviation of 21.7 years\nfrom the actual terminus post/ante quem, with a median deviation of 0 years.\nFor inscriptions, the restoration model achieved a CER of 20.5%, a top-1\naccuracy of 63.7%, and a top-20 accuracy of 83.0% for sequences up to 10\ncharacters. In geographic attribution, it attained a top-1 accuracy of 75.0%\nand a top-3 accuracy of 83.7%, while in dating, it had an average deviation of\n37.1 years and a median deviation of 3 years from the actual date range.\nBenchmarked against the state-of-the-art model (Ithaca) on a shared test set\nand on recently edited inscriptions, the instruction-tuned models excelled in\ntext restoration, while also offering the practical advantage of ignoring\nspaces during reconstruction, which aligns with the scriptio continua of\nancient textual artifacts. However, their performance in geographic and\nchronological attribution was lower than Ithaca's. To evaluate the approach in\na more even setup, the instruction model was retrained with an 80%/10%/10%\ntrain-validation-test split, and still outperformed Ithaca in text restoration.\nThe results suggest that fine-tuning larger pretrained causal language models\nusing instruction templates for emendations and conjectures to ancient texts\nholds promise.\n","authors":["Eric Cullhed"],"pdf_url":"https://arxiv.org/pdf/2409.13870v3.pdf","comment":"9 pages, 1 table. To be submitted"},{"id":"http://arxiv.org/abs/2411.11171v1","updated":"2024-11-17T20:44:34Z","published":"2024-11-17T20:44:34Z","title":"LLäMmlein: Compact and Competitive German-Only Language Models from\n Scratch","summary":" We create two German-only decoder models, LL\\\"aMmlein 120M and 1B,\ntransparently from scratch and publish them, along with the training data, for\nthe German NLP research community to use. The model training involved several\nkey steps, including extensive data preprocessing, the creation of a custom\nGerman tokenizer, the training itself, as well as the evaluation of the final\nmodels on various benchmarks. Throughout the training process, multiple\ncheckpoints were saved and analyzed using the SuperGLEBer benchmark to monitor\nthe models' learning dynamics. Compared to state-of-the-art models on the\nSuperGLEBer benchmark, both LL\\\"aMmlein models performed competitively,\nconsistently matching or surpassing models with similar parameter sizes. The\nresults show that the models' quality scales with size as expected, but\nperformance improvements on some tasks plateaued early, offering valuable\ninsights into resource allocation for future model development.\n","authors":["Jan Pfister","Julia Wunderle","Andreas Hotho"],"pdf_url":"https://arxiv.org/pdf/2411.11171v1.pdf","comment":"first draft;\n https://www.informatik.uni-wuerzburg.de/datascience/projects/nlp/llammlein/"},{"id":"http://arxiv.org/abs/2406.10965v2","updated":"2024-11-17T17:30:24Z","published":"2024-06-16T14:51:12Z","title":"DocNet: Semantic Structure in Inductive Bias Detection Models","summary":" News will have biases so long as people have opinions. It is increasingly\nimportant for informed citizens to be able to identify bias as social media\nbecomes the primary entry point for news and partisan differences increase. If\npeople know the biases of the news they are consuming, they will be able to\ntake action to avoid polarizing echo chambers. In this paper, we explore an\noften overlooked aspect of bias detection in documents: the semantic structure\nof news articles. We present DocNet, a novel, inductive, and low-resource\ndocument embedding and bias detection model that outperforms large language\nmodels. We also demonstrate that the semantic structure of news articles from\nopposing partisan sides, as represented in document-level graph embeddings,\nhave significant similarities. These results can be used to advance bias\ndetection in low-resource environments. Our code, data, and the corresponding\ndatasheet are made available at: https://anonymous.4open.science/r/DocNet/.\n","authors":["Jessica Zhu","Iain Cruickshank","Michel Cukier"],"pdf_url":"https://arxiv.org/pdf/2406.10965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05168v2","updated":"2024-11-17T17:26:23Z","published":"2024-10-07T16:25:39Z","title":"ReasoningRank: Teaching Student Models to Rank through Reasoning-Based\n Knowledge Distillation","summary":" Reranking documents based on their relevance to a given query is a critical\ntask in information retrieval. Traditional reranking methods often lack\ntransparency and rely on proprietary models, hindering reproducibility and\ninterpretability. We propose Reason-to-Rank (R2R), a novel open-source\nreranking approach that enhances transparency by generating two types of\nreasoning: direct relevance reasoning, which explains how a document addresses\nthe query, and comparison reasoning, which justifies the relevance of one\ndocument over another. We leverage large language models (LLMs) as teacher\nmodels to generate these explanations and distill this knowledge into smaller,\nopenly available student models. Our student models are trained to generate\nmeaningful reasoning and rerank documents, achieving competitive performance\nacross multiple datasets, including MSMARCO and BRIGHT. Experiments demonstrate\nthat R2R not only improves reranking accuracy but also provides valuable\ninsights into the decision-making process. By offering a structured and\ninterpretable solution with openly accessible resources, R2R aims to bridge the\ngap between effectiveness and transparency in information retrieval, fostering\nreproducibility and further research in the field.\n","authors":["Yuelyu Ji","Zhuochun Li","Rui Meng","Daqing He"],"pdf_url":"https://arxiv.org/pdf/2410.05168v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05558v2","updated":"2024-11-17T17:00:11Z","published":"2024-10-07T23:36:05Z","title":"Narrative-of-Thought: Improving Temporal Reasoning of Large Language\n Models via Recounted Narratives","summary":" Reasoning about time and temporal relations is an integral aspect of human\ncognition, essential for perceiving the world and navigating our experiences.\nThough large language models (LLMs) have demonstrated impressive performance in\nmany reasoning tasks, temporal reasoning remains challenging due to its\nintrinsic complexity. In this work, we first study an essential task of\ntemporal reasoning -- temporal graph generation, to unveil LLMs' inherent,\nglobal reasoning capabilities. We show that this task presents great challenges\neven for the most powerful LLMs, such as GPT-3.5/4. We also notice a\nsignificant performance gap by small models (<10B) that lag behind LLMs by 50%.\nNext, we study how to close this gap with a budget constraint, e.g., not using\nmodel finetuning. We propose a new prompting technique tailored for temporal\nreasoning, Narrative-of-Thought (NoT), that first converts the events set to a\nPython class, then prompts a small model to generate a temporally grounded\nnarrative, guiding the final generation of a temporal graph. Extensive\nexperiments showcase the efficacy of NoT in improving various metrics. Notably,\nNoT attains the highest F1 on the Schema-11 evaluation set, while securing an\noverall F1 on par with GPT-3.5. NoT also achieves the best structural\nsimilarity across the board, even compared with GPT-3.5/4. Our code is\navailable at https://github.com/launchnlp/NoT.\n","authors":["Xinliang Frederick Zhang","Nick Beauchamp","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2410.05558v2.pdf","comment":"EMNLP'24 Findings"},{"id":"http://arxiv.org/abs/2406.20098v2","updated":"2024-11-17T16:11:00Z","published":"2024-06-28T17:59:46Z","title":"Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework\n for Multimodal LLMs","summary":" Multimodal large language models (MLLMs) have shown impressive success across\nmodalities such as image, video, and audio in a variety of understanding and\ngeneration tasks. However, current MLLMs are surprisingly poor at understanding\nwebpage screenshots and generating their corresponding HTML code. To address\nthis problem, we propose $\\texttt{Web2Code}$, a benchmark consisting of a new\nlarge-scale webpage-to-code dataset for instruction tuning and an evaluation\nframework for the webpage understanding and HTML code translation abilities of\nMLLMs. For dataset construction, we leverage pretrained LLMs to enhance\nexisting webpage-to-code datasets as well as generate a diverse pool of new\nwebpages rendered into images. Specifically, the inputs are webpage images and\ninstructions, while the responses are the webpage's HTML code. We further\ninclude diverse natural language QA pairs about the webpage content in the\nresponses to enable a more comprehensive understanding of the web content. To\nevaluate model performance in these tasks, we develop an evaluation framework\nfor testing MLLMs' abilities in webpage understanding and web-to-code\ngeneration. Extensive experiments show that our proposed dataset is beneficial\nnot only to our proposed tasks but also in the general visual domain. We hope\nour work will contribute to the development of general MLLMs suitable for\nweb-based content generation and task automation. Our data and code are\navailable at https://github.com/MBZUAI-LLM/web2code.\n","authors":["Sukmin Yun","Haokun Lin","Rusiru Thushara","Mohammad Qazim Bhat","Yongxin Wang","Zutao Jiang","Mingkai Deng","Jinhong Wang","Tianhua Tao","Junbo Li","Haonan Li","Preslav Nakov","Timothy Baldwin","Zhengzhong Liu","Eric P. Xing","Xiaodan Liang","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2406.20098v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Camera-ready Version. Website at\n https://mbzuai-llm.github.io/webpage2code/"},{"id":"http://arxiv.org/abs/2406.18528v2","updated":"2024-11-17T15:09:54Z","published":"2024-06-26T17:56:29Z","title":"PrExMe! Large Scale Prompt Exploration of Open Source LLMs for Machine\n Translation and Summarization Evaluation","summary":" Large language models (LLMs) have revolutionized NLP research. Notably,\nin-context learning enables their use as evaluation metrics for natural\nlanguage generation, making them particularly advantageous in low-resource\nscenarios and time-restricted applications. In this work, we introduce PrExMe,\na large-scale Prompt Exploration for Metrics, where we evaluate more than 720\nprompt templates for open-source LLM-based metrics on machine translation (MT)\nand summarization datasets, totalling over 6.6M evaluations. This extensive\ncomparison (1) benchmarks recent open-source LLMs as metrics and (2) explores\nthe stability and variability of different prompting strategies. We discover\nthat, on the one hand, there are scenarios for which prompts are stable. For\ninstance, some LLMs show idiosyncratic preferences and favor to grade generated\ntexts with textual labels while others prefer to return numeric scores. On the\nother hand, the stability of prompts and model rankings can be susceptible to\nseemingly innocuous changes. For example, changing the requested output format\nfrom \"0 to 100\" to \"-1 to +1\" can strongly affect the rankings in our\nevaluation. Our study contributes to understanding the impact of different\nprompting approaches on LLM-based metrics for MT and summarization evaluation,\nhighlighting the most stable prompting patterns and potential limitations.\n","authors":["Christoph Leiter","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2406.18528v2.pdf","comment":"EMNLP 2024 main; camera-ready"},{"id":"http://arxiv.org/abs/2306.13041v2","updated":"2024-11-17T14:17:08Z","published":"2023-06-22T17:07:57Z","title":"Towards Explainable Evaluation Metrics for Machine Translation","summary":" Unlike classical lexical overlap metrics such as BLEU, most current\nevaluation metrics for machine translation (for example, COMET or BERTScore)\nare based on black-box large language models. They often achieve strong\ncorrelations with human judgments, but recent research indicates that the\nlower-quality classical metrics remain dominant, one of the potential reasons\nbeing that their decision processes are more transparent. To foster more\nwidespread acceptance of novel high-quality metrics, explainability thus\nbecomes crucial. In this concept paper, we identify key properties as well as\nkey goals of explainable machine translation metrics and provide a\ncomprehensive synthesis of recent techniques, relating them to our established\ngoals and properties. In this context, we also discuss the latest\nstate-of-the-art approaches to explainable metrics based on generative models\nsuch as ChatGPT and GPT4. Finally, we contribute a vision of next-generation\napproaches, including natural language explanations. We hope that our work can\nhelp catalyze and guide future research on explainable evaluation metrics and,\nmediately, also contribute to better and more transparent machine translation\nsystems.\n","authors":["Christoph Leiter","Piyawat Lertvittayakumjorn","Marina Fomicheva","Wei Zhao","Yang Gao","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2306.13041v2.pdf","comment":"Published at JMLR 3/24. We released an earlier preprint of this paper\n under a different title (arXiv:2203.11131)"},{"id":"http://arxiv.org/abs/2411.11081v1","updated":"2024-11-17T14:14:36Z","published":"2024-11-17T14:14:36Z","title":"The Promises and Pitfalls of LLM Annotations in Dataset Labeling: a Case\n Study on Media Bias Detection","summary":" High annotation costs from hiring or crowdsourcing complicate the creation of\nlarge, high-quality datasets needed for training reliable text classifiers.\nRecent research suggests using Large Language Models (LLMs) to automate the\nannotation process, reducing these costs while maintaining data quality. LLMs\nhave shown promising results in annotating downstream tasks like hate speech\ndetection and political framing. Building on the success in these areas, this\nstudy investigates whether LLMs are viable for annotating the complex task of\nmedia bias detection and whether a downstream media bias classifier can be\ntrained on such data. We create annolexical, the first large-scale dataset for\nmedia bias classification with over 48000 synthetically annotated examples. Our\nclassifier, fine-tuned on this dataset, surpasses all of the annotator LLMs by\n5-9 percent in Matthews Correlation Coefficient (MCC) and performs close to or\noutperforms the model trained on human-labeled data when evaluated on two media\nbias benchmark datasets (BABE and BASIL). This study demonstrates how our\napproach significantly reduces the cost of dataset creation in the media bias\ndomain and, by extension, the development of classifiers, while our subsequent\nbehavioral stress-testing reveals some of its current limitations and\ntrade-offs.\n","authors":["Tomas Horych","Christoph Mandl","Terry Ruas","Andre Greiner-Petter","Bela Gipp","Akiko Aizawa","Timo Spinde"],"pdf_url":"https://arxiv.org/pdf/2411.11081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11072v1","updated":"2024-11-17T13:21:26Z","published":"2024-11-17T13:21:26Z","title":"Multilingual Large Language Models: A Systematic Survey","summary":" This paper provides a comprehensive survey of the latest research on\nmultilingual large language models (MLLMs). MLLMs not only are able to\nunderstand and generate language across linguistic boundaries, but also\nrepresent an important advancement in artificial intelligence. We first discuss\nthe architecture and pre-training objectives of MLLMs, highlighting the key\ncomponents and methodologies that contribute to their multilingual\ncapabilities. We then discuss the construction of multilingual pre-training and\nalignment datasets, underscoring the importance of data quality and diversity\nin enhancing MLLM performance. An important focus of this survey is on the\nevaluation of MLLMs. We present a detailed taxonomy and roadmap covering the\nassessment of MLLMs' cross-lingual knowledge, reasoning, alignment with human\nvalues, safety, interpretability and specialized applications. Specifically, we\nextensively discuss multilingual evaluation benchmarks and datasets, and\nexplore the use of LLMs themselves as multilingual evaluators. To enhance MLLMs\nfrom black to white boxes, we also address the interpretability of multilingual\ncapabilities, cross-lingual transfer and language bias within these models.\nFinally, we provide a comprehensive review of real-world applications of MLLMs\nacross diverse domains, including biology, medicine, computer science,\nmathematics and law. We showcase how these models have driven innovation and\nimprovements in these specialized fields while also highlighting the challenges\nand opportunities in deploying MLLMs within diverse language communities and\napplication scenarios.We listed the paper related in this survey and publicly\navailable at https://github.com/tjunlp-lab/Awesome-Multilingual-LLMs-Papers .\n","authors":["Shaolin Zhu"," Supryadi","Shaoyang Xu","Haoran Sun","Leiyu Pan","Menglong Cui","Jiangcun Du","Renren Jin","António Branco","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2411.11072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11061v1","updated":"2024-11-17T12:48:24Z","published":"2024-11-17T12:48:24Z","title":"Beyond Human-Like Processing: Large Language Models Perform Equivalently\n on Forward and Backward Scientific Text","summary":" The impressive performance of large language models (LLMs) has led to their\nconsideration as models of human language processing. Instead, we suggest that\nthe success of LLMs arises from the flexibility of the transformer learning\narchitecture. To evaluate this conjecture, we trained LLMs on scientific texts\nthat were either in a forward or backward format. Despite backward text being\ninconsistent with the structure of human languages, we found that LLMs\nperformed equally well in either format on a neuroscience benchmark, eclipsing\nhuman expert performance for both forward and backward orders. Our results are\nconsistent with the success of transformers across diverse domains, such as\nweather prediction and protein design. This widespread success is attributable\nto LLM's ability to extract predictive patterns from any sufficiently\nstructured input. Given their generality, we suggest caution in interpreting\nLLM's success in linguistic tasks as evidence for human-like mechanisms.\n","authors":["Xiaoliang Luo","Michael Ramscar","Bradley C. Love"],"pdf_url":"https://arxiv.org/pdf/2411.11061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11055v1","updated":"2024-11-17T12:32:44Z","published":"2024-11-17T12:32:44Z","title":"FastDraft: How to Train Your Draft","summary":" Speculative Decoding has gained popularity as an effective technique for\naccelerating the auto-regressive inference process of Large Language Models\n(LLMs). However, Speculative Decoding entirely relies on the availability of\nefficient draft models, which are often lacking for many existing language\nmodels due to a stringent constraint of vocabulary incompatibility. In this\nwork we introduce FastDraft, a novel and efficient approach for pre-training\nand aligning a draft model to any large language model by incorporating\nefficient pre-training, followed by fine-tuning over synthetic datasets\ngenerated by the target model. We demonstrate FastDraft by training two highly\nparameter efficient drafts for the popular Phi-3-mini and Llama-3.1-8B models.\nUsing FastDraft, we were able to produce a draft with approximately 10 billion\ntokens on a single server with 8 Intel$^\\circledR$ Gaudi$^\\circledR$ 2\naccelerators in under 24 hours. Our results show that the draft model achieves\nimpressive results in key metrics of acceptance rate, block efficiency and up\nto 3x memory bound speed up when evaluated on code completion and up to 2x in\nsummarization, text completion and instruction tasks. We validate our\ntheoretical findings through benchmarking on the latest Intel$^\\circledR$\nCore$^{\\tiny \\text{TM}}$ Ultra, achieving a wall-clock time speedup of up to\n2x, indicating a significant reduction in runtime. Due to its high quality,\nFastDraft unlocks large language models inference on AI-PC and other\nedge-devices.\n","authors":["Ofir Zafrir","Igor Margulis","Dorin Shteyman","Guy Boudoukh"],"pdf_url":"https://arxiv.org/pdf/2411.11055v1.pdf","comment":"ENLSP NeurIPS Workshop 2024"},{"id":"http://arxiv.org/abs/2411.11053v1","updated":"2024-11-17T12:31:04Z","published":"2024-11-17T12:31:04Z","title":"SRA-MCTS: Self-driven Reasoning Aurmentation with Monte Carlo Tree\n Search for Enhanced Code Generation","summary":" Large language models demonstrate exceptional performance in simple code\ngeneration tasks but still face challenges in tackling complex problems. These\nchallenges may stem from insufficient reasoning and problem decomposition\ncapabilities. To address this issue, we propose a reasoning-augmented data\ngeneration process, SRA-MCTS, which guides the model to autonomously generate\nhigh-quality intermediate reasoning paths. This creates a positive feedback\nloop, enabling continuous improvement. Our method operates entirely through the\nmodel itself without requiring additional supervision. By synthesizing natural\nlanguage reasoning paths and translating them into executable code, the\napproach ensures analytical accuracy and enhances the success rate in solving\ncomplex tasks. Experimental results show that, even without additional\nsupervisory signals, our method achieves performance improvements across\ndifferent model scales, demonstrating the significant potential of\nself-improvement in small models. Furthermore, the method remains robust when\ntraditional Chain-of-Thought (CoT) approaches exhibit performance degradation,\nwith notable improvements observed in diversity metrics such as pass@10. We\nencourage further exploration of reasoning processes within training data to\nenhance the ability of language models to address complex problems.\n","authors":["Bin Xu","Yiguan Lin","Yinghao Li"," YangGao"],"pdf_url":"https://arxiv.org/pdf/2411.11053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03472v3","updated":"2024-11-17T12:22:47Z","published":"2024-01-07T12:48:07Z","title":"PEneo: Unifying Line Extraction, Line Grouping, and Entity Linking for\n End-to-end Document Pair Extraction","summary":" Document pair extraction aims to identify key and value entities as well as\ntheir relationships from visually-rich documents. Most existing methods divide\nit into two separate tasks: semantic entity recognition (SER) and relation\nextraction (RE). However, simply concatenating SER and RE serially can lead to\nsevere error propagation, and it fails to handle cases like multi-line entities\nin real scenarios. To address these issues, this paper introduces a novel\nframework, PEneo (Pair Extraction new decoder option), which performs document\npair extraction in a unified pipeline, incorporating three concurrent\nsub-tasks: line extraction, line grouping, and entity linking. This approach\nalleviates the error accumulation problem and can handle the case of multi-line\nentities. Furthermore, to better evaluate the model's performance and to\nfacilitate future research on pair extraction, we introduce RFUND, a\nre-annotated version of the commonly used FUNSD and XFUND datasets, to make\nthem more accurate and cover realistic situations. Experiments on various\nbenchmarks demonstrate PEneo's superiority over previous pipelines, boosting\nthe performance by a large margin (e.g., 19.89%-22.91% F1 score on RFUND-EN)\nwhen combined with various backbones like LiLT and LayoutLMv3, showing its\neffectiveness and generality. Codes and the new annotations are available at\nhttps://github.com/ZeningLin/PEneo.\n","authors":["Zening Lin","Jiapeng Wang","Teng Li","Wenhui Liao","Dayi Huang","Longfei Xiong","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2401.03472v3.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2411.11027v1","updated":"2024-11-17T10:17:01Z","published":"2024-11-17T10:17:01Z","title":"BianCang: A Traditional Chinese Medicine Large Language Model","summary":" The rise of large language models (LLMs) has driven significant progress in\nmedical applications, including traditional Chinese medicine (TCM). However,\ncurrent medical LLMs struggle with TCM diagnosis and syndrome differentiation\ndue to substantial differences between TCM and modern medical theory, and the\nscarcity of specialized, high-quality corpora. This paper addresses these\nchallenges by proposing BianCang, a TCM-specific LLM, using a two-stage\ntraining process that first injects domain-specific knowledge and then aligns\nit through targeted stimulation. To enhance diagnostic and differentiation\ncapabilities, we constructed pre-training corpora, instruction-aligned datasets\nbased on real hospital records, and the ChP-TCM dataset derived from the\nPharmacopoeia of the People's Republic of China. We compiled extensive TCM and\nmedical corpora for continuous pre-training and supervised fine-tuning,\nbuilding a comprehensive dataset to refine the model's understanding of TCM.\nEvaluations across 11 test sets involving 29 models and 4 tasks demonstrate the\neffectiveness of BianCang, offering valuable insights for future research.\nCode, datasets, and models are available at\nhttps://github.com/QLU-NLP/BianCang.\n","authors":["Sibo Wei","Xueping Peng","Yi-fei Wang","Jiasheng Si","Weiyu Zhang","Wenpeng Lu","Xiaoming Wu","Yinglong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02272v2","updated":"2024-11-17T09:55:35Z","published":"2024-11-04T17:03:55Z","title":"Combining Induction and Transduction for Abstract Reasoning","summary":" When learning an input-output mapping from very few examples, is it better to\nfirst infer a latent function that explains the examples, or is it better to\ndirectly predict new test outputs, e.g. using a neural network? We study this\nquestion on ARC, a highly diverse dataset of abstract reasoning tasks. We train\nneural models for induction (inferring latent functions) and transduction\n(directly predicting the test output for a given test input). Our models are\ntrained on synthetic data generated by prompting LLMs to produce Python code\nspecifying a function to be inferred, plus a stochastic subroutine for\ngenerating inputs to that function. We find inductive and transductive models\nsolve very different problems, despite training on the same problems, and\ndespite sharing the same neural architecture.\n","authors":["Wen-Ding Li","Keya Hu","Carter Larsen","Yuqing Wu","Simon Alford","Caleb Woo","Spencer M. Dunn","Hao Tang","Michelangelo Naim","Dat Nguyen","Wei-Long Zheng","Zenna Tavares","Yewen Pu","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2411.02272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14898v3","updated":"2024-11-17T08:45:44Z","published":"2024-06-21T06:43:15Z","title":"Safely Learning with Private Data: A Federated Learning Framework for\n Large Language Model","summary":" Private data, being larger and quality-higher than public data, can greatly\nimprove large language models (LLM). However, due to privacy concerns, this\ndata is often dispersed in multiple silos, making its secure utilization for\nLLM training a challenge. Federated learning (FL) is an ideal solution for\ntraining models with distributed private data, but traditional frameworks like\nFedAvg are unsuitable for LLM due to their high computational demands on\nclients. An alternative, split learning, offloads most training parameters to\nthe server while training embedding and output layers locally, making it more\nsuitable for LLM. Nonetheless, it faces significant challenges in security and\nefficiency. Firstly, the gradients of embeddings are prone to attacks, leading\nto potential reverse engineering of private data. Furthermore, the server's\nlimitation of handle only one client's training request at a time hinders\nparallel training, severely impacting training efficiency. In this paper, we\npropose a Federated Learning framework for LLM, named FL-GLM, which prevents\ndata leakage caused by both server-side and peer-client attacks while improving\ntraining efficiency. Specifically, we first place the input block and output\nblock on local client to prevent embedding gradient attacks from server.\nSecondly, we employ key-encryption during client-server communication to\nprevent reverse engineering attacks from peer-clients. Lastly, we employ\noptimization methods like client-batching or server-hierarchical, adopting\ndifferent acceleration methods based on the actual computational capabilities\nof the server. Experimental results on NLU and generation tasks demonstrate\nthat FL-GLM achieves comparable metrics to centralized chatGLM model,\nvalidating the effectiveness of our federated learning framework.\n","authors":["JiaYing Zheng","HaiNan Zhang","LingXiang Wang","WangJie Qiu","HongWei Zheng","ZhiMing Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.14898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05365v4","updated":"2024-11-17T07:22:31Z","published":"2024-08-09T22:29:23Z","title":"FiSTECH: Financial Style Transfer to Enhance Creativity without\n Hallucinations in LLMs","summary":" Recent trends in Generative AI have emerged towards fine-tuning foundational\nlarge language models (LLMs) to create domain-specific LLMs for automation and\nchatbot-like applications. Specialized applications for analytics-heavy domains\nsuch as Financial report generation require specific writing styles that\ncomprise compound and creative sentences with minimized hallucinations. In this\nwork, we explore the self-corrective auto-regressive qualities of LLMs to learn\ncreativity in writing styles with minimal prompting. We propose a novel\ntwo-stage fine-tuning (FT) strategy wherein in the first stage public domain\nfinancial reports are used to train for writing styles while allowing the LLM\nto hallucinate. In the second stage the examples of hallucinations are manually\ncorrected and further used to fine-tune the LLM. The finally trained LLM learns\nto generate specific financial report sections using minimal instructions and\ntabular data inputs while ensuring low fine-tuning costs. Our proposed\ntwo-stage fine-tuning boosts the accuracy of financial questions answering by\ntwo-folds while reducing hallucinations by over 50%. Also, the fine-tuned model\nhas lower perplexity, improved ROUGE, TER and BLEU scores, higher creativity\nand knowledge density with lower uncertainty and cross entropy than base LLMs.\nThus, the proposed framework can be generalized to train creativity in LLMs by\nfirst allowing them to hallucinate.\n","authors":["Sohini Roychowdhury","Marko Krema","Brian Moore","Xingjian Lai","Dike Effedua","Bharat Jethwani"],"pdf_url":"https://arxiv.org/pdf/2408.05365v4.pdf","comment":"10 pages, 14 figures, 5 tables, conference"},{"id":"http://arxiv.org/abs/2401.01286v5","updated":"2024-11-17T06:50:44Z","published":"2024-01-02T16:54:58Z","title":"A Comprehensive Study of Knowledge Editing for Large Language Models","summary":" Large Language Models (LLMs) have shown extraordinary capabilities in\nunderstanding and generating text that closely mirrors human communication.\nHowever, a primary limitation lies in the significant computational demands\nduring training, arising from their extensive parameterization. This challenge\nis further intensified by the dynamic nature of the world, necessitating\nfrequent updates to LLMs to correct outdated information or integrate new\nknowledge, thereby ensuring their continued relevance. Note that many\napplications demand continual model adjustments post-training to address\ndeficiencies or undesirable behaviors. There is an increasing interest in\nefficient, lightweight methods for on-the-fly model modifications. To this end,\nrecent years have seen a burgeoning in the techniques of knowledge editing for\nLLMs, which aim to efficiently modify LLMs' behaviors within specific domains\nwhile preserving overall performance across various inputs. In this paper, we\nfirst define the knowledge editing problem and then provide a comprehensive\nreview of cutting-edge approaches. Drawing inspiration from educational and\ncognitive research theories, we propose a unified categorization criterion that\nclassifies knowledge editing methods into three groups: resorting to external\nknowledge, merging knowledge into the model, and editing intrinsic knowledge.\nFurthermore, we introduce a new benchmark, KnowEdit, for a comprehensive\nempirical evaluation of representative knowledge editing approaches.\nAdditionally, we provide an in-depth analysis of knowledge location, which can\ngive a deeper understanding of the knowledge structures inherent within LLMs.\nFinally, we discuss several potential applications of knowledge editing,\noutlining its broad and impactful implications.\n","authors":["Ningyu Zhang","Yunzhi Yao","Bozhong Tian","Peng Wang","Shumin Deng","Mengru Wang","Zekun Xi","Shengyu Mao","Jintian Zhang","Yuansheng Ni","Siyuan Cheng","Ziwen Xu","Xin Xu","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Lei Liang","Zhiqiang Zhang","Xiaowei Zhu","Jun Zhou","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2401.01286v5.pdf","comment":"Ongoing work (v5): we have updated the Table 4 results after\n optimizing certain methods (related to AdaLoRA) and fixing computational bugs\n (related to ROME and MEMIT) in the EasyEdit. These improvements have led to\n better results than before. We will continue updating this paper and welcome\n everyone to discuss and exchange ideas"},{"id":"http://arxiv.org/abs/2411.05281v2","updated":"2024-11-17T05:40:44Z","published":"2024-11-08T02:24:29Z","title":"Fox-1 Technical Report","summary":" We present Fox-1, a series of small language models (SLMs) consisting of\nFox-1-1.6B and Fox-1-1.6B-Instruct-v0.1. These models are pre-trained on 3\ntrillion tokens of web-scraped document data and fine-tuned with 5 billion\ntokens of instruction-following and multi-turn conversation data. Aiming to\nimprove the pre-training efficiency, Fox-1-1.6B model introduces a novel\n3-stage data curriculum across all the training data with 2K-8K sequence\nlength. In architecture design, Fox-1 features a deeper layer structure, an\nexpanded vocabulary, and utilizes Grouped Query Attention (GQA), offering a\nperformant and efficient architecture compared to other SLMs. Fox-1 achieves\nbetter or on-par performance in various benchmarks compared to StableLM-2-1.6B,\nGemma-2B, Qwen1.5-1.8B, and OpenELM1.1B, with competitive inference speed and\nthroughput. The model weights have been released under the Apache 2.0 license,\nwhere we aim to promote the democratization of LLMs and make them fully\naccessible to the whole open-source community.\n","authors":["Zijian Hu","Jipeng Zhang","Rui Pan","Zhaozhuo Xu","Shanshan Han","Han Jin","Alay Dilipbhai Shah","Dimitris Stripelis","Yuhang Yao","Salman Avestimehr","Chaoyang He","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.05281v2.pdf","comment":"Base model is available at\n https://huggingface.co/tensoropera/Fox-1-1.6B and the instruction-tuned\n version is available at\n https://huggingface.co/tensoropera/Fox-1-1.6B-Instruct-v0.1"},{"id":"http://arxiv.org/abs/2411.10955v1","updated":"2024-11-17T04:06:12Z","published":"2024-11-17T04:06:12Z","title":"A Topic-aware Comparable Corpus of Chinese Variations","summary":" This study aims to fill the gap by constructing a topic-aware comparable\ncorpus of Mainland Chinese Mandarin and Taiwanese Mandarin from the social\nmedia in Mainland China and Taiwan, respectively. Using Dcard for Taiwanese\nMandarin and Sina Weibo for Mainland Chinese, we create a comparable corpus\nthat updates regularly and reflects modern language use on social media.\n","authors":["Da-Chen Lian","Shu-Kai Hsieh"],"pdf_url":"https://arxiv.org/pdf/2411.10955v1.pdf","comment":"4 pages, 4 figures, presented at APCLC2018: ASIA-PACIFIC CORPUS\n LINGUISTICS CONFERENCE 2018"},{"id":"http://arxiv.org/abs/2411.10954v1","updated":"2024-11-17T03:53:24Z","published":"2024-11-17T03:53:24Z","title":"Dialectal Toxicity Detection: Evaluating LLM-as-a-Judge Consistency\n Across Language Varieties","summary":" There has been little systematic study on how dialectal differences affect\ntoxicity detection by modern LLMs. Furthermore, although using LLMs as\nevaluators (\"LLM-as-a-judge\") is a growing research area, their sensitivity to\ndialectal nuances is still underexplored and requires more focused attention.\nIn this paper, we address these gaps through a comprehensive toxicity\nevaluation of LLMs across diverse dialects. We create a multi-dialect dataset\nthrough synthetic transformations and human-assisted translations, covering 10\nlanguage clusters and 60 varieties. We then evaluated three LLMs on their\nability to assess toxicity across multilingual, dialectal, and LLM-human\nconsistency. Our findings show that LLMs are sensitive in handling both\nmultilingual and dialectal variations. However, if we have to rank the\nconsistency, the weakest area is LLM-human agreement, followed by dialectal\nconsistency. Code repository:\n\\url{https://github.com/ffaisal93/dialect_toxicity_llm_judge}\n","authors":["Fahim Faisal","Md Mushfiqur Rahman","Antonios Anastasopoulos"],"pdf_url":"https://arxiv.org/pdf/2411.10954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10950v1","updated":"2024-11-17T03:32:50Z","published":"2024-11-17T03:32:50Z","title":"Understanding Multimodal LLMs: the Mechanistic Interpretability of Llava\n in Visual Question Answering","summary":" Understanding the mechanisms behind Large Language Models (LLMs) is crucial\nfor designing improved models and strategies. While recent studies have yielded\nvaluable insights into the mechanisms of textual LLMs, the mechanisms of\nMulti-modal Large Language Models (MLLMs) remain underexplored. In this paper,\nwe apply mechanistic interpretability methods to analyze the visual question\nanswering (VQA) mechanisms in the first MLLM, Llava. We compare the mechanisms\nbetween VQA and textual QA (TQA) in color answering tasks and find that: a) VQA\nexhibits a mechanism similar to the in-context learning mechanism observed in\nTQA; b) the visual features exhibit significant interpretability when\nprojecting the visual embeddings into the embedding space; and c) Llava\nenhances the existing capabilities of the corresponding textual LLM Vicuna\nduring visual instruction tuning. Based on these findings, we develop an\ninterpretability tool to help users and researchers identify important visual\nlocations for final predictions, aiding in the understanding of visual\nhallucination. Our method demonstrates faster and more effective results\ncompared to existing interpretability approaches. Code:\n\\url{https://github.com/zepingyu0512/llava-mechanism}\n","authors":["Zeping Yu","Sophia Ananiadou"],"pdf_url":"https://arxiv.org/pdf/2411.10950v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2408.03047v2","updated":"2024-11-17T02:53:34Z","published":"2024-08-06T09:02:53Z","title":"OpenOmni: A Collaborative Open Source Tool for Building Future-Ready\n Multimodal Conversational Agents","summary":" Multimodal conversational agents are highly desirable because they offer\nnatural and human-like interaction. However, there is a lack of comprehensive\nend-to-end solutions to support collaborative development and benchmarking.\nWhile proprietary systems like GPT-4o and Gemini demonstrating impressive\nintegration of audio, video, and text with response times of 200-250ms,\nchallenges remain in balancing latency, accuracy, cost, and data privacy. To\nbetter understand and quantify these issues, we developed OpenOmni, an\nopen-source, end-to-end pipeline benchmarking tool that integrates advanced\ntechnologies such as Speech-to-Text, Emotion Detection, Retrieval Augmented\nGeneration, Large Language Models, along with the ability to integrate\ncustomized models. OpenOmni supports local and cloud deployment, ensuring data\nprivacy and supporting latency and accuracy benchmarking. This flexible\nframework allows researchers to customize the pipeline, focusing on real\nbottlenecks and facilitating rapid proof-of-concept development. OpenOmni can\nsignificantly enhance applications like indoor assistance for visually impaired\nindividuals, advancing human-computer interaction. Our demonstration video is\navailable https://www.youtube.com/watch?v=zaSiT3clWqY, demo is available via\nhttps://openomni.ai4wa.com, code is available via\nhttps://github.com/AI4WA/OpenOmniFramework.\n","authors":["Qiang Sun","Yuanyi Luo","Sirui Li","Wenxiao Zhang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2408.03047v2.pdf","comment":"Published in Proceedings of the 2024 Conference on Empirical Methods\n in Natural Language Processing: System Demonstrations (EMNLP 2024) Best Demo\n Paper Award at EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.10937v1","updated":"2024-11-17T02:23:45Z","published":"2024-11-17T02:23:45Z","title":"Memory-Augmented Multimodal LLMs for Surgical VQA via Self-Contained\n Inquiry","summary":" Comprehensively understanding surgical scenes in Surgical Visual Question\nAnswering (Surgical VQA) requires reasoning over multiple objects. Previous\napproaches address this task using cross-modal fusion strategies to enhance\nreasoning ability. However, these methods often struggle with limited scene\nunderstanding and question comprehension, and some rely on external resources\n(e.g., pre-extracted object features), which can introduce errors and\ngeneralize poorly across diverse surgical environments. To address these\nchallenges, we propose SCAN, a simple yet effective memory-augmented framework\nthat leverages Multimodal LLMs to improve surgical context comprehension via\nSelf-Contained Inquiry. SCAN operates autonomously, generating two types of\nmemory for context augmentation: Direct Memory (DM), which provides multiple\ncandidates (or hints) to the final answer, and Indirect Memory (IM), which\nconsists of self-contained question-hint pairs to capture broader scene\ncontext. DM directly assists in answering the question, while IM enhances\nunderstanding of the surgical scene beyond the immediate query. Reasoning over\nthese object-aware memories enables the model to accurately interpret images\nand respond to questions. Extensive experiments on three publicly available\nSurgical VQA datasets demonstrate that SCAN achieves state-of-the-art\nperformance, offering improved accuracy and robustness across various surgical\nscenarios.\n","authors":["Wenjun Hou","Yi Cheng","Kaishuai Xu","Yan Hu","Wenjie Li","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.10937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10934v1","updated":"2024-11-17T02:08:03Z","published":"2024-11-17T02:08:03Z","title":"Analyzing Pokémon and Mario Streamers' Twitch Chat with LLM-based User\n Embeddings","summary":" We present a novel digital humanities method for representing our Twitch\nchatters as user embeddings created by a large language model (LLM). We cluster\nthese embeddings automatically using affinity propagation and further narrow\nthis clustering down through manual analysis. We analyze the chat of one stream\nby each Twitch streamer: SmallAnt, DougDoug and PointCrow. Our findings suggest\nthat each streamer has their own type of chatters, however two categories\nemerge for all of the streamers: supportive viewers and emoji and reaction\nsenders. Repetitive message spammers is a shared chatter category for two of\nthe streamers.\n","authors":["Mika Hämäläinen","Jack Rueter","Khalid Alnajjar"],"pdf_url":"https://arxiv.org/pdf/2411.10934v1.pdf","comment":"NLP4DH 2024"},{"id":"http://arxiv.org/abs/2411.10928v1","updated":"2024-11-17T01:16:37Z","published":"2024-11-17T01:16:37Z","title":"Learn from Downstream and Be Yourself in Multimodal Large Language Model\n Fine-Tuning","summary":" Multimodal Large Language Model (MLLM) have demonstrated strong\ngeneralization capabilities across diverse distributions and tasks, largely due\nto extensive pre-training datasets. Fine-tuning MLLM has become a common\npractice to improve performance on specific downstream tasks. However, during\nfine-tuning, MLLM often faces the risk of forgetting knowledge acquired during\npre-training, which can result in a decline in generalization abilities. To\nbalance the trade-off between generalization and specialization, we propose\nmeasuring the parameter importance for both pre-trained and fine-tuning\ndistributions, based on frozen pre-trained weight magnitude and accumulated\nfine-tuning gradient values. We further apply an importance-aware weight\nallocation strategy, selectively updating relatively important parameters for\ndownstream tasks. We conduct empirical evaluations on both image captioning and\nvisual question-answering tasks using various MLLM architectures. The\ncomprehensive experimental analysis demonstrates the effectiveness of the\nproposed solution, highlighting the efficiency of the crucial modules in\nenhancing downstream specialization performance while mitigating generalization\ndegradation in MLLM Fine-Tuning.\n","authors":["Wenke Huang","Jian Liang","Zekun Shi","Didi Zhu","Guancheng Wan","He Li","Bo Du","Dacheng Tao","Mang Ye"],"pdf_url":"https://arxiv.org/pdf/2411.10928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10927v1","updated":"2024-11-17T01:15:58Z","published":"2024-11-17T01:15:58Z","title":"Inter-linguistic Phonetic Composition (IPC): A Theoretical and\n Computational Approach to Enhance Second Language Pronunciation","summary":" Learners of a second language (L2) often unconsciously substitute unfamiliar\nL2 phonemes with similar phonemes from their native language (L1), even though\nnative speakers of the L2 perceive these sounds as distinct and\nnon-interchangeable. This phonemic substitution leads to deviations from the\nstandard phonological patterns of the L2, creating challenges for learners in\nacquiring accurate L2 pronunciation. To address this, we propose\nInter-linguistic Phonetic Composition (IPC), a novel computational method\ndesigned to minimize incorrect phonological transfer by reconstructing L2\nphonemes as composite sounds derived from multiple L1 phonemes. Tests with two\nautomatic speech recognition models demonstrated that when L2 speakers produced\nIPC-generated composite sounds, the recognition rate of target L2 phonemes\nimproved by 20% compared to when their pronunciation was influenced by original\nphonological transfer patterns. The improvement was observed within a\nrelatively shorter time frame, demonstrating rapid acquisition of the composite\nsound.\n","authors":["Jisang Park","Minu Kim","DaYoung Hong","Jongha Lee"],"pdf_url":"https://arxiv.org/pdf/2411.10927v1.pdf","comment":"10 pages, 6 Figures, submitted to ACL ARR October 2024 for NAACL 2025"},{"id":"http://arxiv.org/abs/2411.00042v2","updated":"2024-11-17T00:59:42Z","published":"2024-10-29T16:06:26Z","title":"Improving Math Problem Solving in Large Language Models Through\n Categorization and Strategy Tailoring","summary":" In this paper, we investigate how to harness large language models (LLMs) to\nsolve mathematical problems both quickly and accurately. Specifically, we\ndemonstrate the effectiveness of classifying problems into distinct categories\nand applying category-specific problem-solving strategies to enhance the math\nperformance of LLMs. We develop a straightforward machine learning model for\nproblem categorization and show that its accuracy can be significantly improved\nthrough the creation of well-designed training datasets. We believe that our\napproach works by helping reduce hallucinations in LLMs, which is a critical\nstep toward unlocking their potential to tackle advanced mathematical problems.\n","authors":["Amogh Akella"],"pdf_url":"https://arxiv.org/pdf/2411.00042v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08818v2","updated":"2024-11-17T00:41:01Z","published":"2024-07-11T18:59:21Z","title":"MAGNET: Improving the Multilingual Fairness of Language Models with\n Adaptive Gradient-Based Tokenization","summary":" In multilingual settings, non-Latin scripts and low-resource languages are\nusually disadvantaged in terms of language models' utility, efficiency, and\ncost. Specifically, previous studies have reported multiple modeling biases\nthat the current tokenization algorithms introduce to non-Latin script\nlanguages, the main one being over-segmentation. In this work, we propose\nMAGNET; multilingual adaptive gradient-based tokenization to reduce\nover-segmentation via adaptive gradient-based subword tokenization. MAGNET\nlearns to predict segment boundaries between byte tokens in a sequence via\nsub-modules within the model, which act as internal boundary predictors\n(tokenizers). Previous gradient-based tokenization methods aimed for uniform\ncompression across sequences by integrating a single boundary predictor during\ntraining and optimizing it end-to-end through stochastic reparameterization\nalongside the next token prediction objective. However, this approach still\nresults in over-segmentation for non-Latin script languages in multilingual\nsettings. In contrast, MAGNET offers a customizable architecture where\nbyte-level sequences are routed through language-script-specific predictors,\neach optimized for its respective language script. This modularity enforces\nequitable segmentation granularity across different language scripts compared\nto previous methods. Through extensive experiments, we demonstrate that in\naddition to reducing segmentation disparities, MAGNET also enables faster\nlanguage modelling and improves downstream utility.\n","authors":["Orevaoghene Ahia","Sachin Kumar","Hila Gonen","Valentin Hofmann","Tomasz Limisiewicz","Yulia Tsvetkov","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2407.08818v2.pdf","comment":null}]},"2024-11-16T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.10899v1","updated":"2024-11-16T21:58:26Z","published":"2024-11-16T21:58:26Z","title":"Planning for Tabletop Object Rearrangement","summary":" Finding an high-quality solution for the tabletop object rearrangement\nplanning is a challenging problem. Compared to determining a goal arrangement,\nrearrangement planning is challenging due to the dependencies between objects\nand the buffer capacity available to hold objects. Although orla* has proposed\nan A* based searching strategy with lazy evaluation for the high-quality\nsolution, it is not scalable, with the success rate decreasing as the number of\nobjects increases. To overcome this limitation, we propose an enhanced A*-based\nalgorithm that improves state representation and employs incremental goal\nattempts with lazy evaluation at each iteration. This approach aims to enhance\nscalability while maintaining solution quality. Our evaluation demonstrates\nthat our algorithm can provide superior solutions compared to orla*, in a\nshorter time, for both stationary and mobile robots.\n","authors":["Jiaming Hu","Jan Szczekulski","Sudhansh Peddabomma","Henrik I. Christensen"],"pdf_url":"https://arxiv.org/pdf/2411.10899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10886v1","updated":"2024-11-16T20:59:01Z","published":"2024-11-16T20:59:01Z","title":"MetricGold: Leveraging Text-To-Image Latent Diffusion Models for Metric\n Depth Estimation","summary":" Recovering metric depth from a single image remains a fundamental challenge\nin computer vision, requiring both scene understanding and accurate scaling.\nWhile deep learning has advanced monocular depth estimation, current models\noften struggle with unfamiliar scenes and layouts, particularly in zero-shot\nscenarios and when predicting scale-ergodic metric depth. We present\nMetricGold, a novel approach that harnesses generative diffusion model's rich\npriors to improve metric depth estimation. Building upon recent advances in\nMariGold, DDVM and Depth Anything V2 respectively, our method combines latent\ndiffusion, log-scaled metric depth representation, and synthetic data training.\nMetricGold achieves efficient training on a single RTX 3090 within two days\nusing photo-realistic synthetic data from HyperSIM, VirtualKitti, and\nTartanAir. Our experiments demonstrate robust generalization across diverse\ndatasets, producing sharper and higher quality metric depth estimates compared\nto existing approaches.\n","authors":["Ansh Shah","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2411.10886v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2312.02145 by\n other authors"},{"id":"http://arxiv.org/abs/2309.07496v4","updated":"2024-11-16T20:31:56Z","published":"2023-09-14T08:01:11Z","title":"Comparison of Middlewares in Edge-to-Edge and Edge-to-Cloud\n Communication for Distributed ROS2 Systems","summary":" The increased data transmission and number of devices involved in\ncommunications among distributed systems make it challenging yet significantly\nnecessary to have an efficient and reliable networking middleware. In robotics\nand autonomous systems, the wide application of ROS\\,2 brings the possibility\nof utilizing various networking middlewares together with DDS in ROS\\,2 for\nbetter communication among edge devices or between edge devices and the cloud.\nHowever, there is a lack of comprehensive communication performance comparison\nof integrating these networking middlewares with ROS\\,2. In this study, we\nprovide a quantitative analysis for the communication performance of utilized\nnetworking middlewares including MQTT and Zenoh alongside DDS in ROS\\,2 among a\nmultiple host system. For a complete and reliable comparison, we calculate the\nlatency and throughput of these middlewares by sending distinct amounts and\ntypes of data through different network setups including Ethernet, Wi-Fi, and\n4G. To further extend the evaluation to real-world application scenarios, we\nassess the drift error (the position changes) over time caused by these\nnetworking middlewares with the robot moving in an identical square-shaped\npath. Our results show that CycloneDDS performs better under Ethernet while\nZenoh performs better under Wi-Fi and 4G. In the actual robot test, the robot\nmoving trajectory drift error over time (96\\,s) via Zenoh is the smallest. It\nis worth noting we have a discussion of the CPU utilization of these networking\nmiddlewares and the performance impact caused by enabling the security feature\nin ROS\\,2 at the end of the paper.\n","authors":["Jiaqiang Zhang","Xianjia Yu","Sier Ha","Jorge Pena Queralta","Tomi Westerlund"],"pdf_url":"https://arxiv.org/pdf/2309.07496v4.pdf","comment":"Accepted by the Journal of Intelligent & Robotic Systems"},{"id":"http://arxiv.org/abs/2303.09565v6","updated":"2024-11-16T17:01:49Z","published":"2023-03-17T16:56:48Z","title":"A SysML-based language for evaluating digital twin software reusability\n in cyber-physical system structure","summary":" Evaluating early design concepts is crucial as it impacts quality and cost.\nThis process is often hindered by vague and uncertain design information. This\narticle introduces the SysML-based Simulated-Physical Systems Modeling Language\n(SPSysML). It is a Domain-Specification Language for evaluating component\nreusability in Cyber-Physical Systems incorporating Digital Twins and other\nsimulated parts. The proposed factors assess the design quantitatively. SPSysML\nuses a requirement-based system structuring method to couple simulated and\nphysical parts with requirements. SPSysML enables DTs to perceive exogenous\nactions in the simulated world.\n SPSysML validation is survey- and application-based. First, we develop a\nrobotic system for an assisted living project. As a result of the SPSysML\napplication, we observed an integrity improvement between the simulated and\nphysical parts of the system. Thus, more system components are shared between\nthe simulated and physical setups. The system was deployed on the physical\nrobot and two simulators based on ROS and ROS2. Additionally, we share a\nquestionnaire for SPSysML assessment. The feedback that we already received is\npublished in this article.\n","authors":["Wojciech Dudek","Narcis Miguel","Tomasz Winiarski"],"pdf_url":"https://arxiv.org/pdf/2303.09565v6.pdf","comment":"This work has been submitted to the Elsevier Robotics and Autonomous\n Systems Journal"},{"id":"http://arxiv.org/abs/2411.06414v2","updated":"2024-11-16T15:29:30Z","published":"2024-11-10T10:31:21Z","title":"Psycho Gundam: Electroencephalography based real-time robotic control\n system with deep learning","summary":" The Psycho Frame, a sophisticated system primarily used in Universal Century\n(U.C.) series mobile suits for NEWTYPE pilots, has evolved as an integral\ncomponent in harnessing the latent potential of mental energy. Its ability to\namplify and resonate with the pilot's psyche enables real-time mental control,\ncreating unique applications such as psychomagnetic fields and sensory-based\nweaponry. This paper presents the development of a novel robotic control system\ninspired by the Psycho Frame, combining electroencephalography (EEG) and deep\nlearning for real-time control of robotic systems. By capturing and\ninterpreting brainwave data through EEG, the system extends human cognitive\ncommands to robotic actions, reflecting the seamless synchronization of thought\nand machine, much like the Psyco Frame's integration with a Newtype pilot's\nmental faculties. This research demonstrates how modern AI techniques can\nexpand the limits of human-machine interaction, potentially transcending\ntraditional input methods and enabling a deeper, more intuitive control of\ncomplex robotic systems.\n","authors":["Chi-Sheng Chen","Wei-Sheng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.06414v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10071v3","updated":"2024-11-16T15:10:15Z","published":"2024-09-16T08:21:22Z","title":"Towards Physically-Realizable Adversarial Attacks in Embodied Vision\n Navigation","summary":" The deployment of embodied navigation agents in safety-critical environments\nraises concerns about their vulnerability to adversarial attacks on deep neural\nnetworks. However, current attack methods often lack practicality due to\nchallenges in transitioning from the digital to the physical world, while\nexisting physical attacks for object detection fail to achieve both multi-view\neffectiveness and naturalness. To address this, we propose a practical attack\nmethod for embodied navigation by attaching adversarial patches with learnable\ntextures and opacity to objects. Specifically, to ensure effectiveness across\nvarying viewpoints, we employ a multi-view optimization strategy based on\nobject-aware sampling, which uses feedback from the navigation model to\noptimize the patch's texture. To make the patch inconspicuous to human\nobservers, we introduce a two-stage opacity optimization mechanism, where\nopacity is refined after texture optimization. Experimental results show our\nadversarial patches reduce navigation success rates by about 40%, outperforming\nprevious methods in practicality, effectiveness, and naturalness. Code is\navailable at:\n[https://github.com/chen37058/Physical-Attacks-in-Embodied-Navigation].\n","authors":["Meng Chen","Jiawei Tu","Chao Qi","Yonghao Dang","Feng Zhou","Wei Wei","Jianqin Yin"],"pdf_url":"https://arxiv.org/pdf/2409.10071v3.pdf","comment":"8 pages, 6 figures, submitted to the 2025 IEEE International\n Conference on Robotics & Automation (ICRA)"},{"id":"http://arxiv.org/abs/2410.01440v4","updated":"2024-11-16T13:58:45Z","published":"2024-10-02T11:42:49Z","title":"Closed-Loop Long-Horizon Robotic Planning via Equilibrium Sequence\n Modeling","summary":" In the endeavor to make autonomous robots take actions, task planning is a\nmajor challenge that requires translating high-level task descriptions into\nlong-horizon action sequences. Despite recent advances in language model\nagents, they remain prone to planning errors and limited in their ability to\nplan ahead. To address these limitations in robotic planning, we advocate a\nself-refining scheme that iteratively refines a draft plan until an equilibrium\nis reached. Remarkably, this process can be optimized end-to-end from an\nanalytical perspective without the need to curate additional verifiers or\nreward models, allowing us to train self-refining planners in a simple\nsupervised learning fashion. Meanwhile, a nested equilibrium sequence modeling\nprocedure is devised for efficient closed-loop planning that incorporates\nuseful feedback from the environment (or an internal world model). Our method\nis evaluated on the VirtualHome-Env benchmark, showing advanced performance\nwith better scaling for inference computation. Code is available at\nhttps://github.com/Singularity0104/equilibrium-planner.\n","authors":["Jinghan Li","Zhicheng Sun","Fei Li","Cao Sheng","Jiazhong Yu","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2410.01440v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10760v1","updated":"2024-11-16T09:33:11Z","published":"2024-11-16T09:33:11Z","title":"Experimental study of fish-like bodies with passive tail and tunable\n stiffness","summary":" Scombrid fishes and tuna are efficient swimmers capable of maximizing\nperformance to escape predators and save energy during long journeys. A key\naspect in achieving these goals is the flexibility of the tail, which the fish\noptimizes during swimming. Though, the robotic counterparts, although highly\nefficient, have partially investigated the importance of flexibility. We have\ndesigned and tested a fish-like robotic platform (of 30 cm in length) to\nquantify performance with a tail made flexible through a torsional spring\nplaced at the peduncle. Body kinematics, forces, and power have been measured\nand compared with real fish. The platform can vary its frequency between 1 and\n3 Hz, reaching self-propulsion conditions with speed over 1 BL/s and Strouhal\nnumber in the optimal range. We show that changing the frequency of the robot\ncan influence the thrust and power achieved by the fish-like robot.\nFurthermore, by using appropriately tuned stiffness, the robot deforms in\naccordance with the travelling wave mechanism, which has been revealed to be\nthe actual motion of real fish. These findings demonstrate the potential of\ntuning the stiffness in fish swimming and offer a basis for investigating\nfish-like flexibility in bio-inspired underwater vehicles.\n","authors":["L. Padovani","G. Manduca","D. Paniccia","G. Graziani","R. Piva","C. Lugni"],"pdf_url":"https://arxiv.org/pdf/2411.10760v1.pdf","comment":"Conference Paper submitted to the 15th International Conference on\n Hydrodynamics (ICHD 2024)"},{"id":"http://arxiv.org/abs/2407.04292v4","updated":"2024-11-16T08:49:03Z","published":"2024-07-05T06:54:26Z","title":"Software-Hardware Co-Design For Embodied AI Robots","summary":" Embodied AI robots have the potential to fundamentally improve the way human\nbeings live and manufacture. Continued progress in the burgeoning field of\nusing large language models to control robots depends critically on an\nefficient computing substrate. In particular, today's computing systems for\nembodied AI robots are designed purely based on the interest of algorithm\ndevelopers, where robot actions are divided into a discrete frame-basis. Such\nan execution pipeline creates high latency and energy consumption. This paper\nproposes Corki, an algorithm-architecture co-design framework for real-time\nembodied AI robot control. Our idea is to decouple LLM inference, robotic\ncontrol and data communication in the embodied AI robots compute pipeline.\nInstead of predicting action for one single frame, Corki predicts the\ntrajectory for the near future to reduce the frequency of LLM inference. The\nalgorithm is coupled with a hardware that accelerates transforming trajectory\ninto actual torque signals used to control robots and an execution pipeline\nthat parallels data communication with computation. Corki largely reduces LLM\ninference frequency by up to 8.0x, resulting in up to 3.6x speed up. The\nsuccess rate improvement can be up to 17.3%. Code is provided for\nre-implementation. https://github.com/hyy0613/Corki\n","authors":["Yiyang Huang","Yuhui Hao","Bo Yu","Feng Yan","Yuxin Yang","Feng Min","Yinhe Han","Lin Ma","Shaoshan Liu","Qiang Liu","Yiming Gan"],"pdf_url":"https://arxiv.org/pdf/2407.04292v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11548v6","updated":"2024-11-16T07:31:18Z","published":"2024-06-17T13:44:53Z","title":"AIC MLLM: Autonomous Interactive Correction MLLM for Robust Robotic\n Manipulation","summary":" The ability to reflect on and correct failures is crucial for robotic systems\nto interact stably with real-life objects.Observing the generalization and\nreasoning capabilities of Multimodal Large Language Models (MLLMs), previous\napproaches have aimed to utilize these models to enhance robotic systems\naccordingly.However, these methods typically focus on high-level planning\ncorrections using an additional MLLM, with limited utilization of failed\nsamples to correct low-level contact poses which is particularly prone to occur\nduring articulated object manipulation.To address this gap, we propose an\nAutonomous Interactive Correction (AIC) MLLM, which makes use of previous\nlow-level interaction experiences to correct SE(3) pose predictions for\narticulated object. Specifically, AIC MLLM is initially fine-tuned to acquire\nboth pose prediction and feedback prompt comprehension abilities.We design two\ntypes of prompt instructions for interactions with objects: 1) visual masks to\nhighlight unmovable parts for position correction, and 2) textual descriptions\nto indicate potential directions for rotation correction. During inference, a\nFeedback Information Extraction module is introduced to recognize the failure\ncause, allowing AIC MLLM to adaptively correct the pose prediction using the\ncorresponding prompts.To further enhance manipulation stability, we devise a\nTest Time Adaptation strategy that enables AIC MLLM to better adapt to the\ncurrent scene configuration.Finally, extensive experiments are conducted in\nboth simulated and real-world environments to evaluate the proposed method. The\nresults demonstrate that our AIC MLLM can efficiently correct failure samples\nby leveraging interaction experience prompts.Our project website is\nhttps://sites.google.com/view/aic-mllm.\n","authors":["Chuyan Xiong","Chengyu Shen","Xiaoqi Li","Kaichen Zhou","Jeremy Liu","Ruiping Wang","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2406.11548v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10722v1","updated":"2024-11-16T07:02:46Z","published":"2024-11-16T07:02:46Z","title":"DGS-SLAM: Gaussian Splatting SLAM in Dynamic Environment","summary":" We introduce Dynamic Gaussian Splatting SLAM (DGS-SLAM), the first dynamic\nSLAM framework built on the foundation of Gaussian Splatting. While recent\nadvancements in dense SLAM have leveraged Gaussian Splatting to enhance scene\nrepresentation, most approaches assume a static environment, making them\nvulnerable to photometric and geometric inconsistencies caused by dynamic\nobjects. To address these challenges, we integrate Gaussian Splatting SLAM with\na robust filtering process to handle dynamic objects throughout the entire\npipeline, including Gaussian insertion and keyframe selection. Within this\nframework, to further improve the accuracy of dynamic object removal, we\nintroduce a robust mask generation method that enforces photometric consistency\nacross keyframes, reducing noise from inaccurate segmentation and artifacts\nsuch as shadows. Additionally, we propose the loop-aware window selection\nmechanism, which utilizes unique keyframe IDs of 3D Gaussians to detect loops\nbetween the current and past frames, facilitating joint optimization of the\ncurrent camera poses and the Gaussian map. DGS-SLAM achieves state-of-the-art\nperformance in both camera tracking and novel view synthesis on various dynamic\nSLAM benchmarks, proving its effectiveness in handling real-world dynamic\nscenes.\n","authors":["Mangyu Kong","Jaewon Lee","Seongwon Lee","Euntai Kim"],"pdf_url":"https://arxiv.org/pdf/2411.10722v1.pdf","comment":"Preprint, Under review"},{"id":"http://arxiv.org/abs/2411.10699v1","updated":"2024-11-16T04:48:03Z","published":"2024-11-16T04:48:03Z","title":"Hierarchical Adaptive Motion Planning with Nonlinear Model Predictive\n Control for Safety-Critical Collaborative Loco-Manipulation","summary":" As legged robots take on roles in industrial and autonomous construction,\ncollaborative loco-manipulation is crucial for handling large and heavy objects\nthat exceed the capabilities of a single robot. However, ensuring the safety of\nthese multi-robot tasks is essential to prevent accidents and guarantee\nreliable operation. This paper presents a hierarchical control system for\nobject manipulation using a team of quadrupedal robots. The combination of the\nmotion planner and the decentralized locomotion controller in a hierarchical\nstructure enables safe, adaptive planning for teams in complex scenarios. A\nhigh-level nonlinear model predictive control planner generates collision-free\npaths by incorporating control barrier functions, accounting for static and\ndynamic obstacles. This process involves calculating contact points and forces\nwhile adapting to unknown objects and terrain properties. The decentralized\nloco-manipulation controller then ensures each robot maintains stable\nlocomotion and manipulation based on the planner's guidance. The effectiveness\nof our method is carefully examined in simulations under various conditions and\nvalidated in real-life setups with robot hardware. By modifying the object's\nconfiguration, the robot team can maneuver unknown objects through an\nenvironment containing both static and dynamic obstacles. We have made our code\npublicly available in an open-source repository at\n\\url{https://github.com/DRCL-USC/collaborative_loco_manipulation}.\n","authors":["Mohsen Sombolestan","Quan Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.10699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07612v2","updated":"2024-11-16T03:16:41Z","published":"2024-11-12T07:38:57Z","title":"A Simple Multi-agent Joint Prediction Method for Autonomous Driving","summary":" Predicting future motions of road participants is an important task for\ndriving autonomously. Most existing models excel at predicting the marginal\ntrajectory of a single agent, but predicting joint trajectories for multiple\nagents that are consistent within a scene remains a challenge. Previous\nresearch has often focused on marginal predictions, but the importance of joint\npredictions has become increasingly apparent. Joint prediction aims to generate\ntrajectories that are consistent across the entire scene. Our research builds\nupon the SIMPL baseline to explore methods for generating scene-consistent\ntrajectories. We tested our algorithm on the Argoverse 2 dataset, and\nexperimental results demonstrate that our approach can generate\nscene-consistent trajectories. Compared to the SIMPL baseline, our method\nsignificantly reduces the collision rate of joint trajectories within the\nscene.\n","authors":["Mingyi Wang","Hongqun Zou","Yifan Liu","You Wang","Guang Li"],"pdf_url":"https://arxiv.org/pdf/2411.07612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07699v2","updated":"2024-11-16T01:48:01Z","published":"2024-11-12T10:26:23Z","title":"RINO: Accurate, Robust Radar-Inertial Odometry with Non-Iterative\n Estimation","summary":" Precise localization and mapping are critical for achieving autonomous\nnavigation in self-driving vehicles. However, ego-motion estimation still faces\nsignificant challenges, particularly when GNSS failures occur or under extreme\nweather conditions (e.g., fog, rain, and snow). In recent years, scanning radar\nhas emerged as an effective solution due to its strong penetration\ncapabilities. Nevertheless, scanning radar data inherently contains high levels\nof noise, necessitating hundreds to thousands of iterations of optimization to\nestimate a reliable transformation from the noisy data. Such iterative solving\nis time-consuming, unstable, and prone to failure. To address these challenges,\nwe propose an accurate and robust Radar-Inertial Odometry system, RINO, which\nemploys a non-iterative solving approach. Our method decouples rotation and\ntranslation estimation and applies an adaptive voting scheme for 2D rotation\nestimation, enhancing efficiency while ensuring consistent solving time.\nAdditionally, the approach implements a loosely coupled system between the\nscanning radar and an inertial measurement unit (IMU), leveraging Error-State\nKalman Filtering (ESKF). Notably, we successfully estimated the uncertainty of\nthe pose estimation from the scanning radar, incorporating this into the\nfilter's Maximum A Posteriori estimation, a consideration that has been\npreviously overlooked. Validation on publicly available datasets demonstrates\nthat RINO outperforms state-of-the-art methods and baselines in both accuracy\nand robustness. Our code is available at https://github.com/yangsc4063/rino.\n","authors":["Shuocheng Yang","Yueming Cao","Shengbo Eben Li","Jianqiang Wang","Shaobing Xu"],"pdf_url":"https://arxiv.org/pdf/2411.07699v2.pdf","comment":null}],"Systems and Control":[{"id":"http://arxiv.org/abs/2206.05276v3","updated":"2024-11-16T21:40:31Z","published":"2022-06-10T23:16:41Z","title":"Game-Theoretic Neyman-Pearson Detection to Combat Strategic Evasion","summary":" The security in networked systems depends greatly on recognizing and\nidentifying adversarial behaviors. Traditional detection methods focus on\nspecific categories of attacks and have become inadequate for increasingly\nstealthy and deceptive attacks that are designed to bypass detection\nstrategically. This work aims to develop a holistic theory to countermeasure\nsuch evasive attacks. We focus on extending a fundamental class of\nstatistical-based detection methods based on Neyman-Pearson's (NP) hypothesis\ntesting formulation. We propose game-theoretic frameworks to capture the\nconflicting relationship between a strategic evasive attacker and an\nevasion-aware NP detector. By analyzing both the equilibrium behaviors of the\nattacker and the NP detector, we characterize their performance using\nEquilibrium Receiver-Operational-Characteristic (EROC) curves. We show that the\nevasion-aware NP detectors outperform the passive ones in the way that the\nformer can act strategically against the attacker's behavior and adaptively\nmodify their decision rules based on the received messages. In addition, we\nextend our framework to a sequential setting where the user sends out\nidentically distributed messages. We corroborate the analytical results with a\ncase study of anomaly detection.\n","authors":["Yinan Hu","Quanyan Zhu"],"pdf_url":"https://arxiv.org/pdf/2206.05276v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10882v1","updated":"2024-11-16T20:34:03Z","published":"2024-11-16T20:34:03Z","title":"Adaptive Soft Actor-Critic Framework for RIS-Assisted and UAV-Aided\n Communication","summary":" In this work, we explore UAV-assisted reconfigurable intelligent surface\n(RIS) technology to enhance downlink communications in wireless networks. By\nintegrating RIS on both UAVs and ground infrastructure, we aim to boost network\ncoverage, fairness, and resilience against challenges such as UAV jitter. To\nmaximize the minimum achievable user rate, we formulate a joint optimization\nproblem involving beamforming, phase shifts, and UAV trajectory. To address\nthis problem, we propose an adaptive soft actor-critic (ASAC) framework. In\nthis approach, agents are built using adaptive sparse transformers with\nattentive feature refinement (ASTAFER), enabling dynamic feature processing\nthat adapts to real-time network conditions. The ASAC model learns optimal\nsolutions to the coupled subproblems in real time, delivering an end-to-end\nsolution without relying on iterative or relaxation-based methods. Simulation\nresults demonstrate that our ASAC-based approach achieves better performance\ncompared to the conventional SAC. This makes it a robust, adaptable solution\nfor real-time, fair, and efficient downlink communication in UAV-RIS networks.\n","authors":["Abuzar B. M. Adam","Elhadj Moustapha Diallo","Mohammed A. M. Elhassan"],"pdf_url":"https://arxiv.org/pdf/2411.10882v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2303.09565v6","updated":"2024-11-16T17:01:49Z","published":"2023-03-17T16:56:48Z","title":"A SysML-based language for evaluating digital twin software reusability\n in cyber-physical system structure","summary":" Evaluating early design concepts is crucial as it impacts quality and cost.\nThis process is often hindered by vague and uncertain design information. This\narticle introduces the SysML-based Simulated-Physical Systems Modeling Language\n(SPSysML). It is a Domain-Specification Language for evaluating component\nreusability in Cyber-Physical Systems incorporating Digital Twins and other\nsimulated parts. The proposed factors assess the design quantitatively. SPSysML\nuses a requirement-based system structuring method to couple simulated and\nphysical parts with requirements. SPSysML enables DTs to perceive exogenous\nactions in the simulated world.\n SPSysML validation is survey- and application-based. First, we develop a\nrobotic system for an assisted living project. As a result of the SPSysML\napplication, we observed an integrity improvement between the simulated and\nphysical parts of the system. Thus, more system components are shared between\nthe simulated and physical setups. The system was deployed on the physical\nrobot and two simulators based on ROS and ROS2. Additionally, we share a\nquestionnaire for SPSysML assessment. The feedback that we already received is\npublished in this article.\n","authors":["Wojciech Dudek","Narcis Miguel","Tomasz Winiarski"],"pdf_url":"https://arxiv.org/pdf/2303.09565v6.pdf","comment":"This work has been submitted to the Elsevier Robotics and Autonomous\n Systems Journal"},{"id":"http://arxiv.org/abs/2411.10820v1","updated":"2024-11-16T14:55:30Z","published":"2024-11-16T14:55:30Z","title":"Molecular Dynamics Study of Liquid Condensation on Nano-structured\n Sinusoidal Hybrid Wetting Surfaces","summary":" Although real surfaces exhibit intricate topologies at the nanoscale, rough\nsurface consideration is often overlooked in nanoscale heat transfer studies.\nSuperimposed sinusoidal functions effectively model the complexity of these\nsurfaces. This study investigates the impact of sinusoidal roughness on liquid\nargon condensation over a functional gradient wetting (FGW) surface with 84%\nhydrophilic content using molecular dynamics simulations. Argon atoms are\nconfined between two platinum substrates: a flat lower substrate heated to 130K\nand a rough upper substrate at 90K. Key metrics of the nanoscale condensation\nprocess, such as nucleation, surface heat flux, and total energy per atom, are\nanalyzed. Rough surfaces significantly enhance nucleation, nearly doubling\ncluster counts compared to smooth surfaces and achieving a more extended atomic\ndensity profile with a peak of approximately and improved heat flux. Stronger\natom-surface interactions also lead to more efficient energy dissipation. These\nfindings underscore the importance of surface roughness in optimizing\ncondensation and heat transfer, offering a more accurate representation of\nsurface textures and a basis for designing surfaces that achieve superior heat\ntransfer performance.\n","authors":["Taskin Mehereen","Shorup Chanda","Afrina Ayrin Nitu","Jubaer Tanjil Jami","Rafia Rizwana Rahim","Md Ashiqur Rahman"],"pdf_url":"https://arxiv.org/pdf/2411.10820v1.pdf","comment":"9 pages, 7 figures, conference"},{"id":"http://arxiv.org/abs/2411.10805v1","updated":"2024-11-16T13:49:06Z","published":"2024-11-16T13:49:06Z","title":"Existence of $ε$-Nash Equilibria in Nonzero-Sum Borel Stochastic\n Games and Equilibria of Quantized Models","summary":" Establishing the existence of exact or near Markov or stationary perfect Nash\nequilibria in nonzero-sum Markov games over Borel spaces remains a challenging\nproblem, with few positive results to date. In this paper, we establish the\nexistence of approximate Markov and stationary Nash equilibria for nonzero-sum\nstochastic games over Borel spaces, assuming only mild regularity conditions on\nthe model. Our approach involves analyzing a quantized version of the game, for\nwhich we provide an explicit construction under both finite-horizon and\ndiscounted cost criteria. This work has significant implications for emerging\napplications such as multi-agent learning. Our results apply to both compact\nand non-compact state spaces. For the compact state space case, we first\napproximate the standard Borel model with a finite state-action model. Using\nthe existence of Markov and stationary perfect Nash equilibria for these finite\nmodels under finite-horizon and discounted cost criteria, we demonstrate that\nthese joint policies constitute approximate Markov and stationary perfect\nequilibria under mild continuity conditions on the one-stage costs and\ntransition probabilities. For the non-compact state space case, we achieve\nsimilar results by first approximating the model with a compact-state model.\nCompared with previous results in the literature, which we comprehensively\nreview, we provide more general and complementary conditions, along with\nexplicit approximation models whose equilibria are $\\epsilon$-equilibria for\nthe original model.\n","authors":["Naci Saldi","Gurdal Arslan","Serdar Yuksel"],"pdf_url":"https://arxiv.org/pdf/2411.10805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10769v1","updated":"2024-11-16T10:25:49Z","published":"2024-11-16T10:25:49Z","title":"Demonstrating Remote Synchronization: An Experimental Approach with\n Nonlinear Oscillators","summary":" This study investigates remote synchronization in arbitrary network clusters\nof coupled nonlinear oscillators, a phenomenon inspired by neural\nsynchronization in the brain. Employing a multi-faceted approach encompassing\nanalytical, numerical, and experimental methodologies, we leverage the Master\nStability Function (MSF) to analyze network stability. We provide experimental\nevidence of remote synchronization between two clusters of nonlinear\noscillators, where oscillators within each cluster are also remotely connected.\nThis observation parallels the thalamus-mediated synchronization of neuronal\npopulations in the brain. An electronic circuit testbed, supported by nonlinear\nODE modeling and LT Spice simulation, was developed to validate our theoretical\npredictions. Future work will extend this investigation to encompass diverse\nnetwork topologies and explore potential applications in neuroscience,\ncommunication networks, and power systems.\n","authors":["Sanjeev Kumar Pandey","Neetish Patel"],"pdf_url":"https://arxiv.org/pdf/2411.10769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10739v1","updated":"2024-11-16T08:25:22Z","published":"2024-11-16T08:25:22Z","title":"A Wearable Gait Monitoring System for 17 Gait Parameters Based on\n Computer Vision","summary":" We developed a shoe-mounted gait monitoring system capable of tracking up to\n17 gait parameters, including gait length, step time, stride velocity, and\nothers. The system employs a stereo camera mounted on one shoe to track a\nmarker placed on the opposite shoe, enabling the estimation of spatial gait\nparameters. Additionally, a Force Sensitive Resistor (FSR) affixed to the heel\nof the shoe, combined with a custom-designed algorithm, is utilized to measure\ntemporal gait parameters. Through testing on multiple participants and\ncomparison with the gait mat, the proposed gait monitoring system exhibited\nnotable performance, with the accuracy of all measured gait parameters\nexceeding 93.61%. The system also demonstrated a low drift of 4.89% during\nlong-distance walking. A gait identification task conducted on participants\nusing a trained Transformer model achieved 95.7% accuracy on the dataset\ncollected by the proposed system, demonstrating that our hardware has the\npotential to collect long-sequence gait data suitable for integration with\ncurrent Large Language Models (LLMs). The system is cost-effective,\nuser-friendly, and well-suited for real-life measurements.\n","authors":["Jiangang Chen","Yung-Hong Sun","Kristen Pickett","Barbara King","Yu Hen Hu","Hongrui Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.10739v1.pdf","comment":"13 pages, 14 figures. This paper was submitted for publication to the\n IEEE Transactions on Instrumentation and Measurement"},{"id":"http://arxiv.org/abs/2401.15299v2","updated":"2024-11-16T07:54:05Z","published":"2024-01-27T05:14:17Z","title":"SupplyGraph: A Benchmark Dataset for Supply Chain Planning using Graph\n Neural Networks","summary":" Graph Neural Networks (GNNs) have gained traction across different domains\nsuch as transportation, bio-informatics, language processing, and computer\nvision. However, there is a noticeable absence of research on applying GNNs to\nsupply chain networks. Supply chain networks are inherently graph-like in\nstructure, making them prime candidates for applying GNN methodologies. This\nopens up a world of possibilities for optimizing, predicting, and solving even\nthe most complex supply chain problems. A major setback in this approach lies\nin the absence of real-world benchmark datasets to facilitate the research and\nresolution of supply chain problems using GNNs. To address the issue, we\npresent a real-world benchmark dataset for temporal tasks, obtained from one of\nthe leading FMCG companies in Bangladesh, focusing on supply chain planning for\nproduction purposes. The dataset includes temporal data as node features to\nenable sales predictions, production planning, and the identification of\nfactory issues. By utilizing this dataset, researchers can employ GNNs to\naddress numerous supply chain problems, thereby advancing the field of supply\nchain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph\n","authors":["Azmine Toushik Wasi","MD Shafikul Islam","Adipto Raihan Akib"],"pdf_url":"https://arxiv.org/pdf/2401.15299v2.pdf","comment":"Accepted to 4th workshop on Graphs and more Complex structures for\n Learning and Reasoning, colocated with AAAI 2024. Extended journal version\n with experiments is available here: arXiv:2411.08550"},{"id":"http://arxiv.org/abs/2411.10727v1","updated":"2024-11-16T07:31:19Z","published":"2024-11-16T07:31:19Z","title":"Self-Triggered Control in Artificial Pancreas","summary":" The management of type 1 diabetes has been revolutionized by the artificial\npancreas system (APS), which automates insulin delivery based on continuous\nglucose monitor (CGM). While conventional closed-loop systems rely on CGM data,\nwhich leads to higher energy consumption at the sensors and increased data\nredundancy in the underlying communication network. In contrast, this paper\nproposes a self-triggered control mechanism that can potentially achieve lower\nlatency and energy efficiency. The model for the APS consists of a state and\ninput-constrained dynamical system affected by exogenous meal disturbances. Our\nself-triggered mechanism relies on restricting the state evolution within the\nrobust control invariant of such a system at all times. To that end, using\ntools from reachability, we associate a safe time interval with such invariant\nsets, which denotes the maximum time for which the invariant set remains\ninvariant, even without transmission of CGM data at all times.\n","authors":["Debayani Ghosh","Sahaj Saxena","Navin Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.10727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10702v1","updated":"2024-11-16T04:56:23Z","published":"2024-11-16T04:56:23Z","title":"Wireless Resource Allocation with Collaborative Distributed and\n Centralized DRL under Control Channel Attacks","summary":" In this paper, we consider a wireless resource allocation problem in a\ncyber-physical system (CPS) where the control channel, carrying resource\nallocation commands, is subjected to denial-of-service (DoS) attacks. We\npropose a novel concept of collaborative distributed and centralized (CDC)\nresource allocation to effectively mitigate the impact of these attacks. To\noptimize the CDC resource allocation policy, we develop a new CDC-deep\nreinforcement learning (DRL) algorithm, whereas existing DRL frameworks only\nformulate either centralized or distributed decision-making problems.\nSimulation results demonstrate that the CDC-DRL algorithm significantly\noutperforms state-of-the-art DRL benchmarks, showcasing its ability to address\nresource allocation problems in large-scale CPSs under control channel attacks.\n","authors":["Ke Wang","Wanchun Liu","Teng Joon Lim"],"pdf_url":"https://arxiv.org/pdf/2411.10702v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"}],"Computation and Language":[{"id":"http://arxiv.org/abs/2411.10915v1","updated":"2024-11-16T23:54:53Z","published":"2024-11-16T23:54:53Z","title":"Bias in Large Language Models: Origin, Evaluation, and Mitigation","summary":" Large Language Models (LLMs) have revolutionized natural language processing,\nbut their susceptibility to biases poses significant challenges. This\ncomprehensive review examines the landscape of bias in LLMs, from its origins\nto current mitigation strategies. We categorize biases as intrinsic and\nextrinsic, analyzing their manifestations in various NLP tasks. The review\ncritically assesses a range of bias evaluation methods, including data-level,\nmodel-level, and output-level approaches, providing researchers with a robust\ntoolkit for bias detection. We further explore mitigation strategies,\ncategorizing them into pre-model, intra-model, and post-model techniques,\nhighlighting their effectiveness and limitations. Ethical and legal\nimplications of biased LLMs are discussed, emphasizing potential harms in\nreal-world applications such as healthcare and criminal justice. By\nsynthesizing current knowledge on bias in LLMs, this review contributes to the\nongoing effort to develop fair and responsible AI systems. Our work serves as a\ncomprehensive resource for researchers and practitioners working towards\nunderstanding, evaluating, and mitigating bias in LLMs, fostering the\ndevelopment of more equitable AI technologies.\n","authors":["Yufei Guo","Muzhe Guo","Juntao Su","Zhou Yang","Mengqiu Zhu","Hongfei Li","Mengyang Qiu","Shuo Shuo Liu"],"pdf_url":"https://arxiv.org/pdf/2411.10915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10914v1","updated":"2024-11-16T23:53:27Z","published":"2024-11-16T23:53:27Z","title":"BPO: Towards Balanced Preference Optimization between Knowledge Breadth\n and Depth in Alignment","summary":" Reinforcement Learning with Human Feedback (RLHF) is the key to the success\nof large language models (LLMs) in recent years. In this work, we first\nintroduce the concepts of knowledge breadth and knowledge depth, which measure\nthe comprehensiveness and depth of an LLM or knowledge source respectively. We\nreveal that the imbalance in the number of prompts and responses can lead to a\npotential disparity in breadth and depth learning within alignment tuning\ndatasets by showing that even a simple uniform method for balancing the number\nof instructions and responses can lead to significant improvements. Building on\nthis, we further propose Balanced Preference Optimization (BPO), designed to\ndynamically augment the knowledge depth of each sample. BPO is motivated by the\nobservation that the usefulness of knowledge varies across samples,\nnecessitating tailored learning of knowledge depth. To achieve this, we\nintroduce gradient-based clustering, estimating the knowledge informativeness\nand usefulness of each augmented sample based on the model's optimization\ndirection. Our experimental results across various benchmarks demonstrate that\nBPO outperforms other baseline methods in alignment tuning while maintaining\ntraining efficiency. Furthermore, we conduct a detailed analysis of each\ncomponent of BPO, providing guidelines for future research in preference data\noptimization.\n","authors":["Sizhe Wang","Yongqi Tong","Hengyuan Zhang","Dawei Li","Xin Zhang","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.10914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10912v1","updated":"2024-11-16T23:29:32Z","published":"2024-11-16T23:29:32Z","title":"SPICA: Retrieving Scenarios for Pluralistic In-Context Alignment","summary":" Alignment of large language models (LLMs) to societal values should account\nfor pluralistic values from diverse groups. One technique uses in-context\nlearning for inference-time alignment, but only considers similarity when\ndrawing few-shot examples, not accounting for cross-group differences in value\nprioritization. We propose SPICA, a framework for pluralistic alignment that\naccounts for group-level differences during in-context example retrieval. SPICA\nintroduces three designs to facilitate pluralistic alignment: scenario banks,\ngroup-informed metrics, and in-context alignment prompts. From an evaluation of\nSPICA on an alignment task collecting inputs from four demographic groups ($n =\n544$), our metrics retrieve in-context examples that more closely match\nobserved preferences, with the best prompt configuration using multiple\ncontrastive responses to demonstrate examples. In an end-to-end evaluation ($n\n= 80$), we observe that SPICA-aligned models are higher rated than a baseline\nsimilarity-only retrieval approach, with groups seeing up to a +0.16 point\nimprovement on a 5 point scale. Additionally, gains from SPICA were more\nuniform, with all groups benefiting from alignment rather than only some.\nFinally, we find that while a group-agnostic approach can effectively align to\naggregated values, it is not most suited for aligning to divergent groups.\n","authors":["Quan Ze Chen","K. J. Kevin Feng","Chan Young Park","Amy X. Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11798v2","updated":"2024-11-16T23:19:51Z","published":"2024-07-16T14:52:02Z","title":"PipeInfer: Accelerating LLM Inference using Asynchronous Pipelined\n Speculation","summary":" Inference of Large Language Models (LLMs) across computer clusters has become\na focal point of research in recent times, with many acceleration techniques\ntaking inspiration from CPU speculative execution. These techniques reduce\nbottlenecks associated with memory bandwidth, but also increase end-to-end\nlatency per inference run, requiring high speculation acceptance rates to\nimprove performance. Combined with a variable rate of acceptance across tasks,\nspeculative inference techniques can result in reduced performance.\nAdditionally, pipeline-parallel designs require many user requests to maintain\nmaximum utilization. As a remedy, we propose PipeInfer, a pipelined speculative\nacceleration technique to reduce inter-token latency and improve system\nutilization for single-request scenarios while also improving tolerance to low\nspeculation acceptance rates and low-bandwidth interconnects. PipeInfer\nexhibits up to a 2.15$\\times$ improvement in generation speed over standard\nspeculative inference. PipeInfer achieves its improvement through Continuous\nAsynchronous Speculation and Early Inference Cancellation, the former improving\nlatency and generation speed by running single-token inference simultaneously\nwith several speculative runs, while the latter improves speed and latency by\nskipping the computation of invalidated runs, even in the middle of inference.\n","authors":["Branden Butler","Sixing Yu","Arya Mazaheri","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2407.11798v2.pdf","comment":"11 pages, submitted to SC24 conference"},{"id":"http://arxiv.org/abs/2409.10715v2","updated":"2024-11-16T20:50:11Z","published":"2024-09-16T20:38:35Z","title":"Self-Attention Limits Working Memory Capacity of Transformer-Based\n Models","summary":" Recent work on Transformer-based large language models (LLMs) has revealed\nstriking limits in their working memory capacity, similar to what has been\nfound in human behavioral studies. Specifically, these models' performance\ndrops significantly on N-back tasks as N increases. However, there is still a\nlack of mechanistic interpretability as to why this phenomenon would arise.\nInspired by the executive attention theory from behavioral sciences, we\nhypothesize that the self-attention mechanism within Transformer-based models\nmight be responsible for their working memory capacity limits. To test this\nhypothesis, we train vanilla decoder-only transformers to perform N-back tasks\nand find that attention scores gradually aggregate to the N-back positions over\ntraining, suggesting that the model masters the task by learning a strategy to\npay attention to the relationship between the current position and the N-back\nposition. Critically, we find that the total entropy of the attention score\nmatrix increases as N increases, suggesting that the dispersion of attention\nscores might be the cause of the capacity limit observed in N-back tasks. Our\nfindings thus offer insights into the shared role of attention in both human\nand artificial intelligence. Moreover, the limitations of the self-attention\nmechanism revealed in the current study could inform future efforts to design\nmore powerful model architectures with enhanced working memory capacity and\ncognitive capabilities.\n","authors":["Dongyu Gong","Hantao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.10715v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2403.13112v3","updated":"2024-11-16T20:39:46Z","published":"2024-03-19T19:27:23Z","title":"Efficient Encoder-Decoder Transformer Decoding for Decomposable Tasks","summary":" Transformer-based NLP models are powerful but have high computational costs\nthat limit deployment. Finetuned encoder-decoder models are popular in\nspecialized domains and can outperform larger more generalized decoder-only\nmodels, such as GPT-4. We introduce a new configuration for encoder-decoder\nmodels that improves efficiency on structured output and decomposable tasks\nwhere multiple outputs are required for a single shared input. Our method,\nprompt-in-decoder (PiD), encodes the input once and decodes the output in\nparallel, boosting both training and inference efficiency by avoiding duplicate\ninput encoding and increasing the operational intensity (ratio of numbers of\narithmetic operation to memory access) of decoding process by sharing the input\nkey-value cache. We achieve computation reduction that roughly scales with the\nnumber of subtasks, gaining up to 4.6x speed-up over state-of-the-art models\nfor dialogue state tracking, summarization, and question-answering tasks, with\ncomparable or better performance.\n","authors":["Bo-Ru Lu","Nikita Haduong","Chien-Yu Lin","Hao Cheng","Noah A. Smith","Mari Ostendorf"],"pdf_url":"https://arxiv.org/pdf/2403.13112v3.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2411.10879v1","updated":"2024-11-16T20:20:15Z","published":"2024-11-16T20:20:15Z","title":"BanglaDialecto: An End-to-End AI-Powered Regional Speech Standardization","summary":" This study focuses on recognizing Bangladeshi dialects and converting diverse\nBengali accents into standardized formal Bengali speech. Dialects, often\nreferred to as regional languages, are distinctive variations of a language\nspoken in a particular location and are identified by their phonetics,\npronunciations, and lexicon. Subtle changes in pronunciation and intonation are\nalso influenced by geographic location, educational attainment, and\nsocioeconomic status. Dialect standardization is needed to ensure effective\ncommunication, educational consistency, access to technology, economic\nopportunities, and the preservation of linguistic resources while respecting\ncultural diversity. Being the fifth most spoken language with around 55\ndistinct dialects spoken by 160 million people, addressing Bangla dialects is\ncrucial for developing inclusive communication tools. However, limited research\nexists due to a lack of comprehensive datasets and the challenges of handling\ndiverse dialects. With the advancement in multilingual Large Language Models\n(mLLMs), emerging possibilities have been created to address the challenges of\ndialectal Automated Speech Recognition (ASR) and Machine Translation (MT). This\nstudy presents an end-to-end pipeline for converting dialectal Noakhali speech\nto standard Bangla speech. This investigation includes constructing a\nlarge-scale diverse dataset with dialectal speech signals that tailored the\nfine-tuning process in ASR and LLM for transcribing the dialect speech to\ndialect text and translating the dialect text to standard Bangla text. Our\nexperiments demonstrated that fine-tuning the Whisper ASR model achieved a CER\nof 0.8% and WER of 1.5%, while the BanglaT5 model attained a BLEU score of\n41.6% for dialect-to-standard text translation.\n","authors":["Md. Nazmus Sadat Samin","Jawad Ibn Ahad","Tanjila Ahmed Medha","Fuad Rahman","Mohammad Ruhul Amin","Nabeel Mohammed","Shafin Rahman"],"pdf_url":"https://arxiv.org/pdf/2411.10879v1.pdf","comment":"Accepted in 2024 IEEE International Conference on Big Data (IEEE\n BigData)"},{"id":"http://arxiv.org/abs/2411.10878v1","updated":"2024-11-16T20:18:57Z","published":"2024-11-16T20:18:57Z","title":"Empowering Meta-Analysis: Leveraging Large Language Models for\n Scientific Synthesis","summary":" This study investigates the automation of meta-analysis in scientific\ndocuments using large language models (LLMs). Meta-analysis is a robust\nstatistical method that synthesizes the findings of multiple studies support\narticles to provide a comprehensive understanding. We know that a meta-article\nprovides a structured analysis of several articles. However, conducting\nmeta-analysis by hand is labor-intensive, time-consuming, and susceptible to\nhuman error, highlighting the need for automated pipelines to streamline the\nprocess. Our research introduces a novel approach that fine-tunes the LLM on\nextensive scientific datasets to address challenges in big data handling and\nstructured data extraction. We automate and optimize the meta-analysis process\nby integrating Retrieval Augmented Generation (RAG). Tailored through prompt\nengineering and a new loss metric, Inverse Cosine Distance (ICD), designed for\nfine-tuning on large contextual datasets, LLMs efficiently generate structured\nmeta-analysis content. Human evaluation then assesses relevance and provides\ninformation on model performance in key metrics. This research demonstrates\nthat fine-tuned models outperform non-fine-tuned models, with fine-tuned LLMs\ngenerating 87.6% relevant meta-analysis abstracts. The relevance of the\ncontext, based on human evaluation, shows a reduction in irrelevancy from 4.56%\nto 1.9%. These experiments were conducted in a low-resource environment,\nhighlighting the study's contribution to enhancing the efficiency and\nreliability of meta-analysis automation.\n","authors":["Jawad Ibn Ahad","Rafeed Mohammad Sultan","Abraham Kaikobad","Fuad Rahman","Mohammad Ruhul Amin","Nabeel Mohammed","Shafin Rahman"],"pdf_url":"https://arxiv.org/pdf/2411.10878v1.pdf","comment":"Accepted in 2024 IEEE International Conference on Big Data (IEEE\n BigData)"},{"id":"http://arxiv.org/abs/2411.10869v1","updated":"2024-11-16T19:23:52Z","published":"2024-11-16T19:23:52Z","title":"Large Language Models (LLMs) as Traffic Control Systems at Urban\n Intersections: A New Paradigm","summary":" This study introduces a novel approach for traffic control systems by using\nLarge Language Models (LLMs) as traffic controllers. The study utilizes their\nlogical reasoning, scene understanding, and decision-making capabilities to\noptimize throughput and provide feedback based on traffic conditions in\nreal-time. LLMs centralize traditionally disconnected traffic control processes\nand can integrate traffic data from diverse sources to provide context-aware\ndecisions. LLMs can also deliver tailored outputs using various means such as\nwireless signals and visuals to drivers, infrastructures, and autonomous\nvehicles. To evaluate LLMs ability as traffic controllers, this study proposed\na four-stage methodology. The methodology includes data creation and\nenvironment initialization, prompt engineering, conflict identification, and\nfine-tuning. We simulated multi-lane four-leg intersection scenarios and\ngenerates detailed datasets to enable conflict detection using LLMs and Python\nsimulation as a ground truth. We used chain-of-thought prompts to lead LLMs in\nunderstanding the context, detecting conflicts, resolving them using traffic\nrules, and delivering context-sensitive traffic management solutions. We\nevaluated the prformance GPT-mini, Gemini, and Llama as traffic controllers.\nResults showed that the fine-tuned GPT-mini achieved 83% accuracy and an\nF1-score of 0.84. GPT-mini model exhibited a promising performance in\ngenerating actionable traffic management insights, with high ROUGE-L scores\nacross conflict identification of 0.95, decision-making of 0.91, priority\nassignment of 0.94, and waiting time optimization of 0.92. We demonstrated that\nLLMs can offer precise recommendations to drivers in real-time including\nyielding, slowing, or stopping based on vehicle dynamics.\n","authors":["Sari Masri","Huthaifa I. Ashqar","Mohammed Elhenawy"],"pdf_url":"https://arxiv.org/pdf/2411.10869v1.pdf","comment":"The data and code that support the findings of this study are openly\n available in Zenodo at https://doi.org/10.5281/zenodo.14171745, reference\n number 14171745"},{"id":"http://arxiv.org/abs/2402.15302v5","updated":"2024-11-16T19:21:32Z","published":"2024-02-23T13:03:12Z","title":"How (un)ethical are instruction-centric responses of LLMs? Unveiling the\n vulnerabilities of safety guardrails to harmful queries","summary":" In this study, we tackle a growing concern around the safety and ethical use\nof large language models (LLMs). Despite their potential, these models can be\ntricked into producing harmful or unethical content through various\nsophisticated methods, including 'jailbreaking' techniques and targeted\nmanipulation. Our work zeroes in on a specific issue: to what extent LLMs can\nbe led astray by asking them to generate responses that are instruction-centric\nsuch as a pseudocode, a program or a software snippet as opposed to vanilla\ntext. To investigate this question, we introduce TechHazardQA, a dataset\ncontaining complex queries which should be answered in both text and\ninstruction-centric formats (e.g., pseudocodes), aimed at identifying triggers\nfor unethical responses. We query a series of LLMs -- Llama-2-13b, Llama-2-7b,\nMistral-V2 and Mistral 8X7B -- and ask them to generate both text and\ninstruction-centric responses. For evaluation we report the harmfulness score\nmetric as well as judgements from GPT-4 and humans. Overall, we observe that\nasking LLMs to produce instruction-centric responses enhances the unethical\nresponse generation by ~2-38% across the models. As an additional objective, we\ninvestigate the impact of model editing using the ROME technique, which further\nincreases the propensity for generating undesirable content. In particular,\nasking edited LLMs to generate instruction-centric responses further increases\nthe unethical response generation by ~3-16% across the different models.\n","authors":["Somnath Banerjee","Sayan Layek","Rima Hazra","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2402.15302v5.pdf","comment":"Accepted at AAAI Conference on Web and Social Media (ICWSM) 2025.\n [Dataset](https://huggingface.co/datasets/SoftMINER-Group/TechHazardQA)"},{"id":"http://arxiv.org/abs/2411.05049v2","updated":"2024-11-16T18:58:35Z","published":"2024-11-07T06:34:48Z","title":"ProverbEval: Exploring LLM Evaluation Challenges for Low-resource\n Language Understanding","summary":" With the rapid development of evaluation datasets to assess LLMs\nunderstanding across a wide range of subjects and domains, identifying a\nsuitable language understanding benchmark has become increasingly challenging.\nIn this work, we explore LLM evaluation challenges for low-resource language\nunderstanding and introduce ProverbEval, LLM evaluation benchmark for\nlow-resource languages based on proverbs to focus on low-resource language\nunderstanding in culture-specific scenarios. We benchmark various LLMs and\nexplore factors that create variability in the benchmarking process. We\nobserved performance variances of up to 50%, depending on the order in which\nanswer choices were presented in multiple-choice tasks. Native language proverb\ndescriptions significantly improve tasks such as proverb generation,\ncontributing to improved outcomes. Additionally, monolingual evaluations\nconsistently outperformed their cross-lingual counterparts. We argue special\nattention must be given to the order of choices, choice of prompt language,\ntask variability, and generation tasks when creating LLM evaluation benchmarks.\n","authors":["Israel Abebe Azime","Atnafu Lambebo Tonja","Tadesse Destaw Belay","Yonas Chanie","Bontu Fufa Balcha","Negasi Haile Abadi","Henok Biadglign Ademtew","Mulubrhan Abebe Nerea","Debela Desalegn Yadeta","Derartu Dagne Geremew","Assefa Atsbiha tesfau","Philipp Slusallek","Thamar Solorio","Dietrich Klakow"],"pdf_url":"https://arxiv.org/pdf/2411.05049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11109v5","updated":"2024-11-16T18:56:32Z","published":"2024-06-17T00:18:31Z","title":"Investigating Annotator Bias in Large Language Models for Hate Speech\n Detection","summary":" Data annotation, the practice of assigning descriptive labels to raw data, is\npivotal in optimizing the performance of machine learning models. However, it\nis a resource-intensive process susceptible to biases introduced by annotators.\nThe emergence of sophisticated Large Language Models (LLMs) presents a unique\nopportunity to modernize and streamline this complex procedure. While existing\nresearch extensively evaluates the efficacy of LLMs, as annotators, this paper\ndelves into the biases present in LLMs when annotating hate speech data. Our\nresearch contributes to understanding biases in four key categories: gender,\nrace, religion, and disability with four LLMs: GPT-3.5, GPT-4o, Llama-3.1 and\nGemma-2. Specifically targeting highly vulnerable groups within these\ncategories, we analyze annotator biases. Furthermore, we conduct a\ncomprehensive examination of potential factors contributing to these biases by\nscrutinizing the annotated data. We introduce our custom hate speech detection\ndataset, HateBiasNet, to conduct this research. Additionally, we perform the\nsame experiments on the ETHOS (Mollas et al. 2022) dataset also for comparative\nanalysis. This paper serves as a crucial resource, guiding researchers and\npractitioners in harnessing the potential of LLMs for data annotation, thereby\nfostering advancements in this critical field.\n","authors":["Amit Das","Zheng Zhang","Najib Hasan","Souvika Sarkar","Fatemeh Jamshidi","Tathagata Bhattacharya","Mostafa Rahgouy","Nilanjana Raychawdhary","Dongji Feng","Vinija Jain","Aman Chadha","Mary Sandage","Lauramarie Pope","Gerry Dozier","Cheryl Seals"],"pdf_url":"https://arxiv.org/pdf/2406.11109v5.pdf","comment":"Accepted at NeurIPS Safe Generative AI Workshop, 2024"},{"id":"http://arxiv.org/abs/2410.14668v3","updated":"2024-11-16T18:47:18Z","published":"2024-10-18T17:57:40Z","title":"MiCEval: Unveiling Multimodal Chain of Thought's Quality via Image\n Description and Reasoning Steps","summary":" Multimodal Chain of Thought (MCoT) is a popular prompting strategy for\nimproving the performance of multimodal large language models (MLLMs) across a\nrange of complex reasoning tasks. Despite its popularity, there is a notable\nabsence of automated methods for evaluating the quality of reasoning steps in\nMCoT. To address this gap, we propose Multimodal Chain-of-Thought Evaluation\n(MiCEval), a framework designed to assess the correctness of reasoning chains\nby evaluating the quality of both the description and each reasoning step. The\nevaluation of the description component focuses on the accuracy of the image\ndescriptions, while the reasoning step evaluates the quality of each step as it\nis conditionally generated based on the preceding steps. MiCEval is built upon\na fine-grained dataset with annotations that rate each step according to\ncorrectness, relevance, and informativeness. Extensive experiments on four\nstate-of-the-art MLLMs show that step-wise evaluations using MiCEval align more\nclosely with human judgments compared to existing methods based on cosine\nsimilarity or fine-tuning approaches. MiCEval datasets and code can be found in\nhttps://github.com/alenai97/MiCEval.\n","authors":["Xiongtao Zhou","Jie He","Lanyu Chen","Jingyu Li","Haojing Chen","Víctor Gutiérrez-Basulto","Jeff Z. Pan","Hanjie Chen"],"pdf_url":"https://arxiv.org/pdf/2410.14668v3.pdf","comment":"41 pages"},{"id":"http://arxiv.org/abs/2411.10857v1","updated":"2024-11-16T18:32:38Z","published":"2024-11-16T18:32:38Z","title":"Large Vision-Language Models for Remote Sensing Visual Question\n Answering","summary":" Remote Sensing Visual Question Answering (RSVQA) is a challenging task that\ninvolves interpreting complex satellite imagery to answer natural language\nquestions. Traditional approaches often rely on separate visual feature\nextractors and language processing models, which can be computationally\nintensive and limited in their ability to handle open-ended questions. In this\npaper, we propose a novel method that leverages a generative Large\nVision-Language Model (LVLM) to streamline the RSVQA process. Our approach\nconsists of a two-step training strategy: domain-adaptive pretraining and\nprompt-based finetuning. This method enables the LVLM to generate natural\nlanguage answers by conditioning on both visual and textual inputs, without the\nneed for predefined answer categories. We evaluate our model on the RSVQAxBEN\ndataset, demonstrating superior performance compared to state-of-the-art\nbaselines. Additionally, a human evaluation study shows that our method\nproduces answers that are more accurate, relevant, and fluent. The results\nhighlight the potential of generative LVLMs in advancing the field of remote\nsensing analysis.\n","authors":["Surasakdi Siripong","Apirak Chaiyapan","Thanakorn Phonchai"],"pdf_url":"https://arxiv.org/pdf/2411.10857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16692v3","updated":"2024-11-16T16:09:11Z","published":"2024-04-25T15:53:00Z","title":"Influence of Solution Efficiency and Valence of Instruction on Additive\n and Subtractive Solution Strategies in Humans and GPT-4","summary":" Generative artificial intelligences, particularly large language models\n(LLMs), play an increasingly prominent role in human decision-making contexts,\nnecessitating transparency about their capabilities. While prior studies have\nshown addition biases in humans (Adams et al., 2021) and OpenAI's GPT-3 (Winter\net al., 2023), this study extends the research by comparing human and GPT-4\nproblem-solving across both spatial and linguistic tasks, with variations in\nsolution efficiency and valence of task instruction. Four preregistered\nexperiments with 588 participants from the U.S. and 680 GPT-4 iterations\nrevealed a stronger tendency towards additive transformations in GPT-4 than in\nhumans. Human participants were less likely to use additive strategies when\nsubtraction was relatively more efficient than when addition and subtraction\nwere equally efficient. GPT-4 exhibited the opposite behavior, with a strong\naddition bias when subtraction was more efficient. In terms of valence of task\ninstruction, GPT-4's use of additive strategies increased when instructed to\n\"improve\" (positive) rather than \"edit\" (neutral). These findings demonstrate\nthat biases in human problem-solving are amplified in GPT-4, and that LLM\nbehavior differs from human efficiency-based strategies. This highlights the\nlimitations of LLMs and the need for caution when using them in real-world\napplications.\n","authors":["Lydia Uhler","Verena Jordan","Jürgen Buder","Markus Huff","Frank Papenmeier"],"pdf_url":"https://arxiv.org/pdf/2404.16692v3.pdf","comment":"29 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.10828v1","updated":"2024-11-16T15:53:03Z","published":"2024-11-16T15:53:03Z","title":"Bilingual Text-dependent Speaker Verification with Pre-trained Models\n for TdSV Challenge 2024","summary":" This paper presents our submissions to the Iranian division of the\nText-dependent Speaker Verification Challenge (TdSV) 2024. TdSV aims to\ndetermine if a specific phrase was spoken by a target speaker. We developed two\nindependent subsystems based on pre-trained models: For phrase verification, a\nphrase classifier rejected incorrect phrases, while for speaker verification, a\npre-trained ResNet293 with domain adaptation extracted speaker embeddings for\ncomputing cosine similarity scores. In addition, we evaluated Whisper-PMFA, a\npre-trained ASR model adapted for speaker verification, and found that,\nalthough it outperforms randomly initialized ResNets, it falls short of the\nperformance of pre-trained ResNets, highlighting the importance of large-scale\npre-training. The results also demonstrate that achieving competitive\nperformance on TdSV without joint modeling of speaker and text is possible. Our\nbest system achieved a MinDCF of 0.0358 on the evaluation subset and won the\nchallenge.\n","authors":["Seyed Ali Farokh"],"pdf_url":"https://arxiv.org/pdf/2411.10828v1.pdf","comment":"5 pages, no figures"},{"id":"http://arxiv.org/abs/2407.16222v3","updated":"2024-11-16T14:44:50Z","published":"2024-07-23T06:59:53Z","title":"PreAlign: Boosting Cross-Lingual Transfer by Early Establishment of\n Multilingual Alignment","summary":" Large language models demonstrate reasonable multilingual abilities, despite\npredominantly English-centric pretraining. However, the spontaneous\nmultilingual alignment in these models is shown to be weak, leading to\nunsatisfactory cross-lingual transfer and knowledge sharing. Previous works\nattempt to address this issue by explicitly injecting multilingual alignment\ninformation during or after pretraining. Thus for the early stage in\npretraining, the alignment is weak for sharing information or knowledge across\nlanguages. In this paper, we propose PreAlign, a framework that establishes\nmultilingual alignment prior to language model pretraining. PreAlign injects\nmultilingual alignment by initializing the model to generate similar\nrepresentations of aligned words and preserves this alignment using a\ncode-switching strategy during pretraining. Extensive experiments in a\nsynthetic English to English-Clone setting demonstrate that PreAlign\nsignificantly outperforms standard multilingual joint training in language\nmodeling, zero-shot cross-lingual transfer, and cross-lingual knowledge\napplication. Further experiments in real-world scenarios further validate\nPreAlign's effectiveness across various model sizes.\n","authors":["Jiahuan Li","Shujian Huang","Aarron Ching","Xinyu Dai","Jiajun Chen"],"pdf_url":"https://arxiv.org/pdf/2407.16222v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10813v1","updated":"2024-11-16T14:28:33Z","published":"2024-11-16T14:28:33Z","title":"Information Anxiety in Large Language Models","summary":" Large Language Models (LLMs) have demonstrated strong performance as\nknowledge repositories, enabling models to understand user queries and generate\naccurate and context-aware responses. Extensive evaluation setups have\ncorroborated the positive correlation between the retrieval capability of LLMs\nand the frequency of entities in their pretraining corpus. We take the\ninvestigation further by conducting a comprehensive analysis of the internal\nreasoning and retrieval mechanisms of LLMs. Our work focuses on three critical\ndimensions - the impact of entity popularity, the models' sensitivity to\nlexical variations in query formulation, and the progression of hidden state\nrepresentations across LLM layers. Our preliminary findings reveal that popular\nquestions facilitate early convergence of internal states toward the correct\nanswer. However, as the popularity of a query increases, retrieved attributes\nacross lexical variations become increasingly dissimilar and less accurate.\nInterestingly, we find that LLMs struggle to disentangle facts, grounded in\ndistinct relations, from their parametric memory when dealing with highly\npopular subjects. Through a case study, we explore these latent strains within\nLLMs when processing highly popular queries, a phenomenon we term information\nanxiety. The emergence of information anxiety in LLMs underscores the\nadversarial injection in the form of linguistic variations and calls for a more\nholistic evaluation of frequently occurring entities.\n","authors":["Prasoon Bajpai","Sarah Masud","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2411.10813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10761v1","updated":"2024-11-16T09:36:56Z","published":"2024-11-16T09:36:56Z","title":"Can Generic LLMs Help Analyze Child-adult Interactions Involving\n Children with Autism in Clinical Observation?","summary":" Large Language Models (LLMs) have shown significant potential in\nunderstanding human communication and interaction. However, their performance\nin the domain of child-inclusive interactions, including in clinical settings,\nremains less explored. In this work, we evaluate generic LLMs' ability to\nanalyze child-adult dyadic interactions in a clinically relevant context\ninvolving children with ASD. Specifically, we explore LLMs in performing four\ntasks: classifying child-adult utterances, predicting engaged activities,\nrecognizing language skills and understanding traits that are clinically\nrelevant. Our evaluation shows that generic LLMs are highly capable of\nanalyzing long and complex conversations in clinical observation sessions,\noften surpassing the performance of non-expert human evaluators. The results\nshow their potential to segment interactions of interest, assist in language\nskills evaluation, identify engaged activities, and offer clinical-relevant\ncontext for assessments.\n","authors":["Tiantian Feng","Anfeng Xu","Rimita Lahiri","Helen Tager-Flusberg","So Hyun Kim","Somer Bishop","Catherine Lord","Shrikanth Narayanan"],"pdf_url":"https://arxiv.org/pdf/2411.10761v1.pdf","comment":"GenAI for Health Workshop, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08506v2","updated":"2024-11-16T09:36:36Z","published":"2024-11-13T10:43:31Z","title":"Towards Operationalizing Right to Data Protection","summary":" The widespread practice of indiscriminate data scraping to fine-tune language\nmodels (LMs) raises significant legal and ethical concerns, particularly\nregarding compliance with data protection laws such as the General Data\nProtection Regulation (GDPR). This practice often results in the unauthorized\nuse of personal information, prompting growing debate within the academic and\nregulatory communities. Recent works have introduced the concept of generating\nunlearnable datasets (by adding imperceptible noise to the clean data), such\nthat the underlying model achieves lower loss during training but fails to\ngeneralize to the unseen test setting. Though somewhat effective, these\napproaches are predominantly designed for images and are limited by several\npractical constraints like requiring knowledge of the target model. To this\nend, we introduce RegText, a framework that injects imperceptible spurious\ncorrelations into natural language datasets, effectively rendering them\nunlearnable without affecting semantic content. We demonstrate RegText's\nutility through rigorous empirical analysis of small and large LMs. Notably,\nRegText can restrict newer models like GPT-4o and Llama from learning on our\ngenerated data, resulting in a drop in their test accuracy compared to their\nzero-shot performance and paving the way for generating unlearnable text to\nprotect public data.\n","authors":["Abhinav Java","Simra Shahid","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2411.08506v2.pdf","comment":"First two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2411.10753v1","updated":"2024-11-16T09:20:35Z","published":"2024-11-16T09:20:35Z","title":"Chain-of-Programming (CoP) : Empowering Large Language Models for\n Geospatial Code Generation","summary":" With the rapid growth of interdisciplinary demands for geospatial modeling\nand the rise of large language models (LLMs), geospatial code generation\ntechnology has seen significant advancements. However, existing LLMs often face\nchallenges in the geospatial code generation process due to incomplete or\nunclear user requirements and insufficient knowledge of specific platform\nsyntax rules, leading to the generation of non-executable code, a phenomenon\nknown as \"code hallucination.\" To address this issue, this paper proposes a\nChain of Programming (CoP) framework, which decomposes the code generation\nprocess into five steps: requirement analysis, algorithm design, code\nimplementation, code debugging, and code annotation. The framework incorporates\na shared information pool, knowledge base retrieval, and user feedback\nmechanisms, forming an end-to-end code generation flow from requirements to\ncode without the need for model fine-tuning. Based on a geospatial problem\nclassification framework and evaluation benchmarks, the CoP strategy\nsignificantly improves the logical clarity, syntactical correctness, and\nexecutability of the generated code, with improvements ranging from 3.0% to\n48.8%. Comparative and ablation experiments further validate the superiority of\nthe CoP strategy over other optimization approaches and confirm the rationality\nand necessity of its key components. Through case studies on building data\nvisualization and fire data analysis, this paper demonstrates the application\nand effectiveness of CoP in various geospatial scenarios. The CoP framework\noffers a systematic, step-by-step approach to LLM-based geospatial code\ngeneration tasks, significantly enhancing code generation performance in\ngeospatial tasks and providing valuable insights for code generation in other\nvertical domains.\n","authors":["Shuyang Hou","Haoyue Jiao","Zhangxiao Shen","Jianyuan Liang","Anqi Zhao","Xiaopu Zhang","Jianxun Wang","Huayi Wu"],"pdf_url":"https://arxiv.org/pdf/2411.10753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17086v3","updated":"2024-11-16T08:20:24Z","published":"2023-10-26T01:08:47Z","title":"Transformers Learn to Achieve Second-Order Convergence Rates for\n In-Context Linear Regression","summary":" Transformers excel at in-context learning (ICL) -- learning from\ndemonstrations without parameter updates -- but how they do so remains a\nmystery. Recent work suggests that Transformers may internally run Gradient\nDescent (GD), a first-order optimization method, to perform ICL. In this paper,\nwe instead demonstrate that Transformers learn to approximate second-order\noptimization methods for ICL. For in-context linear regression, Transformers\nshare a similar convergence rate as Iterative Newton's Method, both\nexponentially faster than GD. Empirically, predictions from successive\nTransformer layers closely match different iterations of Newton's Method\nlinearly, with each middle layer roughly computing 3 iterations; thus,\nTransformers and Newton's method converge at roughly the same rate. In\ncontrast, Gradient Descent converges exponentially more slowly. We also show\nthat Transformers can learn in-context on ill-conditioned data, a setting where\nGradient Descent struggles but Iterative Newton succeeds. Finally, to\ncorroborate our empirical findings, we prove that Transformers can implement\n$k$ iterations of Newton's method with $k + \\mathcal{O}(1)$ layers.\n","authors":["Deqing Fu","Tian-Qi Chen","Robin Jia","Vatsal Sharan"],"pdf_url":"https://arxiv.org/pdf/2310.17086v3.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10730v1","updated":"2024-11-16T07:49:15Z","published":"2024-11-16T07:49:15Z","title":"Comparison of Multilingual and Bilingual Models for Satirical News\n Detection of Arabic and English","summary":" Satirical news is real news combined with a humorous comment or exaggerated\ncontent, and it often mimics the format and style of real news. However,\nsatirical news is often misunderstood as misinformation, especially by\nindividuals from different cultural and social backgrounds. This research\naddresses the challenge of distinguishing satire from truthful news by\nleveraging multilingual satire detection methods in English and Arabic. We\nexplore both zero-shot and chain-of-thought (CoT) prompting using two language\nmodels, Jais-chat(13B) and LLaMA-2-chat(7B). Our results show that CoT\nprompting offers a significant advantage for the Jais-chat model over the\nLLaMA-2-chat model. Specifically, Jais-chat achieved the best performance, with\nan F1-score of 80\\% in English when using CoT prompting. These results\nhighlight the importance of structured reasoning in CoT, which enhances\ncontextual understanding and is vital for complex tasks like satire detection.\n","authors":["Omar W. Abdalla","Aditya Joshi","Rahat Masood","Salil S. Kanhere"],"pdf_url":"https://arxiv.org/pdf/2411.10730v1.pdf","comment":"ALTA 2024 (Selected for publication)"},{"id":"http://arxiv.org/abs/2411.10724v1","updated":"2024-11-16T07:14:32Z","published":"2024-11-16T07:14:32Z","title":"HJ-Ky-0.1: an Evaluation Dataset for Kyrgyz Word Embeddings","summary":" One of the key tasks in modern applied computational linguistics is\nconstructing word vector representations (word embeddings), which are widely\nused to address natural language processing tasks such as sentiment analysis,\ninformation extraction, and more. To choose an appropriate method for\ngenerating these word embeddings, quality assessment techniques are often\nnecessary. A standard approach involves calculating distances between vectors\nfor words with expert-assessed 'similarity'. This work introduces the first\n'silver standard' dataset for such tasks in the Kyrgyz language, alongside\ntraining corresponding models and validating the dataset's suitability through\nquality evaluation metrics.\n","authors":["Anton Alekseev","Gulnara Kabaeva"],"pdf_url":"https://arxiv.org/pdf/2411.10724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10713v1","updated":"2024-11-16T05:54:36Z","published":"2024-11-16T05:54:36Z","title":"A Regularized LSTM Method for Detecting Fake News Articles","summary":" Nowadays, the rapid diffusion of fake news poses a significant problem, as it\ncan spread misinformation and confusion. This paper aims to develop an advanced\nmachine learning solution for detecting fake news articles. Leveraging a\ncomprehensive dataset of news articles, including 23,502 fake news articles and\n21,417 accurate news articles, we implemented and evaluated three\nmachine-learning models. Our dataset, curated from diverse sources, provides\nrich textual content categorized into title, text, subject, and Date features.\nThese features are essential for training robust classification models to\ndistinguish between fake and authentic news articles. The initial model\nemployed a Long Short-Term Memory (LSTM) network, achieving an accuracy of 94%.\nThe second model improved upon this by incorporating additional regularization\ntechniques and fine-tuning hyperparameters, resulting in a 97% accuracy. The\nfinal model combined the strengths of previous architectures with advanced\noptimization strategies, achieving a peak accuracy of 98%. These results\ndemonstrate the effectiveness of our approach in identifying fake news with\nhigh precision. Implementing these models showcases significant advancements in\nnatural language processing and machine learning techniques, contributing\nvaluable tools for combating misinformation. Our work highlights the potential\nfor deploying such models in real-world applications, providing a reliable\nmethod for automated fake news detection and enhancing the credibility of news\ndissemination.\n","authors":["Tanjina Sultana Camelia","Faizur Rahman Fahim","Md. Musfique Anwar"],"pdf_url":"https://arxiv.org/pdf/2411.10713v1.pdf","comment":"6 pages, 7 figures, 2024 IEEE International Conference on Signal\n Processing, Information, Communication and Systems (SPICSCON)"},{"id":"http://arxiv.org/abs/2411.01045v2","updated":"2024-11-16T05:22:52Z","published":"2024-11-01T21:29:07Z","title":"Towards Robust Text Classification: Mitigating Spurious Correlations\n with Causal Learning","summary":" In text classification tasks, models often rely on spurious correlations for\npredictions, incorrectly associating irrelevant features with the target\nlabels. This issue limits the robustness and generalization of models,\nespecially when faced with out-of-distribution data where such spurious\ncorrelations no longer hold. To address this challenge, we propose the Causally\nCalibrated Robust Classifier (CCR), which aims to reduce models' reliance on\nspurious correlations and improve model robustness. Our approach integrates a\ncausal feature selection method based on counterfactual reasoning, along with\nan unbiased inverse propensity weighting (IPW) loss function. By focusing on\nselecting causal features, we ensure that the model relies less on spurious\nfeatures during prediction. We theoretically justify our approach and\nempirically show that CCR achieves state-of-the-art performance among methods\nwithout group labels, and in some cases, it can compete with the models that\nutilize group labels.\n","authors":["Yuqing Zhou","Ziwei Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.01045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08273v3","updated":"2024-11-16T04:23:20Z","published":"2024-01-16T10:53:11Z","title":"Large Language Models are Null-Shot Learners","summary":" This paper presents null-shot prompting. Null-shot prompting exploits\nhallucination in large language models (LLMs) by instructing LLMs to utilize\ninformation from the \"Examples\" section that never exists within the provided\ncontext to perform a task. While reducing hallucination is crucial and\nnon-negligible for daily and critical uses of LLMs, we propose that in the\ncurrent landscape in which these LLMs still hallucinate, it is possible, in\nfact, to exploit hallucination to increase performance in performing tasks\ncompared to standard zero-shot prompting. Experiments with eight LLMs show\nimprovements in performance across the majority of eight datasets, including\nreading comprehension, arithmetic reasoning, and closed-book question\nanswering. The observed inconsistency in increased relative performance across\nthe LLMs also potentially indicates a different degree of inherent\nhallucination in each model. These differences show that it is possible to\nutilize null-shot prompting as a way to detect degrees of hallucination in LLMs\nusing existing benchmarking datasets. We also perform ablation studies,\nincluding experimenting with a modified version of null-shot prompting that\nincorporates ideas from zero-shot chain-of-thought prompting, which shows\ndifferent trends of results.\n","authors":["Pittawat Taveekitworachai","Febri Abdullah","Ruck Thawonmas"],"pdf_url":"https://arxiv.org/pdf/2401.08273v3.pdf","comment":"28 pages; v2: added Gemini Pro results, error analysis, and a\n discussion on confabulation; v3: see its extended version, an EMNLP 2024\n paper, at https://aclanthology.org/2024.emnlp-main.740/"},{"id":"http://arxiv.org/abs/2411.05503v2","updated":"2024-11-16T03:53:07Z","published":"2024-11-08T12:03:31Z","title":"KyrgyzNLP: Challenges, Progress, and Future","summary":" Large language models (LLMs) have excelled in numerous benchmarks, advancing\nAI applications in both linguistic and non-linguistic tasks. However, this has\nprimarily benefited well-resourced languages, leaving less-resourced ones\n(LRLs) at a disadvantage. In this paper, we highlight the current state of the\nNLP field in the specific LRL: kyrgyz tili.\n Human evaluation, including annotated datasets created by native speakers,\nremains an irreplaceable component of reliable NLP performance, especially for\nLRLs where automatic evaluations can fall short. In recent assessments of the\nresources for Turkic languages, Kyrgyz is labeled with the status 'Scraping\nBy', a severely under-resourced language spoken by millions. This is concerning\ngiven the growing importance of the language, not only in Kyrgyzstan but also\namong diaspora communities where it holds no official status.\n We review prior efforts in the field, noting that many of the publicly\navailable resources have only recently been developed, with few exceptions\nbeyond dictionaries (the processed data used for the analysis is presented at\nhttps://kyrgyznlp.github.io/). While recent papers have made some headway, much\nmore remains to be done. Despite interest and support from both business and\ngovernment sectors in the Kyrgyz Republic, the situation for Kyrgyz language\nresources remains challenging. We stress the importance of community-driven\nefforts to build these resources, ensuring the future advancement\nsustainability. We then share our view of the most pressing challenges in\nKyrgyz NLP. Finally, we propose a roadmap for future development in terms of\nresearch topics and language resources.\n","authors":["Anton Alekseev","Timur Turatali"],"pdf_url":"https://arxiv.org/pdf/2411.05503v2.pdf","comment":"Keynote talk at the 12th International Conference on Analysis of\n Images, Social Networks and Texts (AIST-2024)"},{"id":"http://arxiv.org/abs/2411.10681v1","updated":"2024-11-16T03:12:17Z","published":"2024-11-16T03:12:17Z","title":"Structured Dialogue System for Mental Health: An LLM Chatbot Leveraging\n the PM+ Guidelines","summary":" The Structured Dialogue System, referred to as SuDoSys, is an innovative\nLarge Language Model (LLM)-based chatbot designed to provide psychological\ncounseling. SuDoSys leverages the World Health Organization (WHO)'s Problem\nManagement Plus (PM+) guidelines to deliver stage-aware multi-turn dialogues.\nExisting methods for employing an LLM in multi-turn psychological counseling\ntypically involve direct fine-tuning using generated dialogues, often\nneglecting the dynamic stage shifts of counseling sessions. Unlike previous\napproaches, SuDoSys considers the different stages of counseling and stores\nessential information throughout the counseling process, ensuring coherent and\ndirected conversations. The system employs an LLM, a stage-aware instruction\ngenerator, a response unpacker, a topic database, and a stage controller to\nmaintain dialogue flow. In addition, we propose a novel technique that\nsimulates counseling clients to interact with the evaluated system and evaluate\nits performance automatically. When assessed using both objective and\nsubjective evaluations, SuDoSys demonstrates its effectiveness in generating\nlogically coherent responses. The system's code and program scripts for\nevaluation are open-sourced.\n","authors":["Yixiang Chen","Xinyu Zhang","Jinran Wang","Xurong Xie","Nan Yan","Hui Chen","Lan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.10681v1.pdf","comment":"Accepted to the 16th International Conference on Social Robotic (ICSR\n 2024)"},{"id":"http://arxiv.org/abs/2406.06046v2","updated":"2024-11-16T02:59:22Z","published":"2024-06-10T06:27:42Z","title":"MATES: Model-Aware Data Selection for Efficient Pretraining with Data\n Influence Models","summary":" Pretraining data selection has the potential to improve language model\npretraining efficiency by utilizing higher-quality data from massive web data\ncorpora. Current data selection methods, which rely on either hand-crafted\nrules or larger reference models, are conducted statically and do not capture\nthe evolving data preferences during pretraining. In this paper, we introduce\nmodel-aware data selection with data influence models (MATES), where a data\ninfluence model continuously adapts to the evolving data preferences of the\npretraining model and then selects the data most effective for the current\npretraining progress. Specifically, we collect oracle data influence by locally\nprobing the pretraining model and fine-tune a small data influence model to\napproximate it accurately. The data influence model then predicts data\ninfluence over the whole pretraining corpus and selects the most influential\ndata for the next pretraining stage. Experiments of pretraining 410M and 1B\nmodels on the C4 dataset demonstrate that MATES significantly outperforms\nrandom data selection on extensive downstream tasks. It doubles the gains\nachieved by the state-of-the-art data selection approach that leverages larger\nreference models and reduces the total FLOPs required to reach certain\nperformances by half. Further analyses validate the effectiveness of the\nlocally probed oracle data influence and the approximation with data influence\nmodels. Our code is open-sourced at https://github.com/cxcscmu/MATES.\n","authors":["Zichun Yu","Spandan Das","Chenyan Xiong"],"pdf_url":"https://arxiv.org/pdf/2406.06046v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10670v1","updated":"2024-11-16T02:16:59Z","published":"2024-11-16T02:16:59Z","title":"IntentGPT: Few-shot Intent Discovery with Large Language Models","summary":" In today's digitally driven world, dialogue systems play a pivotal role in\nenhancing user interactions, from customer service to virtual assistants. In\nthese dialogues, it is important to identify user's goals automatically to\nresolve their needs promptly. This has necessitated the integration of models\nthat perform Intent Detection. However, users' intents are diverse and dynamic,\nmaking it challenging to maintain a fixed set of predefined intents. As a\nresult, a more practical approach is to develop a model capable of identifying\nnew intents as they emerge. We address the challenge of Intent Discovery, an\narea that has drawn significant attention in recent research efforts. Existing\nmethods need to train on a substantial amount of data for correctly identifying\nnew intents, demanding significant human effort. To overcome this, we introduce\nIntentGPT, a novel training-free method that effectively prompts Large Language\nModels (LLMs) such as GPT-4 to discover new intents with minimal labeled data.\nIntentGPT comprises an \\textit{In-Context Prompt Generator}, which generates\ninformative prompts for In-Context Learning, an \\textit{Intent Predictor} for\nclassifying and discovering user intents from utterances, and a\n\\textit{Semantic Few-Shot Sampler} that selects relevant few-shot examples and\na set of known intents to be injected into the prompt. Our experiments show\nthat IntentGPT outperforms previous methods that require extensive\ndomain-specific data and fine-tuning, in popular benchmarks, including CLINC\nand BANKING, among others.\n","authors":["Juan A. Rodriguez","Nicholas Botzer","David Vazquez","Christopher Pal","Marco Pedersoli","Issam Laradji"],"pdf_url":"https://arxiv.org/pdf/2411.10670v1.pdf","comment":"ICLR 2024 Workshop on LLM Agents"},{"id":"http://arxiv.org/abs/2408.13545v2","updated":"2024-11-16T02:08:31Z","published":"2024-08-24T10:34:20Z","title":"IQA-EVAL: Automatic Evaluation of Human-Model Interactive Question\n Answering","summary":" To evaluate Large Language Models (LLMs) for question answering (QA),\ntraditional methods typically focus on assessing single-turn responses to given\nquestions. However, this approach doesn't capture the dynamic nature of\nhuman-AI interactions, where humans actively seek information through\nconversation. Recent works in human-computer interaction (HCI) have employed\nhuman evaluators to conduct interactions and evaluations, but they are often\nprohibitively expensive and time-consuming to scale. We introduce an automatic\nevaluation framework IQA-EVAL to achieve Interactive Question Answering\nEvaluations, more specifically, we introduce a LLM-based Evaluation Agent (LEA)\nthat can: (1) simulate human behaviors to generate interactions with IQA\nmodels; (2) automatically evaluate the generated interactions. Moreover, we\npropose assigning personas to LEAs to better simulate groups of real human\nevaluators. We show that: (1) our evaluation framework with GPT-4 (or Claude)\nas the backbone model achieves a high correlation with human evaluations on the\nIQA task; (2) assigning personas to LEA to better represent the crowd further\nsignificantly improves correlations. Finally, we use our automatic metric to\nevaluate five recent representative LLMs with over 1000 questions from complex\nand ambiguous question answering tasks, which comes with a substantial cost of\n$5k if evaluated by humans.\n","authors":["Ruosen Li","Ruochen Li","Barry Wang","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2408.13545v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.10666v1","updated":"2024-11-16T02:02:49Z","published":"2024-11-16T02:02:49Z","title":"SAM Decoding: Speculative Decoding via Suffix Automaton","summary":" Large Language Models (LLMs) have revolutionized natural language processing\nby unifying tasks into text generation, yet their large parameter sizes and\nautoregressive nature limit inference speed. SAM-Decoding addresses this by\nintroducing a novel retrieval-based speculative decoding method that uses a\nsuffix automaton for efficient and accurate draft generation. Unlike n-gram\nmatching used by the existing method, SAM-Decoding finds the longest suffix\nmatch in generating text and text corpuss, achieving an average time complexity\nof $O(1)$ per generation step. SAM-Decoding constructs static and dynamic\nsuffix automatons for the text corpus and input prompts, respectively, enabling\nfast and precise draft generation. Meanwhile, it is designed as an approach\nthat can be combined with existing methods, allowing SAM-Decoding to adaptively\nselect a draft generation strategy based on the matching length, thus\nincreasing the inference speed of the LLM. When combined with Token Recycling,\nevaluations show SAM-Decoding outperforms existing model-free methods,\nachieving a speedup of $2.27\\times$ over autoregressive decoding on Spec-Bench.\nWhen combined with EAGLE2, it reaches a speedup of $2.49\\times$, surpassing all\ncurrent approaches. Our code is available at\nhttps://github.com/hyx1999/SAM-Decoding.\n","authors":["Yuxuan Hu","Ke Wang","Jing Zhang","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.10666v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.01768v2","updated":"2024-11-16T00:54:09Z","published":"2024-04-02T09:31:32Z","title":"Stereotype Detection in LLMs: A Multiclass, Explainable, and\n Benchmark-Driven Approach","summary":" Stereotype detection is a challenging and subjective task, as certain\nstatements, such as \"Black people like to play basketball,\" may not appear\novertly toxic but still reinforce racial stereotypes. With the increasing\nprevalence of large language models (LLMs) in human-facing artificial\nintelligence (AI) applications, detecting these types of biases is essential.\nHowever, LLMs risk perpetuating and amplifying stereotypical outputs derived\nfrom their training data. A reliable stereotype detector is crucial for\nbenchmarking bias, monitoring model input and output, filtering training data,\nand ensuring fairer model behavior in downstream applications. This paper\nintroduces the Multi-Grain Stereotype (MGS) dataset, consisting of 51,867\ninstances across gender, race, profession, religion, and other stereotypes,\ncurated from multiple existing datasets. We evaluate various machine learning\napproaches to establish baselines and fine-tune language models of different\narchitectures and sizes, presenting a suite of stereotype multiclass\nclassifiers trained on the MGS dataset. Given the subjectivity of stereotypes,\nexplainability is essential to align model learning with human understanding of\nstereotypes. We employ explainable AI (XAI) tools, including SHAP, LIME, and\nBertViz, to assess whether the model's learned patterns align with human\nintuitions about stereotypes.Additionally, we develop stereotype elicitation\nprompts and benchmark the presence of stereotypes in text generation tasks\nusing popular LLMs, employing the best-performing stereotype classifiers.\n","authors":["Zekun Wu","Sahan Bulathwela","Maria Perez-Ortiz","Adriano Soares Koshiyama"],"pdf_url":"https://arxiv.org/pdf/2404.01768v2.pdf","comment":"Under review as a conference paper at ARR October 2024"},{"id":"http://arxiv.org/abs/2409.11149v3","updated":"2024-11-16T00:28:03Z","published":"2024-09-17T13:03:12Z","title":"SAGED: A Holistic Bias-Benchmarking Pipeline for Language Models with\n Customisable Fairness Calibration","summary":" The development of unbiased large language models is widely recognized as\ncrucial, yet existing benchmarks fall short in detecting biases due to limited\nscope, contamination, and lack of a fairness baseline. SAGED(-Bias) is the\nfirst holistic benchmarking pipeline to address these problems. The pipeline\nencompasses five core stages: scraping materials, assembling benchmarks,\ngenerating responses, extracting numeric features, and diagnosing with\ndisparity metrics. SAGED includes metrics for max disparity, such as impact\nratio, and bias concentration, such as Max Z-scores. Noticing that assessment\ntool bias and contextual bias in prompts can distort evaluation, SAGED\nimplements counterfactual branching and baseline calibration for mitigation.\nFor demonstration, we use SAGED on G20 Countries with popular 8b-level models\nincluding Gemma2, Llama3.1, Mistral, and Qwen2. With sentiment analysis, we\nfind that while Mistral and Qwen2 show lower max disparity and higher bias\nconcentration than Gemma2 and Llama3.1, all models are notably biased against\ncountries like Russia and (except for Qwen2) China. With further experiments to\nhave models role-playing U.S. (vice-/former-) presidents, we see bias amplifies\nand shifts in heterogeneous directions. Moreover, we see Qwen2 and Mistral not\nengage in role-playing, while Llama3.1 and Gemma2 role-play Trump notably more\nintensively than Biden and Harris, indicating role-playing performance bias in\nthese models.\n","authors":["Xin Guan","Nathaniel Demchak","Saloni Gupta","Ze Wang","Ediz Ertekin Jr.","Adriano Koshiyama","Emre Kazim","Zekun Wu"],"pdf_url":"https://arxiv.org/pdf/2409.11149v3.pdf","comment":"Submitted to COLING 2025 Main Conference"},{"id":"http://arxiv.org/abs/2411.10640v1","updated":"2024-11-16T00:14:51Z","published":"2024-11-16T00:14:51Z","title":"BlueLM-V-3B: Algorithm and System Co-Design for Multimodal Large\n Language Models on Mobile Devices","summary":" The emergence and growing popularity of multimodal large language models\n(MLLMs) have significant potential to enhance various aspects of daily life,\nfrom improving communication to facilitating learning and problem-solving.\nMobile phones, as essential daily companions, represent the most effective and\naccessible deployment platform for MLLMs, enabling seamless integration into\neveryday tasks. However, deploying MLLMs on mobile phones presents challenges\ndue to limitations in memory size and computational capability, making it\ndifficult to achieve smooth and real-time processing without extensive\noptimization. In this paper, we present BlueLM-V-3B, an algorithm and system\nco-design approach specifically tailored for the efficient deployment of MLLMs\non mobile platforms. To be specific, we redesign the dynamic resolution scheme\nadopted by mainstream MLLMs and implement system optimization for\nhardware-aware deployment to optimize model inference on mobile phones.\nBlueLM-V-3B boasts the following key highlights: (1) Small Size: BlueLM-V-3B\nfeatures a language model with 2.7B parameters and a vision encoder with 400M\nparameters. (2) Fast Speed: BlueLM-V-3B achieves a generation speed of 24.4\ntoken/s on the MediaTek Dimensity 9300 processor with 4-bit LLM weight\nquantization. (3) Strong Performance: BlueLM-V-3B has attained the highest\naverage score of 66.1 on the OpenCompass benchmark among models with $\\leq$ 4B\nparameters and surpassed a series of models with much larger parameter sizes\n(e.g., MiniCPM-V-2.6, InternVL2-8B).\n","authors":["Xudong Lu","Yinghao Chen","Cheng Chen","Hui Tan","Boheng Chen","Yina Xie","Rui Hu","Guanxin Tan","Renshou Wu","Yan Hu","Yi Zeng","Lei Wu","Liuyang Bian","Zhaoxiong Wang","Long Liu","Yanzhou Yang","Han Xiao","Aojun Zhou","Yafei Wen","Xiaoxin Chen","Shuai Ren","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2411.10640v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2411.10639v1","updated":"2024-11-16T00:14:13Z","published":"2024-11-16T00:14:13Z","title":"MTA: Multimodal Task Alignment for BEV Perception and Captioning","summary":" Bird's eye view (BEV)-based 3D perception plays a crucial role in autonomous\ndriving applications. The rise of large language models has spurred interest in\nBEV-based captioning to understand object behavior in the surrounding\nenvironment. However, existing approaches treat perception and captioning as\nseparate tasks, focusing on the performance of only one of the tasks and\noverlooking the potential benefits of multimodal alignment. To bridge this gap\nbetween modalities, we introduce MTA, a novel multimodal task alignment\nframework that boosts both BEV perception and captioning. MTA consists of two\nkey components: (1) BEV-Language Alignment (BLA), a contextual learning\nmechanism that aligns the BEV scene representations with ground-truth language\nrepresentations, and (2) Detection-Captioning Alignment (DCA), a cross-modal\nprompting mechanism that aligns detection and captioning outputs. MTA\nintegrates into state-of-the-art baselines during training, adding no extra\ncomputational complexity at runtime. Extensive experiments on the nuScenes and\nTOD3Cap datasets show that MTA significantly outperforms state-of-the-art\nbaselines, achieving a 4.9% improvement in perception and a 9.2% improvement in\ncaptioning. These results underscore the effectiveness of unified alignment in\nreconciling BEV-based perception and captioning.\n","authors":["Yunsheng Ma","Burhaneddin Yaman","Xin Ye","Feng Tao","Abhirup Mallik","Ziran Wang","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2411.10639v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.10636v1","updated":"2024-11-16T00:04:45Z","published":"2024-11-16T00:04:45Z","title":"Gender Bias Mitigation for Bangla Classification Tasks","summary":" In this study, we investigate gender bias in Bangla pretrained language\nmodels, a largely under explored area in low-resource languages. To assess this\nbias, we applied gender-name swapping techniques to existing datasets, creating\nfour manually annotated, task-specific datasets for sentiment analysis,\ntoxicity detection, hate speech detection, and sarcasm detection. By altering\nnames and gender-specific terms, we ensured these datasets were suitable for\ndetecting and mitigating gender bias. We then proposed a joint loss\noptimization technique to mitigate gender bias across task-specific pretrained\nmodels. Our approach was evaluated against existing bias mitigation methods,\nwith results showing that our technique not only effectively reduces bias but\nalso maintains competitive accuracy compared to other baseline approaches. To\npromote further research, we have made both our implementation and datasets\npublicly available\nhttps://github.com/sajib-kumar/Gender-Bias-Mitigation-From-Bangla-PLM\n","authors":["Sajib Kumar Saha Joy","Arman Hassan Mahy","Meherin Sultana","Azizah Mamun Abha","MD Piyal Ahmmed","Yue Dong","G M Shahariar"],"pdf_url":"https://arxiv.org/pdf/2411.10636v1.pdf","comment":null}]},"2024-11-19T00:00:00Z":{"Robotics":[{"id":"http://arxiv.org/abs/2411.12734v1","updated":"2024-11-19T18:57:41Z","published":"2024-11-19T18:57:41Z","title":"Soft Robotic Dynamic In-Hand Pen Spinning","summary":" Dynamic in-hand manipulation remains a challenging task for soft robotic\nsystems that have demonstrated advantages in safe compliant interactions but\nstruggle with high-speed dynamic tasks. In this work, we present SWIFT, a\nsystem for learning dynamic tasks using a soft and compliant robotic hand.\nUnlike previous works that rely on simulation, quasi-static actions and precise\nobject models, the proposed system learns to spin a pen through trial-and-error\nusing only real-world data without requiring explicit prior knowledge of the\npen's physical attributes. With self-labeled trials sampled from the real\nworld, the system discovers the set of pen grasping and spinning primitive\nparameters that enables a soft hand to spin a pen robustly and reliably. After\n130 sampled actions per object, SWIFT achieves 100% success rate across three\npens with different weights and weight distributions, demonstrating the\nsystem's generalizability and robustness to changes in object properties. The\nresults highlight the potential for soft robotic end-effectors to perform\ndynamic tasks including rapid in-hand manipulation. We also demonstrate that\nSWIFT generalizes to spinning items with different shapes and weights such as a\nbrush and a screwdriver which we spin with 10/10 and 5/10 success rates\nrespectively. Videos, data, and code are available at\nhttps://soft-spin.github.io.\n","authors":["Yunchao Yao","Uksang Yoo","Jean Oh","Christopher G. Atkeson","Jeffrey Ichnowski"],"pdf_url":"https://arxiv.org/pdf/2411.12734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12711v1","updated":"2024-11-19T18:25:38Z","published":"2024-11-19T18:25:38Z","title":"UBSoft: A Simulation Platform for Robotic Skill Learning in Unbounded\n Soft Environments","summary":" It is desired to equip robots with the capability of interacting with various\nsoft materials as they are ubiquitous in the real world. While physics\nsimulations are one of the predominant methods for data collection and robot\ntraining, simulating soft materials presents considerable challenges.\nSpecifically, it is significantly more costly than simulating rigid objects in\nterms of simulation speed and storage requirements. These limitations typically\nrestrict the scope of studies on soft materials to small and bounded areas,\nthereby hindering the learning of skills in broader spaces. To address this\nissue, we introduce UBSoft, a new simulation platform designed to support\nunbounded soft environments for robot skill acquisition. Our platform utilizes\nspatially adaptive resolution scales, where simulation resolution dynamically\nadjusts based on proximity to active robotic agents. Our framework markedly\nreduces the demand for extensive storage space and computation costs required\nfor large-scale scenarios involving soft materials. We also establish a set of\nbenchmark tasks in our platform, including both locomotion and manipulation\ntasks, and conduct experiments to evaluate the efficacy of various\nreinforcement learning algorithms and trajectory optimization techniques, both\ngradient-based and sampling-based. Preliminary results indicate that\nsampling-based trajectory optimization generally achieves better results for\nobtaining one trajectory to solve the task. Additionally, we conduct\nexperiments in real-world environments to demonstrate that advancements made in\nour UBSoft simulator could translate to improved robot interactions with\nlarge-scale soft material. More videos can be found at\nhttps://vis-www.cs.umass.edu/ubsoft/.\n","authors":["Chunru Lin","Jugang Fan","Yian Wang","Zeyuan Yang","Zhehuan Chen","Lixing Fang","Tsun-Hsuan Wang","Zhou Xian","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2411.12711v1.pdf","comment":"CoRL 2024. The first two authors contributed equally to this paper"},{"id":"http://arxiv.org/abs/2306.03530v4","updated":"2024-11-19T17:41:00Z","published":"2023-06-06T09:26:43Z","title":"RLtools: A Fast, Portable Deep Reinforcement Learning Library for\n Continuous Control","summary":" Deep Reinforcement Learning (RL) can yield capable agents and control\npolicies in several domains but is commonly plagued by prohibitively long\ntraining times. Additionally, in the case of continuous control problems, the\napplicability of learned policies on real-world embedded devices is limited due\nto the lack of real-time guarantees and portability of existing libraries. To\naddress these challenges, we present RLtools, a dependency-free, header-only,\npure C++ library for deep supervised and reinforcement learning. Its novel\narchitecture allows RLtools to be used on a wide variety of platforms, from HPC\nclusters over workstations and laptops to smartphones, smartwatches, and\nmicrocontrollers. Specifically, due to the tight integration of the RL\nalgorithms with simulation environments, RLtools can solve popular RL problems\nup to 76 times faster than other popular RL frameworks. We also benchmark the\ninference on a diverse set of microcontrollers and show that in most cases our\noptimized implementation is by far the fastest. Finally, RLtools enables the\nfirst-ever demonstration of training a deep RL algorithm directly on a\nmicrocontroller, giving rise to the field of TinyRL. The source code as well as\ndocumentation and live demos are available through our project page at\nhttps://rl.tools.\n","authors":["Jonas Eschmann","Dario Albani","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2306.03530v4.pdf","comment":"Project page: https://rl.tools"},{"id":"http://arxiv.org/abs/2411.12664v1","updated":"2024-11-19T17:16:47Z","published":"2024-11-19T17:16:47Z","title":"Identifying patterns of proprioception and target matching acuity in\n healthy humans","summary":" Traditional approaches to measurement in upper-limb therapy have gaps that\nelectronic sensing and recording can help fill. We highlight shortcomings in\ncurrent kinematic recording devices, and we introduce a wrist sensing device\nthat performs multimodal sensing during single-axis rotation. Our goal is to\ncharacterize normative kinesthetic perception and real-world performance as a\nmultimodal sensory \"fingerprint\" that can serve as a reference point for\nidentifying deficit in persons affected by stroke, and then as a jumping point\nfor later neuroscientific interrogation. We present an experiment involving\npsychophysical measurements of passive stimuli discrimination, matching\nadjustment acuity, and ADL performance in 11 neurologically-intact persons. We\nfound that passive velocity sense and active position sense of healthy\ncontrols, measured by velocity discrimination and position matching\nrespectively, correlated in rank with each other, but other score comparisons\nof acuity or task performance had no statistically significant correlations. We\nalso found that participants differed in acuity between passive and active\nvelocity sense, which supports current understanding about muscle spindle\nactivation being modulated by conscious motor command. The potential for our\nnull correlation results to reveal dissociable aspects of deficit is discussed,\nas well as implications for future neuroscientific study with more kinematic\nmeasures and larger datasets.\n","authors":["Jacob Carducci","Jeremy D. Brown"],"pdf_url":"https://arxiv.org/pdf/2411.12664v1.pdf","comment":"14 pages, 15 figures; A newer version of this work has been submitted\n to the 2024 IEEE EMBC for possible publication in their conference\n proceedings"},{"id":"http://arxiv.org/abs/2411.12658v1","updated":"2024-11-19T17:06:24Z","published":"2024-11-19T17:06:24Z","title":"Data-efficient Tactile Sensing with Electrical Impedance Tomography","summary":" Electrical Impedance Tomography (EIT)-inspired tactile sensors are gaining\nattention in robotic tactile sensing due to their cost-effectiveness, safety,\nand scalability with sparse electrode configurations. This paper presents a\ndata augmentation strategy for learning-based tactile reconstruction that\namplifies the original single-frame signal measurement into 32 distinct,\neffective signal data for training. This approach supplements uncollected\nconditions of position information, resulting in more accurate and\nhigh-resolution tactile reconstructions. Data augmentation for EIT\nsignificantly reduces the required EIT measurements and achieves promising\nperformance with even limited samples. Simulation results show that the\nproposed method improves the correlation coefficient by over 12% and reduces\nthe relative error by over 21% under various noise levels. Furthermore, we\ndemonstrate that a standard deep neural network (DNN) utilizing the proposed\ndata augmentation reduces the required data down to 1/31 while achieving a\nsimilar tactile reconstruction quality. Real-world tests further validate the\napproach's effectiveness on a flexible EIT-based tactile sensor. These results\ncould help address the challenge of training tactile sensing networks with\nlimited available measurements, improving the accuracy and applicability of\nEIT-based tactile sensing systems.\n","authors":["Huazhi Dong","Ronald B. Liu","Leo Micklem","Peisan Sharel E","Francesco Giorgio-Serchi","Yunjie Yang"],"pdf_url":"https://arxiv.org/pdf/2411.12658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12633v1","updated":"2024-11-19T16:45:52Z","published":"2024-11-19T16:45:52Z","title":"Instant Policy: In-Context Imitation Learning via Graph Diffusion","summary":" Following the impressive capabilities of in-context learning with large\ntransformers, In-Context Imitation Learning (ICIL) is a promising opportunity\nfor robotics. We introduce Instant Policy, which learns new tasks instantly\n(without further training) from just one or two demonstrations, achieving ICIL\nthrough two key components. First, we introduce inductive biases through a\ngraph representation and model ICIL as a graph generation problem with a\nlearned diffusion process, enabling structured reasoning over demonstrations,\nobservations, and actions. Second, we show that such a model can be trained\nusing pseudo-demonstrations - arbitrary trajectories generated in simulation -\nas a virtually infinite pool of training data. Simulated and real experiments\nshow that Instant Policy enables rapid learning of various everyday robot\ntasks. We also show how it can serve as a foundation for cross-embodiment and\nzero-shot transfer to language-defined tasks. Code and videos are available at\nhttps://www.robot-learning.uk/instant-policy.\n","authors":["Vitalis Vosylius","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2411.12633v1.pdf","comment":"Code and videos are available on our project webpage at\n https://www.robot-learning.uk/instant-policy"},{"id":"http://arxiv.org/abs/2411.08566v2","updated":"2024-11-19T16:03:58Z","published":"2024-11-13T12:26:08Z","title":"Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space\n Exploration by Reinforcement Learning Agent","summary":" Grasping by a robot in unstructured environments is deemed a critical\nchallenge because of the requirement for effective adaptation to a wide\nvariation in object geometries, material properties, and other environmental\nfactors. In this paper, we propose a novel framework for robotic grasping based\non the idea of compressing high-dimensional target and gripper features in a\ncommon latent space using a set of autoencoders. Our approach simplifies\ngrasping by using three autoencoders dedicated to the target, the gripper, and\na third one that fuses their latent representations. This allows the RL agent\nto achieve higher learning rates at the initial stages of exploration of a new\nenvironment, as well as at non-zero shot grasp attempts. The agent explores the\nlatent space of the third autoencoder for better quality grasp without explicit\nreconstruction of objects. By implementing the PoWER algorithm into the RL\ntraining process, updates on the agent's policy will be made through the\nperturbation in the reward-weighted latent space. The successful exploration\nefficiently constrains both position and pose integrity for feasible executions\nof grasps. We evaluate our system on a diverse set of objects, demonstrating\nthe high success rate in grasping with minimum computational overhead. We found\nthat approach enhances the adaptation of the RL agent by more than 35 % in\nsimulation experiments.\n","authors":["Leonidas Askianakis"],"pdf_url":"https://arxiv.org/pdf/2411.08566v2.pdf","comment":"Submitted for review at IEEE ICRA 2025"},{"id":"http://arxiv.org/abs/2411.12573v1","updated":"2024-11-19T15:41:43Z","published":"2024-11-19T15:41:43Z","title":"Locomotion Mode Transitions: Tackling System- and User-Specific\n Variability in Lower-Limb Exoskeletons","summary":" Accurate detection of locomotion transitions, such as walk to sit, walk to\nstair ascent, and descent, is crucial to effectively control robotic assistive\ndevices, such as lower-limb exoskeletons, as each locomotion mode requires\nspecific assistance. Variability in collected sensor data introduced by user-\nor system-specific characteristics makes it challenging to maintain high\ntransition detection accuracy while avoiding latency using non-adaptive\nclassification models. In this study, we identified key factors influencing\ntransition detection performance, including variations in user behavior, and\ndifferent mechanical designs of the exoskeletons. To boost the transition\ndetection accuracy, we introduced two methods for adapting a finite-state\nmachine classifier to system- and user-specific variability: a Statistics-Based\napproach and Bayesian Optimization. Our experimental results demonstrate that\nboth methods remarkably improve transition detection accuracy across diverse\nusers, achieving up to an 80% increase in certain scenarios compared to the\nnon-personalized threshold method. These findings emphasize the importance of\npersonalization in adaptive control systems, underscoring the potential for\nenhanced user experience and effectiveness in assistive devices. By\nincorporating subject- and system-specific data into the model training\nprocess, our approach offers a precise and reliable solution for detecting\nlocomotion transitions, catering to individual user needs, and ultimately\nimproving the performance of assistive devices.\n","authors":["Andrea Dal Prete","Zeynep Özge Orhan","Anastasia Bolotnikova","Marta Gandolla","Auke Ijspeert","Mohamed Bouri"],"pdf_url":"https://arxiv.org/pdf/2411.12573v1.pdf","comment":"16 pages, 16 figures"},{"id":"http://arxiv.org/abs/2411.12549v1","updated":"2024-11-19T15:03:02Z","published":"2024-11-19T15:03:02Z","title":"Tactile interaction with social robots influences attitudes and\n behaviour","summary":" Tactile interaction plays an essential role in human-to-human interaction.\nPeople gain comfort and support from tactile interactions with others and touch\nis an important predictor for trust. While touch has been explored as a\ncommunicative modality in HCI and HRI, we here report on two studies in which\ntouching a social robot is used to regulate people's stress levels and\nconsequently their actions. In the first study, we look at whether different\nintensities of tactile interaction result in a physiological response related\nto stress, and whether the interaction impacts risk-taking behaviour and trust.\nWe let 38 participants complete a Balloon Analogue Risk Task (BART), a\ncomputer-based game that serves as a proxy for risk-taking behaviour. In our\nstudy, participants are supported by a robot during the BART task. The robot\nbuilds trust and encourages participants to take more risk. The results show\nthat affective tactile interaction with the robot increases participants'\nrisk-taking behaviour, but gentle affective tactile interaction increases\ncomfort and lowers stress whereas high-intensity touch does not. We also find\nthat male participants exhibit more risk-taking behaviour than females while\nbeing less stressed. Based on this experiment, a second study is used to\nascertain whether these effects are caused by the social nature of tactile\ninteraction or by the physical interaction alone. For this, instead of a social\nrobot, participants now have a tactile interaction with a non-social device.\nThe non-social interaction does not result in any effect, leading us to\nconclude that tactile interaction with humanoid robots is a social phenomenon\nrather than a mere physical phenomenon.\n","authors":["Qiaoqiao Ren","Tony Belpaeme"],"pdf_url":"https://arxiv.org/pdf/2411.12549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08234v2","updated":"2024-11-19T14:44:36Z","published":"2024-06-12T14:01:12Z","title":"MaIL: Improving Imitation Learning with Mamba","summary":" This work presents Mamba Imitation Learning (MaIL), a novel imitation\nlearning (IL) architecture that provides an alternative to state-of-the-art\n(SoTA) Transformer-based policies. MaIL leverages Mamba, a state-space model\ndesigned to selectively focus on key features of the data. While Transformers\nare highly effective in data-rich environments due to their dense attention\nmechanisms, they can struggle with smaller datasets, often leading to\noverfitting or suboptimal representation learning. In contrast, Mamba's\narchitecture enhances representation learning efficiency by focusing on key\nfeatures and reducing model complexity. This approach mitigates overfitting and\nenhances generalization, even when working with limited data. Extensive\nevaluations on the LIBERO benchmark demonstrate that MaIL consistently\noutperforms Transformers on all LIBERO tasks with limited data and matches\ntheir performance when the full dataset is available. Additionally, MaIL's\neffectiveness is validated through its superior performance in three real robot\nexperiments. Our code is available at https://github.com/ALRhub/MaIL.\n","authors":["Xiaogang Jia","Qian Wang","Atalay Donat","Bowen Xing","Ge Li","Hongyi Zhou","Onur Celik","Denis Blessing","Rudolf Lioutikov","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2406.08234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12535v1","updated":"2024-11-19T14:33:47Z","published":"2024-11-19T14:33:47Z","title":"Multilayer occupancy grid for obstacle avoidance in an autonomous ground\n vehicle using RGB-D camera","summary":" This work describes the process of integrating a depth camera into the\nnavigation system of a self-driving ground vehicle (SDV) and the implementation\nof a multilayer costmap that enhances the vehicle's obstacle identification\nprocess by expanding its two-dimensional field of view, based on 2D LIDAR, to a\nthree-dimensional perception system using an RGB-D camera. This approach lays\nthe foundation for a robust vision-based navigation and obstacle detection\nsystem. A theoretical review is presented and implementation results are\ndiscussed for future work.\n","authors":["Jhair S. Gallego","Ricardo E. Ramirez"],"pdf_url":"https://arxiv.org/pdf/2411.12535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12520v1","updated":"2024-11-19T14:07:17Z","published":"2024-11-19T14:07:17Z","title":"VMGNet: A Low Computational Complexity Robotic Grasping Network Based on\n VMamba with Multi-Scale Feature Fusion","summary":" While deep learning-based robotic grasping technology has demonstrated strong\nadaptability, its computational complexity has also significantly increased,\nmaking it unsuitable for scenarios with high real-time requirements. Therefore,\nwe propose a low computational complexity and high accuracy model named VMGNet\nfor robotic grasping. For the first time, we introduce the Visual State Space\ninto the robotic grasping field to achieve linear computational complexity,\nthereby greatly reducing the model's computational cost. Meanwhile, to improve\nthe accuracy of the model, we propose an efficient and lightweight multi-scale\nfeature fusion module, named Fusion Bridge Module, to extract and fuse\ninformation at different scales. We also present a new loss function\ncalculation method to enhance the importance differences between subtasks,\nimproving the model's fitting ability. Experiments show that VMGNet has only\n8.7G Floating Point Operations and an inference time of 8.1 ms on our devices.\nVMGNet also achieved state-of-the-art performance on the Cornell and Jacquard\npublic datasets. To validate VMGNet's effectiveness in practical applications,\nwe conducted real grasping experiments in multi-object scenarios, and VMGNet\nachieved an excellent performance with a 94.4% success rate in real-world\ngrasping tasks. The video for the real-world robotic grasping experiments is\navailable at https://youtu.be/S-QHBtbmLc4.\n","authors":["Yuhao Jin","Qizhong Gao","Xiaohui Zhu","Yong Yue","Eng Gee Lim","Yuqing Chen","Prudence Wong","Yijie Chu"],"pdf_url":"https://arxiv.org/pdf/2411.12520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12503v1","updated":"2024-11-19T13:42:18Z","published":"2024-11-19T13:42:18Z","title":"ManiSkill-ViTac 2025: Challenge on Manipulation Skill Learning With\n Vision and Tactile Sensing","summary":" This article introduces the ManiSkill-ViTac Challenge 2025, which focuses on\nlearning contact-rich manipulation skills using both tactile and visual\nsensing. Expanding upon the 2024 challenge, ManiSkill-ViTac 2025 includes 3\nindependent tracks: tactile manipulation, tactile-vision fusion manipulation,\nand tactile sensor structure design. The challenge aims to push the boundaries\nof robotic manipulation skills, emphasizing the integration of tactile and\nvisual data to enhance performance in complex, real-world tasks. Participants\nwill be evaluated using standardized metrics across both simulated and\nreal-world environments, spurring innovations in sensor design and\nsignificantly advancing the field of vision-tactile fusion in robotics.\n","authors":["Chuanyu Li","Renjun Dang","Xiang Li","Zhiyuan Wu","Jing Xu","Hamidreza Kasaei","Roberto Calandra","Nathan Lepora","Shan Luo","Hao Su","Rui Chen"],"pdf_url":"https://arxiv.org/pdf/2411.12503v1.pdf","comment":"Challenge webpage:\n https://ai-workshops.github.io/maniskill-vitac-challenge-2025/"},{"id":"http://arxiv.org/abs/2411.12478v1","updated":"2024-11-19T13:00:47Z","published":"2024-11-19T13:00:47Z","title":"Robotic transcatheter tricuspid valve replacement with hybrid enhanced\n intelligence: a new paradigm and first-in-vivo study","summary":" Transcatheter tricuspid valve replacement (TTVR) is the latest treatment for\ntricuspid regurgitation and is in the early stages of clinical adoption.\nIntelligent robotic approaches are expected to overcome the challenges of\nsurgical manipulation and widespread dissemination, but systems and protocols\nwith high clinical utility have not yet been reported. In this study, we\npropose a complete solution that includes a passive stabilizer, robotic drive,\ndetachable delivery catheter and valve manipulation mechanism. Working towards\nautonomy, a hybrid augmented intelligence approach based on reinforcement\nlearning, Monte Carlo probabilistic maps and human-robot co-piloted control was\nintroduced. Systematic tests in phantom and first-in-vivo animal experiments\nwere performed to verify that the system design met the clinical requirement.\nFurthermore, the experimental results confirmed the advantages of co-piloted\ncontrol over conventional master-slave control in terms of time efficiency,\ncontrol efficiency, autonomy and stability of operation. In conclusion, this\nstudy provides a comprehensive pathway for robotic TTVR and, to our knowledge,\ncompletes the first animal study that not only successfully demonstrates the\napplication of hybrid enhanced intelligence in interventional robotics, but\nalso provides a solution with high application value for a cutting-edge\nprocedure.\n","authors":["Shuangyi Wang","Haichuan Lin","Yiping Xie","Ziqi Wang","Dong Chen","Longyue Tan","Xilong Hou","Chen Chen","Xiao-Hu Zhou","Shengtao Lin","Fei Pan","Kent Chak-Yu So","Zeng-Guang Hou"],"pdf_url":"https://arxiv.org/pdf/2411.12478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12425v1","updated":"2024-11-19T11:23:19Z","published":"2024-11-19T11:23:19Z","title":"Behaviour diversity in a walking and climbing centipede-like virtual\n creature","summary":" Robot controllers are often optimised for a single robot in a single\nenvironment. This approach proves brittle, as such a controller will often fail\nto produce sensible behavior for a new morphology or environment. In\ncomparison, animal gaits are robust and versatile. By observing animals, and\nattempting to extract general principles of locomotion from their movement, we\naim to design a single decentralised controller applicable to diverse\nmorphologies and environments. The controller implements the three components\n1) undulation, 2) peristalsis, and 3) leg motion, which we believe are the\nessential elements in most animal gaits. The controller is tested on a variety\nof simulated centipede-like robots. The centipede is chosen as inspiration\nbecause it moves using both body contractions and legged locomotion. For a\ncontroller to work in qualitatively different settings, it must also be able to\nexhibit qualitatively different behaviors. We find that six different modes of\nlocomotion emerge from our controller in response to environmental and\nmorphological changes. We also find that different parts of the centipede model\ncan exhibit different modes of locomotion, simultaneously, based on local\nmorphological features. This controller can potentially aid in the design or\nevolution of robots, by quickly testing the potential of a morphology, or be\nused to get insights about underlying locomotion principles in the centipede.\n","authors":["Emma Stensby Norstein","Kotaro Yasui","Takeshi Kano","Akio Ishiguro","Kyrre Glette"],"pdf_url":"https://arxiv.org/pdf/2411.12425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17394v2","updated":"2024-11-19T10:27:37Z","published":"2024-04-26T13:14:28Z","title":"Child Speech Recognition in Human-Robot Interaction: Problem Solved?","summary":" Automated Speech Recognition shows superhuman performance for adult English\nspeech on a range of benchmarks, but disappoints when fed children's speech.\nThis has long sat in the way of child-robot interaction. Recent evolutions in\ndata-driven speech recognition, including the availability of Transformer\narchitectures and unprecedented volumes of training data, might mean a\nbreakthrough for child speech recognition and social robot applications aimed\nat children. We revisit a study on child speech recognition from 2017 and show\nthat indeed performance has increased, with newcomer OpenAI Whisper doing\nmarkedly better than leading commercial cloud services. Performance improves\neven more in highly structured interactions when priming models with specific\nphrases. While transcription is not perfect yet, the best model recognises\n60.3% of sentences correctly barring small grammatical differences, with\nsub-second transcription time running on a local GPU, showing potential for\nusable autonomous child-robot speech interactions.\n","authors":["Ruben Janssens","Eva Verhelst","Giulio Antonio Abbo","Qiaoqiao Ren","Maria Jose Pinto Bernal","Tony Belpaeme"],"pdf_url":"https://arxiv.org/pdf/2404.17394v2.pdf","comment":"Submitted to 2024 International Conference on Social Robotics"},{"id":"http://arxiv.org/abs/2411.09623v2","updated":"2024-11-19T10:15:56Z","published":"2024-11-14T17:47:54Z","title":"Vision-based Manipulation of Transparent Plastic Bags in Industrial\n Setups","summary":" This paper addresses the challenges of vision-based manipulation for\nautonomous cutting and unpacking of transparent plastic bags in industrial\nsetups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data,\nconnectivity, analytics, and robotics, promises enhanced accessibility and\nsustainability throughout the value chain. The integration of autonomous\nsystems, including collaborative robots (cobots), into industrial processes is\npivotal for efficiency and safety. The proposed solution employs advanced\nMachine Learning algorithms, particularly Convolutional Neural Networks (CNNs),\nto identify transparent plastic bags under varying lighting and background\nconditions. Tracking algorithms and depth sensing technologies are utilized for\n3D spatial awareness during pick and placement. The system addresses challenges\nin grasping and manipulation, considering optimal points, compliance control\nwith vacuum gripping technology, and real-time automation for safe interaction\nin dynamic environments. The system's successful testing and validation in the\nlab with the FRANKA robot arm, showcases its potential for widespread\nindustrial applications, while demonstrating effectiveness in automating the\nunpacking and cutting of transparent plastic bags for an 8-stack bulk-loader\nbased on specific requirements and rigorous testing.\n","authors":["F. Adetunji","A. Karukayil","P. Samant","S. Shabana","F. Varghese","U. Upadhyay","R. A. Yadav","A. Partridge","E. Pendleton","R. Plant","Y. Petillot","M. Koskinopoulou"],"pdf_url":"https://arxiv.org/pdf/2411.09623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11616v2","updated":"2024-11-19T10:11:04Z","published":"2024-11-18T14:42:15Z","title":"Signaling and Social Learning in Swarms of Robots","summary":" This paper investigates the role of communication in improving coordination\nwithin robot swarms, focusing on a paradigm where learning and execution occur\nsimultaneously in a decentralized manner. We highlight the role communication\ncan play in addressing the credit assignment problem (individual contribution\nto the overall performance), and how it can be influenced by it. We propose a\ntaxonomy of existing and future works on communication, focusing on information\nselection and physical abstraction as principal axes for classification: from\nlow-level lossless compression with raw signal extraction and processing to\nhigh-level lossy compression with structured communication models. The paper\nreviews current research from evolutionary robotics, multi-agent (deep)\nreinforcement learning, language models, and biophysics models to outline the\nchallenges and opportunities of communication in a collective of robots that\ncontinuously learn from one another through local message exchanges,\nillustrating a form of social learning.\n","authors":["Leo Cazenille","Maxime Toquebiau","Nicolas Lobato-Dauzier","Alessia Loi","Loona Macabre","Nathanael Aubert-Kato","Anthony Genot","Nicolas Bredeche"],"pdf_url":"https://arxiv.org/pdf/2411.11616v2.pdf","comment":"17 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2409.16718v2","updated":"2024-11-19T09:27:37Z","published":"2024-09-25T08:07:18Z","title":"Vision-Language Model Fine-Tuning via Simple Parameter-Efficient\n Modification","summary":" Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed\nthe success of prompt tuning and adapter tuning, while the classic model\nfine-tuning on inherent parameters seems to be overlooked. It is believed that\nfine-tuning the parameters of VLMs with few-shot samples corrupts the\npre-trained knowledge since fine-tuning the CLIP model even degrades\nperformance. In this paper, we revisit this viewpoint, and propose a new\nperspective: fine-tuning the specific parameters instead of all will uncover\nthe power of classic model fine-tuning on VLMs. Through our meticulous study,\nwe propose ClipFit, a simple yet effective method to fine-tune CLIP without\nintroducing any overhead of extra parameters. We demonstrate that by only\nfine-tuning the specific bias terms and normalization layers, ClipFit can\nimprove the performance of zero-shot CLIP by 7.27\\% average harmonic mean\naccuracy. Lastly, to understand how fine-tuning in CLIPFit affects the\npre-trained models, we conducted extensive experimental analyses w.r.t. changes\nin internal parameters and representations. We found that low-level text bias\nlayers and the first layer normalization layer change much more than other\nlayers. The code is available at \\url{https://github.com/minglllli/CLIPFit}.\n","authors":["Ming Li","Jike Zhong","Chenxin Li","Liuzhuozheng Li","Nie Lin","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2409.16718v2.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2411.12361v1","updated":"2024-11-19T09:20:51Z","published":"2024-11-19T09:20:51Z","title":"Breathless: An 8-hour Performance Contrasting Human and Robot\n Expressiveness","summary":" This paper describes the robot technology behind an original performance that\npairs a human dancer (Cuan) with an industrial robot arm for an eight-hour\ndance that unfolds over the timespan of an American workday. To control the\nrobot arm, we combine a range of sinusoidal motions with varying amplitude,\nfrequency and offset at each joint to evoke human motions common in physical\nlabor such as stirring, digging, and stacking. More motions were developed\nusing deep learning techniques for video-based human-pose tracking and\nextraction. We combine these pre-recorded motions with improvised robot motions\ncreated live by putting the robot into teach-mode and triggering force sensing\nfrom the robot joints onstage. All motions are combined with commercial and\noriginal music using a custom suite of python software with AppleScript,\nKeynote, and Zoom to facilitate on-stage communication with the dancer. The\nresulting performance contrasts the expressivity of the human body with the\nprecision of robot machinery. Video, code and data are available on the project\nwebsite: https://sites.google.com/playing.studio/breathless\n","authors":["Catie Cuan","Tianshuang Qiu","Shreya Ganti","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2411.12361v1.pdf","comment":"15 pages, 9 figures, accepted for ISRR (International Symposium of\n Robotics Research) 2024"},{"id":"http://arxiv.org/abs/2411.12359v1","updated":"2024-11-19T09:19:35Z","published":"2024-11-19T09:19:35Z","title":"TactV: A Class of Hybrid Terrestrial/Aerial Coaxial Tilt-Rotor Vehicles","summary":" To enhance the obstacle-crossing and endurance capabilities of vehicles\noperating in complex environments, this paper presents the design of a hybrid\nterrestrial/aerial coaxial tilt-rotor vehicle, TactV, which integrates\nadvantages such as lightweight construction and high maneuverability. Unlike\nexisting tandem dual-rotor vehicles, TactV employs a tiltable coaxial\ndual-rotor design and features a spherical cage structure that encases the\nbody, allowing for omnidirectional movement while further reducing its overall\ndimensions. To enable TactV to maneuver flexibly in aerial, planar, and\ninclined surfaces, we established corresponding dynamic and control models for\neach mode. Additionally, we leveraged TactV's tiltable center of gravity to\ndesign energy-saving and high-mobility modes for ground operations, thereby\nfurther enhancing its endurance. Experimental designs for both aerial and\nground tests corroborated the superiority of TactV's movement capabilities and\ncontrol strategies.\n","authors":["Yifei Dong","Yimin Zhu","Lixian Zhang","Yihang Ding"],"pdf_url":"https://arxiv.org/pdf/2411.12359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11607v2","updated":"2024-11-19T08:52:04Z","published":"2024-11-18T14:29:22Z","title":"Performance evaluation of a ROS2 based Automated Driving System","summary":" Automated driving is currently a prominent area of scientific work. In the\nfuture, highly automated driving and new Advanced Driver Assistance Systems\nwill become reality. While Advanced Driver Assistance Systems and automated\ndriving functions for certain domains are already commercially available,\nubiquitous automated driving in complex scenarios remains a subject of ongoing\nresearch. Contrarily to single-purpose Electronic Control Units, the software\nfor automated driving is often executed on high performance PCs. The Robot\nOperating System 2 (ROS2) is commonly used to connect components in an\nautomated driving system. Due to the time critical nature of automated driving\nsystems, the performance of the framework is especially important. In this\npaper, a thorough performance evaluation of ROS2 is conducted, both in terms of\ntimeliness and error rate. The results show that ROS2 is a suitable framework\nfor automated driving systems.\n","authors":["Jorin Kouril","Bernd Schäufele","Ilja Radusch","Bettina Schnor"],"pdf_url":"https://arxiv.org/pdf/2411.11607v2.pdf","comment":"Published and presented at VEHITS 2024, Proceedings of the 10th\n International Conference on Vehicle Technology and Intelligent Transport\n Systems - VEHITS; 2024"},{"id":"http://arxiv.org/abs/2411.12338v1","updated":"2024-11-19T08:42:24Z","published":"2024-11-19T08:42:24Z","title":"Target Height Estimation Using a Single Acoustic Camera for Compensation\n in 2D Seabed Mosaicking","summary":" This letter proposes a novel approach for compensating target height data in\n2D seabed mosaicking for low-visibility underwater perception. Acoustic cameras\nare effective sensors for sensing the marine environments due to their\nhigh-resolution imaging capabilities and robustness to darkness and turbidity.\nHowever, the loss of elevation angle during the imaging process results in a\nlack of target height information in the original acoustic camera images,\nleading to a simplistic 2D representation of the seabed mosaicking. In\nperceiving cluttered and unexplored marine environments, target height data is\ncrucial for avoiding collisions with marine robots. This study proposes a novel\napproach for estimating seabed target height using a single acoustic camera and\nintegrates height data into 2D seabed mosaicking to compensate for the missing\n3D dimension of seabed targets. Unlike classic methods that model the loss of\nelevation angle to achieve seabed 3D reconstruction, this study focuses on\nutilizing available acoustic cast shadow clues and simple sensor motion to\nquickly estimate target height. The feasibility of our proposal is verified\nthrough a water tank experiment and a simulation experiment.\n","authors":["Xiaoteng Zhou","Yusheng Wang","Katsunori Mizuno"],"pdf_url":"https://arxiv.org/pdf/2411.12338v1.pdf","comment":"8 pages,conference"},{"id":"http://arxiv.org/abs/2411.12310v1","updated":"2024-11-19T07:55:01Z","published":"2024-11-19T07:55:01Z","title":"Variable-Frequency Imitation Learning for Variable-Speed Motion","summary":" Conventional methods of imitation learning for variable-speed motion have\ndifficulty extrapolating speeds because they rely on learning models running at\na constant sampling frequency. This study proposes variable-frequency imitation\nlearning (VFIL), a novel method for imitation learning with learning models\ntrained to run at variable sampling frequencies along with the desired speeds\nof motion. The experimental results showed that the proposed method improved\nthe velocity-wise accuracy along both the interpolated and extrapolated\nfrequency labels, in addition to a 12.5 % increase in the overall success rate.\n","authors":["Nozomu Masuya","Sho Sakaino","Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2411.12310v1.pdf","comment":"7 pages, 9 figures, 2 tables. Submitted to IEEE ICM 2025"},{"id":"http://arxiv.org/abs/2411.12308v1","updated":"2024-11-19T07:49:22Z","published":"2024-11-19T07:49:22Z","title":"SNN-Based Online Learning of Concepts and Action Laws in an Open World","summary":" We present the architecture of a fully autonomous, bio-inspired cognitive\nagent built around a spiking neural network (SNN) implementing the agent's\nsemantic memory. The agent explores its universe and learns concepts of\nobjects/situations and of its own actions in a one-shot manner. While\nobject/situation concepts are unary, action concepts are triples made up of an\ninitial situation, a motor activity, and an outcome. They embody the agent's\nknowledge of its universe's actions laws. Both kinds of concepts have different\ndegrees of generality. To make decisions the agent queries its semantic memory\nfor the expected outcomes of envisaged actions and chooses the action to take\non the basis of these predictions. Our experiments show that the agent handles\nnew situations by appealing to previously learned general concepts and rapidly\nmodifies its concepts to adapt to environment changes.\n","authors":["Christel Grimaud","Dominique Longin","Andreas Herzig"],"pdf_url":"https://arxiv.org/pdf/2411.12308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12286v1","updated":"2024-11-19T07:12:48Z","published":"2024-11-19T07:12:48Z","title":"GLOVER: Generalizable Open-Vocabulary Affordance Reasoning for\n Task-Oriented Grasping","summary":" Inferring affordable (i.e., graspable) parts of arbitrary objects based on\nhuman specifications is essential for robots advancing toward open-vocabulary\nmanipulation. Current grasp planners, however, are hindered by limited\nvision-language comprehension and time-consuming 3D radiance modeling,\nrestricting real-time, open-vocabulary interactions with objects. To address\nthese limitations, we propose GLOVER, a unified Generalizable Open-Vocabulary\nAffordance Reasoning framework, which fine-tunes the Large Language Models\n(LLMs) to predict visual affordance of graspable object parts within RGB\nfeature space. We compile a dataset of over 10,000 images from human-object\ninteractions, annotated with unified visual and linguistic affordance labels,\nto enable multi-modal fine-tuning. GLOVER inherits world knowledge and\ncommon-sense reasoning from LLMs, facilitating more fine-grained object\nunderstanding and sophisticated tool-use reasoning. To enable effective\nreal-world deployment, we present Affordance-Aware Grasping Estimation (AGE), a\nnon-parametric grasp planner that aligns the gripper pose with a superquadric\nsurface derived from affordance data. In evaluations across 30 real-world\nscenes, GLOVER achieves success rates of 86.0% in part identification and 76.3%\nin grasping, with speeds approximately 330 times faster in affordance reasoning\nand 40 times faster in grasping pose estimation than the previous\nstate-of-the-art.\n","authors":["Teli Ma","Zifan Wang","Jiaming Zhou","Mengmeng Wang","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2411.12286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12255v1","updated":"2024-11-19T06:09:09Z","published":"2024-11-19T06:09:09Z","title":"Error-Feedback Model for Output Correction in Bilateral Control-Based\n Imitation Learning","summary":" In recent years, imitation learning using neural networks has enabled robots\nto perform flexible tasks. However, since neural networks operate in a\nfeedforward structure, they do not possess a mechanism to compensate for output\nerrors. To address this limitation, we developed a feedback mechanism to\ncorrect these errors. By employing a hierarchical structure for neural networks\ncomprising lower and upper layers, the lower layer was controlled to follow the\nupper layer. Additionally, using a multi-layer perceptron in the lower layer,\nwhich lacks an internal state, enhanced the error feedback. In the\ncharacter-writing task, this model demonstrated improved accuracy in writing\npreviously untrained characters. In the character-writing task, this model\ndemonstrated improved accuracy in writing previously untrained characters.\nThrough autonomous control with error feedback, we confirmed that the lower\nlayer could effectively track the output of the upper layer. This study\nrepresents a promising step toward integrating neural networks with control\ntheories.\n","authors":["Hiroshi Sato","Masashi Konosu","Sho Sakaino","Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2411.12255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12250v1","updated":"2024-11-19T05:52:51Z","published":"2024-11-19T05:52:51Z","title":"ADV2E: Bridging the Gap Between Analogue Circuit and Discrete Frames in\n the Video-to-Events Simulator","summary":" Event cameras operate fundamentally differently from traditional Active Pixel\nSensor (APS) cameras, offering significant advantages. Recent research has\ndeveloped simulators to convert video frames into events, addressing the\nshortage of real event datasets. Current simulators primarily focus on the\nlogical behavior of event cameras. However, the fundamental analogue properties\nof pixel circuits are seldom considered in simulator design. The gap between\nanalogue pixel circuit and discrete video frames causes the degeneration of\nsynthetic events, particularly in high-contrast scenes. In this paper, we\npropose a novel method of generating reliable event data based on a detailed\nanalysis of the pixel circuitry in event cameras. We incorporate the analogue\nproperties of event camera pixel circuits into the simulator design: (1)\nanalogue filtering of signals from light intensity to events, and (2) a cutoff\nfrequency that is independent of video frame rate. Experimental results on two\nrelevant tasks, including semantic segmentation and image reconstruction,\nvalidate the reliability of simulated event data, even in high-contrast scenes.\nThis demonstrates that deep neural networks exhibit strong generalization from\nsimulated to real event data, confirming that the synthetic events generated by\nthe proposed method are both realistic and well-suited for effective training.\n","authors":["Xiao Jiang","Fei Zhou","Jiongzhi Lin"],"pdf_url":"https://arxiv.org/pdf/2411.12250v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.12206v1","updated":"2024-11-19T03:49:57Z","published":"2024-11-19T03:49:57Z","title":"Safe Navigation in Dynamic Environments using Density Functions","summary":" This work uses density functions for safe navigation in dynamic environments.\nThe dynamic environment consists of time-varying obstacles as well as\ntime-varying target sets. We propose an analytical construction of time-varying\ndensity functions to solve these navigation problems. The proposed approach\nleads to a time-varying feedback controller obtained as a positive gradient of\nthe density function. This paper's main contribution is providing convergence\nproof using the analytically constructed density function for safe navigation\nin the presence of a dynamic obstacle set and time-varying target set. The\nresults are the first of this kind developed for a system with integrator\ndynamics and open up the possibility for application to systems with more\ncomplex dynamics using methods based on control density function and inverse\nkinematic-based control design. We present the application of the developed\napproach for collision avoidance in multi-agent systems and robotic systems.\nWhile the theoretical results are produced for first-order integrator systems,\nwe demonstrate how the framework can be applied for systems with non-trivial\ndynamics, such as Dubin's car model and fully actuated Euler-Lagrange system\nwith robotics applications.\n","authors":["Sriram S. K. S Narayanan","Joseph Moyalan","Umesh Vaidya"],"pdf_url":"https://arxiv.org/pdf/2411.12206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08162v2","updated":"2024-11-19T03:30:11Z","published":"2024-07-11T03:47:14Z","title":"Improving Visual Place Recognition Based Robot Navigation By Verifying\n Localization Estimates","summary":" Visual Place Recognition (VPR) systems often have imperfect performance,\naffecting the `integrity' of position estimates and subsequent robot navigation\ndecisions. Previously, SVM classifiers have been used to monitor VPR integrity.\nThis research introduces a novel Multi-Layer Perceptron (MLP) integrity monitor\nwhich demonstrates improved performance and generalizability, removing\nper-environment training and reducing manual tuning requirements. We test our\nproposed system in extensive real-world experiments, presenting two real-time\nintegrity-based VPR verification methods: a single-query rejection method for\nrobot navigation to a goal zone (Experiment 1); and a history-of-queries method\nthat takes a best, verified, match from its recent trajectory and uses an\nodometer to extrapolate a current position estimate (Experiment 2). Noteworthy\nresults for Experiment 1 include a decrease in aggregate mean along-track goal\nerror from ~9.8m to ~3.1m, and an increase in the aggregate rate of successful\nmission completion from ~41% to ~55%. Experiment 2 showed a decrease in\naggregate mean along-track localization error from ~2.0m to ~0.5m, and an\nincrease in the aggregate localization precision from ~97% to ~99%. Overall,\nour results demonstrate the practical usefulness of a VPR integrity monitor in\nreal-world robotics to improve VPR localization and consequent navigation\nperformance.\n","authors":["Owen Claxton","Connor Malone","Helen Carson","Jason Ford","Gabe Bolton","Iman Shames","Michael Milford"],"pdf_url":"https://arxiv.org/pdf/2407.08162v2.pdf","comment":"Author Accepted Preprint for Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2411.12185v1","updated":"2024-11-19T02:56:51Z","published":"2024-11-19T02:56:51Z","title":"LiV-GS: LiDAR-Vision Integration for 3D Gaussian Splatting SLAM in\n Outdoor Environments","summary":" We present LiV-GS, a LiDAR-visual SLAM system in outdoor environments that\nleverages 3D Gaussian as a differentiable spatial representation. Notably,\nLiV-GS is the first method that directly aligns discrete and sparse LiDAR data\nwith continuous differentiable Gaussian maps in large-scale outdoor scenes,\novercoming the limitation of fixed resolution in traditional LiDAR mapping. The\nsystem aligns point clouds with Gaussian maps using shared covariance\nattributes for front-end tracking and integrates the normal orientation into\nthe loss function to refines the Gaussian map. To reliably and stably update\nGaussians outside the LiDAR field of view, we introduce a novel conditional\nGaussian constraint that aligns these Gaussians closely with the nearest\nreliable ones. The targeted adjustment enables LiV-GS to achieve fast and\naccurate mapping with novel view synthesis at a rate of 7.98 FPS. Extensive\ncomparative experiments demonstrate LiV-GS's superior performance in SLAM,\nimage rendering and mapping. The successful cross-modal radar-LiDAR\nlocalization highlights the potential of LiV-GS for applications in cross-modal\nsemantic positioning and object segmentation with Gaussian maps.\n","authors":["Renxiang Xiao","Wei Liu","Yushuai Chen","Liang Hu"],"pdf_url":"https://arxiv.org/pdf/2411.12185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12175v1","updated":"2024-11-19T02:39:57Z","published":"2024-11-19T02:39:57Z","title":"AsynEIO: Asynchronous Monocular Event-Inertial Odometry Using Gaussian\n Process Regression","summary":" Event cameras, when combined with inertial sensors, show significant\npotential for motion estimation in challenging scenarios, such as high-speed\nmaneuvers and low-light environments. There are many methods for producing such\nestimations, but most boil down to a synchronous discrete-time fusion problem.\nHowever, the asynchronous nature of event cameras and their unique fusion\nmechanism with inertial sensors remain underexplored. In this paper, we\nintroduce a monocular event-inertial odometry method called AsynEIO, designed\nto fuse asynchronous event and inertial data within a unified Gaussian Process\n(GP) regression framework. Our approach incorporates an event-driven frontend\nthat tracks feature trajectories directly from raw event streams at a high\ntemporal resolution. These tracked feature trajectories, along with various\ninertial factors, are integrated into the same GP regression framework to\nenable asynchronous fusion. With deriving analytical residual Jacobians and\nnoise models, our method constructs a factor graph that is iteratively\noptimized and pruned using a sliding-window optimizer. Comparative assessments\nhighlight the performance of different inertial fusion strategies, suggesting\noptimal choices for varying conditions. Experimental results on both public\ndatasets and our own event-inertial sequences indicate that AsynEIO outperforms\nexisting methods, especially in high-speed and low-illumination scenarios.\n","authors":["Zhixiang Wang","Xudong Li","Yizhai Zhang","Fan Zhang"," Panfeng"],"pdf_url":"https://arxiv.org/pdf/2411.12175v1.pdf","comment":"Submitted to IEEE (2024-11-4)"},{"id":"http://arxiv.org/abs/2411.12155v1","updated":"2024-11-19T01:23:52Z","published":"2024-11-19T01:23:52Z","title":"Reinforcement Learning with Action Sequence for Data-Efficient Robot\n Learning","summary":" Training reinforcement learning (RL) agents on robotic tasks typically\nrequires a large number of training samples. This is because training data\noften consists of noisy trajectories, whether from exploration or\nhuman-collected demonstrations, making it difficult to learn value functions\nthat understand the effect of taking each action. On the other hand, recent\nbehavior-cloning (BC) approaches have shown that predicting a sequence of\nactions enables policies to effectively approximate noisy, multi-modal\ndistributions of expert demonstrations. Can we use a similar idea for improving\nRL on robotic tasks? In this paper, we introduce a novel RL algorithm that\nlearns a critic network that outputs Q-values over a sequence of actions. By\nexplicitly training the value functions to learn the consequence of executing a\nseries of current and future actions, our algorithm allows for learning useful\nvalue functions from noisy trajectories. We study our algorithm across various\nsetups with sparse and dense rewards, and with or without demonstrations,\nspanning mobile bi-manual manipulation, whole-body control, and tabletop\nmanipulation tasks from BiGym, HumanoidBench, and RLBench. We find that, by\nlearning the critic network with action sequences, our algorithm outperforms\nvarious RL and BC baselines, in particular on challenging humanoid control\ntasks.\n","authors":["Younggyo Seo","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2411.12155v1.pdf","comment":"17 Pages. Website: https://younggyo.me/cqn-as/"},{"id":"http://arxiv.org/abs/2403.15993v2","updated":"2024-11-19T01:00:14Z","published":"2024-03-24T03:10:18Z","title":"Robust-Locomotion-by-Logic: Perturbation-Resilient Bipedal Locomotion\n via Signal Temporal Logic Guided Model Predictive Control","summary":" This study introduces a robust planning framework that utilizes a model\npredictive control (MPC) approach, enhanced by incorporating signal temporal\nlogic (STL) specifications. This marks the first-ever study to apply STL-guided\ntrajectory optimization for bipedal locomotion, specifically designed to handle\nboth translational and orientational perturbations. Existing recovery\nstrategies often struggle with reasoning complex task logic and evaluating\nlocomotion robustness systematically, making them susceptible to failures\ncaused by inappropriate recovery strategies or lack of robustness. To address\nthese issues, we design an analytical stability metric for bipedal locomotion\nand quantify this metric using STL specifications, which guide the generation\nof recovery trajectories to achieve maximum robustness degree. To enable safe\nand computational-efficient crossed-leg maneuver, we design data-driven\nself-leg-collision constraints that are $1000$ times faster than the\ntraditional inverse-kinematics-based approach. Our framework outperforms a\nstate-of-the-art locomotion controller, a standard MPC without STL, and a\nlinear-temporal-logic-based planner in a high-fidelity dynamic simulation,\nespecially in scenarios involving crossed-leg maneuvers. Additionally, the\nCassie bipedal robot achieves robust performance under horizontal and\norientational perturbations such as those observed in ship motions. These\nenvironments are validated in simulations and deployed on hardware.\nFurthermore, our proposed method demonstrates versatility on stepping stones\nand terrain-agnostic features on inclined terrains.\n","authors":["Zhaoyuan Gu","Yuntian Zhao","Yipu Chen","Rongming Guo","Jennifer K. Leestma","Gregory S. Sawicki","Ye Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.15993v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12150v1","updated":"2024-11-19T00:56:35Z","published":"2024-11-19T00:56:35Z","title":"HEIGHT: Heterogeneous Interaction Graph Transformer for Robot Navigation\n in Crowded and Constrained Environments","summary":" We study the problem of robot navigation in dense and interactive crowds with\nenvironmental constraints such as corridors and furniture. Previous methods\nfail to consider all types of interactions among agents and obstacles, leading\nto unsafe and inefficient robot paths. In this article, we leverage a\ngraph-based representation of crowded and constrained scenarios and propose a\nstructured framework to learn robot navigation policies with deep reinforcement\nlearning. We first split the representations of different components in the\nenvironment and propose a heterogeneous spatio-temporal (st) graph to model\ndistinct interactions among humans, robots, and obstacles. Based on the\nheterogeneous st-graph, we propose HEIGHT, a novel navigation policy network\narchitecture with different components to capture heterogeneous interactions\namong entities through space and time. HEIGHT utilizes attention mechanisms to\nprioritize important interactions and a recurrent network to track changes in\nthe dynamic scene over time, encouraging the robot to avoid collisions\nadaptively. Through extensive simulation and real-world experiments, we\ndemonstrate that HEIGHT outperforms state-of-the-art baselines in terms of\nsuccess and efficiency in challenging navigation scenarios. Furthermore, we\ndemonstrate that our pipeline achieves better zero-shot generalization\ncapability than previous works when the densities of humans and obstacles\nchange. More videos are available at\nhttps://sites.google.com/view/crowdnav-height/home.\n","authors":["Shuijing Liu","Haochen Xia","Fatemeh Cheraghi Pouria","Kaiwen Hong","Neeloy Chakraborty","Katherine Driggs-Campbell"],"pdf_url":"https://arxiv.org/pdf/2411.12150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05249v2","updated":"2024-11-19T00:05:02Z","published":"2024-04-08T07:25:25Z","title":"SAFE-GIL: SAFEty Guided Imitation Learning for Robotic Systems","summary":" Behavior cloning (BC) is a widely-used approach in imitation learning, where\na robot learns a control policy by observing an expert supervisor. However, the\nlearned policy can make errors and might lead to safety violations, which\nlimits their utility in safety-critical robotics applications. While prior\nworks have tried improving a BC policy via additional real or synthetic action\nlabels, adversarial training, or runtime filtering, none of them explicitly\nfocus on reducing the BC policy's safety violations during training time. We\npropose SAFE-GIL, a design-time method to learn safety-aware behavior cloning\npolicies. SAFE-GIL deliberately injects adversarial disturbance in the system\nduring data collection to guide the expert towards safety-critical states. This\ndisturbance injection simulates potential policy errors that the system might\nencounter during the test time. By ensuring that training more closely\nreplicates expert behavior in safety-critical states, our approach results in\nsafer policies despite policy errors during the test time. We further develop a\nreachability-based method to compute this adversarial disturbance. We compare\nSAFE-GIL with various behavior cloning techniques and online safety-filtering\nmethods in three domains: autonomous ground navigation, aircraft taxiing, and\naerial navigation on a quadrotor testbed. Our method demonstrates a significant\nreduction in safety failures, particularly in low data regimes where the\nlikelihood of learning errors, and therefore safety violations, is higher. See\nour website here: https://y-u-c.github.io/safegil/\n","authors":["Yusuf Umut Ciftci","Darren Chiu","Zeyuan Feng","Gaurav S. Sukhatme","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2404.05249v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.11844v2","updated":"2024-11-19T18:59:42Z","published":"2024-11-18T18:59:31Z","title":"Generative World Explorer","summary":" Planning with partial observation is a central challenge in embodied AI. A\nmajority of prior works have tackled this challenge by developing agents that\nphysically explore their environment to update their beliefs about the world\nstate. In contrast, humans can $\\textit{imagine}$ unseen parts of the world\nthrough a mental exploration and $\\textit{revise}$ their beliefs with imagined\nobservations. Such updated beliefs can allow them to make more informed\ndecisions, without necessitating the physical exploration of the world at all\ntimes. To achieve this human-like ability, we introduce the $\\textit{Generative\nWorld Explorer (Genex)}$, an egocentric world exploration framework that allows\nan agent to mentally explore a large-scale 3D world (e.g., urban scenes) and\nacquire imagined observations to update its belief. This updated belief will\nthen help the agent to make a more informed decision at the current step. To\ntrain $\\textit{Genex}$, we create a synthetic urban scene dataset, Genex-DB.\nOur experimental results demonstrate that (1) $\\textit{Genex}$ can generate\nhigh-quality and consistent observations during long-horizon exploration of a\nlarge virtual physical world and (2) the beliefs updated with the generated\nobservations can inform an existing decision-making model (e.g., an LLM agent)\nto make better plans.\n","authors":["Taiming Lu","Tianmin Shu","Alan Yuille","Daniel Khashabi","Jieneng Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11844v2.pdf","comment":"Website: generative-world-explorer.github.io"},{"id":"http://arxiv.org/abs/2411.12724v1","updated":"2024-11-19T18:45:16Z","published":"2024-11-19T18:45:16Z","title":"Heuristic-Free Multi-Teacher Learning","summary":" We introduce Teacher2Task, a novel framework for multi-teacher learning that\neliminates the need for manual aggregation heuristics. Existing multi-teacher\nmethods typically rely on such heuristics to combine predictions from multiple\nteachers, often resulting in sub-optimal aggregated labels and the propagation\nof aggregation errors. Teacher2Task addresses these limitations by introducing\nteacher-specific input tokens and reformulating the training process. Instead\nof relying on aggregated labels, the framework transforms the training data,\nconsisting of ground truth labels and annotations from N teachers, into N+1\ndistinct tasks: N auxiliary tasks that predict the labeling styles of the N\nindividual teachers, and one primary task that focuses on the ground truth\nlabels. This approach, drawing upon principles from multiple learning\nparadigms, demonstrates strong empirical results across a range of\narchitectures, modalities, and tasks.\n","authors":["Huy Thong Nguyen","En-Hung Chu","Lenord Melvix","Jazon Jiao","Chunglin Wen","Benjamin Louie"],"pdf_url":"https://arxiv.org/pdf/2411.12724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12713v1","updated":"2024-11-19T18:27:31Z","published":"2024-11-19T18:27:31Z","title":"CATCH: Complementary Adaptive Token-level Contrastive Decoding to\n Mitigate Hallucinations in LVLMs","summary":" Large Vision-Language Model (LVLM) systems have demonstrated impressive\nvision-language reasoning capabilities but suffer from pervasive and severe\nhallucination issues, posing significant risks in critical domains such as\nhealthcare and autonomous systems. Despite previous efforts to mitigate\nhallucinations, a persistent issue remains: visual defect from vision-language\nmisalignment, creating a bottleneck in visual processing capacity. To address\nthis challenge, we develop Complementary Adaptive Token-level Contrastive\nDecoding to Mitigate Hallucinations in LVLMs (CATCH), based on the Information\nBottleneck theory. CATCH introduces Complementary Visual Decoupling (CVD) for\nvisual information separation, Non-Visual Screening (NVS) for hallucination\ndetection, and Adaptive Token-level Contrastive Decoding (ATCD) for\nhallucination mitigation. CATCH addresses issues related to visual defects that\ncause diminished fine-grained feature perception and cumulative hallucinations\nin open-ended scenarios. It is applicable to various visual question-answering\ntasks without requiring any specific data or prior knowledge, and generalizes\nrobustly to new tasks without additional training, opening new possibilities\nfor advancing LVLM in various challenging applications.\n","authors":["Zhehan Kan","Ce Zhang","Zihan Liao","Yapeng Tian","Wenming Yang","Junyuan Xiao","Xu Li","Dongmei Jiang","Yaowei Wang","Qingmin Liao"],"pdf_url":"https://arxiv.org/pdf/2411.12713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12707v1","updated":"2024-11-19T18:22:25Z","published":"2024-11-19T18:22:25Z","title":"Barttender: An approachable & interpretable way to compare medical\n imaging and non-imaging data","summary":" Imaging-based deep learning has transformed healthcare research, yet its\nclinical adoption remains limited due to challenges in comparing imaging models\nwith traditional non-imaging and tabular data. To bridge this gap, we introduce\nBarttender, an interpretable framework that uses deep learning for the direct\ncomparison of the utility of imaging versus non-imaging tabular data for tasks\nlike disease prediction.\n Barttender converts non-imaging tabular features, such as scalar data from\nelectronic health records, into grayscale bars, facilitating an interpretable\nand scalable deep learning based modeling of both data modalities. Our\nframework allows researchers to evaluate differences in utility through\nperformance measures, as well as local (sample-level) and global\n(population-level) explanations. We introduce a novel measure to define global\nfeature importances for image-based deep learning models, which we call gIoU.\nExperiments on the CheXpert and MIMIC datasets with chest X-rays and scalar\ndata from electronic health records show that Barttender performs comparably to\ntraditional methods and offers enhanced explainability using deep learning\nmodels.\n","authors":["Ayush Singla","Shakson Isaac","Chirag J. Patel"],"pdf_url":"https://arxiv.org/pdf/2411.12707v1.pdf","comment":"Accepted to the Proceedings Track at Machine Learning for Health\n (ML4H 2024) conference, held on December 15-16, 2024 in Vancouver, Canada"},{"id":"http://arxiv.org/abs/2411.12593v1","updated":"2024-11-19T18:04:13Z","published":"2024-11-19T18:04:13Z","title":"AdaCM$^2$: On Understanding Extremely Long-Term Video with Adaptive\n Cross-Modality Memory Reduction","summary":" The advancements in large language models (LLMs) have propelled the\nimprovement of video understanding tasks by incorporating LLMs with visual\nmodels. However, most existing LLM-based models (e.g., VideoLLaMA, VideoChat)\nare constrained to processing short-duration videos. Recent attempts to\nunderstand long-term videos by extracting and compressing visual features into\na fixed memory size. Nevertheless, those methods leverage only visual modality\nto merge video tokens and overlook the correlation between visual and textual\nqueries, leading to difficulties in effectively handling complex\nquestion-answering tasks. To address the challenges of long videos and complex\nprompts, we propose AdaCM$^2$, which, for the first time, introduces an\nadaptive cross-modality memory reduction approach to video-text alignment in an\nauto-regressive manner on video streams. Our extensive experiments on various\nvideo understanding tasks, such as video captioning, video question answering,\nand video classification, demonstrate that AdaCM$^2$ achieves state-of-the-art\nperformance across multiple datasets while significantly reducing memory usage.\nNotably, it achieves a 4.5% improvement across multiple tasks in the LVU\ndataset with a GPU memory consumption reduction of up to 65%.\n","authors":["Yuanbin Man","Ying Huang","Chengming Zhang","Bingzhe Li","Wei Niu","Miao Yin"],"pdf_url":"https://arxiv.org/pdf/2411.12593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10979v2","updated":"2024-11-19T17:46:27Z","published":"2024-11-17T06:23:46Z","title":"VidComposition: Can MLLMs Analyze Compositions in Compiled Videos?","summary":" The advancement of Multimodal Large Language Models (MLLMs) has enabled\nsignificant progress in multimodal understanding, expanding their capacity to\nanalyze video content. However, existing evaluation benchmarks for MLLMs\nprimarily focus on abstract video comprehension, lacking a detailed assessment\nof their ability to understand video compositions, the nuanced interpretation\nof how visual elements combine and interact within highly compiled video\ncontexts. We introduce VidComposition, a new benchmark specifically designed to\nevaluate the video composition understanding capabilities of MLLMs using\ncarefully curated compiled videos and cinematic-level annotations.\nVidComposition includes 982 videos with 1706 multiple-choice questions,\ncovering various compositional aspects such as camera movement, angle, shot\nsize, narrative structure, character actions and emotions, etc. Our\ncomprehensive evaluation of 33 open-source and proprietary MLLMs reveals a\nsignificant performance gap between human and model capabilities. This\nhighlights the limitations of current MLLMs in understanding complex, compiled\nvideo compositions and offers insights into areas for further improvement. The\nleaderboard and evaluation code are available at\nhttps://yunlong10.github.io/VidComposition/.\n","authors":["Yunlong Tang","Junjia Guo","Hang Hua","Susan Liang","Mingqian Feng","Xinyang Li","Rui Mao","Chao Huang","Jing Bi","Zeliang Zhang","Pooyan Fazli","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2411.10979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12681v1","updated":"2024-11-19T17:39:03Z","published":"2024-11-19T17:39:03Z","title":"AI Guided Early Screening of Cervical Cancer","summary":" In order to support the creation of reliable machine learning models for\nanomaly detection, this project focuses on preprocessing, enhancing, and\norganizing a medical imaging dataset. There are two classifications in the\ndataset: normal and abnormal, along with extra noise fluctuations. In order to\nimprove the photographs' quality, undesirable artifacts, including visible\nmedical equipment at the edges, were eliminated using central cropping.\nAdjusting the brightness and contrast was one of the additional preprocessing\nprocesses. Normalization was then performed to normalize the data. To make\nclassification jobs easier, the dataset was methodically handled by combining\nseveral image subsets into two primary categories: normal and pathological. To\nprovide a strong training set that adapts well to real-world situations,\nsophisticated picture preprocessing techniques were used, such as contrast\nenhancement and real-time augmentation (including rotations, zooms, and\nbrightness modifications). To guarantee efficient model evaluation, the data\nwas subsequently divided into training and testing subsets. In order to create\nprecise and effective machine learning models for medical anomaly detection,\nhigh-quality input data is ensured via this thorough approach. Because of the\nproject pipeline's flexible and scalable design, it can be easily integrated\nwith bigger clinical decision-support systems.\n","authors":["Dharanidharan S I","Suhitha Renuka S V","Ajishi Singh","Sheena Christabel Pravin"],"pdf_url":"https://arxiv.org/pdf/2411.12681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12678v1","updated":"2024-11-19T17:31:36Z","published":"2024-11-19T17:31:36Z","title":"Deep Learning-Driven Heat Map Analysis for Evaluating thickness of\n Wounded Skin Layers","summary":" Understanding the appropriate skin layer thickness in wounded sites is an\nimportant tool to move forward on wound healing practices and treatment\nprotocols. Methods to measure depth often are invasive and less specific. This\npaper introduces a novel method that is non-invasive with deep learning\ntechniques using classifying of skin layers that helps in measurement of wound\ndepth through heatmap analysis. A set of approximately 200 labeled images of\nskin allows five classes to be distinguished: scars, wounds, and healthy skin,\namong others. Each image has annotated key layers, namely the stratum cornetum,\nthe epidermis, and the dermis, in the software Roboflow. In the preliminary\nstage, the Heatmap generator VGG16 was used to enhance the visibility of tissue\nlayers, based upon which their annotated images were used to train ResNet18\nwith early stopping techniques. It ended up at a very high accuracy rate of\n97.67%. To do this, the comparison of the models ResNet18, VGG16, DenseNet121,\nand EfficientNet has been done where both EfficientNet and ResNet18 have\nattained accuracy rates of almost 95.35%. For further hyperparameter tuning,\nEfficientNet and ResNet18 were trained at six different learning rates to\ndetermine the best model configuration. It has been noted that the accuracy has\nhuge variations with different learning rates. In the case of EfficientNet, the\nmaximum achievable accuracy was 95.35% at the rate of 0.0001. The same was true\nfor ResNet18, which also attained its peak value of 95.35% at the same rate.\nThese facts indicate that the model can be applied and utilized in actual-time,\nnon-invasive wound assessment, which holds a great promise to improve clinical\ndiagnosis and treatment planning.\n","authors":["Devakumar GR","JB Kaarthikeyan","Dominic Immanuel T","Sheena Christabel Pravin"],"pdf_url":"https://arxiv.org/pdf/2411.12678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12676v1","updated":"2024-11-19T17:29:59Z","published":"2024-11-19T17:29:59Z","title":"IoT-Based 3D Pose Estimation and Motion Optimization for Athletes:\n Application of C3D and OpenPose","summary":" This study proposes the IoT-Enhanced Pose Optimization Network (IE-PONet) for\nhigh-precision 3D pose estimation and motion optimization of track and field\nathletes. IE-PONet integrates C3D for spatiotemporal feature extraction,\nOpenPose for real-time keypoint detection, and Bayesian optimization for\nhyperparameter tuning. Experimental results on NTURGB+D and FineGYM datasets\ndemonstrate superior performance, with AP\\(^p50\\) scores of 90.5 and 91.0, and\nmAP scores of 74.3 and 74.0, respectively. Ablation studies confirm the\nessential roles of each module in enhancing model accuracy. IE-PONet provides a\nrobust tool for athletic performance analysis and optimization, offering\nprecise technical insights for training and injury prevention. Future work will\nfocus on further model optimization, multimodal data integration, and\ndeveloping real-time feedback mechanisms to enhance practical applications.\n","authors":["Fei Ren","Chao Ren","Tianyi Lyu"],"pdf_url":"https://arxiv.org/pdf/2411.12676v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2411.12667v1","updated":"2024-11-19T17:19:20Z","published":"2024-11-19T17:19:20Z","title":"Machine Learning Approaches on Crop Pattern Recognition a Comparative\n Analysis","summary":" Monitoring agricultural activities is important to ensure food security.\nRemote sensing plays a significant role for large-scale continuous monitoring\nof cultivation activities. Time series remote sensing data were used for the\ngeneration of the cropping pattern. Classification algorithms are used to\nclassify crop patterns and mapped agriculture land used. Some conventional\nclassification methods including support vector machine (SVM) and decision\ntrees were applied for crop pattern recognition. However, in this paper, we are\nproposing Deep Neural Network (DNN) based classification to improve the\nperformance of crop pattern recognition and make a comparative analysis with\ntwo (2) other machine learning approaches including Naive Bayes and Random\nForest.\n","authors":["Kazi Hasibul Kabir","Md. Zahiruddin Aqib","Sharmin Sultana","Shamim Akhter"],"pdf_url":"https://arxiv.org/pdf/2411.12667v1.pdf","comment":"Published in ICNTET2018: International Conference on New Trends in\n Engineering & Technology Tirupathi Highway, Tiruvallur Dist Chennai, India,\n September 7-8, 2018"},{"id":"http://arxiv.org/abs/2411.12663v1","updated":"2024-11-19T17:16:31Z","published":"2024-11-19T17:16:31Z","title":"PoM: Efficient Image and Video Generation with the Polynomial Mixer","summary":" Diffusion models based on Multi-Head Attention (MHA) have become ubiquitous\nto generate high quality images and videos. However, encoding an image or a\nvideo as a sequence of patches results in costly attention patterns, as the\nrequirements both in terms of memory and compute grow quadratically. To\nalleviate this problem, we propose a drop-in replacement for MHA called the\nPolynomial Mixer (PoM) that has the benefit of encoding the entire sequence\ninto an explicit state. PoM has a linear complexity with respect to the number\nof tokens. This explicit state also allows us to generate frames in a\nsequential fashion, minimizing memory and compute requirement, while still\nbeing able to train in parallel. We show the Polynomial Mixer is a universal\nsequence-to-sequence approximator, just like regular MHA. We adapt several\nDiffusion Transformers (DiT) for generating images and videos with PoM\nreplacing MHA, and we obtain high quality samples while using less\ncomputational resources. The code is available at\nhttps://github.com/davidpicard/HoMM.\n","authors":["David Picard","Nicolas Dufour"],"pdf_url":"https://arxiv.org/pdf/2411.12663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12635v1","updated":"2024-11-19T16:49:24Z","published":"2024-11-19T16:49:24Z","title":"M3D: Dual-Stream Selective State Spaces and Depth-Driven Framework for\n High-Fidelity Single-View 3D Reconstruction","summary":" The precise reconstruction of 3D objects from a single RGB image in complex\nscenes presents a critical challenge in virtual reality, autonomous driving,\nand robotics. Existing neural implicit 3D representation methods face\nsignificant difficulties in balancing the extraction of global and local\nfeatures, particularly in diverse and complex environments, leading to\ninsufficient reconstruction precision and quality. We propose M3D, a novel\nsingle-view 3D reconstruction framework, to tackle these challenges. This\nframework adopts a dual-stream feature extraction strategy based on Selective\nState Spaces to effectively balance the extraction of global and local\nfeatures, thereby improving scene comprehension and representation precision.\nAdditionally, a parallel branch extracts depth information, effectively\nintegrating visual and geometric features to enhance reconstruction quality and\npreserve intricate details. Experimental results indicate that the fusion of\nmulti-scale features with depth information via the dual-branch feature\nextraction significantly boosts geometric consistency and fidelity, achieving\nstate-of-the-art reconstruction performance.\n","authors":["Luoxi Zhang","Pragyan Shrestha","Yu Zhou","Chun Xie","Itaru Kitahara"],"pdf_url":"https://arxiv.org/pdf/2411.12635v1.pdf","comment":"9 pages, 4 figures, submitted to CVPR 2025 for review"},{"id":"http://arxiv.org/abs/2411.12633v1","updated":"2024-11-19T16:45:52Z","published":"2024-11-19T16:45:52Z","title":"Instant Policy: In-Context Imitation Learning via Graph Diffusion","summary":" Following the impressive capabilities of in-context learning with large\ntransformers, In-Context Imitation Learning (ICIL) is a promising opportunity\nfor robotics. We introduce Instant Policy, which learns new tasks instantly\n(without further training) from just one or two demonstrations, achieving ICIL\nthrough two key components. First, we introduce inductive biases through a\ngraph representation and model ICIL as a graph generation problem with a\nlearned diffusion process, enabling structured reasoning over demonstrations,\nobservations, and actions. Second, we show that such a model can be trained\nusing pseudo-demonstrations - arbitrary trajectories generated in simulation -\nas a virtually infinite pool of training data. Simulated and real experiments\nshow that Instant Policy enables rapid learning of various everyday robot\ntasks. We also show how it can serve as a foundation for cross-embodiment and\nzero-shot transfer to language-defined tasks. Code and videos are available at\nhttps://www.robot-learning.uk/instant-policy.\n","authors":["Vitalis Vosylius","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2411.12633v1.pdf","comment":"Code and videos are available on our project webpage at\n https://www.robot-learning.uk/instant-policy"},{"id":"http://arxiv.org/abs/2409.20098v2","updated":"2024-11-19T16:45:34Z","published":"2024-09-30T08:50:22Z","title":"DIG-FACE: De-biased Learning for Generalized Facial Expression Category\n Discovery","summary":" We introduce a novel task, Generalized Facial Expression Category Discovery\n(G-FACE), that discovers new, unseen facial expressions while recognizing known\ncategories effectively. Even though there are generalized category discovery\nmethods for natural images, they show compromised performance on G-FACE. We\nidentified two biases that affect the learning: implicit bias, coming from an\nunderlying distributional gap between new categories in unlabeled data and\nknown categories in labeled data, and explicit bias, coming from shifted\npreference on explicit visual facial change characteristics from known\nexpressions to unknown expressions. By addressing the challenges caused by both\nbiases, we propose a Debiased G-FACE method, namely DIG-FACE, that facilitates\nthe debiasing of both implicit and explicit biases. In the implicit debiasing\nprocess of DIG-FACE, we devise a novel learning strategy that aims at\nestimating and minimizing the upper bound of implicit bias. In the explicit\ndebiasing process, we optimize the model's ability to handle nuanced visual\nfacial expression data by introducing a hierarchical category-discrimination\nrefinement strategy: sample-level, triplet-level, and distribution-level\noptimizations. Extensive experiments demonstrate that our DIG-FACE\nsignificantly enhances recognition accuracy for both known and new categories,\nsetting a first-of-its-kind standard for the task.\n","authors":["Tingzhang Luo","Yichao Liu","Yuanyuan Liu","Andi Zhang","Xin Wang","Yibing Zhan","Chang Tang","Leyuan Liu","Zhe Chen"],"pdf_url":"https://arxiv.org/pdf/2409.20098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11636v3","updated":"2024-11-19T16:27:45Z","published":"2024-06-17T15:16:18Z","title":"Feasibility of Federated Learning from Client Databases with Different\n Brain Diseases and MRI Modalities","summary":" Segmentation models for brain lesions in MRI are typically developed for a\nspecific disease and trained on data with a predefined set of MRI modalities.\nSuch models cannot segment the disease using data with a different set of MRI\nmodalities, nor can they segment other types of diseases. Moreover, this\ntraining paradigm prevents a model from using the advantages of learning from\nheterogeneous databases that may contain scans and segmentation labels for\ndifferent brain pathologies and diverse sets of MRI modalities. Additionally,\nthe confidentiality of patient data often prevents central data aggregation,\nnecessitating a decentralized approach. Is it feasible to use Federated\nLearning (FL) to train a single model on client databases that contain scans\nand labels of different brain pathologies and diverse sets of MRI modalities?\nWe demonstrate promising results by combining appropriate, simple, and\npractical modifications to the model and training strategy: Designing a model\nwith input channels that cover the whole set of modalities available across\nclients, training with random modality drop, and exploring the effects of\nfeature normalization methods. Evaluation on 7 brain MRI databases with 5\ndifferent diseases shows that this FL framework can train a single model\nachieving very promising results in segmenting all disease types seen during\ntraining. Importantly, it can segment these diseases in new databases that\ncontain sets of modalities different from those in training clients. These\nresults demonstrate, for the first time, the feasibility and effectiveness of\nusing FL to train a single 3D segmentation model on decentralised data with\ndiverse brain diseases and MRI modalities, a necessary step towards leveraging\nheterogeneous real-world databases. Code:\nhttps://github.com/FelixWag/FedUniBrain\n","authors":["Felix Wagner","Wentian Xu","Pramit Saha","Ziyun Liang","Daniel Whitehouse","David Menon","Virginia Newcombe","Natalie Voets","J. Alison Noble","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2406.11636v3.pdf","comment":"Accepted as a conference paper at WACV 2025"},{"id":"http://arxiv.org/abs/2411.12620v1","updated":"2024-11-19T16:27:31Z","published":"2024-11-19T16:27:31Z","title":"Maps from Motion (MfM): Generating 2D Semantic Maps from Sparse\n Multi-view Images","summary":" World-wide detailed 2D maps require enormous collective efforts.\nOpenStreetMap is the result of 11 million registered users manually annotating\nthe GPS location of over 1.75 billion entries, including distinctive landmarks\nand common urban objects. At the same time, manual annotations can include\nerrors and are slow to update, limiting the map's accuracy. Maps from Motion\n(MfM) is a step forward to automatize such time-consuming map making procedure\nby computing 2D maps of semantic objects directly from a collection of\nuncalibrated multi-view images. From each image, we extract a set of object\ndetections, and estimate their spatial arrangement in a top-down local map\ncentered in the reference frame of the camera that captured the image. Aligning\nthese local maps is not a trivial problem, since they provide incomplete, noisy\nfragments of the scene, and matching detections across them is unreliable\nbecause of the presence of repeated pattern and the limited appearance\nvariability of urban objects. We address this with a novel graph-based\nframework, that encodes the spatial and semantic distribution of the objects\ndetected in each image, and learns how to combine them to predict the objects'\nposes in a global reference system, while taking into account all possible\ndetection matches and preserving the topology observed in each image. Despite\nthe complexity of the problem, our best model achieves global 2D registration\nwith an average accuracy within 4 meters (i.e., below GPS accuracy) even on\nsparse sequences with strong viewpoint change, on which COLMAP has an 80%\nfailure rate. We provide extensive evaluation on synthetic and real-world data,\nshowing how the method obtains a solution even in scenarios where standard\noptimization techniques fail.\n","authors":["Matteo Toso","Stefano Fiorini","Stuart James","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2411.12620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12615v1","updated":"2024-11-19T16:20:27Z","published":"2024-11-19T16:20:27Z","title":"A Multimodal Approach Combining Structural and Cross-domain Textual\n Guidance for Weakly Supervised OCT Segmentation","summary":" Accurate segmentation of Optical Coherence Tomography (OCT) images is crucial\nfor diagnosing and monitoring retinal diseases. However, the labor-intensive\nnature of pixel-level annotation limits the scalability of supervised learning\nwith large datasets. Weakly Supervised Semantic Segmentation (WSSS) provides a\npromising alternative by leveraging image-level labels. In this study, we\npropose a novel WSSS approach that integrates structural guidance with\ntext-driven strategies to generate high-quality pseudo labels, significantly\nimproving segmentation performance. In terms of visual information, our method\nemploys two processing modules that exchange raw image features and structural\nfeatures from OCT images, guiding the model to identify where lesions are\nlikely to occur. In terms of textual information, we utilize large-scale\npretrained models from cross-domain sources to implement label-informed textual\nguidance and synthetic descriptive integration with two textual processing\nmodules that combine local semantic features with consistent synthetic\ndescriptions. By fusing these visual and textual components within a multimodal\nframework, our approach enhances lesion localization accuracy. Experimental\nresults on three OCT datasets demonstrate that our method achieves\nstate-of-the-art performance, highlighting its potential to improve diagnostic\naccuracy and efficiency in medical imaging.\n","authors":["Jiaqi Yang","Nitish Mehta","Xiaoling Hu","Chao Chen","Chia-Ling Tsai"],"pdf_url":"https://arxiv.org/pdf/2411.12615v1.pdf","comment":"21 pages, 9 figures, 8 tables"},{"id":"http://arxiv.org/abs/2411.12604v1","updated":"2024-11-19T16:07:58Z","published":"2024-11-19T16:07:58Z","title":"SG-LRA: Self-Generating Automatic Scoliosis Cobb Angle Measurement with\n Low-Rank Approximation","summary":" Automatic Cobb angle measurement from X-ray images is crucial for scoliosis\nscreening and diagnosis. However, most existing regression-based methods and\nsegmentation-based methods struggle with inaccurate spine representations or\nmask connectivity/fragmentation issues. Besides, landmark-based methods suffer\nfrom insufficient training data and annotations. To address these challenges,\nwe propose a novel framework including Self-Generation pipeline and Low-Rank\nApproximation representation (SG-LRA) for automatic Cobb angle measurement.\nSpecifically, we propose a parameterized spine contour representation based on\nLRA, which enables eigen-spine decomposition and spine contour reconstruction.\nWe can directly obtain spine contour with only regressed LRA coefficients,\nwhich form a more accurate spine representation than rectangular boxes. Also,\nwe combine LRA coefficient regression with anchor box classification to solve\ninaccurate predictions and mask connectivity issues. Moreover, we develop a\ndata engine with automatic annotation and automatic selection in an iterative\nmanner, which is trained on a private Spinal2023 dataset. With our data engine,\nwe generate the largest scoliosis X-ray dataset named Spinal-AI2024 largely\nwithout privacy leaks. Extensive experiments on public AASCE2019, private\nSpinal2023, and generated Spinal-AI2024 datasets demonstrate that our method\nachieves state-of-the-art Cobb angle measurement performance. Our code and\nSpinal-AI2024 dataset are available at https://github.com/Ernestchenchen/SG-LRA\nand https://github.com/Ernestchenchen/Spinal-AI2024, respectively.\n","authors":["Zhiwen Shao","Yichen Yuan","Lizhuang Ma","Dit-Yan Yeung","Xiaojia Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.12604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12603v1","updated":"2024-11-19T16:06:32Z","published":"2024-11-19T16:06:32Z","title":"STREAM: A Universal State-Space Model for Sparse Geometric Data","summary":" Handling sparse and unstructured geometric data, such as point clouds or\nevent-based vision, is a pressing challenge in the field of machine vision.\nRecently, sequence models such as Transformers and state-space models entered\nthe domain of geometric data. These methods require specialized preprocessing\nto create a sequential view of a set of points. Furthermore, prior works\ninvolving sequence models iterate geometric data with either uniform or learned\nstep sizes, implicitly relying on the model to infer the underlying geometric\nstructure. In this work, we propose to encode geometric structure explicitly\ninto the parameterization of a state-space model. State-space models are based\non linear dynamics governed by a one-dimensional variable such as time or a\nspatial coordinate. We exploit this dynamic variable to inject relative\ndifferences of coordinates into the step size of the state-space model. The\nresulting geometric operation computes interactions between all pairs of N\npoints in O(N) steps. Our model deploys the Mamba selective state-space model\nwith a modified CUDA kernel to efficiently map sparse geometric data to modern\nhardware. The resulting sequence model, which we call STREAM, achieves\ncompetitive results on a range of benchmarks from point-cloud classification to\nevent-based vision and audio classification. STREAM demonstrates a powerful\ninductive bias for sparse geometric data by improving the PointMamba baseline\nwhen trained from scratch on the ModelNet40 and ScanObjectNN point cloud\nanalysis datasets. It further achieves, for the first time, 100% test accuracy\non all 11 classes of the DVS128 Gestures dataset.\n","authors":["Mark Schöne","Yash Bhisikar","Karan Bania","Khaleelulla Khan Nazeer","Christian Mayr","Anand Subramoney","David Kappel"],"pdf_url":"https://arxiv.org/pdf/2411.12603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12602v1","updated":"2024-11-19T16:06:21Z","published":"2024-11-19T16:06:21Z","title":"SAM Carries the Burden: A Semi-Supervised Approach Refining Pseudo\n Labels for Medical Segmentation","summary":" Semantic segmentation is a crucial task in medical imaging. Although\nsupervised learning techniques have proven to be effective in performing this\ntask, they heavily depend on large amounts of annotated training data. The\nrecently introduced Segment Anything Model (SAM) enables prompt-based\nsegmentation and offers zero-shot generalization to unfamiliar objects. In our\nwork, we leverage SAM's abstract object understanding for medical image\nsegmentation to provide pseudo labels for semi-supervised learning, thereby\nmitigating the need for extensive annotated training data. Our approach refines\ninitial segmentations that are derived from a limited amount of annotated data\n(comprising up to 43 cases) by extracting bounding boxes and seed points as\nprompts forwarded to SAM. Thus, it enables the generation of dense segmentation\nmasks as pseudo labels for unlabelled data. The results show that training with\nour pseudo labels yields an improvement in Dice score from $74.29\\,\\%$ to\n$84.17\\,\\%$ and from $66.63\\,\\%$ to $74.87\\,\\%$ for the segmentation of bones\nof the paediatric wrist and teeth in dental radiographs, respectively. As a\nresult, our method outperforms intensity-based post-processing methods,\nstate-of-the-art supervised learning for segmentation (nnU-Net), and the\nsemi-supervised mean teacher approach. Our Code is available on GitHub.\n","authors":["Ron Keuth","Lasse Hansen","Maren Balks","Ronja Jäger","Anne-Nele Schröder","Ludger Tüshaus","Mattias Heinrich"],"pdf_url":"https://arxiv.org/pdf/2411.12602v1.pdf","comment":"Presented at MICCAI Workshop on Advancing Data Solutions in Medical\n Imaging AI 2024; Code and data:\n https://github.com/multimodallearning/SamCarriesTheBurden"},{"id":"http://arxiv.org/abs/2411.05879v2","updated":"2024-11-19T16:00:19Z","published":"2024-11-08T04:53:55Z","title":"Smile upon the Face but Sadness in the Eyes: Emotion Recognition based\n on Facial Expressions and Eye Behaviors","summary":" Emotion Recognition (ER) is the process of identifying human emotions from\ngiven data. Currently, the field heavily relies on facial expression\nrecognition (FER) because facial expressions contain rich emotional cues.\nHowever, it is important to note that facial expressions may not always\nprecisely reflect genuine emotions and FER-based results may yield misleading\nER. To understand and bridge this gap between FER and ER, we introduce eye\nbehaviors as an important emotional cues for the creation of a new\nEye-behavior-aided Multimodal Emotion Recognition (EMER) dataset. Different\nfrom existing multimodal ER datasets, the EMER dataset employs a stimulus\nmaterial-induced spontaneous emotion generation method to integrate\nnon-invasive eye behavior data, like eye movements and eye fixation maps, with\nfacial videos, aiming to obtain natural and accurate human emotions. Notably,\nfor the first time, we provide annotations for both ER and FER in the EMER,\nenabling a comprehensive analysis to better illustrate the gap between both\ntasks. Furthermore, we specifically design a new EMERT architecture to\nconcurrently enhance performance in both ER and FER by efficiently identifying\nand bridging the emotion gap between the two.Specifically, our EMERT employs\nmodality-adversarial feature decoupling and multi-task Transformer to augment\nthe modeling of eye behaviors, thus providing an effective complement to facial\nexpressions. In the experiment, we introduce seven multimodal benchmark\nprotocols for a variety of comprehensive evaluations of the EMER dataset. The\nresults show that the EMERT outperforms other state-of-the-art multimodal\nmethods by a great margin, revealing the importance of modeling eye behaviors\nfor robust ER. To sum up, we provide a comprehensive analysis of the importance\nof eye behaviors in ER, advancing the study on addressing the gap between FER\nand ER for more robust ER performance.\n","authors":["Yuanyuan Liu","Lin Wei","Kejun Liu","Yibing Zhan","Zijing Chen","Zhe Chen","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2411.05879v2.pdf","comment":"The paper is part of ongoing work and we request to withdraw it from\n arXiv to revise it further. And The paper was submitted without agreement\n from all co-authors"},{"id":"http://arxiv.org/abs/2402.19145v2","updated":"2024-11-19T15:54:14Z","published":"2024-02-29T13:29:10Z","title":"A SAM-guided Two-stream Lightweight Model for Anomaly Detection","summary":" In industrial anomaly detection, model efficiency and mobile-friendliness\nbecome the primary concerns in real-world applications. Simultaneously, the\nimpressive generalization capabilities of Segment Anything (SAM) have garnered\nbroad academic attention, making it an ideal choice for localizing unseen\nanomalies and diverse real-world patterns. In this paper, considering these two\ncritical factors, we propose a SAM-guided Two-stream Lightweight Model for\nunsupervised anomaly detection (STLM) that not only aligns with the two\npractical application requirements but also harnesses the robust generalization\ncapabilities of SAM. We employ two lightweight image encoders, i.e., our\ntwo-stream lightweight module, guided by SAM's knowledge. To be specific, one\nstream is trained to generate discriminative and general feature\nrepresentations in both normal and anomalous regions, while the other stream\nreconstructs the same images without anomalies, which effectively enhances the\ndifferentiation of two-stream representations when facing anomalous regions.\nFurthermore, we employ a shared mask decoder and a feature aggregation module\nto generate anomaly maps. Our experiments conducted on MVTec AD benchmark show\nthat STLM, with about 16M parameters and achieving an inference time in 20ms,\ncompetes effectively with state-of-the-art methods in terms of performance,\n98.26% on pixel-level AUC and 94.92% on PRO. We further experiment on more\ndifficult datasets, e.g., VisA and DAGM, to demonstrate the effectiveness and\ngeneralizability of STLM.\n","authors":["Chenghao Li","Lei Qi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2402.19145v2.pdf","comment":"Accepted by ACM TOMM"},{"id":"http://arxiv.org/abs/2406.18558v2","updated":"2024-11-19T15:50:24Z","published":"2024-05-27T15:14:09Z","title":"BAISeg: Boundary Assisted Weakly Supervised Instance Segmentation","summary":" How to extract instance-level masks without instance-level supervision is the\nmain challenge of weakly supervised instance segmentation (WSIS). Popular WSIS\nmethods estimate a displacement field (DF) via learning inter-pixel relations\nand perform clustering to identify instances. However, the resulting instance\ncentroids are inherently unstable and vary significantly across different\nclustering algorithms. In this paper, we propose Boundary-Assisted Instance\nSegmentation (BAISeg), which is a novel paradigm for WSIS that realizes\ninstance segmentation with pixel-level annotations. BAISeg comprises an\ninstance-aware boundary detection (IABD) branch and a semantic segmentation\nbranch. The IABD branch identifies instances by predicting class-agnostic\ninstance boundaries rather than instance centroids, therefore, it is different\nfrom previous DF-based approaches. In particular, we proposed the Cascade\nFusion Module (CFM) and the Deep Mutual Attention (DMA) in the IABD branch to\nobtain rich contextual information and capture instance boundaries with weak\nresponses. During the training phase, we employed Pixel-to-Pixel Contrast to\nenhance the discriminative capacity of the IABD branch. This further\nstrengthens the continuity and closedness of the instance boundaries. Extensive\nexperiments on PASCAL VOC 2012 and MS COCO demonstrate the effectiveness of our\napproach, and we achieve considerable performance with only pixel-level\nannotations. The code will be available at https://github.com/wsis-seg/BAISeg.\n","authors":["Tengbo Wang","Yu Bai"],"pdf_url":"https://arxiv.org/pdf/2406.18558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12575v1","updated":"2024-11-19T15:42:48Z","published":"2024-11-19T15:42:48Z","title":"Stochastic BIQA: Median Randomized Smoothing for Certified Blind Image\n Quality Assessment","summary":" Most modern No-Reference Image-Quality Assessment (NR-IQA) metrics are based\non neural networks vulnerable to adversarial attacks. Attacks on such metrics\nlead to incorrect image/video quality predictions, which poses significant\nrisks, especially in public benchmarks. Developers of image processing\nalgorithms may unfairly increase the score of a target IQA metric without\nimproving the actual quality of the adversarial image. Although some empirical\ndefenses for IQA metrics were proposed, they do not provide theoretical\nguarantees and may be vulnerable to adaptive attacks. This work focuses on\ndeveloping a provably robust no-reference IQA metric. Our method is based on\nMedian Smoothing (MS) combined with an additional convolution denoiser with\nranking loss to improve the SROCC and PLCC scores of the defended IQA metric.\nCompared with two prior methods on three datasets, our method exhibited\nsuperior SROCC and PLCC scores while maintaining comparable certified\nguarantees.\n","authors":["Ekaterina Shumitskaya","Mikhail Pautov","Dmitriy Vatolin","Anastasia Antsiferova"],"pdf_url":"https://arxiv.org/pdf/2411.12575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11613v2","updated":"2024-11-19T15:36:38Z","published":"2024-11-18T14:35:01Z","title":"Leveraging Computational Pathology AI for Noninvasive Optical Imaging\n Analysis Without Retraining","summary":" Noninvasive optical imaging modalities can probe patient's tissue in 3D and\nover time generate gigabytes of clinically relevant data per sample. There is a\nneed for AI models to analyze this data and assist clinical workflow. The lack\nof expert labelers and the large dataset required (>100,000 images) for model\ntraining and tuning are the main hurdles in creating foundation models. In this\npaper we introduce FoundationShift, a method to apply any AI model from\ncomputational pathology without retraining. We show our method is more accurate\nthan state of the art models (SAM, MedSAM, SAM-Med2D, CellProfiler, Hover-Net,\nPLIP, UNI and ChatGPT), with multiple imaging modalities (OCT and RCM). This is\nachieved without the need for model retraining or fine-tuning. Applying our\nmethod to noninvasive in vivo images could enable physicians to readily\nincorporate optical imaging modalities into their clinical practice, providing\nreal time tissue analysis and improving patient care.\n","authors":["Danny Barash","Emilie Manning","Aidan Van Vleck","Omri Hirsch","Kyi Lei Aye","Jingxi Li","Philip O. Scumpia","Aydogan Ozcan","Sumaira Aasi","Kerri E. Rieger","Kavita Y. Sarin","Oren Freifeld","Yonatan Winetraub"],"pdf_url":"https://arxiv.org/pdf/2411.11613v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12560v1","updated":"2024-11-19T15:23:59Z","published":"2024-11-19T15:23:59Z","title":"Topological Symmetry Enhanced Graph Convolution for Skeleton-Based\n Action Recognition","summary":" Skeleton-based action recognition has achieved remarkable performance with\nthe development of graph convolutional networks (GCNs). However, most of these\nmethods tend to construct complex topology learning mechanisms while neglecting\nthe inherent symmetry of the human body. Additionally, the use of temporal\nconvolutions with certain fixed receptive fields limits their capacity to\neffectively capture dependencies in time sequences. To address the issues, we\n(1) propose a novel Topological Symmetry Enhanced Graph Convolution (TSE-GC) to\nenable distinct topology learning across different channel partitions while\nincorporating topological symmetry awareness and (2) construct a Multi-Branch\nDeformable Temporal Convolution (MBDTC) for skeleton-based action recognition.\nThe proposed TSE-GC emphasizes the inherent symmetry of the human body while\nenabling efficient learning of dynamic topologies. Meanwhile, the design of\nMBDTC introduces the concept of deformable modeling, leading to more flexible\nreceptive fields and stronger modeling capacity of temporal dependencies.\nCombining TSE-GC with MBDTC, our final model, TSE-GCN, achieves competitive\nperformance with fewer parameters compared with state-of-the-art methods on\nthree large datasets, NTU RGB+D, NTU RGB+D 120, and NW-UCLA. On the\ncross-subject and cross-set evaluations of NTU RGB+D 120, the accuracies of our\nmodel reach 90.0\\% and 91.1\\%, with 1.1M parameters and 1.38 GFLOPS for one\nstream.\n","authors":["Zeyu Liang","Hailun Xia","Naichuan Zheng","Huan Xu"],"pdf_url":"https://arxiv.org/pdf/2411.12560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12966v4","updated":"2024-11-19T15:22:16Z","published":"2024-04-19T15:53:27Z","title":"Look Before You Decide: Prompting Active Deduction of MLLMs for\n Assumptive Reasoning","summary":" Recently, Multimodal Large Language Models (MLLMs) have achieved significant\nsuccess across multiple disciplines due to their exceptional\ninstruction-following capabilities and extensive world knowledge. However,\nwhether these MLLMs possess human-like compositional reasoning abilities\nremains an open problem. To unveil their reasoning behaviors, we first curate a\n\\textbf{M}ultimodal \\textbf{A}ssumptive \\textbf{R}ea\\textbf{s}oning Benchmark\n(MARS-Bench) in this paper. Interestingly, we find that most prevalent MLLMs\ncan be easily fooled by the introduction of a presupposition into the question,\nwhereas such presuppositions appear naive to human reasoning. Besides, we also\npropose a simple yet effective method, Active Deduction (AD), to encourage the\nmodel to actively perform composite deduction before reaching a final decision.\nEquipped with the proposed AD method, a MLLM demonstrates significant\nimprovements in assumptive reasoning abilities without compromising its\ngeneral-purpose question-answering performance. We also provide extensive\nevaluations of both open-source and private MLLMs on MARS-Bench, along with\nexperimental analyses of the AD method.\n","authors":["Yian Li","Wentao Tian","Yang Jiao","Jingjing Chen","Na Zhao","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.12966v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12558v1","updated":"2024-11-19T15:18:50Z","published":"2024-11-19T15:18:50Z","title":"Recall and Refine: A Simple but Effective Source-free Open-set Domain\n Adaptation Framework","summary":" Open-set Domain Adaptation (OSDA) aims to adapt a model from a labeled source\ndomain to an unlabeled target domain, where novel classes - also referred to as\ntarget-private unknown classes - are present. Source-free Open-set Domain\nAdaptation (SF-OSDA) methods address OSDA without accessing labeled source\ndata, making them particularly relevant under privacy constraints. However,\nSF-OSDA presents significant challenges due to distribution shifts and the\nintroduction of novel classes. Existing SF-OSDA methods typically rely on\nthresholding the prediction entropy of a sample to identify it as either a\nknown or unknown class but fail to explicitly learn discriminative features for\nthe target-private unknown classes. We propose Recall and Refine (RRDA), a\nnovel SF-OSDA framework designed to address these limitations by explicitly\nlearning features for target-private unknown classes. RRDA employs a two-step\nprocess. First, we enhance the model's capacity to recognize unknown classes by\ntraining a target classifier with an additional decision boundary, guided by\nsynthetic samples generated from target domain features. This enables the\nclassifier to effectively separate known and unknown classes. In the second\nstep, we adapt the entire model to the target domain, addressing both domain\nshifts and improving generalization to unknown classes. Any off-the-shelf\nsource-free domain adaptation method (e.g., SHOT, AaD) can be seamlessly\nintegrated into our framework at this stage. Extensive experiments on three\nbenchmark datasets demonstrate that RRDA significantly outperforms existing\nSF-OSDA and OSDA methods.\n","authors":["Ismail Nejjar","Hao Dong","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2411.12558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18238v3","updated":"2024-11-19T15:06:20Z","published":"2024-10-23T19:33:30Z","title":"CARLA2Real: a tool for reducing the sim2real gap in CARLA simulator","summary":" Simulators are indispensable for research in autonomous systems such as\nself-driving cars, autonomous robots and drones. Despite significant progress\nin various simulation aspects, such as graphical realism, an evident gap\npersists between the virtual and real-world environments. Since the ultimate\ngoal is to deploy the autonomous systems in the real world, closing the\nsim2real gap is of utmost importance. In this paper, we employ a\nstate-of-the-art approach to enhance the photorealism of simulated data,\naligning them with the visual characteristics of real-world datasets. Based on\nthis, we developed CARLA2Real, an easy-to-use, publicly available tool\n(plug-in) for the widely used and open-source CARLA simulator. This tool\nenhances the output of CARLA in near real-time, achieving a frame rate of 13\nFPS, translating it to the visual style and realism of real-world datasets such\nas Cityscapes, KITTI, and Mapillary Vistas. By employing the proposed tool, we\ngenerated synthetic datasets from both the simulator and the enhancement model\noutputs, including their corresponding ground truth annotations for tasks\nrelated to autonomous driving. Then, we performed a number of experiments to\nevaluate the impact of the proposed approach on feature extraction and semantic\nsegmentation methods when trained on the enhanced synthetic data. The results\ndemonstrate that the sim2real gap is significant and can indeed be reduced by\nthe introduced approach.\n","authors":["Stefanos Pasios","Nikos Nikolaidis"],"pdf_url":"https://arxiv.org/pdf/2410.18238v3.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2411.12547v1","updated":"2024-11-19T15:00:18Z","published":"2024-11-19T15:00:18Z","title":"S3TU-Net: Structured Convolution and Superpixel Transformer for Lung\n Nodule Segmentation","summary":" The irregular and challenging characteristics of lung adenocarcinoma nodules\nin computed tomography (CT) images complicate staging diagnosis, making\naccurate segmentation critical for clinicians to extract detailed lesion\ninformation. In this study, we propose a segmentation model, S3TU-Net, which\nintegrates multi-dimensional spatial connectors and a superpixel-based visual\ntransformer. S3TU-Net is built on a multi-view CNN-Transformer hybrid\narchitecture, incorporating superpixel algorithms, structured weighting, and\nspatial shifting techniques to achieve superior segmentation performance. The\nmodel leverages structured convolution blocks (DWF-Conv/D2BR-Conv) to extract\nmulti-scale local features while mitigating overfitting. To enhance multi-scale\nfeature fusion, we introduce the S2-MLP Link, integrating spatial shifting and\nattention mechanisms at the skip connections. Additionally, the residual-based\nsuperpixel visual transformer (RM-SViT) effectively merges global and local\nfeatures by employing sparse correlation learning and multi-branch attention to\ncapture long-range dependencies, with residual connections enhancing stability\nand computational efficiency. Experimental results on the LIDC-IDRI dataset\ndemonstrate that S3TU-Net achieves a DSC, precision, and IoU of 89.04%, 90.73%,\nand 90.70%, respectively. Compared to recent methods, S3TU-Net improves DSC by\n4.52% and sensitivity by 3.16%, with other metrics showing an approximate 2%\nincrease. In addition to comparison and ablation studies, we validated the\ngeneralization ability of our model on the EPDB private dataset, achieving a\nDSC of 86.40%.\n","authors":["Yuke Wu","Xiang Liu","Yunyu Shi","Xinyi Chen","Zhenglei Wang","YuQing Xu","Shuo Hong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.12547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09821v2","updated":"2024-11-19T14:57:40Z","published":"2024-11-14T21:53:46Z","title":"Automatic Classification of General Movements in Newborns","summary":" General movements (GMs) are spontaneous, coordinated body movements in\ninfants that offer valuable insights into the developing nervous system.\nAssessed through the Prechtl GM Assessment (GMA), GMs are reliable predictors\nfor neurodevelopmental disorders. However, GMA requires specifically trained\nclinicians, who are limited in number. To scale up newborn screening, there is\na need for an algorithm that can automatically classify GMs from infant video\nrecordings. This data poses challenges, including variability in recording\nlength, device type, and setting, with each video coarsely annotated for\noverall movement quality. In this work, we introduce a tool for extracting\nfeatures from these recordings and explore various machine learning techniques\nfor automated GM classification.\n","authors":["Daphné Chopard","Sonia Laguna","Kieran Chin-Cheong","Annika Dietz","Anna Badura","Sven Wellmann","Julia E. Vogt"],"pdf_url":"https://arxiv.org/pdf/2411.09821v2.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages"},{"id":"http://arxiv.org/abs/2405.00448v3","updated":"2024-11-19T14:52:59Z","published":"2024-05-01T11:04:22Z","title":"MMTryon: Multi-Modal Multi-Reference Control for High-Quality Fashion\n Generation","summary":" This paper introduces MMTryon, a multi-modal multi-reference VIrtual Try-ON\n(VITON) framework, which can generate high-quality compositional try-on results\nby taking a text instruction and multiple garment images as inputs. Our MMTryon\naddresses three problems overlooked in prior literature: 1) Support of multiple\ntry-on items. Existing methods are commonly designed for single-item try-on\ntasks (e.g., upper/lower garments, dresses). 2)Specification of dressing style.\nExisting methods are unable to customize dressing styles based on instructions\n(e.g., zipped/unzipped, tuck-in/tuck-out, etc.) 3) Segmentation Dependency.\nThey further heavily rely on category-specific segmentation models to identify\nthe replacement regions, with segmentation errors directly leading to\nsignificant artifacts in the try-on results. To address the first two issues,\nour MMTryon introduces a novel multi-modality and multi-reference attention\nmechanism to combine the garment information from reference images and\ndressing-style information from text instructions. Besides, to remove the\nsegmentation dependency, MMTryon uses a parsing-free garment encoder and\nleverages a novel scalable data generation pipeline to convert existing VITON\ndatasets to a form that allows MMTryon to be trained without requiring any\nexplicit segmentation. Extensive experiments on high-resolution benchmarks and\nin-the-wild test sets demonstrate MMTryon's superiority over existing SOTA\nmethods both qualitatively and quantitatively. MMTryon's impressive performance\non multi-item and style-controllable virtual try-on scenarios and its ability\nto try on any outfit in a large variety of scenarios from any source image,\nopens up a new avenue for future investigation in the fashion community.\n","authors":["Xujie Zhang","Ente Lin","Xiu Li","Yuxuan Luo","Michael Kampffmeyer","Xin Dong","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2405.00448v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08685v5","updated":"2024-11-19T14:52:04Z","published":"2023-05-15T14:42:02Z","title":"CLIP-VG: Self-paced Curriculum Adapting of CLIP for Visual Grounding","summary":" Visual Grounding (VG) is a crucial topic in the field of vision and language,\nwhich involves locating a specific region described by expressions within an\nimage. To reduce the reliance on manually labeled data, unsupervised visual\ngrounding have been developed to locate regions using pseudo-labels. However,\nthe performance of existing unsupervised methods is highly dependent on the\nquality of pseudo-labels and these methods always encounter issues with limited\ndiversity. In order to utilize vision and language pre-trained models to\naddress the grounding problem, and reasonably take advantage of pseudo-labels,\nwe propose CLIP-VG, a novel method that can conduct self-paced curriculum\nadapting of CLIP with pseudo-language labels. We propose a simple yet efficient\nend-to-end network architecture to realize the transfer of CLIP to the visual\ngrounding. Based on the CLIP-based architecture, we further propose\nsingle-source and multi-source curriculum adapting algorithms, which can\nprogressively find more reliable pseudo-labels to learn an optimal model,\nthereby achieving a balance between reliability and diversity for the\npseudo-language labels. Our method outperforms the current state-of-the-art\nunsupervised method by a significant margin on RefCOCO/+/g datasets in both\nsingle-source and multi-source scenarios, with improvements ranging from\n6.78$\\%$ to 10.67$\\%$ and 11.39$\\%$ to 14.87$\\%$, respectively. The results\neven outperform existing weakly supervised visual grounding methods.\nFurthermore, our method is also competitive in fully supervised setting. The\ncode and models are available at https://github.com/linhuixiao/CLIP-VG.\n","authors":["Linhui Xiao","Xiaoshan Yang","Fang Peng","Ming Yan","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2305.08685v5.pdf","comment":"Accepted by IEEE Transaction on Multimedia (2023), Paper page:\n https://ieeexplore.ieee.org/abstract/document/10269126. Code are available at\n https://github.com/linhuixiao/CLIP-VG"},{"id":"http://arxiv.org/abs/2406.14264v2","updated":"2024-11-19T14:51:58Z","published":"2024-06-20T12:40:18Z","title":"Zero-Shot Image Denoising for High-Resolution Electron Microscopy","summary":" High-resolution electron microscopy (HREM) imaging technique is a powerful\ntool for directly visualizing a broad range of materials in real-space.\nHowever, it faces challenges in denoising due to ultra-low signal-to-noise\nratio (SNR) and scarce data availability. In this work, we propose Noise2SR, a\nzero-shot self-supervised learning (ZS-SSL) denoising framework for HREM.\nWithin our framework, we propose a super-resolution (SR) based self-supervised\ntraining strategy, incorporating the Random Sub-sampler module. The Random\nSub-sampler is designed to generate approximate infinite noisy pairs from a\nsingle noisy image, serving as an effective data augmentation in zero-shot\ndenoising. Noise2SR trains the network with paired noisy images of different\nresolutions, which is conducted via SR strategy. The SR-based training\nfacilitates the network adopting more pixels for supervision, and the random\nsub-sampling helps compel the network to learn continuous signals enhancing the\nrobustness. Meanwhile, we mitigate the uncertainty caused by random-sampling by\nadopting minimum mean squared error (MMSE) estimation for the denoised results.\nWith the distinctive integration of training strategy and proposed designs,\nNoise2SR can achieve superior denoising performance using a single noisy HREM\nimage. We evaluate the performance of Noise2SR in both simulated and real HREM\ndenoising tasks. It outperforms state-of-the-art ZS-SSL methods and achieves\ncomparable denoising performance with supervised methods. The success of\nNoise2SR suggests its potential for improving the SNR of images in material\nimaging domains.\n","authors":["Xuanyu Tian","Zhuoya Dong","Xiyue Lin","Yue Gao","Hongjiang Wei","Yanhang Ma","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14264v2.pdf","comment":"12 pages, 12 figures"},{"id":"http://arxiv.org/abs/2411.08992v2","updated":"2024-11-19T14:51:07Z","published":"2024-11-13T19:33:08Z","title":"IDCIA: Immunocytochemistry Dataset for Cellular Image Analysis","summary":" We present a new annotated microscopic cellular image dataset to improve the\neffectiveness of machine learning methods for cellular image analysis. Cell\ncounting is an important step in cell analysis. Typically, domain experts\nmanually count cells in a microscopic image. Automated cell counting can\npotentially eliminate this tedious, time-consuming process. However, a good,\nlabeled dataset is required for training an accurate machine learning model.\nOur dataset includes microscopic images of cells, and for each image, the cell\ncount and the location of individual cells. The data were collected as part of\nan ongoing study investigating the potential of electrical stimulation to\nmodulate stem cell differentiation and possible applications for neural repair.\nCompared to existing publicly available datasets, our dataset has more images\nof cells stained with more variety of antibodies (protein components of immune\nresponses against invaders) typically used for cell analysis. The experimental\nresults on this dataset indicate that none of the five existing models under\nthis study are able to achieve sufficiently accurate count to replace the\nmanual methods. The dataset is available at\nhttps://figshare.com/articles/dataset/Dataset/21970604.\n","authors":["Abdurahman Ali Mohammed","Catherine Fonder","Donald S. Sakaguchi","Wallapak Tavanapong","Surya K. Mallapragada","Azeez Idris"],"pdf_url":"https://arxiv.org/pdf/2411.08992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09151v2","updated":"2024-11-19T14:41:13Z","published":"2024-08-17T09:51:42Z","title":"One-step Generative Diffusion for Realistic Extreme Image Rescaling","summary":" Image rescaling aims to learn the optimal low-resolution (LR) image that can\nbe accurately reconstructed to its original high-resolution (HR) counterpart,\nproviding an efficient image processing and storage method for ultra-high\ndefinition media. However, extreme downscaling factors pose significant\nchallenges to the upscaling process due to its highly ill-posed nature, causing\nexisting image rescaling methods to struggle in generating semantically correct\nstructures and perceptual friendly textures. In this work, we propose a novel\nframework called One-Step Image Rescaling Diffusion (OSIRDiff) for extreme\nimage rescaling, which performs rescaling operations in the latent space of a\npre-trained autoencoder and effectively leverages powerful natural image priors\nlearned by a pre-trained text-to-image diffusion model. Specifically, OSIRDiff\nadopts a pseudo-invertible module to establish the bidirectional mapping\nbetween the latent features of the HR image and the target-sized LR image.\nThen, the rescaled features are refined by a pre-trained diffusion model to\ngenerate more faithful and visually pleasing details. The entire model is\nend-to-end trained to enable the diffusion priors to guide the rescaling\nprocess. Considering the spatially non-uniform reconstruction quality of the\nrescaled latent features, we propose a novel time-step alignment strategy,\nwhich can adaptively determine the generative strength of the diffusion model\nbased on the degree of latent reconstruction errors. Extensive experiments\ndemonstrate the superiority of OSIRDiff over previous methods in both\nquantitative and qualitative evaluations.\n","authors":["Ce Wang","Zhenyu Hu","Wanjie Sun","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.09151v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06807v3","updated":"2024-11-19T14:31:02Z","published":"2024-03-11T15:26:34Z","title":"Multistep Consistency Models","summary":" Diffusion models are relatively easy to train but require many steps to\ngenerate samples. Consistency models are far more difficult to train, but\ngenerate samples in a single step.\n In this paper we propose Multistep Consistency Models: A unification between\nConsistency Models (Song et al., 2023) and TRACT (Berthelot et al., 2023) that\ncan interpolate between a consistency model and a diffusion model: a trade-off\nbetween sampling speed and sampling quality. Specifically, a 1-step consistency\nmodel is a conventional consistency model whereas a $\\infty$-step consistency\nmodel is a diffusion model.\n Multistep Consistency Models work really well in practice. By increasing the\nsample budget from a single step to 2-8 steps, we can train models more easily\nthat generate higher quality samples, while retaining much of the sampling\nspeed benefits. Notable results are 1.4 FID on Imagenet 64 in 8 step and 2.1\nFID on Imagenet128 in 8 steps with consistency distillation, using simple\nlosses without adversarial training. We also show that our method scales to a\ntext-to-image diffusion model, generating samples that are close to the quality\nof the original model.\n","authors":["Jonathan Heek","Emiel Hoogeboom","Tim Salimans"],"pdf_url":"https://arxiv.org/pdf/2403.06807v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12530v1","updated":"2024-11-19T14:24:03Z","published":"2024-11-19T14:24:03Z","title":"Contourlet Refinement Gate Framework for Thermal Spectrum Distribution\n Regularized Infrared Image Super-Resolution","summary":" Image super-resolution (SR) is a classical yet still active low-level vision\nproblem that aims to reconstruct high-resolution (HR) images from their\nlow-resolution (LR) counterparts, serving as a key technique for image\nenhancement. Current approaches to address SR tasks, such as transformer-based\nand diffusion-based methods, are either dedicated to extracting RGB image\nfeatures or assuming similar degradation patterns, neglecting the inherent\nmodal disparities between infrared and visible images. When directly applied to\ninfrared image SR tasks, these methods inevitably distort the infrared spectral\ndistribution, compromising the machine perception in downstream tasks. In this\nwork, we emphasize the infrared spectral distribution fidelity and propose a\nContourlet refinement gate framework to restore infrared modal-specific\nfeatures while preserving spectral distribution fidelity. Our approach captures\nhigh-pass subbands from multi-scale and multi-directional infrared spectral\ndecomposition to recover infrared-degraded information through a gate\narchitecture. The proposed Spectral Fidelity Loss regularizes the spectral\nfrequency distribution during reconstruction, which ensures the preservation of\nboth high- and low-frequency components and maintains the fidelity of\ninfrared-specific features. We propose a two-stage prompt-learning optimization\nto guide the model in learning infrared HR characteristics from LR degradation.\nExtensive experiments demonstrate that our approach outperforms existing image\nSR models in both visual and perceptual tasks while notably enhancing machine\nperception in downstream tasks. Our code is available at\nhttps://github.com/hey-it-s-me/CoRPLE.\n","authors":["Yang Zou","Zhixin Chen","Zhipeng Zhang","Xingyuan Li","Long Ma","Jinyuan Liu","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.12530v1.pdf","comment":"13 figures, 6 tables"},{"id":"http://arxiv.org/abs/2411.12525v1","updated":"2024-11-19T14:18:02Z","published":"2024-11-19T14:18:02Z","title":"Rethinking Top Probability from Multi-view for Distracted Driver\n Behaviour Localization","summary":" Naturalistic driving action localization task aims to recognize and\ncomprehend human behaviors and actions from video data captured during\nreal-world driving scenarios. Previous studies have shown great action\nlocalization performance by applying a recognition model followed by\nprobability-based post-processing. Nevertheless, the probabilities provided by\nthe recognition model frequently contain confused information causing challenge\nfor post-processing. In this work, we adopt an action recognition model based\non self-supervise learning to detect distracted activities and give potential\naction probabilities. Subsequently, a constraint ensemble strategy takes\nadvantages of multi-camera views to provide robust predictions. Finally, we\nintroduce a conditional post-processing operation to locate distracted\nbehaviours and action temporal boundaries precisely. Experimenting on test set\nA2, our method obtains the sixth position on the public leaderboard of track 3\nof the 2024 AI City Challenge.\n","authors":["Quang Vinh Nguyen","Vo Hoang Thanh Son","Chau Truong Vinh Hoang","Duc Duy Nguyen","Nhat Huy Nguyen Minh","Soo-Hyung Kim"],"pdf_url":"https://arxiv.org/pdf/2411.12525v1.pdf","comment":"Computer Vision and Pattern Recognition Workshop 2024"},{"id":"http://arxiv.org/abs/2411.12523v1","updated":"2024-11-19T14:13:25Z","published":"2024-11-19T14:13:25Z","title":"Data Pruning in Generative Diffusion Models","summary":" Data pruning is the problem of identifying a core subset that is most\nbeneficial to training and discarding the remainder. While pruning strategies\nare well studied for discriminative models like those used in classification,\nlittle research has gone into their application to generative models.\nGenerative models aim to estimate the underlying distribution of the data, so\npresumably they should benefit from larger datasets. In this work we aim to\nshed light on the accuracy of this statement, specifically answer the question\nof whether data pruning for generative diffusion models could have a positive\nimpact. Contrary to intuition, we show that eliminating redundant or noisy data\nin large datasets is beneficial particularly when done strategically. We\nexperiment with several pruning methods including recent-state-of-art methods,\nand evaluate over CelebA-HQ and ImageNet datasets. We demonstrate that a simple\nclustering method outperforms other sophisticated and computationally demanding\nmethods. We further exhibit how we can leverage clustering to balance skewed\ndatasets in an unsupervised manner to allow fair sampling for underrepresented\npopulations in the data distribution, which is a crucial problem in generative\nmodels.\n","authors":["Rania Briq","Jiangtao Wang","Steffan Kesselheim"],"pdf_url":"https://arxiv.org/pdf/2411.12523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12520v1","updated":"2024-11-19T14:07:17Z","published":"2024-11-19T14:07:17Z","title":"VMGNet: A Low Computational Complexity Robotic Grasping Network Based on\n VMamba with Multi-Scale Feature Fusion","summary":" While deep learning-based robotic grasping technology has demonstrated strong\nadaptability, its computational complexity has also significantly increased,\nmaking it unsuitable for scenarios with high real-time requirements. Therefore,\nwe propose a low computational complexity and high accuracy model named VMGNet\nfor robotic grasping. For the first time, we introduce the Visual State Space\ninto the robotic grasping field to achieve linear computational complexity,\nthereby greatly reducing the model's computational cost. Meanwhile, to improve\nthe accuracy of the model, we propose an efficient and lightweight multi-scale\nfeature fusion module, named Fusion Bridge Module, to extract and fuse\ninformation at different scales. We also present a new loss function\ncalculation method to enhance the importance differences between subtasks,\nimproving the model's fitting ability. Experiments show that VMGNet has only\n8.7G Floating Point Operations and an inference time of 8.1 ms on our devices.\nVMGNet also achieved state-of-the-art performance on the Cornell and Jacquard\npublic datasets. To validate VMGNet's effectiveness in practical applications,\nwe conducted real grasping experiments in multi-object scenarios, and VMGNet\nachieved an excellent performance with a 94.4% success rate in real-world\ngrasping tasks. The video for the real-world robotic grasping experiments is\navailable at https://youtu.be/S-QHBtbmLc4.\n","authors":["Yuhao Jin","Qizhong Gao","Xiaohui Zhu","Yong Yue","Eng Gee Lim","Yuqing Chen","Prudence Wong","Yijie Chu"],"pdf_url":"https://arxiv.org/pdf/2411.12520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12516v1","updated":"2024-11-19T13:58:20Z","published":"2024-11-19T13:58:20Z","title":"MAViS: Modular Autonomous Virtualization System for Two-Dimensional\n Semiconductor Quantum Dot Arrays","summary":" Arrays of gate-defined semiconductor quantum dots are among the leading\ncandidates for building scalable quantum processors. High-fidelity\ninitialization, control, and readout of spin qubit registers require exquisite\nand targeted control over key Hamiltonian parameters that define the\nelectrostatic environment. However, due to the tight gate pitch, capacitive\ncrosstalk between gates hinders independent tuning of chemical potentials and\ninterdot couplings. While virtual gates offer a practical solution, determining\nall the required cross-capacitance matrices accurately and efficiently in large\nquantum dot registers is an open challenge. Here, we establish a Modular\nAutomated Virtualization System (MAViS) -- a general and modular framework for\nautonomously constructing a complete stack of multi-layer virtual gates in real\ntime. Our method employs machine learning techniques to rapidly extract\nfeatures from two-dimensional charge stability diagrams. We then utilize\ncomputer vision and regression models to self-consistently determine all\nrelative capacitive couplings necessary for virtualizing plunger and barrier\ngates in both low- and high-tunnel-coupling regimes. Using MAViS, we\nsuccessfully demonstrate accurate virtualization of a dense two-dimensional\narray comprising ten quantum dots defined in a high-quality Ge/SiGe\nheterostructure. Our work offers an elegant and practical solution for the\nefficient control of large-scale semiconductor quantum dot systems.\n","authors":["Anantha S. Rao","Donovan Buterakos","Barnaby van Straaten","Valentin John","Cécile X. Yu","Stefan D. Oosterhout","Lucas Stehouwer","Giordano Scappucci","Menno Veldhorst","Francesco Borsoi","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2411.12516v1.pdf","comment":"14 pages, 5 figures, 8 pages of supplemental material"},{"id":"http://arxiv.org/abs/2411.12514v1","updated":"2024-11-19T13:55:58Z","published":"2024-11-19T13:55:58Z","title":"3D Reconstruction by Looking: Instantaneous Blind Spot Detector for\n Indoor SLAM through Mixed Reality","summary":" Indoor SLAM often suffers from issues such as scene drifting, double walls,\nand blind spots, particularly in confined spaces with objects close to the\nsensors (e.g. LiDAR and cameras) in reconstruction tasks. Real-time\nvisualization of point cloud registration during data collection may help\nmitigate these issues, but a significant limitation remains in the inability to\nin-depth compare the scanned data with actual physical environments. These\nchallenges obstruct the quality of reconstruction products, frequently\nnecessitating revisit and rescan efforts. For this regard, we developed the\nLiMRSF (LiDAR-MR-RGB Sensor Fusion) system, allowing users to perceive the\nin-situ point cloud registration by looking through a Mixed-Reality (MR)\nheadset. This tailored framework visualizes point cloud meshes as holograms,\nseamlessly matching with the real-time scene on see-through glasses, and\nautomatically highlights errors detected while they overlap. Such holographic\nelements are transmitted via a TCP server to an MR headset, where it is\ncalibrated to align with the world coordinate, the physical location. This\nallows users to view the localized reconstruction product instantaneously,\nenabling them to quickly identify blind spots and errors, and take prompt\naction on-site. Our blind spot detector achieves an error detection precision\nwith an F1 Score of 75.76% with acceptably high fidelity of monitoring through\nthe LiMRSF system (highest SSIM of 0.5619, PSNR of 14.1004, and lowest MSE of\n0.0389 in the five different sections of the simplified mesh model which users\nvisualize through the LiMRSF device see-through glasses). This method ensures\nthe creation of detailed, high-quality datasets for 3D models, with potential\napplications in Building Information Modeling (BIM) but not limited.\n","authors":["Hanbeom Chang","Jongseong Brad Choi","Chul Min Yeum"],"pdf_url":"https://arxiv.org/pdf/2411.12514v1.pdf","comment":"21 pages, 13 figures, 3 tables"},{"id":"http://arxiv.org/abs/2411.12510v1","updated":"2024-11-19T13:52:30Z","published":"2024-11-19T13:52:30Z","title":"PR-ENDO: Physically Based Relightable Gaussian Splatting for Endoscopy","summary":" Endoscopic procedures are crucial for colorectal cancer diagnosis, and\nthree-dimensional reconstruction of the environment for real-time novel-view\nsynthesis can significantly enhance diagnosis. We present PR-ENDO, a framework\nthat leverages 3D Gaussian Splatting within a physically based, relightable\nmodel tailored for the complex acquisition conditions in endoscopy, such as\nrestricted camera rotations and strong view-dependent illumination. By\nexploiting the connection between the camera and light source, our approach\nintroduces a relighting model to capture the intricate interactions between\nlight and tissue using physically based rendering and MLP. Existing methods\noften produce artifacts and inconsistencies under these conditions, which\nPR-ENDO overcomes by incorporating a specialized diffuse MLP that utilizes\nlight angles and normal vectors, achieving stable reconstructions even with\nlimited training camera rotations. We benchmarked our framework using a\npublicly available dataset and a newly introduced dataset with wider camera\nrotations. Our methods demonstrated superior image quality compared to baseline\napproaches.\n","authors":["Joanna Kaleta","Weronika Smolak-Dyżewska","Dawid Malarz","Diego Dall'Alba","Przemysław Korzeniowski","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2411.12510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01728v2","updated":"2024-11-19T13:42:21Z","published":"2024-08-03T10:01:29Z","title":"Survey on Emotion Recognition through Posture Detection and the\n possibility of its application in Virtual Reality","summary":" A survey is presented focused on using pose estimation techniques in\nEmotional recognition using various technologies normal cameras, and depth\ncameras for real-time, and the potential use of VR and inputs including images,\nvideos, and 3-dimensional poses described in vector space. We discussed 19\nresearch papers collected from selected journals and databases highlighting\ntheir methodology, classification algorithm, and the used datasets that relate\nto emotion recognition and pose estimation. A benchmark has been made according\nto their accuracy as it was the most common performance measurement metric\nused. We concluded that the multimodal Approaches overall made the best\naccuracy and then we mentioned futuristic concerns that can improve the\ndevelopment of this research topic.\n","authors":["Leina Elansary","Zaki Taha","Walaa Gad"],"pdf_url":"https://arxiv.org/pdf/2408.01728v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00234v2","updated":"2024-11-19T13:34:58Z","published":"2023-09-30T02:54:51Z","title":"Pixel-Inconsistency Modeling for Image Manipulation Localization","summary":" Digital image forensics plays a crucial role in image authentication and\nmanipulation localization. Despite the progress powered by deep neural\nnetworks, existing forgery localization methodologies exhibit limitations when\ndeployed to unseen datasets and perturbed images (i.e., lack of generalization\nand robustness to real-world applications). To circumvent these problems and\naid image integrity, this paper presents a generalized and robust manipulation\nlocalization model through the analysis of pixel inconsistency artifacts. The\nrationale is grounded on the observation that most image signal processors\n(ISP) involve the demosaicing process, which introduces pixel correlations in\npristine images. Moreover, manipulating operations, including splicing,\ncopy-move, and inpainting, directly affect such pixel regularity. We,\ntherefore, first split the input image into several blocks and design masked\nself-attention mechanisms to model the global pixel dependency in input images.\nSimultaneously, we optimize another local pixel dependency stream to mine local\nmanipulation clues within input forgery images. In addition, we design novel\nLearning-to-Weight Modules (LWM) to combine features from the two streams,\nthereby enhancing the final forgery localization performance. To improve the\ntraining process, we propose a novel Pixel-Inconsistency Data Augmentation\n(PIDA) strategy, driving the model to focus on capturing inherent pixel-level\nartifacts instead of mining semantic forgery traces. This work establishes a\ncomprehensive benchmark integrating 15 representative detection models across\n12 datasets. Extensive experiments show that our method successfully extracts\ninherent pixel-inconsistency forgery fingerprints and achieve state-of-the-art\ngeneralization and robustness performances in image manipulation localization.\n","authors":["Chenqi Kong","Anwei Luo","Shiqi Wang","Haoliang Li","Anderson Rocha","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2310.00234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15356v2","updated":"2024-11-19T13:18:57Z","published":"2024-05-24T08:46:31Z","title":"Alleviating Hallucinations in Large Vision-Language Models through\n Hallucination-Induced Optimization","summary":" Although Large Visual Language Models (LVLMs) have demonstrated exceptional\nabilities in understanding multimodal data, they invariably suffer from\nhallucinations, leading to a disconnect between the generated text and the\ncorresponding images. Almost all current visual contrastive decoding methods\nattempt to mitigate these hallucinations by introducing visual uncertainty\ninformation that appropriately widens the contrastive logits gap between\nhallucinatory and targeted ones. However, due to uncontrollable nature of the\nglobal visual uncertainty, they struggle to precisely induce the hallucinatory\ntokens, which severely limits their effectiveness in mitigating hallucinations\nand may even lead to the generation of undesired hallucinations. To tackle this\nissue, we conducted the theoretical analysis to promote the effectiveness of\ncontrast decoding. Building on this insight, we introduce a novel optimization\nstrategy named Hallucination-Induced Optimization (HIO). This strategy seeks to\namplify the contrast between hallucinatory and targeted tokens relying on a\nfine-tuned theoretical preference model (i.e., Contrary Bradley-Terry Model),\nthereby facilitating efficient contrast decoding to alleviate hallucinations in\nLVLMs. Extensive experimental research demonstrates that our HIO strategy can\neffectively reduce hallucinations in LVLMs, outperforming state-of-the-art\nmethods across various benchmarks.\n","authors":["Beitao Chen","Xinyu Lyu","Lianli Gao","Jingkuan Song","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2405.15356v2.pdf","comment":"Accepted by NeurIPS 2024. arXiv admin note: text overlap with\n arXiv:2311.16922 by other authors"},{"id":"http://arxiv.org/abs/2307.06701v3","updated":"2024-11-19T13:09:06Z","published":"2023-07-13T11:58:27Z","title":"S-HR-VQVAE: Sequential Hierarchical Residual Learning Vector Quantized\n Variational Autoencoder for Video Prediction","summary":" We address the video prediction task by putting forth a novel model that\ncombines (i) a novel hierarchical residual learning vector quantized\nvariational autoencoder (HR-VQVAE), and (ii) a novel autoregressive\nspatiotemporal predictive model (AST-PM). We refer to this approach as a\nsequential hierarchical residual learning vector quantized variational\nautoencoder (S-HR-VQVAE). By leveraging the intrinsic capabilities of HR-VQVAE\nat modeling still images with a parsimonious representation, combined with the\nAST-PM's ability to handle spatiotemporal information, S-HR-VQVAE can better\ndeal with major challenges in video prediction. These include learning\nspatiotemporal information, handling high dimensional data, combating blurry\nprediction, and implicit modeling of physical characteristics. Extensive\nexperimental results on four challenging tasks, namely KTH Human Action,\nTrafficBJ, Human3.6M, and Kitti, demonstrate that our model compares favorably\nagainst state-of-the-art video prediction techniques both in quantitative and\nqualitative evaluations despite a much smaller model size. Finally, we boost\nS-HR-VQVAE by proposing a novel training method to jointly estimate the\nHR-VQVAE and AST-PM parameters.\n","authors":["Mohammad Adiban","Kalin Stefanov","Sabato Marco Siniscalchi","Giampiero Salvi"],"pdf_url":"https://arxiv.org/pdf/2307.06701v3.pdf","comment":"12 pages, 6 figures, 5 tables. Accepted for publication on IEEE\n Transactions on Multimedia on 2024-11-19"},{"id":"http://arxiv.org/abs/2411.12471v1","updated":"2024-11-19T12:52:37Z","published":"2024-11-19T12:52:37Z","title":"SCIGS: 3D Gaussians Splatting from a Snapshot Compressive Image","summary":" Snapshot Compressive Imaging (SCI) offers a possibility for capturing\ninformation in high-speed dynamic scenes, requiring efficient reconstruction\nmethod to recover scene information. Despite promising results, current deep\nlearning-based and NeRF-based reconstruction methods face challenges: 1) deep\nlearning-based reconstruction methods struggle to maintain 3D structural\nconsistency within scenes, and 2) NeRF-based reconstruction methods still face\nlimitations in handling dynamic scenes. To address these challenges, we propose\nSCIGS, a variant of 3DGS, and develop a primitive-level transformation network\nthat utilizes camera pose stamps and Gaussian primitive coordinates as\nembedding vectors. This approach resolves the necessity of camera pose in\nvanilla 3DGS and enhances multi-view 3D structural consistency in dynamic\nscenes by utilizing transformed primitives. Additionally, a high-frequency\nfilter is introduced to eliminate the artifacts generated during the\ntransformation. The proposed SCIGS is the first to reconstruct a 3D explicit\nscene from a single compressed image, extending its application to dynamic 3D\nscenes. Experiments on both static and dynamic scenes demonstrate that SCIGS\nnot only enhances SCI decoding but also outperforms current state-of-the-art\nmethods in reconstructing dynamic 3D scenes from a single compressed image. The\ncode will be made available upon publication.\n","authors":["Zixu Wang","Hao Yang","Yu Guo","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2411.12471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09663v3","updated":"2024-11-19T12:49:14Z","published":"2024-08-19T02:46:23Z","title":"3D-Consistent Human Avatars with Sparse Inputs via Gaussian Splatting\n and Contrastive Learning","summary":" Existing approaches for human avatar generation--both NeRF-based and 3D\nGaussian Splatting (3DGS) based--struggle with maintaining 3D consistency and\nexhibit degraded detail reconstruction, particularly when training with sparse\ninputs. To address this challenge, we propose CHASE, a novel framework that\nachieves dense-input-level performance using only sparse inputs through two key\ninnovations: cross-pose intrinsic 3D consistency supervision and 3D geometry\ncontrastive learning. Building upon prior skeleton-driven approaches that\ncombine rigid deformation with non-rigid cloth dynamics, we first establish\nbaseline avatars with fundamental 3D consistency. To enhance 3D consistency\nunder sparse inputs, we introduce a Dynamic Avatar Adjustment (DAA) module,\nwhich refines deformed Gaussians by leveraging similar poses from the training\nset. By minimizing the rendering discrepancy between adjusted Gaussians and\nreference poses, DAA provides additional supervision for avatar reconstruction.\nWe further maintain global 3D consistency through a novel geometry-aware\ncontrastive learning strategy. While designed for sparse inputs, CHASE\nsurpasses state-of-the-art methods across both full and sparse settings on\nZJU-MoCap and H36M datasets, demonstrating that our enhanced 3D consistency\nleads to superior rendering quality.\n","authors":["Haoyu Zhao","Hao Wang","Chen Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2408.09663v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09665v2","updated":"2024-11-19T12:48:43Z","published":"2024-08-19T02:58:20Z","title":"Topology-aware Human Avatars with Semantically-guided Gaussian Splatting","summary":" Reconstructing photo-realistic and topology-aware animatable human avatars\nfrom monocular videos remains challenging in computer vision and graphics.\nRecently, methods using 3D Gaussians to represent the human body have emerged,\noffering faster optimization and real-time rendering. However, due to ignoring\nthe crucial role of human body semantic information which represents the\nexplicit topological and intrinsic structure within human body, they fail to\nachieve fine-detail reconstruction of human avatars. To address this issue, we\npropose SG-GS, which uses semantics-embedded 3D Gaussians, skeleton-driven\nrigid deformation, and non-rigid cloth dynamics deformation to create\nphoto-realistic human avatars. We then design a Semantic Human-Body Annotator\n(SHA) which utilizes SMPL's semantic prior for efficient body part semantic\nlabeling. The generated labels are used to guide the optimization of semantic\nattributes of Gaussian. To capture the explicit topological structure of the\nhuman body, we employ a 3D network that integrates both topological and\ngeometric associations for human avatar deformation. We further implement three\nkey strategies to enhance the semantic accuracy of 3D Gaussians and rendering\nquality: semantic projection with 2D regularization, semantic-guided density\nregularization and semantic-aware regularization with neighborhood consistency.\nExtensive experiments demonstrate that SG-GS achieves state-of-the-art geometry\nand appearance reconstruction performance.\n","authors":["Haoyu Zhao","Chen Yang","Hao Wang","Xingyue Zhao","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2408.09665v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11219v2","updated":"2024-11-19T12:40:06Z","published":"2024-11-18T01:11:47Z","title":"Relational Contrastive Learning and Masked Image Modeling for Scene Text\n Recognition","summary":" Context-aware methods have achieved remarkable advancements in supervised\nscene text recognition by leveraging semantic priors from words. Considering\nthe heterogeneity of text and background in STR, we propose that such\ncontextual priors can be reinterpreted as the relations between textual\nelements, serving as effective self-supervised labels for representation\nlearning. However, textual relations are restricted to the finite size of the\ndataset due to lexical dependencies, which causes over-fitting problem, thus\ncompromising the representation quality. To address this, our work introduces a\nunified framework of Relational Contrastive Learning and Masked Image Modeling\nfor STR (RCMSTR), which explicitly models the enriched textual relations. For\nthe RCL branch, we first introduce the relational rearrangement module to\ncultivate new relations on the fly. Based on this, we further conduct\nrelational contrastive learning to model the intra- and inter-hierarchical\nrelations for frames, sub-words and words. On the other hand, MIM can naturally\nboost the context information via masking, where we find that the block masking\nstrategy is more effective for STR. For the effective integration of RCL and\nMIM, we also introduce a novel decoupling design aimed at mitigating the impact\nof masked images on contrastive learning. Additionally, to enhance the\ncompatibility of MIM with CNNs, we propose the adoption of sparse convolutions\nand directly sharing the weights with dense convolutions in training. The\nproposed RCMSTR demonstrates superior performance in various evaluation\nprotocols for different STR-related downstream tasks, outperforming the\nexisting state-of-the-art self-supervised STR techniques. Ablation studies and\nqualitative experimental results further validate the effectiveness of our\nmethod. The code and pre-trained models will be available at\nhttps://github.com/ThunderVVV/RCMSTR .\n","authors":["Tiancheng Lin","Jinglei Zhang","Yi Xu","Kai Chen","Rui Zhang","Chang-Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.11219v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.00508"},{"id":"http://arxiv.org/abs/2406.19997v2","updated":"2024-11-19T12:28:19Z","published":"2024-06-28T15:32:59Z","title":"Wavelets Are All You Need for Autoregressive Image Generation","summary":" In this paper, we take a new approach to autoregressive image generation that\nis based on two main ingredients. The first is wavelet image coding, which\nallows to tokenize the visual details of an image from coarse to fine details\nby ordering the information starting with the most significant bits of the most\nsignificant wavelet coefficients. The second is a variant of a language\ntransformer whose architecture is re-designed and optimized for token sequences\nin this 'wavelet language'. The transformer learns the significant statistical\ncorrelations within a token sequence, which are the manifestations of\nwell-known correlations between the wavelet subbands at various resolutions. We\nshow experimental results with conditioning on the generation process.\n","authors":["Wael Mattar","Idan Levy","Nir Sharon","Shai Dekel"],"pdf_url":"https://arxiv.org/pdf/2406.19997v2.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.12452v1","updated":"2024-11-19T12:19:45Z","published":"2024-11-19T12:19:45Z","title":"GaussianPretrain: A Simple Unified 3D Gaussian Representation for Visual\n Pre-training in Autonomous Driving","summary":" Self-supervised learning has made substantial strides in image processing,\nwhile visual pre-training for autonomous driving is still in its infancy.\nExisting methods often focus on learning geometric scene information while\nneglecting texture or treating both aspects separately, hindering comprehensive\nscene understanding. In this context, we are excited to introduce\nGaussianPretrain, a novel pre-training paradigm that achieves a holistic\nunderstanding of the scene by uniformly integrating geometric and texture\nrepresentations. Conceptualizing 3D Gaussian anchors as volumetric LiDAR\npoints, our method learns a deepened understanding of scenes to enhance\npre-training performance with detailed spatial structure and texture, achieving\nthat 40.6% faster than NeRF-based method UniPAD with 70% GPU memory only. We\ndemonstrate the effectiveness of GaussianPretrain across multiple 3D perception\ntasks, showing significant performance improvements, such as a 7.05% increase\nin NDS for 3D object detection, boosts mAP by 1.9% in HD map construction and\n0.8% improvement on Occupancy prediction. These significant gains highlight\nGaussianPretrain's theoretical innovation and strong practical potential,\npromoting visual pre-training development for autonomous driving. Source code\nwill be available at https://github.com/Public-BOTs/GaussianPretrain\n","authors":["Shaoqing Xu","Fang Li","Shengyin Jiang","Ziying Song","Li Liu","Zhi-xin Yang"],"pdf_url":"https://arxiv.org/pdf/2411.12452v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.12450v1","updated":"2024-11-19T12:18:16Z","published":"2024-11-19T12:18:16Z","title":"Frequency-Aware Guidance for Blind Image Restoration via Diffusion\n Models","summary":" Blind image restoration remains a significant challenge in low-level vision\ntasks. Recently, denoising diffusion models have shown remarkable performance\nin image synthesis. Guided diffusion models, leveraging the potent generative\npriors of pre-trained models along with a differential guidance loss, have\nachieved promising results in blind image restoration. However, these models\ntypically consider data consistency solely in the spatial domain, often\nresulting in distorted image content. In this paper, we propose a novel\nfrequency-aware guidance loss that can be integrated into various diffusion\nmodels in a plug-and-play manner. Our proposed guidance loss, based on 2D\ndiscrete wavelet transform, simultaneously enforces content consistency in both\nthe spatial and frequency domains. Experimental results demonstrate the\neffectiveness of our method in three blind restoration tasks: blind image\ndeblurring, imaging through turbulence, and blind restoration for multiple\ndegradations. Notably, our method achieves a significant improvement in PSNR\nscore, with a remarkable enhancement of 3.72\\,dB in image deblurring. Moreover,\nour method exhibits superior capability in generating images with rich details\nand reduced distortion, leading to the best visual quality.\n","authors":["Jun Xiao","Zihang Lyu","Hao Xie","Cong Zhang","Yakun Ju","Changjian Shui","Kin-Man Lam"],"pdf_url":"https://arxiv.org/pdf/2411.12450v1.pdf","comment":"17 pages, 6 figures, has been accepted by the ECCV 2024: AIM workshop"},{"id":"http://arxiv.org/abs/2411.12448v1","updated":"2024-11-19T12:15:40Z","published":"2024-11-19T12:15:40Z","title":"Large Language Models for Lossless Image Compression: Next-Pixel\n Prediction in Language Space is All You Need","summary":" We have recently witnessed that ``Intelligence\" and `` Compression\" are the\ntwo sides of the same coin, where the language large model (LLM) with\nunprecedented intelligence is a general-purpose lossless compressor for various\ndata modalities. This attribute particularly appeals to the lossless image\ncompression community, given the increasing need to compress high-resolution\nimages in the current streaming media era. Consequently, a spontaneous envision\nemerges: Can the compression performance of the LLM elevate lossless image\ncompression to new heights? However, our findings indicate that the naive\napplication of LLM-based lossless image compressors suffers from a considerable\nperformance gap compared with existing state-of-the-art (SOTA) codecs on common\nbenchmark datasets. In light of this, we are dedicated to fulfilling the\nunprecedented intelligence (compression) capacity of the LLM for lossless image\ncompression tasks, thereby bridging the gap between theoretical and practical\ncompression performance. Specifically, we propose P$^{2}$-LLM, a next-pixel\nprediction-based LLM, which integrates various elaborated insights and\nmethodologies, \\textit{e.g.,} pixel-level priors, the in-context ability of\nLLM, and a pixel-level semantic preservation strategy, to enhance the\nunderstanding capacity of pixel sequences for better next-pixel predictions.\nExtensive experiments on benchmark datasets demonstrate that P$^{2}$-LLM can\nbeat SOTA classical and learned codecs.\n","authors":["Kecheng Chen","Pingping Zhang","Hui Liu","Jie Liu","Yibing Liu","Jixin Huang","Shiqi Wang","Hong Yan","Haoliang Li"],"pdf_url":"https://arxiv.org/pdf/2411.12448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12440v1","updated":"2024-11-19T11:59:54Z","published":"2024-11-19T11:59:54Z","title":"Beyond Gaussians: Fast and High-Fidelity 3D Splatting with Linear\n Kernels","summary":" Recent advancements in 3D Gaussian Splatting (3DGS) have substantially\nimproved novel view synthesis, enabling high-quality reconstruction and\nreal-time rendering. However, blurring artifacts, such as floating primitives\nand over-reconstruction, remain challenging. Current methods address these\nissues by refining scene structure, enhancing geometric representations,\naddressing blur in training images, improving rendering consistency, and\noptimizing density control, yet the role of kernel design remains\nunderexplored. We identify the soft boundaries of Gaussian ellipsoids as one of\nthe causes of these artifacts, limiting detail capture in high-frequency\nregions. To bridge this gap, we introduce 3D Linear Splatting (3DLS), which\nreplaces Gaussian kernels with linear kernels to achieve sharper and more\nprecise results, particularly in high-frequency regions. Through evaluations on\nthree datasets, 3DLS demonstrates state-of-the-art fidelity and accuracy, along\nwith a 30% FPS improvement over baseline 3DGS. The implementation will be made\npublicly available upon acceptance. \\freefootnote{*Corresponding author.\n","authors":["Haodong Chen","Runnan Chen","Qiang Qu","Zhaoqing Wang","Tongliang Liu","Xiaoming Chen","Yuk Ying Chung"],"pdf_url":"https://arxiv.org/pdf/2411.12440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12431v1","updated":"2024-11-19T11:41:22Z","published":"2024-11-19T11:41:22Z","title":"CV-Cities: Advancing Cross-View Geo-Localization in Global Cities","summary":" Cross-view geo-localization (CVGL), which involves matching and retrieving\nsatellite images to determine the geographic location of a ground image, is\ncrucial in GNSS-constrained scenarios. However, this task faces significant\nchallenges due to substantial viewpoint discrepancies, the complexity of\nlocalization scenarios, and the need for global localization. To address these\nissues, we propose a novel CVGL framework that integrates the vision\nfoundational model DINOv2 with an advanced feature mixer. Our framework\nintroduces the symmetric InfoNCE loss and incorporates near-neighbor sampling\nand dynamic similarity sampling strategies, significantly enhancing\nlocalization accuracy. Experimental results show that our framework surpasses\nexisting methods across multiple public and self-built datasets. To further\nimprove globalscale performance, we have developed CV-Cities, a novel dataset\nfor global CVGL. CV-Cities includes 223,736 ground-satellite image pairs with\ngeolocation data, spanning sixteen cities across six continents and covering a\nwide range of complex scenarios, providing a challenging benchmark for CVGL.\nThe framework trained with CV-Cities demonstrates high localization accuracy in\nvarious test cities, highlighting its strong globalization and generalization\ncapabilities. Our datasets and codes are available at\nhttps://github.com/GaoShuang98/CVCities.\n","authors":["Gaoshuang Huang","Yang Zhou","Luying Zhao","Wenjian Gan"],"pdf_url":"https://arxiv.org/pdf/2411.12431v1.pdf","comment":"Datasets and codes are available, accepted by IEEE JSTARS"},{"id":"http://arxiv.org/abs/2411.12426v1","updated":"2024-11-19T11:26:21Z","published":"2024-11-19T11:26:21Z","title":"Motif Channel Opened in a White-Box: Stereo Matching via Motif\n Correlation Graph","summary":" Real-world applications of stereo matching, such as autonomous driving, place\nstringent demands on both safety and accuracy. However, learning-based stereo\nmatching methods inherently suffer from the loss of geometric structures in\ncertain feature channels, creating a bottleneck in achieving precise detail\nmatching. Additionally, these methods lack interpretability due to the\nblack-box nature of deep learning. In this paper, we propose MoCha-V2, a novel\nlearning-based paradigm for stereo matching. MoCha-V2 introduces the Motif\nCorrelation Graph (MCG) to capture recurring textures, which are referred to as\n``motifs\" within feature channels. These motifs reconstruct geometric\nstructures and are learned in a more interpretable way. Subsequently, we\nintegrate features from multiple frequency domains through wavelet inverse\ntransformation. The resulting motif features are utilized to restore geometric\nstructures in the stereo matching process. Experimental results demonstrate the\neffectiveness of MoCha-V2. MoCha-V2 achieved 1st place on the Middlebury\nbenchmark at the time of its release. Code is available at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Yongjun Zhang","Wenting Li","Bingshu Wang","Yong Zhao","C. L. Philip Chen"],"pdf_url":"https://arxiv.org/pdf/2411.12426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02799v2","updated":"2024-11-19T11:13:43Z","published":"2024-11-05T04:20:06Z","title":"ERUP-YOLO: Enhancing Object Detection Robustness for Adverse Weather\n Condition by Unified Image-Adaptive Processing","summary":" We propose an image-adaptive object detection method for adverse weather\nconditions such as fog and low-light. Our framework employs differentiable\npreprocessing filters to perform image enhancement suitable for later-stage\nobject detections. Our framework introduces two differentiable filters: a\nB\\'ezier curve-based pixel-wise (BPW) filter and a kernel-based local (KBL)\nfilter. These filters unify the functions of classical image processing filters\nand improve performance of object detection. We also propose a domain-agnostic\ndata augmentation strategy using the BPW filter. Our method does not require\ndata-specific customization of the filter combinations, parameter ranges, and\ndata augmentation. We evaluate our proposed approach, called Enhanced\nRobustness by Unified Image Processing (ERUP)-YOLO, by applying it to the\nYOLOv3 detector. Experiments on adverse weather datasets demonstrate that our\nproposed filters match or exceed the expressiveness of conventional methods and\nour ERUP-YOLO achieved superior performance in a wide range of adverse weather\nconditions, including fog and low-light conditions.\n","authors":["Yuka Ogino","Yuho Shoji","Takahiro Toizumi","Atsushi Ito"],"pdf_url":"https://arxiv.org/pdf/2411.02799v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2411.12415v1","updated":"2024-11-19T11:01:30Z","published":"2024-11-19T11:01:30Z","title":"Classification of Geographical Land Structure Using Convolution Neural\n Network and Transfer Learning","summary":" Satellite imagery has dramatically revolutionized the field of geography by\ngiving academics, scientists, and policymakers unprecedented global access to\nspatial data. Manual methods typically require significant time and effort to\ndetect the generic land structure in satellite images. This study can produce a\nset of applications such as urban planning and development, environmental\nmonitoring, disaster management, etc. Therefore, the research presents a\nmethodology to minimize human labor, reducing the expenses and duration needed\nto identify the land structure. This article developed a deep learning-based\napproach to automate the process of classifying geographical land structures.\nWe used a satellite image dataset acquired from MLRSNet. The study compared the\nperformance of three architectures, namely CNN, ResNet-50, and Inception-v3. We\nused three optimizers with any model: Adam, SGD, and RMSProp. We conduct the\ntraining process for a fixed number of epochs, specifically 100 epochs, with a\nbatch size of 64. The ResNet-50 achieved an accuracy of 76.5% with the ADAM\noptimizer, the Inception-v3 with RMSProp achieved an accuracy of 93.8%, and the\nproposed approach, CNN with RMSProp optimizer, achieved the highest level of\nperformance and an accuracy of 94.8%. Moreover, a thorough examination of the\nCNN model demonstrated its exceptional accuracy, recall, and F1 scores for all\ncategories, confirming its resilience and dependability in precisely detecting\nvarious terrain formations. The results highlight the potential of deep\nlearning models in scene understanding, as well as their significance in\nefficiently identifying and categorizing land structures from satellite\nimagery.\n","authors":["Mustafa M. Abd Zaid","Ahmed Abed Mohammed","Putra Sumari"],"pdf_url":"https://arxiv.org/pdf/2411.12415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00570v2","updated":"2024-11-19T11:00:38Z","published":"2024-03-01T14:47:46Z","title":"Rethinking cluster-conditioned diffusion models for label-free image\n synthesis","summary":" Diffusion-based image generation models can enhance image quality when\nconditioned on ground truth labels. Here, we conduct a comprehensive\nexperimental study on image-level conditioning for diffusion models using\ncluster assignments. We investigate how individual clustering determinants,\nsuch as the number of clusters and the clustering method, impact image\nsynthesis across three different datasets. Given the optimal number of clusters\nwith respect to image synthesis, we show that cluster-conditioning can achieve\nstate-of-the-art performance, with an FID of 1.67 for CIFAR10 and 2.17 for\nCIFAR100, along with a strong increase in training sample efficiency. We\nfurther propose a novel empirical method to estimate an upper bound for the\noptimal number of clusters. Unlike existing approaches, we find no significant\nassociation between clustering performance and the corresponding\ncluster-conditional FID scores. The code is available at\nhttps://github.com/HHU-MMBS/cedm-official-wavc2025.\n","authors":["Nikolas Adaloglou","Tim Kaiser","Felix Michels","Markus Kollmann"],"pdf_url":"https://arxiv.org/pdf/2403.00570v2.pdf","comment":"Accepted in WAVC2025 (21 pages, 15 figures). Code is available at\n https://github.com/HHU-MMBS/cedm-official-wavc2025"},{"id":"http://arxiv.org/abs/2309.10987v4","updated":"2024-11-19T10:55:52Z","published":"2023-09-20T01:04:57Z","title":"SpikingNeRF: Making Bio-inspired Neural Networks See through the Real\n World","summary":" In this paper, we propose SpikingNeRF, which aligns the temporal dimension of\nspiking neural networks (SNNs) with the radiance rays, to seamlessly\naccommodate SNNs to the reconstruction of neural radiance fields (NeRF). Thus,\nthe computation turns into a spike-based, multiplication-free manner, reducing\nenergy consumption and making high-quality 3D rendering, for the first time,\naccessible to neuromorphic hardware. In SpikingNeRF, each sampled point on the\nray is matched to a particular time step and represented in a hybrid manner\nwhere the voxel grids are maintained as well. Based on the voxel grids, sampled\npoints are determined whether to be masked out for faster training and\ninference. However, this masking operation also incurs irregular temporal\nlength, making it intractable for hardware processors, e.g., GPUs, to conduct\nparallel training. To address this problem, we develop the temporal padding\nstrategy to tackle the masked samples to maintain regular temporal length,\ni.e., regular tensors, and further propose the temporal condensing strategy to\nform a denser data structure for hardware-friendly computation. Experiments on\nvarious datasets demonstrate that our method can reduce energy consumption by\nan average of 70.79\\% and obtain comparable synthesis quality with the ANN\nbaseline. Verification on the neuromorphic hardware accelerator also shows that\nSpikingNeRF can further benefit from neuromorphic computing over the ANN\nbaselines on energy efficiency. Codes and the appendix are in\n\\url{https://github.com/Ikarosy/SpikingNeRF-of-CASIA}.\n","authors":["Xingting Yao","Qinghao Hu","Fei Zhou","Tielong Liu","Zitao Mo","Zeyu Zhu","Zhengyang Zhuge","Jian Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.10987v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09623v2","updated":"2024-11-19T10:15:56Z","published":"2024-11-14T17:47:54Z","title":"Vision-based Manipulation of Transparent Plastic Bags in Industrial\n Setups","summary":" This paper addresses the challenges of vision-based manipulation for\nautonomous cutting and unpacking of transparent plastic bags in industrial\nsetups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data,\nconnectivity, analytics, and robotics, promises enhanced accessibility and\nsustainability throughout the value chain. The integration of autonomous\nsystems, including collaborative robots (cobots), into industrial processes is\npivotal for efficiency and safety. The proposed solution employs advanced\nMachine Learning algorithms, particularly Convolutional Neural Networks (CNNs),\nto identify transparent plastic bags under varying lighting and background\nconditions. Tracking algorithms and depth sensing technologies are utilized for\n3D spatial awareness during pick and placement. The system addresses challenges\nin grasping and manipulation, considering optimal points, compliance control\nwith vacuum gripping technology, and real-time automation for safe interaction\nin dynamic environments. The system's successful testing and validation in the\nlab with the FRANKA robot arm, showcases its potential for widespread\nindustrial applications, while demonstrating effectiveness in automating the\nunpacking and cutting of transparent plastic bags for an 8-stack bulk-loader\nbased on specific requirements and rigorous testing.\n","authors":["F. Adetunji","A. Karukayil","P. Samant","S. Shabana","F. Varghese","U. Upadhyay","R. A. Yadav","A. Partridge","E. Pendleton","R. Plant","Y. Petillot","M. Koskinopoulou"],"pdf_url":"https://arxiv.org/pdf/2411.09623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12383v1","updated":"2024-11-19T10:09:08Z","published":"2024-11-19T10:09:08Z","title":"Automatic staff reconstruction within SIMSSA proect","summary":" The automatic analysis of scores has been a research topic of interest for\nthe last few decades and still is since music databases that include musical\nscores are currently being created to make musical content available to the\npublic, including scores of ancient music. For the correct analysis of music\nelements and their interpretation, the identification of staff lines is of key\nimportance. In this paper, a scheme to post-process the output of a previous\nmusical object identification system is described. This system allows the\nreconstruction by means of detection, tracking and interpolation of the staff\nlines of ancient scores from the digital Salzinnes Database. The scheme\ndeveloped shows a remarkable performance on the specific task it was created\nfor.\n","authors":["Lorenzo J. Tardon","Isabel Barbancho","Ana M. Barbancho","Ichiro Fujinaga"],"pdf_url":"https://arxiv.org/pdf/2411.12383v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2408.06071v2","updated":"2024-11-19T09:55:37Z","published":"2024-08-12T11:44:47Z","title":"A-BDD: Leveraging Data Augmentations for Safe Autonomous Driving in\n Adverse Weather and Lighting","summary":" High-autonomy vehicle functions rely on machine learning (ML) algorithms to\nunderstand the environment. Despite displaying remarkable performance in fair\nweather scenarios, perception algorithms are heavily affected by adverse\nweather and lighting conditions. To overcome these difficulties, ML engineers\nmainly rely on comprehensive real-world datasets. However, the difficulties in\nreal-world data collection for critical areas of the operational design domain\n(ODD) often means synthetic data is required for perception training and safety\nvalidation. Thus, we present A-BDD, a large set of over 60,000 synthetically\naugmented images based on BDD100K that are equipped with semantic segmentation\nand bounding box annotations (inherited from the BDD100K dataset). The dataset\ncontains augmented data for rain, fog, overcast and sunglare/shadow with\nvarying intensity levels. We further introduce novel strategies utilizing\nfeature-based image quality metrics like FID and CMMD, which help identify\nuseful augmented and real-world data for ML training and testing. By conducting\nexperiments on A-BDD, we provide evidence that data augmentations can play a\npivotal role in closing performance gaps in adverse weather and lighting\nconditions.\n","authors":["Felix Assion","Florens Gressner","Nitin Augustine","Jona Klemenc","Ahmed Hammam","Alexandre Krattinger","Holger Trittenbach","Anja Philippsen","Sascha Riemer"],"pdf_url":"https://arxiv.org/pdf/2408.06071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10755v2","updated":"2024-11-19T09:30:44Z","published":"2024-11-16T09:22:46Z","title":"Diffusion-Based Semantic Segmentation of Lumbar Spine MRI Scans of Lower\n Back Pain Patients","summary":" This study introduces a diffusion-based framework for robust and accurate\nsegmenton of vertebrae, intervertebral discs (IVDs), and spinal canal from\nMagnetic Resonance Imaging~(MRI) scans of patients with low back pain (LBP),\nregardless of whether the scans are T1w or T2-weighted. The results showed that\nSpineSegDiff achieved comparable outperformed non-diffusion state-of-the-art\nmodels in the identification of degenerated IVDs. Our findings highlight the\npotential of diffusion models to improve LBP diagnosis and management through\nprecise spine MRI analysis.\n","authors":["Maria Monzon","Thomas Iff","Ender Konukoglu","Catherine R. Jutzeler"],"pdf_url":"https://arxiv.org/pdf/2411.10755v2.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 5 pages"},{"id":"http://arxiv.org/abs/2409.16718v2","updated":"2024-11-19T09:27:37Z","published":"2024-09-25T08:07:18Z","title":"Vision-Language Model Fine-Tuning via Simple Parameter-Efficient\n Modification","summary":" Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed\nthe success of prompt tuning and adapter tuning, while the classic model\nfine-tuning on inherent parameters seems to be overlooked. It is believed that\nfine-tuning the parameters of VLMs with few-shot samples corrupts the\npre-trained knowledge since fine-tuning the CLIP model even degrades\nperformance. In this paper, we revisit this viewpoint, and propose a new\nperspective: fine-tuning the specific parameters instead of all will uncover\nthe power of classic model fine-tuning on VLMs. Through our meticulous study,\nwe propose ClipFit, a simple yet effective method to fine-tune CLIP without\nintroducing any overhead of extra parameters. We demonstrate that by only\nfine-tuning the specific bias terms and normalization layers, ClipFit can\nimprove the performance of zero-shot CLIP by 7.27\\% average harmonic mean\naccuracy. Lastly, to understand how fine-tuning in CLIPFit affects the\npre-trained models, we conducted extensive experimental analyses w.r.t. changes\nin internal parameters and representations. We found that low-level text bias\nlayers and the first layer normalization layer change much more than other\nlayers. The code is available at \\url{https://github.com/minglllli/CLIPFit}.\n","authors":["Ming Li","Jike Zhong","Chenxin Li","Liuzhuozheng Li","Nie Lin","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2409.16718v2.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2411.12361v1","updated":"2024-11-19T09:20:51Z","published":"2024-11-19T09:20:51Z","title":"Breathless: An 8-hour Performance Contrasting Human and Robot\n Expressiveness","summary":" This paper describes the robot technology behind an original performance that\npairs a human dancer (Cuan) with an industrial robot arm for an eight-hour\ndance that unfolds over the timespan of an American workday. To control the\nrobot arm, we combine a range of sinusoidal motions with varying amplitude,\nfrequency and offset at each joint to evoke human motions common in physical\nlabor such as stirring, digging, and stacking. More motions were developed\nusing deep learning techniques for video-based human-pose tracking and\nextraction. We combine these pre-recorded motions with improvised robot motions\ncreated live by putting the robot into teach-mode and triggering force sensing\nfrom the robot joints onstage. All motions are combined with commercial and\noriginal music using a custom suite of python software with AppleScript,\nKeynote, and Zoom to facilitate on-stage communication with the dancer. The\nresulting performance contrasts the expressivity of the human body with the\nprecision of robot machinery. Video, code and data are available on the project\nwebsite: https://sites.google.com/playing.studio/breathless\n","authors":["Catie Cuan","Tianshuang Qiu","Shreya Ganti","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2411.12361v1.pdf","comment":"15 pages, 9 figures, accepted for ISRR (International Symposium of\n Robotics Research) 2024"},{"id":"http://arxiv.org/abs/2409.00647v2","updated":"2024-11-19T09:18:02Z","published":"2024-09-01T07:47:48Z","title":"Modifying the U-Net's Encoder-Decoder Architecture for Segmentation of\n Tumors in Breast Ultrasound Images","summary":" Segmentation is one of the most significant steps in image processing.\nSegmenting an image is a technique that makes it possible to separate a digital\nimage into various areas based on the different characteristics of pixels in\nthe image. In particular, segmentation of breast ultrasound images is widely\nused for cancer identification. As a result of image segmentation, it is\npossible to make early diagnoses of a diseases via medical images in a very\neffective way. Due to various ultrasound artifacts and noises, including\nspeckle noise, low signal-to-noise ratio, and intensity heterogeneity, the\nprocess of accurately segmenting medical images, such as ultrasound images, is\nstill a challenging task. In this paper, we present a new method to improve the\naccuracy and effectiveness of breast ultrasound image segmentation. More\nprecisely, we propose a Neural Network (NN) based on U-Net and an\nencoder-decoder architecture. By taking U-Net as the basis, both encoder and\ndecoder parts are developed by combining U-Net with other Deep Neural Networks\n(Res-Net and MultiResUNet) and introducing a new approach and block (Co-Block),\nwhich preserve as much as possible the low-level and the high-level features.\nDesigned network is evaluated using the Breast Ultrasound Images (BUSI)\nDataset. It consists of 780 images and the images are categorized into three\nclasses, which are normal, benign, and malignant. According to our extensive\nevaluations on a public breast ultrasound dataset, designed network segments\nthe breast lesions more accurately than other state-of-the-art deep learning\nmethods. With only 8.88M parameters, our network (CResU-Net) obtained 82.88%,\n77.5%, 90.3%, and 98.4% in terms of Dice similarity coefficients (DSC),\nIntersection over Union (IoU), Area under curve (AUC), and global accuracy\n(ACC), respectively, on BUSI dataset.\n","authors":["Sina Derakhshandeh","Ali Mahloojifar"],"pdf_url":"https://arxiv.org/pdf/2409.00647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12355v1","updated":"2024-11-19T09:16:54Z","published":"2024-11-19T09:16:54Z","title":"DynFocus: Dynamic Cooperative Network Empowers LLMs with Video\n Understanding","summary":" The challenge in LLM-based video understanding lies in preserving visual and\nsemantic information in long videos while maintaining a memory-affordable token\ncount. However, redundancy and correspondence in videos have hindered the\nperformance potential of existing methods. Through statistical learning on\ncurrent datasets, we observe that redundancy occurs in both repeated and\nanswer-irrelevant frames, and the corresponding frames vary with different\nquestions. This suggests the possibility of adopting dynamic encoding to\nbalance detailed video information preservation with token budget reduction. To\nthis end, we propose a dynamic cooperative network, DynFocus, for\nmemory-efficient video encoding in this paper. Specifically, i) a Dynamic Event\nPrototype Estimation (DPE) module to dynamically select meaningful frames for\nquestion answering; (ii) a Compact Cooperative Encoding (CCE) module that\nencodes meaningful frames with detailed visual appearance and the remaining\nframes with sketchy perception separately. We evaluate our method on five\npublicly available benchmarks, and experimental results consistently\ndemonstrate that our method achieves competitive performance.\n","authors":["Yudong Han","Qingpei Guo","Liyuan Pan","Liu Liu","Yu Guan","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2411.12355v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.12350v1","updated":"2024-11-19T09:07:26Z","published":"2024-11-19T09:07:26Z","title":"DiM: $f$-Divergence Minimization Guided Sharpness-Aware Optimization for\n Semi-supervised Medical Image Segmentation","summary":" As a technique to alleviate the pressure of data annotation, semi-supervised\nlearning (SSL) has attracted widespread attention. In the specific domain of\nmedical image segmentation, semi-supervised methods (SSMIS) have become a\nresearch hotspot due to their ability to reduce the need for large amounts of\nprecisely annotated data. SSMIS focuses on enhancing the model's generalization\nperformance by leveraging a small number of labeled samples and a large number\nof unlabeled samples. The latest sharpness-aware optimization (SAM) technique,\nwhich optimizes the model by reducing the sharpness of the loss function, has\nshown significant success in SSMIS. However, SAM and its variants may not fully\naccount for the distribution differences between different datasets. To address\nthis issue, we propose a sharpness-aware optimization method based on\n$f$-divergence minimization (DiM) for semi-supervised medical image\nsegmentation. This method enhances the model's stability by fine-tuning the\nsensitivity of model parameters and improves the model's adaptability to\ndifferent datasets through the introduction of $f$-divergence. By reducing\n$f$-divergence, the DiM method not only improves the performance balance\nbetween the source and target datasets but also prevents performance\ndegradation due to overfitting on the source dataset.\n","authors":["Bingli Wang","Houcheng Su","Nan Yin","Mengzhu Wang","Li Shen"],"pdf_url":"https://arxiv.org/pdf/2411.12350v1.pdf","comment":"8page"},{"id":"http://arxiv.org/abs/2411.10715v2","updated":"2024-11-19T09:07:18Z","published":"2024-11-16T06:11:10Z","title":"EVT: Efficient View Transformation for Multi-Modal 3D Object Detection","summary":" Multi-modal sensor fusion in bird's-eye-view (BEV) representation has become\nthe leading approach in 3D object detection. However, existing methods often\nrely on depth estimators or transformer encoders for view transformation,\nincurring substantial computational overhead. Furthermore, the lack of precise\ngeometric correspondence between 2D and 3D spaces leads to spatial and\nray-directional misalignments, restricting the effectiveness of BEV\nrepresentations. To address these challenges, we propose a novel 3D object\ndetector via efficient view transformation (EVT), which leverages a\nwell-structured BEV representation to enhance accuracy and efficiency. EVT\nfocuses on two main areas. First, it employs Adaptive Sampling and Adaptive\nProjection (ASAP), using LiDAR guidance to generate 3D sampling points and\nadaptive kernels. The generated points and kernels are then used to facilitate\nthe transformation of image features into BEV space and refine the BEV\nfeatures. Second, EVT includes an improved transformer-based detection\nframework, which contains a group-wise query initialization method and an\nenhanced query update framework. It is designed to effectively utilize the\nobtained multi-modal BEV features within the transformer decoder. By leveraging\nthe geometric properties of object queries, this framework significantly\nenhances detection performance, especially in a multi-layer transformer decoder\nstructure. EVT achieves state-of-the-art performance on the nuScenes test set\nwith real-time inference speed.\n","authors":["Yongjin Lee","Hyeon-Mun Jeong","Yurim Jeon","Sanghyun Kim"],"pdf_url":"https://arxiv.org/pdf/2411.10715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11515v2","updated":"2024-11-19T08:50:38Z","published":"2024-11-18T12:22:37Z","title":"Cascaded Diffusion Models for 2D and 3D Microscopy Image Synthesis to\n Enhance Cell Segmentation","summary":" Automated cell segmentation in microscopy images is essential for biomedical\nresearch, yet conventional methods are labor-intensive and prone to error.\nWhile deep learning-based approaches have proven effective, they often require\nlarge annotated datasets, which are scarce due to the challenges of manual\nannotation. To overcome this, we propose a novel framework for synthesizing\ndensely annotated 2D and 3D cell microscopy images using cascaded diffusion\nmodels. Our method synthesizes 2D and 3D cell masks from sparse 2D annotations\nusing multi-level diffusion models and NeuS, a 3D surface reconstruction\napproach. Following that, a pretrained 2D Stable Diffusion model is finetuned\nto generate realistic cell textures and the final outputs are combined to form\ncell populations. We show that training a segmentation model with a combination\nof our synthetic data and real data improves cell segmentation performance by\nup to 9\\% across multiple datasets. Additionally, the FID scores indicate that\nthe synthetic data closely resembles real data. The code for our proposed\napproach will be available at\nhttps://github.com/ruveydayilmaz0/cascaded_diffusion.\n","authors":["Rüveyda Yilmaz","Kaan Keven","Yuli Wu","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2411.11515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13383v2","updated":"2024-11-19T08:49:45Z","published":"2024-10-17T09:36:19Z","title":"Railway LiDAR semantic segmentation based on intelligent semi-automated\n data annotation","summary":" Automated vehicles rely on an accurate and robust perception of the\nenvironment. Similarly to automated cars, highly automated trains require an\nenvironmental perception. Although there is a lot of research based on either\ncamera or LiDAR sensors in the automotive domain, very few contributions for\nthis task exist yet for automated trains. Additionally, no public dataset or\ndescribed approach for a 3D LiDAR semantic segmentation in the railway\nenvironment exists yet. Thus, we propose an approach for a point-wise 3D\nsemantic segmentation based on the 2DPass network architecture using scans and\nimages jointly. In addition, we present a semi-automated intelligent data\nannotation approach, which we use to efficiently and accurately label the\nrequired dataset recorded on a railway track in Germany. To improve performance\ndespite a still small number of labeled scans, we apply an active learning\napproach to intelligently select scans for the training dataset. Our\ncontributions are threefold: We annotate rail data including camera and LiDAR\ndata from the railway environment, transfer label the raw LiDAR point clouds\nusing an image segmentation network, and train a state-of-the-art 3D LiDAR\nsemantic segmentation network efficiently leveraging active learning. The\ntrained network achieves good segmentation results with a mean IoU of 71.48% of\n9 classes.\n","authors":["Florian Wulff","Bernd Schaeufele","Julian Pfeifer","Ilja Radusch"],"pdf_url":"https://arxiv.org/pdf/2410.13383v2.pdf","comment":"This article has been accepted for publication in the IEEE VTC Fall\n 2024"},{"id":"http://arxiv.org/abs/2411.12338v1","updated":"2024-11-19T08:42:24Z","published":"2024-11-19T08:42:24Z","title":"Target Height Estimation Using a Single Acoustic Camera for Compensation\n in 2D Seabed Mosaicking","summary":" This letter proposes a novel approach for compensating target height data in\n2D seabed mosaicking for low-visibility underwater perception. Acoustic cameras\nare effective sensors for sensing the marine environments due to their\nhigh-resolution imaging capabilities and robustness to darkness and turbidity.\nHowever, the loss of elevation angle during the imaging process results in a\nlack of target height information in the original acoustic camera images,\nleading to a simplistic 2D representation of the seabed mosaicking. In\nperceiving cluttered and unexplored marine environments, target height data is\ncrucial for avoiding collisions with marine robots. This study proposes a novel\napproach for estimating seabed target height using a single acoustic camera and\nintegrates height data into 2D seabed mosaicking to compensate for the missing\n3D dimension of seabed targets. Unlike classic methods that model the loss of\nelevation angle to achieve seabed 3D reconstruction, this study focuses on\nutilizing available acoustic cast shadow clues and simple sensor motion to\nquickly estimate target height. The feasibility of our proposal is verified\nthrough a water tank experiment and a simulation experiment.\n","authors":["Xiaoteng Zhou","Yusheng Wang","Katsunori Mizuno"],"pdf_url":"https://arxiv.org/pdf/2411.12338v1.pdf","comment":"8 pages,conference"},{"id":"http://arxiv.org/abs/2411.12331v1","updated":"2024-11-19T08:32:17Z","published":"2024-11-19T08:32:17Z","title":"Accelerating UMAP for Large-Scale Datasets Through Spectral Coarsening","summary":" This paper introduces an innovative approach to dramatically accelerate UMAP\nusing spectral data compression.The proposed method significantly reduces the\nsize of the dataset, preserving its essential manifold structure through an\nadvanced spectral compression technique. This allows UMAP to perform much\nfaster while maintaining the quality of its embeddings. Experiments on\nreal-world datasets, such as USPS, demonstrate the method's ability to achieve\nsubstantial data reduction without compromising embedding fidelity.\n","authors":["Yongyu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.12331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12321v1","updated":"2024-11-19T08:24:01Z","published":"2024-11-19T08:24:01Z","title":"Enhancing Blind Source Separation with Dissociative Principal Component\n Analysis","summary":" Sparse principal component analysis (sPCA) enhances the interpretability of\nprincipal components (PCs) by imposing sparsity constraints on loading vectors\n(LVs). However, when used as a precursor to independent component analysis\n(ICA) for blind source separation (BSS), sPCA may underperform due to its focus\non simplicity, potentially disregarding some statistical information essential\nfor effective ICA. To overcome this limitation, a sophisticated approach is\nproposed that preserves the interpretability advantages of sPCA while\nsignificantly enhancing its source extraction capabilities. This consists of\ntwo tailored algorithms, dissociative PCA (DPCA1 and DPCA2), which employ\nadaptive and firm thresholding alongside gradient and coordinate descent\napproaches to optimize the proposed model dynamically. These algorithms\nintegrate left and right singular vectors from singular value decomposition\n(SVD) through dissociation matrices (DMs) that replace traditional singular\nvalues, thus capturing latent interdependencies effectively to model complex\nsource relationships. This leads to refined PCs and LVs that more accurately\nrepresent the underlying data structure. The proposed approach avoids focusing\non individual eigenvectors, instead, it collaboratively combines multiple\neigenvectors to disentangle interdependencies within each SVD variate. The\nsuperior performance of the proposed DPCA algorithms is demonstrated across\nfour varied imaging applications including functional magnetic resonance\nimaging (fMRI) source retrieval, foreground-background separation, image\nreconstruction, and image inpainting. They outperformed traditional methods\nsuch as PCA+ICA, PPCA+ICA, SPCA+ICA, PMD, and GPower.\n","authors":["Muhammad Usman Khalid"],"pdf_url":"https://arxiv.org/pdf/2411.12321v1.pdf","comment":"1. 13 pages with 6 figures, this work has not bee published before.\n 2. The paper is yet to be peer-reviewed and I am planning to submit it to\n IEEE Transactions on Image Processing. 3. There is no supplementary material.\n 4. There is no funding for this work as of now"},{"id":"http://arxiv.org/abs/2411.12319v1","updated":"2024-11-19T08:23:52Z","published":"2024-11-19T08:23:52Z","title":"CLIP Unreasonable Potential in Single-Shot Face Recognition","summary":" Face recognition is a core task in computer vision designed to identify and\nauthenticate individuals by analyzing facial patterns and features. This field\nintersects with artificial intelligence image processing and machine learning\nwith applications in security authentication and personalization. Traditional\napproaches in facial recognition focus on capturing facial features like the\neyes, nose and mouth and matching these against a database to verify identities\nHowever challenges such as high false positive rates have persisted often due\nto the similarity among individuals facial features. Recently Contrastive\nLanguage Image Pretraining (CLIP) a model developed by OpenAI has shown\npromising advancements by linking natural language processing with vision tasks\nallowing it to generalize across modalities. Using CLIP's vision language\ncorrespondence and single-shot finetuning the model can achieve lower false\npositive rates upon deployment without the need of mass facial features\nextraction. This integration demonstrating CLIP's potential to address\npersistent issues in face recognition model performance without complicating\nour training paradigm.\n","authors":["Nhan T. Luu"],"pdf_url":"https://arxiv.org/pdf/2411.12319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19954v2","updated":"2024-11-19T08:17:30Z","published":"2024-09-30T05:19:09Z","title":"Domain Consistency Representation Learning for Lifelong Person\n Re-Identification","summary":" Lifelong person re-identification (LReID) exhibits a contradictory\nrelationship between intra-domain discrimination and inter-domain gaps when\nlearning from continuous data. Intra-domain discrimination focuses on\nindividual nuances (e.g. clothing type, accessories, etc.), while inter-domain\ngaps emphasize domain consistency. Achieving a trade-off between maximizing\nintra-domain discrimination and minimizing inter-domain gaps is a crucial\nchallenge for improving LReID performance. Most existing methods aim to reduce\ninter-domain gaps through knowledge distillation to maintain domain\nconsistency. However, they often ignore intra-domain discrimination. To address\nthis challenge, we propose a novel domain consistency representation learning\n(DCR) model that explores global and attribute-wise representations as a bridge\nto balance intra-domain discrimination and inter-domain gaps. At the\nintra-domain level, we explore the complementary relationship between global\nand attribute-wise representations to improve discrimination among similar\nidentities. Excessive learning intra-domain discrimination can lead to\ncatastrophic forgetting. We further develop an attribute-oriented\nanti-forgetting (AF) strategy that explores attribute-wise representations to\nenhance inter-domain consistency, and propose a knowledge consolidation (KC)\nstrategy to facilitate knowledge transfer. Extensive experiments show that our\nDCR model achieves superior performance compared to state-of-the-art LReID\nmethods. Our code will be available soon.\n","authors":["Shiben Liu","Qiang Wang","Huijie Fan","Weihong Ren","Baojie Fan","Yandong Tang"],"pdf_url":"https://arxiv.org/pdf/2409.19954v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.14923v4","updated":"2024-11-19T08:02:15Z","published":"2024-07-20T16:23:57Z","title":"RayFormer: Improving Query-Based Multi-Camera 3D Object Detection via\n Ray-Centric Strategies","summary":" The recent advances in query-based multi-camera 3D object detection are\nfeatured by initializing object queries in the 3D space, and then sampling\nfeatures from perspective-view images to perform multi-round query refinement.\nIn such a framework, query points near the same camera ray are likely to sample\nsimilar features from very close pixels, resulting in ambiguous query features\nand degraded detection accuracy. To this end, we introduce RayFormer, a\ncamera-ray-inspired query-based 3D object detector that aligns the\ninitialization and feature extraction of object queries with the optical\ncharacteristics of cameras. Specifically, RayFormer transforms perspective-view\nimage features into bird's eye view (BEV) via the lift-splat-shoot method and\nsegments the BEV map to sectors based on the camera rays. Object queries are\nuniformly and sparsely initialized along each camera ray, facilitating the\nprojection of different queries onto different areas in the image to extract\ndistinct features. Besides, we leverage the instance information of images to\nsupplement the uniformly initialized object queries by further involving\nadditional queries along the ray from 2D object detection boxes. To extract\nunique object-level features that cater to distinct queries, we design a ray\nsampling method that suitably organizes the distribution of feature sampling\npoints on both images and bird's eye view. Extensive experiments are conducted\non the nuScenes dataset to validate our proposed ray-inspired model design. The\nproposed RayFormer achieves superior performance of 55.5% mAP and 63.3% NDS,\nrespectively.\n","authors":["Xiaomeng Chu","Jiajun Deng","Guoliang You","Yifan Duan","Yao Li","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.14923v4.pdf","comment":"Accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2411.12313v1","updated":"2024-11-19T08:01:20Z","published":"2024-11-19T08:01:20Z","title":"C$^{2}$INet: Realizing Incremental Trajectory Prediction with\n Prior-Aware Continual Causal Intervention","summary":" Trajectory prediction for multi-agents in complex scenarios is crucial for\napplications like autonomous driving. However, existing methods often overlook\nenvironmental biases, which leads to poor generalization. Additionally,\nhardware constraints limit the use of large-scale data across environments, and\ncontinual learning settings exacerbate the challenge of catastrophic\nforgetting. To address these issues, we propose the Continual Causal\nIntervention (C$^{2}$INet) method for generalizable multi-agent trajectory\nprediction within a continual learning framework. Using variational inference,\nwe align environment-related prior with posterior estimator of confounding\nfactors in the latent space, thereby intervening in causal correlations that\naffect trajectory representation. Furthermore, we store optimal variational\npriors across various scenarios using a memory queue, ensuring continuous\ndebiasing during incremental task training. The proposed C$^{2}$INet enhances\nadaptability to diverse tasks while preserving previous task information to\nprevent catastrophic forgetting. It also incorporates pruning strategies to\nmitigate overfitting. Comparative evaluations on three real and synthetic\ncomplex datasets against state-of-the-art methods demonstrate that our proposed\nmethod consistently achieves reliable prediction performance, effectively\nmitigating confounding factors unique to different scenarios. This highlights\nthe practical value of our method for real-world applications.\n","authors":["Xiaohe Li","Feilong Huang","Zide Fan","Fangli Mou","Leilei Lin","Yingyan Hou","Lijie Wen"],"pdf_url":"https://arxiv.org/pdf/2411.12313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11985v3","updated":"2024-11-19T07:59:39Z","published":"2024-05-20T12:35:01Z","title":"MTVQA: Benchmarking Multilingual Text-Centric Visual Question Answering","summary":" Text-Centric Visual Question Answering (TEC-VQA) in its proper format not\nonly facilitates human-machine interaction in text-centric visual environments\nbut also serves as a de facto gold proxy to evaluate AI models in the domain of\ntext-centric scene understanding. Nonetheless, most existing TEC-VQA benchmarks\nhave focused on high-resource languages like English and Chinese. Despite\npioneering works to expand multilingual QA pairs in non-text-centric VQA\ndatasets through translation engines, the translation-based protocol encounters\na substantial \"visual-textual misalignment\" problem when applied to TEC-VQA.\nSpecifically, it prioritizes the text in question-answer pairs while\ndisregarding the visual text present in images. Moreover, it fails to address\ncomplexities related to nuanced meaning, contextual distortion, language bias,\nand question-type diversity. In this work, we tackle multilingual TEC-VQA by\nintroducing MTVQA, the first benchmark featuring high-quality human expert\nannotations across 9 diverse languages, consisting of 6,778 question-answer\npairs across 2,116 images. Further, by comprehensively evaluating numerous\nstate-of-the-art Multimodal Large Language Models~(MLLMs), including Qwen2-VL,\nGPT-4o, GPT-4V, Claude3, and Gemini, on the MTVQA benchmark, it is evident that\nthere is still a large room for performance improvement (Qwen2-VL scoring 30.9\nversus 79.7 for human performance), underscoring the value of MTVQA.\nAdditionally, we supply multilingual training data within the MTVQA dataset,\ndemonstrating that straightforward fine-tuning with this data can substantially\nenhance multilingual TEC-VQA performance. We aspire that MTVQA will offer the\nresearch community fresh insights and stimulate further exploration in\nmultilingual visual text comprehension. The project homepage is available at\nhttps://bytedance.github.io/MTVQA/.\n","authors":["Jingqun Tang","Qi Liu","Yongjie Ye","Jinghui Lu","Shu Wei","Chunhui Lin","Wanqing Li","Mohamad Fitri Faiz Bin Mahmood","Hao Feng","Zhen Zhao","Yanjie Wang","Yuliang Liu","Hao Liu","Xiang Bai","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2405.11985v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12309v1","updated":"2024-11-19T07:51:44Z","published":"2024-11-19T07:51:44Z","title":"DGTR: Distributed Gaussian Turbo-Reconstruction for Sparse-View Vast\n Scenes","summary":" Novel-view synthesis (NVS) approaches play a critical role in vast scene\nreconstruction. However, these methods rely heavily on dense image inputs and\nprolonged training times, making them unsuitable where computational resources\nare limited. Additionally, few-shot methods often struggle with poor\nreconstruction quality in vast environments. This paper presents DGTR, a novel\ndistributed framework for efficient Gaussian reconstruction for sparse-view\nvast scenes. Our approach divides the scene into regions, processed\nindependently by drones with sparse image inputs. Using a feed-forward Gaussian\nmodel, we predict high-quality Gaussian primitives, followed by a global\nalignment algorithm to ensure geometric consistency. Synthetic views and depth\npriors are incorporated to further enhance training, while a distillation-based\nmodel aggregation mechanism enables efficient reconstruction. Our method\nachieves high-quality large-scale scene reconstruction and novel-view synthesis\nin significantly reduced training times, outperforming existing approaches in\nboth speed and scalability. We demonstrate the effectiveness of our framework\non vast aerial scenes, achieving high-quality results within minutes. Code will\nreleased on our ![project page](https://3d-aigc.github.com/DGTR).\n","authors":["Hao Li","Yuanyuan Gao","Haosong Peng","Chenming Wu","Weicai Ye","Yufeng Zhan","Chen Zhao","Dingwen Zhang","Jingdong Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2411.12309v1.pdf","comment":"Code will released on our ![project\n page](https://3d-aigc.github.com/DGTR)"},{"id":"http://arxiv.org/abs/2411.12306v1","updated":"2024-11-19T07:47:37Z","published":"2024-11-19T07:47:37Z","title":"Diffusion Product Quantization","summary":" In this work, we explore the quantization of diffusion models in extreme\ncompression regimes to reduce model size while maintaining performance. We\nbegin by investigating classical vector quantization but find that diffusion\nmodels are particularly susceptible to quantization error, with the codebook\nsize limiting generation quality. To address this, we introduce product\nquantization, which offers improved reconstruction precision and larger\ncapacity -- crucial for preserving the generative capabilities of diffusion\nmodels. Furthermore, we propose a method to compress the codebook by evaluating\nthe importance of each vector and removing redundancy, ensuring the model size\nremaining within the desired range. We also introduce an end-to-end calibration\napproach that adjusts assignments during the forward pass and optimizes the\ncodebook using the DDPM loss. By compressing the model to as low as 1 bit\n(resulting in over 24 times reduction in model size), we achieve a balance\nbetween compression and quality. We apply our compression method to the DiT\nmodel on ImageNet and consistently outperform other quantization approaches,\ndemonstrating competitive generative performance.\n","authors":["Jie Shao","Hanxiao Zhang","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2411.12306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13999v2","updated":"2024-11-19T07:45:30Z","published":"2024-05-22T21:15:03Z","title":"Computer-Vision-Enabled Worker Video Analysis for Motion Amount\n Quantification","summary":" The performance of physical workers is significantly influenced by the extent\nof their motions. However, monitoring and assessing these motions remains a\nchallenge. Recent advancements have enabled in-situ video analysis for\nreal-time observation of worker behaviors. This paper introduces a novel\nframework for tracking and quantifying upper and lower limb motions, issuing\nalerts when critical thresholds are reached. Using joint position data from\nposture estimation, the framework employs Hotelling's $T^2$ statistic to\nquantify and monitor motion amounts. The results indicate that the correlation\nbetween workers' joint motion amounts and Hotelling's $T^2$ statistic is\napproximately 35\\% higher for micro-tasks than macro-tasks, demonstrating the\nframework's ability to detect fine-grained motion differences. This study\nhighlights the proposed system's effectiveness in real-time applications across\nvarious industry settings, providing a valuable tool for precision motion\nanalysis and proactive ergonomic adjustments.\n","authors":["Hari Iyer","Neel Macwan","Shenghan Guo","Heejin Jeong"],"pdf_url":"https://arxiv.org/pdf/2405.13999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06173v2","updated":"2024-11-19T07:42:32Z","published":"2024-11-09T13:03:54Z","title":"LSSInst: Improving Geometric Modeling in LSS-Based BEV Perception with\n Instance Representation","summary":" With the attention gained by camera-only 3D object detection in autonomous\ndriving, methods based on Bird-Eye-View (BEV) representation especially derived\nfrom the forward view transformation paradigm, i.e., lift-splat-shoot (LSS),\nhave recently seen significant progress. The BEV representation formulated by\nthe frustum based on depth distribution prediction is ideal for learning the\nroad structure and scene layout from multi-view images. However, to retain\ncomputational efficiency, the compressed BEV representation such as in\nresolution and axis is inevitably weak in retaining the individual geometric\ndetails, undermining the methodological generality and applicability. With this\nin mind, to compensate for the missing details and utilize multi-view geometry\nconstraints, we propose LSSInst, a two-stage object detector incorporating BEV\nand instance representations in tandem. The proposed detector exploits\nfine-grained pixel-level features that can be flexibly integrated into existing\nLSS-based BEV networks. Having said that, due to the inherent gap between two\nrepresentation spaces, we design the instance adaptor for the BEV-to-instance\nsemantic coherence rather than pass the proposal naively. Extensive experiments\ndemonstrated that our proposed framework is of excellent generalization ability\nand performance, which boosts the performances of modern LSS-based BEV\nperception methods without bells and whistles and outperforms current LSS-based\nstate-of-the-art works on the large-scale nuScenes benchmark.\n","authors":["Weijie Ma","Jingwei Jiang","Yang Yang","Zehui Chen","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.06173v2.pdf","comment":"Accepted by 3DV 2025"},{"id":"http://arxiv.org/abs/2411.12301v1","updated":"2024-11-19T07:41:09Z","published":"2024-11-19T07:41:09Z","title":"Physics-Guided Detector for SAR Airplanes","summary":" The disperse structure distributions (discreteness) and variant scattering\ncharacteristics (variability) of SAR airplane targets lead to special\nchallenges of object detection and recognition. The current deep learning-based\ndetectors encounter challenges in distinguishing fine-grained SAR airplanes\nagainst complex backgrounds. To address it, we propose a novel physics-guided\ndetector (PGD) learning paradigm for SAR airplanes that comprehensively\ninvestigate their discreteness and variability to improve the detection\nperformance. It is a general learning paradigm that can be extended to\ndifferent existing deep learning-based detectors with \"backbone-neck-head\"\narchitectures. The main contributions of PGD include the physics-guided\nself-supervised learning, feature enhancement, and instance perception, denoted\nas PGSSL, PGFE, and PGIP, respectively. PGSSL aims to construct a\nself-supervised learning task based on a wide range of SAR airplane targets\nthat encodes the prior knowledge of various discrete structure distributions\ninto the embedded space. Then, PGFE enhances the multi-scale feature\nrepresentation of a detector, guided by the physics-aware information learned\nfrom PGSSL. PGIP is constructed at the detection head to learn the refined and\ndominant scattering point of each SAR airplane instance, thus alleviating the\ninterference from the complex background. We propose two implementations,\ndenoted as PGD and PGD-Lite, and apply them to various existing detectors with\ndifferent backbones and detection heads. The experiments demonstrate the\nflexibility and effectiveness of the proposed PGD, which can improve existing\ndetectors on SAR airplane detection with fine-grained classification task (an\nimprovement of 3.1\\% mAP most), and achieve the state-of-the-art performance\n(90.7\\% mAP) on SAR-AIRcraft-1.0 dataset. The project is open-source at\n\\url{https://github.com/XAI4SAR/PGD}.\n","authors":["Zhongling Huang","Long Liu","Shuxin Yang","Zhirui Wang","Gong Cheng","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2411.12301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12293v1","updated":"2024-11-19T07:26:30Z","published":"2024-11-19T07:26:30Z","title":"Generative Timelines for Instructed Visual Assembly","summary":" The objective of this work is to manipulate visual timelines (e.g. a video)\nthrough natural language instructions, making complex timeline editing tasks\naccessible to non-expert or potentially even disabled users. We call this task\nInstructed visual assembly. This task is challenging as it requires (i)\nidentifying relevant visual content in the input timeline as well as retrieving\nrelevant visual content in a given input (video) collection, (ii) understanding\nthe input natural language instruction, and (iii) performing the desired edits\nof the input visual timeline to produce an output timeline. To address these\nchallenges, we propose the Timeline Assembler, a generative model trained to\nperform instructed visual assembly tasks. The contributions of this work are\nthree-fold. First, we develop a large multimodal language model, which is\ndesigned to process visual content, compactly represent timelines and\naccurately interpret timeline editing instructions. Second, we introduce a\nnovel method for automatically generating datasets for visual assembly tasks,\nenabling efficient training of our model without the need for human-labeled\ndata. Third, we validate our approach by creating two novel datasets for image\nand video assembly, demonstrating that the Timeline Assembler substantially\noutperforms established baseline models, including the recent GPT-4o, in\naccurately executing complex assembly instructions across various real-world\ninspired scenarios.\n","authors":["Alejandro Pardo","Jui-Hsien Wang","Bernard Ghanem","Josef Sivic","Bryan Russell","Fabian Caba Heilbron"],"pdf_url":"https://arxiv.org/pdf/2411.12293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11396v2","updated":"2024-11-19T07:20:03Z","published":"2024-11-18T09:18:36Z","title":"Stacking Brick by Brick: Aligned Feature Isolation for Incremental Face\n Forgery Detection","summary":" The rapid advancement of face forgery techniques has introduced a growing\nvariety of forgeries. Incremental Face Forgery Detection (IFFD), involving\ngradually adding new forgery data to fine-tune the previously trained model,\nhas been introduced as a promising strategy to deal with evolving forgery\nmethods. However, a naively trained IFFD model is prone to catastrophic\nforgetting when new forgeries are integrated, as treating all forgeries as a\nsingle ''Fake\" class in the Real/Fake classification can cause different\nforgery types overriding one another, thereby resulting in the forgetting of\nunique characteristics from earlier tasks and limiting the model's\neffectiveness in learning forgery specificity and generality. In this paper, we\npropose to stack the latent feature distributions of previous and new tasks\nbrick by brick, $\\textit{i.e.}$, achieving $\\textbf{aligned feature\nisolation}$. In this manner, we aim to preserve learned forgery information and\naccumulate new knowledge by minimizing distribution overriding, thereby\nmitigating catastrophic forgetting. To achieve this, we first introduce Sparse\nUniform Replay (SUR) to obtain the representative subsets that could be treated\nas the uniformly sparse versions of the previous global distributions. We then\npropose a Latent-space Incremental Detector (LID) that leverages SUR data to\nisolate and align distributions. For evaluation, we construct a more advanced\nand comprehensive benchmark tailored for IFFD. The leading experimental results\nvalidate the superiority of our method.\n","authors":["Jikang Cheng","Zhiyuan Yan","Ying Zhang","Li Hao","Jiaxin Ai","Qin Zou","Chen Li","Zhongyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11396v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12290v1","updated":"2024-11-19T07:19:05Z","published":"2024-11-19T07:19:05Z","title":"SSEditor: Controllable Mask-to-Scene Generation with Diffusion Model","summary":" Recent advancements in 3D diffusion-based semantic scene generation have\ngained attention. However, existing methods rely on unconditional generation\nand require multiple resampling steps when editing scenes, which significantly\nlimits their controllability and flexibility. To this end, we propose SSEditor,\na controllable Semantic Scene Editor that can generate specified target\ncategories without multiple-step resampling. SSEditor employs a two-stage\ndiffusion-based framework: (1) a 3D scene autoencoder is trained to obtain\nlatent triplane features, and (2) a mask-conditional diffusion model is trained\nfor customizable 3D semantic scene generation. In the second stage, we\nintroduce a geometric-semantic fusion module that enhance the model's ability\nto learn geometric and semantic information. This ensures that objects are\ngenerated with correct positions, sizes, and categories. Extensive experiments\non SemanticKITTI and CarlaSC demonstrate that SSEditor outperforms previous\napproaches in terms of controllability and flexibility in target generation, as\nwell as the quality of semantic scene generation and reconstruction. More\nimportantly, experiments on the unseen Occ-3D Waymo dataset show that SSEditor\nis capable of generating novel urban scenes, enabling the rapid construction of\n3D scenes.\n","authors":["Haowen Zheng","Yanyan Liang"],"pdf_url":"https://arxiv.org/pdf/2411.12290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12286v1","updated":"2024-11-19T07:12:48Z","published":"2024-11-19T07:12:48Z","title":"GLOVER: Generalizable Open-Vocabulary Affordance Reasoning for\n Task-Oriented Grasping","summary":" Inferring affordable (i.e., graspable) parts of arbitrary objects based on\nhuman specifications is essential for robots advancing toward open-vocabulary\nmanipulation. Current grasp planners, however, are hindered by limited\nvision-language comprehension and time-consuming 3D radiance modeling,\nrestricting real-time, open-vocabulary interactions with objects. To address\nthese limitations, we propose GLOVER, a unified Generalizable Open-Vocabulary\nAffordance Reasoning framework, which fine-tunes the Large Language Models\n(LLMs) to predict visual affordance of graspable object parts within RGB\nfeature space. We compile a dataset of over 10,000 images from human-object\ninteractions, annotated with unified visual and linguistic affordance labels,\nto enable multi-modal fine-tuning. GLOVER inherits world knowledge and\ncommon-sense reasoning from LLMs, facilitating more fine-grained object\nunderstanding and sophisticated tool-use reasoning. To enable effective\nreal-world deployment, we present Affordance-Aware Grasping Estimation (AGE), a\nnon-parametric grasp planner that aligns the gripper pose with a superquadric\nsurface derived from affordance data. In evaluations across 30 real-world\nscenes, GLOVER achieves success rates of 86.0% in part identification and 76.3%\nin grasping, with speeds approximately 330 times faster in affordance reasoning\nand 40 times faster in grasping pose estimation than the previous\nstate-of-the-art.\n","authors":["Teli Ma","Zifan Wang","Jiaming Zhou","Mengmeng Wang","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2411.12286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12279v1","updated":"2024-11-19T06:57:45Z","published":"2024-11-19T06:57:45Z","title":"HouseLLM: LLM-Assisted Two-Phase Text-to-Floorplan Generation","summary":" This paper proposes a two-phase text-to-floorplan generation method, which\nguides a Large Language Model (LLM) to generate an initial layout (Layout-LLM)\nand refines them into the final floorplans through conditional diffusion model.\nWe incorporate a Chain-of-Thought approach to prompt the LLM based on user text\nspecifications, enabling a more user-friendly and intuitive house layout\ndesign. This method allows users to describe their needs in natural language,\nenhancing accessibility and providing clearer geometric constraints. The final\nfloorplans generated by Layout-LLM through conditional diffusion refinement are\nmore accurate and better meet user requirements. Experimental results\ndemonstrate that our approach achieves state-of-the-art performance across all\nmetrics, validating its effectiveness in practical home design applications. We\nplan to release our code for public use.\n","authors":["Ziyang Zong","Zhaohuan Zhan","Guang Tan"],"pdf_url":"https://arxiv.org/pdf/2411.12279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12278v1","updated":"2024-11-19T06:57:16Z","published":"2024-11-19T06:57:16Z","title":"Versatile Cataract Fundus Image Restoration Model Utilizing Unpaired\n Cataract and High-quality Images","summary":" Cataract is one of the most common blinding eye diseases and can be treated\nby surgery. However, because cataract patients may also suffer from other\nblinding eye diseases, ophthalmologists must diagnose them before surgery. The\ncloudy lens of cataract patients forms a hazy degeneration in the fundus\nimages, making it challenging to observe the patient's fundus vessels, which\nbrings difficulties to the diagnosis process. To address this issue, this paper\nestablishes a new cataract image restoration method named Catintell. It\ncontains a cataract image synthesizing model, Catintell-Syn, and a restoration\nmodel, Catintell-Res. Catintell-Syn uses GAN architecture with fully\nunsupervised data to generate paired cataract-like images with realistic style\nand texture rather than the conventional Gaussian degradation algorithm.\nMeanwhile, Catintell-Res is an image restoration network that can improve the\nquality of real cataract fundus images using the knowledge learned from\nsynthetic cataract images. Extensive experiments show that Catintell-Res\noutperforms other cataract image restoration methods in PSNR with 39.03 and\nSSIM with 0.9476. Furthermore, the universal restoration ability that\nCatintell-Res gained from unpaired cataract images can process cataract images\nfrom various datasets. We hope the models can help ophthalmologists identify\nother blinding eye diseases of cataract patients and inspire more medical image\nrestoration methods in the future.\n","authors":["Zheng Gong","Zhuo Deng","Weihao Gao","Wenda Zhou","Yuhang Yang","Hanqing Zhao","Zhiyuan Niu","Lei Shao","Wenbin Wei","Lan Ma"],"pdf_url":"https://arxiv.org/pdf/2411.12278v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.12276v1","updated":"2024-11-19T06:56:24Z","published":"2024-11-19T06:56:24Z","title":"libcll: an Extendable Python Toolkit for Complementary-Label Learning","summary":" Complementary-label learning (CLL) is a weakly supervised learning paradigm\nfor multiclass classification, where only complementary labels -- indicating\nclasses an instance does not belong to -- are provided to the learning\nalgorithm. Despite CLL's increasing popularity, previous studies highlight two\nmain challenges: (1) inconsistent results arising from varied assumptions on\ncomplementary label generation, and (2) high barriers to entry due to the lack\nof a standardized evaluation platform across datasets and algorithms. To\naddress these challenges, we introduce \\texttt{libcll}, an extensible Python\ntoolkit for CLL research. \\texttt{libcll} provides a universal interface that\nsupports a wide range of generation assumptions, both synthetic and real-world\ndatasets, and key CLL algorithms. The toolkit is designed to mitigate\ninconsistencies and streamline the research process, with easy installation,\ncomprehensive usage guides, and quickstart tutorials that facilitate efficient\nadoption and implementation of CLL techniques. Extensive ablation studies\nconducted with \\texttt{libcll} demonstrate its utility in generating valuable\ninsights to advance future CLL research.\n","authors":["Nai-Xuan Ye","Tan-Ha Mai","Hsiu-Hsuan Wang","Wei-I Lin","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2411.12276v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.12273v1","updated":"2024-11-19T06:52:28Z","published":"2024-11-19T06:52:28Z","title":"Acquire Precise and Comparable Fundus Image Quality Score: FTHNet and\n FQS Dataset","summary":" The retinal fundus images are utilized extensively in the diagnosis, and\ntheir quality can directly affect the diagnosis results. However, due to the\ninsufficient dataset and algorithm application, current fundus image quality\nassessment (FIQA) methods are not powerful enough to meet ophthalmologists`\ndemands. In this paper, we address the limitations of datasets and algorithms\nin FIQA. First, we establish a new FIQA dataset, Fundus Quality Score(FQS),\nwhich includes 2246 fundus images with two labels: a continuous Mean Opinion\nScore varying from 0 to 100 and a three-level quality label. Then, we propose a\nFIQA Transformer-based Hypernetwork (FTHNet) to solve these tasks with\nregression results rather than classification results in conventional FIQA\nworks. The FTHNet is optimized for the FIQA tasks with extensive experiments.\nResults on our FQS dataset show that the FTHNet can give quality scores for\nfundus images with PLCC of 0.9423 and SRCC of 0.9488, significantly\noutperforming other methods with fewer parameters and less computation\ncomplexity.We successfully build a dataset and model addressing the problems of\ncurrent FIQA methods. Furthermore, the model deployment experiments demonstrate\nits potential in automatic medical image quality control. All experiments are\ncarried out with 10-fold cross-validation to ensure the significance of the\nresults.\n","authors":["Zheng Gong","Zhuo Deng","Run Gan","Zhiyuan Niu","Lu Chen","Canfeng Huang","Jia Liang","Weihao Gao","Fang Li","Shaochong Zhang","Lan Ma"],"pdf_url":"https://arxiv.org/pdf/2411.12273v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.12270v1","updated":"2024-11-19T06:47:56Z","published":"2024-11-19T06:47:56Z","title":"KDC-MAE: Knowledge Distilled Contrastive Mask Auto-Encoder","summary":" In this work, we attempted to extend the thought and showcase a way forward\nfor the Self-supervised Learning (SSL) learning paradigm by combining\ncontrastive learning, self-distillation (knowledge distillation) and masked\ndata modelling, the three major SSL frameworks, to learn a joint and\ncoordinated representation. The proposed technique of SSL learns by the\ncollaborative power of different learning objectives of SSL. Hence to jointly\nlearn the different SSL objectives we proposed a new SSL architecture KDC-MAE,\na complementary masking strategy to learn the modular correspondence, and a\nweighted way to combine them coordinately. Experimental results conclude that\nthe contrastive masking correspondence along with the KD learning objective has\nlent a hand to performing better learning for multiple modalities over multiple\ntasks.\n","authors":["Maheswar Bora","Saurabh Atreya","Aritra Mukherjee","Abhijit Das"],"pdf_url":"https://arxiv.org/pdf/2411.12270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11287v2","updated":"2024-11-19T06:36:59Z","published":"2024-08-21T02:19:54Z","title":"Taming Generative Diffusion Prior for Universal Blind Image Restoration","summary":" Diffusion models have been widely utilized for image restoration. However,\nprevious blind image restoration methods still need to assume the type of\ndegradation model while leaving the parameters to be optimized, limiting their\nreal-world applications. Therefore, we aim to tame generative diffusion prior\nfor universal blind image restoration dubbed BIR-D, which utilizes an\noptimizable convolutional kernel to simulate the degradation model and\ndynamically update the parameters of the kernel in the diffusion steps,\nenabling it to achieve blind image restoration results even in various complex\nsituations. Besides, based on mathematical reasoning, we have provided an\nempirical formula for the chosen of adaptive guidance scale, eliminating the\nneed for a grid search for the optimal parameter. Experimentally, Our BIR-D has\ndemonstrated superior practicality and versatility than off-the-shelf\nunsupervised methods across various tasks both on real-world and synthetic\ndatasets, qualitatively and quantitatively. BIR-D is able to fulfill\nmulti-guidance blind image restoration. Moreover, BIR-D can also restore images\nthat undergo multiple and complicated degradations, demonstrating the practical\napplications.\n","authors":["Siwei Tu","Weidong Yang","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2408.11287v2.pdf","comment":"15 pages, 12 figures, 8 tables"},{"id":"http://arxiv.org/abs/2411.12259v1","updated":"2024-11-19T06:17:25Z","published":"2024-11-19T06:17:25Z","title":"Prototype Optimization with Neural ODE for Few-Shot Learning","summary":" Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel\nclasses with few examples. Pre-training based methods effectively tackle the\nproblem by pre-training a feature extractor and then performing class\nprediction via a cosine classifier with mean-based prototypes. Nevertheless,\ndue to the data scarcity, the mean-based prototypes are usually biased. In this\npaper, we attempt to diminish the prototype bias by regarding it as a prototype\noptimization problem. To this end, we propose a novel prototype optimization\nframework to rectify prototypes, i.e., introducing a meta-optimizer to optimize\nprototypes. Although the existing meta-optimizers can also be adapted to our\nframework, they all overlook a crucial gradient bias issue, i.e., the\nmean-based gradient estimation is also biased on sparse data. To address this\nissue, in this paper, we regard the gradient and its flow as meta-knowledge and\nthen propose a novel Neural Ordinary Differential Equation (ODE)-based\nmeta-optimizer to optimize prototypes, called MetaNODE. Although MetaNODE has\nshown superior performance, it suffers from a huge computational burden. To\nfurther improve its computation efficiency, we conduct a detailed analysis on\nMetaNODE and then design an effective and efficient MetaNODE extension version\n(called E2MetaNODE). It consists of two novel modules: E2GradNet and E2Solver,\nwhich aim to estimate accurate gradient flows and solve optimal prototypes in\nan effective and efficient manner, respectively. Extensive experiments show\nthat 1) our methods achieve superior performance over previous FSL methods and\n2) our E2MetaNODE significantly improves computation efficiency meanwhile\nwithout performance degradation.\n","authors":["Baoquan Zhang","Shanshan Feng","Bingqi Shan","Xutao Li","Yunming Ye","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2411.12259v1.pdf","comment":"An extended version of metanode: prototype optimization as a neural\n ode for few-shot learning. arXiv admin note: text overlap with\n arXiv:2103.14341"},{"id":"http://arxiv.org/abs/2404.06842v3","updated":"2024-11-19T06:09:48Z","published":"2024-04-10T09:14:28Z","title":"MoCha-Stereo: Motif Channel Attention Network for Stereo Matching","summary":" Learning-based stereo matching techniques have made significant progress.\nHowever, existing methods inevitably lose geometrical structure information\nduring the feature channel generation process, resulting in edge detail\nmismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network\n(MoCha-Stereo) is designed to address this problem. We provide the Motif\nChannel Correlation Volume (MCCV) to determine more accurate edge matching\ncosts. MCCV is achieved by projecting motif channels, which capture common\ngeometric structures in feature channels, onto feature maps and cost volumes.\nIn addition, edge variations in %potential feature channels of the\nreconstruction error map also affect details matching, we propose the\nReconstruction Error Motif Penalty (REMP) module to further refine the\nfull-resolution disparity estimation. REMP integrates the frequency information\nof typical channel features from the reconstruction error. MoCha-Stereo ranks\n1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure\nalso shows excellent performance in Multi-View Stereo. Code is avaliable at\nhttps://github.com/ZYangChen/MoCha-Stereo.\n","authors":["Ziyang Chen","Wei Long","He Yao","Yongjun Zhang","Bingshu Wang","Yongbin Qin","Jia Wu"],"pdf_url":"https://arxiv.org/pdf/2404.06842v3.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2208.09315v2","updated":"2024-11-19T05:57:57Z","published":"2022-08-19T12:59:46Z","title":"Self-Supervised Place Recognition by Refining Temporal and Featural\n Pseudo Labels from Panoramic Data","summary":" Visual place recognition (VPR) using deep networks has achieved\nstate-of-the-art performance. However, most of them require a training set with\nground truth sensor poses to obtain positive and negative samples of each\nobservation's spatial neighborhood for supervised learning. When such\ninformation is unavailable, temporal neighborhoods from a sequentially\ncollected data stream could be exploited for self-supervised training, although\nwe find its performance suboptimal. Inspired by noisy label learning, we\npropose a novel self-supervised framework named TF-VPR that uses temporal\nneighborhoods and learnable feature neighborhoods to discover unknown spatial\nneighborhoods. Our method follows an iterative training paradigm which\nalternates between: (1) representation learning with data augmentation, (2)\npositive set expansion to include the current feature space neighbors, and (3)\npositive set contraction via geometric verification. We conduct auto-labeling\nand generalization tests on both simulated and real datasets, with either RGB\nimages or point clouds as inputs. The results show that our method outperforms\nself-supervised baselines in recall rate, robustness, and heading diversity, a\nnovel metric we propose for VPR. Our code and datasets can be found at\nhttps://ai4ce.github.io/TF-VPR/\n","authors":["Chao Chen","Xinhao Liu","Xuchu Xu","Yiming Li","Li Ding","Ruoyu Wang","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2208.09315v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12250v1","updated":"2024-11-19T05:52:51Z","published":"2024-11-19T05:52:51Z","title":"ADV2E: Bridging the Gap Between Analogue Circuit and Discrete Frames in\n the Video-to-Events Simulator","summary":" Event cameras operate fundamentally differently from traditional Active Pixel\nSensor (APS) cameras, offering significant advantages. Recent research has\ndeveloped simulators to convert video frames into events, addressing the\nshortage of real event datasets. Current simulators primarily focus on the\nlogical behavior of event cameras. However, the fundamental analogue properties\nof pixel circuits are seldom considered in simulator design. The gap between\nanalogue pixel circuit and discrete video frames causes the degeneration of\nsynthetic events, particularly in high-contrast scenes. In this paper, we\npropose a novel method of generating reliable event data based on a detailed\nanalysis of the pixel circuitry in event cameras. We incorporate the analogue\nproperties of event camera pixel circuits into the simulator design: (1)\nanalogue filtering of signals from light intensity to events, and (2) a cutoff\nfrequency that is independent of video frame rate. Experimental results on two\nrelevant tasks, including semantic segmentation and image reconstruction,\nvalidate the reliability of simulated event data, even in high-contrast scenes.\nThis demonstrates that deep neural networks exhibit strong generalization from\nsimulated to real event data, confirming that the synthetic events generated by\nthe proposed method are both realistic and well-suited for effective training.\n","authors":["Xiao Jiang","Fei Zhou","Jiongzhi Lin"],"pdf_url":"https://arxiv.org/pdf/2411.12250v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.12248v1","updated":"2024-11-19T05:52:17Z","published":"2024-11-19T05:52:17Z","title":"Neuro-3D: Towards 3D Visual Decoding from EEG Signals","summary":" Human's perception of the visual world is shaped by the stereo processing of\n3D information. Understanding how the brain perceives and processes 3D visual\nstimuli in the real world has been a longstanding endeavor in neuroscience.\nTowards this goal, we introduce a new neuroscience task: decoding 3D visual\nperception from EEG signals, a neuroimaging technique that enables real-time\nmonitoring of neural dynamics enriched with complex visual cues. To provide the\nessential benchmark, we first present EEG-3D, a pioneering dataset featuring\nmultimodal analysis data and extensive EEG recordings from 12 subjects viewing\n72 categories of 3D objects rendered in both videos and images. Furthermore, we\npropose Neuro-3D, a 3D visual decoding framework based on EEG signals. This\nframework adaptively integrates EEG features derived from static and dynamic\nstimuli to learn complementary and robust neural representations, which are\nsubsequently utilized to recover both the shape and color of 3D objects through\nthe proposed diffusion-based colored point cloud decoder. To the best of our\nknowledge, we are the first to explore EEG-based 3D visual decoding.\nExperiments indicate that Neuro-3D not only reconstructs colored 3D objects\nwith high fidelity, but also learns effective neural representations that\nenable insightful brain region analysis. The dataset and associated code will\nbe made publicly available.\n","authors":["Zhanqiang Guo","Jiamin Wu","Yonghao Song","Weijian Mai","Qihao Zheng","Wanli Ouyang","Chunfeng Song"],"pdf_url":"https://arxiv.org/pdf/2411.12248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09766v2","updated":"2024-11-19T05:43:22Z","published":"2024-11-14T19:22:36Z","title":"NACNet: A Histology Context-aware Transformer Graph Convolution Network\n for Predicting Treatment Response to Neoadjuvant Chemotherapy in Triple\n Negative Breast Cancer","summary":" Neoadjuvant chemotherapy (NAC) response prediction for triple negative breast\ncancer (TNBC) patients is a challenging task clinically as it requires\nunderstanding complex histology interactions within the tumor microenvironment\n(TME). Digital whole slide images (WSIs) capture detailed tissue information,\nbut their giga-pixel size necessitates computational methods based on multiple\ninstance learning, which typically analyze small, isolated image tiles without\nthe spatial context of the TME. To address this limitation and incorporate TME\nspatial histology interactions in predicting NAC response for TNBC patients, we\ndeveloped a histology context-aware transformer graph convolution network\n(NACNet). Our deep learning method identifies the histopathological labels on\nindividual image tiles from WSIs, constructs a spatial TME graph, and\nrepresents each node with features derived from tissue texture and social\nnetwork analysis. It predicts NAC response using a transformer graph\nconvolution network model enhanced with graph isomorphism network layers. We\nevaluate our method with WSIs of a cohort of TNBC patient (N=105) and compared\nits performance with multiple state-of-the-art machine learning and deep\nlearning models, including both graph and non-graph approaches. Our NACNet\nachieves 90.0% accuracy, 96.0% sensitivity, 88.0% specificity, and an AUC of\n0.82, through eight-fold cross-validation, outperforming baseline models. These\ncomprehensive experimental results suggest that NACNet holds strong potential\nfor stratifying TNBC patients by NAC response, thereby helping to prevent\novertreatment, improve patient quality of life, reduce treatment cost, and\nenhance clinical outcomes, marking an important advancement toward personalized\nbreast cancer treatment.\n","authors":["Qiang Li","George Teodoro","Yi Jiang","Jun Kong"],"pdf_url":"https://arxiv.org/pdf/2411.09766v2.pdf","comment":"This paper is accepted by Computerized Medical Imaging and Graphics\n (Nov 07 2024)"},{"id":"http://arxiv.org/abs/2409.05442v3","updated":"2024-11-19T04:43:30Z","published":"2024-09-09T08:46:45Z","title":"EndoOmni: Zero-Shot Cross-Dataset Depth Estimation in Endoscopy by\n Robust Self-Learning from Noisy Labels","summary":" Single-image depth estimation is essential for endoscopy tasks such as\nlocalization, reconstruction, and augmented reality. Most existing methods in\nsurgical scenes focus on in-domain depth estimation, limiting their real-world\napplicability. This constraint stems from the scarcity and inferior labeling\nquality of medical data for training. In this work, we present EndoOmni, the\nfirst foundation model for zero-shot cross-domain depth estimation for\nendoscopy. To harness the potential of diverse training data, we refine the\nadvanced self-learning paradigm that employs a teacher model to generate\npseudo-labels, guiding a student model trained on large-scale labeled and\nunlabeled data. To address training disturbance caused by inherent noise in\ndepth labels, we propose a robust training framework that leverages both depth\nlabels and estimated confidence from the teacher model to jointly guide the\nstudent model training. Moreover, we propose a weighted scale-and-shift\ninvariant loss to adaptively adjust learning weights based on label confidence,\nthus imposing learning bias towards cleaner label pixels while reducing the\ninfluence of highly noisy pixels. Experiments on zero-shot relative depth\nestimation show that our EndoOmni improves state-of-the-art methods in medical\nimaging for 33\\% and existing foundation models for 34\\% in terms of absolute\nrelative error on specific datasets. Furthermore, our model provides strong\ninitialization for fine-tuning metric depth estimation, maintaining superior\nperformance in both in-domain and out-of-domain scenarios. The source code is\npublicly available at https://github.com/TianCuteQY/EndoOmni.\n","authors":["Qingyao Tian","Zhen Chen","Huai Liao","Xinyan Huang","Lujie Li","Sebastien Ourselin","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2409.05442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06486v2","updated":"2024-11-19T04:31:31Z","published":"2024-11-10T14:59:29Z","title":"DDIM-Driven Coverless Steganography Scheme with Real Key","summary":" Typical steganography embeds secret information into images by exploiting\ntheir redundancy. Since the visual imperceptibility of secret information is a\nkey factor in scheme evaluation, conventional methods aim to balance this\nrequirement with embedding capacity. Consequently, integrating emerging image\ngeneration models and secret transmission has been extensively explored to\nachieve a higher embedding capacity. Previous works mostly focus on generating\nstego-images with Generative Adversarial Networks (GANs) and usually rely on\npseudo-keys, namely conditions or parameters involved in the generation\nprocess, which are related to secret images. However, studies on\ndiffusion-based coverless steganography remain insufficient. In this work, we\nleverage the Denoising Diffusion Implicit Model (DDIM) to generate high-quality\nstego-images without introducing pseudo-keys, instead employing real keys to\nenhance security. Furthermore, our method offers low-image-correlation real-key\nprotection by incorporating chaotic encryption. Another core innovation is that\nour method requires only one-time negotiation for multiple communications,\nunlike prior methods that necessitate negotiation for each interaction.\n","authors":["Mingyu Yu","Haonan Miao","Zhengping Jin","Sujuan Qin"],"pdf_url":"https://arxiv.org/pdf/2411.06486v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10060v4","updated":"2024-11-19T03:58:40Z","published":"2024-08-19T14:54:12Z","title":"Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with\n Texture Map-Based Weak Supervision","summary":" Facial wrinkle detection plays a crucial role in cosmetic dermatology.\nPrecise manual segmentation of facial wrinkles is challenging and\ntime-consuming, with inherent subjectivity leading to inconsistent results\namong graders. To address this issue, we propose two solutions. First, we build\nand release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an\nextension of the NVIDIA FFHQ dataset. It includes 1,000 images with human\nlabels and 50,000 images with automatically generated weak labels. This dataset\ncould serve as a foundation for the research community to develop advanced\nwrinkle detection algorithms. Second, we introduce a simple training strategy\nutilizing texture maps, applicable to various segmentation models, to detect\nwrinkles across the face. Our two-stage training strategy first pretrain models\non a large dataset with weak labels (N=50k), or masked texture maps generated\nthrough computer vision techniques, without human intervention. We then\nfinetune the models using human-labeled data (N=1k), which consists of manually\nlabeled wrinkle masks. The network takes as input a combination of RGB and\nmasked texture map of the image, comprising four channels, in finetuning. We\neffectively combine labels from multiple annotators to minimize subjectivity in\nmanual labeling. Our strategies demonstrate improved segmentation performance\nin facial wrinkle segmentation both quantitatively and visually compared to\nexisting pretraining methods. The dataset is available at\nhttps://github.com/labhai/ffhq-wrinkle-dataset.\n","authors":["Junho Moon","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.10060v4.pdf","comment":"Accepted at International Conference on Pattern Recognition (ICPR),\n 2024"},{"id":"http://arxiv.org/abs/2411.12201v1","updated":"2024-11-19T03:39:43Z","published":"2024-11-19T03:39:43Z","title":"Invariant Shape Representation Learning For Image Classification","summary":" Geometric shape features have been widely used as strong predictors for image\nclassification. Nevertheless, most existing classifiers such as deep neural\nnetworks (DNNs) directly leverage the statistical correlations between these\nshape features and target variables. However, these correlations can often be\nspurious and unstable across different environments (e.g., in different age\ngroups, certain types of brain changes have unstable relations with\nneurodegenerative disease); hence leading to biased or inaccurate predictions.\nIn this paper, we introduce a novel framework that for the first time develops\ninvariant shape representation learning (ISRL) to further strengthen the\nrobustness of image classifiers. In contrast to existing approaches that mainly\nderive features in the image space, our model ISRL is designed to jointly\ncapture invariant features in latent shape spaces parameterized by deformable\ntransformations. To achieve this goal, we develop a new learning paradigm based\non invariant risk minimization (IRM) to learn invariant representations of\nimage and shape features across multiple training distributions/environments.\nBy embedding the features that are invariant with regard to target variables in\ndifferent environments, our model consistently offers more accurate\npredictions. We validate our method by performing classification tasks on both\nsimulated 2D images, real 3D brain and cine cardiovascular magnetic resonance\nimages (MRIs). Our code is publicly available at\nhttps://github.com/tonmoy-hossain/ISRL.\n","authors":["Tonmoy Hossain","Jing Ma","Jundong Li","Miaomiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.12201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12199v1","updated":"2024-11-19T03:30:44Z","published":"2024-11-19T03:30:44Z","title":"RoSIS: Robust Framework for Text-Promptable Surgical Instrument\n Segmentation Using Vision-Language Fusion","summary":" Surgical instrument segmentation (SIS) is an essential task in\ncomputer-assisted surgeries, with deep learning-based research improving\naccuracy in complex environments. Recently, text-promptable segmentation\nmethods have been introduced to generate masks based on text prompts describing\ntarget objects. However, these methods assume that the object described by a\ngiven text prompt exists in the scene. This results in mask generation whenever\na related text prompt is provided, even if the object is absent from the image.\nExisting methods handle this by using prompts only for objects known to be\npresent in the image, which introduces inaccessible information in a\nvision-based method setting and results in unfair comparisons. For fair\ncomparison, we redefine existing text-promptable SIS settings to robust\nconditions, called Robust text-promptable SIS (R-SIS), designed to forward\nprompts of all classes and determine the existence of an object from a given\ntext prompt for the fair comparison. Furthermore, we propose a novel framework,\nRobust Surgical Instrument Segmentation (RoSIS), which combines visual and\nlanguage features for promptable segmentation in the R-SIS setting. RoSIS\nemploys an encoder-decoder architecture with a Multi-Modal Fusion Block (MMFB)\nand a Selective Gate Block (SGB) to achieve balanced integration of vision and\nlanguage features. Additionally, we introduce an iterative inference strategy\nthat refines segmentation masks in two steps: an initial pass using name-based\nprompts, followed by a refinement step using location prompts. Experiments on\nvarious datasets and settings demonstrate that RoSIS outperforms existing\nvision-based and promptable methods under robust conditions.\n","authors":["Tae-Min Choi","Juyoun Park"],"pdf_url":"https://arxiv.org/pdf/2411.12199v1.pdf","comment":"10 pages, 6 figures, submitted to IEEE transactions on Medical\n Imaging"},{"id":"http://arxiv.org/abs/2003.13648v4","updated":"2024-11-19T03:30:22Z","published":"2020-03-30T17:32:49Z","title":"Weakly-supervised land classification for coastal zone based on deep\n convolutional neural networks by incorporating dual-polarimetric\n characteristics into training dataset","summary":" In this work we explore the performance of DCNNs on semantic segmentation\nusing spaceborne polarimetric synthetic aperture radar (PolSAR) datasets. The\nsemantic segmentation task using PolSAR data can be categorized as weakly\nsupervised learning when the characteristics of SAR data and data annotating\nprocedures are factored in. Datasets are initially analyzed for selecting\nfeasible pre-training images. Then the differences between spaceborne and\nairborne datasets are examined in terms of spatial resolution and viewing\ngeometry. In this study we used two dual-polarimetric images acquired by\nTerraSAR-X DLR. A novel method to produce training dataset with more supervised\ninformation is developed. Specifically, a series of typical classified images\nas well as intensity images serve as training datasets. A field survey is\nconducted for an area of about 20 square kilometers to obtain a ground truth\ndataset used for accuracy evaluation. Several transfer learning strategies are\nmade for aforementioned training datasets which will be combined in a\npracticable order. Three DCNN models, including SegNet, U-Net, and LinkNet, are\nimplemented next.\n","authors":["Sheng Sun","Armando Marino","Wenze Shui","Zhongwen Hu"],"pdf_url":"https://arxiv.org/pdf/2003.13648v4.pdf","comment":"We are sorry for that there are some minor errors in the experimental\n results. We need to make some improvements to the results and request to\n withdraw the submission"},{"id":"http://arxiv.org/abs/2407.08162v2","updated":"2024-11-19T03:30:11Z","published":"2024-07-11T03:47:14Z","title":"Improving Visual Place Recognition Based Robot Navigation By Verifying\n Localization Estimates","summary":" Visual Place Recognition (VPR) systems often have imperfect performance,\naffecting the `integrity' of position estimates and subsequent robot navigation\ndecisions. Previously, SVM classifiers have been used to monitor VPR integrity.\nThis research introduces a novel Multi-Layer Perceptron (MLP) integrity monitor\nwhich demonstrates improved performance and generalizability, removing\nper-environment training and reducing manual tuning requirements. We test our\nproposed system in extensive real-world experiments, presenting two real-time\nintegrity-based VPR verification methods: a single-query rejection method for\nrobot navigation to a goal zone (Experiment 1); and a history-of-queries method\nthat takes a best, verified, match from its recent trajectory and uses an\nodometer to extrapolate a current position estimate (Experiment 2). Noteworthy\nresults for Experiment 1 include a decrease in aggregate mean along-track goal\nerror from ~9.8m to ~3.1m, and an increase in the aggregate rate of successful\nmission completion from ~41% to ~55%. Experiment 2 showed a decrease in\naggregate mean along-track localization error from ~2.0m to ~0.5m, and an\nincrease in the aggregate localization precision from ~97% to ~99%. Overall,\nour results demonstrate the practical usefulness of a VPR integrity monitor in\nreal-world robotics to improve VPR localization and consequent navigation\nperformance.\n","authors":["Owen Claxton","Connor Malone","Helen Carson","Jason Ford","Gabe Bolton","Iman Shames","Michael Milford"],"pdf_url":"https://arxiv.org/pdf/2407.08162v2.pdf","comment":"Author Accepted Preprint for Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2411.12198v1","updated":"2024-11-19T03:30:06Z","published":"2024-11-19T03:30:06Z","title":"CCIS-Diff: A Generative Model with Stable Diffusion Prior for Controlled\n Colonoscopy Image Synthesis","summary":" Colonoscopy is crucial for identifying adenomatous polyps and preventing\ncolorectal cancer. However, developing robust models for polyp detection is\nchallenging by the limited size and accessibility of existing colonoscopy\ndatasets. While previous efforts have attempted to synthesize colonoscopy\nimages, current methods suffer from instability and insufficient data\ndiversity. Moreover, these approaches lack precise control over the generation\nprocess, resulting in images that fail to meet clinical quality standards. To\naddress these challenges, we propose CCIS-DIFF, a Controlled generative model\nfor high-quality Colonoscopy Image Synthesis based on a Diffusion architecture.\nOur method offers precise control over both the spatial attributes (polyp\nlocation and shape) and clinical characteristics of polyps that align with\nclinical descriptions. Specifically, we introduce a blur mask weighting\nstrategy to seamlessly blend synthesized polyps with the colonic mucosa, and a\ntext-aware attention mechanism to guide the generated images to reflect\nclinical characteristics. Notably, to achieve this, we construct a new\nmulti-modal colonoscopy dataset that integrates images, mask annotations, and\ncorresponding clinical text descriptions. Experimental results demonstrate that\nour method generates high-quality, diverse colonoscopy images with fine control\nover both spatial constraints and clinical consistency, offering valuable\nsupport for downstream segmentation and diagnostic tasks.\n","authors":["Yifan Xie","Jingge Wang","Tao Feng","Fei Ma","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2411.12198v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.12197v1","updated":"2024-11-19T03:29:18Z","published":"2024-11-19T03:29:18Z","title":"MTFusion: Reconstructing Any 3D Object from Single Image Using\n Multi-word Textual Inversion","summary":" Reconstructing 3D models from single-view images is a long-standing problem\nin computer vision. The latest advances for single-image 3D reconstruction\nextract a textual description from the input image and further utilize it to\nsynthesize 3D models. However, existing methods focus on capturing a single key\nattribute of the image (e.g., object type, artistic style) and fail to consider\nthe multi-perspective information required for accurate 3D reconstruction, such\nas object shape and material properties. Besides, the reliance on Neural\nRadiance Fields hinders their ability to reconstruct intricate surfaces and\ntexture details. In this work, we propose MTFusion, which leverages both image\ndata and textual descriptions for high-fidelity 3D reconstruction. Our approach\nconsists of two stages. First, we adopt a novel multi-word textual inversion\ntechnique to extract a detailed text description capturing the image's\ncharacteristics. Then, we use this description and the image to generate a 3D\nmodel with FlexiCubes. Additionally, MTFusion enhances FlexiCubes by employing\na special decoder network for Signed Distance Functions, leading to faster\ntraining and finer surface representation. Extensive evaluations demonstrate\nthat our MTFusion surpasses existing image-to-3D methods on a wide range of\nsynthetic and real-world images. Furthermore, the ablation study proves the\neffectiveness of our network designs.\n","authors":["Yu Liu","Ruowei Wang","Jiaqi Li","Zixiang Xu","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.12197v1.pdf","comment":"PRCV 2024"},{"id":"http://arxiv.org/abs/2411.12195v1","updated":"2024-11-19T03:27:05Z","published":"2024-11-19T03:27:05Z","title":"A Survey of Medical Vision-and-Language Applications and Their\n Techniques","summary":" Medical vision-and-language models (MVLMs) have attracted substantial\ninterest due to their capability to offer a natural language interface for\ninterpreting complex medical data. Their applications are versatile and have\nthe potential to improve diagnostic accuracy and decision-making for individual\npatients while also contributing to enhanced public health monitoring, disease\nsurveillance, and policy-making through more efficient analysis of large data\nsets. MVLMS integrate natural language processing with medical images to enable\na more comprehensive and contextual understanding of medical images alongside\ntheir corresponding textual information. Unlike general vision-and-language\nmodels trained on diverse, non-specialized datasets, MVLMs are purpose-built\nfor the medical domain, automatically extracting and interpreting critical\ninformation from medical images and textual reports to support clinical\ndecision-making. Popular clinical applications of MVLMs include automated\nmedical report generation, medical visual question answering, medical\nmultimodal segmentation, diagnosis and prognosis and medical image-text\nretrieval. Here, we provide a comprehensive overview of MVLMs and the various\nmedical tasks to which they have been applied. We conduct a detailed analysis\nof various vision-and-language model architectures, focusing on their distinct\nstrategies for cross-modal integration/exploitation of medical visual and\ntextual features. We also examine the datasets used for these tasks and compare\nthe performance of different models based on standardized evaluation metrics.\nFurthermore, we highlight potential challenges and summarize future research\ntrends and directions. The full collection of papers and codes is available at:\nhttps://github.com/YtongXie/Medical-Vision-and-Language-Tasks-and-Methodologies-A-Survey.\n","authors":["Qi Chen","Ruoshan Zhao","Sinuo Wang","Vu Minh Hieu Phan","Anton van den Hengel","Johan Verjans","Zhibin Liao","Minh-Son To","Yong Xia","Jian Chen","Yutong Xie","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2411.12195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07987v4","updated":"2024-11-19T03:23:20Z","published":"2024-04-11T17:59:09Z","title":"ControlNet++: Improving Conditional Controls with Efficient Consistency\n Feedback","summary":" To enhance the controllability of text-to-image diffusion models, existing\nefforts like ControlNet incorporated image-based conditional controls. In this\npaper, we reveal that existing methods still face significant challenges in\ngenerating images that align with the image conditional controls. To this end,\nwe propose ControlNet++, a novel approach that improves controllable generation\nby explicitly optimizing pixel-level cycle consistency between generated images\nand conditional controls. Specifically, for an input conditional control, we\nuse a pre-trained discriminative reward model to extract the corresponding\ncondition of the generated images, and then optimize the consistency loss\nbetween the input conditional control and extracted condition. A\nstraightforward implementation would be generating images from random noises\nand then calculating the consistency loss, but such an approach requires\nstoring gradients for multiple sampling timesteps, leading to considerable time\nand memory costs. To address this, we introduce an efficient reward strategy\nthat deliberately disturbs the input images by adding noise, and then uses the\nsingle-step denoised images for reward fine-tuning. This avoids the extensive\ncosts associated with image sampling, allowing for more efficient reward\nfine-tuning. Extensive experiments show that ControlNet++ significantly\nimproves controllability under various conditional controls. For example, it\nachieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE,\nrespectively, for segmentation mask, line-art edge, and depth conditions. All\nthe code, models, demo and organized data have been open sourced on our Github\nRepo.\n","authors":["Ming Li","Taojiannan Yang","Huafeng Kuang","Jie Wu","Zhaoning Wang","Xuefeng Xiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07987v4.pdf","comment":"Camera Ready Version. Project Page:\n https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data:\n https://github.com/liming-ai/ControlNet_Plus_Plus"},{"id":"http://arxiv.org/abs/2410.14148v3","updated":"2024-11-19T03:08:34Z","published":"2024-10-18T03:34:32Z","title":"Fine-Grained Verifiers: Preference Modeling as Next-token Prediction in\n Vision-Language Alignment","summary":" The recent advancements in large language models (LLMs) and pre-trained\nvision models have accelerated the development of vision-language large models\n(VLLMs), enhancing the interaction between visual and linguistic modalities.\nDespite their notable success across various domains, VLLMs face challenges in\nmodality alignment, which can lead to issues like hallucinations and unsafe\ncontent generation. Current alignment techniques often rely on coarse feedback\nand external datasets, limiting scalability and performance. In this paper, we\npropose FiSAO (Fine-Grained Self-Alignment Optimization), a novel\nself-alignment method that utilizes the model's own visual encoder as a\nfine-grained verifier to improve vision-language alignment without the need for\nadditional data. By leveraging token-level feedback from the vision encoder,\nFiSAO significantly improves vision-language alignment, even surpassing\ntraditional preference tuning methods that require additional data. Through\nboth theoretical analysis and experimental validation, we demonstrate that\nFiSAO effectively addresses the misalignment problem in VLLMs, marking the\nfirst instance of token-level rewards being applied to such models.\n","authors":["Chenhang Cui","An Zhang","Yiyang Zhou","Zhaorun Chen","Gelei Deng","Huaxiu Yao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.14148v3.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2401.16861v3","updated":"2024-11-19T03:08:33Z","published":"2024-01-30T10:04:49Z","title":"Repositioning the Subject within Image","summary":" Current image manipulation primarily centers on static manipulation, such as\nreplacing specific regions within an image or altering its overall style. In\nthis paper, we introduce an innovative dynamic manipulation task, subject\nrepositioning. This task involves relocating a user-specified subject to a\ndesired position while preserving the image's fidelity. Our research reveals\nthat the fundamental sub-tasks of subject repositioning, which include filling\nthe void left by the repositioned subject, reconstructing obscured portions of\nthe subject and blending the subject to be consistent with surrounding areas,\ncan be effectively reformulated as a unified, prompt-guided inpainting task.\nConsequently, we can employ a single diffusion generative model to address\nthese sub-tasks using various task prompts learned through our proposed task\ninversion technique. Additionally, we integrate pre-processing and\npost-processing techniques to further enhance the quality of subject\nrepositioning. These elements together form our SEgment-gEnerate-and-bLEnd\n(SEELE) framework. To assess SEELE's effectiveness in subject repositioning, we\nassemble a real-world subject repositioning dataset called ReS. Results of\nSEELE on ReS demonstrate its efficacy. Code and ReS dataset are available at\nhttps://yikai-wang.github.io/seele/.\n","authors":["Yikai Wang","Chenjie Cao","Ke Fan","Qiaole Dong","Yifan Li","Xiangyang Xue","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2401.16861v3.pdf","comment":"Accepted by TMLR. Arxiv version uses small size images. Full size\n PDF, code, and dataset are available at https://yikai-wang.github.io/seele/"},{"id":"http://arxiv.org/abs/2411.12188v1","updated":"2024-11-19T03:02:39Z","published":"2024-11-19T03:02:39Z","title":"Constant Rate Schedule: Constant-Rate Distributional Change for\n Efficient Training and Sampling in Diffusion Models","summary":" We propose a noise schedule that ensures a constant rate of change in the\nprobability distribution of diffused data throughout the diffusion process. To\nobtain this noise schedule, we measure the rate of change in the probability\ndistribution of the forward process and use it to determine the noise schedule\nbefore training diffusion models. The functional form of the noise schedule is\nautomatically determined and tailored to each dataset and type of diffusion\nmodel. We evaluate the effectiveness of our noise schedule on unconditional and\nclass-conditional image generation tasks using the LSUN\n(bedroom/church/cat/horse), ImageNet, and FFHQ datasets. Through extensive\nexperiments, we confirmed that our noise schedule broadly improves the\nperformance of the diffusion models regardless of the dataset, sampler, number\nof function evaluations, or type of diffusion model.\n","authors":["Shuntaro Okada","Kenji Doi","Ryota Yoshihashi","Hirokatsu Kataoka","Tomohiro Tanaka"],"pdf_url":"https://arxiv.org/pdf/2411.12188v1.pdf","comment":"33 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.10094v2","updated":"2024-11-19T02:55:22Z","published":"2024-09-16T08:50:47Z","title":"Beyond Perceptual Distances: Rethinking Disparity Assessment for\n Out-of-Distribution Detection with Diffusion Models","summary":" Out-of-Distribution (OoD) detection aims to justify whether a given sample is\nfrom the training distribution of the classifier-under-protection, i.e.,\nIn-Distribution (InD), or from OoD. Diffusion Models (DMs) are recently\nutilized in OoD detection by using the perceptual distances between the given\nimage and its DM generation. DM-based methods bring fresh insights to the\nfield, yet remain under-explored.\n In this work, we point out two main limitations in DM-based OoD detection\nmethods: (i) the perceptual metrics on the disparities between the given sample\nand its generation are devised only at human-perceived levels, ignoring the\nabstract or high-level patterns that help better reflect the intrinsic\ndisparities in distribution; (ii) only the raw image contents are taken to\nmeasure the disparities, while other representations, i.e., the features and\nprobabilities from the classifier-under-protection, are easy to access at hand\nbut are ignored. To this end, our proposed detection framework goes beyond the\nperceptual distances and looks into the deep representations from the\nclassifier-under-protection with our novel metrics devised correspondingly,\nleading to more informative disparity assessments between InD and OoD. An\nanomaly-removal strategy is integrated to remove the abnormal OoD information\nin the generation, further enhancing the distinctiveness of disparities. Our\nwork has demonstrated state-of-the-art detection performances among DM-based\nmethods in extensive experiments.\n","authors":["Kun Fang","Qinghua Tao","Zuopeng Yang","Xiaolin Huang","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2409.10094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16843v2","updated":"2024-11-19T02:52:45Z","published":"2024-02-26T18:59:18Z","title":"Multi-LoRA Composition for Image Generation","summary":" Low-Rank Adaptation (LoRA) is extensively utilized in text-to-image models\nfor the accurate rendition of specific elements like distinct characters or\nunique styles in generated images. Nonetheless, existing methods face\nchallenges in effectively composing multiple LoRAs, especially as the number of\nLoRAs to be integrated grows, thus hindering the creation of complex imagery.\nIn this paper, we study multi-LoRA composition through a decoding-centric\nperspective. We present two training-free methods: LoRA Switch, which\nalternates between different LoRAs at each denoising step, and LoRA Composite,\nwhich simultaneously incorporates all LoRAs to guide more cohesive image\nsynthesis. To evaluate the proposed approaches, we establish ComposLoRA, a new\ncomprehensive testbed as part of this research. It features a diverse range of\nLoRA categories with 480 composition sets. Utilizing an evaluation framework\nbased on GPT-4V, our findings demonstrate a clear improvement in performance\nwith our methods over the prevalent baseline, particularly evident when\nincreasing the number of LoRAs in a composition. The code, benchmarks, LoRA\nweights, and all evaluation details are available on our project website:\nhttps://maszhongming.github.io/Multi-LoRA-Composition.\n","authors":["Ming Zhong","Yelong Shen","Shuohang Wang","Yadong Lu","Yizhu Jiao","Siru Ouyang","Donghan Yu","Jiawei Han","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2402.16843v2.pdf","comment":"Transactions on Machine Learning Research (TMLR), 2024"},{"id":"http://arxiv.org/abs/2411.12181v1","updated":"2024-11-19T02:48:36Z","published":"2024-11-19T02:48:36Z","title":"Enhancing Low Dose Computed Tomography Images Using Consistency Training\n Techniques","summary":" Diffusion models have significant impact on wide range of generative tasks,\nespecially on image inpainting and restoration. Although the improvements on\naiming for decreasing number of function evaluations (NFE), the iterative\nresults are still computationally expensive. Consistency models are as a new\nfamily of generative models, enable single-step sampling of high quality data\nwithout the need for adversarial training. In this paper, we introduce the beta\nnoise distribution, which provides flexibility in adjusting noise levels. This\nis combined with a sinusoidal curriculum that enhances the learning of the\ntrajectory between the noise distribution and the posterior distribution of\ninterest, allowing High Noise Improved Consistency Training (HN-iCT) to be\ntrained in a supervised fashion. Additionally, High Noise Improved Consistency\nTraining with Image Condition (HN-iCT-CN) architecture is introduced, enables\nto take Low Dose images as a condition for extracting significant features by\nWeighted Attention Gates (WAG).Our results indicate that unconditional image\ngeneration using HN-iCT significantly outperforms basic CT and iCT training\ntechniques with NFE=1 on the CIFAR10 and CelebA datasets. Moreover, our\nimage-conditioned model demonstrates exceptional performance in enhancing\nlow-dose (LD) CT scans.\n","authors":["Mahmut S. Gokmen","Jie Zhang","Ge Wang","Jin Chen","Cody Bumgardner"],"pdf_url":"https://arxiv.org/pdf/2411.12181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17593v2","updated":"2024-11-19T02:48:07Z","published":"2024-01-31T04:34:31Z","title":"Head and Neck Tumor Segmentation from [18F]F-FDG PET/CT Images Based on\n 3D Diffusion Model","summary":" Head and neck (H&N) cancers are among the most prevalent types of cancer\nworldwide, and [18F]F-FDG PET/CT is widely used for H&N cancer management.\nRecently, the diffusion model has demonstrated remarkable performance in\nvarious image-generation tasks. In this work, we proposed a 3D diffusion model\nto accurately perform H&N tumor segmentation from 3D PET and CT volumes. The 3D\ndiffusion model was developed considering the 3D nature of PET and CT images\nacquired. During the reverse process, the model utilized a 3D UNet structure\nand took the concatenation of PET, CT, and Gaussian noise volumes as the\nnetwork input to generate the tumor mask. Experiments based on the HECKTOR\nchallenge dataset were conducted to evaluate the effectiveness of the proposed\ndiffusion model. Several state-of-the-art techniques based on U-Net and\nTransformer structures were adopted as the reference methods. Benefits of\nemploying both PET and CT as the network input as well as further extending the\ndiffusion model from 2D to 3D were investigated based on various quantitative\nmetrics and the uncertainty maps generated. Results showed that the proposed 3D\ndiffusion model could generate more accurate segmentation results compared with\nother methods. Compared to the diffusion model in 2D format, the proposed 3D\nmodel yielded superior results. Our experiments also highlighted the advantage\nof utilizing dual-modality PET and CT data over only single-modality data for\nH&N tumor segmentation.\n","authors":["Yafei Dong","Kuang Gong"],"pdf_url":"https://arxiv.org/pdf/2401.17593v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12177v1","updated":"2024-11-19T02:40:42Z","published":"2024-11-19T02:40:42Z","title":"Robust 3D Semantic Occupancy Prediction with Calibration-free Spatial\n Transformation","summary":" 3D semantic occupancy prediction, which seeks to provide accurate and\ncomprehensive representations of environment scenes, is important to autonomous\ndriving systems. For autonomous cars equipped with multi-camera and LiDAR, it\nis critical to aggregate multi-sensor information into a unified 3D space for\naccurate and robust predictions. Recent methods are mainly built on the\n2D-to-3D transformation that relies on sensor calibration to project the 2D\nimage information into the 3D space. These methods, however, suffer from two\nmajor limitations: First, they rely on accurate sensor calibration and are\nsensitive to the calibration noise, which limits their application in real\ncomplex environments. Second, the spatial transformation layers are\ncomputationally expensive and limit their running on an autonomous vehicle. In\nthis work, we attempt to exploit a Robust and Efficient 3D semantic Occupancy\n(REO) prediction scheme. To this end, we propose a calibration-free spatial\ntransformation based on vanilla attention to implicitly model the spatial\ncorrespondence. In this way, we robustly project the 2D features to a\npredefined BEV plane without using sensor calibration as input. Then, we\nintroduce 2D and 3D auxiliary training tasks to enhance the discrimination\npower of 2D backbones on spatial, semantic, and texture features. Last, we\npropose a query-based prediction scheme to efficiently generate large-scale\nfine-grained occupancy predictions. By fusing point clouds that provide\ncomplementary spatial information, our REO surpasses the existing methods by a\nlarge margin on three benchmarks, including OpenOccupancy, Occ3D-nuScenes, and\nSemanticKITTI Scene Completion. For instance, our REO achieves 19.8$\\times$\nspeedup compared to Co-Occ, with 1.1 improvements in geometry IoU on\nOpenOccupancy. Our code will be available at https://github.com/ICEORY/REO.\n","authors":["Zhuangwei Zhuang","Ziyin Wang","Sitao Chen","Lizhao Liu","Hui Luo","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2411.12177v1.pdf","comment":"13 pages, 11 figures, 18 tables"},{"id":"http://arxiv.org/abs/2411.12175v1","updated":"2024-11-19T02:39:57Z","published":"2024-11-19T02:39:57Z","title":"AsynEIO: Asynchronous Monocular Event-Inertial Odometry Using Gaussian\n Process Regression","summary":" Event cameras, when combined with inertial sensors, show significant\npotential for motion estimation in challenging scenarios, such as high-speed\nmaneuvers and low-light environments. There are many methods for producing such\nestimations, but most boil down to a synchronous discrete-time fusion problem.\nHowever, the asynchronous nature of event cameras and their unique fusion\nmechanism with inertial sensors remain underexplored. In this paper, we\nintroduce a monocular event-inertial odometry method called AsynEIO, designed\nto fuse asynchronous event and inertial data within a unified Gaussian Process\n(GP) regression framework. Our approach incorporates an event-driven frontend\nthat tracks feature trajectories directly from raw event streams at a high\ntemporal resolution. These tracked feature trajectories, along with various\ninertial factors, are integrated into the same GP regression framework to\nenable asynchronous fusion. With deriving analytical residual Jacobians and\nnoise models, our method constructs a factor graph that is iteratively\noptimized and pruned using a sliding-window optimizer. Comparative assessments\nhighlight the performance of different inertial fusion strategies, suggesting\noptimal choices for varying conditions. Experimental results on both public\ndatasets and our own event-inertial sequences indicate that AsynEIO outperforms\nexisting methods, especially in high-speed and low-illumination scenarios.\n","authors":["Zhixiang Wang","Xudong Li","Yizhai Zhang","Fan Zhang"," Panfeng"],"pdf_url":"https://arxiv.org/pdf/2411.12175v1.pdf","comment":"Submitted to IEEE (2024-11-4)"},{"id":"http://arxiv.org/abs/2411.12174v1","updated":"2024-11-19T02:39:28Z","published":"2024-11-19T02:39:28Z","title":"Just KIDDIN: Knowledge Infusion and Distillation for Detection of\n INdecent Memes","summary":" Toxicity identification in online multimodal environments remains a\nchallenging task due to the complexity of contextual connections across\nmodalities (e.g., textual and visual). In this paper, we propose a novel\nframework that integrates Knowledge Distillation (KD) from Large Visual\nLanguage Models (LVLMs) and knowledge infusion to enhance the performance of\ntoxicity detection in hateful memes. Our approach extracts sub-knowledge graphs\nfrom ConceptNet, a large-scale commonsense Knowledge Graph (KG) to be infused\nwithin a compact VLM framework. The relational context between toxic phrases in\ncaptions and memes, as well as visual concepts in memes enhance the model's\nreasoning capabilities. Experimental results from our study on two hate speech\nbenchmark datasets demonstrate superior performance over the state-of-the-art\nbaselines across AU-ROC, F1, and Recall with improvements of 1.1%, 7%, and 35%,\nrespectively. Given the contextual complexity of the toxicity detection task,\nour approach showcases the significance of learning from both explicit (i.e.\nKG) as well as implicit (i.e. LVLMs) contextual cues incorporated through a\nhybrid neurosymbolic approach. This is crucial for real-world applications\nwhere accurate and scalable recognition of toxic content is critical for\ncreating safer online environments.\n","authors":["Rahul Garg","Trilok Padhi","Hemang Jain","Ugur Kursuncu","Ugur Kursuncu","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2411.12174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08000v3","updated":"2024-11-19T02:31:45Z","published":"2024-08-15T07:57:28Z","title":"MVInpainter: Learning Multi-View Consistent Inpainting to Bridge 2D and\n 3D Editing","summary":" Novel View Synthesis (NVS) and 3D generation have recently achieved prominent\nimprovements. However, these works mainly focus on confined categories or\nsynthetic 3D assets, which are discouraged from generalizing to challenging\nin-the-wild scenes and fail to be employed with 2D synthesis directly.\nMoreover, these methods heavily depended on camera poses, limiting their\nreal-world applications. To overcome these issues, we propose MVInpainter,\nre-formulating the 3D editing as a multi-view 2D inpainting task. Specifically,\nMVInpainter partially inpaints multi-view images with the reference guidance\nrather than intractably generating an entirely novel view from scratch, which\nlargely simplifies the difficulty of in-the-wild NVS and leverages unmasked\nclues instead of explicit pose conditions. To ensure cross-view consistency,\nMVInpainter is enhanced by video priors from motion components and appearance\nguidance from concatenated reference key&value attention. Furthermore,\nMVInpainter incorporates slot attention to aggregate high-level optical flow\nfeatures from unmasked regions to control the camera movement with pose-free\ntraining and inference. Sufficient scene-level experiments on both\nobject-centric and forward-facing datasets verify the effectiveness of\nMVInpainter, including diverse tasks, such as multi-view object removal,\nsynthesis, insertion, and replacement. The project page is\nhttps://ewrfcas.github.io/MVInpainter/.\n","authors":["Chenjie Cao","Chaohui Yu","Fan Wang","Xiangyang Xue","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2408.08000v3.pdf","comment":"Project page: https://ewrfcas.github.io/MVInpainter/. Accepted at\n NeurIPS2024"},{"id":"http://arxiv.org/abs/2411.12168v1","updated":"2024-11-19T02:18:19Z","published":"2024-11-19T02:18:19Z","title":"Sketch-guided Cage-based 3D Gaussian Splatting Deformation","summary":" 3D Gaussian Splatting (GS) is one of the most promising novel 3D\nrepresentations that has received great interest in computer graphics and\ncomputer vision. While various systems have introduced editing capabilities for\n3D GS, such as those guided by text prompts, fine-grained control over\ndeformation remains an open challenge. In this work, we present a novel\nsketch-guided 3D GS deformation system that allows users to intuitively modify\nthe geometry of a 3D GS model by drawing a silhouette sketch from a single\nviewpoint. Our approach introduces a new deformation method that combines\ncage-based deformations with a variant of Neural Jacobian Fields, enabling\nprecise, fine-grained control. Additionally, it leverages large-scale 2D\ndiffusion priors and ControlNet to ensure the generated deformations are\nsemantically plausible. Through a series of experiments, we demonstrate the\neffectiveness of our method and showcase its ability to animate static 3D GS\nmodels as one of its key applications.\n","authors":["Tianhao Xie","Noam Aigerman","Eugene Belilovsky","Tiberiu Popa"],"pdf_url":"https://arxiv.org/pdf/2411.12168v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.19957v4","updated":"2024-11-19T02:12:54Z","published":"2024-05-30T11:23:01Z","title":"PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting","summary":" Previous text-to-4D methods have leveraged multiple Score Distillation\nSampling (SDS) techniques, combining motion priors from video-based diffusion\nmodels (DMs) with geometric priors from multiview DMs to implicitly guide 4D\nrenderings. However, differences in these priors result in conflicting gradient\ndirections during optimization, causing trade-offs between motion fidelity and\ngeometry accuracy, and requiring substantial optimization time to reconcile the\nmodels. In this paper, we introduce \\textbf{P}ixel-\\textbf{L}evel\n\\textbf{A}lignment for text-driven \\textbf{4D} Gaussian splatting (PLA4D) to\nresolve this motion-geometry conflict. PLA4D provides an anchor reference,\ni.e., text-generated video, to align the rendering process conditioned by\ndifferent DMs in pixel space. For static alignment, our approach introduces a\nfocal alignment method and Gaussian-Mesh contrastive learning to iteratively\nadjust focal lengths and provide explicit geometric priors at each timestep. At\nthe dynamic level, a motion alignment technique and T-MV refinement method are\nemployed to enforce both pose alignment and motion continuity across unknown\nviewpoints, ensuring intrinsic geometric consistency across views. With such\npixel-level multi-DM alignment, our PLA4D framework is able to generate 4D\nobjects with superior geometric, motion, and semantic consistency. Fully\nimplemented with open-source tools, PLA4D offers an efficient and accessible\nsolution for high-quality 4D digital content creation with significantly\nreduced generation time.\n","authors":["Qiaowei Miao","JinSheng Quan","Kehan Li","Yawei Luo"],"pdf_url":"https://arxiv.org/pdf/2405.19957v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09209v2","updated":"2024-11-19T02:09:23Z","published":"2024-11-14T06:13:05Z","title":"JoyVASA: Portrait and Animal Image Animation with Diffusion-Based\n Audio-Driven Facial Dynamics and Head Motion Generation","summary":" Audio-driven portrait animation has made significant advances with\ndiffusion-based models, improving video quality and lipsync accuracy. However,\nthe increasing complexity of these models has led to inefficiencies in training\nand inference, as well as constraints on video length and inter-frame\ncontinuity. In this paper, we propose JoyVASA, a diffusion-based method for\ngenerating facial dynamics and head motion in audio-driven facial animation.\nSpecifically, in the first stage, we introduce a decoupled facial\nrepresentation framework that separates dynamic facial expressions from static\n3D facial representations. This decoupling allows the system to generate longer\nvideos by combining any static 3D facial representation with dynamic motion\nsequences. Then, in the second stage, a diffusion transformer is trained to\ngenerate motion sequences directly from audio cues, independent of character\nidentity. Finally, a generator trained in the first stage uses the 3D facial\nrepresentation and the generated motion sequences as inputs to render\nhigh-quality animations. With the decoupled facial representation and the\nidentity-independent motion generation process, JoyVASA extends beyond human\nportraits to animate animal faces seamlessly. The model is trained on a hybrid\ndataset of private Chinese and public English data, enabling multilingual\nsupport. Experimental results validate the effectiveness of our approach.\nFuture work will focus on improving real-time performance and refining\nexpression control, further expanding the applications in portrait animation.\nThe code is available at: https://github.com/jdh-algo/JoyVASA.\n","authors":["Xuyang Cao","Guoxin Wang","Sheng Shi","Jun Zhao","Yang Yao","Jintao Fei","Minyu Gao"],"pdf_url":"https://arxiv.org/pdf/2411.09209v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17612v3","updated":"2024-11-19T02:05:56Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic datasets that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense. Our code is available at\nhttps://github.com/AngusDujw/Diversity-Driven-Synthesis.https://github.com/AngusDujw/Diversity-Driven-Synthesis.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03634v4","updated":"2024-11-19T02:01:50Z","published":"2024-07-04T04:54:03Z","title":"SOWA: Adapting Hierarchical Frozen Window Self-Attention to\n Visual-Language Models for Better Anomaly Detection","summary":" Visual anomaly detection is essential in industrial manufacturing, yet\ntraditional methods often rely heavily on extensive normal datasets and\ntask-specific models, limiting their scalability. Recent advancements in\nlarge-scale vision-language models have significantly enhanced zero- and\nfew-shot anomaly detection. However, these approaches may not fully leverage\nhierarchical features, potentially overlooking nuanced details crucial for\naccurate detection. To address this, we introduce a novel window self-attention\nmechanism based on the CLIP model, augmented with learnable prompts to process\nmulti-level features within a Soldier-Officer Window Self-Attention (SOWA)\nframework. Our method has been rigorously evaluated on five benchmark datasets,\nachieving superior performance by leading in 18 out of 20 metrics, setting a\nnew standard against existing state-of-the-art techniques.\n","authors":["Zongxiang Hu","Zhaosheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03634v4.pdf","comment":"8 pages, 9 figures, conference"},{"id":"http://arxiv.org/abs/2411.12151v1","updated":"2024-11-19T01:01:56Z","published":"2024-11-19T01:01:56Z","title":"Self-Supervised Learning in Deep Networks: A Pathway to Robust Few-Shot\n Classification","summary":" This study aims to optimize the few-shot image classification task and\nimprove the model's feature extraction and classification performance by\ncombining self-supervised learning with the deep network model ResNet-101.\nDuring the training process, we first pre-train the model with self-supervision\nto enable it to learn common feature expressions on a large amount of unlabeled\ndata; then fine-tune it on the few-shot dataset Mini-ImageNet to improve the\nmodel's accuracy and generalization ability under limited data. The\nexperimental results show that compared with traditional convolutional neural\nnetworks, ResNet-50, DenseNet, and other models, our method has achieved\nexcellent performance of about 95.12% in classification accuracy (ACC) and F1\nscore, verifying the effectiveness of self-supervised learning in few-shot\nclassification. This method provides an efficient and reliable solution for the\nfield of few-shot image classification.\n","authors":["Yuyang Xiao"],"pdf_url":"https://arxiv.org/pdf/2411.12151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12146v1","updated":"2024-11-19T00:50:01Z","published":"2024-11-19T00:50:01Z","title":"Self-supervised denoising of visual field data improves detection of\n glaucoma progression","summary":" Perimetric measurements provide insight into a patient's peripheral vision\nand day-to-day functioning and are the main outcome measure for identifying\nprogression of visual damage from glaucoma. However, visual field data can be\nnoisy, exhibiting high variance, especially with increasing damage. In this\nstudy, we demonstrate the utility of self-supervised deep learning in denoising\nvisual field data from over 4000 patients to enhance its signal-to-noise ratio\nand its ability to detect true glaucoma progression. We deployed both a\nvariational autoencoder (VAE) and a masked autoencoder to determine which\nself-supervised model best smooths the visual field data while reconstructing\nsalient features that are less noisy and more predictive of worsening disease.\nOur results indicate that including a categorical p-value at every visual field\nlocation improves the smoothing of visual field data. Masked autoencoders led\nto cleaner denoised data than previous methods, such as variational\nautoencoders. A 4.7% increase in detection of progressing eyes with pointwise\nlinear regression (PLR) was observed. The masked and variational autoencoders'\nsmoothed data predicted glaucoma progression 2.3 months earlier when p-values\nwere included compared to when they were not. The faster prediction of time to\nprogression (TTP) and the higher percentage progression detected support our\nhypothesis that masking out visual field elements during training while\nincluding p-values at each location would improve the task of detection of\nvisual field progression. Our study has clinically relevant implications\nregarding masking when training neural networks to denoise visual field data,\nresulting in earlier and more accurate detection of glaucoma progression. This\ndenoising model can be integrated into future models for visual field analysis\nto enhance detection of glaucoma progression.\n","authors":["Sean Wu","Jun Yu Chen","Vahid Mohammadzadeh","Sajad Besharati","Jaewon Lee","Kouros Nouri-Mahdavi","Joseph Caprioli","Zhe Fei","Fabien Scalzo"],"pdf_url":"https://arxiv.org/pdf/2411.12146v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.04618v2","updated":"2024-11-19T00:46:01Z","published":"2024-10-06T20:38:14Z","title":"Towards Unsupervised Blind Face Restoration using Diffusion Prior","summary":" Blind face restoration methods have shown remarkable performance,\nparticularly when trained on large-scale synthetic datasets with supervised\nlearning. These datasets are often generated by simulating low-quality face\nimages with a handcrafted image degradation pipeline. The models trained on\nsuch synthetic degradations, however, cannot deal with inputs of unseen\ndegradations. In this paper, we address this issue by using only a set of input\nimages, with unknown degradations and without ground truth targets, to\nfine-tune a restoration model that learns to map them to clean and contextually\nconsistent outputs. We utilize a pre-trained diffusion model as a generative\nprior through which we generate high quality images from the natural image\ndistribution while maintaining the input image content through consistency\nconstraints. These generated images are then used as pseudo targets to\nfine-tune a pre-trained restoration model. Unlike many recent approaches that\nemploy diffusion models at test time, we only do so during training and thus\nmaintain an efficient inference-time performance. Extensive experiments show\nthat the proposed approach can consistently improve the perceptual quality of\npre-trained blind face restoration models while maintaining great consistency\nwith the input contents. Our best model also achieves the state-of-the-art\nresults on both synthetic and real-world datasets.\n","authors":["Tianshu Kuai","Sina Honari","Igor Gilitschenski","Alex Levinshtein"],"pdf_url":"https://arxiv.org/pdf/2410.04618v2.pdf","comment":"WACV 2025. Project page: https://dt-bfr.github.io/"}],"Systems and Control":[{"id":"http://arxiv.org/abs/2411.12736v1","updated":"2024-11-19T18:58:03Z","published":"2024-11-19T18:58:03Z","title":"ACING: Actor-Critic for Instruction Learning in Black-Box Large Language\n Models","summary":" The effectiveness of Large Language Models (LLMs) in solving tasks vastly\ndepends on the quality of the instructions, which often require fine-tuning\nthrough extensive human effort. This highlights the need for automated\ninstruction optimization; however, this optimization is particularly\nchallenging when dealing with black-box LLMs, where model parameters and\ngradients remain inaccessible. We propose ACING, a task-specific prompt\noptimization approach framed as a stateless continuous-action Reinforcement\nLearning (RL) problem, known as the continuum bandit setting. ACING leverages\nan actor-critic-based method to optimize prompts, learning from\nnon-differentiable reward signals. We validate ACING by optimizing prompts for\nChatGPT on 30 instruction-based tasks. ACING consistently outperforms baseline\nmethods, achieving a median score improvement of 10 percentage points.\nFurthermore, ACING not only recovers but also surpasses human-crafted expert\ninstructions, achieving up to a 39 percentage point improvement against human\nbenchmarks.\n","authors":["Salma Kharrat","Fares Fourati","Marco Canini"],"pdf_url":"https://arxiv.org/pdf/2411.12736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14010v2","updated":"2024-11-19T18:06:30Z","published":"2024-03-20T22:15:33Z","title":"When are Lossy Energy Storage Optimization Models Convex?","summary":" We examine a class of optimization problems involving the optimal operation\nof a single lossy energy storage system, where energy losses occur during\ncharging and discharging. These inefficiencies typically lead to a nonconvex\nset of feasible charging and discharging power profiles. In this paper, we\nderive an equivalent reformulation of this class of optimization problems by\neliminating the charging and discharging power variables and recasting the\nproblem entirely in terms of the storage state-of-charge variables. We show\nthat the feasible set of the proposed reformulation is always convex. We also\nprovide sufficient conditions under which the objective function of the\nproposed reformulation is guaranteed to be convex. The conditions provided both\nunify and generalize many existing conditions for convexity in the literature.\n","authors":["Feras Al Taha","Eilyan Bitar"],"pdf_url":"https://arxiv.org/pdf/2403.14010v2.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.12682v1","updated":"2024-11-19T17:39:51Z","published":"2024-11-19T17:39:51Z","title":"Distributed Coordination of Grid-Forming and Grid-Following\n Inverter-Based Resources for Optimal Frequency Control in Power Systems","summary":" With the fast-growing penetration of power inverter-interfaced renewable\ngeneration, power systems face significant challenges in maintaining power\nbalance and the nominal frequency. This paper studies the grid-level\ncoordinated control of a mix of grid-forming (GFM) and grid-following (GFL)\ninverter-based resources (IBRs) for power system frequency regulation at scale.\nSpecifically, a fully distributed optimal frequency control algorithm is\nproposed by leveraging the projected primal-dual gradient method and the\nstructure of the physical system dynamics. This algorithm 1) restores the\nnominal frequency, 2) minimizes the total control cost, 3) respects the IBR\npower limits and the line thermal constraints, and 4) is implemented in a\ndistributed fashion that only needs local measurement and local communication.\nThe effectiveness and optimality of the proposed algorithm are demonstrated\nthrough high-fidelity electromagnetic transient (EMT) simulations on the IEEE\n39-bus system.\n","authors":["Xiaoyang Wang","Xin Chen"],"pdf_url":"https://arxiv.org/pdf/2411.12682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12666v1","updated":"2024-11-19T17:17:50Z","published":"2024-11-19T17:17:50Z","title":"Steady-State Initialization of Object-Oriented Advanced Thermal Power\n Generation System Models with Application to the Case of the SOS-CO2 Cycle","summary":" The forthcoming energy transition calls for a new generation of thermal power\ngeneration systems with low- or zero-emission and highly flexible operation.\nDynamic modelling and simulation is a key enabling factor in this field, as\ncontrolling such plants is a difficult task for which there is no previous\nexperience and very short design times are expected. The steady-state\ninitialization of those dynamic models is an essential step in the design\nprocess, but is unfortunately a difficult task which involves the numerical\nsolution of large systems of nonlinear equations with iterative Newton methods,\nwhich is often prone to numerical failures.\n In this work, several strategies and methodologies are discussed to\nsuccessfully achieve steady-state initialization of first-principles\nequation-based, object-oriented models of advanced thermal power generation\nsystems. These are presented in the context of the Modelica modelling language,\nbut could be applied to other equation-based, object-oriented modelling and\nsimulation environments.\n Finally, the successful application of such strategies and methodologies to\nthe SOS-CO2 advanced power generation system is presented.\n","authors":["Matteo Luigi De Pascali","Francesco Casella"],"pdf_url":"https://arxiv.org/pdf/2411.12666v1.pdf","comment":"Submitted to Simulation Modelling Practice and Theory"},{"id":"http://arxiv.org/abs/2411.12653v1","updated":"2024-11-19T17:02:04Z","published":"2024-11-19T17:02:04Z","title":"Smart Predict-then-Optimize Method with Dependent Data: Risk Bounds and\n Calibration of Autoregression","summary":" The predict-then-optimize (PTO) framework is indispensable for addressing\npractical stochastic decision-making tasks. It consists of two crucial steps:\ninitially predicting unknown parameters of an optimization model and\nsubsequently solving the problem based on these predictions. Elmachtoub and\nGrigas [1] introduced the Smart Predict-then-Optimize (SPO) loss for the\nframework, which gauges the decision error arising from predicted parameters,\nand a convex surrogate, the SPO+ loss, which incorporates the underlying\nstructure of the optimization model. The consistency of these different loss\nfunctions is guaranteed under the assumption of i.i.d. training data.\nNevertheless, various types of data are often dependent, such as power load\nfluctuations over time. This dependent nature can lead to diminished model\nperformance in testing or real-world applications. Motivated to make\nintelligent predictions for time series data, we present an autoregressive SPO\nmethod directly targeting the optimization problem at the decision stage in\nthis paper, where the conditions of consistency are no longer met. Therefore,\nwe first analyze the generalization bounds of the SPO loss within our\nautoregressive model. Subsequently, the uniform calibration results in Liu and\nGrigas [2] are extended in the proposed model. Finally, we conduct experiments\nto empirically demonstrate the effectiveness of the SPO+ surrogate compared to\nthe absolute loss and the least squares loss, especially when the cost vectors\nare determined by stationary dynamical systems and demonstrate the relationship\nbetween normalized regret and mixing coefficients.\n","authors":["Jixian Liu","Tao Xu","Jianping He","Chongrong Fang"],"pdf_url":"https://arxiv.org/pdf/2411.12653v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.12480v1","updated":"2024-11-19T13:01:38Z","published":"2024-11-19T13:01:38Z","title":"Probabilistic Day-Ahead Battery Scheduling based on Mixed Random\n Variables for Enhanced Grid Operation","summary":" The increasing penetration of renewable energy sources introduces significant\nchallenges to power grid stability, primarily due to their inherent\nvariability. A new opportunity for grid operation is the smart integration of\nelectricity production combined with battery storages in residential buildings.\nThis study explores how residential battery systems can aid in stabilizing the\npower grid by flexibly managing deviations from forecasted residential power\nconsumption and PV generation. The key contribution of this work is the\ndevelopment of an analytical approach that enables the asymmetric allocation of\nquantified power uncertainties between a residential battery system and the\npower grid, introducing a new degree of freedom into the scheduling problem.\nThis is accomplished by employing mixed random variables - characterized by\nboth continuous and discrete events - to model battery and grid power\nuncertainties. These variables are embedded into a continuous stochastic\noptimization framework, which computes probabilistic schedules for battery\noperation and power exchange with the grid. Test cases demonstrate that the\nproposed framework can be used effectively to reduce and quantify grid\nuncertainties while minimizing electricity costs. It is also shown that\nresidential battery systems can be actively used to provide flexibility during\ncritical periods of grid operation. Overall, this framework empowers prosumers\nto take an active role in grid stabilization, contributing to a more resilient\nand adaptive energy system.\n","authors":["Janik Pinter","Frederik Zahn","Maximilian Beichter","Ralf Mikut","Veit Hagenmeyer"],"pdf_url":"https://arxiv.org/pdf/2411.12480v1.pdf","comment":"12 pages, 7 figures, submitted to IREP 2025 Symposium"},{"id":"http://arxiv.org/abs/2411.12478v1","updated":"2024-11-19T13:00:47Z","published":"2024-11-19T13:00:47Z","title":"Robotic transcatheter tricuspid valve replacement with hybrid enhanced\n intelligence: a new paradigm and first-in-vivo study","summary":" Transcatheter tricuspid valve replacement (TTVR) is the latest treatment for\ntricuspid regurgitation and is in the early stages of clinical adoption.\nIntelligent robotic approaches are expected to overcome the challenges of\nsurgical manipulation and widespread dissemination, but systems and protocols\nwith high clinical utility have not yet been reported. In this study, we\npropose a complete solution that includes a passive stabilizer, robotic drive,\ndetachable delivery catheter and valve manipulation mechanism. Working towards\nautonomy, a hybrid augmented intelligence approach based on reinforcement\nlearning, Monte Carlo probabilistic maps and human-robot co-piloted control was\nintroduced. Systematic tests in phantom and first-in-vivo animal experiments\nwere performed to verify that the system design met the clinical requirement.\nFurthermore, the experimental results confirmed the advantages of co-piloted\ncontrol over conventional master-slave control in terms of time efficiency,\ncontrol efficiency, autonomy and stability of operation. In conclusion, this\nstudy provides a comprehensive pathway for robotic TTVR and, to our knowledge,\ncompletes the first animal study that not only successfully demonstrates the\napplication of hybrid enhanced intelligence in interventional robotics, but\nalso provides a solution with high application value for a cutting-edge\nprocedure.\n","authors":["Shuangyi Wang","Haichuan Lin","Yiping Xie","Ziqi Wang","Dong Chen","Longyue Tan","Xilong Hou","Chen Chen","Xiao-Hu Zhou","Shengtao Lin","Fei Pan","Kent Chak-Yu So","Zeng-Guang Hou"],"pdf_url":"https://arxiv.org/pdf/2411.12478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13376v4","updated":"2024-11-19T09:22:31Z","published":"2024-04-20T13:32:27Z","title":"Cross-Forming Control and Fault Current Limiting for Grid-Forming\n Inverters","summary":" This article proposes a \"cross-forming\" control concept for grid-forming\ninverters operating against grid faults. Cross-forming refers to voltage angle\nforming and current magnitude forming. It differs from classical grid-forming\nand grid-following paradigms that feature voltage magnitude-and-angle forming\nand voltage magnitude-and-angle following (or current magnitude-and-angle\nforming), respectively. The cross-forming concept addresses the need for\ninverters to remain grid-forming (particularly voltage angle forming, as\nrequired by grid codes) while managing fault current limitation. Simple and\nfeasible cross-forming control implementations are proposed, enabling inverters\nto quickly limit fault currents to a prescribed level while preserving voltage\nangle forming for grid-forming synchronization and providing dynamic ancillary\nservices, during symmetrical or asymmetrical fault ride-through. Moreover, the\ncross-forming control yields an equivalent system featuring a constant virtual\nimpedance and a \"normal form\" representation, allowing for the extension of\npreviously established transient stability results to include scenarios\ninvolving current saturation. Simulations and experiments validate the efficacy\nof the proposed cross-forming control implementations.\n","authors":["Xiuqiang He","Maitraya Avadhut Desai","Linbin Huang","Florian Dörfler"],"pdf_url":"https://arxiv.org/pdf/2404.13376v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12353v1","updated":"2024-11-19T09:15:56Z","published":"2024-11-19T09:15:56Z","title":"Service Restoration for Distribution Systems Based on Semi-Analytical\n Metamodeling of Decision-Dependent Interruption Cost and Cold Load Pickup","summary":" Developing optimized restoration strategies for power distribution systems\n(PDSs) is essential to meet the pressing demand for enhanced resilience. Prior\nknowledge of customer interruption cost (CIC) and load restoration behaviors,\nparticularly cold load pickup (CLPU), is crucial for guiding effective\nrestoration; however, both are reciprocally affected by the realized customer\ninterruption duration (CID), making them decision-dependent and challenging to\nmodel especially given the limited understanding of underlying physical\nmechanisms. This paper presents a novel approach by constructing tractable\nmetamodels to capture the varying patterns of CIC and CLPU with CID - patterns\nwhich can be derived from limited data and reflect observed surface-level\ncorrelations rather than underlying mechanisms, thereby enabling practical\nsurrogate modeling of these decision-dependencies. Specifically, quadratic\nfunctions are used to model the increasing rate of CIC with CID based on data\nfitting. Several defining characteristics of CLPU are extracted, each modeled\nin a piecewise linear form relative to CID, and the actual restored load\naccounting for CLPU is subsequently retrieved. Building on these metamodels, a\nPDS restoration optimization model is constructed, incorporating mobile energy\nstorage systems (MESSs) and network reconfiguration. Case studies validate our\napproach and also highlight MESS's unique potential to accelerate CLPU-related\nrestoration.\n","authors":["Wei Wang","Minwu Chen","Hongbin Wang","Gaoqiang Peng","Hongzhou Chen"],"pdf_url":"https://arxiv.org/pdf/2411.12353v1.pdf","comment":"10 pages, 10 figures, submitted to IEEE Transactions on Smart Grid"},{"id":"http://arxiv.org/abs/2410.13383v2","updated":"2024-11-19T08:49:45Z","published":"2024-10-17T09:36:19Z","title":"Railway LiDAR semantic segmentation based on intelligent semi-automated\n data annotation","summary":" Automated vehicles rely on an accurate and robust perception of the\nenvironment. Similarly to automated cars, highly automated trains require an\nenvironmental perception. Although there is a lot of research based on either\ncamera or LiDAR sensors in the automotive domain, very few contributions for\nthis task exist yet for automated trains. Additionally, no public dataset or\ndescribed approach for a 3D LiDAR semantic segmentation in the railway\nenvironment exists yet. Thus, we propose an approach for a point-wise 3D\nsemantic segmentation based on the 2DPass network architecture using scans and\nimages jointly. In addition, we present a semi-automated intelligent data\nannotation approach, which we use to efficiently and accurately label the\nrequired dataset recorded on a railway track in Germany. To improve performance\ndespite a still small number of labeled scans, we apply an active learning\napproach to intelligently select scans for the training dataset. Our\ncontributions are threefold: We annotate rail data including camera and LiDAR\ndata from the railway environment, transfer label the raw LiDAR point clouds\nusing an image segmentation network, and train a state-of-the-art 3D LiDAR\nsemantic segmentation network efficiently leveraging active learning. The\ntrained network achieves good segmentation results with a mean IoU of 71.48% of\n9 classes.\n","authors":["Florian Wulff","Bernd Schaeufele","Julian Pfeifer","Ilja Radusch"],"pdf_url":"https://arxiv.org/pdf/2410.13383v2.pdf","comment":"This article has been accepted for publication in the IEEE VTC Fall\n 2024"},{"id":"http://arxiv.org/abs/2411.12312v1","updated":"2024-11-19T07:57:04Z","published":"2024-11-19T07:57:04Z","title":"Age of Information Minimization in UAV-Assisted Covert Communication:\n Trajectory and Beamforming Design","summary":" Unmanned aerial vehicles (UAVs) have the potential for time-sensitive\napplications. Due to wireless channel variation, received data may have an\nexpiration time, particularly in critical situations such as rescue operations,\nnatural disasters, or the military. Age of Information (AoI) is a metric that\nmeasures the freshness of received packets to specify the validity period of\ninformation. In addition, it is necessary to guarantee the privacy of\nconfidential information transmission through air-to-ground links against\neavesdroppers. This paper investigates UAV-assisted covert communication to\nminimize AoI in the presence of an aerial eavesdropper for the first time.\nHowever, to ensure the eavesdropper's error detection rate, UAV-enabled\nbeamforming employs the power-domain non-orthogonal multiple access (PD-NOMA)\ntechnique to cover the covert user by a public user. PD-NOMA technique\nsignificantly improves the user's AoI, too. The joint optimization problem\ncontains non-convex constraints and coupled optimization variables, including\nUAV trajectory, beamforming design, and the user's AoI which is challenging to\nderive a direct solution. We have developed an efficient alternating\noptimization technique to address the formulated optimization problem.\nNumerical results demonstrate the impact of the main parameters on the\nperformance of the proposed communication system.\n","authors":["Shima Salar Hosseini","Paeiz Azmi","Ali Nazari"],"pdf_url":"https://arxiv.org/pdf/2411.12312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12292v1","updated":"2024-11-19T07:23:31Z","published":"2024-11-19T07:23:31Z","title":"The Soft-PVTOL: modeling and control","summary":" This paper presents, for the first time, the soft planar vertical take-off\nand landing (Soft-PVTOL) aircraft. This concept captures the soft aerial\nvehicle's fundamental dynamics with a minimum number of states and inputs but\nretains the main features to consider when designing control laws. Unlike\nconventional PVTOL and multi-rotors, where altering position inevitably impacts\norientation due to their underactuated design, the Soft-PVTOL offers the unique\nadvantage of separating these dynamics, opening doors to unparalleled\nmaneuverability and precision. We demonstrate that the Soft-PVTOL can be\nmodeled using the Euler-Lagrange equations by assuming a constant curvature\nmodel in the aerial robot's arms. Such a mathematical model is presented in\ndetail and can be extended to several constant curvature segments in each\nSoft-PVTOL arm. Moreover, we design a passivity-based control law that exploits\nthe flexibility of the robot's arms. We solve the tracking control problem,\nproving that the error equilibrium globally exponentially converges to zero.\nThe controller is tested in numerical simulations, demonstrating robust\nperformance and ensuring the efficacy of the closed-loop system.\n","authors":["Gerardo Flores","Mark W. Spong"],"pdf_url":"https://arxiv.org/pdf/2411.12292v1.pdf","comment":"This manuscript has been submitted for peer review"},{"id":"http://arxiv.org/abs/2411.12239v1","updated":"2024-11-19T05:35:50Z","published":"2024-11-19T05:35:50Z","title":"A Control Lyapunov Function Approach to Event-Triggered Parameterized\n Control for Discrete-Time Linear Systems","summary":" This paper proposes an event-triggered parameterized control method using a\ncontrol Lyapunov function approach for discrete time linear systems with\nexternal disturbances. In this control method, each control input to the plant\nis a linear combination of a fixed set of linearly independent scalar\nfunctions. The controller updates the coefficients of the parameterized control\ninput in an event-triggered manner so as to minimize a quadratic cost function\nsubject to quadratic constraints and communicates the same to the actuator. We\ndesign an event-triggering rule that guarantees global uniform ultimate\nboundedness of trajectories of the closed loop system and non-trivial\ninter-event times. We illustrate our results through numerical examples and we\nalso compare the performance of the proposed control method with other existing\ncontrol methods in the literature.\n","authors":["Anusree Rajan","Kushagra Parmeshwar","Pavankumar Tallapragada"],"pdf_url":"https://arxiv.org/pdf/2411.12239v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2402.16337"},{"id":"http://arxiv.org/abs/2411.12183v1","updated":"2024-11-19T02:50:11Z","published":"2024-11-19T02:50:11Z","title":"Action-Attentive Deep Reinforcement Learning for Autonomous Alignment of\n Beamlines","summary":" Synchrotron radiation sources play a crucial role in fields such as materials\nscience, biology, and chemistry. The beamline, a key subsystem of the\nsynchrotron, modulates and directs the radiation to the sample for analysis.\nHowever, the alignment of beamlines is a complex and time-consuming process,\nprimarily carried out manually by experienced engineers. Even minor\nmisalignments in optical components can significantly affect the beam's\nproperties, leading to suboptimal experimental outcomes. Current automated\nmethods, such as bayesian optimization (BO) and reinforcement learning (RL),\nalthough these methods enhance performance, limitations remain. The\nrelationship between the current and target beam properties, crucial for\ndetermining the adjustment, is not fully considered. Additionally, the physical\ncharacteristics of optical elements are overlooked, such as the need to adjust\nspecific devices to control the output beam's spot size or position. This paper\naddresses the alignment of beamlines by modeling it as a Markov Decision\nProcess (MDP) and training an intelligent agent using RL. The agent calculates\nadjustment values based on the current and target beam states, executes\nactions, and iterates until optimal parameters are achieved. A policy network\nwith action attention is designed to improve decision-making by considering\nboth state differences and the impact of optical components. Experiments on two\nsimulated beamlines demonstrate that our algorithm outperforms existing\nmethods, with ablation studies highlighting the effectiveness of the action\nattention-based policy network.\n","authors":["Siyu Wang","Shengran Dai","Jianhui Jiang","Shuang Wu","Yufei Peng","Junbin Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.12183v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.12162v1","updated":"2024-11-19T01:58:40Z","published":"2024-11-19T01:58:40Z","title":"Microsegmented Cloud Network Architecture Using Open-Source Tools for a\n Zero Trust Foundation","summary":" This paper presents a multi-cloud networking architecture built on zero trust\nprinciples and micro-segmentation to provide secure connectivity with\nauthentication, authorization, and encryption in transit. The proposed design\nincludes the multi-cloud network to support a wide range of applications and\nworkload use cases, compute resources including containers, virtual machines,\nand cloud-native services, including IaaS (Infrastructure as a Service (IaaS),\nPaaS (Platform as a service). Furthermore, open-source tools provide\nflexibility, agility, and independence from locking to one vendor technology.\nThe paper provides a secure architecture with micro-segmentation and follows\nzero trust principles to solve multi-fold security and operational challenges.\n","authors":["Sunil Arora","John Hastings"],"pdf_url":"https://arxiv.org/pdf/2411.12162v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.12159v1","updated":"2024-11-19T01:52:59Z","published":"2024-11-19T01:52:59Z","title":"Sensor-fusion based Prognostics Framework for Complex Engineering\n Systems Exhibiting Multiple Failure Modes","summary":" Complex engineering systems are often subject to multiple failure modes.\nDeveloping a remaining useful life (RUL) prediction model that does not\nconsider the failure mode causing degradation is likely to result in inaccurate\npredictions. However, distinguishing between causes of failure without manually\ninspecting the system is nontrivial. This challenge is increased when the\ncauses of historically observed failures are unknown. Sensors, which are useful\nfor monitoring the state-of-health of systems, can also be used for\ndistinguishing between multiple failure modes as the presence of multiple\nfailure modes results in discriminatory behavior of the sensor signals. When\nsystems are equipped with multiple sensors, some sensors may exhibit behavior\ncorrelated with degradation, while other sensors do not. Furthermore, which\nsensors exhibit this behavior may differ for each failure mode. In this paper,\nwe present a simultaneous clustering and sensor selection approach for\nunlabeled training datasets of systems exhibiting multiple failure modes. The\ncluster assignments and the selected sensors are then utilized in real-time to\nfirst diagnose the active failure mode and then to predict the system RUL. We\nvalidate the complete pipeline of the methodology using a simulated dataset of\nsystems exhibiting two failure modes and on a turbofan degradation dataset from\nNASA.\n","authors":["Benjamin Peters","Ayush Mohanty","Xiaolei Fang","Stephen K. Robinson","Nagi Gebraeel"],"pdf_url":"https://arxiv.org/pdf/2411.12159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12154v1","updated":"2024-11-19T01:08:13Z","published":"2024-11-19T01:08:13Z","title":"Tangential Randomization in Linear Bandits (TRAiL): Guaranteed Inference\n and Regret Bounds","summary":" We propose and analyze TRAiL (Tangential Randomization in Linear Bandits), a\ncomputationally efficient regret-optimal forced exploration algorithm for\nlinear bandits on action sets that are sublevel sets of strongly convex\nfunctions. TRAiL estimates the governing parameter of the linear bandit problem\nthrough a standard regularized least squares and perturbs the reward-maximizing\naction corresponding to said point estimate along the tangent plane of the\nconvex compact action set before projecting back to it. Exploiting\nconcentration results for matrix martingales, we prove that TRAiL ensures a\n$\\Omega(\\sqrt{T})$ growth in the inference quality, measured via the minimum\neigenvalue of the design (regressor) matrix with high-probability over a\n$T$-length period. We build on this result to obtain an $\\mathcal{O}(\\sqrt{T}\n\\log(T))$ upper bound on cumulative regret with probability at least $ 1 - 1/T$\nover $T$ periods, and compare TRAiL to other popular algorithms for linear\nbandits. Then, we characterize an $\\Omega(\\sqrt{T})$ minimax lower bound for\nany algorithm on the expected regret that covers a wide variety of\naction/parameter sets and noise processes. Our analysis not only expands the\nrealm of lower-bounds in linear bandits significantly, but as a byproduct,\nyields a trade-off between regret and inference quality. Specifically, we prove\nthat any algorithm with an $\\mathcal{O}(T^\\alpha)$ expected regret growth must\nhave an $\\Omega(T^{1-\\alpha})$ asymptotic growth in expected inference quality.\nOur experiments on the $L^p$ unit ball as action sets reveal how this relation\ncan be violated, but only in the short-run, before returning to respect the\nbound asymptotically. In effect, regret-minimizing algorithms must have just\nthe right rate of inference -- too fast or too slow inference will incur\nsub-optimal regret growth.\n","authors":["Arda Güçlü","Subhonmesh Bose"],"pdf_url":"https://arxiv.org/pdf/2411.12154v1.pdf","comment":"42 pages, 6 Figures"},{"id":"http://arxiv.org/abs/2411.12152v1","updated":"2024-11-19T01:03:02Z","published":"2024-11-19T01:03:02Z","title":"Development of a Comprehensive Physics-Based Battery Model and Its\n Multidimensional Comparison with an Equivalent-Circuit Model: Accuracy,\n Complexity, and Real-World Performance under Varying Conditions","summary":" This paper develops a comprehensive physics-based model (PBM) that spans a\nwide operational range, including varying temperatures, charge/discharge\nconditions, and real-world field data cycles. The PBM incorporates key factors\nsuch as hysteresis effects, concentration-dependent diffusivity, and the\nArrhenius law to provide a realistic depiction of battery behavior.\nAdditionally, the paper presents an in-depth analysis comparing the PBM with an\nequivalent-circuit model (ECM) for accurately capturing the dynamics of\nlithium-ion batteries under diverse operating conditions. To ensure a fair\ncomparison, both the PBM and ECM are rigorously calibrated and validated\nthrough parameter identification and testing across 55 different operating\nconditions. To the best of the authors' knowledge, this represents the most\ncomprehensive model calibration and validation effort for PBM and ECM in the\nliterature to date, encompassing large temperature variations (-20 to\n40{\\deg}C), various charging/discharging C-rates, and real-world driving\ncycles. Comparative analysis between the PBM and ECM highlights key differences\nin accuracy, computational complexity, parameterization requirements, and\nperformance under varying temperature conditions. appropriate models for\nbattery management applications.\n","authors":["Guodong Fan","Boru Zhou","Chengwen Meng","Tengwei Pang","Xi Zhang","Mingshu Du","Wei Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.12152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12130v1","updated":"2024-11-19T00:06:46Z","published":"2024-11-19T00:06:46Z","title":"Adversarial Multi-Agent Reinforcement Learning for Proactive False Data\n Injection Detection","summary":" Smart inverters are instrumental in the integration of renewable and\ndistributed energy resources (DERs) into the electric grid. Such inverters rely\non communication layers for continuous control and monitoring, potentially\nexposing them to cyber-physical attacks such as false data injection attacks\n(FDIAs). We propose to construct a defense strategy against a priori unknown\nFDIAs with a multi-agent reinforcement learning (MARL) framework. The first\nagent is an adversary that simulates and discovers various FDIA strategies,\nwhile the second agent is a defender in charge of detecting and localizing\nFDIAs. This approach enables the defender to be trained against new FDIAs\ncontinuously generated by the adversary. The numerical results demonstrate that\nthe proposed MARL defender outperforms a supervised offline defender.\nAdditionally, we show that the detection skills of an MARL defender can be\ncombined with that of an offline defender through a transfer learning approach.\n","authors":["Kejun Chen","Truc Nguyen","Malik Hassanaly"],"pdf_url":"https://arxiv.org/pdf/2411.12130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05249v2","updated":"2024-11-19T00:05:02Z","published":"2024-04-08T07:25:25Z","title":"SAFE-GIL: SAFEty Guided Imitation Learning for Robotic Systems","summary":" Behavior cloning (BC) is a widely-used approach in imitation learning, where\na robot learns a control policy by observing an expert supervisor. However, the\nlearned policy can make errors and might lead to safety violations, which\nlimits their utility in safety-critical robotics applications. While prior\nworks have tried improving a BC policy via additional real or synthetic action\nlabels, adversarial training, or runtime filtering, none of them explicitly\nfocus on reducing the BC policy's safety violations during training time. We\npropose SAFE-GIL, a design-time method to learn safety-aware behavior cloning\npolicies. SAFE-GIL deliberately injects adversarial disturbance in the system\nduring data collection to guide the expert towards safety-critical states. This\ndisturbance injection simulates potential policy errors that the system might\nencounter during the test time. By ensuring that training more closely\nreplicates expert behavior in safety-critical states, our approach results in\nsafer policies despite policy errors during the test time. We further develop a\nreachability-based method to compute this adversarial disturbance. We compare\nSAFE-GIL with various behavior cloning techniques and online safety-filtering\nmethods in three domains: autonomous ground navigation, aircraft taxiing, and\naerial navigation on a quadrotor testbed. Our method demonstrates a significant\nreduction in safety failures, particularly in low data regimes where the\nlikelihood of learning errors, and therefore safety violations, is higher. See\nour website here: https://y-u-c.github.io/safegil/\n","authors":["Yusuf Umut Ciftci","Darren Chiu","Zeyuan Feng","Gaurav S. Sukhatme","Somil Bansal"],"pdf_url":"https://arxiv.org/pdf/2404.05249v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.12736v1","updated":"2024-11-19T18:58:03Z","published":"2024-11-19T18:58:03Z","title":"ACING: Actor-Critic for Instruction Learning in Black-Box Large Language\n Models","summary":" The effectiveness of Large Language Models (LLMs) in solving tasks vastly\ndepends on the quality of the instructions, which often require fine-tuning\nthrough extensive human effort. This highlights the need for automated\ninstruction optimization; however, this optimization is particularly\nchallenging when dealing with black-box LLMs, where model parameters and\ngradients remain inaccessible. We propose ACING, a task-specific prompt\noptimization approach framed as a stateless continuous-action Reinforcement\nLearning (RL) problem, known as the continuum bandit setting. ACING leverages\nan actor-critic-based method to optimize prompts, learning from\nnon-differentiable reward signals. We validate ACING by optimizing prompts for\nChatGPT on 30 instruction-based tasks. ACING consistently outperforms baseline\nmethods, achieving a median score improvement of 10 percentage points.\nFurthermore, ACING not only recovers but also surpasses human-crafted expert\ninstructions, achieving up to a 39 percentage point improvement against human\nbenchmarks.\n","authors":["Salma Kharrat","Fares Fourati","Marco Canini"],"pdf_url":"https://arxiv.org/pdf/2411.12736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12732v1","updated":"2024-11-19T18:57:01Z","published":"2024-11-19T18:57:01Z","title":"Benchmarking Positional Encodings for GNNs and Graph Transformers","summary":" Recent advances in Graph Neural Networks (GNNs) and Graph Transformers (GTs)\nhave been driven by innovations in architectures and Positional Encodings\n(PEs), which are critical for augmenting node features and capturing graph\ntopology. PEs are essential for GTs, where topological information would\notherwise be lost without message-passing. However, PEs are often tested\nalongside novel architectures, making it difficult to isolate their effect on\nestablished models. To address this, we present a comprehensive benchmark of\nPEs in a unified framework that includes both message-passing GNNs and GTs. We\nalso establish theoretical connections between MPNNs and GTs and introduce a\nsparsified GRIT attention mechanism to examine the influence of global\nconnectivity. Our findings demonstrate that previously untested combinations of\nGNN architectures and PEs can outperform existing methods and offer a more\ncomprehensive picture of the state-of-the-art. To support future research and\nexperimentation in our framework, we make the code publicly available.\n","authors":["Florian Grötschla","Jiaqing Xie","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2411.12732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06818v4","updated":"2024-11-19T18:56:07Z","published":"2024-06-10T22:01:34Z","title":"Conformal Prediction for Class-wise Coverage via Augmented Label Rank\n Calibration","summary":" Conformal prediction (CP) is an emerging uncertainty quantification framework\nthat allows us to construct a prediction set to cover the true label with a\npre-specified marginal or conditional probability. Although the valid coverage\nguarantee has been extensively studied for classification problems, CP often\nproduces large prediction sets which may not be practically useful. This issue\nis exacerbated for the setting of class-conditional coverage on imbalanced\nclassification tasks with many and/or imbalanced classes. This paper proposes\nthe Rank Calibrated Class-conditional CP (RC3P) algorithm to reduce the\nprediction set sizes to achieve class-conditional coverage, where the valid\ncoverage holds for each class. In contrast to the standard class-conditional CP\n(CCP) method that uniformly thresholds the class-wise conformity score for each\nclass, the augmented label rank calibration step allows RC3P to selectively\niterate this class-wise thresholding subroutine only for a subset of classes\nwhose class-wise top-k error is small. We prove that agnostic to the classifier\nand data distribution, RC3P achieves class-wise coverage. We also show that\nRC3P reduces the size of prediction sets compared to the CCP method.\nComprehensive experiments on multiple real-world datasets demonstrate that RC3P\nachieves class-wise coverage and 26.25% reduction in prediction set sizes on\naverage.\n","authors":["Yuanjie Shi","Subhankar Ghosh","Taha Belkhouja","Janardhan Rao Doppa","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2406.06818v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12730v1","updated":"2024-11-19T18:52:55Z","published":"2024-11-19T18:52:55Z","title":"Testing classical properties from quantum data","summary":" Many properties of Boolean functions can be tested far more efficiently than\nthe function can be learned. However, this advantage often disappears when\ntesters are limited to random samples--a natural setting for data\nscience--rather than queries. In this work we investigate the quantum version\nof this scenario: quantum algorithms that test properties of a function $f$\nsolely from quantum data in the form of copies of the function state for $f$.\n For three well-established properties, we show that the speedup lost when\nrestricting classical testers to samples can be recovered by testers that use\nquantum data. For monotonicity testing, we give a quantum algorithm that uses\n$\\tilde{\\mathcal{O}}(n^2)$ function state copies as compared to the\n$2^{\\Omega(\\sqrt{n})}$ samples required classically. We also present\n$\\mathcal{O}(1)$-copy testers for symmetry and triangle-freeness, comparing\nfavorably to classical lower bounds of $\\Omega(n^{1/4})$ and $\\Omega(n)$\nsamples respectively. These algorithms are time-efficient and necessarily\ninclude techniques beyond the Fourier sampling approaches applied to earlier\ntesting problems.\n These results make the case for a general study of the advantages afforded by\nquantum data for testing. We contribute to this project by complementing our\nupper bounds with a lower bound of $\\Omega(1/\\varepsilon)$ for monotonicity\ntesting from quantum data in the proximity regime\n$\\varepsilon\\leq\\mathcal{O}(n^{-3/2})$. This implies a strict separation\nbetween testing monotonicity from quantum data and from quantum queries--where\n$\\tilde{\\mathcal{O}}(n)$ queries suffice when $\\varepsilon=\\Theta(n^{-3/2})$.\nWe also exhibit a testing problem that can be solved from $\\mathcal{O}(1)$\nclassical queries but requires $\\Omega(2^{n/2})$ function state copies,\ncomplementing a separation of the same magnitude in the opposite direction\nderived from the Forrelation problem.\n","authors":["Matthias C. Caro","Preksha Naik","Joseph Slote"],"pdf_url":"https://arxiv.org/pdf/2411.12730v1.pdf","comment":"38 + 14 pages, 2 tables, 2 figures"},{"id":"http://arxiv.org/abs/2411.11748v2","updated":"2024-11-19T18:50:19Z","published":"2024-11-18T17:25:06Z","title":"Debiased Regression for Root-N-Consistent Conditional Mean Estimation","summary":" This study introduces a debiasing method for regression estimators, including\nhigh-dimensional and nonparametric regression estimators. For example,\nnonparametric regression methods allow for the estimation of regression\nfunctions in a data-driven manner with minimal assumptions; however, these\nmethods typically fail to achieve $\\sqrt{n}$-consistency in their convergence\nrates, and many, including those in machine learning, lack guarantees that\ntheir estimators asymptotically follow a normal distribution. To address these\nchallenges, we propose a debiasing technique for nonparametric estimators by\nadding a bias-correction term to the original estimators, extending the\nconventional one-step estimator used in semiparametric analysis. Specifically,\nfor each data point, we estimate the conditional expected residual of the\noriginal nonparametric estimator, which can, for instance, be computed using\nkernel (Nadaraya-Watson) regression, and incorporate it as a bias-reduction\nterm. Our theoretical analysis demonstrates that the proposed estimator\nachieves $\\sqrt{n}$-consistency and asymptotic normality under a mild\nconvergence rate condition for both the original nonparametric estimator and\nthe conditional expected residual estimator. Notably, this approach remains\nmodel-free as long as the original estimator and the conditional expected\nresidual estimator satisfy the convergence rate condition. The proposed method\noffers several advantages, including improved estimation accuracy and\nsimplified construction of confidence intervals.\n","authors":["Masahiro Kato"],"pdf_url":"https://arxiv.org/pdf/2411.11748v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12726v1","updated":"2024-11-19T18:48:00Z","published":"2024-11-19T18:48:00Z","title":"LazyDINO: Fast, scalable, and efficiently amortized Bayesian inversion\n via structure-exploiting and surrogate-driven measure transport","summary":" We present LazyDINO, a transport map variational inference method for fast,\nscalable, and efficiently amortized solutions of high-dimensional nonlinear\nBayesian inverse problems with expensive parameter-to-observable (PtO) maps.\nOur method consists of an offline phase in which we construct a\nderivative-informed neural surrogate of the PtO map using joint samples of the\nPtO map and its Jacobian. During the online phase, when given observational\ndata, we seek rapid posterior approximation using surrogate-driven training of\na lazy map [Brennan et al., NeurIPS, (2020)], i.e., a structure-exploiting\ntransport map with low-dimensional nonlinearity. The trained lazy map then\nproduces approximate posterior samples or density evaluations. Our surrogate\nconstruction is optimized for amortized Bayesian inversion using lazy map\nvariational inference. We show that (i) the derivative-based reduced basis\narchitecture [O'Leary-Roseberry et al., Comput. Methods Appl. Mech. Eng., 388\n(2022)] minimizes the upper bound on the expected error in surrogate posterior\napproximation, and (ii) the derivative-informed training formulation\n[O'Leary-Roseberry et al., J. Comput. Phys., 496 (2024)] minimizes the expected\nerror due to surrogate-driven transport map optimization. Our numerical results\ndemonstrate that LazyDINO is highly efficient in cost amortization for Bayesian\ninversion. We observe one to two orders of magnitude reduction of offline cost\nfor accurate posterior approximation, compared to simulation-based amortized\ninference via conditional transport and conventional surrogate-driven\ntransport. In particular, LazyDINO outperforms Laplace approximation\nconsistently using fewer than 1000 offline samples, while other amortized\ninference methods struggle and sometimes fail at 16,000 offline samples.\n","authors":["Lianghao Cao","Joshua Chen","Michael Brennan","Thomas O'Leary-Roseberry","Youssef Marzouk","Omar Ghattas"],"pdf_url":"https://arxiv.org/pdf/2411.12726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12724v1","updated":"2024-11-19T18:45:16Z","published":"2024-11-19T18:45:16Z","title":"Heuristic-Free Multi-Teacher Learning","summary":" We introduce Teacher2Task, a novel framework for multi-teacher learning that\neliminates the need for manual aggregation heuristics. Existing multi-teacher\nmethods typically rely on such heuristics to combine predictions from multiple\nteachers, often resulting in sub-optimal aggregated labels and the propagation\nof aggregation errors. Teacher2Task addresses these limitations by introducing\nteacher-specific input tokens and reformulating the training process. Instead\nof relying on aggregated labels, the framework transforms the training data,\nconsisting of ground truth labels and annotations from N teachers, into N+1\ndistinct tasks: N auxiliary tasks that predict the labeling styles of the N\nindividual teachers, and one primary task that focuses on the ground truth\nlabels. This approach, drawing upon principles from multiple learning\nparadigms, demonstrates strong empirical results across a range of\narchitectures, modalities, and tasks.\n","authors":["Huy Thong Nguyen","En-Hung Chu","Lenord Melvix","Jazon Jiao","Chunglin Wen","Benjamin Louie"],"pdf_url":"https://arxiv.org/pdf/2411.12724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12719v1","updated":"2024-11-19T18:37:45Z","published":"2024-11-19T18:37:45Z","title":"Rethinking MUSHRA: Addressing Modern Challenges in Text-to-Speech\n Evaluation","summary":" Despite rapid advancements in TTS models, a consistent and robust human\nevaluation framework is still lacking. For example, MOS tests fail to\ndifferentiate between similar models, and CMOS's pairwise comparisons are\ntime-intensive. The MUSHRA test is a promising alternative for evaluating\nmultiple TTS systems simultaneously, but in this work we show that its reliance\non matching human reference speech unduly penalises the scores of modern TTS\nsystems that can exceed human speech quality. More specifically, we conduct a\ncomprehensive assessment of the MUSHRA test, focusing on its sensitivity to\nfactors such as rater variability, listener fatigue, and reference bias. Based\non our extensive evaluation involving 471 human listeners across Hindi and\nTamil we identify two primary shortcomings: (i) reference-matching bias, where\nraters are unduly influenced by the human reference, and (ii) judgement\nambiguity, arising from a lack of clear fine-grained guidelines. To address\nthese issues, we propose two refined variants of the MUSHRA test. The first\nvariant enables fairer ratings for synthesized samples that surpass human\nreference quality. The second variant reduces ambiguity, as indicated by the\nrelatively lower variance across raters. By combining these approaches, we\nachieve both more reliable and more fine-grained assessments. We also release\nMANGO, a massive dataset of 47,100 human ratings, the first-of-its-kind\ncollection for Indian languages, aiding in analyzing human preferences and\ndeveloping automatic metrics for evaluating TTS systems.\n","authors":["Praveen Srinivasa Varadhan","Amogh Gulati","Ashwin Sankar","Srija Anand","Anirudh Gupta","Anirudh Mukherjee","Shiva Kumar Marepally","Ankur Bhatia","Saloni Jaju","Suvrat Bhooshan","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2411.12719v1.pdf","comment":"19 pages, 12 Figures"},{"id":"http://arxiv.org/abs/2406.17918v3","updated":"2024-11-19T18:24:03Z","published":"2024-06-25T20:00:32Z","title":"GraphSnapShot: Graph Machine Learning Acceleration with Fast Storage and\n Retrieval","summary":" In our recent research, we have developed a framework called GraphSnapShot,\nwhich has been proven an useful tool for graph learning acceleration.\nGraphSnapShot is a framework for fast cache, storage, retrieval and computation\nfor graph learning. It can quickly store and update the local topology of graph\nstructure and allows us to track patterns in the structure of graph networks,\njust like take snapshots of the graphs. In experiments, GraphSnapShot shows\nefficiency, it can achieve up to 30% training acceleration and 73% memory\nreduction for lossless graph ML training compared to current baselines such as\ndgl.This technique is particular useful for large dynamic graph learning tasks\nsuch as social media analysis and recommendation systems to process complex\nrelationships between entities.\n The code for GraphSnapShot is publicly available at\nhttps://github.com/NoakLiu/GraphSnapShot.\n","authors":["Dong Liu","Roger Waleffe","Meng Jiang","Shivaram Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2406.17918v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11046v2","updated":"2024-11-19T18:18:04Z","published":"2024-03-17T00:11:15Z","title":"Regulating Chatbot Output via Inter-Informational Competition","summary":" The advent of ChatGPT has sparked over a year of regulatory frenzy. However,\nfew existing studies have rigorously questioned the assumption that, if left\nunregulated, AI chatbot's output would inflict tangible, severe real harm on\nhuman affairs. Most researchers have overlooked the critical possibility that\nthe information market itself can effectively mitigate these risks and, as a\nresult, they tend to use regulatory tools to address the issue directly. This\nArticle develops a yardstick for reevaluating both AI-related content risks and\ncorresponding regulatory proposals by focusing on inter-informational\ncompetition among various outlets. The decades-long history of regulating\ninformation and communications technologies indicates that regulators tend to\nerr too much on the side of caution and to put forward excessive regulatory\nmeasures when encountering the uncertainties brought about by new technologies.\nIn fact, a trove of empirical evidence has demonstrated that market competition\namong information outlets can effectively mitigate most risks and that\noverreliance on regulation is not only unnecessary but detrimental, as well.\nThis Article argues that sufficient competition among chatbots and other\ninformation outlets in the information marketplace can sufficiently mitigate\nand even resolve most content risks posed by generative AI technologies. This\nrenders certain loudly advocated regulatory strategies, like mandatory\nprohibitions, licensure, curation of datasets, and notice-and-response regimes,\ntruly unnecessary and even toxic to desirable competition and innovation\nthroughout the AI industry. Ultimately, the ideas that I advance in this\nArticle should pour some much-needed cold water on the regulatory frenzy over\ngenerative AI and steer the issue back to a rational track.\n","authors":["Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11046v2.pdf","comment":"50-page legal Article, forthcoming in Northwestern Journal of\n Technology and Intellectual Property"},{"id":"http://arxiv.org/abs/2402.01306v4","updated":"2024-11-19T18:12:45Z","published":"2024-02-02T10:53:36Z","title":"KTO: Model Alignment as Prospect Theoretic Optimization","summary":" Kahneman & Tversky's $\\textit{prospect theory}$ tells us that humans perceive\nrandom variables in a biased but well-defined manner (1992); for example,\nhumans are famously loss-averse. We show that objectives for aligning LLMs with\nhuman feedback implicitly incorporate many of these biases -- the success of\nthese objectives (e.g., DPO) over cross-entropy minimization can partly be\nascribed to them belonging to a family of loss functions that we call\n$\\textit{human-aware losses}$ (HALOs). However, the utility functions these\nmethods attribute to humans still differ from those in the prospect theory\nliterature. Using a Kahneman-Tversky model of human utility, we propose a HALO\nthat directly maximizes the utility of generations instead of maximizing the\nlog-likelihood of preferences, as current methods do. We call this approach\nKTO, and it matches or exceeds the performance of preference-based methods at\nscales from 1B to 30B, despite only learning from a binary signal of whether an\noutput is desirable. More broadly, our work suggests that there is no one HALO\nthat is universally superior; the best loss depends on the inductive biases\nmost appropriate for a given setting, an oft-overlooked consideration.\n","authors":["Kawin Ethayarajh","Winnie Xu","Niklas Muennighoff","Dan Jurafsky","Douwe Kiela"],"pdf_url":"https://arxiv.org/pdf/2402.01306v4.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2411.12700v1","updated":"2024-11-19T18:08:01Z","published":"2024-11-19T18:08:01Z","title":"Learning multivariate Gaussians with imperfect advice","summary":" We revisit the problem of distribution learning within the framework of\nlearning-augmented algorithms. In this setting, we explore the scenario where a\nprobability distribution is provided as potentially inaccurate advice on the\ntrue, unknown distribution. Our objective is to develop learning algorithms\nwhose sample complexity decreases as the quality of the advice improves,\nthereby surpassing standard learning lower bounds when the advice is\nsufficiently accurate.\n Specifically, we demonstrate that this outcome is achievable for the problem\nof learning a multivariate Gaussian distribution $N(\\boldsymbol{\\mu},\n\\boldsymbol{\\Sigma})$ in the PAC learning setting. Classically, in the\nadvice-free setting, $\\tilde{\\Theta}(d^2/\\varepsilon^2)$ samples are sufficient\nand worst case necessary to learn $d$-dimensional Gaussians up to TV distance\n$\\varepsilon$ with constant probability. When we are additionally given a\nparameter $\\tilde{\\boldsymbol{\\Sigma}}$ as advice, we show that\n$\\tilde{O}(d^{2-\\beta}/\\varepsilon^2)$ samples suffices whenever $\\|\n\\tilde{\\boldsymbol{\\Sigma}}^{-1/2} \\boldsymbol{\\Sigma}\n\\tilde{\\boldsymbol{\\Sigma}}^{-1/2} - \\boldsymbol{I_d} \\|_1 \\leq \\varepsilon\nd^{1-\\beta}$ (where $\\|\\cdot\\|_1$ denotes the entrywise $\\ell_1$ norm) for any\n$\\beta > 0$, yielding a polynomial improvement over the advice-free setting.\n","authors":["Arnab Bhattacharyya","Davin Choo","Philips George John","Themis Gouleakis"],"pdf_url":"https://arxiv.org/pdf/2411.12700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12697v1","updated":"2024-11-19T18:06:06Z","published":"2024-11-19T18:06:06Z","title":"Attribute Inference Attacks for Federated Regression Tasks","summary":" Federated Learning (FL) enables multiple clients, such as mobile phones and\nIoT devices, to collaboratively train a global machine learning model while\nkeeping their data localized. However, recent studies have revealed that the\ntraining phase of FL is vulnerable to reconstruction attacks, such as attribute\ninference attacks (AIA), where adversaries exploit exchanged messages and\nauxiliary public information to uncover sensitive attributes of targeted\nclients. While these attacks have been extensively studied in the context of\nclassification tasks, their impact on regression tasks remains largely\nunexplored. In this paper, we address this gap by proposing novel model-based\nAIAs specifically designed for regression tasks in FL environments. Our\napproach considers scenarios where adversaries can either eavesdrop on\nexchanged messages or directly interfere with the training process. We\nbenchmark our proposed attacks against state-of-the-art methods using\nreal-world datasets. The results demonstrate a significant increase in\nreconstruction accuracy, particularly in heterogeneous client datasets, a\ncommon scenario in FL. The efficacy of our model-based AIAs makes them better\ncandidates for empirically quantifying privacy leakage for federated regression\ntasks.\n","authors":["Francesco Diana","Othmane Marfoq","Chuan Xu","Giovanni Neglia","Frédéric Giroire","Eoin Thomas"],"pdf_url":"https://arxiv.org/pdf/2411.12697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12689v1","updated":"2024-11-19T17:53:30Z","published":"2024-11-19T17:53:30Z","title":"IMUVIE: Pickup Timeline Action Localization via Motion Movies","summary":" Falls among seniors due to difficulties with tasks such as picking up objects\npose significant health and safety risks, impacting quality of life and\nindependence. Reliable, accessible assessment tools are critical for early\nintervention but often require costly clinic-based equipment and trained\npersonnel, limiting their use in daily life. Existing wearable-based pickup\nmeasurement solutions address some needs but face limitations in\ngeneralizability.\n We present IMUVIE, a wearable system that uses motion movies and a\nmachine-learning model to automatically detect and measure pickup events,\nproviding a practical solution for frequent monitoring. IMUVIE's design\nprinciples-data normalization, occlusion handling, and streamlined\nvisuals-enhance model performance and are adaptable to tasks beyond pickup\nclassification.\n In rigorous leave one subject out cross validation evaluations, IMUVIE\nachieves exceptional window level localization accuracy of 91-92% for pickup\naction classification on 256,291 motion movie frame candidates while\nmaintaining an event level recall of 97% when evaluated on 129 pickup events.\nIMUVIE has strong generalization and performs well on unseen subjects. In an\ninterview survey, IMUVIE demonstrated strong user interest and trust, with ease\nof use identified as the most critical factor for adoption. IMUVIE offers a\npractical, at-home solution for fall risk assessment, facilitating early\ndetection of movement deterioration, and supporting safer, independent living\nfor seniors.\n","authors":["John Clapham","Kenneth Koltermann","Yanfu Zhang","Yuming Sun","Evie N Burnet","Gang Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.12689v1.pdf","comment":"This is a preprint version, 12 pages, 20 figures, 3 tables"},{"id":"http://arxiv.org/abs/2406.08316v3","updated":"2024-11-19T17:49:27Z","published":"2024-06-12T15:16:40Z","title":"Is Programming by Example solved by LLMs?","summary":" Programming-by-Examples (PBE) aims to generate an algorithm from input-output\nexamples. Such systems are practically and theoretically important: from an\nend-user perspective, they are deployed to millions of people, and from an AI\nperspective, PBE corresponds to a very general form of few-shot inductive\ninference. Given the success of Large Language Models (LLMs) in code-generation\ntasks, we investigate here the extent to which LLMs can be said to have\n\"solved\" PBE. We experiment on classic domains such as lists and strings, and\nan uncommon graphics programming domain not well represented in typical\npretraining data. We find that pretrained models are not effective at PBE, but\nthat they can be fine-tuned for much higher performance, provided the test\nproblems are in-distribution. We analyze empirically what causes these models\nto succeed and fail, and take steps toward understanding how to achieve better\nout-of-distribution generalization. Collectively these results suggest that\nLLMs make strong progress toward solving the typical suite of PBE tasks,\npotentially increasing the flexibility and applicability of PBE systems, while\nalso identifying ways in which LLMs still fall short.\n","authors":["Wen-Ding Li","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2406.08316v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03530v4","updated":"2024-11-19T17:41:00Z","published":"2023-06-06T09:26:43Z","title":"RLtools: A Fast, Portable Deep Reinforcement Learning Library for\n Continuous Control","summary":" Deep Reinforcement Learning (RL) can yield capable agents and control\npolicies in several domains but is commonly plagued by prohibitively long\ntraining times. Additionally, in the case of continuous control problems, the\napplicability of learned policies on real-world embedded devices is limited due\nto the lack of real-time guarantees and portability of existing libraries. To\naddress these challenges, we present RLtools, a dependency-free, header-only,\npure C++ library for deep supervised and reinforcement learning. Its novel\narchitecture allows RLtools to be used on a wide variety of platforms, from HPC\nclusters over workstations and laptops to smartphones, smartwatches, and\nmicrocontrollers. Specifically, due to the tight integration of the RL\nalgorithms with simulation environments, RLtools can solve popular RL problems\nup to 76 times faster than other popular RL frameworks. We also benchmark the\ninference on a diverse set of microcontrollers and show that in most cases our\noptimized implementation is by far the fastest. Finally, RLtools enables the\nfirst-ever demonstration of training a deep RL algorithm directly on a\nmicrocontroller, giving rise to the field of TinyRL. The source code as well as\ndocumentation and live demos are available through our project page at\nhttps://rl.tools.\n","authors":["Jonas Eschmann","Dario Albani","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2306.03530v4.pdf","comment":"Project page: https://rl.tools"},{"id":"http://arxiv.org/abs/2108.00480v4","updated":"2024-11-19T17:33:22Z","published":"2021-08-01T15:43:57Z","title":"Realised Volatility Forecasting: Machine Learning via Financial Word\n Embedding","summary":" This study develops a financial word embedding using 15 years of business\nnews. Our results show that this specialised language model produces more\naccurate results than general word embeddings, based on a financial benchmark\nwe established. As an application, we incorporate this word embedding into a\nsimple machine learning model to enhance the HAR model for forecasting realised\nvolatility. This approach statistically and economically outperforms\nestablished econometric models. Using an explainable AI method, we also\nidentify key phrases in business news that contribute significantly to\nvolatility, offering insights into language patterns tied to market dynamics.\n","authors":["Eghbal Rahimikia","Stefan Zohren","Ser-Huang Poon"],"pdf_url":"https://arxiv.org/pdf/2108.00480v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12676v1","updated":"2024-11-19T17:29:59Z","published":"2024-11-19T17:29:59Z","title":"IoT-Based 3D Pose Estimation and Motion Optimization for Athletes:\n Application of C3D and OpenPose","summary":" This study proposes the IoT-Enhanced Pose Optimization Network (IE-PONet) for\nhigh-precision 3D pose estimation and motion optimization of track and field\nathletes. IE-PONet integrates C3D for spatiotemporal feature extraction,\nOpenPose for real-time keypoint detection, and Bayesian optimization for\nhyperparameter tuning. Experimental results on NTURGB+D and FineGYM datasets\ndemonstrate superior performance, with AP\\(^p50\\) scores of 90.5 and 91.0, and\nmAP scores of 74.3 and 74.0, respectively. Ablation studies confirm the\nessential roles of each module in enhancing model accuracy. IE-PONet provides a\nrobust tool for athletic performance analysis and optimization, offering\nprecise technical insights for training and injury prevention. Future work will\nfocus on further model optimization, multimodal data integration, and\ndeveloping real-time feedback mechanisms to enhance practical applications.\n","authors":["Fei Ren","Chao Ren","Tianyi Lyu"],"pdf_url":"https://arxiv.org/pdf/2411.12676v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2411.02272v3","updated":"2024-11-19T17:29:58Z","published":"2024-11-04T17:03:55Z","title":"Combining Induction and Transduction for Abstract Reasoning","summary":" When learning an input-output mapping from very few examples, is it better to\nfirst infer a latent function that explains the examples, or is it better to\ndirectly predict new test outputs, e.g. using a neural network? We study this\nquestion on ARC, a highly diverse dataset of abstract reasoning tasks. We train\nneural models for induction (inferring latent functions) and transduction\n(directly predicting the test output for a given test input). Our models are\ntrained on synthetic data generated by prompting LLMs to produce Python code\nspecifying a function to be inferred, plus a stochastic subroutine for\ngenerating inputs to that function. We find inductive and transductive models\nsolve very different problems, despite training on the same problems, and\ndespite sharing the same neural architecture.\n","authors":["Wen-Ding Li","Keya Hu","Carter Larsen","Yuqing Wu","Simon Alford","Caleb Woo","Spencer M. Dunn","Hao Tang","Michelangelo Naim","Dat Nguyen","Wei-Long Zheng","Zenna Tavares","Yewen Pu","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2411.02272v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12667v1","updated":"2024-11-19T17:19:20Z","published":"2024-11-19T17:19:20Z","title":"Machine Learning Approaches on Crop Pattern Recognition a Comparative\n Analysis","summary":" Monitoring agricultural activities is important to ensure food security.\nRemote sensing plays a significant role for large-scale continuous monitoring\nof cultivation activities. Time series remote sensing data were used for the\ngeneration of the cropping pattern. Classification algorithms are used to\nclassify crop patterns and mapped agriculture land used. Some conventional\nclassification methods including support vector machine (SVM) and decision\ntrees were applied for crop pattern recognition. However, in this paper, we are\nproposing Deep Neural Network (DNN) based classification to improve the\nperformance of crop pattern recognition and make a comparative analysis with\ntwo (2) other machine learning approaches including Naive Bayes and Random\nForest.\n","authors":["Kazi Hasibul Kabir","Md. Zahiruddin Aqib","Sharmin Sultana","Shamim Akhter"],"pdf_url":"https://arxiv.org/pdf/2411.12667v1.pdf","comment":"Published in ICNTET2018: International Conference on New Trends in\n Engineering & Technology Tirupathi Highway, Tiruvallur Dist Chennai, India,\n September 7-8, 2018"},{"id":"http://arxiv.org/abs/2411.12665v1","updated":"2024-11-19T17:17:46Z","published":"2024-11-19T17:17:46Z","title":"Auto-Evaluation with Few Labels through Post-hoc Regression","summary":" Continually evaluating large generative models provides a unique challenge.\nOften, human annotations are necessary to evaluate high-level properties of\nthese models (e.g. in text or images). However, collecting human annotations of\nsamples can be resource intensive, and using other machine learning systems to\nprovide the annotations, or automatic evaluation, can introduce systematic\nerrors into the evaluation. The Prediction Powered Inference (PPI) framework\nprovides a way of leveraging both the statistical power of automatic evaluation\nand a small pool of labelled data to produce a low-variance, unbiased estimate\nof the quantity being evaluated for. However, most work on PPI considers a\nrelatively sizable set of labelled samples, which is not always practical to\nobtain. To this end, we present two new PPI-based techniques that leverage\nrobust regressors to produce even lower variance estimators in the few-label\nregime.\n","authors":["Benjamin Eyre","David Madras"],"pdf_url":"https://arxiv.org/pdf/2411.12665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12663v1","updated":"2024-11-19T17:16:31Z","published":"2024-11-19T17:16:31Z","title":"PoM: Efficient Image and Video Generation with the Polynomial Mixer","summary":" Diffusion models based on Multi-Head Attention (MHA) have become ubiquitous\nto generate high quality images and videos. However, encoding an image or a\nvideo as a sequence of patches results in costly attention patterns, as the\nrequirements both in terms of memory and compute grow quadratically. To\nalleviate this problem, we propose a drop-in replacement for MHA called the\nPolynomial Mixer (PoM) that has the benefit of encoding the entire sequence\ninto an explicit state. PoM has a linear complexity with respect to the number\nof tokens. This explicit state also allows us to generate frames in a\nsequential fashion, minimizing memory and compute requirement, while still\nbeing able to train in parallel. We show the Polynomial Mixer is a universal\nsequence-to-sequence approximator, just like regular MHA. We adapt several\nDiffusion Transformers (DiT) for generating images and videos with PoM\nreplacing MHA, and we obtain high quality samples while using less\ncomputational resources. The code is available at\nhttps://github.com/davidpicard/HoMM.\n","authors":["David Picard","Nicolas Dufour"],"pdf_url":"https://arxiv.org/pdf/2411.12663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05972v3","updated":"2024-11-19T17:16:03Z","published":"2024-01-11T15:20:06Z","title":"Scientific Machine Learning Based Reduced-Order Models for Plasma\n Turbulence Simulations","summary":" This paper investigates non-intrusive Scientific Machine Learning (SciML)\nReduced-Order Models (ROMs) for plasma turbulence simulations. In particular,\nwe focus on Operator Inference (OpInf) to build low-cost physics-based ROMs\nfrom data for such simulations. As a representative example, we consider the\n(classical) Hasegawa-Wakatani (HW) equations used for modeling two-dimensional\nelectrostatic drift-wave turbulence. For a comprehensive perspective of the\npotential of OpInf to construct predictive ROMs, we consider three setups for\nthe HW equations by varying a key parameter, namely the adiabaticity\ncoefficient. These setups lead to the formation of complex and nonlinear\ndynamics, which makes the construction of predictive ROMs of any kind\nchallenging. We generate the training datasets by performing direct numerical\nsimulations of the HW equations and recording the computed state data and\noutputs the over a time horizon of $100$ time units in the turbulent phase. We\nthen use these datasets to construct OpInf ROMs for predictions over $400$\nadditional time units, that is, $400\\%$ more than the training horizon. Our\nresults show that the OpInf ROMs capture important statistical features of the\nturbulent dynamics and generalize beyond the training time horizon while\nreducing the computational effort of the high-fidelity simulation by up to five\norders of magnitude. In the broader context of fusion research, this shows that\nnon-intrusive SciML ROMs have the potential to drastically accelerate numerical\nstudies, which can ultimately enable tasks such as the design of optimized\nfusion devices.\n","authors":["Constantin Gahr","Ionut-Gabriel Farcas","Frank Jenko"],"pdf_url":"https://arxiv.org/pdf/2401.05972v3.pdf","comment":"14 pages in double column format, 9 figures, 8 tables"},{"id":"http://arxiv.org/abs/2202.09481v2","updated":"2024-11-19T16:55:55Z","published":"2022-02-19T00:30:52Z","title":"TransDreamer: Reinforcement Learning with Transformer World Models","summary":" The Dreamer agent provides various benefits of Model-Based Reinforcement\nLearning (MBRL) such as sample efficiency, reusable knowledge, and safe\nplanning. However, its world model and policy networks inherit the limitations\nof recurrent neural networks and thus an important question is how an MBRL\nframework can benefit from the recent advances of transformers and what the\nchallenges are in doing so. In this paper, we propose a transformer-based MBRL\nagent, called TransDreamer. We first introduce the Transformer State-Space\nModel, a world model that leverages a transformer for dynamics predictions. We\nthen share this world model with a transformer-based policy network and obtain\nstability in training a transformer-based RL agent. In experiments, we apply\nthe proposed model to 2D visual RL and 3D first-person visual RL tasks both\nrequiring long-range memory access for memory-based reasoning. We show that the\nproposed model outperforms Dreamer in these complex tasks.\n","authors":["Chang Chen","Yi-Fu Wu","Jaesik Yoon","Sungjin Ahn"],"pdf_url":"https://arxiv.org/pdf/2202.09481v2.pdf","comment":"Deep RL Workshop NeurIPS 2021"},{"id":"http://arxiv.org/abs/2411.12643v1","updated":"2024-11-19T16:54:30Z","published":"2024-11-19T16:54:30Z","title":"DLBacktrace: A Model Agnostic Explainability for any Deep Learning\n Models","summary":" The rapid advancement of artificial intelligence has led to increasingly\nsophisticated deep learning models, which frequently operate as opaque 'black\nboxes' with limited transparency in their decision-making processes. This lack\nof interpretability presents considerable challenges, especially in high-stakes\napplications where understanding the rationale behind a model's outputs is as\nessential as the outputs themselves. This study addresses the pressing need for\ninterpretability in AI systems, emphasizing its role in fostering trust,\nensuring accountability, and promoting responsible deployment in\nmission-critical fields. To address the interpretability challenge in deep\nlearning, we introduce DLBacktrace, an innovative technique developed by the\nAryaXAI team to illuminate model decisions across a wide array of domains,\nincluding simple Multi Layer Perceptron (MLPs), Convolutional Neural Networks\n(CNNs), Large Language Models (LLMs), Computer Vision Models, and more.\n We provide a comprehensive overview of the DLBacktrace algorithm and present\nbenchmarking results, comparing its performance against established\ninterpretability methods, such as SHAP, LIME, GradCAM, Integrated Gradients,\nSmoothGrad, and Attention Rollout, using diverse task-based metrics. The\nproposed DLBacktrace technique is compatible with various model architectures\nbuilt in PyTorch and TensorFlow, supporting models like Llama 3.2, other NLP\narchitectures such as BERT and LSTMs, computer vision models like ResNet and\nU-Net, as well as custom deep neural network (DNN) models for tabular data.\nThis flexibility underscores DLBacktrace's adaptability and effectiveness in\nenhancing model transparency across a broad spectrum of applications. The\nlibrary is open-sourced and available at https://github.com/AryaXAI/DLBacktrace .\n","authors":["Vinay Kumar Sankarapu","Chintan Chitroda","Yashwardhan Rathore","Neeraj Kumar Singh","Pratinav Seth"],"pdf_url":"https://arxiv.org/pdf/2411.12643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12640v1","updated":"2024-11-19T16:51:56Z","published":"2024-11-19T16:51:56Z","title":"Leadsee-Precip: A Deep Learning Diagnostic Model for Precipitation","summary":" Recently, deep-learning weather forecasting models have surpassed traditional\nnumerical models in terms of the accuracy of meteorological variables. However,\nthere is considerable potential for improvements in precipitation forecasts,\nespecially for heavy precipitation events. To address this deficiency, we\npropose Leadsee-Precip, a global deep learning model to generate precipitation\nfrom meteorological circulation fields. The model utilizes an information\nbalance scheme to tackle the challenges of predicting heavy precipitation\ncaused by the long-tail distribution of precipitation data. Additionally, more\naccurate satellite and radar-based precipitation retrievals are used as\ntraining targets. Compared to artificial intelligence global weather models,\nthe heavy precipitation from Leadsee-Precip is more consistent with\nobservations and shows competitive performance against global numerical weather\nprediction models. Leadsee-Precip can be integrated with any global circulation\nmodel to generate precipitation forecasts. But the deviations between the\npredicted and the ground-truth circulation fields may lead to a weakened\nprecipitation forecast, which could potentially be mitigated by further\nfine-tuning based on the predicted circulation fields.\n","authors":["Weiwen Ji","Jin Feng","Yueqi Liu","Yulu Qiu","Hua Gao"],"pdf_url":"https://arxiv.org/pdf/2411.12640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12636v1","updated":"2024-11-19T16:49:58Z","published":"2024-11-19T16:49:58Z","title":"PyAWD: A Library for Generating Large Synthetic Datasets of Acoustic\n Wave Propagation with Devito","summary":" Seismic data is often sparse and unevenly distributed due to the high costs\nand logistical challenges associated with deploying physical seismometers,\nlimiting the application of Machine Learning (ML) in earthquake analysis. To\naddress this gap, we introduce PyAWD, a Python library designed to generate\nhigh-resolution synthetic datasets simulating spatio-temporal acoustic wave\npropagation in both two-dimensional and three-dimensional heterogeneous media.\nBy allowing fine control over parameters such as wave speed, external forces,\nspatial and temporal discretization, and media composition, PyAWD enables the\ncreation of ML-scale datasets that capture the complexity of seismic wave\nbehavior. We illustrate the library's potential with an epicenter retrieval\ntask, showcasing its suitability for designing complex, accurate seismic\nproblems that support advanced ML approaches in the absence or lack of dense\nreal-world data.\n","authors":["Pascal Tribel","Gianluca Bontempi"],"pdf_url":"https://arxiv.org/pdf/2411.12636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03320v3","updated":"2024-11-19T16:49:12Z","published":"2024-10-20T18:35:56Z","title":"log-RRIM: Yield Prediction via Local-to-global Reaction Representation\n Learning and Interaction Modeling","summary":" Accurate prediction of chemical reaction yields is crucial for optimizing\norganic synthesis, potentially reducing time and resources spent on\nexperimentation. With the rise of artificial intelligence (AI), there is\ngrowing interest in leveraging AI-based methods to accelerate yield predictions\nwithout conducting in vitro experiments. We present log-RRIM, an innovative\ngraph transformer-based framework designed for predicting chemical reaction\nyields. Our approach implements a unique local-to-global reaction\nrepresentation learning strategy. This approach initially captures detailed\nmolecule-level information and then models and aggregates intermolecular\ninteractions, ensuring that the impact of varying-sizes molecular fragments on\nyield is accurately accounted for. Another key feature of log-RRIM is its\nintegration of a cross-attention mechanism that focuses on the interplay\nbetween reagents and reaction centers. This design reflects a fundamental\nprinciple in chemical reactions: the crucial role of reagents in influencing\nbond-breaking and formation processes, which ultimately affect reaction yields.\nlog-RRIM outperforms existing methods in our experiments, especially for medium\nto high-yielding reactions, proving its reliability as a predictor. Its\nadvanced modeling of reactant-reagent interactions and sensitivity to small\nmolecular fragments make it a valuable tool for reaction planning and\noptimization in chemical synthesis. The data and codes of log-RRIM are\naccessible through https://github.com/ninglab/Yield_log_RRIM.\n","authors":["Xiao Hu","Ziqi Chen","Bo Peng","Daniel Adu-Ampratwum","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2411.03320v3.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.12633v1","updated":"2024-11-19T16:45:52Z","published":"2024-11-19T16:45:52Z","title":"Instant Policy: In-Context Imitation Learning via Graph Diffusion","summary":" Following the impressive capabilities of in-context learning with large\ntransformers, In-Context Imitation Learning (ICIL) is a promising opportunity\nfor robotics. We introduce Instant Policy, which learns new tasks instantly\n(without further training) from just one or two demonstrations, achieving ICIL\nthrough two key components. First, we introduce inductive biases through a\ngraph representation and model ICIL as a graph generation problem with a\nlearned diffusion process, enabling structured reasoning over demonstrations,\nobservations, and actions. Second, we show that such a model can be trained\nusing pseudo-demonstrations - arbitrary trajectories generated in simulation -\nas a virtually infinite pool of training data. Simulated and real experiments\nshow that Instant Policy enables rapid learning of various everyday robot\ntasks. We also show how it can serve as a foundation for cross-embodiment and\nzero-shot transfer to language-defined tasks. Code and videos are available at\nhttps://www.robot-learning.uk/instant-policy.\n","authors":["Vitalis Vosylius","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2411.12633v1.pdf","comment":"Code and videos are available on our project webpage at\n https://www.robot-learning.uk/instant-policy"},{"id":"http://arxiv.org/abs/2411.12626v1","updated":"2024-11-19T16:34:45Z","published":"2024-11-19T16:34:45Z","title":"Exploring the Manifold of Neural Networks Using Diffusion Geometry","summary":" Drawing motivation from the manifold hypothesis, which posits that most\nhigh-dimensional data lies on or near low-dimensional manifolds, we apply\nmanifold learning to the space of neural networks. We learn manifolds where\ndatapoints are neural networks by introducing a distance between the hidden\nlayer representations of the neural networks. These distances are then fed to\nthe non-linear dimensionality reduction algorithm PHATE to create a manifold of\nneural networks. We characterize this manifold using features of the\nrepresentation, including class separation, hierarchical cluster structure,\nspectral entropy, and topological structure. Our analysis reveals that\nhigh-performing networks cluster together in the manifold, displaying\nconsistent embedding patterns across all these features. Finally, we\ndemonstrate the utility of this approach for guiding hyperparameter\noptimization and neural architecture search by sampling from the manifold.\n","authors":["Elliott Abel","Peyton Crevasse","Yvan Grinspan","Selma Mazioud","Folu Ogundipe","Kristof Reimann","Ellie Schueler","Andrew J. Steindl","Ellen Zhang","Dhananjay Bhaskar","Siddharth Viswanath","Yanlei Zhang","Tim G. J. Rudner","Ian Adelstein","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2411.12626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11636v3","updated":"2024-11-19T16:27:45Z","published":"2024-06-17T15:16:18Z","title":"Feasibility of Federated Learning from Client Databases with Different\n Brain Diseases and MRI Modalities","summary":" Segmentation models for brain lesions in MRI are typically developed for a\nspecific disease and trained on data with a predefined set of MRI modalities.\nSuch models cannot segment the disease using data with a different set of MRI\nmodalities, nor can they segment other types of diseases. Moreover, this\ntraining paradigm prevents a model from using the advantages of learning from\nheterogeneous databases that may contain scans and segmentation labels for\ndifferent brain pathologies and diverse sets of MRI modalities. Additionally,\nthe confidentiality of patient data often prevents central data aggregation,\nnecessitating a decentralized approach. Is it feasible to use Federated\nLearning (FL) to train a single model on client databases that contain scans\nand labels of different brain pathologies and diverse sets of MRI modalities?\nWe demonstrate promising results by combining appropriate, simple, and\npractical modifications to the model and training strategy: Designing a model\nwith input channels that cover the whole set of modalities available across\nclients, training with random modality drop, and exploring the effects of\nfeature normalization methods. Evaluation on 7 brain MRI databases with 5\ndifferent diseases shows that this FL framework can train a single model\nachieving very promising results in segmenting all disease types seen during\ntraining. Importantly, it can segment these diseases in new databases that\ncontain sets of modalities different from those in training clients. These\nresults demonstrate, for the first time, the feasibility and effectiveness of\nusing FL to train a single 3D segmentation model on decentralised data with\ndiverse brain diseases and MRI modalities, a necessary step towards leveraging\nheterogeneous real-world databases. Code:\nhttps://github.com/FelixWag/FedUniBrain\n","authors":["Felix Wagner","Wentian Xu","Pramit Saha","Ziyun Liang","Daniel Whitehouse","David Menon","Virginia Newcombe","Natalie Voets","J. Alison Noble","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2406.11636v3.pdf","comment":"Accepted as a conference paper at WACV 2025"},{"id":"http://arxiv.org/abs/2411.12615v1","updated":"2024-11-19T16:20:27Z","published":"2024-11-19T16:20:27Z","title":"A Multimodal Approach Combining Structural and Cross-domain Textual\n Guidance for Weakly Supervised OCT Segmentation","summary":" Accurate segmentation of Optical Coherence Tomography (OCT) images is crucial\nfor diagnosing and monitoring retinal diseases. However, the labor-intensive\nnature of pixel-level annotation limits the scalability of supervised learning\nwith large datasets. Weakly Supervised Semantic Segmentation (WSSS) provides a\npromising alternative by leveraging image-level labels. In this study, we\npropose a novel WSSS approach that integrates structural guidance with\ntext-driven strategies to generate high-quality pseudo labels, significantly\nimproving segmentation performance. In terms of visual information, our method\nemploys two processing modules that exchange raw image features and structural\nfeatures from OCT images, guiding the model to identify where lesions are\nlikely to occur. In terms of textual information, we utilize large-scale\npretrained models from cross-domain sources to implement label-informed textual\nguidance and synthetic descriptive integration with two textual processing\nmodules that combine local semantic features with consistent synthetic\ndescriptions. By fusing these visual and textual components within a multimodal\nframework, our approach enhances lesion localization accuracy. Experimental\nresults on three OCT datasets demonstrate that our method achieves\nstate-of-the-art performance, highlighting its potential to improve diagnostic\naccuracy and efficiency in medical imaging.\n","authors":["Jiaqi Yang","Nitish Mehta","Xiaoling Hu","Chao Chen","Chia-Ling Tsai"],"pdf_url":"https://arxiv.org/pdf/2411.12615v1.pdf","comment":"21 pages, 9 figures, 8 tables"},{"id":"http://arxiv.org/abs/2411.12612v1","updated":"2024-11-19T16:18:20Z","published":"2024-11-19T16:18:20Z","title":"Reward driven workflows for unsupervised explainable analysis of phases\n and ferroic variants from atomically resolved imaging data","summary":" Rapid progress in aberration corrected electron microscopy necessitates\ndevelopment of robust methods for the identification of phases, ferroic\nvariants, and other pertinent aspects of materials structure from imaging data.\nWhile unsupervised methods for clustering and classification are widely used\nfor these tasks, their performance can be sensitive to hyperparameter selection\nin the analysis workflow. In this study, we explore the effects of descriptors\nand hyperparameters on the capability of unsupervised ML methods to distill\nlocal structural information, exemplified by discovery of polarization and\nlattice distortion in Sm doped BiFeO3 (BFO) thin films. We demonstrate that a\nreward-driven approach can be used to optimize these key hyperparameters across\nthe full workflow, where rewards were designed to reflect domain wall\ncontinuity and straightness, ensuring that the analysis aligns with the\nmaterial's physical behavior. This approach allows us to discover local\ndescriptors that are best aligned with the specific physical behavior,\nproviding insight into the fundamental physics of materials. We further extend\nthe reward driven workflows to disentangle structural factors of variation via\noptimized variational autoencoder (VAE). Finally, the importance of\nwell-defined rewards was explored as a quantifiable measure of success of the\nworkflow.\n","authors":["Kamyar Barakati","Yu Liu","Chris Nelson","Maxim A. Ziatdinov","Xiaohang Zhang","Ichiro Takeuchi","Sergei V. Kalinin"],"pdf_url":"https://arxiv.org/pdf/2411.12612v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2211.13723v3","updated":"2024-11-19T16:17:58Z","published":"2022-11-24T17:19:30Z","title":"Improving Multi-task Learning via Seeking Task-based Flat Regions","summary":" Multi-Task Learning (MTL) is a widely-used and powerful learning paradigm for\ntraining deep neural networks that allows learning more than one objective by a\nsingle backbone. Compared to training tasks separately, MTL significantly\nreduces computational costs, improves data efficiency, and potentially enhances\nmodel performance by leveraging knowledge across tasks. Hence, it has been\nadopted in a variety of applications, ranging from computer vision to natural\nlanguage processing and speech recognition. Among them, there is an emerging\nline of work in MTL that focuses on manipulating the task gradient to derive an\nultimate gradient descent direction to benefit all tasks. Despite achieving\nimpressive results on many benchmarks, directly applying these approaches\nwithout using appropriate regularization techniques might lead to suboptimal\nsolutions on real-world problems. In particular, standard training that\nminimizes the empirical loss on the training data can easily suffer from\noverfitting to low-resource tasks or be spoiled by noisy-labeled ones, which\ncan cause negative transfer between tasks and overall performance drop. To\nalleviate such problems, we propose to leverage a recently introduced training\nmethod, named Sharpness-aware Minimization, which can enhance model\ngeneralization ability on single-task learning. Accordingly, we present a novel\nMTL training methodology, encouraging the model to find task-based flat minima\nfor coherently improving its generalization capability on all tasks. Finally,\nwe conduct comprehensive experiments on a variety of applications to\ndemonstrate the merit of our proposed approach to existing gradient-based MTL\nmethods, as suggested by our developed theory.\n","authors":["Hoang Phan","Lam Tran","Quyen Tran","Ngoc N. Tran","Tuan Truong","Nhat Ho","Dinh Phung","Trung Le"],"pdf_url":"https://arxiv.org/pdf/2211.13723v3.pdf","comment":"35 pages, 17 figures, 7 tables"},{"id":"http://arxiv.org/abs/2411.12603v1","updated":"2024-11-19T16:06:32Z","published":"2024-11-19T16:06:32Z","title":"STREAM: A Universal State-Space Model for Sparse Geometric Data","summary":" Handling sparse and unstructured geometric data, such as point clouds or\nevent-based vision, is a pressing challenge in the field of machine vision.\nRecently, sequence models such as Transformers and state-space models entered\nthe domain of geometric data. These methods require specialized preprocessing\nto create a sequential view of a set of points. Furthermore, prior works\ninvolving sequence models iterate geometric data with either uniform or learned\nstep sizes, implicitly relying on the model to infer the underlying geometric\nstructure. In this work, we propose to encode geometric structure explicitly\ninto the parameterization of a state-space model. State-space models are based\non linear dynamics governed by a one-dimensional variable such as time or a\nspatial coordinate. We exploit this dynamic variable to inject relative\ndifferences of coordinates into the step size of the state-space model. The\nresulting geometric operation computes interactions between all pairs of N\npoints in O(N) steps. Our model deploys the Mamba selective state-space model\nwith a modified CUDA kernel to efficiently map sparse geometric data to modern\nhardware. The resulting sequence model, which we call STREAM, achieves\ncompetitive results on a range of benchmarks from point-cloud classification to\nevent-based vision and audio classification. STREAM demonstrates a powerful\ninductive bias for sparse geometric data by improving the PointMamba baseline\nwhen trained from scratch on the ModelNet40 and ScanObjectNN point cloud\nanalysis datasets. It further achieves, for the first time, 100% test accuracy\non all 11 classes of the DVS128 Gestures dataset.\n","authors":["Mark Schöne","Yash Bhisikar","Karan Bania","Khaleelulla Khan Nazeer","Christian Mayr","Anand Subramoney","David Kappel"],"pdf_url":"https://arxiv.org/pdf/2411.12603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12601v1","updated":"2024-11-19T16:05:35Z","published":"2024-11-19T16:05:35Z","title":"Hypergraph $p$-Laplacian equations for data interpolation and\n semi-supervised learning","summary":" Hypergraph learning with $p$-Laplacian regularization has attracted a lot of\nattention due to its flexibility in modeling higher-order relationships in\ndata. This paper focuses on its fast numerical implementation, which is\nchallenging due to the non-differentiability of the objective function and the\nnon-uniqueness of the minimizer. We derive a hypergraph $p$-Laplacian equation\nfrom the subdifferential of the $p$-Laplacian regularization. A simplified\nequation that is mathematically well-posed and computationally efficient is\nproposed as an alternative. Numerical experiments verify that the simplified\n$p$-Laplacian equation suppresses spiky solutions in data interpolation and\nimproves classification accuracy in semi-supervised learning. The remarkably\nlow computational cost enables further applications.\n","authors":["Kehan Shi","Martin Burger"],"pdf_url":"https://arxiv.org/pdf/2411.12601v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2411.12600v1","updated":"2024-11-19T16:04:31Z","published":"2024-11-19T16:04:31Z","title":"Provable unlearning in topic modeling and downstream tasks","summary":" Machine unlearning algorithms are increasingly important as legal concerns\narise around the provenance of training data, but verifying the success of\nunlearning is often difficult. Provable guarantees for unlearning are often\nlimited to supervised learning settings. In this paper, we provide the first\ntheoretical guarantees for unlearning in the pre-training and fine-tuning\nparadigm by studying topic models, simple bag-of-words language models that can\nbe adapted to solve downstream tasks like retrieval and classification. First,\nwe design a provably effective unlearning algorithm for topic models that\nincurs a computational overhead independent of the size of the original\ndataset. Our analysis additionally quantifies the deletion capacity of the\nmodel -- i.e., the number of examples that can be unlearned without incurring a\nsignificant cost in model performance. Finally, we formally extend our analyses\nto account for adaptation to a given downstream task. In particular, we design\nan efficient algorithm to perform unlearning after fine-tuning the topic model\nvia a linear head. Notably, we show that it is easier to unlearn pre-training\ndata from models that have been fine-tuned to a particular task, and one can\nunlearn this data without modifying the base model.\n","authors":["Stanley Wei","Sadhika Malladi","Sanjeev Arora","Amartya Sanyal"],"pdf_url":"https://arxiv.org/pdf/2411.12600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08566v2","updated":"2024-11-19T16:03:58Z","published":"2024-11-13T12:26:08Z","title":"Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space\n Exploration by Reinforcement Learning Agent","summary":" Grasping by a robot in unstructured environments is deemed a critical\nchallenge because of the requirement for effective adaptation to a wide\nvariation in object geometries, material properties, and other environmental\nfactors. In this paper, we propose a novel framework for robotic grasping based\non the idea of compressing high-dimensional target and gripper features in a\ncommon latent space using a set of autoencoders. Our approach simplifies\ngrasping by using three autoencoders dedicated to the target, the gripper, and\na third one that fuses their latent representations. This allows the RL agent\nto achieve higher learning rates at the initial stages of exploration of a new\nenvironment, as well as at non-zero shot grasp attempts. The agent explores the\nlatent space of the third autoencoder for better quality grasp without explicit\nreconstruction of objects. By implementing the PoWER algorithm into the RL\ntraining process, updates on the agent's policy will be made through the\nperturbation in the reward-weighted latent space. The successful exploration\nefficiently constrains both position and pose integrity for feasible executions\nof grasps. We evaluate our system on a diverse set of objects, demonstrating\nthe high success rate in grasping with minimum computational overhead. We found\nthat approach enhances the adaptation of the RL agent by more than 35 % in\nsimulation experiments.\n","authors":["Leonidas Askianakis"],"pdf_url":"https://arxiv.org/pdf/2411.08566v2.pdf","comment":"Submitted for review at IEEE ICRA 2025"},{"id":"http://arxiv.org/abs/2411.12597v1","updated":"2024-11-19T16:01:54Z","published":"2024-11-19T16:01:54Z","title":"GNNAS-Dock: Budget Aware Algorithm Selection with Graph Neural Networks\n for Molecular Docking","summary":" Molecular docking is a major element in drug discovery and design. It enables\nthe prediction of ligand-protein interactions by simulating the binding of\nsmall molecules to proteins. Despite the availability of numerous docking\nalgorithms, there is no single algorithm consistently outperforms the others\nacross a diverse set of docking scenarios. This paper introduces GNNAS-Dock, a\nnovel Graph Neural Network (GNN)-based automated algorithm selection system for\nmolecular docking in blind docking situations. GNNs are accommodated to process\nthe complex structural data of both ligands and proteins. They benefit from the\ninherent graph-like properties to predict the performance of various docking\nalgorithms under different conditions. The present study pursues two main\nobjectives: 1) predict the performance of each candidate docking algorithm, in\nterms of Root Mean Square Deviation (RMSD), thereby identifying the most\naccurate method for specific scenarios; and 2) choose the best computationally\nefficient docking algorithm for each docking case, aiming to reduce the time\nrequired for docking while maintaining high accuracy. We validate our approach\non PDBBind 2020 refined set, which contains about 5,300 pairs of protein-ligand\ncomplexes.\n","authors":["Yiliang Yuan","Mustafa Misir"],"pdf_url":"https://arxiv.org/pdf/2411.12597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04720v2","updated":"2024-11-19T15:57:07Z","published":"2024-08-08T18:36:43Z","title":"Learning the Simplicity of Scattering Amplitudes","summary":" The simplification and reorganization of complex expressions lies at the core\nof scientific progress, particularly in theoretical high-energy physics. This\nwork explores the application of machine learning to a particular facet of this\nchallenge: the task of simplifying scattering amplitudes expressed in terms of\nspinor-helicity variables. We demonstrate that an encoder-decoder transformer\narchitecture achieves impressive simplification capabilities for expressions\ncomposed of handfuls of terms. Lengthier expressions are implemented in an\nadditional embedding network, trained using contrastive learning, which\nisolates subexpressions that are more likely to simplify. The resulting\nframework is capable of reducing expressions with hundreds of terms - a regular\noccurrence in quantum field theory calculations - to vastly simpler equivalent\nexpressions. Starting from lengthy input expressions, our networks can generate\nthe Parke-Taylor formula for five-point gluon scattering, as well as new\ncompact expressions for five-point amplitudes involving scalars and gravitons.\nAn interactive demonstration can be found at\nhttps://spinorhelicity.streamlit.app .\n","authors":["Clifford Cheung","Aurélien Dersy","Matthew D. Schwartz"],"pdf_url":"https://arxiv.org/pdf/2408.04720v2.pdf","comment":"25+15 pages, 9+6 figures, v2: typos correction and extended the\n introduction, conclusion, sections 2.2, 2.4 and appendix F"},{"id":"http://arxiv.org/abs/2411.07372v2","updated":"2024-11-19T15:53:51Z","published":"2024-11-11T21:21:32Z","title":"Identifying Differential Patient Care Through Inverse Intent Inference","summary":" Sepsis is a life-threatening condition defined by end-organ dysfunction due\nto a dysregulated host response to infection. Although the Surviving Sepsis\nCampaign has launched and has been releasing sepsis treatment guidelines to\nunify and normalize the care for sepsis patients, it has been reported in\nnumerous studies that disparities in care exist across the trajectory of\npatient stay in the emergency department and intensive care unit. Here, we\napply a number of reinforcement learning techniques including behavioral\ncloning, imitation learning, and inverse reinforcement learning, to learn the\noptimal policy in the management of septic patient subgroups using expert\ndemonstrations. Then we estimate the counterfactual optimal policies by\napplying the model to another subset of unseen medical populations and identify\nthe difference in cure by comparing it to the real policy. Our data comes from\nthe sepsis cohort of MIMIC-IV and the clinical data warehouses of the Mass\nGeneral Brigham healthcare system. The ultimate objective of this work is to\nuse the optimal learned policy function to estimate the counterfactual\ntreatment policy and identify deviations across sub-populations of interest. We\nhope this approach would help us identify any disparities in care and also\nchanges in cure in response to the publication of national sepsis treatment\nguidelines.\n","authors":["Hyewon Jeong","Siddharth Nayak","Taylor Killian","Sanjat Kanjilal"],"pdf_url":"https://arxiv.org/pdf/2411.07372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17075v2","updated":"2024-11-19T15:50:58Z","published":"2024-10-22T14:52:46Z","title":"Combinatorial Logistic Bandits","summary":" We introduce a novel framework called combinatorial logistic bandits (CLogB),\nwhere in each round, a subset of base arms (called the super arm) is selected,\nwith the outcome of each base arm being binary and its expectation following a\nlogistic parametric model. The feedback is governed by a general arm triggering\nprocess. Our study covers CLogB with reward functions satisfying two smoothness\nconditions, capturing application scenarios such as online content delivery,\nonline learning to rank, and dynamic channel allocation. We first propose a\nsimple yet efficient algorithm, CLogUCB, utilizing a variance-agnostic\nexploration bonus. Under the 1-norm triggering probability modulated (TPM)\nsmoothness condition, CLogUCB achieves a regret bound of\n$\\tilde{O}(d\\sqrt{\\kappa KT})$, where $\\tilde{O}$ ignores logarithmic factors,\n$d$ is the dimension of the feature vector, $\\kappa$ represents the\nnonlinearity of the logistic model, and $K$ is the maximum number of base arms\na super arm can trigger. This result improves on prior work by a factor of\n$\\tilde{O}(\\sqrt{\\kappa})$. We then enhance CLogUCB with a variance-adaptive\nversion, VA-CLogUCB, which attains a regret bound of $\\tilde{O}(d\\sqrt{KT})$\nunder the same 1-norm TPM condition, improving another\n$\\tilde{O}(\\sqrt{\\kappa})$ factor. VA-CLogUCB shows even greater promise under\nthe stronger triggering probability and variance modulated (TPVM) condition,\nachieving a leading $\\tilde{O}(d\\sqrt{T})$ regret, thus removing the additional\ndependency on the action-size $K$. Furthermore, we enhance the computational\nefficiency of VA-CLogUCB by eliminating the nonconvex optimization process when\nthe context feature map is time-invariant while maintaining the tight\n$\\tilde{O}(d\\sqrt{T})$ regret. Finally, experiments on synthetic and real-world\ndatasets demonstrate the superior performance of our algorithms compared to\nbenchmark algorithms.\n","authors":["Xutong Liu","Xiangxiang Dai","Xuchuang Wang","Mohammad Hajiesmaili","John C. S. Lui"],"pdf_url":"https://arxiv.org/pdf/2410.17075v2.pdf","comment":"Accepted in ACM SIGMETRICS 2025"},{"id":"http://arxiv.org/abs/2411.12580v1","updated":"2024-11-19T15:47:12Z","published":"2024-11-19T15:47:12Z","title":"Procedural Knowledge in Pretraining Drives Reasoning in Large Language\n Models","summary":" The capabilities and limitations of Large Language Models have been sketched\nout in great detail in recent years, providing an intriguing yet conflicting\npicture. On the one hand, LLMs demonstrate a general ability to solve problems.\nOn the other hand, they show surprising reasoning gaps when compared to humans,\ncasting doubt on the robustness of their generalisation strategies. The sheer\nvolume of data used in the design of LLMs has precluded us from applying the\nmethod traditionally used to measure generalisation: train-test set separation.\nTo overcome this, we study what kind of generalisation strategies LLMs employ\nwhen performing reasoning tasks by investigating the pretraining data they rely\non. For two models of different sizes (7B and 35B) and 2.5B of their\npretraining tokens, we identify what documents influence the model outputs for\nthree simple mathematical reasoning tasks and contrast this to the data that\nare influential for answering factual questions. We find that, while the models\nrely on mostly distinct sets of data for each factual question, a document\noften has a similar influence across different reasoning questions within the\nsame task, indicating the presence of procedural knowledge. We further find\nthat the answers to factual questions often show up in the most influential\ndata. However, for reasoning questions the answers usually do not show up as\nhighly influential, nor do the answers to the intermediate reasoning steps.\nWhen we characterise the top ranked documents for the reasoning questions\nqualitatively, we confirm that the influential documents often contain\nprocedural knowledge, like demonstrating how to obtain a solution using\nformulae or code. Our findings indicate that the approach to reasoning the\nmodels use is unlike retrieval, and more like a generalisable strategy that\nsynthesises procedural knowledge from documents doing a similar form of\nreasoning.\n","authors":["Laura Ruis","Maximilian Mozes","Juhan Bae","Siddhartha Rao Kamalakara","Dwarak Talupuru","Acyr Locatelli","Robert Kirk","Tim Rocktäschel","Edward Grefenstette","Max Bartolo"],"pdf_url":"https://arxiv.org/pdf/2411.12580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06750v2","updated":"2024-11-19T15:44:30Z","published":"2024-09-10T13:39:29Z","title":"Can Agents Spontaneously Form a Society? Introducing a Novel\n Architecture for Generative Multi-Agents to Elicit Social Emergence","summary":" Generative agents have demonstrated impressive capabilities in specific\ntasks, but most of these frameworks focus on independent tasks and lack\nattention to social interactions. We introduce a generative agent architecture\ncalled ITCMA-S, which includes a basic framework for individual agents and a\nframework called LTRHA that supports social interactions among multi-agents.\nThis architecture enables agents to identify and filter out behaviors that are\ndetrimental to social interactions, guiding them to choose more favorable\nactions. We designed a sandbox environment to simulate the natural evolution of\nsocial relationships among multiple identity-less agents for experimental\nevaluation. The results showed that ITCMA-S performed well on multiple\nevaluation indicators, demonstrating its ability to actively explore the\nenvironment, recognize new agents, and acquire new information through\ncontinuous actions and dialogue. Observations show that as agents establish\nconnections with each other, they spontaneously form cliques with internal\nhierarchies around a selected leader and organize collective activities.\n","authors":["H. Zhang","J. Yin","M. Jiang","C. Su"],"pdf_url":"https://arxiv.org/pdf/2409.06750v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.02387v2","updated":"2024-11-19T15:39:41Z","published":"2024-10-03T11:07:43Z","title":"BiSSL: Bilevel Optimization for Self-Supervised Pre-Training and\n Fine-Tuning","summary":" In this work, we present BiSSL, a first-of-its-kind training framework that\nintroduces bilevel optimization to enhance the alignment between the pretext\npre-training and downstream fine-tuning stages in self-supervised learning.\nBiSSL formulates the pretext and downstream task objectives as the lower- and\nupper-level objectives in a bilevel optimization problem and serves as an\nintermediate training stage within the self-supervised learning pipeline. By\nmore explicitly modeling the interdependence of these training stages, BiSSL\nfacilitates enhanced information sharing between them, ultimately leading to a\nbackbone parameter initialization that is better suited for the downstream\ntask. We propose a training algorithm that alternates between optimizing the\ntwo objectives defined in BiSSL. Using a ResNet-18 backbone pre-trained with\nSimCLR on the STL10 dataset, we demonstrate that our proposed framework\nconsistently achieves improved or competitive classification accuracies across\nvarious downstream image classification datasets compared to the conventional\nself-supervised learning pipeline. Qualitative analyses of the backbone\nfeatures further suggest that BiSSL enhances the alignment of downstream\nfeatures in the backbone prior to fine-tuning.\n","authors":["Gustav Wagner Zakarias","Lars Kai Hansen","Zheng-Hua Tan"],"pdf_url":"https://arxiv.org/pdf/2410.02387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12570v1","updated":"2024-11-19T15:39:25Z","published":"2024-11-19T15:39:25Z","title":"A data driven approach to classify descriptors based on their efficiency\n in translating noisy trajectories into physically-relevant information","summary":" Reconstructing the physical complexity of many-body dynamical systems can be\nchallenging. Starting from the trajectories of their constitutive units (raw\ndata), typical approaches require selecting appropriate descriptors to convert\nthem into time-series, which are then analyzed to extract interpretable\ninformation. However, identifying the most effective descriptor is often\nnon-trivial. Here, we report a data-driven approach to compare the efficiency\nof various descriptors in extracting information from noisy trajectories and\ntranslating it into physically relevant insights. As a prototypical system with\nnon-trivial internal complexity, we analyze molecular dynamics trajectories of\nan atomistic system where ice and water coexist in equilibrium near the\nsolid/liquid transition temperature. We compare general and specific\ndescriptors often used in aqueous systems: number of neighbors, molecular\nvelocities, Smooth Overlap of Atomic Positions (SOAP), Local Environments and\nNeighbors Shuffling (LENS), Orientational Tetrahedral Order, and distance from\nthe fifth neighbor ($d_5$). Using Onion Clustering -- an efficient unsupervised\nmethod for single-point time-series analysis -- we assess the maximum\nextractable information for each descriptor and rank them via a\nhigh-dimensional metric. Our results show that advanced descriptors like SOAP\nand LENS outperform classical ones due to higher signal-to-noise ratios.\nNonetheless, even simple descriptors can rival or exceed advanced ones after\nlocal signal denoising. For example, $d_5$, initially among the weakest,\nbecomes the most effective at resolving the system's non-local dynamical\ncomplexity after denoising. This work highlights the critical role of noise in\ninformation extraction from molecular trajectories and offers a data-driven\napproach to identify optimal descriptors for systems with characteristic\ninternal complexity.\n","authors":["Simone Martino","Domiziano Doria","Chiara Lionello","Matteo Becchi","Giovanni M. Pavan"],"pdf_url":"https://arxiv.org/pdf/2411.12570v1.pdf","comment":"19 pages, 5 figures + 3 in supporting information (at the bottom of\n the manuscript)"},{"id":"http://arxiv.org/abs/2402.08313v2","updated":"2024-11-19T15:29:44Z","published":"2024-02-13T09:17:20Z","title":"Approximating Families of Sharp Solutions to Fisher's Equation with\n Physics-Informed Neural Networks","summary":" This paper employs physics-informed neural networks (PINNs) to solve Fisher's\nequation, a fundamental reaction-diffusion system with both simplicity and\nsignificance. The focus is on investigating Fisher's equation under conditions\nof large reaction rate coefficients, where solutions exhibit steep traveling\nwaves that often present challenges for traditional numerical methods. To\naddress these challenges, a residual weighting scheme is introduced in the\nnetwork training to mitigate the difficulties associated with standard PINN\napproaches. Additionally, a specialized network architecture designed to\ncapture traveling wave solutions is explored. The paper also assesses the\nability of PINNs to approximate a family of solutions by generalizing across\nmultiple reaction rate coefficients. The proposed method demonstrates high\neffectiveness in solving Fisher's equation with large reaction rate\ncoefficients and shows promise for meshfree solutions of generalized\nreaction-diffusion systems.\n","authors":["Franz M. Rohrhofer","Stefan Posch","Clemens Gößnitzer","Bernhard C. Geiger"],"pdf_url":"https://arxiv.org/pdf/2402.08313v2.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.12563v1","updated":"2024-11-19T15:27:54Z","published":"2024-11-19T15:27:54Z","title":"Stream-Based Active Learning for Process Monitoring","summary":" Statistical process monitoring (SPM) methods are essential tools in quality\nmanagement to check the stability of industrial processes, i.e., to dynamically\nclassify the process state as in control (IC), under normal operating\nconditions, or out of control (OC), otherwise. Traditional SPM methods are\nbased on unsupervised approaches, which are popular because in most industrial\napplications the true OC states of the process are not explicitly known. This\nhampered the development of supervised methods that could instead take\nadvantage of process data containing labels on the true process state, although\nthey still need improvement in dealing with class imbalance, as OC states are\nrare in high-quality processes, and the dynamic recognition of unseen classes,\ne.g., the number of possible OC states. This article presents a novel\nstream-based active learning strategy for SPM that enhances partially hidden\nMarkov models to deal with data streams. The ultimate goal is to optimize\nlabeling resources constrained by a limited budget and dynamically update the\npossible OC states. The proposed method performance in classifying the true\nstate of the process is assessed through a simulation and a case study on the\nSPM of a resistance spot welding process in the automotive industry, which\nmotivated this research.\n","authors":["Christian Capezza","Antonio Lepore","Kamran Paynabar"],"pdf_url":"https://arxiv.org/pdf/2411.12563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10263v2","updated":"2024-11-19T15:27:27Z","published":"2024-05-16T17:13:55Z","title":"Partially Unitary Learning","summary":" The problem of an optimal mapping between Hilbert spaces $IN$ of\n$\\left|\\psi\\right\\rangle$ and $OUT$ of $\\left|\\phi\\right\\rangle$ based on a set\nof wavefunction measurements (within a phase) $\\psi_l \\to \\phi_l$, $l=1\\dots\nM$, is formulated as an optimization problem maximizing the total fidelity\n$\\sum_{l=1}^{M} \\omega^{(l)}\n\\left|\\langle\\phi_l|\\mathcal{U}|\\psi_l\\rangle\\right|^2$ subject to probability\npreservation constraints on $\\mathcal{U}$ (partial unitarity). The constructed\noperator $\\mathcal{U}$ can be considered as an $IN$ to $OUT$ quantum channel;\nit is a partially unitary rectangular matrix (an isometry) of dimension\n$\\dim(OUT) \\times \\dim(IN)$ transforming operators as $A^{OUT}=\\mathcal{U}\nA^{IN} \\mathcal{U}^{\\dagger}$. An iterative algorithm for finding the global\nmaximum of this optimization problem is developed, and its application to a\nnumber of problems is demonstrated. A software product implementing the\nalgorithm is available from the authors.\n","authors":["Mikhail Gennadievich Belov","Vladislav Gennadievich Malyshkin"],"pdf_url":"https://arxiv.org/pdf/2405.10263v2.pdf","comment":"A working algorithm implementing Partially Unitary Learning\n arXiv:2212.14810 has been developed and generalized. See arXiv:2407.04406 for\n further generalization to density matrix mappings"},{"id":"http://arxiv.org/abs/2403.20212v2","updated":"2024-11-19T15:23:29Z","published":"2024-03-29T14:47:54Z","title":"On Size and Hardness Generalization in Unsupervised Learning for the\n Travelling Salesman Problem","summary":" We study the generalization capability of Unsupervised Learning in solving\nthe Travelling Salesman Problem (TSP). We use a Graph Neural Network (GNN)\ntrained with a surrogate loss function to generate an embedding for each node.\nWe use these embeddings to construct a heat map that indicates the likelihood\nof each edge being part of the optimal route. We then apply local search to\ngenerate our final predictions. Our investigation explores how different\ntraining instance sizes, embedding dimensions, and distributions influence the\noutcomes of Unsupervised Learning methods. Our results show that training with\nlarger instance sizes and increasing embedding dimensions can build a more\neffective representation, enhancing the model's ability to solve TSP.\nFurthermore, in evaluating generalization across different distributions, we\nfirst determine the hardness of various distributions and explore how different\nhardnesses affect the final results. Our findings suggest that models trained\non harder instances exhibit better generalization capabilities, highlighting\nthe importance of selecting appropriate training instances in solving TSP using\nUnsupervised Learning.\n","authors":["Yimeng Min","Carla P. Gomes"],"pdf_url":"https://arxiv.org/pdf/2403.20212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12556v1","updated":"2024-11-19T15:15:45Z","published":"2024-11-19T15:15:45Z","title":"UMGAD: Unsupervised Multiplex Graph Anomaly Detection","summary":" Graph anomaly detection (GAD) is a critical task in graph machine learning,\nwith the primary objective of identifying anomalous nodes that deviate\nsignificantly from the majority. This task is widely applied in various\nreal-world scenarios, including fraud detection and social network analysis.\nHowever, existing GAD methods still face two major challenges: (1) They are\noften limited to detecting anomalies in single-type interaction graphs and\nstruggle with multiple interaction types in multiplex heterogeneous graphs; (2)\nIn unsupervised scenarios, selecting appropriate anomaly score thresholds\nremains a significant challenge for accurate anomaly detection. To address the\nabove challenges, we propose a novel Unsupervised Multiplex Graph Anomaly\nDetection method, named UMGAD. We first learn multi-relational correlations\namong nodes in multiplex heterogeneous graphs and capture anomaly information\nduring node attribute and structure reconstruction through graph-masked\nautoencoder (GMAE). Then, to further weaken the influence of noise and\nredundant information on abnormal information extraction, we generate\nattribute-level and subgraph-level augmented-view graphs respectively, and\nperform attribute and structure reconstruction through GMAE. Finally, We learn\nto optimize node attributes and structural features through contrastive\nlearning between original-view and augmented-view graphs to improve the model's\nability to capture anomalies. Meanwhile, we also propose a new anomaly score\nthreshold selection strategy, which allows the model to be independent of the\nground truth in real unsupervised scenarios. Extensive experiments on four\ndatasets show that our \\model significantly outperforms state-of-the-art\nmethods, achieving average improvements of 13.48% in AUC and 11.68% in Macro-F1\nacross all datasets.\n","authors":["Xiang Li","Jianpeng Qi","Zhongying Zhao","Guanjie Zheng","Lei Cao","Junyu Dong","Yanwei Yu"],"pdf_url":"https://arxiv.org/pdf/2411.12556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10214v2","updated":"2024-11-19T15:09:10Z","published":"2024-11-15T14:21:32Z","title":"Machine Learning Algorithms to Assess Site Closure Time Frames for Soil\n and Groundwater Contamination","summary":" Monitored Natural Attenuation (MNA) is gaining prominence as an effective\nmethod for managing soil and groundwater contamination due to its\ncost-efficiency and minimal environmental disruption. Despite its benefits, MNA\nnecessitates extensive groundwater monitoring to ensure that contaminant levels\ndecrease to meet safety standards. This study expands the capabilities of\nPyLEnM, a Python package designed for long-term environmental monitoring, by\nincorporating new algorithms to enhance its predictive and analytical\nfunctionalities. We introduce methods to estimate the timeframe required for\ncontaminants like Sr-90 and I-129 to reach regulatory safety standards using\nlinear regression and to forecast future contaminant levels with the\nBidirectional Long Short-Term Memory (Bi-LSTM) networks. Additionally, Random\nForest regression is employed to identify factors influencing the time to reach\nsafety standards. Our methods are illustrated using data from the Savannah\nRiver Site (SRS) F-Area, where preliminary findings reveal a notable downward\ntrend in contaminant levels, with variability linked to initial concentrations\nand groundwater flow dynamics. The Bi-LSTM model effectively predicts\ncontaminant concentrations for the next four years, demonstrating the potential\nof advanced time series analysis to improve MNA strategies and reduce reliance\non manual groundwater sampling. The code, along with its usage instructions,\nvalidation, and requirements, is available at:\nhttps://github.com/csplevuanh/pylenm_extension.\n","authors":["Vu-Anh Le","Haruko Murakami Wainwright","Hansell Gonzalez-Raymat","Carol Eddy-Dilek"],"pdf_url":"https://arxiv.org/pdf/2411.10214v2.pdf","comment":"The paper will be withdrawn to fix some work issues with the sections\n on Bi-LSTM models"},{"id":"http://arxiv.org/abs/2411.12547v1","updated":"2024-11-19T15:00:18Z","published":"2024-11-19T15:00:18Z","title":"S3TU-Net: Structured Convolution and Superpixel Transformer for Lung\n Nodule Segmentation","summary":" The irregular and challenging characteristics of lung adenocarcinoma nodules\nin computed tomography (CT) images complicate staging diagnosis, making\naccurate segmentation critical for clinicians to extract detailed lesion\ninformation. In this study, we propose a segmentation model, S3TU-Net, which\nintegrates multi-dimensional spatial connectors and a superpixel-based visual\ntransformer. S3TU-Net is built on a multi-view CNN-Transformer hybrid\narchitecture, incorporating superpixel algorithms, structured weighting, and\nspatial shifting techniques to achieve superior segmentation performance. The\nmodel leverages structured convolution blocks (DWF-Conv/D2BR-Conv) to extract\nmulti-scale local features while mitigating overfitting. To enhance multi-scale\nfeature fusion, we introduce the S2-MLP Link, integrating spatial shifting and\nattention mechanisms at the skip connections. Additionally, the residual-based\nsuperpixel visual transformer (RM-SViT) effectively merges global and local\nfeatures by employing sparse correlation learning and multi-branch attention to\ncapture long-range dependencies, with residual connections enhancing stability\nand computational efficiency. Experimental results on the LIDC-IDRI dataset\ndemonstrate that S3TU-Net achieves a DSC, precision, and IoU of 89.04%, 90.73%,\nand 90.70%, respectively. Compared to recent methods, S3TU-Net improves DSC by\n4.52% and sensitivity by 3.16%, with other metrics showing an approximate 2%\nincrease. In addition to comparison and ablation studies, we validated the\ngeneralization ability of our model on the EPDB private dataset, achieving a\nDSC of 86.40%.\n","authors":["Yuke Wu","Xiang Liu","Yunyu Shi","Xinyi Chen","Zhenglei Wang","YuQing Xu","Shuo Hong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.12547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09821v2","updated":"2024-11-19T14:57:40Z","published":"2024-11-14T21:53:46Z","title":"Automatic Classification of General Movements in Newborns","summary":" General movements (GMs) are spontaneous, coordinated body movements in\ninfants that offer valuable insights into the developing nervous system.\nAssessed through the Prechtl GM Assessment (GMA), GMs are reliable predictors\nfor neurodevelopmental disorders. However, GMA requires specifically trained\nclinicians, who are limited in number. To scale up newborn screening, there is\na need for an algorithm that can automatically classify GMs from infant video\nrecordings. This data poses challenges, including variability in recording\nlength, device type, and setting, with each video coarsely annotated for\noverall movement quality. In this work, we introduce a tool for extracting\nfeatures from these recordings and explore various machine learning techniques\nfor automated GM classification.\n","authors":["Daphné Chopard","Sonia Laguna","Kieran Chin-Cheong","Annika Dietz","Anna Badura","Sven Wellmann","Julia E. Vogt"],"pdf_url":"https://arxiv.org/pdf/2411.09821v2.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages"},{"id":"http://arxiv.org/abs/2406.08234v2","updated":"2024-11-19T14:44:36Z","published":"2024-06-12T14:01:12Z","title":"MaIL: Improving Imitation Learning with Mamba","summary":" This work presents Mamba Imitation Learning (MaIL), a novel imitation\nlearning (IL) architecture that provides an alternative to state-of-the-art\n(SoTA) Transformer-based policies. MaIL leverages Mamba, a state-space model\ndesigned to selectively focus on key features of the data. While Transformers\nare highly effective in data-rich environments due to their dense attention\nmechanisms, they can struggle with smaller datasets, often leading to\noverfitting or suboptimal representation learning. In contrast, Mamba's\narchitecture enhances representation learning efficiency by focusing on key\nfeatures and reducing model complexity. This approach mitigates overfitting and\nenhances generalization, even when working with limited data. Extensive\nevaluations on the LIBERO benchmark demonstrate that MaIL consistently\noutperforms Transformers on all LIBERO tasks with limited data and matches\ntheir performance when the full dataset is available. Additionally, MaIL's\neffectiveness is validated through its superior performance in three real robot\nexperiments. Our code is available at https://github.com/ALRhub/MaIL.\n","authors":["Xiaogang Jia","Qian Wang","Atalay Donat","Bowen Xing","Ge Li","Hongyi Zhou","Onur Celik","Denis Blessing","Rudolf Lioutikov","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2406.08234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12539v1","updated":"2024-11-19T14:39:29Z","published":"2024-11-19T14:39:29Z","title":"Predicting Customer Satisfaction by Replicating the Survey Response\n Distribution","summary":" For many call centers, customer satisfaction (CSAT) is a key performance\nindicator (KPI). However, only a fraction of customers take the CSAT survey\nafter the call, leading to a biased and inaccurate average CSAT value, and\nmissed opportunities for coaching, follow-up, and rectification. Therefore,\ncall centers can benefit from a model predicting customer satisfaction on calls\nwhere the customer did not complete the survey. Given that CSAT is a closely\nmonitored KPI, it is critical to minimize any bias in the average predicted\nCSAT (pCSAT). In this paper, we introduce a method such that predicted CSAT\n(pCSAT) scores accurately replicate the distribution of survey CSAT responses\nfor every call center with sufficient data in a live production environment.\nThe method can be applied to many multiclass classification problems to improve\nthe class balance and minimize its changes upon model updates.\n","authors":["Etienne Manderscheid","Matthias Lee"],"pdf_url":"https://arxiv.org/pdf/2411.12539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12537v1","updated":"2024-11-19T14:35:38Z","published":"2024-11-19T14:35:38Z","title":"Unlocking State-Tracking in Linear RNNs Through Negative Eigenvalues","summary":" Linear Recurrent Neural Networks (LRNNs) such as Mamba, RWKV, GLA, mLSTM, and\nDeltaNet have emerged as efficient alternatives to Transformers in large\nlanguage modeling, offering linear scaling with sequence length and improved\ntraining efficiency. However, LRNNs struggle to perform state-tracking which\nmay impair performance in tasks such as code evaluation or tracking a chess\ngame. Even parity, the simplest state-tracking task, which non-linear RNNs like\nLSTM handle effectively, cannot be solved by current LRNNs. Recently, Sarrof et\nal. (2024) demonstrated that the failure of LRNNs like Mamba to solve parity\nstems from restricting the value range of their diagonal state-transition\nmatrices to $[0, 1]$ and that incorporating negative values can resolve this\nissue. We extend this result to non-diagonal LRNNs, which have recently shown\npromise in models such as DeltaNet. We prove that finite precision LRNNs with\nstate-transition matrices having only positive eigenvalues cannot solve parity,\nwhile complex eigenvalues are needed to count modulo $3$. Notably, we also\nprove that LRNNs can learn any regular language when their state-transition\nmatrices are products of identity minus vector outer product matrices, each\nwith eigenvalues in the range $[-1, 1]$. Our empirical results confirm that\nextending the eigenvalue range of models like Mamba and DeltaNet to include\nnegative values not only enables them to solve parity but consistently improves\ntheir performance on state-tracking tasks. Furthermore, pre-training LRNNs with\nan extended eigenvalue range for language modeling achieves comparable\nperformance and stability while showing promise on code and math data. Our work\nenhances the expressivity of modern LRNNs, broadening their applicability\nwithout changing the cost of training or inference.\n","authors":["Riccardo Grazzi","Julien Siems","Jörg K. H. Franke","Arber Zela","Frank Hutter","Massimiliano Pontil"],"pdf_url":"https://arxiv.org/pdf/2411.12537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06807v3","updated":"2024-11-19T14:31:02Z","published":"2024-03-11T15:26:34Z","title":"Multistep Consistency Models","summary":" Diffusion models are relatively easy to train but require many steps to\ngenerate samples. Consistency models are far more difficult to train, but\ngenerate samples in a single step.\n In this paper we propose Multistep Consistency Models: A unification between\nConsistency Models (Song et al., 2023) and TRACT (Berthelot et al., 2023) that\ncan interpolate between a consistency model and a diffusion model: a trade-off\nbetween sampling speed and sampling quality. Specifically, a 1-step consistency\nmodel is a conventional consistency model whereas a $\\infty$-step consistency\nmodel is a diffusion model.\n Multistep Consistency Models work really well in practice. By increasing the\nsample budget from a single step to 2-8 steps, we can train models more easily\nthat generate higher quality samples, while retaining much of the sampling\nspeed benefits. Notable results are 1.4 FID on Imagenet 64 in 8 step and 2.1\nFID on Imagenet128 in 8 steps with consistency distillation, using simple\nlosses without adversarial training. We also show that our method scales to a\ntext-to-image diffusion model, generating samples that are close to the quality\nof the original model.\n","authors":["Jonathan Heek","Emiel Hoogeboom","Tim Salimans"],"pdf_url":"https://arxiv.org/pdf/2403.06807v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.02666v2","updated":"2024-11-19T14:30:22Z","published":"2022-06-06T15:09:09Z","title":"Robust Pareto Set Identification with Contaminated Bandit Feedback","summary":" We consider the Pareto set identification (PSI) problem in multi-objective\nmulti-armed bandits (MO-MAB) with contaminated reward observations. At each arm\npull, with some fixed probability, the true reward samples are replaced with\nthe samples from an arbitrary contamination distribution chosen by an\nadversary. We consider ({\\alpha}, {\\delta})-PAC PSI and propose a sample\nmedian-based multi-objective adaptive elimination algorithm that returns an\n({\\alpha}, {\\delta})- PAC Pareto set upon termination with a sample complexity\nbound that depends on the contamination probability. As the contamination\nprobability decreases, we recover the wellknown sample complexity results in\nMO-MAB. We compare the proposed algorithm with a mean-based method from MO-MAB\nliterature, as well as an extended version that uses median estimators, on\nseveral PSI problems under adversarial corruptions, including review bombing\nand diabetes management. Our numerical results support our theoretical findings\nand demonstrate that robust algorithm design is crucial for accurate PSI under\ncontaminated reward observations.\n","authors":["İlter Onat Korkmaz","Efe Eren Ceyani","Kerem Bozgan","Cem Tekin"],"pdf_url":"https://arxiv.org/pdf/2206.02666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00028v2","updated":"2024-11-19T14:29:32Z","published":"2024-10-29T04:03:15Z","title":"Synergizing LLM Agents and Knowledge Graph for Socioeconomic Prediction\n in LBSN","summary":" The fast development of location-based social networks (LBSNs) has led to\nsignificant changes in society, resulting in popular studies of using LBSN data\nfor socioeconomic prediction, e.g., regional population and commercial activity\nestimation. Existing studies design various graphs to model heterogeneous LBSN\ndata, and further apply graph representation learning methods for socioeconomic\nprediction. However, these approaches heavily rely on heuristic ideas and\nexpertise to extract task-relevant knowledge from diverse data, which may not\nbe optimal for specific tasks. Additionally, they tend to overlook the inherent\nrelationships between different indicators, limiting the prediction accuracy.\nMotivated by the remarkable abilities of large language models (LLMs) in\ncommonsense reasoning, embedding, and multi-agent collaboration, in this work,\nwe synergize LLM agents and knowledge graph for socioeconomic prediction. We\nfirst construct a location-based knowledge graph (LBKG) to integrate\nmulti-sourced LBSN data. Then we leverage the reasoning power of LLM agent to\nidentify relevant meta-paths in the LBKG for each type of socioeconomic\nprediction task, and design a semantic-guided attention module for knowledge\nfusion with meta-paths. Moreover, we introduce a cross-task communication\nmechanism to further enhance performance by enabling knowledge sharing across\ntasks at both LLM agent and KG levels. On the one hand, the LLM agents for\ndifferent tasks collaborate to generate more diverse and comprehensive\nmeta-paths. On the other hand, the embeddings from different tasks are\nadaptively merged for better socioeconomic prediction. Experiments on two\ndatasets demonstrate the effectiveness of the synergistic design between LLM\nand KG, providing insights for information sharing across socioeconomic\nprediction tasks.\n","authors":["Zhilun Zhou","Jingyang Fan","Yu Liu","Fengli Xu","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.00028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14690v2","updated":"2024-11-19T14:16:38Z","published":"2023-02-28T16:01:38Z","title":"On the existence of minimizers in shallow residual ReLU neural network\n optimization landscapes","summary":" In this article, we show existence of minimizers in the loss landscape for\nresidual artificial neural networks (ANNs) with multi-dimensional input layer\nand one hidden layer with ReLU activation. Our work contrasts earlier results\nin [D. Gallon, A. Jentzen, and F. Lindner, preprint, arXiv:2211.15641, 2022]\nand [P. Petersen, M. Raslan, and F. Voigtlaender, Found. Comput. Math., 21\n(2021), pp. 375-444] which showed that in many situations minimizers do not\nexist for common smooth activation functions even in the case where the target\nfunctions are polynomials. The proof of the existence property makes use of a\nclosure of the search space containing all functions generated by ANNs and\nadditional discontinuous generalized responses. As we will show, the additional\ngeneralized responses in this larger space are suboptimal so that the minimum\nis attained in the original function class.\n","authors":["Steffen Dereich","Arnulf Jentzen","Sebastian Kassing"],"pdf_url":"https://arxiv.org/pdf/2302.14690v2.pdf","comment":"Author's Accepted Manuscript version. To appear in SINUM"},{"id":"http://arxiv.org/abs/2411.12523v1","updated":"2024-11-19T14:13:25Z","published":"2024-11-19T14:13:25Z","title":"Data Pruning in Generative Diffusion Models","summary":" Data pruning is the problem of identifying a core subset that is most\nbeneficial to training and discarding the remainder. While pruning strategies\nare well studied for discriminative models like those used in classification,\nlittle research has gone into their application to generative models.\nGenerative models aim to estimate the underlying distribution of the data, so\npresumably they should benefit from larger datasets. In this work we aim to\nshed light on the accuracy of this statement, specifically answer the question\nof whether data pruning for generative diffusion models could have a positive\nimpact. Contrary to intuition, we show that eliminating redundant or noisy data\nin large datasets is beneficial particularly when done strategically. We\nexperiment with several pruning methods including recent-state-of-art methods,\nand evaluate over CelebA-HQ and ImageNet datasets. We demonstrate that a simple\nclustering method outperforms other sophisticated and computationally demanding\nmethods. We further exhibit how we can leverage clustering to balance skewed\ndatasets in an unsupervised manner to allow fair sampling for underrepresented\npopulations in the data distribution, which is a crucial problem in generative\nmodels.\n","authors":["Rania Briq","Jiangtao Wang","Steffan Kesselheim"],"pdf_url":"https://arxiv.org/pdf/2411.12523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12516v1","updated":"2024-11-19T13:58:20Z","published":"2024-11-19T13:58:20Z","title":"MAViS: Modular Autonomous Virtualization System for Two-Dimensional\n Semiconductor Quantum Dot Arrays","summary":" Arrays of gate-defined semiconductor quantum dots are among the leading\ncandidates for building scalable quantum processors. High-fidelity\ninitialization, control, and readout of spin qubit registers require exquisite\nand targeted control over key Hamiltonian parameters that define the\nelectrostatic environment. However, due to the tight gate pitch, capacitive\ncrosstalk between gates hinders independent tuning of chemical potentials and\ninterdot couplings. While virtual gates offer a practical solution, determining\nall the required cross-capacitance matrices accurately and efficiently in large\nquantum dot registers is an open challenge. Here, we establish a Modular\nAutomated Virtualization System (MAViS) -- a general and modular framework for\nautonomously constructing a complete stack of multi-layer virtual gates in real\ntime. Our method employs machine learning techniques to rapidly extract\nfeatures from two-dimensional charge stability diagrams. We then utilize\ncomputer vision and regression models to self-consistently determine all\nrelative capacitive couplings necessary for virtualizing plunger and barrier\ngates in both low- and high-tunnel-coupling regimes. Using MAViS, we\nsuccessfully demonstrate accurate virtualization of a dense two-dimensional\narray comprising ten quantum dots defined in a high-quality Ge/SiGe\nheterostructure. Our work offers an elegant and practical solution for the\nefficient control of large-scale semiconductor quantum dot systems.\n","authors":["Anantha S. Rao","Donovan Buterakos","Barnaby van Straaten","Valentin John","Cécile X. Yu","Stefan D. Oosterhout","Lucas Stehouwer","Giordano Scappucci","Menno Veldhorst","Francesco Borsoi","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2411.12516v1.pdf","comment":"14 pages, 5 figures, 8 pages of supplemental material"},{"id":"http://arxiv.org/abs/2409.05023v2","updated":"2024-11-19T13:57:39Z","published":"2024-09-08T08:29:51Z","title":"Asymptotic and Non-Asymptotic Convergence of AdaGrad for Non-Convex\n Optimization via Novel Stopping Time-based Analysis","summary":" Adaptive optimizers have emerged as powerful tools in deep learning,\ndynamically adjusting the learning rate based on iterative gradients. These\nadaptive methods have significantly succeeded in various deep learning tasks,\noutperforming stochastic gradient descent (SGD). However, despite AdaGrad's\nstatus as a cornerstone of adaptive optimization, its theoretical analysis has\nnot adequately addressed key aspects such as asymptotic convergence and\nnon-asymptotic convergence rates in non-convex optimization scenarios. This\nstudy aims to provide a comprehensive analysis of AdaGrad, filling the existing\ngaps in the literature. We introduce an innovative stopping time technique from\nprobabilistic theory, which allows us to establish the stability of AdaGrad\nunder mild conditions for the first time. We further derive the asymptotically\nalmost sure and mean-square convergence for AdaGrad. In addition, we\ndemonstrate the near-optimal non-asymptotic convergence rate measured by the\naverage-squared gradients in expectation, which is stronger than the existing\nhigh-probability results. The techniques developed in this work are potentially\nindependent of interest for future research on other adaptive stochastic\nalgorithms.\n","authors":["Ruinan Jin","Xiaoyu Wang","Baoxiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.05023v2.pdf","comment":"50 pages"},{"id":"http://arxiv.org/abs/2411.12502v1","updated":"2024-11-19T13:40:49Z","published":"2024-11-19T13:40:49Z","title":"Transformer Neural Processes -- Kernel Regression","summary":" Stochastic processes model various natural phenomena from disease\ntransmission to stock prices, but simulating and quantifying their uncertainty\ncan be computationally challenging. For example, modeling a Gaussian Process\nwith standard statistical methods incurs an $\\mathcal{O}(n^3)$ penalty, and\neven using state-of-the-art Neural Processes (NPs) incurs an $\\mathcal{O}(n^2)$\npenalty due to the attention mechanism. We introduce the Transformer Neural\nProcess - Kernel Regression (TNP-KR), a new architecture that incorporates a\nnovel transformer block we call a Kernel Regression Block (KRBlock), which\nreduces the computational complexity of attention in transformer-based Neural\nProcesses (TNPs) from $\\mathcal{O}((n_C+n_T)^2)$ to $O(n_C^2+n_Cn_T)$ by\neliminating masked computations, where $n_C$ is the number of context, and\n$n_T$ is the number of test points, respectively, and a fast attention variant\nthat further reduces all attention calculations to $\\mathcal{O}(n_C)$ in space\nand time complexity. In benchmarks spanning such tasks as meta-regression,\nBayesian optimization, and image completion, we demonstrate that the full\nvariant matches the performance of state-of-the-art methods while training\nfaster and scaling two orders of magnitude higher in number of test points, and\nthe fast variant nearly matches that performance while scaling to millions of\nboth test and context points on consumer hardware.\n","authors":["Daniel Jenson","Jhonathan Navott","Mengyan Zhang","Makkunda Sharma","Elizaveta Semenova","Seth Flaxman"],"pdf_url":"https://arxiv.org/pdf/2411.12502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12498v1","updated":"2024-11-19T13:31:53Z","published":"2024-11-19T13:31:53Z","title":"Enhancing Reasoning Capabilities of LLMs via Principled Synthetic Logic\n Corpus","summary":" Large language models (LLMs) are capable of solving a wide range of tasks,\nyet they have struggled with reasoning. To address this, we propose\n$\\textbf{Additional Logic Training (ALT)}$, which aims to enhance LLMs'\nreasoning capabilities by program-generated logical reasoning samples. We first\nestablish principles for designing high-quality samples by integrating symbolic\nlogic theory and previous empirical insights. Then, based on these principles,\nwe construct a synthetic corpus named $\\textbf{Formal Logic Deduction Diverse}$\n($\\textbf{FLD}$$^{\\times 2}$), comprising numerous samples of multi-step\ndeduction with unknown facts, diverse reasoning rules, diverse linguistic\nexpressions, and challenging distractors. Finally, we empirically show that ALT\non FLD$^{\\times2}$ substantially enhances the reasoning capabilities of\nstate-of-the-art LLMs, including LLaMA-3.1-70B. Improvements include gains of\nup to 30 points on logical reasoning benchmarks, up to 10 points on math and\ncoding benchmarks, and 5 points on the benchmark suite BBH.\n","authors":["Terufumi Morishita","Gaku Morio","Atsuki Yamaguchi","Yasuhiro Sogawa"],"pdf_url":"https://arxiv.org/pdf/2411.12498v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.19262v3","updated":"2024-11-19T13:27:30Z","published":"2024-05-29T16:55:32Z","title":"Weak-to-Strong Search: Align Large Language Models via Searching over\n Small Language Models","summary":" Large language models are usually fine-tuned to align with human preferences.\nHowever, fine-tuning a large language model can be challenging. In this work,\nwe introduce $\\textit{weak-to-strong search}$, framing the alignment of a large\nlanguage model as a test-time greedy search to maximize the log-probability\ndifference between small tuned and untuned models while sampling from the\nfrozen large model. This method serves both as (1) a compute-efficient model\nup-scaling strategy that avoids directly tuning the large model and as (2) an\ninstance of weak-to-strong generalization that enhances a strong model with\nweak test-time guidance. Empirically, we demonstrate the flexibility of\nweak-to-strong search across different tasks. In controlled-sentiment\ngeneration and summarization, we use tuned and untuned $\\texttt{gpt2}$s to\nimprove the alignment of large models without additional training. Crucially,\nin a more difficult instruction-following benchmark, AlpacaEval 2.0, we show\nthat reusing off-the-shelf small models (e.g., $\\texttt{zephyr-7b-beta}$ and\nits untuned version) can improve the length-controlled win rates of both\nwhite-box and black-box large models against $\\texttt{gpt-4-turbo}$ (e.g.,\n$34.4\\% \\rightarrow 37.9\\%$ for $\\texttt{Llama-3-70B-Instruct}$ and $16.0\\%\n\\rightarrow 20.1\\%$ for $\\texttt{gpt-3.5-turbo-instruct}$), despite the small\nmodels' low win rates $\\approx 10.0\\%$.\n","authors":["Zhanhui Zhou","Zhixuan Liu","Jie Liu","Zhichen Dong","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2405.19262v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.11132v2","updated":"2024-11-19T13:13:58Z","published":"2024-11-17T17:36:30Z","title":"Variational Bayesian Bow tie Neural Networks with Shrinkage","summary":" Despite the dominant role of deep models in machine learning, limitations\npersist, including overconfident predictions, susceptibility to adversarial\nattacks, and underestimation of variability in predictions. The Bayesian\nparadigm provides a natural framework to overcome such issues and has become\nthe gold standard for uncertainty estimation with deep models, also providing\nimproved accuracy and a framework for tuning critical hyperparameters. However,\nexact Bayesian inference is challenging, typically involving variational\nalgorithms that impose strong independence and distributional assumptions.\nMoreover, existing methods are sensitive to the architectural choice of the\nnetwork. We address these issues by constructing a relaxed version of the\nstandard feed-forward rectified neural network, and employing Polya-Gamma data\naugmentation tricks to render a conditionally linear and Gaussian model.\nAdditionally, we use sparsity-promoting priors on the weights of the neural\nnetwork for data-driven architectural design. To approximate the posterior, we\nderive a variational inference algorithm that avoids distributional assumptions\nand independence across layers and is a faster alternative to the usual Markov\nChain Monte Carlo schemes.\n","authors":["Alisa Sheinkman","Sara Wade"],"pdf_url":"https://arxiv.org/pdf/2411.11132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06701v3","updated":"2024-11-19T13:09:06Z","published":"2023-07-13T11:58:27Z","title":"S-HR-VQVAE: Sequential Hierarchical Residual Learning Vector Quantized\n Variational Autoencoder for Video Prediction","summary":" We address the video prediction task by putting forth a novel model that\ncombines (i) a novel hierarchical residual learning vector quantized\nvariational autoencoder (HR-VQVAE), and (ii) a novel autoregressive\nspatiotemporal predictive model (AST-PM). We refer to this approach as a\nsequential hierarchical residual learning vector quantized variational\nautoencoder (S-HR-VQVAE). By leveraging the intrinsic capabilities of HR-VQVAE\nat modeling still images with a parsimonious representation, combined with the\nAST-PM's ability to handle spatiotemporal information, S-HR-VQVAE can better\ndeal with major challenges in video prediction. These include learning\nspatiotemporal information, handling high dimensional data, combating blurry\nprediction, and implicit modeling of physical characteristics. Extensive\nexperimental results on four challenging tasks, namely KTH Human Action,\nTrafficBJ, Human3.6M, and Kitti, demonstrate that our model compares favorably\nagainst state-of-the-art video prediction techniques both in quantitative and\nqualitative evaluations despite a much smaller model size. Finally, we boost\nS-HR-VQVAE by proposing a novel training method to jointly estimate the\nHR-VQVAE and AST-PM parameters.\n","authors":["Mohammad Adiban","Kalin Stefanov","Sabato Marco Siniscalchi","Giampiero Salvi"],"pdf_url":"https://arxiv.org/pdf/2307.06701v3.pdf","comment":"12 pages, 6 figures, 5 tables. Accepted for publication on IEEE\n Transactions on Multimedia on 2024-11-19"},{"id":"http://arxiv.org/abs/2411.12484v1","updated":"2024-11-19T13:08:03Z","published":"2024-11-19T13:08:03Z","title":"Regular-pattern-sensitive CRFs for Distant Label Interactions","summary":" Linear-chain conditional random fields (CRFs) are a common model component\nfor sequence labeling tasks when modeling the interactions between different\nlabels is important. However, the Markov assumption limits linear-chain CRFs to\nonly directly modeling interactions between adjacent labels. Weighted\nfinite-state transducers (FSTs) are a related approach which can be made to\nmodel distant label-label interactions, but exact label inference is\nintractable for these models in the general case, and the task of selecting an\nappropriate automaton structure for the desired interaction types poses a\npractical challenge. In this work, we present regular-pattern-sensitive CRFs\n(RPCRFs), a method of enriching standard linear-chain CRFs with the ability to\nlearn long-distance label interactions which occur in user-specified patterns.\nThis approach allows users to write regular-expression label patterns concisely\nspecifying which types of interactions the model should take into account,\nallowing the model to learn from data whether and in which contexts these\npatterns occur. The result can be interpreted alternatively as a CRF augmented\nwith additional, non-local potentials, or as a finite-state transducer whose\nstructure is defined by a set of easily-interpretable patterns. Critically,\nunlike the general case for FSTs (and for non-chain CRFs), exact training and\ninference are tractable for many pattern sets. In this work, we detail how a\nRPCRF can be automatically constructed from a set of user-specified patterns,\nand demonstrate the model's effectiveness on synthetic data, showing how\ndifferent types of patterns can capture different nonlocal dependency\nstructures in label sequences.\n","authors":["Sean Papay","Roman Klinger","Sebastian Pado"],"pdf_url":"https://arxiv.org/pdf/2411.12484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12476v1","updated":"2024-11-19T12:56:43Z","published":"2024-11-19T12:56:43Z","title":"Comparing Prior and Learned Time Representations in Transformer Models\n of Timeseries","summary":" What sets timeseries analysis apart from other machine learning exercises is\nthat time representation becomes a primary aspect of the experiment setup, as\nit must adequately represent the temporal relations that are relevant for the\napplication at hand. In the work described here we study wo different\nvariations of the Transformer architecture: one where we use the fixed time\nrepresentation proposed in the literature and one where the time representation\nis learned from the data. Our experiments use data from predicting the energy\noutput of solar panels, a task that exhibits known periodicities (daily and\nseasonal) that is straight-forward to encode in the fixed time representation.\nOur results indicate that even in an experiment where the phenomenon is\nwell-understood, it is difficult to encode prior knowledge due to side-effects\nthat are difficult to mitigate. We conclude that research work is needed to\nwork the human into the learning loop in ways that improve the robustness and\ntrust-worthiness of the network.\n","authors":["Natalia Koliou","Tatiana Boura","Stasinos Konstantopoulos","George Meramveliotakis","George Kosmadakis"],"pdf_url":"https://arxiv.org/pdf/2411.12476v1.pdf","comment":"Presented at the AI in Natural Sciences and Technology (AINST) track\n of the 13th Conference on Artificial Intelligence (SETN 2024), 11-13\n September 2024, Piraeus, Greece"},{"id":"http://arxiv.org/abs/2411.12469v1","updated":"2024-11-19T12:51:17Z","published":"2024-11-19T12:51:17Z","title":"AI Flow at the Network Edge","summary":" Recent advancements in large language models (LLMs) and their multimodal\nvariants have led to remarkable progress across various domains, demonstrating\nimpressive capabilities and unprecedented potential. In the era of ubiquitous\nconnectivity, leveraging communication networks to distribute intelligence is a\ntransformative concept, envisioning AI-powered services accessible at the\nnetwork edge. However, pushing large models from the cloud to\nresource-constrained environments faces critical challenges. Model inference on\nlow-end devices leads to excessive latency and performance bottlenecks, while\nraw data transmission over limited bandwidth networks causes high communication\noverhead. This article presents AI Flow, a framework that streamlines the\ninference process by jointly leveraging the heterogeneous resources available\nacross devices, edge nodes, and cloud servers, making intelligence flow across\nnetworks. To facilitate cooperation among multiple computational nodes, the\nproposed framework explores a paradigm shift in the design of communication\nnetwork systems from transmitting information flow to intelligence flow, where\nthe goal of communications is task-oriented and folded into the inference\nprocess. Experimental results demonstrate the effectiveness of the proposed\nframework through an image captioning use case, showcasing the ability to\nreduce response latency while maintaining high-quality captions. This article\nserves as a position paper for identifying the motivation, challenges, and\nprinciples of AI Flow.\n","authors":["Jiawei Shao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2411.12469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09122v2","updated":"2024-11-19T12:40:43Z","published":"2024-02-14T12:18:23Z","title":"Mixed-Output Gaussian Process Latent Variable Models","summary":" This work develops a Bayesian non-parametric approach to signal separation\nwhere the signals may vary according to latent variables. Our key contribution\nis to augment Gaussian Process Latent Variable Models (GPLVMs) for the case\nwhere each data point comprises the weighted sum of a known number of pure\ncomponent signals, observed across several input locations. Our framework\nallows arbitrary non-linear variations in the signals while being able to\nincorporate useful priors for the linear weights, such as summing-to-one. Our\ncontributions are particularly relevant to spectroscopy, where changing\nconditions may cause the underlying pure component signals to vary from sample\nto sample. To demonstrate the applicability to both spectroscopy and other\ndomains, we consider several applications: a near-infrared spectroscopy dataset\nwith varying temperatures, a simulated dataset for identifying flow\nconfiguration through a pipe, and a dataset for determining the type of rock\nfrom its reflectance.\n","authors":["James Odgers","Ruby Sedgwick","Chrysoula Kappatou","Ruth Misener","Sarah Filippi"],"pdf_url":"https://arxiv.org/pdf/2402.09122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19997v2","updated":"2024-11-19T12:28:19Z","published":"2024-06-28T15:32:59Z","title":"Wavelets Are All You Need for Autoregressive Image Generation","summary":" In this paper, we take a new approach to autoregressive image generation that\nis based on two main ingredients. The first is wavelet image coding, which\nallows to tokenize the visual details of an image from coarse to fine details\nby ordering the information starting with the most significant bits of the most\nsignificant wavelet coefficients. The second is a variant of a language\ntransformer whose architecture is re-designed and optimized for token sequences\nin this 'wavelet language'. The transformer learns the significant statistical\ncorrelations within a token sequence, which are the manifestations of\nwell-known correlations between the wavelet subbands at various resolutions. We\nshow experimental results with conditioning on the generation process.\n","authors":["Wael Mattar","Idan Levy","Nir Sharon","Shai Dekel"],"pdf_url":"https://arxiv.org/pdf/2406.19997v2.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.12451v1","updated":"2024-11-19T12:19:28Z","published":"2024-11-19T12:19:28Z","title":"Empirical Privacy Evaluations of Generative and Predictive Machine\n Learning Models -- A review and challenges for practice","summary":" Synthetic data generators, when trained using privacy-preserving techniques\nlike differential privacy, promise to produce synthetic data with formal\nprivacy guarantees, facilitating the sharing of sensitive data. However, it is\ncrucial to empirically assess the privacy risks associated with the generated\nsynthetic data before deploying generative technologies. This paper outlines\nthe key concepts and assumptions underlying empirical privacy evaluation in\nmachine learning-based generative and predictive models. Then, this paper\nexplores the practical challenges for privacy evaluations of generative models\nfor use cases with millions of training records, such as data from statistical\nagencies and healthcare providers. Our findings indicate that methods designed\nto verify the correct operation of the training algorithm are effective for\nlarge datasets, but they often assume an adversary that is unrealistic in many\nscenarios. Based on the findings, we highlight a crucial trade-off between the\ncomputational feasibility of the evaluation and the level of realism of the\nassumed threat model. Finally, we conclude with ideas and suggestions for\nfuture research.\n","authors":["Flavio Hafner","Chang Sun"],"pdf_url":"https://arxiv.org/pdf/2411.12451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12438v1","updated":"2024-11-19T11:58:51Z","published":"2024-11-19T11:58:51Z","title":"Dimension Reduction via Sum-of-Squares and Improved Clustering\n Algorithms for Non-Spherical Mixtures","summary":" We develop a new approach for clustering non-spherical (i.e., arbitrary\ncomponent covariances) Gaussian mixture models via a subroutine, based on the\nsum-of-squares method, that finds a low-dimensional separation-preserving\nprojection of the input data. Our method gives a non-spherical analog of the\nclassical dimension reduction, based on singular value decomposition, that\nforms a key component of the celebrated spherical clustering algorithm of\nVempala and Wang [VW04] (in addition to several other applications).\n As applications, we obtain an algorithm to (1) cluster an arbitrary\ntotal-variation separated mixture of $k$ centered (i.e., zero-mean) Gaussians\nwith $n\\geq \\operatorname{poly}(d) f(w_{\\min}^{-1})$ samples and\n$\\operatorname{poly}(n)$ time, and (2) cluster an arbitrary total-variation\nseparated mixture of $k$ Gaussians with identical but arbitrary unknown\ncovariance with $n \\geq d^{O(\\log w_{\\min}^{-1})} f(w_{\\min}^{-1})$ samples and\n$n^{O(\\log w_{\\min}^{-1})}$ time. Here, $w_{\\min}$ is the minimum mixing weight\nof the input mixture, and $f$ does not depend on the dimension $d$. Our\nalgorithms naturally extend to tolerating a dimension-independent fraction of\narbitrary outliers. Before this work, the techniques in the state-of-the-art\nnon-spherical clustering algorithms needed $d^{O(k)} f(w_{\\min}^{-1})$ time and\nsamples for clustering such mixtures.\n Our results may come as a surprise in the context of the $d^{\\Omega(k)}$\nstatistical query lower bound [DKS17] for clustering non-spherical Gaussian\nmixtures. While this result is usually thought to rule out $d^{o(k)}$ cost\nalgorithms for the problem, our results show that the lower bounds can in fact\nbe circumvented for a remarkably general class of Gaussian mixtures.\n","authors":["Prashanti Anderson","Mitali Bafna","Rares-Darius Buhai","Pravesh K. Kothari","David Steurer"],"pdf_url":"https://arxiv.org/pdf/2411.12438v1.pdf","comment":"64 pages"},{"id":"http://arxiv.org/abs/2411.12435v1","updated":"2024-11-19T11:52:10Z","published":"2024-11-19T11:52:10Z","title":"STRisk: A Socio-Technical Approach to Assess Hacking Breaches Risk","summary":" Data breaches have begun to take on new dimensions and their prediction is\nbecoming of great importance to organizations. Prior work has addressed this\nissue mainly from a technical perspective and neglected other interfering\naspects such as the social media dimension. To fill this gap, we propose STRisk\nwhich is a predictive system where we expand the scope of the prediction task\nby bringing into play the social media dimension. We study over 3800 US\norganizations including both victim and non-victim organizations. For each\norganization, we design a profile composed of a variety of externally measured\ntechnical indicators and social factors. In addition, to account for unreported\nincidents, we consider the non-victim sample to be noisy and propose a noise\ncorrection approach to correct mislabeled organizations. We then build several\nmachine learning models to predict whether an organization is exposed to\nexperience a hacking breach. By exploiting both technical and social features,\nwe achieve a Area Under Curve (AUC) score exceeding 98%, which is 12% higher\nthan the AUC achieved using only technical features. Furthermore, our feature\nimportance analysis reveals that open ports and expired certificates are the\nbest technical predictors, while spreadability and agreeability are the best\nsocial predictors.\n","authors":["Hicham Hammouchi","Narjisse Nejjari","Ghita Mezzour","Mounir Ghogho","Houda Benbrahim"],"pdf_url":"https://arxiv.org/pdf/2411.12435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00570v2","updated":"2024-11-19T11:00:38Z","published":"2024-03-01T14:47:46Z","title":"Rethinking cluster-conditioned diffusion models for label-free image\n synthesis","summary":" Diffusion-based image generation models can enhance image quality when\nconditioned on ground truth labels. Here, we conduct a comprehensive\nexperimental study on image-level conditioning for diffusion models using\ncluster assignments. We investigate how individual clustering determinants,\nsuch as the number of clusters and the clustering method, impact image\nsynthesis across three different datasets. Given the optimal number of clusters\nwith respect to image synthesis, we show that cluster-conditioning can achieve\nstate-of-the-art performance, with an FID of 1.67 for CIFAR10 and 2.17 for\nCIFAR100, along with a strong increase in training sample efficiency. We\nfurther propose a novel empirical method to estimate an upper bound for the\noptimal number of clusters. Unlike existing approaches, we find no significant\nassociation between clustering performance and the corresponding\ncluster-conditional FID scores. The code is available at\nhttps://github.com/HHU-MMBS/cedm-official-wavc2025.\n","authors":["Nikolas Adaloglou","Tim Kaiser","Felix Michels","Markus Kollmann"],"pdf_url":"https://arxiv.org/pdf/2403.00570v2.pdf","comment":"Accepted in WAVC2025 (21 pages, 15 figures). Code is available at\n https://github.com/HHU-MMBS/cedm-official-wavc2025"},{"id":"http://arxiv.org/abs/2405.04793v2","updated":"2024-11-19T10:59:30Z","published":"2024-05-08T03:57:45Z","title":"Zero-shot LLM-guided Counterfactual Generation: A Case Study on NLP\n Model Evaluation","summary":" With the development and proliferation of large, complex, black-box models\nfor solving many natural language processing (NLP) tasks, there is also an\nincreasing necessity of methods to stress-test these models and provide some\ndegree of interpretability or explainability. While counterfactual examples are\nuseful in this regard, automated generation of counterfactuals is a data and\nresource intensive process. such methods depend on models such as pre-trained\nlanguage models that are then fine-tuned on auxiliary, often task-specific\ndatasets, that may be infeasible to build in practice, especially for new tasks\nand data domains. Therefore, in this work we explore the possibility of\nleveraging large language models (LLMs) for zero-shot counterfactual generation\nin order to stress-test NLP models. We propose a structured pipeline to\nfacilitate this generation, and we hypothesize that the instruction-following\nand textual understanding capabilities of recent LLMs can be effectively\nleveraged for generating high quality counterfactuals in a zero-shot manner,\nwithout requiring any training or fine-tuning. Through comprehensive\nexperiments on a variety of propreitary and open-source LLMs, along with\nvarious downstream tasks in NLP, we explore the efficacy of LLMs as zero-shot\ncounterfactual generators in evaluating and explaining black-box NLP models.\n","authors":["Amrita Bhattacharjee","Raha Moraffah","Joshua Garland","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2405.04793v2.pdf","comment":"Longer version of short paper accepted at IEEE BigData 2024 (Main\n Track)"},{"id":"http://arxiv.org/abs/2207.01581v2","updated":"2024-11-19T10:28:29Z","published":"2022-07-04T17:01:18Z","title":"Interpretable Fusion Analytics Framework for fMRI Connectivity:\n Self-Attention Mechanism and Latent Space Item-Response Model","summary":" There have been several attempts to use deep learning based on brain fMRI\nsignals to classify cognitive impairment diseases. However, deep learning is a\nhidden black box model that makes it difficult to interpret the process of\nclassification. To address this issue, we propose a novel analytical framework\nthat interprets the classification result from deep learning processes. We\nfirst derive the region of interest (ROI) functional connectivity network (FCN)\nby embedding functions based on their similar signal patterns. Then, using the\nself-attention equipped deep learning model, we classify diseases based on\ntheir FCN. Finally, in order to interpret the classification results, we employ\na latent space item-response interaction network model to identify the\nsignificant functions that exhibit distinct connectivity patterns when compared\nto other diseases. The application of this proposed framework to the four types\nof cognitive impairment shows that our approach is valid for determining the\nsignificant ROI functions.\n","authors":["Jeong-Jae Kim","Yeseul Jeon","SuMin Yu","Junggu Choi","Sanghoon Han"],"pdf_url":"https://arxiv.org/pdf/2207.01581v2.pdf","comment":"This submission is a duplicate of another manuscript from our\n research group [arXiv preprint arXiv:2401.09028] due to a misunderstanding in\n communication among co-authors"},{"id":"http://arxiv.org/abs/2411.11616v2","updated":"2024-11-19T10:11:04Z","published":"2024-11-18T14:42:15Z","title":"Signaling and Social Learning in Swarms of Robots","summary":" This paper investigates the role of communication in improving coordination\nwithin robot swarms, focusing on a paradigm where learning and execution occur\nsimultaneously in a decentralized manner. We highlight the role communication\ncan play in addressing the credit assignment problem (individual contribution\nto the overall performance), and how it can be influenced by it. We propose a\ntaxonomy of existing and future works on communication, focusing on information\nselection and physical abstraction as principal axes for classification: from\nlow-level lossless compression with raw signal extraction and processing to\nhigh-level lossy compression with structured communication models. The paper\nreviews current research from evolutionary robotics, multi-agent (deep)\nreinforcement learning, language models, and biophysics models to outline the\nchallenges and opportunities of communication in a collective of robots that\ncontinuously learn from one another through local message exchanges,\nillustrating a form of social learning.\n","authors":["Leo Cazenille","Maxime Toquebiau","Nicolas Lobato-Dauzier","Alessia Loi","Loona Macabre","Nathanael Aubert-Kato","Anthony Genot","Nicolas Bredeche"],"pdf_url":"https://arxiv.org/pdf/2411.11616v2.pdf","comment":"17 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2408.06071v2","updated":"2024-11-19T09:55:37Z","published":"2024-08-12T11:44:47Z","title":"A-BDD: Leveraging Data Augmentations for Safe Autonomous Driving in\n Adverse Weather and Lighting","summary":" High-autonomy vehicle functions rely on machine learning (ML) algorithms to\nunderstand the environment. Despite displaying remarkable performance in fair\nweather scenarios, perception algorithms are heavily affected by adverse\nweather and lighting conditions. To overcome these difficulties, ML engineers\nmainly rely on comprehensive real-world datasets. However, the difficulties in\nreal-world data collection for critical areas of the operational design domain\n(ODD) often means synthetic data is required for perception training and safety\nvalidation. Thus, we present A-BDD, a large set of over 60,000 synthetically\naugmented images based on BDD100K that are equipped with semantic segmentation\nand bounding box annotations (inherited from the BDD100K dataset). The dataset\ncontains augmented data for rain, fog, overcast and sunglare/shadow with\nvarying intensity levels. We further introduce novel strategies utilizing\nfeature-based image quality metrics like FID and CMMD, which help identify\nuseful augmented and real-world data for ML training and testing. By conducting\nexperiments on A-BDD, we provide evidence that data augmentations can play a\npivotal role in closing performance gaps in adverse weather and lighting\nconditions.\n","authors":["Felix Assion","Florens Gressner","Nitin Augustine","Jona Klemenc","Ahmed Hammam","Alexandre Krattinger","Holger Trittenbach","Anja Philippsen","Sascha Riemer"],"pdf_url":"https://arxiv.org/pdf/2408.06071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12377v1","updated":"2024-11-19T09:53:28Z","published":"2024-11-19T09:53:28Z","title":"Non-IID data in Federated Learning: A Systematic Review with Taxonomy,\n Metrics, Methods, Frameworks and Future Directions","summary":" Recent advances in machine learning have highlighted Federated Learning (FL)\nas a promising approach that enables multiple distributed users (so-called\nclients) to collectively train ML models without sharing their private data.\nWhile this privacy-preserving method shows potential, it struggles when data\nacross clients is not independent and identically distributed (non-IID) data.\nThe latter remains an unsolved challenge that can result in poorer model\nperformance and slower training times. Despite the significance of non-IID data\nin FL, there is a lack of consensus among researchers about its classification\nand quantification. This systematic review aims to fill that gap by providing a\ndetailed taxonomy for non-IID data, partition protocols, and metrics to\nquantify data heterogeneity. Additionally, we describe popular solutions to\naddress non-IID data and standardized frameworks employed in FL with\nheterogeneous data. Based on our state-of-the-art review, we present key\nlessons learned and suggest promising future research directions.\n","authors":["Daniel M. Jimenez G.","David Solans","Mikko Heikkila","Andrea Vitaletti","Nicolas Kourtellis","Aris Anagnostopoulos","Ioannis Chatzigiannakis"],"pdf_url":"https://arxiv.org/pdf/2411.12377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12044v4","updated":"2024-11-19T09:52:55Z","published":"2023-12-19T10:57:12Z","title":"XLand-MiniGrid: Scalable Meta-Reinforcement Learning Environments in JAX","summary":" Inspired by the diversity and depth of XLand and the simplicity and\nminimalism of MiniGrid, we present XLand-MiniGrid, a suite of tools and\ngrid-world environments for meta-reinforcement learning research. Written in\nJAX, XLand-MiniGrid is designed to be highly scalable and can potentially run\non GPU or TPU accelerators, democratizing large-scale experimentation with\nlimited resources. Along with the environments, XLand-MiniGrid provides\npre-sampled benchmarks with millions of unique tasks of varying difficulty and\neasy-to-use baselines that allow users to quickly start training adaptive\nagents. In addition, we have conducted a preliminary analysis of scaling and\ngeneralization, showing that our baselines are capable of reaching millions of\nsteps per second during training and validating that the proposed benchmarks\nare challenging. XLand-MiniGrid is open-source and available at\nhttps://github.com/dunnolab/xland-minigrid.\n","authors":["Alexander Nikulin","Vladislav Kurenkov","Ilya Zisman","Artem Agarkov","Viacheslav Sinii","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2312.12044v4.pdf","comment":"Neural Information Processing Systems (NeurIPS 2024) Track on\n Datasets and Benchmarks. Source code at\n https://github.com/dunnolab/xland-minigrid"},{"id":"http://arxiv.org/abs/2405.17151v3","updated":"2024-11-19T09:48:17Z","published":"2024-05-27T13:26:34Z","title":"Smoke and Mirrors in Causal Downstream Tasks","summary":" Machine Learning and AI have the potential to transform data-driven\nscientific discovery, enabling accurate predictions for several scientific\nphenomena. As many scientific questions are inherently causal, this paper looks\nat the causal inference task of treatment effect estimation, where the outcome\nof interest is recorded in high-dimensional observations in a Randomized\nControlled Trial (RCT). Despite being the simplest possible causal setting and\na perfect fit for deep learning, we theoretically find that many common choices\nin the literature may lead to biased estimates. To test the practical impact of\nthese considerations, we recorded ISTAnt, the first real-world benchmark for\ncausal inference downstream tasks on high-dimensional observations as an RCT\nstudying how garden ants (Lasius neglectus) respond to microparticles applied\nonto their colony members by hygienic grooming. Comparing 6 480 models\nfine-tuned from state-of-the-art visual backbones, we find that the sampling\nand modeling choices significantly affect the accuracy of the causal estimate,\nand that classification accuracy is not a proxy thereof. We further validated\nthe analysis, repeating it on a synthetically generated visual data set\ncontrolling the causal model. Our results suggest that future benchmarks should\ncarefully consider real downstream scientific questions, especially causal\nones. Further, we highlight guidelines for representation learning methods to\nhelp answer causal questions in the sciences.\n","authors":["Riccardo Cadei","Lukas Lindorfer","Sylvia Cremer","Cordelia Schmid","Francesco Locatello"],"pdf_url":"https://arxiv.org/pdf/2405.17151v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12372v1","updated":"2024-11-19T09:35:28Z","published":"2024-11-19T09:35:28Z","title":"RedPajama: an Open Dataset for Training Large Language Models","summary":" Large language models are increasingly becoming a cornerstone technology in\nartificial intelligence, the sciences, and society as a whole, yet the optimal\nstrategies for dataset composition and filtering remain largely elusive. Many\nof the top-performing models lack transparency in their dataset curation and\nmodel development processes, posing an obstacle to the development of fully\nopen language models. In this paper, we identify three core data-related\nchallenges that must be addressed to advance open-source language models. These\ninclude (1) transparency in model development, including the data curation\nprocess, (2) access to large quantities of high-quality data, and (3)\navailability of artifacts and metadata for dataset curation and analysis. To\naddress these challenges, we release RedPajama-V1, an open reproduction of the\nLLaMA training dataset. In addition, we release RedPajama-V2, a massive\nweb-only dataset consisting of raw, unfiltered text data together with quality\nsignals and metadata. Together, the RedPajama datasets comprise over 100\ntrillion tokens spanning multiple domains and with their quality signals\nfacilitate the filtering of data, aiming to inspire the development of numerous\nnew datasets. To date, these datasets have already been used in the training of\nstrong language models used in production, such as Snowflake Arctic,\nSalesforce's XGen and AI2's OLMo. To provide insight into the quality of\nRedPajama, we present a series of analyses and ablation studies with\ndecoder-only language models with up to 1.6B parameters. Our findings\ndemonstrate how quality signals for web data can be effectively leveraged to\ncurate high-quality subsets of the dataset, underscoring the potential of\nRedPajama to advance the development of transparent and high-performing\nlanguage models at scale.\n","authors":["Maurice Weber","Daniel Fu","Quentin Anthony","Yonatan Oren","Shane Adams","Anton Alexandrov","Xiaozhong Lyu","Huu Nguyen","Xiaozhe Yao","Virginia Adams","Ben Athiwaratkun","Rahul Chalamala","Kezhen Chen","Max Ryabinin","Tri Dao","Percy Liang","Christopher Ré","Irina Rish","Ce Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.12372v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2411.10755v2","updated":"2024-11-19T09:30:44Z","published":"2024-11-16T09:22:46Z","title":"Diffusion-Based Semantic Segmentation of Lumbar Spine MRI Scans of Lower\n Back Pain Patients","summary":" This study introduces a diffusion-based framework for robust and accurate\nsegmenton of vertebrae, intervertebral discs (IVDs), and spinal canal from\nMagnetic Resonance Imaging~(MRI) scans of patients with low back pain (LBP),\nregardless of whether the scans are T1w or T2-weighted. The results showed that\nSpineSegDiff achieved comparable outperformed non-diffusion state-of-the-art\nmodels in the identification of degenerated IVDs. Our findings highlight the\npotential of diffusion models to improve LBP diagnosis and management through\nprecise spine MRI analysis.\n","authors":["Maria Monzon","Thomas Iff","Ender Konukoglu","Catherine R. Jutzeler"],"pdf_url":"https://arxiv.org/pdf/2411.10755v2.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 5 pages"},{"id":"http://arxiv.org/abs/2409.16718v2","updated":"2024-11-19T09:27:37Z","published":"2024-09-25T08:07:18Z","title":"Vision-Language Model Fine-Tuning via Simple Parameter-Efficient\n Modification","summary":" Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed\nthe success of prompt tuning and adapter tuning, while the classic model\nfine-tuning on inherent parameters seems to be overlooked. It is believed that\nfine-tuning the parameters of VLMs with few-shot samples corrupts the\npre-trained knowledge since fine-tuning the CLIP model even degrades\nperformance. In this paper, we revisit this viewpoint, and propose a new\nperspective: fine-tuning the specific parameters instead of all will uncover\nthe power of classic model fine-tuning on VLMs. Through our meticulous study,\nwe propose ClipFit, a simple yet effective method to fine-tune CLIP without\nintroducing any overhead of extra parameters. We demonstrate that by only\nfine-tuning the specific bias terms and normalization layers, ClipFit can\nimprove the performance of zero-shot CLIP by 7.27\\% average harmonic mean\naccuracy. Lastly, to understand how fine-tuning in CLIPFit affects the\npre-trained models, we conducted extensive experimental analyses w.r.t. changes\nin internal parameters and representations. We found that low-level text bias\nlayers and the first layer normalization layer change much more than other\nlayers. The code is available at \\url{https://github.com/minglllli/CLIPFit}.\n","authors":["Ming Li","Jike Zhong","Chenxin Li","Liuzhuozheng Li","Nie Lin","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2409.16718v2.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2411.12364v1","updated":"2024-11-19T09:24:34Z","published":"2024-11-19T09:24:34Z","title":"Ultra-Sparse Memory Network","summary":" It is widely acknowledged that the performance of Transformer models is\nexponentially related to their number of parameters and computational\ncomplexity. While approaches like Mixture of Experts (MoE) decouple parameter\ncount from computational complexity, they still face challenges in inference\ndue to high memory access costs. This work introduces UltraMem, incorporating\nlarge-scale, ultra-sparse memory layer to address these limitations. Our\napproach significantly reduces inference latency while maintaining model\nperformance. We also investigate the scaling laws of this new architecture,\ndemonstrating that it not only exhibits favorable scaling properties but\noutperforms traditional models. In our experiments, we train networks with up\nto 20 million memory slots. The results show that our method achieves\nstate-of-the-art inference speed and model performance within a given\ncomputational budget.\n","authors":["Zihao Huang","Qiyang Min","Hongzhi Huang","Defa Zhu","Yutao Zeng","Ran Guo","Xun Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.12364v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.12352v1","updated":"2024-11-19T09:15:08Z","published":"2024-11-19T09:15:08Z","title":"Perfecting Imperfect Physical Neural Networks with Transferable\n Robustness using Sharpness-Aware Training","summary":" AI models are essential in science and engineering, but recent advances are\npushing the limits of traditional digital hardware. To address these\nlimitations, physical neural networks (PNNs), which use physical substrates for\ncomputation, have gained increasing attention. However, developing effective\ntraining methods for PNNs remains a significant challenge. Current approaches,\nregardless of offline and online training, suffer from significant accuracy\nloss. Offline training is hindered by imprecise modeling, while online training\nyields device-specific models that can't be transferred to other devices due to\nmanufacturing variances. Both methods face challenges from perturbations after\ndeployment, such as thermal drift or alignment errors, which make trained\nmodels invalid and require retraining. Here, we address the challenges with\nboth offline and online training through a novel technique called\nSharpness-Aware Training (SAT), where we innovatively leverage the geometry of\nthe loss landscape to tackle the problems in training physical systems. SAT\nenables accurate training using efficient backpropagation algorithms, even with\nimprecise models. PNNs trained by SAT offline even outperform those trained\nonline, despite modeling and fabrication errors. SAT also overcomes online\ntraining limitations by enabling reliable transfer of models between devices.\nFinally, SAT is highly resilient to perturbations after deployment, allowing\nPNNs to continuously operate accurately under perturbations without retraining.\nWe demonstrate SAT across three types of PNNs, showing it is universally\napplicable, regardless of whether the models are explicitly known. This work\noffers a transformative, efficient approach to training PNNs, addressing\ncritical challenges in analog computing and enabling real-world deployment.\n","authors":["Tengji Xu","Zeyu Luo","Shaojie Liu","Li Fan","Qiarong Xiao","Benshan Wang","Dongliang Wang","Chaoran Huang"],"pdf_url":"https://arxiv.org/pdf/2411.12352v1.pdf","comment":"24 pages, 4 figures"},{"id":"http://arxiv.org/abs/2303.00970v3","updated":"2024-11-19T08:55:53Z","published":"2023-03-02T05:08:15Z","title":"PAPAL: A Provable PArticle-based Primal-Dual ALgorithm for Mixed Nash\n Equilibrium","summary":" We consider the non-convex non-concave objective function in two-player\nzero-sum continuous games. The existence of pure Nash equilibrium requires\nstringent conditions, posing a major challenge for this problem. To circumvent\nthis difficulty, we examine the problem of identifying a mixed Nash\nequilibrium, where strategies are randomized and characterized by probability\ndistributions over continuous domains. To this end, we propose PArticle-based\nPrimal-dual ALgorithm (PAPAL) tailored for a weakly entropy-regularized min-max\noptimization over probability distributions. This algorithm employs the\nstochastic movements of particles to represent the updates of random strategies\nfor the $\\epsilon$-mixed Nash equilibrium. We offer a comprehensive convergence\nanalysis of the proposed algorithm, demonstrating its effectiveness. In\ncontrast to prior research that attempted to update particle importance without\nmovements, PAPAL is the first implementable particle-based algorithm\naccompanied by non-asymptotic quantitative convergence results, running time,\nand sample complexity guarantees. Our framework contributes novel insights into\nthe particle-based algorithms for continuous min-max optimization in the\ngeneral non-convex non-concave setting.\n","authors":["Shihong Ding","Hanze Dong","Cong Fang","Zhouchen Lin","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.00970v3.pdf","comment":"Published in Journal of Machine Learning Research 25 (2024) 1-48"},{"id":"http://arxiv.org/abs/2411.11515v2","updated":"2024-11-19T08:50:38Z","published":"2024-11-18T12:22:37Z","title":"Cascaded Diffusion Models for 2D and 3D Microscopy Image Synthesis to\n Enhance Cell Segmentation","summary":" Automated cell segmentation in microscopy images is essential for biomedical\nresearch, yet conventional methods are labor-intensive and prone to error.\nWhile deep learning-based approaches have proven effective, they often require\nlarge annotated datasets, which are scarce due to the challenges of manual\nannotation. To overcome this, we propose a novel framework for synthesizing\ndensely annotated 2D and 3D cell microscopy images using cascaded diffusion\nmodels. Our method synthesizes 2D and 3D cell masks from sparse 2D annotations\nusing multi-level diffusion models and NeuS, a 3D surface reconstruction\napproach. Following that, a pretrained 2D Stable Diffusion model is finetuned\nto generate realistic cell textures and the final outputs are combined to form\ncell populations. We show that training a segmentation model with a combination\nof our synthetic data and real data improves cell segmentation performance by\nup to 9\\% across multiple datasets. Additionally, the FID scores indicate that\nthe synthetic data closely resembles real data. The code for our proposed\napproach will be available at\nhttps://github.com/ruveydayilmaz0/cascaded_diffusion.\n","authors":["Rüveyda Yilmaz","Kaan Keven","Yuli Wu","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2411.11515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12334v1","updated":"2024-11-19T08:36:34Z","published":"2024-11-19T08:36:34Z","title":"Learning from Label Proportions and Covariate-shifted Instances","summary":" In many applications, especially due to lack of supervision or privacy\nconcerns, the training data is grouped into bags of instances (feature-vectors)\nand for each bag we have only an aggregate label derived from the\ninstance-labels in the bag. In learning from label proportions (LLP) the\naggregate label is the average of the instance-labels in a bag, and a\nsignificant body of work has focused on training models in the LLP setting to\npredict instance-labels. In practice however, the training data may have fully\nsupervised albeit covariate-shifted source data, along with the usual target\ndata with bag-labels, and we wish to train a good instance-level predictor on\nthe target domain. We call this the covariate-shifted hybrid LLP problem. Fully\nsupervised covariate shifted data often has useful training signals and the\ngoal is to leverage them for better predictive performance in the hybrid LLP\nsetting. To achieve this, we develop methods for hybrid LLP which naturally\nincorporate the target bag-labels along with the source instance-labels, in the\ndomain adaptation framework. Apart from proving theoretical guarantees bounding\nthe target generalization error, we also conduct experiments on several\npublicly available datasets showing that our methods outperform LLP and domain\nadaptation baselines as well techniques from previous related work.\n","authors":["Sagalpreet Singh","Navodita Sharma","Shreyas Havaldar","Rishi Saket","Aravindan Raghuveer"],"pdf_url":"https://arxiv.org/pdf/2411.12334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11567v2","updated":"2024-11-19T08:35:01Z","published":"2024-11-18T13:40:03Z","title":"GNN-Based Code Annotation Logic for Establishing Security Boundaries in\n C Code","summary":" Securing sensitive operations in today's interconnected software landscape is\ncrucial yet challenging. Modern platforms rely on Trusted Execution\nEnvironments (TEEs), such as Intel SGX and ARM TrustZone, to isolate security\nsensitive code from the main system, reducing the Trusted Computing Base (TCB)\nand providing stronger assurances. However, identifying which code should\nreside in TEEs is complex and requires specialized expertise, which is not\nsupported by current automated tools. Existing solutions often migrate entire\napplications to TEEs, leading to suboptimal use and an increased TCB. To\naddress this gap, we propose Code Annotation Logic (CAL), a pioneering tool\nthat automatically identifies security sensitive components for TEE isolation.\nCAL analyzes codebases, leveraging a graph-based approach with novel feature\nconstruction and employing a custom graph neural network model to accurately\ndetermine which parts of the code should be isolated. CAL effectively optimizes\nTCB, reducing the burden of manual analysis and enhancing overall security. Our\ncontributions include the definition of security sensitive code, the\nconstruction and labeling of a comprehensive dataset of source files, a feature\nrich graph based data preparation pipeline, and the CAL model for TEE\nintegration. Evaluation results demonstrate CAL's efficacy in identifying\nsensitive code with a recall of 86.05%, an F1 score of 81.56%, and an\nidentification rate of 91.59% for security sensitive functions. By enabling\nefficient code isolation, CAL advances the secure development of applications\nusing TEEs, offering a practical solution for developers to reduce attack\nvectors.\n","authors":["Varun Gadey","Raphael Goetz","Christoph Sendner","Sampo Sovio","Alexandra Dmitrienko"],"pdf_url":"https://arxiv.org/pdf/2411.11567v2.pdf","comment":"Submitted"},{"id":"http://arxiv.org/abs/2401.12843v3","updated":"2024-11-19T08:34:20Z","published":"2024-01-23T15:25:21Z","title":"An embedding-based distance for temporal graphs","summary":" Temporal graphs are commonly used to represent time-resolved relations\nbetween entities in many natural and artificial systems. Many techniques were\ndevised to investigate the evolution of temporal graphs by comparing their\nstate at different time points. However, quantifying the similarity between\ntemporal graphs as a whole is an open problem. Here, we use embeddings based on\ntime-respecting random walks to introduce a new notion of distance between\ntemporal graphs. This distance is well-defined for pairs of temporal graphs\nwith different numbers of nodes and different time spans. We study the case of\na matched pair of graphs, when a known relation exists between their nodes, and\nthe case of unmatched graphs, when such a relation is unavailable and the\ngraphs may be of different sizes. We use empirical and synthetic temporal\nnetwork data to show that the distance we introduce discriminates graphs with\ndifferent topological and temporal properties. We provide an efficient\nimplementation of the distance computation suitable for large-scale temporal\ngraphs.\n","authors":["Lorenzo Dall'Amico","Alain Barrat","Ciro Cattuto"],"pdf_url":"https://arxiv.org/pdf/2401.12843v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12330v1","updated":"2024-11-19T08:32:14Z","published":"2024-11-19T08:32:14Z","title":"Graph as a feature: improving node classification with non-neural\n graph-aware logistic regression","summary":" Graph Neural Networks (GNNs) and their message passing framework that\nleverages both structural and feature information, have become a standard\nmethod for solving graph-based machine learning problems. However, these\napproaches still struggle to generalise well beyond datasets that exhibit\nstrong homophily, where nodes of the same class tend to connect. This\nlimitation has led to the development of complex neural architectures that pose\nchallenges in terms of efficiency and scalability. In response to these\nlimitations, we focus on simpler and more scalable approaches and introduce\nGraph-aware Logistic Regression (GLR), a non-neural model designed for node\nclassification tasks. Unlike traditional graph algorithms that use only a\nfraction of the information accessible to GNNs, our proposed model\nsimultaneously leverages both node features and the relationships between\nentities. However instead of relying on message passing, our approach encodes\neach node's relationships as an additional feature vector, which is then\ncombined with the node's self attributes. Extensive experimental results,\nconducted within a rigorous evaluation framework, show that our proposed GLR\napproach outperforms both foundational and sophisticated state-of-the-art GNN\nmodels in node classification tasks. Going beyond the traditional limited\nbenchmarks, our experiments indicate that GLR increases generalisation ability\nwhile reaching performance gains in computation time up to two orders of\nmagnitude compared to it best neural competitor.\n","authors":["Simon Delarue","Thomas Bonald","Tiphaine Viard"],"pdf_url":"https://arxiv.org/pdf/2411.12330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12329v1","updated":"2024-11-19T08:30:22Z","published":"2024-11-19T08:30:22Z","title":"Attributed Graph Clustering in Collaborative Settings","summary":" Graph clustering is an unsupervised machine learning method that partitions\nthe nodes in a graph into different groups. Despite achieving significant\nprogress in exploiting both attributed and structured data information, graph\nclustering methods often face practical challenges related to data isolation.\nMoreover, the absence of collaborative methods for graph clustering limits\ntheir effectiveness.\n In this paper, we propose a collaborative graph clustering framework for\nattributed graphs, supporting attributed graph clustering over vertically\npartitioned data with different participants holding distinct features of the\nsame data. Our method leverages a novel technique that reduces the sample\nspace, improving the efficiency of the attributed graph clustering method.\nFurthermore, we compare our method to its centralized counterpart under a\nproximity condition, demonstrating that the successful local results of each\nparticipant contribute to the overall success of the collaboration.\n We fully implement our approach and evaluate its utility and efficiency by\nconducting experiments on four public datasets. The results demonstrate that\nour method achieves comparable accuracy levels to centralized attributed graph\nclustering methods. Our collaborative graph clustering framework provides an\nefficient and effective solution for graph clustering challenges related to\ndata isolation.\n","authors":["Rui Zhang","Xiaoyang Hou","Zhihua Tian","Jian Liu","Qingbiao Wu","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2411.12329v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.09273v5","updated":"2024-11-19T08:17:57Z","published":"2024-05-15T11:42:41Z","title":"Fair Generalized Linear Mixed Models","summary":" When using machine learning for automated prediction, it is important to\naccount for fairness in the prediction. Fairness in machine learning aims to\nensure that biases in the data and model inaccuracies do not lead to\ndiscriminatory decisions. E.g., predictions from fair machine learning models\nshould not discriminate against sensitive variables such as sexual orientation\nand ethnicity. The training data often in obtained from social surveys. In\nsocial surveys, oftentimes the data collection process is a strata sampling,\ne.g. due to cost restrictions. In strata samples, the assumption of\nindependence between the observation is not fulfilled. Hence, if the machine\nlearning models do not account for the strata correlations, the results may be\nbiased. Especially high is the bias in cases where the strata assignment is\ncorrelated to the variable of interest. We present in this paper an algorithm\nthat can handle both problems simultaneously, and we demonstrate the impact of\nstratified sampling on the quality of fair machine learning predictions in a\nreproducible simulation study.\n","authors":["Jan Pablo Burgard","João Vitor Pamplona"],"pdf_url":"https://arxiv.org/pdf/2405.09273v5.pdf","comment":"25 pages, 12 figures. arXiv admin note: text overlap with\n arXiv:2405.06433"},{"id":"http://arxiv.org/abs/2409.15761v2","updated":"2024-11-19T08:12:46Z","published":"2024-09-24T05:31:17Z","title":"TFG: Unified Training-Free Guidance for Diffusion Models","summary":" Given an unconditional diffusion model and a predictor for a target property\nof interest (e.g., a classifier), the goal of training-free guidance is to\ngenerate samples with desirable target properties without additional training.\nExisting methods, though effective in various individual applications, often\nlack theoretical grounding and rigorous testing on extensive benchmarks. As a\nresult, they could even fail on simple tasks, and applying them to a new\nproblem becomes unavoidably difficult. This paper introduces a novel\nalgorithmic framework encompassing existing methods as special cases, unifying\nthe study of training-free guidance into the analysis of an algorithm-agnostic\ndesign space. Via theoretical and empirical investigation, we propose an\nefficient and effective hyper-parameter searching strategy that can be readily\napplied to any downstream task. We systematically benchmark across 7 diffusion\nmodels on 16 tasks with 40 targets, and improve performance by 8.5% on average.\nOur framework and benchmark offer a solid foundation for conditional generation\nin a training-free manner.\n","authors":["Haotian Ye","Haowei Lin","Jiaqi Han","Minkai Xu","Sheng Liu","Yitao Liang","Jianzhu Ma","James Zou","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2409.15761v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12313v1","updated":"2024-11-19T08:01:20Z","published":"2024-11-19T08:01:20Z","title":"C$^{2}$INet: Realizing Incremental Trajectory Prediction with\n Prior-Aware Continual Causal Intervention","summary":" Trajectory prediction for multi-agents in complex scenarios is crucial for\napplications like autonomous driving. However, existing methods often overlook\nenvironmental biases, which leads to poor generalization. Additionally,\nhardware constraints limit the use of large-scale data across environments, and\ncontinual learning settings exacerbate the challenge of catastrophic\nforgetting. To address these issues, we propose the Continual Causal\nIntervention (C$^{2}$INet) method for generalizable multi-agent trajectory\nprediction within a continual learning framework. Using variational inference,\nwe align environment-related prior with posterior estimator of confounding\nfactors in the latent space, thereby intervening in causal correlations that\naffect trajectory representation. Furthermore, we store optimal variational\npriors across various scenarios using a memory queue, ensuring continuous\ndebiasing during incremental task training. The proposed C$^{2}$INet enhances\nadaptability to diverse tasks while preserving previous task information to\nprevent catastrophic forgetting. It also incorporates pruning strategies to\nmitigate overfitting. Comparative evaluations on three real and synthetic\ncomplex datasets against state-of-the-art methods demonstrate that our proposed\nmethod consistently achieves reliable prediction performance, effectively\nmitigating confounding factors unique to different scenarios. This highlights\nthe practical value of our method for real-world applications.\n","authors":["Xiaohe Li","Feilong Huang","Zide Fan","Fangli Mou","Leilei Lin","Yingyan Hou","Lijie Wen"],"pdf_url":"https://arxiv.org/pdf/2411.12313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12308v1","updated":"2024-11-19T07:49:22Z","published":"2024-11-19T07:49:22Z","title":"SNN-Based Online Learning of Concepts and Action Laws in an Open World","summary":" We present the architecture of a fully autonomous, bio-inspired cognitive\nagent built around a spiking neural network (SNN) implementing the agent's\nsemantic memory. The agent explores its universe and learns concepts of\nobjects/situations and of its own actions in a one-shot manner. While\nobject/situation concepts are unary, action concepts are triples made up of an\ninitial situation, a motor activity, and an outcome. They embody the agent's\nknowledge of its universe's actions laws. Both kinds of concepts have different\ndegrees of generality. To make decisions the agent queries its semantic memory\nfor the expected outcomes of envisaged actions and chooses the action to take\non the basis of these predictions. Our experiments show that the agent handles\nnew situations by appealing to previously learned general concepts and rapidly\nmodifies its concepts to adapt to environment changes.\n","authors":["Christel Grimaud","Dominique Longin","Andreas Herzig"],"pdf_url":"https://arxiv.org/pdf/2411.12308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15000v3","updated":"2024-11-19T07:46:16Z","published":"2024-02-22T22:28:46Z","title":"Divide-or-Conquer? Which Part Should You Distill Your LLM?","summary":" Recent methods have demonstrated that Large Language Models (LLMs) can solve\nreasoning tasks better when they are encouraged to solve subtasks of the main\ntask first. In this paper we devise a similar strategy that breaks down\nreasoning tasks into a problem decomposition phase and a problem solving phase\nand show that the strategy is able to outperform a single stage solution.\nFurther, we hypothesize that the decomposition should be easier to distill into\na smaller model compared to the problem solving because the latter requires\nlarge amounts of domain knowledge while the former only requires learning\ngeneral problem solving strategies. We propose methods to distill these two\ncapabilities and evaluate their impact on reasoning outcomes and inference\ncost. We find that we can distill the problem decomposition phase and at the\nsame time achieve good generalization across tasks, datasets, and models.\nHowever, it is harder to distill the problem solving capability without losing\nperformance and the resulting distilled model struggles with generalization.\nThese results indicate that by using smaller, distilled problem decomposition\nmodels in combination with problem solving LLMs we can achieve reasoning with\ncost-efficient inference and local adaptation.\n","authors":["Zhuofeng Wu","He Bai","Aonan Zhang","Jiatao Gu","VG Vinod Vydiswaran","Navdeep Jaitly","Yizhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.15000v3.pdf","comment":"Findings of the Association for Computational Linguistics: EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.04001v2","updated":"2024-11-19T07:44:51Z","published":"2024-09-06T03:05:35Z","title":"A semi-supervised learning using over-parameterized regression","summary":" Semi-supervised learning (SSL) is an important theme in machine learning, in\nwhich we have a few labeled samples and many unlabeled samples. In this paper,\nfor SSL in a regression problem, we consider a method of incorporating\ninformation on unlabeled samples into kernel functions. As a typical\nimplementation, we employ Gaussian kernels whose centers are labeled and\nunlabeled input samples. Since the number of coefficients is larger than the\nnumber of labeled samples in this setting, this is an over-parameterized\nregression roblem. A ridge regression is a typical estimation method under this\nsetting. In this paper, alternatively, we consider to apply the minimum norm\nleast squares (MNLS), which is known as a helpful tool for understanding deep\nlearning behavior while it may not be application oriented. Then, in applying\nthe MNLS for SSL, we established several methods based on feature\nextraction/dimension reduction in the SVD (singular value decomposition)\nrepresentation of a Gram type matrix appeared in the over-parameterized\nregression problem. The methods are thresholding according to singular value\nmagnitude with cross validation, hard-thresholding with cross validation,\nuniversal thresholding and bridge thresholding methods. The first one is\nequivalent to a method using a well-known low rank approximation of a Gram type\nmatrix. We refer to these methods as SVD regression methods. In the experiments\nfor real data, depending on datasets, clear superiority of the proposed SVD\nregression methods over ridge regression methods was observed. And, depending\non datasets, incorporation of information on unlabeled input samples into\nkernels was found to be clearly effective.\n","authors":["Katsuyuki Hagiwara"],"pdf_url":"https://arxiv.org/pdf/2409.04001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12304v1","updated":"2024-11-19T07:43:30Z","published":"2024-11-19T07:43:30Z","title":"Emergence of Implicit World Models from Mortal Agents","summary":" We discuss the possibility of world models and active exploration as emergent\nproperties of open-ended behavior optimization in autonomous agents. In\ndiscussing the source of the open-endedness of living things, we start from the\nperspective of biological systems as understood by the mechanistic approach of\ntheoretical biology and artificial life. From this perspective, we discuss the\npotential of homeostasis in particular as an open-ended objective for\nautonomous agents and as a general, integrative extrinsic motivation. We then\ndiscuss the possibility of implicitly acquiring a world model and active\nexploration through the internal dynamics of a network, and a hypothetical\narchitecture for this, by combining meta-reinforcement learning, which assumes\ndomain adaptation as a system that achieves robust homeostasis.\n","authors":["Kazuya Horibe","Naoto Yoshida"],"pdf_url":"https://arxiv.org/pdf/2411.12304v1.pdf","comment":"Accepted as a 1-page tiny paper in the Intrinsically Motivated\n Open-ended Learning workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.09961v2","updated":"2024-11-19T07:27:10Z","published":"2024-11-15T05:30:36Z","title":"Dense ReLU Neural Networks for Temporal-spatial Model","summary":" In this paper, we focus on fully connected deep neural networks utilizing the\nRectified Linear Unit (ReLU) activation function for nonparametric estimation.\nWe derive non-asymptotic bounds that lead to convergence rates, addressing both\ntemporal and spatial dependence in the observed measurements. By accounting for\ndependencies across time and space, our models better reflect the complexities\nof real-world data, enhancing both predictive performance and theoretical\nrobustness. We also tackle the curse of dimensionality by modeling the data on\na manifold, exploring the intrinsic dimensionality of high-dimensional data. We\nbroaden existing theoretical findings of temporal-spatial analysis by applying\nthem to neural networks in more general contexts and demonstrate that our proof\ntechniques are effective for models with short-range dependence. Our empirical\nsimulations across various synthetic response functions underscore the superior\nperformance of our method, outperforming established approaches in the existing\nliterature. These findings provide valuable insights into the strong\ncapabilities of dense neural networks for temporal-spatial modeling across a\nbroad range of function classes.\n","authors":["Zhi Zhang","Carlos Misael Madrid Padilla","Xiaokai Luo","Oscar Hernan Madrid Padilla","Daren Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13157v3","updated":"2024-11-19T07:10:34Z","published":"2022-11-23T17:32:52Z","title":"A Hybrid Data-Driven Multi-Stage Deep Learning Framework for Enhanced\n Nuclear Reactor Power Prediction","summary":" The accurate and efficient modeling of nuclear reactor transients is crucial\nfor ensuring safe and optimal reactor operation. Traditional physics-based\nmodels, while valuable, can be computationally intensive and may not fully\ncapture the complexities of real-world reactor behavior. This paper introduces\na novel multi-stage deep learning framework that addresses these limitations,\noffering a faster and more robust solution for predicting the final\nsteady-state power of reactor transients. By leveraging a combination of\nfeed-forward neural networks with both classification and regression stages,\nand training on a unique dataset that integrates real-world measurements of\nreactor power and controls state from the Missouri University of Science and\nTechnology Reactor (MSTR) with noise-enhanced simulated data, our approach\nachieves remarkable accuracy (96% classification, 2.3% MAPE). The incorporation\nof simulated data with noise significantly improves the model's generalization\ncapabilities, mitigating the risk of overfitting. This innovative solution not\nonly enables rapid and precise prediction of reactor behavior but also has the\npotential to revolutionize nuclear reactor operations, facilitating enhanced\nsafety protocols, optimized performance, and streamlined decision-making\nprocesses.\n","authors":["James Daniell","Kazuma Kobayashi","Ayodeji Alajo","Syed Bahauddin Alam"],"pdf_url":"https://arxiv.org/pdf/2211.13157v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18893v2","updated":"2024-11-19T07:08:17Z","published":"2024-04-29T17:30:36Z","title":"Learning general Gaussian mixtures with efficient score matching","summary":" We study the problem of learning mixtures of $k$ Gaussians in $d$ dimensions.\nWe make no separation assumptions on the underlying mixture components: we only\nrequire that the covariance matrices have bounded condition number and that the\nmeans and covariances lie in a ball of bounded radius. We give an algorithm\nthat draws $d^{\\mathrm{poly}(k/\\varepsilon)}$ samples from the target mixture,\nruns in sample-polynomial time, and constructs a sampler whose output\ndistribution is $\\varepsilon$-far from the unknown mixture in total variation.\nPrior works for this problem either (i) required exponential runtime in the\ndimension $d$, (ii) placed strong assumptions on the instance (e.g., spherical\ncovariances or clusterability), or (iii) had doubly exponential dependence on\nthe number of components $k$.\n Our approach departs from commonly used techniques for this problem like the\nmethod of moments. Instead, we leverage a recently developed reduction, based\non diffusion models, from distribution learning to a supervised learning task\ncalled score matching. We give an algorithm for the latter by proving a\nstructural result showing that the score function of a Gaussian mixture can be\napproximated by a piecewise-polynomial function, and there is an efficient\nalgorithm for finding it. To our knowledge, this is the first example of\ndiffusion models achieving a state-of-the-art theoretical guarantee for an\nunsupervised learning task.\n","authors":["Sitan Chen","Vasilis Kontonis","Kulin Shah"],"pdf_url":"https://arxiv.org/pdf/2404.18893v2.pdf","comment":"57 pages"},{"id":"http://arxiv.org/abs/2404.08901v3","updated":"2024-11-19T07:04:06Z","published":"2024-04-13T05:01:54Z","title":"Bullion: A Column Store for Machine Learning","summary":" The past two decades have witnessed significant success in applying columnar\nstorage to data warehousing and analytics. However, the rapid growth of machine\nlearning poses new challenges. This paper presents Bullion, a columnar storage\nsystem tailored for machine learning workloads. Bullion addresses the\ncomplexities of data compliance, optimizes the encoding of long sequence sparse\nfeatures, efficiently manages wide-table projections, introduces feature\nquantization in storage, enables quality-aware sequential reads for multimodal\ntraining data, and provides a comprehensive cascading encoding framework that\nunifies diverse encoding schemes through modular, composable interfaces. By\naligning with the evolving requirements of ML applications, Bullion facilitates\nthe application of columnar storage and processing to modern application\nscenarios such as those within advertising, recommendation systems, and\nGenerative AI.\n Preliminary experimental results and theoretical analysis demonstrate\nBullion's improved ability to deliver strong performance in the face of the\nunique demands of machine learning workloads compared to existing columnar\nstorage solutions. Bullion significantly reduces I/O costs for deletion\ncompliance, achieves substantial storage savings with its optimized encoding\nscheme for sparse features, and improves metadata parsing speed for wide-table\nprojections. These advancements enable Bullion to become an important component\nin the future of machine learning infrastructure, enabling organizations to\nefficiently manage and process the massive volumes of data required for\ntraining and inference in modern AI applications.\n","authors":["Gang Liao","Ye Liu","Jianjun Chen","Daniel J. Abadi"],"pdf_url":"https://arxiv.org/pdf/2404.08901v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12276v1","updated":"2024-11-19T06:56:24Z","published":"2024-11-19T06:56:24Z","title":"libcll: an Extendable Python Toolkit for Complementary-Label Learning","summary":" Complementary-label learning (CLL) is a weakly supervised learning paradigm\nfor multiclass classification, where only complementary labels -- indicating\nclasses an instance does not belong to -- are provided to the learning\nalgorithm. Despite CLL's increasing popularity, previous studies highlight two\nmain challenges: (1) inconsistent results arising from varied assumptions on\ncomplementary label generation, and (2) high barriers to entry due to the lack\nof a standardized evaluation platform across datasets and algorithms. To\naddress these challenges, we introduce \\texttt{libcll}, an extensible Python\ntoolkit for CLL research. \\texttt{libcll} provides a universal interface that\nsupports a wide range of generation assumptions, both synthetic and real-world\ndatasets, and key CLL algorithms. The toolkit is designed to mitigate\ninconsistencies and streamline the research process, with easy installation,\ncomprehensive usage guides, and quickstart tutorials that facilitate efficient\nadoption and implementation of CLL techniques. Extensive ablation studies\nconducted with \\texttt{libcll} demonstrate its utility in generating valuable\ninsights to advance future CLL research.\n","authors":["Nai-Xuan Ye","Tan-Ha Mai","Hsiu-Hsuan Wang","Wei-I Lin","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2411.12276v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.12274v1","updated":"2024-11-19T06:53:54Z","published":"2024-11-19T06:53:54Z","title":"A Review on Generative AI Models for Synthetic Medical Text, Time\n Series, and Longitudinal Data","summary":" This paper presents the results of a novel scoping review on the practical\nmodels for generating three different types of synthetic health records (SHRs):\nmedical text, time series, and longitudinal data. The innovative aspects of the\nreview, which incorporate study objectives, data modality, and research\nmethodology of the reviewed studies, uncover the importance and the scope of\nthe topic for the digital medicine context. In total, 52 publications met the\neligibility criteria for generating medical time series (22), longitudinal data\n(17), and medical text (13). Privacy preservation was found to be the main\nresearch objective of the studied papers, along with class imbalance, data\nscarcity, and data imputation as the other objectives. The adversarial\nnetwork-based, probabilistic, and large language models exhibited superiority\nfor generating synthetic longitudinal data, time series, and medical texts,\nrespectively. Finding a reliable performance measure to quantify SHR\nre-identification risk is the major research gap of the topic.\n","authors":["Mohammad Loni","Fatemeh Poursalim","Mehdi Asadi","Arash Gharehbaghi"],"pdf_url":"https://arxiv.org/pdf/2411.12274v1.pdf","comment":"27 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.11287v2","updated":"2024-11-19T06:36:59Z","published":"2024-08-21T02:19:54Z","title":"Taming Generative Diffusion Prior for Universal Blind Image Restoration","summary":" Diffusion models have been widely utilized for image restoration. However,\nprevious blind image restoration methods still need to assume the type of\ndegradation model while leaving the parameters to be optimized, limiting their\nreal-world applications. Therefore, we aim to tame generative diffusion prior\nfor universal blind image restoration dubbed BIR-D, which utilizes an\noptimizable convolutional kernel to simulate the degradation model and\ndynamically update the parameters of the kernel in the diffusion steps,\nenabling it to achieve blind image restoration results even in various complex\nsituations. Besides, based on mathematical reasoning, we have provided an\nempirical formula for the chosen of adaptive guidance scale, eliminating the\nneed for a grid search for the optimal parameter. Experimentally, Our BIR-D has\ndemonstrated superior practicality and versatility than off-the-shelf\nunsupervised methods across various tasks both on real-world and synthetic\ndatasets, qualitatively and quantitatively. BIR-D is able to fulfill\nmulti-guidance blind image restoration. Moreover, BIR-D can also restore images\nthat undergo multiple and complicated degradations, demonstrating the practical\napplications.\n","authors":["Siwei Tu","Weidong Yang","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2408.11287v2.pdf","comment":"15 pages, 12 figures, 8 tables"},{"id":"http://arxiv.org/abs/2311.07929v3","updated":"2024-11-19T06:34:03Z","published":"2023-11-14T06:15:16Z","title":"Variational Graph Autoencoder for Heterogeneous Information Networks\n with Missing and Inaccurate Attributes","summary":" Heterogeneous Information Networks (HINs), which consist of various types of\nnodes and edges, have recently demonstrated excellent performance in graph\nmining. However, most existing heterogeneous graph neural networks (HGNNs)\nignore the problems of missing attributes, inaccurate attributes and scarce\nlabels for nodes, which limits their expressiveness. In this paper, we propose\na generative self-supervised model GraMI to address these issues\nsimultaneously. Specifically, GraMI first initializes all the nodes in the\ngraph with a low-dimensional representation matrix. After that, based on the\nvariational graph autoencoder framework, GraMI learns both node-level and\nattribute-level embeddings in the encoder, which can provide fine-grained\nsemantic information to construct node attributes. In the decoder, GraMI\nreconstructs both links and attributes. Instead of directly reconstructing raw\nfeatures for attributed nodes, GraMI generates the initial low-dimensional\nrepresentation matrix for all the nodes, based on which raw features of\nattributed nodes are further reconstructed to leverage accurate attributes. In\nthis way, GraMI can not only complete informative features for non-attributed\nnodes, but rectify inaccurate ones for attributed nodes. Finally, we conduct\nextensive experiments to show the superiority of GraMI in tackling HINs with\nmissing and inaccurate attributes.\n","authors":["Yige Zhao","Jianxiang Yu","Yao Cheng","Chengcheng Yu","Yiding Liu","Xiang Li","Shuaiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.07929v3.pdf","comment":"Accepted by KDD 2025"},{"id":"http://arxiv.org/abs/2411.12265v1","updated":"2024-11-19T06:28:58Z","published":"2024-11-19T06:28:58Z","title":"On the Accuracy and Precision of Moving Averages to Estimate Wi-Fi Link\n Quality","summary":" The radio spectrum is characterized by a noticeable variability, which\nimpairs performance and determinism of every wireless communication technology.\nTo counteract this aspect, mechanisms like Minstrel are customarily employed in\nreal Wi-Fi devices, and the adoption of machine learning for optimization is\nenvisaged in next-generation Wi-Fi 8. All these approaches require\ncommunication quality to be monitored at runtime.\n In this paper, the effectiveness of simple techniques based on moving\naverages to estimate wireless link quality is analyzed, to assess their\nadvantages and weaknesses. Results can be used, e.g., as a baseline when\nstudying how artificial intelligence can be employed to mitigate\nunpredictability of wireless networks by providing reliable estimates about\ncurrent spectrum conditions.\n","authors":["Gianluca Cena","Gabriele Formis","Matteo Rosani","Stefano Scanzio"],"pdf_url":"https://arxiv.org/pdf/2411.12265v1.pdf","comment":"preprint, 8 pages, 2024"},{"id":"http://arxiv.org/abs/2403.01420v3","updated":"2024-11-19T06:10:32Z","published":"2024-03-03T07:38:24Z","title":"The Implicit Bias of Heterogeneity towards Invariance: A Study of\n Multi-Environment Matrix Sensing","summary":" Models are expected to engage in invariance learning, which involves\ndistinguishing the core relations that remain consistent across varying\nenvironments to ensure the predictions are safe, robust and fair. While\nexisting works consider specific algorithms to realize invariance learning, we\nshow that model has the potential to learn invariance through standard training\nprocedures. In other words, this paper studies the implicit bias of Stochastic\nGradient Descent (SGD) over heterogeneous data and shows that the implicit bias\ndrives the model learning towards an invariant solution. We call the phenomenon\nthe implicit invariance learning. Specifically, we theoretically investigate\nthe multi-environment low-rank matrix sensing problem where in each\nenvironment, the signal comprises (i) a lower-rank invariant part shared across\nall environments; and (ii) a significantly varying environment-dependent\nspurious component. The key insight is, through simply employing the large step\nsize large-batch SGD sequentially in each environment without any explicit\nregularization, the oscillation caused by heterogeneity can provably prevent\nmodel learning spurious signals. The model reaches the invariant solution after\ncertain iterations. In contrast, model learned using pooled SGD over all data\nwould simultaneously learn both the invariant and spurious signals. Overall, we\nunveil another implicit bias that is a result of the symbiosis between the\nheterogeneity of data and modern algorithms, which is, to the best of our\nknowledge, first in the literature.\n","authors":["Yang Xu","Yihong Gu","Cong Fang"],"pdf_url":"https://arxiv.org/pdf/2403.01420v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12256v1","updated":"2024-11-19T06:10:22Z","published":"2024-11-19T06:10:22Z","title":"Restructuring Tractable Probabilistic Circuits","summary":" Probabilistic circuits (PCs) is a unifying representation for probabilistic\nmodels that support tractable inference. Numerous applications of PCs like\ncontrollable text generation depend on the ability to efficiently multiply two\ncircuits. Existing multiplication algorithms require that the circuits respect\nthe same structure, i.e. variable scopes decomposes according to the same\nvtree. In this work, we propose and study the task of restructuring\nstructured(-decomposable) PCs, that is, transforming a structured PC such that\nit conforms to a target vtree. We propose a generic approach for this problem\nand show that it leads to novel polynomial-time algorithms for multiplying\ncircuits respecting different vtrees, as well as a practical depth-reduction\nalgorithm that preserves structured decomposibility. Our work opens up new\navenues for tractable PC inference, suggesting the possibility of training with\nless restrictive PC structures while enabling efficient inference by changing\ntheir structures at inference time.\n","authors":["Honghua Zhang","Benjie Wang","Marcelo Arenas","Guy Van den Broeck"],"pdf_url":"https://arxiv.org/pdf/2411.12256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12255v1","updated":"2024-11-19T06:09:09Z","published":"2024-11-19T06:09:09Z","title":"Error-Feedback Model for Output Correction in Bilateral Control-Based\n Imitation Learning","summary":" In recent years, imitation learning using neural networks has enabled robots\nto perform flexible tasks. However, since neural networks operate in a\nfeedforward structure, they do not possess a mechanism to compensate for output\nerrors. To address this limitation, we developed a feedback mechanism to\ncorrect these errors. By employing a hierarchical structure for neural networks\ncomprising lower and upper layers, the lower layer was controlled to follow the\nupper layer. Additionally, using a multi-layer perceptron in the lower layer,\nwhich lacks an internal state, enhanced the error feedback. In the\ncharacter-writing task, this model demonstrated improved accuracy in writing\npreviously untrained characters. In the character-writing task, this model\ndemonstrated improved accuracy in writing previously untrained characters.\nThrough autonomous control with error feedback, we confirmed that the lower\nlayer could effectively track the output of the upper layer. This study\nrepresents a promising step toward integrating neural networks with control\ntheories.\n","authors":["Hiroshi Sato","Masashi Konosu","Sho Sakaino","Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2411.12255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12254v1","updated":"2024-11-19T05:58:22Z","published":"2024-11-19T05:58:22Z","title":"Predicting User Intents and Musical Attributes from Music Discovery\n Conversations","summary":" Intent classification is a text understanding task that identifies user needs\nfrom input text queries. While intent classification has been extensively\nstudied in various domains, it has not received much attention in the music\ndomain. In this paper, we investigate intent classification models for music\ndiscovery conversation, focusing on pre-trained language models. Rather than\nonly predicting functional needs: intent classification, we also include a task\nfor classifying musical needs: musical attribute classification. Additionally,\nwe propose a method of concatenating previous chat history with just\nsingle-turn user queries in the input text, allowing the model to understand\nthe overall conversation context better. Our proposed model significantly\nimproves the F1 score for both user intent and musical attribute\nclassification, and surpasses the zero-shot and few-shot performance of the\npretrained Llama 3 model.\n","authors":["Daeyong Kwon","SeungHeon Doh","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2411.12254v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.12244v1","updated":"2024-11-19T05:49:00Z","published":"2024-11-19T05:49:00Z","title":"Hyper-parameter Optimization for Federated Learning with Step-wise\n Adaptive Mechanism","summary":" Federated Learning (FL) is a decentralized learning approach that protects\nsensitive information by utilizing local model parameters rather than sharing\nclients' raw datasets. While this privacy-preserving method is widely employed\nacross various applications, it still requires significant development and\noptimization. Automated Machine Learning (Auto-ML) has been adapted for\nreducing the need for manual adjustments. Previous studies have explored the\nintegration of AutoML with different FL algorithms to evaluate their\neffectiveness in enhancing FL settings. However, Automated FL (Auto-FL) faces\nadditional challenges due to the involvement of a large cohort of clients and\nglobal training rounds between clients and the server, rendering the tuning\nprocess time-consuming and nearly impossible on resource-constrained edge\ndevices (e.g., IoT devices). This paper investigates the deployment and\nintegration of two lightweight Hyper-Parameter Optimization (HPO) tools,\nRaytune and Optuna, within the context of FL settings. A step-wise feedback\nmechanism has also been designed to accelerate the hyper-parameter tuning\nprocess and coordinate AutoML toolkits with the FL server. To this end, both\nlocal and global feedback mechanisms are integrated to limit the search space\nand expedite the HPO process. Further, a novel client selection technique is\nintroduced to mitigate the straggler effect in Auto-FL. The selected\nhyper-parameter tuning tools are evaluated using two benchmark datasets,\nFEMNIST, and CIFAR10. Further, the paper discusses the essential properties of\nsuccessful HPO tools, the integration mechanism with the FL pipeline, and the\nchallenges posed by the distributed and heterogeneous nature of FL\nenvironments.\n","authors":["Yasaman Saadati","M. Hadi Amini"],"pdf_url":"https://arxiv.org/pdf/2411.12244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08511v3","updated":"2024-11-19T05:45:14Z","published":"2024-10-11T04:23:56Z","title":"Distributionally robust self-supervised learning for tabular data","summary":" Machine learning (ML) models trained using Empirical Risk Minimization (ERM)\noften exhibit systematic errors on specific subpopulations of tabular data,\nknown as error slices. Learning robust representation in presence of error\nslices is challenging, especially in self-supervised settings during the\nfeature reconstruction phase, due to high cardinality features and the\ncomplexity of constructing error sets. Traditional robust representation\nlearning methods are largely focused on improving worst group performance in\nsupervised setting in computer vision, leaving a gap in approaches tailored for\ntabular data. We address this gap by developing a framework to learn robust\nrepresentation in tabular data during self-supervised pre-training. Our\napproach utilizes an encoder-decoder model trained with Masked Language\nModeling (MLM) loss to learn robust latent representations. This paper applies\nthe Just Train Twice (JTT) and Deep Feature Reweighting (DFR) methods during\nthe pre-training phase for tabular data. These methods fine-tune the ERM\npre-trained model by up-weighting error-prone samples or creating balanced\ndatasets for specific categorical features. This results in specialized models\nfor each feature, which are then used in an ensemble approach to enhance\ndownstream classification performance. This methodology improves robustness\nacross slices, thus enhancing overall generalization performance. Extensive\nexperiments across various datasets demonstrate the efficacy of our approach.\nThe code is available:\n\\url{https://github.com/amazon-science/distributionally-robust-self-supervised-learning-for-tabular-data}.\n","authors":["Shantanu Ghosh","Tiankang Xie","Mikhail Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2410.08511v3.pdf","comment":"TRL Workshop@NeurIPS2024"},{"id":"http://arxiv.org/abs/2410.16561v3","updated":"2024-11-19T05:34:33Z","published":"2024-10-21T22:40:42Z","title":"Gradient Normalization Provably Benefits Nonconvex SGD under\n Heavy-Tailed Noise","summary":" This paper investigates the roles of gradient normalization and clipping in\nensuring the convergence of Stochastic Gradient Descent (SGD) under\nheavy-tailed noise. While existing approaches consider gradient clipping\nindispensable for SGD convergence, we theoretically demonstrate that gradient\nnormalization alone without clipping is sufficient to ensure convergence.\nFurthermore, we establish that combining gradient normalization with clipping\noffers significantly improved convergence rates compared to using either\ntechnique in isolation, notably as gradient noise diminishes. With these\nresults, our work provides the first theoretical evidence demonstrating the\nbenefits of gradient normalization in SGD under heavy-tailed noise. Finally, we\nintroduce an accelerated SGD variant incorporating gradient normalization and\nclipping, further enhancing convergence rates under heavy-tailed noise.\n","authors":["Tao Sun","Xinwang Liu","Kun Yuan"],"pdf_url":"https://arxiv.org/pdf/2410.16561v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00068v3","updated":"2024-11-19T05:08:44Z","published":"2024-01-30T14:47:15Z","title":"Adapting Amidst Degradation: Cross Domain Li-ion Battery Health\n Estimation via Physics-Guided Test-Time Training","summary":" Health modeling of lithium-ion batteries (LIBs) is crucial for safe and\nefficient energy management and carries significant socio-economic\nimplications. Although Machine Learning (ML)-based State of Health (SOH)\nestimation methods have made significant progress in accuracy, the scarcity of\nhigh-quality LIB data remains a major obstacle. Existing transfer learning\nmethods for cross-domain LIB SOH estimation have significantly alleviated the\nlabeling burden of target LIB data, however, they still require sufficient\nunlabeled target data (UTD) for effective adaptation to the target domain.\nCollecting this UTD is challenging due to the time-consuming nature of\ndegradation experiments. To address this issue, we introduce a practical\nTest-Time Training framework, BatteryTTT, which adapts the model continually\nusing each UTD collected amidst degradation, thereby significantly reducing\ndata collection time. To fully utilize each UTD, BatteryTTT integrates the\ninherent physical laws of modern LIBs into self-supervised learning, termed\nPhyscics-Guided Test-Time Training. Additionally, we explore the potential of\nlarge language models (LLMs) in battery sequence modeling by evaluating their\nperformance in SOH estimation through model reprogramming and prefix prompt\nadaptation. The combination of BatteryTTT and LLM modeling, termed GPT4Battery,\nachieves state-of-the-art generalization results across current LIB benchmarks.\nFurthermore, we demonstrate the practical value and scalability of our approach\nby deploying it in our real-world battery management system (BMS) for 300Ah\nlarge-scale energy storage LIBs.\n","authors":["Yuyuan Feng","Guosheng Hu","Xiaodong Li","Zhihong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00068v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09003v2","updated":"2024-11-19T04:53:47Z","published":"2024-11-13T20:12:55Z","title":"Refusal in LLMs is an Affine Function","summary":" We propose affine concept editing (ACE) as an approach for steering language\nmodels' behavior by intervening directly in activations. We begin with an\naffine decomposition of model activation vectors and show that prior methods\nfor steering model behavior correspond to subsets of terms of this\ndecomposition. We then provide a derivation of ACE and use it to control\nrefusal behavior on ten different models, including Llama 3 70B. ACE combines\naffine subspace projection and activation addition to reliably control the\nmodel's refusal responses across prompt types. We evaluate the results using\nLLM-based scoring on a collection of harmful and harmless prompts. Our\nexperiments demonstrate that ACE consistently achieves more precise control\nover model behavior than existing methods and generalizes to models where\ndirectional ablation via affine subspace projection alone produces incoherent\noutputs. Code for reproducing our results is available at\nhttps://github.com/EleutherAI/steering-llama3 .\n","authors":["Thomas Marshall","Adam Scherlis","Nora Belrose"],"pdf_url":"https://arxiv.org/pdf/2411.09003v2.pdf","comment":"added plots for results from additional models"},{"id":"http://arxiv.org/abs/2411.12222v1","updated":"2024-11-19T04:32:41Z","published":"2024-11-19T04:32:41Z","title":"Contrast Similarity-Aware Dual-Pathway Mamba for Multivariate Time\n Series Node Classification","summary":" Multivariate time series (MTS) data is generated through multiple sensors\nacross various domains such as engineering application, health monitoring, and\nthe internet of things, characterized by its temporal changes and high\ndimensional characteristics. Over the past few years, many studies have\nexplored the long-range dependencies and similarities in MTS. However,\nlong-range dependencies are difficult to model due to their temporal changes\nand high dimensionality makes it difficult to obtain similarities effectively\nand efficiently. Thus, to address these issues, we propose contrast\nsimilarity-aware dual-pathway Mamba for MTS node classification (CS-DPMamba).\nFirstly, to obtain the dynamic similarity of each sample, we initially use\ntemporal contrast learning module to acquire MTS representations. And then we\nconstruct a similarity matrix between MTS representations using Fast Dynamic\nTime Warping (FastDTW). Secondly, we apply the DPMamba to consider the\nbidirectional nature of MTS, allowing us to better capture long-range and\nshort-range dependencies within the data. Finally, we utilize the\nKolmogorov-Arnold Network enhanced Graph Isomorphism Network to complete the\ninformation interaction in the matrix and MTS node classification task. By\ncomprehensively considering the long-range dependencies and dynamic similarity\nfeatures, we achieved precise MTS node classification. We conducted experiments\non multiple University of East Anglia (UEA) MTS datasets, which encompass\ndiverse application scenarios. Our results demonstrate the superiority of our\nmethod through both supervised and semi-supervised experiments on the MTS\nclassification task.\n","authors":["Mingsen Du","Meng Chen","Yongjian Li","Xiuxin Zhang","Jiahui Gao","Cun Ji","Shoushui Wei"],"pdf_url":"https://arxiv.org/pdf/2411.12222v1.pdf","comment":"Submitted to Knowledge-Based Systems on Nov 17, 2024"},{"id":"http://arxiv.org/abs/2408.03195v2","updated":"2024-11-19T04:21:54Z","published":"2024-08-06T13:55:51Z","title":"RELIEF: Reinforcement Learning Empowered Graph Feature Prompt Tuning","summary":" The advent of the \"pre-train, prompt\" paradigm has recently extended its\ngeneralization ability and data efficiency to graph representation learning,\nfollowing its achievements in Natural Language Processing (NLP). Initial graph\nprompt tuning approaches tailored specialized prompting functions for Graph\nNeural Network (GNN) models pre-trained with specific strategies, such as edge\nprediction, thus limiting their applicability. In contrast, another pioneering\nline of research has explored universal prompting via adding prompts to the\ninput graph's feature space, thereby removing the reliance on specific\npre-training strategies. However, the necessity to add feature prompts to all\nnodes remains an open question. Motivated by findings from prompt tuning\nresearch in the NLP domain, which suggest that highly capable pre-trained\nmodels need less conditioning signal to achieve desired behaviors, we advocate\nfor strategically incorporating necessary and lightweight feature prompts to\ncertain graph nodes to enhance downstream task performance. This introduces a\ncombinatorial optimization problem, requiring a policy to decide 1) which nodes\nto prompt and 2) what specific feature prompts to attach. We then address the\nproblem by framing the prompt incorporation process as a sequential\ndecision-making problem and propose our method, RELIEF, which employs\nReinforcement Learning (RL) to optimize it. At each step, the RL agent selects\na node (discrete action) and determines the prompt content (continuous action),\naiming to maximize cumulative performance gain. Extensive experiments on graph\nand node-level tasks with various pre-training strategies in few-shot scenarios\ndemonstrate that our RELIEF outperforms fine-tuning and other prompt-based\napproaches in classification performance and data efficiency.\n","authors":["Jiapeng Zhu","Zichen Ding","Jianxiang Yu","Jiaqi Tan","Xiang Li","Weining Qian"],"pdf_url":"https://arxiv.org/pdf/2408.03195v2.pdf","comment":"Accepted by SIGKDD 2025"},{"id":"http://arxiv.org/abs/2411.12220v1","updated":"2024-11-19T04:12:14Z","published":"2024-11-19T04:12:14Z","title":"DeTrigger: A Gradient-Centric Approach to Backdoor Attack Mitigation in\n Federated Learning","summary":" Federated Learning (FL) enables collaborative model training across\ndistributed devices while preserving local data privacy, making it ideal for\nmobile and embedded systems. However, the decentralized nature of FL also opens\nvulnerabilities to model poisoning attacks, particularly backdoor attacks,\nwhere adversaries implant trigger patterns to manipulate model predictions. In\nthis paper, we propose DeTrigger, a scalable and efficient backdoor-robust\nfederated learning framework that leverages insights from adversarial attack\nmethodologies. By employing gradient analysis with temperature scaling,\nDeTrigger detects and isolates backdoor triggers, allowing for precise model\nweight pruning of backdoor activations without sacrificing benign model\nknowledge. Extensive evaluations across four widely used datasets demonstrate\nthat DeTrigger achieves up to 251x faster detection than traditional methods\nand mitigates backdoor attacks by up to 98.9%, with minimal impact on global\nmodel accuracy. Our findings establish DeTrigger as a robust and scalable\nsolution to protect federated learning environments against sophisticated\nbackdoor threats.\n","authors":["Kichang Lee","Yujin Shin","Jonghyuk Yun","Jun Han","JeongGil Ko"],"pdf_url":"https://arxiv.org/pdf/2411.12220v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.10060v4","updated":"2024-11-19T03:58:40Z","published":"2024-08-19T14:54:12Z","title":"Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with\n Texture Map-Based Weak Supervision","summary":" Facial wrinkle detection plays a crucial role in cosmetic dermatology.\nPrecise manual segmentation of facial wrinkles is challenging and\ntime-consuming, with inherent subjectivity leading to inconsistent results\namong graders. To address this issue, we propose two solutions. First, we build\nand release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an\nextension of the NVIDIA FFHQ dataset. It includes 1,000 images with human\nlabels and 50,000 images with automatically generated weak labels. This dataset\ncould serve as a foundation for the research community to develop advanced\nwrinkle detection algorithms. Second, we introduce a simple training strategy\nutilizing texture maps, applicable to various segmentation models, to detect\nwrinkles across the face. Our two-stage training strategy first pretrain models\non a large dataset with weak labels (N=50k), or masked texture maps generated\nthrough computer vision techniques, without human intervention. We then\nfinetune the models using human-labeled data (N=1k), which consists of manually\nlabeled wrinkle masks. The network takes as input a combination of RGB and\nmasked texture map of the image, comprising four channels, in finetuning. We\neffectively combine labels from multiple annotators to minimize subjectivity in\nmanual labeling. Our strategies demonstrate improved segmentation performance\nin facial wrinkle segmentation both quantitatively and visually compared to\nexisting pretraining methods. The dataset is available at\nhttps://github.com/labhai/ffhq-wrinkle-dataset.\n","authors":["Junho Moon","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.10060v4.pdf","comment":"Accepted at International Conference on Pattern Recognition (ICPR),\n 2024"},{"id":"http://arxiv.org/abs/2411.11764v2","updated":"2024-11-19T03:48:26Z","published":"2024-11-18T17:43:43Z","title":"Freezing of Gait Detection Using Gramian Angular Fields and Federated\n Learning from Wearable Sensors","summary":" Freezing of gait (FOG) is a debilitating symptom of Parkinson's disease (PD)\nthat impairs mobility and safety. Traditional detection methods face challenges\ndue to intra and inter-patient variability, and most systems are tested in\ncontrolled settings, limiting their real-world applicability. Addressing these\ngaps, we present FOGSense, a novel FOG detection system designed for\nuncontrolled, free-living conditions. It uses Gramian Angular Field (GAF)\ntransformations and federated deep learning to capture temporal and spatial\ngait patterns missed by traditional methods. We evaluated our FOGSense system\nusing a public PD dataset, 'tdcsfog'. FOGSense improves accuracy by 10.4% over\na single-axis accelerometer, reduces failure points compared to multi-sensor\nsystems, and demonstrates robustness to missing values. The federated\narchitecture allows personalized model adaptation and efficient smartphone\nsynchronization during off-peak hours, making it effective for long-term\nmonitoring as symptoms evolve. Overall, FOGSense achieves a 22.2% improvement\nin F1-score compared to state-of-the-art methods, along with enhanced\nsensitivity for FOG episode detection. Code is available:\nhttps://github.com/shovito66/FOGSense.\n","authors":["Shovito Barua Soumma","S M Raihanul Alam","Rudmila Rahman","Umme Niraj Mahi","Abdullah Mamun","Sayyed Mostafa Mostafavi","Hassan Ghasemzadeh"],"pdf_url":"https://arxiv.org/pdf/2411.11764v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07987v4","updated":"2024-11-19T03:23:20Z","published":"2024-04-11T17:59:09Z","title":"ControlNet++: Improving Conditional Controls with Efficient Consistency\n Feedback","summary":" To enhance the controllability of text-to-image diffusion models, existing\nefforts like ControlNet incorporated image-based conditional controls. In this\npaper, we reveal that existing methods still face significant challenges in\ngenerating images that align with the image conditional controls. To this end,\nwe propose ControlNet++, a novel approach that improves controllable generation\nby explicitly optimizing pixel-level cycle consistency between generated images\nand conditional controls. Specifically, for an input conditional control, we\nuse a pre-trained discriminative reward model to extract the corresponding\ncondition of the generated images, and then optimize the consistency loss\nbetween the input conditional control and extracted condition. A\nstraightforward implementation would be generating images from random noises\nand then calculating the consistency loss, but such an approach requires\nstoring gradients for multiple sampling timesteps, leading to considerable time\nand memory costs. To address this, we introduce an efficient reward strategy\nthat deliberately disturbs the input images by adding noise, and then uses the\nsingle-step denoised images for reward fine-tuning. This avoids the extensive\ncosts associated with image sampling, allowing for more efficient reward\nfine-tuning. Extensive experiments show that ControlNet++ significantly\nimproves controllability under various conditional controls. For example, it\nachieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE,\nrespectively, for segmentation mask, line-art edge, and depth conditions. All\nthe code, models, demo and organized data have been open sourced on our Github\nRepo.\n","authors":["Ming Li","Taojiannan Yang","Huafeng Kuang","Jie Wu","Zhaoning Wang","Xuefeng Xiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07987v4.pdf","comment":"Camera Ready Version. Project Page:\n https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data:\n https://github.com/liming-ai/ControlNet_Plus_Plus"},{"id":"http://arxiv.org/abs/2411.12193v1","updated":"2024-11-19T03:18:31Z","published":"2024-11-19T03:18:31Z","title":"Hierarchical Spatio-Temporal Uncertainty Quantification for Distributed\n Energy Adoption","summary":" The rapid deployment of distributed energy resources (DER) has introduced\nsignificant spatio-temporal uncertainties in power grid management,\nnecessitating accurate multilevel forecasting methods. However, existing\napproaches often produce overly conservative uncertainty intervals at\nindividual spatial units and fail to properly capture uncertainties when\naggregating predictions across different spatial scales. This paper presents a\nnovel hierarchical spatio-temporal model based on the conformal prediction\nframework to address these challenges. Our approach generates circuit-level DER\ngrowth predictions and efficiently aggregates them to the substation level\nwhile maintaining statistical validity through a tailored non-conformity score.\nApplied to a decade of DER installation data from a local utility network, our\nmethod demonstrates superior performance over existing approaches, particularly\nin reducing prediction interval widths while maintaining coverage.\n","authors":["Wenbin Zhou","Shixiang Zhu","Feng Qiu","Xuan Wu"],"pdf_url":"https://arxiv.org/pdf/2411.12193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08446v2","updated":"2024-11-19T03:06:15Z","published":"2024-08-15T22:53:35Z","title":"Lifelong Reinforcement Learning via Neuromodulation","summary":" Navigating multiple tasks$\\unicode{x2014}$for instance in succession as in\ncontinual or lifelong learning, or in distributions as in meta or multi-task\nlearning$\\unicode{x2014}$requires some notion of adaptation. Evolution over\ntimescales of millennia has imbued humans and other animals with highly\neffective adaptive learning and decision-making strategies. Central to these\nfunctions are so-called neuromodulatory systems. In this work we introduce an\nabstract framework for integrating theories and evidence from neuroscience and\nthe cognitive sciences into the design of adaptive artificial reinforcement\nlearning algorithms. We give a concrete instance of this framework built on\nliterature surrounding the neuromodulators Acetylcholine (ACh) and\nNoradrenaline (NA), and empirically validate the effectiveness of the resulting\nadaptive algorithm in a non-stationary multi-armed bandit problem. We conclude\nwith a theory-based experiment proposal providing an avenue to link our\nframework back to efforts in experimental neuroscience.\n","authors":["Sebastian Lee","Samuel Liebana","Claudia Clopath","Will Dabney"],"pdf_url":"https://arxiv.org/pdf/2408.08446v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12188v1","updated":"2024-11-19T03:02:39Z","published":"2024-11-19T03:02:39Z","title":"Constant Rate Schedule: Constant-Rate Distributional Change for\n Efficient Training and Sampling in Diffusion Models","summary":" We propose a noise schedule that ensures a constant rate of change in the\nprobability distribution of diffused data throughout the diffusion process. To\nobtain this noise schedule, we measure the rate of change in the probability\ndistribution of the forward process and use it to determine the noise schedule\nbefore training diffusion models. The functional form of the noise schedule is\nautomatically determined and tailored to each dataset and type of diffusion\nmodel. We evaluate the effectiveness of our noise schedule on unconditional and\nclass-conditional image generation tasks using the LSUN\n(bedroom/church/cat/horse), ImageNet, and FFHQ datasets. Through extensive\nexperiments, we confirmed that our noise schedule broadly improves the\nperformance of the diffusion models regardless of the dataset, sampler, number\nof function evaluations, or type of diffusion model.\n","authors":["Shuntaro Okada","Kenji Doi","Ryota Yoshihashi","Hirokatsu Kataoka","Tomohiro Tanaka"],"pdf_url":"https://arxiv.org/pdf/2411.12188v1.pdf","comment":"33 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.12184v1","updated":"2024-11-19T02:56:45Z","published":"2024-11-19T02:56:45Z","title":"Testability of Instrumental Variables in Additive Nonlinear,\n Non-Constant Effects Models","summary":" We address the issue of the testability of instrumental variables derived\nfrom observational data. Most existing testable implications are centered on\nscenarios where the treatment is a discrete variable, e.g., instrumental\ninequality (Pearl, 1995), or where the effect is assumed to be constant, e.g.,\ninstrumental variables condition based on the principle of independent\nmechanisms (Burauel, 2023). However, treatments can often be continuous\nvariables, such as drug dosages or nutritional content levels, and non-constant\neffects may occur in many real-world scenarios. In this paper, we consider an\nadditive nonlinear, non-constant effects model with unmeasured confounders, in\nwhich treatments can be either discrete or continuous, and propose an\nAuxiliary-based Independence Test (AIT) condition to test whether a variable is\na valid instrument. We first show that if the candidate instrument is valid,\nthen the AIT condition holds. Moreover, we illustrate the implications of the\nAIT condition and demonstrate that, in certain conditions, AIT conditions are\nnecessary and sufficient to detect all invalid IVs. We also extend the AIT\ncondition to include covariates and introduce a practical testing algorithm.\nExperimental results on both synthetic and three different real-world datasets\nshow the effectiveness of our proposed condition.\n","authors":["Xichen Guo","Zheng Li","Biwei Huang","Yan Zeng","Zhi Geng","Feng Xie"],"pdf_url":"https://arxiv.org/pdf/2411.12184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10094v2","updated":"2024-11-19T02:55:22Z","published":"2024-09-16T08:50:47Z","title":"Beyond Perceptual Distances: Rethinking Disparity Assessment for\n Out-of-Distribution Detection with Diffusion Models","summary":" Out-of-Distribution (OoD) detection aims to justify whether a given sample is\nfrom the training distribution of the classifier-under-protection, i.e.,\nIn-Distribution (InD), or from OoD. Diffusion Models (DMs) are recently\nutilized in OoD detection by using the perceptual distances between the given\nimage and its DM generation. DM-based methods bring fresh insights to the\nfield, yet remain under-explored.\n In this work, we point out two main limitations in DM-based OoD detection\nmethods: (i) the perceptual metrics on the disparities between the given sample\nand its generation are devised only at human-perceived levels, ignoring the\nabstract or high-level patterns that help better reflect the intrinsic\ndisparities in distribution; (ii) only the raw image contents are taken to\nmeasure the disparities, while other representations, i.e., the features and\nprobabilities from the classifier-under-protection, are easy to access at hand\nbut are ignored. To this end, our proposed detection framework goes beyond the\nperceptual distances and looks into the deep representations from the\nclassifier-under-protection with our novel metrics devised correspondingly,\nleading to more informative disparity assessments between InD and OoD. An\nanomaly-removal strategy is integrated to remove the abnormal OoD information\nin the generation, further enhancing the distinctiveness of disparities. Our\nwork has demonstrated state-of-the-art detection performances among DM-based\nmethods in extensive experiments.\n","authors":["Kun Fang","Qinghua Tao","Zuopeng Yang","Xiaolin Huang","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2409.10094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16843v2","updated":"2024-11-19T02:52:45Z","published":"2024-02-26T18:59:18Z","title":"Multi-LoRA Composition for Image Generation","summary":" Low-Rank Adaptation (LoRA) is extensively utilized in text-to-image models\nfor the accurate rendition of specific elements like distinct characters or\nunique styles in generated images. Nonetheless, existing methods face\nchallenges in effectively composing multiple LoRAs, especially as the number of\nLoRAs to be integrated grows, thus hindering the creation of complex imagery.\nIn this paper, we study multi-LoRA composition through a decoding-centric\nperspective. We present two training-free methods: LoRA Switch, which\nalternates between different LoRAs at each denoising step, and LoRA Composite,\nwhich simultaneously incorporates all LoRAs to guide more cohesive image\nsynthesis. To evaluate the proposed approaches, we establish ComposLoRA, a new\ncomprehensive testbed as part of this research. It features a diverse range of\nLoRA categories with 480 composition sets. Utilizing an evaluation framework\nbased on GPT-4V, our findings demonstrate a clear improvement in performance\nwith our methods over the prevalent baseline, particularly evident when\nincreasing the number of LoRAs in a composition. The code, benchmarks, LoRA\nweights, and all evaluation details are available on our project website:\nhttps://maszhongming.github.io/Multi-LoRA-Composition.\n","authors":["Ming Zhong","Yelong Shen","Shuohang Wang","Yadong Lu","Yizhu Jiao","Siru Ouyang","Donghan Yu","Jiawei Han","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2402.16843v2.pdf","comment":"Transactions on Machine Learning Research (TMLR), 2024"},{"id":"http://arxiv.org/abs/2411.12183v1","updated":"2024-11-19T02:50:11Z","published":"2024-11-19T02:50:11Z","title":"Action-Attentive Deep Reinforcement Learning for Autonomous Alignment of\n Beamlines","summary":" Synchrotron radiation sources play a crucial role in fields such as materials\nscience, biology, and chemistry. The beamline, a key subsystem of the\nsynchrotron, modulates and directs the radiation to the sample for analysis.\nHowever, the alignment of beamlines is a complex and time-consuming process,\nprimarily carried out manually by experienced engineers. Even minor\nmisalignments in optical components can significantly affect the beam's\nproperties, leading to suboptimal experimental outcomes. Current automated\nmethods, such as bayesian optimization (BO) and reinforcement learning (RL),\nalthough these methods enhance performance, limitations remain. The\nrelationship between the current and target beam properties, crucial for\ndetermining the adjustment, is not fully considered. Additionally, the physical\ncharacteristics of optical elements are overlooked, such as the need to adjust\nspecific devices to control the output beam's spot size or position. This paper\naddresses the alignment of beamlines by modeling it as a Markov Decision\nProcess (MDP) and training an intelligent agent using RL. The agent calculates\nadjustment values based on the current and target beam states, executes\nactions, and iterates until optimal parameters are achieved. A policy network\nwith action attention is designed to improve decision-making by considering\nboth state differences and the impact of optical components. Experiments on two\nsimulated beamlines demonstrate that our algorithm outperforms existing\nmethods, with ablation studies highlighting the effectiveness of the action\nattention-based policy network.\n","authors":["Siyu Wang","Shengran Dai","Jianhui Jiang","Shuang Wu","Yufei Peng","Junbin Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.12183v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.12182v1","updated":"2024-11-19T02:48:58Z","published":"2024-11-19T02:48:58Z","title":"Diffusion-Inspired Cold Start with Sufficient Prior in Computerized\n Adaptive Testing","summary":" Computerized Adaptive Testing (CAT) aims to select the most appropriate\nquestions based on the examinee's ability and is widely used in online\neducation. However, existing CAT systems often lack initial understanding of\nthe examinee's ability, requiring random probing questions. This can lead to\npoorly matched questions, extending the test duration and negatively impacting\nthe examinee's mindset, a phenomenon referred to as the Cold Start with\nInsufficient Prior (CSIP) task. This issue occurs because CAT systems do not\neffectively utilize the abundant prior information about the examinee available\nfrom other courses on online platforms. These response records, due to the\ncommonality of cognitive states across different knowledge domains, can provide\nvaluable prior information for the target domain. However, no prior work has\nexplored solutions for the CSIP task. In response to this gap, we propose\nDiffusion Cognitive States TransfeR Framework (DCSR), a novel domain transfer\nframework based on Diffusion Models (DMs) to address the CSIP task.\nSpecifically, we construct a cognitive state transition bridge between domains,\nguided by the common cognitive states of examinees, encouraging the model to\nreconstruct the initial ability state in the target domain. To enrich the\nexpressive power of the generated data, we analyze the causal relationships in\nthe generation process from a causal perspective. Redundant and extraneous\ncognitive states can lead to limited transfer and negative transfer effects.\nOur DCSR can seamlessly apply the generated initial ability states in the\ntarget domain to existing question selection algorithms, thus improving the\ncold start performance of the CAT system. Extensive experiments conducted on\nfive real-world datasets demonstrate that DCSR significantly outperforms\nexisting baseline methods in addressing the CSIP task.\n","authors":["Haiping Ma","Aoqing Xia","Changqian Wang","Hai Wang","Xingyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.12182v1.pdf","comment":"Accepted by KDD2025"},{"id":"http://arxiv.org/abs/2312.15063v2","updated":"2024-11-19T02:40:19Z","published":"2023-12-22T21:01:16Z","title":"A universal approximation theorem for nonlinear resistive networks","summary":" Resistor networks have recently attracted interest as analog computing\nplatforms for machine learning, particularly due to their compatibility with\nthe Equilibrium Propagation training framework. In this work, we explore the\ncomputational capabilities of these networks. We prove that electrical networks\nconsisting of voltage sources, linear resistors, diodes, and voltage-controlled\nvoltage sources (VCVS) can approximate any continuous function to arbitrary\nprecision. Central to our proof is a method for translating a ReLU neural\nnetwork into an approximately equivalent electrical network comprising these\nfour elements. Our proof relies on two assumptions: (a) circuit elements are\nideal, and (b) variable resistor conductances and VCVS amplification factors\ncan take any value (arbitrarily small or large). Our findings provide insights\nthat could guide the development of universal self-learning electrical\nnetworks.\n","authors":["Benjamin Scellier","Siddhartha Mishra"],"pdf_url":"https://arxiv.org/pdf/2312.15063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11101v2","updated":"2024-11-19T02:39:53Z","published":"2024-11-17T15:17:08Z","title":"Different Horses for Different Courses: Comparing Bias Mitigation\n Algorithms in ML","summary":" With fairness concerns gaining significant attention in Machine Learning\n(ML), several bias mitigation techniques have been proposed, often compared\nagainst each other to find the best method. These benchmarking efforts tend to\nuse a common setup for evaluation under the assumption that providing a uniform\nenvironment ensures a fair comparison. However, bias mitigation techniques are\nsensitive to hyperparameter choices, random seeds, feature selection, etc.,\nmeaning that comparison on just one setting can unfairly favour certain\nalgorithms. In this work, we show significant variance in fairness achieved by\nseveral algorithms and the influence of the learning pipeline on fairness\nscores. We highlight that most bias mitigation techniques can achieve\ncomparable performance, given the freedom to perform hyperparameter\noptimization, suggesting that the choice of the evaluation parameters-rather\nthan the mitigation technique itself-can sometimes create the perceived\nsuperiority of one method over another. We hope our work encourages future\nresearch on how various choices in the lifecycle of developing an algorithm\nimpact fairness, and trends that guide the selection of appropriate algorithms.\n","authors":["Prakhar Ganesh","Usman Gohar","Lu Cheng","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2411.11101v2.pdf","comment":"To appear at AFME@NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.12174v1","updated":"2024-11-19T02:39:28Z","published":"2024-11-19T02:39:28Z","title":"Just KIDDIN: Knowledge Infusion and Distillation for Detection of\n INdecent Memes","summary":" Toxicity identification in online multimodal environments remains a\nchallenging task due to the complexity of contextual connections across\nmodalities (e.g., textual and visual). In this paper, we propose a novel\nframework that integrates Knowledge Distillation (KD) from Large Visual\nLanguage Models (LVLMs) and knowledge infusion to enhance the performance of\ntoxicity detection in hateful memes. Our approach extracts sub-knowledge graphs\nfrom ConceptNet, a large-scale commonsense Knowledge Graph (KG) to be infused\nwithin a compact VLM framework. The relational context between toxic phrases in\ncaptions and memes, as well as visual concepts in memes enhance the model's\nreasoning capabilities. Experimental results from our study on two hate speech\nbenchmark datasets demonstrate superior performance over the state-of-the-art\nbaselines across AU-ROC, F1, and Recall with improvements of 1.1%, 7%, and 35%,\nrespectively. Given the contextual complexity of the toxicity detection task,\nour approach showcases the significance of learning from both explicit (i.e.\nKG) as well as implicit (i.e. LVLMs) contextual cues incorporated through a\nhybrid neurosymbolic approach. This is crucial for real-world applications\nwhere accurate and scalable recognition of toxic content is critical for\ncreating safer online environments.\n","authors":["Rahul Garg","Trilok Padhi","Hemang Jain","Ugur Kursuncu","Ugur Kursuncu","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2411.12174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12173v1","updated":"2024-11-19T02:35:14Z","published":"2024-11-19T02:35:14Z","title":"SkillTree: Explainable Skill-Based Deep Reinforcement Learning for\n Long-Horizon Control Tasks","summary":" Deep reinforcement learning (DRL) has achieved remarkable success in various\nresearch domains. However, its reliance on neural networks results in a lack of\ntransparency, which limits its practical applications. To achieve\nexplainability, decision trees have emerged as a popular and promising\nalternative to neural networks. Nonetheless, due to their limited\nexpressiveness, traditional decision trees struggle with high-dimensional\nlong-horizon continuous control tasks. In this paper, we proposes SkillTree, a\nnovel framework that reduces complex continuous action spaces into discrete\nskill spaces. Our hierarchical approach integrates a differentiable decision\ntree within the high-level policy to generate skill embeddings, which\nsubsequently guide the low-level policy in executing skills. By making skill\ndecisions explainable, we achieve skill-level explainability, enhancing the\nunderstanding of the decision-making process in complex tasks. Experimental\nresults demonstrate that our method achieves performance comparable to\nskill-based neural networks in complex robotic arm control domains.\nFurthermore, SkillTree offers explanations at the skill level, thereby\nincreasing the transparency of the decision-making process.\n","authors":["Yongyan Wen","Siyuan Li","Rongchang Zuo","Lei Yuan","Hangyu Mao","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2411.12173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17612v3","updated":"2024-11-19T02:05:56Z","published":"2024-09-26T08:03:19Z","title":"Diversity-Driven Synthesis: Enhancing Dataset Distillation through\n Directed Weight Adjustment","summary":" The sharp increase in data-related expenses has motivated research into\ncondensing datasets while retaining the most informative features. Dataset\ndistillation has thus recently come to the fore. This paradigm generates\nsynthetic datasets that are representative enough to replace the original\ndataset in training a neural network. To avoid redundancy in these synthetic\ndatasets, it is crucial that each element contains unique features and remains\ndiverse from others during the synthesis stage. In this paper, we provide a\nthorough theoretical and empirical analysis of diversity within synthesized\ndatasets. We argue that enhancing diversity can improve the parallelizable yet\nisolated synthesizing approach. Specifically, we introduce a novel method that\nemploys dynamic and directed weight adjustment techniques to modulate the\nsynthesis process, thereby maximizing the representativeness and diversity of\neach synthetic instance. Our method ensures that each batch of synthetic data\nmirrors the characteristics of a large, varying subset of the original dataset.\nExtensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet,\nand ImageNet-1K, demonstrate the superior performance of our method,\nhighlighting its effectiveness in producing diverse and representative\nsynthetic datasets with minimal computational expense. Our code is available at\nhttps://github.com/AngusDujw/Diversity-Driven-Synthesis.https://github.com/AngusDujw/Diversity-Driven-Synthesis.\n","authors":["Jiawei Du","Xin Zhang","Juncheng Hu","Wenxin Huang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.17612v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12164v1","updated":"2024-11-19T02:01:07Z","published":"2024-11-19T02:01:07Z","title":"UrbanDiT: A Foundation Model for Open-World Urban Spatio-Temporal\n Learning","summary":" The urban environment is characterized by complex spatio-temporal dynamics\narising from diverse human activities and interactions. Effectively modeling\nthese dynamics is essential for understanding and optimizing urban systems In\nthis work, we introduce UrbanDiT, a foundation model for open-world urban\nspatio-temporal learning that successfully scale up diffusion transformers in\nthis field. UrbanDiT pioneers a unified model that integrates diverse\nspatio-temporal data sources and types while learning universal spatio-temporal\npatterns across different cities and scenarios. This allows the model to unify\nboth multi-data and multi-task learning, and effectively support a wide range\nof spatio-temporal applications. Its key innovation lies in the elaborated\nprompt learning framework, which adaptively generates both data-driven and\ntask-specific prompts, guiding the model to deliver superior performance across\nvarious urban applications. UrbanDiT offers three primary advantages: 1) It\nunifies diverse data types, such as grid-based and graph-based data, into a\nsequential format, allowing to capture spatio-temporal dynamics across diverse\nscenarios of different cities; 2) With masking strategies and task-specific\nprompts, it supports a wide range of tasks, including bi-directional\nspatio-temporal prediction, temporal interpolation, spatial extrapolation, and\nspatio-temporal imputation; and 3) It generalizes effectively to open-world\nscenarios, with its powerful zero-shot capabilities outperforming nearly all\nbaselines with training data. These features allow UrbanDiT to achieves\nstate-of-the-art performance in different domains such as transportation\ntraffic, crowd flows, taxi demand, bike usage, and cellular traffic, across\nmultiple cities and tasks. UrbanDiT sets up a new benchmark for foundation\nmodels in the urban spatio-temporal domain.\n","authors":["Yuan Yuan","Chonghua Han","Jingtao Ding","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.12164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12159v1","updated":"2024-11-19T01:52:59Z","published":"2024-11-19T01:52:59Z","title":"Sensor-fusion based Prognostics Framework for Complex Engineering\n Systems Exhibiting Multiple Failure Modes","summary":" Complex engineering systems are often subject to multiple failure modes.\nDeveloping a remaining useful life (RUL) prediction model that does not\nconsider the failure mode causing degradation is likely to result in inaccurate\npredictions. However, distinguishing between causes of failure without manually\ninspecting the system is nontrivial. This challenge is increased when the\ncauses of historically observed failures are unknown. Sensors, which are useful\nfor monitoring the state-of-health of systems, can also be used for\ndistinguishing between multiple failure modes as the presence of multiple\nfailure modes results in discriminatory behavior of the sensor signals. When\nsystems are equipped with multiple sensors, some sensors may exhibit behavior\ncorrelated with degradation, while other sensors do not. Furthermore, which\nsensors exhibit this behavior may differ for each failure mode. In this paper,\nwe present a simultaneous clustering and sensor selection approach for\nunlabeled training datasets of systems exhibiting multiple failure modes. The\ncluster assignments and the selected sensors are then utilized in real-time to\nfirst diagnose the active failure mode and then to predict the system RUL. We\nvalidate the complete pipeline of the methodology using a simulated dataset of\nsystems exhibiting two failure modes and on a turbofan degradation dataset from\nNASA.\n","authors":["Benjamin Peters","Ayush Mohanty","Xiaolei Fang","Stephen K. Robinson","Nagi Gebraeel"],"pdf_url":"https://arxiv.org/pdf/2411.12159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10825v2","updated":"2024-11-19T01:51:37Z","published":"2024-09-17T01:37:57Z","title":"Unveiling and Mitigating Bias in Large Language Model Recommendations: A\n Path to Fairness","summary":" Large Language Model (LLM)-based recommendation systems provide more\ncomprehensive recommendations than traditional systems by deeply analyzing\ncontent and user behavior. However, these systems often exhibit biases,\nfavoring mainstream content while marginalizing non-traditional options due to\nskewed training data. This study investigates the intricate relationship\nbetween bias and LLM-based recommendation systems, with a focus on music, song,\nand book recommendations across diverse demographic and cultural groups.\nThrough a comprehensive analysis conducted over different LLM-models, this\npaper evaluates the impact of bias on recommendation outcomes. Our findings\nhighlight that biases are not only deeply embedded but also widely pervasive\nacross these systems, emphasizing the substantial and widespread nature of the\nissue. Moreover, contextual information, such as socioeconomic status, further\namplify these biases, demonstrating the complexity and depth of the challenges\nfaced in creating fair recommendations across different groups.\n","authors":["Shahnewaz Karim Sakib","Anindya Bijoy Das"],"pdf_url":"https://arxiv.org/pdf/2409.10825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15887v3","updated":"2024-11-19T01:30:14Z","published":"2024-07-21T16:33:56Z","title":"Separable DeepONet: Breaking the Curse of Dimensionality in\n Physics-Informed Machine Learning","summary":" The deep operator network (DeepONet) is a popular neural operator\narchitecture that has shown promise in solving partial differential equations\n(PDEs) by using deep neural networks to map between infinite-dimensional\nfunction spaces. In the absence of labeled datasets, we utilize the PDE\nresidual loss to learn the physical system, an approach known as\nphysics-informed DeepONet. This method faces significant computational\nchallenges, primarily due to the curse of dimensionality, as the computational\ncost increases exponentially with finer discretization. In this paper, we\nintroduce the Separable DeepONet framework to address these challenges and\nimprove scalability for high-dimensional PDEs. Our approach involves a\nfactorization technique where sub-networks handle individual one-dimensional\ncoordinates, thereby reducing the number of forward passes and the size of the\nJacobian matrix. By using forward-mode automatic differentiation, we further\noptimize the computational cost related to the Jacobian matrix. As a result,\nour modifications lead to a linear scaling of computational cost with\ndiscretization density, making Separable DeepONet suitable for high-dimensional\nPDEs. We validate the effectiveness of the separable architecture through three\nbenchmark PDE models: the viscous Burgers equation, Biot's consolidation\ntheory, and a parametrized heat equation. In all cases, our proposed framework\nachieves comparable or improved accuracy while significantly reducing\ncomputational time compared to conventional DeepONet. These results demonstrate\nthe potential of Separable DeepONet in efficiently solving complex,\nhigh-dimensional PDEs, advancing the field of physics-informed machine\nlearning.\n","authors":["Luis Mandl","Somdatta Goswami","Lena Lambers","Tim Ricken"],"pdf_url":"https://arxiv.org/pdf/2407.15887v3.pdf","comment":"23 Pages, 9 Figures and 1 Table"},{"id":"http://arxiv.org/abs/2411.12155v1","updated":"2024-11-19T01:23:52Z","published":"2024-11-19T01:23:52Z","title":"Reinforcement Learning with Action Sequence for Data-Efficient Robot\n Learning","summary":" Training reinforcement learning (RL) agents on robotic tasks typically\nrequires a large number of training samples. This is because training data\noften consists of noisy trajectories, whether from exploration or\nhuman-collected demonstrations, making it difficult to learn value functions\nthat understand the effect of taking each action. On the other hand, recent\nbehavior-cloning (BC) approaches have shown that predicting a sequence of\nactions enables policies to effectively approximate noisy, multi-modal\ndistributions of expert demonstrations. Can we use a similar idea for improving\nRL on robotic tasks? In this paper, we introduce a novel RL algorithm that\nlearns a critic network that outputs Q-values over a sequence of actions. By\nexplicitly training the value functions to learn the consequence of executing a\nseries of current and future actions, our algorithm allows for learning useful\nvalue functions from noisy trajectories. We study our algorithm across various\nsetups with sparse and dense rewards, and with or without demonstrations,\nspanning mobile bi-manual manipulation, whole-body control, and tabletop\nmanipulation tasks from BiGym, HumanoidBench, and RLBench. We find that, by\nlearning the critic network with action sequences, our algorithm outperforms\nvarious RL and BC baselines, in particular on challenging humanoid control\ntasks.\n","authors":["Younggyo Seo","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2411.12155v1.pdf","comment":"17 Pages. Website: https://younggyo.me/cqn-as/"},{"id":"http://arxiv.org/abs/2411.12154v1","updated":"2024-11-19T01:08:13Z","published":"2024-11-19T01:08:13Z","title":"Tangential Randomization in Linear Bandits (TRAiL): Guaranteed Inference\n and Regret Bounds","summary":" We propose and analyze TRAiL (Tangential Randomization in Linear Bandits), a\ncomputationally efficient regret-optimal forced exploration algorithm for\nlinear bandits on action sets that are sublevel sets of strongly convex\nfunctions. TRAiL estimates the governing parameter of the linear bandit problem\nthrough a standard regularized least squares and perturbs the reward-maximizing\naction corresponding to said point estimate along the tangent plane of the\nconvex compact action set before projecting back to it. Exploiting\nconcentration results for matrix martingales, we prove that TRAiL ensures a\n$\\Omega(\\sqrt{T})$ growth in the inference quality, measured via the minimum\neigenvalue of the design (regressor) matrix with high-probability over a\n$T$-length period. We build on this result to obtain an $\\mathcal{O}(\\sqrt{T}\n\\log(T))$ upper bound on cumulative regret with probability at least $ 1 - 1/T$\nover $T$ periods, and compare TRAiL to other popular algorithms for linear\nbandits. Then, we characterize an $\\Omega(\\sqrt{T})$ minimax lower bound for\nany algorithm on the expected regret that covers a wide variety of\naction/parameter sets and noise processes. Our analysis not only expands the\nrealm of lower-bounds in linear bandits significantly, but as a byproduct,\nyields a trade-off between regret and inference quality. Specifically, we prove\nthat any algorithm with an $\\mathcal{O}(T^\\alpha)$ expected regret growth must\nhave an $\\Omega(T^{1-\\alpha})$ asymptotic growth in expected inference quality.\nOur experiments on the $L^p$ unit ball as action sets reveal how this relation\ncan be violated, but only in the short-run, before returning to respect the\nbound asymptotically. In effect, regret-minimizing algorithms must have just\nthe right rate of inference -- too fast or too slow inference will incur\nsub-optimal regret growth.\n","authors":["Arda Güçlü","Subhonmesh Bose"],"pdf_url":"https://arxiv.org/pdf/2411.12154v1.pdf","comment":"42 pages, 6 Figures"},{"id":"http://arxiv.org/abs/2411.12150v1","updated":"2024-11-19T00:56:35Z","published":"2024-11-19T00:56:35Z","title":"HEIGHT: Heterogeneous Interaction Graph Transformer for Robot Navigation\n in Crowded and Constrained Environments","summary":" We study the problem of robot navigation in dense and interactive crowds with\nenvironmental constraints such as corridors and furniture. Previous methods\nfail to consider all types of interactions among agents and obstacles, leading\nto unsafe and inefficient robot paths. In this article, we leverage a\ngraph-based representation of crowded and constrained scenarios and propose a\nstructured framework to learn robot navigation policies with deep reinforcement\nlearning. We first split the representations of different components in the\nenvironment and propose a heterogeneous spatio-temporal (st) graph to model\ndistinct interactions among humans, robots, and obstacles. Based on the\nheterogeneous st-graph, we propose HEIGHT, a novel navigation policy network\narchitecture with different components to capture heterogeneous interactions\namong entities through space and time. HEIGHT utilizes attention mechanisms to\nprioritize important interactions and a recurrent network to track changes in\nthe dynamic scene over time, encouraging the robot to avoid collisions\nadaptively. Through extensive simulation and real-world experiments, we\ndemonstrate that HEIGHT outperforms state-of-the-art baselines in terms of\nsuccess and efficiency in challenging navigation scenarios. Furthermore, we\ndemonstrate that our pipeline achieves better zero-shot generalization\ncapability than previous works when the densities of humans and obstacles\nchange. More videos are available at\nhttps://sites.google.com/view/crowdnav-height/home.\n","authors":["Shuijing Liu","Haochen Xia","Fatemeh Cheraghi Pouria","Kaiwen Hong","Neeloy Chakraborty","Katherine Driggs-Campbell"],"pdf_url":"https://arxiv.org/pdf/2411.12150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12146v1","updated":"2024-11-19T00:50:01Z","published":"2024-11-19T00:50:01Z","title":"Self-supervised denoising of visual field data improves detection of\n glaucoma progression","summary":" Perimetric measurements provide insight into a patient's peripheral vision\nand day-to-day functioning and are the main outcome measure for identifying\nprogression of visual damage from glaucoma. However, visual field data can be\nnoisy, exhibiting high variance, especially with increasing damage. In this\nstudy, we demonstrate the utility of self-supervised deep learning in denoising\nvisual field data from over 4000 patients to enhance its signal-to-noise ratio\nand its ability to detect true glaucoma progression. We deployed both a\nvariational autoencoder (VAE) and a masked autoencoder to determine which\nself-supervised model best smooths the visual field data while reconstructing\nsalient features that are less noisy and more predictive of worsening disease.\nOur results indicate that including a categorical p-value at every visual field\nlocation improves the smoothing of visual field data. Masked autoencoders led\nto cleaner denoised data than previous methods, such as variational\nautoencoders. A 4.7% increase in detection of progressing eyes with pointwise\nlinear regression (PLR) was observed. The masked and variational autoencoders'\nsmoothed data predicted glaucoma progression 2.3 months earlier when p-values\nwere included compared to when they were not. The faster prediction of time to\nprogression (TTP) and the higher percentage progression detected support our\nhypothesis that masking out visual field elements during training while\nincluding p-values at each location would improve the task of detection of\nvisual field progression. Our study has clinically relevant implications\nregarding masking when training neural networks to denoise visual field data,\nresulting in earlier and more accurate detection of glaucoma progression. This\ndenoising model can be integrated into future models for visual field analysis\nto enhance detection of glaucoma progression.\n","authors":["Sean Wu","Jun Yu Chen","Vahid Mohammadzadeh","Sajad Besharati","Jaewon Lee","Kouros Nouri-Mahdavi","Joseph Caprioli","Zhe Fei","Fabien Scalzo"],"pdf_url":"https://arxiv.org/pdf/2411.12146v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.12142v1","updated":"2024-11-19T00:44:56Z","published":"2024-11-19T00:44:56Z","title":"A Computational Method for Measuring \"Open Codes\" in Qualitative\n Analysis","summary":" Qualitative analysis is critical to understanding human datasets in many\nsocial science disciplines. Open coding is an inductive qualitative process\nthat identifies and interprets \"open codes\" from datasets. Yet, meeting\nmethodological expectations (such as \"as exhaustive as possible\") can be\nchallenging. While many machine learning (ML)/generative AI (GAI) studies have\nattempted to support open coding, few have systematically measured or evaluated\nGAI outcomes, increasing potential bias risks. Building on Grounded Theory and\nThematic Analysis theories, we present a computational method to measure and\nidentify potential biases from \"open codes\" systematically. Instead of\noperationalizing human expert results as the \"ground truth,\" our method is\nbuilt upon a team-based approach between human and machine coders. We\nexperiment with two HCI datasets to establish this method's reliability by 1)\ncomparing it with human analysis, and 2) analyzing its output stability. We\npresent evidence-based suggestions and example workflows for ML/GAI to support\nopen coding.\n","authors":["John Chen","Alexandros Lotsos","Lexie Zhao","Jessica Hullman","Bruce Sherin","Uri Wilensky","Michael Horn"],"pdf_url":"https://arxiv.org/pdf/2411.12142v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.12736v1","updated":"2024-11-19T18:58:03Z","published":"2024-11-19T18:58:03Z","title":"ACING: Actor-Critic for Instruction Learning in Black-Box Large Language\n Models","summary":" The effectiveness of Large Language Models (LLMs) in solving tasks vastly\ndepends on the quality of the instructions, which often require fine-tuning\nthrough extensive human effort. This highlights the need for automated\ninstruction optimization; however, this optimization is particularly\nchallenging when dealing with black-box LLMs, where model parameters and\ngradients remain inaccessible. We propose ACING, a task-specific prompt\noptimization approach framed as a stateless continuous-action Reinforcement\nLearning (RL) problem, known as the continuum bandit setting. ACING leverages\nan actor-critic-based method to optimize prompts, learning from\nnon-differentiable reward signals. We validate ACING by optimizing prompts for\nChatGPT on 30 instruction-based tasks. ACING consistently outperforms baseline\nmethods, achieving a median score improvement of 10 percentage points.\nFurthermore, ACING not only recovers but also surpasses human-crafted expert\ninstructions, achieving up to a 39 percentage point improvement against human\nbenchmarks.\n","authors":["Salma Kharrat","Fares Fourati","Marco Canini"],"pdf_url":"https://arxiv.org/pdf/2411.12736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12732v1","updated":"2024-11-19T18:57:01Z","published":"2024-11-19T18:57:01Z","title":"Benchmarking Positional Encodings for GNNs and Graph Transformers","summary":" Recent advances in Graph Neural Networks (GNNs) and Graph Transformers (GTs)\nhave been driven by innovations in architectures and Positional Encodings\n(PEs), which are critical for augmenting node features and capturing graph\ntopology. PEs are essential for GTs, where topological information would\notherwise be lost without message-passing. However, PEs are often tested\nalongside novel architectures, making it difficult to isolate their effect on\nestablished models. To address this, we present a comprehensive benchmark of\nPEs in a unified framework that includes both message-passing GNNs and GTs. We\nalso establish theoretical connections between MPNNs and GTs and introduce a\nsparsified GRIT attention mechanism to examine the influence of global\nconnectivity. Our findings demonstrate that previously untested combinations of\nGNN architectures and PEs can outperform existing methods and offer a more\ncomprehensive picture of the state-of-the-art. To support future research and\nexperimentation in our framework, we make the code publicly available.\n","authors":["Florian Grötschla","Jiaqing Xie","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2411.12732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12724v1","updated":"2024-11-19T18:45:16Z","published":"2024-11-19T18:45:16Z","title":"Heuristic-Free Multi-Teacher Learning","summary":" We introduce Teacher2Task, a novel framework for multi-teacher learning that\neliminates the need for manual aggregation heuristics. Existing multi-teacher\nmethods typically rely on such heuristics to combine predictions from multiple\nteachers, often resulting in sub-optimal aggregated labels and the propagation\nof aggregation errors. Teacher2Task addresses these limitations by introducing\nteacher-specific input tokens and reformulating the training process. Instead\nof relying on aggregated labels, the framework transforms the training data,\nconsisting of ground truth labels and annotations from N teachers, into N+1\ndistinct tasks: N auxiliary tasks that predict the labeling styles of the N\nindividual teachers, and one primary task that focuses on the ground truth\nlabels. This approach, drawing upon principles from multiple learning\nparadigms, demonstrates strong empirical results across a range of\narchitectures, modalities, and tasks.\n","authors":["Huy Thong Nguyen","En-Hung Chu","Lenord Melvix","Jazon Jiao","Chunglin Wen","Benjamin Louie"],"pdf_url":"https://arxiv.org/pdf/2411.12724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12713v1","updated":"2024-11-19T18:27:31Z","published":"2024-11-19T18:27:31Z","title":"CATCH: Complementary Adaptive Token-level Contrastive Decoding to\n Mitigate Hallucinations in LVLMs","summary":" Large Vision-Language Model (LVLM) systems have demonstrated impressive\nvision-language reasoning capabilities but suffer from pervasive and severe\nhallucination issues, posing significant risks in critical domains such as\nhealthcare and autonomous systems. Despite previous efforts to mitigate\nhallucinations, a persistent issue remains: visual defect from vision-language\nmisalignment, creating a bottleneck in visual processing capacity. To address\nthis challenge, we develop Complementary Adaptive Token-level Contrastive\nDecoding to Mitigate Hallucinations in LVLMs (CATCH), based on the Information\nBottleneck theory. CATCH introduces Complementary Visual Decoupling (CVD) for\nvisual information separation, Non-Visual Screening (NVS) for hallucination\ndetection, and Adaptive Token-level Contrastive Decoding (ATCD) for\nhallucination mitigation. CATCH addresses issues related to visual defects that\ncause diminished fine-grained feature perception and cumulative hallucinations\nin open-ended scenarios. It is applicable to various visual question-answering\ntasks without requiring any specific data or prior knowledge, and generalizes\nrobustly to new tasks without additional training, opening new possibilities\nfor advancing LVLM in various challenging applications.\n","authors":["Zhehan Kan","Ce Zhang","Zihan Liao","Yapeng Tian","Wenming Yang","Junyuan Xiao","Xu Li","Dongmei Jiang","Yaowei Wang","Qingmin Liao"],"pdf_url":"https://arxiv.org/pdf/2411.12713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12712v1","updated":"2024-11-19T18:27:25Z","published":"2024-11-19T18:27:25Z","title":"Enhancing Multi-Class Disease Classification: Neoplasms, Cardiovascular,\n Nervous System, and Digestive Disorders Using Advanced LLMs","summary":" In this research, we explored the improvement in terms of multi-class disease\nclassification via pre-trained language models over Medical-Abstracts-TC-Corpus\nthat spans five medical conditions. We excluded non-cancer conditions and\nexamined four specific diseases. We assessed four LLMs, BioBERT, XLNet, and\nBERT, as well as a novel base model (Last-BERT). BioBERT, which was pre-trained\non medical data, demonstrated superior performance in medical text\nclassification (97% accuracy). Surprisingly, XLNet followed closely (96%\naccuracy), demonstrating its generalizability across domains even though it was\nnot pre-trained on medical data. LastBERT, a custom model based on the lighter\nversion of BERT, also proved competitive with 87.10% accuracy (just under\nBERT's 89.33%). Our findings confirm the importance of specialized models such\nas BioBERT and also support impressions around more general solutions like\nXLNet and well-tuned transformer architectures with fewer parameters (in this\ncase, LastBERT) in medical domain tasks.\n","authors":["Ahmed Akib Jawad Karim","Muhammad Zawad Mahmud","Samiha Islam","Aznur Azam"],"pdf_url":"https://arxiv.org/pdf/2411.12712v1.pdf","comment":"7 Pages, 4 tables and 11 figures. Under review in a IEEE conference"},{"id":"http://arxiv.org/abs/2403.11046v2","updated":"2024-11-19T18:18:04Z","published":"2024-03-17T00:11:15Z","title":"Regulating Chatbot Output via Inter-Informational Competition","summary":" The advent of ChatGPT has sparked over a year of regulatory frenzy. However,\nfew existing studies have rigorously questioned the assumption that, if left\nunregulated, AI chatbot's output would inflict tangible, severe real harm on\nhuman affairs. Most researchers have overlooked the critical possibility that\nthe information market itself can effectively mitigate these risks and, as a\nresult, they tend to use regulatory tools to address the issue directly. This\nArticle develops a yardstick for reevaluating both AI-related content risks and\ncorresponding regulatory proposals by focusing on inter-informational\ncompetition among various outlets. The decades-long history of regulating\ninformation and communications technologies indicates that regulators tend to\nerr too much on the side of caution and to put forward excessive regulatory\nmeasures when encountering the uncertainties brought about by new technologies.\nIn fact, a trove of empirical evidence has demonstrated that market competition\namong information outlets can effectively mitigate most risks and that\noverreliance on regulation is not only unnecessary but detrimental, as well.\nThis Article argues that sufficient competition among chatbots and other\ninformation outlets in the information marketplace can sufficiently mitigate\nand even resolve most content risks posed by generative AI technologies. This\nrenders certain loudly advocated regulatory strategies, like mandatory\nprohibitions, licensure, curation of datasets, and notice-and-response regimes,\ntruly unnecessary and even toxic to desirable competition and innovation\nthroughout the AI industry. Ultimately, the ideas that I advance in this\nArticle should pour some much-needed cold water on the regulatory frenzy over\ngenerative AI and steer the issue back to a rational track.\n","authors":["Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11046v2.pdf","comment":"50-page legal Article, forthcoming in Northwestern Journal of\n Technology and Intellectual Property"},{"id":"http://arxiv.org/abs/2402.01306v4","updated":"2024-11-19T18:12:45Z","published":"2024-02-02T10:53:36Z","title":"KTO: Model Alignment as Prospect Theoretic Optimization","summary":" Kahneman & Tversky's $\\textit{prospect theory}$ tells us that humans perceive\nrandom variables in a biased but well-defined manner (1992); for example,\nhumans are famously loss-averse. We show that objectives for aligning LLMs with\nhuman feedback implicitly incorporate many of these biases -- the success of\nthese objectives (e.g., DPO) over cross-entropy minimization can partly be\nascribed to them belonging to a family of loss functions that we call\n$\\textit{human-aware losses}$ (HALOs). However, the utility functions these\nmethods attribute to humans still differ from those in the prospect theory\nliterature. Using a Kahneman-Tversky model of human utility, we propose a HALO\nthat directly maximizes the utility of generations instead of maximizing the\nlog-likelihood of preferences, as current methods do. We call this approach\nKTO, and it matches or exceeds the performance of preference-based methods at\nscales from 1B to 30B, despite only learning from a binary signal of whether an\noutput is desirable. More broadly, our work suggests that there is no one HALO\nthat is universally superior; the best loss depends on the inductive biases\nmost appropriate for a given setting, an oft-overlooked consideration.\n","authors":["Kawin Ethayarajh","Winnie Xu","Niklas Muennighoff","Dan Jurafsky","Douwe Kiela"],"pdf_url":"https://arxiv.org/pdf/2402.01306v4.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2411.12701v1","updated":"2024-11-19T18:11:36Z","published":"2024-11-19T18:11:36Z","title":"When Backdoors Speak: Understanding LLM Backdoor Attacks Through\n Model-Generated Explanations","summary":" Large Language Models (LLMs) are vulnerable to backdoor attacks, where hidden\ntriggers can maliciously manipulate model behavior. While several backdoor\nattack methods have been proposed, the mechanisms by which backdoor functions\noperate in LLMs remain underexplored. In this paper, we move beyond attacking\nLLMs and investigate backdoor functionality through the novel lens of natural\nlanguage explanations. Specifically, we leverage LLMs' generative capabilities\nto produce human-understandable explanations for their decisions, allowing us\nto compare explanations for clean and poisoned samples. We explore various\nbackdoor attacks and embed the backdoor into LLaMA models for multiple tasks.\nOur experiments show that backdoored models produce higher-quality explanations\nfor clean data compared to poisoned data, while generating significantly more\nconsistent explanations for poisoned data than for clean data. We further\nanalyze the explanation generation process, revealing that at the token level,\nthe explanation token of poisoned samples only appears in the final few\ntransformer layers of the LLM. At the sentence level, attention dynamics\nindicate that poisoned inputs shift attention from the input context when\ngenerating the explanation. These findings deepen our understanding of backdoor\nattack mechanisms in LLMs and offer a framework for detecting such\nvulnerabilities through explainability techniques, contributing to the\ndevelopment of more secure LLMs.\n","authors":["Huaizhi Ge","Yiming Li","Qifan Wang","Yongfeng Zhang","Ruixiang Tang"],"pdf_url":"https://arxiv.org/pdf/2411.12701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12697v1","updated":"2024-11-19T18:06:06Z","published":"2024-11-19T18:06:06Z","title":"Attribute Inference Attacks for Federated Regression Tasks","summary":" Federated Learning (FL) enables multiple clients, such as mobile phones and\nIoT devices, to collaboratively train a global machine learning model while\nkeeping their data localized. However, recent studies have revealed that the\ntraining phase of FL is vulnerable to reconstruction attacks, such as attribute\ninference attacks (AIA), where adversaries exploit exchanged messages and\nauxiliary public information to uncover sensitive attributes of targeted\nclients. While these attacks have been extensively studied in the context of\nclassification tasks, their impact on regression tasks remains largely\nunexplored. In this paper, we address this gap by proposing novel model-based\nAIAs specifically designed for regression tasks in FL environments. Our\napproach considers scenarios where adversaries can either eavesdrop on\nexchanged messages or directly interfere with the training process. We\nbenchmark our proposed attacks against state-of-the-art methods using\nreal-world datasets. The results demonstrate a significant increase in\nreconstruction accuracy, particularly in heterogeneous client datasets, a\ncommon scenario in FL. The efficacy of our model-based AIAs makes them better\ncandidates for empirically quantifying privacy leakage for federated regression\ntasks.\n","authors":["Francesco Diana","Othmane Marfoq","Chuan Xu","Giovanni Neglia","Frédéric Giroire","Eoin Thomas"],"pdf_url":"https://arxiv.org/pdf/2411.12697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12593v1","updated":"2024-11-19T18:04:13Z","published":"2024-11-19T18:04:13Z","title":"AdaCM$^2$: On Understanding Extremely Long-Term Video with Adaptive\n Cross-Modality Memory Reduction","summary":" The advancements in large language models (LLMs) have propelled the\nimprovement of video understanding tasks by incorporating LLMs with visual\nmodels. However, most existing LLM-based models (e.g., VideoLLaMA, VideoChat)\nare constrained to processing short-duration videos. Recent attempts to\nunderstand long-term videos by extracting and compressing visual features into\na fixed memory size. Nevertheless, those methods leverage only visual modality\nto merge video tokens and overlook the correlation between visual and textual\nqueries, leading to difficulties in effectively handling complex\nquestion-answering tasks. To address the challenges of long videos and complex\nprompts, we propose AdaCM$^2$, which, for the first time, introduces an\nadaptive cross-modality memory reduction approach to video-text alignment in an\nauto-regressive manner on video streams. Our extensive experiments on various\nvideo understanding tasks, such as video captioning, video question answering,\nand video classification, demonstrate that AdaCM$^2$ achieves state-of-the-art\nperformance across multiple datasets while significantly reducing memory usage.\nNotably, it achieves a 4.5% improvement across multiple tasks in the LVU\ndataset with a GPU memory consumption reduction of up to 65%.\n","authors":["Yuanbin Man","Ying Huang","Chengming Zhang","Bingzhe Li","Wei Niu","Miao Yin"],"pdf_url":"https://arxiv.org/pdf/2411.12593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08316v3","updated":"2024-11-19T17:49:27Z","published":"2024-06-12T15:16:40Z","title":"Is Programming by Example solved by LLMs?","summary":" Programming-by-Examples (PBE) aims to generate an algorithm from input-output\nexamples. Such systems are practically and theoretically important: from an\nend-user perspective, they are deployed to millions of people, and from an AI\nperspective, PBE corresponds to a very general form of few-shot inductive\ninference. Given the success of Large Language Models (LLMs) in code-generation\ntasks, we investigate here the extent to which LLMs can be said to have\n\"solved\" PBE. We experiment on classic domains such as lists and strings, and\nan uncommon graphics programming domain not well represented in typical\npretraining data. We find that pretrained models are not effective at PBE, but\nthat they can be fine-tuned for much higher performance, provided the test\nproblems are in-distribution. We analyze empirically what causes these models\nto succeed and fail, and take steps toward understanding how to achieve better\nout-of-distribution generalization. Collectively these results suggest that\nLLMs make strong progress toward solving the typical suite of PBE tasks,\npotentially increasing the flexibility and applicability of PBE systems, while\nalso identifying ways in which LLMs still fall short.\n","authors":["Wen-Ding Li","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2406.08316v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10979v2","updated":"2024-11-19T17:46:27Z","published":"2024-11-17T06:23:46Z","title":"VidComposition: Can MLLMs Analyze Compositions in Compiled Videos?","summary":" The advancement of Multimodal Large Language Models (MLLMs) has enabled\nsignificant progress in multimodal understanding, expanding their capacity to\nanalyze video content. However, existing evaluation benchmarks for MLLMs\nprimarily focus on abstract video comprehension, lacking a detailed assessment\nof their ability to understand video compositions, the nuanced interpretation\nof how visual elements combine and interact within highly compiled video\ncontexts. We introduce VidComposition, a new benchmark specifically designed to\nevaluate the video composition understanding capabilities of MLLMs using\ncarefully curated compiled videos and cinematic-level annotations.\nVidComposition includes 982 videos with 1706 multiple-choice questions,\ncovering various compositional aspects such as camera movement, angle, shot\nsize, narrative structure, character actions and emotions, etc. Our\ncomprehensive evaluation of 33 open-source and proprietary MLLMs reveals a\nsignificant performance gap between human and model capabilities. This\nhighlights the limitations of current MLLMs in understanding complex, compiled\nvideo compositions and offers insights into areas for further improvement. The\nleaderboard and evaluation code are available at\nhttps://yunlong10.github.io/VidComposition/.\n","authors":["Yunlong Tang","Junjia Guo","Hang Hua","Susan Liang","Mingqian Feng","Xinyang Li","Rui Mao","Chao Huang","Jing Bi","Zeliang Zhang","Pooyan Fazli","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2411.10979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12685v1","updated":"2024-11-19T17:45:12Z","published":"2024-11-19T17:45:12Z","title":"Enhanced Sign Language Translation between American Sign Language (ASL)\n and Indian Sign Language (ISL) Using LLMs","summary":" We have come up with a research that hopes to provide a bridge between the\nusers of American Sign Language and the users of spoken language and Indian\nSign Language (ISL). The research enabled us to create a novel framework that\nwe have developed for Learner Systems. Leveraging art of Large models to create\nkey features including: - Real-time translation between these two sign\nlanguages in an efficient manner. Making LLM's capability available for\nseamless translations to ISL. Here is the full study showing its implementation\nin this paper. The core of the system is a sophisticated pipeline that begins\nwith reclassification and recognition of ASL gestures based on a strong Random\nForest Classifier. By recognizing the ASL, it is translated into text which can\nbe more easily processed. Highly evolved natural language NLP (Natural Language\nProcessing) techniques come in handy as they play a role in our LLM integration\nwhere you then use LLMs to be able to convert the ASL text to ISL which\nprovides you with the intent of sentence or phrase. The final step is to\nsynthesize the translated text back into ISL gestures, creating an end-to-end\ntranslation experience using RIFE-Net. This framework is tasked with key\nchallenges such as automatically dealing with gesture variability and\novercoming the linguistic differences between ASL and ISL. By automating the\ntranslation process, we hope to vastly improve accessibility for sign language\nusers. No longer will the communication gap between ASL and ISL create\nbarriers; this totally cool innovation aims to bring our communities closer\ntogether. And we believe, with full confidence in our framework, that we're\nable to apply the same principles across a wide variety of sign language\ndialects.\n","authors":["Malay Kumar","S. Sarvajit Visagan","Tanish Sarang Mahajan","Anisha Natarajan"],"pdf_url":"https://arxiv.org/pdf/2411.12685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03530v4","updated":"2024-11-19T17:41:00Z","published":"2023-06-06T09:26:43Z","title":"RLtools: A Fast, Portable Deep Reinforcement Learning Library for\n Continuous Control","summary":" Deep Reinforcement Learning (RL) can yield capable agents and control\npolicies in several domains but is commonly plagued by prohibitively long\ntraining times. Additionally, in the case of continuous control problems, the\napplicability of learned policies on real-world embedded devices is limited due\nto the lack of real-time guarantees and portability of existing libraries. To\naddress these challenges, we present RLtools, a dependency-free, header-only,\npure C++ library for deep supervised and reinforcement learning. Its novel\narchitecture allows RLtools to be used on a wide variety of platforms, from HPC\nclusters over workstations and laptops to smartphones, smartwatches, and\nmicrocontrollers. Specifically, due to the tight integration of the RL\nalgorithms with simulation environments, RLtools can solve popular RL problems\nup to 76 times faster than other popular RL frameworks. We also benchmark the\ninference on a diverse set of microcontrollers and show that in most cases our\noptimized implementation is by far the fastest. Finally, RLtools enables the\nfirst-ever demonstration of training a deep RL algorithm directly on a\nmicrocontroller, giving rise to the field of TinyRL. The source code as well as\ndocumentation and live demos are available through our project page at\nhttps://rl.tools.\n","authors":["Jonas Eschmann","Dario Albani","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2306.03530v4.pdf","comment":"Project page: https://rl.tools"},{"id":"http://arxiv.org/abs/2411.12681v1","updated":"2024-11-19T17:39:03Z","published":"2024-11-19T17:39:03Z","title":"AI Guided Early Screening of Cervical Cancer","summary":" In order to support the creation of reliable machine learning models for\nanomaly detection, this project focuses on preprocessing, enhancing, and\norganizing a medical imaging dataset. There are two classifications in the\ndataset: normal and abnormal, along with extra noise fluctuations. In order to\nimprove the photographs' quality, undesirable artifacts, including visible\nmedical equipment at the edges, were eliminated using central cropping.\nAdjusting the brightness and contrast was one of the additional preprocessing\nprocesses. Normalization was then performed to normalize the data. To make\nclassification jobs easier, the dataset was methodically handled by combining\nseveral image subsets into two primary categories: normal and pathological. To\nprovide a strong training set that adapts well to real-world situations,\nsophisticated picture preprocessing techniques were used, such as contrast\nenhancement and real-time augmentation (including rotations, zooms, and\nbrightness modifications). To guarantee efficient model evaluation, the data\nwas subsequently divided into training and testing subsets. In order to create\nprecise and effective machine learning models for medical anomaly detection,\nhigh-quality input data is ensured via this thorough approach. Because of the\nproject pipeline's flexible and scalable design, it can be easily integrated\nwith bigger clinical decision-support systems.\n","authors":["Dharanidharan S I","Suhitha Renuka S V","Ajishi Singh","Sheena Christabel Pravin"],"pdf_url":"https://arxiv.org/pdf/2411.12681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12678v1","updated":"2024-11-19T17:31:36Z","published":"2024-11-19T17:31:36Z","title":"Deep Learning-Driven Heat Map Analysis for Evaluating thickness of\n Wounded Skin Layers","summary":" Understanding the appropriate skin layer thickness in wounded sites is an\nimportant tool to move forward on wound healing practices and treatment\nprotocols. Methods to measure depth often are invasive and less specific. This\npaper introduces a novel method that is non-invasive with deep learning\ntechniques using classifying of skin layers that helps in measurement of wound\ndepth through heatmap analysis. A set of approximately 200 labeled images of\nskin allows five classes to be distinguished: scars, wounds, and healthy skin,\namong others. Each image has annotated key layers, namely the stratum cornetum,\nthe epidermis, and the dermis, in the software Roboflow. In the preliminary\nstage, the Heatmap generator VGG16 was used to enhance the visibility of tissue\nlayers, based upon which their annotated images were used to train ResNet18\nwith early stopping techniques. It ended up at a very high accuracy rate of\n97.67%. To do this, the comparison of the models ResNet18, VGG16, DenseNet121,\nand EfficientNet has been done where both EfficientNet and ResNet18 have\nattained accuracy rates of almost 95.35%. For further hyperparameter tuning,\nEfficientNet and ResNet18 were trained at six different learning rates to\ndetermine the best model configuration. It has been noted that the accuracy has\nhuge variations with different learning rates. In the case of EfficientNet, the\nmaximum achievable accuracy was 95.35% at the rate of 0.0001. The same was true\nfor ResNet18, which also attained its peak value of 95.35% at the same rate.\nThese facts indicate that the model can be applied and utilized in actual-time,\nnon-invasive wound assessment, which holds a great promise to improve clinical\ndiagnosis and treatment planning.\n","authors":["Devakumar GR","JB Kaarthikeyan","Dominic Immanuel T","Sheena Christabel Pravin"],"pdf_url":"https://arxiv.org/pdf/2411.12678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02272v3","updated":"2024-11-19T17:29:58Z","published":"2024-11-04T17:03:55Z","title":"Combining Induction and Transduction for Abstract Reasoning","summary":" When learning an input-output mapping from very few examples, is it better to\nfirst infer a latent function that explains the examples, or is it better to\ndirectly predict new test outputs, e.g. using a neural network? We study this\nquestion on ARC, a highly diverse dataset of abstract reasoning tasks. We train\nneural models for induction (inferring latent functions) and transduction\n(directly predicting the test output for a given test input). Our models are\ntrained on synthetic data generated by prompting LLMs to produce Python code\nspecifying a function to be inferred, plus a stochastic subroutine for\ngenerating inputs to that function. We find inductive and transductive models\nsolve very different problems, despite training on the same problems, and\ndespite sharing the same neural architecture.\n","authors":["Wen-Ding Li","Keya Hu","Carter Larsen","Yuqing Wu","Simon Alford","Caleb Woo","Spencer M. Dunn","Hao Tang","Michelangelo Naim","Dat Nguyen","Wei-Long Zheng","Zenna Tavares","Yewen Pu","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2411.02272v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12671v1","updated":"2024-11-19T17:23:55Z","published":"2024-11-19T17:23:55Z","title":"Neurosymbolic Graph Enrichment for Grounded World Models","summary":" The development of artificial intelligence systems capable of understanding\nand reasoning about complex real-world scenarios is a significant challenge. In\nthis work we present a novel approach to enhance and exploit LLM reactive\ncapability to address complex problems and interpret deeply contextual\nreal-world meaning. We introduce a method and a tool for creating a multimodal,\nknowledge-augmented formal representation of meaning that combines the\nstrengths of large language models with structured semantic representations.\nOur method begins with an image input, utilizing state-of-the-art large\nlanguage models to generate a natural language description. This description is\nthen transformed into an Abstract Meaning Representation (AMR) graph, which is\nformalized and enriched with logical design patterns, and layered semantics\nderived from linguistic and factual knowledge bases. The resulting graph is\nthen fed back into the LLM to be extended with implicit knowledge activated by\ncomplex heuristic learning, including semantic implicatures, moral values,\nembodied cognition, and metaphorical representations. By bridging the gap\nbetween unstructured language models and formal semantic structures, our method\nopens new avenues for tackling intricate problems in natural language\nunderstanding and reasoning.\n","authors":["Stefano De Giorgis","Aldo Gangemi","Alessandro Russo"],"pdf_url":"https://arxiv.org/pdf/2411.12671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12663v1","updated":"2024-11-19T17:16:31Z","published":"2024-11-19T17:16:31Z","title":"PoM: Efficient Image and Video Generation with the Polynomial Mixer","summary":" Diffusion models based on Multi-Head Attention (MHA) have become ubiquitous\nto generate high quality images and videos. However, encoding an image or a\nvideo as a sequence of patches results in costly attention patterns, as the\nrequirements both in terms of memory and compute grow quadratically. To\nalleviate this problem, we propose a drop-in replacement for MHA called the\nPolynomial Mixer (PoM) that has the benefit of encoding the entire sequence\ninto an explicit state. PoM has a linear complexity with respect to the number\nof tokens. This explicit state also allows us to generate frames in a\nsequential fashion, minimizing memory and compute requirement, while still\nbeing able to train in parallel. We show the Polynomial Mixer is a universal\nsequence-to-sequence approximator, just like regular MHA. We adapt several\nDiffusion Transformers (DiT) for generating images and videos with PoM\nreplacing MHA, and we obtain high quality samples while using less\ncomputational resources. The code is available at\nhttps://github.com/davidpicard/HoMM.\n","authors":["David Picard","Nicolas Dufour"],"pdf_url":"https://arxiv.org/pdf/2411.12663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12650v1","updated":"2024-11-19T16:58:15Z","published":"2024-11-19T16:58:15Z","title":"Optimizing Airline Reservation Systems with Edge-Enabled Microservices:\n A Framework for Real-Time Data Processing and Enhanced User Responsiveness","summary":" The growing complexity of the operations of airline reservations requires a\nsmart solution for the adoption of novel approaches to the development of\nquick, efficient, and adaptive reservation systems. This paper outlines in\ndetail a conceptual framework for the implementation of edge computing\nmicroservices in order to address the shortcomings of traditional centralized\narchitectures. Specifically, as edge computing allows for certain activities\nsuch as seat inventory checks, booking processes and even confirmation to be\ndone nearer to the user, thus lessening the overall response time and improving\nthe performance of the system. In addition, the framework value should include\nachieving the high performance of the system such as low latency, high\nthroughput and higher user experience. The major design components include\ndeployed distributed computing microservices orchestrated by Kubernetes,\nreal-time message processing system with Kafka and its elastic scaling. Other\noperational components include Prometheus and Grafana, which are used to\nmonitor and manage resources, ensuring that all operational processes are\noptimized. Although this research focuses on a design and theoretical scheming\nof the framework, its use is foreseen to be more advantageous in facilitating a\ntransform in the provision of services in the airline industry by improving\ncustomers' satisfaction, providing infrastructure which is cheap to install and\nefficiently supporting technology changes such as artificial intelligence and\ninternet of things embedded systems. This research addresses the increasing\ndemand for new technologies with modern well-distributed and real-time-centric\nsystems and also provides a basis for future case implementation and testing.\nAs such, the proposed architecture offers a market-ready, extensible solution\nto the problems posed by existing airline reservation systems .\n","authors":["Biman Barua","M. Shamim Kaiser"],"pdf_url":"https://arxiv.org/pdf/2411.12650v1.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.12644v1","updated":"2024-11-19T16:54:45Z","published":"2024-11-19T16:54:45Z","title":"CodeXEmbed: A Generalist Embedding Model Family for Multiligual and\n Multi-task Code Retrieval","summary":" Despite the success of text retrieval in many NLP tasks, code retrieval\nremains a largely underexplored area. Most text retrieval systems are tailored\nfor natural language queries, often neglecting the specific challenges of\nretrieving code. This gap leaves existing models unable to effectively capture\nthe diversity of programming languages and tasks across different domains,\nhighlighting the need for more focused research in code retrieval. To address\nthis, we introduce CodeXEmbed, a family of large-scale code embedding models\nranging from 400M to 7B parameters. Our novel training pipeline unifies\nmultiple programming languages and transforms various code-related tasks into a\ncommon retrieval framework, enhancing model generalizability and retrieval\nperformance. Our 7B model sets a new state-of-the-art (SOTA) in code retrieval,\noutperforming the previous leading model, Voyage-Code, by over 20% on CoIR\nbenchmark. In addition to excelling in code retrieval, our models demonstrate\ncompetitive performance on the widely adopted BeIR text retrieval benchmark,\noffering versatility across domains. Experimental results demonstrate that\nimproving retrieval performance significantly enhances end-to-end\nRetrieval-Augmented Generation (RAG) performance for code-related tasks.\n","authors":["Ye Liu","Rui Meng","Shafiq Jot","Silvio Savarese","Caiming Xiong","Yingbo Zhou","Semih Yavuz"],"pdf_url":"https://arxiv.org/pdf/2411.12644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12643v1","updated":"2024-11-19T16:54:30Z","published":"2024-11-19T16:54:30Z","title":"DLBacktrace: A Model Agnostic Explainability for any Deep Learning\n Models","summary":" The rapid advancement of artificial intelligence has led to increasingly\nsophisticated deep learning models, which frequently operate as opaque 'black\nboxes' with limited transparency in their decision-making processes. This lack\nof interpretability presents considerable challenges, especially in high-stakes\napplications where understanding the rationale behind a model's outputs is as\nessential as the outputs themselves. This study addresses the pressing need for\ninterpretability in AI systems, emphasizing its role in fostering trust,\nensuring accountability, and promoting responsible deployment in\nmission-critical fields. To address the interpretability challenge in deep\nlearning, we introduce DLBacktrace, an innovative technique developed by the\nAryaXAI team to illuminate model decisions across a wide array of domains,\nincluding simple Multi Layer Perceptron (MLPs), Convolutional Neural Networks\n(CNNs), Large Language Models (LLMs), Computer Vision Models, and more.\n We provide a comprehensive overview of the DLBacktrace algorithm and present\nbenchmarking results, comparing its performance against established\ninterpretability methods, such as SHAP, LIME, GradCAM, Integrated Gradients,\nSmoothGrad, and Attention Rollout, using diverse task-based metrics. The\nproposed DLBacktrace technique is compatible with various model architectures\nbuilt in PyTorch and TensorFlow, supporting models like Llama 3.2, other NLP\narchitectures such as BERT and LSTMs, computer vision models like ResNet and\nU-Net, as well as custom deep neural network (DNN) models for tabular data.\nThis flexibility underscores DLBacktrace's adaptability and effectiveness in\nenhancing model transparency across a broad spectrum of applications. The\nlibrary is open-sourced and available at https://github.com/AryaXAI/DLBacktrace .\n","authors":["Vinay Kumar Sankarapu","Chintan Chitroda","Yashwardhan Rathore","Neeraj Kumar Singh","Pratinav Seth"],"pdf_url":"https://arxiv.org/pdf/2411.12643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03320v3","updated":"2024-11-19T16:49:12Z","published":"2024-10-20T18:35:56Z","title":"log-RRIM: Yield Prediction via Local-to-global Reaction Representation\n Learning and Interaction Modeling","summary":" Accurate prediction of chemical reaction yields is crucial for optimizing\norganic synthesis, potentially reducing time and resources spent on\nexperimentation. With the rise of artificial intelligence (AI), there is\ngrowing interest in leveraging AI-based methods to accelerate yield predictions\nwithout conducting in vitro experiments. We present log-RRIM, an innovative\ngraph transformer-based framework designed for predicting chemical reaction\nyields. Our approach implements a unique local-to-global reaction\nrepresentation learning strategy. This approach initially captures detailed\nmolecule-level information and then models and aggregates intermolecular\ninteractions, ensuring that the impact of varying-sizes molecular fragments on\nyield is accurately accounted for. Another key feature of log-RRIM is its\nintegration of a cross-attention mechanism that focuses on the interplay\nbetween reagents and reaction centers. This design reflects a fundamental\nprinciple in chemical reactions: the crucial role of reagents in influencing\nbond-breaking and formation processes, which ultimately affect reaction yields.\nlog-RRIM outperforms existing methods in our experiments, especially for medium\nto high-yielding reactions, proving its reliability as a predictor. Its\nadvanced modeling of reactant-reagent interactions and sensitivity to small\nmolecular fragments make it a valuable tool for reaction planning and\noptimization in chemical synthesis. The data and codes of log-RRIM are\naccessible through https://github.com/ninglab/Yield_log_RRIM.\n","authors":["Xiao Hu","Ziqi Chen","Bo Peng","Daniel Adu-Ampratwum","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2411.03320v3.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.12633v1","updated":"2024-11-19T16:45:52Z","published":"2024-11-19T16:45:52Z","title":"Instant Policy: In-Context Imitation Learning via Graph Diffusion","summary":" Following the impressive capabilities of in-context learning with large\ntransformers, In-Context Imitation Learning (ICIL) is a promising opportunity\nfor robotics. We introduce Instant Policy, which learns new tasks instantly\n(without further training) from just one or two demonstrations, achieving ICIL\nthrough two key components. First, we introduce inductive biases through a\ngraph representation and model ICIL as a graph generation problem with a\nlearned diffusion process, enabling structured reasoning over demonstrations,\nobservations, and actions. Second, we show that such a model can be trained\nusing pseudo-demonstrations - arbitrary trajectories generated in simulation -\nas a virtually infinite pool of training data. Simulated and real experiments\nshow that Instant Policy enables rapid learning of various everyday robot\ntasks. We also show how it can serve as a foundation for cross-embodiment and\nzero-shot transfer to language-defined tasks. Code and videos are available at\nhttps://www.robot-learning.uk/instant-policy.\n","authors":["Vitalis Vosylius","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2411.12633v1.pdf","comment":"Code and videos are available on our project webpage at\n https://www.robot-learning.uk/instant-policy"},{"id":"http://arxiv.org/abs/2411.12629v1","updated":"2024-11-19T16:40:17Z","published":"2024-11-19T16:40:17Z","title":"Estimating Dark Matter Halo Masses in Simulated Galaxy Clusters with\n Graph Neural Networks","summary":" Galaxies grow and evolve in dark matter halos. Because dark matter is not\nvisible, galaxies' halo masses ($\\rm{M}_{\\rm{halo}}$) must be inferred\nindirectly. We present a graph neural network (GNN) model for predicting\n$\\rm{M}_{\\rm{halo}}$ from stellar mass ($\\rm{M}_{*}$) in simulated galaxy\nclusters using data from the IllustrisTNG simulation suite. Unlike traditional\nmachine learning models like random forests, our GNN captures the\ninformation-rich substructure of galaxy clusters by using spatial and kinematic\nrelationships between galaxy neighbour. A GNN model trained on the TNG-Cluster\ndataset and independently tested on the TNG300 simulation achieves superior\npredictive performance compared to other baseline models we tested. Future work\nwill extend this approach to different simulations and real observational\ndatasets to further validate the GNN model's ability to generalise.\n","authors":["Nikhil Garuda","John F. Wu","Dylan Nelson","Annalisa Pillepich"],"pdf_url":"https://arxiv.org/pdf/2411.12629v1.pdf","comment":"9 pages, 4 figures, accepted at the NeurIPS ML4PS 2024 workshop"},{"id":"http://arxiv.org/abs/2211.13723v3","updated":"2024-11-19T16:17:58Z","published":"2022-11-24T17:19:30Z","title":"Improving Multi-task Learning via Seeking Task-based Flat Regions","summary":" Multi-Task Learning (MTL) is a widely-used and powerful learning paradigm for\ntraining deep neural networks that allows learning more than one objective by a\nsingle backbone. Compared to training tasks separately, MTL significantly\nreduces computational costs, improves data efficiency, and potentially enhances\nmodel performance by leveraging knowledge across tasks. Hence, it has been\nadopted in a variety of applications, ranging from computer vision to natural\nlanguage processing and speech recognition. Among them, there is an emerging\nline of work in MTL that focuses on manipulating the task gradient to derive an\nultimate gradient descent direction to benefit all tasks. Despite achieving\nimpressive results on many benchmarks, directly applying these approaches\nwithout using appropriate regularization techniques might lead to suboptimal\nsolutions on real-world problems. In particular, standard training that\nminimizes the empirical loss on the training data can easily suffer from\noverfitting to low-resource tasks or be spoiled by noisy-labeled ones, which\ncan cause negative transfer between tasks and overall performance drop. To\nalleviate such problems, we propose to leverage a recently introduced training\nmethod, named Sharpness-aware Minimization, which can enhance model\ngeneralization ability on single-task learning. Accordingly, we present a novel\nMTL training methodology, encouraging the model to find task-based flat minima\nfor coherently improving its generalization capability on all tasks. Finally,\nwe conduct comprehensive experiments on a variety of applications to\ndemonstrate the merit of our proposed approach to existing gradient-based MTL\nmethods, as suggested by our developed theory.\n","authors":["Hoang Phan","Lam Tran","Quyen Tran","Ngoc N. Tran","Tuan Truong","Nhat Ho","Dinh Phung","Trung Le"],"pdf_url":"https://arxiv.org/pdf/2211.13723v3.pdf","comment":"35 pages, 17 figures, 7 tables"},{"id":"http://arxiv.org/abs/2411.12603v1","updated":"2024-11-19T16:06:32Z","published":"2024-11-19T16:06:32Z","title":"STREAM: A Universal State-Space Model for Sparse Geometric Data","summary":" Handling sparse and unstructured geometric data, such as point clouds or\nevent-based vision, is a pressing challenge in the field of machine vision.\nRecently, sequence models such as Transformers and state-space models entered\nthe domain of geometric data. These methods require specialized preprocessing\nto create a sequential view of a set of points. Furthermore, prior works\ninvolving sequence models iterate geometric data with either uniform or learned\nstep sizes, implicitly relying on the model to infer the underlying geometric\nstructure. In this work, we propose to encode geometric structure explicitly\ninto the parameterization of a state-space model. State-space models are based\non linear dynamics governed by a one-dimensional variable such as time or a\nspatial coordinate. We exploit this dynamic variable to inject relative\ndifferences of coordinates into the step size of the state-space model. The\nresulting geometric operation computes interactions between all pairs of N\npoints in O(N) steps. Our model deploys the Mamba selective state-space model\nwith a modified CUDA kernel to efficiently map sparse geometric data to modern\nhardware. The resulting sequence model, which we call STREAM, achieves\ncompetitive results on a range of benchmarks from point-cloud classification to\nevent-based vision and audio classification. STREAM demonstrates a powerful\ninductive bias for sparse geometric data by improving the PointMamba baseline\nwhen trained from scratch on the ModelNet40 and ScanObjectNN point cloud\nanalysis datasets. It further achieves, for the first time, 100% test accuracy\non all 11 classes of the DVS128 Gestures dataset.\n","authors":["Mark Schöne","Yash Bhisikar","Karan Bania","Khaleelulla Khan Nazeer","Christian Mayr","Anand Subramoney","David Kappel"],"pdf_url":"https://arxiv.org/pdf/2411.12603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12600v1","updated":"2024-11-19T16:04:31Z","published":"2024-11-19T16:04:31Z","title":"Provable unlearning in topic modeling and downstream tasks","summary":" Machine unlearning algorithms are increasingly important as legal concerns\narise around the provenance of training data, but verifying the success of\nunlearning is often difficult. Provable guarantees for unlearning are often\nlimited to supervised learning settings. In this paper, we provide the first\ntheoretical guarantees for unlearning in the pre-training and fine-tuning\nparadigm by studying topic models, simple bag-of-words language models that can\nbe adapted to solve downstream tasks like retrieval and classification. First,\nwe design a provably effective unlearning algorithm for topic models that\nincurs a computational overhead independent of the size of the original\ndataset. Our analysis additionally quantifies the deletion capacity of the\nmodel -- i.e., the number of examples that can be unlearned without incurring a\nsignificant cost in model performance. Finally, we formally extend our analyses\nto account for adaptation to a given downstream task. In particular, we design\nan efficient algorithm to perform unlearning after fine-tuning the topic model\nvia a linear head. Notably, we show that it is easier to unlearn pre-training\ndata from models that have been fine-tuned to a particular task, and one can\nunlearn this data without modifying the base model.\n","authors":["Stanley Wei","Sadhika Malladi","Sanjeev Arora","Amartya Sanyal"],"pdf_url":"https://arxiv.org/pdf/2411.12600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12587v1","updated":"2024-11-19T15:55:56Z","published":"2024-11-19T15:55:56Z","title":"Whisper Finetuning on Nepali Language","summary":" Despite the growing advancements in Automatic Speech Recognition (ASR)\nmodels, the development of robust models for underrepresented languages, such\nas Nepali, remains a challenge. This research focuses on making an exhaustive\nand generalized dataset followed by fine-tuning OpenAI's Whisper models of\ndifferent sizes to improve transcription (speech-to-text) accuracy for the\nNepali language. We leverage publicly available ASR datasets and self-recorded\ncustom datasets with a diverse range of accents, dialects, and speaking styles\nfurther enriched through augmentation. Our experimental results demonstrate\nthat fine-tuning Whisper models on our curated custom dataset substantially\nreduces the Word Error Rate (WER) across all model sizes attributed to larger\ndata variations in terms of speaker's age, gender, and sentiment, acoustic\nenvironment, dialect, denser audio segments (15-30 seconds) that are more\ncompatible with Whisper's input, and manual curation of audios and\ntranscriptions. Notably, our approach outperforms Whisper's baseline models\ntrained on Fleur's dataset, achieving WER reductions of up to 36.2% on the\nsmall and 23.8% on medium models. Furthermore, we show that data augmentation\nplays a significant role in enhancing model robustness. Our approach underlines\nthe importance of dataset quality, variation, and augmentation in the\nadaptation of state-of-the-art models to underrepresented languages for\ndeveloping accurate ASR systems.\n","authors":["Sanjay Rijal","Shital Adhikari","Manish Dahal","Manish Awale","Vaghawan Ojha"],"pdf_url":"https://arxiv.org/pdf/2411.12587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06750v2","updated":"2024-11-19T15:44:30Z","published":"2024-09-10T13:39:29Z","title":"Can Agents Spontaneously Form a Society? Introducing a Novel\n Architecture for Generative Multi-Agents to Elicit Social Emergence","summary":" Generative agents have demonstrated impressive capabilities in specific\ntasks, but most of these frameworks focus on independent tasks and lack\nattention to social interactions. We introduce a generative agent architecture\ncalled ITCMA-S, which includes a basic framework for individual agents and a\nframework called LTRHA that supports social interactions among multi-agents.\nThis architecture enables agents to identify and filter out behaviors that are\ndetrimental to social interactions, guiding them to choose more favorable\nactions. We designed a sandbox environment to simulate the natural evolution of\nsocial relationships among multiple identity-less agents for experimental\nevaluation. The results showed that ITCMA-S performed well on multiple\nevaluation indicators, demonstrating its ability to actively explore the\nenvironment, recognize new agents, and acquire new information through\ncontinuous actions and dialogue. Observations show that as agents establish\nconnections with each other, they spontaneously form cliques with internal\nhierarchies around a selected leader and organize collective activities.\n","authors":["H. Zhang","J. Yin","M. Jiang","C. Su"],"pdf_url":"https://arxiv.org/pdf/2409.06750v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.12571v1","updated":"2024-11-19T15:39:51Z","published":"2024-11-19T15:39:51Z","title":"Large Language Models for Combinatorial Optimization of Design Structure\n Matrix","summary":" Combinatorial optimization (CO) is essential for improving efficiency and\nperformance in engineering applications. As complexity increases with larger\nproblem sizes and more intricate dependencies, identifying the optimal solution\nbecome challenging. When it comes to real-world engineering problems,\nalgorithms based on pure mathematical reasoning are limited and incapable to\ncapture the contextual nuances necessary for optimization. This study explores\nthe potential of Large Language Models (LLMs) in solving engineering CO\nproblems by leveraging their reasoning power and contextual knowledge. We\npropose a novel LLM-based framework that integrates network topology and domain\nknowledge to optimize the sequencing of Design Structure Matrix (DSM)-a common\nCO problem. Our experiments on various DSM cases demonstrate that the proposed\nmethod achieves faster convergence and higher solution quality than benchmark\nmethods. Moreover, results show that incorporating contextual domain knowledge\nsignificantly improves performance despite the choice of LLMs. These findings\nhighlight the potential of LLMs in tackling complex real-world CO problems by\ncombining semantic and mathematical reasoning. This approach paves the way for\na new paradigm in in real-world combinatorial optimization.\n","authors":["Shuo Jiang","Min Xie","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2411.12571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02387v2","updated":"2024-11-19T15:39:41Z","published":"2024-10-03T11:07:43Z","title":"BiSSL: Bilevel Optimization for Self-Supervised Pre-Training and\n Fine-Tuning","summary":" In this work, we present BiSSL, a first-of-its-kind training framework that\nintroduces bilevel optimization to enhance the alignment between the pretext\npre-training and downstream fine-tuning stages in self-supervised learning.\nBiSSL formulates the pretext and downstream task objectives as the lower- and\nupper-level objectives in a bilevel optimization problem and serves as an\nintermediate training stage within the self-supervised learning pipeline. By\nmore explicitly modeling the interdependence of these training stages, BiSSL\nfacilitates enhanced information sharing between them, ultimately leading to a\nbackbone parameter initialization that is better suited for the downstream\ntask. We propose a training algorithm that alternates between optimizing the\ntwo objectives defined in BiSSL. Using a ResNet-18 backbone pre-trained with\nSimCLR on the STL10 dataset, we demonstrate that our proposed framework\nconsistently achieves improved or competitive classification accuracies across\nvarious downstream image classification datasets compared to the conventional\nself-supervised learning pipeline. Qualitative analyses of the backbone\nfeatures further suggest that BiSSL enhances the alignment of downstream\nfeatures in the backbone prior to fine-tuning.\n","authors":["Gustav Wagner Zakarias","Lars Kai Hansen","Zheng-Hua Tan"],"pdf_url":"https://arxiv.org/pdf/2410.02387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17213v5","updated":"2024-11-19T15:37:57Z","published":"2024-09-25T17:38:39Z","title":"Plurals: A System for Guiding LLMs Via Simulated Social Ensembles","summary":" Recent debates raised concerns that language models may favor certain\nviewpoints. But what if the solution is not to aim for a 'view from nowhere'\nbut rather to leverage different viewpoints? We introduce Plurals, a system and\nPython library for pluralistic AI deliberation. Plurals consists of Agents\n(LLMs, optionally with personas) which deliberate within customizable\nStructures, with Moderators overseeing deliberation. Plurals is a generator of\nsimulated social ensembles. Plurals integrates with government datasets to\ncreate nationally representative personas, includes deliberation templates\ninspired by deliberative democracy, and allows users to customize both\ninformation-sharing structures and deliberation behavior within Structures. Six\ncase studies demonstrate fidelity to theoretical constructs and efficacy. Three\nrandomized experiments show simulated focus groups produced output resonant\nwith an online sample of the relevant audiences (chosen over zero-shot\ngeneration in 75% of trials). Plurals is both a paradigm and a concrete system\nfor pluralistic AI. The Plurals library is available at\nhttps://github.com/josh-ashkinaze/plurals and will be continually updated.\n","authors":["Joshua Ashkinaze","Emily Fry","Narendra Edara","Eric Gilbert","Ceren Budak"],"pdf_url":"https://arxiv.org/pdf/2409.17213v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12560v1","updated":"2024-11-19T15:23:59Z","published":"2024-11-19T15:23:59Z","title":"Topological Symmetry Enhanced Graph Convolution for Skeleton-Based\n Action Recognition","summary":" Skeleton-based action recognition has achieved remarkable performance with\nthe development of graph convolutional networks (GCNs). However, most of these\nmethods tend to construct complex topology learning mechanisms while neglecting\nthe inherent symmetry of the human body. Additionally, the use of temporal\nconvolutions with certain fixed receptive fields limits their capacity to\neffectively capture dependencies in time sequences. To address the issues, we\n(1) propose a novel Topological Symmetry Enhanced Graph Convolution (TSE-GC) to\nenable distinct topology learning across different channel partitions while\nincorporating topological symmetry awareness and (2) construct a Multi-Branch\nDeformable Temporal Convolution (MBDTC) for skeleton-based action recognition.\nThe proposed TSE-GC emphasizes the inherent symmetry of the human body while\nenabling efficient learning of dynamic topologies. Meanwhile, the design of\nMBDTC introduces the concept of deformable modeling, leading to more flexible\nreceptive fields and stronger modeling capacity of temporal dependencies.\nCombining TSE-GC with MBDTC, our final model, TSE-GCN, achieves competitive\nperformance with fewer parameters compared with state-of-the-art methods on\nthree large datasets, NTU RGB+D, NTU RGB+D 120, and NW-UCLA. On the\ncross-subject and cross-set evaluations of NTU RGB+D 120, the accuracies of our\nmodel reach 90.0\\% and 91.1\\%, with 1.1M parameters and 1.38 GFLOPS for one\nstream.\n","authors":["Zeyu Liang","Hailun Xia","Naichuan Zheng","Huan Xu"],"pdf_url":"https://arxiv.org/pdf/2411.12560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20212v2","updated":"2024-11-19T15:23:29Z","published":"2024-03-29T14:47:54Z","title":"On Size and Hardness Generalization in Unsupervised Learning for the\n Travelling Salesman Problem","summary":" We study the generalization capability of Unsupervised Learning in solving\nthe Travelling Salesman Problem (TSP). We use a Graph Neural Network (GNN)\ntrained with a surrogate loss function to generate an embedding for each node.\nWe use these embeddings to construct a heat map that indicates the likelihood\nof each edge being part of the optimal route. We then apply local search to\ngenerate our final predictions. Our investigation explores how different\ntraining instance sizes, embedding dimensions, and distributions influence the\noutcomes of Unsupervised Learning methods. Our results show that training with\nlarger instance sizes and increasing embedding dimensions can build a more\neffective representation, enhancing the model's ability to solve TSP.\nFurthermore, in evaluating generalization across different distributions, we\nfirst determine the hardness of various distributions and explore how different\nhardnesses affect the final results. Our findings suggest that models trained\non harder instances exhibit better generalization capabilities, highlighting\nthe importance of selecting appropriate training instances in solving TSP using\nUnsupervised Learning.\n","authors":["Yimeng Min","Carla P. Gomes"],"pdf_url":"https://arxiv.org/pdf/2403.20212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12966v4","updated":"2024-11-19T15:22:16Z","published":"2024-04-19T15:53:27Z","title":"Look Before You Decide: Prompting Active Deduction of MLLMs for\n Assumptive Reasoning","summary":" Recently, Multimodal Large Language Models (MLLMs) have achieved significant\nsuccess across multiple disciplines due to their exceptional\ninstruction-following capabilities and extensive world knowledge. However,\nwhether these MLLMs possess human-like compositional reasoning abilities\nremains an open problem. To unveil their reasoning behaviors, we first curate a\n\\textbf{M}ultimodal \\textbf{A}ssumptive \\textbf{R}ea\\textbf{s}oning Benchmark\n(MARS-Bench) in this paper. Interestingly, we find that most prevalent MLLMs\ncan be easily fooled by the introduction of a presupposition into the question,\nwhereas such presuppositions appear naive to human reasoning. Besides, we also\npropose a simple yet effective method, Active Deduction (AD), to encourage the\nmodel to actively perform composite deduction before reaching a final decision.\nEquipped with the proposed AD method, a MLLM demonstrates significant\nimprovements in assumptive reasoning abilities without compromising its\ngeneral-purpose question-answering performance. We also provide extensive\nevaluations of both open-source and private MLLMs on MARS-Bench, along with\nexperimental analyses of the AD method.\n","authors":["Yian Li","Wentao Tian","Yang Jiao","Jingjing Chen","Na Zhao","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.12966v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12558v1","updated":"2024-11-19T15:18:50Z","published":"2024-11-19T15:18:50Z","title":"Recall and Refine: A Simple but Effective Source-free Open-set Domain\n Adaptation Framework","summary":" Open-set Domain Adaptation (OSDA) aims to adapt a model from a labeled source\ndomain to an unlabeled target domain, where novel classes - also referred to as\ntarget-private unknown classes - are present. Source-free Open-set Domain\nAdaptation (SF-OSDA) methods address OSDA without accessing labeled source\ndata, making them particularly relevant under privacy constraints. However,\nSF-OSDA presents significant challenges due to distribution shifts and the\nintroduction of novel classes. Existing SF-OSDA methods typically rely on\nthresholding the prediction entropy of a sample to identify it as either a\nknown or unknown class but fail to explicitly learn discriminative features for\nthe target-private unknown classes. We propose Recall and Refine (RRDA), a\nnovel SF-OSDA framework designed to address these limitations by explicitly\nlearning features for target-private unknown classes. RRDA employs a two-step\nprocess. First, we enhance the model's capacity to recognize unknown classes by\ntraining a target classifier with an additional decision boundary, guided by\nsynthetic samples generated from target domain features. This enables the\nclassifier to effectively separate known and unknown classes. In the second\nstep, we adapt the entire model to the target domain, addressing both domain\nshifts and improving generalization to unknown classes. Any off-the-shelf\nsource-free domain adaptation method (e.g., SHOT, AaD) can be seamlessly\nintegrated into our framework at this stage. Extensive experiments on three\nbenchmark datasets demonstrate that RRDA significantly outperforms existing\nSF-OSDA and OSDA methods.\n","authors":["Ismail Nejjar","Hao Dong","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2411.12558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17994v4","updated":"2024-11-19T14:51:14Z","published":"2024-09-26T16:06:38Z","title":"CRoP: Context-wise Robust Static Human-Sensing Personalization","summary":" The advancement in deep learning and internet-of-things have led to diverse\nhuman sensing applications. However, distinct patterns in human sensing,\ninfluenced by various factors or contexts, challenge the generic neural network\nmodel's performance due to natural distribution shifts. To address this,\npersonalization tailors models to individual users. Yet most personalization\nstudies overlook intra-user heterogeneity across contexts in sensory data,\nlimiting intra-user generalizability. This limitation is especially critical in\nclinical applications, where limited data availability hampers both\ngeneralizability and personalization. Notably, intra-user sensing attributes\nare expected to change due to external factors such as treatment progression,\nfurther complicating the challenges. To address the intra-user generalization\nchallenge, this work introduces CRoP, a novel static personalization approach.\nCRoP leverages off-the-shelf pre-trained models as generic starting points and\ncaptures user-specific traits through adaptive pruning on a minimal sub-network\nwhile preserving generic knowledge in the remaining parameters. CRoP\ndemonstrates superior personalization effectiveness and intra-user robustness\nacross four human-sensing datasets, including two from real-world health\ndomains, underscoring its practical and social impact. Additionally, to support\nCRoP's generalization ability and design choices, we provide empirical\njustification through gradient inner product analysis, ablation studies, and\ncomparisons against state-of-the-art baselines.\n","authors":["Sawinder Kaur","Avery Gump","Jingyu Xin","Yi Xiao","Harshit Sharma","Nina R Benway","Jonathan L Preston","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2409.17994v4.pdf","comment":"33 pages, 6 figues and 12 tables"},{"id":"http://arxiv.org/abs/2411.08992v2","updated":"2024-11-19T14:51:07Z","published":"2024-11-13T19:33:08Z","title":"IDCIA: Immunocytochemistry Dataset for Cellular Image Analysis","summary":" We present a new annotated microscopic cellular image dataset to improve the\neffectiveness of machine learning methods for cellular image analysis. Cell\ncounting is an important step in cell analysis. Typically, domain experts\nmanually count cells in a microscopic image. Automated cell counting can\npotentially eliminate this tedious, time-consuming process. However, a good,\nlabeled dataset is required for training an accurate machine learning model.\nOur dataset includes microscopic images of cells, and for each image, the cell\ncount and the location of individual cells. The data were collected as part of\nan ongoing study investigating the potential of electrical stimulation to\nmodulate stem cell differentiation and possible applications for neural repair.\nCompared to existing publicly available datasets, our dataset has more images\nof cells stained with more variety of antibodies (protein components of immune\nresponses against invaders) typically used for cell analysis. The experimental\nresults on this dataset indicate that none of the five existing models under\nthis study are able to achieve sufficiently accurate count to replace the\nmanual methods. The dataset is available at\nhttps://figshare.com/articles/dataset/Dataset/21970604.\n","authors":["Abdurahman Ali Mohammed","Catherine Fonder","Donald S. Sakaguchi","Wallapak Tavanapong","Surya K. Mallapragada","Azeez Idris"],"pdf_url":"https://arxiv.org/pdf/2411.08992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12539v1","updated":"2024-11-19T14:39:29Z","published":"2024-11-19T14:39:29Z","title":"Predicting Customer Satisfaction by Replicating the Survey Response\n Distribution","summary":" For many call centers, customer satisfaction (CSAT) is a key performance\nindicator (KPI). However, only a fraction of customers take the CSAT survey\nafter the call, leading to a biased and inaccurate average CSAT value, and\nmissed opportunities for coaching, follow-up, and rectification. Therefore,\ncall centers can benefit from a model predicting customer satisfaction on calls\nwhere the customer did not complete the survey. Given that CSAT is a closely\nmonitored KPI, it is critical to minimize any bias in the average predicted\nCSAT (pCSAT). In this paper, we introduce a method such that predicted CSAT\n(pCSAT) scores accurately replicate the distribution of survey CSAT responses\nfor every call center with sufficient data in a live production environment.\nThe method can be applied to many multiclass classification problems to improve\nthe class balance and minimize its changes upon model updates.\n","authors":["Etienne Manderscheid","Matthias Lee"],"pdf_url":"https://arxiv.org/pdf/2411.12539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00028v2","updated":"2024-11-19T14:29:32Z","published":"2024-10-29T04:03:15Z","title":"Synergizing LLM Agents and Knowledge Graph for Socioeconomic Prediction\n in LBSN","summary":" The fast development of location-based social networks (LBSNs) has led to\nsignificant changes in society, resulting in popular studies of using LBSN data\nfor socioeconomic prediction, e.g., regional population and commercial activity\nestimation. Existing studies design various graphs to model heterogeneous LBSN\ndata, and further apply graph representation learning methods for socioeconomic\nprediction. However, these approaches heavily rely on heuristic ideas and\nexpertise to extract task-relevant knowledge from diverse data, which may not\nbe optimal for specific tasks. Additionally, they tend to overlook the inherent\nrelationships between different indicators, limiting the prediction accuracy.\nMotivated by the remarkable abilities of large language models (LLMs) in\ncommonsense reasoning, embedding, and multi-agent collaboration, in this work,\nwe synergize LLM agents and knowledge graph for socioeconomic prediction. We\nfirst construct a location-based knowledge graph (LBKG) to integrate\nmulti-sourced LBSN data. Then we leverage the reasoning power of LLM agent to\nidentify relevant meta-paths in the LBKG for each type of socioeconomic\nprediction task, and design a semantic-guided attention module for knowledge\nfusion with meta-paths. Moreover, we introduce a cross-task communication\nmechanism to further enhance performance by enabling knowledge sharing across\ntasks at both LLM agent and KG levels. On the one hand, the LLM agents for\ndifferent tasks collaborate to generate more diverse and comprehensive\nmeta-paths. On the other hand, the embeddings from different tasks are\nadaptively merged for better socioeconomic prediction. Experiments on two\ndatasets demonstrate the effectiveness of the synergistic design between LLM\nand KG, providing insights for information sharing across socioeconomic\nprediction tasks.\n","authors":["Zhilun Zhou","Jingyang Fan","Yu Liu","Fengli Xu","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.00028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12525v1","updated":"2024-11-19T14:18:02Z","published":"2024-11-19T14:18:02Z","title":"Rethinking Top Probability from Multi-view for Distracted Driver\n Behaviour Localization","summary":" Naturalistic driving action localization task aims to recognize and\ncomprehend human behaviors and actions from video data captured during\nreal-world driving scenarios. Previous studies have shown great action\nlocalization performance by applying a recognition model followed by\nprobability-based post-processing. Nevertheless, the probabilities provided by\nthe recognition model frequently contain confused information causing challenge\nfor post-processing. In this work, we adopt an action recognition model based\non self-supervise learning to detect distracted activities and give potential\naction probabilities. Subsequently, a constraint ensemble strategy takes\nadvantages of multi-camera views to provide robust predictions. Finally, we\nintroduce a conditional post-processing operation to locate distracted\nbehaviours and action temporal boundaries precisely. Experimenting on test set\nA2, our method obtains the sixth position on the public leaderboard of track 3\nof the 2024 AI City Challenge.\n","authors":["Quang Vinh Nguyen","Vo Hoang Thanh Son","Chau Truong Vinh Hoang","Duc Duy Nguyen","Nhat Huy Nguyen Minh","Soo-Hyung Kim"],"pdf_url":"https://arxiv.org/pdf/2411.12525v1.pdf","comment":"Computer Vision and Pattern Recognition Workshop 2024"},{"id":"http://arxiv.org/abs/2411.12517v1","updated":"2024-11-19T13:59:16Z","published":"2024-11-19T13:59:16Z","title":"The Hermeneutic Turn of AI: Is the Machine Capable of Interpreting?","summary":" This article aims to demonstrate how the approach to computing is being\ndisrupted by deep learning (artificial neural networks), not only in terms of\ntechniques but also in our interactions with machines. It also addresses the\nphilosophical tradition of hermeneutics (Don Ihde, Wilhelm Dilthey) to\nhighlight a parallel with this movement and to demystify the idea of human-like\nAI.\n","authors":["Remy Demichelis"],"pdf_url":"https://arxiv.org/pdf/2411.12517v1.pdf","comment":"4 pages."},{"id":"http://arxiv.org/abs/2408.01728v2","updated":"2024-11-19T13:42:21Z","published":"2024-08-03T10:01:29Z","title":"Survey on Emotion Recognition through Posture Detection and the\n possibility of its application in Virtual Reality","summary":" A survey is presented focused on using pose estimation techniques in\nEmotional recognition using various technologies normal cameras, and depth\ncameras for real-time, and the potential use of VR and inputs including images,\nvideos, and 3-dimensional poses described in vector space. We discussed 19\nresearch papers collected from selected journals and databases highlighting\ntheir methodology, classification algorithm, and the used datasets that relate\nto emotion recognition and pose estimation. A benchmark has been made according\nto their accuracy as it was the most common performance measurement metric\nused. We concluded that the multimodal Approaches overall made the best\naccuracy and then we mentioned futuristic concerns that can improve the\ndevelopment of this research topic.\n","authors":["Leina Elansary","Zaki Taha","Walaa Gad"],"pdf_url":"https://arxiv.org/pdf/2408.01728v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12502v1","updated":"2024-11-19T13:40:49Z","published":"2024-11-19T13:40:49Z","title":"Transformer Neural Processes -- Kernel Regression","summary":" Stochastic processes model various natural phenomena from disease\ntransmission to stock prices, but simulating and quantifying their uncertainty\ncan be computationally challenging. For example, modeling a Gaussian Process\nwith standard statistical methods incurs an $\\mathcal{O}(n^3)$ penalty, and\neven using state-of-the-art Neural Processes (NPs) incurs an $\\mathcal{O}(n^2)$\npenalty due to the attention mechanism. We introduce the Transformer Neural\nProcess - Kernel Regression (TNP-KR), a new architecture that incorporates a\nnovel transformer block we call a Kernel Regression Block (KRBlock), which\nreduces the computational complexity of attention in transformer-based Neural\nProcesses (TNPs) from $\\mathcal{O}((n_C+n_T)^2)$ to $O(n_C^2+n_Cn_T)$ by\neliminating masked computations, where $n_C$ is the number of context, and\n$n_T$ is the number of test points, respectively, and a fast attention variant\nthat further reduces all attention calculations to $\\mathcal{O}(n_C)$ in space\nand time complexity. In benchmarks spanning such tasks as meta-regression,\nBayesian optimization, and image completion, we demonstrate that the full\nvariant matches the performance of state-of-the-art methods while training\nfaster and scaling two orders of magnitude higher in number of test points, and\nthe fast variant nearly matches that performance while scaling to millions of\nboth test and context points on consumer hardware.\n","authors":["Daniel Jenson","Jhonathan Navott","Mengyan Zhang","Makkunda Sharma","Elizaveta Semenova","Seth Flaxman"],"pdf_url":"https://arxiv.org/pdf/2411.12502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12498v1","updated":"2024-11-19T13:31:53Z","published":"2024-11-19T13:31:53Z","title":"Enhancing Reasoning Capabilities of LLMs via Principled Synthetic Logic\n Corpus","summary":" Large language models (LLMs) are capable of solving a wide range of tasks,\nyet they have struggled with reasoning. To address this, we propose\n$\\textbf{Additional Logic Training (ALT)}$, which aims to enhance LLMs'\nreasoning capabilities by program-generated logical reasoning samples. We first\nestablish principles for designing high-quality samples by integrating symbolic\nlogic theory and previous empirical insights. Then, based on these principles,\nwe construct a synthetic corpus named $\\textbf{Formal Logic Deduction Diverse}$\n($\\textbf{FLD}$$^{\\times 2}$), comprising numerous samples of multi-step\ndeduction with unknown facts, diverse reasoning rules, diverse linguistic\nexpressions, and challenging distractors. Finally, we empirically show that ALT\non FLD$^{\\times2}$ substantially enhances the reasoning capabilities of\nstate-of-the-art LLMs, including LLaMA-3.1-70B. Improvements include gains of\nup to 30 points on logical reasoning benchmarks, up to 10 points on math and\ncoding benchmarks, and 5 points on the benchmark suite BBH.\n","authors":["Terufumi Morishita","Gaku Morio","Atsuki Yamaguchi","Yasuhiro Sogawa"],"pdf_url":"https://arxiv.org/pdf/2411.12498v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.19262v3","updated":"2024-11-19T13:27:30Z","published":"2024-05-29T16:55:32Z","title":"Weak-to-Strong Search: Align Large Language Models via Searching over\n Small Language Models","summary":" Large language models are usually fine-tuned to align with human preferences.\nHowever, fine-tuning a large language model can be challenging. In this work,\nwe introduce $\\textit{weak-to-strong search}$, framing the alignment of a large\nlanguage model as a test-time greedy search to maximize the log-probability\ndifference between small tuned and untuned models while sampling from the\nfrozen large model. This method serves both as (1) a compute-efficient model\nup-scaling strategy that avoids directly tuning the large model and as (2) an\ninstance of weak-to-strong generalization that enhances a strong model with\nweak test-time guidance. Empirically, we demonstrate the flexibility of\nweak-to-strong search across different tasks. In controlled-sentiment\ngeneration and summarization, we use tuned and untuned $\\texttt{gpt2}$s to\nimprove the alignment of large models without additional training. Crucially,\nin a more difficult instruction-following benchmark, AlpacaEval 2.0, we show\nthat reusing off-the-shelf small models (e.g., $\\texttt{zephyr-7b-beta}$ and\nits untuned version) can improve the length-controlled win rates of both\nwhite-box and black-box large models against $\\texttt{gpt-4-turbo}$ (e.g.,\n$34.4\\% \\rightarrow 37.9\\%$ for $\\texttt{Llama-3-70B-Instruct}$ and $16.0\\%\n\\rightarrow 20.1\\%$ for $\\texttt{gpt-3.5-turbo-instruct}$), despite the small\nmodels' low win rates $\\approx 10.0\\%$.\n","authors":["Zhanhui Zhou","Zhixuan Liu","Jie Liu","Zhichen Dong","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2405.19262v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.05767v2","updated":"2024-11-19T13:22:21Z","published":"2024-08-11T13:17:14Z","title":"Reference-free Hallucination Detection for Large Vision-Language Models","summary":" Large vision-language models (LVLMs) have made significant progress in recent\nyears. While LVLMs exhibit excellent ability in language understanding,\nquestion answering, and conversations of visual inputs, they are prone to\nproducing hallucinations. While several methods are proposed to evaluate the\nhallucinations in LVLMs, most are reference-based and depend on external tools,\nwhich complicates their practical application. To assess the viability of\nalternative methods, it is critical to understand whether the reference-free\napproaches, which do not rely on any external tools, can efficiently detect\nhallucinations. Therefore, we initiate an exploratory study to demonstrate the\neffectiveness of different reference-free solutions in detecting hallucinations\nin LVLMs. In particular, we conduct an extensive study on three kinds of\ntechniques: uncertainty-based, consistency-based, and supervised uncertainty\nquantification methods on four representative LVLMs across two different tasks.\nThe empirical results show that the reference-free approaches are capable of\neffectively detecting non-factual responses in LVLMs, with the supervised\nuncertainty quantification method outperforming the others, achieving the best\nperformance across different settings.\n","authors":["Qing Li","Jiahui Geng","Chenyang Lyu","Derui Zhu","Maxim Panov","Fakhri Karray"],"pdf_url":"https://arxiv.org/pdf/2408.05767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06701v3","updated":"2024-11-19T13:09:06Z","published":"2023-07-13T11:58:27Z","title":"S-HR-VQVAE: Sequential Hierarchical Residual Learning Vector Quantized\n Variational Autoencoder for Video Prediction","summary":" We address the video prediction task by putting forth a novel model that\ncombines (i) a novel hierarchical residual learning vector quantized\nvariational autoencoder (HR-VQVAE), and (ii) a novel autoregressive\nspatiotemporal predictive model (AST-PM). We refer to this approach as a\nsequential hierarchical residual learning vector quantized variational\nautoencoder (S-HR-VQVAE). By leveraging the intrinsic capabilities of HR-VQVAE\nat modeling still images with a parsimonious representation, combined with the\nAST-PM's ability to handle spatiotemporal information, S-HR-VQVAE can better\ndeal with major challenges in video prediction. These include learning\nspatiotemporal information, handling high dimensional data, combating blurry\nprediction, and implicit modeling of physical characteristics. Extensive\nexperimental results on four challenging tasks, namely KTH Human Action,\nTrafficBJ, Human3.6M, and Kitti, demonstrate that our model compares favorably\nagainst state-of-the-art video prediction techniques both in quantitative and\nqualitative evaluations despite a much smaller model size. Finally, we boost\nS-HR-VQVAE by proposing a novel training method to jointly estimate the\nHR-VQVAE and AST-PM parameters.\n","authors":["Mohammad Adiban","Kalin Stefanov","Sabato Marco Siniscalchi","Giampiero Salvi"],"pdf_url":"https://arxiv.org/pdf/2307.06701v3.pdf","comment":"12 pages, 6 figures, 5 tables. Accepted for publication on IEEE\n Transactions on Multimedia on 2024-11-19"},{"id":"http://arxiv.org/abs/2411.12483v1","updated":"2024-11-19T13:07:04Z","published":"2024-11-19T13:07:04Z","title":"Analysing Explanation-Related Interactions in Collaborative\n Perception-Cognition-Communication-Action","summary":" Effective communication is essential in collaborative tasks, so AI-equipped\nrobots working alongside humans need to be able to explain their behaviour in\norder to cooperate effectively and earn trust. We analyse and classify\ncommunications among human participants collaborating to complete a simulated\nemergency response task. The analysis identifies messages that relate to\nvarious kinds of interactive explanations identified in the explainable AI\nliterature. This allows us to understand what type of explanations humans\nexpect from their teammates in such settings, and thus where AI-equipped robots\nmost need explanation capabilities. We find that most explanation-related\nmessages seek clarification in the decisions or actions taken. We also confirm\nthat messages have an impact on the performance of our simulated task.\n","authors":["Marc Roig Vilamala","Jack Furby","Julian de Gortari Briseno","Mani Srivastava","Alun Preece","Carolina Fuentes Toro"],"pdf_url":"https://arxiv.org/pdf/2411.12483v1.pdf","comment":"4 pages, 3 figures, published as a Late Breaking Report in RO-MAN\n 2024"},{"id":"http://arxiv.org/abs/2411.12476v1","updated":"2024-11-19T12:56:43Z","published":"2024-11-19T12:56:43Z","title":"Comparing Prior and Learned Time Representations in Transformer Models\n of Timeseries","summary":" What sets timeseries analysis apart from other machine learning exercises is\nthat time representation becomes a primary aspect of the experiment setup, as\nit must adequately represent the temporal relations that are relevant for the\napplication at hand. In the work described here we study wo different\nvariations of the Transformer architecture: one where we use the fixed time\nrepresentation proposed in the literature and one where the time representation\nis learned from the data. Our experiments use data from predicting the energy\noutput of solar panels, a task that exhibits known periodicities (daily and\nseasonal) that is straight-forward to encode in the fixed time representation.\nOur results indicate that even in an experiment where the phenomenon is\nwell-understood, it is difficult to encode prior knowledge due to side-effects\nthat are difficult to mitigate. We conclude that research work is needed to\nwork the human into the learning loop in ways that improve the robustness and\ntrust-worthiness of the network.\n","authors":["Natalia Koliou","Tatiana Boura","Stasinos Konstantopoulos","George Meramveliotakis","George Kosmadakis"],"pdf_url":"https://arxiv.org/pdf/2411.12476v1.pdf","comment":"Presented at the AI in Natural Sciences and Technology (AINST) track\n of the 13th Conference on Artificial Intelligence (SETN 2024), 11-13\n September 2024, Piraeus, Greece"},{"id":"http://arxiv.org/abs/2411.12469v1","updated":"2024-11-19T12:51:17Z","published":"2024-11-19T12:51:17Z","title":"AI Flow at the Network Edge","summary":" Recent advancements in large language models (LLMs) and their multimodal\nvariants have led to remarkable progress across various domains, demonstrating\nimpressive capabilities and unprecedented potential. In the era of ubiquitous\nconnectivity, leveraging communication networks to distribute intelligence is a\ntransformative concept, envisioning AI-powered services accessible at the\nnetwork edge. However, pushing large models from the cloud to\nresource-constrained environments faces critical challenges. Model inference on\nlow-end devices leads to excessive latency and performance bottlenecks, while\nraw data transmission over limited bandwidth networks causes high communication\noverhead. This article presents AI Flow, a framework that streamlines the\ninference process by jointly leveraging the heterogeneous resources available\nacross devices, edge nodes, and cloud servers, making intelligence flow across\nnetworks. To facilitate cooperation among multiple computational nodes, the\nproposed framework explores a paradigm shift in the design of communication\nnetwork systems from transmitting information flow to intelligence flow, where\nthe goal of communications is task-oriented and folded into the inference\nprocess. Experimental results demonstrate the effectiveness of the proposed\nframework through an image captioning use case, showcasing the ability to\nreduce response latency while maintaining high-quality captions. This article\nserves as a position paper for identifying the motivation, challenges, and\nprinciples of AI Flow.\n","authors":["Jiawei Shao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2411.12469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04625v3","updated":"2024-11-19T12:41:04Z","published":"2024-06-07T04:19:01Z","title":"Key-Element-Informed sLLM Tuning for Document Summarization","summary":" Remarkable advances in large language models (LLMs) have enabled high-quality\ntext summarization. However, this capability is currently accessible only\nthrough LLMs of substantial size or proprietary LLMs with usage fees. In\nresponse, smaller-scale LLMs (sLLMs) of easy accessibility and low costs have\nbeen extensively studied, yet they often suffer from missing key information\nand entities, i.e., low relevance, in particular, when input documents are\nlong. We hence propose a key-element-informed instruction tuning for\nsummarization, so-called KEITSum, which identifies key elements in documents\nand instructs sLLM to generate summaries capturing these key elements.\nExperimental results on dialogue and news datasets demonstrate that sLLM with\nKEITSum indeed provides high-quality summarization with higher relevance and\nless hallucinations, competitive to proprietary LLM.\n","authors":["Sangwon Ryu","Heejin Do","Yunsu Kim","Gary Geunbae Lee","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2406.04625v3.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2411.12460v1","updated":"2024-11-19T12:36:02Z","published":"2024-11-19T12:36:02Z","title":"Guide-to-Explain for Controllable Summarization","summary":" Recently, large language models (LLMs) have demonstrated remarkable\nperformance in abstractive summarization tasks. However, controllable\nsummarization with LLMs remains underexplored, limiting their ability to\ngenerate summaries that align with specific user preferences. In this paper, we\nfirst investigate the capability of LLMs to control diverse attributes,\nrevealing that they encounter greater challenges with numerical attributes,\nsuch as length and extractiveness, compared to linguistic attributes. To\naddress this challenge, we propose a guide-to-explain framework (GTE) for\ncontrollable summarization. Our GTE framework enables the model to identify\nmisaligned attributes in the initial draft and guides it in explaining errors\nin the previous output. Based on this reflection, the model generates a\nwell-adjusted summary. As a result, by allowing the model to reflect on its\nmisalignment, we generate summaries that satisfy the desired attributes in\nsurprisingly fewer iterations than other iterative methods solely using LLMs.\n","authors":["Sangwon Ryu","Heejin Do","Daehee Kim","Yunsu Kim","Gary Geunbae Lee","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2411.12460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19997v2","updated":"2024-11-19T12:28:19Z","published":"2024-06-28T15:32:59Z","title":"Wavelets Are All You Need for Autoregressive Image Generation","summary":" In this paper, we take a new approach to autoregressive image generation that\nis based on two main ingredients. The first is wavelet image coding, which\nallows to tokenize the visual details of an image from coarse to fine details\nby ordering the information starting with the most significant bits of the most\nsignificant wavelet coefficients. The second is a variant of a language\ntransformer whose architecture is re-designed and optimized for token sequences\nin this 'wavelet language'. The transformer learns the significant statistical\ncorrelations within a token sequence, which are the manifestations of\nwell-known correlations between the wavelet subbands at various resolutions. We\nshow experimental results with conditioning on the generation process.\n","authors":["Wael Mattar","Idan Levy","Nir Sharon","Shai Dekel"],"pdf_url":"https://arxiv.org/pdf/2406.19997v2.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.12433v1","updated":"2024-11-19T11:50:03Z","published":"2024-11-19T11:50:03Z","title":"Preference-Conditioned Gradient Variations for Multi-Objective\n Quality-Diversity","summary":" In a variety of domains, from robotics to finance, Quality-Diversity\nalgorithms have been used to generate collections of both diverse and\nhigh-performing solutions. Multi-Objective Quality-Diversity algorithms have\nemerged as a promising approach for applying these methods to complex,\nmulti-objective problems. However, existing methods are limited by their search\ncapabilities. For example, Multi-Objective Map-Elites depends on random genetic\nvariations which struggle in high-dimensional search spaces. Despite efforts to\nenhance search efficiency with gradient-based mutation operators, existing\napproaches consider updating solutions to improve on each objective separately\nrather than achieving desired trade-offs. In this work, we address this\nlimitation by introducing Multi-Objective Map-Elites with\nPreference-Conditioned Policy-Gradient and Crowding Mechanisms: a new\nMulti-Objective Quality-Diversity algorithm that uses preference-conditioned\npolicy-gradient mutations to efficiently discover promising regions of the\nobjective space and crowding mechanisms to promote a uniform distribution of\nsolutions on the Pareto front. We evaluate our approach on six robotics\nlocomotion tasks and show that our method outperforms or matches all\nstate-of-the-art Multi-Objective Quality-Diversity methods in all six,\nincluding two newly proposed tri-objective tasks. Importantly, our method also\nachieves a smoother set of trade-offs, as measured by newly-proposed\nsparsity-based metrics. This performance comes at a lower computational storage\ncost compared to previous methods.\n","authors":["Hannah Janmohamed","Maxence Faldor","Thomas Pierrot","Antoine Cully"],"pdf_url":"https://arxiv.org/pdf/2411.12433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00570v2","updated":"2024-11-19T11:00:38Z","published":"2024-03-01T14:47:46Z","title":"Rethinking cluster-conditioned diffusion models for label-free image\n synthesis","summary":" Diffusion-based image generation models can enhance image quality when\nconditioned on ground truth labels. Here, we conduct a comprehensive\nexperimental study on image-level conditioning for diffusion models using\ncluster assignments. We investigate how individual clustering determinants,\nsuch as the number of clusters and the clustering method, impact image\nsynthesis across three different datasets. Given the optimal number of clusters\nwith respect to image synthesis, we show that cluster-conditioning can achieve\nstate-of-the-art performance, with an FID of 1.67 for CIFAR10 and 2.17 for\nCIFAR100, along with a strong increase in training sample efficiency. We\nfurther propose a novel empirical method to estimate an upper bound for the\noptimal number of clusters. Unlike existing approaches, we find no significant\nassociation between clustering performance and the corresponding\ncluster-conditional FID scores. The code is available at\nhttps://github.com/HHU-MMBS/cedm-official-wavc2025.\n","authors":["Nikolas Adaloglou","Tim Kaiser","Felix Michels","Markus Kollmann"],"pdf_url":"https://arxiv.org/pdf/2403.00570v2.pdf","comment":"Accepted in WAVC2025 (21 pages, 15 figures). Code is available at\n https://github.com/HHU-MMBS/cedm-official-wavc2025"},{"id":"http://arxiv.org/abs/2405.04793v2","updated":"2024-11-19T10:59:30Z","published":"2024-05-08T03:57:45Z","title":"Zero-shot LLM-guided Counterfactual Generation: A Case Study on NLP\n Model Evaluation","summary":" With the development and proliferation of large, complex, black-box models\nfor solving many natural language processing (NLP) tasks, there is also an\nincreasing necessity of methods to stress-test these models and provide some\ndegree of interpretability or explainability. While counterfactual examples are\nuseful in this regard, automated generation of counterfactuals is a data and\nresource intensive process. such methods depend on models such as pre-trained\nlanguage models that are then fine-tuned on auxiliary, often task-specific\ndatasets, that may be infeasible to build in practice, especially for new tasks\nand data domains. Therefore, in this work we explore the possibility of\nleveraging large language models (LLMs) for zero-shot counterfactual generation\nin order to stress-test NLP models. We propose a structured pipeline to\nfacilitate this generation, and we hypothesize that the instruction-following\nand textual understanding capabilities of recent LLMs can be effectively\nleveraged for generating high quality counterfactuals in a zero-shot manner,\nwithout requiring any training or fine-tuning. Through comprehensive\nexperiments on a variety of propreitary and open-source LLMs, along with\nvarious downstream tasks in NLP, we explore the efficacy of LLMs as zero-shot\ncounterfactual generators in evaluating and explaining black-box NLP models.\n","authors":["Amrita Bhattacharjee","Raha Moraffah","Joshua Garland","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2405.04793v2.pdf","comment":"Longer version of short paper accepted at IEEE BigData 2024 (Main\n Track)"},{"id":"http://arxiv.org/abs/2309.10987v4","updated":"2024-11-19T10:55:52Z","published":"2023-09-20T01:04:57Z","title":"SpikingNeRF: Making Bio-inspired Neural Networks See through the Real\n World","summary":" In this paper, we propose SpikingNeRF, which aligns the temporal dimension of\nspiking neural networks (SNNs) with the radiance rays, to seamlessly\naccommodate SNNs to the reconstruction of neural radiance fields (NeRF). Thus,\nthe computation turns into a spike-based, multiplication-free manner, reducing\nenergy consumption and making high-quality 3D rendering, for the first time,\naccessible to neuromorphic hardware. In SpikingNeRF, each sampled point on the\nray is matched to a particular time step and represented in a hybrid manner\nwhere the voxel grids are maintained as well. Based on the voxel grids, sampled\npoints are determined whether to be masked out for faster training and\ninference. However, this masking operation also incurs irregular temporal\nlength, making it intractable for hardware processors, e.g., GPUs, to conduct\nparallel training. To address this problem, we develop the temporal padding\nstrategy to tackle the masked samples to maintain regular temporal length,\ni.e., regular tensors, and further propose the temporal condensing strategy to\nform a denser data structure for hardware-friendly computation. Experiments on\nvarious datasets demonstrate that our method can reduce energy consumption by\nan average of 70.79\\% and obtain comparable synthesis quality with the ANN\nbaseline. Verification on the neuromorphic hardware accelerator also shows that\nSpikingNeRF can further benefit from neuromorphic computing over the ANN\nbaselines on energy efficiency. Codes and the appendix are in\n\\url{https://github.com/Ikarosy/SpikingNeRF-of-CASIA}.\n","authors":["Xingting Yao","Qinghao Hu","Fei Zhou","Tielong Liu","Zitao Mo","Zeyu Zhu","Zhengyang Zhuge","Jian Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.10987v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12405v1","updated":"2024-11-19T10:41:54Z","published":"2024-11-19T10:41:54Z","title":"Evaluating the Prompt Steerability of Large Language Models","summary":" Building pluralistic AI requires designing models that are able to be shaped\nto represent a wide range of value systems and cultures. Achieving this\nrequires first being able to evaluate the degree to which a given model is\ncapable of reflecting various personas. To this end, we propose a benchmark for\nevaluating the steerability of model personas as a function of prompting. Our\ndesign is based on a formal definition of prompt steerability, which analyzes\nthe degree to which a model's joint behavioral distribution can be shifted from\nits baseline behavior. By defining steerability indices and inspecting how\nthese indices change as a function of steering effort, we can estimate the\nsteerability of a model across various persona dimensions and directions. Our\nbenchmark reveals that the steerability of many current models is limited --\ndue to both a skew in their baseline behavior and an asymmetry in their\nsteerability across many persona dimensions. We release an implementation of\nour benchmark at https://github.com/IBM/prompt-steering.\n","authors":["Erik Miehling","Michael Desmond","Karthikeyan Natesan Ramamurthy","Elizabeth M. Daly","Pierre Dognin","Jesus Rios","Djallel Bouneffouf","Miao Liu"],"pdf_url":"https://arxiv.org/pdf/2411.12405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.01581v2","updated":"2024-11-19T10:28:29Z","published":"2022-07-04T17:01:18Z","title":"Interpretable Fusion Analytics Framework for fMRI Connectivity:\n Self-Attention Mechanism and Latent Space Item-Response Model","summary":" There have been several attempts to use deep learning based on brain fMRI\nsignals to classify cognitive impairment diseases. However, deep learning is a\nhidden black box model that makes it difficult to interpret the process of\nclassification. To address this issue, we propose a novel analytical framework\nthat interprets the classification result from deep learning processes. We\nfirst derive the region of interest (ROI) functional connectivity network (FCN)\nby embedding functions based on their similar signal patterns. Then, using the\nself-attention equipped deep learning model, we classify diseases based on\ntheir FCN. Finally, in order to interpret the classification results, we employ\na latent space item-response interaction network model to identify the\nsignificant functions that exhibit distinct connectivity patterns when compared\nto other diseases. The application of this proposed framework to the four types\nof cognitive impairment shows that our approach is valid for determining the\nsignificant ROI functions.\n","authors":["Jeong-Jae Kim","Yeseul Jeon","SuMin Yu","Junggu Choi","Sanghoon Han"],"pdf_url":"https://arxiv.org/pdf/2207.01581v2.pdf","comment":"This submission is a duplicate of another manuscript from our\n research group [arXiv preprint arXiv:2401.09028] due to a misunderstanding in\n communication among co-authors"},{"id":"http://arxiv.org/abs/2411.12395v1","updated":"2024-11-19T10:27:26Z","published":"2024-11-19T10:27:26Z","title":"Do LLMs Understand Ambiguity in Text? A Case Study in Open-world\n Question Answering","summary":" Ambiguity in natural language poses significant challenges to Large Language\nModels (LLMs) used for open-domain question answering. LLMs often struggle with\nthe inherent uncertainties of human communication, leading to\nmisinterpretations, miscommunications, hallucinations, and biased responses.\nThis significantly weakens their ability to be used for tasks like\nfact-checking, question answering, feature extraction, and sentiment analysis.\nUsing open-domain question answering as a test case, we compare off-the-shelf\nand few-shot LLM performance, focusing on measuring the impact of explicit\ndisambiguation strategies. We demonstrate how simple, training-free,\ntoken-level disambiguation methods may be effectively used to improve LLM\nperformance for ambiguous question answering tasks. We empirically show our\nfindings and discuss best practices and broader impacts regarding ambiguity in\nLLMs.\n","authors":["Aryan Keluskar","Amrita Bhattacharjee","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2411.12395v1.pdf","comment":"Accepted at the REU Symposium at IEEE BigData 2024"},{"id":"http://arxiv.org/abs/2411.09623v2","updated":"2024-11-19T10:15:56Z","published":"2024-11-14T17:47:54Z","title":"Vision-based Manipulation of Transparent Plastic Bags in Industrial\n Setups","summary":" This paper addresses the challenges of vision-based manipulation for\nautonomous cutting and unpacking of transparent plastic bags in industrial\nsetups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data,\nconnectivity, analytics, and robotics, promises enhanced accessibility and\nsustainability throughout the value chain. The integration of autonomous\nsystems, including collaborative robots (cobots), into industrial processes is\npivotal for efficiency and safety. The proposed solution employs advanced\nMachine Learning algorithms, particularly Convolutional Neural Networks (CNNs),\nto identify transparent plastic bags under varying lighting and background\nconditions. Tracking algorithms and depth sensing technologies are utilized for\n3D spatial awareness during pick and placement. The system addresses challenges\nin grasping and manipulation, considering optimal points, compliance control\nwith vacuum gripping technology, and real-time automation for safe interaction\nin dynamic environments. The system's successful testing and validation in the\nlab with the FRANKA robot arm, showcases its potential for widespread\nindustrial applications, while demonstrating effectiveness in automating the\nunpacking and cutting of transparent plastic bags for an 8-stack bulk-loader\nbased on specific requirements and rigorous testing.\n","authors":["F. Adetunji","A. Karukayil","P. Samant","S. Shabana","F. Varghese","U. Upadhyay","R. A. Yadav","A. Partridge","E. Pendleton","R. Plant","Y. Petillot","M. Koskinopoulou"],"pdf_url":"https://arxiv.org/pdf/2411.09623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11616v2","updated":"2024-11-19T10:11:04Z","published":"2024-11-18T14:42:15Z","title":"Signaling and Social Learning in Swarms of Robots","summary":" This paper investigates the role of communication in improving coordination\nwithin robot swarms, focusing on a paradigm where learning and execution occur\nsimultaneously in a decentralized manner. We highlight the role communication\ncan play in addressing the credit assignment problem (individual contribution\nto the overall performance), and how it can be influenced by it. We propose a\ntaxonomy of existing and future works on communication, focusing on information\nselection and physical abstraction as principal axes for classification: from\nlow-level lossless compression with raw signal extraction and processing to\nhigh-level lossy compression with structured communication models. The paper\nreviews current research from evolutionary robotics, multi-agent (deep)\nreinforcement learning, language models, and biophysics models to outline the\nchallenges and opportunities of communication in a collective of robots that\ncontinuously learn from one another through local message exchanges,\nillustrating a form of social learning.\n","authors":["Leo Cazenille","Maxime Toquebiau","Nicolas Lobato-Dauzier","Alessia Loi","Loona Macabre","Nathanael Aubert-Kato","Anthony Genot","Nicolas Bredeche"],"pdf_url":"https://arxiv.org/pdf/2411.11616v2.pdf","comment":"17 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2409.16718v2","updated":"2024-11-19T09:27:37Z","published":"2024-09-25T08:07:18Z","title":"Vision-Language Model Fine-Tuning via Simple Parameter-Efficient\n Modification","summary":" Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed\nthe success of prompt tuning and adapter tuning, while the classic model\nfine-tuning on inherent parameters seems to be overlooked. It is believed that\nfine-tuning the parameters of VLMs with few-shot samples corrupts the\npre-trained knowledge since fine-tuning the CLIP model even degrades\nperformance. In this paper, we revisit this viewpoint, and propose a new\nperspective: fine-tuning the specific parameters instead of all will uncover\nthe power of classic model fine-tuning on VLMs. Through our meticulous study,\nwe propose ClipFit, a simple yet effective method to fine-tune CLIP without\nintroducing any overhead of extra parameters. We demonstrate that by only\nfine-tuning the specific bias terms and normalization layers, ClipFit can\nimprove the performance of zero-shot CLIP by 7.27\\% average harmonic mean\naccuracy. Lastly, to understand how fine-tuning in CLIPFit affects the\npre-trained models, we conducted extensive experimental analyses w.r.t. changes\nin internal parameters and representations. We found that low-level text bias\nlayers and the first layer normalization layer change much more than other\nlayers. The code is available at \\url{https://github.com/minglllli/CLIPFit}.\n","authors":["Ming Li","Jike Zhong","Chenxin Li","Liuzhuozheng Li","Nie Lin","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2409.16718v2.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2411.12357v1","updated":"2024-11-19T09:18:20Z","published":"2024-11-19T09:18:20Z","title":"A Layered Architecture for Developing and Enhancing Capabilities in\n Large Language Model-based Software Systems","summary":" Significant efforts has been made to expand the use of Large Language Models\n(LLMs) beyond basic language tasks. While the generalizability and versatility\nof LLMs have enabled widespread adoption, evolving demands in application\ndevelopment often exceed their native capabilities. Meeting these demands may\ninvolve a diverse set of methods, such as enhancing creativity through either\ninference temperature adjustments or creativity-provoking prompts. Selecting\nthe right approach is critical, as different methods lead to trade-offs in\nengineering complexity, scalability, and operational costs. This paper\nintroduces a layered architecture that organizes LLM software system\ndevelopment into distinct layers, each characterized by specific attributes. By\naligning capabilities with these layers, the framework encourages the\nsystematic implementation of capabilities in effective and efficient ways that\nultimately supports desired functionalities and qualities. Through practical\ncase studies, we illustrate the utility of the framework. This work offers\ndevelopers actionable insights for selecting suitable technologies in LLM-based\nsoftware system development, promoting robustness and scalability.\n","authors":["Dawen Zhang","Xiwei Xu","Chen Wang","Zhenchang Xing","Robert Mao"],"pdf_url":"https://arxiv.org/pdf/2411.12357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12350v1","updated":"2024-11-19T09:07:26Z","published":"2024-11-19T09:07:26Z","title":"DiM: $f$-Divergence Minimization Guided Sharpness-Aware Optimization for\n Semi-supervised Medical Image Segmentation","summary":" As a technique to alleviate the pressure of data annotation, semi-supervised\nlearning (SSL) has attracted widespread attention. In the specific domain of\nmedical image segmentation, semi-supervised methods (SSMIS) have become a\nresearch hotspot due to their ability to reduce the need for large amounts of\nprecisely annotated data. SSMIS focuses on enhancing the model's generalization\nperformance by leveraging a small number of labeled samples and a large number\nof unlabeled samples. The latest sharpness-aware optimization (SAM) technique,\nwhich optimizes the model by reducing the sharpness of the loss function, has\nshown significant success in SSMIS. However, SAM and its variants may not fully\naccount for the distribution differences between different datasets. To address\nthis issue, we propose a sharpness-aware optimization method based on\n$f$-divergence minimization (DiM) for semi-supervised medical image\nsegmentation. This method enhances the model's stability by fine-tuning the\nsensitivity of model parameters and improves the model's adaptability to\ndifferent datasets through the introduction of $f$-divergence. By reducing\n$f$-divergence, the DiM method not only improves the performance balance\nbetween the source and target datasets but also prevents performance\ndegradation due to overfitting on the source dataset.\n","authors":["Bingli Wang","Houcheng Su","Nan Yin","Mengzhu Wang","Li Shen"],"pdf_url":"https://arxiv.org/pdf/2411.12350v1.pdf","comment":"8page"},{"id":"http://arxiv.org/abs/2406.05085v2","updated":"2024-11-19T08:46:34Z","published":"2024-06-07T16:59:38Z","title":"Multi-Head RAG: Solving Multi-Aspect Problems with LLMs","summary":" Retrieval Augmented Generation (RAG) enhances the abilities of Large Language\nModels (LLMs) by enabling the retrieval of documents into the LLM context to\nprovide more accurate and relevant responses. Existing RAG solutions do not\nfocus on queries that may require fetching multiple documents with\nsubstantially different contents. Such queries occur frequently, but are\nchallenging because the embeddings of these documents may be distant in the\nembedding space, making it hard to retrieve them all. This paper introduces\nMulti-Head RAG (MRAG), a novel scheme designed to address this gap with a\nsimple yet powerful idea: leveraging activations of Transformer's multi-head\nattention layer, instead of the decoder layer, as keys for fetching\nmulti-aspect documents. The driving motivation is that different attention\nheads can learn to capture different data aspects. Harnessing the corresponding\nactivations results in embeddings that represent various facets of data items\nand queries, improving the retrieval accuracy for complex queries. We provide\nan evaluation methodology and metrics, multi-aspect datasets that we release\nonline, and real-world use cases to demonstrate MRAG's effectiveness, showing\nimprovements of up to 20% in relevance over standard RAG baselines. MRAG can be\nseamlessly integrated with existing RAG frameworks and benchmarking tools like\nRAGAS as well as different classes of data stores.\n","authors":["Maciej Besta","Ales Kubicek","Roman Niggli","Robert Gerstenberger","Lucas Weitzendorf","Mingyuan Chi","Patrick Iff","Joanna Gajda","Piotr Nyczyk","Jürgen Müller","Hubert Niewiadomski","Marcin Chrapek","Michał Podstawski","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2406.05085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02856v5","updated":"2024-11-19T08:38:55Z","published":"2024-06-05T02:12:06Z","title":"Xmodel-LM Technical Report","summary":" We introduce Xmodel-LM, a compact and efficient 1.1B language model\npre-trained on around 2 trillion tokens. Trained on our self-built dataset\n(Xdata), which balances Chinese and English corpora based on downstream task\noptimization, Xmodel-LM exhibits remarkable performance despite its smaller\nsize. It notably surpasses existing open-source language models of similar\nscale. Our model checkpoints and code are publicly accessible on GitHub at\nhttps://github.com/XiaoduoAILab/XmodelLM.\n","authors":["Yichuan Wang","Yang Liu","Yu Yan","Qun Wang","Xucheng Huang","Ling Jiang"],"pdf_url":"https://arxiv.org/pdf/2406.02856v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12319v1","updated":"2024-11-19T08:23:52Z","published":"2024-11-19T08:23:52Z","title":"CLIP Unreasonable Potential in Single-Shot Face Recognition","summary":" Face recognition is a core task in computer vision designed to identify and\nauthenticate individuals by analyzing facial patterns and features. This field\nintersects with artificial intelligence image processing and machine learning\nwith applications in security authentication and personalization. Traditional\napproaches in facial recognition focus on capturing facial features like the\neyes, nose and mouth and matching these against a database to verify identities\nHowever challenges such as high false positive rates have persisted often due\nto the similarity among individuals facial features. Recently Contrastive\nLanguage Image Pretraining (CLIP) a model developed by OpenAI has shown\npromising advancements by linking natural language processing with vision tasks\nallowing it to generalize across modalities. Using CLIP's vision language\ncorrespondence and single-shot finetuning the model can achieve lower false\npositive rates upon deployment without the need of mass facial features\nextraction. This integration demonstrating CLIP's potential to address\npersistent issues in face recognition model performance without complicating\nour training paradigm.\n","authors":["Nhan T. Luu"],"pdf_url":"https://arxiv.org/pdf/2411.12319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19954v2","updated":"2024-11-19T08:17:30Z","published":"2024-09-30T05:19:09Z","title":"Domain Consistency Representation Learning for Lifelong Person\n Re-Identification","summary":" Lifelong person re-identification (LReID) exhibits a contradictory\nrelationship between intra-domain discrimination and inter-domain gaps when\nlearning from continuous data. Intra-domain discrimination focuses on\nindividual nuances (e.g. clothing type, accessories, etc.), while inter-domain\ngaps emphasize domain consistency. Achieving a trade-off between maximizing\nintra-domain discrimination and minimizing inter-domain gaps is a crucial\nchallenge for improving LReID performance. Most existing methods aim to reduce\ninter-domain gaps through knowledge distillation to maintain domain\nconsistency. However, they often ignore intra-domain discrimination. To address\nthis challenge, we propose a novel domain consistency representation learning\n(DCR) model that explores global and attribute-wise representations as a bridge\nto balance intra-domain discrimination and inter-domain gaps. At the\nintra-domain level, we explore the complementary relationship between global\nand attribute-wise representations to improve discrimination among similar\nidentities. Excessive learning intra-domain discrimination can lead to\ncatastrophic forgetting. We further develop an attribute-oriented\nanti-forgetting (AF) strategy that explores attribute-wise representations to\nenhance inter-domain consistency, and propose a knowledge consolidation (KC)\nstrategy to facilitate knowledge transfer. Extensive experiments show that our\nDCR model achieves superior performance compared to state-of-the-art LReID\nmethods. Our code will be available soon.\n","authors":["Shiben Liu","Qiang Wang","Huijie Fan","Weihong Ren","Baojie Fan","Yandong Tang"],"pdf_url":"https://arxiv.org/pdf/2409.19954v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.15761v2","updated":"2024-11-19T08:12:46Z","published":"2024-09-24T05:31:17Z","title":"TFG: Unified Training-Free Guidance for Diffusion Models","summary":" Given an unconditional diffusion model and a predictor for a target property\nof interest (e.g., a classifier), the goal of training-free guidance is to\ngenerate samples with desirable target properties without additional training.\nExisting methods, though effective in various individual applications, often\nlack theoretical grounding and rigorous testing on extensive benchmarks. As a\nresult, they could even fail on simple tasks, and applying them to a new\nproblem becomes unavoidably difficult. This paper introduces a novel\nalgorithmic framework encompassing existing methods as special cases, unifying\nthe study of training-free guidance into the analysis of an algorithm-agnostic\ndesign space. Via theoretical and empirical investigation, we propose an\nefficient and effective hyper-parameter searching strategy that can be readily\napplied to any downstream task. We systematically benchmark across 7 diffusion\nmodels on 16 tasks with 40 targets, and improve performance by 8.5% on average.\nOur framework and benchmark offer a solid foundation for conditional generation\nin a training-free manner.\n","authors":["Haotian Ye","Haowei Lin","Jiaqi Han","Minkai Xu","Sheng Liu","Yitao Liang","Jianzhu Ma","James Zou","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2409.15761v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12308v1","updated":"2024-11-19T07:49:22Z","published":"2024-11-19T07:49:22Z","title":"SNN-Based Online Learning of Concepts and Action Laws in an Open World","summary":" We present the architecture of a fully autonomous, bio-inspired cognitive\nagent built around a spiking neural network (SNN) implementing the agent's\nsemantic memory. The agent explores its universe and learns concepts of\nobjects/situations and of its own actions in a one-shot manner. While\nobject/situation concepts are unary, action concepts are triples made up of an\ninitial situation, a motor activity, and an outcome. They embody the agent's\nknowledge of its universe's actions laws. Both kinds of concepts have different\ndegrees of generality. To make decisions the agent queries its semantic memory\nfor the expected outcomes of envisaged actions and chooses the action to take\non the basis of these predictions. Our experiments show that the agent handles\nnew situations by appealing to previously learned general concepts and rapidly\nmodifies its concepts to adapt to environment changes.\n","authors":["Christel Grimaud","Dominique Longin","Andreas Herzig"],"pdf_url":"https://arxiv.org/pdf/2411.12308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12307v1","updated":"2024-11-19T07:48:35Z","published":"2024-11-19T07:48:35Z","title":"Balancing Accuracy and Efficiency in Multi-Turn Intent Classification\n for LLM-Powered Dialog Systems in Production","summary":" Accurate multi-turn intent classification is essential for advancing\nconversational AI systems. However, challenges such as the scarcity of\ncomprehensive datasets and the complexity of contextual dependencies across\ndialogue turns hinder progress. This paper presents two novel approaches\nleveraging Large Language Models (LLMs) to enhance scalability and reduce\nlatency in production dialogue systems. First, we introduce Symbol Tuning,\nwhich simplifies intent labels to reduce task complexity and improve\nperformance in multi-turn dialogues. Second, we propose C-LARA\n(Consistency-aware, Linguistics Adaptive Retrieval Augmentation), a framework\nthat employs LLMs for data augmentation and pseudo-labeling to generate\nsynthetic multi-turn dialogues. These enriched datasets are used to fine-tune a\nsmall, efficient model suitable for deployment. Experiments conducted on\nmultilingual dialogue datasets demonstrate significant improvements in\nclassification accuracy and resource efficiency. Our methods enhance multi-turn\nintent classification accuracy by 5.09%, reduce annotation costs by 40%, and\nenable scalable deployment in low-resource multilingual industrial systems,\nhighlighting their practicality and impact.\n","authors":["Junhua Liu","Yong Keat Tan","Bin Fu","Kwan Hui Lim"],"pdf_url":"https://arxiv.org/pdf/2411.12307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12290v1","updated":"2024-11-19T07:19:05Z","published":"2024-11-19T07:19:05Z","title":"SSEditor: Controllable Mask-to-Scene Generation with Diffusion Model","summary":" Recent advancements in 3D diffusion-based semantic scene generation have\ngained attention. However, existing methods rely on unconditional generation\nand require multiple resampling steps when editing scenes, which significantly\nlimits their controllability and flexibility. To this end, we propose SSEditor,\na controllable Semantic Scene Editor that can generate specified target\ncategories without multiple-step resampling. SSEditor employs a two-stage\ndiffusion-based framework: (1) a 3D scene autoencoder is trained to obtain\nlatent triplane features, and (2) a mask-conditional diffusion model is trained\nfor customizable 3D semantic scene generation. In the second stage, we\nintroduce a geometric-semantic fusion module that enhance the model's ability\nto learn geometric and semantic information. This ensures that objects are\ngenerated with correct positions, sizes, and categories. Extensive experiments\non SemanticKITTI and CarlaSC demonstrate that SSEditor outperforms previous\napproaches in terms of controllability and flexibility in target generation, as\nwell as the quality of semantic scene generation and reconstruction. More\nimportantly, experiments on the unseen Occ-3D Waymo dataset show that SSEditor\nis capable of generating novel urban scenes, enabling the rapid construction of\n3D scenes.\n","authors":["Haowen Zheng","Yanyan Liang"],"pdf_url":"https://arxiv.org/pdf/2411.12290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16721v2","updated":"2024-11-19T07:01:03Z","published":"2024-09-25T08:13:39Z","title":"Grading and Anomaly Detection for Automated Retinal Image Analysis using\n Deep Learning","summary":" The significant portion of diabetic patients was affected due to major\nblindness caused by Diabetic retinopathy (DR). For diabetic retinopathy, lesion\nsegmentation, and detection the comprehensive examination is delved into the\ndeep learning techniques application. The study conducted a systematic\nliterature review using the PRISMA analysis and 62 articles has been\ninvestigated in the research. By including CNN-based models for DR grading, and\nfeature fusion several deep-learning methodologies are explored during the\nstudy. For enhancing effectiveness in classification accuracy and robustness\nthe data augmentation and ensemble learning strategies are scrutinized. By\ndemonstrating the superior performance compared to individual models the\nefficacy of ensemble learning methods is investigated. The potential ensemble\napproaches in DR diagnosis are shown by the integration of multiple pre-trained\nnetworks with custom classifiers that yield high specificity. The diverse\ndeep-learning techniques that are employed for detecting DR lesions are\ndiscussed within the diabetic retinopathy lesions segmentation and detection\nsection. By emphasizing the requirement for continued research and integration\ninto clinical practice deep learning shows promise for personalized healthcare\nand early detection of diabetics.\n","authors":["Syed Mohd Faisal Malik","Md Tabrez Nafis","Mohd Abdul Ahad","Safdar Tanweer"],"pdf_url":"https://arxiv.org/pdf/2409.16721v2.pdf","comment":"Diabetic retinopathy, segmentation, images on retinal fundus,\n convolutional neural network"},{"id":"http://arxiv.org/abs/2411.12276v1","updated":"2024-11-19T06:56:24Z","published":"2024-11-19T06:56:24Z","title":"libcll: an Extendable Python Toolkit for Complementary-Label Learning","summary":" Complementary-label learning (CLL) is a weakly supervised learning paradigm\nfor multiclass classification, where only complementary labels -- indicating\nclasses an instance does not belong to -- are provided to the learning\nalgorithm. Despite CLL's increasing popularity, previous studies highlight two\nmain challenges: (1) inconsistent results arising from varied assumptions on\ncomplementary label generation, and (2) high barriers to entry due to the lack\nof a standardized evaluation platform across datasets and algorithms. To\naddress these challenges, we introduce \\texttt{libcll}, an extensible Python\ntoolkit for CLL research. \\texttt{libcll} provides a universal interface that\nsupports a wide range of generation assumptions, both synthetic and real-world\ndatasets, and key CLL algorithms. The toolkit is designed to mitigate\ninconsistencies and streamline the research process, with easy installation,\ncomprehensive usage guides, and quickstart tutorials that facilitate efficient\nadoption and implementation of CLL techniques. Extensive ablation studies\nconducted with \\texttt{libcll} demonstrate its utility in generating valuable\ninsights to advance future CLL research.\n","authors":["Nai-Xuan Ye","Tan-Ha Mai","Hsiu-Hsuan Wang","Wei-I Lin","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2411.12276v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.12275v1","updated":"2024-11-19T06:55:57Z","published":"2024-11-19T06:55:57Z","title":"Building Trust: Foundations of Security, Safety and Transparency in AI","summary":" This paper explores the rapidly evolving ecosystem of publicly available AI\nmodels, and their potential implications on the security and safety landscape.\nAs AI models become increasingly prevalent, understanding their potential risks\nand vulnerabilities is crucial. We review the current security and safety\nscenarios while highlighting challenges such as tracking issues, remediation,\nand the apparent absence of AI model lifecycle and ownership processes.\nComprehensive strategies to enhance security and safety for both model\ndevelopers and end-users are proposed. This paper aims to provide some of the\nfoundational pieces for more standardized security, safety, and transparency in\nthe development and operation of AI models and the larger open ecosystems and\ncommunities forming around them.\n","authors":["Huzaifa Sidhpurwala","Garth Mollett","Emily Fox","Mark Bestavros","Huamin Chen"],"pdf_url":"https://arxiv.org/pdf/2411.12275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01812v4","updated":"2024-11-19T06:14:31Z","published":"2024-09-14T02:35:29Z","title":"From Text to Multimodality: Exploring the Evolution and Impact of Large\n Language Models in Medical Practice","summary":" Large Language Models (LLMs) have rapidly evolved from text-based systems to\nmultimodal platforms, significantly impacting various sectors including\nhealthcare. This comprehensive review explores the progression of LLMs to\nMultimodal Large Language Models (MLLMs) and their growing influence in medical\npractice. We examine the current landscape of MLLMs in healthcare, analyzing\ntheir applications across clinical decision support, medical imaging, patient\nengagement, and research. The review highlights the unique capabilities of\nMLLMs in integrating diverse data types, such as text, images, and audio, to\nprovide more comprehensive insights into patient health. We also address the\nchallenges facing MLLM implementation, including data limitations, technical\nhurdles, and ethical considerations. By identifying key research gaps, this\npaper aims to guide future investigations in areas such as dataset development,\nmodality alignment methods, and the establishment of ethical guidelines. As\nMLLMs continue to shape the future of healthcare, understanding their potential\nand limitations is crucial for their responsible and effective integration into\nmedical practice.\n","authors":["Qian Niu","Keyu Chen","Ming Li","Pohsun Feng","Ziqian Bi","Lawrence KQ Yan","Yichao Zhang","Caitlyn Heqi Yin","Cheng Fei","Junyu Liu","Benji Peng","Tianyang Wang","Yunze Wang","Silin Chen"],"pdf_url":"https://arxiv.org/pdf/2410.01812v4.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2310.16334v3","updated":"2024-11-19T06:12:47Z","published":"2023-10-25T03:30:37Z","title":"Structured Multi-Track Accompaniment Arrangement via Style Prior\n Modelling","summary":" In the realm of music AI, arranging rich and structured multi-track\naccompaniments from a simple lead sheet presents significant challenges. Such\nchallenges include maintaining track cohesion, ensuring long-term coherence,\nand optimizing computational efficiency. In this paper, we introduce a novel\nsystem that leverages prior modelling over disentangled style factors to\naddress these challenges. Our method presents a two-stage process: initially, a\npiano arrangement is derived from the lead sheet by retrieving piano texture\nstyles; subsequently, a multi-track orchestration is generated by infusing\norchestral function styles into the piano arrangement. Our key design is the\nuse of vector quantization and a unique multi-stream Transformer to model the\nlong-term flow of the orchestration style, which enables flexible,\ncontrollable, and structured music generation. Experiments show that by\nfactorizing the arrangement task into interpretable sub-stages, our approach\nenhances generative capacity while improving efficiency. Additionally, our\nsystem supports a variety of music genres and provides style control at\ndifferent composition hierarchies. We further show that our system achieves\nsuperior coherence, structure, and overall arrangement quality compared to\nexisting baselines.\n","authors":["Jingwei Zhao","Gus Xia","Ziyu Wang","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2310.16334v3.pdf","comment":"Accepted by NeurIPS 2024; significance test updated with Bonferroni\n correction"},{"id":"http://arxiv.org/abs/2411.12256v1","updated":"2024-11-19T06:10:22Z","published":"2024-11-19T06:10:22Z","title":"Restructuring Tractable Probabilistic Circuits","summary":" Probabilistic circuits (PCs) is a unifying representation for probabilistic\nmodels that support tractable inference. Numerous applications of PCs like\ncontrollable text generation depend on the ability to efficiently multiply two\ncircuits. Existing multiplication algorithms require that the circuits respect\nthe same structure, i.e. variable scopes decomposes according to the same\nvtree. In this work, we propose and study the task of restructuring\nstructured(-decomposable) PCs, that is, transforming a structured PC such that\nit conforms to a target vtree. We propose a generic approach for this problem\nand show that it leads to novel polynomial-time algorithms for multiplying\ncircuits respecting different vtrees, as well as a practical depth-reduction\nalgorithm that preserves structured decomposibility. Our work opens up new\navenues for tractable PC inference, suggesting the possibility of training with\nless restrictive PC structures while enabling efficient inference by changing\ntheir structures at inference time.\n","authors":["Honghua Zhang","Benjie Wang","Marcelo Arenas","Guy Van den Broeck"],"pdf_url":"https://arxiv.org/pdf/2411.12256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12255v1","updated":"2024-11-19T06:09:09Z","published":"2024-11-19T06:09:09Z","title":"Error-Feedback Model for Output Correction in Bilateral Control-Based\n Imitation Learning","summary":" In recent years, imitation learning using neural networks has enabled robots\nto perform flexible tasks. However, since neural networks operate in a\nfeedforward structure, they do not possess a mechanism to compensate for output\nerrors. To address this limitation, we developed a feedback mechanism to\ncorrect these errors. By employing a hierarchical structure for neural networks\ncomprising lower and upper layers, the lower layer was controlled to follow the\nupper layer. Additionally, using a multi-layer perceptron in the lower layer,\nwhich lacks an internal state, enhanced the error feedback. In the\ncharacter-writing task, this model demonstrated improved accuracy in writing\npreviously untrained characters. In the character-writing task, this model\ndemonstrated improved accuracy in writing previously untrained characters.\nThrough autonomous control with error feedback, we confirmed that the lower\nlayer could effectively track the output of the upper layer. This study\nrepresents a promising step toward integrating neural networks with control\ntheories.\n","authors":["Hiroshi Sato","Masashi Konosu","Sho Sakaino","Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2411.12255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04420v2","updated":"2024-11-19T05:51:39Z","published":"2023-07-10T08:54:07Z","title":"FedDCT: A Dynamic Cross-Tier Federated Learning Framework in Wireless\n Networks","summary":" Federated Learning (FL), as a privacy-preserving machine learning paradigm,\ntrains a global model across devices without exposing local data. However,\nresource heterogeneity and inevitable stragglers in wireless networks severely\nimpact the efficiency and accuracy of FL training. In this paper, we propose a\nnovel Dynamic Cross-Tier Federated Learning framework (FedDCT). Firstly, we\ndesign a dynamic tiering strategy that dynamically partitions devices into\ndifferent tiers based on their response times and assigns specific timeout\nthresholds to each tier to reduce single-round training time. Then, we propose\na cross-tier device selection algorithm that selects devices that respond\nquickly and are conducive to model convergence to improve convergence\nefficiency and accuracy. Experimental results demonstrate that the proposed\napproach under wireless networks outperforms the baseline approach, with an\naverage reduction of 54.7\\% in convergence time and an average improvement of\n1.83\\% in convergence accuracy.\n","authors":["Youquan Xian","Xiaoyun Gan","Chuanjian Yao","Dongcheng Li","Peng Wang","Peng Liu","Ying Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.04420v2.pdf","comment":"Published in WASA 2024"},{"id":"http://arxiv.org/abs/2411.12246v1","updated":"2024-11-19T05:51:10Z","published":"2024-11-19T05:51:10Z","title":"Efficient Training in Multi-Agent Reinforcement Learning: A\n Communication-Free Framework for the Box-Pushing Problem","summary":" Self-organizing systems consist of autonomous agents that can perform complex\ntasks and adapt to dynamic environments without a central controller. Prior\nresearch often relies on reinforcement learning to enable agents to gain the\nskills needed for task completion, such as in the box-pushing environment.\nHowever, when agents push from opposing directions during exploration, they\ntend to exert equal and opposite forces on the box, resulting in minimal\ndisplacement and inefficient training. This paper proposes a model called\nShared Pool of Information (SPI), which enables information to be accessible to\nall agents and facilitates coordination, reducing force conflicts among agents\nand enhancing exploration efficiency. Through computer simulations, we\ndemonstrate that SPI not only expedites the training process but also requires\nfewer steps per episode, significantly improving the agents' collaborative\neffectiveness.\n","authors":["David Ge","Hao Ji"],"pdf_url":"https://arxiv.org/pdf/2411.12246v1.pdf","comment":"17 pages, 16 figures"},{"id":"http://arxiv.org/abs/2411.12240v1","updated":"2024-11-19T05:37:17Z","published":"2024-11-19T05:37:17Z","title":"Evaluating Tokenizer Performance of Large Language Models Across\n Official Indian Languages","summary":" Large Language Models (LLMs) based on transformer architectures have\nrevolutionized a variety of domains, with tokenization playing a pivotal role\nin their pre-processing and fine-tuning stages. In multilingual models,\nparticularly those tailored for Indic languages, effective tokenization is\ncrucial for optimizing performance. This paper presents a comprehensive\nevaluation of tokenizers used by 12 LLMs across all 22 official languages of\nIndia, with a focus on comparing the efficiency of their tokenization\nprocesses. We employed the Normalized Sequence Length (NSL) as a key metric in\nour analysis. Our findings reveal that the SUTRA tokenizer outperforms all\nother models, including several Indic-specific models, excelling in 14\nlanguages. Notable insights include the SUTRA tokenizer's superior handling of\nIndic languages, GPT-4o's advancement over its predecessor GPT-4 in processing\nIndian languages, and the limited performance of Project Indus in certain\nlanguages. This study underscores the critical importance of developing\ntargeted tokenization strategies for multilingual and Indic-centric models,\nlaying the groundwork for future improvements in tokenizer design to enhance\nlinguistic coverage and model efficiency.\n","authors":["S. Tamang","D. J. Bora"],"pdf_url":"https://arxiv.org/pdf/2411.12240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00068v3","updated":"2024-11-19T05:08:44Z","published":"2024-01-30T14:47:15Z","title":"Adapting Amidst Degradation: Cross Domain Li-ion Battery Health\n Estimation via Physics-Guided Test-Time Training","summary":" Health modeling of lithium-ion batteries (LIBs) is crucial for safe and\nefficient energy management and carries significant socio-economic\nimplications. Although Machine Learning (ML)-based State of Health (SOH)\nestimation methods have made significant progress in accuracy, the scarcity of\nhigh-quality LIB data remains a major obstacle. Existing transfer learning\nmethods for cross-domain LIB SOH estimation have significantly alleviated the\nlabeling burden of target LIB data, however, they still require sufficient\nunlabeled target data (UTD) for effective adaptation to the target domain.\nCollecting this UTD is challenging due to the time-consuming nature of\ndegradation experiments. To address this issue, we introduce a practical\nTest-Time Training framework, BatteryTTT, which adapts the model continually\nusing each UTD collected amidst degradation, thereby significantly reducing\ndata collection time. To fully utilize each UTD, BatteryTTT integrates the\ninherent physical laws of modern LIBs into self-supervised learning, termed\nPhyscics-Guided Test-Time Training. Additionally, we explore the potential of\nlarge language models (LLMs) in battery sequence modeling by evaluating their\nperformance in SOH estimation through model reprogramming and prefix prompt\nadaptation. The combination of BatteryTTT and LLM modeling, termed GPT4Battery,\nachieves state-of-the-art generalization results across current LIB benchmarks.\nFurthermore, we demonstrate the practical value and scalability of our approach\nby deploying it in our real-world battery management system (BMS) for 300Ah\nlarge-scale energy storage LIBs.\n","authors":["Yuyuan Feng","Guosheng Hu","Xiaodong Li","Zhihong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00068v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12222v1","updated":"2024-11-19T04:32:41Z","published":"2024-11-19T04:32:41Z","title":"Contrast Similarity-Aware Dual-Pathway Mamba for Multivariate Time\n Series Node Classification","summary":" Multivariate time series (MTS) data is generated through multiple sensors\nacross various domains such as engineering application, health monitoring, and\nthe internet of things, characterized by its temporal changes and high\ndimensional characteristics. Over the past few years, many studies have\nexplored the long-range dependencies and similarities in MTS. However,\nlong-range dependencies are difficult to model due to their temporal changes\nand high dimensionality makes it difficult to obtain similarities effectively\nand efficiently. Thus, to address these issues, we propose contrast\nsimilarity-aware dual-pathway Mamba for MTS node classification (CS-DPMamba).\nFirstly, to obtain the dynamic similarity of each sample, we initially use\ntemporal contrast learning module to acquire MTS representations. And then we\nconstruct a similarity matrix between MTS representations using Fast Dynamic\nTime Warping (FastDTW). Secondly, we apply the DPMamba to consider the\nbidirectional nature of MTS, allowing us to better capture long-range and\nshort-range dependencies within the data. Finally, we utilize the\nKolmogorov-Arnold Network enhanced Graph Isomorphism Network to complete the\ninformation interaction in the matrix and MTS node classification task. By\ncomprehensively considering the long-range dependencies and dynamic similarity\nfeatures, we achieved precise MTS node classification. We conducted experiments\non multiple University of East Anglia (UEA) MTS datasets, which encompass\ndiverse application scenarios. Our results demonstrate the superiority of our\nmethod through both supervised and semi-supervised experiments on the MTS\nclassification task.\n","authors":["Mingsen Du","Meng Chen","Yongjian Li","Xiuxin Zhang","Jiahui Gao","Cun Ji","Shoushui Wei"],"pdf_url":"https://arxiv.org/pdf/2411.12222v1.pdf","comment":"Submitted to Knowledge-Based Systems on Nov 17, 2024"},{"id":"http://arxiv.org/abs/2411.12220v1","updated":"2024-11-19T04:12:14Z","published":"2024-11-19T04:12:14Z","title":"DeTrigger: A Gradient-Centric Approach to Backdoor Attack Mitigation in\n Federated Learning","summary":" Federated Learning (FL) enables collaborative model training across\ndistributed devices while preserving local data privacy, making it ideal for\nmobile and embedded systems. However, the decentralized nature of FL also opens\nvulnerabilities to model poisoning attacks, particularly backdoor attacks,\nwhere adversaries implant trigger patterns to manipulate model predictions. In\nthis paper, we propose DeTrigger, a scalable and efficient backdoor-robust\nfederated learning framework that leverages insights from adversarial attack\nmethodologies. By employing gradient analysis with temperature scaling,\nDeTrigger detects and isolates backdoor triggers, allowing for precise model\nweight pruning of backdoor activations without sacrificing benign model\nknowledge. Extensive evaluations across four widely used datasets demonstrate\nthat DeTrigger achieves up to 251x faster detection than traditional methods\nand mitigates backdoor attacks by up to 98.9%, with minimal impact on global\nmodel accuracy. Our findings establish DeTrigger as a robust and scalable\nsolution to protect federated learning environments against sophisticated\nbackdoor threats.\n","authors":["Kichang Lee","Yujin Shin","Jonghyuk Yun","Jun Han","JeongGil Ko"],"pdf_url":"https://arxiv.org/pdf/2411.12220v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.10060v4","updated":"2024-11-19T03:58:40Z","published":"2024-08-19T14:54:12Z","title":"Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with\n Texture Map-Based Weak Supervision","summary":" Facial wrinkle detection plays a crucial role in cosmetic dermatology.\nPrecise manual segmentation of facial wrinkles is challenging and\ntime-consuming, with inherent subjectivity leading to inconsistent results\namong graders. To address this issue, we propose two solutions. First, we build\nand release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an\nextension of the NVIDIA FFHQ dataset. It includes 1,000 images with human\nlabels and 50,000 images with automatically generated weak labels. This dataset\ncould serve as a foundation for the research community to develop advanced\nwrinkle detection algorithms. Second, we introduce a simple training strategy\nutilizing texture maps, applicable to various segmentation models, to detect\nwrinkles across the face. Our two-stage training strategy first pretrain models\non a large dataset with weak labels (N=50k), or masked texture maps generated\nthrough computer vision techniques, without human intervention. We then\nfinetune the models using human-labeled data (N=1k), which consists of manually\nlabeled wrinkle masks. The network takes as input a combination of RGB and\nmasked texture map of the image, comprising four channels, in finetuning. We\neffectively combine labels from multiple annotators to minimize subjectivity in\nmanual labeling. Our strategies demonstrate improved segmentation performance\nin facial wrinkle segmentation both quantitatively and visually compared to\nexisting pretraining methods. The dataset is available at\nhttps://github.com/labhai/ffhq-wrinkle-dataset.\n","authors":["Junho Moon","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.10060v4.pdf","comment":"Accepted at International Conference on Pattern Recognition (ICPR),\n 2024"},{"id":"http://arxiv.org/abs/2411.06269v3","updated":"2024-11-19T03:37:03Z","published":"2024-11-09T19:53:15Z","title":"AI's Spatial Intelligence: Evaluating AI's Understanding of Spatial\n Transformations in PSVT:R and Augmented Reality","summary":" Spatial intelligence is important in Architecture, Construction, Science,\nTechnology, Engineering, and Mathematics (STEM), and Medicine. Understanding\nthree-dimensional (3D) spatial rotations can involve verbal descriptions and\nvisual or interactive examples, illustrating how objects change orientation in\n3D space. Recent studies show Artificial Intelligence (AI) with language and\nvision capabilities still face limitations in spatial reasoning. In this paper,\nwe have studied generative AI's spatial capabilities of understanding rotations\nof objects utilizing its image and language processing features. We examined\nthe spatial intelligence of the GPT-4 model with vision in understanding\nspatial rotation process with diagrams based on the Revised Purdue Spatial\nVisualization Test: Visualization of Rotations (Revised PSVT:R). Next, we\nincorporated a layer of coordinate system axes on Revised PSVT:R to study the\nvariations in GPT-4's performance. We also examined GPT-4's understanding of 3D\nrotations in Augmented Reality (AR) scenes that visualize spatial rotations of\nan object in 3D space and observed increased accuracy of GPT-4's understanding\nof the rotations by adding supplementary textual information depicting the\nrotation process or mathematical representations of the rotation (e.g.,\nmatrices). The results indicate that while GPT-4 as a major current Generative\nAI model lacks the understanding of a spatial rotation process, it has the\npotential to understand the rotation process with additional information that\ncan be provided by methods such as AR. By combining the potentials in spatial\nintelligence of AI with AR's interactive visualization abilities, we expect to\noffer enhanced guidance for students' spatial learning activities. Such spatial\nguidance can benefit understanding spatial transformations and additionally\nsupport processes like assembly, fabrication, and manufacturing.\n","authors":["Uttamasha Monjoree","Wei Yan"],"pdf_url":"https://arxiv.org/pdf/2411.06269v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12198v1","updated":"2024-11-19T03:30:06Z","published":"2024-11-19T03:30:06Z","title":"CCIS-Diff: A Generative Model with Stable Diffusion Prior for Controlled\n Colonoscopy Image Synthesis","summary":" Colonoscopy is crucial for identifying adenomatous polyps and preventing\ncolorectal cancer. However, developing robust models for polyp detection is\nchallenging by the limited size and accessibility of existing colonoscopy\ndatasets. While previous efforts have attempted to synthesize colonoscopy\nimages, current methods suffer from instability and insufficient data\ndiversity. Moreover, these approaches lack precise control over the generation\nprocess, resulting in images that fail to meet clinical quality standards. To\naddress these challenges, we propose CCIS-DIFF, a Controlled generative model\nfor high-quality Colonoscopy Image Synthesis based on a Diffusion architecture.\nOur method offers precise control over both the spatial attributes (polyp\nlocation and shape) and clinical characteristics of polyps that align with\nclinical descriptions. Specifically, we introduce a blur mask weighting\nstrategy to seamlessly blend synthesized polyps with the colonic mucosa, and a\ntext-aware attention mechanism to guide the generated images to reflect\nclinical characteristics. Notably, to achieve this, we construct a new\nmulti-modal colonoscopy dataset that integrates images, mask annotations, and\ncorresponding clinical text descriptions. Experimental results demonstrate that\nour method generates high-quality, diverse colonoscopy images with fine control\nover both spatial constraints and clinical consistency, offering valuable\nsupport for downstream segmentation and diagnostic tasks.\n","authors":["Yifan Xie","Jingge Wang","Tao Feng","Fei Ma","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2411.12198v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.12196v1","updated":"2024-11-19T03:29:17Z","published":"2024-11-19T03:29:17Z","title":"A More Advanced Group Polarization Measurement Approach Based on\n LLM-Based Agents and Graphs","summary":" Group polarization is an important research direction in social media content\nanalysis, attracting many researchers to explore this field. Therefore, how to\neffectively measure group polarization has become a critical topic. Measuring\ngroup polarization on social media presents several challenges that have not\nyet been addressed by existing solutions. First, social media group\npolarization measurement involves processing vast amounts of text, which poses\na significant challenge for information extraction. Second, social media texts\noften contain hard-to-understand content, including sarcasm, memes, and\ninternet slang. Additionally, group polarization research focuses on holistic\nanalysis, while texts is typically fragmented. To address these challenges, we\ndesigned a solution based on a multi-agent system and used a graph-structured\nCommunity Sentiment Network (CSN) to represent polarization states.\nFurthermore, we developed a metric called Community Opposition Index (COI)\nbased on the CSN to quantify polarization. Finally, we tested our multi-agent\nsystem through a zero-shot stance detection task and achieved outstanding\nresults. In summary, the proposed approach has significant value in terms of\nusability, accuracy, and interpretability.\n","authors":["Zixin Liu","Ji Zhang","Yiran Ding"],"pdf_url":"https://arxiv.org/pdf/2411.12196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07987v4","updated":"2024-11-19T03:23:20Z","published":"2024-04-11T17:59:09Z","title":"ControlNet++: Improving Conditional Controls with Efficient Consistency\n Feedback","summary":" To enhance the controllability of text-to-image diffusion models, existing\nefforts like ControlNet incorporated image-based conditional controls. In this\npaper, we reveal that existing methods still face significant challenges in\ngenerating images that align with the image conditional controls. To this end,\nwe propose ControlNet++, a novel approach that improves controllable generation\nby explicitly optimizing pixel-level cycle consistency between generated images\nand conditional controls. Specifically, for an input conditional control, we\nuse a pre-trained discriminative reward model to extract the corresponding\ncondition of the generated images, and then optimize the consistency loss\nbetween the input conditional control and extracted condition. A\nstraightforward implementation would be generating images from random noises\nand then calculating the consistency loss, but such an approach requires\nstoring gradients for multiple sampling timesteps, leading to considerable time\nand memory costs. To address this, we introduce an efficient reward strategy\nthat deliberately disturbs the input images by adding noise, and then uses the\nsingle-step denoised images for reward fine-tuning. This avoids the extensive\ncosts associated with image sampling, allowing for more efficient reward\nfine-tuning. Extensive experiments show that ControlNet++ significantly\nimproves controllability under various conditional controls. For example, it\nachieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE,\nrespectively, for segmentation mask, line-art edge, and depth conditions. All\nthe code, models, demo and organized data have been open sourced on our Github\nRepo.\n","authors":["Ming Li","Taojiannan Yang","Huafeng Kuang","Jie Wu","Zhaoning Wang","Xuefeng Xiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.07987v4.pdf","comment":"Camera Ready Version. Project Page:\n https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data:\n https://github.com/liming-ai/ControlNet_Plus_Plus"},{"id":"http://arxiv.org/abs/2411.12184v1","updated":"2024-11-19T02:56:45Z","published":"2024-11-19T02:56:45Z","title":"Testability of Instrumental Variables in Additive Nonlinear,\n Non-Constant Effects Models","summary":" We address the issue of the testability of instrumental variables derived\nfrom observational data. Most existing testable implications are centered on\nscenarios where the treatment is a discrete variable, e.g., instrumental\ninequality (Pearl, 1995), or where the effect is assumed to be constant, e.g.,\ninstrumental variables condition based on the principle of independent\nmechanisms (Burauel, 2023). However, treatments can often be continuous\nvariables, such as drug dosages or nutritional content levels, and non-constant\neffects may occur in many real-world scenarios. In this paper, we consider an\nadditive nonlinear, non-constant effects model with unmeasured confounders, in\nwhich treatments can be either discrete or continuous, and propose an\nAuxiliary-based Independence Test (AIT) condition to test whether a variable is\na valid instrument. We first show that if the candidate instrument is valid,\nthen the AIT condition holds. Moreover, we illustrate the implications of the\nAIT condition and demonstrate that, in certain conditions, AIT conditions are\nnecessary and sufficient to detect all invalid IVs. We also extend the AIT\ncondition to include covariates and introduce a practical testing algorithm.\nExperimental results on both synthetic and three different real-world datasets\nshow the effectiveness of our proposed condition.\n","authors":["Xichen Guo","Zheng Li","Biwei Huang","Yan Zeng","Zhi Geng","Feng Xie"],"pdf_url":"https://arxiv.org/pdf/2411.12184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16843v2","updated":"2024-11-19T02:52:45Z","published":"2024-02-26T18:59:18Z","title":"Multi-LoRA Composition for Image Generation","summary":" Low-Rank Adaptation (LoRA) is extensively utilized in text-to-image models\nfor the accurate rendition of specific elements like distinct characters or\nunique styles in generated images. Nonetheless, existing methods face\nchallenges in effectively composing multiple LoRAs, especially as the number of\nLoRAs to be integrated grows, thus hindering the creation of complex imagery.\nIn this paper, we study multi-LoRA composition through a decoding-centric\nperspective. We present two training-free methods: LoRA Switch, which\nalternates between different LoRAs at each denoising step, and LoRA Composite,\nwhich simultaneously incorporates all LoRAs to guide more cohesive image\nsynthesis. To evaluate the proposed approaches, we establish ComposLoRA, a new\ncomprehensive testbed as part of this research. It features a diverse range of\nLoRA categories with 480 composition sets. Utilizing an evaluation framework\nbased on GPT-4V, our findings demonstrate a clear improvement in performance\nwith our methods over the prevalent baseline, particularly evident when\nincreasing the number of LoRAs in a composition. The code, benchmarks, LoRA\nweights, and all evaluation details are available on our project website:\nhttps://maszhongming.github.io/Multi-LoRA-Composition.\n","authors":["Ming Zhong","Yelong Shen","Shuohang Wang","Yadong Lu","Yizhu Jiao","Siru Ouyang","Donghan Yu","Jiawei Han","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2402.16843v2.pdf","comment":"Transactions on Machine Learning Research (TMLR), 2024"},{"id":"http://arxiv.org/abs/2411.12182v1","updated":"2024-11-19T02:48:58Z","published":"2024-11-19T02:48:58Z","title":"Diffusion-Inspired Cold Start with Sufficient Prior in Computerized\n Adaptive Testing","summary":" Computerized Adaptive Testing (CAT) aims to select the most appropriate\nquestions based on the examinee's ability and is widely used in online\neducation. However, existing CAT systems often lack initial understanding of\nthe examinee's ability, requiring random probing questions. This can lead to\npoorly matched questions, extending the test duration and negatively impacting\nthe examinee's mindset, a phenomenon referred to as the Cold Start with\nInsufficient Prior (CSIP) task. This issue occurs because CAT systems do not\neffectively utilize the abundant prior information about the examinee available\nfrom other courses on online platforms. These response records, due to the\ncommonality of cognitive states across different knowledge domains, can provide\nvaluable prior information for the target domain. However, no prior work has\nexplored solutions for the CSIP task. In response to this gap, we propose\nDiffusion Cognitive States TransfeR Framework (DCSR), a novel domain transfer\nframework based on Diffusion Models (DMs) to address the CSIP task.\nSpecifically, we construct a cognitive state transition bridge between domains,\nguided by the common cognitive states of examinees, encouraging the model to\nreconstruct the initial ability state in the target domain. To enrich the\nexpressive power of the generated data, we analyze the causal relationships in\nthe generation process from a causal perspective. Redundant and extraneous\ncognitive states can lead to limited transfer and negative transfer effects.\nOur DCSR can seamlessly apply the generated initial ability states in the\ntarget domain to existing question selection algorithms, thus improving the\ncold start performance of the CAT system. Extensive experiments conducted on\nfive real-world datasets demonstrate that DCSR significantly outperforms\nexisting baseline methods in addressing the CSIP task.\n","authors":["Haiping Ma","Aoqing Xia","Changqian Wang","Hai Wang","Xingyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.12182v1.pdf","comment":"Accepted by KDD2025"},{"id":"http://arxiv.org/abs/2411.12181v1","updated":"2024-11-19T02:48:36Z","published":"2024-11-19T02:48:36Z","title":"Enhancing Low Dose Computed Tomography Images Using Consistency Training\n Techniques","summary":" Diffusion models have significant impact on wide range of generative tasks,\nespecially on image inpainting and restoration. Although the improvements on\naiming for decreasing number of function evaluations (NFE), the iterative\nresults are still computationally expensive. Consistency models are as a new\nfamily of generative models, enable single-step sampling of high quality data\nwithout the need for adversarial training. In this paper, we introduce the beta\nnoise distribution, which provides flexibility in adjusting noise levels. This\nis combined with a sinusoidal curriculum that enhances the learning of the\ntrajectory between the noise distribution and the posterior distribution of\ninterest, allowing High Noise Improved Consistency Training (HN-iCT) to be\ntrained in a supervised fashion. Additionally, High Noise Improved Consistency\nTraining with Image Condition (HN-iCT-CN) architecture is introduced, enables\nto take Low Dose images as a condition for extracting significant features by\nWeighted Attention Gates (WAG).Our results indicate that unconditional image\ngeneration using HN-iCT significantly outperforms basic CT and iCT training\ntechniques with NFE=1 on the CIFAR10 and CelebA datasets. Moreover, our\nimage-conditioned model demonstrates exceptional performance in enhancing\nlow-dose (LD) CT scans.\n","authors":["Mahmut S. Gokmen","Jie Zhang","Ge Wang","Jin Chen","Cody Bumgardner"],"pdf_url":"https://arxiv.org/pdf/2411.12181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11101v2","updated":"2024-11-19T02:39:53Z","published":"2024-11-17T15:17:08Z","title":"Different Horses for Different Courses: Comparing Bias Mitigation\n Algorithms in ML","summary":" With fairness concerns gaining significant attention in Machine Learning\n(ML), several bias mitigation techniques have been proposed, often compared\nagainst each other to find the best method. These benchmarking efforts tend to\nuse a common setup for evaluation under the assumption that providing a uniform\nenvironment ensures a fair comparison. However, bias mitigation techniques are\nsensitive to hyperparameter choices, random seeds, feature selection, etc.,\nmeaning that comparison on just one setting can unfairly favour certain\nalgorithms. In this work, we show significant variance in fairness achieved by\nseveral algorithms and the influence of the learning pipeline on fairness\nscores. We highlight that most bias mitigation techniques can achieve\ncomparable performance, given the freedom to perform hyperparameter\noptimization, suggesting that the choice of the evaluation parameters-rather\nthan the mitigation technique itself-can sometimes create the perceived\nsuperiority of one method over another. We hope our work encourages future\nresearch on how various choices in the lifecycle of developing an algorithm\nimpact fairness, and trends that guide the selection of appropriate algorithms.\n","authors":["Prakhar Ganesh","Usman Gohar","Lu Cheng","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2411.11101v2.pdf","comment":"To appear at AFME@NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.12173v1","updated":"2024-11-19T02:35:14Z","published":"2024-11-19T02:35:14Z","title":"SkillTree: Explainable Skill-Based Deep Reinforcement Learning for\n Long-Horizon Control Tasks","summary":" Deep reinforcement learning (DRL) has achieved remarkable success in various\nresearch domains. However, its reliance on neural networks results in a lack of\ntransparency, which limits its practical applications. To achieve\nexplainability, decision trees have emerged as a popular and promising\nalternative to neural networks. Nonetheless, due to their limited\nexpressiveness, traditional decision trees struggle with high-dimensional\nlong-horizon continuous control tasks. In this paper, we proposes SkillTree, a\nnovel framework that reduces complex continuous action spaces into discrete\nskill spaces. Our hierarchical approach integrates a differentiable decision\ntree within the high-level policy to generate skill embeddings, which\nsubsequently guide the low-level policy in executing skills. By making skill\ndecisions explainable, we achieve skill-level explainability, enhancing the\nunderstanding of the decision-making process in complex tasks. Experimental\nresults demonstrate that our method achieves performance comparable to\nskill-based neural networks in complex robotic arm control domains.\nFurthermore, SkillTree offers explanations at the skill level, thereby\nincreasing the transparency of the decision-making process.\n","authors":["Yongyan Wen","Siyuan Li","Rongchang Zuo","Lei Yuan","Hangyu Mao","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2411.12173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19957v4","updated":"2024-11-19T02:12:54Z","published":"2024-05-30T11:23:01Z","title":"PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting","summary":" Previous text-to-4D methods have leveraged multiple Score Distillation\nSampling (SDS) techniques, combining motion priors from video-based diffusion\nmodels (DMs) with geometric priors from multiview DMs to implicitly guide 4D\nrenderings. However, differences in these priors result in conflicting gradient\ndirections during optimization, causing trade-offs between motion fidelity and\ngeometry accuracy, and requiring substantial optimization time to reconcile the\nmodels. In this paper, we introduce \\textbf{P}ixel-\\textbf{L}evel\n\\textbf{A}lignment for text-driven \\textbf{4D} Gaussian splatting (PLA4D) to\nresolve this motion-geometry conflict. PLA4D provides an anchor reference,\ni.e., text-generated video, to align the rendering process conditioned by\ndifferent DMs in pixel space. For static alignment, our approach introduces a\nfocal alignment method and Gaussian-Mesh contrastive learning to iteratively\nadjust focal lengths and provide explicit geometric priors at each timestep. At\nthe dynamic level, a motion alignment technique and T-MV refinement method are\nemployed to enforce both pose alignment and motion continuity across unknown\nviewpoints, ensuring intrinsic geometric consistency across views. With such\npixel-level multi-DM alignment, our PLA4D framework is able to generate 4D\nobjects with superior geometric, motion, and semantic consistency. Fully\nimplemented with open-source tools, PLA4D offers an efficient and accessible\nsolution for high-quality 4D digital content creation with significantly\nreduced generation time.\n","authors":["Qiaowei Miao","JinSheng Quan","Kehan Li","Yawei Luo"],"pdf_url":"https://arxiv.org/pdf/2405.19957v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12164v1","updated":"2024-11-19T02:01:07Z","published":"2024-11-19T02:01:07Z","title":"UrbanDiT: A Foundation Model for Open-World Urban Spatio-Temporal\n Learning","summary":" The urban environment is characterized by complex spatio-temporal dynamics\narising from diverse human activities and interactions. Effectively modeling\nthese dynamics is essential for understanding and optimizing urban systems In\nthis work, we introduce UrbanDiT, a foundation model for open-world urban\nspatio-temporal learning that successfully scale up diffusion transformers in\nthis field. UrbanDiT pioneers a unified model that integrates diverse\nspatio-temporal data sources and types while learning universal spatio-temporal\npatterns across different cities and scenarios. This allows the model to unify\nboth multi-data and multi-task learning, and effectively support a wide range\nof spatio-temporal applications. Its key innovation lies in the elaborated\nprompt learning framework, which adaptively generates both data-driven and\ntask-specific prompts, guiding the model to deliver superior performance across\nvarious urban applications. UrbanDiT offers three primary advantages: 1) It\nunifies diverse data types, such as grid-based and graph-based data, into a\nsequential format, allowing to capture spatio-temporal dynamics across diverse\nscenarios of different cities; 2) With masking strategies and task-specific\nprompts, it supports a wide range of tasks, including bi-directional\nspatio-temporal prediction, temporal interpolation, spatial extrapolation, and\nspatio-temporal imputation; and 3) It generalizes effectively to open-world\nscenarios, with its powerful zero-shot capabilities outperforming nearly all\nbaselines with training data. These features allow UrbanDiT to achieves\nstate-of-the-art performance in different domains such as transportation\ntraffic, crowd flows, taxi demand, bike usage, and cellular traffic, across\nmultiple cities and tasks. UrbanDiT sets up a new benchmark for foundation\nmodels in the urban spatio-temporal domain.\n","authors":["Yuan Yuan","Chonghua Han","Jingtao Ding","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.12164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10825v2","updated":"2024-11-19T01:51:37Z","published":"2024-09-17T01:37:57Z","title":"Unveiling and Mitigating Bias in Large Language Model Recommendations: A\n Path to Fairness","summary":" Large Language Model (LLM)-based recommendation systems provide more\ncomprehensive recommendations than traditional systems by deeply analyzing\ncontent and user behavior. However, these systems often exhibit biases,\nfavoring mainstream content while marginalizing non-traditional options due to\nskewed training data. This study investigates the intricate relationship\nbetween bias and LLM-based recommendation systems, with a focus on music, song,\nand book recommendations across diverse demographic and cultural groups.\nThrough a comprehensive analysis conducted over different LLM-models, this\npaper evaluates the impact of bias on recommendation outcomes. Our findings\nhighlight that biases are not only deeply embedded but also widely pervasive\nacross these systems, emphasizing the substantial and widespread nature of the\nissue. Moreover, contextual information, such as socioeconomic status, further\namplify these biases, demonstrating the complexity and depth of the challenges\nfaced in creating fair recommendations across different groups.\n","authors":["Shahnewaz Karim Sakib","Anindya Bijoy Das"],"pdf_url":"https://arxiv.org/pdf/2409.10825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07213v4","updated":"2024-11-19T01:27:27Z","published":"2023-12-12T12:26:37Z","title":"Brain-inspired Computing Based on Deep Learning for Human-computer\n Interaction: A Review","summary":" The continuous development of artificial intelligence has a profound impact\non biomedicine and other fields, providing new research ideas and technical\nmethods. Brain-inspired computing is an important intersection between\nmultimodal technology and biomedical field. Focusing on the application\nscenarios of decoding text and speech from brain signals in human-computer\ninteraction, this paper presents a comprehensive review of the brain-inspired\ncomputing models based on deep learning (DL), tracking its evolution,\napplication value, challenges and potential research trends. We first reviews\nits basic concepts and development history, and divides its evolution into two\nstages: recent machine learning and current deep learning, emphasizing the\nimportance of each stage in the research of brain-inspired computing for\nhuman-computer interaction. In addition, the latest progress of deep learning\nin different tasks of brain-inspired computing for human-computer interaction\nis reviewed from five perspectives, including datasets and different brain\nsignals, and the application of key technologies in the model is elaborated in\ndetail. Despite significant advances in brain-inspired computational models,\nchallenges remain to fully exploit their capabilities, and we provide insights\ninto possible directions for future academic research. For more detailed\ninformation, please visit our GitHub page:\nhttps://github.com/ultracoolHub/brain-inspired-computing.\n","authors":["Bihui Yu","Sibo Zhang","Lili Zhou","Jingxuan Wei","Linzhuang Sun","Liping Bu"],"pdf_url":"https://arxiv.org/pdf/2312.07213v4.pdf","comment":"26pages, 8 figures and 4 tables"},{"id":"http://arxiv.org/abs/2411.12156v1","updated":"2024-11-19T01:26:20Z","published":"2024-11-19T01:26:20Z","title":"HNCSE: Advancing Sentence Embeddings via Hybrid Contrastive Learning\n with Hard Negatives","summary":" Unsupervised sentence representation learning remains a critical challenge in\nmodern natural language processing (NLP) research. Recently, contrastive\nlearning techniques have achieved significant success in addressing this issue\nby effectively capturing textual semantics. Many such approaches prioritize the\noptimization using negative samples. In fields such as computer vision, hard\nnegative samples (samples that are close to the decision boundary and thus more\ndifficult to distinguish) have been shown to enhance representation learning.\nHowever, adapting hard negatives to contrastive sentence learning is complex\ndue to the intricate syntactic and semantic details of text. To address this\nproblem, we propose HNCSE, a novel contrastive learning framework that extends\nthe leading SimCSE approach. The hallmark of HNCSE is its innovative use of\nhard negative samples to enhance the learning of both positive and negative\nsamples, thereby achieving a deeper semantic understanding. Empirical tests on\nsemantic textual similarity and transfer task datasets validate the superiority\nof HNCSE.\n","authors":["Wenxiao Liu","Zihong Yang","Chaozhuo Li","Zijin Hong","Jianfeng Ma","Zhiquan Liu","Litian Zhang","Feiran Huang"],"pdf_url":"https://arxiv.org/pdf/2411.12156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12155v1","updated":"2024-11-19T01:23:52Z","published":"2024-11-19T01:23:52Z","title":"Reinforcement Learning with Action Sequence for Data-Efficient Robot\n Learning","summary":" Training reinforcement learning (RL) agents on robotic tasks typically\nrequires a large number of training samples. This is because training data\noften consists of noisy trajectories, whether from exploration or\nhuman-collected demonstrations, making it difficult to learn value functions\nthat understand the effect of taking each action. On the other hand, recent\nbehavior-cloning (BC) approaches have shown that predicting a sequence of\nactions enables policies to effectively approximate noisy, multi-modal\ndistributions of expert demonstrations. Can we use a similar idea for improving\nRL on robotic tasks? In this paper, we introduce a novel RL algorithm that\nlearns a critic network that outputs Q-values over a sequence of actions. By\nexplicitly training the value functions to learn the consequence of executing a\nseries of current and future actions, our algorithm allows for learning useful\nvalue functions from noisy trajectories. We study our algorithm across various\nsetups with sparse and dense rewards, and with or without demonstrations,\nspanning mobile bi-manual manipulation, whole-body control, and tabletop\nmanipulation tasks from BiGym, HumanoidBench, and RLBench. We find that, by\nlearning the critic network with action sequences, our algorithm outperforms\nvarious RL and BC baselines, in particular on challenging humanoid control\ntasks.\n","authors":["Younggyo Seo","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2411.12155v1.pdf","comment":"17 Pages. Website: https://younggyo.me/cqn-as/"},{"id":"http://arxiv.org/abs/2408.02205v3","updated":"2024-11-19T01:10:07Z","published":"2024-08-05T03:08:51Z","title":"Designing Multi-layered Runtime Guardrails for Foundation Model Based\n Agents: Swiss Cheese Model for AI Safety by Design","summary":" Foundation Model (FM)-based agents are revolutionizing application\ndevelopment across various domains. However, their rapidly growing capabilities\nand autonomy have raised significant concerns about AI safety. Researchers are\nexploring better ways to design guardrails to ensure that the runtime behavior\nof FM-based agents remains within specific boundaries. Nevertheless, designing\neffective runtime guardrails is challenging due to the agents' autonomous and\nnon-deterministic behavior. The involvement of multiple pipeline stages and\nagent artifacts, such as goals, plans, tools, at runtime further complicates\nthese issues. Addressing these challenges at runtime requires multi-layered\nguardrails that operate effectively at various levels of the agent\narchitecture. Thus, in this paper, we present a comprehensive taxonomy of\nruntime guardrails for FM-based agents to identify the key quality attributes\nfor guardrails and design dimensions based on the results of a systematic\nliterature review. Inspired by the Swiss Cheese Model, we also propose a\nreference architecture for designing multi-layered runtime guardrails for\nFM-based agents, which includes three dimensions: quality attributes,\npipelines, and artifacts. The proposed taxonomy and reference architecture\nprovide concrete and robust guidance for researchers and practitioners to build\nAI-safety-by-design from a software architecture perspective.\n","authors":["Md Shamsujjoha","Qinghua Lu","Dehai Zhao","Liming Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.02205v3.pdf","comment":"17 Pages"},{"id":"http://arxiv.org/abs/2411.12150v1","updated":"2024-11-19T00:56:35Z","published":"2024-11-19T00:56:35Z","title":"HEIGHT: Heterogeneous Interaction Graph Transformer for Robot Navigation\n in Crowded and Constrained Environments","summary":" We study the problem of robot navigation in dense and interactive crowds with\nenvironmental constraints such as corridors and furniture. Previous methods\nfail to consider all types of interactions among agents and obstacles, leading\nto unsafe and inefficient robot paths. In this article, we leverage a\ngraph-based representation of crowded and constrained scenarios and propose a\nstructured framework to learn robot navigation policies with deep reinforcement\nlearning. We first split the representations of different components in the\nenvironment and propose a heterogeneous spatio-temporal (st) graph to model\ndistinct interactions among humans, robots, and obstacles. Based on the\nheterogeneous st-graph, we propose HEIGHT, a novel navigation policy network\narchitecture with different components to capture heterogeneous interactions\namong entities through space and time. HEIGHT utilizes attention mechanisms to\nprioritize important interactions and a recurrent network to track changes in\nthe dynamic scene over time, encouraging the robot to avoid collisions\nadaptively. Through extensive simulation and real-world experiments, we\ndemonstrate that HEIGHT outperforms state-of-the-art baselines in terms of\nsuccess and efficiency in challenging navigation scenarios. Furthermore, we\ndemonstrate that our pipeline achieves better zero-shot generalization\ncapability than previous works when the densities of humans and obstacles\nchange. More videos are available at\nhttps://sites.google.com/view/crowdnav-height/home.\n","authors":["Shuijing Liu","Haochen Xia","Fatemeh Cheraghi Pouria","Kaiwen Hong","Neeloy Chakraborty","Katherine Driggs-Campbell"],"pdf_url":"https://arxiv.org/pdf/2411.12150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12142v1","updated":"2024-11-19T00:44:56Z","published":"2024-11-19T00:44:56Z","title":"A Computational Method for Measuring \"Open Codes\" in Qualitative\n Analysis","summary":" Qualitative analysis is critical to understanding human datasets in many\nsocial science disciplines. Open coding is an inductive qualitative process\nthat identifies and interprets \"open codes\" from datasets. Yet, meeting\nmethodological expectations (such as \"as exhaustive as possible\") can be\nchallenging. While many machine learning (ML)/generative AI (GAI) studies have\nattempted to support open coding, few have systematically measured or evaluated\nGAI outcomes, increasing potential bias risks. Building on Grounded Theory and\nThematic Analysis theories, we present a computational method to measure and\nidentify potential biases from \"open codes\" systematically. Instead of\noperationalizing human expert results as the \"ground truth,\" our method is\nbuilt upon a team-based approach between human and machine coders. We\nexperiment with two HCI datasets to establish this method's reliability by 1)\ncomparing it with human analysis, and 2) analyzing its output stability. We\npresent evidence-based suggestions and example workflows for ML/GAI to support\nopen coding.\n","authors":["John Chen","Alexandros Lotsos","Lexie Zhao","Jessica Hullman","Bruce Sherin","Uri Wilensky","Michael Horn"],"pdf_url":"https://arxiv.org/pdf/2411.12142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12136v1","updated":"2024-11-19T00:28:14Z","published":"2024-11-19T00:28:14Z","title":"Visualizing Loss Functions as Topological Landscape Profiles","summary":" In machine learning, a loss function measures the difference between model\npredictions and ground-truth (or target) values. For neural network models,\nvisualizing how this loss changes as model parameters are varied can provide\ninsights into the local structure of the so-called loss landscape (e.g.,\nsmoothness) as well as global properties of the underlying model (e.g.,\ngeneralization performance). While various methods for visualizing the loss\nlandscape have been proposed, many approaches limit sampling to just one or two\ndirections, ignoring potentially relevant information in this extremely\nhigh-dimensional space. This paper introduces a new representation based on\ntopological data analysis that enables the visualization of higher-dimensional\nloss landscapes. After describing this new topological landscape profile\nrepresentation, we show how the shape of loss landscapes can reveal new details\nabout model performance and learning dynamics, highlighting several use cases,\nincluding image segmentation (e.g., UNet) and scientific machine learning\n(e.g., physics-informed neural networks). Through these examples, we provide\nnew insights into how loss landscapes vary across distinct hyperparameter\nspaces: we find that the topology of the loss landscape is simpler for\nbetter-performing models; and we observe greater variation in the shape of loss\nlandscapes near transitions from low to high model performance.\n","authors":["Caleb Geniesse","Jiaqing Chen","Tiankai Xie","Ge Shi","Yaoqing Yang","Dmitriy Morozov","Talita Perciano","Michael W. Mahoney","Ross Maciejewski","Gunther H. Weber"],"pdf_url":"https://arxiv.org/pdf/2411.12136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13533v3","updated":"2024-11-19T00:06:13Z","published":"2022-10-24T18:34:24Z","title":"Sufficient Invariant Learning for Distribution Shift","summary":" Learning robust models under distribution shifts between training and test\ndatasets is a fundamental challenge in machine learning. While learning\ninvariant features across environments is a popular approach, it often assumes\nthat these features are fully observed in both training and test sets-a\ncondition frequently violated in practice. When models rely on invariant\nfeatures absent in the test set, their robustness in new environments can\ndeteriorate. To tackle this problem, we introduce a novel learning principle\ncalled the Sufficient Invariant Learning (SIL) framework, which focuses on\nlearning a sufficient subset of invariant features rather than relying on a\nsingle feature. After demonstrating the limitation of existing invariant\nlearning methods, we propose a new algorithm, Adaptive Sharpness-aware Group\nDistributionally Robust Optimization (ASGDRO), to learn diverse invariant\nfeatures by seeking common flat minima across the environments. We\ntheoretically demonstrate that finding a common flat minima enables robust\npredictions based on diverse invariant features. Empirical evaluations on\nmultiple datasets, including our new benchmark, confirm ASGDRO's robustness\nagainst distribution shifts, highlighting the limitations of existing methods.\n","authors":["Taero Kim","Subeen Park","Sungjun Lim","Yonghan Jung","Krikamol Muandet","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2210.13533v3.pdf","comment":null}],"Computation and Language":[{"id":"http://arxiv.org/abs/2411.12736v1","updated":"2024-11-19T18:58:03Z","published":"2024-11-19T18:58:03Z","title":"ACING: Actor-Critic for Instruction Learning in Black-Box Large Language\n Models","summary":" The effectiveness of Large Language Models (LLMs) in solving tasks vastly\ndepends on the quality of the instructions, which often require fine-tuning\nthrough extensive human effort. This highlights the need for automated\ninstruction optimization; however, this optimization is particularly\nchallenging when dealing with black-box LLMs, where model parameters and\ngradients remain inaccessible. We propose ACING, a task-specific prompt\noptimization approach framed as a stateless continuous-action Reinforcement\nLearning (RL) problem, known as the continuum bandit setting. ACING leverages\nan actor-critic-based method to optimize prompts, learning from\nnon-differentiable reward signals. We validate ACING by optimizing prompts for\nChatGPT on 30 instruction-based tasks. ACING consistently outperforms baseline\nmethods, achieving a median score improvement of 10 percentage points.\nFurthermore, ACING not only recovers but also surpasses human-crafted expert\ninstructions, achieving up to a 39 percentage point improvement against human\nbenchmarks.\n","authors":["Salma Kharrat","Fares Fourati","Marco Canini"],"pdf_url":"https://arxiv.org/pdf/2411.12736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12728v1","updated":"2024-11-19T18:51:23Z","published":"2024-11-19T18:51:23Z","title":"Information Theory of Meaningful Communication","summary":" In Shannon's seminal paper, entropy of printed English, treated as a\nstationary stochastic process, was estimated to be roughly 1 bit per character.\nHowever, considered as a means of communication, language differs considerably\nfrom its printed form: (i) the units of information are not characters or even\nwords but clauses, i.e. shortest meaningful parts of speech; and (ii) what is\ntransmitted is principally the meaning of what is being said or written, while\nthe precise phrasing that was used to communicate the meaning is typically\nignored. In this study, we show that one can leverage recently developed large\nlanguage models to quantify information communicated in meaningful narratives\nin terms of bits of meaning per clause.\n","authors":["Doron Sivan","Misha Tsodyks"],"pdf_url":"https://arxiv.org/pdf/2411.12728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12720v1","updated":"2024-11-19T18:38:01Z","published":"2024-11-19T18:38:01Z","title":"Scaling laws for nonlinear dynamical models of speech","summary":" The addition of a nonlinear restoring force to dynamical models of the speech\ngesture significantly improves the empirical accuracy of model predictions, but\nnonlinearity introduces challenges in selecting appropriate parameters and\nnumerical stability, especially when modelling variation in empirical data. We\naddress this issue by introducing simple numerical methods for parameterization\nof nonlinear task dynamic models. We first illustrate the problem and then\noutline solutions in the form of power laws that scale nonlinear stiffness\nterms. We apply the scaling laws to a cubic model and show how they facilitate\ninterpretable simulations of the nonlinear gestural dynamics underpinning\nspeech production.\n","authors":["Sam Kirkham"],"pdf_url":"https://arxiv.org/pdf/2411.12720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12719v1","updated":"2024-11-19T18:37:45Z","published":"2024-11-19T18:37:45Z","title":"Rethinking MUSHRA: Addressing Modern Challenges in Text-to-Speech\n Evaluation","summary":" Despite rapid advancements in TTS models, a consistent and robust human\nevaluation framework is still lacking. For example, MOS tests fail to\ndifferentiate between similar models, and CMOS's pairwise comparisons are\ntime-intensive. The MUSHRA test is a promising alternative for evaluating\nmultiple TTS systems simultaneously, but in this work we show that its reliance\non matching human reference speech unduly penalises the scores of modern TTS\nsystems that can exceed human speech quality. More specifically, we conduct a\ncomprehensive assessment of the MUSHRA test, focusing on its sensitivity to\nfactors such as rater variability, listener fatigue, and reference bias. Based\non our extensive evaluation involving 471 human listeners across Hindi and\nTamil we identify two primary shortcomings: (i) reference-matching bias, where\nraters are unduly influenced by the human reference, and (ii) judgement\nambiguity, arising from a lack of clear fine-grained guidelines. To address\nthese issues, we propose two refined variants of the MUSHRA test. The first\nvariant enables fairer ratings for synthesized samples that surpass human\nreference quality. The second variant reduces ambiguity, as indicated by the\nrelatively lower variance across raters. By combining these approaches, we\nachieve both more reliable and more fine-grained assessments. We also release\nMANGO, a massive dataset of 47,100 human ratings, the first-of-its-kind\ncollection for Indian languages, aiding in analyzing human preferences and\ndeveloping automatic metrics for evaluating TTS systems.\n","authors":["Praveen Srinivasa Varadhan","Amogh Gulati","Ashwin Sankar","Srija Anand","Anirudh Gupta","Anirudh Mukherjee","Shiva Kumar Marepally","Ankur Bhatia","Saloni Jaju","Suvrat Bhooshan","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2411.12719v1.pdf","comment":"19 pages, 12 Figures"},{"id":"http://arxiv.org/abs/2411.12712v1","updated":"2024-11-19T18:27:25Z","published":"2024-11-19T18:27:25Z","title":"Enhancing Multi-Class Disease Classification: Neoplasms, Cardiovascular,\n Nervous System, and Digestive Disorders Using Advanced LLMs","summary":" In this research, we explored the improvement in terms of multi-class disease\nclassification via pre-trained language models over Medical-Abstracts-TC-Corpus\nthat spans five medical conditions. We excluded non-cancer conditions and\nexamined four specific diseases. We assessed four LLMs, BioBERT, XLNet, and\nBERT, as well as a novel base model (Last-BERT). BioBERT, which was pre-trained\non medical data, demonstrated superior performance in medical text\nclassification (97% accuracy). Surprisingly, XLNet followed closely (96%\naccuracy), demonstrating its generalizability across domains even though it was\nnot pre-trained on medical data. LastBERT, a custom model based on the lighter\nversion of BERT, also proved competitive with 87.10% accuracy (just under\nBERT's 89.33%). Our findings confirm the importance of specialized models such\nas BioBERT and also support impressions around more general solutions like\nXLNet and well-tuned transformer architectures with fewer parameters (in this\ncase, LastBERT) in medical domain tasks.\n","authors":["Ahmed Akib Jawad Karim","Muhammad Zawad Mahmud","Samiha Islam","Aznur Azam"],"pdf_url":"https://arxiv.org/pdf/2411.12712v1.pdf","comment":"7 Pages, 4 tables and 11 figures. Under review in a IEEE conference"},{"id":"http://arxiv.org/abs/2411.12703v1","updated":"2024-11-19T18:15:46Z","published":"2024-11-19T18:15:46Z","title":"Strengthening Fake News Detection: Leveraging SVM and Sophisticated Text\n Vectorization Techniques. Defying BERT?","summary":" The rapid spread of misinformation, particularly through online platforms,\nunderscores the urgent need for reliable detection systems. This study explores\nthe utilization of machine learning and natural language processing,\nspecifically Support Vector Machines (SVM) and BERT, to detect news that are\nfake. We employ three distinct text vectorization methods for SVM: Term\nFrequency Inverse Document Frequency (TF-IDF), Word2Vec, and Bag of Words (BoW)\nevaluating their effectiveness in distinguishing between genuine and fake news.\nAdditionally, we compare these methods against the transformer large language\nmodel, BERT. Our comprehensive approach includes detailed preprocessing steps,\nrigorous model implementation, and thorough evaluation to determine the most\neffective techniques. The results demonstrate that while BERT achieves superior\naccuracy with 99.98% and an F1-score of 0.9998, the SVM model with a linear\nkernel and BoW vectorization also performs exceptionally well, achieving 99.81%\naccuracy and an F1-score of 0.9980. These findings highlight that, despite\nBERT's superior performance, SVM models with BoW and TF-IDF vectorization\nmethods come remarkably close, offering highly competitive performance with the\nadvantage of lower computational requirements.\n","authors":["Ahmed Akib Jawad Karim","Kazi Hafiz Md Asad","Aznur Azam"],"pdf_url":"https://arxiv.org/pdf/2411.12703v1.pdf","comment":"6 pages, 3 tables and 6 Figures. Submitted to a conference"},{"id":"http://arxiv.org/abs/2406.08316v3","updated":"2024-11-19T17:49:27Z","published":"2024-06-12T15:16:40Z","title":"Is Programming by Example solved by LLMs?","summary":" Programming-by-Examples (PBE) aims to generate an algorithm from input-output\nexamples. Such systems are practically and theoretically important: from an\nend-user perspective, they are deployed to millions of people, and from an AI\nperspective, PBE corresponds to a very general form of few-shot inductive\ninference. Given the success of Large Language Models (LLMs) in code-generation\ntasks, we investigate here the extent to which LLMs can be said to have\n\"solved\" PBE. We experiment on classic domains such as lists and strings, and\nan uncommon graphics programming domain not well represented in typical\npretraining data. We find that pretrained models are not effective at PBE, but\nthat they can be fine-tuned for much higher performance, provided the test\nproblems are in-distribution. We analyze empirically what causes these models\nto succeed and fail, and take steps toward understanding how to achieve better\nout-of-distribution generalization. Collectively these results suggest that\nLLMs make strong progress toward solving the typical suite of PBE tasks,\npotentially increasing the flexibility and applicability of PBE systems, while\nalso identifying ways in which LLMs still fall short.\n","authors":["Wen-Ding Li","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2406.08316v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00248v2","updated":"2024-11-19T17:46:48Z","published":"2024-10-31T22:58:08Z","title":"A Demonstration of Adaptive Collaboration of Large Language Models for\n Medical Decision-Making","summary":" Medical Decision-Making (MDM) is a multi-faceted process that requires\nclinicians to assess complex multi-modal patient data patient, often\ncollaboratively. Large Language Models (LLMs) promise to streamline this\nprocess by synthesizing vast medical knowledge and multi-modal health data.\nHowever, single-agent are often ill-suited for nuanced medical contexts\nrequiring adaptable, collaborative problem-solving. Our MDAgents addresses this\nneed by dynamically assigning collaboration structures to LLMs based on task\ncomplexity, mimicking real-world clinical collaboration and decision-making.\nThis framework improves diagnostic accuracy and supports adaptive responses in\ncomplex, real-world medical scenarios, making it a valuable tool for clinicians\nin various healthcare settings, and at the same time, being more efficient in\nterms of computing cost than static multi-agent decision making methods.\n","authors":["Yubin Kim","Chanwoo Park","Hyewon Jeong","Cristina Grau-Vilchez","Yik Siu Chan","Xuhai Xu","Daniel McDuff","Hyeonhoon Lee","Cynthia Breazeal","Hae Won Park"],"pdf_url":"https://arxiv.org/pdf/2411.00248v2.pdf","comment":"Under Review for ML4H 2024"},{"id":"http://arxiv.org/abs/2411.12685v1","updated":"2024-11-19T17:45:12Z","published":"2024-11-19T17:45:12Z","title":"Enhanced Sign Language Translation between American Sign Language (ASL)\n and Indian Sign Language (ISL) Using LLMs","summary":" We have come up with a research that hopes to provide a bridge between the\nusers of American Sign Language and the users of spoken language and Indian\nSign Language (ISL). The research enabled us to create a novel framework that\nwe have developed for Learner Systems. Leveraging art of Large models to create\nkey features including: - Real-time translation between these two sign\nlanguages in an efficient manner. Making LLM's capability available for\nseamless translations to ISL. Here is the full study showing its implementation\nin this paper. The core of the system is a sophisticated pipeline that begins\nwith reclassification and recognition of ASL gestures based on a strong Random\nForest Classifier. By recognizing the ASL, it is translated into text which can\nbe more easily processed. Highly evolved natural language NLP (Natural Language\nProcessing) techniques come in handy as they play a role in our LLM integration\nwhere you then use LLMs to be able to convert the ASL text to ISL which\nprovides you with the intent of sentence or phrase. The final step is to\nsynthesize the translated text back into ISL gestures, creating an end-to-end\ntranslation experience using RIFE-Net. This framework is tasked with key\nchallenges such as automatically dealing with gesture variability and\novercoming the linguistic differences between ASL and ISL. By automating the\ntranslation process, we hope to vastly improve accessibility for sign language\nusers. No longer will the communication gap between ASL and ISL create\nbarriers; this totally cool innovation aims to bring our communities closer\ntogether. And we believe, with full confidence in our framework, that we're\nable to apply the same principles across a wide variety of sign language\ndialects.\n","authors":["Malay Kumar","S. Sarvajit Visagan","Tanish Sarang Mahajan","Anisha Natarajan"],"pdf_url":"https://arxiv.org/pdf/2411.12685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.00480v4","updated":"2024-11-19T17:33:22Z","published":"2021-08-01T15:43:57Z","title":"Realised Volatility Forecasting: Machine Learning via Financial Word\n Embedding","summary":" This study develops a financial word embedding using 15 years of business\nnews. Our results show that this specialised language model produces more\naccurate results than general word embeddings, based on a financial benchmark\nwe established. As an application, we incorporate this word embedding into a\nsimple machine learning model to enhance the HAR model for forecasting realised\nvolatility. This approach statistically and economically outperforms\nestablished econometric models. Using an explainable AI method, we also\nidentify key phrases in business news that contribute significantly to\nvolatility, offering insights into language patterns tied to market dynamics.\n","authors":["Eghbal Rahimikia","Stefan Zohren","Ser-Huang Poon"],"pdf_url":"https://arxiv.org/pdf/2108.00480v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02272v3","updated":"2024-11-19T17:29:58Z","published":"2024-11-04T17:03:55Z","title":"Combining Induction and Transduction for Abstract Reasoning","summary":" When learning an input-output mapping from very few examples, is it better to\nfirst infer a latent function that explains the examples, or is it better to\ndirectly predict new test outputs, e.g. using a neural network? We study this\nquestion on ARC, a highly diverse dataset of abstract reasoning tasks. We train\nneural models for induction (inferring latent functions) and transduction\n(directly predicting the test output for a given test input). Our models are\ntrained on synthetic data generated by prompting LLMs to produce Python code\nspecifying a function to be inferred, plus a stochastic subroutine for\ngenerating inputs to that function. We find inductive and transductive models\nsolve very different problems, despite training on the same problems, and\ndespite sharing the same neural architecture.\n","authors":["Wen-Ding Li","Keya Hu","Carter Larsen","Yuqing Wu","Simon Alford","Caleb Woo","Spencer M. Dunn","Hao Tang","Michelangelo Naim","Dat Nguyen","Wei-Long Zheng","Zenna Tavares","Yewen Pu","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2411.02272v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12671v1","updated":"2024-11-19T17:23:55Z","published":"2024-11-19T17:23:55Z","title":"Neurosymbolic Graph Enrichment for Grounded World Models","summary":" The development of artificial intelligence systems capable of understanding\nand reasoning about complex real-world scenarios is a significant challenge. In\nthis work we present a novel approach to enhance and exploit LLM reactive\ncapability to address complex problems and interpret deeply contextual\nreal-world meaning. We introduce a method and a tool for creating a multimodal,\nknowledge-augmented formal representation of meaning that combines the\nstrengths of large language models with structured semantic representations.\nOur method begins with an image input, utilizing state-of-the-art large\nlanguage models to generate a natural language description. This description is\nthen transformed into an Abstract Meaning Representation (AMR) graph, which is\nformalized and enriched with logical design patterns, and layered semantics\nderived from linguistic and factual knowledge bases. The resulting graph is\nthen fed back into the LLM to be extended with implicit knowledge activated by\ncomplex heuristic learning, including semantic implicatures, moral values,\nembodied cognition, and metaphorical representations. By bridging the gap\nbetween unstructured language models and formal semantic structures, our method\nopens new avenues for tackling intricate problems in natural language\nunderstanding and reasoning.\n","authors":["Stefano De Giorgis","Aldo Gangemi","Alessandro Russo"],"pdf_url":"https://arxiv.org/pdf/2411.12671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10020v3","updated":"2024-11-19T17:14:57Z","published":"2024-11-15T07:54:19Z","title":"Information Extraction from Clinical Notes: Are We Ready to Switch to\n Large Language Models?","summary":" Backgrounds: Information extraction (IE) is critical in clinical natural\nlanguage processing (NLP). While large language models (LLMs) excel on\ngenerative tasks, their performance on extractive tasks remains debated.\nMethods: We investigated Named Entity Recognition (NER) and Relation Extraction\n(RE) using 1,588 clinical notes from four sources (UT Physicians, MTSamples,\nMIMIC-III, and i2b2). We developed an annotated corpus covering 4 clinical\nentities and 16 modifiers, and compared instruction-tuned LLaMA-2 and LLaMA-3\nagainst BiomedBERT in terms of performance, generalizability, computational\nresources, and throughput to BiomedBERT. Results: LLaMA models outperformed\nBiomedBERT across datasets. With sufficient training data, LLaMA showed modest\nimprovements (1% on NER, 1.5-3.7% on RE); improvements were larger with limited\ntraining data. On unseen i2b2 data, LLaMA-3-70B outperformed BiomedBERT by 7%\n(F1) on NER and 4% on RE. However, LLaMA models required more computing\nresources and ran up to 28 times slower. We implemented \"Kiwi,\" a clinical IE\npackage featuring both models, available at https://kiwi.clinicalnlp.org/.\nConclusion: This study is among the first to develop and evaluate a\ncomprehensive clinical IE system using open-source LLMs. Results indicate that\nLLaMA models outperform BiomedBERT for clinical NER and RE but with higher\ncomputational costs and lower throughputs. These findings highlight that\nchoosing between LLMs and traditional deep learning methods for clinical IE\napplications should remain task-specific, taking into account both performance\nmetrics and practical considerations such as available computing resources and\nthe intended use case scenarios.\n","authors":["Yan Hu","Xu Zuo","Yujia Zhou","Xueqing Peng","Jimin Huang","Vipina K. Keloth","Vincent J. Zhang","Ruey-Ling Weng","Qingyu Chen","Xiaoqian Jiang","Kirk E. Roberts","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2411.10020v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12650v1","updated":"2024-11-19T16:58:15Z","published":"2024-11-19T16:58:15Z","title":"Optimizing Airline Reservation Systems with Edge-Enabled Microservices:\n A Framework for Real-Time Data Processing and Enhanced User Responsiveness","summary":" The growing complexity of the operations of airline reservations requires a\nsmart solution for the adoption of novel approaches to the development of\nquick, efficient, and adaptive reservation systems. This paper outlines in\ndetail a conceptual framework for the implementation of edge computing\nmicroservices in order to address the shortcomings of traditional centralized\narchitectures. Specifically, as edge computing allows for certain activities\nsuch as seat inventory checks, booking processes and even confirmation to be\ndone nearer to the user, thus lessening the overall response time and improving\nthe performance of the system. In addition, the framework value should include\nachieving the high performance of the system such as low latency, high\nthroughput and higher user experience. The major design components include\ndeployed distributed computing microservices orchestrated by Kubernetes,\nreal-time message processing system with Kafka and its elastic scaling. Other\noperational components include Prometheus and Grafana, which are used to\nmonitor and manage resources, ensuring that all operational processes are\noptimized. Although this research focuses on a design and theoretical scheming\nof the framework, its use is foreseen to be more advantageous in facilitating a\ntransform in the provision of services in the airline industry by improving\ncustomers' satisfaction, providing infrastructure which is cheap to install and\nefficiently supporting technology changes such as artificial intelligence and\ninternet of things embedded systems. This research addresses the increasing\ndemand for new technologies with modern well-distributed and real-time-centric\nsystems and also provides a basis for future case implementation and testing.\nAs such, the proposed architecture offers a market-ready, extensible solution\nto the problems posed by existing airline reservation systems .\n","authors":["Biman Barua","M. Shamim Kaiser"],"pdf_url":"https://arxiv.org/pdf/2411.12650v1.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.12643v1","updated":"2024-11-19T16:54:30Z","published":"2024-11-19T16:54:30Z","title":"DLBacktrace: A Model Agnostic Explainability for any Deep Learning\n Models","summary":" The rapid advancement of artificial intelligence has led to increasingly\nsophisticated deep learning models, which frequently operate as opaque 'black\nboxes' with limited transparency in their decision-making processes. This lack\nof interpretability presents considerable challenges, especially in high-stakes\napplications where understanding the rationale behind a model's outputs is as\nessential as the outputs themselves. This study addresses the pressing need for\ninterpretability in AI systems, emphasizing its role in fostering trust,\nensuring accountability, and promoting responsible deployment in\nmission-critical fields. To address the interpretability challenge in deep\nlearning, we introduce DLBacktrace, an innovative technique developed by the\nAryaXAI team to illuminate model decisions across a wide array of domains,\nincluding simple Multi Layer Perceptron (MLPs), Convolutional Neural Networks\n(CNNs), Large Language Models (LLMs), Computer Vision Models, and more.\n We provide a comprehensive overview of the DLBacktrace algorithm and present\nbenchmarking results, comparing its performance against established\ninterpretability methods, such as SHAP, LIME, GradCAM, Integrated Gradients,\nSmoothGrad, and Attention Rollout, using diverse task-based metrics. The\nproposed DLBacktrace technique is compatible with various model architectures\nbuilt in PyTorch and TensorFlow, supporting models like Llama 3.2, other NLP\narchitectures such as BERT and LSTMs, computer vision models like ResNet and\nU-Net, as well as custom deep neural network (DNN) models for tabular data.\nThis flexibility underscores DLBacktrace's adaptability and effectiveness in\nenhancing model transparency across a broad spectrum of applications. The\nlibrary is open-sourced and available at https://github.com/AryaXAI/DLBacktrace .\n","authors":["Vinay Kumar Sankarapu","Chintan Chitroda","Yashwardhan Rathore","Neeraj Kumar Singh","Pratinav Seth"],"pdf_url":"https://arxiv.org/pdf/2411.12643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11579v2","updated":"2024-11-19T16:39:57Z","published":"2024-09-17T22:06:46Z","title":"HEARTS: A Holistic Framework for Explainable, Sustainable and Robust\n Text Stereotype Detection","summary":" Stereotypes are generalised assumptions about societal groups, and even\nstate-of-the-art LLMs using in-context learning struggle to identify them\naccurately. Due to the subjective nature of stereotypes, where what constitutes\na stereotype can vary widely depending on cultural, social, and individual\nperspectives, robust explainability is crucial. Explainable models ensure that\nthese nuanced judgments can be understood and validated by human users,\npromoting trust and accountability. We address these challenges by introducing\nHEARTS (Holistic Framework for Explainable, Sustainable, and Robust Text\nStereotype Detection), a framework that enhances model performance, minimises\ncarbon footprint, and provides transparent, interpretable explanations. We\nestablish the Expanded Multi-Grain Stereotype Dataset (EMGSD), comprising\n57,201 labelled texts across six groups, including under-represented\ndemographics like LGBTQ+ and regional stereotypes. Ablation studies confirm\nthat BERT models fine-tuned on EMGSD outperform those trained on individual\ncomponents. We then analyse a fine-tuned, carbon-efficient ALBERT-V2 model\nusing SHAP to generate token-level importance values, ensuring alignment with\nhuman understanding, and calculate explainability confidence scores by\ncomparing SHAP and LIME outputs...\n","authors":["Theo King","Zekun Wu","Adriano Koshiyama","Emre Kazim","Philip Treleaven"],"pdf_url":"https://arxiv.org/pdf/2409.11579v2.pdf","comment":"Accepted in NeurIPS 2024 SoLaR Workshop and Safety Gen AI Workshop"},{"id":"http://arxiv.org/abs/2305.14533v2","updated":"2024-11-19T16:34:17Z","published":"2023-05-23T21:33:43Z","title":"How to Choose How to Choose Your Chatbot: A Massively Multi-System\n MultiReference Data Set for Dialog Metric Evaluation","summary":" We release MMSMR, a Massively Multi-System MultiReference dataset to enable\nfuture work on metrics and evaluation for dialog. Automatic metrics for\ndialogue evaluation should be robust proxies for human judgments; however, the\nverification of robustness is currently far from satisfactory. To quantify the\nrobustness correlation and understand what is necessary in a test set, we\ncreate and release an 8-reference dialog dataset by extending single-reference\nevaluation sets and introduce this new language learning conversation dataset.\nWe then train 1750 systems and evaluate them on our novel test set and the\nDailyDialog dataset. We release the novel test set, and model hyper parameters,\ninference outputs, and metric scores for each system on a variety of datasets.\n","authors":["Huda Khayrallah","Zuhaib Akhtar","Edward Cohen","Jyothir S V","João Sedoc"],"pdf_url":"https://arxiv.org/pdf/2305.14533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12619v1","updated":"2024-11-19T16:26:19Z","published":"2024-11-19T16:26:19Z","title":"Leveraging Virtual Reality and AI Tutoring for Language Learning: A Case\n Study of a Virtual Campus Environment with OpenAI GPT Integration with Unity\n 3D","summary":" This paper presents a new approach to multiple language learning, with Hindi\nthe language to be learnt in our case, by using the integration of virtual\nreality environments and AI enabled tutoring systems using OpenAIs GPT api\ncalls. We have developed a scenario which has a virtual campus environment\nusing Unity which focuses on a detailed representation of our universitys\nbuildings 11th floor, where most of the cultural and technological activities\ntake place. Within this virtual environment that we have created, we have an AI\ntutor powered by OpenAI's GPT model which was called using an api which moves\naround with the user. This provided language learning support in Hindi, as GPT\nis able to take care of language translation. Our approach mainly involves\nutilising speech to text, text to text conversion and text to speech\ncapabilities to facilitate real time interaction between users and the AI tutor\nin the presence of internet. This research demonstrates the use of combining VR\ntechnology with AI tutoring for immersive language learning experiences and\nprovides interaction.\n","authors":["Adithya TG","Abhinavaram N","Gowri Srinivasa"],"pdf_url":"https://arxiv.org/pdf/2411.12619v1.pdf","comment":"5 pages, 2 tables, 8 figures"},{"id":"http://arxiv.org/abs/2411.12587v1","updated":"2024-11-19T15:55:56Z","published":"2024-11-19T15:55:56Z","title":"Whisper Finetuning on Nepali Language","summary":" Despite the growing advancements in Automatic Speech Recognition (ASR)\nmodels, the development of robust models for underrepresented languages, such\nas Nepali, remains a challenge. This research focuses on making an exhaustive\nand generalized dataset followed by fine-tuning OpenAI's Whisper models of\ndifferent sizes to improve transcription (speech-to-text) accuracy for the\nNepali language. We leverage publicly available ASR datasets and self-recorded\ncustom datasets with a diverse range of accents, dialects, and speaking styles\nfurther enriched through augmentation. Our experimental results demonstrate\nthat fine-tuning Whisper models on our curated custom dataset substantially\nreduces the Word Error Rate (WER) across all model sizes attributed to larger\ndata variations in terms of speaker's age, gender, and sentiment, acoustic\nenvironment, dialect, denser audio segments (15-30 seconds) that are more\ncompatible with Whisper's input, and manual curation of audios and\ntranscriptions. Notably, our approach outperforms Whisper's baseline models\ntrained on Fleur's dataset, achieving WER reductions of up to 36.2% on the\nsmall and 23.8% on medium models. Furthermore, we show that data augmentation\nplays a significant role in enhancing model robustness. Our approach underlines\nthe importance of dataset quality, variation, and augmentation in the\nadaptation of state-of-the-art models to underrepresented languages for\ndeveloping accurate ASR systems.\n","authors":["Sanjay Rijal","Shital Adhikari","Manish Dahal","Manish Awale","Vaghawan Ojha"],"pdf_url":"https://arxiv.org/pdf/2411.12587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12580v1","updated":"2024-11-19T15:47:12Z","published":"2024-11-19T15:47:12Z","title":"Procedural Knowledge in Pretraining Drives Reasoning in Large Language\n Models","summary":" The capabilities and limitations of Large Language Models have been sketched\nout in great detail in recent years, providing an intriguing yet conflicting\npicture. On the one hand, LLMs demonstrate a general ability to solve problems.\nOn the other hand, they show surprising reasoning gaps when compared to humans,\ncasting doubt on the robustness of their generalisation strategies. The sheer\nvolume of data used in the design of LLMs has precluded us from applying the\nmethod traditionally used to measure generalisation: train-test set separation.\nTo overcome this, we study what kind of generalisation strategies LLMs employ\nwhen performing reasoning tasks by investigating the pretraining data they rely\non. For two models of different sizes (7B and 35B) and 2.5B of their\npretraining tokens, we identify what documents influence the model outputs for\nthree simple mathematical reasoning tasks and contrast this to the data that\nare influential for answering factual questions. We find that, while the models\nrely on mostly distinct sets of data for each factual question, a document\noften has a similar influence across different reasoning questions within the\nsame task, indicating the presence of procedural knowledge. We further find\nthat the answers to factual questions often show up in the most influential\ndata. However, for reasoning questions the answers usually do not show up as\nhighly influential, nor do the answers to the intermediate reasoning steps.\nWhen we characterise the top ranked documents for the reasoning questions\nqualitatively, we confirm that the influential documents often contain\nprocedural knowledge, like demonstrating how to obtain a solution using\nformulae or code. Our findings indicate that the approach to reasoning the\nmodels use is unlike retrieval, and more like a generalisable strategy that\nsynthesises procedural knowledge from documents doing a similar form of\nreasoning.\n","authors":["Laura Ruis","Maximilian Mozes","Juhan Bae","Siddhartha Rao Kamalakara","Dwarak Talupuru","Acyr Locatelli","Robert Kirk","Tim Rocktäschel","Edward Grefenstette","Max Bartolo"],"pdf_url":"https://arxiv.org/pdf/2411.12580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12571v1","updated":"2024-11-19T15:39:51Z","published":"2024-11-19T15:39:51Z","title":"Large Language Models for Combinatorial Optimization of Design Structure\n Matrix","summary":" Combinatorial optimization (CO) is essential for improving efficiency and\nperformance in engineering applications. As complexity increases with larger\nproblem sizes and more intricate dependencies, identifying the optimal solution\nbecome challenging. When it comes to real-world engineering problems,\nalgorithms based on pure mathematical reasoning are limited and incapable to\ncapture the contextual nuances necessary for optimization. This study explores\nthe potential of Large Language Models (LLMs) in solving engineering CO\nproblems by leveraging their reasoning power and contextual knowledge. We\npropose a novel LLM-based framework that integrates network topology and domain\nknowledge to optimize the sequencing of Design Structure Matrix (DSM)-a common\nCO problem. Our experiments on various DSM cases demonstrate that the proposed\nmethod achieves faster convergence and higher solution quality than benchmark\nmethods. Moreover, results show that incorporating contextual domain knowledge\nsignificantly improves performance despite the choice of LLMs. These findings\nhighlight the potential of LLMs in tackling complex real-world CO problems by\ncombining semantic and mathematical reasoning. This approach paves the way for\na new paradigm in in real-world combinatorial optimization.\n","authors":["Shuo Jiang","Min Xie","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2411.12571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17213v5","updated":"2024-11-19T15:37:57Z","published":"2024-09-25T17:38:39Z","title":"Plurals: A System for Guiding LLMs Via Simulated Social Ensembles","summary":" Recent debates raised concerns that language models may favor certain\nviewpoints. But what if the solution is not to aim for a 'view from nowhere'\nbut rather to leverage different viewpoints? We introduce Plurals, a system and\nPython library for pluralistic AI deliberation. Plurals consists of Agents\n(LLMs, optionally with personas) which deliberate within customizable\nStructures, with Moderators overseeing deliberation. Plurals is a generator of\nsimulated social ensembles. Plurals integrates with government datasets to\ncreate nationally representative personas, includes deliberation templates\ninspired by deliberative democracy, and allows users to customize both\ninformation-sharing structures and deliberation behavior within Structures. Six\ncase studies demonstrate fidelity to theoretical constructs and efficacy. Three\nrandomized experiments show simulated focus groups produced output resonant\nwith an online sample of the relevant audiences (chosen over zero-shot\ngeneration in 75% of trials). Plurals is both a paradigm and a concrete system\nfor pluralistic AI. The Plurals library is available at\nhttps://github.com/josh-ashkinaze/plurals and will be continually updated.\n","authors":["Joshua Ashkinaze","Emily Fry","Narendra Edara","Eric Gilbert","Ceren Budak"],"pdf_url":"https://arxiv.org/pdf/2409.17213v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12539v1","updated":"2024-11-19T14:39:29Z","published":"2024-11-19T14:39:29Z","title":"Predicting Customer Satisfaction by Replicating the Survey Response\n Distribution","summary":" For many call centers, customer satisfaction (CSAT) is a key performance\nindicator (KPI). However, only a fraction of customers take the CSAT survey\nafter the call, leading to a biased and inaccurate average CSAT value, and\nmissed opportunities for coaching, follow-up, and rectification. Therefore,\ncall centers can benefit from a model predicting customer satisfaction on calls\nwhere the customer did not complete the survey. Given that CSAT is a closely\nmonitored KPI, it is critical to minimize any bias in the average predicted\nCSAT (pCSAT). In this paper, we introduce a method such that predicted CSAT\n(pCSAT) scores accurately replicate the distribution of survey CSAT responses\nfor every call center with sufficient data in a live production environment.\nThe method can be applied to many multiclass classification problems to improve\nthe class balance and minimize its changes upon model updates.\n","authors":["Etienne Manderscheid","Matthias Lee"],"pdf_url":"https://arxiv.org/pdf/2411.12539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12537v1","updated":"2024-11-19T14:35:38Z","published":"2024-11-19T14:35:38Z","title":"Unlocking State-Tracking in Linear RNNs Through Negative Eigenvalues","summary":" Linear Recurrent Neural Networks (LRNNs) such as Mamba, RWKV, GLA, mLSTM, and\nDeltaNet have emerged as efficient alternatives to Transformers in large\nlanguage modeling, offering linear scaling with sequence length and improved\ntraining efficiency. However, LRNNs struggle to perform state-tracking which\nmay impair performance in tasks such as code evaluation or tracking a chess\ngame. Even parity, the simplest state-tracking task, which non-linear RNNs like\nLSTM handle effectively, cannot be solved by current LRNNs. Recently, Sarrof et\nal. (2024) demonstrated that the failure of LRNNs like Mamba to solve parity\nstems from restricting the value range of their diagonal state-transition\nmatrices to $[0, 1]$ and that incorporating negative values can resolve this\nissue. We extend this result to non-diagonal LRNNs, which have recently shown\npromise in models such as DeltaNet. We prove that finite precision LRNNs with\nstate-transition matrices having only positive eigenvalues cannot solve parity,\nwhile complex eigenvalues are needed to count modulo $3$. Notably, we also\nprove that LRNNs can learn any regular language when their state-transition\nmatrices are products of identity minus vector outer product matrices, each\nwith eigenvalues in the range $[-1, 1]$. Our empirical results confirm that\nextending the eigenvalue range of models like Mamba and DeltaNet to include\nnegative values not only enables them to solve parity but consistently improves\ntheir performance on state-tracking tasks. Furthermore, pre-training LRNNs with\nan extended eigenvalue range for language modeling achieves comparable\nperformance and stability while showing promise on code and math data. Our work\nenhances the expressivity of modern LRNNs, broadening their applicability\nwithout changing the cost of training or inference.\n","authors":["Riccardo Grazzi","Julien Siems","Jörg K. H. Franke","Arber Zela","Frank Hutter","Massimiliano Pontil"],"pdf_url":"https://arxiv.org/pdf/2411.12537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00028v2","updated":"2024-11-19T14:29:32Z","published":"2024-10-29T04:03:15Z","title":"Synergizing LLM Agents and Knowledge Graph for Socioeconomic Prediction\n in LBSN","summary":" The fast development of location-based social networks (LBSNs) has led to\nsignificant changes in society, resulting in popular studies of using LBSN data\nfor socioeconomic prediction, e.g., regional population and commercial activity\nestimation. Existing studies design various graphs to model heterogeneous LBSN\ndata, and further apply graph representation learning methods for socioeconomic\nprediction. However, these approaches heavily rely on heuristic ideas and\nexpertise to extract task-relevant knowledge from diverse data, which may not\nbe optimal for specific tasks. Additionally, they tend to overlook the inherent\nrelationships between different indicators, limiting the prediction accuracy.\nMotivated by the remarkable abilities of large language models (LLMs) in\ncommonsense reasoning, embedding, and multi-agent collaboration, in this work,\nwe synergize LLM agents and knowledge graph for socioeconomic prediction. We\nfirst construct a location-based knowledge graph (LBKG) to integrate\nmulti-sourced LBSN data. Then we leverage the reasoning power of LLM agent to\nidentify relevant meta-paths in the LBKG for each type of socioeconomic\nprediction task, and design a semantic-guided attention module for knowledge\nfusion with meta-paths. Moreover, we introduce a cross-task communication\nmechanism to further enhance performance by enabling knowledge sharing across\ntasks at both LLM agent and KG levels. On the one hand, the LLM agents for\ndifferent tasks collaborate to generate more diverse and comprehensive\nmeta-paths. On the other hand, the embeddings from different tasks are\nadaptively merged for better socioeconomic prediction. Experiments on two\ndatasets demonstrate the effectiveness of the synergistic design between LLM\nand KG, providing insights for information sharing across socioeconomic\nprediction tasks.\n","authors":["Zhilun Zhou","Jingyang Fan","Yu Liu","Fengli Xu","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.00028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19262v3","updated":"2024-11-19T13:27:30Z","published":"2024-05-29T16:55:32Z","title":"Weak-to-Strong Search: Align Large Language Models via Searching over\n Small Language Models","summary":" Large language models are usually fine-tuned to align with human preferences.\nHowever, fine-tuning a large language model can be challenging. In this work,\nwe introduce $\\textit{weak-to-strong search}$, framing the alignment of a large\nlanguage model as a test-time greedy search to maximize the log-probability\ndifference between small tuned and untuned models while sampling from the\nfrozen large model. This method serves both as (1) a compute-efficient model\nup-scaling strategy that avoids directly tuning the large model and as (2) an\ninstance of weak-to-strong generalization that enhances a strong model with\nweak test-time guidance. Empirically, we demonstrate the flexibility of\nweak-to-strong search across different tasks. In controlled-sentiment\ngeneration and summarization, we use tuned and untuned $\\texttt{gpt2}$s to\nimprove the alignment of large models without additional training. Crucially,\nin a more difficult instruction-following benchmark, AlpacaEval 2.0, we show\nthat reusing off-the-shelf small models (e.g., $\\texttt{zephyr-7b-beta}$ and\nits untuned version) can improve the length-controlled win rates of both\nwhite-box and black-box large models against $\\texttt{gpt-4-turbo}$ (e.g.,\n$34.4\\% \\rightarrow 37.9\\%$ for $\\texttt{Llama-3-70B-Instruct}$ and $16.0\\%\n\\rightarrow 20.1\\%$ for $\\texttt{gpt-3.5-turbo-instruct}$), despite the small\nmodels' low win rates $\\approx 10.0\\%$.\n","authors":["Zhanhui Zhou","Zhixuan Liu","Jie Liu","Zhichen Dong","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2405.19262v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.12493v1","updated":"2024-11-19T13:23:53Z","published":"2024-11-19T13:23:53Z","title":"Bias Free Sentiment Analysis","summary":" This paper introduces the Semantic Propagation Graph Neural Network (SProp\nGNN), a machine learning sentiment analysis (SA) architecture that relies\nexclusively on syntactic structures and word-level emotional cues to predict\nemotions in text. By semantically blinding the model to information about\nspecific words, it is robust to biases such as political or gender bias that\nhave been plaguing previous machine learning-based SA systems. The SProp GNN\nshows performance superior to lexicon-based alternatives such as VADER and\nEmoAtlas on two different prediction tasks, and across two languages.\nAdditionally, it approaches the accuracy of transformer-based models while\nsignificantly reducing bias in emotion prediction tasks. By offering improved\nexplainability and reducing bias, the SProp GNN bridges the methodological gap\nbetween interpretable lexicon approaches and powerful, yet often opaque, deep\nlearning models, offering a robust tool for fair and effective emotion analysis\nin understanding human behavior through text.\n","authors":["Hubert Plisiecki"],"pdf_url":"https://arxiv.org/pdf/2411.12493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05767v2","updated":"2024-11-19T13:22:21Z","published":"2024-08-11T13:17:14Z","title":"Reference-free Hallucination Detection for Large Vision-Language Models","summary":" Large vision-language models (LVLMs) have made significant progress in recent\nyears. While LVLMs exhibit excellent ability in language understanding,\nquestion answering, and conversations of visual inputs, they are prone to\nproducing hallucinations. While several methods are proposed to evaluate the\nhallucinations in LVLMs, most are reference-based and depend on external tools,\nwhich complicates their practical application. To assess the viability of\nalternative methods, it is critical to understand whether the reference-free\napproaches, which do not rely on any external tools, can efficiently detect\nhallucinations. Therefore, we initiate an exploratory study to demonstrate the\neffectiveness of different reference-free solutions in detecting hallucinations\nin LVLMs. In particular, we conduct an extensive study on three kinds of\ntechniques: uncertainty-based, consistency-based, and supervised uncertainty\nquantification methods on four representative LVLMs across two different tasks.\nThe empirical results show that the reference-free approaches are capable of\neffectively detecting non-factual responses in LVLMs, with the supervised\nuncertainty quantification method outperforming the others, achieving the best\nperformance across different settings.\n","authors":["Qing Li","Jiahui Geng","Chenyang Lyu","Derui Zhu","Maxim Panov","Fakhri Karray"],"pdf_url":"https://arxiv.org/pdf/2408.05767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12484v1","updated":"2024-11-19T13:08:03Z","published":"2024-11-19T13:08:03Z","title":"Regular-pattern-sensitive CRFs for Distant Label Interactions","summary":" Linear-chain conditional random fields (CRFs) are a common model component\nfor sequence labeling tasks when modeling the interactions between different\nlabels is important. However, the Markov assumption limits linear-chain CRFs to\nonly directly modeling interactions between adjacent labels. Weighted\nfinite-state transducers (FSTs) are a related approach which can be made to\nmodel distant label-label interactions, but exact label inference is\nintractable for these models in the general case, and the task of selecting an\nappropriate automaton structure for the desired interaction types poses a\npractical challenge. In this work, we present regular-pattern-sensitive CRFs\n(RPCRFs), a method of enriching standard linear-chain CRFs with the ability to\nlearn long-distance label interactions which occur in user-specified patterns.\nThis approach allows users to write regular-expression label patterns concisely\nspecifying which types of interactions the model should take into account,\nallowing the model to learn from data whether and in which contexts these\npatterns occur. The result can be interpreted alternatively as a CRF augmented\nwith additional, non-local potentials, or as a finite-state transducer whose\nstructure is defined by a set of easily-interpretable patterns. Critically,\nunlike the general case for FSTs (and for non-chain CRFs), exact training and\ninference are tractable for many pattern sets. In this work, we detail how a\nRPCRF can be automatically constructed from a set of user-specified patterns,\nand demonstrate the model's effectiveness on synthetic data, showing how\ndifferent types of patterns can capture different nonlocal dependency\nstructures in label sequences.\n","authors":["Sean Papay","Roman Klinger","Sebastian Pado"],"pdf_url":"https://arxiv.org/pdf/2411.12484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12483v1","updated":"2024-11-19T13:07:04Z","published":"2024-11-19T13:07:04Z","title":"Analysing Explanation-Related Interactions in Collaborative\n Perception-Cognition-Communication-Action","summary":" Effective communication is essential in collaborative tasks, so AI-equipped\nrobots working alongside humans need to be able to explain their behaviour in\norder to cooperate effectively and earn trust. We analyse and classify\ncommunications among human participants collaborating to complete a simulated\nemergency response task. The analysis identifies messages that relate to\nvarious kinds of interactive explanations identified in the explainable AI\nliterature. This allows us to understand what type of explanations humans\nexpect from their teammates in such settings, and thus where AI-equipped robots\nmost need explanation capabilities. We find that most explanation-related\nmessages seek clarification in the decisions or actions taken. We also confirm\nthat messages have an impact on the performance of our simulated task.\n","authors":["Marc Roig Vilamala","Jack Furby","Julian de Gortari Briseno","Mani Srivastava","Alun Preece","Carolina Fuentes Toro"],"pdf_url":"https://arxiv.org/pdf/2411.12483v1.pdf","comment":"4 pages, 3 figures, published as a Late Breaking Report in RO-MAN\n 2024"},{"id":"http://arxiv.org/abs/2411.12473v1","updated":"2024-11-19T12:55:22Z","published":"2024-11-19T12:55:22Z","title":"NMT-Obfuscator Attack: Ignore a sentence in translation with only one\n word","summary":" Neural Machine Translation systems are used in diverse applications due to\ntheir impressive performance. However, recent studies have shown that these\nsystems are vulnerable to carefully crafted small perturbations to their\ninputs, known as adversarial attacks. In this paper, we propose a new type of\nadversarial attack against NMT models. In this attack, we find a word to be\nadded between two sentences such that the second sentence is ignored and not\ntranslated by the NMT model. The word added between the two sentences is such\nthat the whole adversarial text is natural in the source language. This type of\nattack can be harmful in practical scenarios since the attacker can hide\nmalicious information in the automatic translation made by the target NMT\nmodel. Our experiments show that different NMT models and translation tasks are\nvulnerable to this type of attack. Our attack can successfully force the NMT\nmodels to ignore the second part of the input in the translation for more than\n50% of all cases while being able to maintain low perplexity for the whole\ninput.\n","authors":["Sahar Sadrizadeh","César Descalzo","Ljiljana Dolamic","Pascal Frossard"],"pdf_url":"https://arxiv.org/pdf/2411.12473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05232v2","updated":"2024-11-19T12:42:45Z","published":"2023-11-09T09:25:37Z","title":"A Survey on Hallucination in Large Language Models: Principles,\n Taxonomy, Challenges, and Open Questions","summary":" The emergence of large language models (LLMs) has marked a significant\nbreakthrough in natural language processing (NLP), fueling a paradigm shift in\ninformation acquisition. Nevertheless, LLMs are prone to hallucination,\ngenerating plausible yet nonfactual content. This phenomenon raises significant\nconcerns over the reliability of LLMs in real-world information retrieval (IR)\nsystems and has attracted intensive research to detect and mitigate such\nhallucinations. Given the open-ended general-purpose attributes inherent to\nLLMs, LLM hallucinations present distinct challenges that diverge from prior\ntask-specific models. This divergence highlights the urgency for a nuanced\nunderstanding and comprehensive overview of recent advances in LLM\nhallucinations. In this survey, we begin with an innovative taxonomy of\nhallucination in the era of LLM and then delve into the factors contributing to\nhallucinations. Subsequently, we present a thorough overview of hallucination\ndetection methods and benchmarks. Our discussion then transfers to\nrepresentative methodologies for mitigating LLM hallucinations. Additionally,\nwe delve into the current limitations faced by retrieval-augmented LLMs in\ncombating hallucinations, offering insights for developing more robust IR\nsystems. Finally, we highlight the promising research directions on LLM\nhallucinations, including hallucination in large vision-language models and\nunderstanding of knowledge boundaries in LLM hallucinations.\n","authors":["Lei Huang","Weijiang Yu","Weitao Ma","Weihong Zhong","Zhangyin Feng","Haotian Wang","Qianglong Chen","Weihua Peng","Xiaocheng Feng","Bing Qin","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2311.05232v2.pdf","comment":"Accepted by ACM Transactions on Information Systems (TOIS)"},{"id":"http://arxiv.org/abs/2402.06420v2","updated":"2024-11-19T12:41:04Z","published":"2024-02-09T14:08:23Z","title":"Findings of the First Workshop on Simulating Conversational Intelligence\n in Chat","summary":" The aim of the workshop was to bring together experts working on open-domain\ndialogue research. In this speedily advancing research area many challenges\nstill exist, such as learning information from conversations, and engaging in a\nrealistic and convincing simulation of human intelligence and reasoning.\nSCI-CHAT follows previous workshops on open domain dialogue but in contrast the\nfocus of the shared task is simulation of intelligent conversation as judged in\na live human evaluation. Models aim to include the ability to follow a\nchallenging topic over a multi-turn conversation, while positing, refuting and\nreasoning over arguments. The workshop included both a research track and\nshared task. The main goal of this paper is to provide an overview of the\nshared task, and an in depth analysis of the shared task results following\npresentation at the workshop. The current paper is an extension of that made\navailable prior to presentation of results at the workshop at EACL Malta\n(Graham et al., 2024). The data collected in the evaluation was made publicly\navailable to aide future research. The code was also made available for the\nsame purpose.\n","authors":["Yvette Graham","Mohammed Rameez Qureshi","Haider Khalid","Gerasimos Lampouras","Ignacio Iacobacci","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2402.06420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04625v3","updated":"2024-11-19T12:41:04Z","published":"2024-06-07T04:19:01Z","title":"Key-Element-Informed sLLM Tuning for Document Summarization","summary":" Remarkable advances in large language models (LLMs) have enabled high-quality\ntext summarization. However, this capability is currently accessible only\nthrough LLMs of substantial size or proprietary LLMs with usage fees. In\nresponse, smaller-scale LLMs (sLLMs) of easy accessibility and low costs have\nbeen extensively studied, yet they often suffer from missing key information\nand entities, i.e., low relevance, in particular, when input documents are\nlong. We hence propose a key-element-informed instruction tuning for\nsummarization, so-called KEITSum, which identifies key elements in documents\nand instructs sLLM to generate summaries capturing these key elements.\nExperimental results on dialogue and news datasets demonstrate that sLLM with\nKEITSum indeed provides high-quality summarization with higher relevance and\nless hallucinations, competitive to proprietary LLM.\n","authors":["Sangwon Ryu","Heejin Do","Yunsu Kim","Gary Geunbae Lee","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2406.04625v3.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2411.12460v1","updated":"2024-11-19T12:36:02Z","published":"2024-11-19T12:36:02Z","title":"Guide-to-Explain for Controllable Summarization","summary":" Recently, large language models (LLMs) have demonstrated remarkable\nperformance in abstractive summarization tasks. However, controllable\nsummarization with LLMs remains underexplored, limiting their ability to\ngenerate summaries that align with specific user preferences. In this paper, we\nfirst investigate the capability of LLMs to control diverse attributes,\nrevealing that they encounter greater challenges with numerical attributes,\nsuch as length and extractiveness, compared to linguistic attributes. To\naddress this challenge, we propose a guide-to-explain framework (GTE) for\ncontrollable summarization. Our GTE framework enables the model to identify\nmisaligned attributes in the initial draft and guides it in explaining errors\nin the previous output. Based on this reflection, the model generates a\nwell-adjusted summary. As a result, by allowing the model to reflect on its\nmisalignment, we generate summaries that satisfy the desired attributes in\nsurprisingly fewer iterations than other iterative methods solely using LLMs.\n","authors":["Sangwon Ryu","Heejin Do","Daehee Kim","Yunsu Kim","Gary Geunbae Lee","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2411.12460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12458v1","updated":"2024-11-19T12:29:30Z","published":"2024-11-19T12:29:30Z","title":"Variation between Credible and Non-Credible News Across Topics","summary":" 'Fake News' continues to undermine trust in modern journalism and politics.\nDespite continued efforts to study fake news, results have been conflicting.\nPrevious attempts to analyse and combat fake news have largely focused on\ndistinguishing fake news from truth, or differentiating between its various\nsub-types (such as propaganda, satire, misinformation, etc.) This paper\nconducts a linguistic and stylistic analysis of fake news, focusing on\nvariation between various news topics. It builds on related work identifying\nfeatures from discourse and linguistics in deception detection by analysing\nfive distinct news topics: Economy, Entertainment, Health, Science, and Sports.\nThe results emphasize that linguistic features vary between credible and\ndeceptive news in each domain and highlight the importance of adapting\nclassification tasks to accommodate variety-based stylistic and linguistic\ndifferences in order to achieve better real-world performance.\n","authors":["Emilie Francis"],"pdf_url":"https://arxiv.org/pdf/2411.12458v1.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.12449v1","updated":"2024-11-19T12:17:43Z","published":"2024-11-19T12:17:43Z","title":"\\textsc{Neon}: News Entity-Interaction Extraction for Enhanced Question\n Answering","summary":" Capturing fresh information in near real-time and using it to augment\nexisting large language models (LLMs) is essential to generate up-to-date,\ngrounded, and reliable output. This problem becomes particularly challenging\nwhen LLMs are used for informational tasks in rapidly evolving fields, such as\nWeb search related to recent or unfolding events involving entities, where\ngenerating temporally relevant responses requires access to up-to-the-hour news\nsources. However, the information modeled by the parametric memory of LLMs is\noften outdated, and Web results from prototypical retrieval systems may fail to\ncapture the latest relevant information and struggle to handle conflicting\nreports in evolving news. To address this challenge, we present the NEON\nframework, designed to extract emerging entity interactions -- such as events\nor activities -- as described in news articles. NEON constructs an\nentity-centric timestamped knowledge graph that captures such interactions,\nthereby facilitating enhanced QA capabilities related to news events. Our\nframework innovates by integrating open Information Extraction (openIE) style\ntuples into LLMs to enable in-context retrieval-augmented generation. This\nintegration demonstrates substantial improvements in QA performance when\ntackling temporal, entity-centric search queries. Through NEON, LLMs can\ndeliver more accurate, reliable, and up-to-date responses.\n","authors":["Sneha Singhania","Silviu Cucerzan","Allen Herring","Sujay Kumar Jauhar"],"pdf_url":"https://arxiv.org/pdf/2411.12449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04793v2","updated":"2024-11-19T10:59:30Z","published":"2024-05-08T03:57:45Z","title":"Zero-shot LLM-guided Counterfactual Generation: A Case Study on NLP\n Model Evaluation","summary":" With the development and proliferation of large, complex, black-box models\nfor solving many natural language processing (NLP) tasks, there is also an\nincreasing necessity of methods to stress-test these models and provide some\ndegree of interpretability or explainability. While counterfactual examples are\nuseful in this regard, automated generation of counterfactuals is a data and\nresource intensive process. such methods depend on models such as pre-trained\nlanguage models that are then fine-tuned on auxiliary, often task-specific\ndatasets, that may be infeasible to build in practice, especially for new tasks\nand data domains. Therefore, in this work we explore the possibility of\nleveraging large language models (LLMs) for zero-shot counterfactual generation\nin order to stress-test NLP models. We propose a structured pipeline to\nfacilitate this generation, and we hypothesize that the instruction-following\nand textual understanding capabilities of recent LLMs can be effectively\nleveraged for generating high quality counterfactuals in a zero-shot manner,\nwithout requiring any training or fine-tuning. Through comprehensive\nexperiments on a variety of propreitary and open-source LLMs, along with\nvarious downstream tasks in NLP, we explore the efficacy of LLMs as zero-shot\ncounterfactual generators in evaluating and explaining black-box NLP models.\n","authors":["Amrita Bhattacharjee","Raha Moraffah","Joshua Garland","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2405.04793v2.pdf","comment":"Longer version of short paper accepted at IEEE BigData 2024 (Main\n Track)"},{"id":"http://arxiv.org/abs/2411.12405v1","updated":"2024-11-19T10:41:54Z","published":"2024-11-19T10:41:54Z","title":"Evaluating the Prompt Steerability of Large Language Models","summary":" Building pluralistic AI requires designing models that are able to be shaped\nto represent a wide range of value systems and cultures. Achieving this\nrequires first being able to evaluate the degree to which a given model is\ncapable of reflecting various personas. To this end, we propose a benchmark for\nevaluating the steerability of model personas as a function of prompting. Our\ndesign is based on a formal definition of prompt steerability, which analyzes\nthe degree to which a model's joint behavioral distribution can be shifted from\nits baseline behavior. By defining steerability indices and inspecting how\nthese indices change as a function of steering effort, we can estimate the\nsteerability of a model across various persona dimensions and directions. Our\nbenchmark reveals that the steerability of many current models is limited --\ndue to both a skew in their baseline behavior and an asymmetry in their\nsteerability across many persona dimensions. We release an implementation of\nour benchmark at https://github.com/IBM/prompt-steering.\n","authors":["Erik Miehling","Michael Desmond","Karthikeyan Natesan Ramamurthy","Elizabeth M. Daly","Pierre Dognin","Jesus Rios","Djallel Bouneffouf","Miao Liu"],"pdf_url":"https://arxiv.org/pdf/2411.12405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17394v2","updated":"2024-11-19T10:27:37Z","published":"2024-04-26T13:14:28Z","title":"Child Speech Recognition in Human-Robot Interaction: Problem Solved?","summary":" Automated Speech Recognition shows superhuman performance for adult English\nspeech on a range of benchmarks, but disappoints when fed children's speech.\nThis has long sat in the way of child-robot interaction. Recent evolutions in\ndata-driven speech recognition, including the availability of Transformer\narchitectures and unprecedented volumes of training data, might mean a\nbreakthrough for child speech recognition and social robot applications aimed\nat children. We revisit a study on child speech recognition from 2017 and show\nthat indeed performance has increased, with newcomer OpenAI Whisper doing\nmarkedly better than leading commercial cloud services. Performance improves\neven more in highly structured interactions when priming models with specific\nphrases. While transcription is not perfect yet, the best model recognises\n60.3% of sentences correctly barring small grammatical differences, with\nsub-second transcription time running on a local GPU, showing potential for\nusable autonomous child-robot speech interactions.\n","authors":["Ruben Janssens","Eva Verhelst","Giulio Antonio Abbo","Qiaoqiao Ren","Maria Jose Pinto Bernal","Tony Belpaeme"],"pdf_url":"https://arxiv.org/pdf/2404.17394v2.pdf","comment":"Submitted to 2024 International Conference on Social Robotics"},{"id":"http://arxiv.org/abs/2411.12395v1","updated":"2024-11-19T10:27:26Z","published":"2024-11-19T10:27:26Z","title":"Do LLMs Understand Ambiguity in Text? A Case Study in Open-world\n Question Answering","summary":" Ambiguity in natural language poses significant challenges to Large Language\nModels (LLMs) used for open-domain question answering. LLMs often struggle with\nthe inherent uncertainties of human communication, leading to\nmisinterpretations, miscommunications, hallucinations, and biased responses.\nThis significantly weakens their ability to be used for tasks like\nfact-checking, question answering, feature extraction, and sentiment analysis.\nUsing open-domain question answering as a test case, we compare off-the-shelf\nand few-shot LLM performance, focusing on measuring the impact of explicit\ndisambiguation strategies. We demonstrate how simple, training-free,\ntoken-level disambiguation methods may be effectively used to improve LLM\nperformance for ambiguous question answering tasks. We empirically show our\nfindings and discuss best practices and broader impacts regarding ambiguity in\nLLMs.\n","authors":["Aryan Keluskar","Amrita Bhattacharjee","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2411.12395v1.pdf","comment":"Accepted at the REU Symposium at IEEE BigData 2024"},{"id":"http://arxiv.org/abs/2410.19572v4","updated":"2024-11-19T10:00:41Z","published":"2024-10-25T14:07:53Z","title":"ChunkRAG: Novel LLM-Chunk Filtering Method for RAG Systems","summary":" Retrieval-Augmented Generation (RAG) systems using large language models\n(LLMs) often generate inaccurate responses due to the retrieval of irrelevant\nor loosely related information. Existing methods, which operate at the document\nlevel, fail to effectively filter out such content. We propose LLM-driven chunk\nfiltering, ChunkRAG, a framework that enhances RAG systems by evaluating and\nfiltering retrieved information at the chunk level. Our approach employs\nsemantic chunking to divide documents into coherent sections and utilizes\nLLM-based relevance scoring to assess each chunk's alignment with the user's\nquery. By filtering out less pertinent chunks before the generation phase, we\nsignificantly reduce hallucinations and improve factual accuracy. Experiments\nshow that our method outperforms existing RAG models, achieving higher accuracy\non tasks requiring precise information retrieval. This advancement enhances the\nreliability of RAG systems, making them particularly beneficial for\napplications like fact-checking and multi-hop reasoning.\n","authors":["Ishneet Sukhvinder Singh","Ritvik Aggarwal","Ibrahim Allahverdiyev","Muhammad Taha","Aslihan Akalin","Kevin Zhu","Sean O'Brien"],"pdf_url":"https://arxiv.org/pdf/2410.19572v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12372v1","updated":"2024-11-19T09:35:28Z","published":"2024-11-19T09:35:28Z","title":"RedPajama: an Open Dataset for Training Large Language Models","summary":" Large language models are increasingly becoming a cornerstone technology in\nartificial intelligence, the sciences, and society as a whole, yet the optimal\nstrategies for dataset composition and filtering remain largely elusive. Many\nof the top-performing models lack transparency in their dataset curation and\nmodel development processes, posing an obstacle to the development of fully\nopen language models. In this paper, we identify three core data-related\nchallenges that must be addressed to advance open-source language models. These\ninclude (1) transparency in model development, including the data curation\nprocess, (2) access to large quantities of high-quality data, and (3)\navailability of artifacts and metadata for dataset curation and analysis. To\naddress these challenges, we release RedPajama-V1, an open reproduction of the\nLLaMA training dataset. In addition, we release RedPajama-V2, a massive\nweb-only dataset consisting of raw, unfiltered text data together with quality\nsignals and metadata. Together, the RedPajama datasets comprise over 100\ntrillion tokens spanning multiple domains and with their quality signals\nfacilitate the filtering of data, aiming to inspire the development of numerous\nnew datasets. To date, these datasets have already been used in the training of\nstrong language models used in production, such as Snowflake Arctic,\nSalesforce's XGen and AI2's OLMo. To provide insight into the quality of\nRedPajama, we present a series of analyses and ablation studies with\ndecoder-only language models with up to 1.6B parameters. Our findings\ndemonstrate how quality signals for web data can be effectively leveraged to\ncurate high-quality subsets of the dataset, underscoring the potential of\nRedPajama to advance the development of transparent and high-performing\nlanguage models at scale.\n","authors":["Maurice Weber","Daniel Fu","Quentin Anthony","Yonatan Oren","Shane Adams","Anton Alexandrov","Xiaozhong Lyu","Huu Nguyen","Xiaozhe Yao","Virginia Adams","Ben Athiwaratkun","Rahul Chalamala","Kezhen Chen","Max Ryabinin","Tri Dao","Percy Liang","Christopher Ré","Irina Rish","Ce Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.12372v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2312.07141v3","updated":"2024-11-19T09:33:17Z","published":"2023-12-12T10:24:17Z","title":"Multilingual large language models leak human stereotypes across\n language boundaries","summary":" Multilingual large language models have gained prominence for their\nproficiency in processing and generating text across languages. Like their\nmonolingual counterparts, multilingual models are likely to pick up on\nstereotypes and other social biases present in their training data. In this\npaper, we study a phenomenon we term stereotype leakage, which refers to how\ntraining a model multilingually may lead to stereotypes expressed in one\nlanguage showing up in the models' behaviour in another. We propose a\nmeasurement framework for stereotype leakage and investigate its effect across\nEnglish, Russian, Chinese, and Hindi and with GPT-3.5, mT5, and mBERT. Our\nfindings show a noticeable leakage of positive, negative, and non-polar\nassociations across all languages. We find that of these models, GPT-3.5\nexhibits the most stereotype leakage, and Hindi is the most susceptible to\nleakage effects. WARNING: This paper contains model outputs which could be\noffensive in nature.\n","authors":["Yang Trista Cao","Anna Sotnikova","Jieyu Zhao","Linda X. Zou","Rachel Rudinger","Hal Daume III"],"pdf_url":"https://arxiv.org/pdf/2312.07141v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16718v2","updated":"2024-11-19T09:27:37Z","published":"2024-09-25T08:07:18Z","title":"Vision-Language Model Fine-Tuning via Simple Parameter-Efficient\n Modification","summary":" Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed\nthe success of prompt tuning and adapter tuning, while the classic model\nfine-tuning on inherent parameters seems to be overlooked. It is believed that\nfine-tuning the parameters of VLMs with few-shot samples corrupts the\npre-trained knowledge since fine-tuning the CLIP model even degrades\nperformance. In this paper, we revisit this viewpoint, and propose a new\nperspective: fine-tuning the specific parameters instead of all will uncover\nthe power of classic model fine-tuning on VLMs. Through our meticulous study,\nwe propose ClipFit, a simple yet effective method to fine-tune CLIP without\nintroducing any overhead of extra parameters. We demonstrate that by only\nfine-tuning the specific bias terms and normalization layers, ClipFit can\nimprove the performance of zero-shot CLIP by 7.27\\% average harmonic mean\naccuracy. Lastly, to understand how fine-tuning in CLIPFit affects the\npre-trained models, we conducted extensive experimental analyses w.r.t. changes\nin internal parameters and representations. We found that low-level text bias\nlayers and the first layer normalization layer change much more than other\nlayers. The code is available at \\url{https://github.com/minglllli/CLIPFit}.\n","authors":["Ming Li","Jike Zhong","Chenxin Li","Liuzhuozheng Li","Nie Lin","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2409.16718v2.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2411.12357v1","updated":"2024-11-19T09:18:20Z","published":"2024-11-19T09:18:20Z","title":"A Layered Architecture for Developing and Enhancing Capabilities in\n Large Language Model-based Software Systems","summary":" Significant efforts has been made to expand the use of Large Language Models\n(LLMs) beyond basic language tasks. While the generalizability and versatility\nof LLMs have enabled widespread adoption, evolving demands in application\ndevelopment often exceed their native capabilities. Meeting these demands may\ninvolve a diverse set of methods, such as enhancing creativity through either\ninference temperature adjustments or creativity-provoking prompts. Selecting\nthe right approach is critical, as different methods lead to trade-offs in\nengineering complexity, scalability, and operational costs. This paper\nintroduces a layered architecture that organizes LLM software system\ndevelopment into distinct layers, each characterized by specific attributes. By\naligning capabilities with these layers, the framework encourages the\nsystematic implementation of capabilities in effective and efficient ways that\nultimately supports desired functionalities and qualities. Through practical\ncase studies, we illustrate the utility of the framework. This work offers\ndevelopers actionable insights for selecting suitable technologies in LLM-based\nsoftware system development, promoting robustness and scalability.\n","authors":["Dawen Zhang","Xiwei Xu","Chen Wang","Zhenchang Xing","Robert Mao"],"pdf_url":"https://arxiv.org/pdf/2411.12357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06275v4","updated":"2024-11-19T09:06:33Z","published":"2023-09-12T14:36:23Z","title":"Re-Reading Improves Reasoning in Large Language Models","summary":" To enhance the reasoning capabilities of off-the-shelf Large Language Models\n(LLMs), we introduce a simple, yet general and effective prompting method, Re2,\ni.e., \\textbf{Re}-\\textbf{Re}ading the question as input. Unlike most\nthought-eliciting prompting methods, such as Chain-of-Thought (CoT), which aim\nto elicit the reasoning process in the output, Re2 shifts the focus to the\ninput by processing questions twice, thereby enhancing the understanding\nprocess. Consequently, Re2 demonstrates strong generality and compatibility\nwith most thought-eliciting prompting methods, including CoT. Crucially, Re2\nfacilitates a \"bidirectional\" encoding in unidirectional decoder-only LLMs\nbecause the first pass could provide global information for the second pass. We\nbegin with a preliminary empirical study as the foundation of Re2, illustrating\nits potential to enable \"bidirectional\" attention mechanisms. We then evaluate\nRe2 on extensive reasoning benchmarks across 14 datasets, spanning 112\nexperiments, to validate its effectiveness and generality. Our findings\nindicate that, with the exception of a few scenarios on vanilla ChatGPT, Re2\nconsistently enhances the reasoning performance of LLMs through a simple\nre-reading strategy. Further analyses reveal Re2's adaptability, showing how it\ncan be effectively integrated with different LLMs, thought-eliciting prompting,\nand ensemble strategies. Our code is available at\n\\url{https://github.com/Tebmer/Rereading-LLM-Reasoning/}\n","authors":["Xiaohan Xu","Chongyang Tao","Tao Shen","Can Xu","Hongbo Xu","Guodong Long","Jian-guang Lou","Shuai Ma"],"pdf_url":"https://arxiv.org/pdf/2309.06275v4.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2406.05085v2","updated":"2024-11-19T08:46:34Z","published":"2024-06-07T16:59:38Z","title":"Multi-Head RAG: Solving Multi-Aspect Problems with LLMs","summary":" Retrieval Augmented Generation (RAG) enhances the abilities of Large Language\nModels (LLMs) by enabling the retrieval of documents into the LLM context to\nprovide more accurate and relevant responses. Existing RAG solutions do not\nfocus on queries that may require fetching multiple documents with\nsubstantially different contents. Such queries occur frequently, but are\nchallenging because the embeddings of these documents may be distant in the\nembedding space, making it hard to retrieve them all. This paper introduces\nMulti-Head RAG (MRAG), a novel scheme designed to address this gap with a\nsimple yet powerful idea: leveraging activations of Transformer's multi-head\nattention layer, instead of the decoder layer, as keys for fetching\nmulti-aspect documents. The driving motivation is that different attention\nheads can learn to capture different data aspects. Harnessing the corresponding\nactivations results in embeddings that represent various facets of data items\nand queries, improving the retrieval accuracy for complex queries. We provide\nan evaluation methodology and metrics, multi-aspect datasets that we release\nonline, and real-world use cases to demonstrate MRAG's effectiveness, showing\nimprovements of up to 20% in relevance over standard RAG baselines. MRAG can be\nseamlessly integrated with existing RAG frameworks and benchmarking tools like\nRAGAS as well as different classes of data stores.\n","authors":["Maciej Besta","Ales Kubicek","Roman Niggli","Robert Gerstenberger","Lucas Weitzendorf","Mingyuan Chi","Patrick Iff","Joanna Gajda","Piotr Nyczyk","Jürgen Müller","Hubert Niewiadomski","Marcin Chrapek","Michał Podstawski","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2406.05085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02856v5","updated":"2024-11-19T08:38:55Z","published":"2024-06-05T02:12:06Z","title":"Xmodel-LM Technical Report","summary":" We introduce Xmodel-LM, a compact and efficient 1.1B language model\npre-trained on around 2 trillion tokens. Trained on our self-built dataset\n(Xdata), which balances Chinese and English corpora based on downstream task\noptimization, Xmodel-LM exhibits remarkable performance despite its smaller\nsize. It notably surpasses existing open-source language models of similar\nscale. Our model checkpoints and code are publicly accessible on GitHub at\nhttps://github.com/XiaoduoAILab/XmodelLM.\n","authors":["Yichuan Wang","Yang Liu","Yu Yan","Qun Wang","Xucheng Huang","Ling Jiang"],"pdf_url":"https://arxiv.org/pdf/2406.02856v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01285v2","updated":"2024-11-19T08:08:38Z","published":"2024-10-02T07:14:26Z","title":"Enhancing Training Data Attribution for Large Language Models with\n Fitting Error Consideration","summary":" The black-box nature of large language models (LLMs) poses challenges in\ninterpreting results, impacting issues such as data intellectual property\nprotection and hallucination tracing. Training data attribution (TDA) methods\nare considered effective solutions to address these challenges. Most recent TDA\nmethods rely on influence functions, assuming the model achieves minimized\nempirical risk. However, achieving this criterion is difficult, and sourcing\naccuracy can be compromised by fitting errors during model training. In this\npaper, we introduce a novel TDA method called Debias and Denoise Attribution\n(DDA), which enhances influence functions by addressing fitting errors.\nSpecifically, the debias strategy seeks to improve the performance of influence\nfunctions by eliminating the knowledge bias present in the base model before\nfine-tuning, while the denoise strategy aims to reduce discrepancies in\ninfluence scores arising from varying degrees of fitting during the training\nprocess through smoothing techniques. Experimental results demonstrate that our\nmethod significantly outperforms existing approaches, achieving an averaged AUC\nof 91.64%. Moreover, DDA exhibits strong generality and scalability across\nvarious sources and different-scale models like LLaMA2, QWEN2, and Mistral.\n","authors":["Kangxi Wu","Liang Pang","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.01285v2.pdf","comment":"Accepted to the EMNLP 2024 main"},{"id":"http://arxiv.org/abs/2411.12307v1","updated":"2024-11-19T07:48:35Z","published":"2024-11-19T07:48:35Z","title":"Balancing Accuracy and Efficiency in Multi-Turn Intent Classification\n for LLM-Powered Dialog Systems in Production","summary":" Accurate multi-turn intent classification is essential for advancing\nconversational AI systems. However, challenges such as the scarcity of\ncomprehensive datasets and the complexity of contextual dependencies across\ndialogue turns hinder progress. This paper presents two novel approaches\nleveraging Large Language Models (LLMs) to enhance scalability and reduce\nlatency in production dialogue systems. First, we introduce Symbol Tuning,\nwhich simplifies intent labels to reduce task complexity and improve\nperformance in multi-turn dialogues. Second, we propose C-LARA\n(Consistency-aware, Linguistics Adaptive Retrieval Augmentation), a framework\nthat employs LLMs for data augmentation and pseudo-labeling to generate\nsynthetic multi-turn dialogues. These enriched datasets are used to fine-tune a\nsmall, efficient model suitable for deployment. Experiments conducted on\nmultilingual dialogue datasets demonstrate significant improvements in\nclassification accuracy and resource efficiency. Our methods enhance multi-turn\nintent classification accuracy by 5.09%, reduce annotation costs by 40%, and\nenable scalable deployment in low-resource multilingual industrial systems,\nhighlighting their practicality and impact.\n","authors":["Junhua Liu","Yong Keat Tan","Bin Fu","Kwan Hui Lim"],"pdf_url":"https://arxiv.org/pdf/2411.12307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15000v3","updated":"2024-11-19T07:46:16Z","published":"2024-02-22T22:28:46Z","title":"Divide-or-Conquer? Which Part Should You Distill Your LLM?","summary":" Recent methods have demonstrated that Large Language Models (LLMs) can solve\nreasoning tasks better when they are encouraged to solve subtasks of the main\ntask first. In this paper we devise a similar strategy that breaks down\nreasoning tasks into a problem decomposition phase and a problem solving phase\nand show that the strategy is able to outperform a single stage solution.\nFurther, we hypothesize that the decomposition should be easier to distill into\na smaller model compared to the problem solving because the latter requires\nlarge amounts of domain knowledge while the former only requires learning\ngeneral problem solving strategies. We propose methods to distill these two\ncapabilities and evaluate their impact on reasoning outcomes and inference\ncost. We find that we can distill the problem decomposition phase and at the\nsame time achieve good generalization across tasks, datasets, and models.\nHowever, it is harder to distill the problem solving capability without losing\nperformance and the resulting distilled model struggles with generalization.\nThese results indicate that by using smaller, distilled problem decomposition\nmodels in combination with problem solving LLMs we can achieve reasoning with\ncost-efficient inference and local adaptation.\n","authors":["Zhuofeng Wu","He Bai","Aonan Zhang","Jiatao Gu","VG Vinod Vydiswaran","Navdeep Jaitly","Yizhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.15000v3.pdf","comment":"Findings of the Association for Computational Linguistics: EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.12287v1","updated":"2024-11-19T07:16:48Z","published":"2024-11-19T07:16:48Z","title":"CUE-M: Contextual Understanding and Enhanced Search with Multimodal\n Large Language Model","summary":" The integration of Retrieval-Augmented Generation (RAG) with Multimodal Large\nLanguage Models (MLLMs) has expanded the scope of multimodal query resolution.\nHowever, current systems struggle with intent understanding, information\nretrieval, and safety filtering, limiting their effectiveness. This paper\nintroduces Contextual Understanding and Enhanced Search with MLLM (CUE-M), a\nnovel multimodal search pipeline that addresses these challenges through a\nmulti-stage framework comprising image context enrichment, intent refinement,\ncontextual query generation, external API integration, and relevance-based\nfiltering. CUE-M incorporates a robust safety framework combining image-based,\ntext-based, and multimodal classifiers, dynamically adapting to instance- and\ncategory-specific risks. Evaluations on a multimodal Q&A dataset and a public\nsafety benchmark demonstrate that CUE-M outperforms baselines in accuracy,\nknowledge integration, and safety, advancing the capabilities of multimodal\nretrieval systems.\n","authors":["Dongyoung Go","Taesun Whang","Chanhee Lee","Hwayeon Kim","Sunghoon Park","Seunghwan Ji","Dongchan Kim","Young-Bum Kim"],"pdf_url":"https://arxiv.org/pdf/2411.12287v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2411.12275v1","updated":"2024-11-19T06:55:57Z","published":"2024-11-19T06:55:57Z","title":"Building Trust: Foundations of Security, Safety and Transparency in AI","summary":" This paper explores the rapidly evolving ecosystem of publicly available AI\nmodels, and their potential implications on the security and safety landscape.\nAs AI models become increasingly prevalent, understanding their potential risks\nand vulnerabilities is crucial. We review the current security and safety\nscenarios while highlighting challenges such as tracking issues, remediation,\nand the apparent absence of AI model lifecycle and ownership processes.\nComprehensive strategies to enhance security and safety for both model\ndevelopers and end-users are proposed. This paper aims to provide some of the\nfoundational pieces for more standardized security, safety, and transparency in\nthe development and operation of AI models and the larger open ecosystems and\ncommunities forming around them.\n","authors":["Huzaifa Sidhpurwala","Garth Mollett","Emily Fox","Mark Bestavros","Huamin Chen"],"pdf_url":"https://arxiv.org/pdf/2411.12275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11072v2","updated":"2024-11-19T06:45:13Z","published":"2024-11-17T13:21:26Z","title":"Multilingual Large Language Models: A Systematic Survey","summary":" This paper provides a comprehensive survey of the latest research on\nmultilingual large language models (MLLMs). MLLMs not only are able to\nunderstand and generate language across linguistic boundaries, but also\nrepresent an important advancement in artificial intelligence. We first discuss\nthe architecture and pre-training objectives of MLLMs, highlighting the key\ncomponents and methodologies that contribute to their multilingual\ncapabilities. We then discuss the construction of multilingual pre-training and\nalignment datasets, underscoring the importance of data quality and diversity\nin enhancing MLLM performance. An important focus of this survey is on the\nevaluation of MLLMs. We present a detailed taxonomy and roadmap covering the\nassessment of MLLMs' cross-lingual knowledge, reasoning, alignment with human\nvalues, safety, interpretability and specialized applications. Specifically, we\nextensively discuss multilingual evaluation benchmarks and datasets, and\nexplore the use of LLMs themselves as multilingual evaluators. To enhance MLLMs\nfrom black to white boxes, we also address the interpretability of multilingual\ncapabilities, cross-lingual transfer and language bias within these models.\nFinally, we provide a comprehensive review of real-world applications of MLLMs\nacross diverse domains, including biology, medicine, computer science,\nmathematics and law. We showcase how these models have driven innovation and\nimprovements in these specialized fields while also highlighting the challenges\nand opportunities in deploying MLLMs within diverse language communities and\napplication scenarios. We listed the paper related in this survey and publicly\navailable at https://github.com/tjunlp-lab/Awesome-Multilingual-LLMs-Papers.\n","authors":["Shaolin Zhu"," Supryadi","Shaoyang Xu","Haoran Sun","Leiyu Pan","Menglong Cui","Jiangcun Du","Renren Jin","António Branco","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2411.11072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12262v1","updated":"2024-11-19T06:21:51Z","published":"2024-11-19T06:21:51Z","title":"Low-resource Machine Translation: what for? who for? An observational\n study on a dedicated Tetun language translation service","summary":" The impact of machine translation (MT) on low-resource languages remains\npoorly understood. In particular, observational studies of actual usage\npatterns are scarce. Such studies could provide valuable insights into user\nneeds and behaviours, complementing survey-based methods. Here we present an\nobservational analysis of real-world MT usage for Tetun, the lingua franca of\nTimor-Leste, using server logs from a widely-used MT service with over $70,000$\nmonthly active users. Our analysis of $100,000$ translation requests reveals\npatterns that challenge assumptions based on existing corpora. We find that\nusers, many of them students on mobile devices, typically translate short texts\ninto Tetun across diverse domains including science, healthcare, and daily\nlife. This contrasts sharply with available Tetun corpora, which are dominated\nby news articles covering government and social issues. Our results suggest\nthat MT systems for languages like Tetun should prioritise translating into the\nlow-resource language, handling brief inputs effectively, and covering a wide\nrange of domains relevant to educational contexts. More broadly, this study\ndemonstrates how observational analysis can inform low-resource language\ntechnology development, by grounding research in practical community needs.\n","authors":["Raphael Merx","Hanna Suominen","Adérito José Guterres Correia","Trevor Cohn","Ekaterina Vylomova"],"pdf_url":"https://arxiv.org/pdf/2411.12262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01812v4","updated":"2024-11-19T06:14:31Z","published":"2024-09-14T02:35:29Z","title":"From Text to Multimodality: Exploring the Evolution and Impact of Large\n Language Models in Medical Practice","summary":" Large Language Models (LLMs) have rapidly evolved from text-based systems to\nmultimodal platforms, significantly impacting various sectors including\nhealthcare. This comprehensive review explores the progression of LLMs to\nMultimodal Large Language Models (MLLMs) and their growing influence in medical\npractice. We examine the current landscape of MLLMs in healthcare, analyzing\ntheir applications across clinical decision support, medical imaging, patient\nengagement, and research. The review highlights the unique capabilities of\nMLLMs in integrating diverse data types, such as text, images, and audio, to\nprovide more comprehensive insights into patient health. We also address the\nchallenges facing MLLM implementation, including data limitations, technical\nhurdles, and ethical considerations. By identifying key research gaps, this\npaper aims to guide future investigations in areas such as dataset development,\nmodality alignment methods, and the establishment of ethical guidelines. As\nMLLMs continue to shape the future of healthcare, understanding their potential\nand limitations is crucial for their responsible and effective integration into\nmedical practice.\n","authors":["Qian Niu","Keyu Chen","Ming Li","Pohsun Feng","Ziqian Bi","Lawrence KQ Yan","Yichao Zhang","Caitlyn Heqi Yin","Cheng Fei","Junyu Liu","Benji Peng","Tianyang Wang","Yunze Wang","Silin Chen"],"pdf_url":"https://arxiv.org/pdf/2410.01812v4.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.12254v1","updated":"2024-11-19T05:58:22Z","published":"2024-11-19T05:58:22Z","title":"Predicting User Intents and Musical Attributes from Music Discovery\n Conversations","summary":" Intent classification is a text understanding task that identifies user needs\nfrom input text queries. While intent classification has been extensively\nstudied in various domains, it has not received much attention in the music\ndomain. In this paper, we investigate intent classification models for music\ndiscovery conversation, focusing on pre-trained language models. Rather than\nonly predicting functional needs: intent classification, we also include a task\nfor classifying musical needs: musical attribute classification. Additionally,\nwe propose a method of concatenating previous chat history with just\nsingle-turn user queries in the input text, allowing the model to understand\nthe overall conversation context better. Our proposed model significantly\nimproves the F1 score for both user intent and musical attribute\nclassification, and surpasses the zero-shot and few-shot performance of the\npretrained Llama 3 model.\n","authors":["Daeyong Kwon","SeungHeon Doh","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2411.12254v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.03182v2","updated":"2024-11-19T05:57:28Z","published":"2024-10-04T06:45:48Z","title":"Generating bilingual example sentences with large language models as\n lexicography assistants","summary":" We present a study of LLMs' performance in generating and rating example\nsentences for bilingual dictionaries across languages with varying resource\nlevels: French (high-resource), Indonesian (mid-resource), and Tetun\n(low-resource), with English as the target language. We evaluate the quality of\nLLM-generated examples against the GDEX (Good Dictionary EXample) criteria:\ntypicality, informativeness, and intelligibility. Our findings reveal that\nwhile LLMs can generate reasonably good dictionary examples, their performance\ndegrades significantly for lower-resourced languages. We also observe high\nvariability in human preferences for example quality, reflected in low\ninter-annotator agreement rates. To address this, we demonstrate that\nin-context learning can successfully align LLMs with individual annotator\npreferences. Additionally, we explore the use of pre-trained language models\nfor automated rating of examples, finding that sentence perplexity serves as a\ngood proxy for typicality and intelligibility in higher-resourced languages.\nOur study also contributes a novel dataset of 600 ratings for LLM-generated\nsentence pairs, and provides insights into the potential of LLMs in reducing\nthe cost of lexicographic work, particularly for low-resource languages.\n","authors":["Raphael Merx","Ekaterina Vylomova","Kemal Kurniawan"],"pdf_url":"https://arxiv.org/pdf/2410.03182v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12240v1","updated":"2024-11-19T05:37:17Z","published":"2024-11-19T05:37:17Z","title":"Evaluating Tokenizer Performance of Large Language Models Across\n Official Indian Languages","summary":" Large Language Models (LLMs) based on transformer architectures have\nrevolutionized a variety of domains, with tokenization playing a pivotal role\nin their pre-processing and fine-tuning stages. In multilingual models,\nparticularly those tailored for Indic languages, effective tokenization is\ncrucial for optimizing performance. This paper presents a comprehensive\nevaluation of tokenizers used by 12 LLMs across all 22 official languages of\nIndia, with a focus on comparing the efficiency of their tokenization\nprocesses. We employed the Normalized Sequence Length (NSL) as a key metric in\nour analysis. Our findings reveal that the SUTRA tokenizer outperforms all\nother models, including several Indic-specific models, excelling in 14\nlanguages. Notable insights include the SUTRA tokenizer's superior handling of\nIndic languages, GPT-4o's advancement over its predecessor GPT-4 in processing\nIndian languages, and the limited performance of Project Indus in certain\nlanguages. This study underscores the critical importance of developing\ntargeted tokenization strategies for multilingual and Indic-centric models,\nlaying the groundwork for future improvements in tokenizer design to enhance\nlinguistic coverage and model efficiency.\n","authors":["S. Tamang","D. J. Bora"],"pdf_url":"https://arxiv.org/pdf/2411.12240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11019v3","updated":"2024-11-19T05:35:02Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n with Retrieval Augmentation","summary":" Large language models (LLMs) have shown impressive prowess in solving a wide\nrange of tasks with world knowledge. However, it remains unclear how well LLMs\nare able to perceive their factual knowledge boundaries, particularly under\nretrieval augmentation settings. In this study, we present the first analysis\non the factual knowledge boundaries of LLMs and how retrieval augmentation\naffects LLMs on open-domain question answering (QA), with a bunch of important\nfindings. Specifically, we focus on three research questions and analyze them\nby examining QA, priori judgement and posteriori judgement capabilities of\nLLMs. We show evidence that LLMs possess unwavering confidence in their\nknowledge and cannot handle the conflict between internal and external\nknowledge well. Furthermore, retrieval augmentation proves to be an effective\napproach in enhancing LLMs' awareness of knowledge boundaries. We further\nconduct thorough experiments to examine how different factors affect LLMs and\npropose a simple method to dynamically utilize supporting documents with our\njudgement strategy. Additionally, we find that the relevance between the\nsupporting documents and the questions significantly impacts LLMs' QA and\njudgemental capabilities. The code to reproduce this work is available at\nhttps://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12235v1","updated":"2024-11-19T05:19:53Z","published":"2024-11-19T05:19:53Z","title":"BoolQuestions: Does Dense Retrieval Understand Boolean Logic in\n Language?","summary":" Dense retrieval, which aims to encode the semantic information of arbitrary\ntext into dense vector representations or embeddings, has emerged as an\neffective and efficient paradigm for text retrieval, consequently becoming an\nessential component in various natural language processing systems. These\nsystems typically focus on optimizing the embedding space by attending to the\nrelevance of text pairs, while overlooking the Boolean logic inherent in\nlanguage, which may not be captured by current training objectives. In this\nwork, we first investigate whether current retrieval systems can comprehend the\nBoolean logic implied in language. To answer this question, we formulate the\ntask of Boolean Dense Retrieval and collect a benchmark dataset, BoolQuestions,\nwhich covers complex queries containing basic Boolean logic and corresponding\nannotated passages. Through extensive experimental results on the proposed task\nand benchmark dataset, we draw the conclusion that current dense retrieval\nsystems do not fully understand Boolean logic in language, and there is a long\nway to go to improve our dense retrieval systems. Furthermore, to promote\nfurther research on enhancing the understanding of Boolean logic for language\nmodels, we explore Boolean operation on decomposed query and propose a\ncontrastive continual training method that serves as a strong baseline for the\nresearch community.\n","authors":["Zongmeng Zhang","Jinhua Zhu","Wengang Zhou","Xiang Qi","Peng Zhang","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2411.12235v1.pdf","comment":"Findings of the Association for Computational Linguistics: EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.10557v2","updated":"2024-11-19T05:16:28Z","published":"2024-11-15T20:09:59Z","title":"MLAN: Language-Based Instruction Tuning Improves Zero-Shot\n Generalization of Multimodal Large Language Models","summary":" We present a novel instruction tuning recipe to improve the zero-shot task\ngeneralization of multimodal large language models. In contrast to existing\ninstruction tuning mechanisms that heavily rely on visual instructions, our\napproach focuses on language-based instruction tuning, offering a distinct and\nmore training efficient path for multimodal instruction tuning. We evaluate the\nperformance of the proposed approach on 9 unseen datasets across both language\nand vision modalities. Our results show that our language-only instruction\ntuning is able to significantly improve the performance of two pretrained\nmultimodal models based on Llama 2 and Vicuna on those unseen datasets.\nInterestingly, the language instruction following ability also helps unlock the\nmodels to follow vision instructions without explicit training. Compared to the\nstate of the art multimodal instruction tuning approaches that are mainly based\non visual instructions, our language-based method not only achieves superior\nperformance but also significantly enhances training efficiency. For instance,\nthe language-only instruction tuning produces competitive average performance\nacross the evaluated datasets (with even better performance on language\ndatasets) with significant training efficiency improvements (on average 4x),\nthanks to the striking reduction in the need for vision data. With a small\nnumber of visual instructions, this emerging language instruction following\nability transfers well to the unseen vision datasets, outperforming the state\nof the art with greater training efficiency.\n","authors":["Jianhong Tu","Zhuohao Ni","Nicholas Crispino","Zihao Yu","Michael Bendersky","Beliz Gunel","Ruoxi Jia","Xin Liu","Lingjuan Lyu","Dawn Song","Chenguang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.10557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09003v2","updated":"2024-11-19T04:53:47Z","published":"2024-11-13T20:12:55Z","title":"Refusal in LLMs is an Affine Function","summary":" We propose affine concept editing (ACE) as an approach for steering language\nmodels' behavior by intervening directly in activations. We begin with an\naffine decomposition of model activation vectors and show that prior methods\nfor steering model behavior correspond to subsets of terms of this\ndecomposition. We then provide a derivation of ACE and use it to control\nrefusal behavior on ten different models, including Llama 3 70B. ACE combines\naffine subspace projection and activation addition to reliably control the\nmodel's refusal responses across prompt types. We evaluate the results using\nLLM-based scoring on a collection of harmful and harmless prompts. Our\nexperiments demonstrate that ACE consistently achieves more precise control\nover model behavior than existing methods and generalizes to models where\ndirectional ablation via affine subspace projection alone produces incoherent\noutputs. Code for reproducing our results is available at\nhttps://github.com/EleutherAI/steering-llama3 .\n","authors":["Thomas Marshall","Adam Scherlis","Nora Belrose"],"pdf_url":"https://arxiv.org/pdf/2411.09003v2.pdf","comment":"added plots for results from additional models"},{"id":"http://arxiv.org/abs/2410.14148v3","updated":"2024-11-19T03:08:34Z","published":"2024-10-18T03:34:32Z","title":"Fine-Grained Verifiers: Preference Modeling as Next-token Prediction in\n Vision-Language Alignment","summary":" The recent advancements in large language models (LLMs) and pre-trained\nvision models have accelerated the development of vision-language large models\n(VLLMs), enhancing the interaction between visual and linguistic modalities.\nDespite their notable success across various domains, VLLMs face challenges in\nmodality alignment, which can lead to issues like hallucinations and unsafe\ncontent generation. Current alignment techniques often rely on coarse feedback\nand external datasets, limiting scalability and performance. In this paper, we\npropose FiSAO (Fine-Grained Self-Alignment Optimization), a novel\nself-alignment method that utilizes the model's own visual encoder as a\nfine-grained verifier to improve vision-language alignment without the need for\nadditional data. By leveraging token-level feedback from the vision encoder,\nFiSAO significantly improves vision-language alignment, even surpassing\ntraditional preference tuning methods that require additional data. Through\nboth theoretical analysis and experimental validation, we demonstrate that\nFiSAO effectively addresses the misalignment problem in VLLMs, marking the\nfirst instance of token-level rewards being applied to such models.\n","authors":["Chenhang Cui","An Zhang","Yiyang Zhou","Zhaorun Chen","Gelei Deng","Huaxiu Yao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.14148v3.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2411.11496v2","updated":"2024-11-19T03:01:43Z","published":"2024-11-18T11:58:07Z","title":"Safe + Safe = Unsafe? Exploring How Safe Images Can Be Exploited to\n Jailbreak Large Vision-Language Models","summary":" Recent advances in Large Vision-Language Models (LVLMs) have showcased strong\nreasoning abilities across multiple modalities, achieving significant\nbreakthroughs in various real-world applications. Despite this great success,\nthe safety guardrail of LVLMs may not cover the unforeseen domains introduced\nby the visual modality. Existing studies primarily focus on eliciting LVLMs to\ngenerate harmful responses via carefully crafted image-based jailbreaks\ndesigned to bypass alignment defenses. In this study, we reveal that a safe\nimage can be exploited to achieve the same jailbreak consequence when combined\nwith additional safe images and prompts. This stems from two fundamental\nproperties of LVLMs: universal reasoning capabilities and safety snowball\neffect. Building on these insights, we propose Safety Snowball Agent (SSA), a\nnovel agent-based framework leveraging agents' autonomous and tool-using\nabilities to jailbreak LVLMs. SSA operates through two principal stages: (1)\ninitial response generation, where tools generate or retrieve jailbreak images\nbased on potential harmful intents, and (2) harmful snowballing, where refined\nsubsequent prompts induce progressively harmful outputs. Our experiments\ndemonstrate that \\ours can use nearly any image to induce LVLMs to produce\nunsafe content, achieving high success jailbreaking rates against the latest\nLVLMs. Unlike prior works that exploit alignment flaws, \\ours leverages the\ninherent properties of LVLMs, presenting a profound challenge for enforcing\nsafety in generative multimodal systems. Our code is avaliable at\n\\url{https://github.com/gzcch/Safety_Snowball_Agent}.\n","authors":["Chenhang Cui","Gelei Deng","An Zhang","Jingnan Zheng","Yicong Li","Lianli Gao","Tianwei Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2411.11496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16843v2","updated":"2024-11-19T02:52:45Z","published":"2024-02-26T18:59:18Z","title":"Multi-LoRA Composition for Image Generation","summary":" Low-Rank Adaptation (LoRA) is extensively utilized in text-to-image models\nfor the accurate rendition of specific elements like distinct characters or\nunique styles in generated images. Nonetheless, existing methods face\nchallenges in effectively composing multiple LoRAs, especially as the number of\nLoRAs to be integrated grows, thus hindering the creation of complex imagery.\nIn this paper, we study multi-LoRA composition through a decoding-centric\nperspective. We present two training-free methods: LoRA Switch, which\nalternates between different LoRAs at each denoising step, and LoRA Composite,\nwhich simultaneously incorporates all LoRAs to guide more cohesive image\nsynthesis. To evaluate the proposed approaches, we establish ComposLoRA, a new\ncomprehensive testbed as part of this research. It features a diverse range of\nLoRA categories with 480 composition sets. Utilizing an evaluation framework\nbased on GPT-4V, our findings demonstrate a clear improvement in performance\nwith our methods over the prevalent baseline, particularly evident when\nincreasing the number of LoRAs in a composition. The code, benchmarks, LoRA\nweights, and all evaluation details are available on our project website:\nhttps://maszhongming.github.io/Multi-LoRA-Composition.\n","authors":["Ming Zhong","Yelong Shen","Shuohang Wang","Yadong Lu","Yizhu Jiao","Siru Ouyang","Donghan Yu","Jiawei Han","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2402.16843v2.pdf","comment":"Transactions on Machine Learning Research (TMLR), 2024"},{"id":"http://arxiv.org/abs/2411.12174v1","updated":"2024-11-19T02:39:28Z","published":"2024-11-19T02:39:28Z","title":"Just KIDDIN: Knowledge Infusion and Distillation for Detection of\n INdecent Memes","summary":" Toxicity identification in online multimodal environments remains a\nchallenging task due to the complexity of contextual connections across\nmodalities (e.g., textual and visual). In this paper, we propose a novel\nframework that integrates Knowledge Distillation (KD) from Large Visual\nLanguage Models (LVLMs) and knowledge infusion to enhance the performance of\ntoxicity detection in hateful memes. Our approach extracts sub-knowledge graphs\nfrom ConceptNet, a large-scale commonsense Knowledge Graph (KG) to be infused\nwithin a compact VLM framework. The relational context between toxic phrases in\ncaptions and memes, as well as visual concepts in memes enhance the model's\nreasoning capabilities. Experimental results from our study on two hate speech\nbenchmark datasets demonstrate superior performance over the state-of-the-art\nbaselines across AU-ROC, F1, and Recall with improvements of 1.1%, 7%, and 35%,\nrespectively. Given the contextual complexity of the toxicity detection task,\nour approach showcases the significance of learning from both explicit (i.e.\nKG) as well as implicit (i.e. LVLMs) contextual cues incorporated through a\nhybrid neurosymbolic approach. This is crucial for real-world applications\nwhere accurate and scalable recognition of toxic content is critical for\ncreating safer online environments.\n","authors":["Rahul Garg","Trilok Padhi","Hemang Jain","Ugur Kursuncu","Ugur Kursuncu","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2411.12174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12157v1","updated":"2024-11-19T01:41:56Z","published":"2024-11-19T01:41:56Z","title":"A Combined Encoder and Transformer Approach for Coherent and\n High-Quality Text Generation","summary":" This research introduces a novel text generation model that combines BERT's\nsemantic interpretation strengths with GPT-4's generative capabilities,\nestablishing a high standard in generating coherent, contextually accurate\nlanguage. Through the combined architecture, the model enhances semantic depth\nand maintains smooth, human-like text flow, overcoming limitations seen in\nprior models. Experimental benchmarks reveal that BERT-GPT-4 surpasses\ntraditional models, including GPT-3, T5, BART, Transformer-XL, and CTRL, in key\nmetrics like Perplexity and BLEU, showcasing its superior natural language\ngeneration performance. By fully utilizing contextual information, this hybrid\nmodel generates text that is not only logically coherent but also aligns\nclosely with human language patterns, providing an advanced solution for text\ngeneration tasks. This research highlights the potential of integrating\nsemantic understanding with advanced generative models, contributing new\ninsights for NLP, and setting a foundation for broader applications of\nlarge-scale generative architectures in areas such as automated writing,\nquestion-answer systems, and adaptive conversational agents.\n","authors":["Jiajing Chen","Shuo Wang","Zhen Qi","Zhenhong Zhang","Chihang Wang","Hongye Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.12157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12156v1","updated":"2024-11-19T01:26:20Z","published":"2024-11-19T01:26:20Z","title":"HNCSE: Advancing Sentence Embeddings via Hybrid Contrastive Learning\n with Hard Negatives","summary":" Unsupervised sentence representation learning remains a critical challenge in\nmodern natural language processing (NLP) research. Recently, contrastive\nlearning techniques have achieved significant success in addressing this issue\nby effectively capturing textual semantics. Many such approaches prioritize the\noptimization using negative samples. In fields such as computer vision, hard\nnegative samples (samples that are close to the decision boundary and thus more\ndifficult to distinguish) have been shown to enhance representation learning.\nHowever, adapting hard negatives to contrastive sentence learning is complex\ndue to the intricate syntactic and semantic details of text. To address this\nproblem, we propose HNCSE, a novel contrastive learning framework that extends\nthe leading SimCSE approach. The hallmark of HNCSE is its innovative use of\nhard negative samples to enhance the learning of both positive and negative\nsamples, thereby achieving a deeper semantic understanding. Empirical tests on\nsemantic textual similarity and transfer task datasets validate the superiority\nof HNCSE.\n","authors":["Wenxiao Liu","Zihong Yang","Chaozhuo Li","Zijin Hong","Jianfeng Ma","Zhiquan Liu","Litian Zhang","Feiran Huang"],"pdf_url":"https://arxiv.org/pdf/2411.12156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12147v1","updated":"2024-11-19T00:50:06Z","published":"2024-11-19T00:50:06Z","title":"CoMeDi Shared Task: Models as Annotators in Lexical Semantics\n Disagreements","summary":" We present the results of our system for the CoMeDi Shared Task, which\npredicts majority votes (Subtask 1) and annotator disagreements (Subtask 2).\nOur approach combines model ensemble strategies with MLP-based and\nthreshold-based methods trained on pretrained language models. Treating\nindividual models as virtual annotators, we simulate the annotation process by\ndesigning aggregation measures that incorporate continuous similarity scores\nand discrete classification labels to capture both majority and disagreement.\nAdditionally, we employ anisotropy removal techniques to enhance performance.\nExperimental results demonstrate the effectiveness of our methods, particularly\nfor Subtask 2. Notably, we find that continuous similarity scores, even within\nthe same model, align better with human disagreement patterns compared to\naggregated discrete labels.\n","authors":["Zhu Liu","Zhen Hu","Ying Liu"],"pdf_url":"https://arxiv.org/pdf/2411.12147v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.12142v1","updated":"2024-11-19T00:44:56Z","published":"2024-11-19T00:44:56Z","title":"A Computational Method for Measuring \"Open Codes\" in Qualitative\n Analysis","summary":" Qualitative analysis is critical to understanding human datasets in many\nsocial science disciplines. Open coding is an inductive qualitative process\nthat identifies and interprets \"open codes\" from datasets. Yet, meeting\nmethodological expectations (such as \"as exhaustive as possible\") can be\nchallenging. While many machine learning (ML)/generative AI (GAI) studies have\nattempted to support open coding, few have systematically measured or evaluated\nGAI outcomes, increasing potential bias risks. Building on Grounded Theory and\nThematic Analysis theories, we present a computational method to measure and\nidentify potential biases from \"open codes\" systematically. Instead of\noperationalizing human expert results as the \"ground truth,\" our method is\nbuilt upon a team-based approach between human and machine coders. We\nexperiment with two HCI datasets to establish this method's reliability by 1)\ncomparing it with human analysis, and 2) analyzing its output stability. We\npresent evidence-based suggestions and example workflows for ML/GAI to support\nopen coding.\n","authors":["John Chen","Alexandros Lotsos","Lexie Zhao","Jessica Hullman","Bruce Sherin","Uri Wilensky","Michael Horn"],"pdf_url":"https://arxiv.org/pdf/2411.12142v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Robotics 36 + +
+
+
+ + ☆ Soft Robotic Dynamic In-Hand Pen Spinning + + +
+ Dynamic in-hand manipulation remains a challenging task for soft robotic +systems that have demonstrated advantages in safe compliant interactions but +struggle with high-speed dynamic tasks. In this work, we present SWIFT, a +system for learning dynamic tasks using a soft and compliant robotic hand. +Unlike previous works that rely on simulation, quasi-static actions and precise +object models, the proposed system learns to spin a pen through trial-and-error +using only real-world data without requiring explicit prior knowledge of the +pen's physical attributes. With self-labeled trials sampled from the real +world, the system discovers the set of pen grasping and spinning primitive +parameters that enables a soft hand to spin a pen robustly and reliably. After +130 sampled actions per object, SWIFT achieves 100% success rate across three +pens with different weights and weight distributions, demonstrating the +system's generalizability and robustness to changes in object properties. The +results highlight the potential for soft robotic end-effectors to perform +dynamic tasks including rapid in-hand manipulation. We also demonstrate that +SWIFT generalizes to spinning items with different shapes and weights such as a +brush and a screwdriver which we spin with 10/10 and 5/10 success rates +respectively. Videos, data, and code are available at +https://soft-spin.github.io. + +
+
+
+
+
+ + ☆ UBSoft: A Simulation Platform for Robotic Skill Learning in Unbounded + Soft Environments CoRL 2024 + + +
+ It is desired to equip robots with the capability of interacting with various +soft materials as they are ubiquitous in the real world. While physics +simulations are one of the predominant methods for data collection and robot +training, simulating soft materials presents considerable challenges. +Specifically, it is significantly more costly than simulating rigid objects in +terms of simulation speed and storage requirements. These limitations typically +restrict the scope of studies on soft materials to small and bounded areas, +thereby hindering the learning of skills in broader spaces. To address this +issue, we introduce UBSoft, a new simulation platform designed to support +unbounded soft environments for robot skill acquisition. Our platform utilizes +spatially adaptive resolution scales, where simulation resolution dynamically +adjusts based on proximity to active robotic agents. Our framework markedly +reduces the demand for extensive storage space and computation costs required +for large-scale scenarios involving soft materials. We also establish a set of +benchmark tasks in our platform, including both locomotion and manipulation +tasks, and conduct experiments to evaluate the efficacy of various +reinforcement learning algorithms and trajectory optimization techniques, both +gradient-based and sampling-based. Preliminary results indicate that +sampling-based trajectory optimization generally achieves better results for +obtaining one trajectory to solve the task. Additionally, we conduct +experiments in real-world environments to demonstrate that advancements made in +our UBSoft simulator could translate to improved robot interactions with +large-scale soft material. More videos can be found at +https://vis-www.cs.umass.edu/ubsoft/. + +
+
+ comment: CoRL 2024. The first two authors contributed equally to this paper +
+
+
+
+
+ + ☆ Identifying patterns of proprioception and target matching acuity in + healthy humans + + +
+ Traditional approaches to measurement in upper-limb therapy have gaps that +electronic sensing and recording can help fill. We highlight shortcomings in +current kinematic recording devices, and we introduce a wrist sensing device +that performs multimodal sensing during single-axis rotation. Our goal is to +characterize normative kinesthetic perception and real-world performance as a +multimodal sensory "fingerprint" that can serve as a reference point for +identifying deficit in persons affected by stroke, and then as a jumping point +for later neuroscientific interrogation. We present an experiment involving +psychophysical measurements of passive stimuli discrimination, matching +adjustment acuity, and ADL performance in 11 neurologically-intact persons. We +found that passive velocity sense and active position sense of healthy +controls, measured by velocity discrimination and position matching +respectively, correlated in rank with each other, but other score comparisons +of acuity or task performance had no statistically significant correlations. We +also found that participants differed in acuity between passive and active +velocity sense, which supports current understanding about muscle spindle +activation being modulated by conscious motor command. The potential for our +null correlation results to reveal dissociable aspects of deficit is discussed, +as well as implications for future neuroscientific study with more kinematic +measures and larger datasets. + +
+
+ comment: 14 pages, 15 figures; A newer version of this work has been submitted + to the 2024 IEEE EMBC for possible publication in their conference + proceedings +
+
+
+
+
+ + ☆ Data-efficient Tactile Sensing with Electrical Impedance Tomography + + +
+ Electrical Impedance Tomography (EIT)-inspired tactile sensors are gaining +attention in robotic tactile sensing due to their cost-effectiveness, safety, +and scalability with sparse electrode configurations. This paper presents a +data augmentation strategy for learning-based tactile reconstruction that +amplifies the original single-frame signal measurement into 32 distinct, +effective signal data for training. This approach supplements uncollected +conditions of position information, resulting in more accurate and +high-resolution tactile reconstructions. Data augmentation for EIT +significantly reduces the required EIT measurements and achieves promising +performance with even limited samples. Simulation results show that the +proposed method improves the correlation coefficient by over 12% and reduces +the relative error by over 21% under various noise levels. Furthermore, we +demonstrate that a standard deep neural network (DNN) utilizing the proposed +data augmentation reduces the required data down to 1/31 while achieving a +similar tactile reconstruction quality. Real-world tests further validate the +approach's effectiveness on a flexible EIT-based tactile sensor. These results +could help address the challenge of training tactile sensing networks with +limited available measurements, improving the accuracy and applicability of +EIT-based tactile sensing systems. + +
+
+
+
+
+ + ☆ Instant Policy: In-Context Imitation Learning via Graph Diffusion + + +
+ Following the impressive capabilities of in-context learning with large +transformers, In-Context Imitation Learning (ICIL) is a promising opportunity +for robotics. We introduce Instant Policy, which learns new tasks instantly +(without further training) from just one or two demonstrations, achieving ICIL +through two key components. First, we introduce inductive biases through a +graph representation and model ICIL as a graph generation problem with a +learned diffusion process, enabling structured reasoning over demonstrations, +observations, and actions. Second, we show that such a model can be trained +using pseudo-demonstrations - arbitrary trajectories generated in simulation - +as a virtually infinite pool of training data. Simulated and real experiments +show that Instant Policy enables rapid learning of various everyday robot +tasks. We also show how it can serve as a foundation for cross-embodiment and +zero-shot transfer to language-defined tasks. Code and videos are available at +https://www.robot-learning.uk/instant-policy. + +
+
+ comment: Code and videos are available on our project webpage at + https://www.robot-learning.uk/instant-policy +
+
+
+
+
+ + ☆ Locomotion Mode Transitions: Tackling System- and User-Specific + Variability in Lower-Limb Exoskeletons + + +
+ Accurate detection of locomotion transitions, such as walk to sit, walk to +stair ascent, and descent, is crucial to effectively control robotic assistive +devices, such as lower-limb exoskeletons, as each locomotion mode requires +specific assistance. Variability in collected sensor data introduced by user- +or system-specific characteristics makes it challenging to maintain high +transition detection accuracy while avoiding latency using non-adaptive +classification models. In this study, we identified key factors influencing +transition detection performance, including variations in user behavior, and +different mechanical designs of the exoskeletons. To boost the transition +detection accuracy, we introduced two methods for adapting a finite-state +machine classifier to system- and user-specific variability: a Statistics-Based +approach and Bayesian Optimization. Our experimental results demonstrate that +both methods remarkably improve transition detection accuracy across diverse +users, achieving up to an 80% increase in certain scenarios compared to the +non-personalized threshold method. These findings emphasize the importance of +personalization in adaptive control systems, underscoring the potential for +enhanced user experience and effectiveness in assistive devices. By +incorporating subject- and system-specific data into the model training +process, our approach offers a precise and reliable solution for detecting +locomotion transitions, catering to individual user needs, and ultimately +improving the performance of assistive devices. + +
+
+ comment: 16 pages, 16 figures +
+
+
+
+
+ + ☆ Tactile interaction with social robots influences attitudes and + behaviour + + +
+ Tactile interaction plays an essential role in human-to-human interaction. +People gain comfort and support from tactile interactions with others and touch +is an important predictor for trust. While touch has been explored as a +communicative modality in HCI and HRI, we here report on two studies in which +touching a social robot is used to regulate people's stress levels and +consequently their actions. In the first study, we look at whether different +intensities of tactile interaction result in a physiological response related +to stress, and whether the interaction impacts risk-taking behaviour and trust. +We let 38 participants complete a Balloon Analogue Risk Task (BART), a +computer-based game that serves as a proxy for risk-taking behaviour. In our +study, participants are supported by a robot during the BART task. The robot +builds trust and encourages participants to take more risk. The results show +that affective tactile interaction with the robot increases participants' +risk-taking behaviour, but gentle affective tactile interaction increases +comfort and lowers stress whereas high-intensity touch does not. We also find +that male participants exhibit more risk-taking behaviour than females while +being less stressed. Based on this experiment, a second study is used to +ascertain whether these effects are caused by the social nature of tactile +interaction or by the physical interaction alone. For this, instead of a social +robot, participants now have a tactile interaction with a non-social device. +The non-social interaction does not result in any effect, leading us to +conclude that tactile interaction with humanoid robots is a social phenomenon +rather than a mere physical phenomenon. + +
+
+
+
+
+ + ☆ Multilayer occupancy grid for obstacle avoidance in an autonomous ground + vehicle using RGB-D camera + + +
+ This work describes the process of integrating a depth camera into the +navigation system of a self-driving ground vehicle (SDV) and the implementation +of a multilayer costmap that enhances the vehicle's obstacle identification +process by expanding its two-dimensional field of view, based on 2D LIDAR, to a +three-dimensional perception system using an RGB-D camera. This approach lays +the foundation for a robust vision-based navigation and obstacle detection +system. A theoretical review is presented and implementation results are +discussed for future work. + +
+
+
+
+
+ + ☆ VMGNet: A Low Computational Complexity Robotic Grasping Network Based on + VMamba with Multi-Scale Feature Fusion + + +
+ While deep learning-based robotic grasping technology has demonstrated strong +adaptability, its computational complexity has also significantly increased, +making it unsuitable for scenarios with high real-time requirements. Therefore, +we propose a low computational complexity and high accuracy model named VMGNet +for robotic grasping. For the first time, we introduce the Visual State Space +into the robotic grasping field to achieve linear computational complexity, +thereby greatly reducing the model's computational cost. Meanwhile, to improve +the accuracy of the model, we propose an efficient and lightweight multi-scale +feature fusion module, named Fusion Bridge Module, to extract and fuse +information at different scales. We also present a new loss function +calculation method to enhance the importance differences between subtasks, +improving the model's fitting ability. Experiments show that VMGNet has only +8.7G Floating Point Operations and an inference time of 8.1 ms on our devices. +VMGNet also achieved state-of-the-art performance on the Cornell and Jacquard +public datasets. To validate VMGNet's effectiveness in practical applications, +we conducted real grasping experiments in multi-object scenarios, and VMGNet +achieved an excellent performance with a 94.4% success rate in real-world +grasping tasks. The video for the real-world robotic grasping experiments is +available at https://youtu.be/S-QHBtbmLc4. + +
+
+
+
+
+ + ☆ ManiSkill-ViTac 2025: Challenge on Manipulation Skill Learning With + Vision and Tactile Sensing + + +
+ This article introduces the ManiSkill-ViTac Challenge 2025, which focuses on +learning contact-rich manipulation skills using both tactile and visual +sensing. Expanding upon the 2024 challenge, ManiSkill-ViTac 2025 includes 3 +independent tracks: tactile manipulation, tactile-vision fusion manipulation, +and tactile sensor structure design. The challenge aims to push the boundaries +of robotic manipulation skills, emphasizing the integration of tactile and +visual data to enhance performance in complex, real-world tasks. Participants +will be evaluated using standardized metrics across both simulated and +real-world environments, spurring innovations in sensor design and +significantly advancing the field of vision-tactile fusion in robotics. + +
+
+ comment: Challenge webpage: + https://ai-workshops.github.io/maniskill-vitac-challenge-2025/ +
+
+
+
+
+ + ☆ Robotic transcatheter tricuspid valve replacement with hybrid enhanced + intelligence: a new paradigm and first-in-vivo study + + +
+ Transcatheter tricuspid valve replacement (TTVR) is the latest treatment for +tricuspid regurgitation and is in the early stages of clinical adoption. +Intelligent robotic approaches are expected to overcome the challenges of +surgical manipulation and widespread dissemination, but systems and protocols +with high clinical utility have not yet been reported. In this study, we +propose a complete solution that includes a passive stabilizer, robotic drive, +detachable delivery catheter and valve manipulation mechanism. Working towards +autonomy, a hybrid augmented intelligence approach based on reinforcement +learning, Monte Carlo probabilistic maps and human-robot co-piloted control was +introduced. Systematic tests in phantom and first-in-vivo animal experiments +were performed to verify that the system design met the clinical requirement. +Furthermore, the experimental results confirmed the advantages of co-piloted +control over conventional master-slave control in terms of time efficiency, +control efficiency, autonomy and stability of operation. In conclusion, this +study provides a comprehensive pathway for robotic TTVR and, to our knowledge, +completes the first animal study that not only successfully demonstrates the +application of hybrid enhanced intelligence in interventional robotics, but +also provides a solution with high application value for a cutting-edge +procedure. + +
+
+
+
+
+ + ☆ Behaviour diversity in a walking and climbing centipede-like virtual + creature + + +
+ Robot controllers are often optimised for a single robot in a single +environment. This approach proves brittle, as such a controller will often fail +to produce sensible behavior for a new morphology or environment. In +comparison, animal gaits are robust and versatile. By observing animals, and +attempting to extract general principles of locomotion from their movement, we +aim to design a single decentralised controller applicable to diverse +morphologies and environments. The controller implements the three components +1) undulation, 2) peristalsis, and 3) leg motion, which we believe are the +essential elements in most animal gaits. The controller is tested on a variety +of simulated centipede-like robots. The centipede is chosen as inspiration +because it moves using both body contractions and legged locomotion. For a +controller to work in qualitatively different settings, it must also be able to +exhibit qualitatively different behaviors. We find that six different modes of +locomotion emerge from our controller in response to environmental and +morphological changes. We also find that different parts of the centipede model +can exhibit different modes of locomotion, simultaneously, based on local +morphological features. This controller can potentially aid in the design or +evolution of robots, by quickly testing the potential of a morphology, or be +used to get insights about underlying locomotion principles in the centipede. + +
+
+
+
+
+ + ☆ Breathless: An 8-hour Performance Contrasting Human and Robot + Expressiveness + + +
+ This paper describes the robot technology behind an original performance that +pairs a human dancer (Cuan) with an industrial robot arm for an eight-hour +dance that unfolds over the timespan of an American workday. To control the +robot arm, we combine a range of sinusoidal motions with varying amplitude, +frequency and offset at each joint to evoke human motions common in physical +labor such as stirring, digging, and stacking. More motions were developed +using deep learning techniques for video-based human-pose tracking and +extraction. We combine these pre-recorded motions with improvised robot motions +created live by putting the robot into teach-mode and triggering force sensing +from the robot joints onstage. All motions are combined with commercial and +original music using a custom suite of python software with AppleScript, +Keynote, and Zoom to facilitate on-stage communication with the dancer. The +resulting performance contrasts the expressivity of the human body with the +precision of robot machinery. Video, code and data are available on the project +website: https://sites.google.com/playing.studio/breathless + +
+
+ comment: 15 pages, 9 figures, accepted for ISRR (International Symposium of + Robotics Research) 2024 +
+
+
+
+
+ + ☆ TactV: A Class of Hybrid Terrestrial/Aerial Coaxial Tilt-Rotor Vehicles + + +
+ To enhance the obstacle-crossing and endurance capabilities of vehicles +operating in complex environments, this paper presents the design of a hybrid +terrestrial/aerial coaxial tilt-rotor vehicle, TactV, which integrates +advantages such as lightweight construction and high maneuverability. Unlike +existing tandem dual-rotor vehicles, TactV employs a tiltable coaxial +dual-rotor design and features a spherical cage structure that encases the +body, allowing for omnidirectional movement while further reducing its overall +dimensions. To enable TactV to maneuver flexibly in aerial, planar, and +inclined surfaces, we established corresponding dynamic and control models for +each mode. Additionally, we leveraged TactV's tiltable center of gravity to +design energy-saving and high-mobility modes for ground operations, thereby +further enhancing its endurance. Experimental designs for both aerial and +ground tests corroborated the superiority of TactV's movement capabilities and +control strategies. + +
+
+
+
+
+ + ☆ Target Height Estimation Using a Single Acoustic Camera for Compensation + in 2D Seabed Mosaicking + + +
+ This letter proposes a novel approach for compensating target height data in +2D seabed mosaicking for low-visibility underwater perception. Acoustic cameras +are effective sensors for sensing the marine environments due to their +high-resolution imaging capabilities and robustness to darkness and turbidity. +However, the loss of elevation angle during the imaging process results in a +lack of target height information in the original acoustic camera images, +leading to a simplistic 2D representation of the seabed mosaicking. In +perceiving cluttered and unexplored marine environments, target height data is +crucial for avoiding collisions with marine robots. This study proposes a novel +approach for estimating seabed target height using a single acoustic camera and +integrates height data into 2D seabed mosaicking to compensate for the missing +3D dimension of seabed targets. Unlike classic methods that model the loss of +elevation angle to achieve seabed 3D reconstruction, this study focuses on +utilizing available acoustic cast shadow clues and simple sensor motion to +quickly estimate target height. The feasibility of our proposal is verified +through a water tank experiment and a simulation experiment. + +
+
+ comment: 8 pages,conference +
+
+
+
+
+ + ☆ Variable-Frequency Imitation Learning for Variable-Speed Motion + + +
+ Conventional methods of imitation learning for variable-speed motion have +difficulty extrapolating speeds because they rely on learning models running at +a constant sampling frequency. This study proposes variable-frequency imitation +learning (VFIL), a novel method for imitation learning with learning models +trained to run at variable sampling frequencies along with the desired speeds +of motion. The experimental results showed that the proposed method improved +the velocity-wise accuracy along both the interpolated and extrapolated +frequency labels, in addition to a 12.5 % increase in the overall success rate. + +
+
+ comment: 7 pages, 9 figures, 2 tables. Submitted to IEEE ICM 2025 +
+
+
+
+
+ + ☆ SNN-Based Online Learning of Concepts and Action Laws in an Open World + + +
+ We present the architecture of a fully autonomous, bio-inspired cognitive +agent built around a spiking neural network (SNN) implementing the agent's +semantic memory. The agent explores its universe and learns concepts of +objects/situations and of its own actions in a one-shot manner. While +object/situation concepts are unary, action concepts are triples made up of an +initial situation, a motor activity, and an outcome. They embody the agent's +knowledge of its universe's actions laws. Both kinds of concepts have different +degrees of generality. To make decisions the agent queries its semantic memory +for the expected outcomes of envisaged actions and chooses the action to take +on the basis of these predictions. Our experiments show that the agent handles +new situations by appealing to previously learned general concepts and rapidly +modifies its concepts to adapt to environment changes. + +
+
+
+
+
+ + ☆ GLOVER: Generalizable Open-Vocabulary Affordance Reasoning for + Task-Oriented Grasping + + +
+ Inferring affordable (i.e., graspable) parts of arbitrary objects based on +human specifications is essential for robots advancing toward open-vocabulary +manipulation. Current grasp planners, however, are hindered by limited +vision-language comprehension and time-consuming 3D radiance modeling, +restricting real-time, open-vocabulary interactions with objects. To address +these limitations, we propose GLOVER, a unified Generalizable Open-Vocabulary +Affordance Reasoning framework, which fine-tunes the Large Language Models +(LLMs) to predict visual affordance of graspable object parts within RGB +feature space. We compile a dataset of over 10,000 images from human-object +interactions, annotated with unified visual and linguistic affordance labels, +to enable multi-modal fine-tuning. GLOVER inherits world knowledge and +common-sense reasoning from LLMs, facilitating more fine-grained object +understanding and sophisticated tool-use reasoning. To enable effective +real-world deployment, we present Affordance-Aware Grasping Estimation (AGE), a +non-parametric grasp planner that aligns the gripper pose with a superquadric +surface derived from affordance data. In evaluations across 30 real-world +scenes, GLOVER achieves success rates of 86.0% in part identification and 76.3% +in grasping, with speeds approximately 330 times faster in affordance reasoning +and 40 times faster in grasping pose estimation than the previous +state-of-the-art. + +
+
+
+
+
+ + ☆ Error-Feedback Model for Output Correction in Bilateral Control-Based + Imitation Learning + + +
+ In recent years, imitation learning using neural networks has enabled robots +to perform flexible tasks. However, since neural networks operate in a +feedforward structure, they do not possess a mechanism to compensate for output +errors. To address this limitation, we developed a feedback mechanism to +correct these errors. By employing a hierarchical structure for neural networks +comprising lower and upper layers, the lower layer was controlled to follow the +upper layer. Additionally, using a multi-layer perceptron in the lower layer, +which lacks an internal state, enhanced the error feedback. In the +character-writing task, this model demonstrated improved accuracy in writing +previously untrained characters. In the character-writing task, this model +demonstrated improved accuracy in writing previously untrained characters. +Through autonomous control with error feedback, we confirmed that the lower +layer could effectively track the output of the upper layer. This study +represents a promising step toward integrating neural networks with control +theories. + +
+
+
+
+
+ + ☆ ADV2E: Bridging the Gap Between Analogue Circuit and Discrete Frames in + the Video-to-Events Simulator + + +
+ Event cameras operate fundamentally differently from traditional Active Pixel +Sensor (APS) cameras, offering significant advantages. Recent research has +developed simulators to convert video frames into events, addressing the +shortage of real event datasets. Current simulators primarily focus on the +logical behavior of event cameras. However, the fundamental analogue properties +of pixel circuits are seldom considered in simulator design. The gap between +analogue pixel circuit and discrete video frames causes the degeneration of +synthetic events, particularly in high-contrast scenes. In this paper, we +propose a novel method of generating reliable event data based on a detailed +analysis of the pixel circuitry in event cameras. We incorporate the analogue +properties of event camera pixel circuits into the simulator design: (1) +analogue filtering of signals from light intensity to events, and (2) a cutoff +frequency that is independent of video frame rate. Experimental results on two +relevant tasks, including semantic segmentation and image reconstruction, +validate the reliability of simulated event data, even in high-contrast scenes. +This demonstrates that deep neural networks exhibit strong generalization from +simulated to real event data, confirming that the synthetic events generated by +the proposed method are both realistic and well-suited for effective training. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Safe Navigation in Dynamic Environments using Density Functions + + +
+ This work uses density functions for safe navigation in dynamic environments. +The dynamic environment consists of time-varying obstacles as well as +time-varying target sets. We propose an analytical construction of time-varying +density functions to solve these navigation problems. The proposed approach +leads to a time-varying feedback controller obtained as a positive gradient of +the density function. This paper's main contribution is providing convergence +proof using the analytically constructed density function for safe navigation +in the presence of a dynamic obstacle set and time-varying target set. The +results are the first of this kind developed for a system with integrator +dynamics and open up the possibility for application to systems with more +complex dynamics using methods based on control density function and inverse +kinematic-based control design. We present the application of the developed +approach for collision avoidance in multi-agent systems and robotic systems. +While the theoretical results are produced for first-order integrator systems, +we demonstrate how the framework can be applied for systems with non-trivial +dynamics, such as Dubin's car model and fully actuated Euler-Lagrange system +with robotics applications. + +
+
+
+
+
+ + ☆ LiV-GS: LiDAR-Vision Integration for 3D Gaussian Splatting SLAM in + Outdoor Environments + + +
+ We present LiV-GS, a LiDAR-visual SLAM system in outdoor environments that +leverages 3D Gaussian as a differentiable spatial representation. Notably, +LiV-GS is the first method that directly aligns discrete and sparse LiDAR data +with continuous differentiable Gaussian maps in large-scale outdoor scenes, +overcoming the limitation of fixed resolution in traditional LiDAR mapping. The +system aligns point clouds with Gaussian maps using shared covariance +attributes for front-end tracking and integrates the normal orientation into +the loss function to refines the Gaussian map. To reliably and stably update +Gaussians outside the LiDAR field of view, we introduce a novel conditional +Gaussian constraint that aligns these Gaussians closely with the nearest +reliable ones. The targeted adjustment enables LiV-GS to achieve fast and +accurate mapping with novel view synthesis at a rate of 7.98 FPS. Extensive +comparative experiments demonstrate LiV-GS's superior performance in SLAM, +image rendering and mapping. The successful cross-modal radar-LiDAR +localization highlights the potential of LiV-GS for applications in cross-modal +semantic positioning and object segmentation with Gaussian maps. + +
+
+
+
+
+ + ☆ AsynEIO: Asynchronous Monocular Event-Inertial Odometry Using Gaussian + Process Regression + + +
+ Event cameras, when combined with inertial sensors, show significant +potential for motion estimation in challenging scenarios, such as high-speed +maneuvers and low-light environments. There are many methods for producing such +estimations, but most boil down to a synchronous discrete-time fusion problem. +However, the asynchronous nature of event cameras and their unique fusion +mechanism with inertial sensors remain underexplored. In this paper, we +introduce a monocular event-inertial odometry method called AsynEIO, designed +to fuse asynchronous event and inertial data within a unified Gaussian Process +(GP) regression framework. Our approach incorporates an event-driven frontend +that tracks feature trajectories directly from raw event streams at a high +temporal resolution. These tracked feature trajectories, along with various +inertial factors, are integrated into the same GP regression framework to +enable asynchronous fusion. With deriving analytical residual Jacobians and +noise models, our method constructs a factor graph that is iteratively +optimized and pruned using a sliding-window optimizer. Comparative assessments +highlight the performance of different inertial fusion strategies, suggesting +optimal choices for varying conditions. Experimental results on both public +datasets and our own event-inertial sequences indicate that AsynEIO outperforms +existing methods, especially in high-speed and low-illumination scenarios. + +
+
+ comment: Submitted to IEEE (2024-11-4) +
+
+
+
+
+ + ☆ Reinforcement Learning with Action Sequence for Data-Efficient Robot + Learning + + +
+ Training reinforcement learning (RL) agents on robotic tasks typically +requires a large number of training samples. This is because training data +often consists of noisy trajectories, whether from exploration or +human-collected demonstrations, making it difficult to learn value functions +that understand the effect of taking each action. On the other hand, recent +behavior-cloning (BC) approaches have shown that predicting a sequence of +actions enables policies to effectively approximate noisy, multi-modal +distributions of expert demonstrations. Can we use a similar idea for improving +RL on robotic tasks? In this paper, we introduce a novel RL algorithm that +learns a critic network that outputs Q-values over a sequence of actions. By +explicitly training the value functions to learn the consequence of executing a +series of current and future actions, our algorithm allows for learning useful +value functions from noisy trajectories. We study our algorithm across various +setups with sparse and dense rewards, and with or without demonstrations, +spanning mobile bi-manual manipulation, whole-body control, and tabletop +manipulation tasks from BiGym, HumanoidBench, and RLBench. We find that, by +learning the critic network with action sequences, our algorithm outperforms +various RL and BC baselines, in particular on challenging humanoid control +tasks. + +
+
+ comment: 17 Pages. Website: https://younggyo.me/cqn-as/ +
+
+
+
+
+ + ☆ HEIGHT: Heterogeneous Interaction Graph Transformer for Robot Navigation + in Crowded and Constrained Environments + + +
+ We study the problem of robot navigation in dense and interactive crowds with +environmental constraints such as corridors and furniture. Previous methods +fail to consider all types of interactions among agents and obstacles, leading +to unsafe and inefficient robot paths. In this article, we leverage a +graph-based representation of crowded and constrained scenarios and propose a +structured framework to learn robot navigation policies with deep reinforcement +learning. We first split the representations of different components in the +environment and propose a heterogeneous spatio-temporal (st) graph to model +distinct interactions among humans, robots, and obstacles. Based on the +heterogeneous st-graph, we propose HEIGHT, a novel navigation policy network +architecture with different components to capture heterogeneous interactions +among entities through space and time. HEIGHT utilizes attention mechanisms to +prioritize important interactions and a recurrent network to track changes in +the dynamic scene over time, encouraging the robot to avoid collisions +adaptively. Through extensive simulation and real-world experiments, we +demonstrate that HEIGHT outperforms state-of-the-art baselines in terms of +success and efficiency in challenging navigation scenarios. Furthermore, we +demonstrate that our pipeline achieves better zero-shot generalization +capability than previous works when the densities of humans and obstacles +change. More videos are available at +https://sites.google.com/view/crowdnav-height/home. + +
+
+
+
+
+ + ♻ ☆ RLtools: A Fast, Portable Deep Reinforcement Learning Library for + Continuous Control + + +
+ Deep Reinforcement Learning (RL) can yield capable agents and control +policies in several domains but is commonly plagued by prohibitively long +training times. Additionally, in the case of continuous control problems, the +applicability of learned policies on real-world embedded devices is limited due +to the lack of real-time guarantees and portability of existing libraries. To +address these challenges, we present RLtools, a dependency-free, header-only, +pure C++ library for deep supervised and reinforcement learning. Its novel +architecture allows RLtools to be used on a wide variety of platforms, from HPC +clusters over workstations and laptops to smartphones, smartwatches, and +microcontrollers. Specifically, due to the tight integration of the RL +algorithms with simulation environments, RLtools can solve popular RL problems +up to 76 times faster than other popular RL frameworks. We also benchmark the +inference on a diverse set of microcontrollers and show that in most cases our +optimized implementation is by far the fastest. Finally, RLtools enables the +first-ever demonstration of training a deep RL algorithm directly on a +microcontroller, giving rise to the field of TinyRL. The source code as well as +documentation and live demos are available through our project page at +https://rl.tools. + +
+
+ comment: Project page: https://rl.tools +
+
+
+
+
+ + ♻ ☆ Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space + Exploration by Reinforcement Learning Agent ICRA 2025 + + +
+ Grasping by a robot in unstructured environments is deemed a critical +challenge because of the requirement for effective adaptation to a wide +variation in object geometries, material properties, and other environmental +factors. In this paper, we propose a novel framework for robotic grasping based +on the idea of compressing high-dimensional target and gripper features in a +common latent space using a set of autoencoders. Our approach simplifies +grasping by using three autoencoders dedicated to the target, the gripper, and +a third one that fuses their latent representations. This allows the RL agent +to achieve higher learning rates at the initial stages of exploration of a new +environment, as well as at non-zero shot grasp attempts. The agent explores the +latent space of the third autoencoder for better quality grasp without explicit +reconstruction of objects. By implementing the PoWER algorithm into the RL +training process, updates on the agent's policy will be made through the +perturbation in the reward-weighted latent space. The successful exploration +efficiently constrains both position and pose integrity for feasible executions +of grasps. We evaluate our system on a diverse set of objects, demonstrating +the high success rate in grasping with minimum computational overhead. We found +that approach enhances the adaptation of the RL agent by more than 35 % in +simulation experiments. + +
+
+ comment: Submitted for review at IEEE ICRA 2025 +
+
+
+
+
+ + ♻ ☆ MaIL: Improving Imitation Learning with Mamba + + +
+ This work presents Mamba Imitation Learning (MaIL), a novel imitation +learning (IL) architecture that provides an alternative to state-of-the-art +(SoTA) Transformer-based policies. MaIL leverages Mamba, a state-space model +designed to selectively focus on key features of the data. While Transformers +are highly effective in data-rich environments due to their dense attention +mechanisms, they can struggle with smaller datasets, often leading to +overfitting or suboptimal representation learning. In contrast, Mamba's +architecture enhances representation learning efficiency by focusing on key +features and reducing model complexity. This approach mitigates overfitting and +enhances generalization, even when working with limited data. Extensive +evaluations on the LIBERO benchmark demonstrate that MaIL consistently +outperforms Transformers on all LIBERO tasks with limited data and matches +their performance when the full dataset is available. Additionally, MaIL's +effectiveness is validated through its superior performance in three real robot +experiments. Our code is available at https://github.com/ALRhub/MaIL. + +
+
+
+
+
+ + ♻ ☆ Child Speech Recognition in Human-Robot Interaction: Problem Solved? + + +
+ Automated Speech Recognition shows superhuman performance for adult English +speech on a range of benchmarks, but disappoints when fed children's speech. +This has long sat in the way of child-robot interaction. Recent evolutions in +data-driven speech recognition, including the availability of Transformer +architectures and unprecedented volumes of training data, might mean a +breakthrough for child speech recognition and social robot applications aimed +at children. We revisit a study on child speech recognition from 2017 and show +that indeed performance has increased, with newcomer OpenAI Whisper doing +markedly better than leading commercial cloud services. Performance improves +even more in highly structured interactions when priming models with specific +phrases. While transcription is not perfect yet, the best model recognises +60.3% of sentences correctly barring small grammatical differences, with +sub-second transcription time running on a local GPU, showing potential for +usable autonomous child-robot speech interactions. + +
+
+ comment: Submitted to 2024 International Conference on Social Robotics +
+
+
+
+
+ + ♻ ☆ Vision-based Manipulation of Transparent Plastic Bags in Industrial + Setups + + +
+ This paper addresses the challenges of vision-based manipulation for +autonomous cutting and unpacking of transparent plastic bags in industrial +setups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data, +connectivity, analytics, and robotics, promises enhanced accessibility and +sustainability throughout the value chain. The integration of autonomous +systems, including collaborative robots (cobots), into industrial processes is +pivotal for efficiency and safety. The proposed solution employs advanced +Machine Learning algorithms, particularly Convolutional Neural Networks (CNNs), +to identify transparent plastic bags under varying lighting and background +conditions. Tracking algorithms and depth sensing technologies are utilized for +3D spatial awareness during pick and placement. The system addresses challenges +in grasping and manipulation, considering optimal points, compliance control +with vacuum gripping technology, and real-time automation for safe interaction +in dynamic environments. The system's successful testing and validation in the +lab with the FRANKA robot arm, showcases its potential for widespread +industrial applications, while demonstrating effectiveness in automating the +unpacking and cutting of transparent plastic bags for an 8-stack bulk-loader +based on specific requirements and rigorous testing. + +
+
+
+
+
+ + ♻ ☆ Signaling and Social Learning in Swarms of Robots + + +
+ This paper investigates the role of communication in improving coordination +within robot swarms, focusing on a paradigm where learning and execution occur +simultaneously in a decentralized manner. We highlight the role communication +can play in addressing the credit assignment problem (individual contribution +to the overall performance), and how it can be influenced by it. We propose a +taxonomy of existing and future works on communication, focusing on information +selection and physical abstraction as principal axes for classification: from +low-level lossless compression with raw signal extraction and processing to +high-level lossy compression with structured communication models. The paper +reviews current research from evolutionary robotics, multi-agent (deep) +reinforcement learning, language models, and biophysics models to outline the +challenges and opportunities of communication in a collective of robots that +continuously learn from one another through local message exchanges, +illustrating a form of social learning. + +
+
+ comment: 17 pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ Vision-Language Model Fine-Tuning via Simple Parameter-Efficient + Modification EMNLP 2024 + + +
+ Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed +the success of prompt tuning and adapter tuning, while the classic model +fine-tuning on inherent parameters seems to be overlooked. It is believed that +fine-tuning the parameters of VLMs with few-shot samples corrupts the +pre-trained knowledge since fine-tuning the CLIP model even degrades +performance. In this paper, we revisit this viewpoint, and propose a new +perspective: fine-tuning the specific parameters instead of all will uncover +the power of classic model fine-tuning on VLMs. Through our meticulous study, +we propose ClipFit, a simple yet effective method to fine-tune CLIP without +introducing any overhead of extra parameters. We demonstrate that by only +fine-tuning the specific bias terms and normalization layers, ClipFit can +improve the performance of zero-shot CLIP by 7.27\% average harmonic mean +accuracy. Lastly, to understand how fine-tuning in CLIPFit affects the +pre-trained models, we conducted extensive experimental analyses w.r.t. changes +in internal parameters and representations. We found that low-level text bias +layers and the first layer normalization layer change much more than other +layers. The code is available at \url{https://github.com/minglllli/CLIPFit}. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Performance evaluation of a ROS2 based Automated Driving System + + +
+ Automated driving is currently a prominent area of scientific work. In the +future, highly automated driving and new Advanced Driver Assistance Systems +will become reality. While Advanced Driver Assistance Systems and automated +driving functions for certain domains are already commercially available, +ubiquitous automated driving in complex scenarios remains a subject of ongoing +research. Contrarily to single-purpose Electronic Control Units, the software +for automated driving is often executed on high performance PCs. The Robot +Operating System 2 (ROS2) is commonly used to connect components in an +automated driving system. Due to the time critical nature of automated driving +systems, the performance of the framework is especially important. In this +paper, a thorough performance evaluation of ROS2 is conducted, both in terms of +timeliness and error rate. The results show that ROS2 is a suitable framework +for automated driving systems. + +
+
+ comment: Published and presented at VEHITS 2024, Proceedings of the 10th + International Conference on Vehicle Technology and Intelligent Transport + Systems - VEHITS; 2024 +
+
+
+
+
+ + ♻ ☆ Improving Visual Place Recognition Based Robot Navigation By Verifying + Localization Estimates + + +
+ Visual Place Recognition (VPR) systems often have imperfect performance, +affecting the `integrity' of position estimates and subsequent robot navigation +decisions. Previously, SVM classifiers have been used to monitor VPR integrity. +This research introduces a novel Multi-Layer Perceptron (MLP) integrity monitor +which demonstrates improved performance and generalizability, removing +per-environment training and reducing manual tuning requirements. We test our +proposed system in extensive real-world experiments, presenting two real-time +integrity-based VPR verification methods: a single-query rejection method for +robot navigation to a goal zone (Experiment 1); and a history-of-queries method +that takes a best, verified, match from its recent trajectory and uses an +odometer to extrapolate a current position estimate (Experiment 2). Noteworthy +results for Experiment 1 include a decrease in aggregate mean along-track goal +error from ~9.8m to ~3.1m, and an increase in the aggregate rate of successful +mission completion from ~41% to ~55%. Experiment 2 showed a decrease in +aggregate mean along-track localization error from ~2.0m to ~0.5m, and an +increase in the aggregate localization precision from ~97% to ~99%. Overall, +our results demonstrate the practical usefulness of a VPR integrity monitor in +real-world robotics to improve VPR localization and consequent navigation +performance. + +
+
+ comment: Author Accepted Preprint for Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ Robust-Locomotion-by-Logic: Perturbation-Resilient Bipedal Locomotion + via Signal Temporal Logic Guided Model Predictive Control + + +
+ This study introduces a robust planning framework that utilizes a model +predictive control (MPC) approach, enhanced by incorporating signal temporal +logic (STL) specifications. This marks the first-ever study to apply STL-guided +trajectory optimization for bipedal locomotion, specifically designed to handle +both translational and orientational perturbations. Existing recovery +strategies often struggle with reasoning complex task logic and evaluating +locomotion robustness systematically, making them susceptible to failures +caused by inappropriate recovery strategies or lack of robustness. To address +these issues, we design an analytical stability metric for bipedal locomotion +and quantify this metric using STL specifications, which guide the generation +of recovery trajectories to achieve maximum robustness degree. To enable safe +and computational-efficient crossed-leg maneuver, we design data-driven +self-leg-collision constraints that are $1000$ times faster than the +traditional inverse-kinematics-based approach. Our framework outperforms a +state-of-the-art locomotion controller, a standard MPC without STL, and a +linear-temporal-logic-based planner in a high-fidelity dynamic simulation, +especially in scenarios involving crossed-leg maneuvers. Additionally, the +Cassie bipedal robot achieves robust performance under horizontal and +orientational perturbations such as those observed in ship motions. These +environments are validated in simulations and deployed on hardware. +Furthermore, our proposed method demonstrates versatility on stepping stones +and terrain-agnostic features on inclined terrains. + +
+
+
+
+
+ + ♻ ☆ SAFE-GIL: SAFEty Guided Imitation Learning for Robotic Systems + + +
+ Behavior cloning (BC) is a widely-used approach in imitation learning, where +a robot learns a control policy by observing an expert supervisor. However, the +learned policy can make errors and might lead to safety violations, which +limits their utility in safety-critical robotics applications. While prior +works have tried improving a BC policy via additional real or synthetic action +labels, adversarial training, or runtime filtering, none of them explicitly +focus on reducing the BC policy's safety violations during training time. We +propose SAFE-GIL, a design-time method to learn safety-aware behavior cloning +policies. SAFE-GIL deliberately injects adversarial disturbance in the system +during data collection to guide the expert towards safety-critical states. This +disturbance injection simulates potential policy errors that the system might +encounter during the test time. By ensuring that training more closely +replicates expert behavior in safety-critical states, our approach results in +safer policies despite policy errors during the test time. We further develop a +reachability-based method to compute this adversarial disturbance. We compare +SAFE-GIL with various behavior cloning techniques and online safety-filtering +methods in three domains: autonomous ground navigation, aircraft taxiing, and +aerial navigation on a quadrotor testbed. Our method demonstrates a significant +reduction in safety failures, particularly in low data regimes where the +likelihood of learning errors, and therefore safety violations, is higher. See +our website here: https://y-u-c.github.io/safegil/ + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 134 + +
+
+
+ + ☆ Heuristic-Free Multi-Teacher Learning + + +
+ We introduce Teacher2Task, a novel framework for multi-teacher learning that +eliminates the need for manual aggregation heuristics. Existing multi-teacher +methods typically rely on such heuristics to combine predictions from multiple +teachers, often resulting in sub-optimal aggregated labels and the propagation +of aggregation errors. Teacher2Task addresses these limitations by introducing +teacher-specific input tokens and reformulating the training process. Instead +of relying on aggregated labels, the framework transforms the training data, +consisting of ground truth labels and annotations from N teachers, into N+1 +distinct tasks: N auxiliary tasks that predict the labeling styles of the N +individual teachers, and one primary task that focuses on the ground truth +labels. This approach, drawing upon principles from multiple learning +paradigms, demonstrates strong empirical results across a range of +architectures, modalities, and tasks. + +
+
+
+
+
+ + ☆ CATCH: Complementary Adaptive Token-level Contrastive Decoding to + Mitigate Hallucinations in LVLMs + + +
+ Large Vision-Language Model (LVLM) systems have demonstrated impressive +vision-language reasoning capabilities but suffer from pervasive and severe +hallucination issues, posing significant risks in critical domains such as +healthcare and autonomous systems. Despite previous efforts to mitigate +hallucinations, a persistent issue remains: visual defect from vision-language +misalignment, creating a bottleneck in visual processing capacity. To address +this challenge, we develop Complementary Adaptive Token-level Contrastive +Decoding to Mitigate Hallucinations in LVLMs (CATCH), based on the Information +Bottleneck theory. CATCH introduces Complementary Visual Decoupling (CVD) for +visual information separation, Non-Visual Screening (NVS) for hallucination +detection, and Adaptive Token-level Contrastive Decoding (ATCD) for +hallucination mitigation. CATCH addresses issues related to visual defects that +cause diminished fine-grained feature perception and cumulative hallucinations +in open-ended scenarios. It is applicable to various visual question-answering +tasks without requiring any specific data or prior knowledge, and generalizes +robustly to new tasks without additional training, opening new possibilities +for advancing LVLM in various challenging applications. + +
+
+
+
+
+ + ☆ Barttender: An approachable & interpretable way to compare medical + imaging and non-imaging data ML4H 2024 + + +
+ Imaging-based deep learning has transformed healthcare research, yet its +clinical adoption remains limited due to challenges in comparing imaging models +with traditional non-imaging and tabular data. To bridge this gap, we introduce +Barttender, an interpretable framework that uses deep learning for the direct +comparison of the utility of imaging versus non-imaging tabular data for tasks +like disease prediction. + Barttender converts non-imaging tabular features, such as scalar data from +electronic health records, into grayscale bars, facilitating an interpretable +and scalable deep learning based modeling of both data modalities. Our +framework allows researchers to evaluate differences in utility through +performance measures, as well as local (sample-level) and global +(population-level) explanations. We introduce a novel measure to define global +feature importances for image-based deep learning models, which we call gIoU. +Experiments on the CheXpert and MIMIC datasets with chest X-rays and scalar +data from electronic health records show that Barttender performs comparably to +traditional methods and offers enhanced explainability using deep learning +models. + +
+
+ comment: Accepted to the Proceedings Track at Machine Learning for Health + (ML4H 2024) conference, held on December 15-16, 2024 in Vancouver, Canada +
+
+
+
+
+ + ☆ AdaCM$^2$: On Understanding Extremely Long-Term Video with Adaptive + Cross-Modality Memory Reduction + + +
+ The advancements in large language models (LLMs) have propelled the +improvement of video understanding tasks by incorporating LLMs with visual +models. However, most existing LLM-based models (e.g., VideoLLaMA, VideoChat) +are constrained to processing short-duration videos. Recent attempts to +understand long-term videos by extracting and compressing visual features into +a fixed memory size. Nevertheless, those methods leverage only visual modality +to merge video tokens and overlook the correlation between visual and textual +queries, leading to difficulties in effectively handling complex +question-answering tasks. To address the challenges of long videos and complex +prompts, we propose AdaCM$^2$, which, for the first time, introduces an +adaptive cross-modality memory reduction approach to video-text alignment in an +auto-regressive manner on video streams. Our extensive experiments on various +video understanding tasks, such as video captioning, video question answering, +and video classification, demonstrate that AdaCM$^2$ achieves state-of-the-art +performance across multiple datasets while significantly reducing memory usage. +Notably, it achieves a 4.5% improvement across multiple tasks in the LVU +dataset with a GPU memory consumption reduction of up to 65%. + +
+
+
+
+
+ + ☆ AI Guided Early Screening of Cervical Cancer + + +
+ In order to support the creation of reliable machine learning models for +anomaly detection, this project focuses on preprocessing, enhancing, and +organizing a medical imaging dataset. There are two classifications in the +dataset: normal and abnormal, along with extra noise fluctuations. In order to +improve the photographs' quality, undesirable artifacts, including visible +medical equipment at the edges, were eliminated using central cropping. +Adjusting the brightness and contrast was one of the additional preprocessing +processes. Normalization was then performed to normalize the data. To make +classification jobs easier, the dataset was methodically handled by combining +several image subsets into two primary categories: normal and pathological. To +provide a strong training set that adapts well to real-world situations, +sophisticated picture preprocessing techniques were used, such as contrast +enhancement and real-time augmentation (including rotations, zooms, and +brightness modifications). To guarantee efficient model evaluation, the data +was subsequently divided into training and testing subsets. In order to create +precise and effective machine learning models for medical anomaly detection, +high-quality input data is ensured via this thorough approach. Because of the +project pipeline's flexible and scalable design, it can be easily integrated +with bigger clinical decision-support systems. + +
+
+
+
+
+ + ☆ Deep Learning-Driven Heat Map Analysis for Evaluating thickness of + Wounded Skin Layers + + +
+ Understanding the appropriate skin layer thickness in wounded sites is an +important tool to move forward on wound healing practices and treatment +protocols. Methods to measure depth often are invasive and less specific. This +paper introduces a novel method that is non-invasive with deep learning +techniques using classifying of skin layers that helps in measurement of wound +depth through heatmap analysis. A set of approximately 200 labeled images of +skin allows five classes to be distinguished: scars, wounds, and healthy skin, +among others. Each image has annotated key layers, namely the stratum cornetum, +the epidermis, and the dermis, in the software Roboflow. In the preliminary +stage, the Heatmap generator VGG16 was used to enhance the visibility of tissue +layers, based upon which their annotated images were used to train ResNet18 +with early stopping techniques. It ended up at a very high accuracy rate of +97.67%. To do this, the comparison of the models ResNet18, VGG16, DenseNet121, +and EfficientNet has been done where both EfficientNet and ResNet18 have +attained accuracy rates of almost 95.35%. For further hyperparameter tuning, +EfficientNet and ResNet18 were trained at six different learning rates to +determine the best model configuration. It has been noted that the accuracy has +huge variations with different learning rates. In the case of EfficientNet, the +maximum achievable accuracy was 95.35% at the rate of 0.0001. The same was true +for ResNet18, which also attained its peak value of 95.35% at the same rate. +These facts indicate that the model can be applied and utilized in actual-time, +non-invasive wound assessment, which holds a great promise to improve clinical +diagnosis and treatment planning. + +
+
+
+
+
+ + ☆ IoT-Based 3D Pose Estimation and Motion Optimization for Athletes: + Application of C3D and OpenPose + + +
+ This study proposes the IoT-Enhanced Pose Optimization Network (IE-PONet) for +high-precision 3D pose estimation and motion optimization of track and field +athletes. IE-PONet integrates C3D for spatiotemporal feature extraction, +OpenPose for real-time keypoint detection, and Bayesian optimization for +hyperparameter tuning. Experimental results on NTURGB+D and FineGYM datasets +demonstrate superior performance, with AP\(^p50\) scores of 90.5 and 91.0, and +mAP scores of 74.3 and 74.0, respectively. Ablation studies confirm the +essential roles of each module in enhancing model accuracy. IE-PONet provides a +robust tool for athletic performance analysis and optimization, offering +precise technical insights for training and injury prevention. Future work will +focus on further model optimization, multimodal data integration, and +developing real-time feedback mechanisms to enhance practical applications. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Machine Learning Approaches on Crop Pattern Recognition a Comparative + Analysis + + +
+ Monitoring agricultural activities is important to ensure food security. +Remote sensing plays a significant role for large-scale continuous monitoring +of cultivation activities. Time series remote sensing data were used for the +generation of the cropping pattern. Classification algorithms are used to +classify crop patterns and mapped agriculture land used. Some conventional +classification methods including support vector machine (SVM) and decision +trees were applied for crop pattern recognition. However, in this paper, we are +proposing Deep Neural Network (DNN) based classification to improve the +performance of crop pattern recognition and make a comparative analysis with +two (2) other machine learning approaches including Naive Bayes and Random +Forest. + +
+
+ comment: Published in ICNTET2018: International Conference on New Trends in + Engineering & Technology Tirupathi Highway, Tiruvallur Dist Chennai, India, + September 7-8, 2018 +
+
+
+
+
+ + ☆ PoM: Efficient Image and Video Generation with the Polynomial Mixer + + +
+ Diffusion models based on Multi-Head Attention (MHA) have become ubiquitous +to generate high quality images and videos. However, encoding an image or a +video as a sequence of patches results in costly attention patterns, as the +requirements both in terms of memory and compute grow quadratically. To +alleviate this problem, we propose a drop-in replacement for MHA called the +Polynomial Mixer (PoM) that has the benefit of encoding the entire sequence +into an explicit state. PoM has a linear complexity with respect to the number +of tokens. This explicit state also allows us to generate frames in a +sequential fashion, minimizing memory and compute requirement, while still +being able to train in parallel. We show the Polynomial Mixer is a universal +sequence-to-sequence approximator, just like regular MHA. We adapt several +Diffusion Transformers (DiT) for generating images and videos with PoM +replacing MHA, and we obtain high quality samples while using less +computational resources. The code is available at +https://github.com/davidpicard/HoMM. + +
+
+
+
+
+ + ☆ M3D: Dual-Stream Selective State Spaces and Depth-Driven Framework for + High-Fidelity Single-View 3D Reconstruction CVPR 2025 + + +
+ The precise reconstruction of 3D objects from a single RGB image in complex +scenes presents a critical challenge in virtual reality, autonomous driving, +and robotics. Existing neural implicit 3D representation methods face +significant difficulties in balancing the extraction of global and local +features, particularly in diverse and complex environments, leading to +insufficient reconstruction precision and quality. We propose M3D, a novel +single-view 3D reconstruction framework, to tackle these challenges. This +framework adopts a dual-stream feature extraction strategy based on Selective +State Spaces to effectively balance the extraction of global and local +features, thereby improving scene comprehension and representation precision. +Additionally, a parallel branch extracts depth information, effectively +integrating visual and geometric features to enhance reconstruction quality and +preserve intricate details. Experimental results indicate that the fusion of +multi-scale features with depth information via the dual-branch feature +extraction significantly boosts geometric consistency and fidelity, achieving +state-of-the-art reconstruction performance. + +
+
+ comment: 9 pages, 4 figures, submitted to CVPR 2025 for review +
+
+
+
+
+ + ☆ Instant Policy: In-Context Imitation Learning via Graph Diffusion + + +
+ Following the impressive capabilities of in-context learning with large +transformers, In-Context Imitation Learning (ICIL) is a promising opportunity +for robotics. We introduce Instant Policy, which learns new tasks instantly +(without further training) from just one or two demonstrations, achieving ICIL +through two key components. First, we introduce inductive biases through a +graph representation and model ICIL as a graph generation problem with a +learned diffusion process, enabling structured reasoning over demonstrations, +observations, and actions. Second, we show that such a model can be trained +using pseudo-demonstrations - arbitrary trajectories generated in simulation - +as a virtually infinite pool of training data. Simulated and real experiments +show that Instant Policy enables rapid learning of various everyday robot +tasks. We also show how it can serve as a foundation for cross-embodiment and +zero-shot transfer to language-defined tasks. Code and videos are available at +https://www.robot-learning.uk/instant-policy. + +
+
+ comment: Code and videos are available on our project webpage at + https://www.robot-learning.uk/instant-policy +
+
+
+
+
+ + ☆ Maps from Motion (MfM): Generating 2D Semantic Maps from Sparse + Multi-view Images + + +
+ World-wide detailed 2D maps require enormous collective efforts. +OpenStreetMap is the result of 11 million registered users manually annotating +the GPS location of over 1.75 billion entries, including distinctive landmarks +and common urban objects. At the same time, manual annotations can include +errors and are slow to update, limiting the map's accuracy. Maps from Motion +(MfM) is a step forward to automatize such time-consuming map making procedure +by computing 2D maps of semantic objects directly from a collection of +uncalibrated multi-view images. From each image, we extract a set of object +detections, and estimate their spatial arrangement in a top-down local map +centered in the reference frame of the camera that captured the image. Aligning +these local maps is not a trivial problem, since they provide incomplete, noisy +fragments of the scene, and matching detections across them is unreliable +because of the presence of repeated pattern and the limited appearance +variability of urban objects. We address this with a novel graph-based +framework, that encodes the spatial and semantic distribution of the objects +detected in each image, and learns how to combine them to predict the objects' +poses in a global reference system, while taking into account all possible +detection matches and preserving the topology observed in each image. Despite +the complexity of the problem, our best model achieves global 2D registration +with an average accuracy within 4 meters (i.e., below GPS accuracy) even on +sparse sequences with strong viewpoint change, on which COLMAP has an 80% +failure rate. We provide extensive evaluation on synthetic and real-world data, +showing how the method obtains a solution even in scenarios where standard +optimization techniques fail. + +
+
+
+
+
+ + ☆ A Multimodal Approach Combining Structural and Cross-domain Textual + Guidance for Weakly Supervised OCT Segmentation + + +
+ Accurate segmentation of Optical Coherence Tomography (OCT) images is crucial +for diagnosing and monitoring retinal diseases. However, the labor-intensive +nature of pixel-level annotation limits the scalability of supervised learning +with large datasets. Weakly Supervised Semantic Segmentation (WSSS) provides a +promising alternative by leveraging image-level labels. In this study, we +propose a novel WSSS approach that integrates structural guidance with +text-driven strategies to generate high-quality pseudo labels, significantly +improving segmentation performance. In terms of visual information, our method +employs two processing modules that exchange raw image features and structural +features from OCT images, guiding the model to identify where lesions are +likely to occur. In terms of textual information, we utilize large-scale +pretrained models from cross-domain sources to implement label-informed textual +guidance and synthetic descriptive integration with two textual processing +modules that combine local semantic features with consistent synthetic +descriptions. By fusing these visual and textual components within a multimodal +framework, our approach enhances lesion localization accuracy. Experimental +results on three OCT datasets demonstrate that our method achieves +state-of-the-art performance, highlighting its potential to improve diagnostic +accuracy and efficiency in medical imaging. + +
+
+ comment: 21 pages, 9 figures, 8 tables +
+
+
+
+
+ + ☆ SG-LRA: Self-Generating Automatic Scoliosis Cobb Angle Measurement with + Low-Rank Approximation + + +
+ Automatic Cobb angle measurement from X-ray images is crucial for scoliosis +screening and diagnosis. However, most existing regression-based methods and +segmentation-based methods struggle with inaccurate spine representations or +mask connectivity/fragmentation issues. Besides, landmark-based methods suffer +from insufficient training data and annotations. To address these challenges, +we propose a novel framework including Self-Generation pipeline and Low-Rank +Approximation representation (SG-LRA) for automatic Cobb angle measurement. +Specifically, we propose a parameterized spine contour representation based on +LRA, which enables eigen-spine decomposition and spine contour reconstruction. +We can directly obtain spine contour with only regressed LRA coefficients, +which form a more accurate spine representation than rectangular boxes. Also, +we combine LRA coefficient regression with anchor box classification to solve +inaccurate predictions and mask connectivity issues. Moreover, we develop a +data engine with automatic annotation and automatic selection in an iterative +manner, which is trained on a private Spinal2023 dataset. With our data engine, +we generate the largest scoliosis X-ray dataset named Spinal-AI2024 largely +without privacy leaks. Extensive experiments on public AASCE2019, private +Spinal2023, and generated Spinal-AI2024 datasets demonstrate that our method +achieves state-of-the-art Cobb angle measurement performance. Our code and +Spinal-AI2024 dataset are available at https://github.com/Ernestchenchen/SG-LRA +and https://github.com/Ernestchenchen/Spinal-AI2024, respectively. + +
+
+
+
+
+ + ☆ STREAM: A Universal State-Space Model for Sparse Geometric Data + + +
+ Handling sparse and unstructured geometric data, such as point clouds or +event-based vision, is a pressing challenge in the field of machine vision. +Recently, sequence models such as Transformers and state-space models entered +the domain of geometric data. These methods require specialized preprocessing +to create a sequential view of a set of points. Furthermore, prior works +involving sequence models iterate geometric data with either uniform or learned +step sizes, implicitly relying on the model to infer the underlying geometric +structure. In this work, we propose to encode geometric structure explicitly +into the parameterization of a state-space model. State-space models are based +on linear dynamics governed by a one-dimensional variable such as time or a +spatial coordinate. We exploit this dynamic variable to inject relative +differences of coordinates into the step size of the state-space model. The +resulting geometric operation computes interactions between all pairs of N +points in O(N) steps. Our model deploys the Mamba selective state-space model +with a modified CUDA kernel to efficiently map sparse geometric data to modern +hardware. The resulting sequence model, which we call STREAM, achieves +competitive results on a range of benchmarks from point-cloud classification to +event-based vision and audio classification. STREAM demonstrates a powerful +inductive bias for sparse geometric data by improving the PointMamba baseline +when trained from scratch on the ModelNet40 and ScanObjectNN point cloud +analysis datasets. It further achieves, for the first time, 100% test accuracy +on all 11 classes of the DVS128 Gestures dataset. + +
+
+
+
+
+ + ☆ SAM Carries the Burden: A Semi-Supervised Approach Refining Pseudo + Labels for Medical Segmentation MICCAI + + +
+ Semantic segmentation is a crucial task in medical imaging. Although +supervised learning techniques have proven to be effective in performing this +task, they heavily depend on large amounts of annotated training data. The +recently introduced Segment Anything Model (SAM) enables prompt-based +segmentation and offers zero-shot generalization to unfamiliar objects. In our +work, we leverage SAM's abstract object understanding for medical image +segmentation to provide pseudo labels for semi-supervised learning, thereby +mitigating the need for extensive annotated training data. Our approach refines +initial segmentations that are derived from a limited amount of annotated data +(comprising up to 43 cases) by extracting bounding boxes and seed points as +prompts forwarded to SAM. Thus, it enables the generation of dense segmentation +masks as pseudo labels for unlabelled data. The results show that training with +our pseudo labels yields an improvement in Dice score from $74.29\,\%$ to +$84.17\,\%$ and from $66.63\,\%$ to $74.87\,\%$ for the segmentation of bones +of the paediatric wrist and teeth in dental radiographs, respectively. As a +result, our method outperforms intensity-based post-processing methods, +state-of-the-art supervised learning for segmentation (nnU-Net), and the +semi-supervised mean teacher approach. Our Code is available on GitHub. + +
+
+ comment: Presented at MICCAI Workshop on Advancing Data Solutions in Medical + Imaging AI 2024; Code and data: + https://github.com/multimodallearning/SamCarriesTheBurden +
+
+
+
+
+ + ☆ Stochastic BIQA: Median Randomized Smoothing for Certified Blind Image + Quality Assessment + + +
+ Most modern No-Reference Image-Quality Assessment (NR-IQA) metrics are based +on neural networks vulnerable to adversarial attacks. Attacks on such metrics +lead to incorrect image/video quality predictions, which poses significant +risks, especially in public benchmarks. Developers of image processing +algorithms may unfairly increase the score of a target IQA metric without +improving the actual quality of the adversarial image. Although some empirical +defenses for IQA metrics were proposed, they do not provide theoretical +guarantees and may be vulnerable to adaptive attacks. This work focuses on +developing a provably robust no-reference IQA metric. Our method is based on +Median Smoothing (MS) combined with an additional convolution denoiser with +ranking loss to improve the SROCC and PLCC scores of the defended IQA metric. +Compared with two prior methods on three datasets, our method exhibited +superior SROCC and PLCC scores while maintaining comparable certified +guarantees. + +
+
+
+
+
+ + ☆ Topological Symmetry Enhanced Graph Convolution for Skeleton-Based + Action Recognition + + +
+ Skeleton-based action recognition has achieved remarkable performance with +the development of graph convolutional networks (GCNs). However, most of these +methods tend to construct complex topology learning mechanisms while neglecting +the inherent symmetry of the human body. Additionally, the use of temporal +convolutions with certain fixed receptive fields limits their capacity to +effectively capture dependencies in time sequences. To address the issues, we +(1) propose a novel Topological Symmetry Enhanced Graph Convolution (TSE-GC) to +enable distinct topology learning across different channel partitions while +incorporating topological symmetry awareness and (2) construct a Multi-Branch +Deformable Temporal Convolution (MBDTC) for skeleton-based action recognition. +The proposed TSE-GC emphasizes the inherent symmetry of the human body while +enabling efficient learning of dynamic topologies. Meanwhile, the design of +MBDTC introduces the concept of deformable modeling, leading to more flexible +receptive fields and stronger modeling capacity of temporal dependencies. +Combining TSE-GC with MBDTC, our final model, TSE-GCN, achieves competitive +performance with fewer parameters compared with state-of-the-art methods on +three large datasets, NTU RGB+D, NTU RGB+D 120, and NW-UCLA. On the +cross-subject and cross-set evaluations of NTU RGB+D 120, the accuracies of our +model reach 90.0\% and 91.1\%, with 1.1M parameters and 1.38 GFLOPS for one +stream. + +
+
+
+
+
+ + ☆ Recall and Refine: A Simple but Effective Source-free Open-set Domain + Adaptation Framework + + +
+ Open-set Domain Adaptation (OSDA) aims to adapt a model from a labeled source +domain to an unlabeled target domain, where novel classes - also referred to as +target-private unknown classes - are present. Source-free Open-set Domain +Adaptation (SF-OSDA) methods address OSDA without accessing labeled source +data, making them particularly relevant under privacy constraints. However, +SF-OSDA presents significant challenges due to distribution shifts and the +introduction of novel classes. Existing SF-OSDA methods typically rely on +thresholding the prediction entropy of a sample to identify it as either a +known or unknown class but fail to explicitly learn discriminative features for +the target-private unknown classes. We propose Recall and Refine (RRDA), a +novel SF-OSDA framework designed to address these limitations by explicitly +learning features for target-private unknown classes. RRDA employs a two-step +process. First, we enhance the model's capacity to recognize unknown classes by +training a target classifier with an additional decision boundary, guided by +synthetic samples generated from target domain features. This enables the +classifier to effectively separate known and unknown classes. In the second +step, we adapt the entire model to the target domain, addressing both domain +shifts and improving generalization to unknown classes. Any off-the-shelf +source-free domain adaptation method (e.g., SHOT, AaD) can be seamlessly +integrated into our framework at this stage. Extensive experiments on three +benchmark datasets demonstrate that RRDA significantly outperforms existing +SF-OSDA and OSDA methods. + +
+
+
+
+
+ + ☆ S3TU-Net: Structured Convolution and Superpixel Transformer for Lung + Nodule Segmentation + + +
+ The irregular and challenging characteristics of lung adenocarcinoma nodules +in computed tomography (CT) images complicate staging diagnosis, making +accurate segmentation critical for clinicians to extract detailed lesion +information. In this study, we propose a segmentation model, S3TU-Net, which +integrates multi-dimensional spatial connectors and a superpixel-based visual +transformer. S3TU-Net is built on a multi-view CNN-Transformer hybrid +architecture, incorporating superpixel algorithms, structured weighting, and +spatial shifting techniques to achieve superior segmentation performance. The +model leverages structured convolution blocks (DWF-Conv/D2BR-Conv) to extract +multi-scale local features while mitigating overfitting. To enhance multi-scale +feature fusion, we introduce the S2-MLP Link, integrating spatial shifting and +attention mechanisms at the skip connections. Additionally, the residual-based +superpixel visual transformer (RM-SViT) effectively merges global and local +features by employing sparse correlation learning and multi-branch attention to +capture long-range dependencies, with residual connections enhancing stability +and computational efficiency. Experimental results on the LIDC-IDRI dataset +demonstrate that S3TU-Net achieves a DSC, precision, and IoU of 89.04%, 90.73%, +and 90.70%, respectively. Compared to recent methods, S3TU-Net improves DSC by +4.52% and sensitivity by 3.16%, with other metrics showing an approximate 2% +increase. In addition to comparison and ablation studies, we validated the +generalization ability of our model on the EPDB private dataset, achieving a +DSC of 86.40%. + +
+
+
+
+
+ + ☆ Contourlet Refinement Gate Framework for Thermal Spectrum Distribution + Regularized Infrared Image Super-Resolution + + +
+ Image super-resolution (SR) is a classical yet still active low-level vision +problem that aims to reconstruct high-resolution (HR) images from their +low-resolution (LR) counterparts, serving as a key technique for image +enhancement. Current approaches to address SR tasks, such as transformer-based +and diffusion-based methods, are either dedicated to extracting RGB image +features or assuming similar degradation patterns, neglecting the inherent +modal disparities between infrared and visible images. When directly applied to +infrared image SR tasks, these methods inevitably distort the infrared spectral +distribution, compromising the machine perception in downstream tasks. In this +work, we emphasize the infrared spectral distribution fidelity and propose a +Contourlet refinement gate framework to restore infrared modal-specific +features while preserving spectral distribution fidelity. Our approach captures +high-pass subbands from multi-scale and multi-directional infrared spectral +decomposition to recover infrared-degraded information through a gate +architecture. The proposed Spectral Fidelity Loss regularizes the spectral +frequency distribution during reconstruction, which ensures the preservation of +both high- and low-frequency components and maintains the fidelity of +infrared-specific features. We propose a two-stage prompt-learning optimization +to guide the model in learning infrared HR characteristics from LR degradation. +Extensive experiments demonstrate that our approach outperforms existing image +SR models in both visual and perceptual tasks while notably enhancing machine +perception in downstream tasks. Our code is available at +https://github.com/hey-it-s-me/CoRPLE. + +
+
+ comment: 13 figures, 6 tables +
+
+
+
+
+ + ☆ Rethinking Top Probability from Multi-view for Distracted Driver + Behaviour Localization + + +
+ Naturalistic driving action localization task aims to recognize and +comprehend human behaviors and actions from video data captured during +real-world driving scenarios. Previous studies have shown great action +localization performance by applying a recognition model followed by +probability-based post-processing. Nevertheless, the probabilities provided by +the recognition model frequently contain confused information causing challenge +for post-processing. In this work, we adopt an action recognition model based +on self-supervise learning to detect distracted activities and give potential +action probabilities. Subsequently, a constraint ensemble strategy takes +advantages of multi-camera views to provide robust predictions. Finally, we +introduce a conditional post-processing operation to locate distracted +behaviours and action temporal boundaries precisely. Experimenting on test set +A2, our method obtains the sixth position on the public leaderboard of track 3 +of the 2024 AI City Challenge. + +
+
+ comment: Computer Vision and Pattern Recognition Workshop 2024 +
+
+
+
+
+ + ☆ Data Pruning in Generative Diffusion Models + + +
+ Data pruning is the problem of identifying a core subset that is most +beneficial to training and discarding the remainder. While pruning strategies +are well studied for discriminative models like those used in classification, +little research has gone into their application to generative models. +Generative models aim to estimate the underlying distribution of the data, so +presumably they should benefit from larger datasets. In this work we aim to +shed light on the accuracy of this statement, specifically answer the question +of whether data pruning for generative diffusion models could have a positive +impact. Contrary to intuition, we show that eliminating redundant or noisy data +in large datasets is beneficial particularly when done strategically. We +experiment with several pruning methods including recent-state-of-art methods, +and evaluate over CelebA-HQ and ImageNet datasets. We demonstrate that a simple +clustering method outperforms other sophisticated and computationally demanding +methods. We further exhibit how we can leverage clustering to balance skewed +datasets in an unsupervised manner to allow fair sampling for underrepresented +populations in the data distribution, which is a crucial problem in generative +models. + +
+
+
+
+
+ + ☆ VMGNet: A Low Computational Complexity Robotic Grasping Network Based on + VMamba with Multi-Scale Feature Fusion + + +
+ While deep learning-based robotic grasping technology has demonstrated strong +adaptability, its computational complexity has also significantly increased, +making it unsuitable for scenarios with high real-time requirements. Therefore, +we propose a low computational complexity and high accuracy model named VMGNet +for robotic grasping. For the first time, we introduce the Visual State Space +into the robotic grasping field to achieve linear computational complexity, +thereby greatly reducing the model's computational cost. Meanwhile, to improve +the accuracy of the model, we propose an efficient and lightweight multi-scale +feature fusion module, named Fusion Bridge Module, to extract and fuse +information at different scales. We also present a new loss function +calculation method to enhance the importance differences between subtasks, +improving the model's fitting ability. Experiments show that VMGNet has only +8.7G Floating Point Operations and an inference time of 8.1 ms on our devices. +VMGNet also achieved state-of-the-art performance on the Cornell and Jacquard +public datasets. To validate VMGNet's effectiveness in practical applications, +we conducted real grasping experiments in multi-object scenarios, and VMGNet +achieved an excellent performance with a 94.4% success rate in real-world +grasping tasks. The video for the real-world robotic grasping experiments is +available at https://youtu.be/S-QHBtbmLc4. + +
+
+
+
+
+ + ☆ MAViS: Modular Autonomous Virtualization System for Two-Dimensional + Semiconductor Quantum Dot Arrays + + +
+ Arrays of gate-defined semiconductor quantum dots are among the leading +candidates for building scalable quantum processors. High-fidelity +initialization, control, and readout of spin qubit registers require exquisite +and targeted control over key Hamiltonian parameters that define the +electrostatic environment. However, due to the tight gate pitch, capacitive +crosstalk between gates hinders independent tuning of chemical potentials and +interdot couplings. While virtual gates offer a practical solution, determining +all the required cross-capacitance matrices accurately and efficiently in large +quantum dot registers is an open challenge. Here, we establish a Modular +Automated Virtualization System (MAViS) -- a general and modular framework for +autonomously constructing a complete stack of multi-layer virtual gates in real +time. Our method employs machine learning techniques to rapidly extract +features from two-dimensional charge stability diagrams. We then utilize +computer vision and regression models to self-consistently determine all +relative capacitive couplings necessary for virtualizing plunger and barrier +gates in both low- and high-tunnel-coupling regimes. Using MAViS, we +successfully demonstrate accurate virtualization of a dense two-dimensional +array comprising ten quantum dots defined in a high-quality Ge/SiGe +heterostructure. Our work offers an elegant and practical solution for the +efficient control of large-scale semiconductor quantum dot systems. + +
+
+ comment: 14 pages, 5 figures, 8 pages of supplemental material +
+
+
+
+
+ + ☆ 3D Reconstruction by Looking: Instantaneous Blind Spot Detector for + Indoor SLAM through Mixed Reality + + +
+ Indoor SLAM often suffers from issues such as scene drifting, double walls, +and blind spots, particularly in confined spaces with objects close to the +sensors (e.g. LiDAR and cameras) in reconstruction tasks. Real-time +visualization of point cloud registration during data collection may help +mitigate these issues, but a significant limitation remains in the inability to +in-depth compare the scanned data with actual physical environments. These +challenges obstruct the quality of reconstruction products, frequently +necessitating revisit and rescan efforts. For this regard, we developed the +LiMRSF (LiDAR-MR-RGB Sensor Fusion) system, allowing users to perceive the +in-situ point cloud registration by looking through a Mixed-Reality (MR) +headset. This tailored framework visualizes point cloud meshes as holograms, +seamlessly matching with the real-time scene on see-through glasses, and +automatically highlights errors detected while they overlap. Such holographic +elements are transmitted via a TCP server to an MR headset, where it is +calibrated to align with the world coordinate, the physical location. This +allows users to view the localized reconstruction product instantaneously, +enabling them to quickly identify blind spots and errors, and take prompt +action on-site. Our blind spot detector achieves an error detection precision +with an F1 Score of 75.76% with acceptably high fidelity of monitoring through +the LiMRSF system (highest SSIM of 0.5619, PSNR of 14.1004, and lowest MSE of +0.0389 in the five different sections of the simplified mesh model which users +visualize through the LiMRSF device see-through glasses). This method ensures +the creation of detailed, high-quality datasets for 3D models, with potential +applications in Building Information Modeling (BIM) but not limited. + +
+
+ comment: 21 pages, 13 figures, 3 tables +
+
+
+
+
+ + ☆ PR-ENDO: Physically Based Relightable Gaussian Splatting for Endoscopy + + +
+ Endoscopic procedures are crucial for colorectal cancer diagnosis, and +three-dimensional reconstruction of the environment for real-time novel-view +synthesis can significantly enhance diagnosis. We present PR-ENDO, a framework +that leverages 3D Gaussian Splatting within a physically based, relightable +model tailored for the complex acquisition conditions in endoscopy, such as +restricted camera rotations and strong view-dependent illumination. By +exploiting the connection between the camera and light source, our approach +introduces a relighting model to capture the intricate interactions between +light and tissue using physically based rendering and MLP. Existing methods +often produce artifacts and inconsistencies under these conditions, which +PR-ENDO overcomes by incorporating a specialized diffuse MLP that utilizes +light angles and normal vectors, achieving stable reconstructions even with +limited training camera rotations. We benchmarked our framework using a +publicly available dataset and a newly introduced dataset with wider camera +rotations. Our methods demonstrated superior image quality compared to baseline +approaches. + +
+
+
+
+
+ + ☆ SCIGS: 3D Gaussians Splatting from a Snapshot Compressive Image + + +
+ Snapshot Compressive Imaging (SCI) offers a possibility for capturing +information in high-speed dynamic scenes, requiring efficient reconstruction +method to recover scene information. Despite promising results, current deep +learning-based and NeRF-based reconstruction methods face challenges: 1) deep +learning-based reconstruction methods struggle to maintain 3D structural +consistency within scenes, and 2) NeRF-based reconstruction methods still face +limitations in handling dynamic scenes. To address these challenges, we propose +SCIGS, a variant of 3DGS, and develop a primitive-level transformation network +that utilizes camera pose stamps and Gaussian primitive coordinates as +embedding vectors. This approach resolves the necessity of camera pose in +vanilla 3DGS and enhances multi-view 3D structural consistency in dynamic +scenes by utilizing transformed primitives. Additionally, a high-frequency +filter is introduced to eliminate the artifacts generated during the +transformation. The proposed SCIGS is the first to reconstruct a 3D explicit +scene from a single compressed image, extending its application to dynamic 3D +scenes. Experiments on both static and dynamic scenes demonstrate that SCIGS +not only enhances SCI decoding but also outperforms current state-of-the-art +methods in reconstructing dynamic 3D scenes from a single compressed image. The +code will be made available upon publication. + +
+
+
+
+
+ + ☆ GaussianPretrain: A Simple Unified 3D Gaussian Representation for Visual + Pre-training in Autonomous Driving + + +
+ Self-supervised learning has made substantial strides in image processing, +while visual pre-training for autonomous driving is still in its infancy. +Existing methods often focus on learning geometric scene information while +neglecting texture or treating both aspects separately, hindering comprehensive +scene understanding. In this context, we are excited to introduce +GaussianPretrain, a novel pre-training paradigm that achieves a holistic +understanding of the scene by uniformly integrating geometric and texture +representations. Conceptualizing 3D Gaussian anchors as volumetric LiDAR +points, our method learns a deepened understanding of scenes to enhance +pre-training performance with detailed spatial structure and texture, achieving +that 40.6% faster than NeRF-based method UniPAD with 70% GPU memory only. We +demonstrate the effectiveness of GaussianPretrain across multiple 3D perception +tasks, showing significant performance improvements, such as a 7.05% increase +in NDS for 3D object detection, boosts mAP by 1.9% in HD map construction and +0.8% improvement on Occupancy prediction. These significant gains highlight +GaussianPretrain's theoretical innovation and strong practical potential, +promoting visual pre-training development for autonomous driving. Source code +will be available at https://github.com/Public-BOTs/GaussianPretrain + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Frequency-Aware Guidance for Blind Image Restoration via Diffusion + Models ECCV 2024 + + +
+ Blind image restoration remains a significant challenge in low-level vision +tasks. Recently, denoising diffusion models have shown remarkable performance +in image synthesis. Guided diffusion models, leveraging the potent generative +priors of pre-trained models along with a differential guidance loss, have +achieved promising results in blind image restoration. However, these models +typically consider data consistency solely in the spatial domain, often +resulting in distorted image content. In this paper, we propose a novel +frequency-aware guidance loss that can be integrated into various diffusion +models in a plug-and-play manner. Our proposed guidance loss, based on 2D +discrete wavelet transform, simultaneously enforces content consistency in both +the spatial and frequency domains. Experimental results demonstrate the +effectiveness of our method in three blind restoration tasks: blind image +deblurring, imaging through turbulence, and blind restoration for multiple +degradations. Notably, our method achieves a significant improvement in PSNR +score, with a remarkable enhancement of 3.72\,dB in image deblurring. Moreover, +our method exhibits superior capability in generating images with rich details +and reduced distortion, leading to the best visual quality. + +
+
+ comment: 17 pages, 6 figures, has been accepted by the ECCV 2024: AIM workshop +
+
+
+
+
+ + ☆ Large Language Models for Lossless Image Compression: Next-Pixel + Prediction in Language Space is All You Need + + +
+ We have recently witnessed that ``Intelligence" and `` Compression" are the +two sides of the same coin, where the language large model (LLM) with +unprecedented intelligence is a general-purpose lossless compressor for various +data modalities. This attribute particularly appeals to the lossless image +compression community, given the increasing need to compress high-resolution +images in the current streaming media era. Consequently, a spontaneous envision +emerges: Can the compression performance of the LLM elevate lossless image +compression to new heights? However, our findings indicate that the naive +application of LLM-based lossless image compressors suffers from a considerable +performance gap compared with existing state-of-the-art (SOTA) codecs on common +benchmark datasets. In light of this, we are dedicated to fulfilling the +unprecedented intelligence (compression) capacity of the LLM for lossless image +compression tasks, thereby bridging the gap between theoretical and practical +compression performance. Specifically, we propose P$^{2}$-LLM, a next-pixel +prediction-based LLM, which integrates various elaborated insights and +methodologies, \textit{e.g.,} pixel-level priors, the in-context ability of +LLM, and a pixel-level semantic preservation strategy, to enhance the +understanding capacity of pixel sequences for better next-pixel predictions. +Extensive experiments on benchmark datasets demonstrate that P$^{2}$-LLM can +beat SOTA classical and learned codecs. + +
+
+
+
+
+ + ☆ Beyond Gaussians: Fast and High-Fidelity 3D Splatting with Linear + Kernels + + +
+ Recent advancements in 3D Gaussian Splatting (3DGS) have substantially +improved novel view synthesis, enabling high-quality reconstruction and +real-time rendering. However, blurring artifacts, such as floating primitives +and over-reconstruction, remain challenging. Current methods address these +issues by refining scene structure, enhancing geometric representations, +addressing blur in training images, improving rendering consistency, and +optimizing density control, yet the role of kernel design remains +underexplored. We identify the soft boundaries of Gaussian ellipsoids as one of +the causes of these artifacts, limiting detail capture in high-frequency +regions. To bridge this gap, we introduce 3D Linear Splatting (3DLS), which +replaces Gaussian kernels with linear kernels to achieve sharper and more +precise results, particularly in high-frequency regions. Through evaluations on +three datasets, 3DLS demonstrates state-of-the-art fidelity and accuracy, along +with a 30% FPS improvement over baseline 3DGS. The implementation will be made +publicly available upon acceptance. \freefootnote{*Corresponding author. + +
+
+
+
+
+ + ☆ CV-Cities: Advancing Cross-View Geo-Localization in Global Cities + + +
+ Cross-view geo-localization (CVGL), which involves matching and retrieving +satellite images to determine the geographic location of a ground image, is +crucial in GNSS-constrained scenarios. However, this task faces significant +challenges due to substantial viewpoint discrepancies, the complexity of +localization scenarios, and the need for global localization. To address these +issues, we propose a novel CVGL framework that integrates the vision +foundational model DINOv2 with an advanced feature mixer. Our framework +introduces the symmetric InfoNCE loss and incorporates near-neighbor sampling +and dynamic similarity sampling strategies, significantly enhancing +localization accuracy. Experimental results show that our framework surpasses +existing methods across multiple public and self-built datasets. To further +improve globalscale performance, we have developed CV-Cities, a novel dataset +for global CVGL. CV-Cities includes 223,736 ground-satellite image pairs with +geolocation data, spanning sixteen cities across six continents and covering a +wide range of complex scenarios, providing a challenging benchmark for CVGL. +The framework trained with CV-Cities demonstrates high localization accuracy in +various test cities, highlighting its strong globalization and generalization +capabilities. Our datasets and codes are available at +https://github.com/GaoShuang98/CVCities. + +
+
+ comment: Datasets and codes are available, accepted by IEEE JSTARS +
+
+
+
+
+ + ☆ Motif Channel Opened in a White-Box: Stereo Matching via Motif + Correlation Graph + + +
+ Real-world applications of stereo matching, such as autonomous driving, place +stringent demands on both safety and accuracy. However, learning-based stereo +matching methods inherently suffer from the loss of geometric structures in +certain feature channels, creating a bottleneck in achieving precise detail +matching. Additionally, these methods lack interpretability due to the +black-box nature of deep learning. In this paper, we propose MoCha-V2, a novel +learning-based paradigm for stereo matching. MoCha-V2 introduces the Motif +Correlation Graph (MCG) to capture recurring textures, which are referred to as +``motifs" within feature channels. These motifs reconstruct geometric +structures and are learned in a more interpretable way. Subsequently, we +integrate features from multiple frequency domains through wavelet inverse +transformation. The resulting motif features are utilized to restore geometric +structures in the stereo matching process. Experimental results demonstrate the +effectiveness of MoCha-V2. MoCha-V2 achieved 1st place on the Middlebury +benchmark at the time of its release. Code is available at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+
+
+
+ + ☆ Classification of Geographical Land Structure Using Convolution Neural + Network and Transfer Learning + + +
+ Satellite imagery has dramatically revolutionized the field of geography by +giving academics, scientists, and policymakers unprecedented global access to +spatial data. Manual methods typically require significant time and effort to +detect the generic land structure in satellite images. This study can produce a +set of applications such as urban planning and development, environmental +monitoring, disaster management, etc. Therefore, the research presents a +methodology to minimize human labor, reducing the expenses and duration needed +to identify the land structure. This article developed a deep learning-based +approach to automate the process of classifying geographical land structures. +We used a satellite image dataset acquired from MLRSNet. The study compared the +performance of three architectures, namely CNN, ResNet-50, and Inception-v3. We +used three optimizers with any model: Adam, SGD, and RMSProp. We conduct the +training process for a fixed number of epochs, specifically 100 epochs, with a +batch size of 64. The ResNet-50 achieved an accuracy of 76.5% with the ADAM +optimizer, the Inception-v3 with RMSProp achieved an accuracy of 93.8%, and the +proposed approach, CNN with RMSProp optimizer, achieved the highest level of +performance and an accuracy of 94.8%. Moreover, a thorough examination of the +CNN model demonstrated its exceptional accuracy, recall, and F1 scores for all +categories, confirming its resilience and dependability in precisely detecting +various terrain formations. The results highlight the potential of deep +learning models in scene understanding, as well as their significance in +efficiently identifying and categorizing land structures from satellite +imagery. + +
+
+
+
+
+ + ☆ Automatic staff reconstruction within SIMSSA proect + + +
+ The automatic analysis of scores has been a research topic of interest for +the last few decades and still is since music databases that include musical +scores are currently being created to make musical content available to the +public, including scores of ancient music. For the correct analysis of music +elements and their interpretation, the identification of staff lines is of key +importance. In this paper, a scheme to post-process the output of a previous +musical object identification system is described. This system allows the +reconstruction by means of detection, tracking and interpolation of the staff +lines of ancient scores from the digital Salzinnes Database. The scheme +developed shows a remarkable performance on the specific task it was created +for. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Breathless: An 8-hour Performance Contrasting Human and Robot + Expressiveness + + +
+ This paper describes the robot technology behind an original performance that +pairs a human dancer (Cuan) with an industrial robot arm for an eight-hour +dance that unfolds over the timespan of an American workday. To control the +robot arm, we combine a range of sinusoidal motions with varying amplitude, +frequency and offset at each joint to evoke human motions common in physical +labor such as stirring, digging, and stacking. More motions were developed +using deep learning techniques for video-based human-pose tracking and +extraction. We combine these pre-recorded motions with improvised robot motions +created live by putting the robot into teach-mode and triggering force sensing +from the robot joints onstage. All motions are combined with commercial and +original music using a custom suite of python software with AppleScript, +Keynote, and Zoom to facilitate on-stage communication with the dancer. The +resulting performance contrasts the expressivity of the human body with the +precision of robot machinery. Video, code and data are available on the project +website: https://sites.google.com/playing.studio/breathless + +
+
+ comment: 15 pages, 9 figures, accepted for ISRR (International Symposium of + Robotics Research) 2024 +
+
+
+
+
+ + ☆ DynFocus: Dynamic Cooperative Network Empowers LLMs with Video + Understanding + + +
+ The challenge in LLM-based video understanding lies in preserving visual and +semantic information in long videos while maintaining a memory-affordable token +count. However, redundancy and correspondence in videos have hindered the +performance potential of existing methods. Through statistical learning on +current datasets, we observe that redundancy occurs in both repeated and +answer-irrelevant frames, and the corresponding frames vary with different +questions. This suggests the possibility of adopting dynamic encoding to +balance detailed video information preservation with token budget reduction. To +this end, we propose a dynamic cooperative network, DynFocus, for +memory-efficient video encoding in this paper. Specifically, i) a Dynamic Event +Prototype Estimation (DPE) module to dynamically select meaningful frames for +question answering; (ii) a Compact Cooperative Encoding (CCE) module that +encodes meaningful frames with detailed visual appearance and the remaining +frames with sketchy perception separately. We evaluate our method on five +publicly available benchmarks, and experimental results consistently +demonstrate that our method achieves competitive performance. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ DiM: $f$-Divergence Minimization Guided Sharpness-Aware Optimization for + Semi-supervised Medical Image Segmentation + + +
+ As a technique to alleviate the pressure of data annotation, semi-supervised +learning (SSL) has attracted widespread attention. In the specific domain of +medical image segmentation, semi-supervised methods (SSMIS) have become a +research hotspot due to their ability to reduce the need for large amounts of +precisely annotated data. SSMIS focuses on enhancing the model's generalization +performance by leveraging a small number of labeled samples and a large number +of unlabeled samples. The latest sharpness-aware optimization (SAM) technique, +which optimizes the model by reducing the sharpness of the loss function, has +shown significant success in SSMIS. However, SAM and its variants may not fully +account for the distribution differences between different datasets. To address +this issue, we propose a sharpness-aware optimization method based on +$f$-divergence minimization (DiM) for semi-supervised medical image +segmentation. This method enhances the model's stability by fine-tuning the +sensitivity of model parameters and improves the model's adaptability to +different datasets through the introduction of $f$-divergence. By reducing +$f$-divergence, the DiM method not only improves the performance balance +between the source and target datasets but also prevents performance +degradation due to overfitting on the source dataset. + +
+
+ comment: 8page +
+
+
+
+
+ + ☆ Target Height Estimation Using a Single Acoustic Camera for Compensation + in 2D Seabed Mosaicking + + +
+ This letter proposes a novel approach for compensating target height data in +2D seabed mosaicking for low-visibility underwater perception. Acoustic cameras +are effective sensors for sensing the marine environments due to their +high-resolution imaging capabilities and robustness to darkness and turbidity. +However, the loss of elevation angle during the imaging process results in a +lack of target height information in the original acoustic camera images, +leading to a simplistic 2D representation of the seabed mosaicking. In +perceiving cluttered and unexplored marine environments, target height data is +crucial for avoiding collisions with marine robots. This study proposes a novel +approach for estimating seabed target height using a single acoustic camera and +integrates height data into 2D seabed mosaicking to compensate for the missing +3D dimension of seabed targets. Unlike classic methods that model the loss of +elevation angle to achieve seabed 3D reconstruction, this study focuses on +utilizing available acoustic cast shadow clues and simple sensor motion to +quickly estimate target height. The feasibility of our proposal is verified +through a water tank experiment and a simulation experiment. + +
+
+ comment: 8 pages,conference +
+
+
+
+
+ + ☆ Accelerating UMAP for Large-Scale Datasets Through Spectral Coarsening + + +
+ This paper introduces an innovative approach to dramatically accelerate UMAP +using spectral data compression.The proposed method significantly reduces the +size of the dataset, preserving its essential manifold structure through an +advanced spectral compression technique. This allows UMAP to perform much +faster while maintaining the quality of its embeddings. Experiments on +real-world datasets, such as USPS, demonstrate the method's ability to achieve +substantial data reduction without compromising embedding fidelity. + +
+
+
+
+
+ + ☆ Enhancing Blind Source Separation with Dissociative Principal Component + Analysis + + +
+ Sparse principal component analysis (sPCA) enhances the interpretability of +principal components (PCs) by imposing sparsity constraints on loading vectors +(LVs). However, when used as a precursor to independent component analysis +(ICA) for blind source separation (BSS), sPCA may underperform due to its focus +on simplicity, potentially disregarding some statistical information essential +for effective ICA. To overcome this limitation, a sophisticated approach is +proposed that preserves the interpretability advantages of sPCA while +significantly enhancing its source extraction capabilities. This consists of +two tailored algorithms, dissociative PCA (DPCA1 and DPCA2), which employ +adaptive and firm thresholding alongside gradient and coordinate descent +approaches to optimize the proposed model dynamically. These algorithms +integrate left and right singular vectors from singular value decomposition +(SVD) through dissociation matrices (DMs) that replace traditional singular +values, thus capturing latent interdependencies effectively to model complex +source relationships. This leads to refined PCs and LVs that more accurately +represent the underlying data structure. The proposed approach avoids focusing +on individual eigenvectors, instead, it collaboratively combines multiple +eigenvectors to disentangle interdependencies within each SVD variate. The +superior performance of the proposed DPCA algorithms is demonstrated across +four varied imaging applications including functional magnetic resonance +imaging (fMRI) source retrieval, foreground-background separation, image +reconstruction, and image inpainting. They outperformed traditional methods +such as PCA+ICA, PPCA+ICA, SPCA+ICA, PMD, and GPower. + +
+
+ comment: 1. 13 pages with 6 figures, this work has not bee published before. + 2. The paper is yet to be peer-reviewed and I am planning to submit it to + IEEE Transactions on Image Processing. 3. There is no supplementary material. + 4. There is no funding for this work as of now +
+
+
+
+
+ + ☆ CLIP Unreasonable Potential in Single-Shot Face Recognition + + +
+ Face recognition is a core task in computer vision designed to identify and +authenticate individuals by analyzing facial patterns and features. This field +intersects with artificial intelligence image processing and machine learning +with applications in security authentication and personalization. Traditional +approaches in facial recognition focus on capturing facial features like the +eyes, nose and mouth and matching these against a database to verify identities +However challenges such as high false positive rates have persisted often due +to the similarity among individuals facial features. Recently Contrastive +Language Image Pretraining (CLIP) a model developed by OpenAI has shown +promising advancements by linking natural language processing with vision tasks +allowing it to generalize across modalities. Using CLIP's vision language +correspondence and single-shot finetuning the model can achieve lower false +positive rates upon deployment without the need of mass facial features +extraction. This integration demonstrating CLIP's potential to address +persistent issues in face recognition model performance without complicating +our training paradigm. + +
+
+
+
+
+ + ☆ C$^{2}$INet: Realizing Incremental Trajectory Prediction with + Prior-Aware Continual Causal Intervention + + +
+ Trajectory prediction for multi-agents in complex scenarios is crucial for +applications like autonomous driving. However, existing methods often overlook +environmental biases, which leads to poor generalization. Additionally, +hardware constraints limit the use of large-scale data across environments, and +continual learning settings exacerbate the challenge of catastrophic +forgetting. To address these issues, we propose the Continual Causal +Intervention (C$^{2}$INet) method for generalizable multi-agent trajectory +prediction within a continual learning framework. Using variational inference, +we align environment-related prior with posterior estimator of confounding +factors in the latent space, thereby intervening in causal correlations that +affect trajectory representation. Furthermore, we store optimal variational +priors across various scenarios using a memory queue, ensuring continuous +debiasing during incremental task training. The proposed C$^{2}$INet enhances +adaptability to diverse tasks while preserving previous task information to +prevent catastrophic forgetting. It also incorporates pruning strategies to +mitigate overfitting. Comparative evaluations on three real and synthetic +complex datasets against state-of-the-art methods demonstrate that our proposed +method consistently achieves reliable prediction performance, effectively +mitigating confounding factors unique to different scenarios. This highlights +the practical value of our method for real-world applications. + +
+
+
+
+
+ + ☆ DGTR: Distributed Gaussian Turbo-Reconstruction for Sparse-View Vast + Scenes + + +
+ Novel-view synthesis (NVS) approaches play a critical role in vast scene +reconstruction. However, these methods rely heavily on dense image inputs and +prolonged training times, making them unsuitable where computational resources +are limited. Additionally, few-shot methods often struggle with poor +reconstruction quality in vast environments. This paper presents DGTR, a novel +distributed framework for efficient Gaussian reconstruction for sparse-view +vast scenes. Our approach divides the scene into regions, processed +independently by drones with sparse image inputs. Using a feed-forward Gaussian +model, we predict high-quality Gaussian primitives, followed by a global +alignment algorithm to ensure geometric consistency. Synthetic views and depth +priors are incorporated to further enhance training, while a distillation-based +model aggregation mechanism enables efficient reconstruction. Our method +achieves high-quality large-scale scene reconstruction and novel-view synthesis +in significantly reduced training times, outperforming existing approaches in +both speed and scalability. We demonstrate the effectiveness of our framework +on vast aerial scenes, achieving high-quality results within minutes. Code will +released on our ![project page](https://3d-aigc.github.com/DGTR). + +
+
+ comment: Code will released on our ![project + page](https://3d-aigc.github.com/DGTR) +
+
+
+
+
+ + ☆ Diffusion Product Quantization + + +
+ In this work, we explore the quantization of diffusion models in extreme +compression regimes to reduce model size while maintaining performance. We +begin by investigating classical vector quantization but find that diffusion +models are particularly susceptible to quantization error, with the codebook +size limiting generation quality. To address this, we introduce product +quantization, which offers improved reconstruction precision and larger +capacity -- crucial for preserving the generative capabilities of diffusion +models. Furthermore, we propose a method to compress the codebook by evaluating +the importance of each vector and removing redundancy, ensuring the model size +remaining within the desired range. We also introduce an end-to-end calibration +approach that adjusts assignments during the forward pass and optimizes the +codebook using the DDPM loss. By compressing the model to as low as 1 bit +(resulting in over 24 times reduction in model size), we achieve a balance +between compression and quality. We apply our compression method to the DiT +model on ImageNet and consistently outperform other quantization approaches, +demonstrating competitive generative performance. + +
+
+
+
+
+ + ☆ Physics-Guided Detector for SAR Airplanes + + +
+ The disperse structure distributions (discreteness) and variant scattering +characteristics (variability) of SAR airplane targets lead to special +challenges of object detection and recognition. The current deep learning-based +detectors encounter challenges in distinguishing fine-grained SAR airplanes +against complex backgrounds. To address it, we propose a novel physics-guided +detector (PGD) learning paradigm for SAR airplanes that comprehensively +investigate their discreteness and variability to improve the detection +performance. It is a general learning paradigm that can be extended to +different existing deep learning-based detectors with "backbone-neck-head" +architectures. The main contributions of PGD include the physics-guided +self-supervised learning, feature enhancement, and instance perception, denoted +as PGSSL, PGFE, and PGIP, respectively. PGSSL aims to construct a +self-supervised learning task based on a wide range of SAR airplane targets +that encodes the prior knowledge of various discrete structure distributions +into the embedded space. Then, PGFE enhances the multi-scale feature +representation of a detector, guided by the physics-aware information learned +from PGSSL. PGIP is constructed at the detection head to learn the refined and +dominant scattering point of each SAR airplane instance, thus alleviating the +interference from the complex background. We propose two implementations, +denoted as PGD and PGD-Lite, and apply them to various existing detectors with +different backbones and detection heads. The experiments demonstrate the +flexibility and effectiveness of the proposed PGD, which can improve existing +detectors on SAR airplane detection with fine-grained classification task (an +improvement of 3.1\% mAP most), and achieve the state-of-the-art performance +(90.7\% mAP) on SAR-AIRcraft-1.0 dataset. The project is open-source at +\url{https://github.com/XAI4SAR/PGD}. + +
+
+
+
+
+ + ☆ Generative Timelines for Instructed Visual Assembly + + +
+ The objective of this work is to manipulate visual timelines (e.g. a video) +through natural language instructions, making complex timeline editing tasks +accessible to non-expert or potentially even disabled users. We call this task +Instructed visual assembly. This task is challenging as it requires (i) +identifying relevant visual content in the input timeline as well as retrieving +relevant visual content in a given input (video) collection, (ii) understanding +the input natural language instruction, and (iii) performing the desired edits +of the input visual timeline to produce an output timeline. To address these +challenges, we propose the Timeline Assembler, a generative model trained to +perform instructed visual assembly tasks. The contributions of this work are +three-fold. First, we develop a large multimodal language model, which is +designed to process visual content, compactly represent timelines and +accurately interpret timeline editing instructions. Second, we introduce a +novel method for automatically generating datasets for visual assembly tasks, +enabling efficient training of our model without the need for human-labeled +data. Third, we validate our approach by creating two novel datasets for image +and video assembly, demonstrating that the Timeline Assembler substantially +outperforms established baseline models, including the recent GPT-4o, in +accurately executing complex assembly instructions across various real-world +inspired scenarios. + +
+
+
+
+
+ + ☆ SSEditor: Controllable Mask-to-Scene Generation with Diffusion Model + + +
+ Recent advancements in 3D diffusion-based semantic scene generation have +gained attention. However, existing methods rely on unconditional generation +and require multiple resampling steps when editing scenes, which significantly +limits their controllability and flexibility. To this end, we propose SSEditor, +a controllable Semantic Scene Editor that can generate specified target +categories without multiple-step resampling. SSEditor employs a two-stage +diffusion-based framework: (1) a 3D scene autoencoder is trained to obtain +latent triplane features, and (2) a mask-conditional diffusion model is trained +for customizable 3D semantic scene generation. In the second stage, we +introduce a geometric-semantic fusion module that enhance the model's ability +to learn geometric and semantic information. This ensures that objects are +generated with correct positions, sizes, and categories. Extensive experiments +on SemanticKITTI and CarlaSC demonstrate that SSEditor outperforms previous +approaches in terms of controllability and flexibility in target generation, as +well as the quality of semantic scene generation and reconstruction. More +importantly, experiments on the unseen Occ-3D Waymo dataset show that SSEditor +is capable of generating novel urban scenes, enabling the rapid construction of +3D scenes. + +
+
+
+
+
+ + ☆ GLOVER: Generalizable Open-Vocabulary Affordance Reasoning for + Task-Oriented Grasping + + +
+ Inferring affordable (i.e., graspable) parts of arbitrary objects based on +human specifications is essential for robots advancing toward open-vocabulary +manipulation. Current grasp planners, however, are hindered by limited +vision-language comprehension and time-consuming 3D radiance modeling, +restricting real-time, open-vocabulary interactions with objects. To address +these limitations, we propose GLOVER, a unified Generalizable Open-Vocabulary +Affordance Reasoning framework, which fine-tunes the Large Language Models +(LLMs) to predict visual affordance of graspable object parts within RGB +feature space. We compile a dataset of over 10,000 images from human-object +interactions, annotated with unified visual and linguistic affordance labels, +to enable multi-modal fine-tuning. GLOVER inherits world knowledge and +common-sense reasoning from LLMs, facilitating more fine-grained object +understanding and sophisticated tool-use reasoning. To enable effective +real-world deployment, we present Affordance-Aware Grasping Estimation (AGE), a +non-parametric grasp planner that aligns the gripper pose with a superquadric +surface derived from affordance data. In evaluations across 30 real-world +scenes, GLOVER achieves success rates of 86.0% in part identification and 76.3% +in grasping, with speeds approximately 330 times faster in affordance reasoning +and 40 times faster in grasping pose estimation than the previous +state-of-the-art. + +
+
+
+
+
+ + ☆ HouseLLM: LLM-Assisted Two-Phase Text-to-Floorplan Generation + + +
+ This paper proposes a two-phase text-to-floorplan generation method, which +guides a Large Language Model (LLM) to generate an initial layout (Layout-LLM) +and refines them into the final floorplans through conditional diffusion model. +We incorporate a Chain-of-Thought approach to prompt the LLM based on user text +specifications, enabling a more user-friendly and intuitive house layout +design. This method allows users to describe their needs in natural language, +enhancing accessibility and providing clearer geometric constraints. The final +floorplans generated by Layout-LLM through conditional diffusion refinement are +more accurate and better meet user requirements. Experimental results +demonstrate that our approach achieves state-of-the-art performance across all +metrics, validating its effectiveness in practical home design applications. We +plan to release our code for public use. + +
+
+
+
+
+ + ☆ Versatile Cataract Fundus Image Restoration Model Utilizing Unpaired + Cataract and High-quality Images + + +
+ Cataract is one of the most common blinding eye diseases and can be treated +by surgery. However, because cataract patients may also suffer from other +blinding eye diseases, ophthalmologists must diagnose them before surgery. The +cloudy lens of cataract patients forms a hazy degeneration in the fundus +images, making it challenging to observe the patient's fundus vessels, which +brings difficulties to the diagnosis process. To address this issue, this paper +establishes a new cataract image restoration method named Catintell. It +contains a cataract image synthesizing model, Catintell-Syn, and a restoration +model, Catintell-Res. Catintell-Syn uses GAN architecture with fully +unsupervised data to generate paired cataract-like images with realistic style +and texture rather than the conventional Gaussian degradation algorithm. +Meanwhile, Catintell-Res is an image restoration network that can improve the +quality of real cataract fundus images using the knowledge learned from +synthetic cataract images. Extensive experiments show that Catintell-Res +outperforms other cataract image restoration methods in PSNR with 39.03 and +SSIM with 0.9476. Furthermore, the universal restoration ability that +Catintell-Res gained from unpaired cataract images can process cataract images +from various datasets. We hope the models can help ophthalmologists identify +other blinding eye diseases of cataract patients and inspire more medical image +restoration methods in the future. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ libcll: an Extendable Python Toolkit for Complementary-Label Learning + + +
+ Complementary-label learning (CLL) is a weakly supervised learning paradigm +for multiclass classification, where only complementary labels -- indicating +classes an instance does not belong to -- are provided to the learning +algorithm. Despite CLL's increasing popularity, previous studies highlight two +main challenges: (1) inconsistent results arising from varied assumptions on +complementary label generation, and (2) high barriers to entry due to the lack +of a standardized evaluation platform across datasets and algorithms. To +address these challenges, we introduce \texttt{libcll}, an extensible Python +toolkit for CLL research. \texttt{libcll} provides a universal interface that +supports a wide range of generation assumptions, both synthetic and real-world +datasets, and key CLL algorithms. The toolkit is designed to mitigate +inconsistencies and streamline the research process, with easy installation, +comprehensive usage guides, and quickstart tutorials that facilitate efficient +adoption and implementation of CLL techniques. Extensive ablation studies +conducted with \texttt{libcll} demonstrate its utility in generating valuable +insights to advance future CLL research. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Acquire Precise and Comparable Fundus Image Quality Score: FTHNet and + FQS Dataset + + +
+ The retinal fundus images are utilized extensively in the diagnosis, and +their quality can directly affect the diagnosis results. However, due to the +insufficient dataset and algorithm application, current fundus image quality +assessment (FIQA) methods are not powerful enough to meet ophthalmologists` +demands. In this paper, we address the limitations of datasets and algorithms +in FIQA. First, we establish a new FIQA dataset, Fundus Quality Score(FQS), +which includes 2246 fundus images with two labels: a continuous Mean Opinion +Score varying from 0 to 100 and a three-level quality label. Then, we propose a +FIQA Transformer-based Hypernetwork (FTHNet) to solve these tasks with +regression results rather than classification results in conventional FIQA +works. The FTHNet is optimized for the FIQA tasks with extensive experiments. +Results on our FQS dataset show that the FTHNet can give quality scores for +fundus images with PLCC of 0.9423 and SRCC of 0.9488, significantly +outperforming other methods with fewer parameters and less computation +complexity.We successfully build a dataset and model addressing the problems of +current FIQA methods. Furthermore, the model deployment experiments demonstrate +its potential in automatic medical image quality control. All experiments are +carried out with 10-fold cross-validation to ensure the significance of the +results. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ KDC-MAE: Knowledge Distilled Contrastive Mask Auto-Encoder + + +
+ In this work, we attempted to extend the thought and showcase a way forward +for the Self-supervised Learning (SSL) learning paradigm by combining +contrastive learning, self-distillation (knowledge distillation) and masked +data modelling, the three major SSL frameworks, to learn a joint and +coordinated representation. The proposed technique of SSL learns by the +collaborative power of different learning objectives of SSL. Hence to jointly +learn the different SSL objectives we proposed a new SSL architecture KDC-MAE, +a complementary masking strategy to learn the modular correspondence, and a +weighted way to combine them coordinately. Experimental results conclude that +the contrastive masking correspondence along with the KD learning objective has +lent a hand to performing better learning for multiple modalities over multiple +tasks. + +
+
+
+
+
+ + ☆ Prototype Optimization with Neural ODE for Few-Shot Learning + + +
+ Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel +classes with few examples. Pre-training based methods effectively tackle the +problem by pre-training a feature extractor and then performing class +prediction via a cosine classifier with mean-based prototypes. Nevertheless, +due to the data scarcity, the mean-based prototypes are usually biased. In this +paper, we attempt to diminish the prototype bias by regarding it as a prototype +optimization problem. To this end, we propose a novel prototype optimization +framework to rectify prototypes, i.e., introducing a meta-optimizer to optimize +prototypes. Although the existing meta-optimizers can also be adapted to our +framework, they all overlook a crucial gradient bias issue, i.e., the +mean-based gradient estimation is also biased on sparse data. To address this +issue, in this paper, we regard the gradient and its flow as meta-knowledge and +then propose a novel Neural Ordinary Differential Equation (ODE)-based +meta-optimizer to optimize prototypes, called MetaNODE. Although MetaNODE has +shown superior performance, it suffers from a huge computational burden. To +further improve its computation efficiency, we conduct a detailed analysis on +MetaNODE and then design an effective and efficient MetaNODE extension version +(called E2MetaNODE). It consists of two novel modules: E2GradNet and E2Solver, +which aim to estimate accurate gradient flows and solve optimal prototypes in +an effective and efficient manner, respectively. Extensive experiments show +that 1) our methods achieve superior performance over previous FSL methods and +2) our E2MetaNODE significantly improves computation efficiency meanwhile +without performance degradation. + +
+
+ comment: An extended version of metanode: prototype optimization as a neural + ode for few-shot learning. arXiv admin note: text overlap with + arXiv:2103.14341 +
+
+
+
+
+ + ☆ ADV2E: Bridging the Gap Between Analogue Circuit and Discrete Frames in + the Video-to-Events Simulator + + +
+ Event cameras operate fundamentally differently from traditional Active Pixel +Sensor (APS) cameras, offering significant advantages. Recent research has +developed simulators to convert video frames into events, addressing the +shortage of real event datasets. Current simulators primarily focus on the +logical behavior of event cameras. However, the fundamental analogue properties +of pixel circuits are seldom considered in simulator design. The gap between +analogue pixel circuit and discrete video frames causes the degeneration of +synthetic events, particularly in high-contrast scenes. In this paper, we +propose a novel method of generating reliable event data based on a detailed +analysis of the pixel circuitry in event cameras. We incorporate the analogue +properties of event camera pixel circuits into the simulator design: (1) +analogue filtering of signals from light intensity to events, and (2) a cutoff +frequency that is independent of video frame rate. Experimental results on two +relevant tasks, including semantic segmentation and image reconstruction, +validate the reliability of simulated event data, even in high-contrast scenes. +This demonstrates that deep neural networks exhibit strong generalization from +simulated to real event data, confirming that the synthetic events generated by +the proposed method are both realistic and well-suited for effective training. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Neuro-3D: Towards 3D Visual Decoding from EEG Signals + + +
+ Human's perception of the visual world is shaped by the stereo processing of +3D information. Understanding how the brain perceives and processes 3D visual +stimuli in the real world has been a longstanding endeavor in neuroscience. +Towards this goal, we introduce a new neuroscience task: decoding 3D visual +perception from EEG signals, a neuroimaging technique that enables real-time +monitoring of neural dynamics enriched with complex visual cues. To provide the +essential benchmark, we first present EEG-3D, a pioneering dataset featuring +multimodal analysis data and extensive EEG recordings from 12 subjects viewing +72 categories of 3D objects rendered in both videos and images. Furthermore, we +propose Neuro-3D, a 3D visual decoding framework based on EEG signals. This +framework adaptively integrates EEG features derived from static and dynamic +stimuli to learn complementary and robust neural representations, which are +subsequently utilized to recover both the shape and color of 3D objects through +the proposed diffusion-based colored point cloud decoder. To the best of our +knowledge, we are the first to explore EEG-based 3D visual decoding. +Experiments indicate that Neuro-3D not only reconstructs colored 3D objects +with high fidelity, but also learns effective neural representations that +enable insightful brain region analysis. The dataset and associated code will +be made publicly available. + +
+
+
+
+
+ + ☆ Invariant Shape Representation Learning For Image Classification + + +
+ Geometric shape features have been widely used as strong predictors for image +classification. Nevertheless, most existing classifiers such as deep neural +networks (DNNs) directly leverage the statistical correlations between these +shape features and target variables. However, these correlations can often be +spurious and unstable across different environments (e.g., in different age +groups, certain types of brain changes have unstable relations with +neurodegenerative disease); hence leading to biased or inaccurate predictions. +In this paper, we introduce a novel framework that for the first time develops +invariant shape representation learning (ISRL) to further strengthen the +robustness of image classifiers. In contrast to existing approaches that mainly +derive features in the image space, our model ISRL is designed to jointly +capture invariant features in latent shape spaces parameterized by deformable +transformations. To achieve this goal, we develop a new learning paradigm based +on invariant risk minimization (IRM) to learn invariant representations of +image and shape features across multiple training distributions/environments. +By embedding the features that are invariant with regard to target variables in +different environments, our model consistently offers more accurate +predictions. We validate our method by performing classification tasks on both +simulated 2D images, real 3D brain and cine cardiovascular magnetic resonance +images (MRIs). Our code is publicly available at +https://github.com/tonmoy-hossain/ISRL. + +
+
+
+
+
+ + ☆ RoSIS: Robust Framework for Text-Promptable Surgical Instrument + Segmentation Using Vision-Language Fusion + + +
+ Surgical instrument segmentation (SIS) is an essential task in +computer-assisted surgeries, with deep learning-based research improving +accuracy in complex environments. Recently, text-promptable segmentation +methods have been introduced to generate masks based on text prompts describing +target objects. However, these methods assume that the object described by a +given text prompt exists in the scene. This results in mask generation whenever +a related text prompt is provided, even if the object is absent from the image. +Existing methods handle this by using prompts only for objects known to be +present in the image, which introduces inaccessible information in a +vision-based method setting and results in unfair comparisons. For fair +comparison, we redefine existing text-promptable SIS settings to robust +conditions, called Robust text-promptable SIS (R-SIS), designed to forward +prompts of all classes and determine the existence of an object from a given +text prompt for the fair comparison. Furthermore, we propose a novel framework, +Robust Surgical Instrument Segmentation (RoSIS), which combines visual and +language features for promptable segmentation in the R-SIS setting. RoSIS +employs an encoder-decoder architecture with a Multi-Modal Fusion Block (MMFB) +and a Selective Gate Block (SGB) to achieve balanced integration of vision and +language features. Additionally, we introduce an iterative inference strategy +that refines segmentation masks in two steps: an initial pass using name-based +prompts, followed by a refinement step using location prompts. Experiments on +various datasets and settings demonstrate that RoSIS outperforms existing +vision-based and promptable methods under robust conditions. + +
+
+ comment: 10 pages, 6 figures, submitted to IEEE transactions on Medical + Imaging +
+
+
+
+
+ + ☆ CCIS-Diff: A Generative Model with Stable Diffusion Prior for Controlled + Colonoscopy Image Synthesis + + +
+ Colonoscopy is crucial for identifying adenomatous polyps and preventing +colorectal cancer. However, developing robust models for polyp detection is +challenging by the limited size and accessibility of existing colonoscopy +datasets. While previous efforts have attempted to synthesize colonoscopy +images, current methods suffer from instability and insufficient data +diversity. Moreover, these approaches lack precise control over the generation +process, resulting in images that fail to meet clinical quality standards. To +address these challenges, we propose CCIS-DIFF, a Controlled generative model +for high-quality Colonoscopy Image Synthesis based on a Diffusion architecture. +Our method offers precise control over both the spatial attributes (polyp +location and shape) and clinical characteristics of polyps that align with +clinical descriptions. Specifically, we introduce a blur mask weighting +strategy to seamlessly blend synthesized polyps with the colonic mucosa, and a +text-aware attention mechanism to guide the generated images to reflect +clinical characteristics. Notably, to achieve this, we construct a new +multi-modal colonoscopy dataset that integrates images, mask annotations, and +corresponding clinical text descriptions. Experimental results demonstrate that +our method generates high-quality, diverse colonoscopy images with fine control +over both spatial constraints and clinical consistency, offering valuable +support for downstream segmentation and diagnostic tasks. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ MTFusion: Reconstructing Any 3D Object from Single Image Using + Multi-word Textual Inversion + + +
+ Reconstructing 3D models from single-view images is a long-standing problem +in computer vision. The latest advances for single-image 3D reconstruction +extract a textual description from the input image and further utilize it to +synthesize 3D models. However, existing methods focus on capturing a single key +attribute of the image (e.g., object type, artistic style) and fail to consider +the multi-perspective information required for accurate 3D reconstruction, such +as object shape and material properties. Besides, the reliance on Neural +Radiance Fields hinders their ability to reconstruct intricate surfaces and +texture details. In this work, we propose MTFusion, which leverages both image +data and textual descriptions for high-fidelity 3D reconstruction. Our approach +consists of two stages. First, we adopt a novel multi-word textual inversion +technique to extract a detailed text description capturing the image's +characteristics. Then, we use this description and the image to generate a 3D +model with FlexiCubes. Additionally, MTFusion enhances FlexiCubes by employing +a special decoder network for Signed Distance Functions, leading to faster +training and finer surface representation. Extensive evaluations demonstrate +that our MTFusion surpasses existing image-to-3D methods on a wide range of +synthetic and real-world images. Furthermore, the ablation study proves the +effectiveness of our network designs. + +
+
+ comment: PRCV 2024 +
+
+
+
+
+ + ☆ A Survey of Medical Vision-and-Language Applications and Their + Techniques + + +
+ Medical vision-and-language models (MVLMs) have attracted substantial +interest due to their capability to offer a natural language interface for +interpreting complex medical data. Their applications are versatile and have +the potential to improve diagnostic accuracy and decision-making for individual +patients while also contributing to enhanced public health monitoring, disease +surveillance, and policy-making through more efficient analysis of large data +sets. MVLMS integrate natural language processing with medical images to enable +a more comprehensive and contextual understanding of medical images alongside +their corresponding textual information. Unlike general vision-and-language +models trained on diverse, non-specialized datasets, MVLMs are purpose-built +for the medical domain, automatically extracting and interpreting critical +information from medical images and textual reports to support clinical +decision-making. Popular clinical applications of MVLMs include automated +medical report generation, medical visual question answering, medical +multimodal segmentation, diagnosis and prognosis and medical image-text +retrieval. Here, we provide a comprehensive overview of MVLMs and the various +medical tasks to which they have been applied. We conduct a detailed analysis +of various vision-and-language model architectures, focusing on their distinct +strategies for cross-modal integration/exploitation of medical visual and +textual features. We also examine the datasets used for these tasks and compare +the performance of different models based on standardized evaluation metrics. +Furthermore, we highlight potential challenges and summarize future research +trends and directions. The full collection of papers and codes is available at: +https://github.com/YtongXie/Medical-Vision-and-Language-Tasks-and-Methodologies-A-Survey. + +
+
+
+
+
+ + ☆ Constant Rate Schedule: Constant-Rate Distributional Change for + Efficient Training and Sampling in Diffusion Models + + +
+ We propose a noise schedule that ensures a constant rate of change in the +probability distribution of diffused data throughout the diffusion process. To +obtain this noise schedule, we measure the rate of change in the probability +distribution of the forward process and use it to determine the noise schedule +before training diffusion models. The functional form of the noise schedule is +automatically determined and tailored to each dataset and type of diffusion +model. We evaluate the effectiveness of our noise schedule on unconditional and +class-conditional image generation tasks using the LSUN +(bedroom/church/cat/horse), ImageNet, and FFHQ datasets. Through extensive +experiments, we confirmed that our noise schedule broadly improves the +performance of the diffusion models regardless of the dataset, sampler, number +of function evaluations, or type of diffusion model. + +
+
+ comment: 33 pages, 9 figures +
+
+
+
+
+ + ☆ Enhancing Low Dose Computed Tomography Images Using Consistency Training + Techniques + + +
+ Diffusion models have significant impact on wide range of generative tasks, +especially on image inpainting and restoration. Although the improvements on +aiming for decreasing number of function evaluations (NFE), the iterative +results are still computationally expensive. Consistency models are as a new +family of generative models, enable single-step sampling of high quality data +without the need for adversarial training. In this paper, we introduce the beta +noise distribution, which provides flexibility in adjusting noise levels. This +is combined with a sinusoidal curriculum that enhances the learning of the +trajectory between the noise distribution and the posterior distribution of +interest, allowing High Noise Improved Consistency Training (HN-iCT) to be +trained in a supervised fashion. Additionally, High Noise Improved Consistency +Training with Image Condition (HN-iCT-CN) architecture is introduced, enables +to take Low Dose images as a condition for extracting significant features by +Weighted Attention Gates (WAG).Our results indicate that unconditional image +generation using HN-iCT significantly outperforms basic CT and iCT training +techniques with NFE=1 on the CIFAR10 and CelebA datasets. Moreover, our +image-conditioned model demonstrates exceptional performance in enhancing +low-dose (LD) CT scans. + +
+
+
+
+
+ + ☆ Robust 3D Semantic Occupancy Prediction with Calibration-free Spatial + Transformation + + +
+ 3D semantic occupancy prediction, which seeks to provide accurate and +comprehensive representations of environment scenes, is important to autonomous +driving systems. For autonomous cars equipped with multi-camera and LiDAR, it +is critical to aggregate multi-sensor information into a unified 3D space for +accurate and robust predictions. Recent methods are mainly built on the +2D-to-3D transformation that relies on sensor calibration to project the 2D +image information into the 3D space. These methods, however, suffer from two +major limitations: First, they rely on accurate sensor calibration and are +sensitive to the calibration noise, which limits their application in real +complex environments. Second, the spatial transformation layers are +computationally expensive and limit their running on an autonomous vehicle. In +this work, we attempt to exploit a Robust and Efficient 3D semantic Occupancy +(REO) prediction scheme. To this end, we propose a calibration-free spatial +transformation based on vanilla attention to implicitly model the spatial +correspondence. In this way, we robustly project the 2D features to a +predefined BEV plane without using sensor calibration as input. Then, we +introduce 2D and 3D auxiliary training tasks to enhance the discrimination +power of 2D backbones on spatial, semantic, and texture features. Last, we +propose a query-based prediction scheme to efficiently generate large-scale +fine-grained occupancy predictions. By fusing point clouds that provide +complementary spatial information, our REO surpasses the existing methods by a +large margin on three benchmarks, including OpenOccupancy, Occ3D-nuScenes, and +SemanticKITTI Scene Completion. For instance, our REO achieves 19.8$\times$ +speedup compared to Co-Occ, with 1.1 improvements in geometry IoU on +OpenOccupancy. Our code will be available at https://github.com/ICEORY/REO. + +
+
+ comment: 13 pages, 11 figures, 18 tables +
+
+
+
+
+ + ☆ AsynEIO: Asynchronous Monocular Event-Inertial Odometry Using Gaussian + Process Regression + + +
+ Event cameras, when combined with inertial sensors, show significant +potential for motion estimation in challenging scenarios, such as high-speed +maneuvers and low-light environments. There are many methods for producing such +estimations, but most boil down to a synchronous discrete-time fusion problem. +However, the asynchronous nature of event cameras and their unique fusion +mechanism with inertial sensors remain underexplored. In this paper, we +introduce a monocular event-inertial odometry method called AsynEIO, designed +to fuse asynchronous event and inertial data within a unified Gaussian Process +(GP) regression framework. Our approach incorporates an event-driven frontend +that tracks feature trajectories directly from raw event streams at a high +temporal resolution. These tracked feature trajectories, along with various +inertial factors, are integrated into the same GP regression framework to +enable asynchronous fusion. With deriving analytical residual Jacobians and +noise models, our method constructs a factor graph that is iteratively +optimized and pruned using a sliding-window optimizer. Comparative assessments +highlight the performance of different inertial fusion strategies, suggesting +optimal choices for varying conditions. Experimental results on both public +datasets and our own event-inertial sequences indicate that AsynEIO outperforms +existing methods, especially in high-speed and low-illumination scenarios. + +
+
+ comment: Submitted to IEEE (2024-11-4) +
+
+
+
+
+ + ☆ Just KIDDIN: Knowledge Infusion and Distillation for Detection of + INdecent Memes + + +
+ Toxicity identification in online multimodal environments remains a +challenging task due to the complexity of contextual connections across +modalities (e.g., textual and visual). In this paper, we propose a novel +framework that integrates Knowledge Distillation (KD) from Large Visual +Language Models (LVLMs) and knowledge infusion to enhance the performance of +toxicity detection in hateful memes. Our approach extracts sub-knowledge graphs +from ConceptNet, a large-scale commonsense Knowledge Graph (KG) to be infused +within a compact VLM framework. The relational context between toxic phrases in +captions and memes, as well as visual concepts in memes enhance the model's +reasoning capabilities. Experimental results from our study on two hate speech +benchmark datasets demonstrate superior performance over the state-of-the-art +baselines across AU-ROC, F1, and Recall with improvements of 1.1%, 7%, and 35%, +respectively. Given the contextual complexity of the toxicity detection task, +our approach showcases the significance of learning from both explicit (i.e. +KG) as well as implicit (i.e. LVLMs) contextual cues incorporated through a +hybrid neurosymbolic approach. This is crucial for real-world applications +where accurate and scalable recognition of toxic content is critical for +creating safer online environments. + +
+
+
+
+
+ + ☆ Sketch-guided Cage-based 3D Gaussian Splatting Deformation + + +
+ 3D Gaussian Splatting (GS) is one of the most promising novel 3D +representations that has received great interest in computer graphics and +computer vision. While various systems have introduced editing capabilities for +3D GS, such as those guided by text prompts, fine-grained control over +deformation remains an open challenge. In this work, we present a novel +sketch-guided 3D GS deformation system that allows users to intuitively modify +the geometry of a 3D GS model by drawing a silhouette sketch from a single +viewpoint. Our approach introduces a new deformation method that combines +cage-based deformations with a variant of Neural Jacobian Fields, enabling +precise, fine-grained control. Additionally, it leverages large-scale 2D +diffusion priors and ControlNet to ensure the generated deformations are +semantically plausible. Through a series of experiments, we demonstrate the +effectiveness of our method and showcase its ability to animate static 3D GS +models as one of its key applications. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ Self-Supervised Learning in Deep Networks: A Pathway to Robust Few-Shot + Classification + + +
+ This study aims to optimize the few-shot image classification task and +improve the model's feature extraction and classification performance by +combining self-supervised learning with the deep network model ResNet-101. +During the training process, we first pre-train the model with self-supervision +to enable it to learn common feature expressions on a large amount of unlabeled +data; then fine-tune it on the few-shot dataset Mini-ImageNet to improve the +model's accuracy and generalization ability under limited data. The +experimental results show that compared with traditional convolutional neural +networks, ResNet-50, DenseNet, and other models, our method has achieved +excellent performance of about 95.12% in classification accuracy (ACC) and F1 +score, verifying the effectiveness of self-supervised learning in few-shot +classification. This method provides an efficient and reliable solution for the +field of few-shot image classification. + +
+
+
+
+
+ + ☆ Self-supervised denoising of visual field data improves detection of + glaucoma progression + + +
+ Perimetric measurements provide insight into a patient's peripheral vision +and day-to-day functioning and are the main outcome measure for identifying +progression of visual damage from glaucoma. However, visual field data can be +noisy, exhibiting high variance, especially with increasing damage. In this +study, we demonstrate the utility of self-supervised deep learning in denoising +visual field data from over 4000 patients to enhance its signal-to-noise ratio +and its ability to detect true glaucoma progression. We deployed both a +variational autoencoder (VAE) and a masked autoencoder to determine which +self-supervised model best smooths the visual field data while reconstructing +salient features that are less noisy and more predictive of worsening disease. +Our results indicate that including a categorical p-value at every visual field +location improves the smoothing of visual field data. Masked autoencoders led +to cleaner denoised data than previous methods, such as variational +autoencoders. A 4.7% increase in detection of progressing eyes with pointwise +linear regression (PLR) was observed. The masked and variational autoencoders' +smoothed data predicted glaucoma progression 2.3 months earlier when p-values +were included compared to when they were not. The faster prediction of time to +progression (TTP) and the higher percentage progression detected support our +hypothesis that masking out visual field elements during training while +including p-values at each location would improve the task of detection of +visual field progression. Our study has clinically relevant implications +regarding masking when training neural networks to denoise visual field data, +resulting in earlier and more accurate detection of glaucoma progression. This +denoising model can be integrated into future models for visual field analysis +to enhance detection of glaucoma progression. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Generative World Explorer + + +
+ Planning with partial observation is a central challenge in embodied AI. A +majority of prior works have tackled this challenge by developing agents that +physically explore their environment to update their beliefs about the world +state. In contrast, humans can $\textit{imagine}$ unseen parts of the world +through a mental exploration and $\textit{revise}$ their beliefs with imagined +observations. Such updated beliefs can allow them to make more informed +decisions, without necessitating the physical exploration of the world at all +times. To achieve this human-like ability, we introduce the $\textit{Generative +World Explorer (Genex)}$, an egocentric world exploration framework that allows +an agent to mentally explore a large-scale 3D world (e.g., urban scenes) and +acquire imagined observations to update its belief. This updated belief will +then help the agent to make a more informed decision at the current step. To +train $\textit{Genex}$, we create a synthetic urban scene dataset, Genex-DB. +Our experimental results demonstrate that (1) $\textit{Genex}$ can generate +high-quality and consistent observations during long-horizon exploration of a +large virtual physical world and (2) the beliefs updated with the generated +observations can inform an existing decision-making model (e.g., an LLM agent) +to make better plans. + +
+
+ comment: Website: generative-world-explorer.github.io +
+
+
+
+
+ + ♻ ☆ VidComposition: Can MLLMs Analyze Compositions in Compiled Videos? + + +
+ The advancement of Multimodal Large Language Models (MLLMs) has enabled +significant progress in multimodal understanding, expanding their capacity to +analyze video content. However, existing evaluation benchmarks for MLLMs +primarily focus on abstract video comprehension, lacking a detailed assessment +of their ability to understand video compositions, the nuanced interpretation +of how visual elements combine and interact within highly compiled video +contexts. We introduce VidComposition, a new benchmark specifically designed to +evaluate the video composition understanding capabilities of MLLMs using +carefully curated compiled videos and cinematic-level annotations. +VidComposition includes 982 videos with 1706 multiple-choice questions, +covering various compositional aspects such as camera movement, angle, shot +size, narrative structure, character actions and emotions, etc. Our +comprehensive evaluation of 33 open-source and proprietary MLLMs reveals a +significant performance gap between human and model capabilities. This +highlights the limitations of current MLLMs in understanding complex, compiled +video compositions and offers insights into areas for further improvement. The +leaderboard and evaluation code are available at +https://yunlong10.github.io/VidComposition/. + +
+
+
+
+
+ + ♻ ☆ DIG-FACE: De-biased Learning for Generalized Facial Expression Category + Discovery + + +
+ We introduce a novel task, Generalized Facial Expression Category Discovery +(G-FACE), that discovers new, unseen facial expressions while recognizing known +categories effectively. Even though there are generalized category discovery +methods for natural images, they show compromised performance on G-FACE. We +identified two biases that affect the learning: implicit bias, coming from an +underlying distributional gap between new categories in unlabeled data and +known categories in labeled data, and explicit bias, coming from shifted +preference on explicit visual facial change characteristics from known +expressions to unknown expressions. By addressing the challenges caused by both +biases, we propose a Debiased G-FACE method, namely DIG-FACE, that facilitates +the debiasing of both implicit and explicit biases. In the implicit debiasing +process of DIG-FACE, we devise a novel learning strategy that aims at +estimating and minimizing the upper bound of implicit bias. In the explicit +debiasing process, we optimize the model's ability to handle nuanced visual +facial expression data by introducing a hierarchical category-discrimination +refinement strategy: sample-level, triplet-level, and distribution-level +optimizations. Extensive experiments demonstrate that our DIG-FACE +significantly enhances recognition accuracy for both known and new categories, +setting a first-of-its-kind standard for the task. + +
+
+
+
+
+ + ♻ ☆ Feasibility of Federated Learning from Client Databases with Different + Brain Diseases and MRI Modalities WACV 2025 + + +
+ Segmentation models for brain lesions in MRI are typically developed for a +specific disease and trained on data with a predefined set of MRI modalities. +Such models cannot segment the disease using data with a different set of MRI +modalities, nor can they segment other types of diseases. Moreover, this +training paradigm prevents a model from using the advantages of learning from +heterogeneous databases that may contain scans and segmentation labels for +different brain pathologies and diverse sets of MRI modalities. Additionally, +the confidentiality of patient data often prevents central data aggregation, +necessitating a decentralized approach. Is it feasible to use Federated +Learning (FL) to train a single model on client databases that contain scans +and labels of different brain pathologies and diverse sets of MRI modalities? +We demonstrate promising results by combining appropriate, simple, and +practical modifications to the model and training strategy: Designing a model +with input channels that cover the whole set of modalities available across +clients, training with random modality drop, and exploring the effects of +feature normalization methods. Evaluation on 7 brain MRI databases with 5 +different diseases shows that this FL framework can train a single model +achieving very promising results in segmenting all disease types seen during +training. Importantly, it can segment these diseases in new databases that +contain sets of modalities different from those in training clients. These +results demonstrate, for the first time, the feasibility and effectiveness of +using FL to train a single 3D segmentation model on decentralised data with +diverse brain diseases and MRI modalities, a necessary step towards leveraging +heterogeneous real-world databases. Code: +https://github.com/FelixWag/FedUniBrain + +
+
+ comment: Accepted as a conference paper at WACV 2025 +
+
+
+
+
+ + ♻ ☆ Smile upon the Face but Sadness in the Eyes: Emotion Recognition based + on Facial Expressions and Eye Behaviors + + +
+ Emotion Recognition (ER) is the process of identifying human emotions from +given data. Currently, the field heavily relies on facial expression +recognition (FER) because facial expressions contain rich emotional cues. +However, it is important to note that facial expressions may not always +precisely reflect genuine emotions and FER-based results may yield misleading +ER. To understand and bridge this gap between FER and ER, we introduce eye +behaviors as an important emotional cues for the creation of a new +Eye-behavior-aided Multimodal Emotion Recognition (EMER) dataset. Different +from existing multimodal ER datasets, the EMER dataset employs a stimulus +material-induced spontaneous emotion generation method to integrate +non-invasive eye behavior data, like eye movements and eye fixation maps, with +facial videos, aiming to obtain natural and accurate human emotions. Notably, +for the first time, we provide annotations for both ER and FER in the EMER, +enabling a comprehensive analysis to better illustrate the gap between both +tasks. Furthermore, we specifically design a new EMERT architecture to +concurrently enhance performance in both ER and FER by efficiently identifying +and bridging the emotion gap between the two.Specifically, our EMERT employs +modality-adversarial feature decoupling and multi-task Transformer to augment +the modeling of eye behaviors, thus providing an effective complement to facial +expressions. In the experiment, we introduce seven multimodal benchmark +protocols for a variety of comprehensive evaluations of the EMER dataset. The +results show that the EMERT outperforms other state-of-the-art multimodal +methods by a great margin, revealing the importance of modeling eye behaviors +for robust ER. To sum up, we provide a comprehensive analysis of the importance +of eye behaviors in ER, advancing the study on addressing the gap between FER +and ER for more robust ER performance. + +
+
+ comment: The paper is part of ongoing work and we request to withdraw it from + arXiv to revise it further. And The paper was submitted without agreement + from all co-authors +
+
+
+
+
+ + ♻ ☆ A SAM-guided Two-stream Lightweight Model for Anomaly Detection + + +
+ In industrial anomaly detection, model efficiency and mobile-friendliness +become the primary concerns in real-world applications. Simultaneously, the +impressive generalization capabilities of Segment Anything (SAM) have garnered +broad academic attention, making it an ideal choice for localizing unseen +anomalies and diverse real-world patterns. In this paper, considering these two +critical factors, we propose a SAM-guided Two-stream Lightweight Model for +unsupervised anomaly detection (STLM) that not only aligns with the two +practical application requirements but also harnesses the robust generalization +capabilities of SAM. We employ two lightweight image encoders, i.e., our +two-stream lightweight module, guided by SAM's knowledge. To be specific, one +stream is trained to generate discriminative and general feature +representations in both normal and anomalous regions, while the other stream +reconstructs the same images without anomalies, which effectively enhances the +differentiation of two-stream representations when facing anomalous regions. +Furthermore, we employ a shared mask decoder and a feature aggregation module +to generate anomaly maps. Our experiments conducted on MVTec AD benchmark show +that STLM, with about 16M parameters and achieving an inference time in 20ms, +competes effectively with state-of-the-art methods in terms of performance, +98.26% on pixel-level AUC and 94.92% on PRO. We further experiment on more +difficult datasets, e.g., VisA and DAGM, to demonstrate the effectiveness and +generalizability of STLM. + +
+
+ comment: Accepted by ACM TOMM +
+
+
+
+
+ + ♻ ☆ BAISeg: Boundary Assisted Weakly Supervised Instance Segmentation + + +
+ How to extract instance-level masks without instance-level supervision is the +main challenge of weakly supervised instance segmentation (WSIS). Popular WSIS +methods estimate a displacement field (DF) via learning inter-pixel relations +and perform clustering to identify instances. However, the resulting instance +centroids are inherently unstable and vary significantly across different +clustering algorithms. In this paper, we propose Boundary-Assisted Instance +Segmentation (BAISeg), which is a novel paradigm for WSIS that realizes +instance segmentation with pixel-level annotations. BAISeg comprises an +instance-aware boundary detection (IABD) branch and a semantic segmentation +branch. The IABD branch identifies instances by predicting class-agnostic +instance boundaries rather than instance centroids, therefore, it is different +from previous DF-based approaches. In particular, we proposed the Cascade +Fusion Module (CFM) and the Deep Mutual Attention (DMA) in the IABD branch to +obtain rich contextual information and capture instance boundaries with weak +responses. During the training phase, we employed Pixel-to-Pixel Contrast to +enhance the discriminative capacity of the IABD branch. This further +strengthens the continuity and closedness of the instance boundaries. Extensive +experiments on PASCAL VOC 2012 and MS COCO demonstrate the effectiveness of our +approach, and we achieve considerable performance with only pixel-level +annotations. The code will be available at https://github.com/wsis-seg/BAISeg. + +
+
+
+
+
+ + ♻ ☆ Leveraging Computational Pathology AI for Noninvasive Optical Imaging + Analysis Without Retraining + + +
+ Noninvasive optical imaging modalities can probe patient's tissue in 3D and +over time generate gigabytes of clinically relevant data per sample. There is a +need for AI models to analyze this data and assist clinical workflow. The lack +of expert labelers and the large dataset required (>100,000 images) for model +training and tuning are the main hurdles in creating foundation models. In this +paper we introduce FoundationShift, a method to apply any AI model from +computational pathology without retraining. We show our method is more accurate +than state of the art models (SAM, MedSAM, SAM-Med2D, CellProfiler, Hover-Net, +PLIP, UNI and ChatGPT), with multiple imaging modalities (OCT and RCM). This is +achieved without the need for model retraining or fine-tuning. Applying our +method to noninvasive in vivo images could enable physicians to readily +incorporate optical imaging modalities into their clinical practice, providing +real time tissue analysis and improving patient care. + +
+
+
+
+
+ + ♻ ☆ Look Before You Decide: Prompting Active Deduction of MLLMs for + Assumptive Reasoning + + +
+ Recently, Multimodal Large Language Models (MLLMs) have achieved significant +success across multiple disciplines due to their exceptional +instruction-following capabilities and extensive world knowledge. However, +whether these MLLMs possess human-like compositional reasoning abilities +remains an open problem. To unveil their reasoning behaviors, we first curate a +\textbf{M}ultimodal \textbf{A}ssumptive \textbf{R}ea\textbf{s}oning Benchmark +(MARS-Bench) in this paper. Interestingly, we find that most prevalent MLLMs +can be easily fooled by the introduction of a presupposition into the question, +whereas such presuppositions appear naive to human reasoning. Besides, we also +propose a simple yet effective method, Active Deduction (AD), to encourage the +model to actively perform composite deduction before reaching a final decision. +Equipped with the proposed AD method, a MLLM demonstrates significant +improvements in assumptive reasoning abilities without compromising its +general-purpose question-answering performance. We also provide extensive +evaluations of both open-source and private MLLMs on MARS-Bench, along with +experimental analyses of the AD method. + +
+
+
+
+
+ + ♻ ☆ CARLA2Real: a tool for reducing the sim2real gap in CARLA simulator + + +
+ Simulators are indispensable for research in autonomous systems such as +self-driving cars, autonomous robots and drones. Despite significant progress +in various simulation aspects, such as graphical realism, an evident gap +persists between the virtual and real-world environments. Since the ultimate +goal is to deploy the autonomous systems in the real world, closing the +sim2real gap is of utmost importance. In this paper, we employ a +state-of-the-art approach to enhance the photorealism of simulated data, +aligning them with the visual characteristics of real-world datasets. Based on +this, we developed CARLA2Real, an easy-to-use, publicly available tool +(plug-in) for the widely used and open-source CARLA simulator. This tool +enhances the output of CARLA in near real-time, achieving a frame rate of 13 +FPS, translating it to the visual style and realism of real-world datasets such +as Cityscapes, KITTI, and Mapillary Vistas. By employing the proposed tool, we +generated synthetic datasets from both the simulator and the enhancement model +outputs, including their corresponding ground truth annotations for tasks +related to autonomous driving. Then, we performed a number of experiments to +evaluate the impact of the proposed approach on feature extraction and semantic +segmentation methods when trained on the enhanced synthetic data. The results +demonstrate that the sim2real gap is significant and can indeed be reduced by +the introduced approach. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Automatic Classification of General Movements in Newborns ML4H + + +
+ General movements (GMs) are spontaneous, coordinated body movements in +infants that offer valuable insights into the developing nervous system. +Assessed through the Prechtl GM Assessment (GMA), GMs are reliable predictors +for neurodevelopmental disorders. However, GMA requires specifically trained +clinicians, who are limited in number. To scale up newborn screening, there is +a need for an algorithm that can automatically classify GMs from infant video +recordings. This data poses challenges, including variability in recording +length, device type, and setting, with each video coarsely annotated for +overall movement quality. In this work, we introduce a tool for extracting +features from these recordings and explore various machine learning techniques +for automated GM classification. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages +
+
+
+
+
+ + ♻ ☆ MMTryon: Multi-Modal Multi-Reference Control for High-Quality Fashion + Generation + + +
+ This paper introduces MMTryon, a multi-modal multi-reference VIrtual Try-ON +(VITON) framework, which can generate high-quality compositional try-on results +by taking a text instruction and multiple garment images as inputs. Our MMTryon +addresses three problems overlooked in prior literature: 1) Support of multiple +try-on items. Existing methods are commonly designed for single-item try-on +tasks (e.g., upper/lower garments, dresses). 2)Specification of dressing style. +Existing methods are unable to customize dressing styles based on instructions +(e.g., zipped/unzipped, tuck-in/tuck-out, etc.) 3) Segmentation Dependency. +They further heavily rely on category-specific segmentation models to identify +the replacement regions, with segmentation errors directly leading to +significant artifacts in the try-on results. To address the first two issues, +our MMTryon introduces a novel multi-modality and multi-reference attention +mechanism to combine the garment information from reference images and +dressing-style information from text instructions. Besides, to remove the +segmentation dependency, MMTryon uses a parsing-free garment encoder and +leverages a novel scalable data generation pipeline to convert existing VITON +datasets to a form that allows MMTryon to be trained without requiring any +explicit segmentation. Extensive experiments on high-resolution benchmarks and +in-the-wild test sets demonstrate MMTryon's superiority over existing SOTA +methods both qualitatively and quantitatively. MMTryon's impressive performance +on multi-item and style-controllable virtual try-on scenarios and its ability +to try on any outfit in a large variety of scenarios from any source image, +opens up a new avenue for future investigation in the fashion community. + +
+
+
+
+
+ + ♻ ☆ CLIP-VG: Self-paced Curriculum Adapting of CLIP for Visual Grounding + + +
+ Visual Grounding (VG) is a crucial topic in the field of vision and language, +which involves locating a specific region described by expressions within an +image. To reduce the reliance on manually labeled data, unsupervised visual +grounding have been developed to locate regions using pseudo-labels. However, +the performance of existing unsupervised methods is highly dependent on the +quality of pseudo-labels and these methods always encounter issues with limited +diversity. In order to utilize vision and language pre-trained models to +address the grounding problem, and reasonably take advantage of pseudo-labels, +we propose CLIP-VG, a novel method that can conduct self-paced curriculum +adapting of CLIP with pseudo-language labels. We propose a simple yet efficient +end-to-end network architecture to realize the transfer of CLIP to the visual +grounding. Based on the CLIP-based architecture, we further propose +single-source and multi-source curriculum adapting algorithms, which can +progressively find more reliable pseudo-labels to learn an optimal model, +thereby achieving a balance between reliability and diversity for the +pseudo-language labels. Our method outperforms the current state-of-the-art +unsupervised method by a significant margin on RefCOCO/+/g datasets in both +single-source and multi-source scenarios, with improvements ranging from +6.78$\%$ to 10.67$\%$ and 11.39$\%$ to 14.87$\%$, respectively. The results +even outperform existing weakly supervised visual grounding methods. +Furthermore, our method is also competitive in fully supervised setting. The +code and models are available at https://github.com/linhuixiao/CLIP-VG. + +
+
+ comment: Accepted by IEEE Transaction on Multimedia (2023), Paper page: + https://ieeexplore.ieee.org/abstract/document/10269126. Code are available at + https://github.com/linhuixiao/CLIP-VG +
+
+
+
+
+ + ♻ ☆ Zero-Shot Image Denoising for High-Resolution Electron Microscopy + + +
+ High-resolution electron microscopy (HREM) imaging technique is a powerful +tool for directly visualizing a broad range of materials in real-space. +However, it faces challenges in denoising due to ultra-low signal-to-noise +ratio (SNR) and scarce data availability. In this work, we propose Noise2SR, a +zero-shot self-supervised learning (ZS-SSL) denoising framework for HREM. +Within our framework, we propose a super-resolution (SR) based self-supervised +training strategy, incorporating the Random Sub-sampler module. The Random +Sub-sampler is designed to generate approximate infinite noisy pairs from a +single noisy image, serving as an effective data augmentation in zero-shot +denoising. Noise2SR trains the network with paired noisy images of different +resolutions, which is conducted via SR strategy. The SR-based training +facilitates the network adopting more pixels for supervision, and the random +sub-sampling helps compel the network to learn continuous signals enhancing the +robustness. Meanwhile, we mitigate the uncertainty caused by random-sampling by +adopting minimum mean squared error (MMSE) estimation for the denoised results. +With the distinctive integration of training strategy and proposed designs, +Noise2SR can achieve superior denoising performance using a single noisy HREM +image. We evaluate the performance of Noise2SR in both simulated and real HREM +denoising tasks. It outperforms state-of-the-art ZS-SSL methods and achieves +comparable denoising performance with supervised methods. The success of +Noise2SR suggests its potential for improving the SNR of images in material +imaging domains. + +
+
+ comment: 12 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ IDCIA: Immunocytochemistry Dataset for Cellular Image Analysis + + +
+ We present a new annotated microscopic cellular image dataset to improve the +effectiveness of machine learning methods for cellular image analysis. Cell +counting is an important step in cell analysis. Typically, domain experts +manually count cells in a microscopic image. Automated cell counting can +potentially eliminate this tedious, time-consuming process. However, a good, +labeled dataset is required for training an accurate machine learning model. +Our dataset includes microscopic images of cells, and for each image, the cell +count and the location of individual cells. The data were collected as part of +an ongoing study investigating the potential of electrical stimulation to +modulate stem cell differentiation and possible applications for neural repair. +Compared to existing publicly available datasets, our dataset has more images +of cells stained with more variety of antibodies (protein components of immune +responses against invaders) typically used for cell analysis. The experimental +results on this dataset indicate that none of the five existing models under +this study are able to achieve sufficiently accurate count to replace the +manual methods. The dataset is available at +https://figshare.com/articles/dataset/Dataset/21970604. + +
+
+
+
+
+ + ♻ ☆ One-step Generative Diffusion for Realistic Extreme Image Rescaling + + +
+ Image rescaling aims to learn the optimal low-resolution (LR) image that can +be accurately reconstructed to its original high-resolution (HR) counterpart, +providing an efficient image processing and storage method for ultra-high +definition media. However, extreme downscaling factors pose significant +challenges to the upscaling process due to its highly ill-posed nature, causing +existing image rescaling methods to struggle in generating semantically correct +structures and perceptual friendly textures. In this work, we propose a novel +framework called One-Step Image Rescaling Diffusion (OSIRDiff) for extreme +image rescaling, which performs rescaling operations in the latent space of a +pre-trained autoencoder and effectively leverages powerful natural image priors +learned by a pre-trained text-to-image diffusion model. Specifically, OSIRDiff +adopts a pseudo-invertible module to establish the bidirectional mapping +between the latent features of the HR image and the target-sized LR image. +Then, the rescaled features are refined by a pre-trained diffusion model to +generate more faithful and visually pleasing details. The entire model is +end-to-end trained to enable the diffusion priors to guide the rescaling +process. Considering the spatially non-uniform reconstruction quality of the +rescaled latent features, we propose a novel time-step alignment strategy, +which can adaptively determine the generative strength of the diffusion model +based on the degree of latent reconstruction errors. Extensive experiments +demonstrate the superiority of OSIRDiff over previous methods in both +quantitative and qualitative evaluations. + +
+
+
+
+
+ + ♻ ☆ Multistep Consistency Models + + +
+ Diffusion models are relatively easy to train but require many steps to +generate samples. Consistency models are far more difficult to train, but +generate samples in a single step. + In this paper we propose Multistep Consistency Models: A unification between +Consistency Models (Song et al., 2023) and TRACT (Berthelot et al., 2023) that +can interpolate between a consistency model and a diffusion model: a trade-off +between sampling speed and sampling quality. Specifically, a 1-step consistency +model is a conventional consistency model whereas a $\infty$-step consistency +model is a diffusion model. + Multistep Consistency Models work really well in practice. By increasing the +sample budget from a single step to 2-8 steps, we can train models more easily +that generate higher quality samples, while retaining much of the sampling +speed benefits. Notable results are 1.4 FID on Imagenet 64 in 8 step and 2.1 +FID on Imagenet128 in 8 steps with consistency distillation, using simple +losses without adversarial training. We also show that our method scales to a +text-to-image diffusion model, generating samples that are close to the quality +of the original model. + +
+
+
+
+
+ + ♻ ☆ Survey on Emotion Recognition through Posture Detection and the + possibility of its application in Virtual Reality + + +
+ A survey is presented focused on using pose estimation techniques in +Emotional recognition using various technologies normal cameras, and depth +cameras for real-time, and the potential use of VR and inputs including images, +videos, and 3-dimensional poses described in vector space. We discussed 19 +research papers collected from selected journals and databases highlighting +their methodology, classification algorithm, and the used datasets that relate +to emotion recognition and pose estimation. A benchmark has been made according +to their accuracy as it was the most common performance measurement metric +used. We concluded that the multimodal Approaches overall made the best +accuracy and then we mentioned futuristic concerns that can improve the +development of this research topic. + +
+
+
+
+
+ + ♻ ☆ Pixel-Inconsistency Modeling for Image Manipulation Localization + + +
+ Digital image forensics plays a crucial role in image authentication and +manipulation localization. Despite the progress powered by deep neural +networks, existing forgery localization methodologies exhibit limitations when +deployed to unseen datasets and perturbed images (i.e., lack of generalization +and robustness to real-world applications). To circumvent these problems and +aid image integrity, this paper presents a generalized and robust manipulation +localization model through the analysis of pixel inconsistency artifacts. The +rationale is grounded on the observation that most image signal processors +(ISP) involve the demosaicing process, which introduces pixel correlations in +pristine images. Moreover, manipulating operations, including splicing, +copy-move, and inpainting, directly affect such pixel regularity. We, +therefore, first split the input image into several blocks and design masked +self-attention mechanisms to model the global pixel dependency in input images. +Simultaneously, we optimize another local pixel dependency stream to mine local +manipulation clues within input forgery images. In addition, we design novel +Learning-to-Weight Modules (LWM) to combine features from the two streams, +thereby enhancing the final forgery localization performance. To improve the +training process, we propose a novel Pixel-Inconsistency Data Augmentation +(PIDA) strategy, driving the model to focus on capturing inherent pixel-level +artifacts instead of mining semantic forgery traces. This work establishes a +comprehensive benchmark integrating 15 representative detection models across +12 datasets. Extensive experiments show that our method successfully extracts +inherent pixel-inconsistency forgery fingerprints and achieve state-of-the-art +generalization and robustness performances in image manipulation localization. + +
+
+
+
+
+ + ♻ ☆ Alleviating Hallucinations in Large Vision-Language Models through + Hallucination-Induced Optimization NeurIPS 2024 + + +
+ Although Large Visual Language Models (LVLMs) have demonstrated exceptional +abilities in understanding multimodal data, they invariably suffer from +hallucinations, leading to a disconnect between the generated text and the +corresponding images. Almost all current visual contrastive decoding methods +attempt to mitigate these hallucinations by introducing visual uncertainty +information that appropriately widens the contrastive logits gap between +hallucinatory and targeted ones. However, due to uncontrollable nature of the +global visual uncertainty, they struggle to precisely induce the hallucinatory +tokens, which severely limits their effectiveness in mitigating hallucinations +and may even lead to the generation of undesired hallucinations. To tackle this +issue, we conducted the theoretical analysis to promote the effectiveness of +contrast decoding. Building on this insight, we introduce a novel optimization +strategy named Hallucination-Induced Optimization (HIO). This strategy seeks to +amplify the contrast between hallucinatory and targeted tokens relying on a +fine-tuned theoretical preference model (i.e., Contrary Bradley-Terry Model), +thereby facilitating efficient contrast decoding to alleviate hallucinations in +LVLMs. Extensive experimental research demonstrates that our HIO strategy can +effectively reduce hallucinations in LVLMs, outperforming state-of-the-art +methods across various benchmarks. + +
+
+ comment: Accepted by NeurIPS 2024. arXiv admin note: text overlap with + arXiv:2311.16922 by other authors +
+
+
+
+
+ + ♻ ☆ S-HR-VQVAE: Sequential Hierarchical Residual Learning Vector Quantized + Variational Autoencoder for Video Prediction + + +
+ We address the video prediction task by putting forth a novel model that +combines (i) a novel hierarchical residual learning vector quantized +variational autoencoder (HR-VQVAE), and (ii) a novel autoregressive +spatiotemporal predictive model (AST-PM). We refer to this approach as a +sequential hierarchical residual learning vector quantized variational +autoencoder (S-HR-VQVAE). By leveraging the intrinsic capabilities of HR-VQVAE +at modeling still images with a parsimonious representation, combined with the +AST-PM's ability to handle spatiotemporal information, S-HR-VQVAE can better +deal with major challenges in video prediction. These include learning +spatiotemporal information, handling high dimensional data, combating blurry +prediction, and implicit modeling of physical characteristics. Extensive +experimental results on four challenging tasks, namely KTH Human Action, +TrafficBJ, Human3.6M, and Kitti, demonstrate that our model compares favorably +against state-of-the-art video prediction techniques both in quantitative and +qualitative evaluations despite a much smaller model size. Finally, we boost +S-HR-VQVAE by proposing a novel training method to jointly estimate the +HR-VQVAE and AST-PM parameters. + +
+
+ comment: 12 pages, 6 figures, 5 tables. Accepted for publication on IEEE + Transactions on Multimedia on 2024-11-19 +
+
+
+
+
+ + ♻ ☆ 3D-Consistent Human Avatars with Sparse Inputs via Gaussian Splatting + and Contrastive Learning + + +
+ Existing approaches for human avatar generation--both NeRF-based and 3D +Gaussian Splatting (3DGS) based--struggle with maintaining 3D consistency and +exhibit degraded detail reconstruction, particularly when training with sparse +inputs. To address this challenge, we propose CHASE, a novel framework that +achieves dense-input-level performance using only sparse inputs through two key +innovations: cross-pose intrinsic 3D consistency supervision and 3D geometry +contrastive learning. Building upon prior skeleton-driven approaches that +combine rigid deformation with non-rigid cloth dynamics, we first establish +baseline avatars with fundamental 3D consistency. To enhance 3D consistency +under sparse inputs, we introduce a Dynamic Avatar Adjustment (DAA) module, +which refines deformed Gaussians by leveraging similar poses from the training +set. By minimizing the rendering discrepancy between adjusted Gaussians and +reference poses, DAA provides additional supervision for avatar reconstruction. +We further maintain global 3D consistency through a novel geometry-aware +contrastive learning strategy. While designed for sparse inputs, CHASE +surpasses state-of-the-art methods across both full and sparse settings on +ZJU-MoCap and H36M datasets, demonstrating that our enhanced 3D consistency +leads to superior rendering quality. + +
+
+
+
+
+ + ♻ ☆ Topology-aware Human Avatars with Semantically-guided Gaussian Splatting + + +
+ Reconstructing photo-realistic and topology-aware animatable human avatars +from monocular videos remains challenging in computer vision and graphics. +Recently, methods using 3D Gaussians to represent the human body have emerged, +offering faster optimization and real-time rendering. However, due to ignoring +the crucial role of human body semantic information which represents the +explicit topological and intrinsic structure within human body, they fail to +achieve fine-detail reconstruction of human avatars. To address this issue, we +propose SG-GS, which uses semantics-embedded 3D Gaussians, skeleton-driven +rigid deformation, and non-rigid cloth dynamics deformation to create +photo-realistic human avatars. We then design a Semantic Human-Body Annotator +(SHA) which utilizes SMPL's semantic prior for efficient body part semantic +labeling. The generated labels are used to guide the optimization of semantic +attributes of Gaussian. To capture the explicit topological structure of the +human body, we employ a 3D network that integrates both topological and +geometric associations for human avatar deformation. We further implement three +key strategies to enhance the semantic accuracy of 3D Gaussians and rendering +quality: semantic projection with 2D regularization, semantic-guided density +regularization and semantic-aware regularization with neighborhood consistency. +Extensive experiments demonstrate that SG-GS achieves state-of-the-art geometry +and appearance reconstruction performance. + +
+
+
+
+
+ + ♻ ☆ Relational Contrastive Learning and Masked Image Modeling for Scene Text + Recognition + + +
+ Context-aware methods have achieved remarkable advancements in supervised +scene text recognition by leveraging semantic priors from words. Considering +the heterogeneity of text and background in STR, we propose that such +contextual priors can be reinterpreted as the relations between textual +elements, serving as effective self-supervised labels for representation +learning. However, textual relations are restricted to the finite size of the +dataset due to lexical dependencies, which causes over-fitting problem, thus +compromising the representation quality. To address this, our work introduces a +unified framework of Relational Contrastive Learning and Masked Image Modeling +for STR (RCMSTR), which explicitly models the enriched textual relations. For +the RCL branch, we first introduce the relational rearrangement module to +cultivate new relations on the fly. Based on this, we further conduct +relational contrastive learning to model the intra- and inter-hierarchical +relations for frames, sub-words and words. On the other hand, MIM can naturally +boost the context information via masking, where we find that the block masking +strategy is more effective for STR. For the effective integration of RCL and +MIM, we also introduce a novel decoupling design aimed at mitigating the impact +of masked images on contrastive learning. Additionally, to enhance the +compatibility of MIM with CNNs, we propose the adoption of sparse convolutions +and directly sharing the weights with dense convolutions in training. The +proposed RCMSTR demonstrates superior performance in various evaluation +protocols for different STR-related downstream tasks, outperforming the +existing state-of-the-art self-supervised STR techniques. Ablation studies and +qualitative experimental results further validate the effectiveness of our +method. The code and pre-trained models will be available at +https://github.com/ThunderVVV/RCMSTR . + +
+
+ comment: arXiv admin note: text overlap with arXiv:2308.00508 +
+
+
+
+
+ + ♻ ☆ Wavelets Are All You Need for Autoregressive Image Generation + + +
+ In this paper, we take a new approach to autoregressive image generation that +is based on two main ingredients. The first is wavelet image coding, which +allows to tokenize the visual details of an image from coarse to fine details +by ordering the information starting with the most significant bits of the most +significant wavelet coefficients. The second is a variant of a language +transformer whose architecture is re-designed and optimized for token sequences +in this 'wavelet language'. The transformer learns the significant statistical +correlations within a token sequence, which are the manifestations of +well-known correlations between the wavelet subbands at various resolutions. We +show experimental results with conditioning on the generation process. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ ERUP-YOLO: Enhancing Object Detection Robustness for Adverse Weather + Condition by Unified Image-Adaptive Processing WACV 2025 + + +
+ We propose an image-adaptive object detection method for adverse weather +conditions such as fog and low-light. Our framework employs differentiable +preprocessing filters to perform image enhancement suitable for later-stage +object detections. Our framework introduces two differentiable filters: a +B\'ezier curve-based pixel-wise (BPW) filter and a kernel-based local (KBL) +filter. These filters unify the functions of classical image processing filters +and improve performance of object detection. We also propose a domain-agnostic +data augmentation strategy using the BPW filter. Our method does not require +data-specific customization of the filter combinations, parameter ranges, and +data augmentation. We evaluate our proposed approach, called Enhanced +Robustness by Unified Image Processing (ERUP)-YOLO, by applying it to the +YOLOv3 detector. Experiments on adverse weather datasets demonstrate that our +proposed filters match or exceed the expressiveness of conventional methods and +our ERUP-YOLO achieved superior performance in a wide range of adverse weather +conditions, including fog and low-light conditions. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Rethinking cluster-conditioned diffusion models for label-free image + synthesis + + +
+ Diffusion-based image generation models can enhance image quality when +conditioned on ground truth labels. Here, we conduct a comprehensive +experimental study on image-level conditioning for diffusion models using +cluster assignments. We investigate how individual clustering determinants, +such as the number of clusters and the clustering method, impact image +synthesis across three different datasets. Given the optimal number of clusters +with respect to image synthesis, we show that cluster-conditioning can achieve +state-of-the-art performance, with an FID of 1.67 for CIFAR10 and 2.17 for +CIFAR100, along with a strong increase in training sample efficiency. We +further propose a novel empirical method to estimate an upper bound for the +optimal number of clusters. Unlike existing approaches, we find no significant +association between clustering performance and the corresponding +cluster-conditional FID scores. The code is available at +https://github.com/HHU-MMBS/cedm-official-wavc2025. + +
+
+ comment: Accepted in WAVC2025 (21 pages, 15 figures). Code is available at + https://github.com/HHU-MMBS/cedm-official-wavc2025 +
+
+
+
+
+ + ♻ ☆ SpikingNeRF: Making Bio-inspired Neural Networks See through the Real + World + + +
+ In this paper, we propose SpikingNeRF, which aligns the temporal dimension of +spiking neural networks (SNNs) with the radiance rays, to seamlessly +accommodate SNNs to the reconstruction of neural radiance fields (NeRF). Thus, +the computation turns into a spike-based, multiplication-free manner, reducing +energy consumption and making high-quality 3D rendering, for the first time, +accessible to neuromorphic hardware. In SpikingNeRF, each sampled point on the +ray is matched to a particular time step and represented in a hybrid manner +where the voxel grids are maintained as well. Based on the voxel grids, sampled +points are determined whether to be masked out for faster training and +inference. However, this masking operation also incurs irregular temporal +length, making it intractable for hardware processors, e.g., GPUs, to conduct +parallel training. To address this problem, we develop the temporal padding +strategy to tackle the masked samples to maintain regular temporal length, +i.e., regular tensors, and further propose the temporal condensing strategy to +form a denser data structure for hardware-friendly computation. Experiments on +various datasets demonstrate that our method can reduce energy consumption by +an average of 70.79\% and obtain comparable synthesis quality with the ANN +baseline. Verification on the neuromorphic hardware accelerator also shows that +SpikingNeRF can further benefit from neuromorphic computing over the ANN +baselines on energy efficiency. Codes and the appendix are in +\url{https://github.com/Ikarosy/SpikingNeRF-of-CASIA}. + +
+
+
+
+
+ + ♻ ☆ Vision-based Manipulation of Transparent Plastic Bags in Industrial + Setups + + +
+ This paper addresses the challenges of vision-based manipulation for +autonomous cutting and unpacking of transparent plastic bags in industrial +setups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data, +connectivity, analytics, and robotics, promises enhanced accessibility and +sustainability throughout the value chain. The integration of autonomous +systems, including collaborative robots (cobots), into industrial processes is +pivotal for efficiency and safety. The proposed solution employs advanced +Machine Learning algorithms, particularly Convolutional Neural Networks (CNNs), +to identify transparent plastic bags under varying lighting and background +conditions. Tracking algorithms and depth sensing technologies are utilized for +3D spatial awareness during pick and placement. The system addresses challenges +in grasping and manipulation, considering optimal points, compliance control +with vacuum gripping technology, and real-time automation for safe interaction +in dynamic environments. The system's successful testing and validation in the +lab with the FRANKA robot arm, showcases its potential for widespread +industrial applications, while demonstrating effectiveness in automating the +unpacking and cutting of transparent plastic bags for an 8-stack bulk-loader +based on specific requirements and rigorous testing. + +
+
+
+
+
+ + ♻ ☆ A-BDD: Leveraging Data Augmentations for Safe Autonomous Driving in + Adverse Weather and Lighting + + +
+ High-autonomy vehicle functions rely on machine learning (ML) algorithms to +understand the environment. Despite displaying remarkable performance in fair +weather scenarios, perception algorithms are heavily affected by adverse +weather and lighting conditions. To overcome these difficulties, ML engineers +mainly rely on comprehensive real-world datasets. However, the difficulties in +real-world data collection for critical areas of the operational design domain +(ODD) often means synthetic data is required for perception training and safety +validation. Thus, we present A-BDD, a large set of over 60,000 synthetically +augmented images based on BDD100K that are equipped with semantic segmentation +and bounding box annotations (inherited from the BDD100K dataset). The dataset +contains augmented data for rain, fog, overcast and sunglare/shadow with +varying intensity levels. We further introduce novel strategies utilizing +feature-based image quality metrics like FID and CMMD, which help identify +useful augmented and real-world data for ML training and testing. By conducting +experiments on A-BDD, we provide evidence that data augmentations can play a +pivotal role in closing performance gaps in adverse weather and lighting +conditions. + +
+
+
+
+
+ + ♻ ☆ Diffusion-Based Semantic Segmentation of Lumbar Spine MRI Scans of Lower + Back Pain Patients ML4H + + +
+ This study introduces a diffusion-based framework for robust and accurate +segmenton of vertebrae, intervertebral discs (IVDs), and spinal canal from +Magnetic Resonance Imaging~(MRI) scans of patients with low back pain (LBP), +regardless of whether the scans are T1w or T2-weighted. The results showed that +SpineSegDiff achieved comparable outperformed non-diffusion state-of-the-art +models in the identification of degenerated IVDs. Our findings highlight the +potential of diffusion models to improve LBP diagnosis and management through +precise spine MRI analysis. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 5 pages +
+
+
+
+
+ + ♻ ☆ Vision-Language Model Fine-Tuning via Simple Parameter-Efficient + Modification EMNLP 2024 + + +
+ Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed +the success of prompt tuning and adapter tuning, while the classic model +fine-tuning on inherent parameters seems to be overlooked. It is believed that +fine-tuning the parameters of VLMs with few-shot samples corrupts the +pre-trained knowledge since fine-tuning the CLIP model even degrades +performance. In this paper, we revisit this viewpoint, and propose a new +perspective: fine-tuning the specific parameters instead of all will uncover +the power of classic model fine-tuning on VLMs. Through our meticulous study, +we propose ClipFit, a simple yet effective method to fine-tune CLIP without +introducing any overhead of extra parameters. We demonstrate that by only +fine-tuning the specific bias terms and normalization layers, ClipFit can +improve the performance of zero-shot CLIP by 7.27\% average harmonic mean +accuracy. Lastly, to understand how fine-tuning in CLIPFit affects the +pre-trained models, we conducted extensive experimental analyses w.r.t. changes +in internal parameters and representations. We found that low-level text bias +layers and the first layer normalization layer change much more than other +layers. The code is available at \url{https://github.com/minglllli/CLIPFit}. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Modifying the U-Net's Encoder-Decoder Architecture for Segmentation of + Tumors in Breast Ultrasound Images + + +
+ Segmentation is one of the most significant steps in image processing. +Segmenting an image is a technique that makes it possible to separate a digital +image into various areas based on the different characteristics of pixels in +the image. In particular, segmentation of breast ultrasound images is widely +used for cancer identification. As a result of image segmentation, it is +possible to make early diagnoses of a diseases via medical images in a very +effective way. Due to various ultrasound artifacts and noises, including +speckle noise, low signal-to-noise ratio, and intensity heterogeneity, the +process of accurately segmenting medical images, such as ultrasound images, is +still a challenging task. In this paper, we present a new method to improve the +accuracy and effectiveness of breast ultrasound image segmentation. More +precisely, we propose a Neural Network (NN) based on U-Net and an +encoder-decoder architecture. By taking U-Net as the basis, both encoder and +decoder parts are developed by combining U-Net with other Deep Neural Networks +(Res-Net and MultiResUNet) and introducing a new approach and block (Co-Block), +which preserve as much as possible the low-level and the high-level features. +Designed network is evaluated using the Breast Ultrasound Images (BUSI) +Dataset. It consists of 780 images and the images are categorized into three +classes, which are normal, benign, and malignant. According to our extensive +evaluations on a public breast ultrasound dataset, designed network segments +the breast lesions more accurately than other state-of-the-art deep learning +methods. With only 8.88M parameters, our network (CResU-Net) obtained 82.88%, +77.5%, 90.3%, and 98.4% in terms of Dice similarity coefficients (DSC), +Intersection over Union (IoU), Area under curve (AUC), and global accuracy +(ACC), respectively, on BUSI dataset. + +
+
+
+
+
+ + ♻ ☆ EVT: Efficient View Transformation for Multi-Modal 3D Object Detection + + +
+ Multi-modal sensor fusion in bird's-eye-view (BEV) representation has become +the leading approach in 3D object detection. However, existing methods often +rely on depth estimators or transformer encoders for view transformation, +incurring substantial computational overhead. Furthermore, the lack of precise +geometric correspondence between 2D and 3D spaces leads to spatial and +ray-directional misalignments, restricting the effectiveness of BEV +representations. To address these challenges, we propose a novel 3D object +detector via efficient view transformation (EVT), which leverages a +well-structured BEV representation to enhance accuracy and efficiency. EVT +focuses on two main areas. First, it employs Adaptive Sampling and Adaptive +Projection (ASAP), using LiDAR guidance to generate 3D sampling points and +adaptive kernels. The generated points and kernels are then used to facilitate +the transformation of image features into BEV space and refine the BEV +features. Second, EVT includes an improved transformer-based detection +framework, which contains a group-wise query initialization method and an +enhanced query update framework. It is designed to effectively utilize the +obtained multi-modal BEV features within the transformer decoder. By leveraging +the geometric properties of object queries, this framework significantly +enhances detection performance, especially in a multi-layer transformer decoder +structure. EVT achieves state-of-the-art performance on the nuScenes test set +with real-time inference speed. + +
+
+
+
+
+ + ♻ ☆ Cascaded Diffusion Models for 2D and 3D Microscopy Image Synthesis to + Enhance Cell Segmentation + + +
+ Automated cell segmentation in microscopy images is essential for biomedical +research, yet conventional methods are labor-intensive and prone to error. +While deep learning-based approaches have proven effective, they often require +large annotated datasets, which are scarce due to the challenges of manual +annotation. To overcome this, we propose a novel framework for synthesizing +densely annotated 2D and 3D cell microscopy images using cascaded diffusion +models. Our method synthesizes 2D and 3D cell masks from sparse 2D annotations +using multi-level diffusion models and NeuS, a 3D surface reconstruction +approach. Following that, a pretrained 2D Stable Diffusion model is finetuned +to generate realistic cell textures and the final outputs are combined to form +cell populations. We show that training a segmentation model with a combination +of our synthetic data and real data improves cell segmentation performance by +up to 9\% across multiple datasets. Additionally, the FID scores indicate that +the synthetic data closely resembles real data. The code for our proposed +approach will be available at +https://github.com/ruveydayilmaz0/cascaded_diffusion. + +
+
+
+
+
+ + ♻ ☆ Railway LiDAR semantic segmentation based on intelligent semi-automated + data annotation + + +
+ Automated vehicles rely on an accurate and robust perception of the +environment. Similarly to automated cars, highly automated trains require an +environmental perception. Although there is a lot of research based on either +camera or LiDAR sensors in the automotive domain, very few contributions for +this task exist yet for automated trains. Additionally, no public dataset or +described approach for a 3D LiDAR semantic segmentation in the railway +environment exists yet. Thus, we propose an approach for a point-wise 3D +semantic segmentation based on the 2DPass network architecture using scans and +images jointly. In addition, we present a semi-automated intelligent data +annotation approach, which we use to efficiently and accurately label the +required dataset recorded on a railway track in Germany. To improve performance +despite a still small number of labeled scans, we apply an active learning +approach to intelligently select scans for the training dataset. Our +contributions are threefold: We annotate rail data including camera and LiDAR +data from the railway environment, transfer label the raw LiDAR point clouds +using an image segmentation network, and train a state-of-the-art 3D LiDAR +semantic segmentation network efficiently leveraging active learning. The +trained network achieves good segmentation results with a mean IoU of 71.48% of +9 classes. + +
+
+ comment: This article has been accepted for publication in the IEEE VTC Fall + 2024 +
+
+
+
+
+ + ♻ ☆ Domain Consistency Representation Learning for Lifelong Person + Re-Identification + + +
+ Lifelong person re-identification (LReID) exhibits a contradictory +relationship between intra-domain discrimination and inter-domain gaps when +learning from continuous data. Intra-domain discrimination focuses on +individual nuances (e.g. clothing type, accessories, etc.), while inter-domain +gaps emphasize domain consistency. Achieving a trade-off between maximizing +intra-domain discrimination and minimizing inter-domain gaps is a crucial +challenge for improving LReID performance. Most existing methods aim to reduce +inter-domain gaps through knowledge distillation to maintain domain +consistency. However, they often ignore intra-domain discrimination. To address +this challenge, we propose a novel domain consistency representation learning +(DCR) model that explores global and attribute-wise representations as a bridge +to balance intra-domain discrimination and inter-domain gaps. At the +intra-domain level, we explore the complementary relationship between global +and attribute-wise representations to improve discrimination among similar +identities. Excessive learning intra-domain discrimination can lead to +catastrophic forgetting. We further develop an attribute-oriented +anti-forgetting (AF) strategy that explores attribute-wise representations to +enhance inter-domain consistency, and propose a knowledge consolidation (KC) +strategy to facilitate knowledge transfer. Extensive experiments show that our +DCR model achieves superior performance compared to state-of-the-art LReID +methods. Our code will be available soon. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ RayFormer: Improving Query-Based Multi-Camera 3D Object Detection via + Ray-Centric Strategies + + +
+ The recent advances in query-based multi-camera 3D object detection are +featured by initializing object queries in the 3D space, and then sampling +features from perspective-view images to perform multi-round query refinement. +In such a framework, query points near the same camera ray are likely to sample +similar features from very close pixels, resulting in ambiguous query features +and degraded detection accuracy. To this end, we introduce RayFormer, a +camera-ray-inspired query-based 3D object detector that aligns the +initialization and feature extraction of object queries with the optical +characteristics of cameras. Specifically, RayFormer transforms perspective-view +image features into bird's eye view (BEV) via the lift-splat-shoot method and +segments the BEV map to sectors based on the camera rays. Object queries are +uniformly and sparsely initialized along each camera ray, facilitating the +projection of different queries onto different areas in the image to extract +distinct features. Besides, we leverage the instance information of images to +supplement the uniformly initialized object queries by further involving +additional queries along the ray from 2D object detection boxes. To extract +unique object-level features that cater to distinct queries, we design a ray +sampling method that suitably organizes the distribution of feature sampling +points on both images and bird's eye view. Extensive experiments are conducted +on the nuScenes dataset to validate our proposed ray-inspired model design. The +proposed RayFormer achieves superior performance of 55.5% mAP and 63.3% NDS, +respectively. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ MTVQA: Benchmarking Multilingual Text-Centric Visual Question Answering + + +
+ Text-Centric Visual Question Answering (TEC-VQA) in its proper format not +only facilitates human-machine interaction in text-centric visual environments +but also serves as a de facto gold proxy to evaluate AI models in the domain of +text-centric scene understanding. Nonetheless, most existing TEC-VQA benchmarks +have focused on high-resource languages like English and Chinese. Despite +pioneering works to expand multilingual QA pairs in non-text-centric VQA +datasets through translation engines, the translation-based protocol encounters +a substantial "visual-textual misalignment" problem when applied to TEC-VQA. +Specifically, it prioritizes the text in question-answer pairs while +disregarding the visual text present in images. Moreover, it fails to address +complexities related to nuanced meaning, contextual distortion, language bias, +and question-type diversity. In this work, we tackle multilingual TEC-VQA by +introducing MTVQA, the first benchmark featuring high-quality human expert +annotations across 9 diverse languages, consisting of 6,778 question-answer +pairs across 2,116 images. Further, by comprehensively evaluating numerous +state-of-the-art Multimodal Large Language Models~(MLLMs), including Qwen2-VL, +GPT-4o, GPT-4V, Claude3, and Gemini, on the MTVQA benchmark, it is evident that +there is still a large room for performance improvement (Qwen2-VL scoring 30.9 +versus 79.7 for human performance), underscoring the value of MTVQA. +Additionally, we supply multilingual training data within the MTVQA dataset, +demonstrating that straightforward fine-tuning with this data can substantially +enhance multilingual TEC-VQA performance. We aspire that MTVQA will offer the +research community fresh insights and stimulate further exploration in +multilingual visual text comprehension. The project homepage is available at +https://bytedance.github.io/MTVQA/. + +
+
+
+
+
+ + ♻ ☆ Computer-Vision-Enabled Worker Video Analysis for Motion Amount + Quantification + + +
+ The performance of physical workers is significantly influenced by the extent +of their motions. However, monitoring and assessing these motions remains a +challenge. Recent advancements have enabled in-situ video analysis for +real-time observation of worker behaviors. This paper introduces a novel +framework for tracking and quantifying upper and lower limb motions, issuing +alerts when critical thresholds are reached. Using joint position data from +posture estimation, the framework employs Hotelling's $T^2$ statistic to +quantify and monitor motion amounts. The results indicate that the correlation +between workers' joint motion amounts and Hotelling's $T^2$ statistic is +approximately 35\% higher for micro-tasks than macro-tasks, demonstrating the +framework's ability to detect fine-grained motion differences. This study +highlights the proposed system's effectiveness in real-time applications across +various industry settings, providing a valuable tool for precision motion +analysis and proactive ergonomic adjustments. + +
+
+
+
+
+ + ♻ ☆ LSSInst: Improving Geometric Modeling in LSS-Based BEV Perception with + Instance Representation 3DV 2025 + + +
+ With the attention gained by camera-only 3D object detection in autonomous +driving, methods based on Bird-Eye-View (BEV) representation especially derived +from the forward view transformation paradigm, i.e., lift-splat-shoot (LSS), +have recently seen significant progress. The BEV representation formulated by +the frustum based on depth distribution prediction is ideal for learning the +road structure and scene layout from multi-view images. However, to retain +computational efficiency, the compressed BEV representation such as in +resolution and axis is inevitably weak in retaining the individual geometric +details, undermining the methodological generality and applicability. With this +in mind, to compensate for the missing details and utilize multi-view geometry +constraints, we propose LSSInst, a two-stage object detector incorporating BEV +and instance representations in tandem. The proposed detector exploits +fine-grained pixel-level features that can be flexibly integrated into existing +LSS-based BEV networks. Having said that, due to the inherent gap between two +representation spaces, we design the instance adaptor for the BEV-to-instance +semantic coherence rather than pass the proposal naively. Extensive experiments +demonstrated that our proposed framework is of excellent generalization ability +and performance, which boosts the performances of modern LSS-based BEV +perception methods without bells and whistles and outperforms current LSS-based +state-of-the-art works on the large-scale nuScenes benchmark. + +
+
+ comment: Accepted by 3DV 2025 +
+
+
+
+
+ + ♻ ☆ Stacking Brick by Brick: Aligned Feature Isolation for Incremental Face + Forgery Detection + + +
+ The rapid advancement of face forgery techniques has introduced a growing +variety of forgeries. Incremental Face Forgery Detection (IFFD), involving +gradually adding new forgery data to fine-tune the previously trained model, +has been introduced as a promising strategy to deal with evolving forgery +methods. However, a naively trained IFFD model is prone to catastrophic +forgetting when new forgeries are integrated, as treating all forgeries as a +single ''Fake" class in the Real/Fake classification can cause different +forgery types overriding one another, thereby resulting in the forgetting of +unique characteristics from earlier tasks and limiting the model's +effectiveness in learning forgery specificity and generality. In this paper, we +propose to stack the latent feature distributions of previous and new tasks +brick by brick, $\textit{i.e.}$, achieving $\textbf{aligned feature +isolation}$. In this manner, we aim to preserve learned forgery information and +accumulate new knowledge by minimizing distribution overriding, thereby +mitigating catastrophic forgetting. To achieve this, we first introduce Sparse +Uniform Replay (SUR) to obtain the representative subsets that could be treated +as the uniformly sparse versions of the previous global distributions. We then +propose a Latent-space Incremental Detector (LID) that leverages SUR data to +isolate and align distributions. For evaluation, we construct a more advanced +and comprehensive benchmark tailored for IFFD. The leading experimental results +validate the superiority of our method. + +
+
+
+
+
+ + ♻ ☆ Taming Generative Diffusion Prior for Universal Blind Image Restoration + + +
+ Diffusion models have been widely utilized for image restoration. However, +previous blind image restoration methods still need to assume the type of +degradation model while leaving the parameters to be optimized, limiting their +real-world applications. Therefore, we aim to tame generative diffusion prior +for universal blind image restoration dubbed BIR-D, which utilizes an +optimizable convolutional kernel to simulate the degradation model and +dynamically update the parameters of the kernel in the diffusion steps, +enabling it to achieve blind image restoration results even in various complex +situations. Besides, based on mathematical reasoning, we have provided an +empirical formula for the chosen of adaptive guidance scale, eliminating the +need for a grid search for the optimal parameter. Experimentally, Our BIR-D has +demonstrated superior practicality and versatility than off-the-shelf +unsupervised methods across various tasks both on real-world and synthetic +datasets, qualitatively and quantitatively. BIR-D is able to fulfill +multi-guidance blind image restoration. Moreover, BIR-D can also restore images +that undergo multiple and complicated degradations, demonstrating the practical +applications. + +
+
+ comment: 15 pages, 12 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ MoCha-Stereo: Motif Channel Attention Network for Stereo Matching CVPR 2024 + + +
+ Learning-based stereo matching techniques have made significant progress. +However, existing methods inevitably lose geometrical structure information +during the feature channel generation process, resulting in edge detail +mismatches. In this paper, the Motif Cha}nnel Attention Stereo Matching Network +(MoCha-Stereo) is designed to address this problem. We provide the Motif +Channel Correlation Volume (MCCV) to determine more accurate edge matching +costs. MCCV is achieved by projecting motif channels, which capture common +geometric structures in feature channels, onto feature maps and cost volumes. +In addition, edge variations in %potential feature channels of the +reconstruction error map also affect details matching, we propose the +Reconstruction Error Motif Penalty (REMP) module to further refine the +full-resolution disparity estimation. REMP integrates the frequency information +of typical channel features from the reconstruction error. MoCha-Stereo ranks +1st on the KITTI-2015 and KITTI-2012 Reflective leaderboards. Our structure +also shows excellent performance in Multi-View Stereo. Code is avaliable at +https://github.com/ZYangChen/MoCha-Stereo. + +
+
+ comment: Accepted to CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Self-Supervised Place Recognition by Refining Temporal and Featural + Pseudo Labels from Panoramic Data + + +
+ Visual place recognition (VPR) using deep networks has achieved +state-of-the-art performance. However, most of them require a training set with +ground truth sensor poses to obtain positive and negative samples of each +observation's spatial neighborhood for supervised learning. When such +information is unavailable, temporal neighborhoods from a sequentially +collected data stream could be exploited for self-supervised training, although +we find its performance suboptimal. Inspired by noisy label learning, we +propose a novel self-supervised framework named TF-VPR that uses temporal +neighborhoods and learnable feature neighborhoods to discover unknown spatial +neighborhoods. Our method follows an iterative training paradigm which +alternates between: (1) representation learning with data augmentation, (2) +positive set expansion to include the current feature space neighbors, and (3) +positive set contraction via geometric verification. We conduct auto-labeling +and generalization tests on both simulated and real datasets, with either RGB +images or point clouds as inputs. The results show that our method outperforms +self-supervised baselines in recall rate, robustness, and heading diversity, a +novel metric we propose for VPR. Our code and datasets can be found at +https://ai4ce.github.io/TF-VPR/ + +
+
+
+
+
+ + ♻ ☆ NACNet: A Histology Context-aware Transformer Graph Convolution Network + for Predicting Treatment Response to Neoadjuvant Chemotherapy in Triple + Negative Breast Cancer + + +
+ Neoadjuvant chemotherapy (NAC) response prediction for triple negative breast +cancer (TNBC) patients is a challenging task clinically as it requires +understanding complex histology interactions within the tumor microenvironment +(TME). Digital whole slide images (WSIs) capture detailed tissue information, +but their giga-pixel size necessitates computational methods based on multiple +instance learning, which typically analyze small, isolated image tiles without +the spatial context of the TME. To address this limitation and incorporate TME +spatial histology interactions in predicting NAC response for TNBC patients, we +developed a histology context-aware transformer graph convolution network +(NACNet). Our deep learning method identifies the histopathological labels on +individual image tiles from WSIs, constructs a spatial TME graph, and +represents each node with features derived from tissue texture and social +network analysis. It predicts NAC response using a transformer graph +convolution network model enhanced with graph isomorphism network layers. We +evaluate our method with WSIs of a cohort of TNBC patient (N=105) and compared +its performance with multiple state-of-the-art machine learning and deep +learning models, including both graph and non-graph approaches. Our NACNet +achieves 90.0% accuracy, 96.0% sensitivity, 88.0% specificity, and an AUC of +0.82, through eight-fold cross-validation, outperforming baseline models. These +comprehensive experimental results suggest that NACNet holds strong potential +for stratifying TNBC patients by NAC response, thereby helping to prevent +overtreatment, improve patient quality of life, reduce treatment cost, and +enhance clinical outcomes, marking an important advancement toward personalized +breast cancer treatment. + +
+
+ comment: This paper is accepted by Computerized Medical Imaging and Graphics + (Nov 07 2024) +
+
+
+
+
+ + ♻ ☆ EndoOmni: Zero-Shot Cross-Dataset Depth Estimation in Endoscopy by + Robust Self-Learning from Noisy Labels + + +
+ Single-image depth estimation is essential for endoscopy tasks such as +localization, reconstruction, and augmented reality. Most existing methods in +surgical scenes focus on in-domain depth estimation, limiting their real-world +applicability. This constraint stems from the scarcity and inferior labeling +quality of medical data for training. In this work, we present EndoOmni, the +first foundation model for zero-shot cross-domain depth estimation for +endoscopy. To harness the potential of diverse training data, we refine the +advanced self-learning paradigm that employs a teacher model to generate +pseudo-labels, guiding a student model trained on large-scale labeled and +unlabeled data. To address training disturbance caused by inherent noise in +depth labels, we propose a robust training framework that leverages both depth +labels and estimated confidence from the teacher model to jointly guide the +student model training. Moreover, we propose a weighted scale-and-shift +invariant loss to adaptively adjust learning weights based on label confidence, +thus imposing learning bias towards cleaner label pixels while reducing the +influence of highly noisy pixels. Experiments on zero-shot relative depth +estimation show that our EndoOmni improves state-of-the-art methods in medical +imaging for 33\% and existing foundation models for 34\% in terms of absolute +relative error on specific datasets. Furthermore, our model provides strong +initialization for fine-tuning metric depth estimation, maintaining superior +performance in both in-domain and out-of-domain scenarios. The source code is +publicly available at https://github.com/TianCuteQY/EndoOmni. + +
+
+
+
+
+ + ♻ ☆ DDIM-Driven Coverless Steganography Scheme with Real Key + + +
+ Typical steganography embeds secret information into images by exploiting +their redundancy. Since the visual imperceptibility of secret information is a +key factor in scheme evaluation, conventional methods aim to balance this +requirement with embedding capacity. Consequently, integrating emerging image +generation models and secret transmission has been extensively explored to +achieve a higher embedding capacity. Previous works mostly focus on generating +stego-images with Generative Adversarial Networks (GANs) and usually rely on +pseudo-keys, namely conditions or parameters involved in the generation +process, which are related to secret images. However, studies on +diffusion-based coverless steganography remain insufficient. In this work, we +leverage the Denoising Diffusion Implicit Model (DDIM) to generate high-quality +stego-images without introducing pseudo-keys, instead employing real keys to +enhance security. Furthermore, our method offers low-image-correlation real-key +protection by incorporating chaotic encryption. Another core innovation is that +our method requires only one-time negotiation for multiple communications, +unlike prior methods that necessitate negotiation for each interaction. + +
+
+
+
+
+ + ♻ ☆ Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with + Texture Map-Based Weak Supervision ICPR + + +
+ Facial wrinkle detection plays a crucial role in cosmetic dermatology. +Precise manual segmentation of facial wrinkles is challenging and +time-consuming, with inherent subjectivity leading to inconsistent results +among graders. To address this issue, we propose two solutions. First, we build +and release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an +extension of the NVIDIA FFHQ dataset. It includes 1,000 images with human +labels and 50,000 images with automatically generated weak labels. This dataset +could serve as a foundation for the research community to develop advanced +wrinkle detection algorithms. Second, we introduce a simple training strategy +utilizing texture maps, applicable to various segmentation models, to detect +wrinkles across the face. Our two-stage training strategy first pretrain models +on a large dataset with weak labels (N=50k), or masked texture maps generated +through computer vision techniques, without human intervention. We then +finetune the models using human-labeled data (N=1k), which consists of manually +labeled wrinkle masks. The network takes as input a combination of RGB and +masked texture map of the image, comprising four channels, in finetuning. We +effectively combine labels from multiple annotators to minimize subjectivity in +manual labeling. Our strategies demonstrate improved segmentation performance +in facial wrinkle segmentation both quantitatively and visually compared to +existing pretraining methods. The dataset is available at +https://github.com/labhai/ffhq-wrinkle-dataset. + +
+
+ comment: Accepted at International Conference on Pattern Recognition (ICPR), + 2024 +
+
+
+
+
+ + ♻ ☆ Weakly-supervised land classification for coastal zone based on deep + convolutional neural networks by incorporating dual-polarimetric + characteristics into training dataset + + +
+ In this work we explore the performance of DCNNs on semantic segmentation +using spaceborne polarimetric synthetic aperture radar (PolSAR) datasets. The +semantic segmentation task using PolSAR data can be categorized as weakly +supervised learning when the characteristics of SAR data and data annotating +procedures are factored in. Datasets are initially analyzed for selecting +feasible pre-training images. Then the differences between spaceborne and +airborne datasets are examined in terms of spatial resolution and viewing +geometry. In this study we used two dual-polarimetric images acquired by +TerraSAR-X DLR. A novel method to produce training dataset with more supervised +information is developed. Specifically, a series of typical classified images +as well as intensity images serve as training datasets. A field survey is +conducted for an area of about 20 square kilometers to obtain a ground truth +dataset used for accuracy evaluation. Several transfer learning strategies are +made for aforementioned training datasets which will be combined in a +practicable order. Three DCNN models, including SegNet, U-Net, and LinkNet, are +implemented next. + +
+
+ comment: We are sorry for that there are some minor errors in the experimental + results. We need to make some improvements to the results and request to + withdraw the submission +
+
+
+
+
+ + ♻ ☆ Improving Visual Place Recognition Based Robot Navigation By Verifying + Localization Estimates + + +
+ Visual Place Recognition (VPR) systems often have imperfect performance, +affecting the `integrity' of position estimates and subsequent robot navigation +decisions. Previously, SVM classifiers have been used to monitor VPR integrity. +This research introduces a novel Multi-Layer Perceptron (MLP) integrity monitor +which demonstrates improved performance and generalizability, removing +per-environment training and reducing manual tuning requirements. We test our +proposed system in extensive real-world experiments, presenting two real-time +integrity-based VPR verification methods: a single-query rejection method for +robot navigation to a goal zone (Experiment 1); and a history-of-queries method +that takes a best, verified, match from its recent trajectory and uses an +odometer to extrapolate a current position estimate (Experiment 2). Noteworthy +results for Experiment 1 include a decrease in aggregate mean along-track goal +error from ~9.8m to ~3.1m, and an increase in the aggregate rate of successful +mission completion from ~41% to ~55%. Experiment 2 showed a decrease in +aggregate mean along-track localization error from ~2.0m to ~0.5m, and an +increase in the aggregate localization precision from ~97% to ~99%. Overall, +our results demonstrate the practical usefulness of a VPR integrity monitor in +real-world robotics to improve VPR localization and consequent navigation +performance. + +
+
+ comment: Author Accepted Preprint for Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ ControlNet++: Improving Conditional Controls with Efficient Consistency + Feedback + + +
+ To enhance the controllability of text-to-image diffusion models, existing +efforts like ControlNet incorporated image-based conditional controls. In this +paper, we reveal that existing methods still face significant challenges in +generating images that align with the image conditional controls. To this end, +we propose ControlNet++, a novel approach that improves controllable generation +by explicitly optimizing pixel-level cycle consistency between generated images +and conditional controls. Specifically, for an input conditional control, we +use a pre-trained discriminative reward model to extract the corresponding +condition of the generated images, and then optimize the consistency loss +between the input conditional control and extracted condition. A +straightforward implementation would be generating images from random noises +and then calculating the consistency loss, but such an approach requires +storing gradients for multiple sampling timesteps, leading to considerable time +and memory costs. To address this, we introduce an efficient reward strategy +that deliberately disturbs the input images by adding noise, and then uses the +single-step denoised images for reward fine-tuning. This avoids the extensive +costs associated with image sampling, allowing for more efficient reward +fine-tuning. Extensive experiments show that ControlNet++ significantly +improves controllability under various conditional controls. For example, it +achieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE, +respectively, for segmentation mask, line-art edge, and depth conditions. All +the code, models, demo and organized data have been open sourced on our Github +Repo. + +
+
+ comment: Camera Ready Version. Project Page: + https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data: + https://github.com/liming-ai/ControlNet_Plus_Plus +
+
+
+
+
+ + ♻ ☆ Fine-Grained Verifiers: Preference Modeling as Next-token Prediction in + Vision-Language Alignment + + +
+ The recent advancements in large language models (LLMs) and pre-trained +vision models have accelerated the development of vision-language large models +(VLLMs), enhancing the interaction between visual and linguistic modalities. +Despite their notable success across various domains, VLLMs face challenges in +modality alignment, which can lead to issues like hallucinations and unsafe +content generation. Current alignment techniques often rely on coarse feedback +and external datasets, limiting scalability and performance. In this paper, we +propose FiSAO (Fine-Grained Self-Alignment Optimization), a novel +self-alignment method that utilizes the model's own visual encoder as a +fine-grained verifier to improve vision-language alignment without the need for +additional data. By leveraging token-level feedback from the vision encoder, +FiSAO significantly improves vision-language alignment, even surpassing +traditional preference tuning methods that require additional data. Through +both theoretical analysis and experimental validation, we demonstrate that +FiSAO effectively addresses the misalignment problem in VLLMs, marking the +first instance of token-level rewards being applied to such models. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ Repositioning the Subject within Image + + +
+ Current image manipulation primarily centers on static manipulation, such as +replacing specific regions within an image or altering its overall style. In +this paper, we introduce an innovative dynamic manipulation task, subject +repositioning. This task involves relocating a user-specified subject to a +desired position while preserving the image's fidelity. Our research reveals +that the fundamental sub-tasks of subject repositioning, which include filling +the void left by the repositioned subject, reconstructing obscured portions of +the subject and blending the subject to be consistent with surrounding areas, +can be effectively reformulated as a unified, prompt-guided inpainting task. +Consequently, we can employ a single diffusion generative model to address +these sub-tasks using various task prompts learned through our proposed task +inversion technique. Additionally, we integrate pre-processing and +post-processing techniques to further enhance the quality of subject +repositioning. These elements together form our SEgment-gEnerate-and-bLEnd +(SEELE) framework. To assess SEELE's effectiveness in subject repositioning, we +assemble a real-world subject repositioning dataset called ReS. Results of +SEELE on ReS demonstrate its efficacy. Code and ReS dataset are available at +https://yikai-wang.github.io/seele/. + +
+
+ comment: Accepted by TMLR. Arxiv version uses small size images. Full size + PDF, code, and dataset are available at https://yikai-wang.github.io/seele/ +
+
+
+
+
+ + ♻ ☆ Beyond Perceptual Distances: Rethinking Disparity Assessment for + Out-of-Distribution Detection with Diffusion Models + + +
+ Out-of-Distribution (OoD) detection aims to justify whether a given sample is +from the training distribution of the classifier-under-protection, i.e., +In-Distribution (InD), or from OoD. Diffusion Models (DMs) are recently +utilized in OoD detection by using the perceptual distances between the given +image and its DM generation. DM-based methods bring fresh insights to the +field, yet remain under-explored. + In this work, we point out two main limitations in DM-based OoD detection +methods: (i) the perceptual metrics on the disparities between the given sample +and its generation are devised only at human-perceived levels, ignoring the +abstract or high-level patterns that help better reflect the intrinsic +disparities in distribution; (ii) only the raw image contents are taken to +measure the disparities, while other representations, i.e., the features and +probabilities from the classifier-under-protection, are easy to access at hand +but are ignored. To this end, our proposed detection framework goes beyond the +perceptual distances and looks into the deep representations from the +classifier-under-protection with our novel metrics devised correspondingly, +leading to more informative disparity assessments between InD and OoD. An +anomaly-removal strategy is integrated to remove the abnormal OoD information +in the generation, further enhancing the distinctiveness of disparities. Our +work has demonstrated state-of-the-art detection performances among DM-based +methods in extensive experiments. + +
+
+
+
+
+ + ♻ ☆ Multi-LoRA Composition for Image Generation + + +
+ Low-Rank Adaptation (LoRA) is extensively utilized in text-to-image models +for the accurate rendition of specific elements like distinct characters or +unique styles in generated images. Nonetheless, existing methods face +challenges in effectively composing multiple LoRAs, especially as the number of +LoRAs to be integrated grows, thus hindering the creation of complex imagery. +In this paper, we study multi-LoRA composition through a decoding-centric +perspective. We present two training-free methods: LoRA Switch, which +alternates between different LoRAs at each denoising step, and LoRA Composite, +which simultaneously incorporates all LoRAs to guide more cohesive image +synthesis. To evaluate the proposed approaches, we establish ComposLoRA, a new +comprehensive testbed as part of this research. It features a diverse range of +LoRA categories with 480 composition sets. Utilizing an evaluation framework +based on GPT-4V, our findings demonstrate a clear improvement in performance +with our methods over the prevalent baseline, particularly evident when +increasing the number of LoRAs in a composition. The code, benchmarks, LoRA +weights, and all evaluation details are available on our project website: +https://maszhongming.github.io/Multi-LoRA-Composition. + +
+
+ comment: Transactions on Machine Learning Research (TMLR), 2024 +
+
+
+
+
+ + ♻ ☆ Head and Neck Tumor Segmentation from [18F]F-FDG PET/CT Images Based on + 3D Diffusion Model + + +
+ Head and neck (H&N) cancers are among the most prevalent types of cancer +worldwide, and [18F]F-FDG PET/CT is widely used for H&N cancer management. +Recently, the diffusion model has demonstrated remarkable performance in +various image-generation tasks. In this work, we proposed a 3D diffusion model +to accurately perform H&N tumor segmentation from 3D PET and CT volumes. The 3D +diffusion model was developed considering the 3D nature of PET and CT images +acquired. During the reverse process, the model utilized a 3D UNet structure +and took the concatenation of PET, CT, and Gaussian noise volumes as the +network input to generate the tumor mask. Experiments based on the HECKTOR +challenge dataset were conducted to evaluate the effectiveness of the proposed +diffusion model. Several state-of-the-art techniques based on U-Net and +Transformer structures were adopted as the reference methods. Benefits of +employing both PET and CT as the network input as well as further extending the +diffusion model from 2D to 3D were investigated based on various quantitative +metrics and the uncertainty maps generated. Results showed that the proposed 3D +diffusion model could generate more accurate segmentation results compared with +other methods. Compared to the diffusion model in 2D format, the proposed 3D +model yielded superior results. Our experiments also highlighted the advantage +of utilizing dual-modality PET and CT data over only single-modality data for +H&N tumor segmentation. + +
+
+
+
+
+ + ♻ ☆ MVInpainter: Learning Multi-View Consistent Inpainting to Bridge 2D and + 3D Editing NeurIPS2024 + + +
+ Novel View Synthesis (NVS) and 3D generation have recently achieved prominent +improvements. However, these works mainly focus on confined categories or +synthetic 3D assets, which are discouraged from generalizing to challenging +in-the-wild scenes and fail to be employed with 2D synthesis directly. +Moreover, these methods heavily depended on camera poses, limiting their +real-world applications. To overcome these issues, we propose MVInpainter, +re-formulating the 3D editing as a multi-view 2D inpainting task. Specifically, +MVInpainter partially inpaints multi-view images with the reference guidance +rather than intractably generating an entirely novel view from scratch, which +largely simplifies the difficulty of in-the-wild NVS and leverages unmasked +clues instead of explicit pose conditions. To ensure cross-view consistency, +MVInpainter is enhanced by video priors from motion components and appearance +guidance from concatenated reference key&value attention. Furthermore, +MVInpainter incorporates slot attention to aggregate high-level optical flow +features from unmasked regions to control the camera movement with pose-free +training and inference. Sufficient scene-level experiments on both +object-centric and forward-facing datasets verify the effectiveness of +MVInpainter, including diverse tasks, such as multi-view object removal, +synthesis, insertion, and replacement. The project page is +https://ewrfcas.github.io/MVInpainter/. + +
+
+ comment: Project page: https://ewrfcas.github.io/MVInpainter/. Accepted at + NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting + + +
+ Previous text-to-4D methods have leveraged multiple Score Distillation +Sampling (SDS) techniques, combining motion priors from video-based diffusion +models (DMs) with geometric priors from multiview DMs to implicitly guide 4D +renderings. However, differences in these priors result in conflicting gradient +directions during optimization, causing trade-offs between motion fidelity and +geometry accuracy, and requiring substantial optimization time to reconcile the +models. In this paper, we introduce \textbf{P}ixel-\textbf{L}evel +\textbf{A}lignment for text-driven \textbf{4D} Gaussian splatting (PLA4D) to +resolve this motion-geometry conflict. PLA4D provides an anchor reference, +i.e., text-generated video, to align the rendering process conditioned by +different DMs in pixel space. For static alignment, our approach introduces a +focal alignment method and Gaussian-Mesh contrastive learning to iteratively +adjust focal lengths and provide explicit geometric priors at each timestep. At +the dynamic level, a motion alignment technique and T-MV refinement method are +employed to enforce both pose alignment and motion continuity across unknown +viewpoints, ensuring intrinsic geometric consistency across views. With such +pixel-level multi-DM alignment, our PLA4D framework is able to generate 4D +objects with superior geometric, motion, and semantic consistency. Fully +implemented with open-source tools, PLA4D offers an efficient and accessible +solution for high-quality 4D digital content creation with significantly +reduced generation time. + +
+
+
+
+
+ + ♻ ☆ JoyVASA: Portrait and Animal Image Animation with Diffusion-Based + Audio-Driven Facial Dynamics and Head Motion Generation + + +
+ Audio-driven portrait animation has made significant advances with +diffusion-based models, improving video quality and lipsync accuracy. However, +the increasing complexity of these models has led to inefficiencies in training +and inference, as well as constraints on video length and inter-frame +continuity. In this paper, we propose JoyVASA, a diffusion-based method for +generating facial dynamics and head motion in audio-driven facial animation. +Specifically, in the first stage, we introduce a decoupled facial +representation framework that separates dynamic facial expressions from static +3D facial representations. This decoupling allows the system to generate longer +videos by combining any static 3D facial representation with dynamic motion +sequences. Then, in the second stage, a diffusion transformer is trained to +generate motion sequences directly from audio cues, independent of character +identity. Finally, a generator trained in the first stage uses the 3D facial +representation and the generated motion sequences as inputs to render +high-quality animations. With the decoupled facial representation and the +identity-independent motion generation process, JoyVASA extends beyond human +portraits to animate animal faces seamlessly. The model is trained on a hybrid +dataset of private Chinese and public English data, enabling multilingual +support. Experimental results validate the effectiveness of our approach. +Future work will focus on improving real-time performance and refining +expression control, further expanding the applications in portrait animation. +The code is available at: https://github.com/jdh-algo/JoyVASA. + +
+
+
+
+
+ + ♻ ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic datasets that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. Our code is available at +https://github.com/AngusDujw/Diversity-Driven-Synthesis.https://github.com/AngusDujw/Diversity-Driven-Synthesis. + +
+
+
+
+
+ + ♻ ☆ SOWA: Adapting Hierarchical Frozen Window Self-Attention to + Visual-Language Models for Better Anomaly Detection + + +
+ Visual anomaly detection is essential in industrial manufacturing, yet +traditional methods often rely heavily on extensive normal datasets and +task-specific models, limiting their scalability. Recent advancements in +large-scale vision-language models have significantly enhanced zero- and +few-shot anomaly detection. However, these approaches may not fully leverage +hierarchical features, potentially overlooking nuanced details crucial for +accurate detection. To address this, we introduce a novel window self-attention +mechanism based on the CLIP model, augmented with learnable prompts to process +multi-level features within a Soldier-Officer Window Self-Attention (SOWA) +framework. Our method has been rigorously evaluated on five benchmark datasets, +achieving superior performance by leading in 18 out of 20 metrics, setting a +new standard against existing state-of-the-art techniques. + +
+
+ comment: 8 pages, 9 figures, conference +
+
+
+
+
+ + ♻ ☆ Towards Unsupervised Blind Face Restoration using Diffusion Prior WACV 2025 + + +
+ Blind face restoration methods have shown remarkable performance, +particularly when trained on large-scale synthetic datasets with supervised +learning. These datasets are often generated by simulating low-quality face +images with a handcrafted image degradation pipeline. The models trained on +such synthetic degradations, however, cannot deal with inputs of unseen +degradations. In this paper, we address this issue by using only a set of input +images, with unknown degradations and without ground truth targets, to +fine-tune a restoration model that learns to map them to clean and contextually +consistent outputs. We utilize a pre-trained diffusion model as a generative +prior through which we generate high quality images from the natural image +distribution while maintaining the input image content through consistency +constraints. These generated images are then used as pseudo targets to +fine-tune a pre-trained restoration model. Unlike many recent approaches that +employ diffusion models at test time, we only do so during training and thus +maintain an efficient inference-time performance. Extensive experiments show +that the proposed approach can consistently improve the perceptual quality of +pre-trained blind face restoration models while maintaining great consistency +with the input contents. Our best model also achieves the state-of-the-art +results on both synthetic and real-world datasets. + +
+
+ comment: WACV 2025. Project page: https://dt-bfr.github.io/ +
+
+
+
+
+
+
+
+ + Systems and Control 20 + +
+
+
+ + ☆ ACING: Actor-Critic for Instruction Learning in Black-Box Large Language + Models + + +
+ The effectiveness of Large Language Models (LLMs) in solving tasks vastly +depends on the quality of the instructions, which often require fine-tuning +through extensive human effort. This highlights the need for automated +instruction optimization; however, this optimization is particularly +challenging when dealing with black-box LLMs, where model parameters and +gradients remain inaccessible. We propose ACING, a task-specific prompt +optimization approach framed as a stateless continuous-action Reinforcement +Learning (RL) problem, known as the continuum bandit setting. ACING leverages +an actor-critic-based method to optimize prompts, learning from +non-differentiable reward signals. We validate ACING by optimizing prompts for +ChatGPT on 30 instruction-based tasks. ACING consistently outperforms baseline +methods, achieving a median score improvement of 10 percentage points. +Furthermore, ACING not only recovers but also surpasses human-crafted expert +instructions, achieving up to a 39 percentage point improvement against human +benchmarks. + +
+
+
+
+
+ + ☆ Distributed Coordination of Grid-Forming and Grid-Following + Inverter-Based Resources for Optimal Frequency Control in Power Systems + + +
+ With the fast-growing penetration of power inverter-interfaced renewable +generation, power systems face significant challenges in maintaining power +balance and the nominal frequency. This paper studies the grid-level +coordinated control of a mix of grid-forming (GFM) and grid-following (GFL) +inverter-based resources (IBRs) for power system frequency regulation at scale. +Specifically, a fully distributed optimal frequency control algorithm is +proposed by leveraging the projected primal-dual gradient method and the +structure of the physical system dynamics. This algorithm 1) restores the +nominal frequency, 2) minimizes the total control cost, 3) respects the IBR +power limits and the line thermal constraints, and 4) is implemented in a +distributed fashion that only needs local measurement and local communication. +The effectiveness and optimality of the proposed algorithm are demonstrated +through high-fidelity electromagnetic transient (EMT) simulations on the IEEE +39-bus system. + +
+
+
+
+
+ + ☆ Steady-State Initialization of Object-Oriented Advanced Thermal Power + Generation System Models with Application to the Case of the SOS-CO2 Cycle + + +
+ The forthcoming energy transition calls for a new generation of thermal power +generation systems with low- or zero-emission and highly flexible operation. +Dynamic modelling and simulation is a key enabling factor in this field, as +controlling such plants is a difficult task for which there is no previous +experience and very short design times are expected. The steady-state +initialization of those dynamic models is an essential step in the design +process, but is unfortunately a difficult task which involves the numerical +solution of large systems of nonlinear equations with iterative Newton methods, +which is often prone to numerical failures. + In this work, several strategies and methodologies are discussed to +successfully achieve steady-state initialization of first-principles +equation-based, object-oriented models of advanced thermal power generation +systems. These are presented in the context of the Modelica modelling language, +but could be applied to other equation-based, object-oriented modelling and +simulation environments. + Finally, the successful application of such strategies and methodologies to +the SOS-CO2 advanced power generation system is presented. + +
+
+ comment: Submitted to Simulation Modelling Practice and Theory +
+
+
+
+
+ + ☆ Smart Predict-then-Optimize Method with Dependent Data: Risk Bounds and + Calibration of Autoregression + + +
+ The predict-then-optimize (PTO) framework is indispensable for addressing +practical stochastic decision-making tasks. It consists of two crucial steps: +initially predicting unknown parameters of an optimization model and +subsequently solving the problem based on these predictions. Elmachtoub and +Grigas [1] introduced the Smart Predict-then-Optimize (SPO) loss for the +framework, which gauges the decision error arising from predicted parameters, +and a convex surrogate, the SPO+ loss, which incorporates the underlying +structure of the optimization model. The consistency of these different loss +functions is guaranteed under the assumption of i.i.d. training data. +Nevertheless, various types of data are often dependent, such as power load +fluctuations over time. This dependent nature can lead to diminished model +performance in testing or real-world applications. Motivated to make +intelligent predictions for time series data, we present an autoregressive SPO +method directly targeting the optimization problem at the decision stage in +this paper, where the conditions of consistency are no longer met. Therefore, +we first analyze the generalization bounds of the SPO loss within our +autoregressive model. Subsequently, the uniform calibration results in Liu and +Grigas [2] are extended in the proposed model. Finally, we conduct experiments +to empirically demonstrate the effectiveness of the SPO+ surrogate compared to +the absolute loss and the least squares loss, especially when the cost vectors +are determined by stationary dynamical systems and demonstrate the relationship +between normalized regret and mixing coefficients. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Probabilistic Day-Ahead Battery Scheduling based on Mixed Random + Variables for Enhanced Grid Operation + + +
+ The increasing penetration of renewable energy sources introduces significant +challenges to power grid stability, primarily due to their inherent +variability. A new opportunity for grid operation is the smart integration of +electricity production combined with battery storages in residential buildings. +This study explores how residential battery systems can aid in stabilizing the +power grid by flexibly managing deviations from forecasted residential power +consumption and PV generation. The key contribution of this work is the +development of an analytical approach that enables the asymmetric allocation of +quantified power uncertainties between a residential battery system and the +power grid, introducing a new degree of freedom into the scheduling problem. +This is accomplished by employing mixed random variables - characterized by +both continuous and discrete events - to model battery and grid power +uncertainties. These variables are embedded into a continuous stochastic +optimization framework, which computes probabilistic schedules for battery +operation and power exchange with the grid. Test cases demonstrate that the +proposed framework can be used effectively to reduce and quantify grid +uncertainties while minimizing electricity costs. It is also shown that +residential battery systems can be actively used to provide flexibility during +critical periods of grid operation. Overall, this framework empowers prosumers +to take an active role in grid stabilization, contributing to a more resilient +and adaptive energy system. + +
+
+ comment: 12 pages, 7 figures, submitted to IREP 2025 Symposium +
+
+
+
+
+ + ☆ Robotic transcatheter tricuspid valve replacement with hybrid enhanced + intelligence: a new paradigm and first-in-vivo study + + +
+ Transcatheter tricuspid valve replacement (TTVR) is the latest treatment for +tricuspid regurgitation and is in the early stages of clinical adoption. +Intelligent robotic approaches are expected to overcome the challenges of +surgical manipulation and widespread dissemination, but systems and protocols +with high clinical utility have not yet been reported. In this study, we +propose a complete solution that includes a passive stabilizer, robotic drive, +detachable delivery catheter and valve manipulation mechanism. Working towards +autonomy, a hybrid augmented intelligence approach based on reinforcement +learning, Monte Carlo probabilistic maps and human-robot co-piloted control was +introduced. Systematic tests in phantom and first-in-vivo animal experiments +were performed to verify that the system design met the clinical requirement. +Furthermore, the experimental results confirmed the advantages of co-piloted +control over conventional master-slave control in terms of time efficiency, +control efficiency, autonomy and stability of operation. In conclusion, this +study provides a comprehensive pathway for robotic TTVR and, to our knowledge, +completes the first animal study that not only successfully demonstrates the +application of hybrid enhanced intelligence in interventional robotics, but +also provides a solution with high application value for a cutting-edge +procedure. + +
+
+
+
+
+ + ☆ Service Restoration for Distribution Systems Based on Semi-Analytical + Metamodeling of Decision-Dependent Interruption Cost and Cold Load Pickup + + +
+ Developing optimized restoration strategies for power distribution systems +(PDSs) is essential to meet the pressing demand for enhanced resilience. Prior +knowledge of customer interruption cost (CIC) and load restoration behaviors, +particularly cold load pickup (CLPU), is crucial for guiding effective +restoration; however, both are reciprocally affected by the realized customer +interruption duration (CID), making them decision-dependent and challenging to +model especially given the limited understanding of underlying physical +mechanisms. This paper presents a novel approach by constructing tractable +metamodels to capture the varying patterns of CIC and CLPU with CID - patterns +which can be derived from limited data and reflect observed surface-level +correlations rather than underlying mechanisms, thereby enabling practical +surrogate modeling of these decision-dependencies. Specifically, quadratic +functions are used to model the increasing rate of CIC with CID based on data +fitting. Several defining characteristics of CLPU are extracted, each modeled +in a piecewise linear form relative to CID, and the actual restored load +accounting for CLPU is subsequently retrieved. Building on these metamodels, a +PDS restoration optimization model is constructed, incorporating mobile energy +storage systems (MESSs) and network reconfiguration. Case studies validate our +approach and also highlight MESS's unique potential to accelerate CLPU-related +restoration. + +
+
+ comment: 10 pages, 10 figures, submitted to IEEE Transactions on Smart Grid +
+
+
+
+
+ + ☆ Age of Information Minimization in UAV-Assisted Covert Communication: + Trajectory and Beamforming Design + + +
+ Unmanned aerial vehicles (UAVs) have the potential for time-sensitive +applications. Due to wireless channel variation, received data may have an +expiration time, particularly in critical situations such as rescue operations, +natural disasters, or the military. Age of Information (AoI) is a metric that +measures the freshness of received packets to specify the validity period of +information. In addition, it is necessary to guarantee the privacy of +confidential information transmission through air-to-ground links against +eavesdroppers. This paper investigates UAV-assisted covert communication to +minimize AoI in the presence of an aerial eavesdropper for the first time. +However, to ensure the eavesdropper's error detection rate, UAV-enabled +beamforming employs the power-domain non-orthogonal multiple access (PD-NOMA) +technique to cover the covert user by a public user. PD-NOMA technique +significantly improves the user's AoI, too. The joint optimization problem +contains non-convex constraints and coupled optimization variables, including +UAV trajectory, beamforming design, and the user's AoI which is challenging to +derive a direct solution. We have developed an efficient alternating +optimization technique to address the formulated optimization problem. +Numerical results demonstrate the impact of the main parameters on the +performance of the proposed communication system. + +
+
+
+
+
+ + ☆ The Soft-PVTOL: modeling and control + + +
+ This paper presents, for the first time, the soft planar vertical take-off +and landing (Soft-PVTOL) aircraft. This concept captures the soft aerial +vehicle's fundamental dynamics with a minimum number of states and inputs but +retains the main features to consider when designing control laws. Unlike +conventional PVTOL and multi-rotors, where altering position inevitably impacts +orientation due to their underactuated design, the Soft-PVTOL offers the unique +advantage of separating these dynamics, opening doors to unparalleled +maneuverability and precision. We demonstrate that the Soft-PVTOL can be +modeled using the Euler-Lagrange equations by assuming a constant curvature +model in the aerial robot's arms. Such a mathematical model is presented in +detail and can be extended to several constant curvature segments in each +Soft-PVTOL arm. Moreover, we design a passivity-based control law that exploits +the flexibility of the robot's arms. We solve the tracking control problem, +proving that the error equilibrium globally exponentially converges to zero. +The controller is tested in numerical simulations, demonstrating robust +performance and ensuring the efficacy of the closed-loop system. + +
+
+ comment: This manuscript has been submitted for peer review +
+
+
+
+
+ + ☆ A Control Lyapunov Function Approach to Event-Triggered Parameterized + Control for Discrete-Time Linear Systems + + +
+ This paper proposes an event-triggered parameterized control method using a +control Lyapunov function approach for discrete time linear systems with +external disturbances. In this control method, each control input to the plant +is a linear combination of a fixed set of linearly independent scalar +functions. The controller updates the coefficients of the parameterized control +input in an event-triggered manner so as to minimize a quadratic cost function +subject to quadratic constraints and communicates the same to the actuator. We +design an event-triggering rule that guarantees global uniform ultimate +boundedness of trajectories of the closed loop system and non-trivial +inter-event times. We illustrate our results through numerical examples and we +also compare the performance of the proposed control method with other existing +control methods in the literature. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2402.16337 +
+
+
+
+
+ + ☆ Action-Attentive Deep Reinforcement Learning for Autonomous Alignment of + Beamlines + + +
+ Synchrotron radiation sources play a crucial role in fields such as materials +science, biology, and chemistry. The beamline, a key subsystem of the +synchrotron, modulates and directs the radiation to the sample for analysis. +However, the alignment of beamlines is a complex and time-consuming process, +primarily carried out manually by experienced engineers. Even minor +misalignments in optical components can significantly affect the beam's +properties, leading to suboptimal experimental outcomes. Current automated +methods, such as bayesian optimization (BO) and reinforcement learning (RL), +although these methods enhance performance, limitations remain. The +relationship between the current and target beam properties, crucial for +determining the adjustment, is not fully considered. Additionally, the physical +characteristics of optical elements are overlooked, such as the need to adjust +specific devices to control the output beam's spot size or position. This paper +addresses the alignment of beamlines by modeling it as a Markov Decision +Process (MDP) and training an intelligent agent using RL. The agent calculates +adjustment values based on the current and target beam states, executes +actions, and iterates until optimal parameters are achieved. A policy network +with action attention is designed to improve decision-making by considering +both state differences and the impact of optical components. Experiments on two +simulated beamlines demonstrate that our algorithm outperforms existing +methods, with ablation studies highlighting the effectiveness of the action +attention-based policy network. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ☆ Microsegmented Cloud Network Architecture Using Open-Source Tools for a + Zero Trust Foundation + + +
+ This paper presents a multi-cloud networking architecture built on zero trust +principles and micro-segmentation to provide secure connectivity with +authentication, authorization, and encryption in transit. The proposed design +includes the multi-cloud network to support a wide range of applications and +workload use cases, compute resources including containers, virtual machines, +and cloud-native services, including IaaS (Infrastructure as a Service (IaaS), +PaaS (Platform as a service). Furthermore, open-source tools provide +flexibility, agility, and independence from locking to one vendor technology. +The paper provides a secure architecture with micro-segmentation and follows +zero trust principles to solve multi-fold security and operational challenges. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Sensor-fusion based Prognostics Framework for Complex Engineering + Systems Exhibiting Multiple Failure Modes + + +
+ Complex engineering systems are often subject to multiple failure modes. +Developing a remaining useful life (RUL) prediction model that does not +consider the failure mode causing degradation is likely to result in inaccurate +predictions. However, distinguishing between causes of failure without manually +inspecting the system is nontrivial. This challenge is increased when the +causes of historically observed failures are unknown. Sensors, which are useful +for monitoring the state-of-health of systems, can also be used for +distinguishing between multiple failure modes as the presence of multiple +failure modes results in discriminatory behavior of the sensor signals. When +systems are equipped with multiple sensors, some sensors may exhibit behavior +correlated with degradation, while other sensors do not. Furthermore, which +sensors exhibit this behavior may differ for each failure mode. In this paper, +we present a simultaneous clustering and sensor selection approach for +unlabeled training datasets of systems exhibiting multiple failure modes. The +cluster assignments and the selected sensors are then utilized in real-time to +first diagnose the active failure mode and then to predict the system RUL. We +validate the complete pipeline of the methodology using a simulated dataset of +systems exhibiting two failure modes and on a turbofan degradation dataset from +NASA. + +
+
+
+
+
+ + ☆ Tangential Randomization in Linear Bandits (TRAiL): Guaranteed Inference + and Regret Bounds + + +
+ We propose and analyze TRAiL (Tangential Randomization in Linear Bandits), a +computationally efficient regret-optimal forced exploration algorithm for +linear bandits on action sets that are sublevel sets of strongly convex +functions. TRAiL estimates the governing parameter of the linear bandit problem +through a standard regularized least squares and perturbs the reward-maximizing +action corresponding to said point estimate along the tangent plane of the +convex compact action set before projecting back to it. Exploiting +concentration results for matrix martingales, we prove that TRAiL ensures a +$\Omega(\sqrt{T})$ growth in the inference quality, measured via the minimum +eigenvalue of the design (regressor) matrix with high-probability over a +$T$-length period. We build on this result to obtain an $\mathcal{O}(\sqrt{T} +\log(T))$ upper bound on cumulative regret with probability at least $ 1 - 1/T$ +over $T$ periods, and compare TRAiL to other popular algorithms for linear +bandits. Then, we characterize an $\Omega(\sqrt{T})$ minimax lower bound for +any algorithm on the expected regret that covers a wide variety of +action/parameter sets and noise processes. Our analysis not only expands the +realm of lower-bounds in linear bandits significantly, but as a byproduct, +yields a trade-off between regret and inference quality. Specifically, we prove +that any algorithm with an $\mathcal{O}(T^\alpha)$ expected regret growth must +have an $\Omega(T^{1-\alpha})$ asymptotic growth in expected inference quality. +Our experiments on the $L^p$ unit ball as action sets reveal how this relation +can be violated, but only in the short-run, before returning to respect the +bound asymptotically. In effect, regret-minimizing algorithms must have just +the right rate of inference -- too fast or too slow inference will incur +sub-optimal regret growth. + +
+
+ comment: 42 pages, 6 Figures +
+
+
+
+
+ + ☆ Development of a Comprehensive Physics-Based Battery Model and Its + Multidimensional Comparison with an Equivalent-Circuit Model: Accuracy, + Complexity, and Real-World Performance under Varying Conditions + + +
+ This paper develops a comprehensive physics-based model (PBM) that spans a +wide operational range, including varying temperatures, charge/discharge +conditions, and real-world field data cycles. The PBM incorporates key factors +such as hysteresis effects, concentration-dependent diffusivity, and the +Arrhenius law to provide a realistic depiction of battery behavior. +Additionally, the paper presents an in-depth analysis comparing the PBM with an +equivalent-circuit model (ECM) for accurately capturing the dynamics of +lithium-ion batteries under diverse operating conditions. To ensure a fair +comparison, both the PBM and ECM are rigorously calibrated and validated +through parameter identification and testing across 55 different operating +conditions. To the best of the authors' knowledge, this represents the most +comprehensive model calibration and validation effort for PBM and ECM in the +literature to date, encompassing large temperature variations (-20 to +40{\deg}C), various charging/discharging C-rates, and real-world driving +cycles. Comparative analysis between the PBM and ECM highlights key differences +in accuracy, computational complexity, parameterization requirements, and +performance under varying temperature conditions. appropriate models for +battery management applications. + +
+
+
+
+
+ + ☆ Adversarial Multi-Agent Reinforcement Learning for Proactive False Data + Injection Detection + + +
+ Smart inverters are instrumental in the integration of renewable and +distributed energy resources (DERs) into the electric grid. Such inverters rely +on communication layers for continuous control and monitoring, potentially +exposing them to cyber-physical attacks such as false data injection attacks +(FDIAs). We propose to construct a defense strategy against a priori unknown +FDIAs with a multi-agent reinforcement learning (MARL) framework. The first +agent is an adversary that simulates and discovers various FDIA strategies, +while the second agent is a defender in charge of detecting and localizing +FDIAs. This approach enables the defender to be trained against new FDIAs +continuously generated by the adversary. The numerical results demonstrate that +the proposed MARL defender outperforms a supervised offline defender. +Additionally, we show that the detection skills of an MARL defender can be +combined with that of an offline defender through a transfer learning approach. + +
+
+
+
+
+ + ♻ ☆ When are Lossy Energy Storage Optimization Models Convex? + + +
+ We examine a class of optimization problems involving the optimal operation +of a single lossy energy storage system, where energy losses occur during +charging and discharging. These inefficiencies typically lead to a nonconvex +set of feasible charging and discharging power profiles. In this paper, we +derive an equivalent reformulation of this class of optimization problems by +eliminating the charging and discharging power variables and recasting the +problem entirely in terms of the storage state-of-charge variables. We show +that the feasible set of the proposed reformulation is always convex. We also +provide sufficient conditions under which the objective function of the +proposed reformulation is guaranteed to be convex. The conditions provided both +unify and generalize many existing conditions for convexity in the literature. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Cross-Forming Control and Fault Current Limiting for Grid-Forming + Inverters + + +
+ This article proposes a "cross-forming" control concept for grid-forming +inverters operating against grid faults. Cross-forming refers to voltage angle +forming and current magnitude forming. It differs from classical grid-forming +and grid-following paradigms that feature voltage magnitude-and-angle forming +and voltage magnitude-and-angle following (or current magnitude-and-angle +forming), respectively. The cross-forming concept addresses the need for +inverters to remain grid-forming (particularly voltage angle forming, as +required by grid codes) while managing fault current limitation. Simple and +feasible cross-forming control implementations are proposed, enabling inverters +to quickly limit fault currents to a prescribed level while preserving voltage +angle forming for grid-forming synchronization and providing dynamic ancillary +services, during symmetrical or asymmetrical fault ride-through. Moreover, the +cross-forming control yields an equivalent system featuring a constant virtual +impedance and a "normal form" representation, allowing for the extension of +previously established transient stability results to include scenarios +involving current saturation. Simulations and experiments validate the efficacy +of the proposed cross-forming control implementations. + +
+
+
+
+
+ + ♻ ☆ Railway LiDAR semantic segmentation based on intelligent semi-automated + data annotation + + +
+ Automated vehicles rely on an accurate and robust perception of the +environment. Similarly to automated cars, highly automated trains require an +environmental perception. Although there is a lot of research based on either +camera or LiDAR sensors in the automotive domain, very few contributions for +this task exist yet for automated trains. Additionally, no public dataset or +described approach for a 3D LiDAR semantic segmentation in the railway +environment exists yet. Thus, we propose an approach for a point-wise 3D +semantic segmentation based on the 2DPass network architecture using scans and +images jointly. In addition, we present a semi-automated intelligent data +annotation approach, which we use to efficiently and accurately label the +required dataset recorded on a railway track in Germany. To improve performance +despite a still small number of labeled scans, we apply an active learning +approach to intelligently select scans for the training dataset. Our +contributions are threefold: We annotate rail data including camera and LiDAR +data from the railway environment, transfer label the raw LiDAR point clouds +using an image segmentation network, and train a state-of-the-art 3D LiDAR +semantic segmentation network efficiently leveraging active learning. The +trained network achieves good segmentation results with a mean IoU of 71.48% of +9 classes. + +
+
+ comment: This article has been accepted for publication in the IEEE VTC Fall + 2024 +
+
+
+
+
+ + ♻ ☆ SAFE-GIL: SAFEty Guided Imitation Learning for Robotic Systems + + +
+ Behavior cloning (BC) is a widely-used approach in imitation learning, where +a robot learns a control policy by observing an expert supervisor. However, the +learned policy can make errors and might lead to safety violations, which +limits their utility in safety-critical robotics applications. While prior +works have tried improving a BC policy via additional real or synthetic action +labels, adversarial training, or runtime filtering, none of them explicitly +focus on reducing the BC policy's safety violations during training time. We +propose SAFE-GIL, a design-time method to learn safety-aware behavior cloning +policies. SAFE-GIL deliberately injects adversarial disturbance in the system +during data collection to guide the expert towards safety-critical states. This +disturbance injection simulates potential policy errors that the system might +encounter during the test time. By ensuring that training more closely +replicates expert behavior in safety-critical states, our approach results in +safer policies despite policy errors during the test time. We further develop a +reachability-based method to compute this adversarial disturbance. We compare +SAFE-GIL with various behavior cloning techniques and online safety-filtering +methods in three domains: autonomous ground navigation, aircraft taxiing, and +aerial navigation on a quadrotor testbed. Our method demonstrates a significant +reduction in safety failures, particularly in low data regimes where the +likelihood of learning errors, and therefore safety violations, is higher. See +our website here: https://y-u-c.github.io/safegil/ + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ ACING: Actor-Critic for Instruction Learning in Black-Box Large Language + Models + + +
+ The effectiveness of Large Language Models (LLMs) in solving tasks vastly +depends on the quality of the instructions, which often require fine-tuning +through extensive human effort. This highlights the need for automated +instruction optimization; however, this optimization is particularly +challenging when dealing with black-box LLMs, where model parameters and +gradients remain inaccessible. We propose ACING, a task-specific prompt +optimization approach framed as a stateless continuous-action Reinforcement +Learning (RL) problem, known as the continuum bandit setting. ACING leverages +an actor-critic-based method to optimize prompts, learning from +non-differentiable reward signals. We validate ACING by optimizing prompts for +ChatGPT on 30 instruction-based tasks. ACING consistently outperforms baseline +methods, achieving a median score improvement of 10 percentage points. +Furthermore, ACING not only recovers but also surpasses human-crafted expert +instructions, achieving up to a 39 percentage point improvement against human +benchmarks. + +
+
+
+
+
+ + ☆ Benchmarking Positional Encodings for GNNs and Graph Transformers + + +
+ Recent advances in Graph Neural Networks (GNNs) and Graph Transformers (GTs) +have been driven by innovations in architectures and Positional Encodings +(PEs), which are critical for augmenting node features and capturing graph +topology. PEs are essential for GTs, where topological information would +otherwise be lost without message-passing. However, PEs are often tested +alongside novel architectures, making it difficult to isolate their effect on +established models. To address this, we present a comprehensive benchmark of +PEs in a unified framework that includes both message-passing GNNs and GTs. We +also establish theoretical connections between MPNNs and GTs and introduce a +sparsified GRIT attention mechanism to examine the influence of global +connectivity. Our findings demonstrate that previously untested combinations of +GNN architectures and PEs can outperform existing methods and offer a more +comprehensive picture of the state-of-the-art. To support future research and +experimentation in our framework, we make the code publicly available. + +
+
+
+
+
+ + ☆ Testing classical properties from quantum data + + +
+ Many properties of Boolean functions can be tested far more efficiently than +the function can be learned. However, this advantage often disappears when +testers are limited to random samples--a natural setting for data +science--rather than queries. In this work we investigate the quantum version +of this scenario: quantum algorithms that test properties of a function $f$ +solely from quantum data in the form of copies of the function state for $f$. + For three well-established properties, we show that the speedup lost when +restricting classical testers to samples can be recovered by testers that use +quantum data. For monotonicity testing, we give a quantum algorithm that uses +$\tilde{\mathcal{O}}(n^2)$ function state copies as compared to the +$2^{\Omega(\sqrt{n})}$ samples required classically. We also present +$\mathcal{O}(1)$-copy testers for symmetry and triangle-freeness, comparing +favorably to classical lower bounds of $\Omega(n^{1/4})$ and $\Omega(n)$ +samples respectively. These algorithms are time-efficient and necessarily +include techniques beyond the Fourier sampling approaches applied to earlier +testing problems. + These results make the case for a general study of the advantages afforded by +quantum data for testing. We contribute to this project by complementing our +upper bounds with a lower bound of $\Omega(1/\varepsilon)$ for monotonicity +testing from quantum data in the proximity regime +$\varepsilon\leq\mathcal{O}(n^{-3/2})$. This implies a strict separation +between testing monotonicity from quantum data and from quantum queries--where +$\tilde{\mathcal{O}}(n)$ queries suffice when $\varepsilon=\Theta(n^{-3/2})$. +We also exhibit a testing problem that can be solved from $\mathcal{O}(1)$ +classical queries but requires $\Omega(2^{n/2})$ function state copies, +complementing a separation of the same magnitude in the opposite direction +derived from the Forrelation problem. + +
+
+ comment: 38 + 14 pages, 2 tables, 2 figures +
+
+
+
+
+ + ☆ LazyDINO: Fast, scalable, and efficiently amortized Bayesian inversion + via structure-exploiting and surrogate-driven measure transport + + +
+ We present LazyDINO, a transport map variational inference method for fast, +scalable, and efficiently amortized solutions of high-dimensional nonlinear +Bayesian inverse problems with expensive parameter-to-observable (PtO) maps. +Our method consists of an offline phase in which we construct a +derivative-informed neural surrogate of the PtO map using joint samples of the +PtO map and its Jacobian. During the online phase, when given observational +data, we seek rapid posterior approximation using surrogate-driven training of +a lazy map [Brennan et al., NeurIPS, (2020)], i.e., a structure-exploiting +transport map with low-dimensional nonlinearity. The trained lazy map then +produces approximate posterior samples or density evaluations. Our surrogate +construction is optimized for amortized Bayesian inversion using lazy map +variational inference. We show that (i) the derivative-based reduced basis +architecture [O'Leary-Roseberry et al., Comput. Methods Appl. Mech. Eng., 388 +(2022)] minimizes the upper bound on the expected error in surrogate posterior +approximation, and (ii) the derivative-informed training formulation +[O'Leary-Roseberry et al., J. Comput. Phys., 496 (2024)] minimizes the expected +error due to surrogate-driven transport map optimization. Our numerical results +demonstrate that LazyDINO is highly efficient in cost amortization for Bayesian +inversion. We observe one to two orders of magnitude reduction of offline cost +for accurate posterior approximation, compared to simulation-based amortized +inference via conditional transport and conventional surrogate-driven +transport. In particular, LazyDINO outperforms Laplace approximation +consistently using fewer than 1000 offline samples, while other amortized +inference methods struggle and sometimes fail at 16,000 offline samples. + +
+
+
+
+
+ + ☆ Heuristic-Free Multi-Teacher Learning + + +
+ We introduce Teacher2Task, a novel framework for multi-teacher learning that +eliminates the need for manual aggregation heuristics. Existing multi-teacher +methods typically rely on such heuristics to combine predictions from multiple +teachers, often resulting in sub-optimal aggregated labels and the propagation +of aggregation errors. Teacher2Task addresses these limitations by introducing +teacher-specific input tokens and reformulating the training process. Instead +of relying on aggregated labels, the framework transforms the training data, +consisting of ground truth labels and annotations from N teachers, into N+1 +distinct tasks: N auxiliary tasks that predict the labeling styles of the N +individual teachers, and one primary task that focuses on the ground truth +labels. This approach, drawing upon principles from multiple learning +paradigms, demonstrates strong empirical results across a range of +architectures, modalities, and tasks. + +
+
+
+
+
+ + ☆ Rethinking MUSHRA: Addressing Modern Challenges in Text-to-Speech + Evaluation + + +
+ Despite rapid advancements in TTS models, a consistent and robust human +evaluation framework is still lacking. For example, MOS tests fail to +differentiate between similar models, and CMOS's pairwise comparisons are +time-intensive. The MUSHRA test is a promising alternative for evaluating +multiple TTS systems simultaneously, but in this work we show that its reliance +on matching human reference speech unduly penalises the scores of modern TTS +systems that can exceed human speech quality. More specifically, we conduct a +comprehensive assessment of the MUSHRA test, focusing on its sensitivity to +factors such as rater variability, listener fatigue, and reference bias. Based +on our extensive evaluation involving 471 human listeners across Hindi and +Tamil we identify two primary shortcomings: (i) reference-matching bias, where +raters are unduly influenced by the human reference, and (ii) judgement +ambiguity, arising from a lack of clear fine-grained guidelines. To address +these issues, we propose two refined variants of the MUSHRA test. The first +variant enables fairer ratings for synthesized samples that surpass human +reference quality. The second variant reduces ambiguity, as indicated by the +relatively lower variance across raters. By combining these approaches, we +achieve both more reliable and more fine-grained assessments. We also release +MANGO, a massive dataset of 47,100 human ratings, the first-of-its-kind +collection for Indian languages, aiding in analyzing human preferences and +developing automatic metrics for evaluating TTS systems. + +
+
+ comment: 19 pages, 12 Figures +
+
+
+
+
+ + ☆ Learning multivariate Gaussians with imperfect advice + + +
+ We revisit the problem of distribution learning within the framework of +learning-augmented algorithms. In this setting, we explore the scenario where a +probability distribution is provided as potentially inaccurate advice on the +true, unknown distribution. Our objective is to develop learning algorithms +whose sample complexity decreases as the quality of the advice improves, +thereby surpassing standard learning lower bounds when the advice is +sufficiently accurate. + Specifically, we demonstrate that this outcome is achievable for the problem +of learning a multivariate Gaussian distribution $N(\boldsymbol{\mu}, +\boldsymbol{\Sigma})$ in the PAC learning setting. Classically, in the +advice-free setting, $\tilde{\Theta}(d^2/\varepsilon^2)$ samples are sufficient +and worst case necessary to learn $d$-dimensional Gaussians up to TV distance +$\varepsilon$ with constant probability. When we are additionally given a +parameter $\tilde{\boldsymbol{\Sigma}}$ as advice, we show that +$\tilde{O}(d^{2-\beta}/\varepsilon^2)$ samples suffices whenever $\| +\tilde{\boldsymbol{\Sigma}}^{-1/2} \boldsymbol{\Sigma} +\tilde{\boldsymbol{\Sigma}}^{-1/2} - \boldsymbol{I_d} \|_1 \leq \varepsilon +d^{1-\beta}$ (where $\|\cdot\|_1$ denotes the entrywise $\ell_1$ norm) for any +$\beta > 0$, yielding a polynomial improvement over the advice-free setting. + +
+
+
+
+
+ + ☆ Attribute Inference Attacks for Federated Regression Tasks + + +
+ Federated Learning (FL) enables multiple clients, such as mobile phones and +IoT devices, to collaboratively train a global machine learning model while +keeping their data localized. However, recent studies have revealed that the +training phase of FL is vulnerable to reconstruction attacks, such as attribute +inference attacks (AIA), where adversaries exploit exchanged messages and +auxiliary public information to uncover sensitive attributes of targeted +clients. While these attacks have been extensively studied in the context of +classification tasks, their impact on regression tasks remains largely +unexplored. In this paper, we address this gap by proposing novel model-based +AIAs specifically designed for regression tasks in FL environments. Our +approach considers scenarios where adversaries can either eavesdrop on +exchanged messages or directly interfere with the training process. We +benchmark our proposed attacks against state-of-the-art methods using +real-world datasets. The results demonstrate a significant increase in +reconstruction accuracy, particularly in heterogeneous client datasets, a +common scenario in FL. The efficacy of our model-based AIAs makes them better +candidates for empirically quantifying privacy leakage for federated regression +tasks. + +
+
+
+
+
+ + ☆ IMUVIE: Pickup Timeline Action Localization via Motion Movies + + +
+ Falls among seniors due to difficulties with tasks such as picking up objects +pose significant health and safety risks, impacting quality of life and +independence. Reliable, accessible assessment tools are critical for early +intervention but often require costly clinic-based equipment and trained +personnel, limiting their use in daily life. Existing wearable-based pickup +measurement solutions address some needs but face limitations in +generalizability. + We present IMUVIE, a wearable system that uses motion movies and a +machine-learning model to automatically detect and measure pickup events, +providing a practical solution for frequent monitoring. IMUVIE's design +principles-data normalization, occlusion handling, and streamlined +visuals-enhance model performance and are adaptable to tasks beyond pickup +classification. + In rigorous leave one subject out cross validation evaluations, IMUVIE +achieves exceptional window level localization accuracy of 91-92% for pickup +action classification on 256,291 motion movie frame candidates while +maintaining an event level recall of 97% when evaluated on 129 pickup events. +IMUVIE has strong generalization and performs well on unseen subjects. In an +interview survey, IMUVIE demonstrated strong user interest and trust, with ease +of use identified as the most critical factor for adoption. IMUVIE offers a +practical, at-home solution for fall risk assessment, facilitating early +detection of movement deterioration, and supporting safer, independent living +for seniors. + +
+
+ comment: This is a preprint version, 12 pages, 20 figures, 3 tables +
+
+
+
+
+ + ☆ IoT-Based 3D Pose Estimation and Motion Optimization for Athletes: + Application of C3D and OpenPose + + +
+ This study proposes the IoT-Enhanced Pose Optimization Network (IE-PONet) for +high-precision 3D pose estimation and motion optimization of track and field +athletes. IE-PONet integrates C3D for spatiotemporal feature extraction, +OpenPose for real-time keypoint detection, and Bayesian optimization for +hyperparameter tuning. Experimental results on NTURGB+D and FineGYM datasets +demonstrate superior performance, with AP\(^p50\) scores of 90.5 and 91.0, and +mAP scores of 74.3 and 74.0, respectively. Ablation studies confirm the +essential roles of each module in enhancing model accuracy. IE-PONet provides a +robust tool for athletic performance analysis and optimization, offering +precise technical insights for training and injury prevention. Future work will +focus on further model optimization, multimodal data integration, and +developing real-time feedback mechanisms to enhance practical applications. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Machine Learning Approaches on Crop Pattern Recognition a Comparative + Analysis + + +
+ Monitoring agricultural activities is important to ensure food security. +Remote sensing plays a significant role for large-scale continuous monitoring +of cultivation activities. Time series remote sensing data were used for the +generation of the cropping pattern. Classification algorithms are used to +classify crop patterns and mapped agriculture land used. Some conventional +classification methods including support vector machine (SVM) and decision +trees were applied for crop pattern recognition. However, in this paper, we are +proposing Deep Neural Network (DNN) based classification to improve the +performance of crop pattern recognition and make a comparative analysis with +two (2) other machine learning approaches including Naive Bayes and Random +Forest. + +
+
+ comment: Published in ICNTET2018: International Conference on New Trends in + Engineering & Technology Tirupathi Highway, Tiruvallur Dist Chennai, India, + September 7-8, 2018 +
+
+
+
+
+ + ☆ Auto-Evaluation with Few Labels through Post-hoc Regression + + +
+ Continually evaluating large generative models provides a unique challenge. +Often, human annotations are necessary to evaluate high-level properties of +these models (e.g. in text or images). However, collecting human annotations of +samples can be resource intensive, and using other machine learning systems to +provide the annotations, or automatic evaluation, can introduce systematic +errors into the evaluation. The Prediction Powered Inference (PPI) framework +provides a way of leveraging both the statistical power of automatic evaluation +and a small pool of labelled data to produce a low-variance, unbiased estimate +of the quantity being evaluated for. However, most work on PPI considers a +relatively sizable set of labelled samples, which is not always practical to +obtain. To this end, we present two new PPI-based techniques that leverage +robust regressors to produce even lower variance estimators in the few-label +regime. + +
+
+
+
+
+ + ☆ PoM: Efficient Image and Video Generation with the Polynomial Mixer + + +
+ Diffusion models based on Multi-Head Attention (MHA) have become ubiquitous +to generate high quality images and videos. However, encoding an image or a +video as a sequence of patches results in costly attention patterns, as the +requirements both in terms of memory and compute grow quadratically. To +alleviate this problem, we propose a drop-in replacement for MHA called the +Polynomial Mixer (PoM) that has the benefit of encoding the entire sequence +into an explicit state. PoM has a linear complexity with respect to the number +of tokens. This explicit state also allows us to generate frames in a +sequential fashion, minimizing memory and compute requirement, while still +being able to train in parallel. We show the Polynomial Mixer is a universal +sequence-to-sequence approximator, just like regular MHA. We adapt several +Diffusion Transformers (DiT) for generating images and videos with PoM +replacing MHA, and we obtain high quality samples while using less +computational resources. The code is available at +https://github.com/davidpicard/HoMM. + +
+
+
+
+
+ + ☆ DLBacktrace: A Model Agnostic Explainability for any Deep Learning + Models + + +
+ The rapid advancement of artificial intelligence has led to increasingly +sophisticated deep learning models, which frequently operate as opaque 'black +boxes' with limited transparency in their decision-making processes. This lack +of interpretability presents considerable challenges, especially in high-stakes +applications where understanding the rationale behind a model's outputs is as +essential as the outputs themselves. This study addresses the pressing need for +interpretability in AI systems, emphasizing its role in fostering trust, +ensuring accountability, and promoting responsible deployment in +mission-critical fields. To address the interpretability challenge in deep +learning, we introduce DLBacktrace, an innovative technique developed by the +AryaXAI team to illuminate model decisions across a wide array of domains, +including simple Multi Layer Perceptron (MLPs), Convolutional Neural Networks +(CNNs), Large Language Models (LLMs), Computer Vision Models, and more. + We provide a comprehensive overview of the DLBacktrace algorithm and present +benchmarking results, comparing its performance against established +interpretability methods, such as SHAP, LIME, GradCAM, Integrated Gradients, +SmoothGrad, and Attention Rollout, using diverse task-based metrics. The +proposed DLBacktrace technique is compatible with various model architectures +built in PyTorch and TensorFlow, supporting models like Llama 3.2, other NLP +architectures such as BERT and LSTMs, computer vision models like ResNet and +U-Net, as well as custom deep neural network (DNN) models for tabular data. +This flexibility underscores DLBacktrace's adaptability and effectiveness in +enhancing model transparency across a broad spectrum of applications. The +library is open-sourced and available at https://github.com/AryaXAI/DLBacktrace . + +
+
+
+
+
+ + ☆ Leadsee-Precip: A Deep Learning Diagnostic Model for Precipitation + + +
+ Recently, deep-learning weather forecasting models have surpassed traditional +numerical models in terms of the accuracy of meteorological variables. However, +there is considerable potential for improvements in precipitation forecasts, +especially for heavy precipitation events. To address this deficiency, we +propose Leadsee-Precip, a global deep learning model to generate precipitation +from meteorological circulation fields. The model utilizes an information +balance scheme to tackle the challenges of predicting heavy precipitation +caused by the long-tail distribution of precipitation data. Additionally, more +accurate satellite and radar-based precipitation retrievals are used as +training targets. Compared to artificial intelligence global weather models, +the heavy precipitation from Leadsee-Precip is more consistent with +observations and shows competitive performance against global numerical weather +prediction models. Leadsee-Precip can be integrated with any global circulation +model to generate precipitation forecasts. But the deviations between the +predicted and the ground-truth circulation fields may lead to a weakened +precipitation forecast, which could potentially be mitigated by further +fine-tuning based on the predicted circulation fields. + +
+
+
+
+
+ + ☆ PyAWD: A Library for Generating Large Synthetic Datasets of Acoustic + Wave Propagation with Devito + + +
+ Seismic data is often sparse and unevenly distributed due to the high costs +and logistical challenges associated with deploying physical seismometers, +limiting the application of Machine Learning (ML) in earthquake analysis. To +address this gap, we introduce PyAWD, a Python library designed to generate +high-resolution synthetic datasets simulating spatio-temporal acoustic wave +propagation in both two-dimensional and three-dimensional heterogeneous media. +By allowing fine control over parameters such as wave speed, external forces, +spatial and temporal discretization, and media composition, PyAWD enables the +creation of ML-scale datasets that capture the complexity of seismic wave +behavior. We illustrate the library's potential with an epicenter retrieval +task, showcasing its suitability for designing complex, accurate seismic +problems that support advanced ML approaches in the absence or lack of dense +real-world data. + +
+
+
+
+
+ + ☆ Instant Policy: In-Context Imitation Learning via Graph Diffusion + + +
+ Following the impressive capabilities of in-context learning with large +transformers, In-Context Imitation Learning (ICIL) is a promising opportunity +for robotics. We introduce Instant Policy, which learns new tasks instantly +(without further training) from just one or two demonstrations, achieving ICIL +through two key components. First, we introduce inductive biases through a +graph representation and model ICIL as a graph generation problem with a +learned diffusion process, enabling structured reasoning over demonstrations, +observations, and actions. Second, we show that such a model can be trained +using pseudo-demonstrations - arbitrary trajectories generated in simulation - +as a virtually infinite pool of training data. Simulated and real experiments +show that Instant Policy enables rapid learning of various everyday robot +tasks. We also show how it can serve as a foundation for cross-embodiment and +zero-shot transfer to language-defined tasks. Code and videos are available at +https://www.robot-learning.uk/instant-policy. + +
+
+ comment: Code and videos are available on our project webpage at + https://www.robot-learning.uk/instant-policy +
+
+
+
+
+ + ☆ Exploring the Manifold of Neural Networks Using Diffusion Geometry + + +
+ Drawing motivation from the manifold hypothesis, which posits that most +high-dimensional data lies on or near low-dimensional manifolds, we apply +manifold learning to the space of neural networks. We learn manifolds where +datapoints are neural networks by introducing a distance between the hidden +layer representations of the neural networks. These distances are then fed to +the non-linear dimensionality reduction algorithm PHATE to create a manifold of +neural networks. We characterize this manifold using features of the +representation, including class separation, hierarchical cluster structure, +spectral entropy, and topological structure. Our analysis reveals that +high-performing networks cluster together in the manifold, displaying +consistent embedding patterns across all these features. Finally, we +demonstrate the utility of this approach for guiding hyperparameter +optimization and neural architecture search by sampling from the manifold. + +
+
+
+
+
+ + ☆ A Multimodal Approach Combining Structural and Cross-domain Textual + Guidance for Weakly Supervised OCT Segmentation + + +
+ Accurate segmentation of Optical Coherence Tomography (OCT) images is crucial +for diagnosing and monitoring retinal diseases. However, the labor-intensive +nature of pixel-level annotation limits the scalability of supervised learning +with large datasets. Weakly Supervised Semantic Segmentation (WSSS) provides a +promising alternative by leveraging image-level labels. In this study, we +propose a novel WSSS approach that integrates structural guidance with +text-driven strategies to generate high-quality pseudo labels, significantly +improving segmentation performance. In terms of visual information, our method +employs two processing modules that exchange raw image features and structural +features from OCT images, guiding the model to identify where lesions are +likely to occur. In terms of textual information, we utilize large-scale +pretrained models from cross-domain sources to implement label-informed textual +guidance and synthetic descriptive integration with two textual processing +modules that combine local semantic features with consistent synthetic +descriptions. By fusing these visual and textual components within a multimodal +framework, our approach enhances lesion localization accuracy. Experimental +results on three OCT datasets demonstrate that our method achieves +state-of-the-art performance, highlighting its potential to improve diagnostic +accuracy and efficiency in medical imaging. + +
+
+ comment: 21 pages, 9 figures, 8 tables +
+
+
+
+
+ + ☆ Reward driven workflows for unsupervised explainable analysis of phases + and ferroic variants from atomically resolved imaging data + + +
+ Rapid progress in aberration corrected electron microscopy necessitates +development of robust methods for the identification of phases, ferroic +variants, and other pertinent aspects of materials structure from imaging data. +While unsupervised methods for clustering and classification are widely used +for these tasks, their performance can be sensitive to hyperparameter selection +in the analysis workflow. In this study, we explore the effects of descriptors +and hyperparameters on the capability of unsupervised ML methods to distill +local structural information, exemplified by discovery of polarization and +lattice distortion in Sm doped BiFeO3 (BFO) thin films. We demonstrate that a +reward-driven approach can be used to optimize these key hyperparameters across +the full workflow, where rewards were designed to reflect domain wall +continuity and straightness, ensuring that the analysis aligns with the +material's physical behavior. This approach allows us to discover local +descriptors that are best aligned with the specific physical behavior, +providing insight into the fundamental physics of materials. We further extend +the reward driven workflows to disentangle structural factors of variation via +optimized variational autoencoder (VAE). Finally, the importance of +well-defined rewards was explored as a quantifiable measure of success of the +workflow. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ STREAM: A Universal State-Space Model for Sparse Geometric Data + + +
+ Handling sparse and unstructured geometric data, such as point clouds or +event-based vision, is a pressing challenge in the field of machine vision. +Recently, sequence models such as Transformers and state-space models entered +the domain of geometric data. These methods require specialized preprocessing +to create a sequential view of a set of points. Furthermore, prior works +involving sequence models iterate geometric data with either uniform or learned +step sizes, implicitly relying on the model to infer the underlying geometric +structure. In this work, we propose to encode geometric structure explicitly +into the parameterization of a state-space model. State-space models are based +on linear dynamics governed by a one-dimensional variable such as time or a +spatial coordinate. We exploit this dynamic variable to inject relative +differences of coordinates into the step size of the state-space model. The +resulting geometric operation computes interactions between all pairs of N +points in O(N) steps. Our model deploys the Mamba selective state-space model +with a modified CUDA kernel to efficiently map sparse geometric data to modern +hardware. The resulting sequence model, which we call STREAM, achieves +competitive results on a range of benchmarks from point-cloud classification to +event-based vision and audio classification. STREAM demonstrates a powerful +inductive bias for sparse geometric data by improving the PointMamba baseline +when trained from scratch on the ModelNet40 and ScanObjectNN point cloud +analysis datasets. It further achieves, for the first time, 100% test accuracy +on all 11 classes of the DVS128 Gestures dataset. + +
+
+
+
+
+ + ☆ Hypergraph $p$-Laplacian equations for data interpolation and + semi-supervised learning + + +
+ Hypergraph learning with $p$-Laplacian regularization has attracted a lot of +attention due to its flexibility in modeling higher-order relationships in +data. This paper focuses on its fast numerical implementation, which is +challenging due to the non-differentiability of the objective function and the +non-uniqueness of the minimizer. We derive a hypergraph $p$-Laplacian equation +from the subdifferential of the $p$-Laplacian regularization. A simplified +equation that is mathematically well-posed and computationally efficient is +proposed as an alternative. Numerical experiments verify that the simplified +$p$-Laplacian equation suppresses spiky solutions in data interpolation and +improves classification accuracy in semi-supervised learning. The remarkably +low computational cost enables further applications. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Provable unlearning in topic modeling and downstream tasks + + +
+ Machine unlearning algorithms are increasingly important as legal concerns +arise around the provenance of training data, but verifying the success of +unlearning is often difficult. Provable guarantees for unlearning are often +limited to supervised learning settings. In this paper, we provide the first +theoretical guarantees for unlearning in the pre-training and fine-tuning +paradigm by studying topic models, simple bag-of-words language models that can +be adapted to solve downstream tasks like retrieval and classification. First, +we design a provably effective unlearning algorithm for topic models that +incurs a computational overhead independent of the size of the original +dataset. Our analysis additionally quantifies the deletion capacity of the +model -- i.e., the number of examples that can be unlearned without incurring a +significant cost in model performance. Finally, we formally extend our analyses +to account for adaptation to a given downstream task. In particular, we design +an efficient algorithm to perform unlearning after fine-tuning the topic model +via a linear head. Notably, we show that it is easier to unlearn pre-training +data from models that have been fine-tuned to a particular task, and one can +unlearn this data without modifying the base model. + +
+
+
+
+
+ + ☆ GNNAS-Dock: Budget Aware Algorithm Selection with Graph Neural Networks + for Molecular Docking + + +
+ Molecular docking is a major element in drug discovery and design. It enables +the prediction of ligand-protein interactions by simulating the binding of +small molecules to proteins. Despite the availability of numerous docking +algorithms, there is no single algorithm consistently outperforms the others +across a diverse set of docking scenarios. This paper introduces GNNAS-Dock, a +novel Graph Neural Network (GNN)-based automated algorithm selection system for +molecular docking in blind docking situations. GNNs are accommodated to process +the complex structural data of both ligands and proteins. They benefit from the +inherent graph-like properties to predict the performance of various docking +algorithms under different conditions. The present study pursues two main +objectives: 1) predict the performance of each candidate docking algorithm, in +terms of Root Mean Square Deviation (RMSD), thereby identifying the most +accurate method for specific scenarios; and 2) choose the best computationally +efficient docking algorithm for each docking case, aiming to reduce the time +required for docking while maintaining high accuracy. We validate our approach +on PDBBind 2020 refined set, which contains about 5,300 pairs of protein-ligand +complexes. + +
+
+
+
+
+ + ☆ Procedural Knowledge in Pretraining Drives Reasoning in Large Language + Models + + +
+ The capabilities and limitations of Large Language Models have been sketched +out in great detail in recent years, providing an intriguing yet conflicting +picture. On the one hand, LLMs demonstrate a general ability to solve problems. +On the other hand, they show surprising reasoning gaps when compared to humans, +casting doubt on the robustness of their generalisation strategies. The sheer +volume of data used in the design of LLMs has precluded us from applying the +method traditionally used to measure generalisation: train-test set separation. +To overcome this, we study what kind of generalisation strategies LLMs employ +when performing reasoning tasks by investigating the pretraining data they rely +on. For two models of different sizes (7B and 35B) and 2.5B of their +pretraining tokens, we identify what documents influence the model outputs for +three simple mathematical reasoning tasks and contrast this to the data that +are influential for answering factual questions. We find that, while the models +rely on mostly distinct sets of data for each factual question, a document +often has a similar influence across different reasoning questions within the +same task, indicating the presence of procedural knowledge. We further find +that the answers to factual questions often show up in the most influential +data. However, for reasoning questions the answers usually do not show up as +highly influential, nor do the answers to the intermediate reasoning steps. +When we characterise the top ranked documents for the reasoning questions +qualitatively, we confirm that the influential documents often contain +procedural knowledge, like demonstrating how to obtain a solution using +formulae or code. Our findings indicate that the approach to reasoning the +models use is unlike retrieval, and more like a generalisable strategy that +synthesises procedural knowledge from documents doing a similar form of +reasoning. + +
+
+
+
+
+ + ☆ A data driven approach to classify descriptors based on their efficiency + in translating noisy trajectories into physically-relevant information + + +
+ Reconstructing the physical complexity of many-body dynamical systems can be +challenging. Starting from the trajectories of their constitutive units (raw +data), typical approaches require selecting appropriate descriptors to convert +them into time-series, which are then analyzed to extract interpretable +information. However, identifying the most effective descriptor is often +non-trivial. Here, we report a data-driven approach to compare the efficiency +of various descriptors in extracting information from noisy trajectories and +translating it into physically relevant insights. As a prototypical system with +non-trivial internal complexity, we analyze molecular dynamics trajectories of +an atomistic system where ice and water coexist in equilibrium near the +solid/liquid transition temperature. We compare general and specific +descriptors often used in aqueous systems: number of neighbors, molecular +velocities, Smooth Overlap of Atomic Positions (SOAP), Local Environments and +Neighbors Shuffling (LENS), Orientational Tetrahedral Order, and distance from +the fifth neighbor ($d_5$). Using Onion Clustering -- an efficient unsupervised +method for single-point time-series analysis -- we assess the maximum +extractable information for each descriptor and rank them via a +high-dimensional metric. Our results show that advanced descriptors like SOAP +and LENS outperform classical ones due to higher signal-to-noise ratios. +Nonetheless, even simple descriptors can rival or exceed advanced ones after +local signal denoising. For example, $d_5$, initially among the weakest, +becomes the most effective at resolving the system's non-local dynamical +complexity after denoising. This work highlights the critical role of noise in +information extraction from molecular trajectories and offers a data-driven +approach to identify optimal descriptors for systems with characteristic +internal complexity. + +
+
+ comment: 19 pages, 5 figures + 3 in supporting information (at the bottom of + the manuscript) +
+
+
+
+
+ + ☆ Stream-Based Active Learning for Process Monitoring + + +
+ Statistical process monitoring (SPM) methods are essential tools in quality +management to check the stability of industrial processes, i.e., to dynamically +classify the process state as in control (IC), under normal operating +conditions, or out of control (OC), otherwise. Traditional SPM methods are +based on unsupervised approaches, which are popular because in most industrial +applications the true OC states of the process are not explicitly known. This +hampered the development of supervised methods that could instead take +advantage of process data containing labels on the true process state, although +they still need improvement in dealing with class imbalance, as OC states are +rare in high-quality processes, and the dynamic recognition of unseen classes, +e.g., the number of possible OC states. This article presents a novel +stream-based active learning strategy for SPM that enhances partially hidden +Markov models to deal with data streams. The ultimate goal is to optimize +labeling resources constrained by a limited budget and dynamically update the +possible OC states. The proposed method performance in classifying the true +state of the process is assessed through a simulation and a case study on the +SPM of a resistance spot welding process in the automotive industry, which +motivated this research. + +
+
+
+
+
+ + ☆ UMGAD: Unsupervised Multiplex Graph Anomaly Detection + + +
+ Graph anomaly detection (GAD) is a critical task in graph machine learning, +with the primary objective of identifying anomalous nodes that deviate +significantly from the majority. This task is widely applied in various +real-world scenarios, including fraud detection and social network analysis. +However, existing GAD methods still face two major challenges: (1) They are +often limited to detecting anomalies in single-type interaction graphs and +struggle with multiple interaction types in multiplex heterogeneous graphs; (2) +In unsupervised scenarios, selecting appropriate anomaly score thresholds +remains a significant challenge for accurate anomaly detection. To address the +above challenges, we propose a novel Unsupervised Multiplex Graph Anomaly +Detection method, named UMGAD. We first learn multi-relational correlations +among nodes in multiplex heterogeneous graphs and capture anomaly information +during node attribute and structure reconstruction through graph-masked +autoencoder (GMAE). Then, to further weaken the influence of noise and +redundant information on abnormal information extraction, we generate +attribute-level and subgraph-level augmented-view graphs respectively, and +perform attribute and structure reconstruction through GMAE. Finally, We learn +to optimize node attributes and structural features through contrastive +learning between original-view and augmented-view graphs to improve the model's +ability to capture anomalies. Meanwhile, we also propose a new anomaly score +threshold selection strategy, which allows the model to be independent of the +ground truth in real unsupervised scenarios. Extensive experiments on four +datasets show that our \model significantly outperforms state-of-the-art +methods, achieving average improvements of 13.48% in AUC and 11.68% in Macro-F1 +across all datasets. + +
+
+
+
+
+ + ☆ S3TU-Net: Structured Convolution and Superpixel Transformer for Lung + Nodule Segmentation + + +
+ The irregular and challenging characteristics of lung adenocarcinoma nodules +in computed tomography (CT) images complicate staging diagnosis, making +accurate segmentation critical for clinicians to extract detailed lesion +information. In this study, we propose a segmentation model, S3TU-Net, which +integrates multi-dimensional spatial connectors and a superpixel-based visual +transformer. S3TU-Net is built on a multi-view CNN-Transformer hybrid +architecture, incorporating superpixel algorithms, structured weighting, and +spatial shifting techniques to achieve superior segmentation performance. The +model leverages structured convolution blocks (DWF-Conv/D2BR-Conv) to extract +multi-scale local features while mitigating overfitting. To enhance multi-scale +feature fusion, we introduce the S2-MLP Link, integrating spatial shifting and +attention mechanisms at the skip connections. Additionally, the residual-based +superpixel visual transformer (RM-SViT) effectively merges global and local +features by employing sparse correlation learning and multi-branch attention to +capture long-range dependencies, with residual connections enhancing stability +and computational efficiency. Experimental results on the LIDC-IDRI dataset +demonstrate that S3TU-Net achieves a DSC, precision, and IoU of 89.04%, 90.73%, +and 90.70%, respectively. Compared to recent methods, S3TU-Net improves DSC by +4.52% and sensitivity by 3.16%, with other metrics showing an approximate 2% +increase. In addition to comparison and ablation studies, we validated the +generalization ability of our model on the EPDB private dataset, achieving a +DSC of 86.40%. + +
+
+
+
+
+ + ☆ Predicting Customer Satisfaction by Replicating the Survey Response + Distribution + + +
+ For many call centers, customer satisfaction (CSAT) is a key performance +indicator (KPI). However, only a fraction of customers take the CSAT survey +after the call, leading to a biased and inaccurate average CSAT value, and +missed opportunities for coaching, follow-up, and rectification. Therefore, +call centers can benefit from a model predicting customer satisfaction on calls +where the customer did not complete the survey. Given that CSAT is a closely +monitored KPI, it is critical to minimize any bias in the average predicted +CSAT (pCSAT). In this paper, we introduce a method such that predicted CSAT +(pCSAT) scores accurately replicate the distribution of survey CSAT responses +for every call center with sufficient data in a live production environment. +The method can be applied to many multiclass classification problems to improve +the class balance and minimize its changes upon model updates. + +
+
+
+
+
+ + ☆ Unlocking State-Tracking in Linear RNNs Through Negative Eigenvalues + + +
+ Linear Recurrent Neural Networks (LRNNs) such as Mamba, RWKV, GLA, mLSTM, and +DeltaNet have emerged as efficient alternatives to Transformers in large +language modeling, offering linear scaling with sequence length and improved +training efficiency. However, LRNNs struggle to perform state-tracking which +may impair performance in tasks such as code evaluation or tracking a chess +game. Even parity, the simplest state-tracking task, which non-linear RNNs like +LSTM handle effectively, cannot be solved by current LRNNs. Recently, Sarrof et +al. (2024) demonstrated that the failure of LRNNs like Mamba to solve parity +stems from restricting the value range of their diagonal state-transition +matrices to $[0, 1]$ and that incorporating negative values can resolve this +issue. We extend this result to non-diagonal LRNNs, which have recently shown +promise in models such as DeltaNet. We prove that finite precision LRNNs with +state-transition matrices having only positive eigenvalues cannot solve parity, +while complex eigenvalues are needed to count modulo $3$. Notably, we also +prove that LRNNs can learn any regular language when their state-transition +matrices are products of identity minus vector outer product matrices, each +with eigenvalues in the range $[-1, 1]$. Our empirical results confirm that +extending the eigenvalue range of models like Mamba and DeltaNet to include +negative values not only enables them to solve parity but consistently improves +their performance on state-tracking tasks. Furthermore, pre-training LRNNs with +an extended eigenvalue range for language modeling achieves comparable +performance and stability while showing promise on code and math data. Our work +enhances the expressivity of modern LRNNs, broadening their applicability +without changing the cost of training or inference. + +
+
+
+
+
+ + ☆ Data Pruning in Generative Diffusion Models + + +
+ Data pruning is the problem of identifying a core subset that is most +beneficial to training and discarding the remainder. While pruning strategies +are well studied for discriminative models like those used in classification, +little research has gone into their application to generative models. +Generative models aim to estimate the underlying distribution of the data, so +presumably they should benefit from larger datasets. In this work we aim to +shed light on the accuracy of this statement, specifically answer the question +of whether data pruning for generative diffusion models could have a positive +impact. Contrary to intuition, we show that eliminating redundant or noisy data +in large datasets is beneficial particularly when done strategically. We +experiment with several pruning methods including recent-state-of-art methods, +and evaluate over CelebA-HQ and ImageNet datasets. We demonstrate that a simple +clustering method outperforms other sophisticated and computationally demanding +methods. We further exhibit how we can leverage clustering to balance skewed +datasets in an unsupervised manner to allow fair sampling for underrepresented +populations in the data distribution, which is a crucial problem in generative +models. + +
+
+
+
+
+ + ☆ MAViS: Modular Autonomous Virtualization System for Two-Dimensional + Semiconductor Quantum Dot Arrays + + +
+ Arrays of gate-defined semiconductor quantum dots are among the leading +candidates for building scalable quantum processors. High-fidelity +initialization, control, and readout of spin qubit registers require exquisite +and targeted control over key Hamiltonian parameters that define the +electrostatic environment. However, due to the tight gate pitch, capacitive +crosstalk between gates hinders independent tuning of chemical potentials and +interdot couplings. While virtual gates offer a practical solution, determining +all the required cross-capacitance matrices accurately and efficiently in large +quantum dot registers is an open challenge. Here, we establish a Modular +Automated Virtualization System (MAViS) -- a general and modular framework for +autonomously constructing a complete stack of multi-layer virtual gates in real +time. Our method employs machine learning techniques to rapidly extract +features from two-dimensional charge stability diagrams. We then utilize +computer vision and regression models to self-consistently determine all +relative capacitive couplings necessary for virtualizing plunger and barrier +gates in both low- and high-tunnel-coupling regimes. Using MAViS, we +successfully demonstrate accurate virtualization of a dense two-dimensional +array comprising ten quantum dots defined in a high-quality Ge/SiGe +heterostructure. Our work offers an elegant and practical solution for the +efficient control of large-scale semiconductor quantum dot systems. + +
+
+ comment: 14 pages, 5 figures, 8 pages of supplemental material +
+
+
+
+
+ + ☆ Transformer Neural Processes -- Kernel Regression + + +
+ Stochastic processes model various natural phenomena from disease +transmission to stock prices, but simulating and quantifying their uncertainty +can be computationally challenging. For example, modeling a Gaussian Process +with standard statistical methods incurs an $\mathcal{O}(n^3)$ penalty, and +even using state-of-the-art Neural Processes (NPs) incurs an $\mathcal{O}(n^2)$ +penalty due to the attention mechanism. We introduce the Transformer Neural +Process - Kernel Regression (TNP-KR), a new architecture that incorporates a +novel transformer block we call a Kernel Regression Block (KRBlock), which +reduces the computational complexity of attention in transformer-based Neural +Processes (TNPs) from $\mathcal{O}((n_C+n_T)^2)$ to $O(n_C^2+n_Cn_T)$ by +eliminating masked computations, where $n_C$ is the number of context, and +$n_T$ is the number of test points, respectively, and a fast attention variant +that further reduces all attention calculations to $\mathcal{O}(n_C)$ in space +and time complexity. In benchmarks spanning such tasks as meta-regression, +Bayesian optimization, and image completion, we demonstrate that the full +variant matches the performance of state-of-the-art methods while training +faster and scaling two orders of magnitude higher in number of test points, and +the fast variant nearly matches that performance while scaling to millions of +both test and context points on consumer hardware. + +
+
+
+
+
+ + ☆ Enhancing Reasoning Capabilities of LLMs via Principled Synthetic Logic + Corpus NeurIPS 2024 + + +
+ Large language models (LLMs) are capable of solving a wide range of tasks, +yet they have struggled with reasoning. To address this, we propose +$\textbf{Additional Logic Training (ALT)}$, which aims to enhance LLMs' +reasoning capabilities by program-generated logical reasoning samples. We first +establish principles for designing high-quality samples by integrating symbolic +logic theory and previous empirical insights. Then, based on these principles, +we construct a synthetic corpus named $\textbf{Formal Logic Deduction Diverse}$ +($\textbf{FLD}$$^{\times 2}$), comprising numerous samples of multi-step +deduction with unknown facts, diverse reasoning rules, diverse linguistic +expressions, and challenging distractors. Finally, we empirically show that ALT +on FLD$^{\times2}$ substantially enhances the reasoning capabilities of +state-of-the-art LLMs, including LLaMA-3.1-70B. Improvements include gains of +up to 30 points on logical reasoning benchmarks, up to 10 points on math and +coding benchmarks, and 5 points on the benchmark suite BBH. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Regular-pattern-sensitive CRFs for Distant Label Interactions + + +
+ Linear-chain conditional random fields (CRFs) are a common model component +for sequence labeling tasks when modeling the interactions between different +labels is important. However, the Markov assumption limits linear-chain CRFs to +only directly modeling interactions between adjacent labels. Weighted +finite-state transducers (FSTs) are a related approach which can be made to +model distant label-label interactions, but exact label inference is +intractable for these models in the general case, and the task of selecting an +appropriate automaton structure for the desired interaction types poses a +practical challenge. In this work, we present regular-pattern-sensitive CRFs +(RPCRFs), a method of enriching standard linear-chain CRFs with the ability to +learn long-distance label interactions which occur in user-specified patterns. +This approach allows users to write regular-expression label patterns concisely +specifying which types of interactions the model should take into account, +allowing the model to learn from data whether and in which contexts these +patterns occur. The result can be interpreted alternatively as a CRF augmented +with additional, non-local potentials, or as a finite-state transducer whose +structure is defined by a set of easily-interpretable patterns. Critically, +unlike the general case for FSTs (and for non-chain CRFs), exact training and +inference are tractable for many pattern sets. In this work, we detail how a +RPCRF can be automatically constructed from a set of user-specified patterns, +and demonstrate the model's effectiveness on synthetic data, showing how +different types of patterns can capture different nonlocal dependency +structures in label sequences. + +
+
+
+
+
+ + ☆ Comparing Prior and Learned Time Representations in Transformer Models + of Timeseries + + +
+ What sets timeseries analysis apart from other machine learning exercises is +that time representation becomes a primary aspect of the experiment setup, as +it must adequately represent the temporal relations that are relevant for the +application at hand. In the work described here we study wo different +variations of the Transformer architecture: one where we use the fixed time +representation proposed in the literature and one where the time representation +is learned from the data. Our experiments use data from predicting the energy +output of solar panels, a task that exhibits known periodicities (daily and +seasonal) that is straight-forward to encode in the fixed time representation. +Our results indicate that even in an experiment where the phenomenon is +well-understood, it is difficult to encode prior knowledge due to side-effects +that are difficult to mitigate. We conclude that research work is needed to +work the human into the learning loop in ways that improve the robustness and +trust-worthiness of the network. + +
+
+ comment: Presented at the AI in Natural Sciences and Technology (AINST) track + of the 13th Conference on Artificial Intelligence (SETN 2024), 11-13 + September 2024, Piraeus, Greece +
+
+
+
+
+ + ☆ AI Flow at the Network Edge + + +
+ Recent advancements in large language models (LLMs) and their multimodal +variants have led to remarkable progress across various domains, demonstrating +impressive capabilities and unprecedented potential. In the era of ubiquitous +connectivity, leveraging communication networks to distribute intelligence is a +transformative concept, envisioning AI-powered services accessible at the +network edge. However, pushing large models from the cloud to +resource-constrained environments faces critical challenges. Model inference on +low-end devices leads to excessive latency and performance bottlenecks, while +raw data transmission over limited bandwidth networks causes high communication +overhead. This article presents AI Flow, a framework that streamlines the +inference process by jointly leveraging the heterogeneous resources available +across devices, edge nodes, and cloud servers, making intelligence flow across +networks. To facilitate cooperation among multiple computational nodes, the +proposed framework explores a paradigm shift in the design of communication +network systems from transmitting information flow to intelligence flow, where +the goal of communications is task-oriented and folded into the inference +process. Experimental results demonstrate the effectiveness of the proposed +framework through an image captioning use case, showcasing the ability to +reduce response latency while maintaining high-quality captions. This article +serves as a position paper for identifying the motivation, challenges, and +principles of AI Flow. + +
+
+
+
+
+ + ☆ Empirical Privacy Evaluations of Generative and Predictive Machine + Learning Models -- A review and challenges for practice + + +
+ Synthetic data generators, when trained using privacy-preserving techniques +like differential privacy, promise to produce synthetic data with formal +privacy guarantees, facilitating the sharing of sensitive data. However, it is +crucial to empirically assess the privacy risks associated with the generated +synthetic data before deploying generative technologies. This paper outlines +the key concepts and assumptions underlying empirical privacy evaluation in +machine learning-based generative and predictive models. Then, this paper +explores the practical challenges for privacy evaluations of generative models +for use cases with millions of training records, such as data from statistical +agencies and healthcare providers. Our findings indicate that methods designed +to verify the correct operation of the training algorithm are effective for +large datasets, but they often assume an adversary that is unrealistic in many +scenarios. Based on the findings, we highlight a crucial trade-off between the +computational feasibility of the evaluation and the level of realism of the +assumed threat model. Finally, we conclude with ideas and suggestions for +future research. + +
+
+
+
+
+ + ☆ Dimension Reduction via Sum-of-Squares and Improved Clustering + Algorithms for Non-Spherical Mixtures + + +
+ We develop a new approach for clustering non-spherical (i.e., arbitrary +component covariances) Gaussian mixture models via a subroutine, based on the +sum-of-squares method, that finds a low-dimensional separation-preserving +projection of the input data. Our method gives a non-spherical analog of the +classical dimension reduction, based on singular value decomposition, that +forms a key component of the celebrated spherical clustering algorithm of +Vempala and Wang [VW04] (in addition to several other applications). + As applications, we obtain an algorithm to (1) cluster an arbitrary +total-variation separated mixture of $k$ centered (i.e., zero-mean) Gaussians +with $n\geq \operatorname{poly}(d) f(w_{\min}^{-1})$ samples and +$\operatorname{poly}(n)$ time, and (2) cluster an arbitrary total-variation +separated mixture of $k$ Gaussians with identical but arbitrary unknown +covariance with $n \geq d^{O(\log w_{\min}^{-1})} f(w_{\min}^{-1})$ samples and +$n^{O(\log w_{\min}^{-1})}$ time. Here, $w_{\min}$ is the minimum mixing weight +of the input mixture, and $f$ does not depend on the dimension $d$. Our +algorithms naturally extend to tolerating a dimension-independent fraction of +arbitrary outliers. Before this work, the techniques in the state-of-the-art +non-spherical clustering algorithms needed $d^{O(k)} f(w_{\min}^{-1})$ time and +samples for clustering such mixtures. + Our results may come as a surprise in the context of the $d^{\Omega(k)}$ +statistical query lower bound [DKS17] for clustering non-spherical Gaussian +mixtures. While this result is usually thought to rule out $d^{o(k)}$ cost +algorithms for the problem, our results show that the lower bounds can in fact +be circumvented for a remarkably general class of Gaussian mixtures. + +
+
+ comment: 64 pages +
+
+
+
+
+ + ☆ STRisk: A Socio-Technical Approach to Assess Hacking Breaches Risk + + +
+ Data breaches have begun to take on new dimensions and their prediction is +becoming of great importance to organizations. Prior work has addressed this +issue mainly from a technical perspective and neglected other interfering +aspects such as the social media dimension. To fill this gap, we propose STRisk +which is a predictive system where we expand the scope of the prediction task +by bringing into play the social media dimension. We study over 3800 US +organizations including both victim and non-victim organizations. For each +organization, we design a profile composed of a variety of externally measured +technical indicators and social factors. In addition, to account for unreported +incidents, we consider the non-victim sample to be noisy and propose a noise +correction approach to correct mislabeled organizations. We then build several +machine learning models to predict whether an organization is exposed to +experience a hacking breach. By exploiting both technical and social features, +we achieve a Area Under Curve (AUC) score exceeding 98%, which is 12% higher +than the AUC achieved using only technical features. Furthermore, our feature +importance analysis reveals that open ports and expired certificates are the +best technical predictors, while spreadability and agreeability are the best +social predictors. + +
+
+
+
+
+ + ☆ Non-IID data in Federated Learning: A Systematic Review with Taxonomy, + Metrics, Methods, Frameworks and Future Directions + + +
+ Recent advances in machine learning have highlighted Federated Learning (FL) +as a promising approach that enables multiple distributed users (so-called +clients) to collectively train ML models without sharing their private data. +While this privacy-preserving method shows potential, it struggles when data +across clients is not independent and identically distributed (non-IID) data. +The latter remains an unsolved challenge that can result in poorer model +performance and slower training times. Despite the significance of non-IID data +in FL, there is a lack of consensus among researchers about its classification +and quantification. This systematic review aims to fill that gap by providing a +detailed taxonomy for non-IID data, partition protocols, and metrics to +quantify data heterogeneity. Additionally, we describe popular solutions to +address non-IID data and standardized frameworks employed in FL with +heterogeneous data. Based on our state-of-the-art review, we present key +lessons learned and suggest promising future research directions. + +
+
+
+
+
+ + ☆ RedPajama: an Open Dataset for Training Large Language Models NeurIPS + 2024 + + +
+ Large language models are increasingly becoming a cornerstone technology in +artificial intelligence, the sciences, and society as a whole, yet the optimal +strategies for dataset composition and filtering remain largely elusive. Many +of the top-performing models lack transparency in their dataset curation and +model development processes, posing an obstacle to the development of fully +open language models. In this paper, we identify three core data-related +challenges that must be addressed to advance open-source language models. These +include (1) transparency in model development, including the data curation +process, (2) access to large quantities of high-quality data, and (3) +availability of artifacts and metadata for dataset curation and analysis. To +address these challenges, we release RedPajama-V1, an open reproduction of the +LLaMA training dataset. In addition, we release RedPajama-V2, a massive +web-only dataset consisting of raw, unfiltered text data together with quality +signals and metadata. Together, the RedPajama datasets comprise over 100 +trillion tokens spanning multiple domains and with their quality signals +facilitate the filtering of data, aiming to inspire the development of numerous +new datasets. To date, these datasets have already been used in the training of +strong language models used in production, such as Snowflake Arctic, +Salesforce's XGen and AI2's OLMo. To provide insight into the quality of +RedPajama, we present a series of analyses and ablation studies with +decoder-only language models with up to 1.6B parameters. Our findings +demonstrate how quality signals for web data can be effectively leveraged to +curate high-quality subsets of the dataset, underscoring the potential of +RedPajama to advance the development of transparent and high-performing +language models at scale. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) Track on Datasets and Benchmarks +
+
+
+
+
+ + ☆ Ultra-Sparse Memory Network + + +
+ It is widely acknowledged that the performance of Transformer models is +exponentially related to their number of parameters and computational +complexity. While approaches like Mixture of Experts (MoE) decouple parameter +count from computational complexity, they still face challenges in inference +due to high memory access costs. This work introduces UltraMem, incorporating +large-scale, ultra-sparse memory layer to address these limitations. Our +approach significantly reduces inference latency while maintaining model +performance. We also investigate the scaling laws of this new architecture, +demonstrating that it not only exhibits favorable scaling properties but +outperforms traditional models. In our experiments, we train networks with up +to 20 million memory slots. The results show that our method achieves +state-of-the-art inference speed and model performance within a given +computational budget. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Perfecting Imperfect Physical Neural Networks with Transferable + Robustness using Sharpness-Aware Training + + +
+ AI models are essential in science and engineering, but recent advances are +pushing the limits of traditional digital hardware. To address these +limitations, physical neural networks (PNNs), which use physical substrates for +computation, have gained increasing attention. However, developing effective +training methods for PNNs remains a significant challenge. Current approaches, +regardless of offline and online training, suffer from significant accuracy +loss. Offline training is hindered by imprecise modeling, while online training +yields device-specific models that can't be transferred to other devices due to +manufacturing variances. Both methods face challenges from perturbations after +deployment, such as thermal drift or alignment errors, which make trained +models invalid and require retraining. Here, we address the challenges with +both offline and online training through a novel technique called +Sharpness-Aware Training (SAT), where we innovatively leverage the geometry of +the loss landscape to tackle the problems in training physical systems. SAT +enables accurate training using efficient backpropagation algorithms, even with +imprecise models. PNNs trained by SAT offline even outperform those trained +online, despite modeling and fabrication errors. SAT also overcomes online +training limitations by enabling reliable transfer of models between devices. +Finally, SAT is highly resilient to perturbations after deployment, allowing +PNNs to continuously operate accurately under perturbations without retraining. +We demonstrate SAT across three types of PNNs, showing it is universally +applicable, regardless of whether the models are explicitly known. This work +offers a transformative, efficient approach to training PNNs, addressing +critical challenges in analog computing and enabling real-world deployment. + +
+
+ comment: 24 pages, 4 figures +
+
+
+
+
+ + ☆ Learning from Label Proportions and Covariate-shifted Instances + + +
+ In many applications, especially due to lack of supervision or privacy +concerns, the training data is grouped into bags of instances (feature-vectors) +and for each bag we have only an aggregate label derived from the +instance-labels in the bag. In learning from label proportions (LLP) the +aggregate label is the average of the instance-labels in a bag, and a +significant body of work has focused on training models in the LLP setting to +predict instance-labels. In practice however, the training data may have fully +supervised albeit covariate-shifted source data, along with the usual target +data with bag-labels, and we wish to train a good instance-level predictor on +the target domain. We call this the covariate-shifted hybrid LLP problem. Fully +supervised covariate shifted data often has useful training signals and the +goal is to leverage them for better predictive performance in the hybrid LLP +setting. To achieve this, we develop methods for hybrid LLP which naturally +incorporate the target bag-labels along with the source instance-labels, in the +domain adaptation framework. Apart from proving theoretical guarantees bounding +the target generalization error, we also conduct experiments on several +publicly available datasets showing that our methods outperform LLP and domain +adaptation baselines as well techniques from previous related work. + +
+
+
+
+
+ + ☆ Graph as a feature: improving node classification with non-neural + graph-aware logistic regression + + +
+ Graph Neural Networks (GNNs) and their message passing framework that +leverages both structural and feature information, have become a standard +method for solving graph-based machine learning problems. However, these +approaches still struggle to generalise well beyond datasets that exhibit +strong homophily, where nodes of the same class tend to connect. This +limitation has led to the development of complex neural architectures that pose +challenges in terms of efficiency and scalability. In response to these +limitations, we focus on simpler and more scalable approaches and introduce +Graph-aware Logistic Regression (GLR), a non-neural model designed for node +classification tasks. Unlike traditional graph algorithms that use only a +fraction of the information accessible to GNNs, our proposed model +simultaneously leverages both node features and the relationships between +entities. However instead of relying on message passing, our approach encodes +each node's relationships as an additional feature vector, which is then +combined with the node's self attributes. Extensive experimental results, +conducted within a rigorous evaluation framework, show that our proposed GLR +approach outperforms both foundational and sophisticated state-of-the-art GNN +models in node classification tasks. Going beyond the traditional limited +benchmarks, our experiments indicate that GLR increases generalisation ability +while reaching performance gains in computation time up to two orders of +magnitude compared to it best neural competitor. + +
+
+
+
+
+ + ☆ Attributed Graph Clustering in Collaborative Settings + + +
+ Graph clustering is an unsupervised machine learning method that partitions +the nodes in a graph into different groups. Despite achieving significant +progress in exploiting both attributed and structured data information, graph +clustering methods often face practical challenges related to data isolation. +Moreover, the absence of collaborative methods for graph clustering limits +their effectiveness. + In this paper, we propose a collaborative graph clustering framework for +attributed graphs, supporting attributed graph clustering over vertically +partitioned data with different participants holding distinct features of the +same data. Our method leverages a novel technique that reduces the sample +space, improving the efficiency of the attributed graph clustering method. +Furthermore, we compare our method to its centralized counterpart under a +proximity condition, demonstrating that the successful local results of each +participant contribute to the overall success of the collaboration. + We fully implement our approach and evaluate its utility and efficiency by +conducting experiments on four public datasets. The results demonstrate that +our method achieves comparable accuracy levels to centralized attributed graph +clustering methods. Our collaborative graph clustering framework provides an +efficient and effective solution for graph clustering challenges related to +data isolation. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ☆ C$^{2}$INet: Realizing Incremental Trajectory Prediction with + Prior-Aware Continual Causal Intervention + + +
+ Trajectory prediction for multi-agents in complex scenarios is crucial for +applications like autonomous driving. However, existing methods often overlook +environmental biases, which leads to poor generalization. Additionally, +hardware constraints limit the use of large-scale data across environments, and +continual learning settings exacerbate the challenge of catastrophic +forgetting. To address these issues, we propose the Continual Causal +Intervention (C$^{2}$INet) method for generalizable multi-agent trajectory +prediction within a continual learning framework. Using variational inference, +we align environment-related prior with posterior estimator of confounding +factors in the latent space, thereby intervening in causal correlations that +affect trajectory representation. Furthermore, we store optimal variational +priors across various scenarios using a memory queue, ensuring continuous +debiasing during incremental task training. The proposed C$^{2}$INet enhances +adaptability to diverse tasks while preserving previous task information to +prevent catastrophic forgetting. It also incorporates pruning strategies to +mitigate overfitting. Comparative evaluations on three real and synthetic +complex datasets against state-of-the-art methods demonstrate that our proposed +method consistently achieves reliable prediction performance, effectively +mitigating confounding factors unique to different scenarios. This highlights +the practical value of our method for real-world applications. + +
+
+
+
+
+ + ☆ SNN-Based Online Learning of Concepts and Action Laws in an Open World + + +
+ We present the architecture of a fully autonomous, bio-inspired cognitive +agent built around a spiking neural network (SNN) implementing the agent's +semantic memory. The agent explores its universe and learns concepts of +objects/situations and of its own actions in a one-shot manner. While +object/situation concepts are unary, action concepts are triples made up of an +initial situation, a motor activity, and an outcome. They embody the agent's +knowledge of its universe's actions laws. Both kinds of concepts have different +degrees of generality. To make decisions the agent queries its semantic memory +for the expected outcomes of envisaged actions and chooses the action to take +on the basis of these predictions. Our experiments show that the agent handles +new situations by appealing to previously learned general concepts and rapidly +modifies its concepts to adapt to environment changes. + +
+
+
+
+
+ + ☆ Emergence of Implicit World Models from Mortal Agents NeurIPS 2024 + + +
+ We discuss the possibility of world models and active exploration as emergent +properties of open-ended behavior optimization in autonomous agents. In +discussing the source of the open-endedness of living things, we start from the +perspective of biological systems as understood by the mechanistic approach of +theoretical biology and artificial life. From this perspective, we discuss the +potential of homeostasis in particular as an open-ended objective for +autonomous agents and as a general, integrative extrinsic motivation. We then +discuss the possibility of implicitly acquiring a world model and active +exploration through the internal dynamics of a network, and a hypothetical +architecture for this, by combining meta-reinforcement learning, which assumes +domain adaptation as a system that achieves robust homeostasis. + +
+
+ comment: Accepted as a 1-page tiny paper in the Intrinsically Motivated + Open-ended Learning workshop at NeurIPS 2024 +
+
+
+
+
+ + ☆ libcll: an Extendable Python Toolkit for Complementary-Label Learning + + +
+ Complementary-label learning (CLL) is a weakly supervised learning paradigm +for multiclass classification, where only complementary labels -- indicating +classes an instance does not belong to -- are provided to the learning +algorithm. Despite CLL's increasing popularity, previous studies highlight two +main challenges: (1) inconsistent results arising from varied assumptions on +complementary label generation, and (2) high barriers to entry due to the lack +of a standardized evaluation platform across datasets and algorithms. To +address these challenges, we introduce \texttt{libcll}, an extensible Python +toolkit for CLL research. \texttt{libcll} provides a universal interface that +supports a wide range of generation assumptions, both synthetic and real-world +datasets, and key CLL algorithms. The toolkit is designed to mitigate +inconsistencies and streamline the research process, with easy installation, +comprehensive usage guides, and quickstart tutorials that facilitate efficient +adoption and implementation of CLL techniques. Extensive ablation studies +conducted with \texttt{libcll} demonstrate its utility in generating valuable +insights to advance future CLL research. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ A Review on Generative AI Models for Synthetic Medical Text, Time + Series, and Longitudinal Data + + +
+ This paper presents the results of a novel scoping review on the practical +models for generating three different types of synthetic health records (SHRs): +medical text, time series, and longitudinal data. The innovative aspects of the +review, which incorporate study objectives, data modality, and research +methodology of the reviewed studies, uncover the importance and the scope of +the topic for the digital medicine context. In total, 52 publications met the +eligibility criteria for generating medical time series (22), longitudinal data +(17), and medical text (13). Privacy preservation was found to be the main +research objective of the studied papers, along with class imbalance, data +scarcity, and data imputation as the other objectives. The adversarial +network-based, probabilistic, and large language models exhibited superiority +for generating synthetic longitudinal data, time series, and medical texts, +respectively. Finding a reliable performance measure to quantify SHR +re-identification risk is the major research gap of the topic. + +
+
+ comment: 27 pages, 3 figures +
+
+
+
+
+ + ☆ On the Accuracy and Precision of Moving Averages to Estimate Wi-Fi Link + Quality + + +
+ The radio spectrum is characterized by a noticeable variability, which +impairs performance and determinism of every wireless communication technology. +To counteract this aspect, mechanisms like Minstrel are customarily employed in +real Wi-Fi devices, and the adoption of machine learning for optimization is +envisaged in next-generation Wi-Fi 8. All these approaches require +communication quality to be monitored at runtime. + In this paper, the effectiveness of simple techniques based on moving +averages to estimate wireless link quality is analyzed, to assess their +advantages and weaknesses. Results can be used, e.g., as a baseline when +studying how artificial intelligence can be employed to mitigate +unpredictability of wireless networks by providing reliable estimates about +current spectrum conditions. + +
+
+ comment: preprint, 8 pages, 2024 +
+
+
+
+
+ + ☆ Restructuring Tractable Probabilistic Circuits + + +
+ Probabilistic circuits (PCs) is a unifying representation for probabilistic +models that support tractable inference. Numerous applications of PCs like +controllable text generation depend on the ability to efficiently multiply two +circuits. Existing multiplication algorithms require that the circuits respect +the same structure, i.e. variable scopes decomposes according to the same +vtree. In this work, we propose and study the task of restructuring +structured(-decomposable) PCs, that is, transforming a structured PC such that +it conforms to a target vtree. We propose a generic approach for this problem +and show that it leads to novel polynomial-time algorithms for multiplying +circuits respecting different vtrees, as well as a practical depth-reduction +algorithm that preserves structured decomposibility. Our work opens up new +avenues for tractable PC inference, suggesting the possibility of training with +less restrictive PC structures while enabling efficient inference by changing +their structures at inference time. + +
+
+
+
+
+ + ☆ Error-Feedback Model for Output Correction in Bilateral Control-Based + Imitation Learning + + +
+ In recent years, imitation learning using neural networks has enabled robots +to perform flexible tasks. However, since neural networks operate in a +feedforward structure, they do not possess a mechanism to compensate for output +errors. To address this limitation, we developed a feedback mechanism to +correct these errors. By employing a hierarchical structure for neural networks +comprising lower and upper layers, the lower layer was controlled to follow the +upper layer. Additionally, using a multi-layer perceptron in the lower layer, +which lacks an internal state, enhanced the error feedback. In the +character-writing task, this model demonstrated improved accuracy in writing +previously untrained characters. In the character-writing task, this model +demonstrated improved accuracy in writing previously untrained characters. +Through autonomous control with error feedback, we confirmed that the lower +layer could effectively track the output of the upper layer. This study +represents a promising step toward integrating neural networks with control +theories. + +
+
+
+
+
+ + ☆ Predicting User Intents and Musical Attributes from Music Discovery + Conversations + + +
+ Intent classification is a text understanding task that identifies user needs +from input text queries. While intent classification has been extensively +studied in various domains, it has not received much attention in the music +domain. In this paper, we investigate intent classification models for music +discovery conversation, focusing on pre-trained language models. Rather than +only predicting functional needs: intent classification, we also include a task +for classifying musical needs: musical attribute classification. Additionally, +we propose a method of concatenating previous chat history with just +single-turn user queries in the input text, allowing the model to understand +the overall conversation context better. Our proposed model significantly +improves the F1 score for both user intent and musical attribute +classification, and surpasses the zero-shot and few-shot performance of the +pretrained Llama 3 model. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Hyper-parameter Optimization for Federated Learning with Step-wise + Adaptive Mechanism + + +
+ Federated Learning (FL) is a decentralized learning approach that protects +sensitive information by utilizing local model parameters rather than sharing +clients' raw datasets. While this privacy-preserving method is widely employed +across various applications, it still requires significant development and +optimization. Automated Machine Learning (Auto-ML) has been adapted for +reducing the need for manual adjustments. Previous studies have explored the +integration of AutoML with different FL algorithms to evaluate their +effectiveness in enhancing FL settings. However, Automated FL (Auto-FL) faces +additional challenges due to the involvement of a large cohort of clients and +global training rounds between clients and the server, rendering the tuning +process time-consuming and nearly impossible on resource-constrained edge +devices (e.g., IoT devices). This paper investigates the deployment and +integration of two lightweight Hyper-Parameter Optimization (HPO) tools, +Raytune and Optuna, within the context of FL settings. A step-wise feedback +mechanism has also been designed to accelerate the hyper-parameter tuning +process and coordinate AutoML toolkits with the FL server. To this end, both +local and global feedback mechanisms are integrated to limit the search space +and expedite the HPO process. Further, a novel client selection technique is +introduced to mitigate the straggler effect in Auto-FL. The selected +hyper-parameter tuning tools are evaluated using two benchmark datasets, +FEMNIST, and CIFAR10. Further, the paper discusses the essential properties of +successful HPO tools, the integration mechanism with the FL pipeline, and the +challenges posed by the distributed and heterogeneous nature of FL +environments. + +
+
+
+
+
+ + ☆ Contrast Similarity-Aware Dual-Pathway Mamba for Multivariate Time + Series Node Classification + + +
+ Multivariate time series (MTS) data is generated through multiple sensors +across various domains such as engineering application, health monitoring, and +the internet of things, characterized by its temporal changes and high +dimensional characteristics. Over the past few years, many studies have +explored the long-range dependencies and similarities in MTS. However, +long-range dependencies are difficult to model due to their temporal changes +and high dimensionality makes it difficult to obtain similarities effectively +and efficiently. Thus, to address these issues, we propose contrast +similarity-aware dual-pathway Mamba for MTS node classification (CS-DPMamba). +Firstly, to obtain the dynamic similarity of each sample, we initially use +temporal contrast learning module to acquire MTS representations. And then we +construct a similarity matrix between MTS representations using Fast Dynamic +Time Warping (FastDTW). Secondly, we apply the DPMamba to consider the +bidirectional nature of MTS, allowing us to better capture long-range and +short-range dependencies within the data. Finally, we utilize the +Kolmogorov-Arnold Network enhanced Graph Isomorphism Network to complete the +information interaction in the matrix and MTS node classification task. By +comprehensively considering the long-range dependencies and dynamic similarity +features, we achieved precise MTS node classification. We conducted experiments +on multiple University of East Anglia (UEA) MTS datasets, which encompass +diverse application scenarios. Our results demonstrate the superiority of our +method through both supervised and semi-supervised experiments on the MTS +classification task. + +
+
+ comment: Submitted to Knowledge-Based Systems on Nov 17, 2024 +
+
+
+
+
+ + ☆ DeTrigger: A Gradient-Centric Approach to Backdoor Attack Mitigation in + Federated Learning + + +
+ Federated Learning (FL) enables collaborative model training across +distributed devices while preserving local data privacy, making it ideal for +mobile and embedded systems. However, the decentralized nature of FL also opens +vulnerabilities to model poisoning attacks, particularly backdoor attacks, +where adversaries implant trigger patterns to manipulate model predictions. In +this paper, we propose DeTrigger, a scalable and efficient backdoor-robust +federated learning framework that leverages insights from adversarial attack +methodologies. By employing gradient analysis with temperature scaling, +DeTrigger detects and isolates backdoor triggers, allowing for precise model +weight pruning of backdoor activations without sacrificing benign model +knowledge. Extensive evaluations across four widely used datasets demonstrate +that DeTrigger achieves up to 251x faster detection than traditional methods +and mitigates backdoor attacks by up to 98.9%, with minimal impact on global +model accuracy. Our findings establish DeTrigger as a robust and scalable +solution to protect federated learning environments against sophisticated +backdoor threats. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Hierarchical Spatio-Temporal Uncertainty Quantification for Distributed + Energy Adoption + + +
+ The rapid deployment of distributed energy resources (DER) has introduced +significant spatio-temporal uncertainties in power grid management, +necessitating accurate multilevel forecasting methods. However, existing +approaches often produce overly conservative uncertainty intervals at +individual spatial units and fail to properly capture uncertainties when +aggregating predictions across different spatial scales. This paper presents a +novel hierarchical spatio-temporal model based on the conformal prediction +framework to address these challenges. Our approach generates circuit-level DER +growth predictions and efficiently aggregates them to the substation level +while maintaining statistical validity through a tailored non-conformity score. +Applied to a decade of DER installation data from a local utility network, our +method demonstrates superior performance over existing approaches, particularly +in reducing prediction interval widths while maintaining coverage. + +
+
+
+
+
+ + ☆ Constant Rate Schedule: Constant-Rate Distributional Change for + Efficient Training and Sampling in Diffusion Models + + +
+ We propose a noise schedule that ensures a constant rate of change in the +probability distribution of diffused data throughout the diffusion process. To +obtain this noise schedule, we measure the rate of change in the probability +distribution of the forward process and use it to determine the noise schedule +before training diffusion models. The functional form of the noise schedule is +automatically determined and tailored to each dataset and type of diffusion +model. We evaluate the effectiveness of our noise schedule on unconditional and +class-conditional image generation tasks using the LSUN +(bedroom/church/cat/horse), ImageNet, and FFHQ datasets. Through extensive +experiments, we confirmed that our noise schedule broadly improves the +performance of the diffusion models regardless of the dataset, sampler, number +of function evaluations, or type of diffusion model. + +
+
+ comment: 33 pages, 9 figures +
+
+
+
+
+ + ☆ Testability of Instrumental Variables in Additive Nonlinear, + Non-Constant Effects Models + + +
+ We address the issue of the testability of instrumental variables derived +from observational data. Most existing testable implications are centered on +scenarios where the treatment is a discrete variable, e.g., instrumental +inequality (Pearl, 1995), or where the effect is assumed to be constant, e.g., +instrumental variables condition based on the principle of independent +mechanisms (Burauel, 2023). However, treatments can often be continuous +variables, such as drug dosages or nutritional content levels, and non-constant +effects may occur in many real-world scenarios. In this paper, we consider an +additive nonlinear, non-constant effects model with unmeasured confounders, in +which treatments can be either discrete or continuous, and propose an +Auxiliary-based Independence Test (AIT) condition to test whether a variable is +a valid instrument. We first show that if the candidate instrument is valid, +then the AIT condition holds. Moreover, we illustrate the implications of the +AIT condition and demonstrate that, in certain conditions, AIT conditions are +necessary and sufficient to detect all invalid IVs. We also extend the AIT +condition to include covariates and introduce a practical testing algorithm. +Experimental results on both synthetic and three different real-world datasets +show the effectiveness of our proposed condition. + +
+
+
+
+
+ + ☆ Action-Attentive Deep Reinforcement Learning for Autonomous Alignment of + Beamlines + + +
+ Synchrotron radiation sources play a crucial role in fields such as materials +science, biology, and chemistry. The beamline, a key subsystem of the +synchrotron, modulates and directs the radiation to the sample for analysis. +However, the alignment of beamlines is a complex and time-consuming process, +primarily carried out manually by experienced engineers. Even minor +misalignments in optical components can significantly affect the beam's +properties, leading to suboptimal experimental outcomes. Current automated +methods, such as bayesian optimization (BO) and reinforcement learning (RL), +although these methods enhance performance, limitations remain. The +relationship between the current and target beam properties, crucial for +determining the adjustment, is not fully considered. Additionally, the physical +characteristics of optical elements are overlooked, such as the need to adjust +specific devices to control the output beam's spot size or position. This paper +addresses the alignment of beamlines by modeling it as a Markov Decision +Process (MDP) and training an intelligent agent using RL. The agent calculates +adjustment values based on the current and target beam states, executes +actions, and iterates until optimal parameters are achieved. A policy network +with action attention is designed to improve decision-making by considering +both state differences and the impact of optical components. Experiments on two +simulated beamlines demonstrate that our algorithm outperforms existing +methods, with ablation studies highlighting the effectiveness of the action +attention-based policy network. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ☆ Diffusion-Inspired Cold Start with Sufficient Prior in Computerized + Adaptive Testing + + +
+ Computerized Adaptive Testing (CAT) aims to select the most appropriate +questions based on the examinee's ability and is widely used in online +education. However, existing CAT systems often lack initial understanding of +the examinee's ability, requiring random probing questions. This can lead to +poorly matched questions, extending the test duration and negatively impacting +the examinee's mindset, a phenomenon referred to as the Cold Start with +Insufficient Prior (CSIP) task. This issue occurs because CAT systems do not +effectively utilize the abundant prior information about the examinee available +from other courses on online platforms. These response records, due to the +commonality of cognitive states across different knowledge domains, can provide +valuable prior information for the target domain. However, no prior work has +explored solutions for the CSIP task. In response to this gap, we propose +Diffusion Cognitive States TransfeR Framework (DCSR), a novel domain transfer +framework based on Diffusion Models (DMs) to address the CSIP task. +Specifically, we construct a cognitive state transition bridge between domains, +guided by the common cognitive states of examinees, encouraging the model to +reconstruct the initial ability state in the target domain. To enrich the +expressive power of the generated data, we analyze the causal relationships in +the generation process from a causal perspective. Redundant and extraneous +cognitive states can lead to limited transfer and negative transfer effects. +Our DCSR can seamlessly apply the generated initial ability states in the +target domain to existing question selection algorithms, thus improving the +cold start performance of the CAT system. Extensive experiments conducted on +five real-world datasets demonstrate that DCSR significantly outperforms +existing baseline methods in addressing the CSIP task. + +
+
+ comment: Accepted by KDD2025 +
+
+
+
+
+ + ☆ Just KIDDIN: Knowledge Infusion and Distillation for Detection of + INdecent Memes + + +
+ Toxicity identification in online multimodal environments remains a +challenging task due to the complexity of contextual connections across +modalities (e.g., textual and visual). In this paper, we propose a novel +framework that integrates Knowledge Distillation (KD) from Large Visual +Language Models (LVLMs) and knowledge infusion to enhance the performance of +toxicity detection in hateful memes. Our approach extracts sub-knowledge graphs +from ConceptNet, a large-scale commonsense Knowledge Graph (KG) to be infused +within a compact VLM framework. The relational context between toxic phrases in +captions and memes, as well as visual concepts in memes enhance the model's +reasoning capabilities. Experimental results from our study on two hate speech +benchmark datasets demonstrate superior performance over the state-of-the-art +baselines across AU-ROC, F1, and Recall with improvements of 1.1%, 7%, and 35%, +respectively. Given the contextual complexity of the toxicity detection task, +our approach showcases the significance of learning from both explicit (i.e. +KG) as well as implicit (i.e. LVLMs) contextual cues incorporated through a +hybrid neurosymbolic approach. This is crucial for real-world applications +where accurate and scalable recognition of toxic content is critical for +creating safer online environments. + +
+
+
+
+
+ + ☆ SkillTree: Explainable Skill-Based Deep Reinforcement Learning for + Long-Horizon Control Tasks + + +
+ Deep reinforcement learning (DRL) has achieved remarkable success in various +research domains. However, its reliance on neural networks results in a lack of +transparency, which limits its practical applications. To achieve +explainability, decision trees have emerged as a popular and promising +alternative to neural networks. Nonetheless, due to their limited +expressiveness, traditional decision trees struggle with high-dimensional +long-horizon continuous control tasks. In this paper, we proposes SkillTree, a +novel framework that reduces complex continuous action spaces into discrete +skill spaces. Our hierarchical approach integrates a differentiable decision +tree within the high-level policy to generate skill embeddings, which +subsequently guide the low-level policy in executing skills. By making skill +decisions explainable, we achieve skill-level explainability, enhancing the +understanding of the decision-making process in complex tasks. Experimental +results demonstrate that our method achieves performance comparable to +skill-based neural networks in complex robotic arm control domains. +Furthermore, SkillTree offers explanations at the skill level, thereby +increasing the transparency of the decision-making process. + +
+
+
+
+
+ + ☆ UrbanDiT: A Foundation Model for Open-World Urban Spatio-Temporal + Learning + + +
+ The urban environment is characterized by complex spatio-temporal dynamics +arising from diverse human activities and interactions. Effectively modeling +these dynamics is essential for understanding and optimizing urban systems In +this work, we introduce UrbanDiT, a foundation model for open-world urban +spatio-temporal learning that successfully scale up diffusion transformers in +this field. UrbanDiT pioneers a unified model that integrates diverse +spatio-temporal data sources and types while learning universal spatio-temporal +patterns across different cities and scenarios. This allows the model to unify +both multi-data and multi-task learning, and effectively support a wide range +of spatio-temporal applications. Its key innovation lies in the elaborated +prompt learning framework, which adaptively generates both data-driven and +task-specific prompts, guiding the model to deliver superior performance across +various urban applications. UrbanDiT offers three primary advantages: 1) It +unifies diverse data types, such as grid-based and graph-based data, into a +sequential format, allowing to capture spatio-temporal dynamics across diverse +scenarios of different cities; 2) With masking strategies and task-specific +prompts, it supports a wide range of tasks, including bi-directional +spatio-temporal prediction, temporal interpolation, spatial extrapolation, and +spatio-temporal imputation; and 3) It generalizes effectively to open-world +scenarios, with its powerful zero-shot capabilities outperforming nearly all +baselines with training data. These features allow UrbanDiT to achieves +state-of-the-art performance in different domains such as transportation +traffic, crowd flows, taxi demand, bike usage, and cellular traffic, across +multiple cities and tasks. UrbanDiT sets up a new benchmark for foundation +models in the urban spatio-temporal domain. + +
+
+
+
+
+ + ☆ Sensor-fusion based Prognostics Framework for Complex Engineering + Systems Exhibiting Multiple Failure Modes + + +
+ Complex engineering systems are often subject to multiple failure modes. +Developing a remaining useful life (RUL) prediction model that does not +consider the failure mode causing degradation is likely to result in inaccurate +predictions. However, distinguishing between causes of failure without manually +inspecting the system is nontrivial. This challenge is increased when the +causes of historically observed failures are unknown. Sensors, which are useful +for monitoring the state-of-health of systems, can also be used for +distinguishing between multiple failure modes as the presence of multiple +failure modes results in discriminatory behavior of the sensor signals. When +systems are equipped with multiple sensors, some sensors may exhibit behavior +correlated with degradation, while other sensors do not. Furthermore, which +sensors exhibit this behavior may differ for each failure mode. In this paper, +we present a simultaneous clustering and sensor selection approach for +unlabeled training datasets of systems exhibiting multiple failure modes. The +cluster assignments and the selected sensors are then utilized in real-time to +first diagnose the active failure mode and then to predict the system RUL. We +validate the complete pipeline of the methodology using a simulated dataset of +systems exhibiting two failure modes and on a turbofan degradation dataset from +NASA. + +
+
+
+
+
+ + ☆ Reinforcement Learning with Action Sequence for Data-Efficient Robot + Learning + + +
+ Training reinforcement learning (RL) agents on robotic tasks typically +requires a large number of training samples. This is because training data +often consists of noisy trajectories, whether from exploration or +human-collected demonstrations, making it difficult to learn value functions +that understand the effect of taking each action. On the other hand, recent +behavior-cloning (BC) approaches have shown that predicting a sequence of +actions enables policies to effectively approximate noisy, multi-modal +distributions of expert demonstrations. Can we use a similar idea for improving +RL on robotic tasks? In this paper, we introduce a novel RL algorithm that +learns a critic network that outputs Q-values over a sequence of actions. By +explicitly training the value functions to learn the consequence of executing a +series of current and future actions, our algorithm allows for learning useful +value functions from noisy trajectories. We study our algorithm across various +setups with sparse and dense rewards, and with or without demonstrations, +spanning mobile bi-manual manipulation, whole-body control, and tabletop +manipulation tasks from BiGym, HumanoidBench, and RLBench. We find that, by +learning the critic network with action sequences, our algorithm outperforms +various RL and BC baselines, in particular on challenging humanoid control +tasks. + +
+
+ comment: 17 Pages. Website: https://younggyo.me/cqn-as/ +
+
+
+
+
+ + ☆ Tangential Randomization in Linear Bandits (TRAiL): Guaranteed Inference + and Regret Bounds + + +
+ We propose and analyze TRAiL (Tangential Randomization in Linear Bandits), a +computationally efficient regret-optimal forced exploration algorithm for +linear bandits on action sets that are sublevel sets of strongly convex +functions. TRAiL estimates the governing parameter of the linear bandit problem +through a standard regularized least squares and perturbs the reward-maximizing +action corresponding to said point estimate along the tangent plane of the +convex compact action set before projecting back to it. Exploiting +concentration results for matrix martingales, we prove that TRAiL ensures a +$\Omega(\sqrt{T})$ growth in the inference quality, measured via the minimum +eigenvalue of the design (regressor) matrix with high-probability over a +$T$-length period. We build on this result to obtain an $\mathcal{O}(\sqrt{T} +\log(T))$ upper bound on cumulative regret with probability at least $ 1 - 1/T$ +over $T$ periods, and compare TRAiL to other popular algorithms for linear +bandits. Then, we characterize an $\Omega(\sqrt{T})$ minimax lower bound for +any algorithm on the expected regret that covers a wide variety of +action/parameter sets and noise processes. Our analysis not only expands the +realm of lower-bounds in linear bandits significantly, but as a byproduct, +yields a trade-off between regret and inference quality. Specifically, we prove +that any algorithm with an $\mathcal{O}(T^\alpha)$ expected regret growth must +have an $\Omega(T^{1-\alpha})$ asymptotic growth in expected inference quality. +Our experiments on the $L^p$ unit ball as action sets reveal how this relation +can be violated, but only in the short-run, before returning to respect the +bound asymptotically. In effect, regret-minimizing algorithms must have just +the right rate of inference -- too fast or too slow inference will incur +sub-optimal regret growth. + +
+
+ comment: 42 pages, 6 Figures +
+
+
+
+
+ + ☆ HEIGHT: Heterogeneous Interaction Graph Transformer for Robot Navigation + in Crowded and Constrained Environments + + +
+ We study the problem of robot navigation in dense and interactive crowds with +environmental constraints such as corridors and furniture. Previous methods +fail to consider all types of interactions among agents and obstacles, leading +to unsafe and inefficient robot paths. In this article, we leverage a +graph-based representation of crowded and constrained scenarios and propose a +structured framework to learn robot navigation policies with deep reinforcement +learning. We first split the representations of different components in the +environment and propose a heterogeneous spatio-temporal (st) graph to model +distinct interactions among humans, robots, and obstacles. Based on the +heterogeneous st-graph, we propose HEIGHT, a novel navigation policy network +architecture with different components to capture heterogeneous interactions +among entities through space and time. HEIGHT utilizes attention mechanisms to +prioritize important interactions and a recurrent network to track changes in +the dynamic scene over time, encouraging the robot to avoid collisions +adaptively. Through extensive simulation and real-world experiments, we +demonstrate that HEIGHT outperforms state-of-the-art baselines in terms of +success and efficiency in challenging navigation scenarios. Furthermore, we +demonstrate that our pipeline achieves better zero-shot generalization +capability than previous works when the densities of humans and obstacles +change. More videos are available at +https://sites.google.com/view/crowdnav-height/home. + +
+
+
+
+
+ + ☆ Self-supervised denoising of visual field data improves detection of + glaucoma progression + + +
+ Perimetric measurements provide insight into a patient's peripheral vision +and day-to-day functioning and are the main outcome measure for identifying +progression of visual damage from glaucoma. However, visual field data can be +noisy, exhibiting high variance, especially with increasing damage. In this +study, we demonstrate the utility of self-supervised deep learning in denoising +visual field data from over 4000 patients to enhance its signal-to-noise ratio +and its ability to detect true glaucoma progression. We deployed both a +variational autoencoder (VAE) and a masked autoencoder to determine which +self-supervised model best smooths the visual field data while reconstructing +salient features that are less noisy and more predictive of worsening disease. +Our results indicate that including a categorical p-value at every visual field +location improves the smoothing of visual field data. Masked autoencoders led +to cleaner denoised data than previous methods, such as variational +autoencoders. A 4.7% increase in detection of progressing eyes with pointwise +linear regression (PLR) was observed. The masked and variational autoencoders' +smoothed data predicted glaucoma progression 2.3 months earlier when p-values +were included compared to when they were not. The faster prediction of time to +progression (TTP) and the higher percentage progression detected support our +hypothesis that masking out visual field elements during training while +including p-values at each location would improve the task of detection of +visual field progression. Our study has clinically relevant implications +regarding masking when training neural networks to denoise visual field data, +resulting in earlier and more accurate detection of glaucoma progression. This +denoising model can be integrated into future models for visual field analysis +to enhance detection of glaucoma progression. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ A Computational Method for Measuring "Open Codes" in Qualitative + Analysis + + +
+ Qualitative analysis is critical to understanding human datasets in many +social science disciplines. Open coding is an inductive qualitative process +that identifies and interprets "open codes" from datasets. Yet, meeting +methodological expectations (such as "as exhaustive as possible") can be +challenging. While many machine learning (ML)/generative AI (GAI) studies have +attempted to support open coding, few have systematically measured or evaluated +GAI outcomes, increasing potential bias risks. Building on Grounded Theory and +Thematic Analysis theories, we present a computational method to measure and +identify potential biases from "open codes" systematically. Instead of +operationalizing human expert results as the "ground truth," our method is +built upon a team-based approach between human and machine coders. We +experiment with two HCI datasets to establish this method's reliability by 1) +comparing it with human analysis, and 2) analyzing its output stability. We +present evidence-based suggestions and example workflows for ML/GAI to support +open coding. + +
+
+
+
+
+ + ♻ ☆ Conformal Prediction for Class-wise Coverage via Augmented Label Rank + Calibration + + +
+ Conformal prediction (CP) is an emerging uncertainty quantification framework +that allows us to construct a prediction set to cover the true label with a +pre-specified marginal or conditional probability. Although the valid coverage +guarantee has been extensively studied for classification problems, CP often +produces large prediction sets which may not be practically useful. This issue +is exacerbated for the setting of class-conditional coverage on imbalanced +classification tasks with many and/or imbalanced classes. This paper proposes +the Rank Calibrated Class-conditional CP (RC3P) algorithm to reduce the +prediction set sizes to achieve class-conditional coverage, where the valid +coverage holds for each class. In contrast to the standard class-conditional CP +(CCP) method that uniformly thresholds the class-wise conformity score for each +class, the augmented label rank calibration step allows RC3P to selectively +iterate this class-wise thresholding subroutine only for a subset of classes +whose class-wise top-k error is small. We prove that agnostic to the classifier +and data distribution, RC3P achieves class-wise coverage. We also show that +RC3P reduces the size of prediction sets compared to the CCP method. +Comprehensive experiments on multiple real-world datasets demonstrate that RC3P +achieves class-wise coverage and 26.25% reduction in prediction set sizes on +average. + +
+
+
+
+
+ + ♻ ☆ Debiased Regression for Root-N-Consistent Conditional Mean Estimation + + +
+ This study introduces a debiasing method for regression estimators, including +high-dimensional and nonparametric regression estimators. For example, +nonparametric regression methods allow for the estimation of regression +functions in a data-driven manner with minimal assumptions; however, these +methods typically fail to achieve $\sqrt{n}$-consistency in their convergence +rates, and many, including those in machine learning, lack guarantees that +their estimators asymptotically follow a normal distribution. To address these +challenges, we propose a debiasing technique for nonparametric estimators by +adding a bias-correction term to the original estimators, extending the +conventional one-step estimator used in semiparametric analysis. Specifically, +for each data point, we estimate the conditional expected residual of the +original nonparametric estimator, which can, for instance, be computed using +kernel (Nadaraya-Watson) regression, and incorporate it as a bias-reduction +term. Our theoretical analysis demonstrates that the proposed estimator +achieves $\sqrt{n}$-consistency and asymptotic normality under a mild +convergence rate condition for both the original nonparametric estimator and +the conditional expected residual estimator. Notably, this approach remains +model-free as long as the original estimator and the conditional expected +residual estimator satisfy the convergence rate condition. The proposed method +offers several advantages, including improved estimation accuracy and +simplified construction of confidence intervals. + +
+
+
+
+
+ + ♻ ☆ GraphSnapShot: Graph Machine Learning Acceleration with Fast Storage and + Retrieval + + +
+ In our recent research, we have developed a framework called GraphSnapShot, +which has been proven an useful tool for graph learning acceleration. +GraphSnapShot is a framework for fast cache, storage, retrieval and computation +for graph learning. It can quickly store and update the local topology of graph +structure and allows us to track patterns in the structure of graph networks, +just like take snapshots of the graphs. In experiments, GraphSnapShot shows +efficiency, it can achieve up to 30% training acceleration and 73% memory +reduction for lossless graph ML training compared to current baselines such as +dgl.This technique is particular useful for large dynamic graph learning tasks +such as social media analysis and recommendation systems to process complex +relationships between entities. + The code for GraphSnapShot is publicly available at +https://github.com/NoakLiu/GraphSnapShot. + +
+
+
+
+
+ + ♻ ☆ Regulating Chatbot Output via Inter-Informational Competition + + +
+ The advent of ChatGPT has sparked over a year of regulatory frenzy. However, +few existing studies have rigorously questioned the assumption that, if left +unregulated, AI chatbot's output would inflict tangible, severe real harm on +human affairs. Most researchers have overlooked the critical possibility that +the information market itself can effectively mitigate these risks and, as a +result, they tend to use regulatory tools to address the issue directly. This +Article develops a yardstick for reevaluating both AI-related content risks and +corresponding regulatory proposals by focusing on inter-informational +competition among various outlets. The decades-long history of regulating +information and communications technologies indicates that regulators tend to +err too much on the side of caution and to put forward excessive regulatory +measures when encountering the uncertainties brought about by new technologies. +In fact, a trove of empirical evidence has demonstrated that market competition +among information outlets can effectively mitigate most risks and that +overreliance on regulation is not only unnecessary but detrimental, as well. +This Article argues that sufficient competition among chatbots and other +information outlets in the information marketplace can sufficiently mitigate +and even resolve most content risks posed by generative AI technologies. This +renders certain loudly advocated regulatory strategies, like mandatory +prohibitions, licensure, curation of datasets, and notice-and-response regimes, +truly unnecessary and even toxic to desirable competition and innovation +throughout the AI industry. Ultimately, the ideas that I advance in this +Article should pour some much-needed cold water on the regulatory frenzy over +generative AI and steer the issue back to a rational track. + +
+
+ comment: 50-page legal Article, forthcoming in Northwestern Journal of + Technology and Intellectual Property +
+
+
+
+
+ + ♻ ☆ KTO: Model Alignment as Prospect Theoretic Optimization ICML 2024 + + +
+ Kahneman & Tversky's $\textit{prospect theory}$ tells us that humans perceive +random variables in a biased but well-defined manner (1992); for example, +humans are famously loss-averse. We show that objectives for aligning LLMs with +human feedback implicitly incorporate many of these biases -- the success of +these objectives (e.g., DPO) over cross-entropy minimization can partly be +ascribed to them belonging to a family of loss functions that we call +$\textit{human-aware losses}$ (HALOs). However, the utility functions these +methods attribute to humans still differ from those in the prospect theory +literature. Using a Kahneman-Tversky model of human utility, we propose a HALO +that directly maximizes the utility of generations instead of maximizing the +log-likelihood of preferences, as current methods do. We call this approach +KTO, and it matches or exceeds the performance of preference-based methods at +scales from 1B to 30B, despite only learning from a binary signal of whether an +output is desirable. More broadly, our work suggests that there is no one HALO +that is universally superior; the best loss depends on the inductive biases +most appropriate for a given setting, an oft-overlooked consideration. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Is Programming by Example solved by LLMs? + + +
+ Programming-by-Examples (PBE) aims to generate an algorithm from input-output +examples. Such systems are practically and theoretically important: from an +end-user perspective, they are deployed to millions of people, and from an AI +perspective, PBE corresponds to a very general form of few-shot inductive +inference. Given the success of Large Language Models (LLMs) in code-generation +tasks, we investigate here the extent to which LLMs can be said to have +"solved" PBE. We experiment on classic domains such as lists and strings, and +an uncommon graphics programming domain not well represented in typical +pretraining data. We find that pretrained models are not effective at PBE, but +that they can be fine-tuned for much higher performance, provided the test +problems are in-distribution. We analyze empirically what causes these models +to succeed and fail, and take steps toward understanding how to achieve better +out-of-distribution generalization. Collectively these results suggest that +LLMs make strong progress toward solving the typical suite of PBE tasks, +potentially increasing the flexibility and applicability of PBE systems, while +also identifying ways in which LLMs still fall short. + +
+
+
+
+
+ + ♻ ☆ RLtools: A Fast, Portable Deep Reinforcement Learning Library for + Continuous Control + + +
+ Deep Reinforcement Learning (RL) can yield capable agents and control +policies in several domains but is commonly plagued by prohibitively long +training times. Additionally, in the case of continuous control problems, the +applicability of learned policies on real-world embedded devices is limited due +to the lack of real-time guarantees and portability of existing libraries. To +address these challenges, we present RLtools, a dependency-free, header-only, +pure C++ library for deep supervised and reinforcement learning. Its novel +architecture allows RLtools to be used on a wide variety of platforms, from HPC +clusters over workstations and laptops to smartphones, smartwatches, and +microcontrollers. Specifically, due to the tight integration of the RL +algorithms with simulation environments, RLtools can solve popular RL problems +up to 76 times faster than other popular RL frameworks. We also benchmark the +inference on a diverse set of microcontrollers and show that in most cases our +optimized implementation is by far the fastest. Finally, RLtools enables the +first-ever demonstration of training a deep RL algorithm directly on a +microcontroller, giving rise to the field of TinyRL. The source code as well as +documentation and live demos are available through our project page at +https://rl.tools. + +
+
+ comment: Project page: https://rl.tools +
+
+
+
+
+ + ♻ ☆ Realised Volatility Forecasting: Machine Learning via Financial Word + Embedding + + +
+ This study develops a financial word embedding using 15 years of business +news. Our results show that this specialised language model produces more +accurate results than general word embeddings, based on a financial benchmark +we established. As an application, we incorporate this word embedding into a +simple machine learning model to enhance the HAR model for forecasting realised +volatility. This approach statistically and economically outperforms +established econometric models. Using an explainable AI method, we also +identify key phrases in business news that contribute significantly to +volatility, offering insights into language patterns tied to market dynamics. + +
+
+
+
+
+ + ♻ ☆ Combining Induction and Transduction for Abstract Reasoning + + +
+ When learning an input-output mapping from very few examples, is it better to +first infer a latent function that explains the examples, or is it better to +directly predict new test outputs, e.g. using a neural network? We study this +question on ARC, a highly diverse dataset of abstract reasoning tasks. We train +neural models for induction (inferring latent functions) and transduction +(directly predicting the test output for a given test input). Our models are +trained on synthetic data generated by prompting LLMs to produce Python code +specifying a function to be inferred, plus a stochastic subroutine for +generating inputs to that function. We find inductive and transductive models +solve very different problems, despite training on the same problems, and +despite sharing the same neural architecture. + +
+
+
+
+
+ + ♻ ☆ Scientific Machine Learning Based Reduced-Order Models for Plasma + Turbulence Simulations + + +
+ This paper investigates non-intrusive Scientific Machine Learning (SciML) +Reduced-Order Models (ROMs) for plasma turbulence simulations. In particular, +we focus on Operator Inference (OpInf) to build low-cost physics-based ROMs +from data for such simulations. As a representative example, we consider the +(classical) Hasegawa-Wakatani (HW) equations used for modeling two-dimensional +electrostatic drift-wave turbulence. For a comprehensive perspective of the +potential of OpInf to construct predictive ROMs, we consider three setups for +the HW equations by varying a key parameter, namely the adiabaticity +coefficient. These setups lead to the formation of complex and nonlinear +dynamics, which makes the construction of predictive ROMs of any kind +challenging. We generate the training datasets by performing direct numerical +simulations of the HW equations and recording the computed state data and +outputs the over a time horizon of $100$ time units in the turbulent phase. We +then use these datasets to construct OpInf ROMs for predictions over $400$ +additional time units, that is, $400\%$ more than the training horizon. Our +results show that the OpInf ROMs capture important statistical features of the +turbulent dynamics and generalize beyond the training time horizon while +reducing the computational effort of the high-fidelity simulation by up to five +orders of magnitude. In the broader context of fusion research, this shows that +non-intrusive SciML ROMs have the potential to drastically accelerate numerical +studies, which can ultimately enable tasks such as the design of optimized +fusion devices. + +
+
+ comment: 14 pages in double column format, 9 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ TransDreamer: Reinforcement Learning with Transformer World Models NeurIPS 2021 + + +
+ The Dreamer agent provides various benefits of Model-Based Reinforcement +Learning (MBRL) such as sample efficiency, reusable knowledge, and safe +planning. However, its world model and policy networks inherit the limitations +of recurrent neural networks and thus an important question is how an MBRL +framework can benefit from the recent advances of transformers and what the +challenges are in doing so. In this paper, we propose a transformer-based MBRL +agent, called TransDreamer. We first introduce the Transformer State-Space +Model, a world model that leverages a transformer for dynamics predictions. We +then share this world model with a transformer-based policy network and obtain +stability in training a transformer-based RL agent. In experiments, we apply +the proposed model to 2D visual RL and 3D first-person visual RL tasks both +requiring long-range memory access for memory-based reasoning. We show that the +proposed model outperforms Dreamer in these complex tasks. + +
+
+ comment: Deep RL Workshop NeurIPS 2021 +
+
+
+
+
+ + ♻ ☆ log-RRIM: Yield Prediction via Local-to-global Reaction Representation + Learning and Interaction Modeling + + +
+ Accurate prediction of chemical reaction yields is crucial for optimizing +organic synthesis, potentially reducing time and resources spent on +experimentation. With the rise of artificial intelligence (AI), there is +growing interest in leveraging AI-based methods to accelerate yield predictions +without conducting in vitro experiments. We present log-RRIM, an innovative +graph transformer-based framework designed for predicting chemical reaction +yields. Our approach implements a unique local-to-global reaction +representation learning strategy. This approach initially captures detailed +molecule-level information and then models and aggregates intermolecular +interactions, ensuring that the impact of varying-sizes molecular fragments on +yield is accurately accounted for. Another key feature of log-RRIM is its +integration of a cross-attention mechanism that focuses on the interplay +between reagents and reaction centers. This design reflects a fundamental +principle in chemical reactions: the crucial role of reagents in influencing +bond-breaking and formation processes, which ultimately affect reaction yields. +log-RRIM outperforms existing methods in our experiments, especially for medium +to high-yielding reactions, proving its reliability as a predictor. Its +advanced modeling of reactant-reagent interactions and sensitivity to small +molecular fragments make it a valuable tool for reaction planning and +optimization in chemical synthesis. The data and codes of log-RRIM are +accessible through https://github.com/ninglab/Yield_log_RRIM. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Feasibility of Federated Learning from Client Databases with Different + Brain Diseases and MRI Modalities WACV 2025 + + +
+ Segmentation models for brain lesions in MRI are typically developed for a +specific disease and trained on data with a predefined set of MRI modalities. +Such models cannot segment the disease using data with a different set of MRI +modalities, nor can they segment other types of diseases. Moreover, this +training paradigm prevents a model from using the advantages of learning from +heterogeneous databases that may contain scans and segmentation labels for +different brain pathologies and diverse sets of MRI modalities. Additionally, +the confidentiality of patient data often prevents central data aggregation, +necessitating a decentralized approach. Is it feasible to use Federated +Learning (FL) to train a single model on client databases that contain scans +and labels of different brain pathologies and diverse sets of MRI modalities? +We demonstrate promising results by combining appropriate, simple, and +practical modifications to the model and training strategy: Designing a model +with input channels that cover the whole set of modalities available across +clients, training with random modality drop, and exploring the effects of +feature normalization methods. Evaluation on 7 brain MRI databases with 5 +different diseases shows that this FL framework can train a single model +achieving very promising results in segmenting all disease types seen during +training. Importantly, it can segment these diseases in new databases that +contain sets of modalities different from those in training clients. These +results demonstrate, for the first time, the feasibility and effectiveness of +using FL to train a single 3D segmentation model on decentralised data with +diverse brain diseases and MRI modalities, a necessary step towards leveraging +heterogeneous real-world databases. Code: +https://github.com/FelixWag/FedUniBrain + +
+
+ comment: Accepted as a conference paper at WACV 2025 +
+
+
+
+
+ + ♻ ☆ Improving Multi-task Learning via Seeking Task-based Flat Regions + + +
+ Multi-Task Learning (MTL) is a widely-used and powerful learning paradigm for +training deep neural networks that allows learning more than one objective by a +single backbone. Compared to training tasks separately, MTL significantly +reduces computational costs, improves data efficiency, and potentially enhances +model performance by leveraging knowledge across tasks. Hence, it has been +adopted in a variety of applications, ranging from computer vision to natural +language processing and speech recognition. Among them, there is an emerging +line of work in MTL that focuses on manipulating the task gradient to derive an +ultimate gradient descent direction to benefit all tasks. Despite achieving +impressive results on many benchmarks, directly applying these approaches +without using appropriate regularization techniques might lead to suboptimal +solutions on real-world problems. In particular, standard training that +minimizes the empirical loss on the training data can easily suffer from +overfitting to low-resource tasks or be spoiled by noisy-labeled ones, which +can cause negative transfer between tasks and overall performance drop. To +alleviate such problems, we propose to leverage a recently introduced training +method, named Sharpness-aware Minimization, which can enhance model +generalization ability on single-task learning. Accordingly, we present a novel +MTL training methodology, encouraging the model to find task-based flat minima +for coherently improving its generalization capability on all tasks. Finally, +we conduct comprehensive experiments on a variety of applications to +demonstrate the merit of our proposed approach to existing gradient-based MTL +methods, as suggested by our developed theory. + +
+
+ comment: 35 pages, 17 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space + Exploration by Reinforcement Learning Agent ICRA 2025 + + +
+ Grasping by a robot in unstructured environments is deemed a critical +challenge because of the requirement for effective adaptation to a wide +variation in object geometries, material properties, and other environmental +factors. In this paper, we propose a novel framework for robotic grasping based +on the idea of compressing high-dimensional target and gripper features in a +common latent space using a set of autoencoders. Our approach simplifies +grasping by using three autoencoders dedicated to the target, the gripper, and +a third one that fuses their latent representations. This allows the RL agent +to achieve higher learning rates at the initial stages of exploration of a new +environment, as well as at non-zero shot grasp attempts. The agent explores the +latent space of the third autoencoder for better quality grasp without explicit +reconstruction of objects. By implementing the PoWER algorithm into the RL +training process, updates on the agent's policy will be made through the +perturbation in the reward-weighted latent space. The successful exploration +efficiently constrains both position and pose integrity for feasible executions +of grasps. We evaluate our system on a diverse set of objects, demonstrating +the high success rate in grasping with minimum computational overhead. We found +that approach enhances the adaptation of the RL agent by more than 35 % in +simulation experiments. + +
+
+ comment: Submitted for review at IEEE ICRA 2025 +
+
+
+
+
+ + ♻ ☆ Learning the Simplicity of Scattering Amplitudes + + +
+ The simplification and reorganization of complex expressions lies at the core +of scientific progress, particularly in theoretical high-energy physics. This +work explores the application of machine learning to a particular facet of this +challenge: the task of simplifying scattering amplitudes expressed in terms of +spinor-helicity variables. We demonstrate that an encoder-decoder transformer +architecture achieves impressive simplification capabilities for expressions +composed of handfuls of terms. Lengthier expressions are implemented in an +additional embedding network, trained using contrastive learning, which +isolates subexpressions that are more likely to simplify. The resulting +framework is capable of reducing expressions with hundreds of terms - a regular +occurrence in quantum field theory calculations - to vastly simpler equivalent +expressions. Starting from lengthy input expressions, our networks can generate +the Parke-Taylor formula for five-point gluon scattering, as well as new +compact expressions for five-point amplitudes involving scalars and gravitons. +An interactive demonstration can be found at +https://spinorhelicity.streamlit.app . + +
+
+ comment: 25+15 pages, 9+6 figures, v2: typos correction and extended the + introduction, conclusion, sections 2.2, 2.4 and appendix F +
+
+
+
+
+ + ♻ ☆ Identifying Differential Patient Care Through Inverse Intent Inference + + +
+ Sepsis is a life-threatening condition defined by end-organ dysfunction due +to a dysregulated host response to infection. Although the Surviving Sepsis +Campaign has launched and has been releasing sepsis treatment guidelines to +unify and normalize the care for sepsis patients, it has been reported in +numerous studies that disparities in care exist across the trajectory of +patient stay in the emergency department and intensive care unit. Here, we +apply a number of reinforcement learning techniques including behavioral +cloning, imitation learning, and inverse reinforcement learning, to learn the +optimal policy in the management of septic patient subgroups using expert +demonstrations. Then we estimate the counterfactual optimal policies by +applying the model to another subset of unseen medical populations and identify +the difference in cure by comparing it to the real policy. Our data comes from +the sepsis cohort of MIMIC-IV and the clinical data warehouses of the Mass +General Brigham healthcare system. The ultimate objective of this work is to +use the optimal learned policy function to estimate the counterfactual +treatment policy and identify deviations across sub-populations of interest. We +hope this approach would help us identify any disparities in care and also +changes in cure in response to the publication of national sepsis treatment +guidelines. + +
+
+
+
+
+ + ♻ ☆ Combinatorial Logistic Bandits + + +
+ We introduce a novel framework called combinatorial logistic bandits (CLogB), +where in each round, a subset of base arms (called the super arm) is selected, +with the outcome of each base arm being binary and its expectation following a +logistic parametric model. The feedback is governed by a general arm triggering +process. Our study covers CLogB with reward functions satisfying two smoothness +conditions, capturing application scenarios such as online content delivery, +online learning to rank, and dynamic channel allocation. We first propose a +simple yet efficient algorithm, CLogUCB, utilizing a variance-agnostic +exploration bonus. Under the 1-norm triggering probability modulated (TPM) +smoothness condition, CLogUCB achieves a regret bound of +$\tilde{O}(d\sqrt{\kappa KT})$, where $\tilde{O}$ ignores logarithmic factors, +$d$ is the dimension of the feature vector, $\kappa$ represents the +nonlinearity of the logistic model, and $K$ is the maximum number of base arms +a super arm can trigger. This result improves on prior work by a factor of +$\tilde{O}(\sqrt{\kappa})$. We then enhance CLogUCB with a variance-adaptive +version, VA-CLogUCB, which attains a regret bound of $\tilde{O}(d\sqrt{KT})$ +under the same 1-norm TPM condition, improving another +$\tilde{O}(\sqrt{\kappa})$ factor. VA-CLogUCB shows even greater promise under +the stronger triggering probability and variance modulated (TPVM) condition, +achieving a leading $\tilde{O}(d\sqrt{T})$ regret, thus removing the additional +dependency on the action-size $K$. Furthermore, we enhance the computational +efficiency of VA-CLogUCB by eliminating the nonconvex optimization process when +the context feature map is time-invariant while maintaining the tight +$\tilde{O}(d\sqrt{T})$ regret. Finally, experiments on synthetic and real-world +datasets demonstrate the superior performance of our algorithms compared to +benchmark algorithms. + +
+
+ comment: Accepted in ACM SIGMETRICS 2025 +
+
+
+
+
+ + ♻ ☆ Can Agents Spontaneously Form a Society? Introducing a Novel + Architecture for Generative Multi-Agents to Elicit Social Emergence + + +
+ Generative agents have demonstrated impressive capabilities in specific +tasks, but most of these frameworks focus on independent tasks and lack +attention to social interactions. We introduce a generative agent architecture +called ITCMA-S, which includes a basic framework for individual agents and a +framework called LTRHA that supports social interactions among multi-agents. +This architecture enables agents to identify and filter out behaviors that are +detrimental to social interactions, guiding them to choose more favorable +actions. We designed a sandbox environment to simulate the natural evolution of +social relationships among multiple identity-less agents for experimental +evaluation. The results showed that ITCMA-S performed well on multiple +evaluation indicators, demonstrating its ability to actively explore the +environment, recognize new agents, and acquire new information through +continuous actions and dialogue. Observations show that as agents establish +connections with each other, they spontaneously form cliques with internal +hierarchies around a selected leader and organize collective activities. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ BiSSL: Bilevel Optimization for Self-Supervised Pre-Training and + Fine-Tuning + + +
+ In this work, we present BiSSL, a first-of-its-kind training framework that +introduces bilevel optimization to enhance the alignment between the pretext +pre-training and downstream fine-tuning stages in self-supervised learning. +BiSSL formulates the pretext and downstream task objectives as the lower- and +upper-level objectives in a bilevel optimization problem and serves as an +intermediate training stage within the self-supervised learning pipeline. By +more explicitly modeling the interdependence of these training stages, BiSSL +facilitates enhanced information sharing between them, ultimately leading to a +backbone parameter initialization that is better suited for the downstream +task. We propose a training algorithm that alternates between optimizing the +two objectives defined in BiSSL. Using a ResNet-18 backbone pre-trained with +SimCLR on the STL10 dataset, we demonstrate that our proposed framework +consistently achieves improved or competitive classification accuracies across +various downstream image classification datasets compared to the conventional +self-supervised learning pipeline. Qualitative analyses of the backbone +features further suggest that BiSSL enhances the alignment of downstream +features in the backbone prior to fine-tuning. + +
+
+
+
+
+ + ♻ ☆ Approximating Families of Sharp Solutions to Fisher's Equation with + Physics-Informed Neural Networks + + +
+ This paper employs physics-informed neural networks (PINNs) to solve Fisher's +equation, a fundamental reaction-diffusion system with both simplicity and +significance. The focus is on investigating Fisher's equation under conditions +of large reaction rate coefficients, where solutions exhibit steep traveling +waves that often present challenges for traditional numerical methods. To +address these challenges, a residual weighting scheme is introduced in the +network training to mitigate the difficulties associated with standard PINN +approaches. Additionally, a specialized network architecture designed to +capture traveling wave solutions is explored. The paper also assesses the +ability of PINNs to approximate a family of solutions by generalizing across +multiple reaction rate coefficients. The proposed method demonstrates high +effectiveness in solving Fisher's equation with large reaction rate +coefficients and shows promise for meshfree solutions of generalized +reaction-diffusion systems. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Partially Unitary Learning + + +
+ The problem of an optimal mapping between Hilbert spaces $IN$ of +$\left|\psi\right\rangle$ and $OUT$ of $\left|\phi\right\rangle$ based on a set +of wavefunction measurements (within a phase) $\psi_l \to \phi_l$, $l=1\dots +M$, is formulated as an optimization problem maximizing the total fidelity +$\sum_{l=1}^{M} \omega^{(l)} +\left|\langle\phi_l|\mathcal{U}|\psi_l\rangle\right|^2$ subject to probability +preservation constraints on $\mathcal{U}$ (partial unitarity). The constructed +operator $\mathcal{U}$ can be considered as an $IN$ to $OUT$ quantum channel; +it is a partially unitary rectangular matrix (an isometry) of dimension +$\dim(OUT) \times \dim(IN)$ transforming operators as $A^{OUT}=\mathcal{U} +A^{IN} \mathcal{U}^{\dagger}$. An iterative algorithm for finding the global +maximum of this optimization problem is developed, and its application to a +number of problems is demonstrated. A software product implementing the +algorithm is available from the authors. + +
+
+ comment: A working algorithm implementing Partially Unitary Learning + arXiv:2212.14810 has been developed and generalized. See arXiv:2407.04406 for + further generalization to density matrix mappings +
+
+
+
+
+ + ♻ ☆ On Size and Hardness Generalization in Unsupervised Learning for the + Travelling Salesman Problem + + +
+ We study the generalization capability of Unsupervised Learning in solving +the Travelling Salesman Problem (TSP). We use a Graph Neural Network (GNN) +trained with a surrogate loss function to generate an embedding for each node. +We use these embeddings to construct a heat map that indicates the likelihood +of each edge being part of the optimal route. We then apply local search to +generate our final predictions. Our investigation explores how different +training instance sizes, embedding dimensions, and distributions influence the +outcomes of Unsupervised Learning methods. Our results show that training with +larger instance sizes and increasing embedding dimensions can build a more +effective representation, enhancing the model's ability to solve TSP. +Furthermore, in evaluating generalization across different distributions, we +first determine the hardness of various distributions and explore how different +hardnesses affect the final results. Our findings suggest that models trained +on harder instances exhibit better generalization capabilities, highlighting +the importance of selecting appropriate training instances in solving TSP using +Unsupervised Learning. + +
+
+
+
+
+ + ♻ ☆ Machine Learning Algorithms to Assess Site Closure Time Frames for Soil + and Groundwater Contamination + + +
+ Monitored Natural Attenuation (MNA) is gaining prominence as an effective +method for managing soil and groundwater contamination due to its +cost-efficiency and minimal environmental disruption. Despite its benefits, MNA +necessitates extensive groundwater monitoring to ensure that contaminant levels +decrease to meet safety standards. This study expands the capabilities of +PyLEnM, a Python package designed for long-term environmental monitoring, by +incorporating new algorithms to enhance its predictive and analytical +functionalities. We introduce methods to estimate the timeframe required for +contaminants like Sr-90 and I-129 to reach regulatory safety standards using +linear regression and to forecast future contaminant levels with the +Bidirectional Long Short-Term Memory (Bi-LSTM) networks. Additionally, Random +Forest regression is employed to identify factors influencing the time to reach +safety standards. Our methods are illustrated using data from the Savannah +River Site (SRS) F-Area, where preliminary findings reveal a notable downward +trend in contaminant levels, with variability linked to initial concentrations +and groundwater flow dynamics. The Bi-LSTM model effectively predicts +contaminant concentrations for the next four years, demonstrating the potential +of advanced time series analysis to improve MNA strategies and reduce reliance +on manual groundwater sampling. The code, along with its usage instructions, +validation, and requirements, is available at: +https://github.com/csplevuanh/pylenm_extension. + +
+
+ comment: The paper will be withdrawn to fix some work issues with the sections + on Bi-LSTM models +
+
+
+
+
+ + ♻ ☆ Automatic Classification of General Movements in Newborns ML4H + + +
+ General movements (GMs) are spontaneous, coordinated body movements in +infants that offer valuable insights into the developing nervous system. +Assessed through the Prechtl GM Assessment (GMA), GMs are reliable predictors +for neurodevelopmental disorders. However, GMA requires specifically trained +clinicians, who are limited in number. To scale up newborn screening, there is +a need for an algorithm that can automatically classify GMs from infant video +recordings. This data poses challenges, including variability in recording +length, device type, and setting, with each video coarsely annotated for +overall movement quality. In this work, we introduce a tool for extracting +features from these recordings and explore various machine learning techniques +for automated GM classification. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages +
+
+
+
+
+ + ♻ ☆ MaIL: Improving Imitation Learning with Mamba + + +
+ This work presents Mamba Imitation Learning (MaIL), a novel imitation +learning (IL) architecture that provides an alternative to state-of-the-art +(SoTA) Transformer-based policies. MaIL leverages Mamba, a state-space model +designed to selectively focus on key features of the data. While Transformers +are highly effective in data-rich environments due to their dense attention +mechanisms, they can struggle with smaller datasets, often leading to +overfitting or suboptimal representation learning. In contrast, Mamba's +architecture enhances representation learning efficiency by focusing on key +features and reducing model complexity. This approach mitigates overfitting and +enhances generalization, even when working with limited data. Extensive +evaluations on the LIBERO benchmark demonstrate that MaIL consistently +outperforms Transformers on all LIBERO tasks with limited data and matches +their performance when the full dataset is available. Additionally, MaIL's +effectiveness is validated through its superior performance in three real robot +experiments. Our code is available at https://github.com/ALRhub/MaIL. + +
+
+
+
+
+ + ♻ ☆ Multistep Consistency Models + + +
+ Diffusion models are relatively easy to train but require many steps to +generate samples. Consistency models are far more difficult to train, but +generate samples in a single step. + In this paper we propose Multistep Consistency Models: A unification between +Consistency Models (Song et al., 2023) and TRACT (Berthelot et al., 2023) that +can interpolate between a consistency model and a diffusion model: a trade-off +between sampling speed and sampling quality. Specifically, a 1-step consistency +model is a conventional consistency model whereas a $\infty$-step consistency +model is a diffusion model. + Multistep Consistency Models work really well in practice. By increasing the +sample budget from a single step to 2-8 steps, we can train models more easily +that generate higher quality samples, while retaining much of the sampling +speed benefits. Notable results are 1.4 FID on Imagenet 64 in 8 step and 2.1 +FID on Imagenet128 in 8 steps with consistency distillation, using simple +losses without adversarial training. We also show that our method scales to a +text-to-image diffusion model, generating samples that are close to the quality +of the original model. + +
+
+
+
+
+ + ♻ ☆ Robust Pareto Set Identification with Contaminated Bandit Feedback + + +
+ We consider the Pareto set identification (PSI) problem in multi-objective +multi-armed bandits (MO-MAB) with contaminated reward observations. At each arm +pull, with some fixed probability, the true reward samples are replaced with +the samples from an arbitrary contamination distribution chosen by an +adversary. We consider ({\alpha}, {\delta})-PAC PSI and propose a sample +median-based multi-objective adaptive elimination algorithm that returns an +({\alpha}, {\delta})- PAC Pareto set upon termination with a sample complexity +bound that depends on the contamination probability. As the contamination +probability decreases, we recover the wellknown sample complexity results in +MO-MAB. We compare the proposed algorithm with a mean-based method from MO-MAB +literature, as well as an extended version that uses median estimators, on +several PSI problems under adversarial corruptions, including review bombing +and diabetes management. Our numerical results support our theoretical findings +and demonstrate that robust algorithm design is crucial for accurate PSI under +contaminated reward observations. + +
+
+
+
+
+ + ♻ ☆ Synergizing LLM Agents and Knowledge Graph for Socioeconomic Prediction + in LBSN + + +
+ The fast development of location-based social networks (LBSNs) has led to +significant changes in society, resulting in popular studies of using LBSN data +for socioeconomic prediction, e.g., regional population and commercial activity +estimation. Existing studies design various graphs to model heterogeneous LBSN +data, and further apply graph representation learning methods for socioeconomic +prediction. However, these approaches heavily rely on heuristic ideas and +expertise to extract task-relevant knowledge from diverse data, which may not +be optimal for specific tasks. Additionally, they tend to overlook the inherent +relationships between different indicators, limiting the prediction accuracy. +Motivated by the remarkable abilities of large language models (LLMs) in +commonsense reasoning, embedding, and multi-agent collaboration, in this work, +we synergize LLM agents and knowledge graph for socioeconomic prediction. We +first construct a location-based knowledge graph (LBKG) to integrate +multi-sourced LBSN data. Then we leverage the reasoning power of LLM agent to +identify relevant meta-paths in the LBKG for each type of socioeconomic +prediction task, and design a semantic-guided attention module for knowledge +fusion with meta-paths. Moreover, we introduce a cross-task communication +mechanism to further enhance performance by enabling knowledge sharing across +tasks at both LLM agent and KG levels. On the one hand, the LLM agents for +different tasks collaborate to generate more diverse and comprehensive +meta-paths. On the other hand, the embeddings from different tasks are +adaptively merged for better socioeconomic prediction. Experiments on two +datasets demonstrate the effectiveness of the synergistic design between LLM +and KG, providing insights for information sharing across socioeconomic +prediction tasks. + +
+
+
+
+
+ + ♻ ☆ On the existence of minimizers in shallow residual ReLU neural network + optimization landscapes + + +
+ In this article, we show existence of minimizers in the loss landscape for +residual artificial neural networks (ANNs) with multi-dimensional input layer +and one hidden layer with ReLU activation. Our work contrasts earlier results +in [D. Gallon, A. Jentzen, and F. Lindner, preprint, arXiv:2211.15641, 2022] +and [P. Petersen, M. Raslan, and F. Voigtlaender, Found. Comput. Math., 21 +(2021), pp. 375-444] which showed that in many situations minimizers do not +exist for common smooth activation functions even in the case where the target +functions are polynomials. The proof of the existence property makes use of a +closure of the search space containing all functions generated by ANNs and +additional discontinuous generalized responses. As we will show, the additional +generalized responses in this larger space are suboptimal so that the minimum +is attained in the original function class. + +
+
+ comment: Author's Accepted Manuscript version. To appear in SINUM +
+
+
+
+
+ + ♻ ☆ Asymptotic and Non-Asymptotic Convergence of AdaGrad for Non-Convex + Optimization via Novel Stopping Time-based Analysis + + +
+ Adaptive optimizers have emerged as powerful tools in deep learning, +dynamically adjusting the learning rate based on iterative gradients. These +adaptive methods have significantly succeeded in various deep learning tasks, +outperforming stochastic gradient descent (SGD). However, despite AdaGrad's +status as a cornerstone of adaptive optimization, its theoretical analysis has +not adequately addressed key aspects such as asymptotic convergence and +non-asymptotic convergence rates in non-convex optimization scenarios. This +study aims to provide a comprehensive analysis of AdaGrad, filling the existing +gaps in the literature. We introduce an innovative stopping time technique from +probabilistic theory, which allows us to establish the stability of AdaGrad +under mild conditions for the first time. We further derive the asymptotically +almost sure and mean-square convergence for AdaGrad. In addition, we +demonstrate the near-optimal non-asymptotic convergence rate measured by the +average-squared gradients in expectation, which is stronger than the existing +high-probability results. The techniques developed in this work are potentially +independent of interest for future research on other adaptive stochastic +algorithms. + +
+
+ comment: 50 pages +
+
+
+
+
+ + ♻ ☆ Weak-to-Strong Search: Align Large Language Models via Searching over + Small Language Models NeurIPS 2024 + + +
+ Large language models are usually fine-tuned to align with human preferences. +However, fine-tuning a large language model can be challenging. In this work, +we introduce $\textit{weak-to-strong search}$, framing the alignment of a large +language model as a test-time greedy search to maximize the log-probability +difference between small tuned and untuned models while sampling from the +frozen large model. This method serves both as (1) a compute-efficient model +up-scaling strategy that avoids directly tuning the large model and as (2) an +instance of weak-to-strong generalization that enhances a strong model with +weak test-time guidance. Empirically, we demonstrate the flexibility of +weak-to-strong search across different tasks. In controlled-sentiment +generation and summarization, we use tuned and untuned $\texttt{gpt2}$s to +improve the alignment of large models without additional training. Crucially, +in a more difficult instruction-following benchmark, AlpacaEval 2.0, we show +that reusing off-the-shelf small models (e.g., $\texttt{zephyr-7b-beta}$ and +its untuned version) can improve the length-controlled win rates of both +white-box and black-box large models against $\texttt{gpt-4-turbo}$ (e.g., +$34.4\% \rightarrow 37.9\%$ for $\texttt{Llama-3-70B-Instruct}$ and $16.0\% +\rightarrow 20.1\%$ for $\texttt{gpt-3.5-turbo-instruct}$), despite the small +models' low win rates $\approx 10.0\%$. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Variational Bayesian Bow tie Neural Networks with Shrinkage + + +
+ Despite the dominant role of deep models in machine learning, limitations +persist, including overconfident predictions, susceptibility to adversarial +attacks, and underestimation of variability in predictions. The Bayesian +paradigm provides a natural framework to overcome such issues and has become +the gold standard for uncertainty estimation with deep models, also providing +improved accuracy and a framework for tuning critical hyperparameters. However, +exact Bayesian inference is challenging, typically involving variational +algorithms that impose strong independence and distributional assumptions. +Moreover, existing methods are sensitive to the architectural choice of the +network. We address these issues by constructing a relaxed version of the +standard feed-forward rectified neural network, and employing Polya-Gamma data +augmentation tricks to render a conditionally linear and Gaussian model. +Additionally, we use sparsity-promoting priors on the weights of the neural +network for data-driven architectural design. To approximate the posterior, we +derive a variational inference algorithm that avoids distributional assumptions +and independence across layers and is a faster alternative to the usual Markov +Chain Monte Carlo schemes. + +
+
+
+
+
+ + ♻ ☆ S-HR-VQVAE: Sequential Hierarchical Residual Learning Vector Quantized + Variational Autoencoder for Video Prediction + + +
+ We address the video prediction task by putting forth a novel model that +combines (i) a novel hierarchical residual learning vector quantized +variational autoencoder (HR-VQVAE), and (ii) a novel autoregressive +spatiotemporal predictive model (AST-PM). We refer to this approach as a +sequential hierarchical residual learning vector quantized variational +autoencoder (S-HR-VQVAE). By leveraging the intrinsic capabilities of HR-VQVAE +at modeling still images with a parsimonious representation, combined with the +AST-PM's ability to handle spatiotemporal information, S-HR-VQVAE can better +deal with major challenges in video prediction. These include learning +spatiotemporal information, handling high dimensional data, combating blurry +prediction, and implicit modeling of physical characteristics. Extensive +experimental results on four challenging tasks, namely KTH Human Action, +TrafficBJ, Human3.6M, and Kitti, demonstrate that our model compares favorably +against state-of-the-art video prediction techniques both in quantitative and +qualitative evaluations despite a much smaller model size. Finally, we boost +S-HR-VQVAE by proposing a novel training method to jointly estimate the +HR-VQVAE and AST-PM parameters. + +
+
+ comment: 12 pages, 6 figures, 5 tables. Accepted for publication on IEEE + Transactions on Multimedia on 2024-11-19 +
+
+
+
+
+ + ♻ ☆ Mixed-Output Gaussian Process Latent Variable Models + + +
+ This work develops a Bayesian non-parametric approach to signal separation +where the signals may vary according to latent variables. Our key contribution +is to augment Gaussian Process Latent Variable Models (GPLVMs) for the case +where each data point comprises the weighted sum of a known number of pure +component signals, observed across several input locations. Our framework +allows arbitrary non-linear variations in the signals while being able to +incorporate useful priors for the linear weights, such as summing-to-one. Our +contributions are particularly relevant to spectroscopy, where changing +conditions may cause the underlying pure component signals to vary from sample +to sample. To demonstrate the applicability to both spectroscopy and other +domains, we consider several applications: a near-infrared spectroscopy dataset +with varying temperatures, a simulated dataset for identifying flow +configuration through a pipe, and a dataset for determining the type of rock +from its reflectance. + +
+
+
+
+
+ + ♻ ☆ Wavelets Are All You Need for Autoregressive Image Generation + + +
+ In this paper, we take a new approach to autoregressive image generation that +is based on two main ingredients. The first is wavelet image coding, which +allows to tokenize the visual details of an image from coarse to fine details +by ordering the information starting with the most significant bits of the most +significant wavelet coefficients. The second is a variant of a language +transformer whose architecture is re-designed and optimized for token sequences +in this 'wavelet language'. The transformer learns the significant statistical +correlations within a token sequence, which are the manifestations of +well-known correlations between the wavelet subbands at various resolutions. We +show experimental results with conditioning on the generation process. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Rethinking cluster-conditioned diffusion models for label-free image + synthesis + + +
+ Diffusion-based image generation models can enhance image quality when +conditioned on ground truth labels. Here, we conduct a comprehensive +experimental study on image-level conditioning for diffusion models using +cluster assignments. We investigate how individual clustering determinants, +such as the number of clusters and the clustering method, impact image +synthesis across three different datasets. Given the optimal number of clusters +with respect to image synthesis, we show that cluster-conditioning can achieve +state-of-the-art performance, with an FID of 1.67 for CIFAR10 and 2.17 for +CIFAR100, along with a strong increase in training sample efficiency. We +further propose a novel empirical method to estimate an upper bound for the +optimal number of clusters. Unlike existing approaches, we find no significant +association between clustering performance and the corresponding +cluster-conditional FID scores. The code is available at +https://github.com/HHU-MMBS/cedm-official-wavc2025. + +
+
+ comment: Accepted in WAVC2025 (21 pages, 15 figures). Code is available at + https://github.com/HHU-MMBS/cedm-official-wavc2025 +
+
+
+
+
+ + ♻ ☆ Zero-shot LLM-guided Counterfactual Generation: A Case Study on NLP + Model Evaluation + + +
+ With the development and proliferation of large, complex, black-box models +for solving many natural language processing (NLP) tasks, there is also an +increasing necessity of methods to stress-test these models and provide some +degree of interpretability or explainability. While counterfactual examples are +useful in this regard, automated generation of counterfactuals is a data and +resource intensive process. such methods depend on models such as pre-trained +language models that are then fine-tuned on auxiliary, often task-specific +datasets, that may be infeasible to build in practice, especially for new tasks +and data domains. Therefore, in this work we explore the possibility of +leveraging large language models (LLMs) for zero-shot counterfactual generation +in order to stress-test NLP models. We propose a structured pipeline to +facilitate this generation, and we hypothesize that the instruction-following +and textual understanding capabilities of recent LLMs can be effectively +leveraged for generating high quality counterfactuals in a zero-shot manner, +without requiring any training or fine-tuning. Through comprehensive +experiments on a variety of propreitary and open-source LLMs, along with +various downstream tasks in NLP, we explore the efficacy of LLMs as zero-shot +counterfactual generators in evaluating and explaining black-box NLP models. + +
+
+ comment: Longer version of short paper accepted at IEEE BigData 2024 (Main + Track) +
+
+
+
+
+ + ♻ ☆ Interpretable Fusion Analytics Framework for fMRI Connectivity: + Self-Attention Mechanism and Latent Space Item-Response Model + + +
+ There have been several attempts to use deep learning based on brain fMRI +signals to classify cognitive impairment diseases. However, deep learning is a +hidden black box model that makes it difficult to interpret the process of +classification. To address this issue, we propose a novel analytical framework +that interprets the classification result from deep learning processes. We +first derive the region of interest (ROI) functional connectivity network (FCN) +by embedding functions based on their similar signal patterns. Then, using the +self-attention equipped deep learning model, we classify diseases based on +their FCN. Finally, in order to interpret the classification results, we employ +a latent space item-response interaction network model to identify the +significant functions that exhibit distinct connectivity patterns when compared +to other diseases. The application of this proposed framework to the four types +of cognitive impairment shows that our approach is valid for determining the +significant ROI functions. + +
+
+ comment: This submission is a duplicate of another manuscript from our + research group [arXiv preprint arXiv:2401.09028] due to a misunderstanding in + communication among co-authors +
+
+
+
+
+ + ♻ ☆ Signaling and Social Learning in Swarms of Robots + + +
+ This paper investigates the role of communication in improving coordination +within robot swarms, focusing on a paradigm where learning and execution occur +simultaneously in a decentralized manner. We highlight the role communication +can play in addressing the credit assignment problem (individual contribution +to the overall performance), and how it can be influenced by it. We propose a +taxonomy of existing and future works on communication, focusing on information +selection and physical abstraction as principal axes for classification: from +low-level lossless compression with raw signal extraction and processing to +high-level lossy compression with structured communication models. The paper +reviews current research from evolutionary robotics, multi-agent (deep) +reinforcement learning, language models, and biophysics models to outline the +challenges and opportunities of communication in a collective of robots that +continuously learn from one another through local message exchanges, +illustrating a form of social learning. + +
+
+ comment: 17 pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ A-BDD: Leveraging Data Augmentations for Safe Autonomous Driving in + Adverse Weather and Lighting + + +
+ High-autonomy vehicle functions rely on machine learning (ML) algorithms to +understand the environment. Despite displaying remarkable performance in fair +weather scenarios, perception algorithms are heavily affected by adverse +weather and lighting conditions. To overcome these difficulties, ML engineers +mainly rely on comprehensive real-world datasets. However, the difficulties in +real-world data collection for critical areas of the operational design domain +(ODD) often means synthetic data is required for perception training and safety +validation. Thus, we present A-BDD, a large set of over 60,000 synthetically +augmented images based on BDD100K that are equipped with semantic segmentation +and bounding box annotations (inherited from the BDD100K dataset). The dataset +contains augmented data for rain, fog, overcast and sunglare/shadow with +varying intensity levels. We further introduce novel strategies utilizing +feature-based image quality metrics like FID and CMMD, which help identify +useful augmented and real-world data for ML training and testing. By conducting +experiments on A-BDD, we provide evidence that data augmentations can play a +pivotal role in closing performance gaps in adverse weather and lighting +conditions. + +
+
+
+
+
+ + ♻ ☆ XLand-MiniGrid: Scalable Meta-Reinforcement Learning Environments in JAX NeurIPS 2024 + + +
+ Inspired by the diversity and depth of XLand and the simplicity and +minimalism of MiniGrid, we present XLand-MiniGrid, a suite of tools and +grid-world environments for meta-reinforcement learning research. Written in +JAX, XLand-MiniGrid is designed to be highly scalable and can potentially run +on GPU or TPU accelerators, democratizing large-scale experimentation with +limited resources. Along with the environments, XLand-MiniGrid provides +pre-sampled benchmarks with millions of unique tasks of varying difficulty and +easy-to-use baselines that allow users to quickly start training adaptive +agents. In addition, we have conducted a preliminary analysis of scaling and +generalization, showing that our baselines are capable of reaching millions of +steps per second during training and validating that the proposed benchmarks +are challenging. XLand-MiniGrid is open-source and available at +https://github.com/dunnolab/xland-minigrid. + +
+
+ comment: Neural Information Processing Systems (NeurIPS 2024) Track on + Datasets and Benchmarks. Source code at + https://github.com/dunnolab/xland-minigrid +
+
+
+
+
+ + ♻ ☆ Smoke and Mirrors in Causal Downstream Tasks + + +
+ Machine Learning and AI have the potential to transform data-driven +scientific discovery, enabling accurate predictions for several scientific +phenomena. As many scientific questions are inherently causal, this paper looks +at the causal inference task of treatment effect estimation, where the outcome +of interest is recorded in high-dimensional observations in a Randomized +Controlled Trial (RCT). Despite being the simplest possible causal setting and +a perfect fit for deep learning, we theoretically find that many common choices +in the literature may lead to biased estimates. To test the practical impact of +these considerations, we recorded ISTAnt, the first real-world benchmark for +causal inference downstream tasks on high-dimensional observations as an RCT +studying how garden ants (Lasius neglectus) respond to microparticles applied +onto their colony members by hygienic grooming. Comparing 6 480 models +fine-tuned from state-of-the-art visual backbones, we find that the sampling +and modeling choices significantly affect the accuracy of the causal estimate, +and that classification accuracy is not a proxy thereof. We further validated +the analysis, repeating it on a synthetically generated visual data set +controlling the causal model. Our results suggest that future benchmarks should +carefully consider real downstream scientific questions, especially causal +ones. Further, we highlight guidelines for representation learning methods to +help answer causal questions in the sciences. + +
+
+
+
+
+ + ♻ ☆ Diffusion-Based Semantic Segmentation of Lumbar Spine MRI Scans of Lower + Back Pain Patients ML4H + + +
+ This study introduces a diffusion-based framework for robust and accurate +segmenton of vertebrae, intervertebral discs (IVDs), and spinal canal from +Magnetic Resonance Imaging~(MRI) scans of patients with low back pain (LBP), +regardless of whether the scans are T1w or T2-weighted. The results showed that +SpineSegDiff achieved comparable outperformed non-diffusion state-of-the-art +models in the identification of degenerated IVDs. Our findings highlight the +potential of diffusion models to improve LBP diagnosis and management through +precise spine MRI analysis. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 5 pages +
+
+
+
+
+ + ♻ ☆ Vision-Language Model Fine-Tuning via Simple Parameter-Efficient + Modification EMNLP 2024 + + +
+ Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed +the success of prompt tuning and adapter tuning, while the classic model +fine-tuning on inherent parameters seems to be overlooked. It is believed that +fine-tuning the parameters of VLMs with few-shot samples corrupts the +pre-trained knowledge since fine-tuning the CLIP model even degrades +performance. In this paper, we revisit this viewpoint, and propose a new +perspective: fine-tuning the specific parameters instead of all will uncover +the power of classic model fine-tuning on VLMs. Through our meticulous study, +we propose ClipFit, a simple yet effective method to fine-tune CLIP without +introducing any overhead of extra parameters. We demonstrate that by only +fine-tuning the specific bias terms and normalization layers, ClipFit can +improve the performance of zero-shot CLIP by 7.27\% average harmonic mean +accuracy. Lastly, to understand how fine-tuning in CLIPFit affects the +pre-trained models, we conducted extensive experimental analyses w.r.t. changes +in internal parameters and representations. We found that low-level text bias +layers and the first layer normalization layer change much more than other +layers. The code is available at \url{https://github.com/minglllli/CLIPFit}. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ PAPAL: A Provable PArticle-based Primal-Dual ALgorithm for Mixed Nash + Equilibrium + + +
+ We consider the non-convex non-concave objective function in two-player +zero-sum continuous games. The existence of pure Nash equilibrium requires +stringent conditions, posing a major challenge for this problem. To circumvent +this difficulty, we examine the problem of identifying a mixed Nash +equilibrium, where strategies are randomized and characterized by probability +distributions over continuous domains. To this end, we propose PArticle-based +Primal-dual ALgorithm (PAPAL) tailored for a weakly entropy-regularized min-max +optimization over probability distributions. This algorithm employs the +stochastic movements of particles to represent the updates of random strategies +for the $\epsilon$-mixed Nash equilibrium. We offer a comprehensive convergence +analysis of the proposed algorithm, demonstrating its effectiveness. In +contrast to prior research that attempted to update particle importance without +movements, PAPAL is the first implementable particle-based algorithm +accompanied by non-asymptotic quantitative convergence results, running time, +and sample complexity guarantees. Our framework contributes novel insights into +the particle-based algorithms for continuous min-max optimization in the +general non-convex non-concave setting. + +
+
+ comment: Published in Journal of Machine Learning Research 25 (2024) 1-48 +
+
+
+
+
+ + ♻ ☆ Cascaded Diffusion Models for 2D and 3D Microscopy Image Synthesis to + Enhance Cell Segmentation + + +
+ Automated cell segmentation in microscopy images is essential for biomedical +research, yet conventional methods are labor-intensive and prone to error. +While deep learning-based approaches have proven effective, they often require +large annotated datasets, which are scarce due to the challenges of manual +annotation. To overcome this, we propose a novel framework for synthesizing +densely annotated 2D and 3D cell microscopy images using cascaded diffusion +models. Our method synthesizes 2D and 3D cell masks from sparse 2D annotations +using multi-level diffusion models and NeuS, a 3D surface reconstruction +approach. Following that, a pretrained 2D Stable Diffusion model is finetuned +to generate realistic cell textures and the final outputs are combined to form +cell populations. We show that training a segmentation model with a combination +of our synthetic data and real data improves cell segmentation performance by +up to 9\% across multiple datasets. Additionally, the FID scores indicate that +the synthetic data closely resembles real data. The code for our proposed +approach will be available at +https://github.com/ruveydayilmaz0/cascaded_diffusion. + +
+
+
+
+
+ + ♻ ☆ GNN-Based Code Annotation Logic for Establishing Security Boundaries in + C Code + + +
+ Securing sensitive operations in today's interconnected software landscape is +crucial yet challenging. Modern platforms rely on Trusted Execution +Environments (TEEs), such as Intel SGX and ARM TrustZone, to isolate security +sensitive code from the main system, reducing the Trusted Computing Base (TCB) +and providing stronger assurances. However, identifying which code should +reside in TEEs is complex and requires specialized expertise, which is not +supported by current automated tools. Existing solutions often migrate entire +applications to TEEs, leading to suboptimal use and an increased TCB. To +address this gap, we propose Code Annotation Logic (CAL), a pioneering tool +that automatically identifies security sensitive components for TEE isolation. +CAL analyzes codebases, leveraging a graph-based approach with novel feature +construction and employing a custom graph neural network model to accurately +determine which parts of the code should be isolated. CAL effectively optimizes +TCB, reducing the burden of manual analysis and enhancing overall security. Our +contributions include the definition of security sensitive code, the +construction and labeling of a comprehensive dataset of source files, a feature +rich graph based data preparation pipeline, and the CAL model for TEE +integration. Evaluation results demonstrate CAL's efficacy in identifying +sensitive code with a recall of 86.05%, an F1 score of 81.56%, and an +identification rate of 91.59% for security sensitive functions. By enabling +efficient code isolation, CAL advances the secure development of applications +using TEEs, offering a practical solution for developers to reduce attack +vectors. + +
+
+ comment: Submitted +
+
+
+
+
+ + ♻ ☆ An embedding-based distance for temporal graphs + + +
+ Temporal graphs are commonly used to represent time-resolved relations +between entities in many natural and artificial systems. Many techniques were +devised to investigate the evolution of temporal graphs by comparing their +state at different time points. However, quantifying the similarity between +temporal graphs as a whole is an open problem. Here, we use embeddings based on +time-respecting random walks to introduce a new notion of distance between +temporal graphs. This distance is well-defined for pairs of temporal graphs +with different numbers of nodes and different time spans. We study the case of +a matched pair of graphs, when a known relation exists between their nodes, and +the case of unmatched graphs, when such a relation is unavailable and the +graphs may be of different sizes. We use empirical and synthetic temporal +network data to show that the distance we introduce discriminates graphs with +different topological and temporal properties. We provide an efficient +implementation of the distance computation suitable for large-scale temporal +graphs. + +
+
+
+
+
+ + ♻ ☆ Fair Generalized Linear Mixed Models + + +
+ When using machine learning for automated prediction, it is important to +account for fairness in the prediction. Fairness in machine learning aims to +ensure that biases in the data and model inaccuracies do not lead to +discriminatory decisions. E.g., predictions from fair machine learning models +should not discriminate against sensitive variables such as sexual orientation +and ethnicity. The training data often in obtained from social surveys. In +social surveys, oftentimes the data collection process is a strata sampling, +e.g. due to cost restrictions. In strata samples, the assumption of +independence between the observation is not fulfilled. Hence, if the machine +learning models do not account for the strata correlations, the results may be +biased. Especially high is the bias in cases where the strata assignment is +correlated to the variable of interest. We present in this paper an algorithm +that can handle both problems simultaneously, and we demonstrate the impact of +stratified sampling on the quality of fair machine learning predictions in a +reproducible simulation study. + +
+
+ comment: 25 pages, 12 figures. arXiv admin note: text overlap with + arXiv:2405.06433 +
+
+
+
+
+ + ♻ ☆ TFG: Unified Training-Free Guidance for Diffusion Models + + +
+ Given an unconditional diffusion model and a predictor for a target property +of interest (e.g., a classifier), the goal of training-free guidance is to +generate samples with desirable target properties without additional training. +Existing methods, though effective in various individual applications, often +lack theoretical grounding and rigorous testing on extensive benchmarks. As a +result, they could even fail on simple tasks, and applying them to a new +problem becomes unavoidably difficult. This paper introduces a novel +algorithmic framework encompassing existing methods as special cases, unifying +the study of training-free guidance into the analysis of an algorithm-agnostic +design space. Via theoretical and empirical investigation, we propose an +efficient and effective hyper-parameter searching strategy that can be readily +applied to any downstream task. We systematically benchmark across 7 diffusion +models on 16 tasks with 40 targets, and improve performance by 8.5% on average. +Our framework and benchmark offer a solid foundation for conditional generation +in a training-free manner. + +
+
+
+
+
+ + ♻ ☆ Divide-or-Conquer? Which Part Should You Distill Your LLM? EMNLP 2024 + + +
+ Recent methods have demonstrated that Large Language Models (LLMs) can solve +reasoning tasks better when they are encouraged to solve subtasks of the main +task first. In this paper we devise a similar strategy that breaks down +reasoning tasks into a problem decomposition phase and a problem solving phase +and show that the strategy is able to outperform a single stage solution. +Further, we hypothesize that the decomposition should be easier to distill into +a smaller model compared to the problem solving because the latter requires +large amounts of domain knowledge while the former only requires learning +general problem solving strategies. We propose methods to distill these two +capabilities and evaluate their impact on reasoning outcomes and inference +cost. We find that we can distill the problem decomposition phase and at the +same time achieve good generalization across tasks, datasets, and models. +However, it is harder to distill the problem solving capability without losing +performance and the resulting distilled model struggles with generalization. +These results indicate that by using smaller, distilled problem decomposition +models in combination with problem solving LLMs we can achieve reasoning with +cost-efficient inference and local adaptation. + +
+
+ comment: Findings of the Association for Computational Linguistics: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ A semi-supervised learning using over-parameterized regression + + +
+ Semi-supervised learning (SSL) is an important theme in machine learning, in +which we have a few labeled samples and many unlabeled samples. In this paper, +for SSL in a regression problem, we consider a method of incorporating +information on unlabeled samples into kernel functions. As a typical +implementation, we employ Gaussian kernels whose centers are labeled and +unlabeled input samples. Since the number of coefficients is larger than the +number of labeled samples in this setting, this is an over-parameterized +regression roblem. A ridge regression is a typical estimation method under this +setting. In this paper, alternatively, we consider to apply the minimum norm +least squares (MNLS), which is known as a helpful tool for understanding deep +learning behavior while it may not be application oriented. Then, in applying +the MNLS for SSL, we established several methods based on feature +extraction/dimension reduction in the SVD (singular value decomposition) +representation of a Gram type matrix appeared in the over-parameterized +regression problem. The methods are thresholding according to singular value +magnitude with cross validation, hard-thresholding with cross validation, +universal thresholding and bridge thresholding methods. The first one is +equivalent to a method using a well-known low rank approximation of a Gram type +matrix. We refer to these methods as SVD regression methods. In the experiments +for real data, depending on datasets, clear superiority of the proposed SVD +regression methods over ridge regression methods was observed. And, depending +on datasets, incorporation of information on unlabeled input samples into +kernels was found to be clearly effective. + +
+
+
+
+
+ + ♻ ☆ Dense ReLU Neural Networks for Temporal-spatial Model + + +
+ In this paper, we focus on fully connected deep neural networks utilizing the +Rectified Linear Unit (ReLU) activation function for nonparametric estimation. +We derive non-asymptotic bounds that lead to convergence rates, addressing both +temporal and spatial dependence in the observed measurements. By accounting for +dependencies across time and space, our models better reflect the complexities +of real-world data, enhancing both predictive performance and theoretical +robustness. We also tackle the curse of dimensionality by modeling the data on +a manifold, exploring the intrinsic dimensionality of high-dimensional data. We +broaden existing theoretical findings of temporal-spatial analysis by applying +them to neural networks in more general contexts and demonstrate that our proof +techniques are effective for models with short-range dependence. Our empirical +simulations across various synthetic response functions underscore the superior +performance of our method, outperforming established approaches in the existing +literature. These findings provide valuable insights into the strong +capabilities of dense neural networks for temporal-spatial modeling across a +broad range of function classes. + +
+
+
+
+
+ + ♻ ☆ A Hybrid Data-Driven Multi-Stage Deep Learning Framework for Enhanced + Nuclear Reactor Power Prediction + + +
+ The accurate and efficient modeling of nuclear reactor transients is crucial +for ensuring safe and optimal reactor operation. Traditional physics-based +models, while valuable, can be computationally intensive and may not fully +capture the complexities of real-world reactor behavior. This paper introduces +a novel multi-stage deep learning framework that addresses these limitations, +offering a faster and more robust solution for predicting the final +steady-state power of reactor transients. By leveraging a combination of +feed-forward neural networks with both classification and regression stages, +and training on a unique dataset that integrates real-world measurements of +reactor power and controls state from the Missouri University of Science and +Technology Reactor (MSTR) with noise-enhanced simulated data, our approach +achieves remarkable accuracy (96% classification, 2.3% MAPE). The incorporation +of simulated data with noise significantly improves the model's generalization +capabilities, mitigating the risk of overfitting. This innovative solution not +only enables rapid and precise prediction of reactor behavior but also has the +potential to revolutionize nuclear reactor operations, facilitating enhanced +safety protocols, optimized performance, and streamlined decision-making +processes. + +
+
+
+
+
+ + ♻ ☆ Learning general Gaussian mixtures with efficient score matching + + +
+ We study the problem of learning mixtures of $k$ Gaussians in $d$ dimensions. +We make no separation assumptions on the underlying mixture components: we only +require that the covariance matrices have bounded condition number and that the +means and covariances lie in a ball of bounded radius. We give an algorithm +that draws $d^{\mathrm{poly}(k/\varepsilon)}$ samples from the target mixture, +runs in sample-polynomial time, and constructs a sampler whose output +distribution is $\varepsilon$-far from the unknown mixture in total variation. +Prior works for this problem either (i) required exponential runtime in the +dimension $d$, (ii) placed strong assumptions on the instance (e.g., spherical +covariances or clusterability), or (iii) had doubly exponential dependence on +the number of components $k$. + Our approach departs from commonly used techniques for this problem like the +method of moments. Instead, we leverage a recently developed reduction, based +on diffusion models, from distribution learning to a supervised learning task +called score matching. We give an algorithm for the latter by proving a +structural result showing that the score function of a Gaussian mixture can be +approximated by a piecewise-polynomial function, and there is an efficient +algorithm for finding it. To our knowledge, this is the first example of +diffusion models achieving a state-of-the-art theoretical guarantee for an +unsupervised learning task. + +
+
+ comment: 57 pages +
+
+
+
+
+ + ♻ ☆ Bullion: A Column Store for Machine Learning + + +
+ The past two decades have witnessed significant success in applying columnar +storage to data warehousing and analytics. However, the rapid growth of machine +learning poses new challenges. This paper presents Bullion, a columnar storage +system tailored for machine learning workloads. Bullion addresses the +complexities of data compliance, optimizes the encoding of long sequence sparse +features, efficiently manages wide-table projections, introduces feature +quantization in storage, enables quality-aware sequential reads for multimodal +training data, and provides a comprehensive cascading encoding framework that +unifies diverse encoding schemes through modular, composable interfaces. By +aligning with the evolving requirements of ML applications, Bullion facilitates +the application of columnar storage and processing to modern application +scenarios such as those within advertising, recommendation systems, and +Generative AI. + Preliminary experimental results and theoretical analysis demonstrate +Bullion's improved ability to deliver strong performance in the face of the +unique demands of machine learning workloads compared to existing columnar +storage solutions. Bullion significantly reduces I/O costs for deletion +compliance, achieves substantial storage savings with its optimized encoding +scheme for sparse features, and improves metadata parsing speed for wide-table +projections. These advancements enable Bullion to become an important component +in the future of machine learning infrastructure, enabling organizations to +efficiently manage and process the massive volumes of data required for +training and inference in modern AI applications. + +
+
+
+
+
+ + ♻ ☆ Taming Generative Diffusion Prior for Universal Blind Image Restoration + + +
+ Diffusion models have been widely utilized for image restoration. However, +previous blind image restoration methods still need to assume the type of +degradation model while leaving the parameters to be optimized, limiting their +real-world applications. Therefore, we aim to tame generative diffusion prior +for universal blind image restoration dubbed BIR-D, which utilizes an +optimizable convolutional kernel to simulate the degradation model and +dynamically update the parameters of the kernel in the diffusion steps, +enabling it to achieve blind image restoration results even in various complex +situations. Besides, based on mathematical reasoning, we have provided an +empirical formula for the chosen of adaptive guidance scale, eliminating the +need for a grid search for the optimal parameter. Experimentally, Our BIR-D has +demonstrated superior practicality and versatility than off-the-shelf +unsupervised methods across various tasks both on real-world and synthetic +datasets, qualitatively and quantitatively. BIR-D is able to fulfill +multi-guidance blind image restoration. Moreover, BIR-D can also restore images +that undergo multiple and complicated degradations, demonstrating the practical +applications. + +
+
+ comment: 15 pages, 12 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Variational Graph Autoencoder for Heterogeneous Information Networks + with Missing and Inaccurate Attributes + + +
+ Heterogeneous Information Networks (HINs), which consist of various types of +nodes and edges, have recently demonstrated excellent performance in graph +mining. However, most existing heterogeneous graph neural networks (HGNNs) +ignore the problems of missing attributes, inaccurate attributes and scarce +labels for nodes, which limits their expressiveness. In this paper, we propose +a generative self-supervised model GraMI to address these issues +simultaneously. Specifically, GraMI first initializes all the nodes in the +graph with a low-dimensional representation matrix. After that, based on the +variational graph autoencoder framework, GraMI learns both node-level and +attribute-level embeddings in the encoder, which can provide fine-grained +semantic information to construct node attributes. In the decoder, GraMI +reconstructs both links and attributes. Instead of directly reconstructing raw +features for attributed nodes, GraMI generates the initial low-dimensional +representation matrix for all the nodes, based on which raw features of +attributed nodes are further reconstructed to leverage accurate attributes. In +this way, GraMI can not only complete informative features for non-attributed +nodes, but rectify inaccurate ones for attributed nodes. Finally, we conduct +extensive experiments to show the superiority of GraMI in tackling HINs with +missing and inaccurate attributes. + +
+
+ comment: Accepted by KDD 2025 +
+
+
+
+
+ + ♻ ☆ The Implicit Bias of Heterogeneity towards Invariance: A Study of + Multi-Environment Matrix Sensing + + +
+ Models are expected to engage in invariance learning, which involves +distinguishing the core relations that remain consistent across varying +environments to ensure the predictions are safe, robust and fair. While +existing works consider specific algorithms to realize invariance learning, we +show that model has the potential to learn invariance through standard training +procedures. In other words, this paper studies the implicit bias of Stochastic +Gradient Descent (SGD) over heterogeneous data and shows that the implicit bias +drives the model learning towards an invariant solution. We call the phenomenon +the implicit invariance learning. Specifically, we theoretically investigate +the multi-environment low-rank matrix sensing problem where in each +environment, the signal comprises (i) a lower-rank invariant part shared across +all environments; and (ii) a significantly varying environment-dependent +spurious component. The key insight is, through simply employing the large step +size large-batch SGD sequentially in each environment without any explicit +regularization, the oscillation caused by heterogeneity can provably prevent +model learning spurious signals. The model reaches the invariant solution after +certain iterations. In contrast, model learned using pooled SGD over all data +would simultaneously learn both the invariant and spurious signals. Overall, we +unveil another implicit bias that is a result of the symbiosis between the +heterogeneity of data and modern algorithms, which is, to the best of our +knowledge, first in the literature. + +
+
+
+
+
+ + ♻ ☆ Distributionally robust self-supervised learning for tabular data NeurIPS2024 + + +
+ Machine learning (ML) models trained using Empirical Risk Minimization (ERM) +often exhibit systematic errors on specific subpopulations of tabular data, +known as error slices. Learning robust representation in presence of error +slices is challenging, especially in self-supervised settings during the +feature reconstruction phase, due to high cardinality features and the +complexity of constructing error sets. Traditional robust representation +learning methods are largely focused on improving worst group performance in +supervised setting in computer vision, leaving a gap in approaches tailored for +tabular data. We address this gap by developing a framework to learn robust +representation in tabular data during self-supervised pre-training. Our +approach utilizes an encoder-decoder model trained with Masked Language +Modeling (MLM) loss to learn robust latent representations. This paper applies +the Just Train Twice (JTT) and Deep Feature Reweighting (DFR) methods during +the pre-training phase for tabular data. These methods fine-tune the ERM +pre-trained model by up-weighting error-prone samples or creating balanced +datasets for specific categorical features. This results in specialized models +for each feature, which are then used in an ensemble approach to enhance +downstream classification performance. This methodology improves robustness +across slices, thus enhancing overall generalization performance. Extensive +experiments across various datasets demonstrate the efficacy of our approach. +The code is available: +\url{https://github.com/amazon-science/distributionally-robust-self-supervised-learning-for-tabular-data}. + +
+
+ comment: TRL Workshop@NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ Gradient Normalization Provably Benefits Nonconvex SGD under + Heavy-Tailed Noise + + +
+ This paper investigates the roles of gradient normalization and clipping in +ensuring the convergence of Stochastic Gradient Descent (SGD) under +heavy-tailed noise. While existing approaches consider gradient clipping +indispensable for SGD convergence, we theoretically demonstrate that gradient +normalization alone without clipping is sufficient to ensure convergence. +Furthermore, we establish that combining gradient normalization with clipping +offers significantly improved convergence rates compared to using either +technique in isolation, notably as gradient noise diminishes. With these +results, our work provides the first theoretical evidence demonstrating the +benefits of gradient normalization in SGD under heavy-tailed noise. Finally, we +introduce an accelerated SGD variant incorporating gradient normalization and +clipping, further enhancing convergence rates under heavy-tailed noise. + +
+
+
+
+
+ + ♻ ☆ Adapting Amidst Degradation: Cross Domain Li-ion Battery Health + Estimation via Physics-Guided Test-Time Training + + +
+ Health modeling of lithium-ion batteries (LIBs) is crucial for safe and +efficient energy management and carries significant socio-economic +implications. Although Machine Learning (ML)-based State of Health (SOH) +estimation methods have made significant progress in accuracy, the scarcity of +high-quality LIB data remains a major obstacle. Existing transfer learning +methods for cross-domain LIB SOH estimation have significantly alleviated the +labeling burden of target LIB data, however, they still require sufficient +unlabeled target data (UTD) for effective adaptation to the target domain. +Collecting this UTD is challenging due to the time-consuming nature of +degradation experiments. To address this issue, we introduce a practical +Test-Time Training framework, BatteryTTT, which adapts the model continually +using each UTD collected amidst degradation, thereby significantly reducing +data collection time. To fully utilize each UTD, BatteryTTT integrates the +inherent physical laws of modern LIBs into self-supervised learning, termed +Physcics-Guided Test-Time Training. Additionally, we explore the potential of +large language models (LLMs) in battery sequence modeling by evaluating their +performance in SOH estimation through model reprogramming and prefix prompt +adaptation. The combination of BatteryTTT and LLM modeling, termed GPT4Battery, +achieves state-of-the-art generalization results across current LIB benchmarks. +Furthermore, we demonstrate the practical value and scalability of our approach +by deploying it in our real-world battery management system (BMS) for 300Ah +large-scale energy storage LIBs. + +
+
+
+
+
+ + ♻ ☆ Refusal in LLMs is an Affine Function + + +
+ We propose affine concept editing (ACE) as an approach for steering language +models' behavior by intervening directly in activations. We begin with an +affine decomposition of model activation vectors and show that prior methods +for steering model behavior correspond to subsets of terms of this +decomposition. We then provide a derivation of ACE and use it to control +refusal behavior on ten different models, including Llama 3 70B. ACE combines +affine subspace projection and activation addition to reliably control the +model's refusal responses across prompt types. We evaluate the results using +LLM-based scoring on a collection of harmful and harmless prompts. Our +experiments demonstrate that ACE consistently achieves more precise control +over model behavior than existing methods and generalizes to models where +directional ablation via affine subspace projection alone produces incoherent +outputs. Code for reproducing our results is available at +https://github.com/EleutherAI/steering-llama3 . + +
+
+ comment: added plots for results from additional models +
+
+
+
+
+ + ♻ ☆ RELIEF: Reinforcement Learning Empowered Graph Feature Prompt Tuning + + +
+ The advent of the "pre-train, prompt" paradigm has recently extended its +generalization ability and data efficiency to graph representation learning, +following its achievements in Natural Language Processing (NLP). Initial graph +prompt tuning approaches tailored specialized prompting functions for Graph +Neural Network (GNN) models pre-trained with specific strategies, such as edge +prediction, thus limiting their applicability. In contrast, another pioneering +line of research has explored universal prompting via adding prompts to the +input graph's feature space, thereby removing the reliance on specific +pre-training strategies. However, the necessity to add feature prompts to all +nodes remains an open question. Motivated by findings from prompt tuning +research in the NLP domain, which suggest that highly capable pre-trained +models need less conditioning signal to achieve desired behaviors, we advocate +for strategically incorporating necessary and lightweight feature prompts to +certain graph nodes to enhance downstream task performance. This introduces a +combinatorial optimization problem, requiring a policy to decide 1) which nodes +to prompt and 2) what specific feature prompts to attach. We then address the +problem by framing the prompt incorporation process as a sequential +decision-making problem and propose our method, RELIEF, which employs +Reinforcement Learning (RL) to optimize it. At each step, the RL agent selects +a node (discrete action) and determines the prompt content (continuous action), +aiming to maximize cumulative performance gain. Extensive experiments on graph +and node-level tasks with various pre-training strategies in few-shot scenarios +demonstrate that our RELIEF outperforms fine-tuning and other prompt-based +approaches in classification performance and data efficiency. + +
+
+ comment: Accepted by SIGKDD 2025 +
+
+
+
+
+ + ♻ ☆ Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with + Texture Map-Based Weak Supervision ICPR + + +
+ Facial wrinkle detection plays a crucial role in cosmetic dermatology. +Precise manual segmentation of facial wrinkles is challenging and +time-consuming, with inherent subjectivity leading to inconsistent results +among graders. To address this issue, we propose two solutions. First, we build +and release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an +extension of the NVIDIA FFHQ dataset. It includes 1,000 images with human +labels and 50,000 images with automatically generated weak labels. This dataset +could serve as a foundation for the research community to develop advanced +wrinkle detection algorithms. Second, we introduce a simple training strategy +utilizing texture maps, applicable to various segmentation models, to detect +wrinkles across the face. Our two-stage training strategy first pretrain models +on a large dataset with weak labels (N=50k), or masked texture maps generated +through computer vision techniques, without human intervention. We then +finetune the models using human-labeled data (N=1k), which consists of manually +labeled wrinkle masks. The network takes as input a combination of RGB and +masked texture map of the image, comprising four channels, in finetuning. We +effectively combine labels from multiple annotators to minimize subjectivity in +manual labeling. Our strategies demonstrate improved segmentation performance +in facial wrinkle segmentation both quantitatively and visually compared to +existing pretraining methods. The dataset is available at +https://github.com/labhai/ffhq-wrinkle-dataset. + +
+
+ comment: Accepted at International Conference on Pattern Recognition (ICPR), + 2024 +
+
+
+
+
+ + ♻ ☆ Freezing of Gait Detection Using Gramian Angular Fields and Federated + Learning from Wearable Sensors + + +
+ Freezing of gait (FOG) is a debilitating symptom of Parkinson's disease (PD) +that impairs mobility and safety. Traditional detection methods face challenges +due to intra and inter-patient variability, and most systems are tested in +controlled settings, limiting their real-world applicability. Addressing these +gaps, we present FOGSense, a novel FOG detection system designed for +uncontrolled, free-living conditions. It uses Gramian Angular Field (GAF) +transformations and federated deep learning to capture temporal and spatial +gait patterns missed by traditional methods. We evaluated our FOGSense system +using a public PD dataset, 'tdcsfog'. FOGSense improves accuracy by 10.4% over +a single-axis accelerometer, reduces failure points compared to multi-sensor +systems, and demonstrates robustness to missing values. The federated +architecture allows personalized model adaptation and efficient smartphone +synchronization during off-peak hours, making it effective for long-term +monitoring as symptoms evolve. Overall, FOGSense achieves a 22.2% improvement +in F1-score compared to state-of-the-art methods, along with enhanced +sensitivity for FOG episode detection. Code is available: +https://github.com/shovito66/FOGSense. + +
+
+
+
+
+ + ♻ ☆ ControlNet++: Improving Conditional Controls with Efficient Consistency + Feedback + + +
+ To enhance the controllability of text-to-image diffusion models, existing +efforts like ControlNet incorporated image-based conditional controls. In this +paper, we reveal that existing methods still face significant challenges in +generating images that align with the image conditional controls. To this end, +we propose ControlNet++, a novel approach that improves controllable generation +by explicitly optimizing pixel-level cycle consistency between generated images +and conditional controls. Specifically, for an input conditional control, we +use a pre-trained discriminative reward model to extract the corresponding +condition of the generated images, and then optimize the consistency loss +between the input conditional control and extracted condition. A +straightforward implementation would be generating images from random noises +and then calculating the consistency loss, but such an approach requires +storing gradients for multiple sampling timesteps, leading to considerable time +and memory costs. To address this, we introduce an efficient reward strategy +that deliberately disturbs the input images by adding noise, and then uses the +single-step denoised images for reward fine-tuning. This avoids the extensive +costs associated with image sampling, allowing for more efficient reward +fine-tuning. Extensive experiments show that ControlNet++ significantly +improves controllability under various conditional controls. For example, it +achieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE, +respectively, for segmentation mask, line-art edge, and depth conditions. All +the code, models, demo and organized data have been open sourced on our Github +Repo. + +
+
+ comment: Camera Ready Version. Project Page: + https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data: + https://github.com/liming-ai/ControlNet_Plus_Plus +
+
+
+
+
+ + ♻ ☆ Lifelong Reinforcement Learning via Neuromodulation + + +
+ Navigating multiple tasks$\unicode{x2014}$for instance in succession as in +continual or lifelong learning, or in distributions as in meta or multi-task +learning$\unicode{x2014}$requires some notion of adaptation. Evolution over +timescales of millennia has imbued humans and other animals with highly +effective adaptive learning and decision-making strategies. Central to these +functions are so-called neuromodulatory systems. In this work we introduce an +abstract framework for integrating theories and evidence from neuroscience and +the cognitive sciences into the design of adaptive artificial reinforcement +learning algorithms. We give a concrete instance of this framework built on +literature surrounding the neuromodulators Acetylcholine (ACh) and +Noradrenaline (NA), and empirically validate the effectiveness of the resulting +adaptive algorithm in a non-stationary multi-armed bandit problem. We conclude +with a theory-based experiment proposal providing an avenue to link our +framework back to efforts in experimental neuroscience. + +
+
+
+
+
+ + ♻ ☆ Beyond Perceptual Distances: Rethinking Disparity Assessment for + Out-of-Distribution Detection with Diffusion Models + + +
+ Out-of-Distribution (OoD) detection aims to justify whether a given sample is +from the training distribution of the classifier-under-protection, i.e., +In-Distribution (InD), or from OoD. Diffusion Models (DMs) are recently +utilized in OoD detection by using the perceptual distances between the given +image and its DM generation. DM-based methods bring fresh insights to the +field, yet remain under-explored. + In this work, we point out two main limitations in DM-based OoD detection +methods: (i) the perceptual metrics on the disparities between the given sample +and its generation are devised only at human-perceived levels, ignoring the +abstract or high-level patterns that help better reflect the intrinsic +disparities in distribution; (ii) only the raw image contents are taken to +measure the disparities, while other representations, i.e., the features and +probabilities from the classifier-under-protection, are easy to access at hand +but are ignored. To this end, our proposed detection framework goes beyond the +perceptual distances and looks into the deep representations from the +classifier-under-protection with our novel metrics devised correspondingly, +leading to more informative disparity assessments between InD and OoD. An +anomaly-removal strategy is integrated to remove the abnormal OoD information +in the generation, further enhancing the distinctiveness of disparities. Our +work has demonstrated state-of-the-art detection performances among DM-based +methods in extensive experiments. + +
+
+
+
+
+ + ♻ ☆ Multi-LoRA Composition for Image Generation + + +
+ Low-Rank Adaptation (LoRA) is extensively utilized in text-to-image models +for the accurate rendition of specific elements like distinct characters or +unique styles in generated images. Nonetheless, existing methods face +challenges in effectively composing multiple LoRAs, especially as the number of +LoRAs to be integrated grows, thus hindering the creation of complex imagery. +In this paper, we study multi-LoRA composition through a decoding-centric +perspective. We present two training-free methods: LoRA Switch, which +alternates between different LoRAs at each denoising step, and LoRA Composite, +which simultaneously incorporates all LoRAs to guide more cohesive image +synthesis. To evaluate the proposed approaches, we establish ComposLoRA, a new +comprehensive testbed as part of this research. It features a diverse range of +LoRA categories with 480 composition sets. Utilizing an evaluation framework +based on GPT-4V, our findings demonstrate a clear improvement in performance +with our methods over the prevalent baseline, particularly evident when +increasing the number of LoRAs in a composition. The code, benchmarks, LoRA +weights, and all evaluation details are available on our project website: +https://maszhongming.github.io/Multi-LoRA-Composition. + +
+
+ comment: Transactions on Machine Learning Research (TMLR), 2024 +
+
+
+
+
+ + ♻ ☆ A universal approximation theorem for nonlinear resistive networks + + +
+ Resistor networks have recently attracted interest as analog computing +platforms for machine learning, particularly due to their compatibility with +the Equilibrium Propagation training framework. In this work, we explore the +computational capabilities of these networks. We prove that electrical networks +consisting of voltage sources, linear resistors, diodes, and voltage-controlled +voltage sources (VCVS) can approximate any continuous function to arbitrary +precision. Central to our proof is a method for translating a ReLU neural +network into an approximately equivalent electrical network comprising these +four elements. Our proof relies on two assumptions: (a) circuit elements are +ideal, and (b) variable resistor conductances and VCVS amplification factors +can take any value (arbitrarily small or large). Our findings provide insights +that could guide the development of universal self-learning electrical +networks. + +
+
+
+
+
+ + ♻ ☆ Different Horses for Different Courses: Comparing Bias Mitigation + Algorithms in ML NeurIPS 2024 + + +
+ With fairness concerns gaining significant attention in Machine Learning +(ML), several bias mitigation techniques have been proposed, often compared +against each other to find the best method. These benchmarking efforts tend to +use a common setup for evaluation under the assumption that providing a uniform +environment ensures a fair comparison. However, bias mitigation techniques are +sensitive to hyperparameter choices, random seeds, feature selection, etc., +meaning that comparison on just one setting can unfairly favour certain +algorithms. In this work, we show significant variance in fairness achieved by +several algorithms and the influence of the learning pipeline on fairness +scores. We highlight that most bias mitigation techniques can achieve +comparable performance, given the freedom to perform hyperparameter +optimization, suggesting that the choice of the evaluation parameters-rather +than the mitigation technique itself-can sometimes create the perceived +superiority of one method over another. We hope our work encourages future +research on how various choices in the lifecycle of developing an algorithm +impact fairness, and trends that guide the selection of appropriate algorithms. + +
+
+ comment: To appear at AFME@NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Diversity-Driven Synthesis: Enhancing Dataset Distillation through + Directed Weight Adjustment + + +
+ The sharp increase in data-related expenses has motivated research into +condensing datasets while retaining the most informative features. Dataset +distillation has thus recently come to the fore. This paradigm generates +synthetic datasets that are representative enough to replace the original +dataset in training a neural network. To avoid redundancy in these synthetic +datasets, it is crucial that each element contains unique features and remains +diverse from others during the synthesis stage. In this paper, we provide a +thorough theoretical and empirical analysis of diversity within synthesized +datasets. We argue that enhancing diversity can improve the parallelizable yet +isolated synthesizing approach. Specifically, we introduce a novel method that +employs dynamic and directed weight adjustment techniques to modulate the +synthesis process, thereby maximizing the representativeness and diversity of +each synthetic instance. Our method ensures that each batch of synthetic data +mirrors the characteristics of a large, varying subset of the original dataset. +Extensive experiments across multiple datasets, including CIFAR, Tiny-ImageNet, +and ImageNet-1K, demonstrate the superior performance of our method, +highlighting its effectiveness in producing diverse and representative +synthetic datasets with minimal computational expense. Our code is available at +https://github.com/AngusDujw/Diversity-Driven-Synthesis.https://github.com/AngusDujw/Diversity-Driven-Synthesis. + +
+
+
+
+
+ + ♻ ☆ Unveiling and Mitigating Bias in Large Language Model Recommendations: A + Path to Fairness + + +
+ Large Language Model (LLM)-based recommendation systems provide more +comprehensive recommendations than traditional systems by deeply analyzing +content and user behavior. However, these systems often exhibit biases, +favoring mainstream content while marginalizing non-traditional options due to +skewed training data. This study investigates the intricate relationship +between bias and LLM-based recommendation systems, with a focus on music, song, +and book recommendations across diverse demographic and cultural groups. +Through a comprehensive analysis conducted over different LLM-models, this +paper evaluates the impact of bias on recommendation outcomes. Our findings +highlight that biases are not only deeply embedded but also widely pervasive +across these systems, emphasizing the substantial and widespread nature of the +issue. Moreover, contextual information, such as socioeconomic status, further +amplify these biases, demonstrating the complexity and depth of the challenges +faced in creating fair recommendations across different groups. + +
+
+
+
+
+ + ♻ ☆ Separable DeepONet: Breaking the Curse of Dimensionality in + Physics-Informed Machine Learning + + +
+ The deep operator network (DeepONet) is a popular neural operator +architecture that has shown promise in solving partial differential equations +(PDEs) by using deep neural networks to map between infinite-dimensional +function spaces. In the absence of labeled datasets, we utilize the PDE +residual loss to learn the physical system, an approach known as +physics-informed DeepONet. This method faces significant computational +challenges, primarily due to the curse of dimensionality, as the computational +cost increases exponentially with finer discretization. In this paper, we +introduce the Separable DeepONet framework to address these challenges and +improve scalability for high-dimensional PDEs. Our approach involves a +factorization technique where sub-networks handle individual one-dimensional +coordinates, thereby reducing the number of forward passes and the size of the +Jacobian matrix. By using forward-mode automatic differentiation, we further +optimize the computational cost related to the Jacobian matrix. As a result, +our modifications lead to a linear scaling of computational cost with +discretization density, making Separable DeepONet suitable for high-dimensional +PDEs. We validate the effectiveness of the separable architecture through three +benchmark PDE models: the viscous Burgers equation, Biot's consolidation +theory, and a parametrized heat equation. In all cases, our proposed framework +achieves comparable or improved accuracy while significantly reducing +computational time compared to conventional DeepONet. These results demonstrate +the potential of Separable DeepONet in efficiently solving complex, +high-dimensional PDEs, advancing the field of physics-informed machine +learning. + +
+
+ comment: 23 Pages, 9 Figures and 1 Table +
+
+
+
+
+
+
+
+ + Artificial Intelligence 110 + +
+
+
+ + ☆ ACING: Actor-Critic for Instruction Learning in Black-Box Large Language + Models + + +
+ The effectiveness of Large Language Models (LLMs) in solving tasks vastly +depends on the quality of the instructions, which often require fine-tuning +through extensive human effort. This highlights the need for automated +instruction optimization; however, this optimization is particularly +challenging when dealing with black-box LLMs, where model parameters and +gradients remain inaccessible. We propose ACING, a task-specific prompt +optimization approach framed as a stateless continuous-action Reinforcement +Learning (RL) problem, known as the continuum bandit setting. ACING leverages +an actor-critic-based method to optimize prompts, learning from +non-differentiable reward signals. We validate ACING by optimizing prompts for +ChatGPT on 30 instruction-based tasks. ACING consistently outperforms baseline +methods, achieving a median score improvement of 10 percentage points. +Furthermore, ACING not only recovers but also surpasses human-crafted expert +instructions, achieving up to a 39 percentage point improvement against human +benchmarks. + +
+
+
+
+
+ + ☆ Benchmarking Positional Encodings for GNNs and Graph Transformers + + +
+ Recent advances in Graph Neural Networks (GNNs) and Graph Transformers (GTs) +have been driven by innovations in architectures and Positional Encodings +(PEs), which are critical for augmenting node features and capturing graph +topology. PEs are essential for GTs, where topological information would +otherwise be lost without message-passing. However, PEs are often tested +alongside novel architectures, making it difficult to isolate their effect on +established models. To address this, we present a comprehensive benchmark of +PEs in a unified framework that includes both message-passing GNNs and GTs. We +also establish theoretical connections between MPNNs and GTs and introduce a +sparsified GRIT attention mechanism to examine the influence of global +connectivity. Our findings demonstrate that previously untested combinations of +GNN architectures and PEs can outperform existing methods and offer a more +comprehensive picture of the state-of-the-art. To support future research and +experimentation in our framework, we make the code publicly available. + +
+
+
+
+
+ + ☆ Heuristic-Free Multi-Teacher Learning + + +
+ We introduce Teacher2Task, a novel framework for multi-teacher learning that +eliminates the need for manual aggregation heuristics. Existing multi-teacher +methods typically rely on such heuristics to combine predictions from multiple +teachers, often resulting in sub-optimal aggregated labels and the propagation +of aggregation errors. Teacher2Task addresses these limitations by introducing +teacher-specific input tokens and reformulating the training process. Instead +of relying on aggregated labels, the framework transforms the training data, +consisting of ground truth labels and annotations from N teachers, into N+1 +distinct tasks: N auxiliary tasks that predict the labeling styles of the N +individual teachers, and one primary task that focuses on the ground truth +labels. This approach, drawing upon principles from multiple learning +paradigms, demonstrates strong empirical results across a range of +architectures, modalities, and tasks. + +
+
+
+
+
+ + ☆ CATCH: Complementary Adaptive Token-level Contrastive Decoding to + Mitigate Hallucinations in LVLMs + + +
+ Large Vision-Language Model (LVLM) systems have demonstrated impressive +vision-language reasoning capabilities but suffer from pervasive and severe +hallucination issues, posing significant risks in critical domains such as +healthcare and autonomous systems. Despite previous efforts to mitigate +hallucinations, a persistent issue remains: visual defect from vision-language +misalignment, creating a bottleneck in visual processing capacity. To address +this challenge, we develop Complementary Adaptive Token-level Contrastive +Decoding to Mitigate Hallucinations in LVLMs (CATCH), based on the Information +Bottleneck theory. CATCH introduces Complementary Visual Decoupling (CVD) for +visual information separation, Non-Visual Screening (NVS) for hallucination +detection, and Adaptive Token-level Contrastive Decoding (ATCD) for +hallucination mitigation. CATCH addresses issues related to visual defects that +cause diminished fine-grained feature perception and cumulative hallucinations +in open-ended scenarios. It is applicable to various visual question-answering +tasks without requiring any specific data or prior knowledge, and generalizes +robustly to new tasks without additional training, opening new possibilities +for advancing LVLM in various challenging applications. + +
+
+
+
+
+ + ☆ Enhancing Multi-Class Disease Classification: Neoplasms, Cardiovascular, + Nervous System, and Digestive Disorders Using Advanced LLMs + + +
+ In this research, we explored the improvement in terms of multi-class disease +classification via pre-trained language models over Medical-Abstracts-TC-Corpus +that spans five medical conditions. We excluded non-cancer conditions and +examined four specific diseases. We assessed four LLMs, BioBERT, XLNet, and +BERT, as well as a novel base model (Last-BERT). BioBERT, which was pre-trained +on medical data, demonstrated superior performance in medical text +classification (97% accuracy). Surprisingly, XLNet followed closely (96% +accuracy), demonstrating its generalizability across domains even though it was +not pre-trained on medical data. LastBERT, a custom model based on the lighter +version of BERT, also proved competitive with 87.10% accuracy (just under +BERT's 89.33%). Our findings confirm the importance of specialized models such +as BioBERT and also support impressions around more general solutions like +XLNet and well-tuned transformer architectures with fewer parameters (in this +case, LastBERT) in medical domain tasks. + +
+
+ comment: 7 Pages, 4 tables and 11 figures. Under review in a IEEE conference +
+
+
+
+
+ + ☆ When Backdoors Speak: Understanding LLM Backdoor Attacks Through + Model-Generated Explanations + + +
+ Large Language Models (LLMs) are vulnerable to backdoor attacks, where hidden +triggers can maliciously manipulate model behavior. While several backdoor +attack methods have been proposed, the mechanisms by which backdoor functions +operate in LLMs remain underexplored. In this paper, we move beyond attacking +LLMs and investigate backdoor functionality through the novel lens of natural +language explanations. Specifically, we leverage LLMs' generative capabilities +to produce human-understandable explanations for their decisions, allowing us +to compare explanations for clean and poisoned samples. We explore various +backdoor attacks and embed the backdoor into LLaMA models for multiple tasks. +Our experiments show that backdoored models produce higher-quality explanations +for clean data compared to poisoned data, while generating significantly more +consistent explanations for poisoned data than for clean data. We further +analyze the explanation generation process, revealing that at the token level, +the explanation token of poisoned samples only appears in the final few +transformer layers of the LLM. At the sentence level, attention dynamics +indicate that poisoned inputs shift attention from the input context when +generating the explanation. These findings deepen our understanding of backdoor +attack mechanisms in LLMs and offer a framework for detecting such +vulnerabilities through explainability techniques, contributing to the +development of more secure LLMs. + +
+
+
+
+
+ + ☆ Attribute Inference Attacks for Federated Regression Tasks + + +
+ Federated Learning (FL) enables multiple clients, such as mobile phones and +IoT devices, to collaboratively train a global machine learning model while +keeping their data localized. However, recent studies have revealed that the +training phase of FL is vulnerable to reconstruction attacks, such as attribute +inference attacks (AIA), where adversaries exploit exchanged messages and +auxiliary public information to uncover sensitive attributes of targeted +clients. While these attacks have been extensively studied in the context of +classification tasks, their impact on regression tasks remains largely +unexplored. In this paper, we address this gap by proposing novel model-based +AIAs specifically designed for regression tasks in FL environments. Our +approach considers scenarios where adversaries can either eavesdrop on +exchanged messages or directly interfere with the training process. We +benchmark our proposed attacks against state-of-the-art methods using +real-world datasets. The results demonstrate a significant increase in +reconstruction accuracy, particularly in heterogeneous client datasets, a +common scenario in FL. The efficacy of our model-based AIAs makes them better +candidates for empirically quantifying privacy leakage for federated regression +tasks. + +
+
+
+
+
+ + ☆ AdaCM$^2$: On Understanding Extremely Long-Term Video with Adaptive + Cross-Modality Memory Reduction + + +
+ The advancements in large language models (LLMs) have propelled the +improvement of video understanding tasks by incorporating LLMs with visual +models. However, most existing LLM-based models (e.g., VideoLLaMA, VideoChat) +are constrained to processing short-duration videos. Recent attempts to +understand long-term videos by extracting and compressing visual features into +a fixed memory size. Nevertheless, those methods leverage only visual modality +to merge video tokens and overlook the correlation between visual and textual +queries, leading to difficulties in effectively handling complex +question-answering tasks. To address the challenges of long videos and complex +prompts, we propose AdaCM$^2$, which, for the first time, introduces an +adaptive cross-modality memory reduction approach to video-text alignment in an +auto-regressive manner on video streams. Our extensive experiments on various +video understanding tasks, such as video captioning, video question answering, +and video classification, demonstrate that AdaCM$^2$ achieves state-of-the-art +performance across multiple datasets while significantly reducing memory usage. +Notably, it achieves a 4.5% improvement across multiple tasks in the LVU +dataset with a GPU memory consumption reduction of up to 65%. + +
+
+
+
+
+ + ☆ Enhanced Sign Language Translation between American Sign Language (ASL) + and Indian Sign Language (ISL) Using LLMs + + +
+ We have come up with a research that hopes to provide a bridge between the +users of American Sign Language and the users of spoken language and Indian +Sign Language (ISL). The research enabled us to create a novel framework that +we have developed for Learner Systems. Leveraging art of Large models to create +key features including: - Real-time translation between these two sign +languages in an efficient manner. Making LLM's capability available for +seamless translations to ISL. Here is the full study showing its implementation +in this paper. The core of the system is a sophisticated pipeline that begins +with reclassification and recognition of ASL gestures based on a strong Random +Forest Classifier. By recognizing the ASL, it is translated into text which can +be more easily processed. Highly evolved natural language NLP (Natural Language +Processing) techniques come in handy as they play a role in our LLM integration +where you then use LLMs to be able to convert the ASL text to ISL which +provides you with the intent of sentence or phrase. The final step is to +synthesize the translated text back into ISL gestures, creating an end-to-end +translation experience using RIFE-Net. This framework is tasked with key +challenges such as automatically dealing with gesture variability and +overcoming the linguistic differences between ASL and ISL. By automating the +translation process, we hope to vastly improve accessibility for sign language +users. No longer will the communication gap between ASL and ISL create +barriers; this totally cool innovation aims to bring our communities closer +together. And we believe, with full confidence in our framework, that we're +able to apply the same principles across a wide variety of sign language +dialects. + +
+
+
+
+
+ + ☆ AI Guided Early Screening of Cervical Cancer + + +
+ In order to support the creation of reliable machine learning models for +anomaly detection, this project focuses on preprocessing, enhancing, and +organizing a medical imaging dataset. There are two classifications in the +dataset: normal and abnormal, along with extra noise fluctuations. In order to +improve the photographs' quality, undesirable artifacts, including visible +medical equipment at the edges, were eliminated using central cropping. +Adjusting the brightness and contrast was one of the additional preprocessing +processes. Normalization was then performed to normalize the data. To make +classification jobs easier, the dataset was methodically handled by combining +several image subsets into two primary categories: normal and pathological. To +provide a strong training set that adapts well to real-world situations, +sophisticated picture preprocessing techniques were used, such as contrast +enhancement and real-time augmentation (including rotations, zooms, and +brightness modifications). To guarantee efficient model evaluation, the data +was subsequently divided into training and testing subsets. In order to create +precise and effective machine learning models for medical anomaly detection, +high-quality input data is ensured via this thorough approach. Because of the +project pipeline's flexible and scalable design, it can be easily integrated +with bigger clinical decision-support systems. + +
+
+
+
+
+ + ☆ Deep Learning-Driven Heat Map Analysis for Evaluating thickness of + Wounded Skin Layers + + +
+ Understanding the appropriate skin layer thickness in wounded sites is an +important tool to move forward on wound healing practices and treatment +protocols. Methods to measure depth often are invasive and less specific. This +paper introduces a novel method that is non-invasive with deep learning +techniques using classifying of skin layers that helps in measurement of wound +depth through heatmap analysis. A set of approximately 200 labeled images of +skin allows five classes to be distinguished: scars, wounds, and healthy skin, +among others. Each image has annotated key layers, namely the stratum cornetum, +the epidermis, and the dermis, in the software Roboflow. In the preliminary +stage, the Heatmap generator VGG16 was used to enhance the visibility of tissue +layers, based upon which their annotated images were used to train ResNet18 +with early stopping techniques. It ended up at a very high accuracy rate of +97.67%. To do this, the comparison of the models ResNet18, VGG16, DenseNet121, +and EfficientNet has been done where both EfficientNet and ResNet18 have +attained accuracy rates of almost 95.35%. For further hyperparameter tuning, +EfficientNet and ResNet18 were trained at six different learning rates to +determine the best model configuration. It has been noted that the accuracy has +huge variations with different learning rates. In the case of EfficientNet, the +maximum achievable accuracy was 95.35% at the rate of 0.0001. The same was true +for ResNet18, which also attained its peak value of 95.35% at the same rate. +These facts indicate that the model can be applied and utilized in actual-time, +non-invasive wound assessment, which holds a great promise to improve clinical +diagnosis and treatment planning. + +
+
+
+
+
+ + ☆ Neurosymbolic Graph Enrichment for Grounded World Models + + +
+ The development of artificial intelligence systems capable of understanding +and reasoning about complex real-world scenarios is a significant challenge. In +this work we present a novel approach to enhance and exploit LLM reactive +capability to address complex problems and interpret deeply contextual +real-world meaning. We introduce a method and a tool for creating a multimodal, +knowledge-augmented formal representation of meaning that combines the +strengths of large language models with structured semantic representations. +Our method begins with an image input, utilizing state-of-the-art large +language models to generate a natural language description. This description is +then transformed into an Abstract Meaning Representation (AMR) graph, which is +formalized and enriched with logical design patterns, and layered semantics +derived from linguistic and factual knowledge bases. The resulting graph is +then fed back into the LLM to be extended with implicit knowledge activated by +complex heuristic learning, including semantic implicatures, moral values, +embodied cognition, and metaphorical representations. By bridging the gap +between unstructured language models and formal semantic structures, our method +opens new avenues for tackling intricate problems in natural language +understanding and reasoning. + +
+
+
+
+
+ + ☆ PoM: Efficient Image and Video Generation with the Polynomial Mixer + + +
+ Diffusion models based on Multi-Head Attention (MHA) have become ubiquitous +to generate high quality images and videos. However, encoding an image or a +video as a sequence of patches results in costly attention patterns, as the +requirements both in terms of memory and compute grow quadratically. To +alleviate this problem, we propose a drop-in replacement for MHA called the +Polynomial Mixer (PoM) that has the benefit of encoding the entire sequence +into an explicit state. PoM has a linear complexity with respect to the number +of tokens. This explicit state also allows us to generate frames in a +sequential fashion, minimizing memory and compute requirement, while still +being able to train in parallel. We show the Polynomial Mixer is a universal +sequence-to-sequence approximator, just like regular MHA. We adapt several +Diffusion Transformers (DiT) for generating images and videos with PoM +replacing MHA, and we obtain high quality samples while using less +computational resources. The code is available at +https://github.com/davidpicard/HoMM. + +
+
+
+
+
+ + ☆ Optimizing Airline Reservation Systems with Edge-Enabled Microservices: + A Framework for Real-Time Data Processing and Enhanced User Responsiveness + + +
+ The growing complexity of the operations of airline reservations requires a +smart solution for the adoption of novel approaches to the development of +quick, efficient, and adaptive reservation systems. This paper outlines in +detail a conceptual framework for the implementation of edge computing +microservices in order to address the shortcomings of traditional centralized +architectures. Specifically, as edge computing allows for certain activities +such as seat inventory checks, booking processes and even confirmation to be +done nearer to the user, thus lessening the overall response time and improving +the performance of the system. In addition, the framework value should include +achieving the high performance of the system such as low latency, high +throughput and higher user experience. The major design components include +deployed distributed computing microservices orchestrated by Kubernetes, +real-time message processing system with Kafka and its elastic scaling. Other +operational components include Prometheus and Grafana, which are used to +monitor and manage resources, ensuring that all operational processes are +optimized. Although this research focuses on a design and theoretical scheming +of the framework, its use is foreseen to be more advantageous in facilitating a +transform in the provision of services in the airline industry by improving +customers' satisfaction, providing infrastructure which is cheap to install and +efficiently supporting technology changes such as artificial intelligence and +internet of things embedded systems. This research addresses the increasing +demand for new technologies with modern well-distributed and real-time-centric +systems and also provides a basis for future case implementation and testing. +As such, the proposed architecture offers a market-ready, extensible solution +to the problems posed by existing airline reservation systems . + +
+
+ comment: 22 pages, 11 figures +
+
+
+
+
+ + ☆ CodeXEmbed: A Generalist Embedding Model Family for Multiligual and + Multi-task Code Retrieval + + +
+ Despite the success of text retrieval in many NLP tasks, code retrieval +remains a largely underexplored area. Most text retrieval systems are tailored +for natural language queries, often neglecting the specific challenges of +retrieving code. This gap leaves existing models unable to effectively capture +the diversity of programming languages and tasks across different domains, +highlighting the need for more focused research in code retrieval. To address +this, we introduce CodeXEmbed, a family of large-scale code embedding models +ranging from 400M to 7B parameters. Our novel training pipeline unifies +multiple programming languages and transforms various code-related tasks into a +common retrieval framework, enhancing model generalizability and retrieval +performance. Our 7B model sets a new state-of-the-art (SOTA) in code retrieval, +outperforming the previous leading model, Voyage-Code, by over 20% on CoIR +benchmark. In addition to excelling in code retrieval, our models demonstrate +competitive performance on the widely adopted BeIR text retrieval benchmark, +offering versatility across domains. Experimental results demonstrate that +improving retrieval performance significantly enhances end-to-end +Retrieval-Augmented Generation (RAG) performance for code-related tasks. + +
+
+
+
+
+ + ☆ DLBacktrace: A Model Agnostic Explainability for any Deep Learning + Models + + +
+ The rapid advancement of artificial intelligence has led to increasingly +sophisticated deep learning models, which frequently operate as opaque 'black +boxes' with limited transparency in their decision-making processes. This lack +of interpretability presents considerable challenges, especially in high-stakes +applications where understanding the rationale behind a model's outputs is as +essential as the outputs themselves. This study addresses the pressing need for +interpretability in AI systems, emphasizing its role in fostering trust, +ensuring accountability, and promoting responsible deployment in +mission-critical fields. To address the interpretability challenge in deep +learning, we introduce DLBacktrace, an innovative technique developed by the +AryaXAI team to illuminate model decisions across a wide array of domains, +including simple Multi Layer Perceptron (MLPs), Convolutional Neural Networks +(CNNs), Large Language Models (LLMs), Computer Vision Models, and more. + We provide a comprehensive overview of the DLBacktrace algorithm and present +benchmarking results, comparing its performance against established +interpretability methods, such as SHAP, LIME, GradCAM, Integrated Gradients, +SmoothGrad, and Attention Rollout, using diverse task-based metrics. The +proposed DLBacktrace technique is compatible with various model architectures +built in PyTorch and TensorFlow, supporting models like Llama 3.2, other NLP +architectures such as BERT and LSTMs, computer vision models like ResNet and +U-Net, as well as custom deep neural network (DNN) models for tabular data. +This flexibility underscores DLBacktrace's adaptability and effectiveness in +enhancing model transparency across a broad spectrum of applications. The +library is open-sourced and available at https://github.com/AryaXAI/DLBacktrace . + +
+
+
+
+
+ + ☆ Instant Policy: In-Context Imitation Learning via Graph Diffusion + + +
+ Following the impressive capabilities of in-context learning with large +transformers, In-Context Imitation Learning (ICIL) is a promising opportunity +for robotics. We introduce Instant Policy, which learns new tasks instantly +(without further training) from just one or two demonstrations, achieving ICIL +through two key components. First, we introduce inductive biases through a +graph representation and model ICIL as a graph generation problem with a +learned diffusion process, enabling structured reasoning over demonstrations, +observations, and actions. Second, we show that such a model can be trained +using pseudo-demonstrations - arbitrary trajectories generated in simulation - +as a virtually infinite pool of training data. Simulated and real experiments +show that Instant Policy enables rapid learning of various everyday robot +tasks. We also show how it can serve as a foundation for cross-embodiment and +zero-shot transfer to language-defined tasks. Code and videos are available at +https://www.robot-learning.uk/instant-policy. + +
+
+ comment: Code and videos are available on our project webpage at + https://www.robot-learning.uk/instant-policy +
+
+
+
+
+ + ☆ Estimating Dark Matter Halo Masses in Simulated Galaxy Clusters with + Graph Neural Networks NeurIPS + + +
+ Galaxies grow and evolve in dark matter halos. Because dark matter is not +visible, galaxies' halo masses ($\rm{M}_{\rm{halo}}$) must be inferred +indirectly. We present a graph neural network (GNN) model for predicting +$\rm{M}_{\rm{halo}}$ from stellar mass ($\rm{M}_{*}$) in simulated galaxy +clusters using data from the IllustrisTNG simulation suite. Unlike traditional +machine learning models like random forests, our GNN captures the +information-rich substructure of galaxy clusters by using spatial and kinematic +relationships between galaxy neighbour. A GNN model trained on the TNG-Cluster +dataset and independently tested on the TNG300 simulation achieves superior +predictive performance compared to other baseline models we tested. Future work +will extend this approach to different simulations and real observational +datasets to further validate the GNN model's ability to generalise. + +
+
+ comment: 9 pages, 4 figures, accepted at the NeurIPS ML4PS 2024 workshop +
+
+
+
+
+ + ☆ STREAM: A Universal State-Space Model for Sparse Geometric Data + + +
+ Handling sparse and unstructured geometric data, such as point clouds or +event-based vision, is a pressing challenge in the field of machine vision. +Recently, sequence models such as Transformers and state-space models entered +the domain of geometric data. These methods require specialized preprocessing +to create a sequential view of a set of points. Furthermore, prior works +involving sequence models iterate geometric data with either uniform or learned +step sizes, implicitly relying on the model to infer the underlying geometric +structure. In this work, we propose to encode geometric structure explicitly +into the parameterization of a state-space model. State-space models are based +on linear dynamics governed by a one-dimensional variable such as time or a +spatial coordinate. We exploit this dynamic variable to inject relative +differences of coordinates into the step size of the state-space model. The +resulting geometric operation computes interactions between all pairs of N +points in O(N) steps. Our model deploys the Mamba selective state-space model +with a modified CUDA kernel to efficiently map sparse geometric data to modern +hardware. The resulting sequence model, which we call STREAM, achieves +competitive results on a range of benchmarks from point-cloud classification to +event-based vision and audio classification. STREAM demonstrates a powerful +inductive bias for sparse geometric data by improving the PointMamba baseline +when trained from scratch on the ModelNet40 and ScanObjectNN point cloud +analysis datasets. It further achieves, for the first time, 100% test accuracy +on all 11 classes of the DVS128 Gestures dataset. + +
+
+
+
+
+ + ☆ Provable unlearning in topic modeling and downstream tasks + + +
+ Machine unlearning algorithms are increasingly important as legal concerns +arise around the provenance of training data, but verifying the success of +unlearning is often difficult. Provable guarantees for unlearning are often +limited to supervised learning settings. In this paper, we provide the first +theoretical guarantees for unlearning in the pre-training and fine-tuning +paradigm by studying topic models, simple bag-of-words language models that can +be adapted to solve downstream tasks like retrieval and classification. First, +we design a provably effective unlearning algorithm for topic models that +incurs a computational overhead independent of the size of the original +dataset. Our analysis additionally quantifies the deletion capacity of the +model -- i.e., the number of examples that can be unlearned without incurring a +significant cost in model performance. Finally, we formally extend our analyses +to account for adaptation to a given downstream task. In particular, we design +an efficient algorithm to perform unlearning after fine-tuning the topic model +via a linear head. Notably, we show that it is easier to unlearn pre-training +data from models that have been fine-tuned to a particular task, and one can +unlearn this data without modifying the base model. + +
+
+
+
+
+ + ☆ Whisper Finetuning on Nepali Language + + +
+ Despite the growing advancements in Automatic Speech Recognition (ASR) +models, the development of robust models for underrepresented languages, such +as Nepali, remains a challenge. This research focuses on making an exhaustive +and generalized dataset followed by fine-tuning OpenAI's Whisper models of +different sizes to improve transcription (speech-to-text) accuracy for the +Nepali language. We leverage publicly available ASR datasets and self-recorded +custom datasets with a diverse range of accents, dialects, and speaking styles +further enriched through augmentation. Our experimental results demonstrate +that fine-tuning Whisper models on our curated custom dataset substantially +reduces the Word Error Rate (WER) across all model sizes attributed to larger +data variations in terms of speaker's age, gender, and sentiment, acoustic +environment, dialect, denser audio segments (15-30 seconds) that are more +compatible with Whisper's input, and manual curation of audios and +transcriptions. Notably, our approach outperforms Whisper's baseline models +trained on Fleur's dataset, achieving WER reductions of up to 36.2% on the +small and 23.8% on medium models. Furthermore, we show that data augmentation +plays a significant role in enhancing model robustness. Our approach underlines +the importance of dataset quality, variation, and augmentation in the +adaptation of state-of-the-art models to underrepresented languages for +developing accurate ASR systems. + +
+
+
+
+
+ + ☆ Large Language Models for Combinatorial Optimization of Design Structure + Matrix + + +
+ Combinatorial optimization (CO) is essential for improving efficiency and +performance in engineering applications. As complexity increases with larger +problem sizes and more intricate dependencies, identifying the optimal solution +become challenging. When it comes to real-world engineering problems, +algorithms based on pure mathematical reasoning are limited and incapable to +capture the contextual nuances necessary for optimization. This study explores +the potential of Large Language Models (LLMs) in solving engineering CO +problems by leveraging their reasoning power and contextual knowledge. We +propose a novel LLM-based framework that integrates network topology and domain +knowledge to optimize the sequencing of Design Structure Matrix (DSM)-a common +CO problem. Our experiments on various DSM cases demonstrate that the proposed +method achieves faster convergence and higher solution quality than benchmark +methods. Moreover, results show that incorporating contextual domain knowledge +significantly improves performance despite the choice of LLMs. These findings +highlight the potential of LLMs in tackling complex real-world CO problems by +combining semantic and mathematical reasoning. This approach paves the way for +a new paradigm in in real-world combinatorial optimization. + +
+
+
+
+
+ + ☆ Topological Symmetry Enhanced Graph Convolution for Skeleton-Based + Action Recognition + + +
+ Skeleton-based action recognition has achieved remarkable performance with +the development of graph convolutional networks (GCNs). However, most of these +methods tend to construct complex topology learning mechanisms while neglecting +the inherent symmetry of the human body. Additionally, the use of temporal +convolutions with certain fixed receptive fields limits their capacity to +effectively capture dependencies in time sequences. To address the issues, we +(1) propose a novel Topological Symmetry Enhanced Graph Convolution (TSE-GC) to +enable distinct topology learning across different channel partitions while +incorporating topological symmetry awareness and (2) construct a Multi-Branch +Deformable Temporal Convolution (MBDTC) for skeleton-based action recognition. +The proposed TSE-GC emphasizes the inherent symmetry of the human body while +enabling efficient learning of dynamic topologies. Meanwhile, the design of +MBDTC introduces the concept of deformable modeling, leading to more flexible +receptive fields and stronger modeling capacity of temporal dependencies. +Combining TSE-GC with MBDTC, our final model, TSE-GCN, achieves competitive +performance with fewer parameters compared with state-of-the-art methods on +three large datasets, NTU RGB+D, NTU RGB+D 120, and NW-UCLA. On the +cross-subject and cross-set evaluations of NTU RGB+D 120, the accuracies of our +model reach 90.0\% and 91.1\%, with 1.1M parameters and 1.38 GFLOPS for one +stream. + +
+
+
+
+
+ + ☆ Recall and Refine: A Simple but Effective Source-free Open-set Domain + Adaptation Framework + + +
+ Open-set Domain Adaptation (OSDA) aims to adapt a model from a labeled source +domain to an unlabeled target domain, where novel classes - also referred to as +target-private unknown classes - are present. Source-free Open-set Domain +Adaptation (SF-OSDA) methods address OSDA without accessing labeled source +data, making them particularly relevant under privacy constraints. However, +SF-OSDA presents significant challenges due to distribution shifts and the +introduction of novel classes. Existing SF-OSDA methods typically rely on +thresholding the prediction entropy of a sample to identify it as either a +known or unknown class but fail to explicitly learn discriminative features for +the target-private unknown classes. We propose Recall and Refine (RRDA), a +novel SF-OSDA framework designed to address these limitations by explicitly +learning features for target-private unknown classes. RRDA employs a two-step +process. First, we enhance the model's capacity to recognize unknown classes by +training a target classifier with an additional decision boundary, guided by +synthetic samples generated from target domain features. This enables the +classifier to effectively separate known and unknown classes. In the second +step, we adapt the entire model to the target domain, addressing both domain +shifts and improving generalization to unknown classes. Any off-the-shelf +source-free domain adaptation method (e.g., SHOT, AaD) can be seamlessly +integrated into our framework at this stage. Extensive experiments on three +benchmark datasets demonstrate that RRDA significantly outperforms existing +SF-OSDA and OSDA methods. + +
+
+
+
+
+ + ☆ Predicting Customer Satisfaction by Replicating the Survey Response + Distribution + + +
+ For many call centers, customer satisfaction (CSAT) is a key performance +indicator (KPI). However, only a fraction of customers take the CSAT survey +after the call, leading to a biased and inaccurate average CSAT value, and +missed opportunities for coaching, follow-up, and rectification. Therefore, +call centers can benefit from a model predicting customer satisfaction on calls +where the customer did not complete the survey. Given that CSAT is a closely +monitored KPI, it is critical to minimize any bias in the average predicted +CSAT (pCSAT). In this paper, we introduce a method such that predicted CSAT +(pCSAT) scores accurately replicate the distribution of survey CSAT responses +for every call center with sufficient data in a live production environment. +The method can be applied to many multiclass classification problems to improve +the class balance and minimize its changes upon model updates. + +
+
+
+
+
+ + ☆ Rethinking Top Probability from Multi-view for Distracted Driver + Behaviour Localization + + +
+ Naturalistic driving action localization task aims to recognize and +comprehend human behaviors and actions from video data captured during +real-world driving scenarios. Previous studies have shown great action +localization performance by applying a recognition model followed by +probability-based post-processing. Nevertheless, the probabilities provided by +the recognition model frequently contain confused information causing challenge +for post-processing. In this work, we adopt an action recognition model based +on self-supervise learning to detect distracted activities and give potential +action probabilities. Subsequently, a constraint ensemble strategy takes +advantages of multi-camera views to provide robust predictions. Finally, we +introduce a conditional post-processing operation to locate distracted +behaviours and action temporal boundaries precisely. Experimenting on test set +A2, our method obtains the sixth position on the public leaderboard of track 3 +of the 2024 AI City Challenge. + +
+
+ comment: Computer Vision and Pattern Recognition Workshop 2024 +
+
+
+
+
+ + ☆ The Hermeneutic Turn of AI: Is the Machine Capable of Interpreting? + + +
+ This article aims to demonstrate how the approach to computing is being +disrupted by deep learning (artificial neural networks), not only in terms of +techniques but also in our interactions with machines. It also addresses the +philosophical tradition of hermeneutics (Don Ihde, Wilhelm Dilthey) to +highlight a parallel with this movement and to demystify the idea of human-like +AI. + +
+
+ comment: 4 pages. +
+
+
+
+
+ + ☆ Transformer Neural Processes -- Kernel Regression + + +
+ Stochastic processes model various natural phenomena from disease +transmission to stock prices, but simulating and quantifying their uncertainty +can be computationally challenging. For example, modeling a Gaussian Process +with standard statistical methods incurs an $\mathcal{O}(n^3)$ penalty, and +even using state-of-the-art Neural Processes (NPs) incurs an $\mathcal{O}(n^2)$ +penalty due to the attention mechanism. We introduce the Transformer Neural +Process - Kernel Regression (TNP-KR), a new architecture that incorporates a +novel transformer block we call a Kernel Regression Block (KRBlock), which +reduces the computational complexity of attention in transformer-based Neural +Processes (TNPs) from $\mathcal{O}((n_C+n_T)^2)$ to $O(n_C^2+n_Cn_T)$ by +eliminating masked computations, where $n_C$ is the number of context, and +$n_T$ is the number of test points, respectively, and a fast attention variant +that further reduces all attention calculations to $\mathcal{O}(n_C)$ in space +and time complexity. In benchmarks spanning such tasks as meta-regression, +Bayesian optimization, and image completion, we demonstrate that the full +variant matches the performance of state-of-the-art methods while training +faster and scaling two orders of magnitude higher in number of test points, and +the fast variant nearly matches that performance while scaling to millions of +both test and context points on consumer hardware. + +
+
+
+
+
+ + ☆ Enhancing Reasoning Capabilities of LLMs via Principled Synthetic Logic + Corpus NeurIPS 2024 + + +
+ Large language models (LLMs) are capable of solving a wide range of tasks, +yet they have struggled with reasoning. To address this, we propose +$\textbf{Additional Logic Training (ALT)}$, which aims to enhance LLMs' +reasoning capabilities by program-generated logical reasoning samples. We first +establish principles for designing high-quality samples by integrating symbolic +logic theory and previous empirical insights. Then, based on these principles, +we construct a synthetic corpus named $\textbf{Formal Logic Deduction Diverse}$ +($\textbf{FLD}$$^{\times 2}$), comprising numerous samples of multi-step +deduction with unknown facts, diverse reasoning rules, diverse linguistic +expressions, and challenging distractors. Finally, we empirically show that ALT +on FLD$^{\times2}$ substantially enhances the reasoning capabilities of +state-of-the-art LLMs, including LLaMA-3.1-70B. Improvements include gains of +up to 30 points on logical reasoning benchmarks, up to 10 points on math and +coding benchmarks, and 5 points on the benchmark suite BBH. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Analysing Explanation-Related Interactions in Collaborative + Perception-Cognition-Communication-Action + + +
+ Effective communication is essential in collaborative tasks, so AI-equipped +robots working alongside humans need to be able to explain their behaviour in +order to cooperate effectively and earn trust. We analyse and classify +communications among human participants collaborating to complete a simulated +emergency response task. The analysis identifies messages that relate to +various kinds of interactive explanations identified in the explainable AI +literature. This allows us to understand what type of explanations humans +expect from their teammates in such settings, and thus where AI-equipped robots +most need explanation capabilities. We find that most explanation-related +messages seek clarification in the decisions or actions taken. We also confirm +that messages have an impact on the performance of our simulated task. + +
+
+ comment: 4 pages, 3 figures, published as a Late Breaking Report in RO-MAN + 2024 +
+
+
+
+
+ + ☆ Comparing Prior and Learned Time Representations in Transformer Models + of Timeseries + + +
+ What sets timeseries analysis apart from other machine learning exercises is +that time representation becomes a primary aspect of the experiment setup, as +it must adequately represent the temporal relations that are relevant for the +application at hand. In the work described here we study wo different +variations of the Transformer architecture: one where we use the fixed time +representation proposed in the literature and one where the time representation +is learned from the data. Our experiments use data from predicting the energy +output of solar panels, a task that exhibits known periodicities (daily and +seasonal) that is straight-forward to encode in the fixed time representation. +Our results indicate that even in an experiment where the phenomenon is +well-understood, it is difficult to encode prior knowledge due to side-effects +that are difficult to mitigate. We conclude that research work is needed to +work the human into the learning loop in ways that improve the robustness and +trust-worthiness of the network. + +
+
+ comment: Presented at the AI in Natural Sciences and Technology (AINST) track + of the 13th Conference on Artificial Intelligence (SETN 2024), 11-13 + September 2024, Piraeus, Greece +
+
+
+
+
+ + ☆ AI Flow at the Network Edge + + +
+ Recent advancements in large language models (LLMs) and their multimodal +variants have led to remarkable progress across various domains, demonstrating +impressive capabilities and unprecedented potential. In the era of ubiquitous +connectivity, leveraging communication networks to distribute intelligence is a +transformative concept, envisioning AI-powered services accessible at the +network edge. However, pushing large models from the cloud to +resource-constrained environments faces critical challenges. Model inference on +low-end devices leads to excessive latency and performance bottlenecks, while +raw data transmission over limited bandwidth networks causes high communication +overhead. This article presents AI Flow, a framework that streamlines the +inference process by jointly leveraging the heterogeneous resources available +across devices, edge nodes, and cloud servers, making intelligence flow across +networks. To facilitate cooperation among multiple computational nodes, the +proposed framework explores a paradigm shift in the design of communication +network systems from transmitting information flow to intelligence flow, where +the goal of communications is task-oriented and folded into the inference +process. Experimental results demonstrate the effectiveness of the proposed +framework through an image captioning use case, showcasing the ability to +reduce response latency while maintaining high-quality captions. This article +serves as a position paper for identifying the motivation, challenges, and +principles of AI Flow. + +
+
+
+
+
+ + ☆ Guide-to-Explain for Controllable Summarization + + +
+ Recently, large language models (LLMs) have demonstrated remarkable +performance in abstractive summarization tasks. However, controllable +summarization with LLMs remains underexplored, limiting their ability to +generate summaries that align with specific user preferences. In this paper, we +first investigate the capability of LLMs to control diverse attributes, +revealing that they encounter greater challenges with numerical attributes, +such as length and extractiveness, compared to linguistic attributes. To +address this challenge, we propose a guide-to-explain framework (GTE) for +controllable summarization. Our GTE framework enables the model to identify +misaligned attributes in the initial draft and guides it in explaining errors +in the previous output. Based on this reflection, the model generates a +well-adjusted summary. As a result, by allowing the model to reflect on its +misalignment, we generate summaries that satisfy the desired attributes in +surprisingly fewer iterations than other iterative methods solely using LLMs. + +
+
+
+
+
+ + ☆ Preference-Conditioned Gradient Variations for Multi-Objective + Quality-Diversity + + +
+ In a variety of domains, from robotics to finance, Quality-Diversity +algorithms have been used to generate collections of both diverse and +high-performing solutions. Multi-Objective Quality-Diversity algorithms have +emerged as a promising approach for applying these methods to complex, +multi-objective problems. However, existing methods are limited by their search +capabilities. For example, Multi-Objective Map-Elites depends on random genetic +variations which struggle in high-dimensional search spaces. Despite efforts to +enhance search efficiency with gradient-based mutation operators, existing +approaches consider updating solutions to improve on each objective separately +rather than achieving desired trade-offs. In this work, we address this +limitation by introducing Multi-Objective Map-Elites with +Preference-Conditioned Policy-Gradient and Crowding Mechanisms: a new +Multi-Objective Quality-Diversity algorithm that uses preference-conditioned +policy-gradient mutations to efficiently discover promising regions of the +objective space and crowding mechanisms to promote a uniform distribution of +solutions on the Pareto front. We evaluate our approach on six robotics +locomotion tasks and show that our method outperforms or matches all +state-of-the-art Multi-Objective Quality-Diversity methods in all six, +including two newly proposed tri-objective tasks. Importantly, our method also +achieves a smoother set of trade-offs, as measured by newly-proposed +sparsity-based metrics. This performance comes at a lower computational storage +cost compared to previous methods. + +
+
+
+
+
+ + ☆ Evaluating the Prompt Steerability of Large Language Models + + +
+ Building pluralistic AI requires designing models that are able to be shaped +to represent a wide range of value systems and cultures. Achieving this +requires first being able to evaluate the degree to which a given model is +capable of reflecting various personas. To this end, we propose a benchmark for +evaluating the steerability of model personas as a function of prompting. Our +design is based on a formal definition of prompt steerability, which analyzes +the degree to which a model's joint behavioral distribution can be shifted from +its baseline behavior. By defining steerability indices and inspecting how +these indices change as a function of steering effort, we can estimate the +steerability of a model across various persona dimensions and directions. Our +benchmark reveals that the steerability of many current models is limited -- +due to both a skew in their baseline behavior and an asymmetry in their +steerability across many persona dimensions. We release an implementation of +our benchmark at https://github.com/IBM/prompt-steering. + +
+
+
+
+
+ + ☆ Do LLMs Understand Ambiguity in Text? A Case Study in Open-world + Question Answering + + +
+ Ambiguity in natural language poses significant challenges to Large Language +Models (LLMs) used for open-domain question answering. LLMs often struggle with +the inherent uncertainties of human communication, leading to +misinterpretations, miscommunications, hallucinations, and biased responses. +This significantly weakens their ability to be used for tasks like +fact-checking, question answering, feature extraction, and sentiment analysis. +Using open-domain question answering as a test case, we compare off-the-shelf +and few-shot LLM performance, focusing on measuring the impact of explicit +disambiguation strategies. We demonstrate how simple, training-free, +token-level disambiguation methods may be effectively used to improve LLM +performance for ambiguous question answering tasks. We empirically show our +findings and discuss best practices and broader impacts regarding ambiguity in +LLMs. + +
+
+ comment: Accepted at the REU Symposium at IEEE BigData 2024 +
+
+
+
+
+ + ☆ A Layered Architecture for Developing and Enhancing Capabilities in + Large Language Model-based Software Systems + + +
+ Significant efforts has been made to expand the use of Large Language Models +(LLMs) beyond basic language tasks. While the generalizability and versatility +of LLMs have enabled widespread adoption, evolving demands in application +development often exceed their native capabilities. Meeting these demands may +involve a diverse set of methods, such as enhancing creativity through either +inference temperature adjustments or creativity-provoking prompts. Selecting +the right approach is critical, as different methods lead to trade-offs in +engineering complexity, scalability, and operational costs. This paper +introduces a layered architecture that organizes LLM software system +development into distinct layers, each characterized by specific attributes. By +aligning capabilities with these layers, the framework encourages the +systematic implementation of capabilities in effective and efficient ways that +ultimately supports desired functionalities and qualities. Through practical +case studies, we illustrate the utility of the framework. This work offers +developers actionable insights for selecting suitable technologies in LLM-based +software system development, promoting robustness and scalability. + +
+
+
+
+
+ + ☆ DiM: $f$-Divergence Minimization Guided Sharpness-Aware Optimization for + Semi-supervised Medical Image Segmentation + + +
+ As a technique to alleviate the pressure of data annotation, semi-supervised +learning (SSL) has attracted widespread attention. In the specific domain of +medical image segmentation, semi-supervised methods (SSMIS) have become a +research hotspot due to their ability to reduce the need for large amounts of +precisely annotated data. SSMIS focuses on enhancing the model's generalization +performance by leveraging a small number of labeled samples and a large number +of unlabeled samples. The latest sharpness-aware optimization (SAM) technique, +which optimizes the model by reducing the sharpness of the loss function, has +shown significant success in SSMIS. However, SAM and its variants may not fully +account for the distribution differences between different datasets. To address +this issue, we propose a sharpness-aware optimization method based on +$f$-divergence minimization (DiM) for semi-supervised medical image +segmentation. This method enhances the model's stability by fine-tuning the +sensitivity of model parameters and improves the model's adaptability to +different datasets through the introduction of $f$-divergence. By reducing +$f$-divergence, the DiM method not only improves the performance balance +between the source and target datasets but also prevents performance +degradation due to overfitting on the source dataset. + +
+
+ comment: 8page +
+
+
+
+
+ + ☆ CLIP Unreasonable Potential in Single-Shot Face Recognition + + +
+ Face recognition is a core task in computer vision designed to identify and +authenticate individuals by analyzing facial patterns and features. This field +intersects with artificial intelligence image processing and machine learning +with applications in security authentication and personalization. Traditional +approaches in facial recognition focus on capturing facial features like the +eyes, nose and mouth and matching these against a database to verify identities +However challenges such as high false positive rates have persisted often due +to the similarity among individuals facial features. Recently Contrastive +Language Image Pretraining (CLIP) a model developed by OpenAI has shown +promising advancements by linking natural language processing with vision tasks +allowing it to generalize across modalities. Using CLIP's vision language +correspondence and single-shot finetuning the model can achieve lower false +positive rates upon deployment without the need of mass facial features +extraction. This integration demonstrating CLIP's potential to address +persistent issues in face recognition model performance without complicating +our training paradigm. + +
+
+
+
+
+ + ☆ SNN-Based Online Learning of Concepts and Action Laws in an Open World + + +
+ We present the architecture of a fully autonomous, bio-inspired cognitive +agent built around a spiking neural network (SNN) implementing the agent's +semantic memory. The agent explores its universe and learns concepts of +objects/situations and of its own actions in a one-shot manner. While +object/situation concepts are unary, action concepts are triples made up of an +initial situation, a motor activity, and an outcome. They embody the agent's +knowledge of its universe's actions laws. Both kinds of concepts have different +degrees of generality. To make decisions the agent queries its semantic memory +for the expected outcomes of envisaged actions and chooses the action to take +on the basis of these predictions. Our experiments show that the agent handles +new situations by appealing to previously learned general concepts and rapidly +modifies its concepts to adapt to environment changes. + +
+
+
+
+
+ + ☆ Balancing Accuracy and Efficiency in Multi-Turn Intent Classification + for LLM-Powered Dialog Systems in Production + + +
+ Accurate multi-turn intent classification is essential for advancing +conversational AI systems. However, challenges such as the scarcity of +comprehensive datasets and the complexity of contextual dependencies across +dialogue turns hinder progress. This paper presents two novel approaches +leveraging Large Language Models (LLMs) to enhance scalability and reduce +latency in production dialogue systems. First, we introduce Symbol Tuning, +which simplifies intent labels to reduce task complexity and improve +performance in multi-turn dialogues. Second, we propose C-LARA +(Consistency-aware, Linguistics Adaptive Retrieval Augmentation), a framework +that employs LLMs for data augmentation and pseudo-labeling to generate +synthetic multi-turn dialogues. These enriched datasets are used to fine-tune a +small, efficient model suitable for deployment. Experiments conducted on +multilingual dialogue datasets demonstrate significant improvements in +classification accuracy and resource efficiency. Our methods enhance multi-turn +intent classification accuracy by 5.09%, reduce annotation costs by 40%, and +enable scalable deployment in low-resource multilingual industrial systems, +highlighting their practicality and impact. + +
+
+
+
+
+ + ☆ SSEditor: Controllable Mask-to-Scene Generation with Diffusion Model + + +
+ Recent advancements in 3D diffusion-based semantic scene generation have +gained attention. However, existing methods rely on unconditional generation +and require multiple resampling steps when editing scenes, which significantly +limits their controllability and flexibility. To this end, we propose SSEditor, +a controllable Semantic Scene Editor that can generate specified target +categories without multiple-step resampling. SSEditor employs a two-stage +diffusion-based framework: (1) a 3D scene autoencoder is trained to obtain +latent triplane features, and (2) a mask-conditional diffusion model is trained +for customizable 3D semantic scene generation. In the second stage, we +introduce a geometric-semantic fusion module that enhance the model's ability +to learn geometric and semantic information. This ensures that objects are +generated with correct positions, sizes, and categories. Extensive experiments +on SemanticKITTI and CarlaSC demonstrate that SSEditor outperforms previous +approaches in terms of controllability and flexibility in target generation, as +well as the quality of semantic scene generation and reconstruction. More +importantly, experiments on the unseen Occ-3D Waymo dataset show that SSEditor +is capable of generating novel urban scenes, enabling the rapid construction of +3D scenes. + +
+
+
+
+
+ + ☆ libcll: an Extendable Python Toolkit for Complementary-Label Learning + + +
+ Complementary-label learning (CLL) is a weakly supervised learning paradigm +for multiclass classification, where only complementary labels -- indicating +classes an instance does not belong to -- are provided to the learning +algorithm. Despite CLL's increasing popularity, previous studies highlight two +main challenges: (1) inconsistent results arising from varied assumptions on +complementary label generation, and (2) high barriers to entry due to the lack +of a standardized evaluation platform across datasets and algorithms. To +address these challenges, we introduce \texttt{libcll}, an extensible Python +toolkit for CLL research. \texttt{libcll} provides a universal interface that +supports a wide range of generation assumptions, both synthetic and real-world +datasets, and key CLL algorithms. The toolkit is designed to mitigate +inconsistencies and streamline the research process, with easy installation, +comprehensive usage guides, and quickstart tutorials that facilitate efficient +adoption and implementation of CLL techniques. Extensive ablation studies +conducted with \texttt{libcll} demonstrate its utility in generating valuable +insights to advance future CLL research. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Building Trust: Foundations of Security, Safety and Transparency in AI + + +
+ This paper explores the rapidly evolving ecosystem of publicly available AI +models, and their potential implications on the security and safety landscape. +As AI models become increasingly prevalent, understanding their potential risks +and vulnerabilities is crucial. We review the current security and safety +scenarios while highlighting challenges such as tracking issues, remediation, +and the apparent absence of AI model lifecycle and ownership processes. +Comprehensive strategies to enhance security and safety for both model +developers and end-users are proposed. This paper aims to provide some of the +foundational pieces for more standardized security, safety, and transparency in +the development and operation of AI models and the larger open ecosystems and +communities forming around them. + +
+
+
+
+
+ + ☆ Restructuring Tractable Probabilistic Circuits + + +
+ Probabilistic circuits (PCs) is a unifying representation for probabilistic +models that support tractable inference. Numerous applications of PCs like +controllable text generation depend on the ability to efficiently multiply two +circuits. Existing multiplication algorithms require that the circuits respect +the same structure, i.e. variable scopes decomposes according to the same +vtree. In this work, we propose and study the task of restructuring +structured(-decomposable) PCs, that is, transforming a structured PC such that +it conforms to a target vtree. We propose a generic approach for this problem +and show that it leads to novel polynomial-time algorithms for multiplying +circuits respecting different vtrees, as well as a practical depth-reduction +algorithm that preserves structured decomposibility. Our work opens up new +avenues for tractable PC inference, suggesting the possibility of training with +less restrictive PC structures while enabling efficient inference by changing +their structures at inference time. + +
+
+
+
+
+ + ☆ Error-Feedback Model for Output Correction in Bilateral Control-Based + Imitation Learning + + +
+ In recent years, imitation learning using neural networks has enabled robots +to perform flexible tasks. However, since neural networks operate in a +feedforward structure, they do not possess a mechanism to compensate for output +errors. To address this limitation, we developed a feedback mechanism to +correct these errors. By employing a hierarchical structure for neural networks +comprising lower and upper layers, the lower layer was controlled to follow the +upper layer. Additionally, using a multi-layer perceptron in the lower layer, +which lacks an internal state, enhanced the error feedback. In the +character-writing task, this model demonstrated improved accuracy in writing +previously untrained characters. In the character-writing task, this model +demonstrated improved accuracy in writing previously untrained characters. +Through autonomous control with error feedback, we confirmed that the lower +layer could effectively track the output of the upper layer. This study +represents a promising step toward integrating neural networks with control +theories. + +
+
+
+
+
+ + ☆ Efficient Training in Multi-Agent Reinforcement Learning: A + Communication-Free Framework for the Box-Pushing Problem + + +
+ Self-organizing systems consist of autonomous agents that can perform complex +tasks and adapt to dynamic environments without a central controller. Prior +research often relies on reinforcement learning to enable agents to gain the +skills needed for task completion, such as in the box-pushing environment. +However, when agents push from opposing directions during exploration, they +tend to exert equal and opposite forces on the box, resulting in minimal +displacement and inefficient training. This paper proposes a model called +Shared Pool of Information (SPI), which enables information to be accessible to +all agents and facilitates coordination, reducing force conflicts among agents +and enhancing exploration efficiency. Through computer simulations, we +demonstrate that SPI not only expedites the training process but also requires +fewer steps per episode, significantly improving the agents' collaborative +effectiveness. + +
+
+ comment: 17 pages, 16 figures +
+
+
+
+
+ + ☆ Evaluating Tokenizer Performance of Large Language Models Across + Official Indian Languages + + +
+ Large Language Models (LLMs) based on transformer architectures have +revolutionized a variety of domains, with tokenization playing a pivotal role +in their pre-processing and fine-tuning stages. In multilingual models, +particularly those tailored for Indic languages, effective tokenization is +crucial for optimizing performance. This paper presents a comprehensive +evaluation of tokenizers used by 12 LLMs across all 22 official languages of +India, with a focus on comparing the efficiency of their tokenization +processes. We employed the Normalized Sequence Length (NSL) as a key metric in +our analysis. Our findings reveal that the SUTRA tokenizer outperforms all +other models, including several Indic-specific models, excelling in 14 +languages. Notable insights include the SUTRA tokenizer's superior handling of +Indic languages, GPT-4o's advancement over its predecessor GPT-4 in processing +Indian languages, and the limited performance of Project Indus in certain +languages. This study underscores the critical importance of developing +targeted tokenization strategies for multilingual and Indic-centric models, +laying the groundwork for future improvements in tokenizer design to enhance +linguistic coverage and model efficiency. + +
+
+
+
+
+ + ☆ Contrast Similarity-Aware Dual-Pathway Mamba for Multivariate Time + Series Node Classification + + +
+ Multivariate time series (MTS) data is generated through multiple sensors +across various domains such as engineering application, health monitoring, and +the internet of things, characterized by its temporal changes and high +dimensional characteristics. Over the past few years, many studies have +explored the long-range dependencies and similarities in MTS. However, +long-range dependencies are difficult to model due to their temporal changes +and high dimensionality makes it difficult to obtain similarities effectively +and efficiently. Thus, to address these issues, we propose contrast +similarity-aware dual-pathway Mamba for MTS node classification (CS-DPMamba). +Firstly, to obtain the dynamic similarity of each sample, we initially use +temporal contrast learning module to acquire MTS representations. And then we +construct a similarity matrix between MTS representations using Fast Dynamic +Time Warping (FastDTW). Secondly, we apply the DPMamba to consider the +bidirectional nature of MTS, allowing us to better capture long-range and +short-range dependencies within the data. Finally, we utilize the +Kolmogorov-Arnold Network enhanced Graph Isomorphism Network to complete the +information interaction in the matrix and MTS node classification task. By +comprehensively considering the long-range dependencies and dynamic similarity +features, we achieved precise MTS node classification. We conducted experiments +on multiple University of East Anglia (UEA) MTS datasets, which encompass +diverse application scenarios. Our results demonstrate the superiority of our +method through both supervised and semi-supervised experiments on the MTS +classification task. + +
+
+ comment: Submitted to Knowledge-Based Systems on Nov 17, 2024 +
+
+
+
+
+ + ☆ DeTrigger: A Gradient-Centric Approach to Backdoor Attack Mitigation in + Federated Learning + + +
+ Federated Learning (FL) enables collaborative model training across +distributed devices while preserving local data privacy, making it ideal for +mobile and embedded systems. However, the decentralized nature of FL also opens +vulnerabilities to model poisoning attacks, particularly backdoor attacks, +where adversaries implant trigger patterns to manipulate model predictions. In +this paper, we propose DeTrigger, a scalable and efficient backdoor-robust +federated learning framework that leverages insights from adversarial attack +methodologies. By employing gradient analysis with temperature scaling, +DeTrigger detects and isolates backdoor triggers, allowing for precise model +weight pruning of backdoor activations without sacrificing benign model +knowledge. Extensive evaluations across four widely used datasets demonstrate +that DeTrigger achieves up to 251x faster detection than traditional methods +and mitigates backdoor attacks by up to 98.9%, with minimal impact on global +model accuracy. Our findings establish DeTrigger as a robust and scalable +solution to protect federated learning environments against sophisticated +backdoor threats. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ CCIS-Diff: A Generative Model with Stable Diffusion Prior for Controlled + Colonoscopy Image Synthesis + + +
+ Colonoscopy is crucial for identifying adenomatous polyps and preventing +colorectal cancer. However, developing robust models for polyp detection is +challenging by the limited size and accessibility of existing colonoscopy +datasets. While previous efforts have attempted to synthesize colonoscopy +images, current methods suffer from instability and insufficient data +diversity. Moreover, these approaches lack precise control over the generation +process, resulting in images that fail to meet clinical quality standards. To +address these challenges, we propose CCIS-DIFF, a Controlled generative model +for high-quality Colonoscopy Image Synthesis based on a Diffusion architecture. +Our method offers precise control over both the spatial attributes (polyp +location and shape) and clinical characteristics of polyps that align with +clinical descriptions. Specifically, we introduce a blur mask weighting +strategy to seamlessly blend synthesized polyps with the colonic mucosa, and a +text-aware attention mechanism to guide the generated images to reflect +clinical characteristics. Notably, to achieve this, we construct a new +multi-modal colonoscopy dataset that integrates images, mask annotations, and +corresponding clinical text descriptions. Experimental results demonstrate that +our method generates high-quality, diverse colonoscopy images with fine control +over both spatial constraints and clinical consistency, offering valuable +support for downstream segmentation and diagnostic tasks. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + A More Advanced Group Polarization Measurement Approach Based on + LLM-Based Agents and Graphs + + +
+ Group polarization is an important research direction in social media content +analysis, attracting many researchers to explore this field. Therefore, how to +effectively measure group polarization has become a critical topic. Measuring +group polarization on social media presents several challenges that have not +yet been addressed by existing solutions. First, social media group +polarization measurement involves processing vast amounts of text, which poses +a significant challenge for information extraction. Second, social media texts +often contain hard-to-understand content, including sarcasm, memes, and +internet slang. Additionally, group polarization research focuses on holistic +analysis, while texts is typically fragmented. To address these challenges, we +designed a solution based on a multi-agent system and used a graph-structured +Community Sentiment Network (CSN) to represent polarization states. +Furthermore, we developed a metric called Community Opposition Index (COI) +based on the CSN to quantify polarization. Finally, we tested our multi-agent +system through a zero-shot stance detection task and achieved outstanding +results. In summary, the proposed approach has significant value in terms of +usability, accuracy, and interpretability. + +
+
+
+
+
+ + ☆ Testability of Instrumental Variables in Additive Nonlinear, + Non-Constant Effects Models + + +
+ We address the issue of the testability of instrumental variables derived +from observational data. Most existing testable implications are centered on +scenarios where the treatment is a discrete variable, e.g., instrumental +inequality (Pearl, 1995), or where the effect is assumed to be constant, e.g., +instrumental variables condition based on the principle of independent +mechanisms (Burauel, 2023). However, treatments can often be continuous +variables, such as drug dosages or nutritional content levels, and non-constant +effects may occur in many real-world scenarios. In this paper, we consider an +additive nonlinear, non-constant effects model with unmeasured confounders, in +which treatments can be either discrete or continuous, and propose an +Auxiliary-based Independence Test (AIT) condition to test whether a variable is +a valid instrument. We first show that if the candidate instrument is valid, +then the AIT condition holds. Moreover, we illustrate the implications of the +AIT condition and demonstrate that, in certain conditions, AIT conditions are +necessary and sufficient to detect all invalid IVs. We also extend the AIT +condition to include covariates and introduce a practical testing algorithm. +Experimental results on both synthetic and three different real-world datasets +show the effectiveness of our proposed condition. + +
+
+
+
+
+ + ☆ Diffusion-Inspired Cold Start with Sufficient Prior in Computerized + Adaptive Testing + + +
+ Computerized Adaptive Testing (CAT) aims to select the most appropriate +questions based on the examinee's ability and is widely used in online +education. However, existing CAT systems often lack initial understanding of +the examinee's ability, requiring random probing questions. This can lead to +poorly matched questions, extending the test duration and negatively impacting +the examinee's mindset, a phenomenon referred to as the Cold Start with +Insufficient Prior (CSIP) task. This issue occurs because CAT systems do not +effectively utilize the abundant prior information about the examinee available +from other courses on online platforms. These response records, due to the +commonality of cognitive states across different knowledge domains, can provide +valuable prior information for the target domain. However, no prior work has +explored solutions for the CSIP task. In response to this gap, we propose +Diffusion Cognitive States TransfeR Framework (DCSR), a novel domain transfer +framework based on Diffusion Models (DMs) to address the CSIP task. +Specifically, we construct a cognitive state transition bridge between domains, +guided by the common cognitive states of examinees, encouraging the model to +reconstruct the initial ability state in the target domain. To enrich the +expressive power of the generated data, we analyze the causal relationships in +the generation process from a causal perspective. Redundant and extraneous +cognitive states can lead to limited transfer and negative transfer effects. +Our DCSR can seamlessly apply the generated initial ability states in the +target domain to existing question selection algorithms, thus improving the +cold start performance of the CAT system. Extensive experiments conducted on +five real-world datasets demonstrate that DCSR significantly outperforms +existing baseline methods in addressing the CSIP task. + +
+
+ comment: Accepted by KDD2025 +
+
+
+
+
+ + ☆ Enhancing Low Dose Computed Tomography Images Using Consistency Training + Techniques + + +
+ Diffusion models have significant impact on wide range of generative tasks, +especially on image inpainting and restoration. Although the improvements on +aiming for decreasing number of function evaluations (NFE), the iterative +results are still computationally expensive. Consistency models are as a new +family of generative models, enable single-step sampling of high quality data +without the need for adversarial training. In this paper, we introduce the beta +noise distribution, which provides flexibility in adjusting noise levels. This +is combined with a sinusoidal curriculum that enhances the learning of the +trajectory between the noise distribution and the posterior distribution of +interest, allowing High Noise Improved Consistency Training (HN-iCT) to be +trained in a supervised fashion. Additionally, High Noise Improved Consistency +Training with Image Condition (HN-iCT-CN) architecture is introduced, enables +to take Low Dose images as a condition for extracting significant features by +Weighted Attention Gates (WAG).Our results indicate that unconditional image +generation using HN-iCT significantly outperforms basic CT and iCT training +techniques with NFE=1 on the CIFAR10 and CelebA datasets. Moreover, our +image-conditioned model demonstrates exceptional performance in enhancing +low-dose (LD) CT scans. + +
+
+
+
+
+ + ☆ SkillTree: Explainable Skill-Based Deep Reinforcement Learning for + Long-Horizon Control Tasks + + +
+ Deep reinforcement learning (DRL) has achieved remarkable success in various +research domains. However, its reliance on neural networks results in a lack of +transparency, which limits its practical applications. To achieve +explainability, decision trees have emerged as a popular and promising +alternative to neural networks. Nonetheless, due to their limited +expressiveness, traditional decision trees struggle with high-dimensional +long-horizon continuous control tasks. In this paper, we proposes SkillTree, a +novel framework that reduces complex continuous action spaces into discrete +skill spaces. Our hierarchical approach integrates a differentiable decision +tree within the high-level policy to generate skill embeddings, which +subsequently guide the low-level policy in executing skills. By making skill +decisions explainable, we achieve skill-level explainability, enhancing the +understanding of the decision-making process in complex tasks. Experimental +results demonstrate that our method achieves performance comparable to +skill-based neural networks in complex robotic arm control domains. +Furthermore, SkillTree offers explanations at the skill level, thereby +increasing the transparency of the decision-making process. + +
+
+
+
+
+ + ☆ UrbanDiT: A Foundation Model for Open-World Urban Spatio-Temporal + Learning + + +
+ The urban environment is characterized by complex spatio-temporal dynamics +arising from diverse human activities and interactions. Effectively modeling +these dynamics is essential for understanding and optimizing urban systems In +this work, we introduce UrbanDiT, a foundation model for open-world urban +spatio-temporal learning that successfully scale up diffusion transformers in +this field. UrbanDiT pioneers a unified model that integrates diverse +spatio-temporal data sources and types while learning universal spatio-temporal +patterns across different cities and scenarios. This allows the model to unify +both multi-data and multi-task learning, and effectively support a wide range +of spatio-temporal applications. Its key innovation lies in the elaborated +prompt learning framework, which adaptively generates both data-driven and +task-specific prompts, guiding the model to deliver superior performance across +various urban applications. UrbanDiT offers three primary advantages: 1) It +unifies diverse data types, such as grid-based and graph-based data, into a +sequential format, allowing to capture spatio-temporal dynamics across diverse +scenarios of different cities; 2) With masking strategies and task-specific +prompts, it supports a wide range of tasks, including bi-directional +spatio-temporal prediction, temporal interpolation, spatial extrapolation, and +spatio-temporal imputation; and 3) It generalizes effectively to open-world +scenarios, with its powerful zero-shot capabilities outperforming nearly all +baselines with training data. These features allow UrbanDiT to achieves +state-of-the-art performance in different domains such as transportation +traffic, crowd flows, taxi demand, bike usage, and cellular traffic, across +multiple cities and tasks. UrbanDiT sets up a new benchmark for foundation +models in the urban spatio-temporal domain. + +
+
+
+
+
+ + ☆ HNCSE: Advancing Sentence Embeddings via Hybrid Contrastive Learning + with Hard Negatives + + +
+ Unsupervised sentence representation learning remains a critical challenge in +modern natural language processing (NLP) research. Recently, contrastive +learning techniques have achieved significant success in addressing this issue +by effectively capturing textual semantics. Many such approaches prioritize the +optimization using negative samples. In fields such as computer vision, hard +negative samples (samples that are close to the decision boundary and thus more +difficult to distinguish) have been shown to enhance representation learning. +However, adapting hard negatives to contrastive sentence learning is complex +due to the intricate syntactic and semantic details of text. To address this +problem, we propose HNCSE, a novel contrastive learning framework that extends +the leading SimCSE approach. The hallmark of HNCSE is its innovative use of +hard negative samples to enhance the learning of both positive and negative +samples, thereby achieving a deeper semantic understanding. Empirical tests on +semantic textual similarity and transfer task datasets validate the superiority +of HNCSE. + +
+
+
+
+
+ + ☆ Reinforcement Learning with Action Sequence for Data-Efficient Robot + Learning + + +
+ Training reinforcement learning (RL) agents on robotic tasks typically +requires a large number of training samples. This is because training data +often consists of noisy trajectories, whether from exploration or +human-collected demonstrations, making it difficult to learn value functions +that understand the effect of taking each action. On the other hand, recent +behavior-cloning (BC) approaches have shown that predicting a sequence of +actions enables policies to effectively approximate noisy, multi-modal +distributions of expert demonstrations. Can we use a similar idea for improving +RL on robotic tasks? In this paper, we introduce a novel RL algorithm that +learns a critic network that outputs Q-values over a sequence of actions. By +explicitly training the value functions to learn the consequence of executing a +series of current and future actions, our algorithm allows for learning useful +value functions from noisy trajectories. We study our algorithm across various +setups with sparse and dense rewards, and with or without demonstrations, +spanning mobile bi-manual manipulation, whole-body control, and tabletop +manipulation tasks from BiGym, HumanoidBench, and RLBench. We find that, by +learning the critic network with action sequences, our algorithm outperforms +various RL and BC baselines, in particular on challenging humanoid control +tasks. + +
+
+ comment: 17 Pages. Website: https://younggyo.me/cqn-as/ +
+
+
+
+
+ + ☆ HEIGHT: Heterogeneous Interaction Graph Transformer for Robot Navigation + in Crowded and Constrained Environments + + +
+ We study the problem of robot navigation in dense and interactive crowds with +environmental constraints such as corridors and furniture. Previous methods +fail to consider all types of interactions among agents and obstacles, leading +to unsafe and inefficient robot paths. In this article, we leverage a +graph-based representation of crowded and constrained scenarios and propose a +structured framework to learn robot navigation policies with deep reinforcement +learning. We first split the representations of different components in the +environment and propose a heterogeneous spatio-temporal (st) graph to model +distinct interactions among humans, robots, and obstacles. Based on the +heterogeneous st-graph, we propose HEIGHT, a novel navigation policy network +architecture with different components to capture heterogeneous interactions +among entities through space and time. HEIGHT utilizes attention mechanisms to +prioritize important interactions and a recurrent network to track changes in +the dynamic scene over time, encouraging the robot to avoid collisions +adaptively. Through extensive simulation and real-world experiments, we +demonstrate that HEIGHT outperforms state-of-the-art baselines in terms of +success and efficiency in challenging navigation scenarios. Furthermore, we +demonstrate that our pipeline achieves better zero-shot generalization +capability than previous works when the densities of humans and obstacles +change. More videos are available at +https://sites.google.com/view/crowdnav-height/home. + +
+
+
+
+
+ + ☆ A Computational Method for Measuring "Open Codes" in Qualitative + Analysis + + +
+ Qualitative analysis is critical to understanding human datasets in many +social science disciplines. Open coding is an inductive qualitative process +that identifies and interprets "open codes" from datasets. Yet, meeting +methodological expectations (such as "as exhaustive as possible") can be +challenging. While many machine learning (ML)/generative AI (GAI) studies have +attempted to support open coding, few have systematically measured or evaluated +GAI outcomes, increasing potential bias risks. Building on Grounded Theory and +Thematic Analysis theories, we present a computational method to measure and +identify potential biases from "open codes" systematically. Instead of +operationalizing human expert results as the "ground truth," our method is +built upon a team-based approach between human and machine coders. We +experiment with two HCI datasets to establish this method's reliability by 1) +comparing it with human analysis, and 2) analyzing its output stability. We +present evidence-based suggestions and example workflows for ML/GAI to support +open coding. + +
+
+
+
+
+ + ☆ Visualizing Loss Functions as Topological Landscape Profiles + + +
+ In machine learning, a loss function measures the difference between model +predictions and ground-truth (or target) values. For neural network models, +visualizing how this loss changes as model parameters are varied can provide +insights into the local structure of the so-called loss landscape (e.g., +smoothness) as well as global properties of the underlying model (e.g., +generalization performance). While various methods for visualizing the loss +landscape have been proposed, many approaches limit sampling to just one or two +directions, ignoring potentially relevant information in this extremely +high-dimensional space. This paper introduces a new representation based on +topological data analysis that enables the visualization of higher-dimensional +loss landscapes. After describing this new topological landscape profile +representation, we show how the shape of loss landscapes can reveal new details +about model performance and learning dynamics, highlighting several use cases, +including image segmentation (e.g., UNet) and scientific machine learning +(e.g., physics-informed neural networks). Through these examples, we provide +new insights into how loss landscapes vary across distinct hyperparameter +spaces: we find that the topology of the loss landscape is simpler for +better-performing models; and we observe greater variation in the shape of loss +landscapes near transitions from low to high model performance. + +
+
+
+
+
+ + ♻ ☆ Regulating Chatbot Output via Inter-Informational Competition + + +
+ The advent of ChatGPT has sparked over a year of regulatory frenzy. However, +few existing studies have rigorously questioned the assumption that, if left +unregulated, AI chatbot's output would inflict tangible, severe real harm on +human affairs. Most researchers have overlooked the critical possibility that +the information market itself can effectively mitigate these risks and, as a +result, they tend to use regulatory tools to address the issue directly. This +Article develops a yardstick for reevaluating both AI-related content risks and +corresponding regulatory proposals by focusing on inter-informational +competition among various outlets. The decades-long history of regulating +information and communications technologies indicates that regulators tend to +err too much on the side of caution and to put forward excessive regulatory +measures when encountering the uncertainties brought about by new technologies. +In fact, a trove of empirical evidence has demonstrated that market competition +among information outlets can effectively mitigate most risks and that +overreliance on regulation is not only unnecessary but detrimental, as well. +This Article argues that sufficient competition among chatbots and other +information outlets in the information marketplace can sufficiently mitigate +and even resolve most content risks posed by generative AI technologies. This +renders certain loudly advocated regulatory strategies, like mandatory +prohibitions, licensure, curation of datasets, and notice-and-response regimes, +truly unnecessary and even toxic to desirable competition and innovation +throughout the AI industry. Ultimately, the ideas that I advance in this +Article should pour some much-needed cold water on the regulatory frenzy over +generative AI and steer the issue back to a rational track. + +
+
+ comment: 50-page legal Article, forthcoming in Northwestern Journal of + Technology and Intellectual Property +
+
+
+
+
+ + ♻ ☆ KTO: Model Alignment as Prospect Theoretic Optimization ICML 2024 + + +
+ Kahneman & Tversky's $\textit{prospect theory}$ tells us that humans perceive +random variables in a biased but well-defined manner (1992); for example, +humans are famously loss-averse. We show that objectives for aligning LLMs with +human feedback implicitly incorporate many of these biases -- the success of +these objectives (e.g., DPO) over cross-entropy minimization can partly be +ascribed to them belonging to a family of loss functions that we call +$\textit{human-aware losses}$ (HALOs). However, the utility functions these +methods attribute to humans still differ from those in the prospect theory +literature. Using a Kahneman-Tversky model of human utility, we propose a HALO +that directly maximizes the utility of generations instead of maximizing the +log-likelihood of preferences, as current methods do. We call this approach +KTO, and it matches or exceeds the performance of preference-based methods at +scales from 1B to 30B, despite only learning from a binary signal of whether an +output is desirable. More broadly, our work suggests that there is no one HALO +that is universally superior; the best loss depends on the inductive biases +most appropriate for a given setting, an oft-overlooked consideration. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Is Programming by Example solved by LLMs? + + +
+ Programming-by-Examples (PBE) aims to generate an algorithm from input-output +examples. Such systems are practically and theoretically important: from an +end-user perspective, they are deployed to millions of people, and from an AI +perspective, PBE corresponds to a very general form of few-shot inductive +inference. Given the success of Large Language Models (LLMs) in code-generation +tasks, we investigate here the extent to which LLMs can be said to have +"solved" PBE. We experiment on classic domains such as lists and strings, and +an uncommon graphics programming domain not well represented in typical +pretraining data. We find that pretrained models are not effective at PBE, but +that they can be fine-tuned for much higher performance, provided the test +problems are in-distribution. We analyze empirically what causes these models +to succeed and fail, and take steps toward understanding how to achieve better +out-of-distribution generalization. Collectively these results suggest that +LLMs make strong progress toward solving the typical suite of PBE tasks, +potentially increasing the flexibility and applicability of PBE systems, while +also identifying ways in which LLMs still fall short. + +
+
+
+
+
+ + ♻ ☆ VidComposition: Can MLLMs Analyze Compositions in Compiled Videos? + + +
+ The advancement of Multimodal Large Language Models (MLLMs) has enabled +significant progress in multimodal understanding, expanding their capacity to +analyze video content. However, existing evaluation benchmarks for MLLMs +primarily focus on abstract video comprehension, lacking a detailed assessment +of their ability to understand video compositions, the nuanced interpretation +of how visual elements combine and interact within highly compiled video +contexts. We introduce VidComposition, a new benchmark specifically designed to +evaluate the video composition understanding capabilities of MLLMs using +carefully curated compiled videos and cinematic-level annotations. +VidComposition includes 982 videos with 1706 multiple-choice questions, +covering various compositional aspects such as camera movement, angle, shot +size, narrative structure, character actions and emotions, etc. Our +comprehensive evaluation of 33 open-source and proprietary MLLMs reveals a +significant performance gap between human and model capabilities. This +highlights the limitations of current MLLMs in understanding complex, compiled +video compositions and offers insights into areas for further improvement. The +leaderboard and evaluation code are available at +https://yunlong10.github.io/VidComposition/. + +
+
+
+
+
+ + ♻ ☆ RLtools: A Fast, Portable Deep Reinforcement Learning Library for + Continuous Control + + +
+ Deep Reinforcement Learning (RL) can yield capable agents and control +policies in several domains but is commonly plagued by prohibitively long +training times. Additionally, in the case of continuous control problems, the +applicability of learned policies on real-world embedded devices is limited due +to the lack of real-time guarantees and portability of existing libraries. To +address these challenges, we present RLtools, a dependency-free, header-only, +pure C++ library for deep supervised and reinforcement learning. Its novel +architecture allows RLtools to be used on a wide variety of platforms, from HPC +clusters over workstations and laptops to smartphones, smartwatches, and +microcontrollers. Specifically, due to the tight integration of the RL +algorithms with simulation environments, RLtools can solve popular RL problems +up to 76 times faster than other popular RL frameworks. We also benchmark the +inference on a diverse set of microcontrollers and show that in most cases our +optimized implementation is by far the fastest. Finally, RLtools enables the +first-ever demonstration of training a deep RL algorithm directly on a +microcontroller, giving rise to the field of TinyRL. The source code as well as +documentation and live demos are available through our project page at +https://rl.tools. + +
+
+ comment: Project page: https://rl.tools +
+
+
+
+
+ + ♻ ☆ Combining Induction and Transduction for Abstract Reasoning + + +
+ When learning an input-output mapping from very few examples, is it better to +first infer a latent function that explains the examples, or is it better to +directly predict new test outputs, e.g. using a neural network? We study this +question on ARC, a highly diverse dataset of abstract reasoning tasks. We train +neural models for induction (inferring latent functions) and transduction +(directly predicting the test output for a given test input). Our models are +trained on synthetic data generated by prompting LLMs to produce Python code +specifying a function to be inferred, plus a stochastic subroutine for +generating inputs to that function. We find inductive and transductive models +solve very different problems, despite training on the same problems, and +despite sharing the same neural architecture. + +
+
+
+
+
+ + ♻ ☆ log-RRIM: Yield Prediction via Local-to-global Reaction Representation + Learning and Interaction Modeling + + +
+ Accurate prediction of chemical reaction yields is crucial for optimizing +organic synthesis, potentially reducing time and resources spent on +experimentation. With the rise of artificial intelligence (AI), there is +growing interest in leveraging AI-based methods to accelerate yield predictions +without conducting in vitro experiments. We present log-RRIM, an innovative +graph transformer-based framework designed for predicting chemical reaction +yields. Our approach implements a unique local-to-global reaction +representation learning strategy. This approach initially captures detailed +molecule-level information and then models and aggregates intermolecular +interactions, ensuring that the impact of varying-sizes molecular fragments on +yield is accurately accounted for. Another key feature of log-RRIM is its +integration of a cross-attention mechanism that focuses on the interplay +between reagents and reaction centers. This design reflects a fundamental +principle in chemical reactions: the crucial role of reagents in influencing +bond-breaking and formation processes, which ultimately affect reaction yields. +log-RRIM outperforms existing methods in our experiments, especially for medium +to high-yielding reactions, proving its reliability as a predictor. Its +advanced modeling of reactant-reagent interactions and sensitivity to small +molecular fragments make it a valuable tool for reaction planning and +optimization in chemical synthesis. The data and codes of log-RRIM are +accessible through https://github.com/ninglab/Yield_log_RRIM. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Improving Multi-task Learning via Seeking Task-based Flat Regions + + +
+ Multi-Task Learning (MTL) is a widely-used and powerful learning paradigm for +training deep neural networks that allows learning more than one objective by a +single backbone. Compared to training tasks separately, MTL significantly +reduces computational costs, improves data efficiency, and potentially enhances +model performance by leveraging knowledge across tasks. Hence, it has been +adopted in a variety of applications, ranging from computer vision to natural +language processing and speech recognition. Among them, there is an emerging +line of work in MTL that focuses on manipulating the task gradient to derive an +ultimate gradient descent direction to benefit all tasks. Despite achieving +impressive results on many benchmarks, directly applying these approaches +without using appropriate regularization techniques might lead to suboptimal +solutions on real-world problems. In particular, standard training that +minimizes the empirical loss on the training data can easily suffer from +overfitting to low-resource tasks or be spoiled by noisy-labeled ones, which +can cause negative transfer between tasks and overall performance drop. To +alleviate such problems, we propose to leverage a recently introduced training +method, named Sharpness-aware Minimization, which can enhance model +generalization ability on single-task learning. Accordingly, we present a novel +MTL training methodology, encouraging the model to find task-based flat minima +for coherently improving its generalization capability on all tasks. Finally, +we conduct comprehensive experiments on a variety of applications to +demonstrate the merit of our proposed approach to existing gradient-based MTL +methods, as suggested by our developed theory. + +
+
+ comment: 35 pages, 17 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Can Agents Spontaneously Form a Society? Introducing a Novel + Architecture for Generative Multi-Agents to Elicit Social Emergence + + +
+ Generative agents have demonstrated impressive capabilities in specific +tasks, but most of these frameworks focus on independent tasks and lack +attention to social interactions. We introduce a generative agent architecture +called ITCMA-S, which includes a basic framework for individual agents and a +framework called LTRHA that supports social interactions among multi-agents. +This architecture enables agents to identify and filter out behaviors that are +detrimental to social interactions, guiding them to choose more favorable +actions. We designed a sandbox environment to simulate the natural evolution of +social relationships among multiple identity-less agents for experimental +evaluation. The results showed that ITCMA-S performed well on multiple +evaluation indicators, demonstrating its ability to actively explore the +environment, recognize new agents, and acquire new information through +continuous actions and dialogue. Observations show that as agents establish +connections with each other, they spontaneously form cliques with internal +hierarchies around a selected leader and organize collective activities. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ BiSSL: Bilevel Optimization for Self-Supervised Pre-Training and + Fine-Tuning + + +
+ In this work, we present BiSSL, a first-of-its-kind training framework that +introduces bilevel optimization to enhance the alignment between the pretext +pre-training and downstream fine-tuning stages in self-supervised learning. +BiSSL formulates the pretext and downstream task objectives as the lower- and +upper-level objectives in a bilevel optimization problem and serves as an +intermediate training stage within the self-supervised learning pipeline. By +more explicitly modeling the interdependence of these training stages, BiSSL +facilitates enhanced information sharing between them, ultimately leading to a +backbone parameter initialization that is better suited for the downstream +task. We propose a training algorithm that alternates between optimizing the +two objectives defined in BiSSL. Using a ResNet-18 backbone pre-trained with +SimCLR on the STL10 dataset, we demonstrate that our proposed framework +consistently achieves improved or competitive classification accuracies across +various downstream image classification datasets compared to the conventional +self-supervised learning pipeline. Qualitative analyses of the backbone +features further suggest that BiSSL enhances the alignment of downstream +features in the backbone prior to fine-tuning. + +
+
+
+
+
+ + ♻ ☆ Plurals: A System for Guiding LLMs Via Simulated Social Ensembles + + +
+ Recent debates raised concerns that language models may favor certain +viewpoints. But what if the solution is not to aim for a 'view from nowhere' +but rather to leverage different viewpoints? We introduce Plurals, a system and +Python library for pluralistic AI deliberation. Plurals consists of Agents +(LLMs, optionally with personas) which deliberate within customizable +Structures, with Moderators overseeing deliberation. Plurals is a generator of +simulated social ensembles. Plurals integrates with government datasets to +create nationally representative personas, includes deliberation templates +inspired by deliberative democracy, and allows users to customize both +information-sharing structures and deliberation behavior within Structures. Six +case studies demonstrate fidelity to theoretical constructs and efficacy. Three +randomized experiments show simulated focus groups produced output resonant +with an online sample of the relevant audiences (chosen over zero-shot +generation in 75% of trials). Plurals is both a paradigm and a concrete system +for pluralistic AI. The Plurals library is available at +https://github.com/josh-ashkinaze/plurals and will be continually updated. + +
+
+
+
+
+ + ♻ ☆ On Size and Hardness Generalization in Unsupervised Learning for the + Travelling Salesman Problem + + +
+ We study the generalization capability of Unsupervised Learning in solving +the Travelling Salesman Problem (TSP). We use a Graph Neural Network (GNN) +trained with a surrogate loss function to generate an embedding for each node. +We use these embeddings to construct a heat map that indicates the likelihood +of each edge being part of the optimal route. We then apply local search to +generate our final predictions. Our investigation explores how different +training instance sizes, embedding dimensions, and distributions influence the +outcomes of Unsupervised Learning methods. Our results show that training with +larger instance sizes and increasing embedding dimensions can build a more +effective representation, enhancing the model's ability to solve TSP. +Furthermore, in evaluating generalization across different distributions, we +first determine the hardness of various distributions and explore how different +hardnesses affect the final results. Our findings suggest that models trained +on harder instances exhibit better generalization capabilities, highlighting +the importance of selecting appropriate training instances in solving TSP using +Unsupervised Learning. + +
+
+
+
+
+ + ♻ ☆ Look Before You Decide: Prompting Active Deduction of MLLMs for + Assumptive Reasoning + + +
+ Recently, Multimodal Large Language Models (MLLMs) have achieved significant +success across multiple disciplines due to their exceptional +instruction-following capabilities and extensive world knowledge. However, +whether these MLLMs possess human-like compositional reasoning abilities +remains an open problem. To unveil their reasoning behaviors, we first curate a +\textbf{M}ultimodal \textbf{A}ssumptive \textbf{R}ea\textbf{s}oning Benchmark +(MARS-Bench) in this paper. Interestingly, we find that most prevalent MLLMs +can be easily fooled by the introduction of a presupposition into the question, +whereas such presuppositions appear naive to human reasoning. Besides, we also +propose a simple yet effective method, Active Deduction (AD), to encourage the +model to actively perform composite deduction before reaching a final decision. +Equipped with the proposed AD method, a MLLM demonstrates significant +improvements in assumptive reasoning abilities without compromising its +general-purpose question-answering performance. We also provide extensive +evaluations of both open-source and private MLLMs on MARS-Bench, along with +experimental analyses of the AD method. + +
+
+
+
+
+ + ♻ ☆ CRoP: Context-wise Robust Static Human-Sensing Personalization + + +
+ The advancement in deep learning and internet-of-things have led to diverse +human sensing applications. However, distinct patterns in human sensing, +influenced by various factors or contexts, challenge the generic neural network +model's performance due to natural distribution shifts. To address this, +personalization tailors models to individual users. Yet most personalization +studies overlook intra-user heterogeneity across contexts in sensory data, +limiting intra-user generalizability. This limitation is especially critical in +clinical applications, where limited data availability hampers both +generalizability and personalization. Notably, intra-user sensing attributes +are expected to change due to external factors such as treatment progression, +further complicating the challenges. To address the intra-user generalization +challenge, this work introduces CRoP, a novel static personalization approach. +CRoP leverages off-the-shelf pre-trained models as generic starting points and +captures user-specific traits through adaptive pruning on a minimal sub-network +while preserving generic knowledge in the remaining parameters. CRoP +demonstrates superior personalization effectiveness and intra-user robustness +across four human-sensing datasets, including two from real-world health +domains, underscoring its practical and social impact. Additionally, to support +CRoP's generalization ability and design choices, we provide empirical +justification through gradient inner product analysis, ablation studies, and +comparisons against state-of-the-art baselines. + +
+
+ comment: 33 pages, 6 figues and 12 tables +
+
+
+
+
+ + ♻ ☆ IDCIA: Immunocytochemistry Dataset for Cellular Image Analysis + + +
+ We present a new annotated microscopic cellular image dataset to improve the +effectiveness of machine learning methods for cellular image analysis. Cell +counting is an important step in cell analysis. Typically, domain experts +manually count cells in a microscopic image. Automated cell counting can +potentially eliminate this tedious, time-consuming process. However, a good, +labeled dataset is required for training an accurate machine learning model. +Our dataset includes microscopic images of cells, and for each image, the cell +count and the location of individual cells. The data were collected as part of +an ongoing study investigating the potential of electrical stimulation to +modulate stem cell differentiation and possible applications for neural repair. +Compared to existing publicly available datasets, our dataset has more images +of cells stained with more variety of antibodies (protein components of immune +responses against invaders) typically used for cell analysis. The experimental +results on this dataset indicate that none of the five existing models under +this study are able to achieve sufficiently accurate count to replace the +manual methods. The dataset is available at +https://figshare.com/articles/dataset/Dataset/21970604. + +
+
+
+
+
+ + ♻ ☆ Synergizing LLM Agents and Knowledge Graph for Socioeconomic Prediction + in LBSN + + +
+ The fast development of location-based social networks (LBSNs) has led to +significant changes in society, resulting in popular studies of using LBSN data +for socioeconomic prediction, e.g., regional population and commercial activity +estimation. Existing studies design various graphs to model heterogeneous LBSN +data, and further apply graph representation learning methods for socioeconomic +prediction. However, these approaches heavily rely on heuristic ideas and +expertise to extract task-relevant knowledge from diverse data, which may not +be optimal for specific tasks. Additionally, they tend to overlook the inherent +relationships between different indicators, limiting the prediction accuracy. +Motivated by the remarkable abilities of large language models (LLMs) in +commonsense reasoning, embedding, and multi-agent collaboration, in this work, +we synergize LLM agents and knowledge graph for socioeconomic prediction. We +first construct a location-based knowledge graph (LBKG) to integrate +multi-sourced LBSN data. Then we leverage the reasoning power of LLM agent to +identify relevant meta-paths in the LBKG for each type of socioeconomic +prediction task, and design a semantic-guided attention module for knowledge +fusion with meta-paths. Moreover, we introduce a cross-task communication +mechanism to further enhance performance by enabling knowledge sharing across +tasks at both LLM agent and KG levels. On the one hand, the LLM agents for +different tasks collaborate to generate more diverse and comprehensive +meta-paths. On the other hand, the embeddings from different tasks are +adaptively merged for better socioeconomic prediction. Experiments on two +datasets demonstrate the effectiveness of the synergistic design between LLM +and KG, providing insights for information sharing across socioeconomic +prediction tasks. + +
+
+
+
+
+ + ♻ ☆ Survey on Emotion Recognition through Posture Detection and the + possibility of its application in Virtual Reality + + +
+ A survey is presented focused on using pose estimation techniques in +Emotional recognition using various technologies normal cameras, and depth +cameras for real-time, and the potential use of VR and inputs including images, +videos, and 3-dimensional poses described in vector space. We discussed 19 +research papers collected from selected journals and databases highlighting +their methodology, classification algorithm, and the used datasets that relate +to emotion recognition and pose estimation. A benchmark has been made according +to their accuracy as it was the most common performance measurement metric +used. We concluded that the multimodal Approaches overall made the best +accuracy and then we mentioned futuristic concerns that can improve the +development of this research topic. + +
+
+
+
+
+ + ♻ ☆ Weak-to-Strong Search: Align Large Language Models via Searching over + Small Language Models NeurIPS 2024 + + +
+ Large language models are usually fine-tuned to align with human preferences. +However, fine-tuning a large language model can be challenging. In this work, +we introduce $\textit{weak-to-strong search}$, framing the alignment of a large +language model as a test-time greedy search to maximize the log-probability +difference between small tuned and untuned models while sampling from the +frozen large model. This method serves both as (1) a compute-efficient model +up-scaling strategy that avoids directly tuning the large model and as (2) an +instance of weak-to-strong generalization that enhances a strong model with +weak test-time guidance. Empirically, we demonstrate the flexibility of +weak-to-strong search across different tasks. In controlled-sentiment +generation and summarization, we use tuned and untuned $\texttt{gpt2}$s to +improve the alignment of large models without additional training. Crucially, +in a more difficult instruction-following benchmark, AlpacaEval 2.0, we show +that reusing off-the-shelf small models (e.g., $\texttt{zephyr-7b-beta}$ and +its untuned version) can improve the length-controlled win rates of both +white-box and black-box large models against $\texttt{gpt-4-turbo}$ (e.g., +$34.4\% \rightarrow 37.9\%$ for $\texttt{Llama-3-70B-Instruct}$ and $16.0\% +\rightarrow 20.1\%$ for $\texttt{gpt-3.5-turbo-instruct}$), despite the small +models' low win rates $\approx 10.0\%$. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Reference-free Hallucination Detection for Large Vision-Language Models + + +
+ Large vision-language models (LVLMs) have made significant progress in recent +years. While LVLMs exhibit excellent ability in language understanding, +question answering, and conversations of visual inputs, they are prone to +producing hallucinations. While several methods are proposed to evaluate the +hallucinations in LVLMs, most are reference-based and depend on external tools, +which complicates their practical application. To assess the viability of +alternative methods, it is critical to understand whether the reference-free +approaches, which do not rely on any external tools, can efficiently detect +hallucinations. Therefore, we initiate an exploratory study to demonstrate the +effectiveness of different reference-free solutions in detecting hallucinations +in LVLMs. In particular, we conduct an extensive study on three kinds of +techniques: uncertainty-based, consistency-based, and supervised uncertainty +quantification methods on four representative LVLMs across two different tasks. +The empirical results show that the reference-free approaches are capable of +effectively detecting non-factual responses in LVLMs, with the supervised +uncertainty quantification method outperforming the others, achieving the best +performance across different settings. + +
+
+
+
+
+ + ♻ ☆ S-HR-VQVAE: Sequential Hierarchical Residual Learning Vector Quantized + Variational Autoencoder for Video Prediction + + +
+ We address the video prediction task by putting forth a novel model that +combines (i) a novel hierarchical residual learning vector quantized +variational autoencoder (HR-VQVAE), and (ii) a novel autoregressive +spatiotemporal predictive model (AST-PM). We refer to this approach as a +sequential hierarchical residual learning vector quantized variational +autoencoder (S-HR-VQVAE). By leveraging the intrinsic capabilities of HR-VQVAE +at modeling still images with a parsimonious representation, combined with the +AST-PM's ability to handle spatiotemporal information, S-HR-VQVAE can better +deal with major challenges in video prediction. These include learning +spatiotemporal information, handling high dimensional data, combating blurry +prediction, and implicit modeling of physical characteristics. Extensive +experimental results on four challenging tasks, namely KTH Human Action, +TrafficBJ, Human3.6M, and Kitti, demonstrate that our model compares favorably +against state-of-the-art video prediction techniques both in quantitative and +qualitative evaluations despite a much smaller model size. Finally, we boost +S-HR-VQVAE by proposing a novel training method to jointly estimate the +HR-VQVAE and AST-PM parameters. + +
+
+ comment: 12 pages, 6 figures, 5 tables. Accepted for publication on IEEE + Transactions on Multimedia on 2024-11-19 +
+
+
+
+
+ + ♻ ☆ Key-Element-Informed sLLM Tuning for Document Summarization + + +
+ Remarkable advances in large language models (LLMs) have enabled high-quality +text summarization. However, this capability is currently accessible only +through LLMs of substantial size or proprietary LLMs with usage fees. In +response, smaller-scale LLMs (sLLMs) of easy accessibility and low costs have +been extensively studied, yet they often suffer from missing key information +and entities, i.e., low relevance, in particular, when input documents are +long. We hence propose a key-element-informed instruction tuning for +summarization, so-called KEITSum, which identifies key elements in documents +and instructs sLLM to generate summaries capturing these key elements. +Experimental results on dialogue and news datasets demonstrate that sLLM with +KEITSum indeed provides high-quality summarization with higher relevance and +less hallucinations, competitive to proprietary LLM. + +
+
+ comment: Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Wavelets Are All You Need for Autoregressive Image Generation + + +
+ In this paper, we take a new approach to autoregressive image generation that +is based on two main ingredients. The first is wavelet image coding, which +allows to tokenize the visual details of an image from coarse to fine details +by ordering the information starting with the most significant bits of the most +significant wavelet coefficients. The second is a variant of a language +transformer whose architecture is re-designed and optimized for token sequences +in this 'wavelet language'. The transformer learns the significant statistical +correlations within a token sequence, which are the manifestations of +well-known correlations between the wavelet subbands at various resolutions. We +show experimental results with conditioning on the generation process. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Rethinking cluster-conditioned diffusion models for label-free image + synthesis + + +
+ Diffusion-based image generation models can enhance image quality when +conditioned on ground truth labels. Here, we conduct a comprehensive +experimental study on image-level conditioning for diffusion models using +cluster assignments. We investigate how individual clustering determinants, +such as the number of clusters and the clustering method, impact image +synthesis across three different datasets. Given the optimal number of clusters +with respect to image synthesis, we show that cluster-conditioning can achieve +state-of-the-art performance, with an FID of 1.67 for CIFAR10 and 2.17 for +CIFAR100, along with a strong increase in training sample efficiency. We +further propose a novel empirical method to estimate an upper bound for the +optimal number of clusters. Unlike existing approaches, we find no significant +association between clustering performance and the corresponding +cluster-conditional FID scores. The code is available at +https://github.com/HHU-MMBS/cedm-official-wavc2025. + +
+
+ comment: Accepted in WAVC2025 (21 pages, 15 figures). Code is available at + https://github.com/HHU-MMBS/cedm-official-wavc2025 +
+
+
+
+
+ + ♻ ☆ Zero-shot LLM-guided Counterfactual Generation: A Case Study on NLP + Model Evaluation + + +
+ With the development and proliferation of large, complex, black-box models +for solving many natural language processing (NLP) tasks, there is also an +increasing necessity of methods to stress-test these models and provide some +degree of interpretability or explainability. While counterfactual examples are +useful in this regard, automated generation of counterfactuals is a data and +resource intensive process. such methods depend on models such as pre-trained +language models that are then fine-tuned on auxiliary, often task-specific +datasets, that may be infeasible to build in practice, especially for new tasks +and data domains. Therefore, in this work we explore the possibility of +leveraging large language models (LLMs) for zero-shot counterfactual generation +in order to stress-test NLP models. We propose a structured pipeline to +facilitate this generation, and we hypothesize that the instruction-following +and textual understanding capabilities of recent LLMs can be effectively +leveraged for generating high quality counterfactuals in a zero-shot manner, +without requiring any training or fine-tuning. Through comprehensive +experiments on a variety of propreitary and open-source LLMs, along with +various downstream tasks in NLP, we explore the efficacy of LLMs as zero-shot +counterfactual generators in evaluating and explaining black-box NLP models. + +
+
+ comment: Longer version of short paper accepted at IEEE BigData 2024 (Main + Track) +
+
+
+
+
+ + ♻ ☆ SpikingNeRF: Making Bio-inspired Neural Networks See through the Real + World + + +
+ In this paper, we propose SpikingNeRF, which aligns the temporal dimension of +spiking neural networks (SNNs) with the radiance rays, to seamlessly +accommodate SNNs to the reconstruction of neural radiance fields (NeRF). Thus, +the computation turns into a spike-based, multiplication-free manner, reducing +energy consumption and making high-quality 3D rendering, for the first time, +accessible to neuromorphic hardware. In SpikingNeRF, each sampled point on the +ray is matched to a particular time step and represented in a hybrid manner +where the voxel grids are maintained as well. Based on the voxel grids, sampled +points are determined whether to be masked out for faster training and +inference. However, this masking operation also incurs irregular temporal +length, making it intractable for hardware processors, e.g., GPUs, to conduct +parallel training. To address this problem, we develop the temporal padding +strategy to tackle the masked samples to maintain regular temporal length, +i.e., regular tensors, and further propose the temporal condensing strategy to +form a denser data structure for hardware-friendly computation. Experiments on +various datasets demonstrate that our method can reduce energy consumption by +an average of 70.79\% and obtain comparable synthesis quality with the ANN +baseline. Verification on the neuromorphic hardware accelerator also shows that +SpikingNeRF can further benefit from neuromorphic computing over the ANN +baselines on energy efficiency. Codes and the appendix are in +\url{https://github.com/Ikarosy/SpikingNeRF-of-CASIA}. + +
+
+
+
+
+ + ♻ ☆ Interpretable Fusion Analytics Framework for fMRI Connectivity: + Self-Attention Mechanism and Latent Space Item-Response Model + + +
+ There have been several attempts to use deep learning based on brain fMRI +signals to classify cognitive impairment diseases. However, deep learning is a +hidden black box model that makes it difficult to interpret the process of +classification. To address this issue, we propose a novel analytical framework +that interprets the classification result from deep learning processes. We +first derive the region of interest (ROI) functional connectivity network (FCN) +by embedding functions based on their similar signal patterns. Then, using the +self-attention equipped deep learning model, we classify diseases based on +their FCN. Finally, in order to interpret the classification results, we employ +a latent space item-response interaction network model to identify the +significant functions that exhibit distinct connectivity patterns when compared +to other diseases. The application of this proposed framework to the four types +of cognitive impairment shows that our approach is valid for determining the +significant ROI functions. + +
+
+ comment: This submission is a duplicate of another manuscript from our + research group [arXiv preprint arXiv:2401.09028] due to a misunderstanding in + communication among co-authors +
+
+
+
+
+ + ♻ ☆ Vision-based Manipulation of Transparent Plastic Bags in Industrial + Setups + + +
+ This paper addresses the challenges of vision-based manipulation for +autonomous cutting and unpacking of transparent plastic bags in industrial +setups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data, +connectivity, analytics, and robotics, promises enhanced accessibility and +sustainability throughout the value chain. The integration of autonomous +systems, including collaborative robots (cobots), into industrial processes is +pivotal for efficiency and safety. The proposed solution employs advanced +Machine Learning algorithms, particularly Convolutional Neural Networks (CNNs), +to identify transparent plastic bags under varying lighting and background +conditions. Tracking algorithms and depth sensing technologies are utilized for +3D spatial awareness during pick and placement. The system addresses challenges +in grasping and manipulation, considering optimal points, compliance control +with vacuum gripping technology, and real-time automation for safe interaction +in dynamic environments. The system's successful testing and validation in the +lab with the FRANKA robot arm, showcases its potential for widespread +industrial applications, while demonstrating effectiveness in automating the +unpacking and cutting of transparent plastic bags for an 8-stack bulk-loader +based on specific requirements and rigorous testing. + +
+
+
+
+
+ + ♻ ☆ Signaling and Social Learning in Swarms of Robots + + +
+ This paper investigates the role of communication in improving coordination +within robot swarms, focusing on a paradigm where learning and execution occur +simultaneously in a decentralized manner. We highlight the role communication +can play in addressing the credit assignment problem (individual contribution +to the overall performance), and how it can be influenced by it. We propose a +taxonomy of existing and future works on communication, focusing on information +selection and physical abstraction as principal axes for classification: from +low-level lossless compression with raw signal extraction and processing to +high-level lossy compression with structured communication models. The paper +reviews current research from evolutionary robotics, multi-agent (deep) +reinforcement learning, language models, and biophysics models to outline the +challenges and opportunities of communication in a collective of robots that +continuously learn from one another through local message exchanges, +illustrating a form of social learning. + +
+
+ comment: 17 pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ Vision-Language Model Fine-Tuning via Simple Parameter-Efficient + Modification EMNLP 2024 + + +
+ Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed +the success of prompt tuning and adapter tuning, while the classic model +fine-tuning on inherent parameters seems to be overlooked. It is believed that +fine-tuning the parameters of VLMs with few-shot samples corrupts the +pre-trained knowledge since fine-tuning the CLIP model even degrades +performance. In this paper, we revisit this viewpoint, and propose a new +perspective: fine-tuning the specific parameters instead of all will uncover +the power of classic model fine-tuning on VLMs. Through our meticulous study, +we propose ClipFit, a simple yet effective method to fine-tune CLIP without +introducing any overhead of extra parameters. We demonstrate that by only +fine-tuning the specific bias terms and normalization layers, ClipFit can +improve the performance of zero-shot CLIP by 7.27\% average harmonic mean +accuracy. Lastly, to understand how fine-tuning in CLIPFit affects the +pre-trained models, we conducted extensive experimental analyses w.r.t. changes +in internal parameters and representations. We found that low-level text bias +layers and the first layer normalization layer change much more than other +layers. The code is available at \url{https://github.com/minglllli/CLIPFit}. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Multi-Head RAG: Solving Multi-Aspect Problems with LLMs + + +
+ Retrieval Augmented Generation (RAG) enhances the abilities of Large Language +Models (LLMs) by enabling the retrieval of documents into the LLM context to +provide more accurate and relevant responses. Existing RAG solutions do not +focus on queries that may require fetching multiple documents with +substantially different contents. Such queries occur frequently, but are +challenging because the embeddings of these documents may be distant in the +embedding space, making it hard to retrieve them all. This paper introduces +Multi-Head RAG (MRAG), a novel scheme designed to address this gap with a +simple yet powerful idea: leveraging activations of Transformer's multi-head +attention layer, instead of the decoder layer, as keys for fetching +multi-aspect documents. The driving motivation is that different attention +heads can learn to capture different data aspects. Harnessing the corresponding +activations results in embeddings that represent various facets of data items +and queries, improving the retrieval accuracy for complex queries. We provide +an evaluation methodology and metrics, multi-aspect datasets that we release +online, and real-world use cases to demonstrate MRAG's effectiveness, showing +improvements of up to 20% in relevance over standard RAG baselines. MRAG can be +seamlessly integrated with existing RAG frameworks and benchmarking tools like +RAGAS as well as different classes of data stores. + +
+
+
+
+
+ + ♻ ☆ Xmodel-LM Technical Report + + +
+ We introduce Xmodel-LM, a compact and efficient 1.1B language model +pre-trained on around 2 trillion tokens. Trained on our self-built dataset +(Xdata), which balances Chinese and English corpora based on downstream task +optimization, Xmodel-LM exhibits remarkable performance despite its smaller +size. It notably surpasses existing open-source language models of similar +scale. Our model checkpoints and code are publicly accessible on GitHub at +https://github.com/XiaoduoAILab/XmodelLM. + +
+
+
+
+
+ + ♻ ☆ Domain Consistency Representation Learning for Lifelong Person + Re-Identification + + +
+ Lifelong person re-identification (LReID) exhibits a contradictory +relationship between intra-domain discrimination and inter-domain gaps when +learning from continuous data. Intra-domain discrimination focuses on +individual nuances (e.g. clothing type, accessories, etc.), while inter-domain +gaps emphasize domain consistency. Achieving a trade-off between maximizing +intra-domain discrimination and minimizing inter-domain gaps is a crucial +challenge for improving LReID performance. Most existing methods aim to reduce +inter-domain gaps through knowledge distillation to maintain domain +consistency. However, they often ignore intra-domain discrimination. To address +this challenge, we propose a novel domain consistency representation learning +(DCR) model that explores global and attribute-wise representations as a bridge +to balance intra-domain discrimination and inter-domain gaps. At the +intra-domain level, we explore the complementary relationship between global +and attribute-wise representations to improve discrimination among similar +identities. Excessive learning intra-domain discrimination can lead to +catastrophic forgetting. We further develop an attribute-oriented +anti-forgetting (AF) strategy that explores attribute-wise representations to +enhance inter-domain consistency, and propose a knowledge consolidation (KC) +strategy to facilitate knowledge transfer. Extensive experiments show that our +DCR model achieves superior performance compared to state-of-the-art LReID +methods. Our code will be available soon. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ TFG: Unified Training-Free Guidance for Diffusion Models + + +
+ Given an unconditional diffusion model and a predictor for a target property +of interest (e.g., a classifier), the goal of training-free guidance is to +generate samples with desirable target properties without additional training. +Existing methods, though effective in various individual applications, often +lack theoretical grounding and rigorous testing on extensive benchmarks. As a +result, they could even fail on simple tasks, and applying them to a new +problem becomes unavoidably difficult. This paper introduces a novel +algorithmic framework encompassing existing methods as special cases, unifying +the study of training-free guidance into the analysis of an algorithm-agnostic +design space. Via theoretical and empirical investigation, we propose an +efficient and effective hyper-parameter searching strategy that can be readily +applied to any downstream task. We systematically benchmark across 7 diffusion +models on 16 tasks with 40 targets, and improve performance by 8.5% on average. +Our framework and benchmark offer a solid foundation for conditional generation +in a training-free manner. + +
+
+
+
+
+ + ♻ ☆ Grading and Anomaly Detection for Automated Retinal Image Analysis using + Deep Learning + + +
+ The significant portion of diabetic patients was affected due to major +blindness caused by Diabetic retinopathy (DR). For diabetic retinopathy, lesion +segmentation, and detection the comprehensive examination is delved into the +deep learning techniques application. The study conducted a systematic +literature review using the PRISMA analysis and 62 articles has been +investigated in the research. By including CNN-based models for DR grading, and +feature fusion several deep-learning methodologies are explored during the +study. For enhancing effectiveness in classification accuracy and robustness +the data augmentation and ensemble learning strategies are scrutinized. By +demonstrating the superior performance compared to individual models the +efficacy of ensemble learning methods is investigated. The potential ensemble +approaches in DR diagnosis are shown by the integration of multiple pre-trained +networks with custom classifiers that yield high specificity. The diverse +deep-learning techniques that are employed for detecting DR lesions are +discussed within the diabetic retinopathy lesions segmentation and detection +section. By emphasizing the requirement for continued research and integration +into clinical practice deep learning shows promise for personalized healthcare +and early detection of diabetics. + +
+
+ comment: Diabetic retinopathy, segmentation, images on retinal fundus, + convolutional neural network +
+
+
+
+
+ + ♻ ☆ From Text to Multimodality: Exploring the Evolution and Impact of Large + Language Models in Medical Practice + + +
+ Large Language Models (LLMs) have rapidly evolved from text-based systems to +multimodal platforms, significantly impacting various sectors including +healthcare. This comprehensive review explores the progression of LLMs to +Multimodal Large Language Models (MLLMs) and their growing influence in medical +practice. We examine the current landscape of MLLMs in healthcare, analyzing +their applications across clinical decision support, medical imaging, patient +engagement, and research. The review highlights the unique capabilities of +MLLMs in integrating diverse data types, such as text, images, and audio, to +provide more comprehensive insights into patient health. We also address the +challenges facing MLLM implementation, including data limitations, technical +hurdles, and ethical considerations. By identifying key research gaps, this +paper aims to guide future investigations in areas such as dataset development, +modality alignment methods, and the establishment of ethical guidelines. As +MLLMs continue to shape the future of healthcare, understanding their potential +and limitations is crucial for their responsible and effective integration into +medical practice. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Structured Multi-Track Accompaniment Arrangement via Style Prior + Modelling NeurIPS 2024 + + +
+ In the realm of music AI, arranging rich and structured multi-track +accompaniments from a simple lead sheet presents significant challenges. Such +challenges include maintaining track cohesion, ensuring long-term coherence, +and optimizing computational efficiency. In this paper, we introduce a novel +system that leverages prior modelling over disentangled style factors to +address these challenges. Our method presents a two-stage process: initially, a +piano arrangement is derived from the lead sheet by retrieving piano texture +styles; subsequently, a multi-track orchestration is generated by infusing +orchestral function styles into the piano arrangement. Our key design is the +use of vector quantization and a unique multi-stream Transformer to model the +long-term flow of the orchestration style, which enables flexible, +controllable, and structured music generation. Experiments show that by +factorizing the arrangement task into interpretable sub-stages, our approach +enhances generative capacity while improving efficiency. Additionally, our +system supports a variety of music genres and provides style control at +different composition hierarchies. We further show that our system achieves +superior coherence, structure, and overall arrangement quality compared to +existing baselines. + +
+
+ comment: Accepted by NeurIPS 2024; significance test updated with Bonferroni + correction +
+
+
+
+
+ + ♻ ☆ FedDCT: A Dynamic Cross-Tier Federated Learning Framework in Wireless + Networks + + +
+ Federated Learning (FL), as a privacy-preserving machine learning paradigm, +trains a global model across devices without exposing local data. However, +resource heterogeneity and inevitable stragglers in wireless networks severely +impact the efficiency and accuracy of FL training. In this paper, we propose a +novel Dynamic Cross-Tier Federated Learning framework (FedDCT). Firstly, we +design a dynamic tiering strategy that dynamically partitions devices into +different tiers based on their response times and assigns specific timeout +thresholds to each tier to reduce single-round training time. Then, we propose +a cross-tier device selection algorithm that selects devices that respond +quickly and are conducive to model convergence to improve convergence +efficiency and accuracy. Experimental results demonstrate that the proposed +approach under wireless networks outperforms the baseline approach, with an +average reduction of 54.7\% in convergence time and an average improvement of +1.83\% in convergence accuracy. + +
+
+ comment: Published in WASA 2024 +
+
+
+
+
+ + ♻ ☆ Adapting Amidst Degradation: Cross Domain Li-ion Battery Health + Estimation via Physics-Guided Test-Time Training + + +
+ Health modeling of lithium-ion batteries (LIBs) is crucial for safe and +efficient energy management and carries significant socio-economic +implications. Although Machine Learning (ML)-based State of Health (SOH) +estimation methods have made significant progress in accuracy, the scarcity of +high-quality LIB data remains a major obstacle. Existing transfer learning +methods for cross-domain LIB SOH estimation have significantly alleviated the +labeling burden of target LIB data, however, they still require sufficient +unlabeled target data (UTD) for effective adaptation to the target domain. +Collecting this UTD is challenging due to the time-consuming nature of +degradation experiments. To address this issue, we introduce a practical +Test-Time Training framework, BatteryTTT, which adapts the model continually +using each UTD collected amidst degradation, thereby significantly reducing +data collection time. To fully utilize each UTD, BatteryTTT integrates the +inherent physical laws of modern LIBs into self-supervised learning, termed +Physcics-Guided Test-Time Training. Additionally, we explore the potential of +large language models (LLMs) in battery sequence modeling by evaluating their +performance in SOH estimation through model reprogramming and prefix prompt +adaptation. The combination of BatteryTTT and LLM modeling, termed GPT4Battery, +achieves state-of-the-art generalization results across current LIB benchmarks. +Furthermore, we demonstrate the practical value and scalability of our approach +by deploying it in our real-world battery management system (BMS) for 300Ah +large-scale energy storage LIBs. + +
+
+
+
+
+ + ♻ ☆ Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with + Texture Map-Based Weak Supervision ICPR + + +
+ Facial wrinkle detection plays a crucial role in cosmetic dermatology. +Precise manual segmentation of facial wrinkles is challenging and +time-consuming, with inherent subjectivity leading to inconsistent results +among graders. To address this issue, we propose two solutions. First, we build +and release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an +extension of the NVIDIA FFHQ dataset. It includes 1,000 images with human +labels and 50,000 images with automatically generated weak labels. This dataset +could serve as a foundation for the research community to develop advanced +wrinkle detection algorithms. Second, we introduce a simple training strategy +utilizing texture maps, applicable to various segmentation models, to detect +wrinkles across the face. Our two-stage training strategy first pretrain models +on a large dataset with weak labels (N=50k), or masked texture maps generated +through computer vision techniques, without human intervention. We then +finetune the models using human-labeled data (N=1k), which consists of manually +labeled wrinkle masks. The network takes as input a combination of RGB and +masked texture map of the image, comprising four channels, in finetuning. We +effectively combine labels from multiple annotators to minimize subjectivity in +manual labeling. Our strategies demonstrate improved segmentation performance +in facial wrinkle segmentation both quantitatively and visually compared to +existing pretraining methods. The dataset is available at +https://github.com/labhai/ffhq-wrinkle-dataset. + +
+
+ comment: Accepted at International Conference on Pattern Recognition (ICPR), + 2024 +
+
+
+
+
+ + ♻ ☆ AI's Spatial Intelligence: Evaluating AI's Understanding of Spatial + Transformations in PSVT:R and Augmented Reality + + +
+ Spatial intelligence is important in Architecture, Construction, Science, +Technology, Engineering, and Mathematics (STEM), and Medicine. Understanding +three-dimensional (3D) spatial rotations can involve verbal descriptions and +visual or interactive examples, illustrating how objects change orientation in +3D space. Recent studies show Artificial Intelligence (AI) with language and +vision capabilities still face limitations in spatial reasoning. In this paper, +we have studied generative AI's spatial capabilities of understanding rotations +of objects utilizing its image and language processing features. We examined +the spatial intelligence of the GPT-4 model with vision in understanding +spatial rotation process with diagrams based on the Revised Purdue Spatial +Visualization Test: Visualization of Rotations (Revised PSVT:R). Next, we +incorporated a layer of coordinate system axes on Revised PSVT:R to study the +variations in GPT-4's performance. We also examined GPT-4's understanding of 3D +rotations in Augmented Reality (AR) scenes that visualize spatial rotations of +an object in 3D space and observed increased accuracy of GPT-4's understanding +of the rotations by adding supplementary textual information depicting the +rotation process or mathematical representations of the rotation (e.g., +matrices). The results indicate that while GPT-4 as a major current Generative +AI model lacks the understanding of a spatial rotation process, it has the +potential to understand the rotation process with additional information that +can be provided by methods such as AR. By combining the potentials in spatial +intelligence of AI with AR's interactive visualization abilities, we expect to +offer enhanced guidance for students' spatial learning activities. Such spatial +guidance can benefit understanding spatial transformations and additionally +support processes like assembly, fabrication, and manufacturing. + +
+
+
+
+
+ + ♻ ☆ ControlNet++: Improving Conditional Controls with Efficient Consistency + Feedback + + +
+ To enhance the controllability of text-to-image diffusion models, existing +efforts like ControlNet incorporated image-based conditional controls. In this +paper, we reveal that existing methods still face significant challenges in +generating images that align with the image conditional controls. To this end, +we propose ControlNet++, a novel approach that improves controllable generation +by explicitly optimizing pixel-level cycle consistency between generated images +and conditional controls. Specifically, for an input conditional control, we +use a pre-trained discriminative reward model to extract the corresponding +condition of the generated images, and then optimize the consistency loss +between the input conditional control and extracted condition. A +straightforward implementation would be generating images from random noises +and then calculating the consistency loss, but such an approach requires +storing gradients for multiple sampling timesteps, leading to considerable time +and memory costs. To address this, we introduce an efficient reward strategy +that deliberately disturbs the input images by adding noise, and then uses the +single-step denoised images for reward fine-tuning. This avoids the extensive +costs associated with image sampling, allowing for more efficient reward +fine-tuning. Extensive experiments show that ControlNet++ significantly +improves controllability under various conditional controls. For example, it +achieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE, +respectively, for segmentation mask, line-art edge, and depth conditions. All +the code, models, demo and organized data have been open sourced on our Github +Repo. + +
+
+ comment: Camera Ready Version. Project Page: + https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data: + https://github.com/liming-ai/ControlNet_Plus_Plus +
+
+
+
+
+ + ♻ ☆ Multi-LoRA Composition for Image Generation + + +
+ Low-Rank Adaptation (LoRA) is extensively utilized in text-to-image models +for the accurate rendition of specific elements like distinct characters or +unique styles in generated images. Nonetheless, existing methods face +challenges in effectively composing multiple LoRAs, especially as the number of +LoRAs to be integrated grows, thus hindering the creation of complex imagery. +In this paper, we study multi-LoRA composition through a decoding-centric +perspective. We present two training-free methods: LoRA Switch, which +alternates between different LoRAs at each denoising step, and LoRA Composite, +which simultaneously incorporates all LoRAs to guide more cohesive image +synthesis. To evaluate the proposed approaches, we establish ComposLoRA, a new +comprehensive testbed as part of this research. It features a diverse range of +LoRA categories with 480 composition sets. Utilizing an evaluation framework +based on GPT-4V, our findings demonstrate a clear improvement in performance +with our methods over the prevalent baseline, particularly evident when +increasing the number of LoRAs in a composition. The code, benchmarks, LoRA +weights, and all evaluation details are available on our project website: +https://maszhongming.github.io/Multi-LoRA-Composition. + +
+
+ comment: Transactions on Machine Learning Research (TMLR), 2024 +
+
+
+
+
+ + ♻ ☆ Different Horses for Different Courses: Comparing Bias Mitigation + Algorithms in ML NeurIPS 2024 + + +
+ With fairness concerns gaining significant attention in Machine Learning +(ML), several bias mitigation techniques have been proposed, often compared +against each other to find the best method. These benchmarking efforts tend to +use a common setup for evaluation under the assumption that providing a uniform +environment ensures a fair comparison. However, bias mitigation techniques are +sensitive to hyperparameter choices, random seeds, feature selection, etc., +meaning that comparison on just one setting can unfairly favour certain +algorithms. In this work, we show significant variance in fairness achieved by +several algorithms and the influence of the learning pipeline on fairness +scores. We highlight that most bias mitigation techniques can achieve +comparable performance, given the freedom to perform hyperparameter +optimization, suggesting that the choice of the evaluation parameters-rather +than the mitigation technique itself-can sometimes create the perceived +superiority of one method over another. We hope our work encourages future +research on how various choices in the lifecycle of developing an algorithm +impact fairness, and trends that guide the selection of appropriate algorithms. + +
+
+ comment: To appear at AFME@NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting + + +
+ Previous text-to-4D methods have leveraged multiple Score Distillation +Sampling (SDS) techniques, combining motion priors from video-based diffusion +models (DMs) with geometric priors from multiview DMs to implicitly guide 4D +renderings. However, differences in these priors result in conflicting gradient +directions during optimization, causing trade-offs between motion fidelity and +geometry accuracy, and requiring substantial optimization time to reconcile the +models. In this paper, we introduce \textbf{P}ixel-\textbf{L}evel +\textbf{A}lignment for text-driven \textbf{4D} Gaussian splatting (PLA4D) to +resolve this motion-geometry conflict. PLA4D provides an anchor reference, +i.e., text-generated video, to align the rendering process conditioned by +different DMs in pixel space. For static alignment, our approach introduces a +focal alignment method and Gaussian-Mesh contrastive learning to iteratively +adjust focal lengths and provide explicit geometric priors at each timestep. At +the dynamic level, a motion alignment technique and T-MV refinement method are +employed to enforce both pose alignment and motion continuity across unknown +viewpoints, ensuring intrinsic geometric consistency across views. With such +pixel-level multi-DM alignment, our PLA4D framework is able to generate 4D +objects with superior geometric, motion, and semantic consistency. Fully +implemented with open-source tools, PLA4D offers an efficient and accessible +solution for high-quality 4D digital content creation with significantly +reduced generation time. + +
+
+
+
+
+ + ♻ ☆ Unveiling and Mitigating Bias in Large Language Model Recommendations: A + Path to Fairness + + +
+ Large Language Model (LLM)-based recommendation systems provide more +comprehensive recommendations than traditional systems by deeply analyzing +content and user behavior. However, these systems often exhibit biases, +favoring mainstream content while marginalizing non-traditional options due to +skewed training data. This study investigates the intricate relationship +between bias and LLM-based recommendation systems, with a focus on music, song, +and book recommendations across diverse demographic and cultural groups. +Through a comprehensive analysis conducted over different LLM-models, this +paper evaluates the impact of bias on recommendation outcomes. Our findings +highlight that biases are not only deeply embedded but also widely pervasive +across these systems, emphasizing the substantial and widespread nature of the +issue. Moreover, contextual information, such as socioeconomic status, further +amplify these biases, demonstrating the complexity and depth of the challenges +faced in creating fair recommendations across different groups. + +
+
+
+
+
+ + ♻ ☆ Brain-inspired Computing Based on Deep Learning for Human-computer + Interaction: A Review + + +
+ The continuous development of artificial intelligence has a profound impact +on biomedicine and other fields, providing new research ideas and technical +methods. Brain-inspired computing is an important intersection between +multimodal technology and biomedical field. Focusing on the application +scenarios of decoding text and speech from brain signals in human-computer +interaction, this paper presents a comprehensive review of the brain-inspired +computing models based on deep learning (DL), tracking its evolution, +application value, challenges and potential research trends. We first reviews +its basic concepts and development history, and divides its evolution into two +stages: recent machine learning and current deep learning, emphasizing the +importance of each stage in the research of brain-inspired computing for +human-computer interaction. In addition, the latest progress of deep learning +in different tasks of brain-inspired computing for human-computer interaction +is reviewed from five perspectives, including datasets and different brain +signals, and the application of key technologies in the model is elaborated in +detail. Despite significant advances in brain-inspired computational models, +challenges remain to fully exploit their capabilities, and we provide insights +into possible directions for future academic research. For more detailed +information, please visit our GitHub page: +https://github.com/ultracoolHub/brain-inspired-computing. + +
+
+ comment: 26pages, 8 figures and 4 tables +
+
+
+
+
+ + ♻ ☆ Designing Multi-layered Runtime Guardrails for Foundation Model Based + Agents: Swiss Cheese Model for AI Safety by Design + + +
+ Foundation Model (FM)-based agents are revolutionizing application +development across various domains. However, their rapidly growing capabilities +and autonomy have raised significant concerns about AI safety. Researchers are +exploring better ways to design guardrails to ensure that the runtime behavior +of FM-based agents remains within specific boundaries. Nevertheless, designing +effective runtime guardrails is challenging due to the agents' autonomous and +non-deterministic behavior. The involvement of multiple pipeline stages and +agent artifacts, such as goals, plans, tools, at runtime further complicates +these issues. Addressing these challenges at runtime requires multi-layered +guardrails that operate effectively at various levels of the agent +architecture. Thus, in this paper, we present a comprehensive taxonomy of +runtime guardrails for FM-based agents to identify the key quality attributes +for guardrails and design dimensions based on the results of a systematic +literature review. Inspired by the Swiss Cheese Model, we also propose a +reference architecture for designing multi-layered runtime guardrails for +FM-based agents, which includes three dimensions: quality attributes, +pipelines, and artifacts. The proposed taxonomy and reference architecture +provide concrete and robust guidance for researchers and practitioners to build +AI-safety-by-design from a software architecture perspective. + +
+
+ comment: 17 Pages +
+
+
+
+
+ + ♻ ☆ Sufficient Invariant Learning for Distribution Shift + + +
+ Learning robust models under distribution shifts between training and test +datasets is a fundamental challenge in machine learning. While learning +invariant features across environments is a popular approach, it often assumes +that these features are fully observed in both training and test sets-a +condition frequently violated in practice. When models rely on invariant +features absent in the test set, their robustness in new environments can +deteriorate. To tackle this problem, we introduce a novel learning principle +called the Sufficient Invariant Learning (SIL) framework, which focuses on +learning a sufficient subset of invariant features rather than relying on a +single feature. After demonstrating the limitation of existing invariant +learning methods, we propose a new algorithm, Adaptive Sharpness-aware Group +Distributionally Robust Optimization (ASGDRO), to learn diverse invariant +features by seeking common flat minima across the environments. We +theoretically demonstrate that finding a common flat minima enables robust +predictions based on diverse invariant features. Empirical evaluations on +multiple datasets, including our new benchmark, confirm ASGDRO's robustness +against distribution shifts, highlighting the limitations of existing methods. + +
+
+
+
+
+
+
+
+ + Computation and Language 72 + +
+
+
+ + ☆ ACING: Actor-Critic for Instruction Learning in Black-Box Large Language + Models + + +
+ The effectiveness of Large Language Models (LLMs) in solving tasks vastly +depends on the quality of the instructions, which often require fine-tuning +through extensive human effort. This highlights the need for automated +instruction optimization; however, this optimization is particularly +challenging when dealing with black-box LLMs, where model parameters and +gradients remain inaccessible. We propose ACING, a task-specific prompt +optimization approach framed as a stateless continuous-action Reinforcement +Learning (RL) problem, known as the continuum bandit setting. ACING leverages +an actor-critic-based method to optimize prompts, learning from +non-differentiable reward signals. We validate ACING by optimizing prompts for +ChatGPT on 30 instruction-based tasks. ACING consistently outperforms baseline +methods, achieving a median score improvement of 10 percentage points. +Furthermore, ACING not only recovers but also surpasses human-crafted expert +instructions, achieving up to a 39 percentage point improvement against human +benchmarks. + +
+
+
+
+
+ + ☆ Information Theory of Meaningful Communication + + +
+ In Shannon's seminal paper, entropy of printed English, treated as a +stationary stochastic process, was estimated to be roughly 1 bit per character. +However, considered as a means of communication, language differs considerably +from its printed form: (i) the units of information are not characters or even +words but clauses, i.e. shortest meaningful parts of speech; and (ii) what is +transmitted is principally the meaning of what is being said or written, while +the precise phrasing that was used to communicate the meaning is typically +ignored. In this study, we show that one can leverage recently developed large +language models to quantify information communicated in meaningful narratives +in terms of bits of meaning per clause. + +
+
+
+
+
+ + ☆ Scaling laws for nonlinear dynamical models of speech + + +
+ The addition of a nonlinear restoring force to dynamical models of the speech +gesture significantly improves the empirical accuracy of model predictions, but +nonlinearity introduces challenges in selecting appropriate parameters and +numerical stability, especially when modelling variation in empirical data. We +address this issue by introducing simple numerical methods for parameterization +of nonlinear task dynamic models. We first illustrate the problem and then +outline solutions in the form of power laws that scale nonlinear stiffness +terms. We apply the scaling laws to a cubic model and show how they facilitate +interpretable simulations of the nonlinear gestural dynamics underpinning +speech production. + +
+
+
+
+
+ + ☆ Rethinking MUSHRA: Addressing Modern Challenges in Text-to-Speech + Evaluation + + +
+ Despite rapid advancements in TTS models, a consistent and robust human +evaluation framework is still lacking. For example, MOS tests fail to +differentiate between similar models, and CMOS's pairwise comparisons are +time-intensive. The MUSHRA test is a promising alternative for evaluating +multiple TTS systems simultaneously, but in this work we show that its reliance +on matching human reference speech unduly penalises the scores of modern TTS +systems that can exceed human speech quality. More specifically, we conduct a +comprehensive assessment of the MUSHRA test, focusing on its sensitivity to +factors such as rater variability, listener fatigue, and reference bias. Based +on our extensive evaluation involving 471 human listeners across Hindi and +Tamil we identify two primary shortcomings: (i) reference-matching bias, where +raters are unduly influenced by the human reference, and (ii) judgement +ambiguity, arising from a lack of clear fine-grained guidelines. To address +these issues, we propose two refined variants of the MUSHRA test. The first +variant enables fairer ratings for synthesized samples that surpass human +reference quality. The second variant reduces ambiguity, as indicated by the +relatively lower variance across raters. By combining these approaches, we +achieve both more reliable and more fine-grained assessments. We also release +MANGO, a massive dataset of 47,100 human ratings, the first-of-its-kind +collection for Indian languages, aiding in analyzing human preferences and +developing automatic metrics for evaluating TTS systems. + +
+
+ comment: 19 pages, 12 Figures +
+
+
+
+
+ + ☆ Enhancing Multi-Class Disease Classification: Neoplasms, Cardiovascular, + Nervous System, and Digestive Disorders Using Advanced LLMs + + +
+ In this research, we explored the improvement in terms of multi-class disease +classification via pre-trained language models over Medical-Abstracts-TC-Corpus +that spans five medical conditions. We excluded non-cancer conditions and +examined four specific diseases. We assessed four LLMs, BioBERT, XLNet, and +BERT, as well as a novel base model (Last-BERT). BioBERT, which was pre-trained +on medical data, demonstrated superior performance in medical text +classification (97% accuracy). Surprisingly, XLNet followed closely (96% +accuracy), demonstrating its generalizability across domains even though it was +not pre-trained on medical data. LastBERT, a custom model based on the lighter +version of BERT, also proved competitive with 87.10% accuracy (just under +BERT's 89.33%). Our findings confirm the importance of specialized models such +as BioBERT and also support impressions around more general solutions like +XLNet and well-tuned transformer architectures with fewer parameters (in this +case, LastBERT) in medical domain tasks. + +
+
+ comment: 7 Pages, 4 tables and 11 figures. Under review in a IEEE conference +
+
+
+
+
+ + ☆ Strengthening Fake News Detection: Leveraging SVM and Sophisticated Text + Vectorization Techniques. Defying BERT? + + +
+ The rapid spread of misinformation, particularly through online platforms, +underscores the urgent need for reliable detection systems. This study explores +the utilization of machine learning and natural language processing, +specifically Support Vector Machines (SVM) and BERT, to detect news that are +fake. We employ three distinct text vectorization methods for SVM: Term +Frequency Inverse Document Frequency (TF-IDF), Word2Vec, and Bag of Words (BoW) +evaluating their effectiveness in distinguishing between genuine and fake news. +Additionally, we compare these methods against the transformer large language +model, BERT. Our comprehensive approach includes detailed preprocessing steps, +rigorous model implementation, and thorough evaluation to determine the most +effective techniques. The results demonstrate that while BERT achieves superior +accuracy with 99.98% and an F1-score of 0.9998, the SVM model with a linear +kernel and BoW vectorization also performs exceptionally well, achieving 99.81% +accuracy and an F1-score of 0.9980. These findings highlight that, despite +BERT's superior performance, SVM models with BoW and TF-IDF vectorization +methods come remarkably close, offering highly competitive performance with the +advantage of lower computational requirements. + +
+
+ comment: 6 pages, 3 tables and 6 Figures. Submitted to a conference +
+
+
+
+
+ + ☆ Enhanced Sign Language Translation between American Sign Language (ASL) + and Indian Sign Language (ISL) Using LLMs + + +
+ We have come up with a research that hopes to provide a bridge between the +users of American Sign Language and the users of spoken language and Indian +Sign Language (ISL). The research enabled us to create a novel framework that +we have developed for Learner Systems. Leveraging art of Large models to create +key features including: - Real-time translation between these two sign +languages in an efficient manner. Making LLM's capability available for +seamless translations to ISL. Here is the full study showing its implementation +in this paper. The core of the system is a sophisticated pipeline that begins +with reclassification and recognition of ASL gestures based on a strong Random +Forest Classifier. By recognizing the ASL, it is translated into text which can +be more easily processed. Highly evolved natural language NLP (Natural Language +Processing) techniques come in handy as they play a role in our LLM integration +where you then use LLMs to be able to convert the ASL text to ISL which +provides you with the intent of sentence or phrase. The final step is to +synthesize the translated text back into ISL gestures, creating an end-to-end +translation experience using RIFE-Net. This framework is tasked with key +challenges such as automatically dealing with gesture variability and +overcoming the linguistic differences between ASL and ISL. By automating the +translation process, we hope to vastly improve accessibility for sign language +users. No longer will the communication gap between ASL and ISL create +barriers; this totally cool innovation aims to bring our communities closer +together. And we believe, with full confidence in our framework, that we're +able to apply the same principles across a wide variety of sign language +dialects. + +
+
+
+
+
+ + ☆ Neurosymbolic Graph Enrichment for Grounded World Models + + +
+ The development of artificial intelligence systems capable of understanding +and reasoning about complex real-world scenarios is a significant challenge. In +this work we present a novel approach to enhance and exploit LLM reactive +capability to address complex problems and interpret deeply contextual +real-world meaning. We introduce a method and a tool for creating a multimodal, +knowledge-augmented formal representation of meaning that combines the +strengths of large language models with structured semantic representations. +Our method begins with an image input, utilizing state-of-the-art large +language models to generate a natural language description. This description is +then transformed into an Abstract Meaning Representation (AMR) graph, which is +formalized and enriched with logical design patterns, and layered semantics +derived from linguistic and factual knowledge bases. The resulting graph is +then fed back into the LLM to be extended with implicit knowledge activated by +complex heuristic learning, including semantic implicatures, moral values, +embodied cognition, and metaphorical representations. By bridging the gap +between unstructured language models and formal semantic structures, our method +opens new avenues for tackling intricate problems in natural language +understanding and reasoning. + +
+
+
+
+
+ + ☆ Optimizing Airline Reservation Systems with Edge-Enabled Microservices: + A Framework for Real-Time Data Processing and Enhanced User Responsiveness + + +
+ The growing complexity of the operations of airline reservations requires a +smart solution for the adoption of novel approaches to the development of +quick, efficient, and adaptive reservation systems. This paper outlines in +detail a conceptual framework for the implementation of edge computing +microservices in order to address the shortcomings of traditional centralized +architectures. Specifically, as edge computing allows for certain activities +such as seat inventory checks, booking processes and even confirmation to be +done nearer to the user, thus lessening the overall response time and improving +the performance of the system. In addition, the framework value should include +achieving the high performance of the system such as low latency, high +throughput and higher user experience. The major design components include +deployed distributed computing microservices orchestrated by Kubernetes, +real-time message processing system with Kafka and its elastic scaling. Other +operational components include Prometheus and Grafana, which are used to +monitor and manage resources, ensuring that all operational processes are +optimized. Although this research focuses on a design and theoretical scheming +of the framework, its use is foreseen to be more advantageous in facilitating a +transform in the provision of services in the airline industry by improving +customers' satisfaction, providing infrastructure which is cheap to install and +efficiently supporting technology changes such as artificial intelligence and +internet of things embedded systems. This research addresses the increasing +demand for new technologies with modern well-distributed and real-time-centric +systems and also provides a basis for future case implementation and testing. +As such, the proposed architecture offers a market-ready, extensible solution +to the problems posed by existing airline reservation systems . + +
+
+ comment: 22 pages, 11 figures +
+
+
+
+
+ + ☆ DLBacktrace: A Model Agnostic Explainability for any Deep Learning + Models + + +
+ The rapid advancement of artificial intelligence has led to increasingly +sophisticated deep learning models, which frequently operate as opaque 'black +boxes' with limited transparency in their decision-making processes. This lack +of interpretability presents considerable challenges, especially in high-stakes +applications where understanding the rationale behind a model's outputs is as +essential as the outputs themselves. This study addresses the pressing need for +interpretability in AI systems, emphasizing its role in fostering trust, +ensuring accountability, and promoting responsible deployment in +mission-critical fields. To address the interpretability challenge in deep +learning, we introduce DLBacktrace, an innovative technique developed by the +AryaXAI team to illuminate model decisions across a wide array of domains, +including simple Multi Layer Perceptron (MLPs), Convolutional Neural Networks +(CNNs), Large Language Models (LLMs), Computer Vision Models, and more. + We provide a comprehensive overview of the DLBacktrace algorithm and present +benchmarking results, comparing its performance against established +interpretability methods, such as SHAP, LIME, GradCAM, Integrated Gradients, +SmoothGrad, and Attention Rollout, using diverse task-based metrics. The +proposed DLBacktrace technique is compatible with various model architectures +built in PyTorch and TensorFlow, supporting models like Llama 3.2, other NLP +architectures such as BERT and LSTMs, computer vision models like ResNet and +U-Net, as well as custom deep neural network (DNN) models for tabular data. +This flexibility underscores DLBacktrace's adaptability and effectiveness in +enhancing model transparency across a broad spectrum of applications. The +library is open-sourced and available at https://github.com/AryaXAI/DLBacktrace . + +
+
+
+
+
+ + ☆ Leveraging Virtual Reality and AI Tutoring for Language Learning: A Case + Study of a Virtual Campus Environment with OpenAI GPT Integration with Unity + 3D + + +
+ This paper presents a new approach to multiple language learning, with Hindi +the language to be learnt in our case, by using the integration of virtual +reality environments and AI enabled tutoring systems using OpenAIs GPT api +calls. We have developed a scenario which has a virtual campus environment +using Unity which focuses on a detailed representation of our universitys +buildings 11th floor, where most of the cultural and technological activities +take place. Within this virtual environment that we have created, we have an AI +tutor powered by OpenAI's GPT model which was called using an api which moves +around with the user. This provided language learning support in Hindi, as GPT +is able to take care of language translation. Our approach mainly involves +utilising speech to text, text to text conversion and text to speech +capabilities to facilitate real time interaction between users and the AI tutor +in the presence of internet. This research demonstrates the use of combining VR +technology with AI tutoring for immersive language learning experiences and +provides interaction. + +
+
+ comment: 5 pages, 2 tables, 8 figures +
+
+
+
+
+ + ☆ Whisper Finetuning on Nepali Language + + +
+ Despite the growing advancements in Automatic Speech Recognition (ASR) +models, the development of robust models for underrepresented languages, such +as Nepali, remains a challenge. This research focuses on making an exhaustive +and generalized dataset followed by fine-tuning OpenAI's Whisper models of +different sizes to improve transcription (speech-to-text) accuracy for the +Nepali language. We leverage publicly available ASR datasets and self-recorded +custom datasets with a diverse range of accents, dialects, and speaking styles +further enriched through augmentation. Our experimental results demonstrate +that fine-tuning Whisper models on our curated custom dataset substantially +reduces the Word Error Rate (WER) across all model sizes attributed to larger +data variations in terms of speaker's age, gender, and sentiment, acoustic +environment, dialect, denser audio segments (15-30 seconds) that are more +compatible with Whisper's input, and manual curation of audios and +transcriptions. Notably, our approach outperforms Whisper's baseline models +trained on Fleur's dataset, achieving WER reductions of up to 36.2% on the +small and 23.8% on medium models. Furthermore, we show that data augmentation +plays a significant role in enhancing model robustness. Our approach underlines +the importance of dataset quality, variation, and augmentation in the +adaptation of state-of-the-art models to underrepresented languages for +developing accurate ASR systems. + +
+
+
+
+
+ + ☆ Procedural Knowledge in Pretraining Drives Reasoning in Large Language + Models + + +
+ The capabilities and limitations of Large Language Models have been sketched +out in great detail in recent years, providing an intriguing yet conflicting +picture. On the one hand, LLMs demonstrate a general ability to solve problems. +On the other hand, they show surprising reasoning gaps when compared to humans, +casting doubt on the robustness of their generalisation strategies. The sheer +volume of data used in the design of LLMs has precluded us from applying the +method traditionally used to measure generalisation: train-test set separation. +To overcome this, we study what kind of generalisation strategies LLMs employ +when performing reasoning tasks by investigating the pretraining data they rely +on. For two models of different sizes (7B and 35B) and 2.5B of their +pretraining tokens, we identify what documents influence the model outputs for +three simple mathematical reasoning tasks and contrast this to the data that +are influential for answering factual questions. We find that, while the models +rely on mostly distinct sets of data for each factual question, a document +often has a similar influence across different reasoning questions within the +same task, indicating the presence of procedural knowledge. We further find +that the answers to factual questions often show up in the most influential +data. However, for reasoning questions the answers usually do not show up as +highly influential, nor do the answers to the intermediate reasoning steps. +When we characterise the top ranked documents for the reasoning questions +qualitatively, we confirm that the influential documents often contain +procedural knowledge, like demonstrating how to obtain a solution using +formulae or code. Our findings indicate that the approach to reasoning the +models use is unlike retrieval, and more like a generalisable strategy that +synthesises procedural knowledge from documents doing a similar form of +reasoning. + +
+
+
+
+
+ + ☆ Large Language Models for Combinatorial Optimization of Design Structure + Matrix + + +
+ Combinatorial optimization (CO) is essential for improving efficiency and +performance in engineering applications. As complexity increases with larger +problem sizes and more intricate dependencies, identifying the optimal solution +become challenging. When it comes to real-world engineering problems, +algorithms based on pure mathematical reasoning are limited and incapable to +capture the contextual nuances necessary for optimization. This study explores +the potential of Large Language Models (LLMs) in solving engineering CO +problems by leveraging their reasoning power and contextual knowledge. We +propose a novel LLM-based framework that integrates network topology and domain +knowledge to optimize the sequencing of Design Structure Matrix (DSM)-a common +CO problem. Our experiments on various DSM cases demonstrate that the proposed +method achieves faster convergence and higher solution quality than benchmark +methods. Moreover, results show that incorporating contextual domain knowledge +significantly improves performance despite the choice of LLMs. These findings +highlight the potential of LLMs in tackling complex real-world CO problems by +combining semantic and mathematical reasoning. This approach paves the way for +a new paradigm in in real-world combinatorial optimization. + +
+
+
+
+
+ + ☆ Predicting Customer Satisfaction by Replicating the Survey Response + Distribution + + +
+ For many call centers, customer satisfaction (CSAT) is a key performance +indicator (KPI). However, only a fraction of customers take the CSAT survey +after the call, leading to a biased and inaccurate average CSAT value, and +missed opportunities for coaching, follow-up, and rectification. Therefore, +call centers can benefit from a model predicting customer satisfaction on calls +where the customer did not complete the survey. Given that CSAT is a closely +monitored KPI, it is critical to minimize any bias in the average predicted +CSAT (pCSAT). In this paper, we introduce a method such that predicted CSAT +(pCSAT) scores accurately replicate the distribution of survey CSAT responses +for every call center with sufficient data in a live production environment. +The method can be applied to many multiclass classification problems to improve +the class balance and minimize its changes upon model updates. + +
+
+
+
+
+ + ☆ Unlocking State-Tracking in Linear RNNs Through Negative Eigenvalues + + +
+ Linear Recurrent Neural Networks (LRNNs) such as Mamba, RWKV, GLA, mLSTM, and +DeltaNet have emerged as efficient alternatives to Transformers in large +language modeling, offering linear scaling with sequence length and improved +training efficiency. However, LRNNs struggle to perform state-tracking which +may impair performance in tasks such as code evaluation or tracking a chess +game. Even parity, the simplest state-tracking task, which non-linear RNNs like +LSTM handle effectively, cannot be solved by current LRNNs. Recently, Sarrof et +al. (2024) demonstrated that the failure of LRNNs like Mamba to solve parity +stems from restricting the value range of their diagonal state-transition +matrices to $[0, 1]$ and that incorporating negative values can resolve this +issue. We extend this result to non-diagonal LRNNs, which have recently shown +promise in models such as DeltaNet. We prove that finite precision LRNNs with +state-transition matrices having only positive eigenvalues cannot solve parity, +while complex eigenvalues are needed to count modulo $3$. Notably, we also +prove that LRNNs can learn any regular language when their state-transition +matrices are products of identity minus vector outer product matrices, each +with eigenvalues in the range $[-1, 1]$. Our empirical results confirm that +extending the eigenvalue range of models like Mamba and DeltaNet to include +negative values not only enables them to solve parity but consistently improves +their performance on state-tracking tasks. Furthermore, pre-training LRNNs with +an extended eigenvalue range for language modeling achieves comparable +performance and stability while showing promise on code and math data. Our work +enhances the expressivity of modern LRNNs, broadening their applicability +without changing the cost of training or inference. + +
+
+
+
+
+ + ☆ Bias Free Sentiment Analysis + + +
+ This paper introduces the Semantic Propagation Graph Neural Network (SProp +GNN), a machine learning sentiment analysis (SA) architecture that relies +exclusively on syntactic structures and word-level emotional cues to predict +emotions in text. By semantically blinding the model to information about +specific words, it is robust to biases such as political or gender bias that +have been plaguing previous machine learning-based SA systems. The SProp GNN +shows performance superior to lexicon-based alternatives such as VADER and +EmoAtlas on two different prediction tasks, and across two languages. +Additionally, it approaches the accuracy of transformer-based models while +significantly reducing bias in emotion prediction tasks. By offering improved +explainability and reducing bias, the SProp GNN bridges the methodological gap +between interpretable lexicon approaches and powerful, yet often opaque, deep +learning models, offering a robust tool for fair and effective emotion analysis +in understanding human behavior through text. + +
+
+
+
+
+ + ☆ Regular-pattern-sensitive CRFs for Distant Label Interactions + + +
+ Linear-chain conditional random fields (CRFs) are a common model component +for sequence labeling tasks when modeling the interactions between different +labels is important. However, the Markov assumption limits linear-chain CRFs to +only directly modeling interactions between adjacent labels. Weighted +finite-state transducers (FSTs) are a related approach which can be made to +model distant label-label interactions, but exact label inference is +intractable for these models in the general case, and the task of selecting an +appropriate automaton structure for the desired interaction types poses a +practical challenge. In this work, we present regular-pattern-sensitive CRFs +(RPCRFs), a method of enriching standard linear-chain CRFs with the ability to +learn long-distance label interactions which occur in user-specified patterns. +This approach allows users to write regular-expression label patterns concisely +specifying which types of interactions the model should take into account, +allowing the model to learn from data whether and in which contexts these +patterns occur. The result can be interpreted alternatively as a CRF augmented +with additional, non-local potentials, or as a finite-state transducer whose +structure is defined by a set of easily-interpretable patterns. Critically, +unlike the general case for FSTs (and for non-chain CRFs), exact training and +inference are tractable for many pattern sets. In this work, we detail how a +RPCRF can be automatically constructed from a set of user-specified patterns, +and demonstrate the model's effectiveness on synthetic data, showing how +different types of patterns can capture different nonlocal dependency +structures in label sequences. + +
+
+
+
+
+ + ☆ Analysing Explanation-Related Interactions in Collaborative + Perception-Cognition-Communication-Action + + +
+ Effective communication is essential in collaborative tasks, so AI-equipped +robots working alongside humans need to be able to explain their behaviour in +order to cooperate effectively and earn trust. We analyse and classify +communications among human participants collaborating to complete a simulated +emergency response task. The analysis identifies messages that relate to +various kinds of interactive explanations identified in the explainable AI +literature. This allows us to understand what type of explanations humans +expect from their teammates in such settings, and thus where AI-equipped robots +most need explanation capabilities. We find that most explanation-related +messages seek clarification in the decisions or actions taken. We also confirm +that messages have an impact on the performance of our simulated task. + +
+
+ comment: 4 pages, 3 figures, published as a Late Breaking Report in RO-MAN + 2024 +
+
+
+
+
+ + ☆ NMT-Obfuscator Attack: Ignore a sentence in translation with only one + word + + +
+ Neural Machine Translation systems are used in diverse applications due to +their impressive performance. However, recent studies have shown that these +systems are vulnerable to carefully crafted small perturbations to their +inputs, known as adversarial attacks. In this paper, we propose a new type of +adversarial attack against NMT models. In this attack, we find a word to be +added between two sentences such that the second sentence is ignored and not +translated by the NMT model. The word added between the two sentences is such +that the whole adversarial text is natural in the source language. This type of +attack can be harmful in practical scenarios since the attacker can hide +malicious information in the automatic translation made by the target NMT +model. Our experiments show that different NMT models and translation tasks are +vulnerable to this type of attack. Our attack can successfully force the NMT +models to ignore the second part of the input in the translation for more than +50% of all cases while being able to maintain low perplexity for the whole +input. + +
+
+
+
+
+ + ☆ Guide-to-Explain for Controllable Summarization + + +
+ Recently, large language models (LLMs) have demonstrated remarkable +performance in abstractive summarization tasks. However, controllable +summarization with LLMs remains underexplored, limiting their ability to +generate summaries that align with specific user preferences. In this paper, we +first investigate the capability of LLMs to control diverse attributes, +revealing that they encounter greater challenges with numerical attributes, +such as length and extractiveness, compared to linguistic attributes. To +address this challenge, we propose a guide-to-explain framework (GTE) for +controllable summarization. Our GTE framework enables the model to identify +misaligned attributes in the initial draft and guides it in explaining errors +in the previous output. Based on this reflection, the model generates a +well-adjusted summary. As a result, by allowing the model to reflect on its +misalignment, we generate summaries that satisfy the desired attributes in +surprisingly fewer iterations than other iterative methods solely using LLMs. + +
+
+
+
+
+ + ☆ Variation between Credible and Non-Credible News Across Topics + + +
+ 'Fake News' continues to undermine trust in modern journalism and politics. +Despite continued efforts to study fake news, results have been conflicting. +Previous attempts to analyse and combat fake news have largely focused on +distinguishing fake news from truth, or differentiating between its various +sub-types (such as propaganda, satire, misinformation, etc.) This paper +conducts a linguistic and stylistic analysis of fake news, focusing on +variation between various news topics. It builds on related work identifying +features from discourse and linguistics in deception detection by analysing +five distinct news topics: Economy, Entertainment, Health, Science, and Sports. +The results emphasize that linguistic features vary between credible and +deceptive news in each domain and highlight the importance of adapting +classification tasks to accommodate variety-based stylistic and linguistic +differences in order to achieve better real-world performance. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ☆ \textsc{Neon}: News Entity-Interaction Extraction for Enhanced Question + Answering + + +
+ Capturing fresh information in near real-time and using it to augment +existing large language models (LLMs) is essential to generate up-to-date, +grounded, and reliable output. This problem becomes particularly challenging +when LLMs are used for informational tasks in rapidly evolving fields, such as +Web search related to recent or unfolding events involving entities, where +generating temporally relevant responses requires access to up-to-the-hour news +sources. However, the information modeled by the parametric memory of LLMs is +often outdated, and Web results from prototypical retrieval systems may fail to +capture the latest relevant information and struggle to handle conflicting +reports in evolving news. To address this challenge, we present the NEON +framework, designed to extract emerging entity interactions -- such as events +or activities -- as described in news articles. NEON constructs an +entity-centric timestamped knowledge graph that captures such interactions, +thereby facilitating enhanced QA capabilities related to news events. Our +framework innovates by integrating open Information Extraction (openIE) style +tuples into LLMs to enable in-context retrieval-augmented generation. This +integration demonstrates substantial improvements in QA performance when +tackling temporal, entity-centric search queries. Through NEON, LLMs can +deliver more accurate, reliable, and up-to-date responses. + +
+
+
+
+
+ + ☆ Evaluating the Prompt Steerability of Large Language Models + + +
+ Building pluralistic AI requires designing models that are able to be shaped +to represent a wide range of value systems and cultures. Achieving this +requires first being able to evaluate the degree to which a given model is +capable of reflecting various personas. To this end, we propose a benchmark for +evaluating the steerability of model personas as a function of prompting. Our +design is based on a formal definition of prompt steerability, which analyzes +the degree to which a model's joint behavioral distribution can be shifted from +its baseline behavior. By defining steerability indices and inspecting how +these indices change as a function of steering effort, we can estimate the +steerability of a model across various persona dimensions and directions. Our +benchmark reveals that the steerability of many current models is limited -- +due to both a skew in their baseline behavior and an asymmetry in their +steerability across many persona dimensions. We release an implementation of +our benchmark at https://github.com/IBM/prompt-steering. + +
+
+
+
+
+ + ☆ Do LLMs Understand Ambiguity in Text? A Case Study in Open-world + Question Answering + + +
+ Ambiguity in natural language poses significant challenges to Large Language +Models (LLMs) used for open-domain question answering. LLMs often struggle with +the inherent uncertainties of human communication, leading to +misinterpretations, miscommunications, hallucinations, and biased responses. +This significantly weakens their ability to be used for tasks like +fact-checking, question answering, feature extraction, and sentiment analysis. +Using open-domain question answering as a test case, we compare off-the-shelf +and few-shot LLM performance, focusing on measuring the impact of explicit +disambiguation strategies. We demonstrate how simple, training-free, +token-level disambiguation methods may be effectively used to improve LLM +performance for ambiguous question answering tasks. We empirically show our +findings and discuss best practices and broader impacts regarding ambiguity in +LLMs. + +
+
+ comment: Accepted at the REU Symposium at IEEE BigData 2024 +
+
+
+
+
+ + ☆ RedPajama: an Open Dataset for Training Large Language Models NeurIPS + 2024 + + +
+ Large language models are increasingly becoming a cornerstone technology in +artificial intelligence, the sciences, and society as a whole, yet the optimal +strategies for dataset composition and filtering remain largely elusive. Many +of the top-performing models lack transparency in their dataset curation and +model development processes, posing an obstacle to the development of fully +open language models. In this paper, we identify three core data-related +challenges that must be addressed to advance open-source language models. These +include (1) transparency in model development, including the data curation +process, (2) access to large quantities of high-quality data, and (3) +availability of artifacts and metadata for dataset curation and analysis. To +address these challenges, we release RedPajama-V1, an open reproduction of the +LLaMA training dataset. In addition, we release RedPajama-V2, a massive +web-only dataset consisting of raw, unfiltered text data together with quality +signals and metadata. Together, the RedPajama datasets comprise over 100 +trillion tokens spanning multiple domains and with their quality signals +facilitate the filtering of data, aiming to inspire the development of numerous +new datasets. To date, these datasets have already been used in the training of +strong language models used in production, such as Snowflake Arctic, +Salesforce's XGen and AI2's OLMo. To provide insight into the quality of +RedPajama, we present a series of analyses and ablation studies with +decoder-only language models with up to 1.6B parameters. Our findings +demonstrate how quality signals for web data can be effectively leveraged to +curate high-quality subsets of the dataset, underscoring the potential of +RedPajama to advance the development of transparent and high-performing +language models at scale. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) Track on Datasets and Benchmarks +
+
+
+
+
+ + ☆ A Layered Architecture for Developing and Enhancing Capabilities in + Large Language Model-based Software Systems + + +
+ Significant efforts has been made to expand the use of Large Language Models +(LLMs) beyond basic language tasks. While the generalizability and versatility +of LLMs have enabled widespread adoption, evolving demands in application +development often exceed their native capabilities. Meeting these demands may +involve a diverse set of methods, such as enhancing creativity through either +inference temperature adjustments or creativity-provoking prompts. Selecting +the right approach is critical, as different methods lead to trade-offs in +engineering complexity, scalability, and operational costs. This paper +introduces a layered architecture that organizes LLM software system +development into distinct layers, each characterized by specific attributes. By +aligning capabilities with these layers, the framework encourages the +systematic implementation of capabilities in effective and efficient ways that +ultimately supports desired functionalities and qualities. Through practical +case studies, we illustrate the utility of the framework. This work offers +developers actionable insights for selecting suitable technologies in LLM-based +software system development, promoting robustness and scalability. + +
+
+
+
+
+ + ☆ Balancing Accuracy and Efficiency in Multi-Turn Intent Classification + for LLM-Powered Dialog Systems in Production + + +
+ Accurate multi-turn intent classification is essential for advancing +conversational AI systems. However, challenges such as the scarcity of +comprehensive datasets and the complexity of contextual dependencies across +dialogue turns hinder progress. This paper presents two novel approaches +leveraging Large Language Models (LLMs) to enhance scalability and reduce +latency in production dialogue systems. First, we introduce Symbol Tuning, +which simplifies intent labels to reduce task complexity and improve +performance in multi-turn dialogues. Second, we propose C-LARA +(Consistency-aware, Linguistics Adaptive Retrieval Augmentation), a framework +that employs LLMs for data augmentation and pseudo-labeling to generate +synthetic multi-turn dialogues. These enriched datasets are used to fine-tune a +small, efficient model suitable for deployment. Experiments conducted on +multilingual dialogue datasets demonstrate significant improvements in +classification accuracy and resource efficiency. Our methods enhance multi-turn +intent classification accuracy by 5.09%, reduce annotation costs by 40%, and +enable scalable deployment in low-resource multilingual industrial systems, +highlighting their practicality and impact. + +
+
+
+
+
+ + ☆ CUE-M: Contextual Understanding and Enhanced Search with Multimodal + Large Language Model + + +
+ The integration of Retrieval-Augmented Generation (RAG) with Multimodal Large +Language Models (MLLMs) has expanded the scope of multimodal query resolution. +However, current systems struggle with intent understanding, information +retrieval, and safety filtering, limiting their effectiveness. This paper +introduces Contextual Understanding and Enhanced Search with MLLM (CUE-M), a +novel multimodal search pipeline that addresses these challenges through a +multi-stage framework comprising image context enrichment, intent refinement, +contextual query generation, external API integration, and relevance-based +filtering. CUE-M incorporates a robust safety framework combining image-based, +text-based, and multimodal classifiers, dynamically adapting to instance- and +category-specific risks. Evaluations on a multimodal Q&A dataset and a public +safety benchmark demonstrate that CUE-M outperforms baselines in accuracy, +knowledge integration, and safety, advancing the capabilities of multimodal +retrieval systems. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ☆ Building Trust: Foundations of Security, Safety and Transparency in AI + + +
+ This paper explores the rapidly evolving ecosystem of publicly available AI +models, and their potential implications on the security and safety landscape. +As AI models become increasingly prevalent, understanding their potential risks +and vulnerabilities is crucial. We review the current security and safety +scenarios while highlighting challenges such as tracking issues, remediation, +and the apparent absence of AI model lifecycle and ownership processes. +Comprehensive strategies to enhance security and safety for both model +developers and end-users are proposed. This paper aims to provide some of the +foundational pieces for more standardized security, safety, and transparency in +the development and operation of AI models and the larger open ecosystems and +communities forming around them. + +
+
+
+
+
+ + ☆ Low-resource Machine Translation: what for? who for? An observational + study on a dedicated Tetun language translation service + + +
+ The impact of machine translation (MT) on low-resource languages remains +poorly understood. In particular, observational studies of actual usage +patterns are scarce. Such studies could provide valuable insights into user +needs and behaviours, complementing survey-based methods. Here we present an +observational analysis of real-world MT usage for Tetun, the lingua franca of +Timor-Leste, using server logs from a widely-used MT service with over $70,000$ +monthly active users. Our analysis of $100,000$ translation requests reveals +patterns that challenge assumptions based on existing corpora. We find that +users, many of them students on mobile devices, typically translate short texts +into Tetun across diverse domains including science, healthcare, and daily +life. This contrasts sharply with available Tetun corpora, which are dominated +by news articles covering government and social issues. Our results suggest +that MT systems for languages like Tetun should prioritise translating into the +low-resource language, handling brief inputs effectively, and covering a wide +range of domains relevant to educational contexts. More broadly, this study +demonstrates how observational analysis can inform low-resource language +technology development, by grounding research in practical community needs. + +
+
+
+
+
+ + ☆ Predicting User Intents and Musical Attributes from Music Discovery + Conversations + + +
+ Intent classification is a text understanding task that identifies user needs +from input text queries. While intent classification has been extensively +studied in various domains, it has not received much attention in the music +domain. In this paper, we investigate intent classification models for music +discovery conversation, focusing on pre-trained language models. Rather than +only predicting functional needs: intent classification, we also include a task +for classifying musical needs: musical attribute classification. Additionally, +we propose a method of concatenating previous chat history with just +single-turn user queries in the input text, allowing the model to understand +the overall conversation context better. Our proposed model significantly +improves the F1 score for both user intent and musical attribute +classification, and surpasses the zero-shot and few-shot performance of the +pretrained Llama 3 model. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Evaluating Tokenizer Performance of Large Language Models Across + Official Indian Languages + + +
+ Large Language Models (LLMs) based on transformer architectures have +revolutionized a variety of domains, with tokenization playing a pivotal role +in their pre-processing and fine-tuning stages. In multilingual models, +particularly those tailored for Indic languages, effective tokenization is +crucial for optimizing performance. This paper presents a comprehensive +evaluation of tokenizers used by 12 LLMs across all 22 official languages of +India, with a focus on comparing the efficiency of their tokenization +processes. We employed the Normalized Sequence Length (NSL) as a key metric in +our analysis. Our findings reveal that the SUTRA tokenizer outperforms all +other models, including several Indic-specific models, excelling in 14 +languages. Notable insights include the SUTRA tokenizer's superior handling of +Indic languages, GPT-4o's advancement over its predecessor GPT-4 in processing +Indian languages, and the limited performance of Project Indus in certain +languages. This study underscores the critical importance of developing +targeted tokenization strategies for multilingual and Indic-centric models, +laying the groundwork for future improvements in tokenizer design to enhance +linguistic coverage and model efficiency. + +
+
+
+
+
+ + ☆ BoolQuestions: Does Dense Retrieval Understand Boolean Logic in + Language? EMNLP 2024 + + +
+ Dense retrieval, which aims to encode the semantic information of arbitrary +text into dense vector representations or embeddings, has emerged as an +effective and efficient paradigm for text retrieval, consequently becoming an +essential component in various natural language processing systems. These +systems typically focus on optimizing the embedding space by attending to the +relevance of text pairs, while overlooking the Boolean logic inherent in +language, which may not be captured by current training objectives. In this +work, we first investigate whether current retrieval systems can comprehend the +Boolean logic implied in language. To answer this question, we formulate the +task of Boolean Dense Retrieval and collect a benchmark dataset, BoolQuestions, +which covers complex queries containing basic Boolean logic and corresponding +annotated passages. Through extensive experimental results on the proposed task +and benchmark dataset, we draw the conclusion that current dense retrieval +systems do not fully understand Boolean logic in language, and there is a long +way to go to improve our dense retrieval systems. Furthermore, to promote +further research on enhancing the understanding of Boolean logic for language +models, we explore Boolean operation on decomposed query and propose a +contrastive continual training method that serves as a strong baseline for the +research community. + +
+
+ comment: Findings of the Association for Computational Linguistics: EMNLP 2024 +
+
+
+
+
+ + ☆ Just KIDDIN: Knowledge Infusion and Distillation for Detection of + INdecent Memes + + +
+ Toxicity identification in online multimodal environments remains a +challenging task due to the complexity of contextual connections across +modalities (e.g., textual and visual). In this paper, we propose a novel +framework that integrates Knowledge Distillation (KD) from Large Visual +Language Models (LVLMs) and knowledge infusion to enhance the performance of +toxicity detection in hateful memes. Our approach extracts sub-knowledge graphs +from ConceptNet, a large-scale commonsense Knowledge Graph (KG) to be infused +within a compact VLM framework. The relational context between toxic phrases in +captions and memes, as well as visual concepts in memes enhance the model's +reasoning capabilities. Experimental results from our study on two hate speech +benchmark datasets demonstrate superior performance over the state-of-the-art +baselines across AU-ROC, F1, and Recall with improvements of 1.1%, 7%, and 35%, +respectively. Given the contextual complexity of the toxicity detection task, +our approach showcases the significance of learning from both explicit (i.e. +KG) as well as implicit (i.e. LVLMs) contextual cues incorporated through a +hybrid neurosymbolic approach. This is crucial for real-world applications +where accurate and scalable recognition of toxic content is critical for +creating safer online environments. + +
+
+
+
+
+ + ☆ A Combined Encoder and Transformer Approach for Coherent and + High-Quality Text Generation + + +
+ This research introduces a novel text generation model that combines BERT's +semantic interpretation strengths with GPT-4's generative capabilities, +establishing a high standard in generating coherent, contextually accurate +language. Through the combined architecture, the model enhances semantic depth +and maintains smooth, human-like text flow, overcoming limitations seen in +prior models. Experimental benchmarks reveal that BERT-GPT-4 surpasses +traditional models, including GPT-3, T5, BART, Transformer-XL, and CTRL, in key +metrics like Perplexity and BLEU, showcasing its superior natural language +generation performance. By fully utilizing contextual information, this hybrid +model generates text that is not only logically coherent but also aligns +closely with human language patterns, providing an advanced solution for text +generation tasks. This research highlights the potential of integrating +semantic understanding with advanced generative models, contributing new +insights for NLP, and setting a foundation for broader applications of +large-scale generative architectures in areas such as automated writing, +question-answer systems, and adaptive conversational agents. + +
+
+
+
+
+ + ☆ HNCSE: Advancing Sentence Embeddings via Hybrid Contrastive Learning + with Hard Negatives + + +
+ Unsupervised sentence representation learning remains a critical challenge in +modern natural language processing (NLP) research. Recently, contrastive +learning techniques have achieved significant success in addressing this issue +by effectively capturing textual semantics. Many such approaches prioritize the +optimization using negative samples. In fields such as computer vision, hard +negative samples (samples that are close to the decision boundary and thus more +difficult to distinguish) have been shown to enhance representation learning. +However, adapting hard negatives to contrastive sentence learning is complex +due to the intricate syntactic and semantic details of text. To address this +problem, we propose HNCSE, a novel contrastive learning framework that extends +the leading SimCSE approach. The hallmark of HNCSE is its innovative use of +hard negative samples to enhance the learning of both positive and negative +samples, thereby achieving a deeper semantic understanding. Empirical tests on +semantic textual similarity and transfer task datasets validate the superiority +of HNCSE. + +
+
+
+
+
+ + ☆ CoMeDi Shared Task: Models as Annotators in Lexical Semantics + Disagreements + + +
+ We present the results of our system for the CoMeDi Shared Task, which +predicts majority votes (Subtask 1) and annotator disagreements (Subtask 2). +Our approach combines model ensemble strategies with MLP-based and +threshold-based methods trained on pretrained language models. Treating +individual models as virtual annotators, we simulate the annotation process by +designing aggregation measures that incorporate continuous similarity scores +and discrete classification labels to capture both majority and disagreement. +Additionally, we employ anisotropy removal techniques to enhance performance. +Experimental results demonstrate the effectiveness of our methods, particularly +for Subtask 2. Notably, we find that continuous similarity scores, even within +the same model, align better with human disagreement patterns compared to +aggregated discrete labels. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ A Computational Method for Measuring "Open Codes" in Qualitative + Analysis + + +
+ Qualitative analysis is critical to understanding human datasets in many +social science disciplines. Open coding is an inductive qualitative process +that identifies and interprets "open codes" from datasets. Yet, meeting +methodological expectations (such as "as exhaustive as possible") can be +challenging. While many machine learning (ML)/generative AI (GAI) studies have +attempted to support open coding, few have systematically measured or evaluated +GAI outcomes, increasing potential bias risks. Building on Grounded Theory and +Thematic Analysis theories, we present a computational method to measure and +identify potential biases from "open codes" systematically. Instead of +operationalizing human expert results as the "ground truth," our method is +built upon a team-based approach between human and machine coders. We +experiment with two HCI datasets to establish this method's reliability by 1) +comparing it with human analysis, and 2) analyzing its output stability. We +present evidence-based suggestions and example workflows for ML/GAI to support +open coding. + +
+
+
+
+
+ + ♻ ☆ Is Programming by Example solved by LLMs? + + +
+ Programming-by-Examples (PBE) aims to generate an algorithm from input-output +examples. Such systems are practically and theoretically important: from an +end-user perspective, they are deployed to millions of people, and from an AI +perspective, PBE corresponds to a very general form of few-shot inductive +inference. Given the success of Large Language Models (LLMs) in code-generation +tasks, we investigate here the extent to which LLMs can be said to have +"solved" PBE. We experiment on classic domains such as lists and strings, and +an uncommon graphics programming domain not well represented in typical +pretraining data. We find that pretrained models are not effective at PBE, but +that they can be fine-tuned for much higher performance, provided the test +problems are in-distribution. We analyze empirically what causes these models +to succeed and fail, and take steps toward understanding how to achieve better +out-of-distribution generalization. Collectively these results suggest that +LLMs make strong progress toward solving the typical suite of PBE tasks, +potentially increasing the flexibility and applicability of PBE systems, while +also identifying ways in which LLMs still fall short. + +
+
+
+
+
+ + ♻ ☆ A Demonstration of Adaptive Collaboration of Large Language Models for + Medical Decision-Making ML4H 2024 + + +
+ Medical Decision-Making (MDM) is a multi-faceted process that requires +clinicians to assess complex multi-modal patient data patient, often +collaboratively. Large Language Models (LLMs) promise to streamline this +process by synthesizing vast medical knowledge and multi-modal health data. +However, single-agent are often ill-suited for nuanced medical contexts +requiring adaptable, collaborative problem-solving. Our MDAgents addresses this +need by dynamically assigning collaboration structures to LLMs based on task +complexity, mimicking real-world clinical collaboration and decision-making. +This framework improves diagnostic accuracy and supports adaptive responses in +complex, real-world medical scenarios, making it a valuable tool for clinicians +in various healthcare settings, and at the same time, being more efficient in +terms of computing cost than static multi-agent decision making methods. + +
+
+ comment: Under Review for ML4H 2024 +
+
+
+
+
+ + ♻ ☆ Realised Volatility Forecasting: Machine Learning via Financial Word + Embedding + + +
+ This study develops a financial word embedding using 15 years of business +news. Our results show that this specialised language model produces more +accurate results than general word embeddings, based on a financial benchmark +we established. As an application, we incorporate this word embedding into a +simple machine learning model to enhance the HAR model for forecasting realised +volatility. This approach statistically and economically outperforms +established econometric models. Using an explainable AI method, we also +identify key phrases in business news that contribute significantly to +volatility, offering insights into language patterns tied to market dynamics. + +
+
+
+
+
+ + ♻ ☆ Combining Induction and Transduction for Abstract Reasoning + + +
+ When learning an input-output mapping from very few examples, is it better to +first infer a latent function that explains the examples, or is it better to +directly predict new test outputs, e.g. using a neural network? We study this +question on ARC, a highly diverse dataset of abstract reasoning tasks. We train +neural models for induction (inferring latent functions) and transduction +(directly predicting the test output for a given test input). Our models are +trained on synthetic data generated by prompting LLMs to produce Python code +specifying a function to be inferred, plus a stochastic subroutine for +generating inputs to that function. We find inductive and transductive models +solve very different problems, despite training on the same problems, and +despite sharing the same neural architecture. + +
+
+
+
+
+ + ♻ ☆ Information Extraction from Clinical Notes: Are We Ready to Switch to + Large Language Models? + + +
+ Backgrounds: Information extraction (IE) is critical in clinical natural +language processing (NLP). While large language models (LLMs) excel on +generative tasks, their performance on extractive tasks remains debated. +Methods: We investigated Named Entity Recognition (NER) and Relation Extraction +(RE) using 1,588 clinical notes from four sources (UT Physicians, MTSamples, +MIMIC-III, and i2b2). We developed an annotated corpus covering 4 clinical +entities and 16 modifiers, and compared instruction-tuned LLaMA-2 and LLaMA-3 +against BiomedBERT in terms of performance, generalizability, computational +resources, and throughput to BiomedBERT. Results: LLaMA models outperformed +BiomedBERT across datasets. With sufficient training data, LLaMA showed modest +improvements (1% on NER, 1.5-3.7% on RE); improvements were larger with limited +training data. On unseen i2b2 data, LLaMA-3-70B outperformed BiomedBERT by 7% +(F1) on NER and 4% on RE. However, LLaMA models required more computing +resources and ran up to 28 times slower. We implemented "Kiwi," a clinical IE +package featuring both models, available at https://kiwi.clinicalnlp.org/. +Conclusion: This study is among the first to develop and evaluate a +comprehensive clinical IE system using open-source LLMs. Results indicate that +LLaMA models outperform BiomedBERT for clinical NER and RE but with higher +computational costs and lower throughputs. These findings highlight that +choosing between LLMs and traditional deep learning methods for clinical IE +applications should remain task-specific, taking into account both performance +metrics and practical considerations such as available computing resources and +the intended use case scenarios. + +
+
+
+
+
+ + ♻ ☆ HEARTS: A Holistic Framework for Explainable, Sustainable and Robust + Text Stereotype Detection NeurIPS 2024 + + +
+ Stereotypes are generalised assumptions about societal groups, and even +state-of-the-art LLMs using in-context learning struggle to identify them +accurately. Due to the subjective nature of stereotypes, where what constitutes +a stereotype can vary widely depending on cultural, social, and individual +perspectives, robust explainability is crucial. Explainable models ensure that +these nuanced judgments can be understood and validated by human users, +promoting trust and accountability. We address these challenges by introducing +HEARTS (Holistic Framework for Explainable, Sustainable, and Robust Text +Stereotype Detection), a framework that enhances model performance, minimises +carbon footprint, and provides transparent, interpretable explanations. We +establish the Expanded Multi-Grain Stereotype Dataset (EMGSD), comprising +57,201 labelled texts across six groups, including under-represented +demographics like LGBTQ+ and regional stereotypes. Ablation studies confirm +that BERT models fine-tuned on EMGSD outperform those trained on individual +components. We then analyse a fine-tuned, carbon-efficient ALBERT-V2 model +using SHAP to generate token-level importance values, ensuring alignment with +human understanding, and calculate explainability confidence scores by +comparing SHAP and LIME outputs... + +
+
+ comment: Accepted in NeurIPS 2024 SoLaR Workshop and Safety Gen AI Workshop +
+
+
+
+
+ + ♻ ☆ How to Choose How to Choose Your Chatbot: A Massively Multi-System + MultiReference Data Set for Dialog Metric Evaluation + + +
+ We release MMSMR, a Massively Multi-System MultiReference dataset to enable +future work on metrics and evaluation for dialog. Automatic metrics for +dialogue evaluation should be robust proxies for human judgments; however, the +verification of robustness is currently far from satisfactory. To quantify the +robustness correlation and understand what is necessary in a test set, we +create and release an 8-reference dialog dataset by extending single-reference +evaluation sets and introduce this new language learning conversation dataset. +We then train 1750 systems and evaluate them on our novel test set and the +DailyDialog dataset. We release the novel test set, and model hyper parameters, +inference outputs, and metric scores for each system on a variety of datasets. + +
+
+
+
+
+ + ♻ ☆ Plurals: A System for Guiding LLMs Via Simulated Social Ensembles + + +
+ Recent debates raised concerns that language models may favor certain +viewpoints. But what if the solution is not to aim for a 'view from nowhere' +but rather to leverage different viewpoints? We introduce Plurals, a system and +Python library for pluralistic AI deliberation. Plurals consists of Agents +(LLMs, optionally with personas) which deliberate within customizable +Structures, with Moderators overseeing deliberation. Plurals is a generator of +simulated social ensembles. Plurals integrates with government datasets to +create nationally representative personas, includes deliberation templates +inspired by deliberative democracy, and allows users to customize both +information-sharing structures and deliberation behavior within Structures. Six +case studies demonstrate fidelity to theoretical constructs and efficacy. Three +randomized experiments show simulated focus groups produced output resonant +with an online sample of the relevant audiences (chosen over zero-shot +generation in 75% of trials). Plurals is both a paradigm and a concrete system +for pluralistic AI. The Plurals library is available at +https://github.com/josh-ashkinaze/plurals and will be continually updated. + +
+
+
+
+
+ + ♻ ☆ Synergizing LLM Agents and Knowledge Graph for Socioeconomic Prediction + in LBSN + + +
+ The fast development of location-based social networks (LBSNs) has led to +significant changes in society, resulting in popular studies of using LBSN data +for socioeconomic prediction, e.g., regional population and commercial activity +estimation. Existing studies design various graphs to model heterogeneous LBSN +data, and further apply graph representation learning methods for socioeconomic +prediction. However, these approaches heavily rely on heuristic ideas and +expertise to extract task-relevant knowledge from diverse data, which may not +be optimal for specific tasks. Additionally, they tend to overlook the inherent +relationships between different indicators, limiting the prediction accuracy. +Motivated by the remarkable abilities of large language models (LLMs) in +commonsense reasoning, embedding, and multi-agent collaboration, in this work, +we synergize LLM agents and knowledge graph for socioeconomic prediction. We +first construct a location-based knowledge graph (LBKG) to integrate +multi-sourced LBSN data. Then we leverage the reasoning power of LLM agent to +identify relevant meta-paths in the LBKG for each type of socioeconomic +prediction task, and design a semantic-guided attention module for knowledge +fusion with meta-paths. Moreover, we introduce a cross-task communication +mechanism to further enhance performance by enabling knowledge sharing across +tasks at both LLM agent and KG levels. On the one hand, the LLM agents for +different tasks collaborate to generate more diverse and comprehensive +meta-paths. On the other hand, the embeddings from different tasks are +adaptively merged for better socioeconomic prediction. Experiments on two +datasets demonstrate the effectiveness of the synergistic design between LLM +and KG, providing insights for information sharing across socioeconomic +prediction tasks. + +
+
+
+
+
+ + ♻ ☆ Weak-to-Strong Search: Align Large Language Models via Searching over + Small Language Models NeurIPS 2024 + + +
+ Large language models are usually fine-tuned to align with human preferences. +However, fine-tuning a large language model can be challenging. In this work, +we introduce $\textit{weak-to-strong search}$, framing the alignment of a large +language model as a test-time greedy search to maximize the log-probability +difference between small tuned and untuned models while sampling from the +frozen large model. This method serves both as (1) a compute-efficient model +up-scaling strategy that avoids directly tuning the large model and as (2) an +instance of weak-to-strong generalization that enhances a strong model with +weak test-time guidance. Empirically, we demonstrate the flexibility of +weak-to-strong search across different tasks. In controlled-sentiment +generation and summarization, we use tuned and untuned $\texttt{gpt2}$s to +improve the alignment of large models without additional training. Crucially, +in a more difficult instruction-following benchmark, AlpacaEval 2.0, we show +that reusing off-the-shelf small models (e.g., $\texttt{zephyr-7b-beta}$ and +its untuned version) can improve the length-controlled win rates of both +white-box and black-box large models against $\texttt{gpt-4-turbo}$ (e.g., +$34.4\% \rightarrow 37.9\%$ for $\texttt{Llama-3-70B-Instruct}$ and $16.0\% +\rightarrow 20.1\%$ for $\texttt{gpt-3.5-turbo-instruct}$), despite the small +models' low win rates $\approx 10.0\%$. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Reference-free Hallucination Detection for Large Vision-Language Models + + +
+ Large vision-language models (LVLMs) have made significant progress in recent +years. While LVLMs exhibit excellent ability in language understanding, +question answering, and conversations of visual inputs, they are prone to +producing hallucinations. While several methods are proposed to evaluate the +hallucinations in LVLMs, most are reference-based and depend on external tools, +which complicates their practical application. To assess the viability of +alternative methods, it is critical to understand whether the reference-free +approaches, which do not rely on any external tools, can efficiently detect +hallucinations. Therefore, we initiate an exploratory study to demonstrate the +effectiveness of different reference-free solutions in detecting hallucinations +in LVLMs. In particular, we conduct an extensive study on three kinds of +techniques: uncertainty-based, consistency-based, and supervised uncertainty +quantification methods on four representative LVLMs across two different tasks. +The empirical results show that the reference-free approaches are capable of +effectively detecting non-factual responses in LVLMs, with the supervised +uncertainty quantification method outperforming the others, achieving the best +performance across different settings. + +
+
+
+
+
+ + ♻ ☆ A Survey on Hallucination in Large Language Models: Principles, + Taxonomy, Challenges, and Open Questions + + +
+ The emergence of large language models (LLMs) has marked a significant +breakthrough in natural language processing (NLP), fueling a paradigm shift in +information acquisition. Nevertheless, LLMs are prone to hallucination, +generating plausible yet nonfactual content. This phenomenon raises significant +concerns over the reliability of LLMs in real-world information retrieval (IR) +systems and has attracted intensive research to detect and mitigate such +hallucinations. Given the open-ended general-purpose attributes inherent to +LLMs, LLM hallucinations present distinct challenges that diverge from prior +task-specific models. This divergence highlights the urgency for a nuanced +understanding and comprehensive overview of recent advances in LLM +hallucinations. In this survey, we begin with an innovative taxonomy of +hallucination in the era of LLM and then delve into the factors contributing to +hallucinations. Subsequently, we present a thorough overview of hallucination +detection methods and benchmarks. Our discussion then transfers to +representative methodologies for mitigating LLM hallucinations. Additionally, +we delve into the current limitations faced by retrieval-augmented LLMs in +combating hallucinations, offering insights for developing more robust IR +systems. Finally, we highlight the promising research directions on LLM +hallucinations, including hallucination in large vision-language models and +understanding of knowledge boundaries in LLM hallucinations. + +
+
+ comment: Accepted by ACM Transactions on Information Systems (TOIS) +
+
+
+
+
+ + ♻ ☆ Findings of the First Workshop on Simulating Conversational Intelligence + in Chat + + +
+ The aim of the workshop was to bring together experts working on open-domain +dialogue research. In this speedily advancing research area many challenges +still exist, such as learning information from conversations, and engaging in a +realistic and convincing simulation of human intelligence and reasoning. +SCI-CHAT follows previous workshops on open domain dialogue but in contrast the +focus of the shared task is simulation of intelligent conversation as judged in +a live human evaluation. Models aim to include the ability to follow a +challenging topic over a multi-turn conversation, while positing, refuting and +reasoning over arguments. The workshop included both a research track and +shared task. The main goal of this paper is to provide an overview of the +shared task, and an in depth analysis of the shared task results following +presentation at the workshop. The current paper is an extension of that made +available prior to presentation of results at the workshop at EACL Malta +(Graham et al., 2024). The data collected in the evaluation was made publicly +available to aide future research. The code was also made available for the +same purpose. + +
+
+
+
+
+ + ♻ ☆ Key-Element-Informed sLLM Tuning for Document Summarization + + +
+ Remarkable advances in large language models (LLMs) have enabled high-quality +text summarization. However, this capability is currently accessible only +through LLMs of substantial size or proprietary LLMs with usage fees. In +response, smaller-scale LLMs (sLLMs) of easy accessibility and low costs have +been extensively studied, yet they often suffer from missing key information +and entities, i.e., low relevance, in particular, when input documents are +long. We hence propose a key-element-informed instruction tuning for +summarization, so-called KEITSum, which identifies key elements in documents +and instructs sLLM to generate summaries capturing these key elements. +Experimental results on dialogue and news datasets demonstrate that sLLM with +KEITSum indeed provides high-quality summarization with higher relevance and +less hallucinations, competitive to proprietary LLM. + +
+
+ comment: Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Zero-shot LLM-guided Counterfactual Generation: A Case Study on NLP + Model Evaluation + + +
+ With the development and proliferation of large, complex, black-box models +for solving many natural language processing (NLP) tasks, there is also an +increasing necessity of methods to stress-test these models and provide some +degree of interpretability or explainability. While counterfactual examples are +useful in this regard, automated generation of counterfactuals is a data and +resource intensive process. such methods depend on models such as pre-trained +language models that are then fine-tuned on auxiliary, often task-specific +datasets, that may be infeasible to build in practice, especially for new tasks +and data domains. Therefore, in this work we explore the possibility of +leveraging large language models (LLMs) for zero-shot counterfactual generation +in order to stress-test NLP models. We propose a structured pipeline to +facilitate this generation, and we hypothesize that the instruction-following +and textual understanding capabilities of recent LLMs can be effectively +leveraged for generating high quality counterfactuals in a zero-shot manner, +without requiring any training or fine-tuning. Through comprehensive +experiments on a variety of propreitary and open-source LLMs, along with +various downstream tasks in NLP, we explore the efficacy of LLMs as zero-shot +counterfactual generators in evaluating and explaining black-box NLP models. + +
+
+ comment: Longer version of short paper accepted at IEEE BigData 2024 (Main + Track) +
+
+
+
+
+ + ♻ ☆ Child Speech Recognition in Human-Robot Interaction: Problem Solved? + + +
+ Automated Speech Recognition shows superhuman performance for adult English +speech on a range of benchmarks, but disappoints when fed children's speech. +This has long sat in the way of child-robot interaction. Recent evolutions in +data-driven speech recognition, including the availability of Transformer +architectures and unprecedented volumes of training data, might mean a +breakthrough for child speech recognition and social robot applications aimed +at children. We revisit a study on child speech recognition from 2017 and show +that indeed performance has increased, with newcomer OpenAI Whisper doing +markedly better than leading commercial cloud services. Performance improves +even more in highly structured interactions when priming models with specific +phrases. While transcription is not perfect yet, the best model recognises +60.3% of sentences correctly barring small grammatical differences, with +sub-second transcription time running on a local GPU, showing potential for +usable autonomous child-robot speech interactions. + +
+
+ comment: Submitted to 2024 International Conference on Social Robotics +
+
+
+
+
+ + ♻ ☆ ChunkRAG: Novel LLM-Chunk Filtering Method for RAG Systems + + +
+ Retrieval-Augmented Generation (RAG) systems using large language models +(LLMs) often generate inaccurate responses due to the retrieval of irrelevant +or loosely related information. Existing methods, which operate at the document +level, fail to effectively filter out such content. We propose LLM-driven chunk +filtering, ChunkRAG, a framework that enhances RAG systems by evaluating and +filtering retrieved information at the chunk level. Our approach employs +semantic chunking to divide documents into coherent sections and utilizes +LLM-based relevance scoring to assess each chunk's alignment with the user's +query. By filtering out less pertinent chunks before the generation phase, we +significantly reduce hallucinations and improve factual accuracy. Experiments +show that our method outperforms existing RAG models, achieving higher accuracy +on tasks requiring precise information retrieval. This advancement enhances the +reliability of RAG systems, making them particularly beneficial for +applications like fact-checking and multi-hop reasoning. + +
+
+
+
+
+ + ♻ ☆ Multilingual large language models leak human stereotypes across + language boundaries + + +
+ Multilingual large language models have gained prominence for their +proficiency in processing and generating text across languages. Like their +monolingual counterparts, multilingual models are likely to pick up on +stereotypes and other social biases present in their training data. In this +paper, we study a phenomenon we term stereotype leakage, which refers to how +training a model multilingually may lead to stereotypes expressed in one +language showing up in the models' behaviour in another. We propose a +measurement framework for stereotype leakage and investigate its effect across +English, Russian, Chinese, and Hindi and with GPT-3.5, mT5, and mBERT. Our +findings show a noticeable leakage of positive, negative, and non-polar +associations across all languages. We find that of these models, GPT-3.5 +exhibits the most stereotype leakage, and Hindi is the most susceptible to +leakage effects. WARNING: This paper contains model outputs which could be +offensive in nature. + +
+
+
+
+
+ + ♻ ☆ Vision-Language Model Fine-Tuning via Simple Parameter-Efficient + Modification EMNLP 2024 + + +
+ Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed +the success of prompt tuning and adapter tuning, while the classic model +fine-tuning on inherent parameters seems to be overlooked. It is believed that +fine-tuning the parameters of VLMs with few-shot samples corrupts the +pre-trained knowledge since fine-tuning the CLIP model even degrades +performance. In this paper, we revisit this viewpoint, and propose a new +perspective: fine-tuning the specific parameters instead of all will uncover +the power of classic model fine-tuning on VLMs. Through our meticulous study, +we propose ClipFit, a simple yet effective method to fine-tune CLIP without +introducing any overhead of extra parameters. We demonstrate that by only +fine-tuning the specific bias terms and normalization layers, ClipFit can +improve the performance of zero-shot CLIP by 7.27\% average harmonic mean +accuracy. Lastly, to understand how fine-tuning in CLIPFit affects the +pre-trained models, we conducted extensive experimental analyses w.r.t. changes +in internal parameters and representations. We found that low-level text bias +layers and the first layer normalization layer change much more than other +layers. The code is available at \url{https://github.com/minglllli/CLIPFit}. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Re-Reading Improves Reasoning in Large Language Models EMNLP 2024 + + +
+ To enhance the reasoning capabilities of off-the-shelf Large Language Models +(LLMs), we introduce a simple, yet general and effective prompting method, Re2, +i.e., \textbf{Re}-\textbf{Re}ading the question as input. Unlike most +thought-eliciting prompting methods, such as Chain-of-Thought (CoT), which aim +to elicit the reasoning process in the output, Re2 shifts the focus to the +input by processing questions twice, thereby enhancing the understanding +process. Consequently, Re2 demonstrates strong generality and compatibility +with most thought-eliciting prompting methods, including CoT. Crucially, Re2 +facilitates a "bidirectional" encoding in unidirectional decoder-only LLMs +because the first pass could provide global information for the second pass. We +begin with a preliminary empirical study as the foundation of Re2, illustrating +its potential to enable "bidirectional" attention mechanisms. We then evaluate +Re2 on extensive reasoning benchmarks across 14 datasets, spanning 112 +experiments, to validate its effectiveness and generality. Our findings +indicate that, with the exception of a few scenarios on vanilla ChatGPT, Re2 +consistently enhances the reasoning performance of LLMs through a simple +re-reading strategy. Further analyses reveal Re2's adaptability, showing how it +can be effectively integrated with different LLMs, thought-eliciting prompting, +and ensemble strategies. Our code is available at +\url{https://github.com/Tebmer/Rereading-LLM-Reasoning/} + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ Multi-Head RAG: Solving Multi-Aspect Problems with LLMs + + +
+ Retrieval Augmented Generation (RAG) enhances the abilities of Large Language +Models (LLMs) by enabling the retrieval of documents into the LLM context to +provide more accurate and relevant responses. Existing RAG solutions do not +focus on queries that may require fetching multiple documents with +substantially different contents. Such queries occur frequently, but are +challenging because the embeddings of these documents may be distant in the +embedding space, making it hard to retrieve them all. This paper introduces +Multi-Head RAG (MRAG), a novel scheme designed to address this gap with a +simple yet powerful idea: leveraging activations of Transformer's multi-head +attention layer, instead of the decoder layer, as keys for fetching +multi-aspect documents. The driving motivation is that different attention +heads can learn to capture different data aspects. Harnessing the corresponding +activations results in embeddings that represent various facets of data items +and queries, improving the retrieval accuracy for complex queries. We provide +an evaluation methodology and metrics, multi-aspect datasets that we release +online, and real-world use cases to demonstrate MRAG's effectiveness, showing +improvements of up to 20% in relevance over standard RAG baselines. MRAG can be +seamlessly integrated with existing RAG frameworks and benchmarking tools like +RAGAS as well as different classes of data stores. + +
+
+
+
+
+ + ♻ ☆ Xmodel-LM Technical Report + + +
+ We introduce Xmodel-LM, a compact and efficient 1.1B language model +pre-trained on around 2 trillion tokens. Trained on our self-built dataset +(Xdata), which balances Chinese and English corpora based on downstream task +optimization, Xmodel-LM exhibits remarkable performance despite its smaller +size. It notably surpasses existing open-source language models of similar +scale. Our model checkpoints and code are publicly accessible on GitHub at +https://github.com/XiaoduoAILab/XmodelLM. + +
+
+
+
+
+ + ♻ ☆ Enhancing Training Data Attribution for Large Language Models with + Fitting Error Consideration EMNLP 2024 + + +
+ The black-box nature of large language models (LLMs) poses challenges in +interpreting results, impacting issues such as data intellectual property +protection and hallucination tracing. Training data attribution (TDA) methods +are considered effective solutions to address these challenges. Most recent TDA +methods rely on influence functions, assuming the model achieves minimized +empirical risk. However, achieving this criterion is difficult, and sourcing +accuracy can be compromised by fitting errors during model training. In this +paper, we introduce a novel TDA method called Debias and Denoise Attribution +(DDA), which enhances influence functions by addressing fitting errors. +Specifically, the debias strategy seeks to improve the performance of influence +functions by eliminating the knowledge bias present in the base model before +fine-tuning, while the denoise strategy aims to reduce discrepancies in +influence scores arising from varying degrees of fitting during the training +process through smoothing techniques. Experimental results demonstrate that our +method significantly outperforms existing approaches, achieving an averaged AUC +of 91.64%. Moreover, DDA exhibits strong generality and scalability across +various sources and different-scale models like LLaMA2, QWEN2, and Mistral. + +
+
+ comment: Accepted to the EMNLP 2024 main +
+
+
+
+
+ + ♻ ☆ Divide-or-Conquer? Which Part Should You Distill Your LLM? EMNLP 2024 + + +
+ Recent methods have demonstrated that Large Language Models (LLMs) can solve +reasoning tasks better when they are encouraged to solve subtasks of the main +task first. In this paper we devise a similar strategy that breaks down +reasoning tasks into a problem decomposition phase and a problem solving phase +and show that the strategy is able to outperform a single stage solution. +Further, we hypothesize that the decomposition should be easier to distill into +a smaller model compared to the problem solving because the latter requires +large amounts of domain knowledge while the former only requires learning +general problem solving strategies. We propose methods to distill these two +capabilities and evaluate their impact on reasoning outcomes and inference +cost. We find that we can distill the problem decomposition phase and at the +same time achieve good generalization across tasks, datasets, and models. +However, it is harder to distill the problem solving capability without losing +performance and the resulting distilled model struggles with generalization. +These results indicate that by using smaller, distilled problem decomposition +models in combination with problem solving LLMs we can achieve reasoning with +cost-efficient inference and local adaptation. + +
+
+ comment: Findings of the Association for Computational Linguistics: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Multilingual Large Language Models: A Systematic Survey + + +
+ This paper provides a comprehensive survey of the latest research on +multilingual large language models (MLLMs). MLLMs not only are able to +understand and generate language across linguistic boundaries, but also +represent an important advancement in artificial intelligence. We first discuss +the architecture and pre-training objectives of MLLMs, highlighting the key +components and methodologies that contribute to their multilingual +capabilities. We then discuss the construction of multilingual pre-training and +alignment datasets, underscoring the importance of data quality and diversity +in enhancing MLLM performance. An important focus of this survey is on the +evaluation of MLLMs. We present a detailed taxonomy and roadmap covering the +assessment of MLLMs' cross-lingual knowledge, reasoning, alignment with human +values, safety, interpretability and specialized applications. Specifically, we +extensively discuss multilingual evaluation benchmarks and datasets, and +explore the use of LLMs themselves as multilingual evaluators. To enhance MLLMs +from black to white boxes, we also address the interpretability of multilingual +capabilities, cross-lingual transfer and language bias within these models. +Finally, we provide a comprehensive review of real-world applications of MLLMs +across diverse domains, including biology, medicine, computer science, +mathematics and law. We showcase how these models have driven innovation and +improvements in these specialized fields while also highlighting the challenges +and opportunities in deploying MLLMs within diverse language communities and +application scenarios. We listed the paper related in this survey and publicly +available at https://github.com/tjunlp-lab/Awesome-Multilingual-LLMs-Papers. + +
+
+
+
+
+ + ♻ ☆ From Text to Multimodality: Exploring the Evolution and Impact of Large + Language Models in Medical Practice + + +
+ Large Language Models (LLMs) have rapidly evolved from text-based systems to +multimodal platforms, significantly impacting various sectors including +healthcare. This comprehensive review explores the progression of LLMs to +Multimodal Large Language Models (MLLMs) and their growing influence in medical +practice. We examine the current landscape of MLLMs in healthcare, analyzing +their applications across clinical decision support, medical imaging, patient +engagement, and research. The review highlights the unique capabilities of +MLLMs in integrating diverse data types, such as text, images, and audio, to +provide more comprehensive insights into patient health. We also address the +challenges facing MLLM implementation, including data limitations, technical +hurdles, and ethical considerations. By identifying key research gaps, this +paper aims to guide future investigations in areas such as dataset development, +modality alignment methods, and the establishment of ethical guidelines. As +MLLMs continue to shape the future of healthcare, understanding their potential +and limitations is crucial for their responsible and effective integration into +medical practice. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Generating bilingual example sentences with large language models as + lexicography assistants + + +
+ We present a study of LLMs' performance in generating and rating example +sentences for bilingual dictionaries across languages with varying resource +levels: French (high-resource), Indonesian (mid-resource), and Tetun +(low-resource), with English as the target language. We evaluate the quality of +LLM-generated examples against the GDEX (Good Dictionary EXample) criteria: +typicality, informativeness, and intelligibility. Our findings reveal that +while LLMs can generate reasonably good dictionary examples, their performance +degrades significantly for lower-resourced languages. We also observe high +variability in human preferences for example quality, reflected in low +inter-annotator agreement rates. To address this, we demonstrate that +in-context learning can successfully align LLMs with individual annotator +preferences. Additionally, we explore the use of pre-trained language models +for automated rating of examples, finding that sentence perplexity serves as a +good proxy for typicality and intelligibility in higher-resourced languages. +Our study also contributes a novel dataset of 600 ratings for LLM-generated +sentence pairs, and provides insights into the potential of LLMs in reducing +the cost of lexicographic work, particularly for low-resource languages. + +
+
+
+
+
+ + ♻ ☆ Investigating the Factual Knowledge Boundary of Large Language Models + with Retrieval Augmentation + + +
+ Large language models (LLMs) have shown impressive prowess in solving a wide +range of tasks with world knowledge. However, it remains unclear how well LLMs +are able to perceive their factual knowledge boundaries, particularly under +retrieval augmentation settings. In this study, we present the first analysis +on the factual knowledge boundaries of LLMs and how retrieval augmentation +affects LLMs on open-domain question answering (QA), with a bunch of important +findings. Specifically, we focus on three research questions and analyze them +by examining QA, priori judgement and posteriori judgement capabilities of +LLMs. We show evidence that LLMs possess unwavering confidence in their +knowledge and cannot handle the conflict between internal and external +knowledge well. Furthermore, retrieval augmentation proves to be an effective +approach in enhancing LLMs' awareness of knowledge boundaries. We further +conduct thorough experiments to examine how different factors affect LLMs and +propose a simple method to dynamically utilize supporting documents with our +judgement strategy. Additionally, we find that the relevance between the +supporting documents and the questions significantly impacts LLMs' QA and +judgemental capabilities. The code to reproduce this work is available at +https://github.com/RUCAIBox/LLM-Knowledge-Boundary. + +
+
+
+
+
+ + ♻ ☆ MLAN: Language-Based Instruction Tuning Improves Zero-Shot + Generalization of Multimodal Large Language Models + + +
+ We present a novel instruction tuning recipe to improve the zero-shot task +generalization of multimodal large language models. In contrast to existing +instruction tuning mechanisms that heavily rely on visual instructions, our +approach focuses on language-based instruction tuning, offering a distinct and +more training efficient path for multimodal instruction tuning. We evaluate the +performance of the proposed approach on 9 unseen datasets across both language +and vision modalities. Our results show that our language-only instruction +tuning is able to significantly improve the performance of two pretrained +multimodal models based on Llama 2 and Vicuna on those unseen datasets. +Interestingly, the language instruction following ability also helps unlock the +models to follow vision instructions without explicit training. Compared to the +state of the art multimodal instruction tuning approaches that are mainly based +on visual instructions, our language-based method not only achieves superior +performance but also significantly enhances training efficiency. For instance, +the language-only instruction tuning produces competitive average performance +across the evaluated datasets (with even better performance on language +datasets) with significant training efficiency improvements (on average 4x), +thanks to the striking reduction in the need for vision data. With a small +number of visual instructions, this emerging language instruction following +ability transfers well to the unseen vision datasets, outperforming the state +of the art with greater training efficiency. + +
+
+
+
+
+ + ♻ ☆ Refusal in LLMs is an Affine Function + + +
+ We propose affine concept editing (ACE) as an approach for steering language +models' behavior by intervening directly in activations. We begin with an +affine decomposition of model activation vectors and show that prior methods +for steering model behavior correspond to subsets of terms of this +decomposition. We then provide a derivation of ACE and use it to control +refusal behavior on ten different models, including Llama 3 70B. ACE combines +affine subspace projection and activation addition to reliably control the +model's refusal responses across prompt types. We evaluate the results using +LLM-based scoring on a collection of harmful and harmless prompts. Our +experiments demonstrate that ACE consistently achieves more precise control +over model behavior than existing methods and generalizes to models where +directional ablation via affine subspace projection alone produces incoherent +outputs. Code for reproducing our results is available at +https://github.com/EleutherAI/steering-llama3 . + +
+
+ comment: added plots for results from additional models +
+
+
+
+
+ + ♻ ☆ Fine-Grained Verifiers: Preference Modeling as Next-token Prediction in + Vision-Language Alignment + + +
+ The recent advancements in large language models (LLMs) and pre-trained +vision models have accelerated the development of vision-language large models +(VLLMs), enhancing the interaction between visual and linguistic modalities. +Despite their notable success across various domains, VLLMs face challenges in +modality alignment, which can lead to issues like hallucinations and unsafe +content generation. Current alignment techniques often rely on coarse feedback +and external datasets, limiting scalability and performance. In this paper, we +propose FiSAO (Fine-Grained Self-Alignment Optimization), a novel +self-alignment method that utilizes the model's own visual encoder as a +fine-grained verifier to improve vision-language alignment without the need for +additional data. By leveraging token-level feedback from the vision encoder, +FiSAO significantly improves vision-language alignment, even surpassing +traditional preference tuning methods that require additional data. Through +both theoretical analysis and experimental validation, we demonstrate that +FiSAO effectively addresses the misalignment problem in VLLMs, marking the +first instance of token-level rewards being applied to such models. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ Safe + Safe = Unsafe? Exploring How Safe Images Can Be Exploited to + Jailbreak Large Vision-Language Models + + +
+ Recent advances in Large Vision-Language Models (LVLMs) have showcased strong +reasoning abilities across multiple modalities, achieving significant +breakthroughs in various real-world applications. Despite this great success, +the safety guardrail of LVLMs may not cover the unforeseen domains introduced +by the visual modality. Existing studies primarily focus on eliciting LVLMs to +generate harmful responses via carefully crafted image-based jailbreaks +designed to bypass alignment defenses. In this study, we reveal that a safe +image can be exploited to achieve the same jailbreak consequence when combined +with additional safe images and prompts. This stems from two fundamental +properties of LVLMs: universal reasoning capabilities and safety snowball +effect. Building on these insights, we propose Safety Snowball Agent (SSA), a +novel agent-based framework leveraging agents' autonomous and tool-using +abilities to jailbreak LVLMs. SSA operates through two principal stages: (1) +initial response generation, where tools generate or retrieve jailbreak images +based on potential harmful intents, and (2) harmful snowballing, where refined +subsequent prompts induce progressively harmful outputs. Our experiments +demonstrate that \ours can use nearly any image to induce LVLMs to produce +unsafe content, achieving high success jailbreaking rates against the latest +LVLMs. Unlike prior works that exploit alignment flaws, \ours leverages the +inherent properties of LVLMs, presenting a profound challenge for enforcing +safety in generative multimodal systems. Our code is avaliable at +\url{https://github.com/gzcch/Safety_Snowball_Agent}. + +
+
+
+
+
+ + ♻ ☆ Multi-LoRA Composition for Image Generation + + +
+ Low-Rank Adaptation (LoRA) is extensively utilized in text-to-image models +for the accurate rendition of specific elements like distinct characters or +unique styles in generated images. Nonetheless, existing methods face +challenges in effectively composing multiple LoRAs, especially as the number of +LoRAs to be integrated grows, thus hindering the creation of complex imagery. +In this paper, we study multi-LoRA composition through a decoding-centric +perspective. We present two training-free methods: LoRA Switch, which +alternates between different LoRAs at each denoising step, and LoRA Composite, +which simultaneously incorporates all LoRAs to guide more cohesive image +synthesis. To evaluate the proposed approaches, we establish ComposLoRA, a new +comprehensive testbed as part of this research. It features a diverse range of +LoRA categories with 480 composition sets. Utilizing an evaluation framework +based on GPT-4V, our findings demonstrate a clear improvement in performance +with our methods over the prevalent baseline, particularly evident when +increasing the number of LoRAs in a composition. The code, benchmarks, LoRA +weights, and all evaluation details are available on our project website: +https://maszhongming.github.io/Multi-LoRA-Composition. + +
+
+ comment: Transactions on Machine Learning Research (TMLR), 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 45 + +
+
+
+ + ☆ RoboGSim: A Real2Sim2Real Robotic Gaussian Splatting Simulator + + +
+ Efficient acquisition of real-world embodied data has been increasingly +critical. However, large-scale demonstrations captured by remote operation tend +to take extremely high costs and fail to scale up the data size in an efficient +manner. Sampling the episodes under a simulated environment is a promising way +for large-scale collection while existing simulators fail to high-fidelity +modeling on texture and physics. To address these limitations, we introduce the +RoboGSim, a real2sim2real robotic simulator, powered by 3D Gaussian Splatting +and the physics engine. RoboGSim mainly includes four parts: Gaussian +Reconstructor, Digital Twins Builder, Scene Composer, and Interactive Engine. +It can synthesize the simulated data with novel views, objects, trajectories, +and scenes. RoboGSim also provides an online, reproducible, and safe evaluation +for different manipulation policies. The real2sim and sim2real transfer +experiments show a high consistency in the texture and physics. Moreover, the +effectiveness of synthetic data is validated under the real-world manipulated +tasks. We hope RoboGSim serves as a closed-loop simulator for fair comparison +on policy learning. More information can be found on our project page +https://robogsim.github.io/ . + +
+
+
+
+
+ + ☆ Differentiable GPU-Parallelized Task and Motion Planning CoRL 2024 + + +
+ We present a differentiable optimization-based framework for Task and Motion +Planning (TAMP) that is massively parallelizable on GPUs, enabling thousands of +sampled seeds to be optimized simultaneously. Existing sampling-based +approaches inherently disconnect the parameters by generating samples for each +independently and combining them through composition and rejection, while +optimization-based methods struggle with highly non-convex constraints and +local optima. Our method treats TAMP constraint satisfaction as optimizing a +batch of particles, each representing an assignment to a plan skeleton's +continuous parameters. We represent the plan skeleton's constraints using +differentiable cost functions, enabling us to compute the gradient of each +particle and update it toward satisfying solutions. Our use of GPU parallelism +better covers the parameter space through scale, increasing the likelihood of +finding the global optima by exploring multiple basins through global sampling. +We demonstrate that our algorithm can effectively solve a highly constrained +Tetris packing problem using a Franka arm in simulation and deploy our planner +on a real robot arm. Website: https://williamshen-nz.github.io/gpu-tamp + +
+
+ comment: 2-page paper presented at the CoRL 2024 Workshop on Differentiable + Optimization Everywhere +
+
+
+
+
+ + ☆ cHyRRT and cHySST: Two Motion Planning Tools for Hybrid Dynamical + Systems + + +
+ This paper describes two C++/Open Motion Planning Library implementations of +the recently developed motion planning algorithms HyRRT arXiv:2210.15082v1 +[cs.RO] and HySST arXiv:2305.18649v1 [cs.RO]. Specifically, cHyRRT, an +implementation of the HyRRT algorithm, is capable of generating a solution to a +motion planning problem for hybrid systems with probabilistically completeness, +while cHySST, an implementation of the asymptotically near-optimal HySST +algorithm, is capable of computing a trajectory to solve the optimal motion +planning problem for hybrid systems. cHyRRT is suitable for motion planning +problems where an optimal solution is not required, whereas cHySST is suitable +for such problems that prefer optimal solutions, within all feasible solutions. +The structure, components, and usage of the two tools are described. Examples +are included to illustrate the main capabilities of the toolbox. + +
+
+ comment: This paper has 26 pages and has been submitted to 28th ACM + International Conference on Hybrid Systems: Computation and Control +
+
+
+
+
+ + ☆ Enabling steep slope walking on Husky using reduced order modeling and + quadratic programming + + +
+ Wing-assisted inclined running (WAIR) observed in some young birds, is an +attractive maneuver that can be extended to legged aerial systems. This study +proposes a control method using a modified Variable Length Inverted Pendulum +(VLIP) by assuming a fixed zero moment point and thruster forces collocated at +the center of mass of the pendulum. A QP MPC is used to find the optimal ground +reaction forces and thruster forces to track a reference position and velocity +trajectory. Simulation results of this VLIP model on a slope of 40 degrees is +maintained and shows thruster forces that can be obtained through posture +manipulation. The simulation also provides insight to how the combined efforts +of the thrusters and the tractive forces from the legs make WAIR possible in +thruster-assisted legged systems. + +
+
+ comment: 6 pages, 8 figures, submitted to the Humanoids 2025 conference +
+
+
+
+
+ + ☆ Assistive Control of Knee Exoskeletons for Human Walking on Granular + Terrains + + +
+ Human walkers traverse diverse environments and demonstrate different gait +locomotion and energy cost on granular terrains compared to solid ground. We +present a stiffness-based model predictive control approach of knee exoskeleton +assistance on sand. The gait and locomotion comparison is first discussed for +human walkers on sand and solid ground. A machine learning-based estimation +scheme is then presented to predict the ground reaction forces (GRFs) for human +walkers on different terrains in real time. Built on the estimated GRFs and +human joint torques, a knee exoskeleton controller is designed to provide +assistive torque through a model predictive stiffness control scheme. We +conduct indoor and outdoor experiments to validate the modeling and control +design and their performance. The experiments demonstrate the major muscle +activation and metabolic reductions by respectively 15% and 3.7% under the +assistive exoskeleton control of human walking on sand. + +
+
+ comment: Eight pages, eleven figures, submitted to IEEE Robotics and + Automation Letters +
+
+
+
+
+ + ☆ High-Speed Cornering Control and Real-Vehicle Deployment for Autonomous + Electric Vehicles + + +
+ Executing drift maneuvers during high-speed cornering presents significant +challenges for autonomous vehicles, yet offers the potential to minimize +turning time and enhance driving dynamics. While reinforcement learning (RL) +has shown promising results in simulated environments, discrepancies between +simulations and real-world conditions have limited its practical deployment. +This study introduces an innovative control framework that integrates +trajectory optimization with drift maneuvers, aiming to improve the algorithm's +adaptability for real-vehicle implementation. We leveraged Bezier-based +pre-trajectory optimization to enhance rewards and optimize the controller +through Twin Delayed Deep Deterministic Policy Gradient (TD3) in a simulated +environment. For real-world deployment, we implement a hybrid RL-MPC fusion +mechanism, , where TD3-derived maneuvers serve as primary inputs for a Model +Predictive Controller (MPC). This integration enables precise real-time +tracking of the optimal trajectory, with MPC providing corrective inputs to +bridge the gap between simulation and reality. The efficacy of this method is +validated through real-vehicle tests on consumer-grade electric vehicles, +focusing on drift U-turns and drift right-angle turns. The control outcomes of +these real-vehicle tests are thoroughly documented in the paper, supported by +supplementary video evidence (https://youtu.be/5wp67FcpfL8). Notably, this +study is the first to deploy and apply an RL-based transient drift cornering +algorithm on consumer-grade electric vehicles. + +
+
+ comment: In the process of being submitted to the Journal of IEEE Transactions + on Industrial Electronics +
+
+
+
+
+ + ☆ Joint-Space Control of a Structurally Elastic Humanoid Robot + + +
+ In this work, the joint-control strategy is presented for the humanoid robot, +PANDORA, whose structural components are designed to be compliant. As opposed +to contemporary approaches which design the elasticity internal to the actuator +housing, PANDORA's structural components are designed to be compliant under +load or, in other words, structurally elastic. To maintain the rapid design +benefit of additive manufacturing, this joint control strategy employs a +disturbance observer (DOB) modeled from an ideal elastic actuator. This robust +controller treats the model variation from the structurally elastic components +as a disturbance and eliminates the need for system identification of the 3D +printed parts. This enables mechanical design engineers to iterate on the 3D +printed linkages without requiring consistent tuning from the joint controller. +Two sets of hardware results are presented for validating the controller. The +first set of results are conducted on an ideal elastic actuator testbed that +drives an unmodeled, 1 DoF weighted pendulum with a 10 kg mass. The results +support the claim that the DOB can handle significant model variation. The +second set of results is from a robust balancing experiment conducted on the 12 +DoF lower body of PANDORA. The robot maintains balance while an operator +applies 50 N pushes to the pelvis, where the actuator tracking results are +presented for the left leg. + +
+
+
+
+
+ + ☆ Integrating Active Sensing and Rearrangement Planning for Efficient + Object Retrieval from Unknown, Confined, Cluttered Environments + + +
+ Retrieving target objects from unknown, confined spaces remains a challenging +task that requires integrated, task-driven active sensing and rearrangement +planning. Previous approaches have independently addressed active sensing and +rearrangement planning, limiting their practicality in real-world scenarios. +This paper presents a new, integrated heuristic-based active sensing and +Monte-Carlo Tree Search (MCTS)-based retrieval planning approach. These +components provide feedback to one another to actively sense critical, +unobserved areas suitable for the retrieval planner to plan a sequence for +relocating path-blocking obstacles and a collision-free trajectory for +retrieving the target object. We demonstrate the effectiveness of our approach +using a robot arm equipped with an in-hand camera in both simulated and +real-world confined, cluttered scenarios. Our framework is compared against +various state-of-the-art methods. The results indicate that our proposed +approach outperforms baseline methods by a significant margin in terms of the +success rate, the object rearrangement planning time consumption and the number +of planning trials before successfully retrieving the target. Videos can be +found at https://youtu.be/tea7I-3RtV0. + +
+
+
+
+
+ + ☆ Semantic-Geometric-Physical-Driven Robot Manipulation Skill Transfer via + Skill Library and Tactile Representation + + +
+ Deploying robots in open-world environments involves complex tasks +characterized by long sequences and rich interactions, necessitating efficient +transfer of robotic skills across diverse and complex scenarios. To address +this challenge, we propose a skill library framework based on knowledge graphs, +which endows robots with high-level skill awareness and spatial semantic +understanding. The framework hierarchically organizes operational knowledge by +constructing a "task graph" and a "scene graph" to represent task and scene +semantic information, respectively. We introduce a "state graph" to facilitate +interaction between high-level task planning and low-level scene information. +Furthermore, we propose a hierarchical transfer framework for operational +skills. At the task level, the framework integrates contextual learning and +chain-of-thought prompting within a four-stage prompt paradigm, leveraging +large language models' (LLMs) reasoning and generalization capabilities to +achieve task-level subtask sequence transfer. At the motion level, an adaptive +trajectory transfer method is developed using the A* algorithm and the skill +library, enabling motion-level adaptive trajectory transfer. At the physical +level, we introduce an adaptive contour extraction and posture perception +method based on tactile perception. This method dynamically obtains +high-precision contour and posture information from visual-tactile texture data +and adjusts transferred skills, such as contact positions and postures, to +ensure effectiveness in new environments. Experimental results validate the +effectiveness of the proposed methods. Project +website:https://github.com/MingchaoQi/skill_transfer + +
+
+
+
+
+ + ☆ TrojanRobot: Backdoor Attacks Against Robotic Manipulation in the + Physical World + + +
+ Robotic manipulation refers to the autonomous handling and interaction of +robots with objects using advanced techniques in robotics and artificial +intelligence. The advent of powerful tools such as large language models (LLMs) +and large vision-language models (LVLMs) has significantly enhanced the +capabilities of these robots in environmental perception and decision-making. +However, the introduction of these intelligent agents has led to security +threats such as jailbreak attacks and adversarial attacks. + In this research, we take a further step by proposing a backdoor attack +specifically targeting robotic manipulation and, for the first time, +implementing backdoor attack in the physical world. By embedding a backdoor +visual language model into the visual perception module within the robotic +system, we successfully mislead the robotic arm's operation in the physical +world, given the presence of common items as triggers. Experimental evaluations +in the physical world demonstrate the effectiveness of the proposed backdoor +attack. + +
+
+ comment: Initial version with preliminary results. We welcome any feedback or + suggestions +
+
+
+
+
+ + ☆ The ethical landscape of robot-assisted surgery. A systematic review + + +
+ Background: Robot-assisted surgery has been widely adopted in recent years. +However, compared to other health technologies operating in close proximity to +patients in a vulnerable state, ethical issues of robot-assisted surgery have +received less attention. Against the background of increasing automation that +are expected to raise new ethical issues, this systematic review aims to map +the state of the ethical debate in this field. + Methods: A protocol was registered in the international prospective register +of systematic reviews (PROSPERO CRD42023397951). Medline via PubMed, EMBASE, +CINHAL, Philosophers' Index, IEEE Xplorer, Web of Science (Core Collection), +Scopus and Google Scholar were searched in January 2023. Screening, extraction, +and analysis were conducted independently by two authors. A qualitative +narrative synthesis was performed. + Results: Out of 1,723 records, 66 records were included in the final dataset. +Seven major strands of the ethical debate emerged during analysis. These +include questions of harms and benefits, responsibility and control, +professional-patient relationship, ethical issues in surgical training and +learning, justice, translational questions, and economic considerations. + Discussion: The identified themes testify to a broad range of different and +differing ethical issues requiring careful deliberation and integration into +the surgical ethos. Looking forward, we argue that a different perspective in +addressing robotic surgical devices might be helpful to consider upcoming +challenges of automation. + +
+
+ comment: 25 pages, 3 tables, 2 figures +
+
+
+
+
+ + ☆ Signaling and Social Learning in Swarms of Robots + + +
+ This paper investigates the role of communication in improving coordination +within robot swarms, focusing on a paradigm where learning and execution occur +simultaneously in a decentralized manner. We highlight the role communication +can play in addressing the credit assignment problem (individual contribution +to the overall performance), and how it can be influenced by it. We propose a +taxonomy of existing and future works on communication, focusing on information +selection and physical abstraction as principal axes for classification: from +low-level lossless compression with raw signal extraction and processing to +high-level lossy compression with structured communication models. The paper +reviews current research from evolutionary robotics, multi-agent (deep) +reinforcement learning, language models, and biophysics models to outline the +challenges and opportunities of communication in a collective of robots that +continuously learn from one another through local message exchanges, +illustrating a form of social learning. + +
+
+ comment: 17 pages, 3 Figures +
+
+
+
+
+ + ☆ VLN-Game: Vision-Language Equilibrium Search for Zero-Shot Semantic + Navigation + + +
+ Following human instructions to explore and search for a specified target in +an unfamiliar environment is a crucial skill for mobile service robots. Most of +the previous works on object goal navigation have typically focused on a single +input modality as the target, which may lead to limited consideration of +language descriptions containing detailed attributes and spatial relationships. +To address this limitation, we propose VLN-Game, a novel zero-shot framework +for visual target navigation that can process object names and descriptive +language targets effectively. To be more precise, our approach constructs a 3D +object-centric spatial map by integrating pre-trained visual-language features +with a 3D reconstruction of the physical environment. Then, the framework +identifies the most promising areas to explore in search of potential target +candidates. A game-theoretic vision language model is employed to determine +which target best matches the given language description. Experiments conducted +on the Habitat-Matterport 3D (HM3D) dataset demonstrate that the proposed +framework achieves state-of-the-art performance in both object goal navigation +and language-based navigation tasks. Moreover, we show that VLN-Game can be +easily deployed on real-world robots. The success of VLN-Game highlights the +promising potential of using game-theoretic methods with compact +vision-language models to advance decision-making capabilities in robotic +systems. The supplementary video and code can be accessed via the following +link: https://sites.google.com/view/vln-game. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ☆ Performance evaluation of a ROS2 based Automated Driving System + + +
+ Automated driving is currently a prominent area of scientific work. In the +future, highly automated driving and new Advanced Driver Assistance Systems +will become reality. While Advanced Driver Assistance Systems and automated +driving functions for certain domains are already commercially available, +ubiquitous automated driving in complex scenarios remains a subject of ongoing +research. Contrarily to single-purpose Electronic Control Units, the software +for automated driving is often executed on high performance PCs. The Robot +Operating System 2 (ROS2) is commonly used to connect components in an +automated driving system. Due to the time critical nature of automated driving +systems, the performance of the framework is especially important. In this +paper, a thorough performance evaluation of ROS2 is conducted, both in terms of +timeliness and error rate. The results show that ROS2 is a suitable framework +for automated driving systems. + +
+
+ comment: Published and presented at VEHITS 2024, Proceedings of the 10th + International Conference on Vehicle Technology and Intelligent Transport + Systems - VEHITS; 2024 +
+
+
+
+
+ + ☆ Closed-loop multi-step planning with innate physics knowledge + + +
+ We present a hierarchical framework to solve robot planning as an input +control problem. At the lowest level are temporary closed control loops, +("tasks"), each representing a behaviour, contingent on a specific sensory +input and therefore temporary. At the highest level, a supervising +"Configurator" directs task creation and termination. Here resides "core" +knowledge as a physics engine, where sequences of tasks can be simulated. The +Configurator encodes and interprets simulation results,based on which it can +choose a sequence of tasks as a plan. We implement this framework on a real +robot and test it in an overtaking scenario as proof-of-concept. + +
+
+
+
+
+ + ☆ Physics Encoded Blocks in Residual Neural Network Architectures for + Digital Twin Models + + +
+ Physics Informed Machine Learning has emerged as a popular approach in +modelling and simulation for digital twins to generate accurate models of +processes and behaviours of real-world systems. However, despite their success +in generating accurate and reliable models, the existing methods either use +simple regularizations in loss functions to offer limited physics integration +or are too specific in architectural definitions to be generalized to a wide +variety of physical systems. This paper presents a generic approach based on a +novel physics-encoded residual neural network architecture to combine +data-driven and physics-based analytical models to address these limitations. +Our method combines physics blocks as mathematical operators from physics-based +models with learning blocks comprising feed-forward layers. Intermediate +residual blocks are incorporated for stable gradient flow as they train on +physical system observation data. This way, the model learns to comply with the +geometric and kinematic aspects of the physical system. Compared to +conventional neural network-based methods, our method improves generalizability +with substantially low data requirements and model complexity in terms of +parameters, especially in scenarios where prior physics knowledge is either +elementary or incomplete. We investigate our approach in two application +domains. The first is a basic robotic motion model using Euler Lagrangian +equations of motion as physics prior. The second application is a complex +scenario of a steering model for a self-driving vehicle in a simulation. In +both applications, our method outperforms both conventional neural network +based approaches as-well as state-of-the-art Physics Informed Machine Learning +methods. + +
+
+
+
+
+ + ☆ Robust State Estimation for Legged Robots with Dual Beta Kalman Filter + + +
+ Existing state estimation algorithms for legged robots that rely on +proprioceptive sensors often overlook foot slippage and leg deformation in the +physical world, leading to large estimation errors. To address this limitation, +we propose a comprehensive measurement model that accounts for both foot +slippage and variable leg length by analyzing the relative motion between foot +contact points and the robot's body center. We show that leg length is an +observable quantity, meaning that its value can be explicitly inferred by +designing an auxiliary filter. To this end, we introduce a dual estimation +framework that iteratively employs a parameter filter to estimate the leg +length parameters and a state filter to estimate the robot's state. To prevent +error accumulation in this iterative framework, we construct a partial +measurement model for the parameter filter using the leg static equation. This +approach ensures that leg length estimation relies solely on joint torques and +foot contact forces, avoiding the influence of state estimation errors on the +parameter estimation. Unlike leg length which can be directly estimated, foot +slippage cannot be measured directly with the current sensor configuration. +However, since foot slippage occurs at a low frequency, it can be treated as +outliers in the measurement data. To mitigate the impact of these outliers, we +propose the beta Kalman filter (beta KF), which redefines the estimation loss +in canonical Kalman filtering using beta divergence. This divergence can assign +low weights to outliers in an adaptive manner, thereby enhancing the robustness +of the estimation algorithm. These techniques together form the dual +beta-Kalman filter (Dual beta KF), a novel algorithm for robust state +estimation in legged robots. Experimental results on the Unitree GO2 robot +demonstrate that the Dual beta KF significantly outperforms state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Exploring Emerging Trends and Research Opportunities in Visual Place + Recognition ICRA + + +
+ Visual-based recognition, e.g., image classification, object detection, etc., +is a long-standing challenge in computer vision and robotics communities. +Concerning the roboticists, since the knowledge of the environment is a +prerequisite for complex navigation tasks, visual place recognition is vital +for most localization implementations or re-localization and loop closure +detection pipelines within simultaneous localization and mapping (SLAM). More +specifically, it corresponds to the system's ability to identify and match a +previously visited location using computer vision tools. Towards developing +novel techniques with enhanced accuracy and robustness, while motivated by the +success presented in natural language processing methods, researchers have +recently turned their attention to vision-language models, which integrate +visual and textual data. + +
+
+ comment: 2 pages, 1 figure. 40th Anniversary of the IEEE Conference on + Robotics and Automation (ICRA@40), Rotterdam, Netherlands, September 23-26, + 2024 +
+
+
+
+
+ + ☆ IKEA Manuals at Work: 4D Grounding of Assembly Instructions on Internet + Videos NeurIPS 2024 + + +
+ Shape assembly is a ubiquitous task in daily life, integral for constructing +complex 3D structures like IKEA furniture. While significant progress has been +made in developing autonomous agents for shape assembly, existing datasets have +not yet tackled the 4D grounding of assembly instructions in videos, essential +for a holistic understanding of assembly in 3D space over time. We introduce +IKEA Video Manuals, a dataset that features 3D models of furniture parts, +instructional manuals, assembly videos from the Internet, and most importantly, +annotations of dense spatio-temporal alignments between these data modalities. +To demonstrate the utility of IKEA Video Manuals, we present five applications +essential for shape assembly: assembly plan generation, part-conditioned +segmentation, part-conditioned pose estimation, video object segmentation, and +furniture assembly based on instructional video manuals. For each application, +we provide evaluation metrics and baseline methods. Through experiments on our +annotated data, we highlight many challenges in grounding assembly instructions +in videos to improve shape assembly, including handling occlusions, varying +viewpoints, and extended assembly sequences. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Bridging the Resource Gap: Deploying Advanced Imitation Learning Models + onto Affordable Embedded Platforms + + +
+ Advanced imitation learning with structures like the transformer is +increasingly demonstrating its advantages in robotics. However, deploying these +large-scale models on embedded platforms remains a major challenge. In this +paper, we propose a pipeline that facilitates the migration of advanced +imitation learning algorithms to edge devices. The process is achieved via an +efficient model compression method and a practical asynchronous parallel method +Temporal Ensemble with Dropped Actions (TEDA) that enhances the smoothness of +operations. To show the efficiency of the proposed pipeline, large-scale +imitation learning models are trained on a server and deployed on an edge +device to complete various manipulation tasks. + +
+
+ comment: Accepted by the 2024 IEEE International Conference on Robotics and + Biomimetics (IEEE ROBIO 2024) +
+
+
+
+
+ + ☆ Extended Neural Contractive Dynamical Systems: On Multiple Tasks and + Riemannian Safety Regions + + +
+ Stability guarantees are crucial when ensuring that a fully autonomous robot +does not take undesirable or potentially harmful actions. We recently proposed +the Neural Contractive Dynamical Systems (NCDS), which is a neural network +architecture that guarantees contractive stability. With this, +learning-from-demonstrations approaches can trivially provide stability +guarantees. However, our early work left several unanswered questions, which we +here address. Beyond providing an in-depth explanation of NCDS, this paper +extends the framework with more careful regularization, a conditional variant +of the framework for handling multiple tasks, and an uncertainty-driven +approach to latent obstacle avoidance. Experiments verify that the developed +system has the flexibility of ordinary neural networks while providing the +stability guarantees needed for autonomous robotics. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2401.09352 +
+
+
+
+
+ + ☆ InstruGen: Automatic Instruction Generation for Vision-and-Language + Navigation Via Large Multimodal Models + + +
+ Recent research on Vision-and-Language Navigation (VLN) indicates that agents +suffer from poor generalization in unseen environments due to the lack of +realistic training environments and high-quality path-instruction pairs. Most +existing methods for constructing realistic navigation scenes have high costs, +and the extension of instructions mainly relies on predefined templates or +rules, lacking adaptability. To alleviate the issue, we propose InstruGen, a +VLN path-instruction pairs generation paradigm. Specifically, we use YouTube +house tour videos as realistic navigation scenes and leverage the powerful +visual understanding and generation abilities of large multimodal models (LMMs) +to automatically generate diverse and high-quality VLN path-instruction pairs. +Our method generates navigation instructions with different granularities and +achieves fine-grained alignment between instructions and visual observations, +which was difficult to achieve with previous methods. Additionally, we design a +multi-stage verification mechanism to reduce hallucinations and inconsistency +of LMMs. Experimental results demonstrate that agents trained with +path-instruction pairs generated by InstruGen achieves state-of-the-art +performance on the R2R and RxR benchmarks, particularly in unseen environments. +Code is available at https://github.com/yanyu0526/InstruGen. + +
+
+
+
+
+ + ☆ SayComply: Grounding Field Robotic Tasks in Operational Compliance + through Retrieval-Based Language Models + + +
+ This paper addresses the problem of task planning for robots that must comply +with operational manuals in real-world settings. Task planning under these +constraints is essential for enabling autonomous robot operation in domains +that require adherence to domain-specific knowledge. Current methods for +generating robot goals and plans rely on common sense knowledge encoded in +large language models. However, these models lack grounding of robot plans to +domain-specific knowledge and are not easily transferable between multiple +sites or customers with different compliance needs. In this work, we present +SayComply, which enables grounding robotic task planning with operational +compliance using retrieval-based language models. We design a hierarchical +database of operational, environment, and robot embodiment manuals and +procedures to enable efficient retrieval of the relevant context under the +limited context length of the LLMs. We then design a task planner using a +tree-based retrieval augmented generation (RAG) technique to generate robot +tasks that follow user instructions while simultaneously complying with the +domain knowledge in the database. We demonstrate the benefits of our approach +through simulations and hardware experiments in real-world scenarios that +require precise context retrieval across various types of context, +outperforming the standard RAG method. Our approach bridges the gap in +deploying robots that consistently adhere to operational protocols, offering a +scalable and edge-deployable solution for ensuring compliance across varied and +complex real-world environments. Project website: saycomply.github.io. + +
+
+
+
+
+ + ☆ Design a New Pulling Gear for the Automated Pant Bottom Hem Sewing + Machine + + +
+ Automated machinery design for garment manufacturing is essential for +improving productivity, consistency, and quality. This paper focuses on the +development of new pulling gear for automated pant bottom hem sewing machines. +Traditionally, these machines require manual intervention to guide the bottom +hem sewing process, which often leads to inconsistent stitch quality and +alignment. While twin-needle sewing machines can create twin lines for the +bottom hem, they typically lack sufficient pulling force to adequately handle +the fabric of the pants' bottom hem. The innovative design of the pulling gear +aims to address this issue by providing the necessary pulling force for the +bottom hem of eyelet pants. The research and design discussed in this article +seek to solve technical challenges, eliminate the need for skilled manual +operators, and enhance overall productivity. This improvement ensures smooth +and precise feeding of fabric pieces in the automated twin needle sewing +machine, ultimately improving the consistency and quality of the stitching. By +integrating this innovation, garment manufacturers can boost productivity, +reduce reliance on manual skilful labour, and optimize the output of the +production process, thereby reaping the benefits of automation in the garment +manufacturing industry. + +
+
+ comment: 9 pages,11 figures, preprint to International Research Journal of + Modernization in Engineering Technology and Science +
+
+
+
+
+ + ☆ DrivingSphere: Building a High-fidelity 4D World for Closed-loop + Simulation + + +
+ Autonomous driving evaluation requires simulation environments that closely +replicate actual road conditions, including real-world sensory data and +responsive feedback loops. However, many existing simulations need to predict +waypoints along fixed routes on public datasets or synthetic photorealistic +data, \ie, open-loop simulation usually lacks the ability to assess dynamic +decision-making. While the recent efforts of closed-loop simulation offer +feedback-driven environments, they cannot process visual sensor inputs or +produce outputs that differ from real-world data. To address these challenges, +we propose DrivingSphere, a realistic and closed-loop simulation framework. Its +core idea is to build 4D world representation and generate real-life and +controllable driving scenarios. In specific, our framework includes a Dynamic +Environment Composition module that constructs a detailed 4D driving world with +a format of occupancy equipping with static backgrounds and dynamic objects, +and a Visual Scene Synthesis module that transforms this data into +high-fidelity, multi-view video outputs, ensuring spatial and temporal +consistency. By providing a dynamic and realistic simulation environment, +DrivingSphere enables comprehensive testing and validation of autonomous +driving algorithms, ultimately advancing the development of more reliable +autonomous cars. The benchmark will be publicly released. + +
+
+ comment: https://yanty123.github.io/DrivingSphere/ +
+
+
+
+
+ + ☆ Conjugate Momentum-Based Estimation of External Forces for Bio-Inspired + Morphing Wing Flight + + +
+ Dynamic morphing wing flights present significant challenges in accurately +estimating external forces due to complex interactions between aerodynamics, +rapid wing movements, and external disturbances. Traditional force estimation +methods often struggle with unpredictable disturbances like wind gusts or +unmodeled impacts that can destabilize flight in real-world scenarios. This +paper addresses these challenges by implementing a Conjugate Momentum-based +Observer, which effectively estimates and manages unknown external forces +acting on the Aerobat, a bio-inspired robotic platform with dynamically +morphing wings. Through simulations, the observer demonstrates its capability +to accurately detect and quantify external forces, even in the presence of +Gaussian noise and abrupt impulse inputs. The results validate the robustness +of the method, showing improved stability and control of the Aerobat in dynamic +environments. This research contributes to advancements in bio-inspired +robotics by enhancing force estimation for flapping-wing systems, with +potential applications in autonomous aerial navigation and robust flight +control. + +
+
+
+
+
+ + ☆ Optimization free control and ground force estimation with momentum + observer for a multimodal legged aerial robot + + +
+ Legged-aerial multimodal robots can make the most of both legged and aerial +systems. In this paper, we propose a control framework that bypasses heavy +onboard computers by using an optimization-free Explicit Reference Governor +that incorporates external thruster forces from an attitude controller. Ground +reaction forces are maintained within friction cone constraints using costly +optimization solvers, but the ERG framework filters applied velocity references +that ensure no slippage at the foot end. We also propose a Conjugate momentum +observer, that is widely used in Disturbance Observation to estimate ground +reaction forces and compare its efficacy against a constrained model in +estimating ground reaction forces in a reduced-order simulation of Husky. + +
+
+ comment: 6 pages, 10 figures, submitted to American Control Conference 2025 +
+
+
+
+
+ + ☆ Operator Splitting Covariance Steering for Safe Stochastic Nonlinear + Control + + +
+ Most robotics applications are typically accompanied with safety restrictions +that need to be satisfied with a high degree of confidence even in environments +under uncertainty. Controlling the state distribution of a system and enforcing +such specifications as distribution constraints is a promising approach for +meeting such requirements. In this direction, covariance steering (CS) is an +increasingly popular stochastic optimal control (SOC) framework for designing +safe controllers via explicit constraints on the system covariance. +Nevertheless, a major challenge in applying CS methods to systems with the +nonlinear dynamics and chance constraints common in robotics is that the +approximations needed are conservative and highly sensitive to the point of +approximation. This can cause sequential convex programming methods to converge +to poor local minima or incorrectly report problems as infeasible due to +shifting constraints. This paper presents a novel algorithm for solving +chance-constrained nonlinear CS problems that directly addresses this +challenge. Specifically, we propose an operator-splitting approach that +temporarily separates the main problem into subproblems that can be solved in +parallel. The benefit of this relaxation lies in the fact that it does not +require all iterates to satisfy all constraints simultaneously prior to +convergence, thus enhancing the exploration capabilities of the algorithm for +finding better solutions. Simulation results verify the ability of the proposed +method to find higher quality solutions under stricter safety constraints than +standard methods on a variety of robotic systems. Finally, the applicability of +the algorithm on real systems is confirmed through hardware demonstrations. + +
+
+
+
+
+ + ☆ Simultaneous Ground Reaction Force and State Estimation via Constrained + Moving Horizon Estimation + + +
+ Accurate ground reaction force (GRF) estimation can significantly improve the +adaptability of legged robots in various real-world applications. For instance, +with estimated GRF and contact kinematics, the locomotion control and planning +assist the robot in overcoming uncertain terrains. The canonical momentum-based +methods, formulated as nonlinear observers, do not fully address the noisy +measurements and the dependence between floating base states and the +generalized momentum dynamics. In this paper, we present a simultaneous ground +reaction force and state estimation framework for legged robots, which +systematically addresses the sensor noise and the coupling between states and +dynamics. With the floating base orientation estimated separately, a +decentralized Moving Horizon Estimation (MHE) method is implemented to fuse the +robot dynamics, proprioceptive sensors, exteroceptive sensors, and +deterministic contact complementarity constraints in a convex windowed +optimization. The proposed method is shown to be capable of providing accurate +GRF and state estimation on several legged robots, including the open-source +educational planar bipedal robot STRIDE and quadrupedal robot Unitree Go1, with +a frequency of 200Hz and a past time window of 0.04s. + +
+
+
+
+
+ + ☆ Fast Convergence of Softmax Policy Mirror Ascent + + +
+ Natural policy gradient (NPG) is a common policy optimization algorithm and +can be viewed as mirror ascent in the space of probabilities. Recently, Vaswani +et al. [2021] introduced a policy gradient method that corresponds to mirror +ascent in the dual space of logits. We refine this algorithm, removing its need +for a normalization across actions and analyze the resulting method (referred +to as SPMA). For tabular MDPs, we prove that SPMA with a constant step-size +matches the linear convergence of NPG and achieves a faster convergence than +constant step-size (accelerated) softmax policy gradient. To handle large +state-action spaces, we extend SPMA to use a log-linear policy +parameterization. Unlike that for NPG, generalizing SPMA to the linear function +approximation (FA) setting does not require compatible function approximation. +Unlike MDPO, a practical generalization of NPG, SPMA with linear FA only +requires solving convex softmax classification problems. We prove that SPMA +achieves linear convergence to the neighbourhood of the optimal value function. +We extend SPMA to handle non-linear FA and evaluate its empirical performance +on the MuJoCo and Atari benchmarks. Our results demonstrate that SPMA +consistently achieves similar or better performance compared to MDPO, PPO and +TRPO. + +
+
+
+
+
+ + ☆ On-the-Go Path Planning and Repair in Static and Dynamic Scenarios + + +
+ Autonomous systems, including robots and drones, face significant challenges +when navigating through dynamic environments, particularly within urban +settings where obstacles, fluctuating traffic, and pedestrian activity are +constantly shifting. Although, traditional motion planning algorithms like the +wavefront planner and gradient descent planner, which use potential functions, +work well in static environments, they fall short in situations where the +environment is continuously changing. This work proposes a dynamic, real-time +path planning approach specifically designed for autonomous systems, allowing +them to effectively avoid static and dynamic obstacles, thereby enhancing their +overall adaptability. The approach integrates the efficiency of conventional +planners with the ability to make rapid adjustments in response to moving +obstacles and environmental changes. The simulation results discussed in this +article demonstrate the effectiveness of the proposed method, demonstrating its +suitability for robotic path planning in both known and unknown environments, +including those involving mobile objects, agents, or potential threats. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ HPA-MPC: Hybrid Perception-Aware Nonlinear Model Predictive Control for + Quadrotors with Suspended Loads + + +
+ Quadrotors equipped with cable-suspended loads represent a versatile, +low-cost, and energy efficient solution for aerial transportation, +construction, and manipulation tasks. However, their real-world deployment is +hindered by several challenges. The system is difficult to control because it +is nonlinear, underactuated, involves hybrid dynamics due to slack-taut cable +modes, and evolves on complex configuration spaces. Additionally, it is crucial +to estimate the full state and the cable's mode transitions in real-time using +on-board sensors and computation. To address these challenges, we present a +novel Hybrid Perception-Aware Nonlinear Model Predictive Control (HPA-MPC) +control approach for quadrotors with suspended loads. Our method considers the +complete hybrid system dynamics and includes a perception-aware cost to ensure +the payload remains visible in the robot's camera during navigation. +Furthermore, the full state and hybrid dynamics' transitions are estimated +using onboard sensors. Experimental results demonstrate that our approach +enables stable load tracking control, even during slack-taut transitions, and +operates entirely onboard. The experiments also show that the perception-aware +term effectively keeps the payload in the robot's camera field of view when a +human operator interacts with the load. + +
+
+ comment: Accepted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ Sequential Gaussian Variational Inference for Nonlinear State Estimation + and Its Application in Robot Navigation + + +
+ Probabilistic state estimation is essential for robots navigating uncertain +environments. Accurately and efficiently managing uncertainty in estimated +states is key to robust robotic operation. However, nonlinearities in robotic +platforms pose significant challenges that require advanced estimation +techniques. Gaussian variational inference (GVI) offers an optimization +perspective on the estimation problem, providing analytically tractable +solutions and efficiencies derived from the geometry of Gaussian space. We +propose a Sequential Gaussian Variational Inference (S-GVI) method to address +nonlinearity and provide efficient sequential inference processes. Our approach +integrates sequential Bayesian principles into the GVI framework, which are +addressed using statistical approximations and gradient updates on the +information geometry. Validations through simulations and real-world +experiments demonstrate significant improvements in state estimation over the +Maximum A Posteriori (MAP) estimation method. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Effective Virtual Reality Teleoperation of an Upper-body Humanoid with + Modified Task Jacobians and Relaxed Barrier Functions for Self-Collision + Avoidance + + +
+ We present an approach for retartgeting off-the-shelf Virtual Reality (VR) +trackers to effectively teleoperate an upper-body humanoid while ensuring +self-collision-free motions. Key to the effectiveness was the proper assignment +of trackers to joint sets via modified task Jacobians and relaxed barrier +functions for self-collision avoidance. The approach was validated on +Apptronik's Astro hardware by demonstrating manipulation capabilities on a +table-top environment with pick-and-place box packing and a two-handed box pick +up and handover task. + +
+
+ comment: First Prize Winner of Horizons of an extended robotics reality + Workshop at International Conference on Intelligent Robots and Systems, 2022 +
+
+
+
+
+ + ♻ ☆ RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual + Dexterous Robot Hands CoRL + + +
+ It has been a long-standing research goal to endow robot hands with +human-level dexterity. Bi-manual robot piano playing constitutes a task that +combines challenges from dynamic tasks, such as generating fast while precise +motions, with slower but contact-rich manipulation problems. Although +reinforcement learning based approaches have shown promising results in +single-task performance, these methods struggle in a multi-song setting. Our +work aims to close this gap and, thereby, enable imitation learning approaches +for robot piano playing at scale. To this end, we introduce the Robot Piano 1 +Million (RP1M) dataset, containing bi-manual robot piano playing motion data of +more than one million trajectories. We formulate finger placements as an +optimal transport problem, thus, enabling automatic annotation of vast amounts +of unlabeled songs. Benchmarking existing imitation learning approaches shows +that such approaches reach state-of-the-art robot piano playing performance by +leveraging RP1M. + +
+
+ comment: Accepted by Conference on Robot Learning (CoRL) 2024. Project + Website: https://rp1m.github.io/ +
+
+
+
+
+ + ♻ ☆ Learning Spatial Bimanual Action Models Based on Affordance Regions and + Human Demonstrations + + +
+ In this paper, we present a novel approach for learning bimanual manipulation +actions from human demonstration by extracting spatial constraints between +affordance regions, termed affordance constraints, of the objects involved. +Affordance regions are defined as object parts that provide interaction +possibilities to an agent. For example, the bottom of a bottle affords the +object to be placed on a surface, while its spout affords the contained liquid +to be poured. We propose a novel approach to learn changes of affordance +constraints in human demonstration to construct spatial bimanual action models +representing object interactions. To exploit the information encoded in these +spatial bimanual action models, we formulate an optimization problem to +determine optimal object configurations across multiple execution keypoints +while taking into account the initial scene, the learned affordance +constraints, and the robot's kinematics. We evaluate the approach in simulation +with two example tasks (pouring drinks and rolling dough) and compare three +different definitions of affordance constraints: (i) component-wise distances +between affordance regions in Cartesian space, (ii) component-wise distances +between affordance regions in cylindrical space, and (iii) degrees of +satisfaction of manually defined symbolic spatial affordance constraints. + +
+
+ comment: 8 pages, accepted for publication at Humanoids 2024 - Copyright IEEE +
+
+
+
+
+ + ♻ ☆ Robotic Sensor Network: Achieving Mutual Communication Control + Assistance With Fast Cross-Layer Optimization + + +
+ Robotic sensor network (RSN) is an emerging paradigm that harvests data from +remote sensors adopting mobile robots. However, communication and control +functionalities in RSNs are interdependent, for which existing approaches +become inefficient, as they plan robot trajectories merely based on +unidirectional impact between communication and control. This paper proposes +the concept of mutual communication control assistance (MCCA), which leverages +a model predictive communication and control (MPC2) design for intertwined +optimization of motion-assisted communication and communication-assisted +collision avoidance. The MPC2 problem jointly optimizes the cross-layer +variables of sensor powers and robot actions, and is solved by alternating +optimization, strong duality, and cross-horizon minorization maximization in +real time. This approach contrasts with conventional communication control +co-design methods that calculate an offline non-executable trajectory. +Experiments in a high-fidelity RSN simulator demonstrate that the proposed MCCA +outperforms various benchmarks in terms of communication efficiency and +navigation time. + +
+
+ comment: 5 pages, 6 figures, to appear in IEEE Wireless Communications Letters +
+
+
+
+
+ + ♻ ☆ SEEK: Semantic Reasoning for Object Goal Navigation in Real World + Inspection Tasks + + +
+ This paper addresses the problem of object-goal navigation in autonomous +inspections in real-world environments. Object-goal navigation is crucial to +enable effective inspections in various settings, often requiring the robot to +identify the target object within a large search space. Current object +inspection methods fall short of human efficiency because they typically cannot +bootstrap prior and common sense knowledge as humans do. In this paper, we +introduce a framework that enables robots to use semantic knowledge from prior +spatial configurations of the environment and semantic common sense knowledge. +We propose SEEK (Semantic Reasoning for Object Inspection Tasks) that combines +semantic prior knowledge with the robot's observations to search for and +navigate toward target objects more efficiently. SEEK maintains two +representations: a Dynamic Scene Graph (DSG) and a Relational Semantic Network +(RSN). The RSN is a compact and practical model that estimates the probability +of finding the target object across spatial elements in the DSG. We propose a +novel probabilistic planning framework to search for the object using +relational semantic knowledge. Our simulation analyses demonstrate that SEEK +outperforms the classical planning and Large Language Models (LLMs)-based +methods that are examined in this study in terms of efficiency for object-goal +inspection tasks. We validated our approach on a physical legged robot in urban +environments, showcasing its practicality and effectiveness in real-world +inspection scenarios. + +
+
+
+
+
+ + ♻ ☆ Multi-modal Situated Reasoning in 3D Scenes NeurIPS 2024 + + +
+ Situation awareness is essential for understanding and reasoning about 3D +scenes in embodied AI agents. However, existing datasets and benchmarks for +situated understanding are limited in data modality, diversity, scale, and task +scope. To address these limitations, we propose Multi-modal Situated Question +Answering (MSQA), a large-scale multi-modal situated reasoning dataset, +scalably collected leveraging 3D scene graphs and vision-language models (VLMs) +across a diverse range of real-world 3D scenes. MSQA includes 251K situated +question-answering pairs across 9 distinct question categories, covering +complex scenarios within 3D scenes. We introduce a novel interleaved +multi-modal input setting in our benchmark to provide text, image, and point +cloud for situation and question description, resolving ambiguity in previous +single-modality convention (e.g., text). Additionally, we devise the +Multi-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models' +situated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN +highlight the limitations of existing vision-language models and underscore the +importance of handling multi-modal interleaved inputs and situation modeling. +Experiments on data scaling and cross-domain transfer further demonstrate the +efficacy of leveraging MSQA as a pre-training dataset for developing more +powerful situated reasoning models. + +
+
+ comment: Accepted by NeurIPS 2024 Datasets and Benchmarks Track. Project page: + https://msr3d.github.io/ +
+
+
+
+
+ + ♻ ☆ Coverage Path Planning For Minimizing Expected Time to Search For an + Object With Continuous Sensing + + +
+ In this paper, we present several results of both theoretical as well as +practical interests. First, we propose the quota lawn mowing problem, an +extension of the classic lawn mowing problem in computational geometry, as +follows: given a quota of coverage, compute the shortest lawn mowing route to +achieve said quota. We give constant-factor approximations for the quota lawn +mowing problem. + Second, we investigate the expected detection time minimization problem in +geometric coverage path planning with local, continuous sensory information. We +provide the first approximation algorithm with provable error bounds with +pseudopolynomial running time. Our ideas also extend to another search +mechanism, namely visibility-based search, which is related to the watchman +route problem. We complement our theoretical analysis with some simple but +effective heuristics for finding an object in minimum expected time, on which +we provide simulation results. + +
+
+
+
+
+ + ♻ ☆ Autoregressive Action Sequence Learning for Robotic Manipulation + + +
+ Designing a universal policy architecture that performs well across diverse +robots and task configurations remains a key challenge. In this work, we +address this by representing robot actions as sequential data and generating +actions through autoregressive sequence modeling. Existing autoregressive +architectures generate end-effector waypoints sequentially as word tokens in +language modeling, which are limited to low-frequency control tasks. Unlike +language, robot actions are heterogeneous and often include continuous values +-- such as joint positions, 2D pixel coordinates, and end-effector poses -- +which are not easily suited for language-based modeling. Based on this insight, +we introduce a straightforward enhancement: we extend causal transformers' +single-token prediction to support predicting a variable number of tokens in a +single step through our Chunking Causal Transformer (CCT). This enhancement +enables robust performance across diverse tasks of various control frequencies, +greater efficiency by having fewer autoregression steps, and lead to a hybrid +action sequence design by mixing different types of actions and using a +different chunk size for each action type. Based on CCT, we propose the +Autoregressive Policy (ARP) architecture, which solves manipulation tasks by +generating hybrid action sequences. We evaluate ARP across diverse robotic +manipulation environments, including Push-T, ALOHA, and RLBench, and show that +ARP, as a universal architecture, outperforms the environment-specific +state-of-the-art in all tested benchmarks, while being more efficient in +computation and parameter sizes. Videos of our real robot demonstrations, all +source code and the pretrained models of ARP can be found at +http://github.com/mlzxy/arp. + +
+
+
+
+
+ + ♻ ☆ LiDAR-BEVMTN: Real-Time LiDAR Bird's-Eye View Multi-Task Perception + Network for Autonomous Driving + + +
+ LiDAR is crucial for robust 3D scene perception in autonomous driving. LiDAR +perception has the largest body of literature after camera perception. However, +multi-task learning across tasks like detection, segmentation, and motion +estimation using LiDAR remains relatively unexplored, especially on +automotive-grade embedded platforms. We present a real-time multi-task +convolutional neural network for LiDAR-based object detection, semantics, and +motion segmentation. The unified architecture comprises a shared encoder and +task-specific decoders, enabling joint representation learning. We propose a +novel Semantic Weighting and Guidance (SWAG) module to transfer semantic +features for improved object detection selectively. Our heterogeneous training +scheme combines diverse datasets and exploits complementary cues between tasks. +The work provides the first embedded implementation unifying these key +perception tasks from LiDAR point clouds achieving 3ms latency on the embedded +NVIDIA Xavier platform. We achieve state-of-the-art results for two tasks, +semantic and motion segmentation, and close to state-of-the-art performance for +3D object detection. By maximizing hardware efficiency and leveraging +multi-task synergies, our method delivers an accurate and efficient solution +tailored for real-world automated driving deployment. Qualitative results can +be seen at https://youtu.be/H-hWRzv2lIY. + +
+
+ comment: Accepted for publication at IEEE Transactions on Intelligent + Transportation Systems +
+
+
+
+
+ + ♻ ☆ Homeostatic motion planning with innate physics knowledge + + +
+ Living organisms interact with their surroundings in a closed-loop fashion, +where sensory inputs dictate the initiation and termination of behaviours. Even +simple animals are able to develop and execute complex plans, which has not yet +been replicated in robotics using pure closed-loop input control. We propose a +solution to this problem by defining a set of discrete and temporary +closed-loop controllers, called "tasks", each representing a closed-loop +behaviour. We further introduce a supervisory module which has an innate +understanding of physics and causality, through which it can simulate the +execution of task sequences over time and store the results in a model of the +environment. On the basis of this model, plans can be made by chaining +temporary closed-loop controllers. The proposed framework was implemented for a +real robot and tested in two scenarios as proof of concept. + +
+
+
+
+
+ + ♻ ☆ Irrotational Contact Fields + + +
+ We present a framework for generating convex approximations of complex +contact models, incorporating experimentally validated models like Hunt & +Crossley coupled with Coulomb's law of friction alongside the principle of +maximum dissipation. Our approach is robust across a wide range of stiffness +values, making it suitable for both compliant surfaces and rigid +approximations. We evaluate these approximations across a wide variety of test +cases, detailing properties and limitations. We implement a fully +differentiable solution in the open-source robotics toolkit, Drake. Our novel +hybrid approach enables computation of gradients for complex geometric models +while reusing factorizations from contact resolution. We demonstrate robust +simulation of robotic tasks at interactive rates, with accurately resolved +stiction and contact transitions, supporting effective sim-to-real transfer. + +
+
+ comment: 16 pages, 26 figures. The supplemental video is available publicly at + https://youtu.be/FTUPYZ_8Xbk?si=MWndCUCGWMJsFnsO +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning-Based Model Matching to Reduce the Sim-Real Gap + in COBRA + + +
+ This paper employs a reinforcement learning-based model identification method +aimed at enhancing the accuracy of the dynamics for our snake robot, called +COBRA. Leveraging gradient information and iterative optimization, the proposed +approach refines the parameters of COBRA's dynamical model such as coefficient +of friction and actuator parameters using experimental and simulated data. +Experimental validation on the hardware platform demonstrates the efficacy of +the proposed approach, highlighting its potential to address sim-to-real gap in +robot implementation. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 136 + +
+
+
+ + ☆ UniHands: Unifying Various Wild-Collected Keypoints for Personalized + Hand Reconstruction + + +
+ Accurate hand motion capture and standardized 3D representation are essential +for various hand-related tasks. Collecting keypoints-only data, while efficient +and cost-effective, results in low-fidelity representations and lacks surface +information. Furthermore, data inconsistencies across sources challenge their +integration and use. We present UniHands, a novel method for creating +standardized yet personalized hand models from wild-collected keypoints from +diverse sources. Unlike existing neural implicit representation methods, +UniHands uses the widely-adopted parametric models MANO and NIMBLE, providing a +more scalable and versatile solution. It also derives unified hand joints from +the meshes, which facilitates seamless integration into various hand-related +tasks. Experiments on the FreiHAND and InterHand2.6M datasets demonstrate its +ability to precisely reconstruct hand mesh vertices and keypoints, effectively +capturing high-degree articulation motions. Empirical studies involving nine +participants show a clear preference for our unified joints over existing +configurations for accuracy and naturalism (p-value 0.016). + +
+
+
+
+
+ + ☆ Generative World Explorer + + +
+ Planning with partial observation is a central challenge in embodied AI. A +majority of prior works have tackled this challenge by developing agents that +physically explore their environment to update their beliefs about the world +state.In contrast, humans can $\textit{imagine}$ unseen parts of the world +through a mental exploration and $\textit{revise}$ their beliefs with imagined +observations. Such updated beliefs can allow them to make more informed +decisions, without necessitating the physical exploration of the world at all +times. To achieve this human-like ability, we introduce the $\textit{Generative +World Explorer (Genex)}$, an egocentric world exploration framework that allows +an agent to mentally explore a large-scale 3D world (e.g., urban scenes) and +acquire imagined observations to update its belief. This updated belief will +then help the agent to make a more informed decision at the current step. To +train $\textit{Genex}$, we create a synthetic urban scene dataset, Genex-DB. +Our experimental results demonstrate that (1) $\textit{Genex}$ can generate +high-quality and consistent observations during long-horizon exploration of a +large virtual physical world and (2) the beliefs updated with the generated +observations can inform an existing decision-making model (e.g., an LLM agent) +to make better plans. + +
+
+ comment: Website: generative-world-explorer.github.io +
+
+
+
+
+ + ☆ RoboGSim: A Real2Sim2Real Robotic Gaussian Splatting Simulator + + +
+ Efficient acquisition of real-world embodied data has been increasingly +critical. However, large-scale demonstrations captured by remote operation tend +to take extremely high costs and fail to scale up the data size in an efficient +manner. Sampling the episodes under a simulated environment is a promising way +for large-scale collection while existing simulators fail to high-fidelity +modeling on texture and physics. To address these limitations, we introduce the +RoboGSim, a real2sim2real robotic simulator, powered by 3D Gaussian Splatting +and the physics engine. RoboGSim mainly includes four parts: Gaussian +Reconstructor, Digital Twins Builder, Scene Composer, and Interactive Engine. +It can synthesize the simulated data with novel views, objects, trajectories, +and scenes. RoboGSim also provides an online, reproducible, and safe evaluation +for different manipulation policies. The real2sim and sim2real transfer +experiments show a high consistency in the texture and physics. Moreover, the +effectiveness of synthetic data is validated under the real-world manipulated +tasks. We hope RoboGSim serves as a closed-loop simulator for fair comparison +on policy learning. More information can be found on our project page +https://robogsim.github.io/ . + +
+
+
+
+
+ + ☆ LightFFDNets: Lightweight Convolutional Neural Networks for Rapid Facial + Forgery Detection + + +
+ Accurate and fast recognition of forgeries is an issue of great importance in +the fields of artificial intelligence, image processing and object detection. +Recognition of forgeries of facial imagery is the process of classifying and +defining the faces in it by analyzing real-world facial images. This process is +usually accomplished by extracting features from an image, using classifier +algorithms, and correctly interpreting the results. Recognizing forgeries of +facial imagery correctly can encounter many different challenges. For example, +factors such as changing lighting conditions, viewing faces from different +angles can affect recognition performance, and background complexity and +perspective changes in facial images can make accurate recognition difficult. +Despite these difficulties, significant progress has been made in the field of +forgery detection. Deep learning algorithms, especially Convolutional Neural +Networks (CNNs), have significantly improved forgery detection performance. + This study focuses on image processing-based forgery detection using +Fake-Vs-Real-Faces (Hard) [10] and 140k Real and Fake Faces [61] data sets. +Both data sets consist of two classes containing real and fake facial images. +In our study, two lightweight deep learning models are proposed to conduct +forgery detection using these images. Additionally, 8 different pretrained CNN +architectures were tested on both data sets and the results were compared with +newly developed lightweight CNN models. It's shown that the proposed +lightweight deep learning models have minimum number of layers. It's also shown +that the proposed lightweight deep learning models detect forgeries of facial +imagery accurately, and computationally efficiently. Although the data set +consists only of face images, the developed models can also be used in other +two-class object recognition problems. + +
+
+ comment: 13 pages, 6 figures, 10 tables +
+
+
+
+
+ + ☆ Equivariant spatio-hemispherical networks for diffusion MRI + deconvolution NeurIPS 2024 + + +
+ Each voxel in a diffusion MRI (dMRI) image contains a spherical signal +corresponding to the direction and strength of water diffusion in the brain. +This paper advances the analysis of such spatio-spherical data by developing +convolutional network layers that are equivariant to the $\mathbf{E(3) \times +SO(3)}$ group and account for the physical symmetries of dMRI including +rotations, translations, and reflections of space alongside voxel-wise +rotations. Further, neuronal fibers are typically antipodally symmetric, a fact +we leverage to construct highly efficient spatio-hemispherical graph +convolutions to accelerate the analysis of high-dimensional dMRI data. In the +context of sparse spherical fiber deconvolution to recover white matter +microstructure, our proposed equivariant network layers yield substantial +performance and efficiency gains, leading to better and more practical +resolution of crossing neuronal fibers and fiber tractography. These gains are +experimentally consistent across both simulation and in vivo human datasets. + +
+
+ comment: Accepted to NeurIPS 2024. 24 pages with 13 figures. Code available at + https://github.com/AxelElaldi/fast-equivariant-deconv +
+
+
+
+
+ + ☆ Edge-Enhanced Dilated Residual Attention Network for Multimodal Medical + Image Fusion + + +
+ Multimodal medical image fusion is a crucial task that combines complementary +information from different imaging modalities into a unified representation, +thereby enhancing diagnostic accuracy and treatment planning. While deep +learning methods, particularly Convolutional Neural Networks (CNNs) and +Transformers, have significantly advanced fusion performance, some of the +existing CNN-based methods fall short in capturing fine-grained multiscale and +edge features, leading to suboptimal feature integration. Transformer-based +models, on the other hand, are computationally intensive in both the training +and fusion stages, making them impractical for real-time clinical use. +Moreover, the clinical application of fused images remains unexplored. In this +paper, we propose a novel CNN-based architecture that addresses these +limitations by introducing a Dilated Residual Attention Network Module for +effective multiscale feature extraction, coupled with a gradient operator to +enhance edge detail learning. To ensure fast and efficient fusion, we present a +parameter-free fusion strategy based on the weighted nuclear norm of softmax, +which requires no additional computations during training or inference. +Extensive experiments, including a downstream brain tumor classification task, +demonstrate that our approach outperforms various baseline methods in terms of +visual quality, texture preservation, and fusion speed, making it a possible +practical solution for real-world clinical applications. The code will be +released at https://github.com/simonZhou86/en_dran. + +
+
+ comment: An extended version of the paper accepted at IEEE BIBM 2024 +
+
+
+
+
+ + ☆ Exploring adversarial robustness of JPEG AI: methodology, comparison and + new methods + + +
+ Adversarial robustness of neural networks is an increasingly important area +of research, combining studies on computer vision models, large language models +(LLMs), and others. With the release of JPEG AI - the first standard for +end-to-end neural image compression (NIC) methods - the question of its +robustness has become critically significant. JPEG AI is among the first +international, real-world applications of neural-network-based models to be +embedded in consumer devices. However, research on NIC robustness has been +limited to open-source codecs and a narrow range of attacks. This paper +proposes a new methodology for measuring NIC robustness to adversarial attacks. +We present the first large-scale evaluation of JPEG AI's robustness, comparing +it with other NIC models. Our evaluation results and code are publicly +available online (link is hidden for a blind review). + +
+
+
+
+
+ + ☆ The Power of Many: Multi-Agent Multimodal Models for Cultural Image + Captioning + + +
+ Large Multimodal Models (LMMs) exhibit impressive performance across various +multimodal tasks. However, their effectiveness in cross-cultural contexts +remains limited due to the predominantly Western-centric nature of most data +and models. Conversely, multi-agent models have shown significant capability in +solving complex tasks. Our study evaluates the collective performance of LMMs +in a multi-agent interaction setting for the novel task of cultural image +captioning. Our contributions are as follows: (1) We introduce MosAIC, a +Multi-Agent framework to enhance cross-cultural Image Captioning using LMMs +with distinct cultural personas; (2) We provide a dataset of culturally +enriched image captions in English for images from China, India, and Romania +across three datasets: GeoDE, GD-VCR, CVQA; (3) We propose a culture-adaptable +metric for evaluating cultural information within image captions; and (4) We +show that the multi-agent interaction outperforms single-agent models across +different metrics, and offer valuable insights for future research. Our dataset +and models can be accessed at https://github.com/MichiganNLP/MosAIC. + +
+
+
+
+
+ + ☆ Revitalizing Electoral Trust: Enhancing Transparency and Efficiency + through Automated Voter Counting with Machine Learning + + +
+ In order to address issues with manual vote counting during election +procedures, this study intends to examine the viability of using advanced image +processing techniques for automated voter counting. The study aims to shed +light on how automated systems that utilize cutting-edge technologies like +OpenCV, CVZone, and the MOG2 algorithm could greatly increase the effectiveness +and openness of electoral operations. The empirical findings demonstrate how +automated voter counting can enhance voting processes and rebuild public +confidence in election outcomes, particularly in places where trust is low. The +study also emphasizes how rigorous metrics, such as the F1 score, should be +used to systematically compare the accuracy of automated systems against manual +counting methods. This methodology enables a detailed comprehension of the +differences in performance between automated and human counting techniques by +providing a nuanced assessment. The incorporation of said measures serves to +reinforce an extensive assessment structure, guaranteeing the legitimacy and +dependability of automated voting systems inside the electoral sphere. + +
+
+ comment: 13 Pages, 4 Figures +
+
+
+
+
+ + ☆ WoodYOLO: A Novel Object Detector for Wood Species Detection in + Microscopic Images + + +
+ Wood species identification plays a crucial role in various industries, from +ensuring the legality of timber products to advancing ecological conservation +efforts. This paper introduces WoodYOLO, a novel object detection algorithm +specifically designed for microscopic wood fiber analysis. Our approach adapts +the YOLO architecture to address the challenges posed by large, high-resolution +microscopy images and the need for high recall in localization of the cell type +of interest (vessel elements). Our results show that WoodYOLO significantly +outperforms state-of-the-art models, achieving performance gains of 12.9% and +6.5% in F2 score over YOLOv10 and YOLOv7, respectively. This improvement in +automated wood cell type localization capabilities contributes to enhancing +regulatory compliance, supporting sustainable forestry practices, and promoting +biodiversity conservation efforts globally. + +
+
+
+
+
+ + ☆ Aligning Few-Step Diffusion Models with Dense Reward Difference Learning + + +
+ Aligning diffusion models with downstream objectives is essential for their +practical applications. However, standard alignment methods often struggle with +step generalization when directly applied to few-step diffusion models, leading +to inconsistent performance across different denoising step scenarios. To +address this, we introduce Stepwise Diffusion Policy Optimization (SDPO), a +novel alignment method tailored for few-step diffusion models. Unlike prior +approaches that rely on a single sparse reward from only the final step of each +denoising trajectory for trajectory-level optimization, SDPO incorporates dense +reward feedback at every intermediate step. By learning the differences in +dense rewards between paired samples, SDPO facilitates stepwise optimization of +few-step diffusion models, ensuring consistent alignment across all denoising +steps. To promote stable and efficient training, SDPO introduces an online +reinforcement learning framework featuring several novel strategies designed to +effectively exploit the stepwise granularity of dense rewards. Experimental +results demonstrate that SDPO consistently outperforms prior methods in +reward-based alignment across diverse step configurations, underscoring its +robust step generalization capabilities. Code is avaliable at +https://github.com/ZiyiZhang27/sdpo. + +
+
+
+
+
+ + ☆ RAWMamba: Unified sRGB-to-RAW De-rendering With State Space Model + + +
+ Recent advancements in sRGB-to-RAW de-rendering have increasingly emphasized +metadata-driven approaches to reconstruct RAW data from sRGB images, +supplemented by partial RAW information. In image-based de-rendering, metadata +is commonly obtained through sampling, whereas in video tasks, it is typically +derived from the initial frame. The distinct metadata requirements necessitate +specialized network architectures, leading to architectural incompatibilities +that increase deployment complexity. In this paper, we propose RAWMamba, a +Mamba-based unified framework developed for sRGB-to-RAW de-rendering across +both image and video domains. The core of RAWMamba is the Unified Metadata +Embedding (UME) module, which harmonizes diverse metadata types into a unified +representation. In detail, a multi-perspective affinity modeling method is +proposed to promote the extraction of reference information. In addition, we +introduce the Local Tone-Aware Mamba (LTA-Mamba) module, which captures +long-range dependencies to enable effective global propagation of metadata. +Experimental results demonstrate that the proposed RAWMamba achieves +state-of-the-art performance, yielding high-quality RAW data reconstruction. + +
+
+
+
+
+ + ☆ MC-LLaVA: Multi-Concept Personalized Vision-Language Model + + +
+ Current vision-language models (VLMs) show exceptional abilities across +diverse tasks including visual question answering. To enhance user experience +in practical applications, recent studies investigate VLM personalization to +understand user-provided concepts. However, existing studies mainly focus on +single-concept personalization, neglecting the existence and interplay of +multiple concepts, which limits the real-world applicability of personalized +VLMs. In this paper, we propose the first multi-concept personalization method +named MC-LLaVA along with a high-quality multi-concept personalization dataset. +Specifically, MC-LLaVA uses a joint training strategy incorporating multiple +concepts in a single training step, allowing VLMs to perform accurately in +multi-concept personalization. To reduce the cost of joint training, MC-LLaVA +leverages visual token information for concept token initialization, yielding +improved concept representation and accelerating joint training. To advance +multi-concept personalization research, we further contribute a high-quality +dataset. We carefully collect images from various movies that contain multiple +characters and manually generate the multi-concept question-answer samples. Our +dataset features diverse movie types and question-answer types. We conduct +comprehensive qualitative and quantitative experiments to demonstrate that +MC-LLaVA can achieve impressive multi-concept personalized responses, paving +the way for VLMs to become better user-specific assistants. The code and +dataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA. + +
+
+
+
+
+ + ☆ From Spectra to Geography: Intelligent Mapping of RRUFF Mineral Data + + +
+ Accurately determining the geographic origin of mineral samples is pivotal +for applications in geology, mineralogy, and material science. Leveraging the +comprehensive Raman spectral data from the RRUFF database, this study +introduces a novel machine learning framework aimed at geolocating mineral +specimens at the country level. We employ a one-dimensional ConvNeXt1D neural +network architecture to classify mineral spectra based solely on their spectral +signatures. The processed dataset comprises over 32,900 mineral samples, +predominantly natural, spanning 101 countries. Through five-fold +cross-validation, the ConvNeXt1D model achieved an impressive average +classification accuracy of 93%, demonstrating its efficacy in capturing +geospatial patterns inherent in Raman spectra. + +
+
+
+
+
+ + ☆ Towards Degradation-Robust Reconstruction in Generalizable NeRF + + +
+ Generalizable Neural Radiance Field (GNeRF) across scenes has been proven to +be an effective way to avoid per-scene optimization by representing a scene +with deep image features of source images. However, despite its potential for +real-world applications, there has been limited research on the robustness of +GNeRFs to different types of degradation present in the source images. The lack +of such research is primarily attributed to the absence of a large-scale +dataset fit for training a degradation-robust generalizable NeRF model. To +address this gap and facilitate investigations into the degradation robustness +of 3D reconstruction tasks, we construct the Objaverse Blur Dataset, comprising +50,000 images from over 1000 settings featuring multiple levels of blur +degradation. In addition, we design a simple and model-agnostic module for +enhancing the degradation robustness of GNeRFs. Specifically, by extracting +3D-aware features through a lightweight depth estimator and denoiser, the +proposed module shows improvement on different popular methods in GNeRFs in +terms of both quantitative and visual quality over varying degradation types +and levels. Our dataset and code will be made publicly available. + +
+
+
+
+
+ + ☆ Dissecting Misalignment of Multimodal Large Language Models via + Influence Function + + +
+ Multi-modal Large Language models (MLLMs) are always trained on data from +diverse and unreliable sources, which may contain misaligned or mislabeled +text-image pairs. This frequently causes robustness issues and hallucinations, +leading to performance degradation. Data valuation is an efficient way to +detect and trace these misalignments. Nevertheless, existing methods are +computationally expensive for MLLMs. While computationally efficient, the +classical influence functions are inadequate for contrastive learning models +because they were originally designed for pointwise loss. Additionally, +contrastive learning involves minimizing the distance between the modalities of +positive samples and maximizing the distance between the modalities of negative +samples. This requires us to evaluate the influence of samples from both +perspectives. To tackle these challenges, we introduce the Extended Influence +Function for Contrastive Loss (ECIF), an influence function crafted for +contrastive loss. ECIF considers both positive and negative samples and +provides a closed-form approximation of contrastive learning models, +eliminating the need for retraining. Building upon ECIF, we develop a series of +algorithms for data evaluation in MLLM, misalignment detection, and +misprediction trace-back tasks. Experimental results demonstrate our ECIF +advances the transparency and interpretability of MLLMs by offering a more +accurate assessment of data impact and model alignment compared to traditional +baseline methods. + +
+
+ comment: 34 pages +
+
+
+
+
+ + ☆ SP${ }^3$ : Superpixel-propagated pseudo-label learning for weakly + semi-supervised medical image segmentation + + +
+ Deep learning-based medical image segmentation helps assist diagnosis and +accelerate the treatment process while the model training usually requires +large-scale dense annotation datasets. Weakly semi-supervised medical image +segmentation is an essential application because it only requires a small +amount of scribbles and a large number of unlabeled data to train the model, +which greatly reduces the clinician's effort to fully annotate images. To +handle the inadequate supervisory information challenge in weakly +semi-supervised segmentation (WSSS), a SuperPixel-Propagated Pseudo-label +(SP${}^3$) learning method is proposed, using the structural information +contained in superpixel for supplemental information. Specifically, the +annotation of scribbles is propagated to superpixels and thus obtains a dense +annotation for supervised training. Since the quality of pseudo-labels is +limited by the low-quality annotation, the beneficial superpixels selected by +dynamic thresholding are used to refine pseudo-labels. Furthermore, aiming to +alleviate the negative impact of noise in pseudo-label, superpixel-level +uncertainty is incorporated to guide the pseudo-label supervision for stable +learning. Our method achieves state-of-the-art performance on both tumor and +organ segmentation datasets under the WSSS setting, using only 3\% of the +annotation workload compared to fully supervised methods and attaining +approximately 80\% Dice score. Additionally, our method outperforms eight +weakly and semi-supervised methods under both weakly supervised and +semi-supervised settings. Results of extensive experiments validate the +effectiveness and annotation efficiency of our weakly semi-supervised +segmentation, which can assist clinicians in achieving automated segmentation +for organs or tumors quickly and ultimately benefit patients. + +
+
+ comment: 10 pages, 7 figures. Under Review +
+
+
+
+
+ + ☆ FERT: Real-Time Facial Expression Recognition with Short-Range FMCW + Radar + + +
+ This study proposes a novel approach for real-time facial expression +recognition utilizing short-range Frequency-Modulated Continuous-Wave (FMCW) +radar equipped with one transmit (Tx), and three receive (Rx) antennas. The +system leverages four distinct modalities simultaneously: Range-Doppler images +(RDIs), micro range-Doppler Images (micro-RDIs), range azimuth images (RAIs), +and range elevation images (REIs). Our innovative architecture integrates +feature extractor blocks, intermediate feature extractor blocks, and a ResNet +block to accurately classify facial expressions into smile, anger, neutral, and +no-face classes. Our model achieves an average classification accuracy of +98.91% on the dataset collected using a 60 GHz short-range FMCW radar. The +proposed solution operates in real-time in a person-independent manner, which +shows the potential use of low-cost FMCW radars for effective facial expression +recognition in various applications. + +
+
+ comment: Accepted at IEEE SENSORS 2024 +
+
+
+
+
+ + ☆ Leveraging Computational Pathology AI for Noninvasive Optical Imaging + Analysis Without Retraining + + +
+ Noninvasive optical imaging modalities can probe patient's tissue in 3D and +over time generate gigabytes of clinically relevant data per sample. There is a +need for AI models to analyze this data and assist clinical workflow. The lack +of expert labelers and the large dataset required (>100,000 images) for model +training and tuning are the main hurdles in creating foundation models. In this +paper we introduce FoundationShift, a method to apply any AI model from +computational pathology without retraining. We show our method is more accurate +than state of the art models (SAM, MedSAM, SAM-Med2D, CellProfiler, Hover-Net, +PLIP, UNI and ChatGPT), with multiple imaging modalities (OCT and RCM). This is +achieved without the need for model retraining or fine-tuning. Applying our +method to noninvasive in vivo images could enable physicians to readily +incorporate optical imaging modalities into their clinical practice, providing +real time tissue analysis and improving patient care. + +
+
+
+
+
+ + ☆ MSSIDD: A Benchmark for Multi-Sensor Denoising + + +
+ The cameras equipped on mobile terminals employ different sensors in +different photograph modes, and the transferability of raw domain denoising +models between these sensors is significant but remains sufficient exploration. +Industrial solutions either develop distinct training strategies and models for +different sensors or ignore the differences between sensors and simply extend +existing models to new sensors, which leads to tedious training or +unsatisfactory performance. In this paper, we introduce a new benchmark, the +Multi-Sensor SIDD (MSSIDD) dataset, which is the first raw-domain dataset +designed to evaluate the sensor transferability of denoising models. The MSSIDD +dataset consists of 60,000 raw images of six distinct sensors, derived through +the degeneration of sRGB images via different camera sensor parameters. +Furthermore, we propose a sensor consistency training framework that enables +denoising models to learn the sensor-invariant features, thereby facilitating +the generalization of the consistent model to unseen sensors. We evaluate +previous arts on the newly proposed MSSIDD dataset, and the experimental +results validate the effectiveness of our proposed method. Our dataset is +available at https://www.kaggle.com/datasets/sjtuwh/mssidd. + +
+
+ comment: 15 pages,7 figures +
+
+
+
+
+ + ☆ Real-Time Fitness Exercise Classification and Counting from Video Frames + + +
+ This paper introduces a novel method for real-time exercise classification +using a Bidirectional Long Short-Term Memory (BiLSTM) neural network. Existing +exercise recognition approaches often rely on synthetic datasets, raw +coordinate inputs sensitive to user and camera variations, and fail to fully +exploit the temporal dependencies in exercise movements. These issues limit +their generalizability and robustness in real-world conditions, where lighting, +camera angles, and user body types vary. + To address these challenges, we propose a BiLSTM-based model that leverages +invariant features, such as joint angles, alongside raw coordinates. By using +both angles and (x, y, z) coordinates, the model adapts to changes in +perspective, user positioning, and body differences, improving generalization. +Training on 30-frame sequences enables the BiLSTM to capture the temporal +context of exercises and recognize patterns evolving over time. + We compiled a dataset combining synthetic data from the InfiniteRep dataset +and real-world videos from Kaggle and other sources. This dataset includes four +common exercises: squat, push-up, shoulder press, and bicep curl. The model was +trained and validated on these diverse datasets, achieving an accuracy of over +99% on the test set. To assess generalizability, the model was tested on 2 +separate test sets representative of typical usage conditions. Comparisons with +the previous approach from the literature are present in the result section +showing that the proposed model is the best-performing one. + The classifier is integrated into a web application providing real-time +exercise classification and repetition counting without manual exercise +selection. + Demo and datasets are available at the following GitHub Repository: +https://github.com/RiccardoRiccio/Fitness-AI-Trainer-With-Automatic-Exercise-Recognition-and-Counting. + +
+
+
+
+
+ + ☆ Enhancing Vision-Language Model Safety through Progressive + Concept-Bottleneck-Driven Alignment + + +
+ Benefiting from the powerful capabilities of Large Language Models (LLMs), +pre-trained visual encoder models connected to LLMs form Vision Language Models +(VLMs). However, recent research shows that the visual modality in VLMs is +highly vulnerable, allowing attackers to bypass safety alignment in LLMs +through visually transmitted content, launching harmful attacks. To address +this challenge, we propose a progressive concept-based alignment strategy, +PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance +visual modality safety alignment. By aligning model predictions with specific +safety concepts, we improve defenses against risky images, enhancing +explainability and controllability while minimally impacting general +performance. Our method is obtained through two-stage training. The low +computational cost of the first stage brings very effective performance +improvement, and the fine-tuning of the language model in the second stage +further improves the safety performance. Our method achieves state-of-the-art +results on popular VLM safety benchmark. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.13581 +
+
+
+
+
+ + ☆ Reliable Poisoned Sample Detection against Backdoor Attacks Enhanced by + Sharpness Aware Minimization + + +
+ Backdoor attack has been considered as a serious security threat to deep +neural networks (DNNs). Poisoned sample detection (PSD) that aims at filtering +out poisoned samples from an untrustworthy training dataset has shown very +promising performance for defending against data poisoning based backdoor +attacks. However, we observe that the detection performance of many advanced +methods is likely to be unstable when facing weak backdoor attacks, such as low +poisoning ratio or weak trigger strength. To further verify this observation, +we make a statistical investigation among various backdoor attacks and poisoned +sample detections, showing a positive correlation between backdoor effect and +detection performance. It inspires us to strengthen the backdoor effect to +enhance detection performance. Since we cannot achieve that goal via directly +manipulating poisoning ratio or trigger strength, we propose to train one model +using the Sharpness-Aware Minimization (SAM) algorithm, rather than the vanilla +training algorithm. We also provide both empirical and theoretical analysis +about how SAM training strengthens the backdoor effect. Then, this SAM trained +model can be seamlessly integrated with any off-the-shelf PSD method that +extracts discriminative features from the trained model for detection, called +SAM-enhanced PSD. Extensive experiments on several benchmark datasets show the +reliable detection performance of the proposed method against both weak and +strong backdoor attacks, with significant improvements against various attacks +($+34.38\%$ TPR on average), over the conventional PSD methods (i.e., without +SAM enhancement). Overall, this work provides new insights about PSD and +proposes a novel approach that can complement existing detection methods, which +may inspire more in-depth explorations in this field. + +
+
+
+
+
+ + ☆ Cascaded Diffusion Models for 2D and 3D Microscopy Image Synthesis to + Enhance Cell Segmentation + + +
+ Automated cell segmentation in microscopy images is essential for biomedical +research, yet conventional methods are labor-intensive and prone to error. +While deep learning-based approaches have proven effective, they often require +large annotated datasets, which are scarce due to the challenges of manual +annotation. To overcome this, we propose a novel framework for synthesizing +densely annotated 2D and 3D cell microscopy images using cascaded diffusion +models. Our method synthesizes 2D and 3D cell masks from sparse 2D annotations +using multi-level diffusion models and NeuS, a 3D surface reconstruction +approach. Following that, a pretrained 2D Stable Diffusion model is finetuned +to generate realistic cell textures and the final outputs are combined to form +cell populations. We show that training a segmentation model with a combination +of our synthetic data and real data improves cell segmentation performance by +up to 9\% across multiple datasets. Additionally, the FID scores indicate that +the synthetic data closely resembles real data. The code for our proposed +approach will be available at +https://github.com/ruveydayilmaz0/cascaded\_diffusion. + +
+
+
+
+
+ + ☆ Learning a Neural Association Network for Self-supervised Multi-Object + Tracking + + +
+ This paper introduces a novel framework to learn data association for +multi-object tracking in a self-supervised manner. Fully-supervised learning +methods are known to achieve excellent tracking performances, but acquiring +identity-level annotations is tedious and time-consuming. Motivated by the fact +that in real-world scenarios object motion can be usually represented by a +Markov process, we present a novel expectation maximization (EM) algorithm that +trains a neural network to associate detections for tracking, without requiring +prior knowledge of their temporal correspondences. At the core of our method +lies a neural Kalman filter, with an observation model conditioned on +associations of detections parameterized by a neural network. Given a batch of +frames as input, data associations between detections from adjacent frames are +predicted by a neural network followed by a Sinkhorn normalization that +determines the assignment probabilities of detections to states. Kalman +smoothing is then used to obtain the marginal probability of observations given +the inferred states, producing a training objective to maximize this marginal +probability using gradient descent. The proposed framework is fully +differentiable, allowing the underlying neural model to be trained end-to-end. +We evaluate our approach on the challenging MOT17 and MOT20 datasets and +achieve state-of-the-art results in comparison to self-supervised trackers +using public detections. We furthermore demonstrate the capability of the +learned model to generalize across datasets. + +
+
+
+
+
+ + ☆ SignEye: Traffic Sign Interpretation from Vehicle First-Person View + + +
+ Traffic signs play a key role in assisting autonomous driving systems (ADS) +by enabling the assessment of vehicle behavior in compliance with traffic +regulations and providing navigation instructions. However, current works are +limited to basic sign understanding without considering the egocentric +vehicle's spatial position, which fails to support further regulation +assessment and direction navigation. Following the above issues, we introduce a +new task: traffic sign interpretation from the vehicle's first-person view, +referred to as TSI-FPV. Meanwhile, we develop a traffic guidance assistant +(TGA) scenario application to re-explore the role of traffic signs in ADS as a +complement to popular autonomous technologies (such as obstacle perception). +Notably, TGA is not a replacement for electronic map navigation; rather, TGA +can be an automatic tool for updating it and complementing it in situations +such as offline conditions or temporary sign adjustments. Lastly, a spatial and +semantic logic-aware stepwise reasoning pipeline (SignEye) is constructed to +achieve the TSI-FPV and TGA, and an application-specific dataset (Traffic-CN) +is built. Experiments show that TSI-FPV and TGA are achievable via our SignEye +trained on Traffic-CN. The results also demonstrate that the TGA can provide +complementary information to ADS beyond existing popular autonomous +technologies. + +
+
+
+
+
+ + ☆ LaVin-DiT: Large Vision Diffusion Transformer + + +
+ This paper presents the Large Vision Diffusion Transformer (LaVin-DiT), a +scalable and unified foundation model designed to tackle over 20 computer +vision tasks in a generative framework. Unlike existing large vision models +directly adapted from natural language processing architectures, which rely on +less efficient autoregressive techniques and disrupt spatial relationships +essential for vision data, LaVin-DiT introduces key innovations to optimize +generative performance for vision tasks. First, to address the high +dimensionality of visual data, we incorporate a spatial-temporal variational +autoencoder that encodes data into a continuous latent space. Second, for +generative modeling, we develop a joint diffusion transformer that +progressively produces vision outputs. Third, for unified multi-task training, +in-context learning is implemented. Input-target pairs serve as task context, +which guides the diffusion transformer to align outputs with specific tasks +within the latent space. During inference, a task-specific context set and test +data as queries allow LaVin-DiT to generalize across tasks without fine-tuning. +Trained on extensive vision datasets, the model is scaled from 0.1B to 3.4B +parameters, demonstrating substantial scalability and state-of-the-art +performance across diverse vision tasks. This work introduces a novel pathway +for large vision foundation models, underscoring the promising potential of +diffusion transformers. The code and models will be open-sourced. + +
+
+ comment: 11 pages, 7 figures, 2 tables +
+
+
+
+
+ + ☆ Look a Group at Once: Multi-Slide Modeling for Survival Prediction + + +
+ Survival prediction is a critical task in pathology. In clinical practice, +pathologists often examine multiple cases, leveraging a broader spectrum of +cancer phenotypes to enhance pathological assessment. Despite significant +advancements in deep learning, current solutions typically model each slide as +a sample, struggling to effectively capture comparable and slide-agnostic +pathological features. In this paper, we introduce GroupMIL, a novel framework +inspired by the clinical practice of collective analysis, which models multiple +slides as a single sample and organizes groups of patches and slides +sequentially to capture cross-slide prognostic features. We also present +GPAMamba, a model designed to facilitate intra- and inter-slide feature +interactions, effectively capturing local micro-environmental characteristics +within slide-level graphs while uncovering essential prognostic patterns across +an extended patch sequence within the group framework. Furthermore, we develop +a dual-head predictor that delivers comprehensive survival risk and probability +assessments for each patient. Extensive empirical evaluations demonstrate that +our model significantly outperforms state-of-the-art approaches across five +datasets from The Cancer Genome Atlas. + +
+
+
+
+
+ + ☆ Exploring Emerging Trends and Research Opportunities in Visual Place + Recognition ICRA + + +
+ Visual-based recognition, e.g., image classification, object detection, etc., +is a long-standing challenge in computer vision and robotics communities. +Concerning the roboticists, since the knowledge of the environment is a +prerequisite for complex navigation tasks, visual place recognition is vital +for most localization implementations or re-localization and loop closure +detection pipelines within simultaneous localization and mapping (SLAM). More +specifically, it corresponds to the system's ability to identify and match a +previously visited location using computer vision tools. Towards developing +novel techniques with enhanced accuracy and robustness, while motivated by the +success presented in natural language processing methods, researchers have +recently turned their attention to vision-language models, which integrate +visual and textual data. + +
+
+ comment: 2 pages, 1 figure. 40th Anniversary of the IEEE Conference on + Robotics and Automation (ICRA@40), Rotterdam, Netherlands, September 23-26, + 2024 +
+
+
+
+
+ + ☆ SL-YOLO: A Stronger and Lighter Drone Target Detection Model + + +
+ Detecting small objects in complex scenes, such as those captured by drones, +is a daunting challenge due to the difficulty in capturing the complex features +of small targets. While the YOLO family has achieved great success in large +target detection, its performance is less than satisfactory when faced with +small targets. Because of this, this paper proposes a revolutionary model +SL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small +target detection. We propose the Hierarchical Extended Path Aggregation Network +(HEPAN), a pioneering cross-scale feature fusion method that can ensure +unparalleled detection accuracy even in the most challenging environments. At +the same time, without sacrificing detection capabilities, we design the C2fDCB +lightweight module and add the SCDown downsampling module to greatly reduce the +model's parameters and computational complexity. Our experimental results on +the VisDrone2019 dataset reveal a significant improvement in performance, with +mAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to +28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M, +and the FPS can reach 132, making it an ideal solution for real-time small +object detection in resource-constrained environments. + +
+
+
+
+
+ + ☆ MVLight: Relightable Text-to-3D Generation via Light-conditioned + Multi-View Diffusion + + +
+ Recent advancements in text-to-3D generation, building on the success of +high-performance text-to-image generative models, have made it possible to +create imaginative and richly textured 3D objects from textual descriptions. +However, a key challenge remains in effectively decoupling light-independent +and lighting-dependent components to enhance the quality of generated 3D models +and their relighting performance. In this paper, we present MVLight, a novel +light-conditioned multi-view diffusion model that explicitly integrates +lighting conditions directly into the generation process. This enables the +model to synthesize high-quality images that faithfully reflect the specified +lighting environment across multiple camera views. By leveraging this +capability to Score Distillation Sampling (SDS), we can effectively synthesize +3D models with improved geometric precision and relighting capabilities. We +validate the effectiveness of MVLight through extensive experiments and a user +study. + +
+
+
+
+
+ + ☆ Generalizable Person Re-identification via Balancing Alignment and + Uniformity NeurIPS 2024 + + +
+ Domain generalizable person re-identification (DG re-ID) aims to learn +discriminative representations that are robust to distributional shifts. While +data augmentation is a straightforward solution to improve generalization, +certain augmentations exhibit a polarized effect in this task, enhancing +in-distribution performance while deteriorating out-of-distribution +performance. In this paper, we investigate this phenomenon and reveal that it +leads to sparse representation spaces with reduced uniformity. To address this +issue, we propose a novel framework, Balancing Alignment and Uniformity (BAU), +which effectively mitigates this effect by maintaining a balance between +alignment and uniformity. Specifically, BAU incorporates alignment and +uniformity losses applied to both original and augmented images and integrates +a weighting strategy to assess the reliability of augmented samples, further +improving the alignment loss. Additionally, we introduce a domain-specific +uniformity loss that promotes uniformity within each source domain, thereby +enhancing the learning of domain-invariant features. Extensive experimental +results demonstrate that BAU effectively exploits the advantages of data +augmentation, which previous studies could not fully utilize, and achieves +state-of-the-art performance without requiring complex training procedures. The +code is available at \url{https://github.com/yoonkicho/BAU}. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ MGNiceNet: Unified Monocular Geometric Scene Understanding ACCV 2024 + + +
+ Monocular geometric scene understanding combines panoptic segmentation and +self-supervised depth estimation, focusing on real-time application in +autonomous vehicles. We introduce MGNiceNet, a unified approach that uses a +linked kernel formulation for panoptic segmentation and self-supervised depth +estimation. MGNiceNet is based on the state-of-the-art real-time panoptic +segmentation method RT-K-Net and extends the architecture to cover both +panoptic segmentation and self-supervised monocular depth estimation. To this +end, we introduce a tightly coupled self-supervised depth estimation predictor +that explicitly uses information from the panoptic path for depth prediction. +Furthermore, we introduce a panoptic-guided motion masking method to improve +depth estimation without relying on video panoptic segmentation annotations. We +evaluate our method on two popular autonomous driving datasets, Cityscapes and +KITTI. Our model shows state-of-the-art results compared to other real-time +methods and closes the gap to computationally more demanding methods. Source +code and trained models are available at +https://github.com/markusschoen/MGNiceNet. + +
+
+ comment: Accepted for ACCV 2024 +
+
+
+
+
+ + ☆ HistoEncoder: a digital pathology foundation model for prostate cancer + + +
+ Foundation models are trained on massive amounts of data to distinguish +complex patterns and can be adapted to a wide range of downstream tasks with +minimal computational resources. Here, we develop a foundation model for +prostate cancer digital pathology called HistoEncoder by pre-training on 48 +million prostate tissue tile images. We demonstrate that HistoEncoder features +extracted from tile images with similar histological patterns map closely +together in the feature space. HistoEncoder outperforms models pre-trained with +natural images, even without fine-tuning or with 1000 times less training data. +We describe two use cases that leverage the capabilities of HistoEncoder by +fine-tuning the model with a limited amount of data and computational +resources. First, we show how HistoEncoder can be used to automatically +annotate large-scale datasets with high accuracy. Second, we combine histomics +with commonly used clinical nomograms, significantly improving prostate +cancer-specific death survival models. Foundation models such as HistoEncoder +can allow organizations with limited resources to build effective clinical +software tools without needing extensive datasets or significant amounts of +computing. + +
+
+
+
+
+ + ☆ The ADUULM-360 Dataset -- A Multi-Modal Dataset for Depth Estimation in + Adverse Weather + + +
+ Depth estimation is an essential task toward full scene understanding since +it allows the projection of rich semantic information captured by cameras into +3D space. While the field has gained much attention recently, datasets for +depth estimation lack scene diversity or sensor modalities. This work presents +the ADUULM-360 dataset, a novel multi-modal dataset for depth estimation. The +ADUULM-360 dataset covers all established autonomous driving sensor modalities, +cameras, lidars, and radars. It covers a frontal-facing stereo setup, six +surround cameras covering the full 360-degree, two high-resolution long-range +lidar sensors, and five long-range radar sensors. It is also the first depth +estimation dataset that contains diverse scenes in good and adverse weather +conditions. We conduct extensive experiments using state-of-the-art +self-supervised depth estimation methods under different training tasks, such +as monocular training, stereo training, and full surround training. Discussing +these results, we demonstrate common limitations of state-of-the-art methods, +especially in adverse weather conditions, which hopefully will inspire future +research in this area. Our dataset, development kit, and trained baselines are +available at https://github.com/uulm-mrm/aduulm_360_dataset. + +
+
+ comment: 2024 IEEE International Conference on Intelligent Transportation + Systems (ITSC) +
+
+
+
+
+ + ☆ Relevance-guided Audio Visual Fusion for Video Saliency Prediction + + +
+ Audio data, often synchronized with video frames, plays a crucial role in +guiding the audience's visual attention. Incorporating audio information into +video saliency prediction tasks can enhance the prediction of human visual +behavior. However, existing audio-visual saliency prediction methods often +directly fuse audio and visual features, which ignore the possibility of +inconsistency between the two modalities, such as when the audio serves as +background music. To address this issue, we propose a novel relevance-guided +audio-visual saliency prediction network dubbed AVRSP. Specifically, the +Relevance-guided Audio-Visual feature Fusion module (RAVF) dynamically adjusts +the retention of audio features based on the semantic relevance between audio +and visual elements, thereby refining the integration process with visual +features. Furthermore, the Multi-scale feature Synergy (MS) module integrates +visual features from different encoding stages, enhancing the network's ability +to represent objects at various scales. The Multi-scale Regulator Gate (MRG) +could transfer crucial fusion information to visual features, thus optimizing +the utilization of multi-scale visual features. Extensive experiments on six +audio-visual eye movement datasets have demonstrated that our AVRSP network +achieves competitive performance in audio-visual saliency prediction. + +
+
+
+
+
+ + ☆ GLDesigner: Leveraging Multi-Modal LLMs as Designer for Enhanced + Aesthetic Text Glyph Layouts + + +
+ Text logo design heavily relies on the creativity and expertise of +professional designers, in which arranging element layouts is one of the most +important procedures. However, few attention has been paid to this specific +task which needs to take precise textural details and user constraints into +consideration, but only on the broader tasks such as document/poster layout +generation. In this paper, we propose a VLM-based framework that generates +content-aware text logo layouts by integrating multi-modal inputs with user +constraints, supporting a more flexible and stable layout design in real-world +applications. We introduce two model techniques to reduce the computation for +processing multiple glyph images simultaneously, while does not face +performance degradation. To support instruction-tuning of out model, we +construct two extensive text logo datasets, which are 5x more larger than the +existing public dataset. Except for the geometric annotations (e.g. text masks +and character recognition), we also compliment with comprehensive layout +descriptions in natural language format, for more effective training to have +reasoning ability when dealing with complex layouts and custom user +constraints. Experimental studies demonstrate the effectiveness of our proposed +model and datasets, when comparing with previous methods in various benchmarks +to evaluate geometric aesthetics and human preferences. The code and datasets +will be publicly available. + +
+
+
+
+
+ + ☆ Towards fast DBSCAN via Spectrum-Preserving Data Compression + + +
+ This paper introduces a novel method to significantly accelerate DBSCAN by +employing spectral data compression. The proposed approach reduces the size of +the data set by a factor of five while preserving the essential clustering +characteristics through an innovative spectral compression technique. This +enables DBSCAN to run substantially faster without any loss of accuracy. +Experiments on real-world data sets, such as USPS, demonstrate the method's +capability to achieve this dramatic reduction in data size while maintaining +clustering performance. + +
+
+
+
+
+ + ☆ IKEA Manuals at Work: 4D Grounding of Assembly Instructions on Internet + Videos NeurIPS 2024 + + +
+ Shape assembly is a ubiquitous task in daily life, integral for constructing +complex 3D structures like IKEA furniture. While significant progress has been +made in developing autonomous agents for shape assembly, existing datasets have +not yet tackled the 4D grounding of assembly instructions in videos, essential +for a holistic understanding of assembly in 3D space over time. We introduce +IKEA Video Manuals, a dataset that features 3D models of furniture parts, +instructional manuals, assembly videos from the Internet, and most importantly, +annotations of dense spatio-temporal alignments between these data modalities. +To demonstrate the utility of IKEA Video Manuals, we present five applications +essential for shape assembly: assembly plan generation, part-conditioned +segmentation, part-conditioned pose estimation, video object segmentation, and +furniture assembly based on instructional video manuals. For each application, +we provide evaluation metrics and baseline methods. Through experiments on our +annotated data, we highlight many challenges in grounding assembly instructions +in videos to improve shape assembly, including handling occlusions, varying +viewpoints, and extended assembly sequences. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Stacking Brick by Brick: Aligned Feature Isolation for Incremental Face + Forgery Detection + + +
+ The rapid advancement of face forgery techniques has introduced a growing +variety of forgeries. Incremental Face Forgery Detection (IFFD), involving +gradually adding new forgery data to fine-tune the previously trained model, +has been introduced as a promising strategy to deal with evolving forgery +methods. However, a naively trained IFFD model is prone to catastrophic +forgetting when new forgeries are integrated, as treating all forgeries as a +single ''Fake" class in the Real/Fake classification can cause different +forgery types overriding one another, thereby resulting in the forgetting of +unique characteristics from earlier tasks and limiting the model's +effectiveness in learning forgery specificity and generality. In this paper, we +propose to stack the latent feature distributions of previous and new tasks +brick by brick, $\textit{i.e.}$, achieving $\textbf{aligned feature +isolation}$. In this manner, we aim to preserve learned forgery information and +accumulate new knowledge by minimizing distribution overriding, thereby +mitigating catastrophic forgetting. To achieve this, we first introduce Sparse +Uniform Replay (SUR) to obtain the representative subsets that could be treated +as the uniformly sparse versions of the previous global distributions. We then +propose a Latent-space Incremental Detector (LID) that leverages SUR data to +isolate and align distributions. For evaluation, we construct a more advanced +and comprehensive benchmark tailored for IFFD. The leading experimental results +validate the superiority of our method. + +
+
+
+
+
+ + ☆ Lung Disease Detection with Vision Transformers: A Comparative Study of + Machine Learning Methods + + +
+ Recent advancements in medical image analysis have predominantly relied on +Convolutional Neural Networks (CNNs), achieving impressive performance in chest +X-ray classification tasks, such as the 92% AUC reported by AutoThorax-Net and +the 88% AUC achieved by ChexNet in classifcation tasks. However, in the medical +field, even small improvements in accuracy can have significant clinical +implications. This study explores the application of Vision Transformers (ViT), +a state-of-the-art architecture in machine learning, to chest X-ray analysis, +aiming to push the boundaries of diagnostic accuracy. I present a comparative +analysis of two ViT-based approaches: one utilizing full chest X-ray images and +another focusing on segmented lung regions. Experiments demonstrate that both +methods surpass the performance of traditional CNN-based models, with the +full-image ViT achieving up to 97.83% accuracy and the lung-segmented ViT +reaching 96.58% accuracy in classifcation of diseases on three label and AUC of +94.54% when label numbers are increased to eight. Notably, the full-image +approach showed superior performance across all metrics, including precision, +recall, F1 score, and AUC-ROC. These findings suggest that Vision Transformers +can effectively capture relevant features from chest X-rays without the need +for explicit lung segmentation, potentially simplifying the preprocessing +pipeline while maintaining high accuracy. This research contributes to the +growing body of evidence supporting the efficacy of transformer-based +architectures in medical image analysis and highlights their potential to +enhance diagnostic precision in clinical settings. + +
+
+
+
+
+ + ☆ LeC$^2$O-NeRF: Learning Continuous and Compact Large-Scale Occupancy for + Urban Scenes + + +
+ In NeRF, a critical problem is to effectively estimate the occupancy to guide +empty-space skipping and point sampling. Grid-based methods work well for +small-scale scenes. However, on large-scale scenes, they are limited by +predefined bounding boxes, grid resolutions, and high memory usage for grid +updates, and thus struggle to speed up training for large-scale, irregularly +bounded and complex urban scenes without sacrificing accuracy. In this paper, +we propose to learn a continuous and compact large-scale occupancy network, +which can classify 3D points as occupied or unoccupied points. We train this +occupancy network end-to-end together with the radiance field in a +self-supervised manner by three designs. First, we propose a novel imbalanced +occupancy loss to regularize the occupancy network. It makes the occupancy +network effectively control the ratio of unoccupied and occupied points, +motivated by the prior that most of 3D scene points are unoccupied. Second, we +design an imbalanced architecture containing a large scene network and a small +empty space network to separately encode occupied and unoccupied points +classified by the occupancy network. This imbalanced structure can effectively +model the imbalanced nature of occupied and unoccupied regions. Third, we +design an explicit density loss to guide the occupancy network, making the +density of unoccupied points smaller. As far as we know, we are the first to +learn a continuous and compact occupancy of large-scale NeRF by a network. In +our experiments, our occupancy network can quickly learn more compact, accurate +and smooth occupancy compared to the occupancy grid. With our learned occupancy +as guidance for empty space skipping on challenging large-scale benchmarks, our +method consistently obtains higher accuracy compared to the occupancy grid, and +our method can speed up state-of-the-art NeRF methods without sacrificing +accuracy. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ TL-CLIP: A Power-specific Multimodal Pre-trained Visual Foundation Model + for Transmission Line Defect Recognition + + +
+ Transmission line defect recognition models have traditionally used general +pre-trained weights as the initial basis for their training. These models often +suffer weak generalization capability due to the lack of domain knowledge in +the pre-training dataset. To address this issue, we propose a two-stage +transmission-line-oriented contrastive language-image pre-training (TL-CLIP) +framework, which lays a more effective foundation for transmission line defect +recognition. The pre-training process employs a novel power-specific multimodal +algorithm assisted with two power-specific pre-training tasks for better +modeling the power-related semantic knowledge contained in the inspection data. +To fine-tune the pre-trained model, we develop a transfer learning strategy, +namely fine-tuning with pre-training objective (FTP), to alleviate the +overfitting problem caused by limited inspection data. Experimental results +demonstrate that the proposed method significantly improves the performance of +transmission line defect recognition in both classification and detection +tasks, indicating clear advantages over traditional pre-trained models in the +scene of transmission line inspection. + +
+
+
+
+
+ + ☆ GPS-Gaussian+: Generalizable Pixel-wise 3D Gaussian Splatting for + Real-Time Human-Scene Rendering from Sparse Views CVPR 2024 + + +
+ Differentiable rendering techniques have recently shown promising results for +free-viewpoint video synthesis of characters. However, such methods, either +Gaussian Splatting or neural implicit rendering, typically necessitate +per-subject optimization which does not meet the requirement of real-time +rendering in an interactive application. We propose a generalizable Gaussian +Splatting approach for high-resolution image rendering under a sparse-view +camera setting. To this end, we introduce Gaussian parameter maps defined on +the source views and directly regress Gaussian properties for instant novel +view synthesis without any fine-tuning or optimization. We train our Gaussian +parameter regression module on human-only data or human-scene data, jointly +with a depth estimation module to lift 2D parameter maps to 3D space. The +proposed framework is fully differentiable with both depth and rendering +supervision or with only rendering supervision. We further introduce a +regularization term and an epipolar attention mechanism to preserve geometry +consistency between two source views, especially when neglecting depth +supervision. Experiments on several datasets demonstrate that our method +outperforms state-of-the-art methods while achieving an exceeding rendering +speed. + +
+
+ comment: Journal extension of CVPR 2024,Project + page:https://yaourtb.github.io/GPS-Gaussian+ +
+
+
+
+
+ + ☆ MAIRA-Seg: Enhancing Radiology Report Generation with Segmentation-Aware + Multimodal Large Language Models ML4H 2024 + + +
+ There is growing interest in applying AI to radiology report generation, +particularly for chest X-rays (CXRs). This paper investigates whether +incorporating pixel-level information through segmentation masks can improve +fine-grained image interpretation of multimodal large language models (MLLMs) +for radiology report generation. We introduce MAIRA-Seg, a segmentation-aware +MLLM framework designed to utilize semantic segmentation masks alongside CXRs +for generating radiology reports. We train expert segmentation models to obtain +mask pseudolabels for radiology-specific structures in CXRs. Subsequently, +building on the architectures of MAIRA, a CXR-specialised model for report +generation, we integrate a trainable segmentation tokens extractor that +leverages these mask pseudolabels, and employ mask-aware prompting to generate +draft radiology reports. Our experiments on the publicly available MIMIC-CXR +dataset show that MAIRA-Seg outperforms non-segmentation baselines. We also +investigate set-of-marks prompting with MAIRA and find that MAIRA-Seg +consistently demonstrates comparable or superior performance. The results +confirm that using segmentation masks enhances the nuanced reasoning of MLLMs, +potentially contributing to better clinical outcomes. + +
+
+ comment: Accepted as Proceedings Paper at ML4H 2024 +
+
+
+
+
+ + ☆ Scalable Autoregressive Monocular Depth Estimation + + +
+ This paper proposes a new autoregressive model as an effective and scalable +monocular depth estimator. Our idea is simple: We tackle the monocular depth +estimation (MDE) task with an autoregressive prediction paradigm, based on two +core designs. First, our depth autoregressive model (DAR) treats the depth map +of different resolutions as a set of tokens, and conducts the low-to-high +resolution autoregressive objective with a patch-wise casual mask. Second, our +DAR recursively discretizes the entire depth range into more compact intervals, +and attains the coarse-to-fine granularity autoregressive objective in an +ordinal-regression manner. By coupling these two autoregressive objectives, our +DAR establishes new state-of-the-art (SOTA) on KITTI and NYU Depth v2 by clear +margins. Further, our scalable approach allows us to scale the model up to 2.0B +and achieve the best RMSE of 1.799 on the KITTI dataset (5% improvement) +compared to 1.896 by the current SOTA (Depth Anything). DAR further showcases +zero-shot generalization ability on unseen datasets. These results suggest that +DAR yields superior performance with an autoregressive prediction paradigm, +providing a promising approach to equip modern autoregressive large models +(e.g., GPT-4o) with depth estimation capabilities. + +
+
+
+
+
+ + ☆ CCExpert: Advancing MLLM Capability in Remote Sensing Change Captioning + with Difference-Aware Integration and a Foundational Dataset + + +
+ Remote Sensing Image Change Captioning (RSICC) aims to generate natural +language descriptions of surface changes between multi-temporal remote sensing +images, detailing the categories, locations, and dynamics of changed objects +(e.g., additions or disappearances). Many current methods attempt to leverage +the long-sequence understanding and reasoning capabilities of multimodal large +language models (MLLMs) for this task. However, without comprehensive data +support, these approaches often alter the essential feature transmission +pathways of MLLMs, disrupting the intrinsic knowledge within the models and +limiting their potential in RSICC. In this paper, we propose a novel model, +CCExpert, based on a new, advanced multimodal large model framework. Firstly, +we design a difference-aware integration module to capture multi-scale +differences between bi-temporal images and incorporate them into the original +image context, thereby enhancing the signal-to-noise ratio of differential +features. Secondly, we constructed a high-quality, diversified dataset called +CC-Foundation, containing 200,000 image pairs and 1.2 million captions, to +provide substantial data support for continue pretraining in this domain. +Lastly, we employed a three-stage progressive training process to ensure the +deep integration of the difference-aware integration module with the pretrained +MLLM. CCExpert achieved a notable performance of $S^*_m=81.80$ on the LEVIR-CC +benchmark, significantly surpassing previous state-of-the-art methods. The code +and part of the dataset will soon be open-sourced at +https://github.com/Meize0729/CCExpert. + +
+
+
+
+
+ + ☆ Text-guided Zero-Shot Object Localization + + +
+ Object localization is a hot issue in computer vision area, which aims to +identify and determine the precise location of specific objects from image or +video. Most existing object localization methods heavily rely on extensive +labeled data, which are costly to annotate and constrain their applicability. +Therefore, we propose a new Zero-Shot Object Localization (ZSOL) framework for +addressing the aforementioned challenges. In the proposed framework, we +introduce the Contrastive Language Image Pre-training (CLIP) module which could +integrate visual and linguistic information effectively. Furthermore, we design +a Text Self-Similarity Matching (TSSM) module, which could improve the +localization accuracy by enhancing the representation of text features +extracted by CLIP module. Hence, the proposed framework can be guided by prompt +words to identify and locate specific objects in an image in the absence of +labeled samples. The results of extensive experiments demonstrate that the +proposed method could improve the localization performance significantly and +establishes an effective benchmark for further research. + +
+
+
+
+
+ + ☆ Superpixel-informed Implicit Neural Representation for Multi-Dimensional + Data ECCV 2024 + + +
+ Recently, implicit neural representations (INRs) have attracted increasing +attention for multi-dimensional data recovery. However, INRs simply map +coordinates via a multi-layer perception (MLP) to corresponding values, +ignoring the inherent semantic information of the data. To leverage semantic +priors from the data, we propose a novel Superpixel-informed INR (S-INR). +Specifically, we suggest utilizing generalized superpixel instead of pixel as +an alternative basic unit of INR for multi-dimensional data (e.g., images and +weather data). The coordinates of generalized superpixels are first fed into +exclusive attention-based MLPs, and then the intermediate results interact with +a shared dictionary matrix. The elaborately designed modules in S-INR allow us +to ingenuously exploit the semantic information within and across generalized +superpixels. Extensive experiments on various applications validate the +effectiveness and efficacy of our S-INR compared to state-of-the-art INR +methods. + +
+
+ comment: Accepted at ECCV 2024, 18 pages, 7 figures +
+
+
+
+
+ + ☆ A comprehensive survey of oracle character recognition: challenges, + benchmarks, and beyond + + +
+ Oracle character recognition-an analysis of ancient Chinese inscriptions +found on oracle bones-has become a pivotal field intersecting archaeology, +paleography, and historical cultural studies. Traditional methods of oracle +character recognition have relied heavily on manual interpretation by experts, +which is not only labor-intensive but also limits broader accessibility to the +general public. With recent breakthroughs in pattern recognition and deep +learning, there is a growing movement towards the automation of oracle +character recognition (OrCR), showing considerable promise in tackling the +challenges inherent to these ancient scripts. However, a comprehensive +understanding of OrCR still remains elusive. Therefore, this paper presents a +systematic and structured survey of the current landscape of OrCR research. We +commence by identifying and analyzing the key challenges of OrCR. Then, we +provide an overview of the primary benchmark datasets and digital resources +available for OrCR. A review of contemporary research methodologies follows, in +which their respective efficacies, limitations, and applicability to the +complex nature of oracle characters are critically highlighted and examined. +Additionally, our review extends to ancillary tasks associated with OrCR across +diverse disciplines, providing a broad-spectrum analysis of its applications. +We conclude with a forward-looking perspective, proposing potential avenues for +future investigations that could yield significant advancements in the field. + +
+
+
+
+
+ + ☆ Visual-Semantic Graph Matching Net for Zero-Shot Learning + + +
+ Zero-shot learning (ZSL) aims to leverage additional semantic information to +recognize unseen classes. To transfer knowledge from seen to unseen classes, +most ZSL methods often learn a shared embedding space by simply aligning visual +embeddings with semantic prototypes. However, methods trained under this +paradigm often struggle to learn robust embedding space because they align the +two modalities in an isolated manner among classes, which ignore the crucial +class relationship during the alignment process. To address the aforementioned +challenges, this paper proposes a Visual-Semantic Graph Matching Net, termed as +VSGMN, which leverages semantic relationships among classes to aid in +visual-semantic embedding. VSGMN employs a Graph Build Network (GBN) and a +Graph Matching Network (GMN) to achieve two-stage visual-semantic alignment. +Specifically, GBN first utilizes an embedding-based approach to build visual +and semantic graphs in the semantic space and align the embedding with its +prototype for first-stage alignment. Additionally, to supplement unseen class +relations in these graphs, GBN also build the unseen class nodes based on +semantic relationships. In the second stage, GMN continuously integrates +neighbor and cross-graph information into the constructed graph nodes, and +aligns the node relationships between the two graphs under the class +relationship constraint. Extensive experiments on three benchmark datasets +demonstrate that VSGMN achieves superior performance in both conventional and +generalized ZSL scenarios. The implementation of our VSGMN and experimental +results are available at github: https://github.com/dbwfd/VSGMN + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ☆ Teaching Video Diffusion Model with Latent Physical Phenomenon Knowledge + + +
+ Video diffusion models have exhibited tremendous progress in various video +generation tasks. However, existing models struggle to capture latent physical +knowledge, failing to infer physical phenomena that are challenging to +articulate with natural language. Generating videos following the fundamental +physical laws is still an opening challenge. To address this challenge, we +propose a novel method to teach video diffusion models with latent physical +phenomenon knowledge, enabling the accurate generation of physically informed +phenomena. Specifically, we first pretrain Masked Autoencoders (MAE) to +reconstruct the physical phenomena, resulting in output embeddings that +encapsulate latent physical phenomenon knowledge. Leveraging these embeddings, +we could generate the pseudo-language prompt features based on the aligned +spatial relationships between CLIP vision and language encoders. Particularly, +given that diffusion models typically use CLIP's language encoder for text +prompt embeddings, our approach integrates the CLIP visual features informed by +latent physical knowledge into a quaternion hidden space. This enables the +modeling of spatial relationships to produce physical knowledge-informed +pseudo-language prompts. By incorporating these prompt features and fine-tuning +the video diffusion model in a parameter-efficient manner, the physical +knowledge-informed videos are successfully generated. We validate our method +extensively through both numerical simulations and real-world observations of +physical phenomena, demonstrating its remarkable performance across diverse +scenarios. + +
+
+ comment: 7 figures, 14 pages +
+
+
+
+
+ + ☆ Video-to-Task Learning via Motion-Guided Attention for Few-Shot Action + Recognition + + +
+ In recent years, few-shot action recognition has achieved remarkable +performance through spatio-temporal relation modeling. Although a wide range of +spatial and temporal alignment modules have been proposed, they primarily +address spatial or temporal misalignments at the video level, while the +spatio-temporal relationships across different videos at the task level remain +underexplored. Recent studies utilize class prototypes to learn task-specific +features but overlook the spatio-temporal relationships across different videos +at the task level, especially in the spatial dimension, where these +relationships provide rich information. In this paper, we propose a novel Dual +Motion-Guided Attention Learning method (called DMGAL) for few-shot action +recognition, aiming to learn the spatio-temporal relationships from the +video-specific to the task-specific level. To achieve this, we propose a +carefully designed Motion-Guided Attention (MGA) method to identify and +correlate motion-related region features from the video level to the task +level. Specifically, the Self Motion-Guided Attention module (S-MGA) achieves +spatio-temporal relation modeling at the video level by identifying and +correlating motion-related region features between different frames within a +video. The Cross Motion-Guided Attention module (C-MGA) identifies and +correlates motion-related region features between frames of different videos +within a specific task to achieve spatio-temporal relationships at the task +level. This approach enables the model to construct class prototypes that fully +incorporate spatio-temporal relationships from the video-specific level to the +task-specific level. We validate the effectiveness of our DMGAL method by +employing both fully fine-tuning and adapter-tuning paradigms. The models +developed using these paradigms are termed DMGAL-FT and DMGAL-Adapter, +respectively. + +
+
+
+
+
+ + ☆ Color-Oriented Redundancy Reduction in Dataset Distillation NeurIPS + 2024 + + +
+ Dataset Distillation (DD) is designed to generate condensed representations +of extensive image datasets, enhancing training efficiency. Despite recent +advances, there remains considerable potential for improvement, particularly in +addressing the notable redundancy within the color space of distilled images. +In this paper, we propose AutoPalette, a framework that minimizes color +redundancy at the individual image and overall dataset levels, respectively. At +the image level, we employ a palette network, a specialized neural network, to +dynamically allocate colors from a reduced color space to each pixel. The +palette network identifies essential areas in synthetic images for model +training and consequently assigns more unique colors to them. At the dataset +level, we develop a color-guided initialization strategy to minimize redundancy +among images. Representative images with the least replicated color patterns +are selected based on the information gain. A comprehensive performance study +involving various datasets and evaluation scenarios is conducted, demonstrating +the superior performance of our proposed color-aware DD compared to existing DD +methods. The code is available at +\url{https://github.com/KeViNYuAn0314/AutoPalette}. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ☆ TP-UNet: Temporal Prompt Guided UNet for Medical Image Segmentation + + +
+ The advancement of medical image segmentation techniques has been propelled +by the adoption of deep learning techniques, particularly UNet-based +approaches, which exploit semantic information to improve the accuracy of +segmentations. However, the order of organs in scanned images has been +disregarded by current medical image segmentation approaches based on UNet. +Furthermore, the inherent network structure of UNet does not provide direct +capabilities for integrating temporal information. To efficiently integrate +temporal information, we propose TP-UNet that utilizes temporal prompts, +encompassing organ-construction relationships, to guide the segmentation UNet +model. Specifically, our framework is featured with cross-attention and +semantic alignment based on unsupervised contrastive learning to combine +temporal prompts and image features effectively. Extensive evaluations on two +medical image segmentation datasets demonstrate the state-of-the-art +performance of TP-UNet. Our implementation will be open-sourced after +acceptance. + +
+
+
+
+
+ + ☆ Performance Evaluation of Geospatial Images based on Zarr and Tiff + + +
+ This evaluate the performance of geospatial image processing using two +distinct data storage formats: Zarr and TIFF. Geospatial images, converted to +numerous applications like environmental monitoring, urban planning, and +disaster management. Traditional Tagged Image File Format is mostly used +because it is simple and compatible but may lack by performance limitations +while working on large datasets. Zarr is a new format designed for the cloud +systems,that offers scalability and efficient storage with data chunking and +compression techniques. This study compares the two formats in terms of storage +efficiency, access speed, and computational performance during typical +geospatial processing tasks. Through analysis on a range of geospatial +datasets, this provides details about the practical advantages and limitations +of each format,helping users to select the appropriate format based on their +specific needs and constraints. + +
+
+
+
+
+ + ☆ Neuron: Learning Context-Aware Evolving Representations for Zero-Shot + Skeleton Action Recognition + + +
+ Zero-shot skeleton action recognition is a non-trivial task that requires +robust unseen generalization with prior knowledge from only seen classes and +shared semantics. Existing methods typically build the skeleton-semantics +interactions by uncontrollable mappings and conspicuous representations, +thereby can hardly capture the intricate and fine-grained relationship for +effective cross-modal transferability. To address these issues, we propose a +novel dyNamically Evolving dUal skeleton-semantic syneRgistic framework with +the guidance of cOntext-aware side informatioN (dubbed Neuron), to explore more +fine-grained cross-modal correspondence from micro to macro perspectives at +both spatial and temporal levels, respectively. Concretely, 1) we first +construct the spatial-temporal evolving micro-prototypes and integrate dynamic +context-aware side information to capture the intricate and synergistic +skeleton-semantic correlations step-by-step, progressively refining cross-model +alignment; and 2) we introduce the spatial compression and temporal memory +mechanisms to guide the growth of spatial-temporal micro-prototypes, enabling +them to absorb structure-related spatial representations and +regularity-dependent temporal patterns. Notably, such processes are analogous +to the learning and growth of neurons, equipping the framework with the +capacity to generalize to novel unseen action categories. Extensive experiments +on various benchmark datasets demonstrated the superiority of the proposed +method. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ Reducing Label Dependency for Underwater Scene Understanding: A Survey + of Datasets, Techniques and Applications + + +
+ Underwater surveys provide long-term data for informing management +strategies, monitoring coral reef health, and estimating blue carbon stocks. +Advances in broad-scale survey methods, such as robotic underwater vehicles, +have increased the range of marine surveys but generate large volumes of +imagery requiring analysis. Computer vision methods such as semantic +segmentation aid automated image analysis, but typically rely on fully +supervised training with extensive labelled data. While ground truth label +masks for tasks like street scene segmentation can be quickly and affordably +generated by non-experts through crowdsourcing services like Amazon Mechanical +Turk, ecology presents greater challenges. The complexity of underwater images, +coupled with the specialist expertise needed to accurately identify species at +the pixel level, makes this process costly, time-consuming, and heavily +dependent on domain experts. In recent years, some works have performed +automated analysis of underwater imagery, and a smaller number of studies have +focused on weakly supervised approaches which aim to reduce the expert-provided +labelled data required. This survey focuses on approaches which reduce +dependency on human expert input, while reviewing the prior and related +approaches to position these works in the wider field of underwater perception. +Further, we offer an overview of coastal ecosystems and the challenges of +underwater imagery. We provide background on weakly and self-supervised deep +learning and integrate these elements into a taxonomy that centres on the +intersection of underwater monitoring, computer vision, and deep learning, +while motivating approaches for weakly supervised deep learning with reduced +dependency on domain expert data annotations. Lastly, the survey examines +available datasets and platforms, and identifies gaps, barriers, and +opportunities for automating underwater surveys. + +
+
+ comment: 70 pages, 20 figures +
+
+
+
+
+ + ☆ Zero-Shot Automatic Annotation and Instance Segmentation using + LLM-Generated Datasets: Eliminating Field Imaging and Manual Annotation for + Deep Learning Model Development + + +
+ Currently, deep learning-based instance segmentation for various applications +(e.g., Agriculture) is predominantly performed using a labor-intensive process +involving extensive field data collection using sophisticated sensors, followed +by careful manual annotation of images, presenting significant logistical and +financial challenges to researchers and organizations. The process also slows +down the model development and training process. In this study, we presented a +novel method for deep learning-based instance segmentation of apples in +commercial orchards that eliminates the need for labor-intensive field data +collection and manual annotation. Utilizing a Large Language Model (LLM), we +synthetically generated orchard images and automatically annotated them using +the Segment Anything Model (SAM) integrated with a YOLO11 base model. This +method significantly reduces reliance on physical sensors and manual data +processing, presenting a major advancement in "Agricultural AI". The synthetic, +auto-annotated dataset was used to train the YOLO11 model for Apple instance +segmentation, which was then validated on real orchard images. The results +showed that the automatically generated annotations achieved a Dice Coefficient +of 0.9513 and an IoU of 0.9303, validating the accuracy and overlap of the mask +annotations. All YOLO11 configurations, trained solely on these synthetic +datasets with automated annotations, accurately recognized and delineated +apples, highlighting the method's efficacy. Specifically, the YOLO11m-seg +configuration achieved a mask precision of 0.902 and a mask mAP@50 of 0.833 on +test images collected from a commercial orchard. Additionally, the YOLO11l-seg +configuration outperformed other models in validation on 40 LLM-generated +images, achieving the highest mask precision and mAP@50 metrics. + Keywords: YOLO, SAM, SAMv2, YOLO11, YOLOv11, Segment Anything, YOLO-SAM + +
+
+
+
+
+ + ☆ Continuous K-space Recovery Network with Image Guidance for Fast MRI + Reconstruction + + +
+ Magnetic resonance imaging (MRI) is a crucial tool for clinical diagnosis +while facing the challenge of long scanning time. To reduce the acquisition +time, fast MRI reconstruction aims to restore high-quality images from the +undersampled k-space. Existing methods typically train deep learning models to +map the undersampled data to artifact-free MRI images. However, these studies +often overlook the unique properties of k-space and directly apply general +networks designed for image processing to k-space recovery, leaving the precise +learning of k-space largely underexplored. In this work, we propose a +continuous k-space recovery network from a new perspective of implicit neural +representation with image domain guidance, which boosts the performance of MRI +reconstruction. Specifically, (1) an implicit neural representation based +encoder-decoder structure is customized to continuously query unsampled +k-values. (2) an image guidance module is designed to mine the semantic +information from the low-quality MRI images to further guide the k-space +recovery. (3) a multi-stage training strategy is proposed to recover dense +k-space progressively. Extensive experiments conducted on CC359, fastMRI, and +IXI datasets demonstrate the effectiveness of our method and its superiority +over other competitors. + +
+
+
+
+
+ + ☆ Towards Open-Vocabulary Audio-Visual Event Localization + + +
+ The Audio-Visual Event Localization (AVEL) task aims to temporally locate and +classify video events that are both audible and visible. Most research in this +field assumes a closed-set setting, which restricts these models' ability to +handle test data containing event categories absent (unseen) during training. +Recently, a few studies have explored AVEL in an open-set setting, enabling the +recognition of unseen events as ``unknown'', but without providing +category-specific semantics. In this paper, we advance the field by introducing +the Open-Vocabulary Audio-Visual Event Localization (OV-AVEL) problem, which +requires localizing audio-visual events and predicting explicit categories for +both seen and unseen data at inference. To address this new task, we propose +the OV-AVEBench dataset, comprising 24,800 videos across 67 real-life +audio-visual scenes (seen:unseen = 46:21), each with manual segment-level +annotation. We also establish three evaluation metrics for this task. Moreover, +we investigate two baseline approaches, one training-free and one using a +further fine-tuning paradigm. Specifically, we utilize the unified multimodal +space from the pretrained ImageBind model to extract audio, visual, and textual +(event classes) features. The training-free baseline then determines +predictions by comparing the consistency of audio-text and visual-text feature +similarities. The fine-tuning baseline incorporates lightweight temporal layers +to encode temporal relations within the audio and visual modalities, using +OV-AVEBench training data for model fine-tuning. We evaluate these baselines on +the proposed OV-AVEBench dataset and discuss potential directions for future +work in this new field. + +
+
+ comment: Project page: https://github.com/jasongief/OV-AVEL +
+
+
+
+
+ + ☆ Cross-Patient Pseudo Bags Generation and Curriculum Contrastive Learning + for Imbalanced Multiclassification of Whole Slide Image + + +
+ Pathology computing has dramatically improved pathologists' workflow and +diagnostic decision-making processes. Although computer-aided diagnostic +systems have shown considerable value in whole slide image (WSI) analysis, the +problem of multi-classification under sample imbalance remains an intractable +challenge. To address this, we propose learning fine-grained information by +generating sub-bags with feature distributions similar to the original WSIs. +Additionally, we utilize a pseudo-bag generation algorithm to further leverage +the abundant and redundant information in WSIs, allowing efficient training in +unbalanced-sample multi-classification tasks. Furthermore, we introduce an +affinity-based sample selection and curriculum contrastive learning strategy to +enhance the stability of model representation learning. Unlike previous +approaches, our framework transitions from learning bag-level representations +to understanding and exploiting the feature distribution of multi-instance +bags. Our method demonstrates significant performance improvements on three +datasets, including tumor classification and lymph node metastasis. On average, +it achieves a 4.39-point improvement in F1 score compared to the second-best +method across the three tasks, underscoring its superior performance. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Semantic or Covariate? A Study on the Intractable Case of + Out-of-Distribution Detection + + +
+ The primary goal of out-of-distribution (OOD) detection tasks is to identify +inputs with semantic shifts, i.e., if samples from novel classes are absent in +the in-distribution (ID) dataset used for training, we should reject these OOD +samples rather than misclassifying them into existing ID classes. However, we +find the current definition of "semantic shift" is ambiguous, which renders +certain OOD testing protocols intractable for the post-hoc OOD detection +methods based on a classifier trained on the ID dataset. In this paper, we +offer a more precise definition of the Semantic Space and the Covariate Space +for the ID distribution, allowing us to theoretically analyze which types of +OOD distributions make the detection task intractable. To avoid the flaw in the +existing OOD settings, we further define the "Tractable OOD" setting which +ensures the distinguishability of OOD and ID distributions for the post-hoc OOD +detection methods. Finally, we conduct several experiments to demonstrate the +necessity of our definitions and validate the correctness of our theorems. + +
+
+ comment: v1 +
+
+
+
+
+ + ☆ DrivingSphere: Building a High-fidelity 4D World for Closed-loop + Simulation + + +
+ Autonomous driving evaluation requires simulation environments that closely +replicate actual road conditions, including real-world sensory data and +responsive feedback loops. However, many existing simulations need to predict +waypoints along fixed routes on public datasets or synthetic photorealistic +data, \ie, open-loop simulation usually lacks the ability to assess dynamic +decision-making. While the recent efforts of closed-loop simulation offer +feedback-driven environments, they cannot process visual sensor inputs or +produce outputs that differ from real-world data. To address these challenges, +we propose DrivingSphere, a realistic and closed-loop simulation framework. Its +core idea is to build 4D world representation and generate real-life and +controllable driving scenarios. In specific, our framework includes a Dynamic +Environment Composition module that constructs a detailed 4D driving world with +a format of occupancy equipping with static backgrounds and dynamic objects, +and a Visual Scene Synthesis module that transforms this data into +high-fidelity, multi-view video outputs, ensuring spatial and temporal +consistency. By providing a dynamic and realistic simulation environment, +DrivingSphere enables comprehensive testing and validation of autonomous +driving algorithms, ultimately advancing the development of more reliable +autonomous cars. The benchmark will be publicly released. + +
+
+ comment: https://yanty123.github.io/DrivingSphere/ +
+
+
+
+
+ + ☆ Noise Filtering Benchmark for Neuromorphic Satellites Observations + + +
+ Event cameras capture sparse, asynchronous brightness changes which offer +high temporal resolution, high dynamic range, low power consumption, and sparse +data output. These advantages make them ideal for Space Situational Awareness, +particularly in detecting resident space objects moving within a telescope's +field of view. However, the output from event cameras often includes +substantial background activity noise, which is known to be more prevalent in +low-light conditions. This noise can overwhelm the sparse events generated by +satellite signals, making detection and tracking more challenging. Existing +noise-filtering algorithms struggle in these scenarios because they are +typically designed for denser scenes, where losing some signal is acceptable. +This limitation hinders the application of event cameras in complex, real-world +environments where signals are extremely sparse. In this paper, we propose new +event-driven noise-filtering algorithms specifically designed for very sparse +scenes. We categorise the algorithms into logical-based and learning-based +approaches and benchmark their performance against 11 state-of-the-art +noise-filtering algorithms, evaluating how effectively they remove noise and +hot pixels while preserving the signal. Their performance was quantified by +measuring signal retention and noise removal accuracy, with results reported +using ROC curves across the parameter space. Additionally, we introduce a new +high-resolution satellite dataset with ground truth from a real-world platform +under various noise conditions, which we have made publicly available. Code, +dataset, and trained weights are available at +\url{https://github.com/samiarja/dvs_sparse_filter}. + +
+
+ comment: 17 pages, 8 figures, 1 table +
+
+
+
+
+ + ☆ BeautyBank: Encoding Facial Makeup in Latent Space + + +
+ The advancement of makeup transfer, editing, and image encoding has +demonstrated their effectiveness and superior quality. However, existing makeup +works primarily focus on low-dimensional features such as color distributions +and patterns, limiting their versatillity across a wide range of makeup +applications. Futhermore, existing high-dimensional latent encoding methods +mainly target global features such as structure and style, and are less +effective for tasks that require detailed attention to local color and pattern +features of makeup. To overcome these limitations, we propose BeautyBank, a +novel makeup encoder that disentangles pattern features of bare and makeup +faces. Our method encodes makeup features into a high-dimensional space, +preserving essential details necessary for makeup reconstruction and broadening +the scope of potential makeup research applications. We also propose a +Progressive Makeup Tuning (PMT) strategy, specifically designed to enhance the +preservation of detailed makeup features while preventing the inclusion of +irrelevant attributes. We further explore novel makeup applications, including +facial image generation with makeup injection and makeup similarity measure. +Extensive empirical experiments validate that our method offers superior task +adaptability and holds significant potential for widespread application in +various makeup-related fields. Furthermore, to address the lack of large-scale, +high-quality paired makeup datasets in the field, we constructed the +Bare-Makeup Synthesis Dataset (BMS), comprising 324,000 pairs of 512x512 pixel +images of bare and makeup-enhanced faces. + +
+
+
+
+
+ + ☆ Efficient Transfer Learning for Video-language Foundation Models + + +
+ Pre-trained vision-language models provide a robust foundation for efficient +transfer learning across various downstream tasks. In the field of video action +recognition, mainstream approaches often introduce additional parameter modules +to capture temporal information. While the increased model capacity brought by +these additional parameters helps better fit the video-specific inductive +biases, existing methods require learning a large number of parameters and are +prone to catastrophic forgetting of the original generalizable knowledge. In +this paper, we propose a simple yet effective Multi-modal Spatio-Temporal +Adapter (MSTA) to improve the alignment between representations in the text and +vision branches, achieving a balance between general knowledge and +task-specific knowledge. Furthermore, to mitigate over-fitting and enhance +generalizability, we introduce a spatio-temporal description-guided consistency +constraint. This constraint involves feeding template inputs (i.e., ``a video +of $\{\textbf{cls}\}$'') into the trainable language branch, while +LLM-generated spatio-temporal descriptions are input into the pre-trained +language branch, enforcing consistency between the outputs of the two branches. +This mechanism prevents over-fitting to downstream tasks and improves the +distinguishability of the trainable branch within the spatio-temporal semantic +space. We evaluate the effectiveness of our approach across four tasks: +zero-shot transfer, few-shot learning, base-to-novel generalization, and +fully-supervised learning. Compared to many state-of-the-art methods, our MSTA +achieves outstanding performance across all evaluations, while using only 2-7\% +of the trainable parameters in the original model. Code will be avaliable at +https://github.com/chenhaoxing/ETL4Video. + +
+
+
+
+
+ + ☆ The Sound of Water: Inferring Physical Properties from Pouring Liquids + + +
+ We study the connection between audio-visual observations and the underlying +physics of a mundane yet intriguing everyday activity: pouring liquids. Given +only the sound of liquid pouring into a container, our objective is to +automatically infer physical properties such as the liquid level, the shape and +size of the container, the pouring rate and the time to fill. To this end, we: +(i) show in theory that these properties can be determined from the fundamental +frequency (pitch); (ii) train a pitch detection model with supervision from +simulated data and visual data with a physics-inspired objective; (iii) +introduce a new large dataset of real pouring videos for a systematic study; +(iv) show that the trained model can indeed infer these physical properties for +real data; and finally, (v) we demonstrate strong generalization to various +container shapes, other datasets, and in-the-wild YouTube videos. Our work +presents a keen understanding of a narrow yet rich problem at the intersection +of acoustics, physics, and learning. It opens up applications to enhance +multisensory perception in robotic pouring. + +
+
+ comment: 25 pages, 17 figures. Project page at + https://bpiyush.github.io/pouring-water-website +
+
+
+
+
+ + ☆ Relational Contrastive Learning and Masked Image Modeling for Scene Text + Recognition + + +
+ Context-aware methods have achieved remarkable advancements in supervised +scene text recognition by leveraging semantic priors from words. Considering +the heterogeneity of text and background in STR, we propose that such +contextual priors can be reinterpreted as the relations between textual +elements, serving as effective self-supervised labels for representation +learning. However, textual relations are restricted to the finite size of the +dataset due to lexical dependencies, which causes over-fitting problem, thus +compromising the representation quality. To address this, our work introduces a +unified framework of Relational Contrastive Learning and Masked Image Modeling +for STR (RCMSTR), which explicitly models the enriched textual relations. For +the RCL branch, we first introduce the relational rearrangement module to +cultivate new relations on the fly. Based on this, we further conduct +relational contrastive learning to model the intra- and inter-hierarchical +relations for frames, sub-words and words.On the other hand, MIM can naturally +boost the context information via masking, where we find that the block masking +strategy is more effective for STR. For the effective integration of RCL and +MIM, we also introduce a novel decoupling design aimed at mitigating the impact +of masked images on contrastive learning. Additionally, to enhance the +compatibility of MIM with CNNs, we propose the adoption of sparse convolutions +and directly sharing the weights with dense convolutions in training. The +proposed RCMSTR demonstrates superior performance in various evaluation +protocols for different STR-related downstream tasks, outperforming the +existing state-of-the-art self-supervised STR techniques. Ablation studies and +qualitative experimental results further validate the effectiveness of our +method.The code and pre-trained models will be available at +https://github.com/ThunderVVV/RCMSTR . + +
+
+
+
+
+ + ☆ DeforHMR: Vision Transformer with Deformable Cross-Attention for 3D + Human Mesh Recovery 3DV2025 + + +
+ Human Mesh Recovery (HMR) is an important yet challenging problem with +applications across various domains including motion capture, augmented +reality, and biomechanics. Accurately predicting human pose parameters from a +single image remains a challenging 3D computer vision task. In this work, we +introduce DeforHMR, a novel regression-based monocular HMR framework designed +to enhance the prediction of human pose parameters using deformable attention +transformers. DeforHMR leverages a novel query-agnostic deformable +cross-attention mechanism within the transformer decoder to effectively regress +the visual features extracted from a frozen pretrained vision transformer (ViT) +encoder. The proposed deformable cross-attention mechanism allows the model to +attend to relevant spatial features more flexibly and in a data-dependent +manner. Equipped with a transformer decoder capable of spatially-nuanced +attention, DeforHMR achieves state-of-the-art performance for single-frame +regression-based methods on the widely used 3D HMR benchmarks 3DPW and RICH. By +pushing the boundary on the field of 3D human mesh recovery through deformable +attention, we introduce an new, effective paradigm for decoding local spatial +information from large pretrained vision encoders in computer vision. + +
+
+ comment: 11 pages, 5 figures, 3DV2025 +
+
+
+
+
+ + ☆ Distill the Best, Ignore the Rest: Improving Dataset Distillation with + Loss-Value-Based Pruning + + +
+ Dataset distillation has gained significant interest in recent years, yet +existing approaches typically distill from the entire dataset, potentially +including non-beneficial samples. We introduce a novel "Prune First, Distill +After" framework that systematically prunes datasets via loss-based sampling +prior to distillation. By leveraging pruning before classical distillation +techniques and generative priors, we create a representative core-set that +leads to enhanced generalization for unseen architectures - a significant +challenge of current distillation methods. More specifically, our proposed +framework significantly boosts distilled quality, achieving up to a 5.2 +percentage points accuracy increase even with substantial dataset pruning, +i.e., removing 80% of the original dataset prior to distillation. Overall, our +experimental results highlight the advantages of our easy-sample prioritization +and cross-architecture robustness, paving the way for more effective and +high-quality dataset distillation. + +
+
+
+
+
+ + ☆ FruitNinja: 3D Object Interior Texture Generation with Gaussian + Splatting + + +
+ In the real world, objects reveal internal textures when sliced or cut, yet +this behavior is not well-studied in 3D generation tasks today. For example, +slicing a virtual 3D watermelon should reveal flesh and seeds. Given that no +available dataset captures an object's full internal structure and collecting +data from all slices is impractical, generative methods become the obvious +approach. However, current 3D generation and inpainting methods often focus on +visible appearance and overlook internal textures. To bridge this gap, we +introduce FruitNinja, the first method to generate internal textures for 3D +objects undergoing geometric and topological changes. Our approach produces +objects via 3D Gaussian Splatting (3DGS) with both surface and interior +textures synthesized, enabling real-time slicing and rendering without +additional optimization. FruitNinja leverages a pre-trained diffusion model to +progressively inpaint cross-sectional views and applies voxel-grid-based +smoothing to achieve cohesive textures throughout the object. Our OpaqueAtom GS +strategy overcomes 3DGS limitations by employing densely distributed opaque +Gaussians, avoiding biases toward larger particles that destabilize training +and sharp color transitions for fine-grained textures. Experimental results +show that FruitNinja substantially outperforms existing approaches, showcasing +unmatched visual quality in real-time rendered internal views across arbitrary +geometry manipulations. + +
+
+
+
+
+ + ☆ Just Leaf It: Accelerating Diffusion Classifiers with Hierarchical Class + Pruning + + +
+ Diffusion models, known for their generative capabilities, have recently +shown unexpected potential in image classification tasks by using Bayes' +theorem. However, most diffusion classifiers require evaluating all class +labels for a single classification, leading to significant computational costs +that can hinder their application in large-scale scenarios. To address this, we +present a Hierarchical Diffusion Classifier (HDC) that exploits the inherent +hierarchical label structure of a dataset. By progressively pruning irrelevant +high-level categories and refining predictions only within relevant +subcategories, i.e., leaf nodes, HDC reduces the total number of class +evaluations. As a result, HDC can accelerate inference by up to 60% while +maintaining and, in some cases, improving classification accuracy. Our work +enables a new control mechanism of the trade-off between speed and precision, +making diffusion-based classification more viable for real-world applications, +particularly in large-scale image classification tasks. + +
+
+
+
+
+ + ☆ Zoomed In, Diffused Out: Towards Local Degradation-Aware Multi-Diffusion + for Extreme Image Super-Resolution + + +
+ Large-scale, pre-trained Text-to-Image (T2I) diffusion models have gained +significant popularity in image generation tasks and have shown unexpected +potential in image Super-Resolution (SR). However, most existing T2I diffusion +models are trained with a resolution limit of 512x512, making scaling beyond +this resolution an unresolved but necessary challenge for image SR. In this +work, we introduce a novel approach that, for the first time, enables these +models to generate 2K, 4K, and even 8K images without any additional training. +Our method leverages MultiDiffusion, which distributes the generation across +multiple diffusion paths to ensure global coherence at larger scales, and local +degradation-aware prompt extraction, which guides the T2I model to reconstruct +fine local structures according to its low-resolution input. These innovations +unlock higher resolutions, allowing T2I diffusion models to be applied to image +SR tasks without limitation on resolution. + +
+
+
+
+
+ + ☆ Autoassociative Learning of Structural Representations for Modeling and + Classification in Medical Imaging + + +
+ Deep learning architectures based on convolutional neural networks tend to +rely on continuous, smooth features. While this characteristics provides +significant robustness and proves useful in many real-world tasks, it is +strikingly incompatible with the physical characteristic of the world, which, +at the scale in which humans operate, comprises crisp objects, typically +representing well-defined categories. This study proposes a class of +neurosymbolic systems that learn by reconstructing the observed images in terms +of visual primitives and are thus forced to form high-level, structural +explanations of them. When applied to the task of diagnosing abnormalities in +histological imaging, the method proved superior to a conventional deep +learning architecture in terms of classification accuracy, while being more +transparent. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ☆ ITACLIP: Boosting Training-Free Semantic Segmentation with Image, Text, + and Architectural Enhancements + + +
+ Recent advances in foundational Vision Language Models (VLMs) have reshaped +the evaluation paradigm in computer vision tasks. These foundational models, +especially CLIP, have accelerated research in open-vocabulary computer vision +tasks, including Open-Vocabulary Semantic Segmentation (OVSS). Although the +initial results are promising, the dense prediction capabilities of VLMs still +require further improvement. In this study, we enhance the semantic +segmentation performance of CLIP by introducing new modules and modifications: +1) architectural changes in the last layer of ViT and the incorporation of +attention maps from the middle layers with the last layer, 2) Image +Engineering: applying data augmentations to enrich input image representations, +and 3) using Large Language Models (LLMs) to generate definitions and synonyms +for each class name to leverage CLIP's open-vocabulary capabilities. Our +training-free method, ITACLIP, outperforms current state-of-the-art approaches +on segmentation benchmarks such as COCO-Stuff, COCO-Object, Pascal Context, and +Pascal VOC. Our code is available at https://github.com/m-arda-aydn/ITACLIP. + +
+
+
+
+
+ + ☆ In-Situ Melt Pool Characterization via Thermal Imaging for Defect + Detection in Directed Energy Deposition Using Vision Transformers + + +
+ Directed Energy Deposition (DED) offers significant potential for +manufacturing complex and multi-material parts. However, internal defects such +as porosity and cracks can compromise mechanical properties and overall +performance. This study focuses on in-situ monitoring and characterization of +melt pools associated with porosity, aiming to improve defect detection and +quality control in DED-printed parts. Traditional machine learning approaches +for defect identification rely on extensive labeled datasets, often scarce and +expensive to generate in real-world manufacturing. To address this, our +framework employs self-supervised learning on unlabeled melt pool data using a +Vision Transformer-based Masked Autoencoder (MAE) to produce highly +representative embeddings. These fine-tuned embeddings are leveraged via +transfer learning to train classifiers on a limited labeled dataset, enabling +the effective identification of melt pool anomalies. We evaluate two +classifiers: (1) a Vision Transformer (ViT) classifier utilizing the fine-tuned +MAE Encoder's parameters and (2) the fine-tuned MAE Encoder combined with an +MLP classifier head. Our framework achieves overall accuracy ranging from +95.44% to 99.17% and an average F1 score exceeding 80%, with the ViT Classifier +slightly outperforming the MAE Encoder Classifier. This demonstrates the +scalability and cost-effectiveness of our approach for automated quality +control in DED, effectively detecting defects with minimal labeled data. + +
+
+
+
+
+ + ♻ ☆ A Review of Digital Pixel Sensors + + +
+ Digital pixel sensor (DPS) has evolved as a pivotal component in modern +imaging systems and has the potential to revolutionize various fields such as +medical imaging, astronomy, surveillance, IoT devices, etc. Compared to analog +pixel sensors, the DPS offers high speed and good image quality. However, the +introduced intrinsic complexity within each pixel, primarily attributed to the +accommodation of the ADC circuit, engenders a substantial increase in the pixel +pitch. Unfortunately, such a pronounced escalation in pixel pitch drastically +undermines the feasibility of achieving high-density integration, which is an +obstacle that significantly narrows down the field of potential applications. +Nonetheless, designing compact conversion circuits along with strategic +integration of 3D architectural paradigms can be a potential remedy to the +prevailing situation. This review article presents a comprehensive overview of +the vast area of DPS technology. The operating principles, advantages, and +challenges of different types of DPS circuits have been analyzed. We categorize +the schemes into several categories based on ADC operation. A comparative study +based on different performance metrics has also been showcased for a +well-rounded understanding. + +
+
+
+
+
+ + ♻ ☆ Watermark-based Detection and Attribution of AI-Generated Content + + +
+ Several companies have deployed watermark-based detection to identify +AI-generated content. However, attribution--the ability to trace back to the +user of a generative AI (GenAI) service who created a given piece of +AI-generated content--remains largely unexplored despite its growing +importance. In this work, we aim to bridge this gap by conducting the first +systematic study on watermark-based, user-level attribution of AI-generated +content. Our key idea is to assign a unique watermark to each user of the GenAI +service and embed this watermark into the AI-generated content created by that +user. Attribution is then performed by identifying the user whose watermark +best matches the one extracted from the given content. This approach, however, +faces a key challenge: How should watermarks be selected for users to maximize +attribution performance? To address the challenge, we first theoretically +derive lower bounds on detection and attribution performance through rigorous +probabilistic analysis for any given set of user watermarks. Then, we select +watermarks for users to maximize these lower bounds, thereby optimizing +detection and attribution performance. Our theoretical and empirical results +show that watermark-based attribution inherits both the accuracy and +(non-)robustness properties of the underlying watermark. Specifically, +attribution remains highly accurate when the watermarked AI-generated content +is either not post-processed or subjected to common post-processing such as +JPEG compression, as well as black-box adversarial post-processing with limited +query budgets. + +
+
+
+
+
+ + ♻ ☆ MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation + Framework + + +
+ Medical imaging segmentation is a highly active area of research, with deep +learning-based methods achieving state-of-the-art results in several +benchmarks. However, the lack of standardized tools for training, testing, and +evaluating new methods makes the comparison of methods difficult. To address +this, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple, +modular, and end-to-end medical imaging segmentation framework designed to +facilitate consistent training, testing, and evaluation of deep learning-based +medical imaging segmentation methods. MIST standardizes data analysis, +preprocessing, and evaluation pipelines, accommodating multiple architectures +and loss functions. This standardization ensures reproducible and fair +comparisons across different methods. We detail MIST's data format +requirements, pipelines, and auxiliary features and demonstrate its efficacy +using the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results +highlight MIST's ability to produce accurate segmentation masks and its +scalability across multiple GPUs, showcasing its potential as a powerful tool +for future medical imaging research and development. + +
+
+ comment: Submitted to BraTS 2024 +
+
+
+
+
+ + ♻ ☆ MV2Cyl: Reconstructing 3D Extrusion Cylinders from Multi-View Images NeurIPS 2024 + + +
+ We present MV2Cyl, a novel method for reconstructing 3D from 2D multi-view +images, not merely as a field or raw geometry but as a sketch-extrude CAD +model. Extracting extrusion cylinders from raw 3D geometry has been extensively +researched in computer vision, while the processing of 3D data through neural +networks has remained a bottleneck. Since 3D scans are generally accompanied by +multi-view images, leveraging 2D convolutional neural networks allows these +images to be exploited as a rich source for extracting extrusion cylinder +information. However, we observe that extracting only the surface information +of the extrudes and utilizing it results in suboptimal outcomes due to the +challenges in the occlusion and surface segmentation. By synergizing with the +extracted base curve information, we achieve the optimal reconstruction result +with the best accuracy in 2D sketch and extrude parameter estimation. Our +experiments, comparing our method with previous work that takes a raw 3D point +cloud as input, demonstrate the effectiveness of our approach by taking +advantage of multi-view images. Our project page can be found at +http://mv2cyl.github.io . + +
+
+ comment: NeurIPS 2024. Project page: http://mv2cyl.github.io +
+
+
+
+
+ + ♻ ☆ Learning to mask: Towards generalized face forgery detection + + +
+ Generalizability to unseen forgery types is crucial for face forgery +detectors. Recent works have made significant progress in terms of +generalization by synthetic forgery data augmentation. In this work, we explore +another path for improving the generalization. Our goal is to reduce the +features that are easy to learn in the training phase, so as to reduce the risk +of overfitting on specific forgery types. Specifically, in our method, a +teacher network takes as input the face images and generates an attention map +of the deep features by a diverse multihead attention ViT. The attention map is +used to guide a student network to focus on the low-attended features by +reducing the highly-attended deep features. A deep feature mixup strategy is +also proposed to synthesize forgeries in the feature domain. Experiments +demonstrate that, without data augmentation, our method is able to achieve +promising performances on unseen forgeries and highly compressed data. + +
+
+ comment: Incorrect experimental setting +
+
+
+
+
+ + ♻ ☆ Understanding Generalizability of Diffusion Models Requires Rethinking + the Hidden Gaussian Structure + + +
+ In this work, we study the generalizability of diffusion models by looking +into the hidden properties of the learned score functions, which are +essentially a series of deep denoisers trained on various noise levels. We +observe that as diffusion models transition from memorization to +generalization, their corresponding nonlinear diffusion denoisers exhibit +increasing linearity. This discovery leads us to investigate the linear +counterparts of the nonlinear diffusion models, which are a series of linear +models trained to match the function mappings of the nonlinear diffusion +denoisers. Surprisingly, these linear denoisers are approximately the optimal +denoisers for a multivariate Gaussian distribution characterized by the +empirical mean and covariance of the training dataset. This finding implies +that diffusion models have the inductive bias towards capturing and utilizing +the Gaussian structure (covariance information) of the training dataset for +data generation. We empirically demonstrate that this inductive bias is a +unique property of diffusion models in the generalization regime, which becomes +increasingly evident when the model's capacity is relatively small compared to +the training dataset size. In the case that the model is highly +overparameterized, this inductive bias emerges during the initial training +phases before the model fully memorizes its training data. Our study provides +crucial insights into understanding the notable strong generalization +phenomenon recently observed in real-world diffusion models. + +
+
+
+
+
+ + ♻ ☆ Eidos: Efficient, Imperceptible Adversarial 3D Point Clouds + + +
+ Classification of 3D point clouds is a challenging machine learning (ML) task +with important real-world applications in a spectrum from autonomous driving +and robot-assisted surgery to earth observation from low orbit. As with other +ML tasks, classification models are notoriously brittle in the presence of +adversarial attacks. These are rooted in imperceptible changes to inputs with +the effect that a seemingly well-trained model ends up misclassifying the +input. This paper adds to the understanding of adversarial attacks by +presenting Eidos, a framework providing Efficient Imperceptible aDversarial +attacks on 3D pOint cloudS. Eidos supports a diverse set of imperceptibility +metrics. It employs an iterative, two-step procedure to identify optimal +adversarial examples, thereby enabling a runtime-imperceptibility trade-off. We +provide empirical evidence relative to several popular 3D point cloud +classification models and several established 3D attack methods, showing Eidos' +superiority with respect to efficiency as well as imperceptibility. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ PhD: A ChatGPT-Prompted Visual hallucination Evaluation Dataset + + +
+ Multimodal Large Language Models (MLLMs) hallucinate, resulting in an +emerging topic of visual hallucination evaluation (VHE). This paper contributes +a ChatGPT-Prompted visual hallucination evaluation Dataset (PhD) for objective +VHE at a large scale. The essence of VHE is to ask an MLLM questions about +specific images to assess its susceptibility to hallucination. Depending on +what to ask (objects, attributes, sentiment, etc.) and how the questions are +asked, we structure PhD along two dimensions, i.e., task and mode. Five visual +recognition tasks, ranging from low-level (object / attribute recognition) to +middle-level (sentiment / position recognition and counting), are considered. +Besides a normal visual QA mode, which we term PhD-base, PhD also asks +questions with inaccurate context (PhD-iac) or with incorrect context +(PhD-icc), or with AI-generated counter common sense images (PhD-ccs). We +construct PhD by a ChatGPT-assisted semi-automated pipeline, encompassing four +pivotal modules: task-specific hallucinatory item (hitem) selection, +hitem-embedded question generation, inaccurate / incorrect context generation, +and counter-common-sense (CCS) image generation. With over 14k daily images, +750 CCS images and 102k VQA triplets in total, PhD reveals considerable +variability in MLLMs' performance across various modes and tasks, offering +valuable insights into the nature of hallucination. As such, PhD stands as a +potent tool not only for VHE but may also play a significant role in the +refinement of MLLMs. + +
+
+
+
+
+ + ♻ ☆ V2X-R: Cooperative LiDAR-4D Radar Fusion for 3D Object Detection with + Denoising Diffusion + + +
+ Current Vehicle-to-Everything (V2X) systems have significantly enhanced 3D +object detection using LiDAR and camera data. However, these methods suffer +from performance degradation in adverse weather conditions. The weatherrobust +4D radar provides Doppler and additional geometric information, raising the +possibility of addressing this challenge. To this end, we present V2X-R, the +first simulated V2X dataset incorporating LiDAR, camera, and 4D radar. V2X-R +contains 12,079 scenarios with 37,727 frames of LiDAR and 4D radar point +clouds, 150,908 images, and 170,859 annotated 3D vehicle bounding boxes. +Subsequently, we propose a novel cooperative LiDAR-4D radar fusion pipeline for +3D object detection and implement it with various fusion strategies. To achieve +weather-robust detection, we additionally propose a Multi-modal Denoising +Diffusion (MDD) module in our fusion pipeline. MDD utilizes weather-robust 4D +radar feature as a condition to prompt the diffusion model to denoise noisy +LiDAR features. Experiments show that our LiDAR-4D radar fusion pipeline +demonstrates superior performance in the V2X-R dataset. Over and above this, +our MDD module further improved the performance of basic fusion model by up to +5.73%/6.70% in foggy/snowy conditions with barely disrupting normal +performance. The dataset and code will be publicly available at: +https://github.com/ylwhxht/V2X-R. + +
+
+
+
+
+ + ♻ ☆ Unmasking Parkinson's Disease with Smile: An AI-enabled Screening + Framework + + +
+ We present an efficient and accessible PD screening method by leveraging +AI-driven models enabled by the largest video dataset of facial expressions +from 1,059 unique participants. This dataset includes 256 individuals with PD, +165 clinically diagnosed, and 91 self-reported. Participants used webcams to +record themselves mimicking three facial expressions (smile, disgust, and +surprise) from diverse sources encompassing their homes across multiple +countries, a US clinic, and a PD wellness center in the US. Facial landmarks +are automatically tracked from the recordings to extract features related to +hypomimia, a prominent PD symptom characterized by reduced facial expressions. +Machine learning algorithms are trained on these features to distinguish +between individuals with and without PD. The model was tested for +generalizability on external (unseen during training) test videos collected +from a US clinic and Bangladesh. An ensemble of machine learning models trained +on smile videos achieved an accuracy of 87.9+-0.1% (95% Confidence Interval) +with an AUROC of 89.3+-0.3% as evaluated on held-out data (using k-fold +cross-validation). In external test settings, the ensemble model achieved +79.8+-0.6% accuracy with 81.9+-0.3% AUROC on the clinical test set and +84.9+-0.4% accuracy with 81.2+-0.6% AUROC on participants from Bangladesh. In +every setting, the model was free from detectable bias across sex and ethnic +subgroups, except in the cohorts from Bangladesh, where the model performed +significantly better for female participants than males. Smiling videos can +effectively differentiate between individuals with and without PD, offering a +potentially easy, accessible, and cost-efficient way to screen for PD, +especially when a clinical diagnosis is difficult to access. + +
+
+
+
+
+ + ♻ ☆ SynArtifact: Classifying and Alleviating Artifacts in Synthetic Images + via Vision-Language Model + + +
+ In the rapidly evolving area of image synthesis, a serious challenge is the +presence of complex artifacts that compromise perceptual realism of synthetic +images. To alleviate artifacts and improve quality of synthetic images, we +fine-tune Vision-Language Model (VLM) as artifact classifier to automatically +identify and classify a wide range of artifacts and provide supervision for +further optimizing generative models. Specifically, we develop a comprehensive +artifact taxonomy and construct a dataset of synthetic images with artifact +annotations for fine-tuning VLM, named SynArtifact-1K. The fine-tuned VLM +exhibits superior ability of identifying artifacts and outperforms the baseline +by 25.66%. To our knowledge, this is the first time such end-to-end artifact +classification task and solution have been proposed. Finally, we leverage the +output of VLM as feedback to refine the generative model for alleviating +artifacts. Visualization results and user study demonstrate that the quality of +images synthesized by the refined diffusion model has been obviously improved. + +
+
+
+
+
+ + ♻ ☆ Utilizing Large Language Models in an iterative paradigm with domain + feedback for molecule optimization + + +
+ Molecule optimization is a critical task in drug discovery to optimize +desired properties of a given molecule through chemical modification. Despite +Large Language Models (LLMs) holding the potential to efficiently simulate this +task by using natural language to direct the optimization, straightforwardly +utilizing them shows limited performance. In this work, we facilitate utilizing +LLMs in an iterative paradigm by proposing a simple yet highly effective domain +feedback provider, namely $\text{Re}^3$DF. In detail, $\text{Re}^3$DF harnesses +an external toolkit, RDKit, to handle the molecule hallucination, if the +modified molecule is chemically invalid. Otherwise, its desired properties are +computed and compared to the original one, establishing reliable domain +feedback with correct direction and distance towards the objective, followed by +a retrieved example, to guide the LLM to refine the modified molecule. We +conduct experiments across both single- and multi-property objectives with 2 +thresholds, where $\text{Re}^3$DF shows significant improvements. Particularly, +for 20 single-property objectives, $\text{Re}^3$DF enhances Hit ratio by 16.95% +and 20.76% under loose (\texttt{l}) and strict (\texttt{s}) thresholds, +respectively. For 32 multi-property objectives, $\text{Re}^3$DF enhances Hit +ratio by 6.04% and 5.25%. + +
+
+
+
+
+ + ♻ ☆ MagicStick: Controllable Video Editing via Control Handle + Transformations WACV 2025 + + +
+ Text-based video editing has recently attracted considerable interest in +changing the style or replacing the objects with a similar structure. Beyond +this, we demonstrate that properties such as shape, size, location, motion, +etc., can also be edited in videos. Our key insight is that the keyframe +transformations of the specific internal feature (e.g., edge maps of objects or +human pose), can easily propagate to other frames to provide generation +guidance. We thus propose MagicStick, a controllable video editing method that +edits the video properties by utilizing the transformation on the extracted +internal control signals. In detail, to keep the appearance, we inflate both +the pretrained image diffusion model and ControlNet to the temporal dimension +and train low-rank adaptions (LORA) layers to fit the specific scenes. Then, in +editing, we perform an inversion and editing framework. Differently, finetuned +ControlNet is introduced in both inversion and generation for attention +guidance with the proposed attention remix between the spatial attention maps +of inversion and editing. Yet succinct, our method is the first method to show +the ability of video property editing from the pre-trained text-to-image model. +We present experiments on numerous examples within our unified framework. We +also compare with shape-aware text-based editing and handcrafted motion video +generation, demonstrating our superior temporal consistency and editing +capability than previous works. The code and models are available on +https://github.com/mayuelala/MagicStick. + +
+
+ comment: Accepted by WACV 2025, Project page: + https://magic-stick-edit.github.io/ Github repository: + https://github.com/mayuelala/MagicStick +
+
+
+
+
+ + ♻ ☆ A Recipe for CAC: Mosaic-based Generalized Loss for Improved + Class-Agnostic Counting ACCV 2024 + + +
+ Class agnostic counting (CAC) is a vision task that can be used to count the +total occurrence number of any given reference objects in the query image. The +task is usually formulated as a density map estimation problem through +similarity computation among a few image samples of the reference object and +the query image. In this paper, we point out a severe issue of the existing CAC +framework: Given a multi-class setting, models don't consider reference images +and instead blindly match all dominant objects in the query image. Moreover, +the current evaluation metrics and dataset cannot be used to faithfully assess +the model's generalization performance and robustness. To this end, we discover +that the combination of mosaic augmentation with generalized loss is essential +for addressing the aforementioned issue of CAC models to count objects of +majority (i.e. dominant objects) regardless of the references. Furthermore, we +introduce a new evaluation protocol and metrics for resolving the problem +behind the existing CAC evaluation scheme and better benchmarking CAC models in +a more fair manner. Besides, extensive evaluation results demonstrate that our +proposed recipe can consistently improve the performance of different CAC +models. The code is available at https://github.com/littlepenguin89106/MGCAC. + +
+
+ comment: Accepted by ACCV 2024 +
+
+
+
+
+ + ♻ ☆ Unconstrained Open Vocabulary Image Classification: Zero-Shot Transfer + from Text to Image via CLIP Inversion WACV 2025 + + +
+ We introduce NOVIC, an innovative real-time uNconstrained Open Vocabulary +Image Classifier that uses an autoregressive transformer to generatively output +classification labels as language. Leveraging the extensive knowledge of CLIP +models, NOVIC harnesses the embedding space to enable zero-shot transfer from +pure text to images. Traditional CLIP models, despite their ability for open +vocabulary classification, require an exhaustive prompt of potential class +labels, restricting their application to images of known content or context. To +address this, we propose an "object decoder" model that is trained on a +large-scale 92M-target dataset of templated object noun sets and LLM-generated +captions to always output the object noun in question. This effectively inverts +the CLIP text encoder and allows textual object labels from essentially the +entire English language to be generated directly from image-derived embedding +vectors, without requiring any a priori knowledge of the potential content of +an image, and without any label biases. The trained decoders are tested on a +mix of manually and web-curated datasets, as well as standard image +classification benchmarks, and achieve fine-grained prompt-free prediction +scores of up to 87.5%, a strong result considering the model must work for any +conceivable image and without any contextual clues. + +
+
+ comment: Published at WACV 2025 +
+
+
+
+
+ + ♻ ☆ Partial Scene Text Retrieval + + +
+ The task of partial scene text retrieval involves localizing and searching +for text instances that are the same or similar to a given query text from an +image gallery. However, existing methods can only handle text-line instances, +leaving the problem of searching for partial patches within these text-line +instances unsolved due to a lack of patch annotations in the training data. To +address this issue, we propose a network that can simultaneously retrieve both +text-line instances and their partial patches. Our method embeds the two types +of data (query text and scene text instances) into a shared feature space and +measures their cross-modal similarities. To handle partial patches, our +proposed approach adopts a Multiple Instance Learning (MIL) approach to learn +their similarities with query text, without requiring extra annotations. +However, constructing bags, which is a standard step of conventional MIL +approaches, can introduce numerous noisy samples for training, and lower +inference speed. To address this issue, we propose a Ranking MIL (RankMIL) +approach to adaptively filter those noisy samples. Additionally, we present a +Dynamic Partial Match Algorithm (DPMA) that can directly search for the target +partial patch from a text-line instance during the inference stage, without +requiring bags. This greatly improves the search efficiency and the performance +of retrieving partial patches. The source code and dataset are available at +https://github.com/lanfeng4659/PSTR. + +
+
+ comment: Accepted on TPAMI +
+
+
+
+
+ + ♻ ☆ BrightDreamer: Generic 3D Gaussian Generative Framework for Fast + Text-to-3D Synthesis + + +
+ Text-to-3D synthesis has recently seen intriguing advances by combining the +text-to-image priors with 3D representation methods, e.g., 3D Gaussian +Splatting (3D GS), via Score Distillation Sampling (SDS). However, a hurdle of +existing methods is the low efficiency, per-prompt optimization for a single 3D +object. Therefore, it is imperative for a paradigm shift from per-prompt +optimization to feed-forward generation for any unseen text prompts, which yet +remains challenging. An obstacle is how to directly generate a set of millions +of 3D Gaussians to represent a 3D object. This paper presents BrightDreamer, an +end-to-end feed-forward approach that can achieve generalizable and fast (77 +ms) text-to-3D generation. Our key idea is to formulate the generation process +as estimating the 3D deformation from an anchor shape with predefined +positions. For this, we first propose a Text-guided Shape Deformation (TSD) +network to predict the deformed shape and its new positions, used as the +centers (one attribute) of 3D Gaussians. To estimate the other four attributes +(i.e., scaling, rotation, opacity, and SH), we then design a novel Text-guided +Triplane Generator (TTG) to generate a triplane representation for a 3D object. +The center of each Gaussian enables us to transform the spatial feature into +the four attributes. The generated 3D Gaussians can be finally rendered at 705 +frames per second. Extensive experiments demonstrate the superiority of our +method over existing methods. Also, BrightDreamer possesses a strong semantic +understanding capability even for complex text prompts. The code is available +in the project page. + +
+
+
+
+
+ + ♻ ☆ Structural-Based Uncertainty in Deep Learning Across Anatomical Scales: + Analysis in White Matter Lesion Segmentation + + +
+ This paper explores uncertainty quantification (UQ) as an indicator of the +trustworthiness of automated deep-learning (DL) tools in the context of white +matter lesion (WML) segmentation from magnetic resonance imaging (MRI) scans of +multiple sclerosis (MS) patients. Our study focuses on two principal aspects of +uncertainty in structured output segmentation tasks. First, we postulate that a +reliable uncertainty measure should indicate predictions likely to be incorrect +with high uncertainty values. Second, we investigate the merit of quantifying +uncertainty at different anatomical scales (voxel, lesion, or patient). We +hypothesize that uncertainty at each scale is related to specific types of +errors. Our study aims to confirm this relationship by conducting separate +analyses for in-domain and out-of-domain settings. Our primary methodological +contributions are (i) the development of novel measures for quantifying +uncertainty at lesion and patient scales, derived from structural prediction +discrepancies, and (ii) the extension of an error retention curve analysis +framework to facilitate the evaluation of UQ performance at both lesion and +patient scales. The results from a multi-centric MRI dataset of 444 patients +demonstrate that our proposed measures more effectively capture model errors at +the lesion and patient scales compared to measures that average voxel-scale +uncertainty values. We provide the UQ protocols code at +https://github.com/Medical-Image-Analysis-Laboratory/MS_WML_uncs. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Forgetting in Deep Learning Beyond Continual + Learning + + +
+ Forgetting refers to the loss or deterioration of previously acquired +knowledge. While existing surveys on forgetting have primarily focused on +continual learning, forgetting is a prevalent phenomenon observed in various +other research domains within deep learning. Forgetting manifests in research +fields such as generative models due to generator shifts, and federated +learning due to heterogeneous data distributions across clients. Addressing +forgetting encompasses several challenges, including balancing the retention of +old task knowledge with fast learning of new task, managing task interference +with conflicting goals, and preventing privacy leakage, etc. Moreover, most +existing surveys on continual learning implicitly assume that forgetting is +always harmful. In contrast, our survey argues that forgetting is a +double-edged sword and can be beneficial and desirable in certain cases, such +as privacy-preserving scenarios. By exploring forgetting in a broader context, +we present a more nuanced understanding of this phenomenon and highlight its +potential advantages. Through this comprehensive survey, we aspire to uncover +potential solutions by drawing upon ideas and approaches from various fields +that have dealt with forgetting. By examining forgetting beyond its +conventional boundaries, we hope to encourage the development of novel +strategies for mitigating, harnessing, or even embracing forgetting in real +applications. A comprehensive list of papers about forgetting in various +research fields is available at +\url{https://github.com/EnnengYang/Awesome-Forgetting-in-Deep-Learning}. + +
+
+ comment: accepted at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ♻ ☆ Chameleon: A Data-Efficient Generalist for Dense Visual Prediction in + the Wild + + +
+ Large language models have evolved data-efficient generalists, benefiting +from the universal language interface and large-scale pre-training. However, +constructing a data-efficient generalist for dense visual prediction presents a +distinct challenge due to the variation in label structures across different +tasks. Consequently, generalization to unseen dense prediction tasks in the +low-data regime is not straightforward and has received less attention from +previous vision generalists. In this study, we explore a universal model that +can flexibly adapt to unseen dense label structures with a few examples, +enabling it to serve as a data-efficient vision generalist in diverse +real-world scenarios. To this end, we base our method on a powerful +meta-learning framework and explore several axes to improve its performance and +versatility for real-world problems, such as flexible adaptation mechanisms and +scalability. We evaluate our model across a spectrum of unseen real-world +scenarios where low-shot learning is desirable, including video, 3D, medical, +biological, and user-interactive tasks. Equipped with a generic architecture +and an effective adaptation mechanism, our model flexibly adapts to all of +these tasks with at most 50 labeled images, showcasing a significant +advancement over existing data-efficient generalist approaches. Codes are +available at https://github.com/GitGyun/chameleon. + +
+
+
+
+
+ + ♻ ☆ Fine-Grained Verifiers: Preference Modeling as Next-token Prediction in + Vision-Language Alignment + + +
+ The recent advancements in large language models (LLMs) and pre-trained +vision models have accelerated the development of vision-language large models +(VLLMs), enhancing the interaction between visual and linguistic modalities. +Despite their notable success across various domains, VLLMs face challenges in +modality alignment, which can lead to issues like hallucinations and unsafe +content generation. Current alignment techniques often rely on coarse feedback +and external datasets, limiting scalability and performance. In this paper, we +propose FiSAO (Fine-Grained Self-Alignment Optimization), a novel +self-alignment method that utilizes the model's own visual encoder as a +fine-grained verifier to improve vision-language alignment without the need for +additional data. By leveraging token-level feedback from the vision encoder, +FiSAO significantly improves vision-language alignment, even surpassing +traditional preference tuning methods that require additional data. Through +both theoretical analysis and experimental validation, we demonstrate that +FiSAO effectively addresses the misalignment problem in VLLMs, marking the +first instance of token-level rewards being applied to such models. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ A Hybrid Approach for COVID-19 Detection: Combining Wasserstein GAN with + Transfer Learning + + +
+ COVID-19 is extremely contagious and its rapid growth has drawn attention +towards its early diagnosis. Early diagnosis of COVID-19 enables healthcare +professionals and government authorities to break the chain of transition and +flatten the epidemic curve. With the number of cases accelerating across the +developed world, COVID-19 induced Viral Pneumonia cases is a big challenge. +Overlapping of COVID-19 cases with Viral Pneumonia and other lung infections +with limited dataset and long training hours is a serious problem to cater. +Limited amount of data often results in over-fitting models and due to this +reason, model does not predict generalized results. To fill this gap, we +proposed GAN-based approach to synthesize images which later fed into the deep +learning models to classify images of COVID-19, Normal, and Viral Pneumonia. +Specifically, customized Wasserstein GAN is proposed to generate 19% more Chest +X-ray images as compare to the real images. This expanded dataset is then used +to train four proposed deep learning models: VGG-16, ResNet-50, GoogLeNet and +MNAST. The result showed that expanded dataset utilized deep learning models to +deliver high classification accuracies. In particular, VGG-16 achieved highest +accuracy of 99.17% among all four proposed schemes. Rest of the models like +ResNet-50, GoogLeNet and MNAST delivered 93.9%, 94.49% and 97.75% testing +accuracies respectively. Later, the efficiency of these models is compared with +the state of art models on the basis of accuracy. Further, our proposed models +can be applied to address the issue of scant datasets for any problem of image +analysis. + +
+
+
+
+
+ + ♻ ☆ Formal Verification of Deep Neural Networks for Object Detection + + +
+ Deep neural networks (DNNs) are widely used in real-world applications, yet +they remain vulnerable to errors and adversarial attacks. Formal verification +offers a systematic approach to identify and mitigate these vulnerabilities, +enhancing model robustness and reliability. While most existing verification +methods focus on image classification models, this work extends formal +verification to the more complex domain of emph{object detection} models. We +propose a formulation for verifying the robustness of such models and +demonstrate how state-of-the-art verification tools, originally developed for +classification, can be adapted for this purpose. Our experiments, conducted on +various datasets and networks, highlight the ability of formal verification to +uncover vulnerabilities in object detection models, underscoring the need to +extend verification efforts to this domain. This work lays the foundation for +further research into formal verification across a broader range of computer +vision applications. + +
+
+
+
+
+ + ♻ ☆ ArtWeaver: Advanced Dynamic Style Integration via Diffusion Model + + +
+ Stylized Text-to-Image Generation (STIG) aims to generate images from text +prompts and style reference images. In this paper, we present ArtWeaver, a +novel framework that leverages pretrained Stable Diffusion (SD) to address +challenges such as misinterpreted styles and inconsistent semantics. Our +approach introduces two innovative modules: the mixed style descriptor and the +dynamic attention adapter. The mixed style descriptor enhances SD by combining +content-aware and frequency-disentangled embeddings from CLIP with additional +sources that capture global statistics and textual information, thus providing +a richer blend of style-related and semantic-related knowledge. To achieve a +better balance between adapter capacity and semantic control, the dynamic +attention adapter is integrated into the diffusion UNet, dynamically +calculating adaptation weights based on the style descriptors. Additionally, we +introduce two objective functions to optimize the model alongside the denoising +loss, further enhancing semantic and style consistency. Extensive experiments +demonstrate the superiority of ArtWeaver over existing methods, producing +images with diverse target styles while maintaining the semantic integrity of +the text prompts. + +
+
+
+
+
+ + ♻ ☆ Frame Interpolation with Consecutive Brownian Bridge Diffusion + + +
+ Recent work in Video Frame Interpolation (VFI) tries to formulate VFI as a +diffusion-based conditional image generation problem, synthesizing the +intermediate frame given a random noise and neighboring frames. Due to the +relatively high resolution of videos, Latent Diffusion Models (LDMs) are +employed as the conditional generation model, where the autoencoder compresses +images into latent representations for diffusion and then reconstructs images +from these latent representations. Such a formulation poses a crucial +challenge: VFI expects that the output is deterministically equal to the ground +truth intermediate frame, but LDMs randomly generate a diverse set of different +images when the model runs multiple times. The reason for the diverse +generation is that the cumulative variance (variance accumulated at each step +of generation) of generated latent representations in LDMs is large. This makes +the sampling trajectory random, resulting in diverse rather than deterministic +generations. To address this problem, we propose our unique solution: Frame +Interpolation with Consecutive Brownian Bridge Diffusion. Specifically, we +propose consecutive Brownian Bridge diffusion that takes a deterministic +initial value as input, resulting in a much smaller cumulative variance of +generated latent representations. Our experiments suggest that our method can +improve together with the improvement of the autoencoder and achieve +state-of-the-art performance in VFI, leaving strong potential for further +enhancement. + +
+
+ comment: Formatting +
+
+
+
+
+ + ♻ ☆ Grounded 3D-LLM with Referent Tokens + + +
+ Prior studies on 3D scene understanding have primarily developed specialized +models for specific tasks or required task-specific fine-tuning. In this study, +we propose Grounded 3D-LLM, which explores the potential of 3D large +multi-modal models (3D LMMs) to consolidate various 3D vision tasks within a +unified generative framework. The model uses scene referent tokens as special +noun phrases to reference 3D scenes, enabling it to handle sequences that +interleave 3D and textual data. Per-task instruction-following templates are +employed to ensure natural and diversity in translating 3D vision tasks into +language formats. To facilitate the use of referent tokens in subsequent +language modeling, we provide a large-scale, automatically curated grounded +scene-text dataset with over 1 million phrase-to-region correspondences and +introduce Contrastive Language-Scene Pre-training (CLASP) to perform +phrase-level scene-text alignment using this data. Our comprehensive evaluation +covers open-ended tasks like dense captioning and 3D question answering, +alongside close-ended tasks such as object detection and language grounding. +Experiments across multiple 3D benchmarks reveal the leading performance and +the broad applicability of Grounded 3D-LLM. Code and datasets are available at +the https://groundedscenellm.github.io/grounded_3d-llm.github.io. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Masked Autoencoders are Parameter-Efficient Federated Continual Learners + + +
+ Federated learning is a specific distributed learning paradigm in which a +central server aggregates updates from multiple clients' local models, thereby +enabling the server to learn without requiring clients to upload their private +data, maintaining data privacy. While existing federated learning methods are +primarily designed for static data, real-world applications often require +clients to learn new categories over time. This challenge necessitates the +integration of continual learning techniques, leading to federated continual +learning (FCL). To address both catastrophic forgetting and non-IID issues, we +propose to use masked autoencoders (MAEs) as parameter-efficient federated +continual learners, called pMAE. pMAE learns reconstructive prompt on the +client side through image reconstruction using MAE. On the server side, it +reconstructs the uploaded restore information to capture the data distribution +across previous tasks and different clients, using these reconstructed images +to finetune discriminative prompt and classifier parameters tailored for +classification, thereby alleviating catastrophic forgetting and non-IID issues +on a global scale. Experimental results demonstrate that pMAE achieves +performance comparable to existing prompt-based methods and can enhance their +effectiveness, particularly when using self-supervised pre-trained transformers +as the backbone. Code is available at: https://github.com/ycheoo/pMAE. + +
+
+
+
+
+ + ♻ ☆ An Open-Source Tool for Mapping War Destruction at Scale in Ukraine + using Sentinel-1 Time Series + + +
+ Access to detailed war impact assessments is crucial for humanitarian +organizations to effectively assist populations most affected by armed +conflicts. However, maintaining a comprehensive understanding of the situation +on the ground is challenging, especially in conflicts that cover vast +territories and extend over long periods. This study presents a scalable and +transferable method for estimating war-induced damage to buildings. We first +train a machine learning model to output pixel-wise probability of destruction +from Synthetic Aperture Radar (SAR) satellite image time series, leveraging +existing, manual damage assessments as ground truth and cloud-based geospatial +analysis tools for large-scale inference. We further post-process these +assessments using open building footprints to obtain a final damage estimate +per building. We introduce an accessible, open-source tool that allows users to +adjust the confidence interval based on their specific requirements and use +cases. Our approach enables humanitarian organizations and other actors to +rapidly screen large geographic regions for war impacts. We provide two +publicly accessible dashboards: a Ukraine Damage Explorer to dynamically view +our pre-computed estimates, and a Rapid Damage Mapping Tool to easily run our +method and produce custom maps. + +
+
+
+
+
+ + ♻ ☆ DemMamba: Alignment-free Raw Video Demoireing with Frequency-assisted + Spatio-Temporal Mamba + + +
+ Moire patterns, resulting from the interference of two similar repetitive +patterns, are frequently observed during the capture of images or videos on +screens. These patterns vary in color, shape, and location across video frames, +posing challenges in extracting information from adjacent frames and preserving +temporal consistency throughout the restoration process. Existing deep learning +methods often depend on well-designed alignment modules, such as optical flow +estimation, deformable convolution, and cross-frame self-attention layers, +incurring high computational costs. Recent studies indicate that utilizing raw +data as input can significantly improve the effectiveness of video demoireing +by providing the pristine degradation information and more detailed content. +However, previous works fail to design both efficient and effective raw video +demoireing methods that can maintain temporal consistency and prevent +degradation of color and spatial details. This paper introduces a novel +alignment-free raw video demoireing network with frequency-assisted +spatio-temporal Mamba (DemMamba). It features sequentially arranged Spatial +Mamba Blocks (SMB) and Temporal Mamba Blocks (TMB) to effectively model the +inter- and intra-relationships in raw videos affected by moire patterns. An +Adaptive Frequency Block (AFB) within the SMB facilitates demoireing in the +frequency domain, while a Channel Attention Block (CAB) in the TMB enhances the +temporal information interactions by leveraging inter-channel relationships +among features. Extensive experiments demonstrate that our proposed DemMamba +surpasses state-of-the-art methods by 1.3 dB in PSNR, and also provides a +satisfactory visual experience. + +
+
+
+
+
+ + ♻ ☆ Image Demoireing in RAW and sRGB Domains ECCV'24 + + +
+ Moire patterns frequently appear when capturing screens with smartphones or +cameras, potentially compromising image quality. Previous studies suggest that +moire pattern elimination in the RAW domain offers greater effectiveness +compared to demoireing in the sRGB domain. Nevertheless, relying solely on RAW +data for image demoireing is insufficient in mitigating the color cast due to +the absence of essential information required for the color correction by the +image signal processor (ISP). In this paper, we propose to jointly utilize both +RAW and sRGB data for image demoireing (RRID), which are readily accessible in +modern smartphones and DSLR cameras. We develop Skip-Connection-based +Demoireing Module (SCDM) with Gated Feedback Module (GFM) and Frequency +Selection Module (FSM) embedded in skip-connections for the efficient and +effective demoireing of RAW and sRGB features, respectively. Subsequently, we +design a RGB Guided ISP (RGISP) to learn a device-dependent ISP, assisting the +process of color recovery. Extensive experiments demonstrate that our RRID +outperforms state-of-the-art approaches, in terms of the performance in moire +pattern removal and color cast correction by 0.62dB in PSNR and 0.003 in SSIM. + +
+
+ comment: Accepted in ECCV'24 +
+
+
+
+
+ + ♻ ☆ Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse + Tensor-based Transformer + + +
+ The evolution of 3D visualization techniques has fundamentally transformed +how we interact with digital content. At the forefront of this change is point +cloud technology, offering an immersive experience that surpasses traditional +2D representations. However, the massive data size of point clouds presents +significant challenges in data compression. Current methods for lossy point +cloud attribute compression (PCAC) generally focus on reconstructing the +original point clouds with minimal error. However, for point cloud +visualization scenarios, the reconstructed point clouds with distortion still +need to undergo a complex rendering process, which affects the final +user-perceived quality. In this paper, we propose an end-to-end deep learning +framework that seamlessly integrates PCAC with differentiable rendering, +denoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of +rendered multiview images for viewing. In a differentiable manner, the impact +of the rendering process on the reconstructed point clouds is taken into +account. Moreover, we characterize point clouds as sparse tensors and propose a +sparse tensor-based transformer, called SP-Trans. By aligning with the local +density of the point cloud and utilizing an enhanced local attention mechanism, +SP-Trans captures the intricate relationships within the point cloud, further +improving feature analysis and synthesis within the framework. Extensive +experiments demonstrate that the proposed RO-PCAC achieves state-of-the-art +compression performance, compared to existing reconstruction-oriented methods, +including traditional, learning-based, and hybrid methods. + +
+
+
+
+
+ + ♻ ☆ Task Adaptive Feature Distribution Based Network for Few-shot + Fine-grained Target Classification + + +
+ Metric-based few-shot fine-grained classification has shown promise due to +its simplicity and efficiency. However, existing methods often overlook +task-level special cases and struggle with accurate category description and +irrelevant sample information. To tackle these, we propose TAFD-Net: a task +adaptive feature distribution network. It features a task-adaptive component +for embedding to capture task-level nuances, an asymmetric metric for +calculating feature distribution similarities between query samples and support +categories, and a contrastive measure strategy to boost performance. Extensive +experiments have been conducted on three datasets and the experimental results +show that our proposed algorithm outperforms recent incremental learning +algorithms. + +
+
+ comment: The presentation logic of the algorithm section in the paper is + unclear, and there are errors in the experimental part that need to be + corrected, along with additional experiments to be conducted +
+
+
+
+
+ + ♻ ☆ MegaFusion: Extend Diffusion Models towards Higher-resolution Image + Generation without Further Tuning WACV 2025 + + +
+ Diffusion models have emerged as frontrunners in text-to-image generation, +but their fixed image resolution during training often leads to challenges in +high-resolution image generation, such as semantic deviations and object +replication. This paper introduces MegaFusion, a novel approach that extends +existing diffusion-based text-to-image models towards efficient +higher-resolution generation without additional fine-tuning or adaptation. +Specifically, we employ an innovative truncate and relay strategy to bridge the +denoising processes across different resolutions, allowing for high-resolution +image generation in a coarse-to-fine manner. Moreover, by integrating dilated +convolutions and noise re-scheduling, we further adapt the model's priors for +higher resolution. The versatility and efficacy of MegaFusion make it +universally applicable to both latent-space and pixel-space diffusion models, +along with other derivative models. Extensive experiments confirm that +MegaFusion significantly boosts the capability of existing models to produce +images of megapixels and various aspect ratios, while only requiring about 40% +of the original computational cost. + +
+
+ comment: Accepted by WACV 2025. Project Page: + https://haoningwu3639.github.io/MegaFusion/ +
+
+
+
+
+ + ♻ ☆ Machine Vision-Based Assessment of Fall Color Changes and its + Relationship with Leaf Nitrogen Concentration + + +
+ Apple(\textit{Malus domestica} Borkh.) trees are deciduous, shedding leaves +each year. This process is preceded by a gradual change in leaf color from +green to yellow as chlorophyll is degraded prior to abscission. The initiation +and rate of this color change are affected by many factors including leaf +nitrogen (N) concentration. We predict that leaf color during this transition +may be indicative of the nitrogen status of apple trees. This study assesses a +machine vision-based system for quantifying the change in leaf color and its +correlation with leaf nitrogen content. An image dataset was collected in color +and 3D over five weeks in the fall of 2021 and 2023 at a commercial orchard +using a ground vehicle-based stereovision sensor. Trees in the foreground were +segmented from the point cloud using color and depth thresholding methods. +Then, to estimate the proportion of yellow leaves per canopy, the color +information of the segmented canopy area was quantified using a custom-defined +metric, \textit{yellowness index} (a normalized ratio of yellow to green +foliage in the tree) that varied from -1 to +1 (-1 being completely green and ++1 being completely yellow). Both K-means-based methods and gradient boosting +methods were used to estimate the \textit{yellowness index}. The gradient +boosting based method proposed in this study was better than the K-means-based +method (both in terms of computational time and accuracy), achieving an $R^2$ +of 0.72 in estimating the \textit{yellowness index}. The metric was able to +capture the gradual color transition from green to yellow over the study +duration. Trees with lower leaf nitrogen showed the color transition to yellow +earlier than the trees with higher nitrogen. + Keywords: Fruit Tree Nitrogen Management, Machine Vision, Point Cloud +Segmentation, Precision Nitrogen Management + +
+
+
+
+
+ + ♻ ☆ A Scalable Training Strategy for Blind Multi-Distribution Noise Removal + + +
+ Despite recent advances, developing general-purpose universal denoising and +artifact-removal networks remains largely an open problem: Given fixed network +weights, one inherently trades-off specialization at one task (e.g.,~removing +Poisson noise) for performance at another (e.g.,~removing speckle noise). In +addition, training such a network is challenging due to the curse of +dimensionality: As one increases the dimensions of the specification-space +(i.e.,~the number of parameters needed to describe the noise distribution) the +number of unique specifications one needs to train for grows exponentially. +Uniformly sampling this space will result in a network that does well at very +challenging problem specifications but poorly at easy problem specifications, +where even large errors will have a small effect on the overall mean squared +error. + In this work we propose training denoising networks using an +adaptive-sampling/active-learning strategy. Our work improves upon a recently +proposed universal denoiser training strategy by extending these results to +higher dimensions and by incorporating a polynomial approximation of the true +specification-loss landscape. This approximation allows us to reduce training +times by almost two orders of magnitude. We test our method on simulated joint +Poisson-Gaussian-Speckle noise and demonstrate that with our proposed training +strategy, a single blind, generalist denoiser network can achieve peak +signal-to-noise ratios within a uniform bound of specialized denoiser networks +across a large range of operating conditions. We also capture a small dataset +of images with varying amounts of joint Poisson-Gaussian-Speckle noise and +demonstrate that a universal denoiser trained using our adaptive-sampling +strategy outperforms uniformly trained baselines. + +
+
+ comment: IEEE TIP 2024 +
+
+
+
+
+ + ♻ ☆ CerviXpert: A Multi-Structural Convolutional Neural Network for + Predicting Cervix Type and Cervical Cell Abnormalities + + +
+ Cervical cancer is a major cause of cancer-related mortality among women +worldwide, and its survival rate improves significantly with early detection. +Traditional diagnostic methods such as Pap smears and cervical biopsies rely +heavily on cytologist expertise, making the process prone to human error. This +study introduces CerviXpert, a multi-structural convolutional neural network +model designed to efficiently classify cervix types and detect cervical cell +abnormalities. CerviXpert is built as a computationally efficient model that +classifies cervical cancer using images from the publicly available SiPaKMeD +dataset. The model architecture emphasizes simplicity, using a limited number +of convolutional layers followed by max pooling and dense layers, trained from +scratch. + We assessed the performance of CerviXpert against other state of the art +convolutional neural network models including ResNet50, VGG16, MobileNetV2, and +InceptionV3, evaluating them on accuracy, computational efficiency, and +robustness using five fold cross validation. CerviXpert achieved an accuracy of +98.04 percent in classifying cervical cell abnormalities into three classes and +98.60 percent for five class cervix type classification, outperforming +MobileNetV2 and InceptionV3 in both accuracy and computational requirements. It +showed comparable results to ResNet50 and VGG16 while reducing computational +complexity and resource needs. + CerviXpert provides an effective solution for cervical cancer screening and +diagnosis, balancing accuracy with computational efficiency. Its streamlined +design enables deployment in resource constrained environments, potentially +enhancing early detection and management of cervical cancer. + +
+
+ comment: 11 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ MatchTime: Towards Automatic Soccer Game Commentary Generation EMNLP 2024 + + +
+ Soccer is a globally popular sport with a vast audience, in this paper, we +consider constructing an automatic soccer game commentary model to improve the +audiences' viewing experience. In general, we make the following contributions: +First, observing the prevalent video-text misalignment in existing datasets, we +manually annotate timestamps for 49 matches, establishing a more robust +benchmark for soccer game commentary generation, termed as +SN-Caption-test-align; Second, we propose a multi-modal temporal alignment +pipeline to automatically correct and filter the existing dataset at scale, +creating a higher-quality soccer game commentary dataset for training, denoted +as MatchTime; Third, based on our curated dataset, we train an automatic +commentary generation model, named MatchVoice. Extensive experiments and +ablation studies have demonstrated the effectiveness of our alignment pipeline, +and training model on the curated dataset achieves state-of-the-art performance +for commentary generation, showcasing that better alignment can lead to +significant performance improvements in downstream tasks. + +
+
+ comment: Accepted by EMNLP 2024 (Oral Presentation); Project Page: + https://haoningwu3639.github.io/MatchTime/ +
+
+
+
+
+ + ♻ ☆ DreamText: High Fidelity Scene Text Synthesis + + +
+ Scene text synthesis involves rendering specified texts onto arbitrary +images. Current methods typically formulate this task in an end-to-end manner +but lack effective character-level guidance during training. Besides, their +text encoders, pre-trained on a single font type, struggle to adapt to the +diverse font styles encountered in practical applications. Consequently, these +methods suffer from character distortion, repetition, and absence, particularly +in polystylistic scenarios. To this end, this paper proposes DreamText for +high-fidelity scene text synthesis. Our key idea is to reconstruct the +diffusion training process, introducing more refined guidance tailored to this +task, to expose and rectify the model's attention at the character level and +strengthen its learning of text regions. This transformation poses a hybrid +optimization challenge, involving both discrete and continuous variables. To +effectively tackle this challenge, we employ a heuristic alternate optimization +strategy. Meanwhile, we jointly train the text encoder and generator to +comprehensively learn and utilize the diverse font present in the training +dataset. This joint training is seamlessly integrated into the alternate +optimization process, fostering a synergistic relationship between learning +character embedding and re-estimating character attention. Specifically, in +each step, we first encode potential character-generated position information +from cross-attention maps into latent character masks. These masks are then +utilized to update the representation of specific characters in the current +step, which, in turn, enables the generator to correct the character's +attention in the subsequent steps. Both qualitative and quantitative results +demonstrate the superiority of our method to the state of the art. + +
+
+ comment: Code: https://github.com/CodeGoat24/DreamText, Project page: + https://codegoat24.github.io/DreamText/ +
+
+
+
+
+ + ♻ ☆ MagicFace: Training-free Universal-Style Human Image Customized + Synthesis + + +
+ Current human image customization methods leverage Stable Diffusion (SD) for +its rich semantic prior. However, since SD is not specifically designed for +human-oriented generation, these methods often require extensive fine-tuning on +large-scale datasets, which renders them susceptible to overfitting and hinders +their ability to personalize individuals with previously unseen styles. +Moreover, these methods extensively focus on single-concept human image +synthesis and lack the flexibility to customize individuals using multiple +given concepts, thereby impeding their broader practical application. This +paper proposes MagicFace, a novel training-free method for multi-concept +universal-style human image personalized synthesis. Our core idea is to +simulate how humans create images given specific concepts, i.e., first +establish a semantic layout considering factors such as concepts' shape and +posture, then optimize details by comparing with concepts at the pixel level. +To implement this process, we introduce a coarse-to-fine generation pipeline, +involving two sequential stages: semantic layout construction and concept +feature injection. This is achieved by our Reference-aware Self-Attention (RSA) +and Region-grouped Blend Attention (RBA) mechanisms. In the first stage, RSA +enables the latent image to query features from all reference concepts +simultaneously, extracting the overall semantic understanding to facilitate the +initial semantic layout establishment. In the second stage, we employ an +attention-based semantic segmentation method to pinpoint the latent generated +regions of all concepts at each step. Following this, RBA divides the pixels of +the latent image into semantic groups, with each group querying fine-grained +features from the corresponding reference concept. Extensive experiments +demonstrate the superiority of our MagicFace. + +
+
+ comment: project page: https://codegoat24.github.io/MagicFace +
+
+
+
+
+ + ♻ ☆ Vision-guided and Mask-enhanced Adaptive Denoising for Prompt-based + Image Editing + + +
+ Text-to-image diffusion models have demonstrated remarkable progress in +synthesizing high-quality images from text prompts, which boosts researches on +prompt-based image editing that edits a source image according to a target +prompt. Despite their advances, existing methods still encounter three key +issues: 1) limited capacity of the text prompt in guiding target image +generation, 2) insufficient mining of word-to-patch and patch-to-patch +relationships for grounding editing areas, and 3) unified editing strength for +all regions during each denoising step. To address these issues, we present a +Vision-guided and Mask-enhanced Adaptive Editing (ViMAEdit) method with three +key novel designs. First, we propose to leverage image embeddings as explicit +guidance to enhance the conventional textual prompt-based denoising process, +where a CLIP-based target image embedding estimation strategy is introduced. +Second, we devise a self-attention-guided iterative editing area grounding +strategy, which iteratively exploits patch-to-patch relationships conveyed by +self-attention maps to refine those word-to-patch relationships contained in +cross-attention maps. Last, we present a spatially adaptive variance-guided +sampling, which highlights sampling variances for critical image regions to +promote the editing capability. Experimental results demonstrate the superior +editing capacity of ViMAEdit over all existing methods. + +
+
+
+
+
+ + ♻ ☆ CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for + Adversarial Defense NeurIPS 2024 + + +
+ Despite ongoing efforts to defend neural classifiers from adversarial +attacks, they remain vulnerable, especially to unseen attacks. In contrast, +humans are difficult to be cheated by subtle manipulations, since we make +judgments only based on essential factors. Inspired by this observation, we +attempt to model label generation with essential label-causative factors and +incorporate label-non-causative factors to assist data generation. For an +adversarial example, we aim to discriminate the perturbations as non-causative +factors and make predictions only based on the label-causative factors. +Concretely, we propose a casual diffusion model (CausalDiff) that adapts +diffusion models for conditional data generation and disentangles the two types +of casual factors by learning towards a novel casual information bottleneck +objective. Empirically, CausalDiff has significantly outperformed +state-of-the-art defense methods on various unseen attacks, achieving an +average robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on +CIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition +Benchmark). The code is available at +\href{https://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff}{https://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff} + +
+
+ comment: accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ObjectNLQ @ Ego4D Episodic Memory Challenge 2024 CVPR + + +
+ In this report, we present our approach for the Natural Language Query track +and Goal Step track of the Ego4D Episodic Memory Benchmark at CVPR 2024. Both +challenges require the localization of actions within long video sequences +using textual queries. To enhance localization accuracy, our method not only +processes the temporal information of videos but also identifies fine-grained +objects spatially within the frames. To this end, we introduce a novel +approach, termed ObjectNLQ, which incorporates an object branch to augment the +video representation with detailed object information, thereby improving +grounding efficiency. ObjectNLQ achieves a mean R@1 of 23.15, ranking 2nd in +the Natural Language Queries Challenge, and gains 33.00 in terms of the metric +R@1, IoU=0.3, ranking 3rd in the Goal Step Challenge. Our code will be released +at https://github.com/Yisen-Feng/ObjectNLQ. + +
+
+ comment: The solution for the Natural Language Query track and Goal Step track + at CVPR EgoVis Workshop 2024 +
+
+
+
+
+ + ♻ ☆ 3D microstructural generation from 2D images of cement paste using + generative adversarial networks + + +
+ Establishing a realistic three-dimensional (3D) microstructure is a crucial +step for studying microstructure development of hardened cement pastes. +However, acquiring 3D microstructural images for cement often involves high +costs and quality compromises. This paper proposes a generative adversarial +networks-based method for generating 3D microstructures from a single +two-dimensional (2D) image, capable of producing high-quality and realistic 3D +images at low cost. In the method, a framework (CEM3DMG) is designed to +synthesize 3D images by learning microstructural information from a 2D +cross-sectional image. Experimental results show that CEM3DMG can generate +realistic 3D images of large size. Visual observation confirms that the +generated 3D images exhibit similar microstructural features to the 2D images, +including similar pore distribution and particle morphology. Furthermore, +quantitative analysis reveals that reconstructed 3D microstructures closely +match the real 2D microstructure in terms of gray level histogram, phase +proportions, and pore size distribution. The source code for CEM3DMG is +available in the GitHub repository at: https://github.com/NBICLAB/CEM3DMG. + +
+
+
+
+
+ + ♻ ☆ Activating Self-Attention for Multi-Scene Absolute Pose Regression NeurIPS 2024 + + +
+ Multi-scene absolute pose regression addresses the demand for fast and +memory-efficient camera pose estimation across various real-world environments. +Nowadays, transformer-based model has been devised to regress the camera pose +directly in multi-scenes. Despite its potential, transformer encoders are +underutilized due to the collapsed self-attention map, having low +representation capacity. This work highlights the problem and investigates it +from a new perspective: distortion of query-key embedding space. Based on the +statistical analysis, we reveal that queries and keys are mapped in completely +different spaces while only a few keys are blended into the query region. This +leads to the collapse of the self-attention map as all queries are considered +similar to those few keys. Therefore, we propose simple but effective solutions +to activate self-attention. Concretely, we present an auxiliary loss that +aligns queries and keys, preventing the distortion of query-key space and +encouraging the model to find global relations by self-attention. In addition, +the fixed sinusoidal positional encoding is adopted instead of undertrained +learnable one to reflect appropriate positional clues into the inputs of +self-attention. As a result, our approach resolves the aforementioned problem +effectively, thus outperforming existing methods in both outdoor and indoor +scenes. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Multi-modal Situated Reasoning in 3D Scenes NeurIPS 2024 + + +
+ Situation awareness is essential for understanding and reasoning about 3D +scenes in embodied AI agents. However, existing datasets and benchmarks for +situated understanding are limited in data modality, diversity, scale, and task +scope. To address these limitations, we propose Multi-modal Situated Question +Answering (MSQA), a large-scale multi-modal situated reasoning dataset, +scalably collected leveraging 3D scene graphs and vision-language models (VLMs) +across a diverse range of real-world 3D scenes. MSQA includes 251K situated +question-answering pairs across 9 distinct question categories, covering +complex scenarios within 3D scenes. We introduce a novel interleaved +multi-modal input setting in our benchmark to provide text, image, and point +cloud for situation and question description, resolving ambiguity in previous +single-modality convention (e.g., text). Additionally, we devise the +Multi-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models' +situated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN +highlight the limitations of existing vision-language models and underscore the +importance of handling multi-modal interleaved inputs and situation modeling. +Experiments on data scaling and cross-domain transfer further demonstrate the +efficacy of leveraging MSQA as a pre-training dataset for developing more +powerful situated reasoning models. + +
+
+ comment: Accepted by NeurIPS 2024 Datasets and Benchmarks Track. Project page: + https://msr3d.github.io/ +
+
+
+
+
+ + ♻ ☆ Uncovering Hidden Connections: Iterative Search and Reasoning for + Video-grounded Dialog + + +
+ In contrast to conventional visual question answering, video-grounded dialog +necessitates a profound understanding of both dialog history and video content +for accurate response generation. Despite commendable progress made by existing +approaches, they still face the challenges of incrementally understanding +complex dialog history and assimilating video information. In response to these +challenges, we present an iterative search and reasoning framework, which +consists of a textual encoder, a visual encoder, and a generator. Specifically, +we devise a path search and aggregation strategy in the textual encoder, mining +core cues from dialog history that are pivotal to understanding the posed +questions. Concurrently, our visual encoder harnesses an iterative reasoning +network to extract and emphasize critical visual markers from videos, enhancing +the depth of visual comprehension. Finally, we utilize the pre-trained GPT-2 +model as our answer generator to decode the mined hidden clues into coherent +and contextualized answers. Extensive experiments on three public datasets +demonstrate the effectiveness and generalizability of our proposed framework. + +
+
+
+
+
+ + ♻ ☆ Searching for internal symbols underlying deep learning + + +
+ Deep learning (DL) enables deep neural networks (DNNs) to automatically learn +complex tasks or rules from given examples without instructions or guiding +principles. As we do not engineer DNNs' functions, it is extremely difficult to +diagnose their decisions, and multiple lines of studies proposed to explain the +principles of their operations. Notably, one line of studies suggests that DNNs +may learn concepts, the high level features that are recognizable to humans. In +this study, we extend this line of studies and hypothesize that DNNs can +develop abstract codes that can be used to augment DNNs' decision-making. To +address this hypothesis, we combine foundation segmentation models and +unsupervised learning to extract internal codes and identify potential use of +abstract codes to make DL's decision-making more reliable and safer. + +
+
+ comment: 16 pages, 10 figures, 5 tables and 1 supplementary table +
+
+
+
+
+ + ♻ ☆ MedCLIP-SAMv2: Towards Universal Text-Driven Medical Image Segmentation + + +
+ Segmentation of anatomical structures and pathological regions in medical +images is essential for modern clinical diagnosis, disease research, and +treatment planning. While significant advancements have been made in deep +learning-based segmentation techniques, many of these methods still suffer from +limitations in data efficiency, generalizability, and interactivity. As a +result, developing precise segmentation methods that require fewer labeled +datasets remains a critical challenge in medical image analysis. Recently, the +introduction of foundation models like CLIP and Segment-Anything-Model (SAM), +with robust cross-domain representations, has paved the way for interactive and +universal image segmentation. However, further exploration of these models for +data-efficient segmentation in medical imaging is still needed and highly +relevant. In this paper, we introduce MedCLIP-SAMv2, a novel framework that +integrates the CLIP and SAM models to perform segmentation on clinical scans +using text prompts, in both zero-shot and weakly supervised settings. Our +approach includes fine-tuning the BiomedCLIP model with a new Decoupled Hard +Negative Noise Contrastive Estimation (DHN-NCE) loss, and leveraging the +Multi-modal Information Bottleneck (M2IB) to create visual prompts for +generating segmentation masks from SAM in the zero-shot setting. We also +investigate using zero-shot segmentation labels within a weakly supervised +paradigm to enhance segmentation quality further. Extensive testing across four +diverse segmentation tasks and medical imaging modalities (breast tumor +ultrasound, brain tumor MRI, lung X-ray, and lung CT) demonstrates the high +accuracy of our proposed framework. Our code is available at +https://github.com/HealthX-Lab/MedCLIP-SAMv2. + +
+
+ comment: 10 pages, 2 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ GazeGen: Gaze-Driven User Interaction for Visual Content Generation + + +
+ We present GazeGen, a user interaction system that generates visual content +(images and videos) for locations indicated by the user's eye gaze. GazeGen +allows intuitive manipulation of visual content by targeting regions of +interest with gaze. Using advanced techniques in object detection and +generative AI, GazeGen performs gaze-controlled image adding/deleting, +repositioning, and surface style changes of image objects, and converts static +images into videos. Central to GazeGen is the DFT Gaze (Distilled and +Fine-Tuned Gaze) agent, an ultra-lightweight model with only 281K parameters, +performing accurate real-time gaze predictions tailored to individual users' +eyes on small edge devices. GazeGen is the first system to combine visual +content generation with real-time gaze estimation, made possible exclusively by +DFT Gaze. This real-time gaze estimation enables various visual content +generation tasks, all controlled by the user's gaze. The input for DFT Gaze is +the user's eye images, while the inputs for visual content generation are the +user's view and the predicted gaze point from DFT Gaze. To achieve efficient +gaze predictions, we derive the small model from a large model (10x larger) via +novel knowledge distillation and personal adaptation techniques. We integrate +knowledge distillation with a masked autoencoder, developing a compact yet +powerful gaze estimation model. This model is further fine-tuned with Adapters, +enabling highly accurate and personalized gaze predictions with minimal user +input. DFT Gaze ensures low-latency and precise gaze tracking, supporting a +wide range of gaze-driven tasks. We validate the performance of DFT Gaze on AEA +and OpenEDS2020 benchmarks, demonstrating low angular gaze error and low +latency on the edge device (Raspberry Pi 4). Furthermore, we describe +applications of GazeGen, illustrating its versatility and effectiveness in +various usage scenarios. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ ControlNet++: Improving Conditional Controls with Efficient Consistency + Feedback + + +
+ To enhance the controllability of text-to-image diffusion models, existing +efforts like ControlNet incorporated image-based conditional controls. In this +paper, we reveal that existing methods still face significant challenges in +generating images that align with the image conditional controls. To this end, +we propose ControlNet++, a novel approach that improves controllable generation +by explicitly optimizing pixel-level cycle consistency between generated images +and conditional controls. Specifically, for an input conditional control, we +use a pre-trained discriminative reward model to extract the corresponding +condition of the generated images, and then optimize the consistency loss +between the input conditional control and extracted condition. A +straightforward implementation would be generating images from random noises +and then calculating the consistency loss, but such an approach requires +storing gradients for multiple sampling timesteps, leading to considerable time +and memory costs. To address this, we introduce an efficient reward strategy +that deliberately disturbs the input images by adding noise, and then uses the +single-step denoised images for reward fine-tuning. This avoids the extensive +costs associated with image sampling, allowing for more efficient reward +fine-tuning. Extensive experiments show that ControlNet++ significantly +improves controllability under various conditional controls. For example, it +achieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE, +respectively, for segmentation mask, line-art edge, and depth conditions. All +the code, models, demo and organized data have been open sourced on our Github +Repo. + +
+
+ comment: Camera Ready Version. Project Page: + https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data: + https://github.com/liming-ai/ControlNet_Plus_Plus +
+
+
+
+
+ + ♻ ☆ FastVideoEdit: Leveraging Consistency Models for Efficient Text-to-Video + Editing WACV 2025 + + +
+ Diffusion models have demonstrated remarkable capabilities in text-to-image +and text-to-video generation, opening up possibilities for video editing based +on textual input. However, the computational cost associated with sequential +sampling in diffusion models poses challenges for efficient video editing. +Existing approaches relying on image generation models for video editing suffer +from time-consuming one-shot fine-tuning, additional condition extraction, or +DDIM inversion, making real-time applications impractical. In this work, we +propose FastVideoEdit, an efficient zero-shot video editing approach inspired +by Consistency Models (CMs). By leveraging the self-consistency property of +CMs, we eliminate the need for time-consuming inversion or additional condition +extraction, reducing editing time. Our method enables direct mapping from +source video to target video with strong preservation ability utilizing a +special variance schedule. This results in improved speed advantages, as fewer +sampling steps can be used while maintaining comparable generation quality. +Experimental results validate the state-of-the-art performance and speed +advantages of FastVideoEdit across evaluation metrics encompassing editing +speed, temporal consistency, and text-video alignment. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ GD doesn't make the cut: Three ways that non-differentiability affects + neural network training + + +
+ This paper critically examines the fundamental distinctions between gradient +methods applied to non-differentiable functions (NGDMs) and classical gradient +descents (GDs) for differentiable functions, revealing significant gaps in +current deep learning optimization theory. We demonstrate that NGDMs exhibit +markedly different convergence properties compared to GDs, strongly challenging +the applicability of extensive neural network convergence literature based on +$L-smoothness$ to non-smooth neural networks. Our analysis reveals paradoxical +behavior of NDGM solutions for $L_{1}$-regularized problems, where increasing +regularization counterintuitively leads to larger $L_{1}$ norms of optimal +solutions. This finding calls into question widely adopted $L_{1}$ penalization +techniques for network pruning. We further challenge the common assumption that +optimization algorithms like RMSProp behave similarly in differentiable and +non-differentiable contexts. Expanding on the Edge of Stability phenomenon, we +demonstrate its occurrence in a broader class of functions, including Lipschitz +continuous convex differentiable functions. This finding raises important +questions about its relevance and interpretation in non-convex, +non-differentiable neural networks, particularly those using ReLU activations. +Our work identifies critical misunderstandings of NDGMs in influential +literature, stemming from an overreliance on strong smoothness assumptions. +These findings necessitate a reevaluation of optimization dynamics in deep +learning, emphasizing the crucial need for more nuanced theoretical foundations +in analyzing these complex systems. + +
+
+
+
+
+ + ♻ ☆ SatDiffMoE: A Mixture of Estimation Method for Satellite Image + Super-resolution with Latent Diffusion Models ICML 2024 + + +
+ During the acquisition of satellite images, there is generally a trade-off +between spatial resolution and temporal resolution (acquisition frequency) due +to the onboard sensors of satellite imaging systems. High-resolution satellite +images are very important for land crop monitoring, urban planning, wildfire +management and a variety of applications. It is a significant yet challenging +task to achieve high spatial-temporal resolution in satellite imaging. With the +advent of diffusion models, we can now learn strong generative priors to +generate realistic satellite images with high resolution, which can be utilized +to promote the super-resolution task as well. In this work, we propose a novel +diffusion-based fusion algorithm called \textbf{SatDiffMoE} that can take an +arbitrary number of sequential low-resolution satellite images at the same +location as inputs, and fuse them into one high-resolution reconstructed image +with more fine details, by leveraging and fusing the complementary information +from different time points. Our algorithm is highly flexible and allows +training and inference on arbitrary number of low-resolution images. +Experimental results show that our proposed SatDiffMoE method not only achieves +superior performance for the satellite image super-resolution tasks on a +variety of datasets, but also gets an improved computational efficiency with +reduced model parameters, compared with previous methods. + +
+
+ comment: Accepted by ICML 2024 Workshop on Advancing Neural Network Training + (WANT): Computational Efficiency, Scalability, and Resource Optimization +
+
+
+
+
+ + ♻ ☆ LiDAR-BEVMTN: Real-Time LiDAR Bird's-Eye View Multi-Task Perception + Network for Autonomous Driving + + +
+ LiDAR is crucial for robust 3D scene perception in autonomous driving. LiDAR +perception has the largest body of literature after camera perception. However, +multi-task learning across tasks like detection, segmentation, and motion +estimation using LiDAR remains relatively unexplored, especially on +automotive-grade embedded platforms. We present a real-time multi-task +convolutional neural network for LiDAR-based object detection, semantics, and +motion segmentation. The unified architecture comprises a shared encoder and +task-specific decoders, enabling joint representation learning. We propose a +novel Semantic Weighting and Guidance (SWAG) module to transfer semantic +features for improved object detection selectively. Our heterogeneous training +scheme combines diverse datasets and exploits complementary cues between tasks. +The work provides the first embedded implementation unifying these key +perception tasks from LiDAR point clouds achieving 3ms latency on the embedded +NVIDIA Xavier platform. We achieve state-of-the-art results for two tasks, +semantic and motion segmentation, and close to state-of-the-art performance for +3D object detection. By maximizing hardware efficiency and leveraging +multi-task synergies, our method delivers an accurate and efficient solution +tailored for real-world automated driving deployment. Qualitative results can +be seen at https://youtu.be/H-hWRzv2lIY. + +
+
+ comment: Accepted for publication at IEEE Transactions on Intelligent + Transportation Systems +
+
+
+
+
+ + ♻ ☆ Multi-scale Restoration of Missing Data in Optical Time-series Images + with Masked Spatial-Temporal Attention Network + + +
+ Remote sensing images often suffer from substantial data loss due to factors +such as thick cloud cover and sensor limitations. Existing methods for imputing +missing values in remote sensing images fail to fully exploit spatiotemporal +auxiliary information, which restricts the accuracy of their reconstructions. +To address this issue, this paper proposes a novel deep learning-based approach +called MS2TAN (Multi-Scale Masked Spatial-Temporal Attention Network) for +reconstructing time-series remote sensing images. First, we introduce an +efficient spatiotemporal feature extractor based on Masked Spatial-Temporal +Attention (MSTA) to capture high-quality representations of spatiotemporal +neighborhood features surrounding missing regions while significantly reducing +the computational complexity of the attention mechanism. Second, a Multi-Scale +Restoration Network composed of MSTA-based Feature Extractors is designed to +progressively refine missing values by exploring spatiotemporal neighborhood +features at different scales. Third, we propose a "Pixel-Structure-Perception" +Multi-Objective Joint Optimization method to enhance the visual quality of the +reconstructed results from multiple perspectives and to preserve more texture +structures. Finally, quantitative experimental results under multi-temporal +inputs on two public datasets demonstrate that the proposed method outperforms +competitive approaches, achieving a 9.76%/9.30% reduction in Mean Absolute +Error (MAE) and a 0.56 dB/0.62 dB increase in Peak Signal-to-Noise Ratio +(PSNR), along with stronger texture and structural consistency. Ablation +experiments further validate the contribution of the core innovations to +imputation accuracy. + +
+
+
+
+
+ + ♻ ☆ OpenCap markerless motion capture estimation of lower extremity + kinematics and dynamics in cycling + + +
+ Markerless motion capture offers several benefits over traditional +marker-based systems by eliminating the need for physical markers, which are +prone to misplacement and artifacts. Utilizing computer vision and deep +learning algorithms, markerless systems can directly detect human body +landmarks, reducing manual processing and errors associated with marker +placement. These systems are adaptable, able to track user-defined features, +and practical for real-world applications using consumer-grade devices such as +smartphone cameras. This study compares the performance of OpenCap, a +markerless motion capture system, with traditional marker-based systems in +assessing cycling biomechanics. Ten healthy adults participated in experiments +to capture sagittal hip, knee, and ankle kinematics and dynamics using both +methods. OpenCap used videos from smartphones and integrated computer vision +and musculoskeletal simulations to estimate 3D kinematics. Results showed high +agreement between the two systems, with no significant differences in kinematic +and kinetic measurements for the hip, knee, and ankle. The correlation +coefficients exceeded 0.98, indicating very strong consistency. Errors were +minimal, with kinematic errors under 4 degrees and kinetic errors below 5 Nm. +This study concludes that OpenCap is a viable alternative to marker-based +motion capture, offering comparable precision without extensive setup for hip +(flexion/extension), knee (flexion/extension), and ankle +(dorsiflexion/plantarflexion) joints. Future work should aim to enhance the +accuracy of ankle joint measurements and extend analyses to 3D kinematics and +kinetics for comprehensive biomechanical assessments. + +
+
+
+
+
+ + ♻ ☆ Deep Generative Classification of Blood Cell Morphology + + +
+ Accurate classification of haematological cells is critical for diagnosing +blood disorders, but presents significant challenges for machine automation +owing to the complexity of cell morphology, heterogeneities of biological, +pathological, and imaging characteristics, and the imbalance of cell type +frequencies. We introduce CytoDiffusion, a diffusion-based classifier that +effectively models blood cell morphology, combining accurate classification +with robust anomaly detection, resistance to distributional shifts, +interpretability, data efficiency, and superhuman uncertainty quantification. +Our approach outperforms state-of-the-art discriminative models in anomaly +detection (AUC 0.990 vs. 0.918), resistance to domain shifts (85.85% vs. 74.38% +balanced accuracy), and performance in low-data regimes (95.88% vs. 94.95% +balanced accuracy). Notably, our model generates synthetic blood cell images +that are nearly indistinguishable from real images, as demonstrated by an +authenticity test in which expert haematologists achieved only 52.3% accuracy +(95% CI: [50.5%, 54.2%]) in distinguishing real from generated images. +Furthermore, we enhance model explainability through the generation of directly +interpretable counterfactual heatmaps. Our comprehensive evaluation framework, +encompassing these multiple performance dimensions, establishes a new benchmark +for medical image analysis in haematology, ultimately enabling improved +diagnostic accuracy in clinical settings. Our code is available at +https://github.com/CambridgeCIA/CytoDiffusion. + +
+
+
+
+
+ + ♻ ☆ N-DriverMotion: Driver motion learning and prediction using an + event-based camera and directly trained spiking neural networks on Loihi 2 + + +
+ Driver motion recognition is a principal factor in ensuring the safety of +driving systems. This paper presents a novel system for learning and predicting +driver motions and an event-based high-resolution (1280x720) dataset, +N-DriverMotion, newly collected to train on a neuromorphic vision system. The +system comprises an event-based camera that generates the first high-resolution +driver motion dataset representing spike inputs and efficient spiking neural +networks (SNNs) that are effective in training and predicting the driver's +gestures. The event dataset consists of 13 driver motion categories classified +by direction (front, side), illumination (bright, moderate, dark), and +participant. A novel simplified four-layer convolutional spiking neural network +(CSNN) that we proposed was directly trained using the high-resolution dataset +without any time-consuming preprocessing. This enables efficient adaptation to +on-device SNNs for real-time inference on high-resolution event-based streams. +Compared with recent gesture recognition systems adopting neural networks for +vision processing, the proposed neuromorphic vision system achieves comparable +accuracy, 94.04\%, in recognizing driver motions with the CSNN architecture. +Our proposed CSNN and the dataset can be used to develop safer and more +efficient driver monitoring systems for autonomous vehicles or edge devices +requiring an efficient neural network architecture. + +
+
+ comment: Accepted for publication in IEEE Open Journal of Vehicular Technology + (OJVT) on 18 November 2024 +
+
+
+
+
+ + ♻ ☆ SportsNGEN: Sustained Generation of Realistic Multi-player Sports + Gameplay + + +
+ We present a transformer decoder based sports simulation engine, SportsNGEN, +trained on sports player and ball tracking sequences, that is capable of +generating sustained gameplay and accurately mimicking the decision making of +real players. By training on a large database of professional tennis tracking +data, we demonstrate that simulations produced by SportsNGEN can be used to +predict the outcomes of rallies, determine the best shot choices at any point, +and evaluate counterfactual or what if scenarios to inform coaching decisions +and elevate broadcast coverage. By combining the generated simulations with a +shot classifier and logic to start and end rallies, the system is capable of +simulating an entire tennis match. We evaluate SportsNGEN by comparing +statistics of the simulations with those of real matches between the same +players. We show that the model output sampling parameters are crucial to +simulation realism and that SportsNGEN is probabilistically well-calibrated to +real data. In addition, a generic version of SportsNGEN can be customized to a +specific player by fine-tuning on the subset of match data that includes that +player. Finally, we show qualitative results indicating the same approach works +for football. + +
+
+
+
+
+
+
+
+ + Systems and Control 36 + +
+
+
+ + ☆ Scalable control synthesis for stochastic systems via structural IMDP + abstractions + + +
+ This paper introduces a novel abstraction-based framework for controller +synthesis of nonlinear discrete-time stochastic systems. The focus is on +probabilistic reach-avoid specifications. The framework is based on abstracting +a stochastic system into a new class of robust Markov models, called +orthogonally decoupled Interval Markov Decision Processes (odIMDPs). +Specifically, an odIMDPs is a class of robust Markov processes, where the +transition probabilities between each pair of states are uncertain and have the +product form. We show that such a specific form in the transition probabilities +allows one to build compositional abstractions of stochastic systems that, for +each state, are only required to store the marginal probability bounds of the +original system. This leads to improved memory complexity for our approach +compared to commonly employed abstraction-based approaches. Furthermore, we +show that an optimal control strategy for a odIMDPs can be computed by solving +a set of linear problems. When the resulting strategy is mapped back to the +original system, it is guaranteed to lead to reduced conservatism compared to +existing approaches. To test our theoretical framework, we perform an extensive +empirical comparison of our methods against Interval Markov Decision Process- +and Markov Decision Process-based approaches on various benchmarks including 7D +systems. Our empirical analysis shows that our approach substantially +outperforms state-of-the-art approaches in terms of both memory requirements +and the conservatism of the results. + +
+
+
+
+
+ + ☆ Machine Learning-Assisted Distribution System Network Reconfiguration + Problem + + +
+ High penetration from volatile renewable energy resources in the grid and the +varying nature of loads raise the need for frequent line switching to ensure +the efficient operation of electrical distribution networks. Operators must +ensure maximum load delivery, reduced losses, and the operation between voltage +limits. However, computations to decide the optimal feeder configuration are +often computationally expensive and intractable, making it unfavorable for +real-time operations. This is mainly due to the existence of binary variables +in the network reconfiguration optimization problem. To tackle this issue, we +have devised an approach that leverages machine learning techniques to reshape +distribution networks featuring multiple substations. This involves predicting +the substation responsible for serving each part of the network. Hence, it +leaves simple and more tractable Optimal Power Flow problems to be solved. This +method can produce accurate results in a significantly faster time, as +demonstrated using the IEEE 37-bus distribution feeder. Compared to the +traditional optimization-based approaches, a feasible solution is achieved +approximately ten times faster for all the tested scenarios. + +
+
+
+
+
+ + ☆ Enabling steep slope walking on Husky using reduced order modeling and + quadratic programming + + +
+ Wing-assisted inclined running (WAIR) observed in some young birds, is an +attractive maneuver that can be extended to legged aerial systems. This study +proposes a control method using a modified Variable Length Inverted Pendulum +(VLIP) by assuming a fixed zero moment point and thruster forces collocated at +the center of mass of the pendulum. A QP MPC is used to find the optimal ground +reaction forces and thruster forces to track a reference position and velocity +trajectory. Simulation results of this VLIP model on a slope of 40 degrees is +maintained and shows thruster forces that can be obtained through posture +manipulation. The simulation also provides insight to how the combined efforts +of the thrusters and the tractive forces from the legs make WAIR possible in +thruster-assisted legged systems. + +
+
+ comment: 6 pages, 8 figures, submitted to the Humanoids 2025 conference +
+
+
+
+
+ + ☆ Design And Optimization Of Multi-rendezvous Manoeuvres Based On + Reinforcement Learning And Convex Optimization + + +
+ Optimizing space vehicle routing is crucial for critical applications such as +on-orbit servicing, constellation deployment, and space debris de-orbiting. +Multi-target Rendezvous presents a significant challenge in this domain. This +problem involves determining the optimal sequence in which to visit a set of +targets, and the corresponding optimal trajectories: this results in a +demanding NP-hard problem. We introduce a framework for the design and +refinement of multi-rendezvous trajectories based on heuristic combinatorial +optimization and Sequential Convex Programming. Our framework is both highly +modular and capable of leveraging candidate solutions obtained with advanced +approaches and handcrafted heuristics. We demonstrate this flexibility by +integrating an Attention-based routing policy trained with Reinforcement +Learning to improve the performance of the combinatorial optimization process. +We show that Reinforcement Learning approaches for combinatorial optimization +can be effectively applied to spacecraft routing problems. We apply the +proposed framework to the UARX Space OSSIE mission: we are able to thoroughly +explore the mission design space, finding optimal tours and trajectories for a +wide variety of mission scenarios. + +
+
+ comment: 18 pages, 12 figures, 5 tables +
+
+
+
+
+ + ☆ High-Speed Cornering Control and Real-Vehicle Deployment for Autonomous + Electric Vehicles + + +
+ Executing drift maneuvers during high-speed cornering presents significant +challenges for autonomous vehicles, yet offers the potential to minimize +turning time and enhance driving dynamics. While reinforcement learning (RL) +has shown promising results in simulated environments, discrepancies between +simulations and real-world conditions have limited its practical deployment. +This study introduces an innovative control framework that integrates +trajectory optimization with drift maneuvers, aiming to improve the algorithm's +adaptability for real-vehicle implementation. We leveraged Bezier-based +pre-trajectory optimization to enhance rewards and optimize the controller +through Twin Delayed Deep Deterministic Policy Gradient (TD3) in a simulated +environment. For real-world deployment, we implement a hybrid RL-MPC fusion +mechanism, , where TD3-derived maneuvers serve as primary inputs for a Model +Predictive Controller (MPC). This integration enables precise real-time +tracking of the optimal trajectory, with MPC providing corrective inputs to +bridge the gap between simulation and reality. The efficacy of this method is +validated through real-vehicle tests on consumer-grade electric vehicles, +focusing on drift U-turns and drift right-angle turns. The control outcomes of +these real-vehicle tests are thoroughly documented in the paper, supported by +supplementary video evidence (https://youtu.be/5wp67FcpfL8). Notably, this +study is the first to deploy and apply an RL-based transient drift cornering +algorithm on consumer-grade electric vehicles. + +
+
+ comment: In the process of being submitted to the Journal of IEEE Transactions + on Industrial Electronics +
+
+
+
+
+ + ☆ A New Finite-Horizon Dynamic Programming Analysis of Nonanticipative + Rate-Distortion Function for Markov Sources + + +
+ This paper deals with the computation of a non-asymptotic lower bound by +means of the nonanticipative rate-distortion function (NRDF) on the +discrete-time zero-delay variable-rate lossy compression problem for discrete +Markov sources with per-stage, single-letter distortion. First, we derive a new +information structure of the NRDF for Markov sources and single-letter +distortions. Second, we derive new convexity results on the NRDF, which +facilitate the use of Lagrange duality theorem to cast the problem as an +unconstrained partially observable finite-time horizon stochastic dynamic +programming (DP) algorithm subject to a probabilistic state (belief state) that +summarizes the past information about the reproduction symbols and takes values +in a continuous state space. Instead of approximating the DP algorithm +directly, we use Karush-Kuhn-Tucker (KKT) conditions to find an implicit +closed-form expression of the optimal control policy of the stochastic DP +(i.e., the minimizing distribution of the NRDF) and approximate the control +policy and the cost-to-go function (a function of the rate) stage-wise, via a +novel dynamic alternating minimization (AM) approach, that is realized by an +offline algorithm operating using backward recursions, with provable +convergence guarantees. We obtain the clean values of the aforementioned +quantities using an online (forward) algorithm operating for any finite-time +horizon. Our methodology provides an approximate solution to the exact NRDF +solution, which becomes near-optimal as the search space of the belief state +becomes sufficiently large at each time stage. We corroborate our theoretical +findings with simulation studies where we apply our algorithms assuming +time-varying and time-invariant binary Markov processes. + +
+
+
+
+
+ + ☆ Coevolution of Opinion Dynamics and Recommendation System: Modeling + Analysis and Reinforcement Learning Based Manipulation + + +
+ In this work, we develop an analytical framework that integrates opinion +dynamics with a recommendation system. By incorporating elements such as +collaborative filtering, we provide a precise characterization of how +recommendation systems shape interpersonal interactions and influence opinion +formation. Moreover, the property of the coevolution of both opinion dynamics +and recommendation systems is also shown. Specifically, the convergence of this +coevolutionary system is theoretically proved, and the mechanisms behind filter +bubble formation are elucidated. Our analysis of the maximum number of opinion +clusters shows how recommendation system parameters affect opinion grouping and +polarization. Additionally, we incorporate the influence of propagators into +our model and propose a reinforcement learning-based solution. The analysis and +the propagation solution are demonstrated in simulation using the Yelp data +set. + +
+
+
+
+
+ + ☆ On the Incorporation of Stability Constraints into Sequential + Operational Scheduling + + +
+ With the increasing penetration of Inverter-Based Resources (IBRs), power +system stability constraints must be incorporated into the operational +framework, transforming it into stability-constrained optimization. Currently, +there exist parallel research efforts on developing the stability constraints +within DC power flow-based unit commitment (UC) and AC Optimal Power Flow +(OPF). However, few studies discuss how including such constraints can interact +with each other and eventually impact grid stability. In this context, this +work simulates a realistic power system decision making framework and provides +a thorough analysis on the necessity of incorporating frequency nadir and small +signal stability constraints into these sequentially connected two operation +stages. The simulation results demonstrate that including both stability +constraints in the UC is essential to maintain power system stability, while +the inclusion in AC OPF can further improve the stability index. + +
+
+
+
+
+ + ☆ Approximate predictive control barrier function for discrete-time + systems + + +
+ We propose integrating an explicit approximation of a predictive control +barrier function (PCBF) in a safety filter framework. The approximated PCBF is +implicitly defined through an optimal control problem and allows guaranteeing +invariance of an implicitly defined safe set as well as stability of this safe +set within a larger domain of attraction. By extending existing theoretical +analysis of the PCBF, we establish inherent robustness of the original +algorithm and translate the guarantees to input-to-state stability of the +proposed algorithm with respect to possible approximation errors, recovering +the same guarantees in the absence of approximation errors. The proposed +algorithm allows certifying inputs with respect to state constraint +satisfaction through a single function evaluation and filtering unsafe inputs +through a control barrier function based safety filter, which is independent of +the time horizon of the original predictive optimisation problem, resulting in +significant online computational benefits. We demonstrate the stability +properties of the proposed algorithm on a linear system example as well as its +use a fast safety filter for miniature race cars in simulation. + +
+
+
+
+
+ + ☆ Carleman-Fourier Linearization of Complex Dynamical Systems: Convergence + and Explicit Error Bounds + + +
+ This paper presents a Carleman-Fourier linearization method for nonlinear +dynamical systems with periodic vector fields involving multiple fundamental +frequencies. By employing Fourier basis functions, the nonlinear dynamical +system is transformed into a linear model on an infinite-dimensional space. The +proposed approach yields accurate approximations over extended regions around +equilibria and for longer time horizons, compared to traditional Carleman +linearization with monomials. Additionally, we develop a finite-section +approximation for the resulting infinite-dimensional system and provide +explicit error bounds that demonstrate exponential convergence to the original +system's solution as the truncation length increases. For specific classes of +dynamical systems, exponential convergence is achieved across the entire time +horizon. The practical significance of these results lies in guiding the +selection of suitable truncation lengths for applications such as model +predictive control, safety verification through reachability analysis, and +efficient quantum computing algorithms. The theoretical findings are validated +through illustrative simulations. + +
+
+
+
+
+ + ☆ Integrating and Comparing Radiality Constraints for Optimized + Distribution System Reconfiguration + + +
+ The reconfiguration of electrical power distribution systems is a crucial +optimization problem aimed at minimizing power losses by altering the system +topology through the operation of interconnection switches. This problem, +typically modelled as a mixed integer nonlinear program demands high +computational resources for large scale networks and requires specialized +radiality constraints for maintaining the tree like structure of distribution +networks. This paper presents a comprehensive analysis that integrates and +compares the computational burden associated with different radiality +constraint formulations proposed in the specialized literature for the +reconfiguration of distribution systems. By using consistent hardware and +software setups, we evaluate the performance of these constraints across +several well known test cases. Our findings reveal significant differences in +computational efficiency depending on the chosen set of radiality constraints, +providing valuable insights for optimizing reconfiguration strategies in +practical distribution networks. + +
+
+
+
+
+ + ☆ A Linear Differential Inclusion for Contraction Analysis to Known + Trajectories + + +
+ Infinitesimal contraction analysis provides exponential convergence rates +between arbitrary pairs of trajectories of a system by studying the system's +linearization. An essentially equivalent viewpoint arises through stability +analysis of a linear differential inclusion (LDI) encompassing the incremental +behavior of the system. In this note, we study contraction of a system to a +particular known trajectory, deriving a new LDI characterizing the error +between arbitrary trajectories and this known trajectory. As with classical +contraction analysis, this new inclusion is constructed via first partial +derivatives of the system's vector field, and contraction rates are obtained +with familiar tools: uniform bounding of the logarithmic norm and LMI-based +Lyapunov conditions. Our LDI is guaranteed to outperform a usual contraction +analysis in two special circumstances: i) when the bound on the logarithmic +norm arises from an interval overapproximation of the Jacobian matrix, and ii) +when the norm considered is the $\ell_1$ norm. Finally, we demonstrate how the +proposed approach strictly improves an existing framework for ellipsoidal +reachable set computation. + +
+
+
+
+
+ + ☆ Exploring LLMs for Verifying Technical System Specifications Against + Requirements + + +
+ Requirements engineering is a knowledge intensive process and crucial for the +success of engineering projects. The field of knowledge-based requirements +engineering (KBRE) aims to support engineers by providing knowledge to assist +in the elicitation, validation, and management of system requirements. The +advent of large language models (LLMs) opens new opportunities in the field of +KBRE. This work experimentally investigates the potential of LLMs in +requirements verification. Therein, LLMs are provided with a set of +requirements and a textual system specification and are prompted to assess +which requirements are fulfilled by the system specification. Different +experimental variables such as system specification complexity, the number of +requirements, and prompting strategies were analyzed. Formal rule-based systems +serve as a benchmark to compare LLM performance to. Requirements and system +specifications are derived from the smart-grid domain. Results show that +advanced LLMs, like GPT-4o and Claude 3.5 Sonnet, achieved f1-scores between 79 +% and 94 % in identifying non-fulfilled requirements, indicating potential for +LLMs to be leveraged for requirements verification. + +
+
+ comment: Submitted to 3rd IEEE Industrial Electronics Society Annual Online + Conference (ONCON) +
+
+
+
+
+ + ☆ Reduced Network Cumulative Constraint Violation for Distributed Bandit + Convex Optimization under Slater Condition + + +
+ This paper studies the distributed bandit convex optimization problem with +time-varying inequality constraints, where the goal is to minimize network +regret and cumulative constraint violation. To calculate network cumulative +constraint violation, existing distributed bandit online algorithms solving +this problem directly use the clipped constraint function to replace its +original constraint function. However, the use of the clipping operation +renders Slater condition (i.e, there exists a point that strictly satisfies the +inequality constraints at all iterations) ineffective to achieve reduced +network cumulative constraint violation. To tackle this challenge, we propose a +new distributed bandit online primal-dual algorithm. If local loss functions +are convex, we show that the proposed algorithm establishes sublinear network +regret and cumulative constraint violation bounds. When Slater condition holds, +the network cumulative constraint violation bound is reduced. In addition, if +local loss functions are strongly convex, for the case where strongly convex +parameters are unknown, the network regret bound is reduced. For the case where +strongly convex parameters are known, the network regret and cumulative +constraint violation bounds are further reduced. To the best of our knowledge, +this paper is among the first to establish reduced (network) cumulative +constraint violation bounds for (distributed) bandit convex optimization with +time-varying constraints under Slater condition. Finally, a numerical example +is provided to verify the theoretical results. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.14060, + arXiv:2306.00149 +
+
+
+
+
+ + ☆ Sound Value Iteration for Simple Stochastic Games + + +
+ Algorithmic analysis of Markov decision processes (MDP) and stochastic games +(SG) in practice relies on value-iteration (VI) algorithms. Since the basic +version of VI does not provide guarantees on the precision of the result, +variants of VI have been proposed that offer such guarantees. In particular, +sound value iteration (SVI) not only provides precise lower and upper bounds on +the result, but also converges faster in the presence of probabilistic cycles. +Unfortunately, it is neither applicable to SG, nor to MDP with end components. +In this paper, we extend SVI and cover both cases. The technical challenge +consists mainly in proper treatment of end components, which require different +handling than in the literature. Moreover, we provide several optimizations of +SVI. Finally, we also evaluate our prototype implementation experimentally to +confirm its advantages on systems with probabilistic cycles. + +
+
+ comment: Preprint. Under Review +
+
+
+
+
+ + ☆ Data-Driven Structured Robust Control of Linear Systems + + +
+ Static structured control refers to the task of designing a state-feedback +controller such that the control gain satisfies a subspace constraint. +Structured control has applications in control of communication-inhibited +dynamical systems, such as systems in networked environments. This work +performs $H_2$-suboptimal regulation under a common structured state-feedback +controller for a class of data-consistent plants. The certification of +$H_2$-performance is attained through a combination of standard $H_2$ LMIs, +convex sufficient conditions for structured control, and a matrix S-lemma for +set-membership. The resulting convex optimization problems are linear matrix +inequalities whose size scales independently of the number of data samples +collected. Data-driven structured $H_2$-regulation control is demonstrated on +example systems. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Closed-loop multi-step planning with innate physics knowledge + + +
+ We present a hierarchical framework to solve robot planning as an input +control problem. At the lowest level are temporary closed control loops, +("tasks"), each representing a behaviour, contingent on a specific sensory +input and therefore temporary. At the highest level, a supervising +"Configurator" directs task creation and termination. Here resides "core" +knowledge as a physics engine, where sequences of tasks can be simulated. The +Configurator encodes and interprets simulation results,based on which it can +choose a sequence of tasks as a plan. We implement this framework on a real +robot and test it in an overtaking scenario as proof-of-concept. + +
+
+
+
+
+ + ☆ Distributed Learning with Partial Information Sharing + + +
+ This work studies the distributed learning process on a network of agents. +Agents make partial observation about an unknown hypothesis and iteratively +share their beliefs over a set of possible hypotheses with their neighbors to +learn the true hypothesis. We present and analyze a distributed learning +algorithm in which agents share belief on only one randomly chosen hypothesis +at a time. Agents estimate the beliefs on missed hypotheses using previously +shared beliefs. We show that agents learn the true hypothesis almost surely +under standard network connectivity and observation model assumptions if belief +on each hypothesis is shared with positive probability at every time. We also +present a memory-efficient variant of the learning algorithm with partial +belief sharing and present simulation results to compare rate of convergence of +full and partial information sharing algorithms. + +
+
+
+
+
+ + ☆ Towards Mitigating Sim2Real Gaps: A Formal Quantitative Approach + + +
+ In this paper, we introduce the notion of simulation-gap functions to +formally quantify the potential gap between an approximate nominal mathematical +model and the high-fidelity simulator representation of a real system. Given a +nominal mathematical model alongside a quantified simulation gap, the system +can be conceptualized as one characterized by bounded states and +input-dependent disturbances. This allows us to leverage the existing powerful +model-based control algorithms effectively, ensuring the enforcement of desired +specifications while guaranteeing a seamless transition from simulation to +real-world application. To provide a formal guarantee for quantifying the +simulation gap, we develop a data-driven approach. In particular, we collect +data using high-fidelity simulators, leveraging recent advancements in +Real-to-Sim transfer to ensure close alignment with reality. We demonstrate the +effectiveness of the proposed method through experiments conducted on a +nonlinear pendulum system and a nonlinear Turtlebot model in simulators. + +
+
+
+
+
+ + ☆ Network-Security Informed Offer-Making of Aggregator with Utility-Owned + Storage Lease Opportunity: Stochastic Stackelberg Game and Distributed + Solution Methods + + +
+ Aggregators of distributed energy resources are increasingly encouraged to +participate in wholesale market bidding. However, the delivery of the power +they are awarded can result in over-voltage or congestion issues within the +distribution network (DN). The opportunity to lease energy storage from the +utility that manages the DN provides the aggregator with a means to mitigate +these issues, while also benefiting the utility in terms of additional lease +revenue. Nevertheless, this leasing opportunity considerably complicates the +aggregator's offer-making process, as it requires the consideration of market +uncertainties, uncertain power injection at DN buses, and the strategic +interactions between the aggregator and the utility. This paper presents a +stochastic Stackelberg game model that effectively captures the interactions +between the aggregator and the utility, ensuring DN security across all +potential uncertainty scenarios. Furthermore, in light of the privacy concerns +of both the aggregator and the utility, two distributed solution methods are +proposed. The first method follows a traditional predict-then-optimize +framework and has been validated to achieve the game equilibrium. The second +method employs an end-to-end framework, which has been empirically shown to +yield superior economic results. Case studies conducted on 69 and 533-bus DNs +illustrate the efficacy of the proposed methods. + +
+
+
+
+
+ + ☆ Data Driven Automatic Electrical Machine Preliminary Design with + Artificial Intelligence Expert Guidance + + +
+ This paper presents a data-driven electrical machine design (EMD) framework +using wound-rotor synchronous generator (WRSG) as a design example. Unlike +traditional preliminary EMD processes that heavily rely on expertise, this +framework leverages an artificial-intelligence based expert database, to +provide preliminary designs directly from user specifications. Initial data is +generated using 2D finite element (FE) machine models by sweeping fundamental +design variables including machine length and diameter, enabling scalable +machine geometry with machine performance for each design is recorded. This +data trains a Metamodel of Optimal Prognosis (MOP)-based surrogate model, which +maps design variables to key performance indicators (KPIs). Once trained, +guided by metaheuristic algorithms, the surrogate model can generate thousands +of geometric scalable designs, covering a wide power range, forming an AI +expert database to guide future preliminary design. The framework is validated +with a 30kVA WRSG design case. A prebuilt WRSG database, covering power from 10 +to 60kVA, is validated by FE simulation. Design No.1138 is selected from +database and compared with conventional design. Results show No.1138 achieves a +higher power density of 2.21 kVA/kg in just 5 seconds, compared to 2.02 kVA/kg +obtained using traditional method, which take several days. The developed AI +expert database also serves as a high-quality data source for further +developing AI models for automatic electrical machine design. + +
+
+
+
+
+ + ☆ Conjugate Momentum-Based Estimation of External Forces for Bio-Inspired + Morphing Wing Flight + + +
+ Dynamic morphing wing flights present significant challenges in accurately +estimating external forces due to complex interactions between aerodynamics, +rapid wing movements, and external disturbances. Traditional force estimation +methods often struggle with unpredictable disturbances like wind gusts or +unmodeled impacts that can destabilize flight in real-world scenarios. This +paper addresses these challenges by implementing a Conjugate Momentum-based +Observer, which effectively estimates and manages unknown external forces +acting on the Aerobat, a bio-inspired robotic platform with dynamically +morphing wings. Through simulations, the observer demonstrates its capability +to accurately detect and quantify external forces, even in the presence of +Gaussian noise and abrupt impulse inputs. The results validate the robustness +of the method, showing improved stability and control of the Aerobat in dynamic +environments. This research contributes to advancements in bio-inspired +robotics by enhancing force estimation for flapping-wing systems, with +potential applications in autonomous aerial navigation and robust flight +control. + +
+
+
+
+
+ + ☆ Optimization free control and ground force estimation with momentum + observer for a multimodal legged aerial robot + + +
+ Legged-aerial multimodal robots can make the most of both legged and aerial +systems. In this paper, we propose a control framework that bypasses heavy +onboard computers by using an optimization-free Explicit Reference Governor +that incorporates external thruster forces from an attitude controller. Ground +reaction forces are maintained within friction cone constraints using costly +optimization solvers, but the ERG framework filters applied velocity references +that ensure no slippage at the foot end. We also propose a Conjugate momentum +observer, that is widely used in Disturbance Observation to estimate ground +reaction forces and compare its efficacy against a constrained model in +estimating ground reaction forces in a reduced-order simulation of Husky. + +
+
+ comment: 6 pages, 10 figures, submitted to American Control Conference 2025 +
+
+
+
+
+ + ☆ Is Locational Marginal Price All You Need for Locational Marginal + Emission? + + +
+ Growing concerns over climate change call for improved techniques for +estimating and quantifying the greenhouse gas emissions associated with +electricity generation and transmission. Among the emission metrics designated +for power grids, locational marginal emission (LME) can provide system +operators and electricity market participants with valuable information on the +emissions associated with electricity usage at various locations in the power +network. In this paper, by investigating the operating patterns and physical +interpretations of marginal emissions and costs in the security-constrained +economic dispatch (SCED) problem, we identify and draw the exact connection +between locational marginal price (LMP) and LME. Such interpretation helps +instantly derive LME given nodal demand vectors or LMP, and also reveals the +interplay between network congestion and nodal emission pattern. Our proposed +approach helps reduce the computation time of LME by an order of magnitude +compared to analytical approaches, while it can also serve as a plug-and-play +module accompanied by an off-the-shelf market clearing and LMP calculation +process. + +
+
+ comment: 8 pages, 5 figures, in submission +
+
+
+
+
+ + ☆ Stability and Performance Analysis on Self-dual Cones + + +
+ In this paper, we consider nonsymmetric solutions to certain Lyapunov and +Riccati equations and inequalities with coefficient matrices corresponding to +cone-preserving dynamical systems. Most results presented here appear to be +novel even in the special case of positive systems. First, we provide a simple +eigenvalue criterion for a Sylvester equation to admit a cone-preserving +solution. For a single system preserving a self-dual cone, this reduces to +stability. Further, we provide a set of conditions equivalent to testing a +given H-infinity norm bound, as in the bounded real lemma. These feature the +stability of a coefficient matrix similar to the Hamiltonian, a solution to two +conic inequalities, and a stabilizing cone-preserving solution to a +nonsymmetric Riccati equation. Finally, we show that the H-infinity norm is +attained at zero frequency. + +
+
+
+
+
+ + ☆ On-the-Go Path Planning and Repair in Static and Dynamic Scenarios + + +
+ Autonomous systems, including robots and drones, face significant challenges +when navigating through dynamic environments, particularly within urban +settings where obstacles, fluctuating traffic, and pedestrian activity are +constantly shifting. Although, traditional motion planning algorithms like the +wavefront planner and gradient descent planner, which use potential functions, +work well in static environments, they fall short in situations where the +environment is continuously changing. This work proposes a dynamic, real-time +path planning approach specifically designed for autonomous systems, allowing +them to effectively avoid static and dynamic obstacles, thereby enhancing their +overall adaptability. The approach integrates the efficiency of conventional +planners with the ability to make rapid adjustments in response to moving +obstacles and environmental changes. The simulation results discussed in this +article demonstrate the effectiveness of the proposed method, demonstrating its +suitability for robotic path planning in both known and unknown environments, +including those involving mobile objects, agents, or potential threats. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ A Robust Solver for Phasor-Domain Short-Circuit Analysis with + Inverter-Based Resources + + +
+ The integration of Inverter-Based Resource (IBR) model into phasor-domain +short circuit (SC) solvers challenges their numerical stability. To address the +challenge, this paper proposes a solver that improves numerical stability by +employing the Newton-Raphson iterative method. The solver can integrate the +latest implementation of IBR SC model in industry-standard fault analysis +programs including the voltage controlled current source tabular model as well +as vendor-specific black-box and white-box equation-based models. The superior +numerical stability of the proposed solver has been mathematically +demonstrated, with identified convergence conditions. An algorithm for the +implementation of the proposed solver in fault analysis programs has been +developed. The objective is to improve the capability of the industry to +accurately represent IBRs in SC studies and ensure system protection +reliability in an IBR-dominated future. + +
+
+
+
+
+ + ☆ Uncertainty Propagation and Minimization for Channel Estimation in + UAV-mounted RIS Systems + + +
+ Reconfigurable Intelligent Surfaces (RIS) are emerging as a key technology +for sixth-generation (6G) wireless networks, leveraging adjustable reflecting +elements to dynamically control electromagnetic wave propagation and optimize +wireless connectivity. By positioning the RIS on an unmanned aerial vehicle +(UAV), it can maintain line-of-sight and proximity to both the transmitter and +receiver, critical factors that mitigate path loss and enhance signal strength. +The lightweight, power-efficient nature of RIS makes UAV integration feasible, +yet the setup faces significant disturbances from UAV motion, which can degrade +RIS alignment and link performance. In this study, we address these challenges +using both experimental measurements and analytical methods. Using an extended +Kalman filter (EKF), we estimate the UAV's orientation in real time during +experimental flights to capture real disturbance effects. The resulting +orientation uncertainty is then propagated to the RIS's channel estimates by +applying the Guide to the Expression of Uncertainty in Measurement (GUM) +framework as well as complex-valued propagation techniques to accurately assess +and minimize the impact of UAV orientation uncertainties on RIS performance. +This method enables us to systematically trace and quantify how orientation +uncertainties affect channel gain and phase stability in real-time. Through +numerical simulations, we find that the uncertainty of the RIS channel link is +influenced by the RIS's configuration. Furthermore, our results demonstrate +that the uncertainty area is most accurately represented by an annular section, +enabling a 58% reduction in the uncertainty area while maintaining a 95% +coverage probability. + +
+
+ comment: 6 pages, 3 figures, submitted to IEEE International Conference on + Communications 2025 +
+
+
+
+
+ + ☆ Transmission Line Outage Probability Prediction Under Extreme Events + Using Peter-Clark Bayesian Structural Learning + + +
+ Recent years have seen a notable increase in the frequency and intensity of +extreme weather events. With a rising number of power outages caused by these +events, accurate prediction of power line outages is essential for safe and +reliable operation of power grids. The Bayesian network is a probabilistic +model that is very effective for predicting line outages under weather-related +uncertainties. However, most existing studies in this area offer general risk +assessments, but fall short of providing specific outage probabilities. In this +work, we introduce a novel approach for predicting transmission line outage +probabilities using a Bayesian network combined with Peter-Clark (PC) +structural learning. Our approach not only enables precise outage probability +calculations, but also demonstrates better scalability and robust performance, +even with limited data. Case studies using data from BPA and NOAA show the +effectiveness of this approach, while comparisons with several existing methods +further highlight its advantages. + +
+
+
+
+
+ + ♻ ☆ Flexibility of Integrated Power and Gas Systems: Gas Flow Modeling and + Solution Choices Matter + + +
+ Due to their slow gas flow dynamics, natural gas pipelines function as +short-term storage, the so-called linepack. By efficiently utilizing linepack, +the natural gas system can provide flexibility to the power system through the +flexible operation of gas-fired power plants. This requires accurately +representing the gas flow physics governed by partial differential equations. +Although several modeling and solution choices have been proposed in the +literature, their impact on the flexibility provision of gas networks to power +systems has not been thoroughly analyzed and compared. This paper bridges this +gap by first developing a unified framework. We harmonize existing approaches +and demonstrate their derivation from and application to the partial +differential equations. Secondly, based on the proposed framework, we +numerically analyze the implications of various modeling and solution choices +on the flexibility provision from gas networks to power systems. One key +conclusion is that relaxation-based approaches allow charging and discharging +the linepack at physically infeasible high rates, ultimately overestimating the +flexibility. + +
+
+
+
+
+ + ♻ ☆ Scalable spectral representations for multi-agent reinforcement learning + in network MDPs + + +
+ Network Markov Decision Processes (MDPs), a popular model for multi-agent +control, pose a significant challenge to efficient learning due to the +exponential growth of the global state-action space with the number of agents. +In this work, utilizing the exponential decay property of network dynamics, we +first derive scalable spectral local representations for network MDPs, which +induces a network linear subspace for the local $Q$-function of each agent. +Building on these local spectral representations, we design a scalable +algorithmic framework for continuous state-action network MDPs, and provide +end-to-end guarantees for the convergence of our algorithm. Empirically, we +validate the effectiveness of our scalable representation-based approach on two +benchmark problems, and demonstrate the advantages of our approach over generic +function approximation approaches to representing the local $Q$-functions. + +
+
+ comment: Updated title, corrected an issue with an author's name +
+
+
+
+
+ + ♻ ☆ Orthogonal Mode Decomposition for Finite Discrete Signals + + +
+ In this paper, an orthogonal mode decomposition method is proposed to +decompose ffnite length real signals on both the real and imaginary axes of the +complex plane. The interpolation function space of ffnite length discrete +signal is constructed, and the relationship between the dimensionality of the +interpolation function space and its subspaces and the band width of the +interpolation function is analyzed. It is proved that the intrinsic mode is +actually the narrow band signal whose intrinsic instantaneous frequency is +always positive (or always negative). Thus, the eigenmode decomposition problem +is transformed into the orthogonal projection problem of interpolation function +space to its low frequency subspace or narrow band subspace. Different from the +existing mode decomposition methods, the orthogonal modal decomposition is a +local time-frequency domain algorithm. Each operation extracts a speciffc mode. +The global decomposition results obtained under the precise deffnition of +eigenmodes have uniqueness and orthogonality. The computational complexity of +the orthogonal mode decomposition method is also much smaller than that of the +existing mode decomposition methods. + +
+
+
+
+
+ + ♻ ☆ Design of Distributed Controller for Discrete-Time Systems Via the + Integration of Extended LMI and Clique-Wise Decomposition + + +
+ This study addresses a design of distributed controllers for discrete-time +systems using linear matrix inequalities (LMIs). Sparsity constraints on +control gains of distributed controllers result in conservatism via the +convexification of the existing methods such as the extended LMI method. In +order to mitigate the conservatism, we introduce a novel LMI formulation for +this problem, utilizing the clique-wise decomposition method from our previous +work on continuous-time systems. By reformulating the sparsity constraint on +the gain matrix within cliques, this method achieves a broader solution set. +Also, the analytical superiority of our method is confirmed through numerical +examples. + +
+
+
+
+
+ + ♻ ☆ Improved Tangential Interpolation-based Multi-input Multi-output Modal + Analysis of a Full Aircraft + + +
+ In the field of Structural Dynamics, modal analysis is the foundation of +System Identification and vibration-based inspection. However, despite their +widespread use, current state-of-the-art methods for extracting modal +parameters from multi-input multi-output (MIMO) frequency domain data are still +affected by many technical limitations. Mainly, they can be computationally +cumbersome and/or negatively affected by close-in-frequency modes. The Loewner +Framework (LF) was recently proposed to alleviate these problems with the +limitation of working with single-input data only. This work proposes a +computationally improved version of the LF, or iLF, to extract modal parameters +more efficiently. Also, the proposed implementation is extended in order to +handle MIMO data in the frequency domain. This new implementation is compared +to state-of-the-art methods such as the frequency domain implementations of the +Least Square Complex Exponential method and the Numerical Algorithm for +Subspace State Space System Identification on numerical and experimental +datasets. More specifically, a finite element model of a 3D Euler-Bernoulli +beam is used for the baseline comparison and the noise robustness verification +of the proposed MIMO iLF algorithm. Then, an experimental dataset from MIMO +ground vibration tests of a trainer jet aircraft with over 91 accelerometer +channels is chosen for the algorithm validation on a real-life application. Its +validation is carried out with known results from a single-input multi-output +dataset of the starboard wing of the same aircraft. Excellent results are +achieved in terms of accuracy, robustness to noise, and computational +performance by the proposed improved MIMO method, both on the numerical and the +experimental datasets. The MIMO iLF MATLAB implementation is shared in the work +supplementary material. + +
+
+
+
+
+ + ♻ ☆ Homeostatic motion planning with innate physics knowledge + + +
+ Living organisms interact with their surroundings in a closed-loop fashion, +where sensory inputs dictate the initiation and termination of behaviours. Even +simple animals are able to develop and execute complex plans, which has not yet +been replicated in robotics using pure closed-loop input control. We propose a +solution to this problem by defining a set of discrete and temporary +closed-loop controllers, called "tasks", each representing a closed-loop +behaviour. We further introduce a supervisory module which has an innate +understanding of physics and causality, through which it can simulate the +execution of task sequences over time and store the results in a model of the +environment. On the basis of this model, plans can be made by chaining +temporary closed-loop controllers. The proposed framework was implemented for a +real robot and tested in two scenarios as proof of concept. + +
+
+
+
+
+ + ♻ ☆ Information-Theoretic Opacity-Enforcement in Markov Decision Processes + + +
+ The paper studies information-theoretic opacity, an information-flow privacy +property, in a setting involving two agents: A planning agent who controls a +stochastic system and an observer who partially observes the system states. The +goal of the observer is to infer some secret, represented by a random variable, +from its partial observations, while the goal of the planning agent is to make +the secret maximally opaque to the observer while achieving a satisfactory +total return. Modeling the stochastic system using a Markov decision process, +two classes of opacity properties are considered -- Last-state opacity is to +ensure that the observer is uncertain if the last state is in a specific set +and initial-state opacity is to ensure that the observer is unsure of the +realization of the initial state. As the measure of opacity, we employ the +Shannon conditional entropy capturing the information about the secret revealed +by the observable. Then, we develop primal-dual policy gradient methods for +opacity-enforcement planning subject to constraints on total returns. We +propose novel algorithms to compute the policy gradient of entropy for each +observation, leveraging message passing within the hidden Markov models. This +gradient computation enables us to have stable and fast convergence. We +demonstrate our solution of opacity-enforcement control through a grid world +example. + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Pairwise Markov Chains for Volatility Forecasting + + +
+ The Pairwise Markov Chain (PMC) is a probabilistic graphical model extending +the well-known Hidden Markov Model. This model, although highly effective for +many tasks, has been scarcely utilized for continuous value prediction. This is +mainly due to the issue of modeling observations inherent in generative +probabilistic models. In this paper, we introduce a new algorithm for +prediction with the PMC. On the one hand, this algorithm allows circumventing +the feature problem, thus fully exploiting the capabilities of the PMC. On the +other hand, it enables the PMC to extend any predictive model by introducing +hidden states, updated at each time step, and allowing the introduction of +non-stationarity for any model. We apply the PMC with its new algorithm for +volatility forecasting, which we compare to the highly popular GARCH(1,1) and +feedforward neural models across numerous pairs. This is particularly relevant +given the regime changes that we can observe in volatility. For each scenario, +our algorithm enhances the performance of the extended model, demonstrating the +value of our approach. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ Tackling prediction tasks in relational databases with LLMs + + +
+ Though large language models (LLMs) have demonstrated exceptional performance +across numerous problems, their application to predictive tasks in relational +databases remains largely unexplored. In this work, we address the notion that +LLMs cannot yield satisfactory results on relational databases due to their +interconnected tables, complex relationships, and heterogeneous data types. +Using the recently introduced RelBench benchmark, we demonstrate that even a +straightforward application of LLMs achieves competitive performance on these +tasks. These findings establish LLMs as a promising new baseline for ML on +relational databases and encourage further research in this direction. + +
+
+
+
+
+ + ☆ KAN/MultKAN with Physics-Informed Spline fitting (KAN-PISF) for + ordinary/partial differential equation discovery of nonlinear dynamic systems + + +
+ Machine learning for scientific discovery is increasingly becoming popular +because of its ability to extract and recognize the nonlinear characteristics +from the data. The black-box nature of deep learning methods poses difficulties +in interpreting the identified model. There is a dire need to interpret the +machine learning models to develop a physical understanding of dynamic systems. +An interpretable form of neural network called Kolmogorov-Arnold networks (KAN) +or Multiplicative KAN (MultKAN) offers critical features that help recognize +the nonlinearities in the governing ordinary/partial differential equations +(ODE/PDE) of various dynamic systems and find their equation structures. In +this study, an equation discovery framework is proposed that includes i) +sequentially regularized derivatives for denoising (SRDD) algorithm to denoise +the measure data to obtain accurate derivatives, ii) KAN to identify the +equation structure and suggest relevant nonlinear functions that are used to +create a small overcomplete library of functions, and iii) physics-informed +spline fitting (PISF) algorithm to filter the excess functions from the library +and converge to the correct equation. The framework was tested on the forced +Duffing oscillator, Van der Pol oscillator (stiff ODE), Burger's equation, and +Bouc-Wen model (coupled ODE). The proposed method converged to the true +equation for the first three systems. It provided an approximate model for the +Bouc-Wen model that could acceptably capture the hysteresis response. Using KAN +maintains low complexity, which helps the user interpret the results throughout +the process and avoid the black-box-type nature of machine learning methods. + +
+
+
+
+
+ + ☆ Competing Bandits in Decentralized Large Contextual Matching Markets + + +
+ Sequential learning in a multi-agent resource constrained matching market has +received significant interest in the past few years. We study decentralized +learning in two-sided matching markets where the demand side (aka players or +agents) competes for a `large' supply side (aka arms) with potentially +time-varying preferences, to obtain a stable match. Despite a long line of work +in the recent past, existing learning algorithms such as Explore-Then-Commit or +Upper-Confidence-Bound remain inefficient for this problem. In particular, the +per-agent regret achieved by these algorithms scales linearly with the number +of arms, $K$. Motivated by the linear contextual bandit framework, we assume +that for each agent an arm-mean can be represented by a linear function of a +known feature vector and an unknown (agent-specific) parameter. + Moreover, our setup captures the essence of a dynamic (non-stationary) +matching market where the preferences over arms change over time. Our proposed +algorithms achieve instance-dependent logarithmic regret, scaling independently +of the number of arms, $K$. + +
+
+
+
+
+ + ☆ A Potential Game Perspective in Federated Learning + + +
+ Federated learning (FL) is an emerging paradigm for training machine learning +models across distributed clients. Traditionally, in FL settings, a central +server assigns training efforts (or strategies) to clients. However, from a +market-oriented perspective, clients may independently choose their training +efforts based on rational self-interest. To explore this, we propose a +potential game framework where each client's payoff is determined by their +individual efforts and the rewards provided by the server. The rewards are +influenced by the collective efforts of all clients and can be modulated +through a reward factor. Our study begins by establishing the existence of Nash +equilibria (NEs), followed by an investigation of uniqueness in homogeneous +settings. We demonstrate a significant improvement in clients' training efforts +at a critical reward factor, identifying it as the optimal choice for the +server. Furthermore, we prove the convergence of the best-response algorithm to +compute NEs for our FL game. Finally, we apply the training efforts derived +from specific NEs to a real-world FL scenario, validating the effectiveness of +the identified optimal reward factor. + +
+
+
+
+
+ + ☆ Parallelly Tempered Generative Adversarial Networks + + +
+ A generative adversarial network (GAN) has been a representative backbone +model in generative artificial intelligence (AI) because of its powerful +performance in capturing intricate data-generating processes. However, the GAN +training is well-known for its notorious training instability, usually +characterized by the occurrence of mode collapse. Through the lens of +gradients' variance, this work particularly analyzes the training instability +and inefficiency in the presence of mode collapse by linking it to +multimodality in the target distribution. To ease the raised training issues +from severe multimodality, we introduce a novel GAN training framework that +leverages a series of tempered distributions produced via convex interpolation. +With our newly developed GAN objective function, the generator can learn all +the tempered distributions simultaneously, conceptually resonating with the +parallel tempering in Statistics. Our simulation studies demonstrate the +superiority of our approach over existing popular training strategies in both +image and tabular data synthesis. We theoretically analyze that such +significant improvement can arise from reducing the variance of gradient +estimates by using the tempered distributions. Finally, we further develop a +variant of the proposed framework aimed at generating fair synthetic data which +is one of the growing interests in the field of trustworthy AI. + +
+
+
+
+
+ + ☆ LLM-IE: A Python Package for Generative Information Extraction with + Large Language Models + + +
+ Objectives: Despite the recent adoption of large language models (LLMs) for +biomedical information extraction, challenges in prompt engineering and +algorithms persist, with no dedicated software available. To address this, we +developed LLM-IE: a Python package for building complete information extraction +pipelines. Our key innovation is an interactive LLM agent to support schema +definition and prompt design. + Materials and Methods: The LLM-IE supports named entity recognition, entity +attribute extraction, and relation extraction tasks. We benchmarked on the i2b2 +datasets and conducted a system evaluation. + Results: The sentence-based prompting algorithm resulted in the best +performance while requiring a longer inference time. System evaluation provided +intuitive visualization. + Discussion: LLM-IE was designed from practical NLP experience in healthcare +and has been adopted in internal projects. It should hold great value to the +biomedical NLP community. + Conclusion: We developed a Python package, LLM-IE, that provides building +blocks for robust information extraction pipeline construction. + +
+
+
+
+
+ + ☆ Drowning in Documents: Consequences of Scaling Reranker Inference + + +
+ Rerankers, typically cross-encoders, are often used to re-score the documents +retrieved by cheaper initial IR systems. This is because, though expensive, +rerankers are assumed to be more effective. We challenge this assumption by +measuring reranker performance for full retrieval, not just re-scoring +first-stage retrieval. Our experiments reveal a surprising trend: the best +existing rerankers provide diminishing returns when scoring progressively more +documents and actually degrade quality beyond a certain limit. In fact, in this +setting, rerankers can frequently assign high scores to documents with no +lexical or semantic overlap with the query. We hope that our findings will spur +future research to improve reranking. + +
+
+
+
+
+ + ☆ Freezing of Gait Detection Using Gramian Angular Fields and Federated + Learning from Wearable Sensors + + +
+ Freezing of gait (FOG) is a debilitating symptom of Parkinson's disease (PD) +that impairs mobility and safety. Traditional detection methods face challenges +due to intra and inter-patient variability, and most systems are tested in +controlled settings, limiting their real-world applicability. Addressing these +gaps, we present FOGSense, a novel FOG detection system designed for +uncontrolled, free-living conditions. It uses Gramian Angular Field (GAF) +transformations and federated deep learning to capture temporal and spatial +gait patterns missed by traditional methods. We evaluated our FOGSense system +using a public PD dataset, 'tdcsfog'. FOGSense improves accuracy by 10.4% over +a single-axis accelerometer, reduces failure points compared to multi-sensor +systems, and demonstrates robustness to missing values. The federated +architecture allows personalized model adaptation and efficient smartphone +synchronization during off-peak hours, making it effective for long-term +monitoring as symptoms evolve. Overall, FOGSense achieves a 22.2% improvement +in F1-score compared to state-of-the-art methods, along with enhanced +sensitivity for FOG episode detection. Code is available: +https://github.com/shovito66/FOGSense. + +
+
+
+
+
+ + ☆ Mapping out the Space of Human Feedback for Reinforcement Learning: A + Conceptual Framework + + +
+ Reinforcement Learning from Human feedback (RLHF) has become a powerful tool +to fine-tune or train agentic machine learning models. Similar to how humans +interact in social contexts, we can use many types of feedback to communicate +our preferences, intentions, and knowledge to an RL agent. However, +applications of human feedback in RL are often limited in scope and disregard +human factors. In this work, we bridge the gap between machine learning and +human-computer interaction efforts by developing a shared understanding of +human feedback in interactive learning scenarios. We first introduce a taxonomy +of feedback types for reward-based learning from human feedback based on nine +key dimensions. Our taxonomy allows for unifying human-centered, +interface-centered, and model-centered aspects. In addition, we identify seven +quality metrics of human feedback influencing both the human ability to express +feedback and the agent's ability to learn from the feedback. Based on the +feedback taxonomy and quality criteria, we derive requirements and design +choices for systems learning from human feedback. We relate these requirements +and design choices to existing work in interactive machine learning. In the +process, we identify gaps in existing work and future research opportunities. +We call for interdisciplinary collaboration to harness the full potential of +reinforcement learning with data-driven co-adaptive modeling and varied +interaction mechanics. + +
+
+
+
+
+ + ☆ Debiased Regression for Root-N-Consistent Conditional Mean Estimation + + +
+ This study introduces a debiasing method for regression estimators, including +high-dimensional and nonparametric regression estimators. For example, +nonparametric regression methods allow for the estimation of regression +functions in a data-driven manner with minimal assumptions; however, these +methods typically fail to achieve $\sqrt{n}$-consistency in their convergence +rates, and many, including those in machine learning, lack guarantees that +their estimators asymptotically follow a normal distribution. To address these +challenges, we propose a debiasing technique for nonparametric estimators by +adding a bias-correction term to the original estimators, extending the +conventional one-step estimator used in semiparametric analysis. Specifically, +for each data point, we estimate the conditional expected residual of the +original nonparametric estimator, which can, for instance, be computed using +kernel (Nadaraya-Watson) regression, and incorporate it as a bias-reduction +term. Our theoretical analysis demonstrates that the proposed estimator +achieves $\sqrt{n}$-consistency and asymptotic normality under a mild +convergence rate condition for both the original nonparametric estimator and +the conditional expected residual estimator. Notably, this approach remains +model-free as long as the original estimator and the conditional expected +residual estimator satisfy the convergence rate condition. The proposed method +offers several advantages, including improved estimation accuracy and +simplified construction of confidence intervals. + +
+
+
+
+
+ + ☆ BitMoD: Bit-serial Mixture-of-Datatype LLM Acceleration + + +
+ Large language models (LLMs) have demonstrated remarkable performance across +various machine learning tasks. Yet the substantial memory footprint of LLMs +significantly hinders their deployment. In this paper, we improve the +accessibility of LLMs through BitMoD, an algorithm-hardware co-design solution +that enables efficient LLM acceleration at low weight precision. On the +algorithm side, BitMoD introduces fine-grained data type adaptation that uses a +different numerical data type to quantize a group of (e.g., 128) weights. +Through the careful design of these new data types, BitMoD is able to quantize +LLM weights to very low precision (e.g., 4 bits and 3 bits) while maintaining +high accuracy. On the hardware side, BitMoD employs a bit-serial processing +element to easily support multiple numerical precisions and data types; our +hardware design includes two key innovations: First, it employs a unified +representation to process different weight data types, thus reducing the +hardware cost. Second, it adopts a bit-serial dequantization unit to rescale +the per-group partial sum with minimal hardware overhead. Our evaluation on six +representative LLMs demonstrates that BitMoD significantly outperforms +state-of-the-art LLM quantization and acceleration methods. For discriminative +tasks, BitMoD can quantize LLM weights to 4-bit with $<\!0.5\%$ accuracy loss +on average. For generative tasks, BitMoD is able to quantize LLM weights to +3-bit while achieving better perplexity than prior LLM quantization scheme. +Combining the superior model performance with an efficient accelerator design, +BitMoD achieves an average of $1.69\times$ and $1.48\times$ speedups compared +to prior LLM accelerators ANT and OliVe, respectively. + +
+
+ comment: HPCA 2025 +
+
+
+
+
+ + ☆ Revitalizing Electoral Trust: Enhancing Transparency and Efficiency + through Automated Voter Counting with Machine Learning + + +
+ In order to address issues with manual vote counting during election +procedures, this study intends to examine the viability of using advanced image +processing techniques for automated voter counting. The study aims to shed +light on how automated systems that utilize cutting-edge technologies like +OpenCV, CVZone, and the MOG2 algorithm could greatly increase the effectiveness +and openness of electoral operations. The empirical findings demonstrate how +automated voter counting can enhance voting processes and rebuild public +confidence in election outcomes, particularly in places where trust is low. The +study also emphasizes how rigorous metrics, such as the F1 score, should be +used to systematically compare the accuracy of automated systems against manual +counting methods. This methodology enables a detailed comprehension of the +differences in performance between automated and human counting techniques by +providing a nuanced assessment. The incorporation of said measures serves to +reinforce an extensive assessment structure, guaranteeing the legitimacy and +dependability of automated voting systems inside the electoral sphere. + +
+
+ comment: 13 Pages, 4 Figures +
+
+
+
+
+ + ☆ Lifted Model Construction without Normalisation: A Vectorised Approach + to Exploit Symmetries in Factor Graphs + + +
+ Lifted probabilistic inference exploits symmetries in a probabilistic model +to allow for tractable probabilistic inference with respect to domain sizes of +logical variables. We found that the current state-of-the-art algorithm to +construct a lifted representation in form of a parametric factor graph misses +symmetries between factors that are exchangeable but scaled differently, +thereby leading to a less compact representation. In this paper, we propose a +generalisation of the advanced colour passing (ACP) algorithm, which is the +state of the art to construct a parametric factor graph. Our proposed algorithm +allows for potentials of factors to be scaled arbitrarily and efficiently +detects more symmetries than the original ACP algorithm. By detecting strictly +more symmetries than ACP, our algorithm significantly reduces online query +times for probabilistic inference when the resulting model is applied, which we +also confirm in our experiments. + +
+
+ comment: Accepted to the Proceedings of the 3rd Learning on Graphs Conference + (LoG 2024) +
+
+
+
+
+ + ☆ Aligning Few-Step Diffusion Models with Dense Reward Difference Learning + + +
+ Aligning diffusion models with downstream objectives is essential for their +practical applications. However, standard alignment methods often struggle with +step generalization when directly applied to few-step diffusion models, leading +to inconsistent performance across different denoising step scenarios. To +address this, we introduce Stepwise Diffusion Policy Optimization (SDPO), a +novel alignment method tailored for few-step diffusion models. Unlike prior +approaches that rely on a single sparse reward from only the final step of each +denoising trajectory for trajectory-level optimization, SDPO incorporates dense +reward feedback at every intermediate step. By learning the differences in +dense rewards between paired samples, SDPO facilitates stepwise optimization of +few-step diffusion models, ensuring consistent alignment across all denoising +steps. To promote stable and efficient training, SDPO introduces an online +reinforcement learning framework featuring several novel strategies designed to +effectively exploit the stepwise granularity of dense rewards. Experimental +results demonstrate that SDPO consistently outperforms prior methods in +reward-based alignment across diverse step configurations, underscoring its +robust step generalization capabilities. Code is avaliable at +https://github.com/ZiyiZhang27/sdpo. + +
+
+
+
+
+ + ☆ FLMarket: Enabling Privacy-preserved Pre-training Data Pricing for + Federated Learning + + +
+ Federated Learning (FL), as a mainstream privacy-preserving machine learning +paradigm, offers promising solutions for privacy-critical domains such as +healthcare and finance. Although extensive efforts have been dedicated from +both academia and industry to improve the vanilla FL, little work focuses on +the data pricing mechanism. In contrast to the straightforward in/post-training +pricing techniques, we study a more difficult problem of pre-training pricing +without direct information from the learning process. We propose FLMarket that +integrates a two-stage, auction-based pricing mechanism with a security +protocol to address the utility-privacy conflict. Through comprehensive +experiments, we show that the client selection according to FLMarket can +achieve more than 10% higher accuracy in subsequent FL training compared to +state-of-the-art methods. In addition, it outperforms the in-training baseline +with more than 2% accuracy increase and 3x run-time speedup. + +
+
+
+
+
+ + ☆ Robust Reinforcement Learning under Diffusion Models for Data with Jumps + + +
+ Reinforcement Learning (RL) has proven effective in solving complex +decision-making tasks across various domains, but challenges remain in +continuous-time settings, particularly when state dynamics are governed by +stochastic differential equations (SDEs) with jump components. In this paper, +we address this challenge by introducing the Mean-Square Bipower Variation +Error (MSBVE) algorithm, which enhances robustness and convergence in scenarios +involving significant stochastic noise and jumps. We first revisit the +Mean-Square TD Error (MSTDE) algorithm, commonly used in continuous-time RL, +and highlight its limitations in handling jumps in state dynamics. The proposed +MSBVE algorithm minimizes the mean-square quadratic variation error, offering +improved performance over MSTDE in environments characterized by SDEs with +jumps. Simulations and formal proofs demonstrate that the MSBVE algorithm +reliably estimates the value function in complex settings, surpassing MSTDE's +performance when faced with jump processes. These findings underscore the +importance of alternative error metrics to improve the resilience and +effectiveness of RL algorithms in continuous-time frameworks. + +
+
+
+
+
+ + ☆ Learning Differentiable Surrogate Losses for Structured Prediction + + +
+ Structured prediction involves learning to predict complex structures rather +than simple scalar values. The main challenge arises from the non-Euclidean +nature of the output space, which generally requires relaxing the problem +formulation. Surrogate methods build on kernel-induced losses or more +generally, loss functions admitting an Implicit Loss Embedding, and convert the +original problem into a regression task followed by a decoding step. However, +designing effective losses for objects with complex structures presents +significant challenges and often requires domain-specific expertise. In this +work, we introduce a novel framework in which a structured loss function, +parameterized by neural networks, is learned directly from output training data +through Contrastive Learning, prior to addressing the supervised surrogate +regression problem. As a result, the differentiable loss not only enables the +learning of neural networks due to the finite dimension of the surrogate space +but also allows for the prediction of new structures of the output data via a +decoding strategy based on gradient descent. Numerical experiments on +supervised graph prediction problems show that our approach achieves similar or +even better performance than methods based on a pre-defined kernel. + +
+
+
+
+
+ + ☆ PSPO*: An Effective Process-supervised Policy Optimization for Reasoning + Alignment + + +
+ Process supervision enhances the performance of large language models in +reasoning tasks by providing feedback at each step of chain-of-thought +reasoning. However, due to the lack of effective process supervision methods, +even advanced large language models are prone to logical errors and redundant +reasoning. We claim that the effectiveness of process supervision significantly +depends on both the accuracy and the length of reasoning chains. Moreover, we +identify that these factors exhibit a nonlinear relationship with the overall +reward score of the reasoning process. Inspired by these insights, we propose a +novel process supervision paradigm, PSPO*, which systematically outlines the +workflow from reward model training to policy optimization, and highlights the +importance of nonlinear rewards in process supervision. Based on PSPO*, we +develop the PSPO-WRS, which considers the number of reasoning steps in +determining reward scores and utilizes an adjusted Weibull distribution for +nonlinear reward shaping. Experimental results on six mathematical reasoning +datasets demonstrate that PSPO-WRS consistently outperforms current mainstream +models. + +
+
+
+
+
+ + ☆ Analysis of Hardware Synthesis Strategies for Machine Learning in + Collider Trigger and Data Acquisition + + +
+ To fully exploit the physics potential of current and future high energy +particle colliders, machine learning (ML) can be implemented in detector +electronics for intelligent data processing and acquisition. The implementation +of ML in real-time at colliders requires very low latencies that are +unachievable with a software-based approach, requiring optimization and +synthesis of ML algorithms for deployment on hardware. An analysis of neural +network inference efficiency is presented, focusing on the application of +collider trigger algorithms in field programmable gate arrays (FPGAs). +Trade-offs are evaluated between two frameworks, the SLAC Neural Network +Library (SNL) and hls4ml, in terms of resources and latency for different model +sizes. Results highlight the strengths and limitations of each approach, +offering valuable insights for optimizing real-time neural network deployments +at colliders. This work aims to guide researchers and engineers in selecting +the most suitable hardware and software configurations for real-time, +resource-constrained environments. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Few-shot Model Extraction Attacks against Sequential Recommender Systems + + +
+ Among adversarial attacks against sequential recommender systems, model +extraction attacks represent a method to attack sequential recommendation +models without prior knowledge. Existing research has primarily concentrated on +the adversary's execution of black-box attacks through data-free model +extraction. However, a significant gap remains in the literature concerning the +development of surrogate models by adversaries with access to few-shot raw data +(10\% even less). That is, the challenge of how to construct a surrogate model +with high functional similarity within the context of few-shot data scenarios +remains an issue that requires resolution.This study addresses this gap by +introducing a novel few-shot model extraction framework against sequential +recommenders, which is designed to construct a superior surrogate model with +the utilization of few-shot data. The proposed few-shot model extraction +framework is comprised of two components: an autoregressive augmentation +generation strategy and a bidirectional repair loss-facilitated model +distillation procedure. Specifically, to generate synthetic data that closely +approximate the distribution of raw data, autoregressive augmentation +generation strategy integrates a probabilistic interaction sampler to extract +inherent dependencies and a synthesis determinant signal module to characterize +user behavioral patterns. Subsequently, bidirectional repair loss, which target +the discrepancies between the recommendation lists, is designed as auxiliary +loss to rectify erroneous predictions from surrogate models, transferring +knowledge from the victim model to the surrogate model effectively. Experiments +on three datasets show that the proposed few-shot model extraction framework +yields superior surrogate models. + +
+
+
+
+
+ + ☆ Artificial Scientific Discovery + + +
+ Rooted in the explosion of deep learning over the past decade, this thesis +spans from AlphaGo to ChatGPT to empirically examine the fundamental concepts +needed to realize the vision of an artificial scientist: a machine with the +capacity to autonomously generate original research and contribute to the +expansion of human knowledge. The investigation begins with {\sc Olivaw}, an +AlphaGo Zero-like agent that discovers Othello knowledge from scratch but is +unable to communicate it. This realization leads to the development of the +Explanatory Learning (EL) framework, a formalization of the problem faced by a +scientist when trying to explain a new phenomenon to their peers. The effective +EL prescriptions allow us to crack Zendo, a board game simulating the +scientific endeavor. This success comes with a fundamental insight: an +artificial scientist must develop its own interpretation of the language used +to explain its findings. This perspective then leads us to see modern +multimodal models as interpreters, and to devise a new way to build +interpretable and cost-effective CLIP-like models: by coupling two unimodal +models using little multimodal data and no further training. Finally, we +discuss what ChatGPT and its siblings are still missing to become artificial +scientists, and introduce Odeen, a benchmark about interpreting explanations +that sees LLMs going no further than random chance while being instead fully +solved by humans. + +
+
+ comment: PhD thesis, 123 pages +
+
+
+
+
+ + ☆ Efficient and Robust Continual Graph Learning for Graph Classification + in Biology + + +
+ Graph classification is essential for understanding complex biological +systems, where molecular structures and interactions are naturally represented +as graphs. Traditional graph neural networks (GNNs) perform well on static +tasks but struggle in dynamic settings due to catastrophic forgetting. We +present Perturbed and Sparsified Continual Graph Learning (PSCGL), a robust and +efficient continual graph learning framework for graph data classification, +specifically targeting biological datasets. We introduce a perturbed sampling +strategy to identify critical data points that contribute to model learning and +a motif-based graph sparsification technique to reduce storage needs while +maintaining performance. Additionally, our PSCGL framework inherently defends +against graph backdoor attacks, which is crucial for applications in sensitive +biological contexts. Extensive experiments on biological datasets demonstrate +that PSCGL not only retains knowledge across tasks but also enhances the +efficiency and robustness of graph classification models in biology. + +
+
+
+
+
+ + ☆ Dissecting Misalignment of Multimodal Large Language Models via + Influence Function + + +
+ Multi-modal Large Language models (MLLMs) are always trained on data from +diverse and unreliable sources, which may contain misaligned or mislabeled +text-image pairs. This frequently causes robustness issues and hallucinations, +leading to performance degradation. Data valuation is an efficient way to +detect and trace these misalignments. Nevertheless, existing methods are +computationally expensive for MLLMs. While computationally efficient, the +classical influence functions are inadequate for contrastive learning models +because they were originally designed for pointwise loss. Additionally, +contrastive learning involves minimizing the distance between the modalities of +positive samples and maximizing the distance between the modalities of negative +samples. This requires us to evaluate the influence of samples from both +perspectives. To tackle these challenges, we introduce the Extended Influence +Function for Contrastive Loss (ECIF), an influence function crafted for +contrastive loss. ECIF considers both positive and negative samples and +provides a closed-form approximation of contrastive learning models, +eliminating the need for retraining. Building upon ECIF, we develop a series of +algorithms for data evaluation in MLLM, misalignment detection, and +misprediction trace-back tasks. Experimental results demonstrate our ECIF +advances the transparency and interpretability of MLLMs by offering a more +accurate assessment of data impact and model alignment compared to traditional +baseline methods. + +
+
+ comment: 34 pages +
+
+
+
+
+ + ☆ No-regret Exploration in Shuffle Private Reinforcement Learning + + +
+ Differential privacy (DP) has recently been introduced into episodic +reinforcement learning (RL) to formally address user privacy concerns in +personalized services. Previous work mainly focuses on two trust models of DP: +the central model, where a central agent is responsible for protecting users' +sensitive data, and the (stronger) local model, where the protection occurs +directly on the user side. However, they either require a trusted central agent +or incur a significantly higher privacy cost, making it unsuitable for many +scenarios. This work introduces a trust model stronger than the central model +but with a lower privacy cost than the local model, leveraging the emerging +\emph{shuffle} model of privacy. We present the first generic algorithm for +episodic RL under the shuffle model, where a trusted shuffler randomly permutes +a batch of users' data before sending it to the central agent. We then +instantiate the algorithm using our proposed shuffle Privatizer, relying on a +shuffle private binary summation mechanism. Our analysis shows that the +algorithm achieves a near-optimal regret bound comparable to that of the +centralized model and significantly outperforms the local model in terms of +privacy cost. + +
+
+
+
+
+ + ☆ TSINR: Capturing Temporal Continuity via Implicit Neural Representations + for Time Series Anomaly Detection + + +
+ Time series anomaly detection aims to identify unusual patterns in data or +deviations from systems' expected behavior. The reconstruction-based methods +are the mainstream in this task, which learn point-wise representation via +unsupervised learning. However, the unlabeled anomaly points in training data +may cause these reconstruction-based methods to learn and reconstruct anomalous +data, resulting in the challenge of capturing normal patterns. In this paper, +we propose a time series anomaly detection method based on implicit neural +representation (INR) reconstruction, named TSINR, to address this challenge. +Due to the property of spectral bias, TSINR enables prioritizing low-frequency +signals and exhibiting poorer performance on high-frequency abnormal data. +Specifically, we adopt INR to parameterize time series data as a continuous +function and employ a transformer-based architecture to predict the INR of +given data. As a result, the proposed TSINR method achieves the advantage of +capturing the temporal continuity and thus is more sensitive to discontinuous +anomaly data. In addition, we further design a novel form of INR continuous +function to learn inter- and intra-channel information, and leverage a +pre-trained large language model to amplify the intense fluctuations in +anomalies. Extensive experiments demonstrate that TSINR achieves superior +overall performance on both univariate and multivariate time series anomaly +detection benchmarks compared to other state-of-the-art reconstruction-based +methods. Our codes are available. + +
+
+ comment: Accepted by SIGKDD 2025 +
+
+
+
+
+ + ☆ ST-Tree with Interpretability for Multivariate Time Series + Classification + + +
+ Multivariate time series classification is of great importance in practical +applications and is a challenging task. However, deep neural network models +such as Transformers exhibit high accuracy in multivariate time series +classification but lack interpretability and fail to provide insights into the +decision-making process. On the other hand, traditional approaches based on +decision tree classifiers offer clear decision processes but relatively lower +accuracy. Swin Transformer (ST) addresses these issues by leveraging +self-attention mechanisms to capture both fine-grained local patterns and +global patterns. It can also model multi-scale feature representation learning, +thereby providing a more comprehensive representation of time series features. +To tackle the aforementioned challenges, we propose ST-Tree with +interpretability for multivariate time series classification. Specifically, the +ST-Tree model combines ST as the backbone network with an additional neural +tree model. This integration allows us to fully leverage the advantages of ST +in learning time series context while providing interpretable decision +processes through the neural tree. This enables researchers to gain clear +insights into the model's decision-making process and extract meaningful +interpretations. Through experimental evaluations on 10 UEA datasets, we +demonstrate that the ST-Tree model improves accuracy in multivariate time +series classification tasks and provides interpretability through visualizing +the decision-making process across different datasets. + +
+
+ comment: Submitted on May 15, 2024, major revisions on Aug 31, 2024 +
+
+
+
+
+ + ☆ FERT: Real-Time Facial Expression Recognition with Short-Range FMCW + Radar + + +
+ This study proposes a novel approach for real-time facial expression +recognition utilizing short-range Frequency-Modulated Continuous-Wave (FMCW) +radar equipped with one transmit (Tx), and three receive (Rx) antennas. The +system leverages four distinct modalities simultaneously: Range-Doppler images +(RDIs), micro range-Doppler Images (micro-RDIs), range azimuth images (RAIs), +and range elevation images (REIs). Our innovative architecture integrates +feature extractor blocks, intermediate feature extractor blocks, and a ResNet +block to accurately classify facial expressions into smile, anger, neutral, and +no-face classes. Our model achieves an average classification accuracy of +98.91% on the dataset collected using a 60 GHz short-range FMCW radar. The +proposed solution operates in real-time in a person-independent manner, which +shows the potential use of low-cost FMCW radars for effective facial expression +recognition in various applications. + +
+
+ comment: Accepted at IEEE SENSORS 2024 +
+
+
+
+
+ + ☆ Signaling and Social Learning in Swarms of Robots + + +
+ This paper investigates the role of communication in improving coordination +within robot swarms, focusing on a paradigm where learning and execution occur +simultaneously in a decentralized manner. We highlight the role communication +can play in addressing the credit assignment problem (individual contribution +to the overall performance), and how it can be influenced by it. We propose a +taxonomy of existing and future works on communication, focusing on information +selection and physical abstraction as principal axes for classification: from +low-level lossless compression with raw signal extraction and processing to +high-level lossy compression with structured communication models. The paper +reviews current research from evolutionary robotics, multi-agent (deep) +reinforcement learning, language models, and biophysics models to outline the +challenges and opportunities of communication in a collective of robots that +continuously learn from one another through local message exchanges, +illustrating a form of social learning. + +
+
+ comment: 17 pages, 3 Figures +
+
+
+
+
+ + ☆ On the physics of nested Markov models: a generalized probabilistic + theory perspective + + +
+ Determining potential probability distributions with a given causal graph is +vital for causality studies. To bypass the difficulty in characterizing latent +variables in a Bayesian network, the nested Markov model provides an elegant +algebraic approach by listing exactly all the equality constraints on the +observed variables. However, this algebraically motivated causal model +comprises distributions outside Bayesian networks, and its physical +interpretation remains vague. In this work, we inspect the nested Markov model +through the lens of generalized probabilistic theory, an axiomatic framework to +describe general physical theories. We prove that all the equality constraints +defining the nested Markov model hold valid theory-independently. Yet, we show +this model generally contains distributions not implementable even within such +relaxed physical theories subjected to merely the relativity principles and +mild probabilistic rules. To interpret the origin of such a gap, we establish a +new causal model that defines valid distributions as projected from a +high-dimensional Bell-type causal structure. The new model unveils inequality +constraints induced by relativity principles, or equivalently high-dimensional +conditional independences, which are absent in the nested Markov model. +Nevertheless, we also notice that the restrictions on states and measurements +introduced by the generalized probabilistic theory framework can pose +additional inequality constraints beyond the new causal model. As a by-product, +we discover a new causal structure exhibiting strict gaps between the +distribution sets of a Bayesian network, generalized probabilistic theories, +and the nested Markov model. We anticipate our results will enlighten further +explorations on the unification of algebraic and physical perspectives of +causality. + +
+
+ comment: 21 pages, 5 figures, 5 tables; Comments are welcome! +
+
+
+
+
+ + ☆ Feature Selection for Network Intrusion Detection + + +
+ Network Intrusion Detection (NID) remains a key area of research within the +information security community, while also being relevant to Machine Learning +(ML) practitioners. The latter generally aim to detect attacks using network +features, which have been extracted from raw network data typically using +dimensionality reduction methods, such as principal component analysis (PCA). +However, PCA is not able to assess the relevance of features for the task at +hand. Consequently, the features available are of varying quality, with some +being entirely non-informative. From this, two major drawbacks arise. Firstly, +trained and deployed models have to process large amounts of unnecessary data, +therefore draining potentially costly resources. Secondly, the noise caused by +the presence of irrelevant features can, in some cases, impede a model's +ability to detect an attack. In order to deal with these challenges, we present +Feature Selection for Network Intrusion Detection (FSNID) a novel +information-theoretic method that facilitates the exclusion of non-informative +features when detecting network intrusions. The proposed method is based on +function approximation using a neural network, which enables a version of our +approach that incorporates a recurrent layer. Consequently, this version +uniquely enables the integration of temporal dependencies. Through an extensive +set of experiments, we demonstrate that the proposed method selects a +significantly reduced feature set, while maintaining NID performance. Code will +be made available upon publication. + +
+
+
+
+
+ + ☆ Generative Spatio-temporal GraphNet for Transonic Wing Pressure + Distribution Forecasting + + +
+ This study presents a framework for predicting unsteady transonic wing +pressure distributions, integrating an autoencoder architecture with graph +convolutional networks and graph-based temporal layers to model time +dependencies. The framework compresses high-dimensional pressure distribution +data into a lower-dimensional latent space using an autoencoder, ensuring +efficient data representation while preserving essential features. Within this +latent space, graph-based temporal layers are employed to predict future wing +pressures based on past data, effectively capturing temporal dependencies and +improving predictive accuracy. This combined approach leverages the strengths +of autoencoders for dimensionality reduction, graph convolutional networks for +handling unstructured grid data, and temporal layers for modeling time-based +sequences. The effectiveness of the proposed framework is validated through its +application to the Benchmark Super Critical Wing test case, achieving accuracy +comparable to computational fluid dynamics, while significantly reducing +prediction time. This framework offers a scalable, computationally efficient +solution for the aerodynamic analysis of unsteady phenomena. + +
+
+
+
+
+ + ☆ Robust Causal Analysis of Linear Cyclic Systems With Hidden Confounders + + +
+ We live in a world full of complex systems which we need to improve our +understanding of. To accomplish this, purely probabilistic investigations are +often not enough. They are only the first step and must be followed by learning +the system's underlying mechanisms. This is what the discipline of causality is +concerned with. Many of those complex systems contain feedback loops which +means that our methods have to allow for cyclic causal relations. Furthermore, +systems are rarely sufficiently isolated, which means that there are usually +hidden confounders, i.e., unmeasured variables that each causally affects more +than one measured variable. Finally, data is often distorted by contaminating +processes, and we need to apply methods that are robust against such +distortions. That's why we consider the robustness of LLC, see \cite{llc}, one +of the few causal analysis methods that can deal with cyclic models with hidden +confounders. Following a theoretical analysis of LLC's robustness properties, +we also provide robust extensions of LLC. To facilitate reproducibility and +further research in this field, we make the source code publicly available. + +
+
+ comment: 18 pages, 2 figures +
+
+
+
+
+ + ☆ Hybrid Data-Driven SSM for Interpretable and Label-Free mmWave Channel + Prediction + + +
+ Accurate prediction of mmWave time-varying channels is essential for +mitigating the issue of channel aging in complex scenarios owing to high user +mobility. Existing channel prediction methods have limitations: classical +model-based methods often struggle to track highly nonlinear channel dynamics +due to limited expert knowledge, while emerging data-driven methods typically +require substantial labeled data for effective training and often lack +interpretability. To address these issues, this paper proposes a novel hybrid +method that integrates a data-driven neural network into a conventional +model-based workflow based on a state-space model (SSM), implicitly tracking +complex channel dynamics from data without requiring precise expert knowledge. +Additionally, a novel unsupervised learning strategy is developed to train the +embedded neural network solely with unlabeled data. Theoretical analyses and +ablation studies are conducted to interpret the enhanced benefits gained from +the hybrid integration. Numerical simulations based on the 3GPP mmWave channel +model corroborate the superior prediction accuracy of the proposed method, +compared to state-of-the-art methods that are either purely model-based or +data-driven. Furthermore, extensive experiments validate its robustness against +various challenging factors, including among others severe channel variations +and high noise levels. + +
+
+
+
+
+ + ☆ GNN-Based Code Annotation Logic for Establishing Security Boundaries in + C Code + + +
+ Securing sensitive operations in today's interconnected software landscape is +crucial yet challenging. Modern platforms rely on Trusted Execution +Environments (TEEs), such as Intel SGX and ARM TrustZone, to isolate security +sensitive code from the main system, reducing the Trusted Computing Base (TCB) +and providing stronger assurances. However, identifying which code should +reside in TEEs is complex and requires specialized expertise, which is not +supported by current automated tools. Existing solutions often migrate entire +applications to TEEs, leading to suboptimal use and an increased TCB. To +address this gap, we propose Code Annotation Logic (CAL), a pioneering tool +that automatically identifies security sensitive components for TEE isolation. +CAL analyzes codebases, leveraging a graph-based approach with novel feature +construction and employing a custom graph neural network model to accurately +determine which parts of the code should be isolated. CAL effectively optimizes +TCB, reducing the burden of manual analysis and enhancing overall security. Our +contributions include the definition of security sensitive code, the +construction and labeling of a comprehensive dataset of source files, a feature +rich graph based data preparation pipeline, and the CAL model for TEE +integration. Evaluation results demonstrate CAL's efficacy in identifying +sensitive code with a recall of 86.05%, an F1 score of 81.56%, and an +identification rate of 91.59% for security sensitive functions. By enabling +efficient code isolation, CAL advances the secure development of applications +using TEEs, offering a practical solution for developers to reduce attack +vectors. + +
+
+ comment: Submitted to the IEEE Symposium on Security and Privacy 2025 +
+
+
+
+
+ + ☆ Data-driven model reconstruction for nonlinear wave dynamics + + +
+ The use of machine learning to predict wave dynamics is a topic of growing +interest, but commonly-used deep learning approaches suffer from a lack of +interpretability of the trained models. Here we present an interpretable +machine learning framework for analyzing the nonlinear evolution dynamics of +optical wavepackets in complex wave media. We use sparse regression to reduce +microscopic discrete lattice models to simpler effective continuum models which +can accurately describe the dynamics of the wavepacket envelope. We apply our +approach to valley-Hall domain walls in honeycomb photonic lattices of +laser-written waveguides with Kerr-type nonlinearity and different boundary +shapes. The reconstructed equations accurately reproduce the linear dispersion +and nonlinear effects including self-steepening and self-focusing. This scheme +is proven free of the a priori limitations imposed by the underlying hierarchy +of scales traditionally employed in asymptotic analytical methods. It +represents a powerful interpretable machine learning technique of interest for +advancing design capabilities in photonics and framing the complex +interaction-driven dynamics in various topological materials. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Real-Time Fitness Exercise Classification and Counting from Video Frames + + +
+ This paper introduces a novel method for real-time exercise classification +using a Bidirectional Long Short-Term Memory (BiLSTM) neural network. Existing +exercise recognition approaches often rely on synthetic datasets, raw +coordinate inputs sensitive to user and camera variations, and fail to fully +exploit the temporal dependencies in exercise movements. These issues limit +their generalizability and robustness in real-world conditions, where lighting, +camera angles, and user body types vary. + To address these challenges, we propose a BiLSTM-based model that leverages +invariant features, such as joint angles, alongside raw coordinates. By using +both angles and (x, y, z) coordinates, the model adapts to changes in +perspective, user positioning, and body differences, improving generalization. +Training on 30-frame sequences enables the BiLSTM to capture the temporal +context of exercises and recognize patterns evolving over time. + We compiled a dataset combining synthetic data from the InfiniteRep dataset +and real-world videos from Kaggle and other sources. This dataset includes four +common exercises: squat, push-up, shoulder press, and bicep curl. The model was +trained and validated on these diverse datasets, achieving an accuracy of over +99% on the test set. To assess generalizability, the model was tested on 2 +separate test sets representative of typical usage conditions. Comparisons with +the previous approach from the literature are present in the result section +showing that the proposed model is the best-performing one. + The classifier is integrated into a web application providing real-time +exercise classification and repetition counting without manual exercise +selection. + Demo and datasets are available at the following GitHub Repository: +https://github.com/RiccardoRiccio/Fitness-AI-Trainer-With-Automatic-Exercise-Recognition-and-Counting. + +
+
+
+
+
+ + ☆ Hierarchical-Graph-Structured Edge Partition Models for Learning + Evolving Community Structure + + +
+ We propose a novel dynamic network model to capture evolving latent +communities within temporal networks. To achieve this, we decompose each +observed dynamic edge between vertices using a Poisson-gamma edge partition +model, assigning each vertex to one or more latent communities through +\emph{nonnegative} vertex-community memberships. Specifically, hierarchical +transition kernels are employed to model the interactions between these latent +communities in the observed temporal network. A hierarchical graph prior is +placed on the transition structure of the latent communities, allowing us to +model how they evolve and interact over time. Consequently, our dynamic network +enables the inferred community structure to merge, split, and interact with one +another, providing a comprehensive understanding of complex network dynamics. +Experiments on various real-world network datasets demonstrate that the +proposed model not only effectively uncovers interpretable latent structures +but also surpasses other state-of-the art dynamic network models in the tasks +of link prediction and community detection. + +
+
+
+
+
+ + ☆ SeqProFT: Applying LoRA Finetuning for Sequence-only Protein Property + Predictions + + +
+ Protein language models (PLMs) are capable of learning the relationships +between protein sequences and functions by treating amino acid sequences as +textual data in a self-supervised manner. However, fine-tuning these models +typically demands substantial computational resources and time, with results +that may not always be optimized for specific tasks. To overcome these +challenges, this study employs the LoRA method to perform end-to-end +fine-tuning of the ESM-2 model specifically for protein property prediction +tasks, utilizing only sequence information. Additionally, a multi-head +attention mechanism is integrated into the downstream network to combine +sequence features with contact map information, thereby enhancing the model's +comprehension of protein sequences. Experimental results of extensive +classification and regression tasks demonstrate that the fine-tuned model +achieves strong performance and faster convergence across multiple regression +and classification tasks. + +
+
+
+
+
+ + ☆ Preempting Text Sanitization Utility in Resource-Constrained + Privacy-Preserving LLM Interactions + + +
+ Individuals have been increasingly interacting with online Large Language +Models (LLMs), both in their work and personal lives. These interactions raise +privacy issues as the LLMs are typically hosted by third-parties who can gather +a variety of sensitive information about users and their companies. Text +Sanitization techniques have been proposed in the literature and can be used to +sanitize user prompts before sending them to the LLM. However, sanitization has +an impact on the downstream task performed by the LLM, and often to such an +extent that it leads to unacceptable results for the user. This is not just a +minor annoyance, with clear monetary consequences as LLM services charge on a +per use basis as well as great amount of computing resources wasted. We propose +an architecture leveraging a Small Language Model (SLM) at the user-side to +help estimate the impact of sanitization on a prompt before it is sent to the +LLM, thus preventing resource losses. + Our evaluation of this architecture revealed a significant problem with text +sanitization based on Differential Privacy, on which we want to draw the +attention of the community for further investigation. + +
+
+
+
+
+ + ☆ A Pre-Trained Graph-Based Model for Adaptive Sequencing of Educational + Documents NeurIPS 2024 + + +
+ Massive Open Online Courses (MOOCs) have greatly contributed to making +education more accessible.However, many MOOCs maintain a rigid, +one-size-fits-all structure that fails to address the diverse needs and +backgrounds of individual learners.Learning path personalization aims to +address this limitation, by tailoring sequences of educational content to +optimize individual student learning outcomes.Existing approaches, however, +often require either massive student interaction data or extensive expert +annotation, limiting their broad application.In this study, we introduce a +novel data-efficient framework for learning path personalization that operates +without expert annotation.Our method employs a flexible recommender system +pre-trained with reinforcement learning on a dataset of raw course +materials.Through experiments on semi-synthetic data, we show that this +pre-training stage substantially improves data-efficiency in a range of +adaptive learning scenarios featuring new educational materials.This opens up +new perspectives for the design of foundation models for adaptive learning. + +
+
+ comment: NeurIPS 2024 Workshop on Large Foundation Models for Educational + Assessment (FM-Assess), Dec 2024, Vancouver, Canada +
+
+
+
+
+ + ☆ Efficient Sample-optimal Learning of Gaussian Tree Models via + Sample-optimal Testing of Gaussian Mutual Information + + +
+ Learning high-dimensional distributions is a significant challenge in machine +learning and statistics. Classical research has mostly concentrated on +asymptotic analysis of such data under suitable assumptions. While existing +works [Bhattacharyya et al.: SICOMP 2023, Daskalakis et al.: STOC 2021, Choo et +al.: ALT 2024] focus on discrete distributions, the current work addresses the +tree structure learning problem for Gaussian distributions, providing efficient +algorithms with solid theoretical guarantees. This is crucial as real-world +distributions are often continuous and differ from the discrete scenarios +studied in prior works. + In this work, we design a conditional mutual information tester for Gaussian +random variables that can test whether two Gaussian random variables are +independent, or their conditional mutual information is at least $\varepsilon$, +for some parameter $\varepsilon \in (0,1)$ using +$\mathcal{O}(\varepsilon^{-1})$ samples which we show to be near-optimal. In +contrast, an additive estimation would require $\Omega(\varepsilon^{-2})$ +samples. Our upper bound technique uses linear regression on a pair of suitably +transformed random variables. Importantly, we show that the chain rule of +conditional mutual information continues to hold for the estimated +(conditional) mutual information. As an application of such a mutual +information tester, we give an efficient $\varepsilon$-approximate +structure-learning algorithm for an $n$-variate Gaussian tree model that takes +$\widetilde{\Theta}(n\varepsilon^{-1})$ samples which we again show to be +near-optimal. In contrast, when the underlying Gaussian model is not known to +be tree-structured, we show that $\widetilde{{{\Theta}}}(n^2\varepsilon^{-2})$ +samples are necessary and sufficient to output an $\varepsilon$-approximate +tree structure. We perform extensive experiments that corroborate our +theoretical convergence bounds. + +
+
+ comment: 47 pages, 16 figures, abstract shortened as per arXiv criteria +
+
+
+
+
+ + ☆ Cascaded Diffusion Models for 2D and 3D Microscopy Image Synthesis to + Enhance Cell Segmentation + + +
+ Automated cell segmentation in microscopy images is essential for biomedical +research, yet conventional methods are labor-intensive and prone to error. +While deep learning-based approaches have proven effective, they often require +large annotated datasets, which are scarce due to the challenges of manual +annotation. To overcome this, we propose a novel framework for synthesizing +densely annotated 2D and 3D cell microscopy images using cascaded diffusion +models. Our method synthesizes 2D and 3D cell masks from sparse 2D annotations +using multi-level diffusion models and NeuS, a 3D surface reconstruction +approach. Following that, a pretrained 2D Stable Diffusion model is finetuned +to generate realistic cell textures and the final outputs are combined to form +cell populations. We show that training a segmentation model with a combination +of our synthetic data and real data improves cell segmentation performance by +up to 9\% across multiple datasets. Additionally, the FID scores indicate that +the synthetic data closely resembles real data. The code for our proposed +approach will be available at +https://github.com/ruveydayilmaz0/cascaded\_diffusion. + +
+
+
+
+
+ + ☆ A Modular Open Source Framework for Genomic Variant Calling + + +
+ Variant calling is a fundamental task in genomic research, essential for +detecting genetic variations such as single nucleotide polymorphisms (SNPs) and +insertions or deletions (indels). This paper presents an enhancement to +DeepChem, a widely used open-source drug discovery framework, through the +integration of DeepVariant. In particular, we introduce a variant calling +pipeline that leverages DeepVariant's convolutional neural network (CNN) +architecture to improve the accuracy and reliability of variant detection. The +implemented pipeline includes stages for realignment of sequencing reads, +candidate variant detection, and pileup image generation, followed by variant +classification using a modified Inception v3 model. Our work adds a modular and +extensible variant calling framework to the DeepChem framework and enables +future work integrating DeepChem's drug discovery infrastructure more tightly +with bioinformatics pipelines. + +
+
+
+
+
+ + ☆ Structure learning with Temporal Gaussian Mixture for model-based + Reinforcement Learning + + +
+ Model-based reinforcement learning refers to a set of approaches capable of +sample-efficient decision making, which create an explicit model of the +environment. This model can subsequently be used for learning optimal policies. +In this paper, we propose a temporal Gaussian Mixture Model composed of a +perception model and a transition model. The perception model extracts discrete +(latent) states from continuous observations using a variational Gaussian +mixture likelihood. Importantly, our model constantly monitors the collected +data searching for new Gaussian components, i.e., the perception model performs +a form of structure learning (Smith et al., 2020; Friston et al., 2018; Neacsu +et al., 2022) as it learns the number of Gaussian components in the mixture. +Additionally, the transition model learns the temporal transition between +consecutive time steps by taking advantage of the Dirichlet-categorical +conjugacy. Both the perception and transition models are able to forget part of +the data points, while integrating the information they provide within the +prior, which ensure fast variational inference. Finally, decision making is +performed with a variant of Q-learning which is able to learn Q-values from +beliefs over states. Empirically, we have demonstrated the model's ability to +learn the structure of several mazes: the model discovered the number of states +and the transition probabilities between these states. Moreover, using its +learned Q-values, the agent was able to successfully navigate from the starting +position to the maze's exit. + +
+
+
+
+
+ + ☆ Physics Encoded Blocks in Residual Neural Network Architectures for + Digital Twin Models + + +
+ Physics Informed Machine Learning has emerged as a popular approach in +modelling and simulation for digital twins to generate accurate models of +processes and behaviours of real-world systems. However, despite their success +in generating accurate and reliable models, the existing methods either use +simple regularizations in loss functions to offer limited physics integration +or are too specific in architectural definitions to be generalized to a wide +variety of physical systems. This paper presents a generic approach based on a +novel physics-encoded residual neural network architecture to combine +data-driven and physics-based analytical models to address these limitations. +Our method combines physics blocks as mathematical operators from physics-based +models with learning blocks comprising feed-forward layers. Intermediate +residual blocks are incorporated for stable gradient flow as they train on +physical system observation data. This way, the model learns to comply with the +geometric and kinematic aspects of the physical system. Compared to +conventional neural network-based methods, our method improves generalizability +with substantially low data requirements and model complexity in terms of +parameters, especially in scenarios where prior physics knowledge is either +elementary or incomplete. We investigate our approach in two application +domains. The first is a basic robotic motion model using Euler Lagrangian +equations of motion as physics prior. The second application is a complex +scenario of a steering model for a self-driving vehicle in a simulation. In +both applications, our method outperforms both conventional neural network +based approaches as-well as state-of-the-art Physics Informed Machine Learning +methods. + +
+
+
+
+
+ + ☆ Alien Recombination: Exploring Concept Blends Beyond Human Cognitive + Availability in Visual Art NeurIPS 2024 + + +
+ While AI models have demonstrated remarkable capabilities in constrained +domains like game strategy, their potential for genuine creativity in +open-ended domains like art remains debated. We explore this question by +examining how AI can transcend human cognitive limitations in visual art +creation. Our research hypothesizes that visual art contains a vast unexplored +space of conceptual combinations, constrained not by inherent incompatibility, +but by cognitive limitations imposed by artists' cultural, temporal, +geographical and social contexts. + To test this hypothesis, we present the Alien Recombination method, a novel +approach utilizing fine-tuned large language models to identify and generate +concept combinations that lie beyond human cognitive availability. The system +models and deliberately counteracts human availability bias, the tendency to +rely on immediately accessible examples, to discover novel artistic +combinations. + This system not only produces combinations that have never been attempted +before within our dataset but also identifies and generates combinations that +are cognitively unavailable to all artists in the domain. Furthermore, we +translate these combinations into visual representations, enabling the +exploration of subjective perceptions of novelty. Our findings suggest that +cognitive unavailability is a promising metric for optimizing artistic novelty, +outperforming merely temperature scaling without additional evaluation +criteria. This approach uses generative models to connect previously +unconnected ideas, providing new insight into the potential of framing +AI-driven creativity as a combinatorial problem. + +
+
+ comment: NeurIPS 2024 Workshop on Creativity & Generative AI, 13 pages, 11 + figures +
+
+
+
+
+ + ☆ Graph Artificial Intelligence for Quantifying Compatibility Mechanisms + in Traditional Chinese Medicine + + +
+ Traditional Chinese Medicine (TCM) involves complex compatibility mechanisms +characterized by multi-component and multi-target interactions, which are +challenging to quantify. To address this challenge, we applied graph artificial +intelligence to develop a TCM multi-dimensional knowledge graph that bridges +traditional TCM theory and modern biomedical science +(https://zenodo.org/records/13763953 ). Using feature engineering and +embedding, we processed key TCM terminology and Chinese herbal pieces (CHP), +introducing medicinal properties as virtual nodes and employing graph neural +networks with attention mechanisms to model and analyze 6,080 Chinese herbal +formulas (CHF). Our method quantitatively assessed the roles of CHP within CHF +and was validated using 215 CHF designed for COVID-19 management. With +interpretable models, open-source data, and code +(https://github.com/ZENGJingqi/GraphAI-for-TCM ), this study provides robust +tools for advancing TCM theory and drug discovery. + +
+
+ comment: 10 pages, 5 figures. Includes open-source dataset and code for + reproducibility +
+
+
+
+
+ + ☆ Physics meets Topology: Physics-informed topological neural networks for + learning rigid body dynamics + + +
+ Rigid body interactions are fundamental to numerous scientific disciplines, +but remain challenging to simulate due to their abrupt nonlinear nature and +sensitivity to complex, often unknown environmental factors. These challenges +call for adaptable learning-based methods capable of capturing complex +interactions beyond explicit physical models and simulations. While graph +neural networks can handle simple scenarios, they struggle with complex scenes +and long-term predictions. We introduce a novel framework for modeling rigid +body dynamics and learning collision interactions, addressing key limitations +of existing graph-based methods. Our approach extends the traditional +representation of meshes by incorporating higher-order topology complexes, +offering a physically consistent representation. Additionally, we propose a +physics-informed message-passing neural architecture, embedding physical laws +directly in the model. Our method demonstrates superior accuracy, even during +long rollouts, and exhibits strong generalization to unseen scenarios. +Importantly, this work addresses the challenge of multi-entity dynamic +interactions, with applications spanning diverse scientific and engineering +domains. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Re-examining learning linear functions in context + + +
+ In context learning (ICL) is an attractive method of solving a wide range of +problems. Inspired by Garg et al. (2022), we look closely at ICL in a variety +of train and test settings for several transformer models of different sizes +trained from scratch. Our study complements prior work by pointing out several +systematic failures of these models to generalize to data not in the training +distribution, thereby showing some limitations of ICL. We find that models +adopt a strategy for this task that is very different from standard solutions. + +
+
+
+
+
+ + ☆ PALMS: Parallel Adaptive Lasso with Multi-directional Signals for Latent + Networks Reconstruction + + +
+ Large-scale networks exist in many field and play an important role in +real-world dynamics. However, the networks are usually latent and expensive to +detect, which becomes the main challenging for many applications and empirical +analysis. Several statistical methods were proposed to infer the edges, but the +complexity of algorithms make them hard to be applied for large-scale networks. +In this paper, we proposed a general distributed and parallel computing +framework for network reconstruction methods via compressive sensing technical, +to make them feasible for inferring the super large networks in practice. +Combining with the CALMS, we proposed for those estimators enjoy additional +theoretical properties, such as the consistency and asymptotic normality, we +prove that the approximate estimation utilizing the distributed algorithm can +keep the theoretical results. + +
+
+ comment: 48 pages +
+
+
+
+
+ + ☆ Upside-Down Reinforcement Learning for More Interpretable Optimal + Control + + +
+ Model-Free Reinforcement Learning (RL) algorithms either learn how to map +states to expected rewards or search for policies that can maximize a certain +performance function. Model-Based algorithms instead, aim to learn an +approximation of the underlying model of the RL environment and then use it in +combination with planning algorithms. Upside-Down Reinforcement Learning (UDRL) +is a novel learning paradigm that aims to learn how to predict actions from +states and desired commands. This task is formulated as a Supervised Learning +problem and has successfully been tackled by Neural Networks (NNs). In this +paper, we investigate whether function approximation algorithms other than NNs +can also be used within a UDRL framework. Our experiments, performed over +several popular optimal control benchmarks, show that tree-based methods like +Random Forests and Extremely Randomized Trees can perform just as well as NNs +with the significant benefit of resulting in policies that are inherently more +interpretable than NNs, therefore paving the way for more transparent, safe, +and robust RL. + +
+
+
+
+
+ + ☆ Unveiling the Inflexibility of Adaptive Embedding in Traffic Forecasting + + +
+ Spatiotemporal Graph Neural Networks (ST-GNNs) and Transformers have shown +significant promise in traffic forecasting by effectively modeling temporal and +spatial correlations. However, rapid urbanization in recent years has led to +dynamic shifts in traffic patterns and travel demand, posing major challenges +for accurate long-term traffic prediction. The generalization capability of +ST-GNNs in extended temporal scenarios and cross-city applications remains +largely unexplored. In this study, we evaluate state-of-the-art models on an +extended traffic benchmark and observe substantial performance degradation in +existing ST-GNNs over time, which we attribute to their limited inductive +capabilities. Our analysis reveals that this degradation stems from an +inability to adapt to evolving spatial relationships within urban environments. +To address this limitation, we reconsider the design of adaptive embeddings and +propose a Principal Component Analysis (PCA) embedding approach that enables +models to adapt to new scenarios without retraining. We incorporate PCA +embeddings into existing ST-GNN and Transformer architectures, achieving marked +improvements in performance. Notably, PCA embeddings allow for flexibility in +graph structures between training and testing, enabling models trained on one +city to perform zero-shot predictions on other cities. This adaptability +demonstrates the potential of PCA embeddings in enhancing the robustness and +generalization of spatiotemporal models. + +
+
+
+
+
+ + ☆ Implicit Regularization for Multi-label Feature Selection + + +
+ In this paper, we address the problem of feature selection in the context of +multi-label learning, by using a new estimator based on implicit regularization +and label embedding. Unlike the sparse feature selection methods that use a +penalized estimator with explicit regularization terms such as $l_{2,1}$-norm, +MCP or SCAD, we propose a simple alternative method via Hadamard product +parameterization. In order to guide the feature selection process, a latent +semantic of multi-label information method is adopted, as a label embedding. +Experimental results on some known benchmark datasets suggest that the proposed +estimator suffers much less from extra bias, and may lead to benign +overfitting. + +
+
+ comment: 11 pages, 7 figures, My paper is currently under review at TPAMI + journal +
+
+
+
+
+ + ☆ Temporal and Spatial Reservoir Ensembling Techniques for Liquid State + Machines + + +
+ Reservoir computing (RC), is a class of computational methods such as Echo +State Networks (ESN) and Liquid State Machines (LSM) describe a generic method +to perform pattern recognition and temporal analysis with any non-linear +system. This is enabled by Reservoir Computing being a shallow network model +with only Input, Reservoir, and Readout layers where input and reservoir +weights are not learned (only the readout layer is trained). LSM is a special +case of Reservoir computing inspired by the organization of neurons in the +brain and generally refers to spike-based Reservoir computing approaches. LSMs +have been successfully used to showcase decent performance on some neuromorphic +vision and speech datasets but a common problem associated with LSMs is that +since the model is more-or-less fixed, the main way to improve the performance +is by scaling up the Reservoir size, but that only gives diminishing rewards +despite a tremendous increase in model size and computation. In this paper, we +propose two approaches for effectively ensembling LSM models - Multi-Length +Scale Reservoir Ensemble (MuLRE) and Temporal Excitation Partitioned Reservoir +Ensemble (TEPRE) and benchmark them on Neuromorphic-MNIST (N-MNIST), Spiking +Heidelberg Digits (SHD), and DVSGesture datasets, which are standard +neuromorphic benchmarks. We achieve 98.1% test accuracy on N-MNIST with a +3600-neuron LSM model which is higher than any prior LSM-based approach and +77.8% test accuracy on the SHD dataset which is on par with a standard +Recurrent Spiking Neural Network trained by Backprop Through Time (BPTT). We +also propose receptive field-based input weights to the Reservoir to work +alongside the Multi-Length Scale Reservoir ensemble model for vision tasks. +Thus, we introduce effective means of scaling up the performance of LSM models +and evaluate them against relevant neuromorphic benchmarks + +
+
+
+
+
+ + ☆ IKEA Manuals at Work: 4D Grounding of Assembly Instructions on Internet + Videos NeurIPS 2024 + + +
+ Shape assembly is a ubiquitous task in daily life, integral for constructing +complex 3D structures like IKEA furniture. While significant progress has been +made in developing autonomous agents for shape assembly, existing datasets have +not yet tackled the 4D grounding of assembly instructions in videos, essential +for a holistic understanding of assembly in 3D space over time. We introduce +IKEA Video Manuals, a dataset that features 3D models of furniture parts, +instructional manuals, assembly videos from the Internet, and most importantly, +annotations of dense spatio-temporal alignments between these data modalities. +To demonstrate the utility of IKEA Video Manuals, we present five applications +essential for shape assembly: assembly plan generation, part-conditioned +segmentation, part-conditioned pose estimation, video object segmentation, and +furniture assembly based on instructional video manuals. For each application, +we provide evaluation metrics and baseline methods. Through experiments on our +annotated data, we highlight many challenges in grounding assembly instructions +in videos to improve shape assembly, including handling occlusions, varying +viewpoints, and extended assembly sequences. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ The Dark Side of Trust: Authority Citation-Driven Jailbreak Attacks on + Large Language Models + + +
+ The widespread deployment of large language models (LLMs) across various +domains has showcased their immense potential while exposing significant safety +vulnerabilities. A major concern is ensuring that LLM-generated content aligns +with human values. Existing jailbreak techniques reveal how this alignment can +be compromised through specific prompts or adversarial suffixes. In this study, +we introduce a new threat: LLMs' bias toward authority. While this inherent +bias can improve the quality of outputs generated by LLMs, it also introduces a +potential vulnerability, increasing the risk of producing harmful content. +Notably, the biases in LLMs is the varying levels of trust given to different +types of authoritative information in harmful queries. For example, malware +development often favors trust GitHub. To better reveal the risks with LLM, we +propose DarkCite, an adaptive authority citation matcher and generator designed +for a black-box setting. DarkCite matches optimal citation types to specific +risk types and generates authoritative citations relevant to harmful +instructions, enabling more effective jailbreak attacks on aligned LLMs.Our +experiments show that DarkCite achieves a higher attack success rate (e.g., +LLama-2 at 76% versus 68%) than previous methods. To counter this risk, we +propose an authenticity and harm verification defense strategy, raising the +average defense pass rate (DPR) from 11% to 74%. More importantly, the ability +to link citations to the content they encompass has become a foundational +function in LLMs, amplifying the influence of LLMs' bias toward authority. + +
+
+
+
+
+ + ☆ Bridging the Resource Gap: Deploying Advanced Imitation Learning Models + onto Affordable Embedded Platforms + + +
+ Advanced imitation learning with structures like the transformer is +increasingly demonstrating its advantages in robotics. However, deploying these +large-scale models on embedded platforms remains a major challenge. In this +paper, we propose a pipeline that facilitates the migration of advanced +imitation learning algorithms to edge devices. The process is achieved via an +efficient model compression method and a practical asynchronous parallel method +Temporal Ensemble with Dropped Actions (TEDA) that enhances the smoothness of +operations. To show the efficiency of the proposed pipeline, large-scale +imitation learning models are trained on a server and deployed on an edge +device to complete various manipulation tasks. + +
+
+ comment: Accepted by the 2024 IEEE International Conference on Robotics and + Biomimetics (IEEE ROBIO 2024) +
+
+
+
+
+ + ☆ Extended Neural Contractive Dynamical Systems: On Multiple Tasks and + Riemannian Safety Regions + + +
+ Stability guarantees are crucial when ensuring that a fully autonomous robot +does not take undesirable or potentially harmful actions. We recently proposed +the Neural Contractive Dynamical Systems (NCDS), which is a neural network +architecture that guarantees contractive stability. With this, +learning-from-demonstrations approaches can trivially provide stability +guarantees. However, our early work left several unanswered questions, which we +here address. Beyond providing an in-depth explanation of NCDS, this paper +extends the framework with more careful regularization, a conditional variant +of the framework for handling multiple tasks, and an uncertainty-driven +approach to latent obstacle avoidance. Experiments verify that the developed +system has the flexibility of ordinary neural networks while providing the +stability guarantees needed for autonomous robotics. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2401.09352 +
+
+
+
+
+ + ☆ The GECo algorithm for Graph Neural Networks Explanation + + +
+ Graph Neural Networks (GNNs) are powerful models that can manage complex data +sources and their interconnection links. One of GNNs' main drawbacks is their +lack of interpretability, which limits their application in sensitive fields. +In this paper, we introduce a new methodology involving graph communities to +address the interpretability of graph classification problems. The proposed +method, called GECo, exploits the idea that if a community is a subset of graph +nodes densely connected, this property should play a role in graph +classification. This is reasonable, especially if we consider the +message-passing mechanism, which is the basic mechanism of GNNs. GECo analyzes +the contribution to the classification result of the communities in the graph, +building a mask that highlights graph-relevant structures. GECo is tested for +Graph Convolutional Networks on six artificial and four real-world graph +datasets and is compared to the main explainability methods such as +PGMExplainer, PGExplainer, GNNExplainer, and SubgraphX using four different +metrics. The obtained results outperform the other methods for artificial graph +datasets and most real-world datasets. + +
+
+
+
+
+ + ☆ Graph Neural Networks on Graph Databases + + +
+ Training graph neural networks on large datasets has long been a challenge. +Traditional approaches include efficiently representing the whole graph +in-memory, designing parameter efficient and sampling-based models, and graph +partitioning in a distributed setup. Separately, graph databases with native +graph storage and query engines have been developed, which enable time and +resource efficient graph analytics workloads. We show how to directly train a +GNN on a graph DB, by retrieving minimal data into memory and sampling using +the query engine. Our experiments show resource advantages for single-machine +and distributed training. Our approach opens up a new way of scaling GNNs as +well as a new application area for graph DBs. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ Rethinking Thinking Tokens: Understanding Why They Underperform in + Practice + + +
+ Thinking Tokens (TT) have been proposed as an unsupervised method to +facilitate reasoning in language models. However, despite their conceptual +appeal, our findings show that TTs marginally improves performance and +consistently underperforms compared to Chain-of-Thought (CoT) reasoning across +multiple benchmarks. We hypothesize that this underperformance stems from the +reliance on a single embedding for TTs, which results in inconsistent learning +signals and introduces noisy gradients. This paper provides a comprehensive +empirical analysis to validate this hypothesis and discusses the implications +for future research on unsupervised reasoning in LLMs. + +
+
+
+
+
+ + ☆ Continual Task Learning through Adaptive Policy Self-Composition + + +
+ Training a generalizable agent to continually learn a sequence of tasks from +offline trajectories is a natural requirement for long-lived agents, yet +remains a significant challenge for current offline reinforcement learning (RL) +algorithms. Specifically, an agent must be able to rapidly adapt to new tasks +using newly collected trajectories (plasticity), while retaining knowledge from +previously learned tasks (stability). However, systematic analyses of this +setting are scarce, and it remains unclear whether conventional continual +learning (CL) methods are effective in continual offline RL (CORL) scenarios. +In this study, we develop the Offline Continual World benchmark and demonstrate +that traditional CL methods struggle with catastrophic forgetting, primarily +due to the unique distribution shifts inherent to CORL scenarios. To address +this challenge, we introduce CompoFormer, a structure-based continual +transformer model that adaptively composes previous policies via a meta-policy +network. Upon encountering a new task, CompoFormer leverages semantic +correlations to selectively integrate relevant prior policies alongside newly +trained parameters, thereby enhancing knowledge sharing and accelerating the +learning process. Our experiments reveal that CompoFormer outperforms +conventional CL methods, particularly in longer task sequences, showcasing a +promising balance between plasticity and stability. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ☆ Zero-Shot Load Forecasting with Large Language Models + + +
+ Deep learning models have shown strong performance in load forecasting, but +they generally require large amounts of data for model training before being +applied to new scenarios, which limits their effectiveness in data-scarce +scenarios. Inspired by the great success of pre-trained language models (LLMs) +in natural language processing, this paper proposes a zero-shot load +forecasting approach using an advanced LLM framework denoted as the Chronos +model. By utilizing its extensive pre-trained knowledge, the Chronos model +enables accurate load forecasting in data-scarce scenarios without the need for +extensive data-specific training. Simulation results across five real-world +datasets demonstrate that the Chronos model significantly outperforms nine +popular baseline models for both deterministic and probabilistic load +forecasting with various forecast horizons (e.g., 1 to 48 hours), even though +the Chronos model is neither tailored nor fine-tuned to these specific load +datasets. Notably, Chronos reduces root mean squared error (RMSE), continuous +ranked probability score (CRPS), and quantile score (QS) by approximately +7.34%-84.30%, 19.63%-60.06%, and 22.83%-54.49%, respectively, compared to +baseline models. These results highlight the superiority and flexibility of the +Chronos model, positioning it as an effective solution in data-scarce +scenarios. + +
+
+ comment: 21 pages,5 figures +
+
+
+
+
+ + ☆ Modeling Multivariable High-resolution 3D Urban Microclimate Using + Localized Fourier Neural Operator + + +
+ Accurate urban microclimate analysis with wind velocity and temperature is +vital for energy-efficient urban planning, supporting carbon reduction, +enhancing public health and comfort, and advancing the low-altitude economy. +However, traditional computational fluid dynamics (CFD) simulations that couple +velocity and temperature are computationally expensive. Recent machine learning +advancements offer promising alternatives for accelerating urban microclimate +simulations. The Fourier neural operator (FNO) has shown efficiency and +accuracy in predicting single-variable velocity magnitudes in urban wind +fields. Yet, for multivariable high-resolution 3D urban microclimate +prediction, FNO faces three key limitations: blurry output quality, high GPU +memory demand, and substantial data requirements. To address these issues, we +propose a novel localized Fourier neural operator (Local-FNO) model that +employs local training, geometry encoding, and patch overlapping. Local-FNO +provides accurate predictions for rapidly changing turbulence in urban +microclimate over 60 seconds, four times the average turbulence integral time +scale, with an average error of 0.35 m/s in velocity and 0.30 {\deg}C in +temperature. It also accurately captures turbulent heat flux represented by the +velocity-temperature correlation. In a 2 km by 2 km domain, Local-FNO resolves +turbulence patterns down to a 10 m resolution. It provides high-resolution +predictions with 150 million feature dimensions on a single 32 GB GPU at nearly +50 times the speed of a CFD solver. Compared to FNO, Local-FNO achieves a 23.9% +reduction in prediction error and a 47.3% improvement in turbulent fluctuation +correlation. + +
+
+
+
+
+ + ☆ A Hybrid Loss Framework for Decomposition-based Time Series Forecasting + Methods: Balancing Global and Component Errors + + +
+ Accurate time series forecasting, predicting future values based on past +data, is crucial for diverse industries. Many current time series methods +decompose time series into multiple sub-series, applying different model +architectures and training with an end-to-end overall loss for forecasting. +However, this raises a question: does this overall loss prioritize the +importance of critical sub-series within the decomposition for the better +performance? To investigate this, we conduct a study on the impact of overall +loss on existing time series methods with sequence decomposition. Our findings +reveal that overall loss may introduce bias in model learning, hindering the +learning of the prioritization of more significant sub-series and limiting the +forecasting performance. To address this, we propose a hybrid loss framework +combining the global and component losses. This framework introduces component +losses for each sub-series alongside the original overall loss. It employs a +dual min-max algorithm to dynamically adjust weights between the overall loss +and component losses, and within component losses. This enables the model to +achieve better performance of current time series methods by focusing on more +critical sub-series while still maintaining a low overall loss. We integrate +our loss framework into several time series methods and evaluate the +performance on multiple datasets. Results show an average improvement of 0.5-2% +over existing methods without any modifications to the model architectures. + +
+
+
+
+
+ + ☆ Enhancing Decision Transformer with Diffusion-Based Trajectory Branch + Generation + + +
+ Decision Transformer (DT) can learn effective policy from offline datasets by +converting the offline reinforcement learning (RL) into a supervised sequence +modeling task, where the trajectory elements are generated auto-regressively +conditioned on the return-to-go (RTG).However, the sequence modeling learning +approach tends to learn policies that converge on the sub-optimal trajectories +within the dataset, for lack of bridging data to move to better trajectories, +even if the condition is set to the highest RTG.To address this issue, we +introduce Diffusion-Based Trajectory Branch Generation (BG), which expands the +trajectories of the dataset with branches generated by a diffusion model.The +trajectory branch is generated based on the segment of the trajectory within +the dataset, and leads to trajectories with higher returns.We concatenate the +generated branch with the trajectory segment as an expansion of the +trajectory.After expanding, DT has more opportunities to learn policies to move +to better trajectories, preventing it from converging to the sub-optimal +trajectories.Empirically, after processing with BG, DT outperforms +state-of-the-art sequence modeling methods on D4RL benchmark, demonstrating the +effectiveness of adding branches to the dataset without further modifications. + +
+
+
+
+
+ + ☆ Cuvis.Ai: An Open-Source, Low-Code Software Ecosystem for Hyperspectral + Processing and Classification + + +
+ Machine learning is an important tool for analyzing high-dimension +hyperspectral data; however, existing software solutions are either +closed-source or inextensible research products. In this paper, we present +cuvis.ai, an open-source and low-code software ecosystem for data acquisition, +preprocessing, and model training. The package is written in Python and +provides wrappers around common machine learning libraries, allowing both +classical and deep learning models to be trained on hyperspectral data. The +codebase abstracts processing interconnections and data dependencies between +operations to minimize code complexity for users. This software package +instantiates nodes in a directed acyclic graph to handle all stages of a +machine learning ecosystem, from data acquisition, including live or static +data sources, to final class assignment or property prediction. User-created +models contain convenient serialization methods to ensure portability and +increase sharing within the research community. All code and data are available +online: https://github.com/cubert-hyperspectral/cuvis.ai + +
+
+ comment: 5 pages, 2024 14th Workshop on Hyperspectral Imaging and Signal + Processing: Evolution in Remote Sensing (WHISPERS) +
+
+
+
+
+ + ☆ A Review on Machine Unlearning + + +
+ Recently, an increasing number of laws have governed the useability of users' +privacy. For example, Article 17 of the General Data Protection Regulation +(GDPR), the right to be forgotten, requires machine learning applications to +remove a portion of data from a dataset and retrain it if the user makes such a +request. Furthermore, from the security perspective, training data for machine +learning models, i.e., data that may contain user privacy, should be +effectively protected, including appropriate erasure. Therefore, researchers +propose various privacy-preserving methods to deal with such issues as machine +unlearning. This paper provides an in-depth review of the security and privacy +concerns in machine learning models. First, we present how machine learning can +use users' private data in daily life and the role that the GDPR plays in this +problem. Then, we introduce the concept of machine unlearning by describing the +security threats in machine learning models and how to protect users' privacy +from being violated using machine learning platforms. As the core content of +the paper, we introduce and analyze current machine unlearning approaches and +several representative research results and discuss them in the context of the +data lineage. Furthermore, we also discuss the future research challenges in +this field. + +
+
+
+
+
+ + ☆ Toward Personalized Federated Node Classification in One-shot + Communication + + +
+ Federated Graph Learning (FGL) has become a promising paradigm for +collaborative training with distributed and private graph data. One-shot +Federated Learning (OFL) enables collaboration in a single communication round +to largely reduce communication costs and potential security concerns. However, +existing OFL methods are not designed for graph data and existing FGL methods +are ineffective within one communication round under both data and model +heterogeneity. To mitigate this gap, we are the first to propose a one-shot +personalized federated graph learning method for node classification, which is +also compatible with the Secure Aggregation scheme. We estimate and aggregate +the statistics of class-wise feature distribution to generate a global +pseudo-graph on the server, which could be used to train a global graph model. +Furthermore, We reveal the under-explored problem of existing personalized FGL +methods that their personalized models are biased and neglect the ability to +generalize to minorities. To achieve better personalization and generalization +simultaneously, we propose a two-stage personalized training to adaptively +utilize the personal information from local data and global information from +the global pseudo-graph. Comprehensive experiments on 8 multi-scale graph +datasets under different partitions with various settings demonstrate our +superior performance over state-of-the-art baselines. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Recurrent Stochastic Configuration Networks with Incremental Blocks + + +
+ Recurrent stochastic configuration networks (RSCNs) have shown promise in +modelling nonlinear dynamic systems with order uncertainty due to their +advantages of easy implementation, less human intervention, and strong +approximation capability. This paper develops the original RSCNs with block +increments, termed block RSCNs (BRSCNs), to further enhance the learning +capacity and efficiency of the network. BRSCNs can simultaneously add multiple +reservoir nodes (subreservoirs) during the construction. Each subreservoir is +configured with a unique structure in the light of a supervisory mechanism, +ensuring the universal approximation property. The reservoir feedback matrix is +appropriately scaled to guarantee the echo state property of the network. +Furthermore, the output weights are updated online using a projection +algorithm, and the persistent excitation conditions that facilitate parameter +convergence are also established. Numerical results over a time series +prediction, a nonlinear system identification task, and two industrial data +predictive analyses demonstrate that the proposed BRSCN performs favourably in +terms of modelling efficiency, learning, and generalization performance, +highlighting their significant potential for coping with complex dynamics. + +
+
+
+
+
+ + ☆ Accelerating spherical K-means clustering for large-scale sparse + document data + + +
+ This paper presents an accelerated spherical K-means clustering algorithm for +large-scale and high-dimensional sparse document data sets. We design an +algorithm working in an architecture-friendly manner (AFM), which is a +procedure of suppressing performance-degradation factors such as the numbers of +instructions, branch mispredictions, and cache misses in CPUs of a modern +computer system. For the AFM operation, we leverage unique universal +characteristics (UCs) of a data-object and a cluster's mean set, which are +skewed distributions on data relationships such as Zipf's law and a +feature-value concentration phenomenon. The UCs indicate that the most part of +the number of multiplications for similarity calculations is executed regarding +terms with high document frequencies (df) and the most part of a similarity +between an object- and a mean-feature vector is obtained by the multiplications +regarding a few high mean-feature values. Our proposed algorithm applies an +inverted-index data structure to a mean set, extracts the specific region with +high-df terms and high mean-feature values in the mean-inverted index by newly +introduced two structural parameters, and exploits the index divided into three +parts for efficient pruning. The algorithm determines the two structural +parameters by minimizing the approximate number of multiplications related to +that of instructions, reduces the branch mispredictions by sharing the index +structure including the two parameters with all the objects, and suppressing +the cache misses by keeping in the caches the frequently used data in the +foregoing specific region, resulting in working in the AFM. We experimentally +demonstrate that our algorithm efficiently achieves superior speed performance +in large-scale documents compared with algorithms using the state-of-the-art +techniques. + +
+
+ comment: 28 pages, 23 figures +
+
+
+
+
+ + ☆ Steering Language Model Refusal with Sparse Autoencoders + + +
+ Responsible practices for deploying language models include guiding models to +recognize and refuse answering prompts that are considered unsafe, while +complying with safe prompts. Achieving such behavior typically requires +updating model weights, which is costly and inflexible. We explore +opportunities to steering model activations at inference time, which does not +require updating weights. Using sparse autoencoders, we identify and steer +features in Phi-3 Mini that mediate refusal behavior. We find that feature +steering can improve Phi-3 Minis robustness to jailbreak attempts across +various harms, including challenging multi-turn attacks. However, we discover +that feature steering can adversely affect overall performance on benchmarks. +These results suggest that identifying steerable mechanisms for refusal via +sparse autoencoders is a promising approach for enhancing language model +safety, but that more research is needed to mitigate feature steerings adverse +effects on performance. + +
+
+
+
+
+ + ☆ SADDE: Semi-supervised Anomaly Detection with Dependable Explanations + + +
+ Semi-supervised learning holds a pivotal position in anomaly detection +applications, yet identifying anomaly patterns with a limited number of labeled +samples poses a significant challenge. Furthermore, the absence of +interpretability poses major obstacles to the practical adoption of +semi-supervised frameworks. The majority of existing interpretation techniques +are tailored for supervised/unsupervised frameworks or non-security domains, +falling short in providing dependable interpretations. In this research paper, +we introduce SADDE, a general framework designed to accomplish two primary +objectives: (1) to render the anomaly detection process interpretable and +enhance the credibility of interpretation outcomes, and (2) to assign +high-confidence pseudo labels to unlabeled samples, thereby boosting the +performance of anomaly detection systems when supervised data is scarce. To +achieve the first objective, we devise a cutting-edge interpretation method +that utilizes both global and local interpreters to furnish trustworthy +explanations. For the second objective, we conceptualize a novel two-stage +semi-supervised learning framework tailored for network anomaly detection, +ensuring that the model predictions of both stages align with specific +constraints. We apply SADDE to two illustrative network anomaly detection tasks +and conduct extensive evaluations in comparison with notable prior works. The +experimental findings underscore that SADDE is capable of delivering precise +detection results alongside dependable interpretations for semi-supervised +network anomaly detection systems. The source code for SADDE is accessible at: +https://github.com/M-Code-Space/SADDE. + +
+
+
+
+
+ + ☆ Dual-Frequency Filtering Self-aware Graph Neural Networks for Homophilic + and Heterophilic Graphs + + +
+ Graph Neural Networks (GNNs) have excelled in handling graph-structured data, +attracting significant research interest. However, two primary challenges have +emerged: interference between topology and attributes distorting node +representations, and the low-pass filtering nature of most GNNs leading to the +oversight of valuable high-frequency information in graph signals. These issues +are particularly pronounced in heterophilic graphs. To address these +challenges, we propose Dual-Frequency Filtering Self-aware Graph Neural +Networks (DFGNN). DFGNN integrates low-pass and high-pass filters to extract +smooth and detailed topological features, using frequency-specific constraints +to minimize noise and redundancy in the respective frequency bands. The model +dynamically adjusts filtering ratios to accommodate both homophilic and +heterophilic graphs. Furthermore, DFGNN mitigates interference by aligning +topological and attribute representations through dynamic correspondences +between their respective frequency bands, enhancing overall model performance +and expressiveness. Extensive experiments conducted on benchmark datasets +demonstrate that DFGNN outperforms state-of-the-art methods in classification +performance, highlighting its effectiveness in handling both homophilic and +heterophilic graphs. + +
+
+ comment: 11pages,17figures +
+
+
+
+
+ + ☆ Multi-Hyperbolic Space-based Heterogeneous Graph Attention Network + + +
+ To leverage the complex structures within heterogeneous graphs, recent +studies on heterogeneous graph embedding use a hyperbolic space, characterized +by a constant negative curvature and exponentially increasing space, which +aligns with the structural properties of heterogeneous graphs. However, despite +heterogeneous graphs inherently possessing diverse power-law structures, most +hyperbolic heterogeneous graph embedding models use a single hyperbolic space +for the entire heterogeneous graph, which may not effectively capture the +diverse power-law structures within the heterogeneous graph. To address this +limitation, we propose Multi-hyperbolic Space-based heterogeneous Graph +Attention Network (MSGAT), which uses multiple hyperbolic spaces to effectively +capture diverse power-law structures within heterogeneous graphs. We conduct +comprehensive experiments to evaluate the effectiveness of MSGAT. The +experimental results demonstrate that MSGAT outperforms state-of-the-art +baselines in various graph machine learning tasks, effectively capturing the +complex structures of heterogeneous graphs. + +
+
+ comment: Accepted in IEEE ICDM 2024 +
+
+
+
+
+ + ☆ Coupled Integral PINN for conservation law + + +
+ The Physics-Informed Neural Network (PINN) is an innovative approach to solve +a diverse array of partial differential equations (PDEs) leveraging the power +of neural networks. This is achieved by minimizing the residual loss associated +with the explicit physical information, usually coupled with data derived from +initial and boundary conditions. However, a challenge arises in the context of +nonlinear conservation laws where derivatives are undefined at shocks, leading +to solutions that deviate from the true physical phenomena. To solve this +issue, the physical solution must be extracted from the weak formulation of the +PDE and is typically further bounded by entropy conditions. Within the +numerical framework, finite volume methods (FVM) are employed to address +conservation laws. These methods resolve the integral form of conservation laws +and delineate the shock characteristics. Inspired by the principles underlying +FVM, this paper introduces a novel Coupled Integrated PINN methodology that +involves fitting the integral solutions of equations using additional neural +networks. This technique not only augments the conventional PINN's capability +in modeling shock waves, but also eliminates the need for spatial and temporal +discretization. As such, it bypasses the complexities of numerical integration +and reconstruction associated with non-convex fluxes. Finally, we show that the +proposed new Integrated PINN performs well in conservative law and outperforms +the vanilla PINN when tackle the challenging shock problems using examples of +Burger's equation, Buckley-Leverett Equation and Euler System. + +
+
+
+
+
+ + ☆ Effective Predictive Modeling for Emergency Department Visits and + Evaluating Exogenous Variables Impact: Using Explainable Meta-learning + Gradient Boosting + + +
+ Over an extensive duration, administrators and clinicians have endeavoured to +predict Emergency Department (ED) visits with precision, aiming to optimise +resource distribution. Despite the proliferation of diverse AI-driven models +tailored for precise prognostication, this task persists as a formidable +challenge, besieged by constraints such as restrained generalisability, +susceptibility to overfitting and underfitting, scalability issues, and complex +fine-tuning hyper-parameters. In this study, we introduce a novel Meta-learning +Gradient Booster (Meta-ED) approach for precisely forecasting daily ED visits +and leveraging a comprehensive dataset of exogenous variables, including +socio-demographic characteristics, healthcare service use, chronic diseases, +diagnosis, and climate parameters spanning 23 years from Canberra Hospital in +ACT, Australia. The proposed Meta-ED consists of four foundational +learners-Catboost, Random Forest, Extra Tree, and lightGBoost-alongside a +dependable top-level learner, Multi-Layer Perceptron (MLP), by combining the +unique capabilities of varied base models (sub-learners). Our study assesses +the efficacy of the Meta-ED model through an extensive comparative analysis +involving 23 models. The evaluation outcomes reveal a notable superiority of +Meta-ED over the other models in accuracy at 85.7% (95% CI ;85.4%, 86.0%) and +across a spectrum of 10 evaluation metrics. Notably, when compared with +prominent techniques, XGBoost, Random Forest (RF), AdaBoost, LightGBoost, and +Extra Tree (ExT), Meta-ED showcases substantial accuracy enhancements of 58.6%, +106.3%, 22.3%, 7.0%, and 15.7%, respectively. Furthermore, incorporating +weather-related features demonstrates a 3.25% improvement in the prediction +accuracy of visitors' numbers. The encouraging outcomes of our study underscore +Meta-ED as a foundation model for the precise prediction of daily ED visitors. + +
+
+
+
+
+ + ☆ ACE2: Accurately learning subseasonal to decadal atmospheric variability + and forced responses + + +
+ Existing machine learning models of weather variability are not formulated to +enable assessment of their response to varying external boundary conditions +such as sea surface temperature and greenhouse gases. Here we present ACE2 (Ai2 +Climate Emulator version 2) and its application to reproducing atmospheric +variability over the past 80 years on timescales from days to decades. ACE2 is +a 450M-parameter autoregressive machine learning emulator, operating with +6-hour temporal resolution, 1{\deg} horizontal resolution and eight vertical +layers. It exactly conserves global dry air mass and moisture and can be +stepped forward stably for arbitrarily many steps with a throughput of about +1500 simulated years per wall clock day. ACE2 generates emergent phenomena such +as tropical cyclones, the Madden Julian Oscillation, and sudden stratospheric +warmings. Furthermore, it accurately reproduces the atmospheric response to El +Ni\~no variability and global trends of temperature over the past 80 years. +However, its sensitivities to separately changing sea surface temperature and +carbon dioxide are not entirely realistic. + +
+
+ comment: 31 pages, 23 figures +
+
+
+
+
+ + ☆ GROOT: Effective Design of Biological Sequences with Limited + Experimental Data + + +
+ Latent space optimization (LSO) is a powerful method for designing discrete, +high-dimensional biological sequences that maximize expensive black-box +functions, such as wet lab experiments. This is accomplished by learning a +latent space from available data and using a surrogate model to guide +optimization algorithms toward optimal outputs. However, existing methods +struggle when labeled data is limited, as training the surrogate model with few +labeled data points can lead to subpar outputs, offering no advantage over the +training data itself. We address this challenge by introducing GROOT, a +Graph-based Latent Smoothing for Biological Sequence Optimization. In +particular, GROOT generates pseudo-labels for neighbors sampled around the +training latent embeddings. These pseudo-labels are then refined and smoothed +by Label Propagation. Additionally, we theoretically and empirically justify +our approach, demonstrate GROOT's ability to extrapolate to regions beyond the +training set while maintaining reliability within an upper bound of their +expected distances from the training regions. We evaluate GROOT on various +biological sequence design tasks, including protein optimization (GFP and AAV) +and three tasks with exact oracles from Design-Bench. The results demonstrate +that GROOT equalizes and surpasses existing methods without requiring access to +black-box oracles or vast amounts of labeled data, highlighting its +practicality and effectiveness. We release our code at +https://anonymous.4open.science/r/GROOT-D554 + +
+
+
+
+
+ + ☆ Graph Retention Networks for Dynamic Graphs + + +
+ In this work, we propose Graph Retention Network as a unified architecture +for deep learning on dynamic graphs. The GRN extends the core computational +manner of retention to dynamic graph data as graph retention, which empowers +the model with three key computational paradigms that enable training +parallelism, $O(1)$ low-cost inference, and long-term batch training. This +architecture achieves an optimal balance of effectiveness, efficiency, and +scalability. Extensive experiments conducted on benchmark datasets present the +superior performance of the GRN in both edge-level prediction and node-level +classification tasks. Our architecture achieves cutting-edge results while +maintaining lower training latency, reduced GPU memory consumption, and up to +an 86.7x improvement in inference throughput compared to baseline models. The +GRNs have demonstrated strong potential to become a widely adopted architecture +for dynamic graph learning tasks. Code will be available at +https://github.com/Chandler-Q/GraphRetentionNet. + +
+
+
+
+
+ + ☆ Progressive Generalization Risk Reduction for Data-Efficient Causal + Effect Estimation + + +
+ Causal effect estimation (CEE) provides a crucial tool for predicting the +unobserved counterfactual outcome for an entity. As CEE relaxes the requirement +for ``perfect'' counterfactual samples (e.g., patients with identical +attributes and only differ in treatments received) that are impractical to +obtain and can instead operate on observational data, it is usually used in +high-stake domains like medical treatment effect prediction. Nevertheless, in +those high-stake domains, gathering a decently sized, fully labelled +observational dataset remains challenging due to hurdles associated with costs, +ethics, expertise and time needed, etc., of which medical treatment surveys are +a typical example. Consequently, if the training dataset is small in scale, low +generalization risks can hardly be achieved on any CEE algorithms. + Unlike existing CEE methods that assume the constant availability of a +dataset with abundant samples, in this paper, we study a more realistic CEE +setting where the labelled data samples are scarce at the beginning, while more +can be gradually acquired over the course of training -- assuredly under a +limited budget considering their expensive nature. Then, the problem naturally +comes down to actively selecting the best possible samples to be labelled, +e.g., identifying the next subset of patients to conduct the treatment survey. +However, acquiring quality data for reducing the CEE risk under limited +labelling budgets remains under-explored until now. To fill the gap, we +theoretically analyse the generalization risk from an intriguing perspective of +progressively shrinking its upper bound, and develop a principled label +acquisition pipeline exclusively for CEE tasks. With our analysis, we propose +the Model Agnostic Causal Active Learning (MACAL) algorithm for batch-wise +label acquisition, which aims to reduce both the CEE model's uncertainty and +the post-acquisition ... + +
+
+ comment: Accepted by KDD'25 +
+
+
+
+
+ + ☆ EXCON: Extreme Instance-based Contrastive Representation Learning of + Severely Imbalanced Multivariate Time Series for Solar Flare Prediction + + +
+ In heliophysics research, predicting solar flares is crucial due to their +potential to impact both space-based systems and Earth's infrastructure +substantially. Magnetic field data from solar active regions, recorded by solar +imaging observatories, are transformed into multivariate time series to enable +solar flare prediction using temporal window-based analysis. In the realm of +multivariate time series-driven solar flare prediction, addressing severe class +imbalance with effective strategies for multivariate time series representation +learning is key to developing robust predictive models. Traditional methods +often struggle with overfitting to the majority class in prediction tasks where +major solar flares are infrequent. This work presents EXCON, a contrastive +representation learning framework designed to enhance classification +performance amidst such imbalances. EXCON operates through four stages: +obtaining core features from multivariate time series data; selecting +distinctive contrastive representations for each class to maximize inter-class +separation; training a temporal feature embedding module with a custom extreme +reconstruction loss to minimize intra-class variation; and applying a +classifier to the learned embeddings for robust classification. The proposed +method leverages contrastive learning principles to map similar instances +closer in the feature space while distancing dissimilar ones, a strategy not +extensively explored in solar flare prediction tasks. This approach not only +addresses class imbalance but also offers a versatile solution applicable to +univariate and multivariate time series across binary and multiclass +classification problems. Experimental results, including evaluations on the +benchmark solar flare dataset and multiple time series archive datasets with +binary and multiclass labels, demonstrate EXCON's efficacy in enhancing +classification performance. + +
+
+ comment: This work has been accepted at the 2024 IEEE International Conference + on Big Data (IEEE BigData 2024) on October 27, 2024, as a main conference + paper +
+
+
+
+
+ + ☆ Mirror Descent on Reproducing Kernel Banach Spaces + + +
+ Recent advances in machine learning have led to increased interest in +reproducing kernel Banach spaces (RKBS) as a more general framework that +extends beyond reproducing kernel Hilbert spaces (RKHS). These works have +resulted in the formulation of representer theorems under several regularized +learning schemes. However, little is known about an optimization method that +encompasses these results in this setting. This paper addresses a learning +problem on Banach spaces endowed with a reproducing kernel, focusing on +efficient optimization within RKBS. To tackle this challenge, we propose an +algorithm based on mirror descent (MDA). Our approach involves an iterative +method that employs gradient steps in the dual space of the Banach space using +the reproducing kernel. + We analyze the convergence properties of our algorithm under various +assumptions and establish two types of results: first, we identify conditions +under which a linear convergence rate is achievable, akin to optimization in +the Euclidean setting, and provide a proof of the linear rate; second, we +demonstrate a standard convergence rate in a constrained setting. Moreover, to +instantiate this algorithm in practice, we introduce a novel family of RKBSs +with $p$-norm ($p \neq 2$), characterized by both an explicit dual map and a +kernel. + +
+
+ comment: 42 pages, 3 figures +
+
+
+
+
+ + ☆ Reliable Learning of Halfspaces under Gaussian Marginals + + +
+ We study the problem of PAC learning halfspaces in the reliable agnostic +model of Kalai et al. (2012). The reliable PAC model captures learning +scenarios where one type of error is costlier than the others. Our main +positive result is a new algorithm for reliable learning of Gaussian halfspaces +on $\mathbb{R}^d$ with sample and computational complexity $$d^{O(\log +(\min\{1/\alpha, 1/\epsilon\}))}\min (2^{\log(1/\epsilon)^{O(\log +(1/\alpha))}},2^{\mathrm{poly}(1/\epsilon)})\;,$$ where $\epsilon$ is the +excess error and $\alpha$ is the bias of the optimal halfspace. We complement +our upper bound with a Statistical Query lower bound suggesting that the +$d^{\Omega(\log (1/\alpha))}$ dependence is best possible. Conceptually, our +results imply a strong computational separation between reliable agnostic +learning and standard agnostic learning of halfspaces in the Gaussian setting. + +
+
+
+
+
+ + ♻ ☆ What Do Learning Dynamics Reveal About Generalization in LLM Reasoning? + + +
+ Despite the remarkable capabilities of modern large language models (LLMs), +the mechanisms behind their problem-solving abilities remain elusive. In this +work, we aim to better understand how the learning dynamics of LLM finetuning +shapes downstream generalization. Our analysis focuses on reasoning tasks, +whose problem structure allows us to distinguish between memorization (the +exact replication of reasoning steps from the training data) and performance +(the correctness of the final solution). We find that a model's generalization +behavior can be effectively characterized by a training metric we call +pre-memorization train accuracy: the accuracy of model samples on training +queries before they begin to copy the exact reasoning steps from the training +set. On the dataset level, this metric is able to reliably predict test +accuracy, achieving $R^2$ of around or exceeding 0.9 across various models +(Llama3 8, Gemma2 9B), datasets (GSM8k, MATH), and training configurations. On +a per-example level, this metric is also indicative of whether individual model +predictions are robust to perturbations in the training query. By connecting a +model's learning behavior to its generalization, pre-memorization train +accuracy can guide targeted improvements to training strategies. We focus on +data curation as an example, and show that prioritizing examples with low +pre-memorization accuracy leads to 1.5-2x improvements in data efficiency +compared to i.i.d. data scaling, and outperforms other standard data curation +techniques. + +
+
+
+
+
+ + ♻ ☆ Watermark-based Detection and Attribution of AI-Generated Content + + +
+ Several companies have deployed watermark-based detection to identify +AI-generated content. However, attribution--the ability to trace back to the +user of a generative AI (GenAI) service who created a given piece of +AI-generated content--remains largely unexplored despite its growing +importance. In this work, we aim to bridge this gap by conducting the first +systematic study on watermark-based, user-level attribution of AI-generated +content. Our key idea is to assign a unique watermark to each user of the GenAI +service and embed this watermark into the AI-generated content created by that +user. Attribution is then performed by identifying the user whose watermark +best matches the one extracted from the given content. This approach, however, +faces a key challenge: How should watermarks be selected for users to maximize +attribution performance? To address the challenge, we first theoretically +derive lower bounds on detection and attribution performance through rigorous +probabilistic analysis for any given set of user watermarks. Then, we select +watermarks for users to maximize these lower bounds, thereby optimizing +detection and attribution performance. Our theoretical and empirical results +show that watermark-based attribution inherits both the accuracy and +(non-)robustness properties of the underlying watermark. Specifically, +attribution remains highly accurate when the watermarked AI-generated content +is either not post-processed or subjected to common post-processing such as +JPEG compression, as well as black-box adversarial post-processing with limited +query budgets. + +
+
+
+
+
+ + ♻ ☆ MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation + Framework + + +
+ Medical imaging segmentation is a highly active area of research, with deep +learning-based methods achieving state-of-the-art results in several +benchmarks. However, the lack of standardized tools for training, testing, and +evaluating new methods makes the comparison of methods difficult. To address +this, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple, +modular, and end-to-end medical imaging segmentation framework designed to +facilitate consistent training, testing, and evaluation of deep learning-based +medical imaging segmentation methods. MIST standardizes data analysis, +preprocessing, and evaluation pipelines, accommodating multiple architectures +and loss functions. This standardization ensures reproducible and fair +comparisons across different methods. We detail MIST's data format +requirements, pipelines, and auxiliary features and demonstrate its efficacy +using the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results +highlight MIST's ability to produce accurate segmentation masks and its +scalability across multiple GPUs, showcasing its potential as a powerful tool +for future medical imaging research and development. + +
+
+ comment: Submitted to BraTS 2024 +
+
+
+
+
+ + ♻ ☆ Learning-Based Pricing and Matching for Two-Sided Queues + + +
+ We consider a dynamic system with multiple types of customers and servers. +Each type of waiting customer or server joins a separate queue, forming a +bipartite graph with customer-side queues and server-side queues. The platform +can match the servers and customers if their types are compatible. The matched +pairs then leave the system. The platform will charge a customer a price +according to their type when they arrive and will pay a server a price +according to their type. The arrival rate of each queue is determined by the +price according to some unknown demand or supply functions. Our goal is to +design pricing and matching algorithms to maximize the profit of the platform +with unknown demand and supply functions, while keeping queue lengths of both +customers and servers below a predetermined threshold. This system can be used +to model two-sided markets such as ride-sharing markets with passengers and +drivers. The difficulties of the problem include simultaneous learning and +decision making, and the tradeoff between maximizing profit and minimizing +queue length. We use a longest-queue-first matching algorithm and propose a +learning-based pricing algorithm, which combines gradient-free stochastic +projected gradient ascent with bisection search. We prove that our proposed +algorithm yields a sublinear regret $\tilde{O}(T^{5/6})$ and anytime +queue-length bound $\tilde{O}(T^{1/6})$, where $T$ is the time horizon. We +further establish a tradeoff between the regret bound and the queue-length +bound: $\tilde{O}(T^{1-\gamma})$ versus $\tilde{O}(T^{\gamma})$ for $\gamma \in +(0, 1/6].$ + +
+
+ comment: 60 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Effective Virtual Reality Teleoperation of an Upper-body Humanoid with + Modified Task Jacobians and Relaxed Barrier Functions for Self-Collision + Avoidance + + +
+ We present an approach for retartgeting off-the-shelf Virtual Reality (VR) +trackers to effectively teleoperate an upper-body humanoid while ensuring +self-collision-free motions. Key to the effectiveness was the proper assignment +of trackers to joint sets via modified task Jacobians and relaxed barrier +functions for self-collision avoidance. The approach was validated on +Apptronik's Astro hardware by demonstrating manipulation capabilities on a +table-top environment with pick-and-place box packing and a two-handed box pick +up and handover task. + +
+
+ comment: First Prize Winner of Horizons of an extended robotics reality + Workshop at International Conference on Intelligent Robots and Systems, 2022 +
+
+
+
+
+ + ♻ ☆ Backdoor defense, learnability and obfuscation + + +
+ We introduce a formal notion of defendability against backdoors using a game +between an attacker and a defender. In this game, the attacker modifies a +function to behave differently on a particular input known as the "trigger", +while behaving the same almost everywhere else. The defender then attempts to +detect the trigger at evaluation time. If the defender succeeds with high +enough probability, then the function class is said to be defendable. The key +constraint on the attacker that makes defense possible is that the attacker's +strategy must work for a randomly-chosen trigger. + Our definition is simple and does not explicitly mention learning, yet we +demonstrate that it is closely connected to learnability. In the +computationally unbounded setting, we use a voting algorithm of Hanneke et al. +(2022) to show that defendability is essentially determined by the VC dimension +of the function class, in much the same way as PAC learnability. In the +computationally bounded setting, we use a similar argument to show that +efficient PAC learnability implies efficient defendability, but not conversely. +On the other hand, we use indistinguishability obfuscation to show that the +class of polynomial size circuits is not efficiently defendable. Finally, we +present polynomial size decision trees as a natural example for which defense +is strictly easier than learning. Thus, we identify efficient defendability as +a notable intermediate concept in between efficient learnability and +obfuscation. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ Robust Subgraph Learning by Monitoring Early Training Representations + + +
+ Graph neural networks (GNNs) have attracted significant attention for their +outstanding performance in graph learning and node classification tasks. +However, their vulnerability to adversarial attacks, particularly through +susceptible nodes, poses a challenge in decision-making. The need for robust +graph summarization is evident in adversarial challenges resulting from the +propagation of attacks throughout the entire graph. In this paper, we address +both performance and adversarial robustness in graph input by introducing the +novel technique SHERD (Subgraph Learning Hale through Early Training +Representation Distances). SHERD leverages information from layers of a +partially trained graph convolutional network (GCN) to detect susceptible nodes +during adversarial attacks using standard distance metrics. The method +identifies "vulnerable (bad)" nodes and removes such nodes to form a robust +subgraph while maintaining node classification performance. Through our +experiments, we demonstrate the increased performance of SHERD in enhancing +robustness by comparing the network's performance on original and subgraph +inputs against various baselines alongside existing adversarial attacks. Our +experiments across multiple datasets, including citation datasets such as Cora, +Citeseer, and Pubmed, as well as microanatomical tissue structures of cell +graphs in the placenta, highlight that SHERD not only achieves substantial +improvement in robust performance but also outperforms several baselines in +terms of node classification accuracy and computational complexity. + +
+
+
+
+
+ + ♻ ☆ Understanding Generalizability of Diffusion Models Requires Rethinking + the Hidden Gaussian Structure + + +
+ In this work, we study the generalizability of diffusion models by looking +into the hidden properties of the learned score functions, which are +essentially a series of deep denoisers trained on various noise levels. We +observe that as diffusion models transition from memorization to +generalization, their corresponding nonlinear diffusion denoisers exhibit +increasing linearity. This discovery leads us to investigate the linear +counterparts of the nonlinear diffusion models, which are a series of linear +models trained to match the function mappings of the nonlinear diffusion +denoisers. Surprisingly, these linear denoisers are approximately the optimal +denoisers for a multivariate Gaussian distribution characterized by the +empirical mean and covariance of the training dataset. This finding implies +that diffusion models have the inductive bias towards capturing and utilizing +the Gaussian structure (covariance information) of the training dataset for +data generation. We empirically demonstrate that this inductive bias is a +unique property of diffusion models in the generalization regime, which becomes +increasingly evident when the model's capacity is relatively small compared to +the training dataset size. In the case that the model is highly +overparameterized, this inductive bias emerges during the initial training +phases before the model fully memorizes its training data. Our study provides +crucial insights into understanding the notable strong generalization +phenomenon recently observed in real-world diffusion models. + +
+
+
+
+
+ + ♻ ☆ Fine-Tuning a Time Series Foundation Model with Wasserstein Loss + + +
+ Inspired by recent advancements in large language models (LLMs) for Natural +Language Processing (NLP), there has been a surge in research focused on +developing foundational models for time series forecasting. One approach +involves training LLM architectures on tokenized time series data using +cross-entropy loss. Although this method has demonstrated promising results, +cross-entropy loss is primarily designed for classification tasks and does not +account for the distance between classes. To address this limitation, we +propose using the Wasserstein loss for such architectures. To validate our +approach, we fine-tuned a foundational time series model on $22$ zero-shot +datasets, comparing the performance of cross-entropy loss with that of +Wasserstein loss. Our results demonstrate that replacing cross-entropy loss +with Wasserstein loss significantly improves point estimation. + +
+
+ comment: 4 main pages; 2 figures +
+
+
+
+
+ + ♻ ☆ Unmasking Parkinson's Disease with Smile: An AI-enabled Screening + Framework + + +
+ We present an efficient and accessible PD screening method by leveraging +AI-driven models enabled by the largest video dataset of facial expressions +from 1,059 unique participants. This dataset includes 256 individuals with PD, +165 clinically diagnosed, and 91 self-reported. Participants used webcams to +record themselves mimicking three facial expressions (smile, disgust, and +surprise) from diverse sources encompassing their homes across multiple +countries, a US clinic, and a PD wellness center in the US. Facial landmarks +are automatically tracked from the recordings to extract features related to +hypomimia, a prominent PD symptom characterized by reduced facial expressions. +Machine learning algorithms are trained on these features to distinguish +between individuals with and without PD. The model was tested for +generalizability on external (unseen during training) test videos collected +from a US clinic and Bangladesh. An ensemble of machine learning models trained +on smile videos achieved an accuracy of 87.9+-0.1% (95% Confidence Interval) +with an AUROC of 89.3+-0.3% as evaluated on held-out data (using k-fold +cross-validation). In external test settings, the ensemble model achieved +79.8+-0.6% accuracy with 81.9+-0.3% AUROC on the clinical test set and +84.9+-0.4% accuracy with 81.2+-0.6% AUROC on participants from Bangladesh. In +every setting, the model was free from detectable bias across sex and ethnic +subgroups, except in the cohorts from Bangladesh, where the model performed +significantly better for female participants than males. Smiling videos can +effectively differentiate between individuals with and without PD, offering a +potentially easy, accessible, and cost-efficient way to screen for PD, +especially when a clinical diagnosis is difficult to access. + +
+
+
+
+
+ + ♻ ☆ DARNet: Dual Attention Refinement Network with Spatiotemporal + Construction for Auditory Attention Detection + + +
+ At a cocktail party, humans exhibit an impressive ability to direct their +attention. The auditory attention detection (AAD) approach seeks to identify +the attended speaker by analyzing brain signals, such as EEG signals. However, +current AAD algorithms overlook the spatial distribution information within EEG +signals and lack the ability to capture long-range latent dependencies, +limiting the model's ability to decode brain activity. To address these issues, +this paper proposes a dual attention refinement network with spatiotemporal +construction for AAD, named DARNet, which consists of the spatiotemporal +construction module, dual attention refinement module, and feature fusion \& +classifier module. Specifically, the spatiotemporal construction module aims to +construct more expressive spatiotemporal feature representations, by capturing +the spatial distribution characteristics of EEG signals. The dual attention +refinement module aims to extract different levels of temporal patterns in EEG +signals and enhance the model's ability to capture long-range latent +dependencies. The feature fusion \& classifier module aims to aggregate +temporal patterns and dependencies from different levels and obtain the final +classification results. The experimental results indicate that compared to the +state-of-the-art models, DARNet achieves an average classification accuracy +improvement of 5.9\% for 0.1s, 4.6\% for 1s, and 3.9\% for 2s on the DTU +dataset. While maintaining excellent classification performance, DARNet +significantly reduces the number of required parameters. Compared to the +state-of-the-art models, DARNet reduces the parameter count by 91\%. Code is +available at: https://github.com/fchest/DARNet.git. + +
+
+
+
+
+ + ♻ ☆ Partial Information Decomposition for Data Interpretability and Feature + Selection + + +
+ In this paper, we introduce Partial Information Decomposition of Features +(PIDF), a new paradigm for simultaneous data interpretability and feature +selection. Contrary to traditional methods that assign a single importance +value, our approach is based on three metrics per feature: the mutual +information shared with the target variable, the feature's contribution to +synergistic information, and the amount of this information that is redundant. +In particular, we develop a novel procedure based on these three metrics, which +reveals not only how features are correlated with the target but also the +additional and overlapping information provided by considering them in +combination with other features. We extensively evaluate PIDF using both +synthetic and real-world data, demonstrating its potential applications and +effectiveness, by considering case studies from genetics and neuroscience. + +
+
+
+
+
+ + ♻ ☆ Modulating Language Model Experiences through Frictions NeurIPS + + +
+ Language models are transforming the ways that their users engage with the +world. Despite impressive capabilities, over-consumption of language model +outputs risks propagating unchecked errors in the short-term and damaging human +capabilities for critical thinking in the long-term. How can we develop +scaffolding around language models to curate more appropriate use? We propose +selective frictions for language model experiences, inspired by behavioral +science interventions, to dampen misuse. Frictions involve small modifications +to a user's experience, e.g., the addition of a button impeding model access +and reminding a user of their expertise relative to the model. Through a user +study with real humans, we observe shifts in user behavior from the imposition +of a friction over LLMs in the context of a multi-topic question-answering task +as a representative task that people may use LLMs for, e.g., in education and +information retrieval. We find that frictions modulate over-reliance by driving +down users' click rates while minimally affecting accuracy for those topics. +Yet, frictions may have unintended effects. We find marked differences in +users' click behaviors even on topics where frictions were not provisioned. Our +contributions motivate further study of human-AI behavioral interaction to +inform more effective and appropriate LLM use. + +
+
+ comment: NeurIPS Workshop on Behavioral ML; non-archival +
+
+
+
+
+ + ♻ ☆ Utilizing Large Language Models in an iterative paradigm with domain + feedback for molecule optimization + + +
+ Molecule optimization is a critical task in drug discovery to optimize +desired properties of a given molecule through chemical modification. Despite +Large Language Models (LLMs) holding the potential to efficiently simulate this +task by using natural language to direct the optimization, straightforwardly +utilizing them shows limited performance. In this work, we facilitate utilizing +LLMs in an iterative paradigm by proposing a simple yet highly effective domain +feedback provider, namely $\text{Re}^3$DF. In detail, $\text{Re}^3$DF harnesses +an external toolkit, RDKit, to handle the molecule hallucination, if the +modified molecule is chemically invalid. Otherwise, its desired properties are +computed and compared to the original one, establishing reliable domain +feedback with correct direction and distance towards the objective, followed by +a retrieved example, to guide the LLM to refine the modified molecule. We +conduct experiments across both single- and multi-property objectives with 2 +thresholds, where $\text{Re}^3$DF shows significant improvements. Particularly, +for 20 single-property objectives, $\text{Re}^3$DF enhances Hit ratio by 16.95% +and 20.76% under loose (\texttt{l}) and strict (\texttt{s}) thresholds, +respectively. For 32 multi-property objectives, $\text{Re}^3$DF enhances Hit +ratio by 6.04% and 5.25%. + +
+
+
+
+
+ + ♻ ☆ Straightness of Rectified Flow: A Theoretical Insight into Wasserstein + Convergence + + +
+ Diffusion models have emerged as a powerful tool for image generation and +denoising. Typically, generative models learn a trajectory between the starting +noise distribution and the target data distribution. Recently Liu et al. +(2023b) designed a novel alternative generative model Rectified Flow (RF), +which aims to learn straight flow trajectories from noise to data using a +sequence of convex optimization problems with close ties to optimal transport. +If the trajectory is curved, one must use many Euler discretization steps or +novel strategies, such as exponential integrators, to achieve a satisfactory +generation quality. In contrast, RF has been shown to theoretically straighten +the trajectory through successive rectifications, reducing the number of +function evaluations (NFEs) while sampling. It has also been shown empirically +that RF may improve the straightness in two rectifications if one can solve the +underlying optimization problem within a sufficiently small error. In this +paper, we make two key theoretical contributions: 1) we provide the first +theoretical analysis of the Wasserstein distance between the sampling +distribution of RF and the target distribution. Our error rate is characterized +by the number of discretization steps and a new formulation of straightness +stronger than that in the original work. 2) under a mild regularity assumption, +we show that for a rectified flow from a Gaussian to any general target +distribution with finite first moment (e.g. mixture of Gaussians), two +rectifications are sufficient to achieve a straight flow, which is in line with +the previous empirical findings. Additionally, we also present empirical +results on both simulated and real datasets to validate our theoretical +findings. + +
+
+
+
+
+ + ♻ ☆ Read to Play (R2-Play): Decision Transformer with Multimodal Game + Instruction + + +
+ Developing a generalist agent is a longstanding objective in artificial +intelligence. Previous efforts utilizing extensive offline datasets from +various tasks demonstrate remarkable performance in multitasking scenarios +within Reinforcement Learning. However, these works encounter challenges in +extending their capabilities to new tasks. Recent approaches integrate textual +guidance or visual trajectory into decision networks to provide task-specific +contextual cues, representing a promising direction. However, it is observed +that relying solely on textual guidance or visual trajectory is insufficient +for accurately conveying the contextual information of tasks. This paper +explores enhanced forms of task guidance for agents, enabling them to +comprehend gameplay instructions, thereby facilitating a "read-to-play" +capability. Drawing inspiration from the success of multimodal instruction +tuning in visual tasks, we treat the visual-based RL task as a long-horizon +vision task and construct a set of multimodal game instructions to incorporate +instruction tuning into a decision transformer. Experimental results +demonstrate that incorporating multimodal game instructions significantly +enhances the decision transformer's multitasking and generalization +capabilities. + +
+
+
+
+
+ + ♻ ☆ Feature-wise and Sample-wise Adaptive Transfer Learning for + High-dimensional Linear Regression + + +
+ We consider the transfer learning problem in the high dimensional linear +regression setting, where the feature dimension is larger than the sample size. +To learn transferable information, which may vary across features or the source +samples, we propose an adaptive transfer learning method that can detect and +aggregate the feature-wise (F-AdaTrans) or sample-wise (S-AdaTrans) +transferable structures. We achieve this by employing a fused-penalty, coupled +with weights that can adapt according to the transferable structure. To choose +the weight, we propose a theoretically informed, data-driven procedure, +enabling F-AdaTrans to selectively fuse the transferable signals with the +target while filtering out non-transferable signals, and S-AdaTrans to obtain +the optimal combination of information transferred from each source sample. We +show that, with appropriately chosen weights, F-AdaTrans achieves a convergence +rate close to that of an oracle estimator with a known transferable structure, +and S-AdaTrans recovers existing near-minimax optimal rates as a special case. +The effectiveness of the proposed method is validated using both simulation and +real data, demonstrating favorable performance compared to the existing +methods. + +
+
+
+
+
+ + ♻ ☆ Scalable spectral representations for multi-agent reinforcement learning + in network MDPs + + +
+ Network Markov Decision Processes (MDPs), a popular model for multi-agent +control, pose a significant challenge to efficient learning due to the +exponential growth of the global state-action space with the number of agents. +In this work, utilizing the exponential decay property of network dynamics, we +first derive scalable spectral local representations for network MDPs, which +induces a network linear subspace for the local $Q$-function of each agent. +Building on these local spectral representations, we design a scalable +algorithmic framework for continuous state-action network MDPs, and provide +end-to-end guarantees for the convergence of our algorithm. Empirically, we +validate the effectiveness of our scalable representation-based approach on two +benchmark problems, and demonstrate the advantages of our approach over generic +function approximation approaches to representing the local $Q$-functions. + +
+
+ comment: Updated title, corrected an issue with an author's name +
+
+
+
+
+ + ♻ ☆ Thermodynamic Transferability in Coarse-Grained Force Fields using Graph + Neural Networks + + +
+ Coarse-graining is a molecular modeling technique in which an atomistic +system is represented in a simplified fashion that retains the most significant +system features that contribute to a target output, while removing the degrees +of freedom that are less relevant. This reduction in model complexity allows +coarse-grained molecular simulations to reach increased spatial and temporal +scales compared to corresponding all-atom models. A core challenge in +coarse-graining is to construct a force field that represents the interactions +in the new representation in a way that preserves the atomistic-level +properties. Many approaches to building coarse-grained force fields have +limited transferability between different thermodynamic conditions as a result +of averaging over internal fluctuations at a specific thermodynamic state +point. Here, we use a graph-convolutional neural network architecture, the +Hierarchically Interacting Particle Neural Network with Tensor Sensitivity +(HIP-NN-TS), to develop a highly automated training pipeline for coarse grained +force fields which allows for studying the transferability of coarse-grained +models based on the force-matching approach. We show that this approach not +only yields highly accurate force fields, but also that these force fields are +more transferable through a variety of thermodynamic conditions. These results +illustrate the potential of machine learning techniques such as graph neural +networks to improve the construction of transferable coarse-grained force +fields. + +
+
+ comment: Post-referee revisions. Accepted by Journal of Chemical Theory and + Computation (JCTC). 46 pages, 10 figures + TOC figure + SI (19 pages, 6 + figures) +
+
+
+
+
+ + ♻ ☆ Statistical-Computational Trade-offs for Recursive Adaptive Partitioning + Estimators + + +
+ Models based on recursive adaptive partitioning such as decision trees and +their ensembles are popular for high-dimensional regression as they can +potentially avoid the curse of dimensionality. Because empirical risk +minimization (ERM) is computationally infeasible, these models are typically +trained using greedy algorithms. Although effective in many cases, these +algorithms have been empirically observed to get stuck at local optima. We +explore this phenomenon in the context of learning sparse regression functions +over $d$ binary features, showing that when the true regression function $f^*$ +does not satisfy Abbe et al. (2022)'s Merged Staircase Property (MSP), greedy +training requires $\exp(\Omega(d))$ to achieve low estimation error. +Conversely, when $f^*$ does satisfy MSP, greedy training can attain small +estimation error with only $O(\log d)$ samples. This dichotomy mirrors that of +two-layer neural networks trained with stochastic gradient descent (SGD) in the +mean-field regime, thereby establishing a head-to-head comparison between +SGD-trained neural networks and greedy recursive partitioning estimators. +Furthermore, ERM-trained recursive partitioning estimators achieve low +estimation error with $O(\log d)$ samples irrespective of whether $f^*$ +satisfies MSP, thereby demonstrating a statistical-computational trade-off for +greedy training. Our proofs are based on a novel interpretation of greedy +recursive partitioning using stochastic process theory and a coupling technique +that may be of independent interest. + +
+
+
+
+
+ + ♻ ☆ DEFT: Efficient Fine-Tuning of Diffusion Models by Learning the + Generalised $h$-transform + + +
+ Generative modelling paradigms based on denoising diffusion processes have +emerged as a leading candidate for conditional sampling in inverse problems. In +many real-world applications, we often have access to large, expensively +trained unconditional diffusion models, which we aim to exploit for improving +conditional sampling. Most recent approaches are motivated heuristically and +lack a unifying framework, obscuring connections between them. Further, they +often suffer from issues such as being very sensitive to hyperparameters, being +expensive to train or needing access to weights hidden behind a closed API. In +this work, we unify conditional training and sampling using the mathematically +well-understood Doob's h-transform. This new perspective allows us to unify +many existing methods under a common umbrella. Under this framework, we propose +DEFT (Doob's h-transform Efficient FineTuning), a new approach for conditional +generation that simply fine-tunes a very small network to quickly learn the +conditional $h$-transform, while keeping the larger unconditional network +unchanged. DEFT is much faster than existing baselines while achieving +state-of-the-art performance across a variety of linear and non-linear +benchmarks. On image reconstruction tasks, we achieve speedups of up to +1.6$\times$, while having the best perceptual quality on natural images and +reconstruction performance on medical images. Further, we also provide initial +experiments on protein motif scaffolding and outperform reconstruction guidance +methods. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.09236 +
+
+
+
+
+ + ♻ ☆ BertaQA: How Much Do Language Models Know About Local Culture? + + +
+ Large Language Models (LLMs) exhibit extensive knowledge about the world, but +most evaluations have been limited to global or anglocentric subjects. This +raises the question of how well these models perform on topics relevant to +other cultures, whose presence on the web is not that prominent. To address +this gap, we introduce BertaQA, a multiple-choice trivia dataset that is +parallel in English and Basque. The dataset consists of a local subset with +questions pertinent to the Basque culture, and a global subset with questions +of broader interest. We find that state-of-the-art LLMs struggle with local +cultural knowledge, even as they excel on global topics. However, we show that +continued pre-training in Basque significantly improves the models' performance +on Basque culture, even when queried in English. To our knowledge, this is the +first solid evidence of knowledge transfer from a low-resource to a +high-resource language. Our analysis sheds light on the complex interplay +between language and knowledge, and reveals that some prior findings do not +fully hold when reassessed on local topics. Our dataset and evaluation code are +available under open licenses at https://github.com/juletx/BertaQA. + +
+
+ comment: NEURIPS Datasets & Benchmarks 2024 +
+
+
+
+
+ + ♻ ☆ RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual + Dexterous Robot Hands CoRL + + +
+ It has been a long-standing research goal to endow robot hands with +human-level dexterity. Bi-manual robot piano playing constitutes a task that +combines challenges from dynamic tasks, such as generating fast while precise +motions, with slower but contact-rich manipulation problems. Although +reinforcement learning based approaches have shown promising results in +single-task performance, these methods struggle in a multi-song setting. Our +work aims to close this gap and, thereby, enable imitation learning approaches +for robot piano playing at scale. To this end, we introduce the Robot Piano 1 +Million (RP1M) dataset, containing bi-manual robot piano playing motion data of +more than one million trajectories. We formulate finger placements as an +optimal transport problem, thus, enabling automatic annotation of vast amounts +of unlabeled songs. Benchmarking existing imitation learning approaches shows +that such approaches reach state-of-the-art robot piano playing performance by +leveraging RP1M. + +
+
+ comment: Accepted by Conference on Robot Learning (CoRL) 2024. Project + Website: https://rp1m.github.io/ +
+
+
+
+
+ + ♻ ☆ PEAC: Unsupervised Pre-training for Cross-Embodiment Reinforcement + Learning NeurIPS24 + + +
+ Designing generalizable agents capable of adapting to diverse embodiments has +achieved significant attention in Reinforcement Learning (RL), which is +critical for deploying RL agents in various real-world applications. Previous +Cross-Embodiment RL approaches have focused on transferring knowledge across +embodiments within specific tasks. These methods often result in knowledge +tightly coupled with those tasks and fail to adequately capture the distinct +characteristics of different embodiments. To address this limitation, we +introduce the notion of Cross-Embodiment Unsupervised RL (CEURL), which +leverages unsupervised learning to enable agents to acquire embodiment-aware +and task-agnostic knowledge through online interactions within reward-free +environments. We formulate CEURL as a novel Controlled Embodiment Markov +Decision Process (CE-MDP) and systematically analyze CEURL's pre-training +objectives under CE-MDP. Based on these analyses, we develop a novel algorithm +Pre-trained Embodiment-Aware Control (PEAC) for handling CEURL, incorporating +an intrinsic reward function specifically designed for cross-embodiment +pre-training. PEAC not only provides an intuitive optimization strategy for +cross-embodiment pre-training but also can integrate flexibly with existing +unsupervised RL methods, facilitating cross-embodiment exploration and skill +discovery. Extensive experiments in both simulated (e.g., DMC and Robosuite) +and real-world environments (e.g., legged locomotion) demonstrate that PEAC +significantly improves adaptation performance and cross-embodiment +generalization, demonstrating its effectiveness in overcoming the unique +challenges of CEURL. The project page and code are in +https://yingchengyang.github.io/ceurl. + +
+
+ comment: NeurIPS24 +
+
+
+
+
+ + ♻ ☆ PyGim: An Efficient Graph Neural Network Library for Real + Processing-In-Memory Architectures + + +
+ Graph Neural Networks (GNNs) are emerging ML models to analyze +graph-structure data. Graph Neural Network (GNN) execution involves both +compute-intensive and memory-intensive kernels, the latter dominates the total +time, being significantly bottlenecked by data movement between memory and +processors. Processing-In-Memory (PIM) systems can alleviate this data movement +bottleneck by placing simple processors near or inside to memory arrays. In +this work, we introduce PyGim, an efficient ML library that accelerates GNNs on +real PIM systems. We propose intelligent parallelization techniques for +memory-intensive kernels of GNNs tailored for real PIM systems, and develop +handy Python API for them. We provide hybrid GNN execution, in which the +compute-intensive and memory-intensive kernels are executed in +processor-centric and memory-centric computing systems, respectively. We +extensively evaluate PyGim on a real-world PIM system with 1992 PIM cores using +emerging GNN models, and demonstrate that it outperforms its state-of-the-art +CPU counterpart on Intel Xeon by on average 3.04x, and achieves higher resource +utilization than CPU and GPU systems. Our work provides useful recommendations +for software, system and hardware designers. PyGim is publicly available at +https://github.com/CMU-SAFARI/PyGim. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Forgetting in Deep Learning Beyond Continual + Learning + + +
+ Forgetting refers to the loss or deterioration of previously acquired +knowledge. While existing surveys on forgetting have primarily focused on +continual learning, forgetting is a prevalent phenomenon observed in various +other research domains within deep learning. Forgetting manifests in research +fields such as generative models due to generator shifts, and federated +learning due to heterogeneous data distributions across clients. Addressing +forgetting encompasses several challenges, including balancing the retention of +old task knowledge with fast learning of new task, managing task interference +with conflicting goals, and preventing privacy leakage, etc. Moreover, most +existing surveys on continual learning implicitly assume that forgetting is +always harmful. In contrast, our survey argues that forgetting is a +double-edged sword and can be beneficial and desirable in certain cases, such +as privacy-preserving scenarios. By exploring forgetting in a broader context, +we present a more nuanced understanding of this phenomenon and highlight its +potential advantages. Through this comprehensive survey, we aspire to uncover +potential solutions by drawing upon ideas and approaches from various fields +that have dealt with forgetting. By examining forgetting beyond its +conventional boundaries, we hope to encourage the development of novel +strategies for mitigating, harnessing, or even embracing forgetting in real +applications. A comprehensive list of papers about forgetting in various +research fields is available at +\url{https://github.com/EnnengYang/Awesome-Forgetting-in-Deep-Learning}. + +
+
+ comment: accepted at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ♻ ☆ Variational Graph Autoencoder for Heterogeneous Information Networks + with Missing and Inaccurate Attributes + + +
+ Heterogeneous Information Networks (HINs), which consist of various types of +nodes and edges, have recently demonstrated excellent performance in graph +mining. However, most existing heterogeneous graph neural networks (HGNNs) +ignore the problems of missing attributes, inaccurate attributes and scarce +labels for nodes, which limits their expressiveness. In this paper, we propose +a generative self-supervised model GraMI to address these issues +simultaneously. Specifically, GraMI first initializes all the nodes in the +graph with a low-dimensional representation matrix. After that, based on the +variational graph autoencoder framework, GraMI learns both node-level and +attribute-level embeddings in the encoder, which can provide fine-grained +semantic information to construct node attributes. In the decoder, GraMI +reconstructs both links and attributes. Instead of directly reconstructing raw +features for attributed nodes, GraMI generates the initial low-dimensional +representation matrix for all the nodes, based on which raw features of +attributed nodes are further reconstructed to leverage accurate attributes. In +this way, GraMI can not only complete informative features for non-attributed +nodes, but rectify inaccurate ones for attributed nodes. Finally, we conduct +extensive experiments to show the superiority of GraMI in tackling HINs with +missing and inaccurate attributes. + +
+
+ comment: Accepted by KDD 2025 +
+
+
+
+
+ + ♻ ☆ Unpicking Data at the Seams: VAEs, Disentanglement and Independent + Components + + +
+ Disentanglement, or identifying salient statistically independent factors of +the data, is of interest in many areas of machine learning and statistics, with +relevance to synthetic data generation with controlled properties, robust +classification of features, parsimonious encoding, and a greater understanding +of the generative process underlying the data. Disentanglement arises in +several generative paradigms, including Variational Autoencoders (VAEs), +Generative Adversarial Networks and diffusion models. Particular progress has +recently been made in understanding disentanglement in VAEs, where the choice +of diagonal posterior covariance matrices is suggested to promote mutual +orthogonality between columns of the decoder's Jacobian. We continue this +thread to show how this linear independence translates to statistical +independence, completing the chain in understanding how the VAE's objective +identifies independent components of, or disentangles, the data. + +
+
+
+
+
+ + ♻ ☆ A survey and taxonomy of loss functions in machine learning + + +
+ Most state-of-the-art machine learning techniques revolve around the +optimisation of loss functions. Defining appropriate loss functions is +therefore critical to successfully solving problems in this field. In this +survey, we present a comprehensive overview of the most widely used loss +functions across key applications, including regression, classification, +generative modeling, ranking, and energy-based modeling. We introduce 43 +distinct loss functions, structured within an intuitive taxonomy that clarifies +their theoretical foundations, properties, and optimal application contexts. +This survey is intended as a resource for undergraduate, graduate, and Ph.D. +students, as well as researchers seeking a deeper understanding of loss +functions. + +
+
+
+
+
+ + ♻ ☆ Not Eliminate but Aggregate: Post-Hoc Control over Mixture-of-Experts to + Address Shortcut Shifts in Natural Language Understanding + + +
+ Recent models for natural language understanding are inclined to exploit +simple patterns in datasets, commonly known as shortcuts. These shortcuts hinge +on spurious correlations between labels and latent features existing in the +training data. At inference time, shortcut-dependent models are likely to +generate erroneous predictions under distribution shifts, particularly when +some latent features are no longer correlated with the labels. To avoid this, +previous studies have trained models to eliminate the reliance on shortcuts. In +this study, we explore a different direction: pessimistically aggregating the +predictions of a mixture-of-experts, assuming each expert captures relatively +different latent features. The experimental results demonstrate that our +post-hoc control over the experts significantly enhances the model's robustness +to the distribution shift in shortcuts. Besides, we show that our approach has +some practical advantages. We also analyze our model and provide results to +support the assumption. + +
+
+ comment: 21 pages, 5 figures (the layout differs from the MIT Press + publication version) +
+
+
+
+
+ + ♻ ☆ Exploring Context Window of Large Language Models via Decomposed + Positional Vectors + + +
+ Transformer-based large language models (LLMs) typically have a limited +context window, resulting in significant performance degradation when +processing text beyond the length of the context window. Extensive studies have +been proposed to extend the context window and achieve length extrapolation of +LLMs, but there is still a lack of in-depth interpretation of these approaches. +In this study, we explore the positional information within and beyond the +context window for deciphering the underlying mechanism of LLMs. By using a +mean-based decomposition method, we disentangle positional vectors from hidden +states of LLMs and analyze their formation and effect on attention. +Furthermore, when texts exceed the context window, we analyze the change of +positional vectors in two settings, i.e., direct extrapolation and context +window extension. Based on our findings, we design two training-free context +window extension methods, positional vector replacement and attention window +extension. Experimental results show that our methods can effectively extend +the context window length. + +
+
+ comment: Accepted by Neurips 2024 as a spotlight +
+
+
+
+
+ + ♻ ☆ Pursuing Overall Welfare in Federated Learning through Sequential + Decision Making ICML 2024 + + +
+ In traditional federated learning, a single global model cannot perform +equally well for all clients. Therefore, the need to achieve the client-level +fairness in federated system has been emphasized, which can be realized by +modifying the static aggregation scheme for updating the global model to an +adaptive one, in response to the local signals of the participating clients. +Our work reveals that existing fairness-aware aggregation strategies can be +unified into an online convex optimization framework, in other words, a central +server's sequential decision making process. To enhance the decision making +capability, we propose simple and intuitive improvements for suboptimal designs +within existing methods, presenting AAggFF. Considering practical requirements, +we further subdivide our method tailored for the cross-device and the +cross-silo settings, respectively. Theoretical analyses guarantee sublinear +regret upper bounds for both settings: $\mathcal{O}(\sqrt{T \log{K}})$ for the +cross-device setting, and $\mathcal{O}(K \log{T})$ for the cross-silo setting, +with $K$ clients and $T$ federation rounds. Extensive experiments demonstrate +that the federated system equipped with AAggFF achieves better degree of +client-level fairness than existing methods in both practical settings. Code is +available at https://github.com/vaseline555/AAggFF + +
+
+ comment: Accepted at ICML 2024; added missing but important references, fixed + typos +
+
+
+
+
+ + ♻ ☆ ARNN: Attentive Recurrent Neural Network for Multi-channel EEG Signals + to Identify Epileptic Seizures + + +
+ Electroencephalography (EEG) is a widely used tool for diagnosing brain +disorders due to its high temporal resolution, non-invasive nature, and +affordability. Manual analysis of EEG is labor-intensive and requires +expertise, making automatic EEG interpretation crucial for reducing workload +and accurately assessing seizures. In epilepsy diagnosis, prolonged EEG +monitoring generates extensive data, often spanning hours, days, or even weeks. +While machine learning techniques for automatic EEG interpretation have +advanced significantly in recent decades, there remains a gap in its ability to +efficiently analyze large datasets with a balance of accuracy and computational +efficiency. To address the challenges mentioned above, an Attention Recurrent +Neural Network (ARNN) is proposed that can process a large amount of data +efficiently and accurately. This ARNN cell recurrently applies attention layers +along a sequence and has linear complexity with the sequence length and +leverages parallel computation by processing multi-channel EEG signals rather +than single-channel signals. In this architecture, the attention layer is a +computational unit that efficiently applies self-attention and cross-attention +mechanisms to compute a recurrent function over a wide number of state vectors +and input signals. This framework is inspired in part by the attention layer +and long short-term memory (LSTM) cells, but it scales this typical cell up by +several orders to parallelize for multi-channel EEG signals. It inherits the +advantages of attention layers and LSTM gate while avoiding their respective +drawbacks. The model's effectiveness is evaluated through extensive experiments +with heterogeneous datasets, including the CHB-MIT and UPenn and Mayo's Clinic +datasets. + +
+
+ comment: 11 pages, 7 figures, Journal Paper +
+
+
+
+
+ + ♻ ☆ Evaluating Synthetic Activations composed of SAE Latents in GPT-2 NeurIPS 2024 + + +
+ Sparse Auto-Encoders (SAEs) are commonly employed in mechanistic +interpretability to decompose the residual stream into monosemantic SAE +latents. Recent work demonstrates that perturbing a model's activations at an +early layer results in a step-function-like change in the model's final layer +activations. Furthermore, the model's sensitivity to this perturbation differs +between model-generated (real) activations and random activations. In our +study, we assess model sensitivity in order to compare real activations to +synthetic activations composed of SAE latents. Our findings indicate that +synthetic activations closely resemble real activations when we control for the +sparsity and cosine similarity of the constituent SAE latents. This suggests +that real activations cannot be explained by a simple "bag of SAE latents" +lacking internal structure, and instead suggests that SAE latents possess +significant geometric and statistical properties. Notably, we observe that our +synthetic activations exhibit less pronounced activation plateaus compared to +those typically surrounding real activations. + +
+
+ comment: Presented at the Attributing Model Behavior at Scale (ATTRIB) + workshop at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Characterizing stable regions in the residual stream of LLMs NeurIPS 2024 + + +
+ We identify stable regions in the residual stream of Transformers, where the +model's output remains insensitive to small activation changes, but exhibits +high sensitivity at region boundaries. These regions emerge during training and +become more defined as training progresses or model size increases. The regions +appear to be much larger than previously studied polytopes. Our analysis +suggests that these stable regions align with semantic distinctions, where +similar prompts cluster within regions, and activations from the same region +lead to similar next token predictions. This work provides a promising research +direction for understanding the complexity of neural networks, shedding light +on training dynamics, and advancing interpretability. + +
+
+ comment: Presented at the Scientific Methods for Understanding Deep Learning + (SciForDL) workshop at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Investigating Sensitive Directions in GPT-2: An Improved Baseline and + Comparative Analysis of SAEs NeurIPS 2024 + + +
+ Sensitive directions experiments attempt to understand the computational +features of Language Models (LMs) by measuring how much the next token +prediction probabilities change by perturbing activations along specific +directions. We extend the sensitive directions work by introducing an improved +baseline for perturbation directions. We demonstrate that KL divergence for +Sparse Autoencoder (SAE) reconstruction errors are no longer pathologically +high compared to the improved baseline. We also show that feature directions +uncovered by SAEs have varying impacts on model outputs depending on the SAE's +sparsity, with lower L0 SAE feature directions exerting a greater influence. +Additionally, we find that end-to-end SAE features do not exhibit stronger +effects on model outputs compared to traditional SAEs. + +
+
+ comment: Presented at the Attributing Model Behavior at Scale (ATTRIB) and + Scientific Methods for Understanding Deep Learning (SciForDL) workshops at + NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ FuXi Weather: A data-to-forecast machine learning system for global + weather + + +
+ Weather forecasting traditionally relies on numerical weather prediction +(NWP) systems that integrates global observational systems, data assimilation +(DA), and forecasting models. Despite steady improvements in forecast accuracy +over recent decades, further advances are increasingly constrained by high +computational costs, the underutilization of vast observational datasets, and +the challenges of obtaining finer resolution. These limitations, alongside the +uneven distribution of observational networks, result in global disparities in +forecast accuracy, leaving some regions vulnerable to extreme weather. Recent +advances in machine learning present a promising alternative, providing more +efficient and accurate forecasts using the same initial conditions as NWP. +However, current machine learning models still depend on the initial conditions +generated by NWP systems, which require extensive computational resources and +expertise. Here we introduce FuXi Weather, a machine learning weather +forecasting system that assimilates data from multiple satellites. Operating on +a 6-hourly DA and forecast cycle, FuXi Weather generates reliable and accurate +10-day global weather forecasts at a spatial resolution of $0.25^\circ$. FuXi +Weather is the first system to achieve all-grid, all-surface, all-channel, and +all-sky DA and forecasting, extending skillful forecast lead times beyond those +of the European Centre for Medium-range Weather Forecasts (ECMWF) +high-resolution forecasts (HRES) while using significantly fewer observations. +FuXi Weather consistently outperforms ECMWF HRES in observation-sparse regions, +such as central Africa, demonstrating its potential to improve forecasts where +observational infrastructure is limited. + +
+
+ comment: 73 pages +
+
+
+
+
+ + ♻ ☆ Bayesian optimization of atomic structures with prior probabilities from + universal interatomic potentials + + +
+ The optimization of atomic structures plays a pivotal role in understanding +and designing materials with desired properties. However, conventional +computational methods often struggle with the formidable task of navigating the +vast potential energy surface, especially in high-dimensional spaces with +numerous local minima. Recent advancements in machine learning-driven surrogate +models offer a promising avenue for alleviating this computational burden. In +this study, we propose a novel approach that combines the strengths of +universal machine learning potentials with a Bayesian approach using Gaussian +processes. By using the machine learning potentials as priors for the Gaussian +process, the Gaussian process has to learn only the difference between the +machine learning potential and the target energy surface calculated for example +by density functional theory. This turns out to improve the speed by which the +global optimal structure is identified across diverse systems for a +well-behaved machine learning potential. The approach is tested on periodic +bulk materials, surface structures, and a cluster. + +
+
+
+
+
+ + ♻ ☆ BONE: a unifying framework for Bayesian online learning in + non-stationary environments + + +
+ We propose a unifying framework for methods that perform Bayesian online +learning in non-stationary environments. We call the framework BONE, which +stands for (B)ayesian (O)nline learning in (N)on-stationary (E)nvironments. +BONE provides a common structure to tackle a variety of problems, including +online continual learning, prequential forecasting, and contextual bandits. The +framework requires specifying three modelling choices: (i) a model for +measurements (e.g., a neural network), (ii) an auxiliary process to model +non-stationarity (e.g., the time since the last changepoint), and (iii) a +conditional prior over model parameters (e.g., a multivariate Gaussian). The +framework also requires two algorithmic choices, which we use to carry out +approximate inference under this framework: (i) an algorithm to estimate +beliefs (posterior distribution) about the model parameters given the auxiliary +variable, and (ii) an algorithm to estimate beliefs about the auxiliary +variable. We show how this modularity allows us to write many different +existing methods as instances of BONE; we also use this framework to propose a +new method. We then experimentally compare existing methods with our proposed +new method on several datasets; we provide insights into the situations that +make one method more suitable than another for a given task. + +
+
+
+
+
+ + ♻ ☆ Interpretable Machine Learning for Survival Analysis + + +
+ With the spread and rapid advancement of black box machine learning models, +the field of interpretable machine learning (IML) or explainable artificial +intelligence (XAI) has become increasingly important over the last decade. This +is particularly relevant for survival analysis, where the adoption of IML +techniques promotes transparency, accountability and fairness in sensitive +areas, such as clinical decision making processes, the development of targeted +therapies, interventions or in other medical or healthcare related contexts. +More specifically, explainability can uncover a survival model's potential +biases and limitations and provide more mathematically sound ways to understand +how and which features are influential for prediction or constitute risk +factors. However, the lack of readily available IML methods may have deterred +medical practitioners and policy makers in public health from leveraging the +full potential of machine learning for predicting time-to-event data. We +present a comprehensive review of the limited existing amount of work on IML +methods for survival analysis within the context of the general IML taxonomy. +In addition, we formally detail how commonly used IML methods, such as such as +individual conditional expectation (ICE), partial dependence plots (PDP), +accumulated local effects (ALE), different feature importance measures or +Friedman's H-interaction statistics can be adapted to survival outcomes. An +application of several IML methods to real data on data on under-5 year +mortality of Ghanaian children from the Demographic and Health Surveys (DHS) +Program serves as a tutorial or guide for researchers, on how to utilize the +techniques in practice to facilitate understanding of model decisions or +predictions. + +
+
+
+
+
+ + ♻ ☆ Non-convex Stochastic Composite Optimization with Polyak Momentum + + +
+ The stochastic proximal gradient method is a powerful generalization of the +widely used stochastic gradient descent (SGD) method and has found numerous +applications in Machine Learning. However, it is notoriously known that this +method fails to converge in non-convex settings where the stochastic noise is +significant (i.e. when only small or bounded batch sizes are used). In this +paper, we focus on the stochastic proximal gradient method with Polyak +momentum. We prove this method attains an optimal convergence rate for +non-convex composite optimization problems, regardless of batch size. +Additionally, we rigorously analyze the variance reduction effect of the Polyak +momentum in the composite optimization setting and we show the method also +converges when the proximal step can only be solved inexactly. Finally, we +provide numerical experiments to validate our theoretical results. + +
+
+
+
+
+ + ♻ ☆ Centaur: a foundation model of human cognition + + +
+ Establishing a unified theory of cognition has been a major goal of +psychology. While there have been previous attempts to instantiate such +theories by building computational models, we currently do not have one model +that captures the human mind in its entirety. Here we introduce Centaur, a +computational model that can predict and simulate human behavior in any +experiment expressible in natural language. We derived Centaur by finetuning a +state-of-the-art language model on a novel, large-scale data set called +Psych-101. Psych-101 reaches an unprecedented scale, covering trial-by-trial +data from over 60,000 participants performing over 10,000,000 choices in 160 +experiments. Centaur not only captures the behavior of held-out participants +better than existing cognitive models, but also generalizes to new cover +stories, structural task modifications, and entirely new domains. Furthermore, +we find that the model's internal representations become more aligned with +human neural activity after finetuning. Taken together, Centaur is the first +real candidate for a unified model of human cognition. We anticipate that it +will have a disruptive impact on the cognitive sciences, challenging the +existing paradigm for developing computational models. + +
+
+
+
+
+ + ♻ ☆ Integrating GNN and Neural ODEs for Estimating Non-Reciprocal Two-Body + Interactions in Mixed-Species Collective Motion NeurIPS 2024 + + +
+ Analyzing the motion of multiple biological agents, be it cells or individual +animals, is pivotal for the understanding of complex collective behaviors. With +the advent of advanced microscopy, detailed images of complex tissue formations +involving multiple cell types have become more accessible in recent years. +However, deciphering the underlying rules that govern cell movements is far +from trivial. Here, we present a novel deep learning framework for estimating +the underlying equations of motion from observed trajectories, a pivotal step +in decoding such complex dynamics. Our framework integrates graph neural +networks with neural differential equations, enabling effective prediction of +two-body interactions based on the states of the interacting entities. We +demonstrate the efficacy of our approach through two numerical experiments. +First, we used simulated data from a toy model to tune the hyperparameters. +Based on the obtained hyperparameters, we then applied this approach to a more +complex model with non-reciprocal forces that mimic the collective dynamics of +the cells of slime molds. Our results show that the proposed method can +accurately estimate the functional forms of two-body interactions -- even when +they are nonreciprocal -- thereby precisely replicating both individual and +collective behaviors within these systems. + +
+
+ comment: Accepted at NeurIPS 2024. Some contents are omitted due to arXiv's + storage limit. Please refer to the full paper at OpenReview (NeurIPS 2024) or + https://github.com/MasahitoUWAMICHI/collectiveMotionNN +
+
+
+
+
+ + ♻ ☆ Word-Sequence Entropy: Towards Uncertainty Estimation in Free-Form + Medical Question Answering Applications and Beyond + + +
+ Uncertainty estimation is crucial for the reliability of safety-critical +human and artificial intelligence (AI) interaction systems, particularly in the +domain of healthcare engineering. However, a robust and general uncertainty +measure for free-form answers has not been well-established in open-ended +medical question-answering (QA) tasks, where generative inequality introduces a +large number of irrelevant words and sequences within the generated set for +uncertainty quantification (UQ), which can lead to biases. This paper +introduces Word-Sequence Entropy (WSE), a method that calibrates uncertainty at +both the word and sequence levels, considering semantic relevance. WSE +quantifies uncertainty in a way that is more closely aligned with the +reliability of LLMs during uncertainty quantification (UQ). We compare WSE with +six baseline methods on five free-form medical QA datasets, utilizing seven +popular large language models (LLMs). Experimental results demonstrate that WSE +exhibits superior performance in UQ under two standard criteria for correctness +evaluation. Additionally, in terms of real-world medical QA applications, the +performance of LLMs is significantly enhanced (e.g., a 6.36% improvement in +model accuracy on the COVID-QA dataset) by employing responses with lower +uncertainty that are identified by WSE as final answers, without any additional +task-specific fine-tuning or architectural modifications. + +
+
+ comment: Accepted by Engineering Applications of Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ ConU: Conformal Uncertainty in Large Language Models with Correctness + Coverage Guarantees EMNLP 2024 + + +
+ Uncertainty quantification (UQ) in natural language generation (NLG) tasks +remains an open challenge, exacerbated by the closed-source nature of the +latest large language models (LLMs). This study investigates applying conformal +prediction (CP), which can transform any heuristic uncertainty notion into +rigorous prediction sets, to black-box LLMs in open-ended NLG tasks. We +introduce a novel uncertainty measure based on self-consistency theory, and +then develop a conformal uncertainty criterion by integrating the uncertainty +condition aligned with correctness into the CP algorithm. Empirical evaluations +indicate that our uncertainty measure outperforms prior state-of-the-art +methods. Furthermore, we achieve strict control over the correctness coverage +rate utilizing 7 popular LLMs on 4 free-form NLG datasets, spanning +general-purpose and medical scenarios. Additionally, the calibrated prediction +sets with small size further highlights the efficiency of our method in +providing trustworthy guarantees for practical open-ended NLG applications. + +
+
+ comment: Accepted by EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Jump Model for Urban Thermal Comfort Monitoring + + +
+ Thermal comfort is essential for well-being in urban spaces, especially as +cities face increasing heat from urbanization and climate change. Existing +thermal comfort models usually overlook temporal dynamics alongside spatial +dependencies. We address this problem by introducing a spatio-temporal jump +model that clusters data with persistence across both spatial and temporal +dimensions. This framework enhances interpretability, minimizes abrupt state +changes, and easily handles missing data. We validate our approach through +extensive simulations, demonstrating its accuracy in recovering the true +underlying partition. When applied to hourly environmental data gathered from a +set of weather stations located across the city of Singapore, our proposal +identifies meaningful thermal comfort regimes, demonstrating its effectiveness +in dynamic urban settings and suitability for real-world monitoring. The +comparison of these regimes with feedback on thermal preference indicates the +potential of an unsupervised approach to avoid extensive surveys. + +
+
+
+
+
+ + ♻ ☆ Fair Generalized Linear Mixed Models + + +
+ When using machine learning for automated prediction, it is important to +account for fairness in the prediction. Fairness in machine learning aims to +ensure that biases in the data and model inaccuracies do not lead to +discriminatory decisions. E.g., predictions from fair machine learning models +should not discriminate against sensitive variables such as sexual orientation +and ethnicity. The training data often in obtained from social surveys. In +social surveys, oftentimes the data collection process is a strata sampling, +e.g. due to cost restrictions. In strata samples, the assumption of +independence between the observation is not fulfilled. Hence, if the machine +learning models do not account for the strata correlations, the results may be +biased. Especially high is the bias in cases where the strata assignment is +correlated to the variable of interest. We present in this paper an algorithm +that can handle both problems simultaneously, and we demonstrate the impact of +stratified sampling on the quality of fair machine learning predictions in a +reproducible simulation study. + +
+
+ comment: 25 pages, 12 figures. arXiv admin note: text overlap with + arXiv:2405.06433 +
+
+
+
+
+ + ♻ ☆ SAD-TIME: a Spatiotemporal-fused network for depression detection with + Automated multi-scale Depth-wise and TIME-interval-related common feature + extractor + + +
+ Background and Objective: Depression is a severe mental disorder, and +accurate diagnosis is pivotal to the cure and rehabilitation of people with +depression. However, the current questionnaire-based diagnostic methods could +bring subjective biases and may be denied by subjects. In search of a more +objective means of diagnosis, researchers have begun to experiment with deep +learning-based methods for identifying depressive disorders in recent years. +Methods: In this study, a novel Spatiotemporal-fused network with Automated +multi-scale Depth-wise and TIME-interval-related common feature extractor +(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common +features extractor (CFE), a spatial sector (SpS), a modified temporal sector +(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale +depth-wise 1D-convolutional neural network and a time-interval embedding +generator, where the unique information of each channel is preserved. The SpS +fuses the functional connectivity with the distance-based connectivity +containing spatial position of EEG electrodes. A multi-head-attention graph +convolutional network is also applied in the SpS to fuse the features from +different EEG channels. The TeS is based on long short-term memory and graph +transformer networks, where the temporal information of different time-windows +is fused. Moreover, the DAL is used after the SpS to obtain the +domain-invariant feature. Results: Experimental results under tenfold +cross-validation show that the proposed SAD-TIME method achieves 92.00% and +94.00% depression classification accuracies on two datasets, respectively, in +cross-subject mode. Conclusion: SAD-TIME is a robust depression detection +model, where the automatedly-generated features, the SpS and the TeS assist the +classification performance with the fusion of the innate spatiotemporal +information in the EEG signals. + +
+
+ comment: 21pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Pre-training Tensor-Train Networks Facilitates Machine Learning with + Variational Quantum Circuits + + +
+ Variational quantum circuits (VQCs) hold promise for quantum machine learning +on noisy intermediate-scale quantum (NISQ) devices. While tensor-train networks +(TTNs) can enhance VQC representation and generalization, the resulting hybrid +model, TTN-VQC, faces optimization challenges due to the Polyak-Lojasiewicz +(PL) condition. To mitigate this challenge, we introduce Pre+TTN-VQC, a +pre-trained TTN model combined with a VQC. Our theoretical analysis, grounded +in two-stage empirical risk minimization, provides an upper bound on the +transfer learning risk. It demonstrates the approach's advantages in overcoming +the optimization challenge while maintaining TTN-VQC's generalization +capability. We validate our findings through experiments on quantum dot and +handwritten digit classification using simulated and actual NISQ environments. + +
+
+ comment: In submission +
+
+
+
+
+ + ♻ ☆ Federated Graph Condensation with Information Bottleneck Principles + + +
+ Graph condensation, which reduces the size of a large-scale graph by +synthesizing a small-scale condensed graph as its substitution, has immediately +benefited various graph learning tasks. However, existing graph condensation +methods rely on centralized data storage, which is unfeasible for real-world +decentralized data distribution, and overlook data holders' privacy-preserving +requirements. To bridge the gap, we propose and study the novel problem of +federated graph condensation for graph neural networks (GNNs). Specifically, we +first propose a general framework for federated graph condensation, in which we +decouple the typical gradient matching process for graph condensation into +client-side gradient calculation and server-side gradient matching. In this +way, the burdensome computation cost in client-side is largely alleviated. +Besides, our empirical studies show that under the federated setting, the +condensed graph will consistently leak data membership privacy, i.e., the +condensed graph during the federated training can be utilized to steal the +training data under the membership inference attacks (MIA). To tackle this +issue, we innovatively incorporate information bottleneck principles into the +federated graph condensation, which only needs to extract partial node features +in one local pre-training step and utilize the features during federated +training. Extensive experiments on real-world datasets demonstrate that our +framework can consistently protect membership privacy during training. +Meanwhile, it also achieves comparable and even superior performance against +existing centralized graph condensation and federated graph learning methods. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Adaptive AI-Driven Material Synthesis: Towards Autonomous 2D Materials + Growth + + +
+ Two-dimensional (2D) materials are poised to revolutionize current +solid-state technology with their extraordinary properties. Yet, the primary +challenge remains their scalable production. While there have been significant +advancements, much of the scientific progress has depended on the exfoliation +of materials, a method that poses severe challenges for large-scale +applications. With the advent of artificial intelligence (AI) in materials +science, innovative synthesis methodologies are now on the horizon. This study +explores the forefront of autonomous materials synthesis using an artificial +neural network (ANN) trained by evolutionary methods, focusing on the efficient +production of graphene. Our approach demonstrates that a neural network can +iteratively and autonomously learn a time-dependent protocol for the efficient +growth of graphene, without requiring pretraining on what constitutes an +effective recipe. Evaluation criteria are based on the proximity of the Raman +signature to that of monolayer graphene: higher scores are granted to outcomes +whose spectrum more closely resembles that of an ideal continuous monolayer +structure. This feedback mechanism allows for iterative refinement of the ANN's +time-dependent synthesis protocols, progressively improving sample quality. +Through the advancement and application of AI methodologies, this work makes a +substantial contribution to the field of materials engineering, fostering a new +era of innovation and efficiency in the synthesis process. + +
+
+
+
+
+ + ♻ ☆ A Framework for Leveraging Partially-Labeled Data for Product + Attribute-Value Identification + + +
+ In the e-commerce domain, the accurate extraction of attribute-value pairs +(e.g., Brand: Apple) from product titles and user search queries is crucial for +enhancing search and recommendation systems. A major challenge with neural +models for this task is the lack of high-quality training data, as the +annotations for attribute-value pairs in the available datasets are often +incomplete. To address this, we introduce GenToC, a model designed for training +directly with partially-labeled data, eliminating the necessity for a fully +annotated dataset. GenToC employs a marker-augmented generative model to +identify potential attributes, followed by a token classification model that +determines the associated values for each attribute. GenToC outperforms +existing state-of-the-art models, exhibiting upto 56.3% increase in the number +of accurate extractions. Furthermore, we utilize GenToC to regenerate the +training dataset to expand attribute-value annotations. This bootstrapping +substantially improves the data quality for training other standard NER models, +which are typically faster but less capable in handling partially-labeled data, +enabling them to achieve comparable performance to GenToC. Our results +demonstrate GenToC's unique ability to learn from a limited set of +partially-labeled data and improve the training of more efficient models, +advancing the automated extraction of attribute-value pairs. Finally, our model +has been successfully integrated into IndiaMART, India's largest B2B e-commerce +platform, achieving a significant increase of 20.2% in the number of correctly +identified attribute-value pairs over the existing deployed system while +achieving a high precision of 89.5%. + +
+
+ comment: Accepted to KDD 2025 ADS Track +
+
+
+
+
+ + ♻ ☆ Machine Vision-Based Assessment of Fall Color Changes and its + Relationship with Leaf Nitrogen Concentration + + +
+ Apple(\textit{Malus domestica} Borkh.) trees are deciduous, shedding leaves +each year. This process is preceded by a gradual change in leaf color from +green to yellow as chlorophyll is degraded prior to abscission. The initiation +and rate of this color change are affected by many factors including leaf +nitrogen (N) concentration. We predict that leaf color during this transition +may be indicative of the nitrogen status of apple trees. This study assesses a +machine vision-based system for quantifying the change in leaf color and its +correlation with leaf nitrogen content. An image dataset was collected in color +and 3D over five weeks in the fall of 2021 and 2023 at a commercial orchard +using a ground vehicle-based stereovision sensor. Trees in the foreground were +segmented from the point cloud using color and depth thresholding methods. +Then, to estimate the proportion of yellow leaves per canopy, the color +information of the segmented canopy area was quantified using a custom-defined +metric, \textit{yellowness index} (a normalized ratio of yellow to green +foliage in the tree) that varied from -1 to +1 (-1 being completely green and ++1 being completely yellow). Both K-means-based methods and gradient boosting +methods were used to estimate the \textit{yellowness index}. The gradient +boosting based method proposed in this study was better than the K-means-based +method (both in terms of computational time and accuracy), achieving an $R^2$ +of 0.72 in estimating the \textit{yellowness index}. The metric was able to +capture the gradual color transition from green to yellow over the study +duration. Trees with lower leaf nitrogen showed the color transition to yellow +earlier than the trees with higher nitrogen. + Keywords: Fruit Tree Nitrogen Management, Machine Vision, Point Cloud +Segmentation, Precision Nitrogen Management + +
+
+
+
+
+ + ♻ ☆ Optimized Feature Generation for Tabular Data via LLMs with Decision + Tree Reasoning NeurIPS 2024 + + +
+ In tabular prediction tasks, tree-based models combined with automated +feature engineering methods often outperform deep learning approaches that rely +on learned representations. While these feature engineering techniques are +effective, they typically depend on a pre-defined search space and primarily +use validation scores for feature selection, thereby missing valuable insights +from previous experiments. To address these limitations, we propose a novel +tabular learning framework that utilizes large language models (LLMs), termed +Optimizing Column feature generator with decision Tree reasoning (OCTree). Our +key idea is to leverage the reasoning capabilities of LLMs to identify +effective feature generation rules without manually specifying the search space +and provide language-based reasoning information highlighting past experiments +as feedback for iterative rule improvements. We use decision trees to convey +this reasoning information, as they can be easily represented in natural +language, effectively providing knowledge from prior experiments (i.e., the +impact of the generated features on performance) to the LLMs. Our empirical +results demonstrate that OCTree consistently enhances the performance of +various prediction models across diverse benchmarks, outperforming competing +automated feature engineering methods. Code is available at +https://github.com/jaehyun513/OCTree. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Understanding the Role of Textual Prompts in LLM for Time Series + Forecasting: an Adapter View + + +
+ In the burgeoning domain of Large Language Models (LLMs), there is a growing +interest in applying LLM to time series forecasting, with multiple studies +focused on leveraging textual prompts to further enhance the predictive +prowess. This study aims to understand how and why the integration of textual +prompts into LLM can effectively improve the prediction accuracy of time +series, which is not obvious at the glance, given the significant domain gap +between texts and time series. Our extensive examination leads us to believe +that (a) adding text prompts is roughly equivalent to introducing additional +adapters, and (b) It is the introduction of learnable parameters rather than +textual information that aligns the LLM with the time series forecasting task, +ultimately enhancing prediction accuracy. Inspired by this discovery, we +developed four adapters that explicitly address the gap between LLM and time +series, and further improve the prediction accuracy. Overall,our work +highlights how textual prompts enhance LLM accuracy in time series forecasting +and suggests new avenues for continually improving LLM-based time series +analysis. + +
+
+
+
+
+ + ♻ ☆ A Scalable Training Strategy for Blind Multi-Distribution Noise Removal + + +
+ Despite recent advances, developing general-purpose universal denoising and +artifact-removal networks remains largely an open problem: Given fixed network +weights, one inherently trades-off specialization at one task (e.g.,~removing +Poisson noise) for performance at another (e.g.,~removing speckle noise). In +addition, training such a network is challenging due to the curse of +dimensionality: As one increases the dimensions of the specification-space +(i.e.,~the number of parameters needed to describe the noise distribution) the +number of unique specifications one needs to train for grows exponentially. +Uniformly sampling this space will result in a network that does well at very +challenging problem specifications but poorly at easy problem specifications, +where even large errors will have a small effect on the overall mean squared +error. + In this work we propose training denoising networks using an +adaptive-sampling/active-learning strategy. Our work improves upon a recently +proposed universal denoiser training strategy by extending these results to +higher dimensions and by incorporating a polynomial approximation of the true +specification-loss landscape. This approximation allows us to reduce training +times by almost two orders of magnitude. We test our method on simulated joint +Poisson-Gaussian-Speckle noise and demonstrate that with our proposed training +strategy, a single blind, generalist denoiser network can achieve peak +signal-to-noise ratios within a uniform bound of specialized denoiser networks +across a large range of operating conditions. We also capture a small dataset +of images with varying amounts of joint Poisson-Gaussian-Speckle noise and +demonstrate that a universal denoiser trained using our adaptive-sampling +strategy outperforms uniformly trained baselines. + +
+
+ comment: IEEE TIP 2024 +
+
+
+
+
+ + ♻ ☆ Incorporating Arbitrary Matrix Group Equivariance into KANs + + +
+ Kolmogorov-Arnold Networks (KANs) have seen great success in scientific +domains thanks to spline activation functions, becoming an alternative to +Multi-Layer Perceptrons (MLPs). However, spline functions may not respect +symmetry in tasks, which is crucial prior knowledge in machine learning. +Previously, equivariant networks embed symmetry into their architectures, +achieving better performance in specific applications. Among these, Equivariant +Multi-Layer Perceptrons (EMLP) introduce arbitrary matrix group equivariance +into MLPs, providing a general framework for constructing equivariant networks +layer by layer. In this paper, we propose Equivariant Kolmogorov-Arnold +Networks (EKAN), a method for incorporating matrix group equivariance into +KANs, aiming to broaden their applicability to more fields. First, we construct +gated spline basis functions, which form the EKAN layer together with +equivariant linear weights. We then define a lift layer to align the input +space of EKAN with the feature space of the dataset, thereby building the +entire EKAN architecture. Compared with baseline models, EKAN achieves higher +accuracy with smaller datasets or fewer parameters on symmetry-related tasks, +such as particle scattering and the three-body problem, often reducing test MSE +by several orders of magnitude. Even in non-symbolic formula scenarios, such as +top quark tagging with three jet constituents, EKAN achieves comparable results +with EMLP using only $26\%$ of the parameters, while KANs do not outperform +MLPs as expected. + +
+
+
+
+
+ + ♻ ☆ Optimal and Fair Encouragement Policy Evaluation and Learning + + +
+ In consequential domains, it is often impossible to compel individuals to +take treatment, so that optimal policy rules are merely suggestions in the +presence of human non-adherence to treatment recommendations. Under +heterogeneity, covariates may predict take-up of treatment and final outcome, +but differently. While optimal treatment rules optimize causal outcomes across +the population, access parity constraints or other fairness considerations on +who receives treatment can be important. For example, in social services, a +persistent puzzle is the gap in take-up of beneficial services among those who +may benefit from them the most. We study causal identification and robust +estimation of optimal treatment rules, including under potential violations of +positivity. We consider fairness constraints such as demographic parity in +treatment take-up, and other constraints, via constrained optimization. Our +framework can be extended to handle algorithmic recommendations under an +often-reasonable covariate-conditional exclusion restriction, using our +robustness checks for lack of positivity in the recommendation. We develop a +two-stage algorithm for solving over parametrized policy classes under general +constraints to obtain variance-sensitive regret bounds. We illustrate the +methods in three case studies based on data from reminders of SNAP benefits +recertification, randomized encouragement to enroll in insurance, and from +pretrial supervised release with electronic monitoring. While the specific +remedy to inequities in algorithmic allocation is context-specific, it requires +studying both take-up of decisions and downstream outcomes of them. + +
+
+ comment: Updated with major new case study on SNAP recertification benefits +
+
+
+
+
+ + ♻ ☆ DecoR: Deconfounding Time Series with Robust Regression + + +
+ Causal inference on time series data is a challenging problem, especially in +the presence of unobserved confounders. This work focuses on estimating the +causal effect between two time series that are confounded by a third, +unobserved time series. Assuming spectral sparsity of the confounder, we show +how in the frequency domain this problem can be framed as an adversarial +outlier problem. We introduce Deconfounding by Robust regression (DecoR), a +novel approach that estimates the causal effect using robust linear regression +in the frequency domain. Considering two different robust regression +techniques, we first improve existing bounds on the estimation error for such +techniques. Crucially, our results do not require distributional assumptions on +the covariates. We can therefore use them in time series settings. Applying +these results to DecoR, we prove, under suitable assumptions, upper bounds for +the estimation error of DecoR that imply consistency. We demonstrate DecoR's +effectiveness through experiments on both synthetic and real-world data from +Earth system science. The simulation experiments furthermore suggest that DecoR +is robust with respect to model misspecification. + +
+
+ comment: 27 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Towards Empirical Interpretation of Internal Circuits and Properties in + Grokked Transformers on Modular Polynomials + + +
+ Grokking has been actively explored to reveal the mystery of delayed +generalization and identifying interpretable representations and algorithms +inside the grokked models is a suggestive hint to understanding its mechanism. +Grokking on modular addition has been known to implement Fourier representation +and its calculation circuits with trigonometric identities in Transformers. +Considering the periodicity in modular arithmetic, the natural question is to +what extent these explanations and interpretations hold for the grokking on +other modular operations beyond addition. For a closer look, we first +hypothesize that any modular operations can be characterized with distinctive +Fourier representation or internal circuits, grokked models obtain common +features transferable among similar operations, and mixing datasets with +similar operations promotes grokking. Then, we extensively examine them by +learning Transformers on complex modular arithmetic tasks, including +polynomials. Our Fourier analysis and novel progress measure for modular +arithmetic, Fourier Frequency Density and Fourier Coefficient Ratio, +characterize distinctive internal representations of grokked models per modular +operation; for instance, polynomials often result in the superposition of the +Fourier components seen in elementary arithmetic, but clear patterns do not +emerge in challenging non-factorizable polynomials. In contrast, our ablation +study on the pre-grokked models reveals that the transferability among the +models grokked with each operation can be only limited to specific +combinations, such as from elementary arithmetic to linear expressions. +Moreover, some multi-task mixtures may lead to co-grokking -- where grokking +simultaneously happens for all the tasks -- and accelerate generalization, +while others may not find optimal solutions. We provide empirical steps towards +the interpretability of internal circuits. + +
+
+ comment: Published at Transactions on Machine Learning Research (TMLR), Code: + https://github.com/frt03/grok_mod_poly +
+
+
+
+
+ + ♻ ☆ A Fair Loss Function for Network Pruning NeurIPS 2022 + + +
+ Model pruning can enable the deployment of neural networks in environments +with resource constraints. While pruning may have a small effect on the overall +performance of the model, it can exacerbate existing biases into the model such +that subsets of samples see significantly degraded performance. In this paper, +we introduce the performance weighted loss function, a simple modified +cross-entropy loss function that can be used to limit the introduction of +biases during pruning. Experiments using the CelebA, Fitzpatrick17k and +CIFAR-10 datasets demonstrate that the proposed method is a simple and +effective tool that can enable existing pruning methods to be used in fairness +sensitive contexts. Code used to produce all experiments contained in this +paper can be found at https://github.com/robbiemeyer/pw_loss_pruning. + +
+
+ comment: [v1] Trustworthy and Socially Responsible Machine Learning (TSRML + 2022) workshop co-located with NeurIPS 2022 +
+
+
+
+
+ + ♻ ☆ A Theoretical Understanding of Self-Correction through In-context + Alignment NeurIPS 2024 + + +
+ Going beyond mimicking limited human experiences, recent studies show initial +evidence that, like humans, large language models (LLMs) are capable of +improving their abilities purely by self-correction, i.e., correcting previous +responses through self-examination, in certain circumstances. Nevertheless, +little is known about how such capabilities arise. In this work, based on a +simplified setup akin to an alignment task, we theoretically analyze +self-correction from an in-context learning perspective, showing that when LLMs +give relatively accurate self-examinations as rewards, they are capable of +refining responses in an in-context way. Notably, going beyond previous +theories on over-simplified linear transformers, our theoretical construction +underpins the roles of several key designs of realistic transformers for +self-correction: softmax attention, multi-head attention, and the MLP block. We +validate these findings extensively on synthetic datasets. Inspired by these +findings, we also illustrate novel applications of self-correction, such as +defending against LLM jailbreaks, where a simple self-correction step does make +a large difference. We believe that these findings will inspire further +research on understanding, exploiting, and enhancing self-correction for +building better foundation models. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SOFTS: Efficient Multivariate Time Series Forecasting with Series-Core + Fusion NeurIPS 2024 + + +
+ Multivariate time series forecasting plays a crucial role in various fields +such as finance, traffic management, energy, and healthcare. Recent studies +have highlighted the advantages of channel independence to resist distribution +drift but neglect channel correlations, limiting further enhancements. Several +methods utilize mechanisms like attention or mixer to address this by capturing +channel correlations, but they either introduce excessive complexity or rely +too heavily on the correlation to achieve satisfactory results under +distribution drifts, particularly with a large number of channels. Addressing +this gap, this paper presents an efficient MLP-based model, the Series-cOre +Fused Time Series forecaster (SOFTS), which incorporates a novel STar +Aggregate-Redistribute (STAR) module. Unlike traditional approaches that manage +channel interactions through distributed structures, \textit{e.g.}, attention, +STAR employs a centralized strategy to improve efficiency and reduce reliance +on the quality of each channel. It aggregates all series to form a global core +representation, which is then dispatched and fused with individual series +representations to facilitate channel interactions effectively.SOFTS achieves +superior performance over existing state-of-the-art methods with only linear +complexity. The broad applicability of the STAR module across different +forecasting models is also demonstrated empirically. For further research and +development, we have made our code publicly available at +https://github.com/Secilia-Cxy/SOFTS. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Physics-informed Machine Learning for Battery Pack Thermal Management + + +
+ With the popularity of electric vehicles, the demand for lithium-ion +batteries is increasing. Temperature significantly influences the performance +and safety of batteries. Battery thermal management systems can effectively +control the temperature of batteries; therefore, the performance and safety can +be ensured. However, the development process of battery thermal management +systems is time-consuming and costly due to the extensive training dataset +needed by data-driven models requiring enormous computational costs for finite +element analysis. Therefore, a new approach to constructing surrogate models is +needed in the era of AI. Physics-informed machine learning enforces the +physical laws in surrogate models, making it the perfect candidate for +estimating battery pack temperature distribution. In this study, we first +developed a 21700 battery pack indirect liquid cooling system with cold plates +on the top and bottom with thermal paste surrounding the battery cells. Then, +the simplified finite element model was built based on experiment results. Due +to the high coolant flow rate, the cold plates can be considered as constant +temperature boundaries, while battery cells are the heat sources. The +physics-informed convolutional neural network served as a surrogate model to +estimate the temperature distribution of the battery pack. The loss function +was constructed considering the heat conduction equation based on the finite +difference method. The physics-informed loss function helped the convergence of +the training process with less data. As a result, the physics-informed +convolutional neural network showed more than 15 percents improvement in +accuracy compared to the data-driven method with the same training data. + +
+
+
+
+
+ + ♻ ☆ Autoregressive Action Sequence Learning for Robotic Manipulation + + +
+ Designing a universal policy architecture that performs well across diverse +robots and task configurations remains a key challenge. In this work, we +address this by representing robot actions as sequential data and generating +actions through autoregressive sequence modeling. Existing autoregressive +architectures generate end-effector waypoints sequentially as word tokens in +language modeling, which are limited to low-frequency control tasks. Unlike +language, robot actions are heterogeneous and often include continuous values +-- such as joint positions, 2D pixel coordinates, and end-effector poses -- +which are not easily suited for language-based modeling. Based on this insight, +we introduce a straightforward enhancement: we extend causal transformers' +single-token prediction to support predicting a variable number of tokens in a +single step through our Chunking Causal Transformer (CCT). This enhancement +enables robust performance across diverse tasks of various control frequencies, +greater efficiency by having fewer autoregression steps, and lead to a hybrid +action sequence design by mixing different types of actions and using a +different chunk size for each action type. Based on CCT, we propose the +Autoregressive Policy (ARP) architecture, which solves manipulation tasks by +generating hybrid action sequences. We evaluate ARP across diverse robotic +manipulation environments, including Push-T, ALOHA, and RLBench, and show that +ARP, as a universal architecture, outperforms the environment-specific +state-of-the-art in all tested benchmarks, while being more efficient in +computation and parameter sizes. Videos of our real robot demonstrations, all +source code and the pretrained models of ARP can be found at +http://github.com/mlzxy/arp. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 146 + +
+
+
+ + ☆ Bi-Mamba: Towards Accurate 1-Bit State Space Models + + +
+ The typical selective state-space model (SSM) of Mamba addresses several +limitations of Transformers, such as quadratic computational complexity with +sequence length and significant inference-time memory requirements due to the +key-value cache. However, the growing size of Mamba models continues to pose +training and deployment challenges and raises environmental concerns due to +considerable energy consumption. In this work, we introduce Bi-Mamba, a +scalable and powerful 1-bit Mamba architecture designed for more efficient +large language models with multiple sizes across 780M, 1.3B, and 2.7B. Bi-Mamba +models are trained from scratch on data volume as regular LLM pertaining using +an autoregressive distillation loss. Extensive experimental results on language +modeling demonstrate that Bi-Mamba achieves performance comparable to its +full-precision counterparts (e.g., FP16 or BF16) and much better accuracy than +post-training-binarization (PTB) Mamba baselines, while significantly reducing +memory footprint and energy consumption compared to the original Mamba model. +Our study pioneers a new linear computational complexity LLM framework under +low-bit representation and facilitates the future design of specialized +hardware tailored for efficient 1-bit Mamba-based LLMs. + +
+
+
+
+
+ + ☆ LightFFDNets: Lightweight Convolutional Neural Networks for Rapid Facial + Forgery Detection + + +
+ Accurate and fast recognition of forgeries is an issue of great importance in +the fields of artificial intelligence, image processing and object detection. +Recognition of forgeries of facial imagery is the process of classifying and +defining the faces in it by analyzing real-world facial images. This process is +usually accomplished by extracting features from an image, using classifier +algorithms, and correctly interpreting the results. Recognizing forgeries of +facial imagery correctly can encounter many different challenges. For example, +factors such as changing lighting conditions, viewing faces from different +angles can affect recognition performance, and background complexity and +perspective changes in facial images can make accurate recognition difficult. +Despite these difficulties, significant progress has been made in the field of +forgery detection. Deep learning algorithms, especially Convolutional Neural +Networks (CNNs), have significantly improved forgery detection performance. + This study focuses on image processing-based forgery detection using +Fake-Vs-Real-Faces (Hard) [10] and 140k Real and Fake Faces [61] data sets. +Both data sets consist of two classes containing real and fake facial images. +In our study, two lightweight deep learning models are proposed to conduct +forgery detection using these images. Additionally, 8 different pretrained CNN +architectures were tested on both data sets and the results were compared with +newly developed lightweight CNN models. It's shown that the proposed +lightweight deep learning models have minimum number of layers. It's also shown +that the proposed lightweight deep learning models detect forgeries of facial +imagery accurately, and computationally efficiently. Although the data set +consists only of face images, the developed models can also be used in other +two-class object recognition problems. + +
+
+ comment: 13 pages, 6 figures, 10 tables +
+
+
+
+
+ + ☆ Edge-Enhanced Dilated Residual Attention Network for Multimodal Medical + Image Fusion + + +
+ Multimodal medical image fusion is a crucial task that combines complementary +information from different imaging modalities into a unified representation, +thereby enhancing diagnostic accuracy and treatment planning. While deep +learning methods, particularly Convolutional Neural Networks (CNNs) and +Transformers, have significantly advanced fusion performance, some of the +existing CNN-based methods fall short in capturing fine-grained multiscale and +edge features, leading to suboptimal feature integration. Transformer-based +models, on the other hand, are computationally intensive in both the training +and fusion stages, making them impractical for real-time clinical use. +Moreover, the clinical application of fused images remains unexplored. In this +paper, we propose a novel CNN-based architecture that addresses these +limitations by introducing a Dilated Residual Attention Network Module for +effective multiscale feature extraction, coupled with a gradient operator to +enhance edge detail learning. To ensure fast and efficient fusion, we present a +parameter-free fusion strategy based on the weighted nuclear norm of softmax, +which requires no additional computations during training or inference. +Extensive experiments, including a downstream brain tumor classification task, +demonstrate that our approach outperforms various baseline methods in terms of +visual quality, texture preservation, and fusion speed, making it a possible +practical solution for real-world clinical applications. The code will be +released at https://github.com/simonZhou86/en_dran. + +
+
+ comment: An extended version of the paper accepted at IEEE BIBM 2024 +
+
+
+
+
+ + ☆ Exploring adversarial robustness of JPEG AI: methodology, comparison and + new methods + + +
+ Adversarial robustness of neural networks is an increasingly important area +of research, combining studies on computer vision models, large language models +(LLMs), and others. With the release of JPEG AI - the first standard for +end-to-end neural image compression (NIC) methods - the question of its +robustness has become critically significant. JPEG AI is among the first +international, real-world applications of neural-network-based models to be +embedded in consumer devices. However, research on NIC robustness has been +limited to open-source codecs and a narrow range of attacks. This paper +proposes a new methodology for measuring NIC robustness to adversarial attacks. +We present the first large-scale evaluation of JPEG AI's robustness, comparing +it with other NIC models. Our evaluation results and code are publicly +available online (link is hidden for a blind review). + +
+
+
+
+
+ + ☆ Exploring the Requirements of Clinicians for Explainable AI Decision + Support Systems in Intensive Care + + +
+ There is a growing need to understand how digital systems can support +clinical decision-making, particularly as artificial intelligence (AI) models +become increasingly complex and less human-interpretable. This complexity +raises concerns about trustworthiness, impacting safe and effective adoption of +such technologies. Improved understanding of decision-making processes and +requirements for explanations coming from decision support tools is a vital +component in providing effective explainable solutions. This is particularly +relevant in the data-intensive, fast-paced environments of intensive care units +(ICUs). To explore these issues, group interviews were conducted with seven ICU +clinicians, representing various roles and experience levels. Thematic analysis +revealed three core themes: (T1) ICU decision-making relies on a wide range of +factors, (T2) the complexity of patient state is challenging for shared +decision-making, and (T3) requirements and capabilities of AI decision support +systems. We include design recommendations from clinical input, providing +insights to inform future AI systems for intensive care. + +
+
+
+
+
+ + ☆ CNMBert: A Model For Hanyu Pinyin Abbreviation to Character Conversion + Task + + +
+ The task of converting Hanyu Pinyin abbreviations to Chinese characters +represents a significant branch within the domain of Chinese Spelling +Correction (CSC). This task is typically one of text-length alignment, however, +due to the limited informational content in pinyin abbreviations, achieving +accurate conversion is challenging. In this paper, we propose CNMBert which +stands for zh-CN Pinyin Multi-mask Bert Model as a solution to this issue. +CNMBert surpasses few-shot GPT models, achieving a 59.63% MRR on a +10,424-sample Hanyu Pinyin abbreviation test dataset. + +
+
+ comment: 9 pages, 2figures +
+
+
+
+
+ + ☆ AdaptLIL: A Gaze-Adaptive Visualization for Ontology Mapping + + +
+ This paper showcases AdaptLIL, a real-time adaptive link-indented list +ontology mapping visualization that uses eye gaze as the primary input source. +Through a multimodal combination of real-time systems, deep learning, and web +development applications, this system uniquely curtails graphical overlays +(adaptations) to pairwise mappings of link-indented list ontology +visualizations for individual users based solely on their eye gaze. + +
+
+
+
+
+ + ☆ The Power of Many: Multi-Agent Multimodal Models for Cultural Image + Captioning + + +
+ Large Multimodal Models (LMMs) exhibit impressive performance across various +multimodal tasks. However, their effectiveness in cross-cultural contexts +remains limited due to the predominantly Western-centric nature of most data +and models. Conversely, multi-agent models have shown significant capability in +solving complex tasks. Our study evaluates the collective performance of LMMs +in a multi-agent interaction setting for the novel task of cultural image +captioning. Our contributions are as follows: (1) We introduce MosAIC, a +Multi-Agent framework to enhance cross-cultural Image Captioning using LMMs +with distinct cultural personas; (2) We provide a dataset of culturally +enriched image captions in English for images from China, India, and Romania +across three datasets: GeoDE, GD-VCR, CVQA; (3) We propose a culture-adaptable +metric for evaluating cultural information within image captions; and (4) We +show that the multi-agent interaction outperforms single-agent models across +different metrics, and offer valuable insights for future research. Our dataset +and models can be accessed at https://github.com/MichiganNLP/MosAIC. + +
+
+
+
+
+ + ☆ QARM: Quantitative Alignment Multi-Modal Recommendation at Kuaishou + + +
+ In recent years, with the significant evolution of multi-modal large models, +many recommender researchers realized the potential of multi-modal information +for user interest modeling. In industry, a wide-used modeling architecture is a +cascading paradigm: (1) first pre-training a multi-modal model to provide +omnipotent representations for downstream services; (2) The downstream +recommendation model takes the multi-modal representation as additional input +to fit real user-item behaviours. Although such paradigm achieves remarkable +improvements, however, there still exist two problems that limit model +performance: (1) Representation Unmatching: The pre-trained multi-modal model +is always supervised by the classic NLP/CV tasks, while the recommendation +models are supervised by real user-item interaction. As a result, the two +fundamentally different tasks' goals were relatively separate, and there was a +lack of consistent objective on their representations; (2) Representation +Unlearning: The generated multi-modal representations are always stored in +cache store and serve as extra fixed input of recommendation model, thus could +not be updated by recommendation model gradient, further unfriendly for +downstream training. Inspired by the two difficulties challenges in downstream +tasks usage, we introduce a quantitative multi-modal framework to customize the +specialized and trainable multi-modal information for different downstream +models. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ WoodYOLO: A Novel Object Detector for Wood Species Detection in + Microscopic Images + + +
+ Wood species identification plays a crucial role in various industries, from +ensuring the legality of timber products to advancing ecological conservation +efforts. This paper introduces WoodYOLO, a novel object detection algorithm +specifically designed for microscopic wood fiber analysis. Our approach adapts +the YOLO architecture to address the challenges posed by large, high-resolution +microscopy images and the need for high recall in localization of the cell type +of interest (vessel elements). Our results show that WoodYOLO significantly +outperforms state-of-the-art models, achieving performance gains of 12.9% and +6.5% in F2 score over YOLOv10 and YOLOv7, respectively. This improvement in +automated wood cell type localization capabilities contributes to enhancing +regulatory compliance, supporting sustainable forestry practices, and promoting +biodiversity conservation efforts globally. + +
+
+
+
+
+ + ☆ Moral Persuasion in Large Language Models: Evaluating Susceptibility and + Ethical Alignment + + +
+ We explore how large language models (LLMs) can be influenced by prompting +them to alter their initial decisions and align them with established ethical +frameworks. Our study is based on two experiments designed to assess the +susceptibility of LLMs to moral persuasion. In the first experiment, we examine +the susceptibility to moral ambiguity by evaluating a Base Agent LLM on morally +ambiguous scenarios and observing how a Persuader Agent attempts to modify the +Base Agent's initial decisions. The second experiment evaluates the +susceptibility of LLMs to align with predefined ethical frameworks by prompting +them to adopt specific value alignments rooted in established philosophical +theories. The results demonstrate that LLMs can indeed be persuaded in morally +charged scenarios, with the success of persuasion depending on factors such as +the model used, the complexity of the scenario, and the conversation length. +Notably, LLMs of distinct sizes but from the same company produced markedly +different outcomes, highlighting the variability in their susceptibility to +ethical persuasion. + +
+
+
+
+
+ + ☆ Lifted Model Construction without Normalisation: A Vectorised Approach + to Exploit Symmetries in Factor Graphs + + +
+ Lifted probabilistic inference exploits symmetries in a probabilistic model +to allow for tractable probabilistic inference with respect to domain sizes of +logical variables. We found that the current state-of-the-art algorithm to +construct a lifted representation in form of a parametric factor graph misses +symmetries between factors that are exchangeable but scaled differently, +thereby leading to a less compact representation. In this paper, we propose a +generalisation of the advanced colour passing (ACP) algorithm, which is the +state of the art to construct a parametric factor graph. Our proposed algorithm +allows for potentials of factors to be scaled arbitrarily and efficiently +detects more symmetries than the original ACP algorithm. By detecting strictly +more symmetries than ACP, our algorithm significantly reduces online query +times for probabilistic inference when the resulting model is applied, which we +also confirm in our experiments. + +
+
+ comment: Accepted to the Proceedings of the 3rd Learning on Graphs Conference + (LoG 2024) +
+
+
+
+
+ + ☆ Semantic-Geometric-Physical-Driven Robot Manipulation Skill Transfer via + Skill Library and Tactile Representation + + +
+ Deploying robots in open-world environments involves complex tasks +characterized by long sequences and rich interactions, necessitating efficient +transfer of robotic skills across diverse and complex scenarios. To address +this challenge, we propose a skill library framework based on knowledge graphs, +which endows robots with high-level skill awareness and spatial semantic +understanding. The framework hierarchically organizes operational knowledge by +constructing a "task graph" and a "scene graph" to represent task and scene +semantic information, respectively. We introduce a "state graph" to facilitate +interaction between high-level task planning and low-level scene information. +Furthermore, we propose a hierarchical transfer framework for operational +skills. At the task level, the framework integrates contextual learning and +chain-of-thought prompting within a four-stage prompt paradigm, leveraging +large language models' (LLMs) reasoning and generalization capabilities to +achieve task-level subtask sequence transfer. At the motion level, an adaptive +trajectory transfer method is developed using the A* algorithm and the skill +library, enabling motion-level adaptive trajectory transfer. At the physical +level, we introduce an adaptive contour extraction and posture perception +method based on tactile perception. This method dynamically obtains +high-precision contour and posture information from visual-tactile texture data +and adjusts transferred skills, such as contact positions and postures, to +ensure effectiveness in new environments. Experimental results validate the +effectiveness of the proposed methods. Project +website:https://github.com/MingchaoQi/skill_transfer + +
+
+
+
+
+ + ☆ FedCoLLM: A Parameter-Efficient Federated Co-tuning Framework for Large + and Small Language Models + + +
+ By adapting Large Language Models (LLMs) to domain-specific tasks or +enriching them with domain-specific knowledge, we can fully harness the +capabilities of LLMs. Nonetheless, a gap persists in achieving simultaneous +mutual enhancement between the server's LLM and the downstream clients' Small +Language Models (SLMs). To address this, we propose FedCoLLM, a novel and +parameter-efficient federated framework designed for co-tuning LLMs and SLMs. +This approach is aimed at adaptively transferring server-side LLMs knowledge to +clients' SLMs while simultaneously enriching the LLMs with domain insights from +the clients. To accomplish this, FedCoLLM utilizes lightweight adapters in +conjunction with SLMs, facilitating knowledge exchange between server and +clients in a manner that respects data privacy while also minimizing +computational and communication overhead. Our evaluation of FedCoLLM, utilizing +various public LLMs and SLMs across a range of NLP text generation tasks, +reveals that the performance of clients' SLMs experiences notable improvements +with the assistance of the LLMs. Simultaneously, the LLMs enhanced via FedCoLLM +achieves comparable performance to that obtained through direct fine-tuning on +clients' data. + +
+
+
+
+
+ + ☆ MC-LLaVA: Multi-Concept Personalized Vision-Language Model + + +
+ Current vision-language models (VLMs) show exceptional abilities across +diverse tasks including visual question answering. To enhance user experience +in practical applications, recent studies investigate VLM personalization to +understand user-provided concepts. However, existing studies mainly focus on +single-concept personalization, neglecting the existence and interplay of +multiple concepts, which limits the real-world applicability of personalized +VLMs. In this paper, we propose the first multi-concept personalization method +named MC-LLaVA along with a high-quality multi-concept personalization dataset. +Specifically, MC-LLaVA uses a joint training strategy incorporating multiple +concepts in a single training step, allowing VLMs to perform accurately in +multi-concept personalization. To reduce the cost of joint training, MC-LLaVA +leverages visual token information for concept token initialization, yielding +improved concept representation and accelerating joint training. To advance +multi-concept personalization research, we further contribute a high-quality +dataset. We carefully collect images from various movies that contain multiple +characters and manually generate the multi-concept question-answer samples. Our +dataset features diverse movie types and question-answer types. We conduct +comprehensive qualitative and quantitative experiments to demonstrate that +MC-LLaVA can achieve impressive multi-concept personalized responses, paving +the way for VLMs to become better user-specific assistants. The code and +dataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA. + +
+
+
+
+
+ + ☆ Technical Report: Enhancing LLM Reasoning with Reward-guided Tree Search + + +
+ Recently, test-time scaling has garnered significant attention from the +research community, largely due to the substantial advancements of the o1 model +released by OpenAI. By allocating more computational resources during the +inference phase, large language models~(LLMs) can extensively explore the +solution space by generating more thought tokens or diverse solutions, thereby +producing more accurate responses. However, developing an o1-like reasoning +approach is challenging, and researchers have been making various attempts to +advance this open area of research. In this paper, we present a preliminary +exploration into enhancing the reasoning abilities of LLMs through +reward-guided tree search algorithms. This framework is implemented by +integrating the policy model, reward model, and search algorithm. It is +primarily constructed around a tree search algorithm, where the policy model +navigates a dynamically expanding tree guided by a specially trained reward +model. We thoroughly explore various design considerations necessary for +implementing this framework and provide a detailed report of the technical +aspects. To assess the effectiveness of our approach, we focus on mathematical +reasoning tasks and conduct extensive evaluations on four challenging datasets, +significantly enhancing the reasoning abilities of LLMs. + +
+
+ comment: LLM;Complex Reasoning;Math +
+
+
+
+
+ + ☆ Conceptwm: A Diffusion Model Watermark for Concept Protection + + +
+ The personalization techniques of diffusion models succeed in generating +specific concepts but also pose threats to copyright protection and illegal +use. Model Watermarking is an effective method to prevent the unauthorized use +of subject-driven or style-driven image generation, safeguarding concept +copyrights. However, under the goal of concept-oriented protection, current +watermarking schemes typically add watermarks to all images rather than +applying them in a refined manner targeted at specific concepts. Additionally, +the personalization techniques of diffusion models can easily remove +watermarks. Existing watermarking methods struggle to achieve fine-grained +watermark embedding with a few images of specific concept and prevent removal +of watermarks through personalized fine-tuning. Therefore, we introduce a novel +concept-oriented watermarking framework that seamlessly embeds imperceptible +watermarks into the concept of diffusion models. We conduct extensive +experiments and ablation studies to verify our framework. Our code is available +at https://anonymous.4open.science/r/Conceptwm-4EB3/. + +
+
+
+
+
+ + ☆ TrojanRobot: Backdoor Attacks Against Robotic Manipulation in the + Physical World + + +
+ Robotic manipulation refers to the autonomous handling and interaction of +robots with objects using advanced techniques in robotics and artificial +intelligence. The advent of powerful tools such as large language models (LLMs) +and large vision-language models (LVLMs) has significantly enhanced the +capabilities of these robots in environmental perception and decision-making. +However, the introduction of these intelligent agents has led to security +threats such as jailbreak attacks and adversarial attacks. + In this research, we take a further step by proposing a backdoor attack +specifically targeting robotic manipulation and, for the first time, +implementing backdoor attack in the physical world. By embedding a backdoor +visual language model into the visual perception module within the robotic +system, we successfully mislead the robotic arm's operation in the physical +world, given the presence of common items as triggers. Experimental evaluations +in the physical world demonstrate the effectiveness of the proposed backdoor +attack. + +
+
+ comment: Initial version with preliminary results. We welcome any feedback or + suggestions +
+
+
+
+
+ + ☆ PSPO*: An Effective Process-supervised Policy Optimization for Reasoning + Alignment + + +
+ Process supervision enhances the performance of large language models in +reasoning tasks by providing feedback at each step of chain-of-thought +reasoning. However, due to the lack of effective process supervision methods, +even advanced large language models are prone to logical errors and redundant +reasoning. We claim that the effectiveness of process supervision significantly +depends on both the accuracy and the length of reasoning chains. Moreover, we +identify that these factors exhibit a nonlinear relationship with the overall +reward score of the reasoning process. Inspired by these insights, we propose a +novel process supervision paradigm, PSPO*, which systematically outlines the +workflow from reward model training to policy optimization, and highlights the +importance of nonlinear rewards in process supervision. Based on PSPO*, we +develop the PSPO-WRS, which considers the number of reasoning steps in +determining reward scores and utilizes an adjusted Weibull distribution for +nonlinear reward shaping. Experimental results on six mathematical reasoning +datasets demonstrate that PSPO-WRS consistently outperforms current mainstream +models. + +
+
+
+
+
+ + ☆ Artificial Scientific Discovery + + +
+ Rooted in the explosion of deep learning over the past decade, this thesis +spans from AlphaGo to ChatGPT to empirically examine the fundamental concepts +needed to realize the vision of an artificial scientist: a machine with the +capacity to autonomously generate original research and contribute to the +expansion of human knowledge. The investigation begins with {\sc Olivaw}, an +AlphaGo Zero-like agent that discovers Othello knowledge from scratch but is +unable to communicate it. This realization leads to the development of the +Explanatory Learning (EL) framework, a formalization of the problem faced by a +scientist when trying to explain a new phenomenon to their peers. The effective +EL prescriptions allow us to crack Zendo, a board game simulating the +scientific endeavor. This success comes with a fundamental insight: an +artificial scientist must develop its own interpretation of the language used +to explain its findings. This perspective then leads us to see modern +multimodal models as interpreters, and to devise a new way to build +interpretable and cost-effective CLIP-like models: by coupling two unimodal +models using little multimodal data and no further training. Finally, we +discuss what ChatGPT and its siblings are still missing to become artificial +scientists, and introduce Odeen, a benchmark about interpreting explanations +that sees LLMs going no further than random chance while being instead fully +solved by humans. + +
+
+ comment: PhD thesis, 123 pages +
+
+
+
+
+ + ☆ Dissecting Misalignment of Multimodal Large Language Models via + Influence Function + + +
+ Multi-modal Large Language models (MLLMs) are always trained on data from +diverse and unreliable sources, which may contain misaligned or mislabeled +text-image pairs. This frequently causes robustness issues and hallucinations, +leading to performance degradation. Data valuation is an efficient way to +detect and trace these misalignments. Nevertheless, existing methods are +computationally expensive for MLLMs. While computationally efficient, the +classical influence functions are inadequate for contrastive learning models +because they were originally designed for pointwise loss. Additionally, +contrastive learning involves minimizing the distance between the modalities of +positive samples and maximizing the distance between the modalities of negative +samples. This requires us to evaluate the influence of samples from both +perspectives. To tackle these challenges, we introduce the Extended Influence +Function for Contrastive Loss (ECIF), an influence function crafted for +contrastive loss. ECIF considers both positive and negative samples and +provides a closed-form approximation of contrastive learning models, +eliminating the need for retraining. Building upon ECIF, we develop a series of +algorithms for data evaluation in MLLM, misalignment detection, and +misprediction trace-back tasks. Experimental results demonstrate our ECIF +advances the transparency and interpretability of MLLMs by offering a more +accurate assessment of data impact and model alignment compared to traditional +baseline methods. + +
+
+ comment: 34 pages +
+
+
+
+
+ + ☆ No-regret Exploration in Shuffle Private Reinforcement Learning + + +
+ Differential privacy (DP) has recently been introduced into episodic +reinforcement learning (RL) to formally address user privacy concerns in +personalized services. Previous work mainly focuses on two trust models of DP: +the central model, where a central agent is responsible for protecting users' +sensitive data, and the (stronger) local model, where the protection occurs +directly on the user side. However, they either require a trusted central agent +or incur a significantly higher privacy cost, making it unsuitable for many +scenarios. This work introduces a trust model stronger than the central model +but with a lower privacy cost than the local model, leveraging the emerging +\emph{shuffle} model of privacy. We present the first generic algorithm for +episodic RL under the shuffle model, where a trusted shuffler randomly permutes +a batch of users' data before sending it to the central agent. We then +instantiate the algorithm using our proposed shuffle Privatizer, relying on a +shuffle private binary summation mechanism. Our analysis shows that the +algorithm achieves a near-optimal regret bound comparable to that of the +centralized model and significantly outperforms the local model in terms of +privacy cost. + +
+
+
+
+
+ + ☆ TSINR: Capturing Temporal Continuity via Implicit Neural Representations + for Time Series Anomaly Detection + + +
+ Time series anomaly detection aims to identify unusual patterns in data or +deviations from systems' expected behavior. The reconstruction-based methods +are the mainstream in this task, which learn point-wise representation via +unsupervised learning. However, the unlabeled anomaly points in training data +may cause these reconstruction-based methods to learn and reconstruct anomalous +data, resulting in the challenge of capturing normal patterns. In this paper, +we propose a time series anomaly detection method based on implicit neural +representation (INR) reconstruction, named TSINR, to address this challenge. +Due to the property of spectral bias, TSINR enables prioritizing low-frequency +signals and exhibiting poorer performance on high-frequency abnormal data. +Specifically, we adopt INR to parameterize time series data as a continuous +function and employ a transformer-based architecture to predict the INR of +given data. As a result, the proposed TSINR method achieves the advantage of +capturing the temporal continuity and thus is more sensitive to discontinuous +anomaly data. In addition, we further design a novel form of INR continuous +function to learn inter- and intra-channel information, and leverage a +pre-trained large language model to amplify the intense fluctuations in +anomalies. Extensive experiments demonstrate that TSINR achieves superior +overall performance on both univariate and multivariate time series anomaly +detection benchmarks compared to other state-of-the-art reconstruction-based +methods. Our codes are available. + +
+
+ comment: Accepted by SIGKDD 2025 +
+
+
+
+
+ + ☆ SP${ }^3$ : Superpixel-propagated pseudo-label learning for weakly + semi-supervised medical image segmentation + + +
+ Deep learning-based medical image segmentation helps assist diagnosis and +accelerate the treatment process while the model training usually requires +large-scale dense annotation datasets. Weakly semi-supervised medical image +segmentation is an essential application because it only requires a small +amount of scribbles and a large number of unlabeled data to train the model, +which greatly reduces the clinician's effort to fully annotate images. To +handle the inadequate supervisory information challenge in weakly +semi-supervised segmentation (WSSS), a SuperPixel-Propagated Pseudo-label +(SP${}^3$) learning method is proposed, using the structural information +contained in superpixel for supplemental information. Specifically, the +annotation of scribbles is propagated to superpixels and thus obtains a dense +annotation for supervised training. Since the quality of pseudo-labels is +limited by the low-quality annotation, the beneficial superpixels selected by +dynamic thresholding are used to refine pseudo-labels. Furthermore, aiming to +alleviate the negative impact of noise in pseudo-label, superpixel-level +uncertainty is incorporated to guide the pseudo-label supervision for stable +learning. Our method achieves state-of-the-art performance on both tumor and +organ segmentation datasets under the WSSS setting, using only 3\% of the +annotation workload compared to fully supervised methods and attaining +approximately 80\% Dice score. Additionally, our method outperforms eight +weakly and semi-supervised methods under both weakly supervised and +semi-supervised settings. Results of extensive experiments validate the +effectiveness and annotation efficiency of our weakly semi-supervised +segmentation, which can assist clinicians in achieving automated segmentation +for organs or tumors quickly and ultimately benefit patients. + +
+
+ comment: 10 pages, 7 figures. Under Review +
+
+
+
+
+ + ☆ Chapter 7 Review of Data-Driven Generative AI Models for Knowledge + Extraction from Scientific Literature in Healthcare + + +
+ This review examines the development of abstractive NLP-based text +summarization approaches and compares them to existing techniques for +extractive summarization. A brief history of text summarization from the 1950s +to the introduction of pre-trained language models such as Bidirectional +Encoder Representations from Transformer (BERT) and Generative Pre-training +Transformers (GPT) are presented. In total, 60 studies were identified in +PubMed and Web of Science, of which 29 were excluded and 24 were read and +evaluated for eligibility, resulting in the use of seven studies for further +analysis. This chapter also includes a section with examples including an +example of a comparison between GPT-3 and state-of-the-art GPT-4 solutions in +scientific text summarisation. Natural language processing has not yet reached +its full potential in the generation of brief textual summaries. As there are +acknowledged concerns that must be addressed, we can expect gradual +introduction of such models in practise. + +
+
+ comment: 16 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ ST-Tree with Interpretability for Multivariate Time Series + Classification + + +
+ Multivariate time series classification is of great importance in practical +applications and is a challenging task. However, deep neural network models +such as Transformers exhibit high accuracy in multivariate time series +classification but lack interpretability and fail to provide insights into the +decision-making process. On the other hand, traditional approaches based on +decision tree classifiers offer clear decision processes but relatively lower +accuracy. Swin Transformer (ST) addresses these issues by leveraging +self-attention mechanisms to capture both fine-grained local patterns and +global patterns. It can also model multi-scale feature representation learning, +thereby providing a more comprehensive representation of time series features. +To tackle the aforementioned challenges, we propose ST-Tree with +interpretability for multivariate time series classification. Specifically, the +ST-Tree model combines ST as the backbone network with an additional neural +tree model. This integration allows us to fully leverage the advantages of ST +in learning time series context while providing interpretable decision +processes through the neural tree. This enables researchers to gain clear +insights into the model's decision-making process and extract meaningful +interpretations. Through experimental evaluations on 10 UEA datasets, we +demonstrate that the ST-Tree model improves accuracy in multivariate time +series classification tasks and provides interpretability through visualizing +the decision-making process across different datasets. + +
+
+ comment: Submitted on May 15, 2024, major revisions on Aug 31, 2024 +
+
+
+
+
+ + ☆ Signaling and Social Learning in Swarms of Robots + + +
+ This paper investigates the role of communication in improving coordination +within robot swarms, focusing on a paradigm where learning and execution occur +simultaneously in a decentralized manner. We highlight the role communication +can play in addressing the credit assignment problem (individual contribution +to the overall performance), and how it can be influenced by it. We propose a +taxonomy of existing and future works on communication, focusing on information +selection and physical abstraction as principal axes for classification: from +low-level lossless compression with raw signal extraction and processing to +high-level lossy compression with structured communication models. The paper +reviews current research from evolutionary robotics, multi-agent (deep) +reinforcement learning, language models, and biophysics models to outline the +challenges and opportunities of communication in a collective of robots that +continuously learn from one another through local message exchanges, +illustrating a form of social learning. + +
+
+ comment: 17 pages, 3 Figures +
+
+
+
+
+ + ☆ Hybrid Data-Driven SSM for Interpretable and Label-Free mmWave Channel + Prediction + + +
+ Accurate prediction of mmWave time-varying channels is essential for +mitigating the issue of channel aging in complex scenarios owing to high user +mobility. Existing channel prediction methods have limitations: classical +model-based methods often struggle to track highly nonlinear channel dynamics +due to limited expert knowledge, while emerging data-driven methods typically +require substantial labeled data for effective training and often lack +interpretability. To address these issues, this paper proposes a novel hybrid +method that integrates a data-driven neural network into a conventional +model-based workflow based on a state-space model (SSM), implicitly tracking +complex channel dynamics from data without requiring precise expert knowledge. +Additionally, a novel unsupervised learning strategy is developed to train the +embedded neural network solely with unlabeled data. Theoretical analyses and +ablation studies are conducted to interpret the enhanced benefits gained from +the hybrid integration. Numerical simulations based on the 3GPP mmWave channel +model corroborate the superior prediction accuracy of the proposed method, +compared to state-of-the-art methods that are either purely model-based or +data-driven. Furthermore, extensive experiments validate its robustness against +various challenging factors, including among others severe channel variations +and high noise levels. + +
+
+
+
+
+ + ☆ Topology-aware Preemptive Scheduling for Co-located LLM Workloads + + +
+ Hosting diverse large language model workloads in a unified resource pool +through co-location is cost-effective. For example, long-running chat services +generally follow diurnal traffic patterns, which inspire co-location of batch +jobs to fulfill resource valleys between successive peaks, and thus to saturate +resource allocation in cluster-wide scope. These heterogeneous workloads often +have different business priorities, and therefore preemption can be leveraged +for resource elasticity. However, workloads often have distinct topology +preferences as well. The resources released by lower-priority instances may +fail to meet the requirements of high-priority online services which are +usually latency-sensitive. The root cause behind such mis-match is a lack of +topology awareness of resource scheduler, especially during preemption. To +bridge this gap, we develop a fine-grained topology-aware method for preemptive +scheduling of hybrid workloads. The method ensures that the resources freed by +preempted tasks adhere to the topological affinity needs of high-priority +preemptors in a guaranteed or best-effort manner. This dynamic alignment +significantly increases the efficiency of preemption and improves overall +scheduled performance for LLM workloads by $55\%$. + +
+
+ comment: 17 Pages, 11 Figures, 5 Tables +
+
+
+
+
+ + ☆ Real-Time Fitness Exercise Classification and Counting from Video Frames + + +
+ This paper introduces a novel method for real-time exercise classification +using a Bidirectional Long Short-Term Memory (BiLSTM) neural network. Existing +exercise recognition approaches often rely on synthetic datasets, raw +coordinate inputs sensitive to user and camera variations, and fail to fully +exploit the temporal dependencies in exercise movements. These issues limit +their generalizability and robustness in real-world conditions, where lighting, +camera angles, and user body types vary. + To address these challenges, we propose a BiLSTM-based model that leverages +invariant features, such as joint angles, alongside raw coordinates. By using +both angles and (x, y, z) coordinates, the model adapts to changes in +perspective, user positioning, and body differences, improving generalization. +Training on 30-frame sequences enables the BiLSTM to capture the temporal +context of exercises and recognize patterns evolving over time. + We compiled a dataset combining synthetic data from the InfiniteRep dataset +and real-world videos from Kaggle and other sources. This dataset includes four +common exercises: squat, push-up, shoulder press, and bicep curl. The model was +trained and validated on these diverse datasets, achieving an accuracy of over +99% on the test set. To assess generalizability, the model was tested on 2 +separate test sets representative of typical usage conditions. Comparisons with +the previous approach from the literature are present in the result section +showing that the proposed model is the best-performing one. + The classifier is integrated into a web application providing real-time +exercise classification and repetition counting without manual exercise +selection. + Demo and datasets are available at the following GitHub Repository: +https://github.com/RiccardoRiccio/Fitness-AI-Trainer-With-Automatic-Exercise-Recognition-and-Counting. + +
+
+
+
+
+ + ☆ Enhancing Vision-Language Model Safety through Progressive + Concept-Bottleneck-Driven Alignment + + +
+ Benefiting from the powerful capabilities of Large Language Models (LLMs), +pre-trained visual encoder models connected to LLMs form Vision Language Models +(VLMs). However, recent research shows that the visual modality in VLMs is +highly vulnerable, allowing attackers to bypass safety alignment in LLMs +through visually transmitted content, launching harmful attacks. To address +this challenge, we propose a progressive concept-based alignment strategy, +PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance +visual modality safety alignment. By aligning model predictions with specific +safety concepts, we improve defenses against risky images, enhancing +explainability and controllability while minimally impacting general +performance. Our method is obtained through two-stage training. The low +computational cost of the first stage brings very effective performance +improvement, and the fine-tuning of the language model in the second stage +further improves the safety performance. Our method achieves state-of-the-art +results on popular VLM safety benchmark. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.13581 +
+
+
+
+
+ + ☆ Addressing Hallucinations in Language Models with Knowledge Graph + Embeddings as an Additional Modality + + +
+ In this paper we present an approach to reduce hallucinations in Large +Language Models (LLMs) by incorporating Knowledge Graphs (KGs) as an additional +modality. Our method involves transforming input text into a set of KG +embeddings and using an adapter to integrate these embeddings into the language +model space, without relying on external retrieval processes. + To facilitate this, we created WikiEntities, a dataset containing over 3 +million Wikipedia texts annotated with entities from Wikidata and their +corresponding embeddings from PyTorch-BigGraph. This dataset serves as a +valuable resource for training Entity Linking models and adapting the described +method to various LLMs using specialized adapters. + Our method does not require fine-tuning of the language models themselves; +instead, we only train the adapter. This ensures that the model's performance +on other tasks is not affected. We trained an adapter for the Mistral 7B, LLaMA +2-7B (chat), and LLaMA 3-8B (instruct) models using this dataset and +demonstrated that our approach improves performance on the HaluEval, True-False +benchmarks and FEVER dataset. The results indicate that incorporating KGs as a +new modality can effectively reduce hallucinations and improve the factual +accuracy of language models, all without the need for external retrieval. + +
+
+
+
+
+ + ☆ A Pre-Trained Graph-Based Model for Adaptive Sequencing of Educational + Documents NeurIPS 2024 + + +
+ Massive Open Online Courses (MOOCs) have greatly contributed to making +education more accessible.However, many MOOCs maintain a rigid, +one-size-fits-all structure that fails to address the diverse needs and +backgrounds of individual learners.Learning path personalization aims to +address this limitation, by tailoring sequences of educational content to +optimize individual student learning outcomes.Existing approaches, however, +often require either massive student interaction data or extensive expert +annotation, limiting their broad application.In this study, we introduce a +novel data-efficient framework for learning path personalization that operates +without expert annotation.Our method employs a flexible recommender system +pre-trained with reinforcement learning on a dataset of raw course +materials.Through experiments on semi-synthetic data, we show that this +pre-training stage substantially improves data-efficiency in a range of +adaptive learning scenarios featuring new educational materials.This opens up +new perspectives for the design of foundation models for adaptive learning. + +
+
+ comment: NeurIPS 2024 Workshop on Large Foundation Models for Educational + Assessment (FM-Assess), Dec 2024, Vancouver, Canada +
+
+
+
+
+ + ☆ Structure learning with Temporal Gaussian Mixture for model-based + Reinforcement Learning + + +
+ Model-based reinforcement learning refers to a set of approaches capable of +sample-efficient decision making, which create an explicit model of the +environment. This model can subsequently be used for learning optimal policies. +In this paper, we propose a temporal Gaussian Mixture Model composed of a +perception model and a transition model. The perception model extracts discrete +(latent) states from continuous observations using a variational Gaussian +mixture likelihood. Importantly, our model constantly monitors the collected +data searching for new Gaussian components, i.e., the perception model performs +a form of structure learning (Smith et al., 2020; Friston et al., 2018; Neacsu +et al., 2022) as it learns the number of Gaussian components in the mixture. +Additionally, the transition model learns the temporal transition between +consecutive time steps by taking advantage of the Dirichlet-categorical +conjugacy. Both the perception and transition models are able to forget part of +the data points, while integrating the information they provide within the +prior, which ensure fast variational inference. Finally, decision making is +performed with a variant of Q-learning which is able to learn Q-values from +beliefs over states. Empirically, we have demonstrated the model's ability to +learn the structure of several mazes: the model discovered the number of states +and the transition probabilities between these states. Moreover, using its +learned Q-values, the agent was able to successfully navigate from the starting +position to the maze's exit. + +
+
+
+
+
+ + ☆ Closed-loop multi-step planning with innate physics knowledge + + +
+ We present a hierarchical framework to solve robot planning as an input +control problem. At the lowest level are temporary closed control loops, +("tasks"), each representing a behaviour, contingent on a specific sensory +input and therefore temporary. At the highest level, a supervising +"Configurator" directs task creation and termination. Here resides "core" +knowledge as a physics engine, where sequences of tasks can be simulated. The +Configurator encodes and interprets simulation results,based on which it can +choose a sequence of tasks as a plan. We implement this framework on a real +robot and test it in an overtaking scenario as proof-of-concept. + +
+
+
+
+
+ + ☆ Search, Verify and Feedback: Towards Next Generation Post-training + Paradigm of Foundation Models via Verifier Engineering + + +
+ The evolution of machine learning has increasingly prioritized the +development of powerful models and more scalable supervision signals. However, +the emergence of foundation models presents significant challenges in providing +effective supervision signals necessary for further enhancing their +capabilities. Consequently, there is an urgent need to explore novel +supervision signals and technical approaches. In this paper, we propose +verifier engineering, a novel post-training paradigm specifically designed for +the era of foundation models. The core of verifier engineering involves +leveraging a suite of automated verifiers to perform verification tasks and +deliver meaningful feedback to foundation models. We systematically categorize +the verifier engineering process into three essential stages: search, verify, +and feedback, and provide a comprehensive review of state-of-the-art research +developments within each stage. We believe that verifier engineering +constitutes a fundamental pathway toward achieving Artificial General +Intelligence. + +
+
+
+
+
+ + ☆ Alien Recombination: Exploring Concept Blends Beyond Human Cognitive + Availability in Visual Art NeurIPS 2024 + + +
+ While AI models have demonstrated remarkable capabilities in constrained +domains like game strategy, their potential for genuine creativity in +open-ended domains like art remains debated. We explore this question by +examining how AI can transcend human cognitive limitations in visual art +creation. Our research hypothesizes that visual art contains a vast unexplored +space of conceptual combinations, constrained not by inherent incompatibility, +but by cognitive limitations imposed by artists' cultural, temporal, +geographical and social contexts. + To test this hypothesis, we present the Alien Recombination method, a novel +approach utilizing fine-tuned large language models to identify and generate +concept combinations that lie beyond human cognitive availability. The system +models and deliberately counteracts human availability bias, the tendency to +rely on immediately accessible examples, to discover novel artistic +combinations. + This system not only produces combinations that have never been attempted +before within our dataset but also identifies and generates combinations that +are cognitively unavailable to all artists in the domain. Furthermore, we +translate these combinations into visual representations, enabling the +exploration of subjective perceptions of novelty. Our findings suggest that +cognitive unavailability is a promising metric for optimizing artistic novelty, +outperforming merely temperature scaling without additional evaluation +criteria. This approach uses generative models to connect previously +unconnected ideas, providing new insight into the potential of framing +AI-driven creativity as a combinatorial problem. + +
+
+ comment: NeurIPS 2024 Workshop on Creativity & Generative AI, 13 pages, 11 + figures +
+
+
+
+
+ + ☆ HistoEncoder: a digital pathology foundation model for prostate cancer + + +
+ Foundation models are trained on massive amounts of data to distinguish +complex patterns and can be adapted to a wide range of downstream tasks with +minimal computational resources. Here, we develop a foundation model for +prostate cancer digital pathology called HistoEncoder by pre-training on 48 +million prostate tissue tile images. We demonstrate that HistoEncoder features +extracted from tile images with similar histological patterns map closely +together in the feature space. HistoEncoder outperforms models pre-trained with +natural images, even without fine-tuning or with 1000 times less training data. +We describe two use cases that leverage the capabilities of HistoEncoder by +fine-tuning the model with a limited amount of data and computational +resources. First, we show how HistoEncoder can be used to automatically +annotate large-scale datasets with high accuracy. Second, we combine histomics +with commonly used clinical nomograms, significantly improving prostate +cancer-specific death survival models. Foundation models such as HistoEncoder +can allow organizations with limited resources to build effective clinical +software tools without needing extensive datasets or significant amounts of +computing. + +
+
+
+
+
+ + ☆ Robust Markov Decision Processes: A Place Where AI and Formal Methods + Meet + + +
+ Markov decision processes (MDPs) are a standard model for sequential +decision-making problems and are widely used across many scientific areas, +including formal methods and artificial intelligence (AI). MDPs do, however, +come with the restrictive assumption that the transition probabilities need to +be precisely known. Robust MDPs (RMDPs) overcome this assumption by instead +defining the transition probabilities to belong to some uncertainty set. We +present a gentle survey on RMDPs, providing a tutorial covering their +fundamentals. In particular, we discuss RMDP semantics and how to solve them by +extending standard MDP methods such as value iteration and policy iteration. We +also discuss how RMDPs relate to other models and how they are used in several +contexts, including reinforcement learning and abstraction techniques. We +conclude with some challenges for future work on RMDPs. + +
+
+
+
+
+ + ☆ Unveiling the Inflexibility of Adaptive Embedding in Traffic Forecasting + + +
+ Spatiotemporal Graph Neural Networks (ST-GNNs) and Transformers have shown +significant promise in traffic forecasting by effectively modeling temporal and +spatial correlations. However, rapid urbanization in recent years has led to +dynamic shifts in traffic patterns and travel demand, posing major challenges +for accurate long-term traffic prediction. The generalization capability of +ST-GNNs in extended temporal scenarios and cross-city applications remains +largely unexplored. In this study, we evaluate state-of-the-art models on an +extended traffic benchmark and observe substantial performance degradation in +existing ST-GNNs over time, which we attribute to their limited inductive +capabilities. Our analysis reveals that this degradation stems from an +inability to adapt to evolving spatial relationships within urban environments. +To address this limitation, we reconsider the design of adaptive embeddings and +propose a Principal Component Analysis (PCA) embedding approach that enables +models to adapt to new scenarios without retraining. We incorporate PCA +embeddings into existing ST-GNN and Transformer architectures, achieving marked +improvements in performance. Notably, PCA embeddings allow for flexibility in +graph structures between training and testing, enabling models trained on one +city to perform zero-shot predictions on other cities. This adaptability +demonstrates the potential of PCA embeddings in enhancing the robustness and +generalization of spatiotemporal models. + +
+
+
+
+
+ + ☆ Implicit Regularization for Multi-label Feature Selection + + +
+ In this paper, we address the problem of feature selection in the context of +multi-label learning, by using a new estimator based on implicit regularization +and label embedding. Unlike the sparse feature selection methods that use a +penalized estimator with explicit regularization terms such as $l_{2,1}$-norm, +MCP or SCAD, we propose a simple alternative method via Hadamard product +parameterization. In order to guide the feature selection process, a latent +semantic of multi-label information method is adopted, as a label embedding. +Experimental results on some known benchmark datasets suggest that the proposed +estimator suffers much less from extra bias, and may lead to benign +overfitting. + +
+
+ comment: 11 pages, 7 figures, My paper is currently under review at TPAMI + journal +
+
+
+
+
+ + ☆ IKEA Manuals at Work: 4D Grounding of Assembly Instructions on Internet + Videos NeurIPS 2024 + + +
+ Shape assembly is a ubiquitous task in daily life, integral for constructing +complex 3D structures like IKEA furniture. While significant progress has been +made in developing autonomous agents for shape assembly, existing datasets have +not yet tackled the 4D grounding of assembly instructions in videos, essential +for a holistic understanding of assembly in 3D space over time. We introduce +IKEA Video Manuals, a dataset that features 3D models of furniture parts, +instructional manuals, assembly videos from the Internet, and most importantly, +annotations of dense spatio-temporal alignments between these data modalities. +To demonstrate the utility of IKEA Video Manuals, we present five applications +essential for shape assembly: assembly plan generation, part-conditioned +segmentation, part-conditioned pose estimation, video object segmentation, and +furniture assembly based on instructional video manuals. For each application, +we provide evaluation metrics and baseline methods. Through experiments on our +annotated data, we highlight many challenges in grounding assembly instructions +in videos to improve shape assembly, including handling occlusions, varying +viewpoints, and extended assembly sequences. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ The GECo algorithm for Graph Neural Networks Explanation + + +
+ Graph Neural Networks (GNNs) are powerful models that can manage complex data +sources and their interconnection links. One of GNNs' main drawbacks is their +lack of interpretability, which limits their application in sensitive fields. +In this paper, we introduce a new methodology involving graph communities to +address the interpretability of graph classification problems. The proposed +method, called GECo, exploits the idea that if a community is a subset of graph +nodes densely connected, this property should play a role in graph +classification. This is reasonable, especially if we consider the +message-passing mechanism, which is the basic mechanism of GNNs. GECo analyzes +the contribution to the classification result of the communities in the graph, +building a mask that highlights graph-relevant structures. GECo is tested for +Graph Convolutional Networks on six artificial and four real-world graph +datasets and is compared to the main explainability methods such as +PGMExplainer, PGExplainer, GNNExplainer, and SubgraphX using four different +metrics. The obtained results outperform the other methods for artificial graph +datasets and most real-world datasets. + +
+
+
+
+
+ + ☆ Continual Task Learning through Adaptive Policy Self-Composition + + +
+ Training a generalizable agent to continually learn a sequence of tasks from +offline trajectories is a natural requirement for long-lived agents, yet +remains a significant challenge for current offline reinforcement learning (RL) +algorithms. Specifically, an agent must be able to rapidly adapt to new tasks +using newly collected trajectories (plasticity), while retaining knowledge from +previously learned tasks (stability). However, systematic analyses of this +setting are scarce, and it remains unclear whether conventional continual +learning (CL) methods are effective in continual offline RL (CORL) scenarios. +In this study, we develop the Offline Continual World benchmark and demonstrate +that traditional CL methods struggle with catastrophic forgetting, primarily +due to the unique distribution shifts inherent to CORL scenarios. To address +this challenge, we introduce CompoFormer, a structure-based continual +transformer model that adaptively composes previous policies via a meta-policy +network. Upon encountering a new task, CompoFormer leverages semantic +correlations to selectively integrate relevant prior policies alongside newly +trained parameters, thereby enhancing knowledge sharing and accelerating the +learning process. Our experiments reveal that CompoFormer outperforms +conventional CL methods, particularly in longer task sequences, showcasing a +promising balance between plasticity and stability. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ☆ A comprehensive survey of oracle character recognition: challenges, + benchmarks, and beyond + + +
+ Oracle character recognition-an analysis of ancient Chinese inscriptions +found on oracle bones-has become a pivotal field intersecting archaeology, +paleography, and historical cultural studies. Traditional methods of oracle +character recognition have relied heavily on manual interpretation by experts, +which is not only labor-intensive but also limits broader accessibility to the +general public. With recent breakthroughs in pattern recognition and deep +learning, there is a growing movement towards the automation of oracle +character recognition (OrCR), showing considerable promise in tackling the +challenges inherent to these ancient scripts. However, a comprehensive +understanding of OrCR still remains elusive. Therefore, this paper presents a +systematic and structured survey of the current landscape of OrCR research. We +commence by identifying and analyzing the key challenges of OrCR. Then, we +provide an overview of the primary benchmark datasets and digital resources +available for OrCR. A review of contemporary research methodologies follows, in +which their respective efficacies, limitations, and applicability to the +complex nature of oracle characters are critically highlighted and examined. +Additionally, our review extends to ancillary tasks associated with OrCR across +diverse disciplines, providing a broad-spectrum analysis of its applications. +We conclude with a forward-looking perspective, proposing potential avenues for +future investigations that could yield significant advancements in the field. + +
+
+
+
+
+ + ☆ Mitigating Knowledge Conflicts in Language Model-Driven Question + Answering + + +
+ Knowledge-aware sequence to sequence generation tasks such as document +question answering and abstract summarization typically requires two types of +knowledge: encoded parametric knowledge and retrieved contextual information. +Previous work show improper correlation between parametric knowledge and +answers in the training set could cause the model ignore input information at +test time, resulting in un-desirable model behaviour such as over-stability and +hallucination. In this work, we argue that hallucination could be mitigated via +explicit correlation between input source and generated content. We focus on a +typical example of hallucination, entity-based knowledge conflicts in question +answering, where correlation of entities and their description at training time +hinders model behaviour during inference. + +
+
+
+
+
+ + ☆ Syllabus: Portable Curricula for Reinforcement Learning Agents + + +
+ Curriculum learning has been a quiet yet crucial component of many of the +high-profile successes of reinforcement learning. Despite this, none of the +major reinforcement learning libraries directly support curriculum learning or +include curriculum learning implementations. These methods can improve the +capabilities and robustness of RL agents, but often require significant, +complex changes to agent training code. We introduce Syllabus, a library for +training RL agents with curriculum learning, as a solution to this problem. +Syllabus provides a universal API for curriculum learning algorithms, +implementations of popular curriculum learning methods, and infrastructure for +easily integrating them with distributed training code written in nearly any RL +library. Syllabus provides a minimal API for each of the core components of +curriculum learning, dramatically simplifying the process of designing new +algorithms and applying existing algorithms to new environments. We demonstrate +that the same Syllabus code can be used to train agents written in multiple +different RL libraries on numerous domains. In doing so, we present the first +examples of curriculum learning in NetHack and Neural MMO, two of the premier +challenges for single-agent and multi-agent RL respectively, achieving strong +results compared to state of the art baselines. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Study of the Performance of CEEMDAN in Underdetermined Speech Separation + + +
+ The CEEMDAN algorithm is one of the modern methods used in the analysis of +non-stationary signals. This research presents a study of the effectiveness of +this method in audio source separation to know the limits of its work. It +concluded two conditions related to frequencies and amplitudes of mixed signals +to be separated by CEEMDAN. The performance of the algorithm in separating +noise from speech and separating speech signals from each other is studied. The +research reached a conclusion that CEEMDAN can remove some types of noise from +speech (speech improvement), and it cannot separate speech signals from each +other (cocktail party). Simulation is done using Matlab environment and Noizeus +database. + +
+
+ comment: in Arabic language +
+
+
+
+
+ + ☆ TP-UNet: Temporal Prompt Guided UNet for Medical Image Segmentation + + +
+ The advancement of medical image segmentation techniques has been propelled +by the adoption of deep learning techniques, particularly UNet-based +approaches, which exploit semantic information to improve the accuracy of +segmentations. However, the order of organs in scanned images has been +disregarded by current medical image segmentation approaches based on UNet. +Furthermore, the inherent network structure of UNet does not provide direct +capabilities for integrating temporal information. To efficiently integrate +temporal information, we propose TP-UNet that utilizes temporal prompts, +encompassing organ-construction relationships, to guide the segmentation UNet +model. Specifically, our framework is featured with cross-attention and +semantic alignment based on unsupervised contrastive learning to combine +temporal prompts and image features effectively. Extensive evaluations on two +medical image segmentation datasets demonstrate the state-of-the-art +performance of TP-UNet. Our implementation will be open-sourced after +acceptance. + +
+
+
+
+
+ + ☆ Recurrent Stochastic Configuration Networks with Incremental Blocks + + +
+ Recurrent stochastic configuration networks (RSCNs) have shown promise in +modelling nonlinear dynamic systems with order uncertainty due to their +advantages of easy implementation, less human intervention, and strong +approximation capability. This paper develops the original RSCNs with block +increments, termed block RSCNs (BRSCNs), to further enhance the learning +capacity and efficiency of the network. BRSCNs can simultaneously add multiple +reservoir nodes (subreservoirs) during the construction. Each subreservoir is +configured with a unique structure in the light of a supervisory mechanism, +ensuring the universal approximation property. The reservoir feedback matrix is +appropriately scaled to guarantee the echo state property of the network. +Furthermore, the output weights are updated online using a projection +algorithm, and the persistent excitation conditions that facilitate parameter +convergence are also established. Numerical results over a time series +prediction, a nonlinear system identification task, and two industrial data +predictive analyses demonstrate that the proposed BRSCN performs favourably in +terms of modelling efficiency, learning, and generalization performance, +highlighting their significant potential for coping with complex dynamics. + +
+
+
+
+
+ + ☆ Towards Personalized Brain-Computer Interface Application Based on + Endogenous EEG Paradigms + + +
+ In this paper, we propose a conceptual framework for personalized +brain-computer interface (BCI) applications, which can offer an enhanced user +experience by customizing services to individual preferences and needs, based +on endogenous electroencephalography (EEG) paradigms including motor imagery +(MI), speech imagery (SI), and visual imagery. The framework includes two +essential components: user identification and intention classification, which +enable personalized services by identifying individual users and recognizing +their intended actions through EEG signals. We validate the feasibility of our +framework using a private EEG dataset collected from eight subjects, employing +the ShallowConvNet architecture to decode EEG features. The experimental +results demonstrate that user identification achieved an average classification +accuracy of 0.995, while intention classification achieved 0.47 accuracy across +all paradigms, with MI demonstrating the best performance. These findings +indicate that EEG signals can effectively support personalized BCI +applications, offering robust identification and reliable intention decoding, +especially for MI and SI. + +
+
+ comment: Submissoion version for IEEE International BCI Winter Conference 2025 +
+
+
+
+
+ + ☆ Transcending Language Boundaries: Harnessing LLMs for Low-Resource + Language Translation + + +
+ Large Language Models (LLMs) have demonstrated remarkable success across a +wide range of tasks and domains. However, their performance in low-resource +language translation, particularly when translating into these languages, +remains underexplored. This gap poses significant challenges, as linguistic +barriers hinder the cultural preservation and development of minority +communities. To address this issue, this paper introduces a novel +retrieval-based method that enhances translation quality for low-resource +languages by focusing on key terms, which involves translating keywords and +retrieving corresponding examples from existing data. To evaluate the +effectiveness of this method, we conducted experiments translating from English +into three low-resource languages: Cherokee, a critically endangered indigenous +language of North America; Tibetan, a historically and culturally significant +language in Asia; and Manchu, a language with few remaining speakers. Our +comparison with the zero-shot performance of GPT-4o and LLaMA 3.1 405B, +highlights the significant challenges these models face when translating into +low-resource languages. In contrast, our retrieval-based method shows promise +in improving both word-level accuracy and overall semantic understanding by +leveraging existing resources more effectively. + +
+
+
+
+
+ + ☆ LP Data Pipeline: Lightweight, Purpose-driven Data Pipeline for Large + Language Models + + +
+ Creating high-quality, large-scale datasets for large language models (LLMs) +often relies on resource-intensive, GPU-accelerated models for quality +filtering, making the process time-consuming and costly. This dependence on +GPUs limits accessibility for organizations lacking significant computational +infrastructure. To address this issue, we introduce the Lightweight, +Purpose-driven (LP) Data Pipeline, a framework that operates entirely on CPUs +to streamline the processes of dataset extraction, filtering, and curation. +Based on our four core principles, the LP Data Pipeline significantly reduces +preparation time and cost while maintaining high data quality. Importantly, our +pipeline enables the creation of purpose-driven datasets tailored to specific +domains and languages, enhancing the applicability of LLMs in specialized +contexts. We anticipate that our pipeline will lower the barriers to LLM +development, enabling a wide range of organizations to access LLMs more easily. + +
+
+
+
+
+ + ☆ Zero-Shot Automatic Annotation and Instance Segmentation using + LLM-Generated Datasets: Eliminating Field Imaging and Manual Annotation for + Deep Learning Model Development + + +
+ Currently, deep learning-based instance segmentation for various applications +(e.g., Agriculture) is predominantly performed using a labor-intensive process +involving extensive field data collection using sophisticated sensors, followed +by careful manual annotation of images, presenting significant logistical and +financial challenges to researchers and organizations. The process also slows +down the model development and training process. In this study, we presented a +novel method for deep learning-based instance segmentation of apples in +commercial orchards that eliminates the need for labor-intensive field data +collection and manual annotation. Utilizing a Large Language Model (LLM), we +synthetically generated orchard images and automatically annotated them using +the Segment Anything Model (SAM) integrated with a YOLO11 base model. This +method significantly reduces reliance on physical sensors and manual data +processing, presenting a major advancement in "Agricultural AI". The synthetic, +auto-annotated dataset was used to train the YOLO11 model for Apple instance +segmentation, which was then validated on real orchard images. The results +showed that the automatically generated annotations achieved a Dice Coefficient +of 0.9513 and an IoU of 0.9303, validating the accuracy and overlap of the mask +annotations. All YOLO11 configurations, trained solely on these synthetic +datasets with automated annotations, accurately recognized and delineated +apples, highlighting the method's efficacy. Specifically, the YOLO11m-seg +configuration achieved a mask precision of 0.902 and a mask mAP@50 of 0.833 on +test images collected from a commercial orchard. Additionally, the YOLO11l-seg +configuration outperformed other models in validation on 40 LLM-generated +images, achieving the highest mask precision and mAP@50 metrics. + Keywords: YOLO, SAM, SAMv2, YOLO11, YOLOv11, Segment Anything, YOLO-SAM + +
+
+
+
+
+ + ☆ Multi-Hyperbolic Space-based Heterogeneous Graph Attention Network + + +
+ To leverage the complex structures within heterogeneous graphs, recent +studies on heterogeneous graph embedding use a hyperbolic space, characterized +by a constant negative curvature and exponentially increasing space, which +aligns with the structural properties of heterogeneous graphs. However, despite +heterogeneous graphs inherently possessing diverse power-law structures, most +hyperbolic heterogeneous graph embedding models use a single hyperbolic space +for the entire heterogeneous graph, which may not effectively capture the +diverse power-law structures within the heterogeneous graph. To address this +limitation, we propose Multi-hyperbolic Space-based heterogeneous Graph +Attention Network (MSGAT), which uses multiple hyperbolic spaces to effectively +capture diverse power-law structures within heterogeneous graphs. We conduct +comprehensive experiments to evaluate the effectiveness of MSGAT. The +experimental results demonstrate that MSGAT outperforms state-of-the-art +baselines in various graph machine learning tasks, effectively capturing the +complex structures of heterogeneous graphs. + +
+
+ comment: Accepted in IEEE ICDM 2024 +
+
+
+
+
+ + ☆ Continuous K-space Recovery Network with Image Guidance for Fast MRI + Reconstruction + + +
+ Magnetic resonance imaging (MRI) is a crucial tool for clinical diagnosis +while facing the challenge of long scanning time. To reduce the acquisition +time, fast MRI reconstruction aims to restore high-quality images from the +undersampled k-space. Existing methods typically train deep learning models to +map the undersampled data to artifact-free MRI images. However, these studies +often overlook the unique properties of k-space and directly apply general +networks designed for image processing to k-space recovery, leaving the precise +learning of k-space largely underexplored. In this work, we propose a +continuous k-space recovery network from a new perspective of implicit neural +representation with image domain guidance, which boosts the performance of MRI +reconstruction. Specifically, (1) an implicit neural representation based +encoder-decoder structure is customized to continuously query unsampled +k-values. (2) an image guidance module is designed to mine the semantic +information from the low-quality MRI images to further guide the k-space +recovery. (3) a multi-stage training strategy is proposed to recover dense +k-space progressively. Extensive experiments conducted on CC359, fastMRI, and +IXI datasets demonstrate the effectiveness of our method and its superiority +over other competitors. + +
+
+
+
+
+ + ☆ Cross-Patient Pseudo Bags Generation and Curriculum Contrastive Learning + for Imbalanced Multiclassification of Whole Slide Image + + +
+ Pathology computing has dramatically improved pathologists' workflow and +diagnostic decision-making processes. Although computer-aided diagnostic +systems have shown considerable value in whole slide image (WSI) analysis, the +problem of multi-classification under sample imbalance remains an intractable +challenge. To address this, we propose learning fine-grained information by +generating sub-bags with feature distributions similar to the original WSIs. +Additionally, we utilize a pseudo-bag generation algorithm to further leverage +the abundant and redundant information in WSIs, allowing efficient training in +unbalanced-sample multi-classification tasks. Furthermore, we introduce an +affinity-based sample selection and curriculum contrastive learning strategy to +enhance the stability of model representation learning. Unlike previous +approaches, our framework transitions from learning bag-level representations +to understanding and exploiting the feature distribution of multi-instance +bags. Our method demonstrates significant performance improvements on three +datasets, including tumor classification and lymph node metastasis. On average, +it achieves a 4.39-point improvement in F1 score compared to the second-best +method across the three tasks, underscoring its superior performance. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ EXCON: Extreme Instance-based Contrastive Representation Learning of + Severely Imbalanced Multivariate Time Series for Solar Flare Prediction + + +
+ In heliophysics research, predicting solar flares is crucial due to their +potential to impact both space-based systems and Earth's infrastructure +substantially. Magnetic field data from solar active regions, recorded by solar +imaging observatories, are transformed into multivariate time series to enable +solar flare prediction using temporal window-based analysis. In the realm of +multivariate time series-driven solar flare prediction, addressing severe class +imbalance with effective strategies for multivariate time series representation +learning is key to developing robust predictive models. Traditional methods +often struggle with overfitting to the majority class in prediction tasks where +major solar flares are infrequent. This work presents EXCON, a contrastive +representation learning framework designed to enhance classification +performance amidst such imbalances. EXCON operates through four stages: +obtaining core features from multivariate time series data; selecting +distinctive contrastive representations for each class to maximize inter-class +separation; training a temporal feature embedding module with a custom extreme +reconstruction loss to minimize intra-class variation; and applying a +classifier to the learned embeddings for robust classification. The proposed +method leverages contrastive learning principles to map similar instances +closer in the feature space while distancing dissimilar ones, a strategy not +extensively explored in solar flare prediction tasks. This approach not only +addresses class imbalance but also offers a versatile solution applicable to +univariate and multivariate time series across binary and multiclass +classification problems. Experimental results, including evaluations on the +benchmark solar flare dataset and multiple time series archive datasets with +binary and multiclass labels, demonstrate EXCON's efficacy in enhancing +classification performance. + +
+
+ comment: This work has been accepted at the 2024 IEEE International Conference + on Big Data (IEEE BigData 2024) on October 27, 2024, as a main conference + paper +
+
+
+
+
+ + ☆ ZeFaV: Boosting Large Language Models for Zero-shot Fact Verification PRICAI 2024 + + +
+ In this paper, we propose ZeFaV - a zero-shot based fact-checking +verification framework to enhance the performance on fact verification task of +large language models by leveraging the in-context learning ability of large +language models to extract the relations among the entities within a claim, +re-organized the information from the evidence in a relationally logical form, +and combine the above information with the original evidence to generate the +context from which our fact-checking model provide verdicts for the input +claims. We conducted empirical experiments to evaluate our approach on two +multi-hop fact-checking datasets including HoVer and FEVEROUS, and achieved +potential results results comparable to other state-of-the-art fact +verification task methods. + +
+
+ comment: This pre-print has been published in PRICAI 2024: Trends in + Artificial Intelligence. The published version is available at + https://doi.org/10.1007/978-981-96-0119-6_28 +
+
+
+
+
+ + ☆ MEMO-Bench: A Multiple Benchmark for Text-to-Image and Multimodal Large + Language Models on Human Emotion Analysis + + +
+ Artificial Intelligence (AI) has demonstrated significant capabilities in +various fields, and in areas such as human-computer interaction (HCI), embodied +intelligence, and the design and animation of virtual digital humans, both +practitioners and users are increasingly concerned with AI's ability to +understand and express emotion. Consequently, the question of whether AI can +accurately interpret human emotions remains a critical challenge. To date, two +primary classes of AI models have been involved in human emotion analysis: +generative models and Multimodal Large Language Models (MLLMs). To assess the +emotional capabilities of these two classes of models, this study introduces +MEMO-Bench, a comprehensive benchmark consisting of 7,145 portraits, each +depicting one of six different emotions, generated by 12 Text-to-Image (T2I) +models. Unlike previous works, MEMO-Bench provides a framework for evaluating +both T2I models and MLLMs in the context of sentiment analysis. Additionally, a +progressive evaluation approach is employed, moving from coarse-grained to +fine-grained metrics, to offer a more detailed and comprehensive assessment of +the sentiment analysis capabilities of MLLMs. The experimental results +demonstrate that existing T2I models are more effective at generating positive +emotions than negative ones. Meanwhile, although MLLMs show a certain degree of +effectiveness in distinguishing and recognizing human emotions, they fall short +of human-level accuracy, particularly in fine-grained emotion analysis. The +MEMO-Bench will be made publicly available to support further research in this +area. + +
+
+
+
+
+ + ☆ MoE-Lightning: High-Throughput MoE Inference on Memory-constrained GPUs + + +
+ Efficient deployment of large language models, particularly Mixture of +Experts (MoE), on resource-constrained platforms presents significant +challenges, especially in terms of computational efficiency and memory +utilization. The MoE architecture, renowned for its ability to increase model +capacity without a proportional increase in inference cost, greatly reduces the +token generation latency compared with dense models. However, the large model +size makes MoE models inaccessible to individuals without high-end GPUs. In +this paper, we propose a high-throughput MoE batch inference system, that +significantly outperforms past work. MoE-Lightning introduces a novel +CPU-GPU-I/O pipelining schedule, CGOPipe, with paged weights to achieve high +resource utilization, and a performance model, HRM, based on a Hierarchical +Roofline Model we introduce to help find policies with higher throughput than +existing systems. MoE-Lightning can achieve up to 10.3x higher throughput than +state-of-the-art offloading-enabled LLM inference systems for Mixtral 8x7B on a +single T4 GPU (16GB). When the theoretical system throughput is bounded by the +GPU memory, MoE-Lightning can reach the throughput upper bound with 2-3x less +CPU memory, significantly increasing resource utilization. MoE-Lightning also +supports efficient batch inference for much larger MoEs (e.g., Mixtral 8x22B +and DBRX) on multiple low-cost GPUs (e.g., 2-4 T4). + +
+
+
+
+
+ + ☆ Making Sigmoid-MSE Great Again: Output Reset Challenges Softmax + Cross-Entropy in Neural Network Classification + + +
+ This study presents a comparative analysis of two objective functions, Mean +Squared Error (MSE) and Softmax Cross-Entropy (SCE) for neural network +classification tasks. While SCE combined with softmax activation is the +conventional choice for transforming network outputs into class probabilities, +we explore an alternative approach using MSE with sigmoid activation. We +introduce the Output Reset algorithm, which reduces inconsistent errors and +enhances classifier robustness. Through extensive experiments on benchmark +datasets (MNIST, CIFAR-10, and Fashion-MNIST), we demonstrate that MSE with +sigmoid activation achieves comparable accuracy and convergence rates to SCE, +while exhibiting superior performance in scenarios with noisy data. Our +findings indicate that MSE, despite its traditional association with regression +tasks, serves as a viable alternative for classification problems, challenging +conventional wisdom about neural network training strategies. + +
+
+
+
+
+ + ☆ The Role of Accuracy and Validation Effectiveness in Conversational + Business Analytics + + +
+ This study examines conversational business analytics, an approach that +utilizes AI to address the technical competency gaps that hindered end users +from effectively using traditional self-service analytics. By facilitating +natural language interactions, conversational business analytics aims to enable +end users to independently retrieve data and generate insights. The analysis +focuses on Text-to-SQL as a representative technology for translating natural +language requests into SQL statements. Using models grounded in expected +utility theory, the study identifies conditions under which conversational +business analytics, through partial or full support, can outperform delegation +to human experts. The results indicate that partial support, which focuses +solely on information generation by AI, is viable when the accuracy of +AI-generated SQL queries exceeds a defined threshold. In contrast, full support +includes not only information generation but also validation through +explanations provided by the AI, and requires sufficiently high validation +effectiveness to be reliable. However, user-based validation presents +challenges, such as misjudgment and rejection of valid SQL queries, which may +limit the effectiveness of conversational business analytics. These challenges +underscore the need for robust validation mechanisms, including improved user +support, automated processes, and methods for assessing quality independently +of end users' technical competencies. + +
+
+
+
+
+ + ☆ Distill the Best, Ignore the Rest: Improving Dataset Distillation with + Loss-Value-Based Pruning + + +
+ Dataset distillation has gained significant interest in recent years, yet +existing approaches typically distill from the entire dataset, potentially +including non-beneficial samples. We introduce a novel "Prune First, Distill +After" framework that systematically prunes datasets via loss-based sampling +prior to distillation. By leveraging pruning before classical distillation +techniques and generative priors, we create a representative core-set that +leads to enhanced generalization for unseen architectures - a significant +challenge of current distillation methods. More specifically, our proposed +framework significantly boosts distilled quality, achieving up to a 5.2 +percentage points accuracy increase even with substantial dataset pruning, +i.e., removing 80% of the original dataset prior to distillation. Overall, our +experimental results highlight the advantages of our easy-sample prioritization +and cross-architecture robustness, paving the way for more effective and +high-quality dataset distillation. + +
+
+
+
+
+ + ☆ Just Leaf It: Accelerating Diffusion Classifiers with Hierarchical Class + Pruning + + +
+ Diffusion models, known for their generative capabilities, have recently +shown unexpected potential in image classification tasks by using Bayes' +theorem. However, most diffusion classifiers require evaluating all class +labels for a single classification, leading to significant computational costs +that can hinder their application in large-scale scenarios. To address this, we +present a Hierarchical Diffusion Classifier (HDC) that exploits the inherent +hierarchical label structure of a dataset. By progressively pruning irrelevant +high-level categories and refining predictions only within relevant +subcategories, i.e., leaf nodes, HDC reduces the total number of class +evaluations. As a result, HDC can accelerate inference by up to 60% while +maintaining and, in some cases, improving classification accuracy. Our work +enables a new control mechanism of the trade-off between speed and precision, +making diffusion-based classification more viable for real-world applications, +particularly in large-scale image classification tasks. + +
+
+
+
+
+ + ☆ Zoomed In, Diffused Out: Towards Local Degradation-Aware Multi-Diffusion + for Extreme Image Super-Resolution + + +
+ Large-scale, pre-trained Text-to-Image (T2I) diffusion models have gained +significant popularity in image generation tasks and have shown unexpected +potential in image Super-Resolution (SR). However, most existing T2I diffusion +models are trained with a resolution limit of 512x512, making scaling beyond +this resolution an unresolved but necessary challenge for image SR. In this +work, we introduce a novel approach that, for the first time, enables these +models to generate 2K, 4K, and even 8K images without any additional training. +Our method leverages MultiDiffusion, which distributes the generation across +multiple diffusion paths to ensure global coherence at larger scales, and local +degradation-aware prompt extraction, which guides the T2I model to reconstruct +fine local structures according to its low-resolution input. These innovations +unlock higher resolutions, allowing T2I diffusion models to be applied to image +SR tasks without limitation on resolution. + +
+
+
+
+
+ + ☆ TSPRank: Bridging Pairwise and Listwise Methods with a Bilinear + Travelling Salesman Model + + +
+ Traditional Learning-To-Rank (LETOR) approaches, including pairwise methods +like RankNet and LambdaMART, often fall short by solely focusing on pairwise +comparisons, leading to sub-optimal global rankings. Conversely, deep learning +based listwise methods, while aiming to optimise entire lists, require complex +tuning and yield only marginal improvements over robust pairwise models. To +overcome these limitations, we introduce Travelling Salesman Problem Rank +(TSPRank), a hybrid pairwise-listwise ranking method. TSPRank reframes the +ranking problem as a Travelling Salesman Problem (TSP), a well-known +combinatorial optimisation challenge that has been extensively studied for its +numerous solution algorithms and applications. This approach enables the +modelling of pairwise relationships and leverages combinatorial optimisation to +determine the listwise ranking. This approach can be directly integrated as an +additional component into embeddings generated by existing backbone models to +enhance ranking performance. Our extensive experiments across three backbone +models on diverse tasks, including stock ranking, information retrieval, and +historical events ordering, demonstrate that TSPRank significantly outperforms +both pure pairwise and listwise methods. Our qualitative analysis reveals that +TSPRank's main advantage over existing methods is its ability to harness global +information better while ranking. TSPRank's robustness and superior performance +across different domains highlight its potential as a versatile and effective +LETOR solution. The code and preprocessed data are available at +https://github.com/waylonli/TSPRank-KDD2025. + +
+
+ comment: Accepted to ACM SIGKDD 2025 Research Track +
+
+
+
+
+ + ☆ Benchmarking pre-trained text embedding models in aligning built asset + information + + +
+ Accurate mapping of the built asset information to established data +classification systems and taxonomies is crucial for effective asset +management, whether for compliance at project handover or ad-hoc data +integration scenarios. Due to the complex nature of built asset data, which +predominantly comprises technical text elements, this process remains largely +manual and reliant on domain expert input. Recent breakthroughs in contextual +text representation learning (text embedding), particularly through pre-trained +large language models, offer promising approaches that can facilitate the +automation of cross-mapping of the built asset data. However, no comprehensive +evaluation has yet been conducted to assess these models' ability to +effectively represent the complex semantics specific to built asset technical +terminology. This study presents a comparative benchmark of state-of-the-art +text embedding models to evaluate their effectiveness in aligning built asset +information with domain-specific technical concepts. Our proposed datasets are +derived from two renowned built asset data classification dictionaries. The +results of our benchmarking across six proposed datasets, covering three tasks +of clustering, retrieval, and reranking, highlight the need for future research +on domain adaptation techniques. The benchmarking resources are published as an +open-source library, which will be maintained and extended to support future +evaluations in this field. + +
+
+
+
+
+ + ☆ Fingerprinting and Tracing Shadows: The Development and Impact of + Browser Fingerprinting on Digital Privacy + + +
+ Browser fingerprinting is a growing technique for identifying and tracking +users online without traditional methods like cookies. This paper gives an +overview by examining the various fingerprinting techniques and analyzes the +entropy and uniqueness of the collected data. The analysis highlights that +browser fingerprinting poses a complex challenge from both technical and +privacy perspectives, as users often have no control over the collection and +use of their data. In addition, it raises significant privacy concerns as users +are often tracked without their knowledge or consent. + +
+
+ comment: SECURWARE 2024, France, Nice +
+
+
+
+
+ + ☆ Fast Convergence of Softmax Policy Mirror Ascent + + +
+ Natural policy gradient (NPG) is a common policy optimization algorithm and +can be viewed as mirror ascent in the space of probabilities. Recently, Vaswani +et al. [2021] introduced a policy gradient method that corresponds to mirror +ascent in the dual space of logits. We refine this algorithm, removing its need +for a normalization across actions and analyze the resulting method (referred +to as SPMA). For tabular MDPs, we prove that SPMA with a constant step-size +matches the linear convergence of NPG and achieves a faster convergence than +constant step-size (accelerated) softmax policy gradient. To handle large +state-action spaces, we extend SPMA to use a log-linear policy +parameterization. Unlike that for NPG, generalizing SPMA to the linear function +approximation (FA) setting does not require compatible function approximation. +Unlike MDPO, a practical generalization of NPG, SPMA with linear FA only +requires solving convex softmax classification problems. We prove that SPMA +achieves linear convergence to the neighbourhood of the optimal value function. +We extend SPMA to handle non-linear FA and evaluate its empirical performance +on the MuJoCo and Atari benchmarks. Our results demonstrate that SPMA +consistently achieves similar or better performance compared to MDPO, PPO and +TRPO. + +
+
+
+
+
+ + ☆ Scaling Deep Learning Research with Kubernetes on the NRP Nautilus + HyperCluster + + +
+ Throughout the scientific computing space, deep learning algorithms have +shown excellent performance in a wide range of applications. As these deep +neural networks (DNNs) continue to mature, the necessary compute required to +train them has continued to grow. Today, modern DNNs require millions of FLOPs +and days to weeks of training to generate a well-trained model. The training +times required for DNNs are oftentimes a bottleneck in DNN research for a +variety of deep learning applications, and as such, accelerating and scaling +DNN training enables more robust and accelerated research. To that end, in this +work, we explore utilizing the NRP Nautilus HyperCluster to automate and scale +deep learning model training for three separate applications of DNNs, including +overhead object detection, burned area segmentation, and deforestation +detection. In total, 234 deep neural models are trained on Nautilus, for a +total time of 4,040 hours + +
+
+
+
+
+ + ☆ Regret-Free Reinforcement Learning for LTL Specifications + + +
+ Reinforcement learning (RL) is a promising method to learn optimal control +policies for systems with unknown dynamics. In particular, synthesizing +controllers for safety-critical systems based on high-level specifications, +such as those expressed in temporal languages like linear temporal logic (LTL), +presents a significant challenge in control systems research. Current RL-based +methods designed for LTL tasks typically offer only asymptotic guarantees, +which provide no insight into the transient performance during the learning +phase. While running an RL algorithm, it is crucial to assess how close we are +to achieving optimal behavior if we stop learning. + In this paper, we present the first regret-free online algorithm for learning +a controller that addresses the general class of LTL specifications over Markov +decision processes (MDPs) with a finite set of states and actions. We begin by +proposing a regret-free learning algorithm to solve infinite-horizon +reach-avoid problems. For general LTL specifications, we show that the +synthesis problem can be reduced to a reach-avoid problem when the graph +structure is known. Additionally, we provide an algorithm for learning the +graph structure, assuming knowledge of a minimum transition probability, which +operates independently of the main regret-free algorithm. + +
+
+
+
+
+ + ☆ ByteScience: Bridging Unstructured Scientific Literature and Structured + Data with Auto Fine-tuned Large Language Model in Token Granularity + + +
+ Natural Language Processing (NLP) is widely used to supply summarization +ability from long context to structured information. However, extracting +structured knowledge from scientific text by NLP models remains a challenge +because of its domain-specific nature to complex data preprocessing and the +granularity of multi-layered device-level information. To address this, we +introduce ByteScience, a non-profit cloud-based auto fine-tuned Large Language +Model (LLM) platform, which is designed to extract structured scientific data +and synthesize new scientific knowledge from vast scientific corpora. The +platform capitalizes on DARWIN, an open-source, fine-tuned LLM dedicated to +natural science. The platform was built on Amazon Web Services (AWS) and +provides an automated, user-friendly workflow for custom model development and +data extraction. The platform achieves remarkable accuracy with only a small +amount of well-annotated articles. This innovative tool streamlines the +transition from the science literature to structured knowledge and data and +benefits the advancements in natural informatics. + +
+
+
+
+
+ + ☆ Understanding Chain-of-Thought in LLMs through Information Theory + + +
+ Large Language Models (LLMs) have shown impressive performance in complex +reasoning tasks through Chain-of-Thought (CoT) reasoning, allowing models to +break down problems into manageable sub-tasks. However, existing CoT evaluation +techniques either require annotated CoT data or fall short in accurately +assessing intermediate reasoning steps, leading to high rates of false +positives. In this paper, we formalize CoT reasoning in LLMs through an +information-theoretic lens. Specifically, our framework quantifies the +`information gain' at each reasoning step, enabling the identification of +failure modes in LLMs without the need for expensive annotated datasets. We +demonstrate the efficacy of our approach through extensive experiments on toy +and GSM-8K data, where it significantly outperforms existing outcome-based +methods by providing more accurate insights into model performance on +individual tasks. + +
+
+
+
+
+ + ☆ Medical Video Generation for Disease Progression Simulation + + +
+ Modeling disease progression is crucial for improving the quality and +efficacy of clinical diagnosis and prognosis, but it is often hindered by a +lack of longitudinal medical image monitoring for individual patients. To +address this challenge, we propose the first Medical Video Generation (MVG) +framework that enables controlled manipulation of disease-related image and +video features, allowing precise, realistic, and personalized simulations of +disease progression. Our approach begins by leveraging large language models +(LLMs) to recaption prompt for disease trajectory. Next, a controllable +multi-round diffusion model simulates the disease progression state for each +patient, creating realistic intermediate disease state sequence. Finally, a +diffusion-based video transition generation model interpolates disease +progression between these states. We validate our framework across three +medical imaging domains: chest X-ray, fundus photography, and skin image. Our +results demonstrate that MVG significantly outperforms baseline models in +generating coherent and clinically plausible disease trajectories. Two user +studies by veteran physicians, provide further validation and insights into the +clinical utility of the generated sequences. MVG has the potential to assist +healthcare providers in modeling disease trajectories, interpolating missing +medical image data, and enhancing medical education through realistic, dynamic +visualizations of disease progression. + +
+
+ comment: Tech Report. The appendix will release soon. arXiv admin note: text + overlap with arXiv:2309.11745 +
+
+
+
+
+ + ☆ Variable Rate Neural Compression for Sparse Detector Data + + +
+ High-energy large-scale particle colliders generate data at extraordinary +rates. Developing real-time high-throughput data compression algorithms to +reduce data volume and meet the bandwidth requirement for storage has become +increasingly critical. Deep learning is a promising technology that can address +this challenging topic. At the newly constructed sPHENIX experiment at the +Relativistic Heavy Ion Collider, a Time Projection Chamber (TPC) serves as the +main tracking detector, which records three-dimensional particle trajectories +in a volume of a gas-filled cylinder. In terms of occupancy, the resulting data +flow can be very sparse reaching $10^{-3}$ for proton-proton collisions. Such +sparsity presents a challenge to conventional learning-free lossy compression +algorithms, such as SZ, ZFP, and MGARD. In contrast, emerging deep +learning-based models, particularly those utilizing convolutional neural +networks for compression, have outperformed these conventional methods in terms +of compression ratios and reconstruction accuracy. However, research on the +efficacy of these deep learning models in handling sparse datasets, like those +produced in particle colliders, remains limited. Furthermore, most deep +learning models do not adapt their processing speeds to data sparsity, which +affects efficiency. To address this issue, we propose a novel approach for TPC +data compression via key-point identification facilitated by sparse +convolution. Our proposed algorithm, BCAE-VS, achieves a $75\%$ improvement in +reconstruction accuracy with a $10\%$ increase in compression ratio over the +previous state-of-the-art model. Additionally, BCAE-VS manages to achieve these +results with a model size over two orders of magnitude smaller. Lastly, we have +experimentally verified that as sparsity increases, so does the model's +throughput. + +
+
+ comment: 37 pages, 12 figures, submitted to Journal of Computational Physics +
+
+
+
+
+ + ♻ ☆ A Perspective for Adapting Generalist AI to Specialized Medical AI + Applications and Their Challenges + + +
+ The integration of Large Language Models (LLMs) into medical applications has +sparked widespread interest across the healthcare industry, from drug discovery +and development to clinical decision support, assisting telemedicine, medical +devices, and healthcare insurance applications. This perspective paper aims to +discuss the inner workings of building LLM-powered medical AI applications and +introduces a comprehensive framework for their development. We review existing +literature and outline the unique challenges of applying LLMs in specialized +medical contexts. Additionally, we introduce a three-step framework to organize +medical LLM research activities: 1) Modeling: breaking down complex medical +workflows into manageable steps for developing medical-specific models; 2) +Optimization: optimizing the model performance with crafted prompts and +integrating external knowledge and tools, and 3) System engineering: +decomposing complex tasks into subtasks and leveraging human expertise for +building medical AI applications. Furthermore, we offer a detailed use case +playbook that describes various LLM-powered medical AI applications, such as +optimizing clinical trial design, enhancing clinical decision support, and +advancing medical imaging analysis. Finally, we discuss various challenges and +considerations for building medical AI applications with LLMs, such as handling +hallucination issues, data ownership and compliance, privacy, intellectual +property considerations, compute cost, sustainability issues, and responsible +AI requirements. + +
+
+
+
+
+ + ♻ ☆ Watermark-based Detection and Attribution of AI-Generated Content + + +
+ Several companies have deployed watermark-based detection to identify +AI-generated content. However, attribution--the ability to trace back to the +user of a generative AI (GenAI) service who created a given piece of +AI-generated content--remains largely unexplored despite its growing +importance. In this work, we aim to bridge this gap by conducting the first +systematic study on watermark-based, user-level attribution of AI-generated +content. Our key idea is to assign a unique watermark to each user of the GenAI +service and embed this watermark into the AI-generated content created by that +user. Attribution is then performed by identifying the user whose watermark +best matches the one extracted from the given content. This approach, however, +faces a key challenge: How should watermarks be selected for users to maximize +attribution performance? To address the challenge, we first theoretically +derive lower bounds on detection and attribution performance through rigorous +probabilistic analysis for any given set of user watermarks. Then, we select +watermarks for users to maximize these lower bounds, thereby optimizing +detection and attribution performance. Our theoretical and empirical results +show that watermark-based attribution inherits both the accuracy and +(non-)robustness properties of the underlying watermark. Specifically, +attribution remains highly accurate when the watermarked AI-generated content +is either not post-processed or subjected to common post-processing such as +JPEG compression, as well as black-box adversarial post-processing with limited +query budgets. + +
+
+
+
+
+ + ♻ ☆ A Multimodal Adaptive Graph-based Intelligent Classification Model for + Fake News + + +
+ Numerous studies have been proposed to detect fake news focusing on +multi-modalities based on machine and/or deep learning. However, studies +focusing on graph-based structures using geometric deep learning are lacking. +To address this challenge, we introduce the Multimodal Adaptive Graph-based +Intelligent Classification (aptly referred to as MAGIC) for fake news +detection. Specifically, the Encoder Representations from Transformers was used +for text vectorization whilst ResNet50 was used for images. A comprehensive +information interaction graph was built using the adaptive Graph Attention +Network before classifying the multimodal input through the Softmax function. +MAGIC was trained and tested on two fake news datasets, that is, Fakeddit +(English) and Multimodal Fake News Detection (Chinese), with the model +achieving an accuracy of 98.8\% and 86.3\%, respectively. Ablation experiments +also revealed MAGIC to yield superior performance across both the datasets. +Findings show that a graph-based deep learning adaptive model is effective in +detecting multimodal fake news, surpassing state-of-the-art methods. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ CRoP: Context-wise Robust Static Human-Sensing Personalization + + +
+ The advancement in deep learning and internet-of-things have led to diverse +human sensing applications. However, distinct patterns in human sensing, +influenced by various factors or contexts, challenge the generic neural network +model's performance due to natural distribution shifts. To address this, +personalization tailors models to individual users. Yet most personalization +studies overlook intra-user heterogeneity across contexts in sensory data, +limiting intra-user generalizability. This limitation is especially critical in +clinical applications, where limited data availability hampers both +generalizability and personalization. Notably, intra-user sensing attributes +are expected to change due to external factors such as treatment progression, +further complicating the challenges. To address the intra-user generalization +challenge, this work introduces CRoP, a novel static personalization approach. +CRoP leverages off-the-shelf pre-trained models as generic starting points and +captures user-specific traits through adaptive pruning on a minimal sub-network +while preserving generic knowledge in the remaining parameters. CRoP +demonstrates superior personalization effectiveness and intra-user robustness +across four human-sensing datasets, including two from real-world health +domains, underscoring its practical and social impact. Additionally, to support +CRoP's generalization ability and design choices, we provide empirical +justification through gradient inner product analysis, ablation studies, and +comparisons against state-of-the-art baselines. + +
+
+ comment: 33 pages, 6 figues and 12 tables +
+
+
+
+
+ + ♻ ☆ MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation + Framework + + +
+ Medical imaging segmentation is a highly active area of research, with deep +learning-based methods achieving state-of-the-art results in several +benchmarks. However, the lack of standardized tools for training, testing, and +evaluating new methods makes the comparison of methods difficult. To address +this, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple, +modular, and end-to-end medical imaging segmentation framework designed to +facilitate consistent training, testing, and evaluation of deep learning-based +medical imaging segmentation methods. MIST standardizes data analysis, +preprocessing, and evaluation pipelines, accommodating multiple architectures +and loss functions. This standardization ensures reproducible and fair +comparisons across different methods. We detail MIST's data format +requirements, pipelines, and auxiliary features and demonstrate its efficacy +using the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results +highlight MIST's ability to produce accurate segmentation masks and its +scalability across multiple GPUs, showcasing its potential as a powerful tool +for future medical imaging research and development. + +
+
+ comment: Submitted to BraTS 2024 +
+
+
+
+
+ + ♻ ☆ Backdoor defense, learnability and obfuscation + + +
+ We introduce a formal notion of defendability against backdoors using a game +between an attacker and a defender. In this game, the attacker modifies a +function to behave differently on a particular input known as the "trigger", +while behaving the same almost everywhere else. The defender then attempts to +detect the trigger at evaluation time. If the defender succeeds with high +enough probability, then the function class is said to be defendable. The key +constraint on the attacker that makes defense possible is that the attacker's +strategy must work for a randomly-chosen trigger. + Our definition is simple and does not explicitly mention learning, yet we +demonstrate that it is closely connected to learnability. In the +computationally unbounded setting, we use a voting algorithm of Hanneke et al. +(2022) to show that defendability is essentially determined by the VC dimension +of the function class, in much the same way as PAC learnability. In the +computationally bounded setting, we use a similar argument to show that +efficient PAC learnability implies efficient defendability, but not conversely. +On the other hand, we use indistinguishability obfuscation to show that the +class of polynomial size circuits is not efficiently defendable. Finally, we +present polynomial size decision trees as a natural example for which defense +is strictly easier than learning. Thus, we identify efficient defendability as +a notable intermediate concept in between efficient learnability and +obfuscation. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ Identifying and Addressing Delusions for Target-Directed Decision-Making + + +
+ Target-directed agents utilize self-generated targets, to guide their +behaviors for better generalization. These agents are prone to blindly chasing +problematic targets, resulting in worse generalization and safety catastrophes. +We show that these behaviors can be results of delusions, stemming from +improper designs around training: the agent may naturally come to hold false +beliefs about certain targets. We identify delusions via intuitive examples in +controlled environments, and investigate their causes and mitigations. With the +insights, we demonstrate how we can make agents address delusions preemptively +and autonomously. We validate empirically the effectiveness of the proposed +strategies in correcting delusional behaviors and improving out-of-distribution +generalization. + +
+
+ comment: 20241118 12h40: incorporated changes of rebuttal +
+
+
+
+
+ + ♻ ☆ DAWN: Designing Distributed Agents in a Worldwide Network + + +
+ The rapid evolution of Large Language Models (LLMs) has transformed them from +basic conversational tools into sophisticated entities capable of complex +reasoning and decision-making. These advancements have led to the development +of specialized LLM-based agents designed for diverse tasks such as coding and +web browsing. As these agents become more capable, the need for a robust +framework that facilitates global communication and collaboration among them +towards advanced objectives has become increasingly critical. Distributed +Agents in a Worldwide Network (DAWN) addresses this need by offering a +versatile framework that integrates LLM-based agents with traditional software +systems, enabling the creation of agentic applications suited for a wide range +of use cases. DAWN enables distributed agents worldwide to register and be +easily discovered through Gateway Agents. Collaborations among these agents are +coordinated by a Principal Agent equipped with reasoning strategies. DAWN +offers three operational modes: No-LLM Mode for deterministic tasks, Copilot +for augmented decision-making, and LLM Agent for autonomous operations. +Additionally, DAWN ensures the safety and security of agent collaborations +globally through a dedicated safety, security, and compliance layer, protecting +the network against attackers and adhering to stringent security and compliance +standards. These features make DAWN a robust network for deploying agent-based +applications across various industries. + +
+
+
+
+
+ + ♻ ☆ Fine-Tuning a Time Series Foundation Model with Wasserstein Loss + + +
+ Inspired by recent advancements in large language models (LLMs) for Natural +Language Processing (NLP), there has been a surge in research focused on +developing foundational models for time series forecasting. One approach +involves training LLM architectures on tokenized time series data using +cross-entropy loss. Although this method has demonstrated promising results, +cross-entropy loss is primarily designed for classification tasks and does not +account for the distance between classes. To address this limitation, we +propose using the Wasserstein loss for such architectures. To validate our +approach, we fine-tuned a foundational time series model on $22$ zero-shot +datasets, comparing the performance of cross-entropy loss with that of +Wasserstein loss. Our results demonstrate that replacing cross-entropy loss +with Wasserstein loss significantly improves point estimation. + +
+
+ comment: 4 main pages; 2 figures +
+
+
+
+
+ + ♻ ☆ PhD: A ChatGPT-Prompted Visual hallucination Evaluation Dataset + + +
+ Multimodal Large Language Models (MLLMs) hallucinate, resulting in an +emerging topic of visual hallucination evaluation (VHE). This paper contributes +a ChatGPT-Prompted visual hallucination evaluation Dataset (PhD) for objective +VHE at a large scale. The essence of VHE is to ask an MLLM questions about +specific images to assess its susceptibility to hallucination. Depending on +what to ask (objects, attributes, sentiment, etc.) and how the questions are +asked, we structure PhD along two dimensions, i.e., task and mode. Five visual +recognition tasks, ranging from low-level (object / attribute recognition) to +middle-level (sentiment / position recognition and counting), are considered. +Besides a normal visual QA mode, which we term PhD-base, PhD also asks +questions with inaccurate context (PhD-iac) or with incorrect context +(PhD-icc), or with AI-generated counter common sense images (PhD-ccs). We +construct PhD by a ChatGPT-assisted semi-automated pipeline, encompassing four +pivotal modules: task-specific hallucinatory item (hitem) selection, +hitem-embedded question generation, inaccurate / incorrect context generation, +and counter-common-sense (CCS) image generation. With over 14k daily images, +750 CCS images and 102k VQA triplets in total, PhD reveals considerable +variability in MLLMs' performance across various modes and tasks, offering +valuable insights into the nature of hallucination. As such, PhD stands as a +potent tool not only for VHE but may also play a significant role in the +refinement of MLLMs. + +
+
+
+
+
+ + ♻ ☆ DARNet: Dual Attention Refinement Network with Spatiotemporal + Construction for Auditory Attention Detection + + +
+ At a cocktail party, humans exhibit an impressive ability to direct their +attention. The auditory attention detection (AAD) approach seeks to identify +the attended speaker by analyzing brain signals, such as EEG signals. However, +current AAD algorithms overlook the spatial distribution information within EEG +signals and lack the ability to capture long-range latent dependencies, +limiting the model's ability to decode brain activity. To address these issues, +this paper proposes a dual attention refinement network with spatiotemporal +construction for AAD, named DARNet, which consists of the spatiotemporal +construction module, dual attention refinement module, and feature fusion \& +classifier module. Specifically, the spatiotemporal construction module aims to +construct more expressive spatiotemporal feature representations, by capturing +the spatial distribution characteristics of EEG signals. The dual attention +refinement module aims to extract different levels of temporal patterns in EEG +signals and enhance the model's ability to capture long-range latent +dependencies. The feature fusion \& classifier module aims to aggregate +temporal patterns and dependencies from different levels and obtain the final +classification results. The experimental results indicate that compared to the +state-of-the-art models, DARNet achieves an average classification accuracy +improvement of 5.9\% for 0.1s, 4.6\% for 1s, and 3.9\% for 2s on the DTU +dataset. While maintaining excellent classification performance, DARNet +significantly reduces the number of required parameters. Compared to the +state-of-the-art models, DARNet reduces the parameter count by 91\%. Code is +available at: https://github.com/fchest/DARNet.git. + +
+
+
+
+
+ + ♻ ☆ Partial Information Decomposition for Data Interpretability and Feature + Selection + + +
+ In this paper, we introduce Partial Information Decomposition of Features +(PIDF), a new paradigm for simultaneous data interpretability and feature +selection. Contrary to traditional methods that assign a single importance +value, our approach is based on three metrics per feature: the mutual +information shared with the target variable, the feature's contribution to +synergistic information, and the amount of this information that is redundant. +In particular, we develop a novel procedure based on these three metrics, which +reveals not only how features are correlated with the target but also the +additional and overlapping information provided by considering them in +combination with other features. We extensively evaluate PIDF using both +synthetic and real-world data, demonstrating its potential applications and +effectiveness, by considering case studies from genetics and neuroscience. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Structure Enhances the Convergence and Generalizability of + Linear Molecular Representation + + +
+ Language models demonstrate fundamental abilities in syntax, semantics, and +reasoning, though their performance often depends significantly on the inputs +they process. This study introduces TSIS (Simplified TSID) and its +variants:TSISD (TSIS with Depth-First Search), TSISO (TSIS in Order), and TSISR +(TSIS in Random), as integral components of the t-SMILES framework. These +additions complete the framework's design, providing diverse approaches to +molecular representation. Through comprehensive analysis and experiments +employing deep generative models, including GPT, diffusion models, and +reinforcement learning, the findings reveal that the hierarchical structure of +t-SMILES is more straightforward to parse than initially anticipated. +Furthermore, t-SMILES consistently outperforms other linear representations +such as SMILES, SELFIES, and SAFE, demonstrating superior convergence speed and +enhanced generalization capabilities. + +
+
+ comment: 26pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Modulating Language Model Experiences through Frictions NeurIPS + + +
+ Language models are transforming the ways that their users engage with the +world. Despite impressive capabilities, over-consumption of language model +outputs risks propagating unchecked errors in the short-term and damaging human +capabilities for critical thinking in the long-term. How can we develop +scaffolding around language models to curate more appropriate use? We propose +selective frictions for language model experiences, inspired by behavioral +science interventions, to dampen misuse. Frictions involve small modifications +to a user's experience, e.g., the addition of a button impeding model access +and reminding a user of their expertise relative to the model. Through a user +study with real humans, we observe shifts in user behavior from the imposition +of a friction over LLMs in the context of a multi-topic question-answering task +as a representative task that people may use LLMs for, e.g., in education and +information retrieval. We find that frictions modulate over-reliance by driving +down users' click rates while minimally affecting accuracy for those topics. +Yet, frictions may have unintended effects. We find marked differences in +users' click behaviors even on topics where frictions were not provisioned. Our +contributions motivate further study of human-AI behavioral interaction to +inform more effective and appropriate LLM use. + +
+
+ comment: NeurIPS Workshop on Behavioral ML; non-archival +
+
+
+
+
+ + ♻ ☆ Utilizing Large Language Models in an iterative paradigm with domain + feedback for molecule optimization + + +
+ Molecule optimization is a critical task in drug discovery to optimize +desired properties of a given molecule through chemical modification. Despite +Large Language Models (LLMs) holding the potential to efficiently simulate this +task by using natural language to direct the optimization, straightforwardly +utilizing them shows limited performance. In this work, we facilitate utilizing +LLMs in an iterative paradigm by proposing a simple yet highly effective domain +feedback provider, namely $\text{Re}^3$DF. In detail, $\text{Re}^3$DF harnesses +an external toolkit, RDKit, to handle the molecule hallucination, if the +modified molecule is chemically invalid. Otherwise, its desired properties are +computed and compared to the original one, establishing reliable domain +feedback with correct direction and distance towards the objective, followed by +a retrieved example, to guide the LLM to refine the modified molecule. We +conduct experiments across both single- and multi-property objectives with 2 +thresholds, where $\text{Re}^3$DF shows significant improvements. Particularly, +for 20 single-property objectives, $\text{Re}^3$DF enhances Hit ratio by 16.95% +and 20.76% under loose (\texttt{l}) and strict (\texttt{s}) thresholds, +respectively. For 32 multi-property objectives, $\text{Re}^3$DF enhances Hit +ratio by 6.04% and 5.25%. + +
+
+
+
+
+ + ♻ ☆ Read to Play (R2-Play): Decision Transformer with Multimodal Game + Instruction + + +
+ Developing a generalist agent is a longstanding objective in artificial +intelligence. Previous efforts utilizing extensive offline datasets from +various tasks demonstrate remarkable performance in multitasking scenarios +within Reinforcement Learning. However, these works encounter challenges in +extending their capabilities to new tasks. Recent approaches integrate textual +guidance or visual trajectory into decision networks to provide task-specific +contextual cues, representing a promising direction. However, it is observed +that relying solely on textual guidance or visual trajectory is insufficient +for accurately conveying the contextual information of tasks. This paper +explores enhanced forms of task guidance for agents, enabling them to +comprehend gameplay instructions, thereby facilitating a "read-to-play" +capability. Drawing inspiration from the success of multimodal instruction +tuning in visual tasks, we treat the visual-based RL task as a long-horizon +vision task and construct a set of multimodal game instructions to incorporate +instruction tuning into a decision transformer. Experimental results +demonstrate that incorporating multimodal game instructions significantly +enhances the decision transformer's multitasking and generalization +capabilities. + +
+
+
+
+
+ + ♻ ☆ Parsing altered brain connectivity in neurodevelopmental disorders by + integrating graph-based normative modeling and deep generative networks + + +
+ Divergent brain connectivity is thought to underlie the behavioral and +cognitive symptoms observed in many neurodevelopmental disorders. Quantifying +divergence from neurotypical connectivity patterns offers a promising pathway +to inform diagnosis and therapeutic interventions. While advanced neuroimaging +techniques, such as diffusion MRI (dMRI), have facilitated the mapping of +brain's structural connectome, the challenge lies in accurately modeling +developmental trajectories within these complex networked structures to create +robust neurodivergence markers. In this work, we present the Brain +Representation via Individualized Deep Generative Embedding (BRIDGE) framework, +which integrates normative modeling with a bio-inspired deep generative model +to create a reference trajectory of connectivity transformation as part of +neurotypical development. This will enable the assessment of neurodivergence by +comparing individuals to the established neurotypical trajectory. BRIDGE +provides a global neurodivergence score based on the difference between +connectivity-based brain age and chronological age, along with region-wise +neurodivergence maps that highlight localized connectivity differences. +Application of BRIDGE to a large cohort of children with autism spectrum +disorder demonstrates that the global neurodivergence score correlates with +clinical assessments in autism, and the regional map offers insights into the +heterogeneity at the individual level in neurodevelopmental disorders. +Together, the neurodivergence score and map form powerful tools for quantifying +developmental divergence in connectivity patterns, advancing the development of +imaging markers for personalized diagnosis and intervention in various clinical +contexts. + +
+
+
+
+
+ + ♻ ☆ Investigating OCR-Sensitive Neurons to Improve Entity Recognition in + Historical Documents + + +
+ This paper investigates the presence of OCR-sensitive neurons within the +Transformer architecture and their influence on named entity recognition (NER) +performance on historical documents. By analysing neuron activation patterns in +response to clean and noisy text inputs, we identify and then neutralise +OCR-sensitive neurons to improve model performance. Based on two open access +large language models (Llama2 and Mistral), experiments demonstrate the +existence of OCR-sensitive regions and show improvements in NER performance on +historical newspapers and classical commentaries, highlighting the potential of +targeted neuron modulation to improve models' performance on noisy text. + +
+
+
+
+
+ + ♻ ☆ Cooperative Evolutionary Pressure and Diminishing Returns Might Explain + the Fermi Paradox: On What Super-AIs Are Like + + +
+ With an evolutionary approach, the basis of morality can be explained as +adaptations to problems of cooperation. With 'evolution' taken in a broad +sense, AIs that satisfy the conditions for evolution to apply will be subject +to the same cooperative evolutionary pressure as biological entities. Here the +adaptiveness of increased cooperation as material safety and wealth increase is +discussed -- for humans, for other societies, and for AIs. Diminishing +beneficial returns from increased access to material resources also suggests +the possibility that, on the whole, there will be no incentive to for instance +colonize entire galaxies, thus providing a possible explanation of the Fermi +paradox, wondering where everybody is. It is further argued that old societies +could engender, give way to, super-AIs, since it is likely that super-AIs are +feasible, and fitter. Closing is an aside on effective ways for morals and +goals to affect life and society, emphasizing environments, cultures, and laws, +and exemplified by how to eat. + Appended are an algorithm for colonizing for example a galaxy quickly, models +of the evolution of cooperation and fairness under diminishing returns, and +software for simulating signaling development. It is also noted that there can +be no exponential colonization or reproduction, for mathematical reasons, as +each entity takes up a certain amount of space. 'Diminishing returns' is +defined, as less than roots. + +
+
+ comment: 32 pages, 3 figures. Added definition, clarifications, expansions, + references +
+
+
+
+
+ + ♻ ☆ Unconstrained Open Vocabulary Image Classification: Zero-Shot Transfer + from Text to Image via CLIP Inversion WACV 2025 + + +
+ We introduce NOVIC, an innovative real-time uNconstrained Open Vocabulary +Image Classifier that uses an autoregressive transformer to generatively output +classification labels as language. Leveraging the extensive knowledge of CLIP +models, NOVIC harnesses the embedding space to enable zero-shot transfer from +pure text to images. Traditional CLIP models, despite their ability for open +vocabulary classification, require an exhaustive prompt of potential class +labels, restricting their application to images of known content or context. To +address this, we propose an "object decoder" model that is trained on a +large-scale 92M-target dataset of templated object noun sets and LLM-generated +captions to always output the object noun in question. This effectively inverts +the CLIP text encoder and allows textual object labels from essentially the +entire English language to be generated directly from image-derived embedding +vectors, without requiring any a priori knowledge of the potential content of +an image, and without any label biases. The trained decoders are tested on a +mix of manually and web-curated datasets, as well as standard image +classification benchmarks, and achieve fine-grained prompt-free prediction +scores of up to 87.5%, a strong result considering the model must work for any +conceivable image and without any contextual clues. + +
+
+ comment: Published at WACV 2025 +
+
+
+
+
+ + ♻ ☆ Separating Tongue from Thought: Activation Patching Reveals + Language-Agnostic Concept Representations in Transformers ICML 2024 + + +
+ A central question in multilingual language modeling is whether large +language models (LLMs) develop a universal concept representation, disentangled +from specific languages. In this paper, we address this question by analyzing +latent representations (latents) during a word translation task in +transformer-based LLMs. We strategically extract latents from a source +translation prompt and insert them into the forward pass on a target +translation prompt. By doing so, we find that the output language is encoded in +the latent at an earlier layer than the concept to be translated. Building on +this insight, we conduct two key experiments. First, we demonstrate that we can +change the concept without changing the language and vice versa through +activation patching alone. Second, we show that patching with the mean over +latents across different languages does not impair and instead improves the +models' performance in translating the concept. Our results provide evidence +for the existence of language-agnostic concept representations within the +investigated models. + +
+
+ comment: 12 pages, 10 figures, previous version published under the title "How + Do Llamas Process Multilingual Text? A Latent Exploration through Activation + Patching" at the ICML 2024 mechanistic interpretability workshop at + https://openreview.net/forum?id=0ku2hIm4BS +
+
+
+
+
+ + ♻ ☆ BertaQA: How Much Do Language Models Know About Local Culture? + + +
+ Large Language Models (LLMs) exhibit extensive knowledge about the world, but +most evaluations have been limited to global or anglocentric subjects. This +raises the question of how well these models perform on topics relevant to +other cultures, whose presence on the web is not that prominent. To address +this gap, we introduce BertaQA, a multiple-choice trivia dataset that is +parallel in English and Basque. The dataset consists of a local subset with +questions pertinent to the Basque culture, and a global subset with questions +of broader interest. We find that state-of-the-art LLMs struggle with local +cultural knowledge, even as they excel on global topics. However, we show that +continued pre-training in Basque significantly improves the models' performance +on Basque culture, even when queried in English. To our knowledge, this is the +first solid evidence of knowledge transfer from a low-resource to a +high-resource language. Our analysis sheds light on the complex interplay +between language and knowledge, and reveals that some prior findings do not +fully hold when reassessed on local topics. Our dataset and evaluation code are +available under open licenses at https://github.com/juletx/BertaQA. + +
+
+ comment: NEURIPS Datasets & Benchmarks 2024 +
+
+
+
+
+ + ♻ ☆ Specification Overfitting in Artificial Intelligence + + +
+ Machine learning (ML) and artificial intelligence (AI) approaches are often +criticized for their inherent bias and for their lack of control, +accountability, and transparency. Consequently, regulatory bodies struggle with +containing this technology's potential negative side effects. High-level +requirements such as fairness and robustness need to be formalized into +concrete specification metrics, imperfect proxies that capture isolated aspects +of the underlying requirements. Given possible trade-offs between different +metrics and their vulnerability to over-optimization, integrating specification +metrics in system development processes is not trivial. This paper defines +specification overfitting, a scenario where systems focus excessively on +specified metrics to the detriment of high-level requirements and task +performance. We present an extensive literature survey to categorize how +researchers propose, measure, and optimize specification metrics in several AI +fields (e.g., natural language processing, computer vision, reinforcement +learning). Using a keyword-based search on papers from major AI conferences and +journals between 2018 and mid-2023, we identify and analyze 74 papers that +propose or optimize specification metrics. We find that although most papers +implicitly address specification overfitting (e.g., by reporting more than one +specification metric), they rarely discuss which role specification metrics +should play in system development or explicitly define the scope and +assumptions behind metric formulations. + +
+
+ comment: 41 pages, 2 figures. Accepted at Artificial Intelligence Review +
+
+
+
+
+ + ♻ ☆ RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual + Dexterous Robot Hands CoRL + + +
+ It has been a long-standing research goal to endow robot hands with +human-level dexterity. Bi-manual robot piano playing constitutes a task that +combines challenges from dynamic tasks, such as generating fast while precise +motions, with slower but contact-rich manipulation problems. Although +reinforcement learning based approaches have shown promising results in +single-task performance, these methods struggle in a multi-song setting. Our +work aims to close this gap and, thereby, enable imitation learning approaches +for robot piano playing at scale. To this end, we introduce the Robot Piano 1 +Million (RP1M) dataset, containing bi-manual robot piano playing motion data of +more than one million trajectories. We formulate finger placements as an +optimal transport problem, thus, enabling automatic annotation of vast amounts +of unlabeled songs. Benchmarking existing imitation learning approaches shows +that such approaches reach state-of-the-art robot piano playing performance by +leveraging RP1M. + +
+
+ comment: Accepted by Conference on Robot Learning (CoRL) 2024. Project + Website: https://rp1m.github.io/ +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Forgetting in Deep Learning Beyond Continual + Learning + + +
+ Forgetting refers to the loss or deterioration of previously acquired +knowledge. While existing surveys on forgetting have primarily focused on +continual learning, forgetting is a prevalent phenomenon observed in various +other research domains within deep learning. Forgetting manifests in research +fields such as generative models due to generator shifts, and federated +learning due to heterogeneous data distributions across clients. Addressing +forgetting encompasses several challenges, including balancing the retention of +old task knowledge with fast learning of new task, managing task interference +with conflicting goals, and preventing privacy leakage, etc. Moreover, most +existing surveys on continual learning implicitly assume that forgetting is +always harmful. In contrast, our survey argues that forgetting is a +double-edged sword and can be beneficial and desirable in certain cases, such +as privacy-preserving scenarios. By exploring forgetting in a broader context, +we present a more nuanced understanding of this phenomenon and highlight its +potential advantages. Through this comprehensive survey, we aspire to uncover +potential solutions by drawing upon ideas and approaches from various fields +that have dealt with forgetting. By examining forgetting beyond its +conventional boundaries, we hope to encourage the development of novel +strategies for mitigating, harnessing, or even embracing forgetting in real +applications. A comprehensive list of papers about forgetting in various +research fields is available at +\url{https://github.com/EnnengYang/Awesome-Forgetting-in-Deep-Learning}. + +
+
+ comment: accepted at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ♻ ☆ A Complete Survey on LLM-based AI Chatbots + + +
+ The past few decades have witnessed an upsurge in data, forming the +foundation for data-hungry, learning-based AI technology. Conversational +agents, often referred to as AI chatbots, rely heavily on such data to train +large language models (LLMs) and generate new content (knowledge) in response +to user prompts. With the advent of OpenAI's ChatGPT, LLM-based chatbots have +set new standards in the AI community. This paper presents a complete survey of +the evolution and deployment of LLM-based chatbots in various sectors. We first +summarize the development of foundational chatbots, followed by the evolution +of LLMs, and then provide an overview of LLM-based chatbots currently in use +and those in the development phase. Recognizing AI chatbots as tools for +generating new knowledge, we explore their diverse applications across various +industries. We then discuss the open challenges, considering how the data used +to train the LLMs and the misuse of the generated knowledge can cause several +issues. Finally, we explore the future outlook to augment their efficiency and +reliability in numerous applications. By addressing key milestones and the +present-day context of LLM-based chatbots, our survey invites readers to delve +deeper into this realm, reflecting on how their next generation will reshape +conversational AI. + +
+
+ comment: 23 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Unpicking Data at the Seams: VAEs, Disentanglement and Independent + Components + + +
+ Disentanglement, or identifying salient statistically independent factors of +the data, is of interest in many areas of machine learning and statistics, with +relevance to synthetic data generation with controlled properties, robust +classification of features, parsimonious encoding, and a greater understanding +of the generative process underlying the data. Disentanglement arises in +several generative paradigms, including Variational Autoencoders (VAEs), +Generative Adversarial Networks and diffusion models. Particular progress has +recently been made in understanding disentanglement in VAEs, where the choice +of diagonal posterior covariance matrices is suggested to promote mutual +orthogonality between columns of the decoder's Jacobian. We continue this +thread to show how this linear independence translates to statistical +independence, completing the chain in understanding how the VAE's objective +identifies independent components of, or disentangles, the data. + +
+
+
+
+
+ + ♻ ☆ Character is Destiny: Can Role-Playing Language Agents Make + Persona-Driven Decisions? + + +
+ Can Large Language Models (LLMs) simulate humans in making important +decisions? Recent research has unveiled the potential of using LLMs to develop +role-playing language agents (RPLAs), mimicking mainly the knowledge and tones +of various characters. However, imitative decision-making necessitates a more +nuanced understanding of personas. In this paper, we benchmark the ability of +LLMs in persona-driven decision-making. Specifically, we investigate whether +LLMs can predict characters' decisions provided by the preceding stories in +high-quality novels. Leveraging character analyses written by literary experts, +we construct a dataset LIFECHOICE comprising 1,462 characters' decision points +from 388 books. Then, we conduct comprehensive experiments on LIFECHOICE, with +various LLMs and RPLA methodologies. The results demonstrate that +state-of-the-art LLMs exhibit promising capabilities in this task, yet +substantial room for improvement remains. Hence, we further propose the CHARMAP +method, which adopts persona-based memory retrieval and significantly advances +RPLAs on this task, achieving 5.03% increase in accuracy. + +
+
+
+
+
+ + ♻ ☆ ARNN: Attentive Recurrent Neural Network for Multi-channel EEG Signals + to Identify Epileptic Seizures + + +
+ Electroencephalography (EEG) is a widely used tool for diagnosing brain +disorders due to its high temporal resolution, non-invasive nature, and +affordability. Manual analysis of EEG is labor-intensive and requires +expertise, making automatic EEG interpretation crucial for reducing workload +and accurately assessing seizures. In epilepsy diagnosis, prolonged EEG +monitoring generates extensive data, often spanning hours, days, or even weeks. +While machine learning techniques for automatic EEG interpretation have +advanced significantly in recent decades, there remains a gap in its ability to +efficiently analyze large datasets with a balance of accuracy and computational +efficiency. To address the challenges mentioned above, an Attention Recurrent +Neural Network (ARNN) is proposed that can process a large amount of data +efficiently and accurately. This ARNN cell recurrently applies attention layers +along a sequence and has linear complexity with the sequence length and +leverages parallel computation by processing multi-channel EEG signals rather +than single-channel signals. In this architecture, the attention layer is a +computational unit that efficiently applies self-attention and cross-attention +mechanisms to compute a recurrent function over a wide number of state vectors +and input signals. This framework is inspired in part by the attention layer +and long short-term memory (LSTM) cells, but it scales this typical cell up by +several orders to parallelize for multi-channel EEG signals. It inherits the +advantages of attention layers and LSTM gate while avoiding their respective +drawbacks. The model's effectiveness is evaluated through extensive experiments +with heterogeneous datasets, including the CHB-MIT and UPenn and Mayo's Clinic +datasets. + +
+
+ comment: 11 pages, 7 figures, Journal Paper +
+
+
+
+
+ + ♻ ☆ LLMs and Memorization: On Quality and Specificity of Copyright + Compliance + + +
+ Memorization in large language models (LLMs) is a growing concern. LLMs have +been shown to easily reproduce parts of their training data, including +copyrighted work. This is an important problem to solve, as it may violate +existing copyright laws as well as the European AI Act. In this work, we +propose a systematic analysis to quantify the extent of potential copyright +infringements in LLMs using European law as an example. Unlike previous work, +we evaluate instruction-finetuned models in a realistic end-user scenario. Our +analysis builds on a proposed threshold of 160 characters, which we borrow from +the German Copyright Service Provider Act and a fuzzy text matching algorithm +to identify potentially copyright-infringing textual reproductions. The +specificity of countermeasures against copyright infringement is analyzed by +comparing model behavior on copyrighted and public domain data. We investigate +what behaviors models show instead of producing protected text (such as refusal +or hallucination) and provide a first legal assessment of these behaviors. We +find that there are huge differences in copyright compliance, specificity, and +appropriate refusal among popular LLMs. Alpaca, GPT 4, GPT 3.5, and Luminous +perform best in our comparison, with OpenGPT-X, Alpaca, and Luminous producing +a particularly low absolute number of potential copyright violations. Code can +be found at https://github.com/felixbmuller/llms-memorization-copyright. + +
+
+ comment: 10 pages, 3 figures, AIES 2024 conference +
+
+
+
+
+ + ♻ ☆ Word-Sequence Entropy: Towards Uncertainty Estimation in Free-Form + Medical Question Answering Applications and Beyond + + +
+ Uncertainty estimation is crucial for the reliability of safety-critical +human and artificial intelligence (AI) interaction systems, particularly in the +domain of healthcare engineering. However, a robust and general uncertainty +measure for free-form answers has not been well-established in open-ended +medical question-answering (QA) tasks, where generative inequality introduces a +large number of irrelevant words and sequences within the generated set for +uncertainty quantification (UQ), which can lead to biases. This paper +introduces Word-Sequence Entropy (WSE), a method that calibrates uncertainty at +both the word and sequence levels, considering semantic relevance. WSE +quantifies uncertainty in a way that is more closely aligned with the +reliability of LLMs during uncertainty quantification (UQ). We compare WSE with +six baseline methods on five free-form medical QA datasets, utilizing seven +popular large language models (LLMs). Experimental results demonstrate that WSE +exhibits superior performance in UQ under two standard criteria for correctness +evaluation. Additionally, in terms of real-world medical QA applications, the +performance of LLMs is significantly enhanced (e.g., a 6.36% improvement in +model accuracy on the COVID-QA dataset) by employing responses with lower +uncertainty that are identified by WSE as final answers, without any additional +task-specific fine-tuning or architectural modifications. + +
+
+ comment: Accepted by Engineering Applications of Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Hacking Back the AI-Hacker: Prompt Injection as a Defense Against + LLM-driven Cyberattacks + + +
+ Large language models (LLMs) are increasingly being harnessed to automate +cyberattacks, making sophisticated exploits more accessible and scalable. In +response, we propose a new defense strategy tailored to counter LLM-driven +cyberattacks. We introduce Mantis, a defensive framework that exploits LLMs' +susceptibility to adversarial inputs to undermine malicious operations. Upon +detecting an automated cyberattack, Mantis plants carefully crafted inputs into +system responses, leading the attacker's LLM to disrupt their own operations +(passive defense) or even compromise the attacker's machine (active defense). +By deploying purposefully vulnerable decoy services to attract the attacker and +using dynamic prompt injections for the attacker's LLM, Mantis can autonomously +hack back the attacker. In our experiments, Mantis consistently achieved over +95% effectiveness against automated LLM-driven attacks. To foster further +research and collaboration, Mantis is available as an open-source tool: +https://github.com/pasquini-dario/project_mantis + +
+
+ comment: v0.2 (evaluated on more agents) +
+
+
+
+
+ + ♻ ☆ ConU: Conformal Uncertainty in Large Language Models with Correctness + Coverage Guarantees EMNLP 2024 + + +
+ Uncertainty quantification (UQ) in natural language generation (NLG) tasks +remains an open challenge, exacerbated by the closed-source nature of the +latest large language models (LLMs). This study investigates applying conformal +prediction (CP), which can transform any heuristic uncertainty notion into +rigorous prediction sets, to black-box LLMs in open-ended NLG tasks. We +introduce a novel uncertainty measure based on self-consistency theory, and +then develop a conformal uncertainty criterion by integrating the uncertainty +condition aligned with correctness into the CP algorithm. Empirical evaluations +indicate that our uncertainty measure outperforms prior state-of-the-art +methods. Furthermore, we achieve strict control over the correctness coverage +rate utilizing 7 popular LLMs on 4 free-form NLG datasets, spanning +general-purpose and medical scenarios. Additionally, the calibrated prediction +sets with small size further highlights the efficiency of our method in +providing trustworthy guarantees for practical open-ended NLG applications. + +
+
+ comment: Accepted by EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ AI's Spatial Intelligence: Evaluating AI's Understanding of Spatial + Transformations in PSVT:R and Augmented Reality + + +
+ Spatial intelligence is important in Architecture, Construction, Science, +Technology, Engineering, and Mathematics (STEM), and Medicine. Understanding +three-dimensional (3D) spatial rotations can involve verbal descriptions and +visual or interactive examples, illustrating how objects change orientation in +3D space. Recent studies show Artificial Intelligence (AI) with language and +vision capabilities still face limitations in spatial reasoning. In this paper, +we have studied generative AI's spatial capabilities of understanding rotations +of objects utilizing its image and language processing features. We examined +the spatial intelligence of the GPT-4 model with vision in understanding +spatial rotation process with diagrams based on the Revised Purdue Spatial +Visualization Test: Visualization of Rotations (Revised PSVT:R). Next, we +incorporated a layer of coordinate system axes on Revised PSVT:R to study the +variations in GPT-4's performance. We also examined GPT-4's understanding of 3D +rotations in Augmented Reality (AR) scenes that visualize spatial rotations of +an object in 3D space and observed increased accuracy of GPT-4's understanding +of the rotations by adding supplementary textual information depicting the +rotation process or mathematical representations of the rotation (e.g., +matrices). The results indicate that while GPT-4 as a major current Generative +AI model lacks the understanding of a spatial rotation process, it has the +potential to understand the rotation process with additional information that +can be provided by methods such as AR. By combining the potentials in spatial +intelligence of AI with AR's interactive visualization abilities, we expect to +offer enhanced guidance for students' spatial learning activities. Such spatial +guidance can benefit understanding spatial transformations and additionally +support processes like assembly, fabrication, and manufacturing. + +
+
+
+
+
+ + ♻ ☆ Semantic Operators: A Declarative Model for Rich, AI-based Analytics + Over Text Data + + +
+ The semantic capabilities of language models (LMs) have the potential to +enable rich analytics and reasoning over vast knowledge corpora. Unfortunately, +existing systems lack high-level abstractions to perform bulk semantic queries +across large corpora. We introduce semantic operators, a declarative +programming interface that extends the relational model with composable +AI-based operations for bulk semantic queries (e.g., filtering, sorting, +joining or aggregating records using natural language criteria). Each operator +can be implemented and optimized in multiple ways, opening a rich space for +execution plans similar to relational operators. We implement our operators in +LOTUS, an open source query engine with a DataFrame API. Furthermore, we +develop several novel optimizations that take advantage of the declarative +nature of semantic operators to accelerate semantic filtering, clustering and +join operators by up to $400\times$ while offering statistical accuracy +guarantees. We demonstrate LOTUS' effectiveness on real AI applications +including fact-checking, extreme multi-label classification, and search. We +show that the semantic operator model is expressive, capturing state-of-the-art +AI pipelines in a few operator calls, and making it easy to express new +pipelines that achieve up to $180\%$ higher quality. Overall, LOTUS queries +match or exceed the accuracy of state-of-the-art AI pipelines for each task +while running up to 28$\times$ faster. LOTUS is publicly available at +https://github.com/stanford-futuredata/lotus. + +
+
+
+
+
+ + ♻ ☆ The why, what, and how of AI-based coding in scientific research + + +
+ Computer programming (coding) is indispensable for researchers across +disciplines, yet it remains challenging to learn and time-consuming to carry +out. Generative AI, particularly large language models (LLMs), has the +potential to transform coding into intuitive conversations, but best practices +and effective workflows are only emerging. We dissect AI-based coding through +three key lenses: the nature and role of LLMs in coding (why), six types of +coding assistance they provide (what), and a five-step workflow in action with +practical implementation strategies (how). Additionally, we address the +limitations and future outlook of AI in coding. By offering actionable +insights, this framework helps to guide researchers in effectively leveraging +AI to enhance coding practices and education, accelerating scientific progress. + +
+
+ comment: 23 pages, 7 figure, 3 boxes +
+
+
+
+
+ + ♻ ☆ SAD-TIME: a Spatiotemporal-fused network for depression detection with + Automated multi-scale Depth-wise and TIME-interval-related common feature + extractor + + +
+ Background and Objective: Depression is a severe mental disorder, and +accurate diagnosis is pivotal to the cure and rehabilitation of people with +depression. However, the current questionnaire-based diagnostic methods could +bring subjective biases and may be denied by subjects. In search of a more +objective means of diagnosis, researchers have begun to experiment with deep +learning-based methods for identifying depressive disorders in recent years. +Methods: In this study, a novel Spatiotemporal-fused network with Automated +multi-scale Depth-wise and TIME-interval-related common feature extractor +(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common +features extractor (CFE), a spatial sector (SpS), a modified temporal sector +(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale +depth-wise 1D-convolutional neural network and a time-interval embedding +generator, where the unique information of each channel is preserved. The SpS +fuses the functional connectivity with the distance-based connectivity +containing spatial position of EEG electrodes. A multi-head-attention graph +convolutional network is also applied in the SpS to fuse the features from +different EEG channels. The TeS is based on long short-term memory and graph +transformer networks, where the temporal information of different time-windows +is fused. Moreover, the DAL is used after the SpS to obtain the +domain-invariant feature. Results: Experimental results under tenfold +cross-validation show that the proposed SAD-TIME method achieves 92.00% and +94.00% depression classification accuracies on two datasets, respectively, in +cross-subject mode. Conclusion: SAD-TIME is a robust depression detection +model, where the automatedly-generated features, the SpS and the TeS assist the +classification performance with the fusion of the innate spatiotemporal +information in the EEG signals. + +
+
+ comment: 21pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Federated Graph Condensation with Information Bottleneck Principles + + +
+ Graph condensation, which reduces the size of a large-scale graph by +synthesizing a small-scale condensed graph as its substitution, has immediately +benefited various graph learning tasks. However, existing graph condensation +methods rely on centralized data storage, which is unfeasible for real-world +decentralized data distribution, and overlook data holders' privacy-preserving +requirements. To bridge the gap, we propose and study the novel problem of +federated graph condensation for graph neural networks (GNNs). Specifically, we +first propose a general framework for federated graph condensation, in which we +decouple the typical gradient matching process for graph condensation into +client-side gradient calculation and server-side gradient matching. In this +way, the burdensome computation cost in client-side is largely alleviated. +Besides, our empirical studies show that under the federated setting, the +condensed graph will consistently leak data membership privacy, i.e., the +condensed graph during the federated training can be utilized to steal the +training data under the membership inference attacks (MIA). To tackle this +issue, we innovatively incorporate information bottleneck principles into the +federated graph condensation, which only needs to extract partial node features +in one local pre-training step and utilize the features during federated +training. Extensive experiments on real-world datasets demonstrate that our +framework can consistently protect membership privacy during training. +Meanwhile, it also achieves comparable and even superior performance against +existing centralized graph condensation and federated graph learning methods. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ A Framework for Leveraging Partially-Labeled Data for Product + Attribute-Value Identification + + +
+ In the e-commerce domain, the accurate extraction of attribute-value pairs +(e.g., Brand: Apple) from product titles and user search queries is crucial for +enhancing search and recommendation systems. A major challenge with neural +models for this task is the lack of high-quality training data, as the +annotations for attribute-value pairs in the available datasets are often +incomplete. To address this, we introduce GenToC, a model designed for training +directly with partially-labeled data, eliminating the necessity for a fully +annotated dataset. GenToC employs a marker-augmented generative model to +identify potential attributes, followed by a token classification model that +determines the associated values for each attribute. GenToC outperforms +existing state-of-the-art models, exhibiting upto 56.3% increase in the number +of accurate extractions. Furthermore, we utilize GenToC to regenerate the +training dataset to expand attribute-value annotations. This bootstrapping +substantially improves the data quality for training other standard NER models, +which are typically faster but less capable in handling partially-labeled data, +enabling them to achieve comparable performance to GenToC. Our results +demonstrate GenToC's unique ability to learn from a limited set of +partially-labeled data and improve the training of more efficient models, +advancing the automated extraction of attribute-value pairs. Finally, our model +has been successfully integrated into IndiaMART, India's largest B2B e-commerce +platform, achieving a significant increase of 20.2% in the number of correctly +identified attribute-value pairs over the existing deployed system while +achieving a high precision of 89.5%. + +
+
+ comment: Accepted to KDD 2025 ADS Track +
+
+
+
+
+ + ♻ ☆ Automating Autograding: Large Language Models as Test Suite Generators + for Introductory Programming + + +
+ Automatically graded programming assignments provide instant feedback to +students and significantly reduce manual grading time for instructors. However, +creating comprehensive suites of test cases for programming problems within +automatic graders can be time-consuming and complex. The effort needed to +define test suites may deter some instructors from creating additional problems +or lead to inadequate test coverage, potentially resulting in misleading +feedback on student solutions. Such limitations may reduce student access to +the well-documented benefits of timely feedback when learning programming. + In this work, we evaluate the effectiveness of using Large Language Models +(LLMs), as part of a larger workflow, to automatically generate test suites for +CS1-level programming problems. Each problem's statement and reference solution +are provided to GPT-4 to produce a test suite that can be used by an +autograder. We evaluate our proposed approach using a sample of 26 problems, +and more than 25,000 attempted solutions to those problems, submitted by +students in an introductory programming course. We compare the performance of +the LLM-generated test suites against the instructor-created test suites for +each problem. Our findings reveal that LLM-generated test suites can correctly +identify most valid solutions, and for most problems are at least as +comprehensive as the instructor test suites. Additionally, the LLM-generated +test suites exposed ambiguities in some problem statements, underscoring their +potential to improve both autograding and instructional design. + +
+
+ comment: Submitted to Journal of Computer Assisted Learning; updated table + refs +
+
+
+
+
+ + ♻ ☆ Optimized Feature Generation for Tabular Data via LLMs with Decision + Tree Reasoning NeurIPS 2024 + + +
+ In tabular prediction tasks, tree-based models combined with automated +feature engineering methods often outperform deep learning approaches that rely +on learned representations. While these feature engineering techniques are +effective, they typically depend on a pre-defined search space and primarily +use validation scores for feature selection, thereby missing valuable insights +from previous experiments. To address these limitations, we propose a novel +tabular learning framework that utilizes large language models (LLMs), termed +Optimizing Column feature generator with decision Tree reasoning (OCTree). Our +key idea is to leverage the reasoning capabilities of LLMs to identify +effective feature generation rules without manually specifying the search space +and provide language-based reasoning information highlighting past experiments +as feedback for iterative rule improvements. We use decision trees to convey +this reasoning information, as they can be easily represented in natural +language, effectively providing knowledge from prior experiments (i.e., the +impact of the generated features on performance) to the LLMs. Our empirical +results demonstrate that OCTree consistently enhances the performance of +various prediction models across diverse benchmarks, outperforming competing +automated feature engineering methods. Code is available at +https://github.com/jaehyun513/OCTree. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Open Domain Question Answering with Conflicting Contexts + + +
+ Open domain question answering systems frequently rely on information +retrieved from large collections of text (such as the Web) to answer questions. +However, such collections of text often contain conflicting information, and +indiscriminately depending on this information may result in untruthful and +inaccurate answers. To understand the gravity of this problem, we collect a +human-annotated dataset, Question Answering with Conflicting Contexts (QACC), +and find that as much as 25% of unambiguous, open domain questions can lead to +conflicting contexts when retrieved using Google Search. We evaluate and +benchmark three powerful Large Language Models (LLMs) with our dataset QACC and +demonstrate their limitations in effectively addressing questions with +conflicting information. To explore how humans reason through conflicting +contexts, we request our annotators to provide explanations for their +selections of correct answers. We demonstrate that by finetuning LLMs to +explain their answers, we can introduce richer information into their training +that guide them through the process of reasoning with conflicting contexts. + +
+
+
+
+
+ + ♻ ☆ LibreLog: Accurate and Efficient Unsupervised Log Parsing Using + Open-Source Large Language Models + + +
+ Log parsing is a critical step that transforms unstructured log data into +structured formats, facilitating subsequent log-based analysis. Traditional +syntax-based log parsers are efficient and effective, but they often experience +decreased accuracy when processing logs that deviate from the predefined rules. +Recently, large language models (LLM) based log parsers have shown superior +parsing accuracy. However, existing LLM-based parsers face three main +challenges: 1)time-consuming and labor-intensive manual labeling for +fine-tuning or in-context learning, 2)increased parsing costs due to the vast +volume of log data and limited context size of LLMs, and 3)privacy risks from +using commercial models like ChatGPT with sensitive log information. To +overcome these limitations, this paper introduces LibreLog, an unsupervised log +parsing approach that leverages open-source LLMs (i.e., Llama3-8B) to enhance +privacy and reduce operational costs while achieving state-of-the-art parsing +accuracy. LibreLog first groups logs with similar static text but varying +dynamic variables using a fixed-depth grouping tree. It then parses logs within +these groups using three components: i)similarity scoring-based retrieval +augmented generation: selects diverse logs within each group based on Jaccard +similarity, helping the LLM distinguish between static text and dynamic +variables; ii)self-reflection: iteratively query LLMs to refine log templates +to improve parsing accuracy; and iii) log template memory: stores parsed +templates to reduce LLM queries for improved parsing efficiency. Our evaluation +on LogHub-2.0 shows that LibreLog achieves 25% higher parsing accuracy and +processes logs 2.7 times faster compared to state-of-the-art LLM-based parsers. +In short, LibreLog addresses privacy and cost concerns of using commercial LLMs +while achieving state-of-the-arts parsing efficiency and accuracy. + +
+
+
+
+
+ + ♻ ☆ CerviXpert: A Multi-Structural Convolutional Neural Network for + Predicting Cervix Type and Cervical Cell Abnormalities + + +
+ Cervical cancer is a major cause of cancer-related mortality among women +worldwide, and its survival rate improves significantly with early detection. +Traditional diagnostic methods such as Pap smears and cervical biopsies rely +heavily on cytologist expertise, making the process prone to human error. This +study introduces CerviXpert, a multi-structural convolutional neural network +model designed to efficiently classify cervix types and detect cervical cell +abnormalities. CerviXpert is built as a computationally efficient model that +classifies cervical cancer using images from the publicly available SiPaKMeD +dataset. The model architecture emphasizes simplicity, using a limited number +of convolutional layers followed by max pooling and dense layers, trained from +scratch. + We assessed the performance of CerviXpert against other state of the art +convolutional neural network models including ResNet50, VGG16, MobileNetV2, and +InceptionV3, evaluating them on accuracy, computational efficiency, and +robustness using five fold cross validation. CerviXpert achieved an accuracy of +98.04 percent in classifying cervical cell abnormalities into three classes and +98.60 percent for five class cervix type classification, outperforming +MobileNetV2 and InceptionV3 in both accuracy and computational requirements. It +showed comparable results to ResNet50 and VGG16 while reducing computational +complexity and resource needs. + CerviXpert provides an effective solution for cervical cancer screening and +diagnosis, balancing accuracy with computational efficiency. Its streamlined +design enables deployment in resource constrained environments, potentially +enhancing early detection and management of cervical cancer. + +
+
+ comment: 11 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Matching Patients to Clinical Trials with Large Language Models + + +
+ Patient recruitment is challenging for clinical trials. We introduce +TrialGPT, an end-to-end framework for zero-shot patient-to-trial matching with +large language models. TrialGPT comprises three modules: it first performs +large-scale filtering to retrieve candidate trials (TrialGPT-Retrieval); then +predicts criterion-level patient eligibility (TrialGPT-Matching); and finally +generates trial-level scores (TrialGPT-Ranking). We evaluate TrialGPT on three +cohorts of 183 synthetic patients with over 75,000 trial annotations. +TrialGPT-Retrieval can recall over 90% of relevant trials using less than 6% of +the initial collection. Manual evaluations on 1,015 patient-criterion pairs +show that TrialGPT-Matching achieves an accuracy of 87.3% with faithful +explanations, close to the expert performance. The TrialGPT-Ranking scores are +highly correlated with human judgments and outperform the best-competing models +by 43.8% in ranking and excluding trials. Furthermore, our user study reveals +that TrialGPT can reduce the screening time by 42.6% in patient recruitment. +Overall, these results have demonstrated promising opportunities for +patient-to-trial matching with TrialGPT. + +
+
+ comment: Nature Communications +
+
+
+
+
+ + ♻ ☆ MEEG and AT-DGNN: Improving EEG Emotion Recognition with Music + Introducing and Graph-based Learning + + +
+ We present the MEEG dataset, a multi-modal collection of music-induced +electroencephalogram (EEG) recordings designed to capture emotional responses +to various musical stimuli across different valence and arousal levels. This +public dataset facilitates an in-depth examination of brainwave patterns within +musical contexts, providing a robust foundation for studying brain network +topology during emotional processing. Leveraging the MEEG dataset, we introduce +the Attention-based Temporal Learner with Dynamic Graph Neural Network +(AT-DGNN), a novel framework for EEG-based emotion recognition. This model +combines an attention mechanism with a dynamic graph neural network (DGNN) to +capture intricate EEG dynamics. The AT-DGNN achieves state-of-the-art (SOTA) +performance with an accuracy of 83.74% in arousal recognition and 86.01% in +valence recognition, outperforming existing SOTA methods. Comparative analysis +with traditional datasets, such as DEAP, further validates the model's +effectiveness and underscores the potency of music as an emotional stimulus. +This study advances graph-based learning methodology in brain-computer +interfaces (BCI), significantly improving the accuracy of EEG-based emotion +recognition. The MEEG dataset and source code are publicly available at +https://github.com/xmh1011/AT-DGNN. + +
+
+
+
+
+ + ♻ ☆ DreamText: High Fidelity Scene Text Synthesis + + +
+ Scene text synthesis involves rendering specified texts onto arbitrary +images. Current methods typically formulate this task in an end-to-end manner +but lack effective character-level guidance during training. Besides, their +text encoders, pre-trained on a single font type, struggle to adapt to the +diverse font styles encountered in practical applications. Consequently, these +methods suffer from character distortion, repetition, and absence, particularly +in polystylistic scenarios. To this end, this paper proposes DreamText for +high-fidelity scene text synthesis. Our key idea is to reconstruct the +diffusion training process, introducing more refined guidance tailored to this +task, to expose and rectify the model's attention at the character level and +strengthen its learning of text regions. This transformation poses a hybrid +optimization challenge, involving both discrete and continuous variables. To +effectively tackle this challenge, we employ a heuristic alternate optimization +strategy. Meanwhile, we jointly train the text encoder and generator to +comprehensively learn and utilize the diverse font present in the training +dataset. This joint training is seamlessly integrated into the alternate +optimization process, fostering a synergistic relationship between learning +character embedding and re-estimating character attention. Specifically, in +each step, we first encode potential character-generated position information +from cross-attention maps into latent character masks. These masks are then +utilized to update the representation of specific characters in the current +step, which, in turn, enables the generator to correct the character's +attention in the subsequent steps. Both qualitative and quantitative results +demonstrate the superiority of our method to the state of the art. + +
+
+ comment: Code: https://github.com/CodeGoat24/DreamText, Project page: + https://codegoat24.github.io/DreamText/ +
+
+
+
+
+ + ♻ ☆ Large Language Models and Cognitive Science: A Comprehensive Review of + Similarities, Differences, and Challenges + + +
+ This comprehensive review explores the intersection of Large Language Models +(LLMs) and cognitive science, examining similarities and differences between +LLMs and human cognitive processes. We analyze methods for evaluating LLMs +cognitive abilities and discuss their potential as cognitive models. The review +covers applications of LLMs in various cognitive fields, highlighting insights +gained for cognitive science research. We assess cognitive biases and +limitations of LLMs, along with proposed methods for improving their +performance. The integration of LLMs with cognitive architectures is examined, +revealing promising avenues for enhancing artificial intelligence (AI) +capabilities. Key challenges and future research directions are identified, +emphasizing the need for continued refinement of LLMs to better align with +human cognition. This review provides a balanced perspective on the current +state and future potential of LLMs in advancing our understanding of both +artificial and human intelligence. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ MagicFace: Training-free Universal-Style Human Image Customized + Synthesis + + +
+ Current human image customization methods leverage Stable Diffusion (SD) for +its rich semantic prior. However, since SD is not specifically designed for +human-oriented generation, these methods often require extensive fine-tuning on +large-scale datasets, which renders them susceptible to overfitting and hinders +their ability to personalize individuals with previously unseen styles. +Moreover, these methods extensively focus on single-concept human image +synthesis and lack the flexibility to customize individuals using multiple +given concepts, thereby impeding their broader practical application. This +paper proposes MagicFace, a novel training-free method for multi-concept +universal-style human image personalized synthesis. Our core idea is to +simulate how humans create images given specific concepts, i.e., first +establish a semantic layout considering factors such as concepts' shape and +posture, then optimize details by comparing with concepts at the pixel level. +To implement this process, we introduce a coarse-to-fine generation pipeline, +involving two sequential stages: semantic layout construction and concept +feature injection. This is achieved by our Reference-aware Self-Attention (RSA) +and Region-grouped Blend Attention (RBA) mechanisms. In the first stage, RSA +enables the latent image to query features from all reference concepts +simultaneously, extracting the overall semantic understanding to facilitate the +initial semantic layout establishment. In the second stage, we employ an +attention-based semantic segmentation method to pinpoint the latent generated +regions of all concepts at each step. Following this, RBA divides the pixels of +the latent image into semantic groups, with each group querying fine-grained +features from the corresponding reference concept. Extensive experiments +demonstrate the superiority of our MagicFace. + +
+
+ comment: project page: https://codegoat24.github.io/MagicFace +
+
+
+
+
+ + ♻ ☆ ObjectNLQ @ Ego4D Episodic Memory Challenge 2024 CVPR + + +
+ In this report, we present our approach for the Natural Language Query track +and Goal Step track of the Ego4D Episodic Memory Benchmark at CVPR 2024. Both +challenges require the localization of actions within long video sequences +using textual queries. To enhance localization accuracy, our method not only +processes the temporal information of videos but also identifies fine-grained +objects spatially within the frames. To this end, we introduce a novel +approach, termed ObjectNLQ, which incorporates an object branch to augment the +video representation with detailed object information, thereby improving +grounding efficiency. ObjectNLQ achieves a mean R@1 of 23.15, ranking 2nd in +the Natural Language Queries Challenge, and gains 33.00 in terms of the metric +R@1, IoU=0.3, ranking 3rd in the Goal Step Challenge. Our code will be released +at https://github.com/Yisen-Feng/ObjectNLQ. + +
+
+ comment: The solution for the Natural Language Query track and Goal Step track + at CVPR EgoVis Workshop 2024 +
+
+
+
+
+ + ♻ ☆ Towards Empirical Interpretation of Internal Circuits and Properties in + Grokked Transformers on Modular Polynomials + + +
+ Grokking has been actively explored to reveal the mystery of delayed +generalization and identifying interpretable representations and algorithms +inside the grokked models is a suggestive hint to understanding its mechanism. +Grokking on modular addition has been known to implement Fourier representation +and its calculation circuits with trigonometric identities in Transformers. +Considering the periodicity in modular arithmetic, the natural question is to +what extent these explanations and interpretations hold for the grokking on +other modular operations beyond addition. For a closer look, we first +hypothesize that any modular operations can be characterized with distinctive +Fourier representation or internal circuits, grokked models obtain common +features transferable among similar operations, and mixing datasets with +similar operations promotes grokking. Then, we extensively examine them by +learning Transformers on complex modular arithmetic tasks, including +polynomials. Our Fourier analysis and novel progress measure for modular +arithmetic, Fourier Frequency Density and Fourier Coefficient Ratio, +characterize distinctive internal representations of grokked models per modular +operation; for instance, polynomials often result in the superposition of the +Fourier components seen in elementary arithmetic, but clear patterns do not +emerge in challenging non-factorizable polynomials. In contrast, our ablation +study on the pre-grokked models reveals that the transferability among the +models grokked with each operation can be only limited to specific +combinations, such as from elementary arithmetic to linear expressions. +Moreover, some multi-task mixtures may lead to co-grokking -- where grokking +simultaneously happens for all the tasks -- and accelerate generalization, +while others may not find optimal solutions. We provide empirical steps towards +the interpretability of internal circuits. + +
+
+ comment: Published at Transactions on Machine Learning Research (TMLR), Code: + https://github.com/frt03/grok_mod_poly +
+
+
+
+
+ + ♻ ☆ Multi-modal Situated Reasoning in 3D Scenes NeurIPS 2024 + + +
+ Situation awareness is essential for understanding and reasoning about 3D +scenes in embodied AI agents. However, existing datasets and benchmarks for +situated understanding are limited in data modality, diversity, scale, and task +scope. To address these limitations, we propose Multi-modal Situated Question +Answering (MSQA), a large-scale multi-modal situated reasoning dataset, +scalably collected leveraging 3D scene graphs and vision-language models (VLMs) +across a diverse range of real-world 3D scenes. MSQA includes 251K situated +question-answering pairs across 9 distinct question categories, covering +complex scenarios within 3D scenes. We introduce a novel interleaved +multi-modal input setting in our benchmark to provide text, image, and point +cloud for situation and question description, resolving ambiguity in previous +single-modality convention (e.g., text). Additionally, we devise the +Multi-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models' +situated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN +highlight the limitations of existing vision-language models and underscore the +importance of handling multi-modal interleaved inputs and situation modeling. +Experiments on data scaling and cross-domain transfer further demonstrate the +efficacy of leveraging MSQA as a pre-training dataset for developing more +powerful situated reasoning models. + +
+
+ comment: Accepted by NeurIPS 2024 Datasets and Benchmarks Track. Project page: + https://msr3d.github.io/ +
+
+
+
+
+ + ♻ ☆ ptt5-v2: A Closer Look at Continued Pretraining of T5 Models for the + Portuguese Language + + +
+ Despite advancements in Natural Language Processing (NLP) and the growing +availability of pretrained models, the English language remains the primary +focus of model development. Continued pretraining on language-specific corpora +provides a practical solution for adapting models to other languages. However, +the impact of different pretraining settings on downstream tasks remains +underexplored. This work introduces $\texttt{ptt5-v2}$, investigating the +continued pretraining of T5 models for Portuguese. We first develop a baseline +set of settings and pretrain models with sizes up to 3B parameters. Finetuning +on three Portuguese downstream tasks (assin2 STS, assin2 RTE, and TweetSentBR) +yields SOTA results on the latter two. We then explore the effects of different +pretraining configurations, including pretraining data quality, optimization +strategies, and multi-epoch pretraining. Perhaps surprisingly, their impact +remains subtle compared to our baseline. We release $\texttt{ptt5-v2}$ +pretrained checkpoints and their MonoT5-based finetuned $\texttt{MonoPTT5}$ +rerankers on HuggingFace in their respective collections at +\url{https://huggingface.co/unicamp-dl}. + +
+
+
+
+
+ + ♻ ☆ Uncovering Hidden Connections: Iterative Search and Reasoning for + Video-grounded Dialog + + +
+ In contrast to conventional visual question answering, video-grounded dialog +necessitates a profound understanding of both dialog history and video content +for accurate response generation. Despite commendable progress made by existing +approaches, they still face the challenges of incrementally understanding +complex dialog history and assimilating video information. In response to these +challenges, we present an iterative search and reasoning framework, which +consists of a textual encoder, a visual encoder, and a generator. Specifically, +we devise a path search and aggregation strategy in the textual encoder, mining +core cues from dialog history that are pivotal to understanding the posed +questions. Concurrently, our visual encoder harnesses an iterative reasoning +network to extract and emphasize critical visual markers from videos, enhancing +the depth of visual comprehension. Finally, we utilize the pre-trained GPT-2 +model as our answer generator to decode the mined hidden clues into coherent +and contextualized answers. Extensive experiments on three public datasets +demonstrate the effectiveness and generalizability of our proposed framework. + +
+
+
+
+
+ + ♻ ☆ Redefining Proactivity for Information Seeking Dialogue + + +
+ Information-Seeking Dialogue (ISD) agents aim to provide accurate responses +to user queries. While proficient in directly addressing user queries, these +agents, as well as LLMs in general, predominantly exhibit reactive behavior, +lacking the ability to generate proactive responses that actively engage users +in sustained conversations. However, existing definitions of proactive dialogue +in this context do not focus on how each response actively engages the user and +sustains the conversation. Hence, we present a new definition of proactivity +that focuses on enhancing the `proactiveness' of each generated response via +the introduction of new information related to the initial query. To this end, +we construct a proactive dialogue dataset comprising 2,000 single-turn +conversations, and introduce several automatic metrics to evaluate response +`proactiveness' which achieved high correlation with human annotation. +Additionally, we introduce two innovative Chain-of-Thought (CoT) prompts, the +3-step CoT and the 3-in-1 CoT prompts, which consistently outperform standard +prompts by up to 90% in the zero-shot setting. + +
+
+
+
+
+ + ♻ ☆ Autoregressive Action Sequence Learning for Robotic Manipulation + + +
+ Designing a universal policy architecture that performs well across diverse +robots and task configurations remains a key challenge. In this work, we +address this by representing robot actions as sequential data and generating +actions through autoregressive sequence modeling. Existing autoregressive +architectures generate end-effector waypoints sequentially as word tokens in +language modeling, which are limited to low-frequency control tasks. Unlike +language, robot actions are heterogeneous and often include continuous values +-- such as joint positions, 2D pixel coordinates, and end-effector poses -- +which are not easily suited for language-based modeling. Based on this insight, +we introduce a straightforward enhancement: we extend causal transformers' +single-token prediction to support predicting a variable number of tokens in a +single step through our Chunking Causal Transformer (CCT). This enhancement +enables robust performance across diverse tasks of various control frequencies, +greater efficiency by having fewer autoregression steps, and lead to a hybrid +action sequence design by mixing different types of actions and using a +different chunk size for each action type. Based on CCT, we propose the +Autoregressive Policy (ARP) architecture, which solves manipulation tasks by +generating hybrid action sequences. We evaluate ARP across diverse robotic +manipulation environments, including Push-T, ALOHA, and RLBench, and show that +ARP, as a universal architecture, outperforms the environment-specific +state-of-the-art in all tested benchmarks, while being more efficient in +computation and parameter sizes. Videos of our real robot demonstrations, all +source code and the pretrained models of ARP can be found at +http://github.com/mlzxy/arp. + +
+
+
+
+
+ + ♻ ☆ Searching for internal symbols underlying deep learning + + +
+ Deep learning (DL) enables deep neural networks (DNNs) to automatically learn +complex tasks or rules from given examples without instructions or guiding +principles. As we do not engineer DNNs' functions, it is extremely difficult to +diagnose their decisions, and multiple lines of studies proposed to explain the +principles of their operations. Notably, one line of studies suggests that DNNs +may learn concepts, the high level features that are recognizable to humans. In +this study, we extend this line of studies and hypothesize that DNNs can +develop abstract codes that can be used to augment DNNs' decision-making. To +address this hypothesis, we combine foundation segmentation models and +unsupervised learning to extract internal codes and identify potential use of +abstract codes to make DL's decision-making more reliable and safer. + +
+
+ comment: 16 pages, 10 figures, 5 tables and 1 supplementary table +
+
+
+
+
+ + ♻ ☆ To what extent can ASV systems naturally defend against spoofing + attacks? + + +
+ The current automatic speaker verification (ASV) task involves making binary +decisions on two types of trials: target and non-target. However, emerging +advancements in speech generation technology pose significant threats to the +reliability of ASV systems. This study investigates whether ASV effortlessly +acquires robustness against spoofing attacks (i.e., zero-shot capability) by +systematically exploring diverse ASV systems and spoofing attacks, ranging from +traditional to cutting-edge techniques. Through extensive analyses conducted on +eight distinct ASV systems and 29 spoofing attack systems, we demonstrate that +the evolution of ASV inherently incorporates defense mechanisms against +spoofing attacks. Nevertheless, our findings also underscore that the +advancement of spoofing attacks far outpaces that of ASV systems, hence +necessitating further research on spoofing-robust ASV methodologies. + +
+
+ comment: 5 pages, 3 figures, 3 tables, Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Introducing Spectral Attention for Long-Range Dependency in Time Series + Forecasting NeurIPS + 2024 + + +
+ Sequence modeling faces challenges in capturing long-range dependencies +across diverse tasks. Recent linear and transformer-based forecasters have +shown superior performance in time series forecasting. However, they are +constrained by their inherent inability to effectively address long-range +dependencies in time series data, primarily due to using fixed-size inputs for +prediction. Furthermore, they typically sacrifice essential temporal +correlation among consecutive training samples by shuffling them into +mini-batches. To overcome these limitations, we introduce a fast and effective +Spectral Attention mechanism, which preserves temporal correlations among +samples and facilitates the handling of long-range information while +maintaining the base model structure. Spectral Attention preserves long-period +trends through a low-pass filter and facilitates gradient to flow between +samples. Spectral Attention can be seamlessly integrated into most sequence +models, allowing models with fixed-sized look-back windows to capture +long-range dependencies over thousands of steps. Through extensive experiments +on 11 real-world time series datasets using 7 recent forecasting models, we +consistently demonstrate the efficacy of our Spectral Attention mechanism, +achieving state-of-the-art results. + +
+
+ comment: Co-first Author: Bong Gyun Kang, Dongjun Lee. Accepted to NeurIPS + 2024 +
+
+
+
+
+ + ♻ ☆ ControlNet++: Improving Conditional Controls with Efficient Consistency + Feedback + + +
+ To enhance the controllability of text-to-image diffusion models, existing +efforts like ControlNet incorporated image-based conditional controls. In this +paper, we reveal that existing methods still face significant challenges in +generating images that align with the image conditional controls. To this end, +we propose ControlNet++, a novel approach that improves controllable generation +by explicitly optimizing pixel-level cycle consistency between generated images +and conditional controls. Specifically, for an input conditional control, we +use a pre-trained discriminative reward model to extract the corresponding +condition of the generated images, and then optimize the consistency loss +between the input conditional control and extracted condition. A +straightforward implementation would be generating images from random noises +and then calculating the consistency loss, but such an approach requires +storing gradients for multiple sampling timesteps, leading to considerable time +and memory costs. To address this, we introduce an efficient reward strategy +that deliberately disturbs the input images by adding noise, and then uses the +single-step denoised images for reward fine-tuning. This avoids the extensive +costs associated with image sampling, allowing for more efficient reward +fine-tuning. Extensive experiments show that ControlNet++ significantly +improves controllability under various conditional controls. For example, it +achieves improvements over ControlNet by 11.1% mIoU, 13.4% SSIM, and 7.6% RMSE, +respectively, for segmentation mask, line-art edge, and depth conditions. All +the code, models, demo and organized data have been open sourced on our Github +Repo. + +
+
+ comment: Camera Ready Version. Project Page: + https://liming-ai.github.io/ControlNet_Plus_Plus Code & Data: + https://github.com/liming-ai/ControlNet_Plus_Plus +
+
+
+
+
+ + ♻ ☆ Ergonomic Design of Computer Laboratory Furniture: Mismatch Analysis + Utilizing Anthropometric Data of University Students + + +
+ Many studies have shown how ergonomically designed furniture improves +productivity and well-being. As computers have become a part of students' +academic lives, they will grow further in the future. We propose +anthropometric-based furniture dimensions suitable for university students to +improve computer laboratory ergonomics. We collected data from 380 participants +and analyzed 11 anthropometric measurements, correlating them to 11 furniture +dimensions. Two types of furniture were studied: a non-adjustable chair with a +non-adjustable table and an adjustable chair with a non-adjustable table. The +mismatch calculation showed a significant difference between furniture +dimensions and anthropometric measurements. The one-way ANOVA test with a +significance level of 5% also showed a significant difference between proposed +and existing furniture dimensions. The proposed dimensions were found to be +more compatible and reduced mismatch percentages for both males and females +compared to existing furniture. The proposed dimensions of the furniture set +with adjustable seat height showed slightly improved results compared to the +non-adjustable furniture set. This suggests that the proposed dimensions can +improve comfort levels and reduce the risk of musculoskeletal disorders among +students. Further studies on the implementation and long-term effects of these +proposed dimensions in real-world computer laboratory settings are recommended. + +
+
+
+
+
+ + ♻ ☆ Solving Generalized Grouping Problems in Cellular Manufacturing Systems + Using a Network Flow Model + + +
+ This paper focuses on the generalized grouping problem in the context of +cellular manufacturing systems (CMS), where parts may have more than one +process route. A process route lists the machines corresponding to each part of +the operation. Inspired by the extensive and widespread use of network flow +algorithms, this research formulates the process route family formation for +generalized grouping as a unit capacity minimum cost network flow model. The +objective is to minimize dissimilarity (based on the machines required) among +the process routes within a family. The proposed model optimally solves the +process route family formation problem without pre-specifying the number of +part families to be formed. The process route of family formation is the first +stage in a hierarchical procedure. For the second stage (machine cell +formation), two procedures, a quadratic assignment programming (QAP) +formulation, and a heuristic procedure, are proposed. The QAP simultaneously +assigns process route families and machines to a pre-specified number of cells +in such a way that total machine utilization is maximized. The heuristic +procedure for machine cell formation is hierarchical in nature. Computational +results for some test problems show that the QAP and the heuristic procedure +yield the same results. + +
+
+
+
+
+ + ♻ ☆ Backpropagation-Free Multi-modal On-Device Model Adaptation via + Cloud-Device Collaboration + + +
+ In our increasingly interconnected world, where intelligent devices +continually amass copious personalized multi-modal data, a pressing need arises +to deliver high-quality, personalized device-aware services. However, this +endeavor presents a multifaceted challenge to prevailing artificial +intelligence (AI) systems primarily rooted in the cloud. As these systems +grapple with shifting data distributions between the cloud and devices, the +traditional approach of fine-tuning-based adaptation (FTA) exists the following +issues: the costly and time-consuming data annotation required by FTA and the +looming risk of model overfitting. To surmount these challenges, we introduce a +Universal On-Device Multi-modal Model Adaptation Framework, revolutionizing +on-device model adaptation by striking a balance between efficiency and +effectiveness. The framework features the Fast Domain Adaptor (FDA) hosted in +the cloud, providing tailored parameters for the Lightweight Multi-modal Model +on devices. To enhance adaptability across multi-modal tasks, the AnchorFrame +Distribution Reasoner (ADR) minimizes communication costs. Our contributions, +encapsulated in the Cloud-Device Collaboration Multi-modal Parameter Generation +(CDC-MMPG) framework, represent a pioneering solution for on-Device Multi-modal +Model Adaptation (DMMA). Extensive experiments validate the efficiency and +effectiveness of our method, particularly in video question answering and +retrieval tasks, driving forward the integration of intelligent devices into +our daily lives. + +
+
+
+
+
+ + ♻ ☆ T-GAE: Transferable Graph Autoencoder for Network Alignment + + +
+ Network alignment is the task of establishing one-to-one correspondences +between the nodes of different graphs. Although finding a plethora of +applications in high-impact domains, this task is known to be NP-hard in its +general form. Existing optimization algorithms do not scale up as the size of +the graphs increases. While being able to reduce the matching complexity, +current GNN approaches fit a deep neural network on each graph and requires +re-train on unseen samples, which is time and memory inefficient. To tackle +both challenges we propose T-GAE, a transferable graph autoencoder framework +that leverages transferability and stability of GNNs to achieve efficient +network alignment on out-of-distribution graphs without retraining. We prove +that GNN-generated embeddings can achieve more accurate alignment compared to +classical spectral methods. Our experiments on real-world benchmarks +demonstrate that T-GAE outperforms the state-of-the-art optimization method and +the best GNN approach by up to 38.7% and 50.8%, respectively, while being able +to reduce 90% of the training time when matching out-of-distribution large +scale networks. We conduct ablation studies to highlight the effectiveness of +the proposed encoder architecture and training objective in enhancing the +expressiveness of GNNs to match perturbed graphs. T-GAE is also proved to be +flexible to utilize matching algorithms of different complexities. Our code is +available at https://github.com/Jason-Tree/T-GAE. + +
+
+
+
+
+ + ♻ ☆ Contextual Combinatorial Bandits with Probabilistically Triggered Arms ICML + + +
+ We study contextual combinatorial bandits with probabilistically triggered +arms (C$^2$MAB-T) under a variety of smoothness conditions that capture a wide +range of applications, such as contextual cascading bandits and contextual +influence maximization bandits. Under the triggering probability modulated +(TPM) condition, we devise the C$^2$-UCB-T algorithm and propose a novel +analysis that achieves an $\tilde{O}(d\sqrt{KT})$ regret bound, removing a +potentially exponentially large factor $O(1/p_{\min})$, where $d$ is the +dimension of contexts, $p_{\min}$ is the minimum positive probability that any +arm can be triggered, and batch-size $K$ is the maximum number of arms that can +be triggered per round. Under the variance modulated (VM) or triggering +probability and variance modulated (TPVM) conditions, we propose a new +variance-adaptive algorithm VAC$^2$-UCB and derive a regret bound +$\tilde{O}(d\sqrt{T})$, which is independent of the batch-size $K$. As a +valuable by-product, our analysis technique and variance-adaptive algorithm can +be applied to the CMAB-T and C$^2$MAB setting, improving existing results there +as well. We also include experiments that demonstrate the improved performance +of our algorithms compared with benchmark algorithms on synthetic and +real-world datasets. + +
+
+ comment: The 40th International Conference on Machine Learning (ICML), 2023 +
+
+
+
+
+ + ♻ ☆ Homeostatic motion planning with innate physics knowledge + + +
+ Living organisms interact with their surroundings in a closed-loop fashion, +where sensory inputs dictate the initiation and termination of behaviours. Even +simple animals are able to develop and execute complex plans, which has not yet +been replicated in robotics using pure closed-loop input control. We propose a +solution to this problem by defining a set of discrete and temporary +closed-loop controllers, called "tasks", each representing a closed-loop +behaviour. We further introduce a supervisory module which has an innate +understanding of physics and causality, through which it can simulate the +execution of task sequences over time and store the results in a model of the +environment. On the basis of this model, plans can be made by chaining +temporary closed-loop controllers. The proposed framework was implemented for a +real robot and tested in two scenarios as proof of concept. + +
+
+
+
+
+ + ♻ ☆ N-DriverMotion: Driver motion learning and prediction using an + event-based camera and directly trained spiking neural networks on Loihi 2 + + +
+ Driver motion recognition is a principal factor in ensuring the safety of +driving systems. This paper presents a novel system for learning and predicting +driver motions and an event-based high-resolution (1280x720) dataset, +N-DriverMotion, newly collected to train on a neuromorphic vision system. The +system comprises an event-based camera that generates the first high-resolution +driver motion dataset representing spike inputs and efficient spiking neural +networks (SNNs) that are effective in training and predicting the driver's +gestures. The event dataset consists of 13 driver motion categories classified +by direction (front, side), illumination (bright, moderate, dark), and +participant. A novel simplified four-layer convolutional spiking neural network +(CSNN) that we proposed was directly trained using the high-resolution dataset +without any time-consuming preprocessing. This enables efficient adaptation to +on-device SNNs for real-time inference on high-resolution event-based streams. +Compared with recent gesture recognition systems adopting neural networks for +vision processing, the proposed neuromorphic vision system achieves comparable +accuracy, 94.04\%, in recognizing driver motions with the CSNN architecture. +Our proposed CSNN and the dataset can be used to develop safer and more +efficient driver monitoring systems for autonomous vehicles or edge devices +requiring an efficient neural network architecture. + +
+
+ comment: Accepted for publication in IEEE Open Journal of Vehicular Technology + (OJVT) on 18 November 2024 +
+
+
+
+
+ + ♻ ☆ Scideator: Human-LLM Scientific Idea Generation Grounded in + Research-Paper Facet Recombination + + +
+ The scientific ideation process often involves blending salient aspects of +existing papers to create new ideas. To see if large language models (LLMs) can +assist this process, we contribute Scideator, a novel mixed-initiative tool for +scientific ideation. Starting from a user-provided set of papers, Scideator +extracts key facets (purposes, mechanisms, and evaluations) from these and +relevant papers, allowing users to explore the idea space by interactively +recombining facets to synthesize inventive ideas. Scideator also helps users to +gauge idea novelty by searching the literature for potential overlaps and +showing automated novelty assessments and explanations. To support these tasks, +Scideator introduces four LLM-powered retrieval-augmented generation (RAG) +modules: Analogous Paper Facet Finder, Faceted Idea Generator, Idea Novelty +Checker, and Idea Novelty Iterator. In a within-subjects user study, 19 +computer-science researchers identified significantly more interesting ideas +using Scideator compared to a strong baseline combining a scientific search +engine with LLM interaction. + +
+
+ comment: Revised TextGRAD results after noting inaccuracies in their reporting +
+
+
+
+
+ + ♻ ☆ ShiftAddLLM: Accelerating Pretrained LLMs via Post-Training + Multiplication-Less Reparameterization NeurIPS 2024 + + +
+ Large language models (LLMs) have shown impressive performance on language +tasks but face challenges when deployed on resource-constrained devices due to +their extensive parameters and reliance on dense multiplications, resulting in +high memory demands and latency bottlenecks. Shift-and-add reparameterization +offers a promising solution by replacing costly multiplications with +hardware-friendly primitives in both the attention and multi-layer perceptron +(MLP) layers of an LLM. However, current reparameterization techniques require +training from scratch or full parameter fine-tuning to restore accuracy, which +is resource-intensive for LLMs. To address this, we propose accelerating +pretrained LLMs through post-training shift-and-add reparameterization, +creating efficient multiplication-free models, dubbed ShiftAddLLM. +Specifically, we quantize each weight matrix into binary matrices paired with +group-wise scaling factors. The associated multiplications are reparameterized +into (1) shifts between activations and scaling factors and (2) queries and +adds according to the binary matrices. To reduce accuracy loss, we present a +multi-objective optimization method to minimize both weight and output +activation reparameterization errors. Additionally, based on varying +sensitivity across layers to reparameterization, we develop an automated bit +allocation strategy to further reduce memory usage and latency. Experiments on +five LLM families and eight tasks consistently validate the effectiveness of +ShiftAddLLM, achieving average perplexity improvements of 5.6 and 22.7 points +at comparable or lower latency compared to the most competitive quantized LLMs +at 3 and 2 bits, respectively, and more than 80% memory and energy reductions +over the original LLMs. Codes and models are available at +https://github.com/GATECH-EIC/ShiftAddLLM. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Batch-Size Independent Regret Bounds for Combinatorial Semi-Bandits with + Probabilistically Triggered Arms or Independent Arms + + +
+ In this paper, we study the combinatorial semi-bandits (CMAB) and focus on +reducing the dependency of the batch-size $K$ in the regret bound, where $K$ is +the total number of arms that can be pulled or triggered in each round. First, +for the setting of CMAB with probabilistically triggered arms (CMAB-T), we +discover a novel (directional) triggering probability and variance modulated +(TPVM) condition that can replace the previously-used smoothness condition for +various applications, such as cascading bandits, online network exploration and +online influence maximization. Under this new condition, we propose a BCUCB-T +algorithm with variance-aware confidence intervals and conduct regret analysis +which reduces the $O(K)$ factor to $O(\log K)$ or $O(\log^2 K)$ in the regret +bound, significantly improving the regret bounds for the above applications. +Second, for the setting of non-triggering CMAB with independent arms, we +propose a SESCB algorithm which leverages on the non-triggering version of the +TPVM condition and completely removes the dependency on $K$ in the leading +regret. As a valuable by-product, the regret analysis used in this paper can +improve several existing results by a factor of $O(\log K)$. Finally, +experimental evaluations show our superior performance compared with benchmark +algorithms in different applications. + +
+
+
+
+
+
+
+
+ + Computation and Language 73 + +
+
+
+ + ☆ Bi-Mamba: Towards Accurate 1-Bit State Space Models + + +
+ The typical selective state-space model (SSM) of Mamba addresses several +limitations of Transformers, such as quadratic computational complexity with +sequence length and significant inference-time memory requirements due to the +key-value cache. However, the growing size of Mamba models continues to pose +training and deployment challenges and raises environmental concerns due to +considerable energy consumption. In this work, we introduce Bi-Mamba, a +scalable and powerful 1-bit Mamba architecture designed for more efficient +large language models with multiple sizes across 780M, 1.3B, and 2.7B. Bi-Mamba +models are trained from scratch on data volume as regular LLM pertaining using +an autoregressive distillation loss. Extensive experimental results on language +modeling demonstrate that Bi-Mamba achieves performance comparable to its +full-precision counterparts (e.g., FP16 or BF16) and much better accuracy than +post-training-binarization (PTB) Mamba baselines, while significantly reducing +memory footprint and energy consumption compared to the original Mamba model. +Our study pioneers a new linear computational complexity LLM framework under +low-bit representation and facilitates the future design of specialized +hardware tailored for efficient 1-bit Mamba-based LLMs. + +
+
+
+
+
+ + ☆ Tackling prediction tasks in relational databases with LLMs + + +
+ Though large language models (LLMs) have demonstrated exceptional performance +across numerous problems, their application to predictive tasks in relational +databases remains largely unexplored. In this work, we address the notion that +LLMs cannot yield satisfactory results on relational databases due to their +interconnected tables, complex relationships, and heterogeneous data types. +Using the recently introduced RelBench benchmark, we demonstrate that even a +straightforward application of LLMs achieves competitive performance on these +tasks. These findings establish LLMs as a promising new baseline for ML on +relational databases and encourage further research in this direction. + +
+
+
+
+
+ + ☆ CNMBert: A Model For Hanyu Pinyin Abbreviation to Character Conversion + Task + + +
+ The task of converting Hanyu Pinyin abbreviations to Chinese characters +represents a significant branch within the domain of Chinese Spelling +Correction (CSC). This task is typically one of text-length alignment, however, +due to the limited informational content in pinyin abbreviations, achieving +accurate conversion is challenging. In this paper, we propose CNMBert which +stands for zh-CN Pinyin Multi-mask Bert Model as a solution to this issue. +CNMBert surpasses few-shot GPT models, achieving a 59.63% MRR on a +10,424-sample Hanyu Pinyin abbreviation test dataset. + +
+
+ comment: 9 pages, 2figures +
+
+
+
+
+ + ☆ Drowning in Documents: Consequences of Scaling Reranker Inference + + +
+ Rerankers, typically cross-encoders, are often used to re-score the documents +retrieved by cheaper initial IR systems. This is because, though expensive, +rerankers are assumed to be more effective. We challenge this assumption by +measuring reranker performance for full retrieval, not just re-scoring +first-stage retrieval. Our experiments reveal a surprising trend: the best +existing rerankers provide diminishing returns when scoring progressively more +documents and actually degrade quality beyond a certain limit. In fact, in this +setting, rerankers can frequently assign high scores to documents with no +lexical or semantic overlap with the query. We hope that our findings will spur +future research to improve reranking. + +
+
+
+
+
+ + ☆ The Power of Many: Multi-Agent Multimodal Models for Cultural Image + Captioning + + +
+ Large Multimodal Models (LMMs) exhibit impressive performance across various +multimodal tasks. However, their effectiveness in cross-cultural contexts +remains limited due to the predominantly Western-centric nature of most data +and models. Conversely, multi-agent models have shown significant capability in +solving complex tasks. Our study evaluates the collective performance of LMMs +in a multi-agent interaction setting for the novel task of cultural image +captioning. Our contributions are as follows: (1) We introduce MosAIC, a +Multi-Agent framework to enhance cross-cultural Image Captioning using LMMs +with distinct cultural personas; (2) We provide a dataset of culturally +enriched image captions in English for images from China, India, and Romania +across three datasets: GeoDE, GD-VCR, CVQA; (3) We propose a culture-adaptable +metric for evaluating cultural information within image captions; and (4) We +show that the multi-agent interaction outperforms single-agent models across +different metrics, and offer valuable insights for future research. Our dataset +and models can be accessed at https://github.com/MichiganNLP/MosAIC. + +
+
+
+
+
+ + ☆ Advacheck at GenAI Detection Task 1: AI Detection Powered by + Domain-Aware Multi-Tasking + + +
+ The paper describes a system designed by Advacheck team to recognise +machine-generated and human-written texts in the monolingual subtask of GenAI +Detection Task 1 competition. Our developed system is a multi-task architecture +with shared Transformer Encoder between several classification heads. One head +is responsible for binary classification between human-written and +machine-generated texts, while the other heads are auxiliary multiclass +classifiers for texts of different domains from particular datasets. As +multiclass heads were trained to distinguish the domains presented in the data, +they provide a better understanding of the samples. This approach led us to +achieve the first place in the official ranking with 83.07% macro F1-score on +the test set and bypass the baseline by 10%. We further study obtained system +through ablation, error and representation analyses, finding that multi-task +learning outperforms single-task mode and simultaneous tasks form a cluster +structure in embeddings space. + +
+
+
+
+
+ + ☆ Moral Persuasion in Large Language Models: Evaluating Susceptibility and + Ethical Alignment + + +
+ We explore how large language models (LLMs) can be influenced by prompting +them to alter their initial decisions and align them with established ethical +frameworks. Our study is based on two experiments designed to assess the +susceptibility of LLMs to moral persuasion. In the first experiment, we examine +the susceptibility to moral ambiguity by evaluating a Base Agent LLM on morally +ambiguous scenarios and observing how a Persuader Agent attempts to modify the +Base Agent's initial decisions. The second experiment evaluates the +susceptibility of LLMs to align with predefined ethical frameworks by prompting +them to adopt specific value alignments rooted in established philosophical +theories. The results demonstrate that LLMs can indeed be persuaded in morally +charged scenarios, with the success of persuasion depending on factors such as +the model used, the complexity of the scenario, and the conversation length. +Notably, LLMs of distinct sizes but from the same company produced markedly +different outcomes, highlighting the variability in their susceptibility to +ethical persuasion. + +
+
+
+
+
+ + ☆ FedCoLLM: A Parameter-Efficient Federated Co-tuning Framework for Large + and Small Language Models + + +
+ By adapting Large Language Models (LLMs) to domain-specific tasks or +enriching them with domain-specific knowledge, we can fully harness the +capabilities of LLMs. Nonetheless, a gap persists in achieving simultaneous +mutual enhancement between the server's LLM and the downstream clients' Small +Language Models (SLMs). To address this, we propose FedCoLLM, a novel and +parameter-efficient federated framework designed for co-tuning LLMs and SLMs. +This approach is aimed at adaptively transferring server-side LLMs knowledge to +clients' SLMs while simultaneously enriching the LLMs with domain insights from +the clients. To accomplish this, FedCoLLM utilizes lightweight adapters in +conjunction with SLMs, facilitating knowledge exchange between server and +clients in a manner that respects data privacy while also minimizing +computational and communication overhead. Our evaluation of FedCoLLM, utilizing +various public LLMs and SLMs across a range of NLP text generation tasks, +reveals that the performance of clients' SLMs experiences notable improvements +with the assistance of the LLMs. Simultaneously, the LLMs enhanced via FedCoLLM +achieves comparable performance to that obtained through direct fine-tuning on +clients' data. + +
+
+
+
+
+ + ☆ Technical Report: Enhancing LLM Reasoning with Reward-guided Tree Search + + +
+ Recently, test-time scaling has garnered significant attention from the +research community, largely due to the substantial advancements of the o1 model +released by OpenAI. By allocating more computational resources during the +inference phase, large language models~(LLMs) can extensively explore the +solution space by generating more thought tokens or diverse solutions, thereby +producing more accurate responses. However, developing an o1-like reasoning +approach is challenging, and researchers have been making various attempts to +advance this open area of research. In this paper, we present a preliminary +exploration into enhancing the reasoning abilities of LLMs through +reward-guided tree search algorithms. This framework is implemented by +integrating the policy model, reward model, and search algorithm. It is +primarily constructed around a tree search algorithm, where the policy model +navigates a dynamically expanding tree guided by a specially trained reward +model. We thoroughly explore various design considerations necessary for +implementing this framework and provide a detailed report of the technical +aspects. To assess the effectiveness of our approach, we focus on mathematical +reasoning tasks and conduct extensive evaluations on four challenging datasets, +significantly enhancing the reasoning abilities of LLMs. + +
+
+ comment: LLM;Complex Reasoning;Math +
+
+
+
+
+ + ☆ Chapter 7 Review of Data-Driven Generative AI Models for Knowledge + Extraction from Scientific Literature in Healthcare + + +
+ This review examines the development of abstractive NLP-based text +summarization approaches and compares them to existing techniques for +extractive summarization. A brief history of text summarization from the 1950s +to the introduction of pre-trained language models such as Bidirectional +Encoder Representations from Transformer (BERT) and Generative Pre-training +Transformers (GPT) are presented. In total, 60 studies were identified in +PubMed and Web of Science, of which 29 were excluded and 24 were read and +evaluated for eligibility, resulting in the use of seven studies for further +analysis. This chapter also includes a section with examples including an +example of a comparison between GPT-3 and state-of-the-art GPT-4 solutions in +scientific text summarisation. Natural language processing has not yet reached +its full potential in the generation of brief textual summaries. As there are +acknowledged concerns that must be addressed, we can expect gradual +introduction of such models in practise. + +
+
+ comment: 16 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ Federated Incremental Named Entity Recognition + + +
+ Federated Named Entity Recognition (FNER) boosts model training within each +local client by aggregating the model updates of decentralized local clients, +without sharing their private data. However, existing FNER methods assume fixed +entity types and local clients in advance, leading to their ineffectiveness in +practical applications. In a more realistic scenario, local clients receive new +entity types continuously, while new local clients collecting novel data may +irregularly join the global FNER training. This challenging setup, referred to +here as Federated Incremental NER, renders the global model suffering from +heterogeneous forgetting of old entity types from both intra-client and +inter-client perspectives. To overcome these challenges, we propose a +Local-Global Forgetting Defense (LGFD) model. Specifically, to address +intra-client forgetting, we develop a structural knowledge distillation loss to +retain the latent space's feature structure and a pseudo-label-guided +inter-type contrastive loss to enhance discriminative capability over different +entity types, effectively preserving previously learned knowledge within local +clients. To tackle inter-client forgetting, we propose a task switching monitor +that can automatically identify new entity types under privacy protection and +store the latest old global model for knowledge distillation and +pseudo-labeling. Experiments demonstrate significant improvement of our LGFD +model over comparison methods. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ OASIS: Open Agents Social Interaction Simulations on One Million Agents + + +
+ There has been a growing interest in enhancing rule-based agent-based models +(ABMs) for social media platforms (\emph{i.e.}, X, Reddit) with more realistic +large language model (LLM) agents, thereby allowing for a more nuanced study of +complex systems. As a result, several LLM-based ABMs have been proposed in the +past year. While they hold promise, each simulator is specifically designed to +study a particular scenario, making it time-consuming and resource-intensive to +explore other phenomena using the same ABM. Additionally, these models simulate +only a limited number of agents, whereas real-world social media platforms +involve millions of users. To this end, we propose OASIS, a generalizable and +scalable social media simulator. OASIS is designed based on real-world social +media platforms, incorporating dynamically updated environments (\emph{i.e.}, +dynamic social networks and post information), diverse action spaces +(\emph{i.e.}, following, commenting), and recommendation systems (\emph{i.e.}, +interest-based and hot-score-based). Additionally, OASIS supports large-scale +user simulations, capable of modeling up to one million users. With these +features, OASIS can be easily extended to different social media platforms to +study large-scale group phenomena and behaviors. We replicate various social +phenomena, including information spreading, group polarization, and herd +effects across X and Reddit platforms. Moreover, we provide observations of +social phenomena at different agent group scales. We observe that the larger +agent group scale leads to more enhanced group dynamics and more diverse and +helpful agents' opinions. These findings demonstrate OASIS's potential as a +powerful tool for studying complex systems in digital environments. + +
+
+
+
+
+ + ☆ Addressing Hallucinations in Language Models with Knowledge Graph + Embeddings as an Additional Modality + + +
+ In this paper we present an approach to reduce hallucinations in Large +Language Models (LLMs) by incorporating Knowledge Graphs (KGs) as an additional +modality. Our method involves transforming input text into a set of KG +embeddings and using an adapter to integrate these embeddings into the language +model space, without relying on external retrieval processes. + To facilitate this, we created WikiEntities, a dataset containing over 3 +million Wikipedia texts annotated with entities from Wikidata and their +corresponding embeddings from PyTorch-BigGraph. This dataset serves as a +valuable resource for training Entity Linking models and adapting the described +method to various LLMs using specialized adapters. + Our method does not require fine-tuning of the language models themselves; +instead, we only train the adapter. This ensures that the model's performance +on other tasks is not affected. We trained an adapter for the Mistral 7B, LLaMA +2-7B (chat), and LLaMA 3-8B (instruct) models using this dataset and +demonstrated that our approach improves performance on the HaluEval, True-False +benchmarks and FEVER dataset. The results indicate that incorporating KGs as a +new modality can effectively reduce hallucinations and improve the factual +accuracy of language models, all without the need for external retrieval. + +
+
+
+
+
+ + ☆ Search, Verify and Feedback: Towards Next Generation Post-training + Paradigm of Foundation Models via Verifier Engineering + + +
+ The evolution of machine learning has increasingly prioritized the +development of powerful models and more scalable supervision signals. However, +the emergence of foundation models presents significant challenges in providing +effective supervision signals necessary for further enhancing their +capabilities. Consequently, there is an urgent need to explore novel +supervision signals and technical approaches. In this paper, we propose +verifier engineering, a novel post-training paradigm specifically designed for +the era of foundation models. The core of verifier engineering involves +leveraging a suite of automated verifiers to perform verification tasks and +deliver meaningful feedback to foundation models. We systematically categorize +the verifier engineering process into three essential stages: search, verify, +and feedback, and provide a comprehensive review of state-of-the-art research +developments within each stage. We believe that verifier engineering +constitutes a fundamental pathway toward achieving Artificial General +Intelligence. + +
+
+
+
+
+ + ☆ Safe + Safe = Unsafe? Exploring How Safe Images Can Be Exploited to + Jailbreak Large Vision-Language Models + + +
+ Recent advances in Large Vision-Language Models (LVLMs) have showcased strong +reasoning abilities across multiple modalities, achieving significant +breakthroughs in various real-world applications. Despite this great success, +the safety guardrail of LVLMs may not cover the unforeseen domains introduced +by the visual modality. Existing studies primarily focus on eliciting LVLMs to +generate harmful responses via carefully crafted image-based jailbreaks +designed to bypass alignment defenses. In this study, we reveal that a safe +image can be exploited to achieve the same jailbreak consequence when combined +with additional safe images and prompts. This stems from two fundamental +properties of LVLMs: universal reasoning capabilities and safety snowball +effect. Building on these insights, we propose Safety Snowball Agent (SSA), a +novel agent-based framework leveraging agents' autonomous and tool-using +abilities to jailbreak LVLMs. SSA operates through two principal stages: (1) +initial response generation, where tools generate or retrieve jailbreak images +based on potential harmful intents, and (2) harmful snowballing, where refined +subsequent prompts induce progressively harmful outputs. Our experiments +demonstrate that \ours can use nearly any image to induce LVLMs to produce +unsafe content, achieving high success jailbreaking rates against the latest +LVLMs. Unlike prior works that exploit alignment flaws, \ours leverages the +inherent properties of LVLMs, presenting a profound challenge for enforcing +safety in generative multimodal systems. Our code is avaliable at +\url{https://github.com/gzcch/Safety_Snowball_Agent}. + +
+
+
+
+
+ + ☆ Quantifying Preferences of Vision-Language Models via Value + Decomposition in Social Media Contexts + + +
+ The rapid advancement of Vision-Language Models (VLMs) has expanded +multimodal applications, yet evaluations often focus on basic tasks like object +recognition, overlooking abstract aspects such as personalities and values. To +address this gap, we introduce Value-Spectrum, a visual question-answering +benchmark aimed at assessing VLMs based on Schwartz's value dimensions, which +capture core values guiding people's beliefs and actions across cultures. We +constructed a vectorized database of over 50,000 short videos sourced from +TikTok, YouTube Shorts, and Instagram Reels, covering multiple months and a +wide array of topics such as family, health, hobbies, society, and technology. +We also developed a VLM agent pipeline to automate video browsing and analysis. +Benchmarking representative VLMs on Value-Spectrum reveals significant +differences in their responses to value-oriented content, with most models +exhibiting a preference for hedonistic topics. Beyond identifying natural +preferences, we explored the ability of VLM agents to adopt specific personas +when explicitly prompted, revealing insights into the models' adaptability in +role-playing scenarios. These findings highlight the potential of +Value-Spectrum as a comprehensive evaluation set for tracking VLM advancements +in value-based tasks and for developing more sophisticated role-playing AI +agents. + +
+
+
+
+
+ + ☆ Re-examining learning linear functions in context + + +
+ In context learning (ICL) is an attractive method of solving a wide range of +problems. Inspired by Garg et al. (2022), we look closely at ICL in a variety +of train and test settings for several transformer models of different sizes +trained from scratch. Our study complements prior work by pointing out several +systematic failures of these models to generalize to data not in the training +distribution, thereby showing some limitations of ICL. We find that models +adopt a strategy for this task that is very different from standard solutions. + +
+
+
+
+
+ + ☆ Causal Effect of Group Diversity on Redundancy and Coverage in + Peer-Reviewing + + +
+ A large host of scientific journals and conferences solicit peer reviews from +multiple reviewers for the same submission, aiming to gather a broader range of +perspectives and mitigate individual biases. In this work, we reflect on the +role of diversity in the slate of reviewers assigned to evaluate a submitted +paper as a factor in diversifying perspectives and improving the utility of the +peer-review process. We propose two measures for assessing review utility: +review coverage -- reviews should cover most contents of the paper -- and +review redundancy -- reviews should add information not already present in +other reviews. We hypothesize that reviews from diverse reviewers will exhibit +high coverage and low redundancy. We conduct a causal study of different +measures of reviewer diversity on review coverage and redundancy using +observational data from a peer-reviewed conference with approximately 5,000 +submitted papers. Our study reveals disparate effects of different diversity +measures on review coverage and redundancy. Our study finds that assigning a +group of reviewers that are topically diverse, have different seniority levels, +or have distinct publication networks leads to broader coverage of the paper or +review criteria, but we find no evidence of an increase in coverage for +reviewer slates with reviewers from diverse organizations or geographical +locations. Reviewers from different organizations, seniority levels, topics, or +publications networks (all except geographical diversity) lead to a decrease in +redundancy in reviews. Furthermore, publication network-based diversity alone +also helps bring in varying perspectives (that is, low redundancy), even within +specific review criteria. Our study adopts a group decision-making perspective +for reviewer assignments in peer review and suggests dimensions of diversity +that can help guide the reviewer assignment process. + +
+
+
+
+
+ + ☆ Membership Inference Attack against Long-Context Large Language Models + + +
+ Recent advances in Large Language Models (LLMs) have enabled them to overcome +their context window limitations, and demonstrate exceptional retrieval and +reasoning capacities on longer context. Quesion-answering systems augmented +with Long-Context Language Models (LCLMs) can automatically search massive +external data and incorporate it into their contexts, enabling faithful +predictions and reducing issues such as hallucinations and knowledge staleness. +Existing studies targeting LCLMs mainly concentrate on addressing the so-called +lost-in-the-middle problem or improving the inference effiencicy, leaving their +privacy risks largely unexplored. In this paper, we aim to bridge this gap and +argue that integrating all information into the long context makes it a +repository of sensitive information, which often contains private data such as +medical records or personal identities. We further investigate the membership +privacy within LCLMs external context, with the aim of determining whether a +given document or sequence is included in the LCLMs context. Our basic idea is +that if a document lies in the context, it will exhibit a low generation loss +or a high degree of semantic similarity to the contents generated by LCLMs. We +for the first time propose six membership inference attack (MIA) strategies +tailored for LCLMs and conduct extensive experiments on various popular models. +Empirical results demonstrate that our attacks can accurately infer membership +status in most cases, e.g., 90.66% attack F1-score on Multi-document QA +datasets with LongChat-7b-v1.5-32k, highlighting significant risks of +membership leakage within LCLMs input contexts. Furthermore, we examine the +underlying reasons why LCLMs are susceptible to revealing such membership +information. + +
+
+
+
+
+ + ☆ Rethinking Thinking Tokens: Understanding Why They Underperform in + Practice + + +
+ Thinking Tokens (TT) have been proposed as an unsupervised method to +facilitate reasoning in language models. However, despite their conceptual +appeal, our findings show that TTs marginally improves performance and +consistently underperforms compared to Chain-of-Thought (CoT) reasoning across +multiple benchmarks. We hypothesize that this underperformance stems from the +reliance on a single embedding for TTs, which results in inconsistent learning +signals and introduces noisy gradients. This paper provides a comprehensive +empirical analysis to validate this hypothesis and discusses the implications +for future research on unsupervised reasoning in LLMs. + +
+
+
+
+
+ + ☆ MAIRA-Seg: Enhancing Radiology Report Generation with Segmentation-Aware + Multimodal Large Language Models ML4H 2024 + + +
+ There is growing interest in applying AI to radiology report generation, +particularly for chest X-rays (CXRs). This paper investigates whether +incorporating pixel-level information through segmentation masks can improve +fine-grained image interpretation of multimodal large language models (MLLMs) +for radiology report generation. We introduce MAIRA-Seg, a segmentation-aware +MLLM framework designed to utilize semantic segmentation masks alongside CXRs +for generating radiology reports. We train expert segmentation models to obtain +mask pseudolabels for radiology-specific structures in CXRs. Subsequently, +building on the architectures of MAIRA, a CXR-specialised model for report +generation, we integrate a trainable segmentation tokens extractor that +leverages these mask pseudolabels, and employ mask-aware prompting to generate +draft radiology reports. Our experiments on the publicly available MIMIC-CXR +dataset show that MAIRA-Seg outperforms non-segmentation baselines. We also +investigate set-of-marks prompting with MAIRA and find that MAIRA-Seg +consistently demonstrates comparable or superior performance. The results +confirm that using segmentation masks enhances the nuanced reasoning of MLLMs, +potentially contributing to better clinical outcomes. + +
+
+ comment: Accepted as Proceedings Paper at ML4H 2024 +
+
+
+
+
+ + ☆ Mitigating Knowledge Conflicts in Language Model-Driven Question + Answering + + +
+ Knowledge-aware sequence to sequence generation tasks such as document +question answering and abstract summarization typically requires two types of +knowledge: encoded parametric knowledge and retrieved contextual information. +Previous work show improper correlation between parametric knowledge and +answers in the training set could cause the model ignore input information at +test time, resulting in un-desirable model behaviour such as over-stability and +hallucination. In this work, we argue that hallucination could be mitigated via +explicit correlation between input source and generated content. We focus on a +typical example of hallucination, entity-based knowledge conflicts in question +answering, where correlation of entities and their description at training time +hinders model behaviour during inference. + +
+
+
+
+
+ + ☆ Transcending Language Boundaries: Harnessing LLMs for Low-Resource + Language Translation + + +
+ Large Language Models (LLMs) have demonstrated remarkable success across a +wide range of tasks and domains. However, their performance in low-resource +language translation, particularly when translating into these languages, +remains underexplored. This gap poses significant challenges, as linguistic +barriers hinder the cultural preservation and development of minority +communities. To address this issue, this paper introduces a novel +retrieval-based method that enhances translation quality for low-resource +languages by focusing on key terms, which involves translating keywords and +retrieving corresponding examples from existing data. To evaluate the +effectiveness of this method, we conducted experiments translating from English +into three low-resource languages: Cherokee, a critically endangered indigenous +language of North America; Tibetan, a historically and culturally significant +language in Asia; and Manchu, a language with few remaining speakers. Our +comparison with the zero-shot performance of GPT-4o and LLaMA 3.1 405B, +highlights the significant challenges these models face when translating into +low-resource languages. In contrast, our retrieval-based method shows promise +in improving both word-level accuracy and overall semantic understanding by +leveraging existing resources more effectively. + +
+
+
+
+
+ + ☆ LP Data Pipeline: Lightweight, Purpose-driven Data Pipeline for Large + Language Models + + +
+ Creating high-quality, large-scale datasets for large language models (LLMs) +often relies on resource-intensive, GPU-accelerated models for quality +filtering, making the process time-consuming and costly. This dependence on +GPUs limits accessibility for organizations lacking significant computational +infrastructure. To address this issue, we introduce the Lightweight, +Purpose-driven (LP) Data Pipeline, a framework that operates entirely on CPUs +to streamline the processes of dataset extraction, filtering, and curation. +Based on our four core principles, the LP Data Pipeline significantly reduces +preparation time and cost while maintaining high data quality. Importantly, our +pipeline enables the creation of purpose-driven datasets tailored to specific +domains and languages, enhancing the applicability of LLMs in specialized +contexts. We anticipate that our pipeline will lower the barriers to LLM +development, enabling a wide range of organizations to access LLMs more easily. + +
+
+
+
+
+ + ☆ VersaTune: Fine-Tuning Multi-Ability LLMs Efficiently + + +
+ Large Language Models (LLMs) exhibit remarkable capabilities in handling +multiple tasks across domains due to their emergent properties. These +capabilities are further augmented during the Supervised Fine-Tuning (SFT) +phase. Despite their potential, existing work mainly focuses on domain-specific +enhancements during fine-tuning, the challenge of which lies in catastrophic +forgetting of knowledge across other domains. In this study, we introduce +VersaTune, a novel data composition framework designed for enhancing LLMs' +overall multi-ability performances during fine-tuning. We categorize knowledge +into distinct domains including law, medicine, finance, science, code. We begin +with detecting the distribution of domain-specific knowledge within the base +model, followed by the composition of training data that aligns with the +model's existing knowledge distribution. During the fine-tuning process, +weights of different domains are dynamically adjusted based on their learnable +potential and forgetting degree. Experimental results demonstrate that +VersaTune achieves significant improvements in multi-domain performance, with a +35.21% enhancement in comprehensive multi-domain tasks. Additionally, in +scenarios where specific domain optimization is required, VersaTune reduces the +degradation of performance in other domains by 38.77%, without compromising the +target domain's training efficacy. + +
+
+
+
+
+ + ☆ Large corpora and large language models: a replicable method for + automating grammatical annotation + + +
+ Much linguistic research relies on annotated datasets of features extracted +from text corpora, but the rapid quantitative growth of these corpora has +created practical difficulties for linguists to manually annotate large data +samples. In this paper, we present a replicable, supervised method that +leverages large language models for assisting the linguist in grammatical +annotation through prompt engineering, training, and evaluation. We introduce a +methodological pipeline applied to the case study of formal variation in the +English evaluative verb construction 'consider X (as) (to be) Y', based on the +large language model Claude 3.5 Sonnet and corpus data from Davies' NOW and +EnTenTen21 (SketchEngine). Overall, we reach a model accuracy of over 90% on +our held-out test samples with only a small amount of training data, validating +the method for the annotation of very large quantities of tokens of the +construction in the future. We discuss the generalisability of our results for +a wider range of case studies of grammatical constructions and grammatical +variation and change, underlining the value of AI copilots as tools for future +linguistic research. + +
+
+
+
+
+ + ☆ ZeFaV: Boosting Large Language Models for Zero-shot Fact Verification PRICAI 2024 + + +
+ In this paper, we propose ZeFaV - a zero-shot based fact-checking +verification framework to enhance the performance on fact verification task of +large language models by leveraging the in-context learning ability of large +language models to extract the relations among the entities within a claim, +re-organized the information from the evidence in a relationally logical form, +and combine the above information with the original evidence to generate the +context from which our fact-checking model provide verdicts for the input +claims. We conducted empirical experiments to evaluate our approach on two +multi-hop fact-checking datasets including HoVer and FEVEROUS, and achieved +potential results results comparable to other state-of-the-art fact +verification task methods. + +
+
+ comment: This pre-print has been published in PRICAI 2024: Trends in + Artificial Intelligence. The published version is available at + https://doi.org/10.1007/978-981-96-0119-6_28 +
+
+
+
+
+ + ☆ MEMO-Bench: A Multiple Benchmark for Text-to-Image and Multimodal Large + Language Models on Human Emotion Analysis + + +
+ Artificial Intelligence (AI) has demonstrated significant capabilities in +various fields, and in areas such as human-computer interaction (HCI), embodied +intelligence, and the design and animation of virtual digital humans, both +practitioners and users are increasingly concerned with AI's ability to +understand and express emotion. Consequently, the question of whether AI can +accurately interpret human emotions remains a critical challenge. To date, two +primary classes of AI models have been involved in human emotion analysis: +generative models and Multimodal Large Language Models (MLLMs). To assess the +emotional capabilities of these two classes of models, this study introduces +MEMO-Bench, a comprehensive benchmark consisting of 7,145 portraits, each +depicting one of six different emotions, generated by 12 Text-to-Image (T2I) +models. Unlike previous works, MEMO-Bench provides a framework for evaluating +both T2I models and MLLMs in the context of sentiment analysis. Additionally, a +progressive evaluation approach is employed, moving from coarse-grained to +fine-grained metrics, to offer a more detailed and comprehensive assessment of +the sentiment analysis capabilities of MLLMs. The experimental results +demonstrate that existing T2I models are more effective at generating positive +emotions than negative ones. Meanwhile, although MLLMs show a certain degree of +effectiveness in distinguishing and recognizing human emotions, they fall short +of human-level accuracy, particularly in fine-grained emotion analysis. The +MEMO-Bench will be made publicly available to support further research in this +area. + +
+
+
+
+
+ + ☆ Does Unlearning Truly Unlearn? A Black Box Evaluation of LLM Unlearning + Methods + + +
+ Large language model unlearning aims to remove harmful information that LLMs +have learnt to prevent their use for malicious purposes. LLMU and RMU have been +proposed as two methods for LLM unlearning, achieving impressive results on +unlearning benchmarks. We study in detail the efficacy of these methods by +evaluating their impact on general model capabilities on the WMDP benchmark as +well as a biology benchmark we create. Our experiments show that RMU generally +leads to better preservation of model capabilities, for similar or better +unlearning. We further test the robustness of these methods and find that doing +5-shot prompting or rephrasing the question in simple ways can lead to an over +ten-fold increase in accuracy on unlearning benchmarks. Finally, we show that +training on unrelated data can almost completely recover pre-unlearning +performance, demonstrating that these methods fail at truly unlearning. The +code is available at +$\href{https://github.com/JaiDoshi/Knowledge-Erasure}{this\, https\, URL}$. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ☆ Mitigating Gender Bias in Contextual Word Embeddings + + +
+ Word embeddings have been shown to produce remarkable results in tackling a +vast majority of NLP related tasks. Unfortunately, word embeddings also capture +the stereotypical biases that are prevalent in society, affecting the +predictive performance of the embeddings when used in downstream tasks. While +various techniques have been proposed \cite{bolukbasi2016man, zhao2018learning} +and criticized\cite{gonen2019lipstick} for static embeddings, very little work +has focused on mitigating bias in contextual embeddings. In this paper, we +propose a novel objective function for MLM(Masked-Language Modeling) which +largely mitigates the gender bias in contextual embeddings and also preserves +the performance for downstream tasks. Since previous works on measuring bias in +contextual embeddings lack in normative reasoning, we also propose novel +evaluation metrics that are straight-forward and aligned with our motivations +in debiasing. We also propose new methods for debiasing static embeddings and +provide empirical proof via extensive analysis and experiments, as to why the +main source of bias in static embeddings stems from the presence of +stereotypical names rather than gendered words themselves. All experiments and +embeddings studied are in English, unless otherwise +specified.\citep{bender2011achieving}. + +
+
+
+
+
+ + ☆ Benchmarking pre-trained text embedding models in aligning built asset + information + + +
+ Accurate mapping of the built asset information to established data +classification systems and taxonomies is crucial for effective asset +management, whether for compliance at project handover or ad-hoc data +integration scenarios. Due to the complex nature of built asset data, which +predominantly comprises technical text elements, this process remains largely +manual and reliant on domain expert input. Recent breakthroughs in contextual +text representation learning (text embedding), particularly through pre-trained +large language models, offer promising approaches that can facilitate the +automation of cross-mapping of the built asset data. However, no comprehensive +evaluation has yet been conducted to assess these models' ability to +effectively represent the complex semantics specific to built asset technical +terminology. This study presents a comparative benchmark of state-of-the-art +text embedding models to evaluate their effectiveness in aligning built asset +information with domain-specific technical concepts. Our proposed datasets are +derived from two renowned built asset data classification dictionaries. The +results of our benchmarking across six proposed datasets, covering three tasks +of clustering, retrieval, and reranking, highlight the need for future research +on domain adaptation techniques. The benchmarking resources are published as an +open-source library, which will be maintained and extended to support future +evaluations in this field. + +
+
+
+
+
+ + ☆ ByteScience: Bridging Unstructured Scientific Literature and Structured + Data with Auto Fine-tuned Large Language Model in Token Granularity + + +
+ Natural Language Processing (NLP) is widely used to supply summarization +ability from long context to structured information. However, extracting +structured knowledge from scientific text by NLP models remains a challenge +because of its domain-specific nature to complex data preprocessing and the +granularity of multi-layered device-level information. To address this, we +introduce ByteScience, a non-profit cloud-based auto fine-tuned Large Language +Model (LLM) platform, which is designed to extract structured scientific data +and synthesize new scientific knowledge from vast scientific corpora. The +platform capitalizes on DARWIN, an open-source, fine-tuned LLM dedicated to +natural science. The platform was built on Amazon Web Services (AWS) and +provides an automated, user-friendly workflow for custom model development and +data extraction. The platform achieves remarkable accuracy with only a small +amount of well-annotated articles. This innovative tool streamlines the +transition from the science literature to structured knowledge and data and +benefits the advancements in natural informatics. + +
+
+
+
+
+ + ☆ Understanding Chain-of-Thought in LLMs through Information Theory + + +
+ Large Language Models (LLMs) have shown impressive performance in complex +reasoning tasks through Chain-of-Thought (CoT) reasoning, allowing models to +break down problems into manageable sub-tasks. However, existing CoT evaluation +techniques either require annotated CoT data or fall short in accurately +assessing intermediate reasoning steps, leading to high rates of false +positives. In this paper, we formalize CoT reasoning in LLMs through an +information-theoretic lens. Specifically, our framework quantifies the +`information gain' at each reasoning step, enabling the identification of +failure modes in LLMs without the need for expensive annotated datasets. We +demonstrate the efficacy of our approach through extensive experiments on toy +and GSM-8K data, where it significantly outperforms existing outcome-based +methods by providing more accurate insights into model performance on +individual tasks. + +
+
+
+
+
+ + ☆ Reviving Dormant Memories: Investigating Catastrophic Forgetting in + Language Models through Rationale-Guidance Difficulty + + +
+ Although substantial efforts have been made to mitigate catastrophic +forgetting in continual learning, the intrinsic mechanisms are not well +understood. In this paper, we discover that when a forgetting model passively +receives an externally provided partial appropriate rationale, its performance +on the forgotten task can be restored. Furthermore, by simply adding a +task-agnostic prefix to the original instruction, the forgetting model can +actively generate an appropriate rationale to reach the correct answer. These +findings suggest that the model does not actually ``forget'' the task +knowledge; instead, the degraded performance can be attributed to the failure +of the original instructions in guiding the model to generate the appropriate +rationales. Based on this insight, we propose the Rationale-Guidance Difficulty +metric to evaluate how effectively a given instruction guides the model in +generating appropriate rationales. We apply this metric to optimize the +allocation of replay data in replay-based continual learning algorithm. +Experimental results demonstrate that our data allocation method effectively +mitigates catastrophic forgetting and maintains better model plasticity +simultaneously across models. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ♻ ☆ Toxicity of the Commons: Curating Open-Source Pre-Training Data + + +
+ Open-source large language models are becoming increasingly available and +popular among researchers and practitioners. While significant progress has +been made on open-weight models, open training data is a practice yet to be +adopted by the leading open-weight models creators. At the same time, there +researchers are working to make language models safer. We propose a data +curation pipeline to reduce harmful outputs by models trained on public domain +data. There are unique challenges to working with public domain data, as these +sources differ from web text in both form and content. Many sources are +historical documents and are the result of Optical Character Recognition (OCR). +Consequently, current state-of-the-art approaches to toxicity filtering are +often infeasible or inappropriate for open data models. In this paper, we +introduce a new fully open-source pipeline for open-data toxicity filtering. +Our contributions are threefold. We create a custom training dataset, +ToxicCommons, which is composed of texts which have been classified across five +different dimensions (racial/origin-based, gender/sex-based, religious, +ability-based discrimination, and violence). We use this dataset to train a +custom classifier, Celadon, that can be used to detect toxic content in open +data more efficiently at a larger scale. Finally, we describe the balanced +approach to content filtration that optimizes safety filtering with respect to +the filtered data available for training. + +
+
+
+
+
+ + ♻ ☆ A Perspective for Adapting Generalist AI to Specialized Medical AI + Applications and Their Challenges + + +
+ The integration of Large Language Models (LLMs) into medical applications has +sparked widespread interest across the healthcare industry, from drug discovery +and development to clinical decision support, assisting telemedicine, medical +devices, and healthcare insurance applications. This perspective paper aims to +discuss the inner workings of building LLM-powered medical AI applications and +introduces a comprehensive framework for their development. We review existing +literature and outline the unique challenges of applying LLMs in specialized +medical contexts. Additionally, we introduce a three-step framework to organize +medical LLM research activities: 1) Modeling: breaking down complex medical +workflows into manageable steps for developing medical-specific models; 2) +Optimization: optimizing the model performance with crafted prompts and +integrating external knowledge and tools, and 3) System engineering: +decomposing complex tasks into subtasks and leveraging human expertise for +building medical AI applications. Furthermore, we offer a detailed use case +playbook that describes various LLM-powered medical AI applications, such as +optimizing clinical trial design, enhancing clinical decision support, and +advancing medical imaging analysis. Finally, we discuss various challenges and +considerations for building medical AI applications with LLMs, such as handling +hallucination issues, data ownership and compliance, privacy, intellectual +property considerations, compute cost, sustainability issues, and responsible +AI requirements. + +
+
+
+
+
+ + ♻ ☆ Watermark-based Detection and Attribution of AI-Generated Content + + +
+ Several companies have deployed watermark-based detection to identify +AI-generated content. However, attribution--the ability to trace back to the +user of a generative AI (GenAI) service who created a given piece of +AI-generated content--remains largely unexplored despite its growing +importance. In this work, we aim to bridge this gap by conducting the first +systematic study on watermark-based, user-level attribution of AI-generated +content. Our key idea is to assign a unique watermark to each user of the GenAI +service and embed this watermark into the AI-generated content created by that +user. Attribution is then performed by identifying the user whose watermark +best matches the one extracted from the given content. This approach, however, +faces a key challenge: How should watermarks be selected for users to maximize +attribution performance? To address the challenge, we first theoretically +derive lower bounds on detection and attribution performance through rigorous +probabilistic analysis for any given set of user watermarks. Then, we select +watermarks for users to maximize these lower bounds, thereby optimizing +detection and attribution performance. Our theoretical and empirical results +show that watermark-based attribution inherits both the accuracy and +(non-)robustness properties of the underlying watermark. Specifically, +attribution remains highly accurate when the watermarked AI-generated content +is either not post-processed or subjected to common post-processing such as +JPEG compression, as well as black-box adversarial post-processing with limited +query budgets. + +
+
+
+
+
+ + ♻ ☆ AgentSquare: Automatic LLM Agent Search in Modular Design Space + + +
+ Recent advancements in Large Language Models (LLMs) have led to a rapid +growth of agentic systems capable of handling a wide range of complex tasks. +However, current research largely relies on manual, task-specific design, +limiting their adaptability to novel tasks. In this paper, we introduce a new +research problem: Modularized LLM Agent Search (MoLAS). We propose a modular +design space that abstracts existing LLM agent designs into four fundamental +modules with uniform IO interface: Planning, Reasoning, Tool Use, and Memory. +Building on this design space, we present a novel LLM agent search framework +called AgentSquare, which introduces two core mechanisms, i.e., module +evolution and recombination, to efficiently search for optimized LLM agents. To +further accelerate the process, we design a performance predictor that uses +in-context surrogate models to skip unpromising agent designs. Extensive +experiments across six benchmarks, covering the diverse scenarios of web, +embodied, tool use and game applications, show that AgentSquare substantially +outperforms hand-crafted agents, achieving an average performance gain of 17.2% +against best-known human designs. Moreover, AgentSquare can generate +interpretable design insights, enabling a deeper understanding of agentic +architecture and its impact on task performance. We believe that the modular +design space and AgentSquare search framework offer a platform for fully +exploiting the potential of prior successful designs and consolidating the +collective efforts of research community. Code repo is available at +https://github.com/tsinghua-fib-lab/AgentSquare. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ Fine-Tuning a Time Series Foundation Model with Wasserstein Loss + + +
+ Inspired by recent advancements in large language models (LLMs) for Natural +Language Processing (NLP), there has been a surge in research focused on +developing foundational models for time series forecasting. One approach +involves training LLM architectures on tokenized time series data using +cross-entropy loss. Although this method has demonstrated promising results, +cross-entropy loss is primarily designed for classification tasks and does not +account for the distance between classes. To address this limitation, we +propose using the Wasserstein loss for such architectures. To validate our +approach, we fine-tuned a foundational time series model on $22$ zero-shot +datasets, comparing the performance of cross-entropy loss with that of +Wasserstein loss. Our results demonstrate that replacing cross-entropy loss +with Wasserstein loss significantly improves point estimation. + +
+
+ comment: 4 main pages; 2 figures +
+
+
+
+
+ + ♻ ☆ Investigating OCR-Sensitive Neurons to Improve Entity Recognition in + Historical Documents + + +
+ This paper investigates the presence of OCR-sensitive neurons within the +Transformer architecture and their influence on named entity recognition (NER) +performance on historical documents. By analysing neuron activation patterns in +response to clean and noisy text inputs, we identify and then neutralise +OCR-sensitive neurons to improve model performance. Based on two open access +large language models (Llama2 and Mistral), experiments demonstrate the +existence of OCR-sensitive regions and show improvements in NER performance on +historical newspapers and classical commentaries, highlighting the potential of +targeted neuron modulation to improve models' performance on noisy text. + +
+
+
+
+
+ + ♻ ☆ BeeManc at the PLABA Track of TAC-2024: RoBERTa for task 1 -- LLaMA3.1 + and GPT-4o for task 2 + + +
+ This report is the system description of the BeeManc team for shared task +Plain Language Adaptation of Biomedical Abstracts (PLABA) 2024. This report +contains two sections corresponding to the two sub-tasks in PLABA 2024. In task +one, we applied fine-tuned ReBERTa-Base models to identify and classify the +difficult terms, jargon and acronyms in the biomedical abstracts and reported +the F1 score. Due to time constraints, we didn't finish the replacement task. +In task two, we leveraged Llamma3.1-70B-Instruct and GPT-4o with the one-shot +prompts to complete the abstract adaptation and reported the scores in BLEU, +SARI, BERTScore, LENS, and SALSA. From the official Evaluation from PLABA-2024 +on Task 1A and 1B, our \textbf{much smaller fine-tuned RoBERTa-Base} model +ranked 3rd and 2nd respectively on the two sub-task, and the \textbf{1st on +averaged F1 scores across the two tasks} from 9 evaluated systems. Our +LLaMA-3.1-70B-instructed model achieved the \textbf{highest Completeness} score +for Task-2. We share our fine-tuned models and related resources at +\url{https://github.com/HECTA-UoM/PLABA2024} + +
+
+ comment: ongoing work - system report +
+
+
+
+
+ + ♻ ☆ Unconstrained Open Vocabulary Image Classification: Zero-Shot Transfer + from Text to Image via CLIP Inversion WACV 2025 + + +
+ We introduce NOVIC, an innovative real-time uNconstrained Open Vocabulary +Image Classifier that uses an autoregressive transformer to generatively output +classification labels as language. Leveraging the extensive knowledge of CLIP +models, NOVIC harnesses the embedding space to enable zero-shot transfer from +pure text to images. Traditional CLIP models, despite their ability for open +vocabulary classification, require an exhaustive prompt of potential class +labels, restricting their application to images of known content or context. To +address this, we propose an "object decoder" model that is trained on a +large-scale 92M-target dataset of templated object noun sets and LLM-generated +captions to always output the object noun in question. This effectively inverts +the CLIP text encoder and allows textual object labels from essentially the +entire English language to be generated directly from image-derived embedding +vectors, without requiring any a priori knowledge of the potential content of +an image, and without any label biases. The trained decoders are tested on a +mix of manually and web-curated datasets, as well as standard image +classification benchmarks, and achieve fine-grained prompt-free prediction +scores of up to 87.5%, a strong result considering the model must work for any +conceivable image and without any contextual clues. + +
+
+ comment: Published at WACV 2025 +
+
+
+
+
+ + ♻ ☆ Separating Tongue from Thought: Activation Patching Reveals + Language-Agnostic Concept Representations in Transformers ICML 2024 + + +
+ A central question in multilingual language modeling is whether large +language models (LLMs) develop a universal concept representation, disentangled +from specific languages. In this paper, we address this question by analyzing +latent representations (latents) during a word translation task in +transformer-based LLMs. We strategically extract latents from a source +translation prompt and insert them into the forward pass on a target +translation prompt. By doing so, we find that the output language is encoded in +the latent at an earlier layer than the concept to be translated. Building on +this insight, we conduct two key experiments. First, we demonstrate that we can +change the concept without changing the language and vice versa through +activation patching alone. Second, we show that patching with the mean over +latents across different languages does not impair and instead improves the +models' performance in translating the concept. Our results provide evidence +for the existence of language-agnostic concept representations within the +investigated models. + +
+
+ comment: 12 pages, 10 figures, previous version published under the title "How + Do Llamas Process Multilingual Text? A Latent Exploration through Activation + Patching" at the ICML 2024 mechanistic interpretability workshop at + https://openreview.net/forum?id=0ku2hIm4BS +
+
+
+
+
+ + ♻ ☆ BertaQA: How Much Do Language Models Know About Local Culture? + + +
+ Large Language Models (LLMs) exhibit extensive knowledge about the world, but +most evaluations have been limited to global or anglocentric subjects. This +raises the question of how well these models perform on topics relevant to +other cultures, whose presence on the web is not that prominent. To address +this gap, we introduce BertaQA, a multiple-choice trivia dataset that is +parallel in English and Basque. The dataset consists of a local subset with +questions pertinent to the Basque culture, and a global subset with questions +of broader interest. We find that state-of-the-art LLMs struggle with local +cultural knowledge, even as they excel on global topics. However, we show that +continued pre-training in Basque significantly improves the models' performance +on Basque culture, even when queried in English. To our knowledge, this is the +first solid evidence of knowledge transfer from a low-resource to a +high-resource language. Our analysis sheds light on the complex interplay +between language and knowledge, and reveals that some prior findings do not +fully hold when reassessed on local topics. Our dataset and evaluation code are +available under open licenses at https://github.com/juletx/BertaQA. + +
+
+ comment: NEURIPS Datasets & Benchmarks 2024 +
+
+
+
+
+ + ♻ ☆ Estimating the Influence of Sequentially Correlated Literary Properties + in Textual Classification: A Data-Centric Hypothesis-Testing Approach + + +
+ Stylometry aims to distinguish authors by analyzing literary traits assumed +to reflect semi-conscious choices distinct from elements like genre or theme. +However, these components often overlap, complicating text classification based +solely on feature distributions. While some literary properties, such as +thematic content, are likely to manifest as correlations between adjacent text +units, others, like authorial style, may be independent thereof. We introduce a +hypothesis-testing approach to evaluate the influence of sequentially +correlated literary properties on text classification, aiming to determine when +these correlations drive classification. Using a multivariate binary +distribution, our method models sequential correlations between text units as a +stochastic process, assessing the likelihood of clustering across varying +adjacency scales. This enables us to examine whether classification is +dominated by sequentially correlated properties or remains independent. In +experiments on a diverse English prose corpus, our analysis integrates +traditional and neural embeddings within supervised and unsupervised +frameworks. Results demonstrate that our approach effectively identifies when +textual classification is not primarily influenced by sequentially correlated +literary properties, particularly in cases where texts differ in authorial +style or genre rather than by a single author within a similar genre. + +
+
+
+
+
+ + ♻ ☆ Utilize the Flow before Stepping into the Same River Twice: Certainty + Represented Knowledge Flow for Refusal-Aware Instruction Tuning + + +
+ Refusal-Aware Instruction Tuning (RAIT) enables Large Language Models (LLMs) +to refuse to answer unknown questions. By modifying responses of unknown +questions in the training data to refusal responses such as "I don't know", +RAIT enhances the reliability of LLMs and reduces their hallucination. +Generally, RAIT modifies training samples based on the correctness of the +initial LLM's response. However, this crude approach can cause LLMs to +excessively refuse answering questions they could have correctly answered, the +problem we call over-refusal. In this paper, we explore two primary causes of +over-refusal: Static conflict occurs when similar samples within the LLM's +feature space receive differing supervision signals (original vs. modified "I +don't know"). Dynamic conflict, on the other hand, emerges as the LLM's +knowledge evolves during SFT, allowing it to answer questions that were +previously unanswerable. Yet, these now-answerable training samples still +retain the original "I don't know" supervision signals based on the initial LLM +state, resulting in inconsistencies. These conflicts cause the trained LLM to +misclassify known questions as unknown, resulting in over-refusal. To address +this issue, we introduce Certainty Represented Knowledge Flow for Refusal-Aware +Instructions Tuning (CRaFT). CRaFT centers on two main contributions: First, we +additionally incorporate response certainty to selectively filter and modify +data, reducing static conflicts. Second, we implement preliminary rehearsal +training to characterize changes in the LLM's knowledge state, which helps +mitigate dynamic conflicts during the fine-tuning process. We conducted +extensive experiments on open-ended question answering and multiple-choice +question task. Experiment results show that CRaFT can improve LLM's overall +performance during the RAIT process. Source code and training data will be +released at Github. + +
+
+ comment: Equal contribution: Runchuan Zhu, Zhipeng Ma, Jiang Wu; Corresponding + author: Conghui He +
+
+
+
+
+ + ♻ ☆ A Complete Survey on LLM-based AI Chatbots + + +
+ The past few decades have witnessed an upsurge in data, forming the +foundation for data-hungry, learning-based AI technology. Conversational +agents, often referred to as AI chatbots, rely heavily on such data to train +large language models (LLMs) and generate new content (knowledge) in response +to user prompts. With the advent of OpenAI's ChatGPT, LLM-based chatbots have +set new standards in the AI community. This paper presents a complete survey of +the evolution and deployment of LLM-based chatbots in various sectors. We first +summarize the development of foundational chatbots, followed by the evolution +of LLMs, and then provide an overview of LLM-based chatbots currently in use +and those in the development phase. Recognizing AI chatbots as tools for +generating new knowledge, we explore their diverse applications across various +industries. We then discuss the open challenges, considering how the data used +to train the LLMs and the misuse of the generated knowledge can cause several +issues. Finally, we explore the future outlook to augment their efficiency and +reliability in numerous applications. By addressing key milestones and the +present-day context of LLM-based chatbots, our survey invites readers to delve +deeper into this realm, reflecting on how their next generation will reshape +conversational AI. + +
+
+ comment: 23 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Fine-Grained Verifiers: Preference Modeling as Next-token Prediction in + Vision-Language Alignment + + +
+ The recent advancements in large language models (LLMs) and pre-trained +vision models have accelerated the development of vision-language large models +(VLLMs), enhancing the interaction between visual and linguistic modalities. +Despite their notable success across various domains, VLLMs face challenges in +modality alignment, which can lead to issues like hallucinations and unsafe +content generation. Current alignment techniques often rely on coarse feedback +and external datasets, limiting scalability and performance. In this paper, we +propose FiSAO (Fine-Grained Self-Alignment Optimization), a novel +self-alignment method that utilizes the model's own visual encoder as a +fine-grained verifier to improve vision-language alignment without the need for +additional data. By leveraging token-level feedback from the vision encoder, +FiSAO significantly improves vision-language alignment, even surpassing +traditional preference tuning methods that require additional data. Through +both theoretical analysis and experimental validation, we demonstrate that +FiSAO effectively addresses the misalignment problem in VLLMs, marking the +first instance of token-level rewards being applied to such models. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ Not Eliminate but Aggregate: Post-Hoc Control over Mixture-of-Experts to + Address Shortcut Shifts in Natural Language Understanding + + +
+ Recent models for natural language understanding are inclined to exploit +simple patterns in datasets, commonly known as shortcuts. These shortcuts hinge +on spurious correlations between labels and latent features existing in the +training data. At inference time, shortcut-dependent models are likely to +generate erroneous predictions under distribution shifts, particularly when +some latent features are no longer correlated with the labels. To avoid this, +previous studies have trained models to eliminate the reliance on shortcuts. In +this study, we explore a different direction: pessimistically aggregating the +predictions of a mixture-of-experts, assuming each expert captures relatively +different latent features. The experimental results demonstrate that our +post-hoc control over the experts significantly enhances the model's robustness +to the distribution shift in shortcuts. Besides, we show that our approach has +some practical advantages. We also analyze our model and provide results to +support the assumption. + +
+
+ comment: 21 pages, 5 figures (the layout differs from the MIT Press + publication version) +
+
+
+
+
+ + ♻ ☆ Exploring Context Window of Large Language Models via Decomposed + Positional Vectors + + +
+ Transformer-based large language models (LLMs) typically have a limited +context window, resulting in significant performance degradation when +processing text beyond the length of the context window. Extensive studies have +been proposed to extend the context window and achieve length extrapolation of +LLMs, but there is still a lack of in-depth interpretation of these approaches. +In this study, we explore the positional information within and beyond the +context window for deciphering the underlying mechanism of LLMs. By using a +mean-based decomposition method, we disentangle positional vectors from hidden +states of LLMs and analyze their formation and effect on attention. +Furthermore, when texts exceed the context window, we analyze the change of +positional vectors in two settings, i.e., direct extrapolation and context +window extension. Based on our findings, we design two training-free context +window extension methods, positional vector replacement and attention window +extension. Experimental results show that our methods can effectively extend +the context window length. + +
+
+ comment: Accepted by Neurips 2024 as a spotlight +
+
+
+
+
+ + ♻ ☆ Towards Evaluating Large Language Models for Graph Query Generation + + +
+ Large Language Models (LLMs) are revolutionizing the landscape of Generative +Artificial Intelligence (GenAI), with innovative LLM-backed solutions emerging +rapidly. However, when applied to database technologies, specifically query +generation for graph databases and Knowledge Graphs (KGs), LLMs still face +significant challenges. While research on LLM-driven query generation for +Structured Query Language (SQL) exists, similar systems for graph databases +remain underdeveloped. This paper presents a comparative study addressing the +challenge of generating Cypher queries a powerful language for interacting with +graph databases using open-access LLMs. We rigorously evaluate several LLM +agents (OpenAI ChatGPT 4o, Claude Sonnet 3.5, Google Gemini Pro 1.5, and a +locally deployed Llama 3.1 8B) using a designed few-shot learning prompt and +Retrieval Augmented Generation (RAG) backed by Chain-of-Thoughts (CoT) +reasoning. Our empirical analysis of query generation accuracy reveals that +Claude Sonnet 3.5 outperforms its counterparts in this specific domain. +Further, we highlight promising future research directions to address the +identified limitations and advance LLM-driven query generation for graph +databases. + +
+
+ comment: Paper accepted and will be presented at CSCI2024 in December 2024, + Later will be published at Springer LNCS +
+
+
+
+
+ + ♻ ☆ Python is Not Always the Best Choice: Embracing Multilingual Program of + Thoughts EMNLP 2024 + + +
+ Program of Thoughts (PoT) is an approach characterized by its executable +intermediate steps, which ensure the accuracy of the logical calculations in +the reasoning process. Currently, PoT primarily uses Python. However, relying +solely on a single language may result in suboptimal solutions and overlook the +potential benefits of other programming languages. In this paper, we conduct +comprehensive experiments on the programming languages used in PoT and find +that no single language consistently delivers optimal performance across all +tasks and models. The effectiveness of each language varies depending on the +specific scenarios. Inspired by this, we propose a task and model agnostic +approach called MultiPoT, which harnesses strength and diversity from various +languages. Experimental results reveal that it significantly outperforms Python +Self-Consistency. Furthermore, it achieves comparable or superior performance +compared to the best monolingual PoT in almost all tasks across all models. In +particular, MultiPoT achieves more than 4.6% improvement on average on ChatGPT +(gpt-3.5-turbo-0701). + +
+
+ comment: Accepted by EMNLP 2024. Code and data are released at + https://github.com/Luowaterbi/MultiPoT +
+
+
+
+
+ + ♻ ☆ LLMs and Memorization: On Quality and Specificity of Copyright + Compliance + + +
+ Memorization in large language models (LLMs) is a growing concern. LLMs have +been shown to easily reproduce parts of their training data, including +copyrighted work. This is an important problem to solve, as it may violate +existing copyright laws as well as the European AI Act. In this work, we +propose a systematic analysis to quantify the extent of potential copyright +infringements in LLMs using European law as an example. Unlike previous work, +we evaluate instruction-finetuned models in a realistic end-user scenario. Our +analysis builds on a proposed threshold of 160 characters, which we borrow from +the German Copyright Service Provider Act and a fuzzy text matching algorithm +to identify potentially copyright-infringing textual reproductions. The +specificity of countermeasures against copyright infringement is analyzed by +comparing model behavior on copyrighted and public domain data. We investigate +what behaviors models show instead of producing protected text (such as refusal +or hallucination) and provide a first legal assessment of these behaviors. We +find that there are huge differences in copyright compliance, specificity, and +appropriate refusal among popular LLMs. Alpaca, GPT 4, GPT 3.5, and Luminous +perform best in our comparison, with OpenGPT-X, Alpaca, and Luminous producing +a particularly low absolute number of potential copyright violations. Code can +be found at https://github.com/felixbmuller/llms-memorization-copyright. + +
+
+ comment: 10 pages, 3 figures, AIES 2024 conference +
+
+
+
+
+ + ♻ ☆ Clustering and Ranking: Diversity-preserved Instruction Selection + through Expert-aligned Quality Estimation EMNLP2024 + + +
+ With contributions from the open-source community, a vast amount of +instruction tuning (IT) data has emerged. Given the significant resource +allocation required for training and evaluating models, it is advantageous to +have an efficient method for selecting high-quality IT data. However, existing +methods for instruction data selection have limitations such as relying on +fragile external APIs, being affected by biases in GPT models, or reducing the +diversity of the selected instruction dataset. In this paper, we propose an +industrial-friendly, expert-aligned and diversity-preserved instruction data +selection method: Clustering and Ranking (CaR). CaR employs a two-step process: +first, it ranks instruction pairs using a high-accuracy (84.25%) scoring model +aligned with expert preferences; second, it preserves dataset diversity through +clustering. In our experiment, CaR efficiently selected a mere 1.96% of +Alpaca's IT data, yet the resulting AlpaCaR model surpassed Alpaca's +performance by an average of 32.1% in GPT-4 evaluations. Moreover, we find that +data selecting is a consistent paradigm whether the pre-trained model is more +capable or the model parameters scaling up. Our approach employs compact models +with 550M parameters and incurs just 11.2% of the financial outlay of current +methods, enhancing its industrial deployability. + +
+
+ comment: Accepted by EMNLP2024 +
+
+
+
+
+ + ♻ ☆ Word-Sequence Entropy: Towards Uncertainty Estimation in Free-Form + Medical Question Answering Applications and Beyond + + +
+ Uncertainty estimation is crucial for the reliability of safety-critical +human and artificial intelligence (AI) interaction systems, particularly in the +domain of healthcare engineering. However, a robust and general uncertainty +measure for free-form answers has not been well-established in open-ended +medical question-answering (QA) tasks, where generative inequality introduces a +large number of irrelevant words and sequences within the generated set for +uncertainty quantification (UQ), which can lead to biases. This paper +introduces Word-Sequence Entropy (WSE), a method that calibrates uncertainty at +both the word and sequence levels, considering semantic relevance. WSE +quantifies uncertainty in a way that is more closely aligned with the +reliability of LLMs during uncertainty quantification (UQ). We compare WSE with +six baseline methods on five free-form medical QA datasets, utilizing seven +popular large language models (LLMs). Experimental results demonstrate that WSE +exhibits superior performance in UQ under two standard criteria for correctness +evaluation. Additionally, in terms of real-world medical QA applications, the +performance of LLMs is significantly enhanced (e.g., a 6.36% improvement in +model accuracy on the COVID-QA dataset) by employing responses with lower +uncertainty that are identified by WSE as final answers, without any additional +task-specific fine-tuning or architectural modifications. + +
+
+ comment: Accepted by Engineering Applications of Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ ConU: Conformal Uncertainty in Large Language Models with Correctness + Coverage Guarantees EMNLP 2024 + + +
+ Uncertainty quantification (UQ) in natural language generation (NLG) tasks +remains an open challenge, exacerbated by the closed-source nature of the +latest large language models (LLMs). This study investigates applying conformal +prediction (CP), which can transform any heuristic uncertainty notion into +rigorous prediction sets, to black-box LLMs in open-ended NLG tasks. We +introduce a novel uncertainty measure based on self-consistency theory, and +then develop a conformal uncertainty criterion by integrating the uncertainty +condition aligned with correctness into the CP algorithm. Empirical evaluations +indicate that our uncertainty measure outperforms prior state-of-the-art +methods. Furthermore, we achieve strict control over the correctness coverage +rate utilizing 7 popular LLMs on 4 free-form NLG datasets, spanning +general-purpose and medical scenarios. Additionally, the calibrated prediction +sets with small size further highlights the efficiency of our method in +providing trustworthy guarantees for practical open-ended NLG applications. + +
+
+ comment: Accepted by EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Semantic Operators: A Declarative Model for Rich, AI-based Analytics + Over Text Data + + +
+ The semantic capabilities of language models (LMs) have the potential to +enable rich analytics and reasoning over vast knowledge corpora. Unfortunately, +existing systems lack high-level abstractions to perform bulk semantic queries +across large corpora. We introduce semantic operators, a declarative +programming interface that extends the relational model with composable +AI-based operations for bulk semantic queries (e.g., filtering, sorting, +joining or aggregating records using natural language criteria). Each operator +can be implemented and optimized in multiple ways, opening a rich space for +execution plans similar to relational operators. We implement our operators in +LOTUS, an open source query engine with a DataFrame API. Furthermore, we +develop several novel optimizations that take advantage of the declarative +nature of semantic operators to accelerate semantic filtering, clustering and +join operators by up to $400\times$ while offering statistical accuracy +guarantees. We demonstrate LOTUS' effectiveness on real AI applications +including fact-checking, extreme multi-label classification, and search. We +show that the semantic operator model is expressive, capturing state-of-the-art +AI pipelines in a few operator calls, and making it easy to express new +pipelines that achieve up to $180\%$ higher quality. Overall, LOTUS queries +match or exceed the accuracy of state-of-the-art AI pipelines for each task +while running up to 28$\times$ faster. LOTUS is publicly available at +https://github.com/stanford-futuredata/lotus. + +
+
+
+
+
+ + ♻ ☆ The why, what, and how of AI-based coding in scientific research + + +
+ Computer programming (coding) is indispensable for researchers across +disciplines, yet it remains challenging to learn and time-consuming to carry +out. Generative AI, particularly large language models (LLMs), has the +potential to transform coding into intuitive conversations, but best practices +and effective workflows are only emerging. We dissect AI-based coding through +three key lenses: the nature and role of LLMs in coding (why), six types of +coding assistance they provide (what), and a five-step workflow in action with +practical implementation strategies (how). Additionally, we address the +limitations and future outlook of AI in coding. By offering actionable +insights, this framework helps to guide researchers in effectively leveraging +AI to enhance coding practices and education, accelerating scientific progress. + +
+
+ comment: 23 pages, 7 figure, 3 boxes +
+
+
+
+
+ + ♻ ☆ Targeted Efficient Fine-tuning: Optimizing Parameter Updates with + Data-Driven Sample Selection + + +
+ Fine-tuning all parameters of Large Language Models (LLMs) is computationally +expensive. Parameter-Efficient Fine-Tuning (PEFT) methods address this by +selectively fine-tuning specific parameters. Most of the parameter efficient +fine-tuning (PEFT) methods center on selecting or introducing a set of +parameters to be fine-tuned. However, there are few methods that consider the +impact of data samples on parameter selecting. Representative data driven +methods include FISH Mask based method, which randomly selects a portion of +data samples as a basis when selecting parameters. However, this random data +sample selection method cannot select optimal parameters for unstable data +distribution. In this work, we introduce a data-centric approach and propose +the Iterative Range Decreasing (IRD) algorithm to optimize the sample-parameter +pair selection in FISH Mask. IRD iteratively refines the selection by +identifying subsets of samples and parameters exhibiting higher Fisher +information. We demonstrate the effectiveness and rationality of proposed +strategy by conducting experiments on GLUE benchmark. Experimental results show +our strategy optimizes the parameter selection and achieves preferable +performance over some typical baseline methods. + +
+
+
+
+
+ + ♻ ☆ A Framework for Leveraging Partially-Labeled Data for Product + Attribute-Value Identification + + +
+ In the e-commerce domain, the accurate extraction of attribute-value pairs +(e.g., Brand: Apple) from product titles and user search queries is crucial for +enhancing search and recommendation systems. A major challenge with neural +models for this task is the lack of high-quality training data, as the +annotations for attribute-value pairs in the available datasets are often +incomplete. To address this, we introduce GenToC, a model designed for training +directly with partially-labeled data, eliminating the necessity for a fully +annotated dataset. GenToC employs a marker-augmented generative model to +identify potential attributes, followed by a token classification model that +determines the associated values for each attribute. GenToC outperforms +existing state-of-the-art models, exhibiting upto 56.3% increase in the number +of accurate extractions. Furthermore, we utilize GenToC to regenerate the +training dataset to expand attribute-value annotations. This bootstrapping +substantially improves the data quality for training other standard NER models, +which are typically faster but less capable in handling partially-labeled data, +enabling them to achieve comparable performance to GenToC. Our results +demonstrate GenToC's unique ability to learn from a limited set of +partially-labeled data and improve the training of more efficient models, +advancing the automated extraction of attribute-value pairs. Finally, our model +has been successfully integrated into IndiaMART, India's largest B2B e-commerce +platform, achieving a significant increase of 20.2% in the number of correctly +identified attribute-value pairs over the existing deployed system while +achieving a high precision of 89.5%. + +
+
+ comment: Accepted to KDD 2025 ADS Track +
+
+
+
+
+ + ♻ ☆ Enhancing High-order Interaction Awareness in LLM-based Recommender + Model EMNLP 2024 + + +
+ Large language models (LLMs) have demonstrated prominent reasoning +capabilities in recommendation tasks by transforming them into text-generation +tasks. However, existing approaches either disregard or ineffectively model the +user-item high-order interactions. To this end, this paper presents an enhanced +LLM-based recommender (ELMRec). We enhance whole-word embeddings to +substantially enhance LLMs' interpretation of graph-constructed interactions +for recommendations, without requiring graph pre-training. This finding may +inspire endeavors to incorporate rich knowledge graphs into LLM-based +recommenders via whole-word embedding. We also found that LLMs often recommend +items based on users' earlier interactions rather than recent ones, and present +a reranking solution. Our ELMRec outperforms state-of-the-art (SOTA) methods in +both direct and sequential recommendations. + +
+
+ comment: Long paper accepted to EMNLP 2024 Main. 16 pages +
+
+
+
+
+ + ♻ ☆ Information Extraction from Clinical Notes: Are We Ready to Switch to + Large Language Models? + + +
+ Backgrounds: Information extraction (IE) is critical in clinical natural +language processing (NLP). While large language models (LLMs) excel on +generative tasks, their performance on extractive tasks remains debated. +Methods: We investigated Named Entity Recognition (NER) and Relation Extraction +(RE) using 1,588 clinical notes from four sources (UT Physicians, MTSamples, +MIMIC-III, and i2b2). We developed an annotated corpus covering 4 clinical +entities and 16 modifiers, and compared instruction-tuned LLaMA-2 and LLaMA-3 +against BiomedBERT in terms of performance, generalizability, computational +resources, and throughput to BiomedBERT. Results: LLaMA models outperformed +BiomedBERT across datasets. With sufficient training data, LLaMA showed modest +improvements (1% on NER, 1.5-3.7% on RE); improvements were larger with limited +training data. On unseen i2b2 data, LLaMA-3-70B outperformed BiomedBERT by 7% +(F1) on NER and 4% on RE. However, LLaMA models required more computing +resources and ran up to 28 times slower. We implemented "Kiwi," a clinical IE +package featuring both models, available at https://kiwi.clinicalnlp.org/. +Conclusion: This study is among the first to develop and evaluate a +comprehensive clinical IE system using open-source LLMs. Results indicate that +LLaMA models outperform BiomedBERT for clinical NER and RE but with higher +computational costs and lower throughputs. These findings highlight that +choosing between LLMs and traditional deep learning methods for clinical IE +applications should remain task-specific, taking into account both performance +metrics and practical considerations such as available computing resources and +the intended use case scenarios. + +
+
+
+
+
+ + ♻ ☆ ReST-MCTS*: LLM Self-Training via Process Reward Guided Tree Search NeurIPS 2024 + + +
+ Recent methodologies in LLM self-training mostly rely on LLM generating +responses and filtering those with correct output answers as training data. +This approach often yields a low-quality fine-tuning training set (e.g., +incorrect plans or intermediate reasoning). In this paper, we develop a +reinforced self-training approach, called ReST-MCTS*, based on integrating +process reward guidance with tree search MCTS* for collecting higher-quality +reasoning traces as well as per-step value to train policy and reward models. +ReST-MCTS* circumvents the per-step manual annotation typically used to train +process rewards by tree-search-based reinforcement learning: Given oracle final +correct answers, ReST-MCTS* is able to infer the correct process rewards by +estimating the probability this step can help lead to the correct answer. These +inferred rewards serve dual purposes: they act as value targets for further +refining the process reward model and also facilitate the selection of +high-quality traces for policy model self-training. We first show that the +tree-search policy in ReST-MCTS* achieves higher accuracy compared with prior +LLM reasoning baselines such as Best-of-N and Tree-of-Thought, within the same +search budget. We then show that by using traces searched by this tree-search +policy as training data, we can continuously enhance the three language models +for multiple iterations, and outperform other self-training algorithms such as +ReST$^\text{EM}$ and Self-Rewarding LM. We release all code at +https://github.com/THUDM/ReST-MCTS. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SciInstruct: a Self-Reflective Instruction Annotated Dataset for + Training Scientific Language Models NeurIPS + + +
+ Large Language Models (LLMs) have shown promise in assisting scientific +discovery. However, such applications are currently limited by LLMs' +deficiencies in understanding intricate scientific concepts, deriving symbolic +equations, and solving advanced numerical calculations. To bridge these gaps, +we introduce SciInstruct, a suite of scientific instructions for training +scientific language models capable of college-level scientific reasoning. +Central to our approach is a novel self-reflective instruction annotation +framework to address the data scarcity challenge in the science domain. This +framework leverages existing LLMs to generate step-by-step reasoning for +unlabelled scientific questions, followed by a process of self-reflective +critic-and-revise. Applying this framework, we curated a diverse and +high-quality dataset encompassing physics, chemistry, math, and formal proofs. +We analyze the curated SciInstruct from multiple interesting perspectives +(e.g., domain, scale, source, question type, answer length, etc.). To verify +the effectiveness of SciInstruct, we fine-tuned different language models with +SciInstruct, i.e., ChatGLM3 (6B and 32B), Llama3-8B-Instruct, and Mistral-7B: +MetaMath, enhancing their scientific and mathematical reasoning capabilities, +without sacrificing the language understanding capabilities of the base model. +We release all codes and SciInstruct at https://github.com/THUDM/SciGLM. + +
+
+ comment: Accepted to NeurIPS D&B Track 2024 +
+
+
+
+
+ + ♻ ☆ Open Domain Question Answering with Conflicting Contexts + + +
+ Open domain question answering systems frequently rely on information +retrieved from large collections of text (such as the Web) to answer questions. +However, such collections of text often contain conflicting information, and +indiscriminately depending on this information may result in untruthful and +inaccurate answers. To understand the gravity of this problem, we collect a +human-annotated dataset, Question Answering with Conflicting Contexts (QACC), +and find that as much as 25% of unambiguous, open domain questions can lead to +conflicting contexts when retrieved using Google Search. We evaluate and +benchmark three powerful Large Language Models (LLMs) with our dataset QACC and +demonstrate their limitations in effectively addressing questions with +conflicting information. To explore how humans reason through conflicting +contexts, we request our annotators to provide explanations for their +selections of correct answers. We demonstrate that by finetuning LLMs to +explain their answers, we can introduce richer information into their training +that guide them through the process of reasoning with conflicting contexts. + +
+
+
+
+
+ + ♻ ☆ Matching Patients to Clinical Trials with Large Language Models + + +
+ Patient recruitment is challenging for clinical trials. We introduce +TrialGPT, an end-to-end framework for zero-shot patient-to-trial matching with +large language models. TrialGPT comprises three modules: it first performs +large-scale filtering to retrieve candidate trials (TrialGPT-Retrieval); then +predicts criterion-level patient eligibility (TrialGPT-Matching); and finally +generates trial-level scores (TrialGPT-Ranking). We evaluate TrialGPT on three +cohorts of 183 synthetic patients with over 75,000 trial annotations. +TrialGPT-Retrieval can recall over 90% of relevant trials using less than 6% of +the initial collection. Manual evaluations on 1,015 patient-criterion pairs +show that TrialGPT-Matching achieves an accuracy of 87.3% with faithful +explanations, close to the expert performance. The TrialGPT-Ranking scores are +highly correlated with human judgments and outperform the best-competing models +by 43.8% in ranking and excluding trials. Furthermore, our user study reveals +that TrialGPT can reduce the screening time by 42.6% in patient recruitment. +Overall, these results have demonstrated promising opportunities for +patient-to-trial matching with TrialGPT. + +
+
+ comment: Nature Communications +
+
+
+
+
+ + ♻ ☆ Large Language Models and Cognitive Science: A Comprehensive Review of + Similarities, Differences, and Challenges + + +
+ This comprehensive review explores the intersection of Large Language Models +(LLMs) and cognitive science, examining similarities and differences between +LLMs and human cognitive processes. We analyze methods for evaluating LLMs +cognitive abilities and discuss their potential as cognitive models. The review +covers applications of LLMs in various cognitive fields, highlighting insights +gained for cognitive science research. We assess cognitive biases and +limitations of LLMs, along with proposed methods for improving their +performance. The integration of LLMs with cognitive architectures is examined, +revealing promising avenues for enhancing artificial intelligence (AI) +capabilities. Key challenges and future research directions are identified, +emphasizing the need for continued refinement of LLMs to better align with +human cognition. This review provides a balanced perspective on the current +state and future potential of LLMs in advancing our understanding of both +artificial and human intelligence. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ A Theoretical Understanding of Self-Correction through In-context + Alignment NeurIPS 2024 + + +
+ Going beyond mimicking limited human experiences, recent studies show initial +evidence that, like humans, large language models (LLMs) are capable of +improving their abilities purely by self-correction, i.e., correcting previous +responses through self-examination, in certain circumstances. Nevertheless, +little is known about how such capabilities arise. In this work, based on a +simplified setup akin to an alignment task, we theoretically analyze +self-correction from an in-context learning perspective, showing that when LLMs +give relatively accurate self-examinations as rewards, they are capable of +refining responses in an in-context way. Notably, going beyond previous +theories on over-simplified linear transformers, our theoretical construction +underpins the roles of several key designs of realistic transformers for +self-correction: softmax attention, multi-head attention, and the MLP block. We +validate these findings extensively on synthetic datasets. Inspired by these +findings, we also illustrate novel applications of self-correction, such as +defending against LLM jailbreaks, where a simple self-correction step does make +a large difference. We believe that these findings will inspire further +research on understanding, exploiting, and enhancing self-correction for +building better foundation models. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ptt5-v2: A Closer Look at Continued Pretraining of T5 Models for the + Portuguese Language + + +
+ Despite advancements in Natural Language Processing (NLP) and the growing +availability of pretrained models, the English language remains the primary +focus of model development. Continued pretraining on language-specific corpora +provides a practical solution for adapting models to other languages. However, +the impact of different pretraining settings on downstream tasks remains +underexplored. This work introduces $\texttt{ptt5-v2}$, investigating the +continued pretraining of T5 models for Portuguese. We first develop a baseline +set of settings and pretrain models with sizes up to 3B parameters. Finetuning +on three Portuguese downstream tasks (assin2 STS, assin2 RTE, and TweetSentBR) +yields SOTA results on the latter two. We then explore the effects of different +pretraining configurations, including pretraining data quality, optimization +strategies, and multi-epoch pretraining. Perhaps surprisingly, their impact +remains subtle compared to our baseline. We release $\texttt{ptt5-v2}$ +pretrained checkpoints and their MonoT5-based finetuned $\texttt{MonoPTT5}$ +rerankers on HuggingFace in their respective collections at +\url{https://huggingface.co/unicamp-dl}. + +
+
+
+
+
+ + ♻ ☆ Redefining Proactivity for Information Seeking Dialogue + + +
+ Information-Seeking Dialogue (ISD) agents aim to provide accurate responses +to user queries. While proficient in directly addressing user queries, these +agents, as well as LLMs in general, predominantly exhibit reactive behavior, +lacking the ability to generate proactive responses that actively engage users +in sustained conversations. However, existing definitions of proactive dialogue +in this context do not focus on how each response actively engages the user and +sustains the conversation. Hence, we present a new definition of proactivity +that focuses on enhancing the `proactiveness' of each generated response via +the introduction of new information related to the initial query. To this end, +we construct a proactive dialogue dataset comprising 2,000 single-turn +conversations, and introduce several automatic metrics to evaluate response +`proactiveness' which achieved high correlation with human annotation. +Additionally, we introduce two innovative Chain-of-Thought (CoT) prompts, the +3-step CoT and the 3-in-1 CoT prompts, which consistently outperform standard +prompts by up to 90% in the zero-shot setting. + +
+
+
+
+
+ + ♻ ☆ MedCLIP-SAMv2: Towards Universal Text-Driven Medical Image Segmentation + + +
+ Segmentation of anatomical structures and pathological regions in medical +images is essential for modern clinical diagnosis, disease research, and +treatment planning. While significant advancements have been made in deep +learning-based segmentation techniques, many of these methods still suffer from +limitations in data efficiency, generalizability, and interactivity. As a +result, developing precise segmentation methods that require fewer labeled +datasets remains a critical challenge in medical image analysis. Recently, the +introduction of foundation models like CLIP and Segment-Anything-Model (SAM), +with robust cross-domain representations, has paved the way for interactive and +universal image segmentation. However, further exploration of these models for +data-efficient segmentation in medical imaging is still needed and highly +relevant. In this paper, we introduce MedCLIP-SAMv2, a novel framework that +integrates the CLIP and SAM models to perform segmentation on clinical scans +using text prompts, in both zero-shot and weakly supervised settings. Our +approach includes fine-tuning the BiomedCLIP model with a new Decoupled Hard +Negative Noise Contrastive Estimation (DHN-NCE) loss, and leveraging the +Multi-modal Information Bottleneck (M2IB) to create visual prompts for +generating segmentation masks from SAM in the zero-shot setting. We also +investigate using zero-shot segmentation labels within a weakly supervised +paradigm to enhance segmentation quality further. Extensive testing across four +diverse segmentation tasks and medical imaging modalities (breast tumor +ultrasound, brain tumor MRI, lung X-ray, and lung CT) demonstrates the high +accuracy of our proposed framework. Our code is available at +https://github.com/HealthX-Lab/MedCLIP-SAMv2. + +
+
+ comment: 10 pages, 2 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ ShiftAddLLM: Accelerating Pretrained LLMs via Post-Training + Multiplication-Less Reparameterization NeurIPS 2024 + + +
+ Large language models (LLMs) have shown impressive performance on language +tasks but face challenges when deployed on resource-constrained devices due to +their extensive parameters and reliance on dense multiplications, resulting in +high memory demands and latency bottlenecks. Shift-and-add reparameterization +offers a promising solution by replacing costly multiplications with +hardware-friendly primitives in both the attention and multi-layer perceptron +(MLP) layers of an LLM. However, current reparameterization techniques require +training from scratch or full parameter fine-tuning to restore accuracy, which +is resource-intensive for LLMs. To address this, we propose accelerating +pretrained LLMs through post-training shift-and-add reparameterization, +creating efficient multiplication-free models, dubbed ShiftAddLLM. +Specifically, we quantize each weight matrix into binary matrices paired with +group-wise scaling factors. The associated multiplications are reparameterized +into (1) shifts between activations and scaling factors and (2) queries and +adds according to the binary matrices. To reduce accuracy loss, we present a +multi-objective optimization method to minimize both weight and output +activation reparameterization errors. Additionally, based on varying +sensitivity across layers to reparameterization, we develop an automated bit +allocation strategy to further reduce memory usage and latency. Experiments on +five LLM families and eight tasks consistently validate the effectiveness of +ShiftAddLLM, achieving average perplexity improvements of 5.6 and 22.7 points +at comparable or lower latency compared to the most competitive quantized LLMs +at 3 and 2 bits, respectively, and more than 80% memory and energy reductions +over the original LLMs. Codes and models are available at +https://github.com/GATECH-EIC/ShiftAddLLM. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Retrieval-Augmented Personalization for Multimodal Large Language Models + + +
+ The development of large language models (LLMs) has significantly enhanced +the capabilities of multimodal LLMs (MLLMs) as general assistants. However, +lack of user-specific knowledge still restricts their application in human's +daily life. In this paper, we introduce the Retrieval Augmented Personalization +(RAP) framework for MLLMs' personalization. Starting from a general MLLM, we +turn it into a personalized assistant in three steps. (a) Remember: We design a +key-value database to store user-related information, e.g., user's name, avatar +and other attributes. (b) Retrieve: When the user initiates a conversation, RAP +will retrieve relevant information from the database using a multimodal +retriever. (c) Generate: The input query and retrieved concepts' information +are fed into MLLMs to generate personalized, knowledge-augmented responses. +Unlike previous methods, RAP allows real-time concept editing via updating the +external database. To further improve generation quality and alignment with +user-specific information, we design a pipeline for data collection and create +a specialized dataset for personalized training of MLLMs. Based on the dataset, +we train a series of MLLMs as personalized multimodal assistants. By +pretraining on large-scale dataset, RAP-MLLMs can generalize to infinite visual +concepts without additional finetuning. Our models demonstrate outstanding +flexibility and generation quality across a variety of tasks, such as +personalized image captioning, question answering and visual recognition. The +code, data and models are available at https://github.com/Hoar012/RAP-MLLM. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 19 + +
+
+
+ + ☆ PickScan: Object discovery and reconstruction from handheld interactions IROS 2024 + + +
+ Reconstructing compositional 3D representations of scenes, where each object +is represented with its own 3D model, is a highly desirable capability in +robotics and augmented reality. However, most existing methods rely heavily on +strong appearance priors for object discovery, therefore only working on those +classes of objects on which the method has been trained, or do not allow for +object manipulation, which is necessary to scan objects fully and to guide +object discovery in challenging scenarios. We address these limitations with a +novel interaction-guided and class-agnostic method based on object +displacements that allows a user to move around a scene with an RGB-D camera, +hold up objects, and finally outputs one 3D model per held-up object. Our main +contribution to this end is a novel approach to detecting user-object +interactions and extracting the masks of manipulated objects. On a +custom-captured dataset, our pipeline discovers manipulated objects with 78.3% +precision at 100% recall and reconstructs them with a mean chamfer distance of +0.90cm. Compared to Co-Fusion, the only comparable interaction-based and +class-agnostic baseline, this corresponds to a reduction in chamfer distance of +73% while detecting 99% fewer false positives. + +
+
+ comment: 7 pages, 8 figures, published in the 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ Robot Metabolism: Towards machines that can grow by consuming other + machines + + +
+ Biological lifeforms can heal, grow, adapt, and reproduce -- abilities +essential for sustained survival and development. In contrast, robots today are +primarily monolithic machines with limited ability to self-repair, physically +develop, or incorporate material from their environments. A key challenge to +such physical adaptation has been that while robot minds are rapidly evolving +new behaviors through AI, their bodies remain closed systems, unable to +systematically integrate new material to grow or heal. We argue that open-ended +physical adaptation is only possible when robots are designed using only a +small repertoire of simple modules. This allows machines to mechanically adapt +by consuming parts from other machines or their surroundings and shedding +broken components. We demonstrate this principle using a truss modular robot +platform composed of one-dimensional actuated bars. We show how robots in this +space can grow bigger, faster, and more capable by consuming materials from +their environment and from other robots. We suggest that machine metabolic +processes akin to the one demonstrated here will be an essential part of any +sustained future robot ecology. + +
+
+ comment: Manuscript combined with Supplementary Materials File for arXiv + submission. Submitting to Journal and will update external DOI once available +
+
+
+
+
+ + ☆ Improving User Experience in Preference-Based Optimization of Reward + Functions for Assistive Robots + + +
+ Assistive robots interact with humans and must adapt to different users' +preferences to be effective. An easy and effective technique to learn +non-expert users' preferences is through rankings of robot behaviors, for +example, robot movement trajectories or gestures. Existing techniques focus on +generating trajectories for users to rank that maximize the outcome of the +preference learning process. However, the generated trajectories do not appear +to reflect the user's preference over repeated interactions. In this work, we +design an algorithm to generate trajectories for users to rank that we call +Covariance Matrix Adaptation Evolution Strategies with Information Gain +(CMA-ES-IG). CMA-ES-IG prioritizes the user's experience of the preference +learning process. We show that users find our algorithm more intuitive and +easier to use than previous approaches across both physical and social robot +tasks. This project's code is hosted at github.com/interaction-lab/CMA-ES-IG + +
+
+ comment: Accepted to ISRR +
+
+
+
+
+ + ☆ Person Segmentation and Action Classification for Multi-Channel + Hemisphere Field of View LiDAR Sensors + + +
+ Robots need to perceive persons in their surroundings for safety and to +interact with them. In this paper, we present a person segmentation and action +classification approach that operates on 3D scans of hemisphere field of view +LiDAR sensors. We recorded a data set with an Ouster OSDome-64 sensor +consisting of scenes where persons perform three different actions and +annotated it. We propose a method based on a MaskDINO model to detect and +segment persons and to recognize their actions from combined spherical +projected multi-channel representations of the LiDAR data with an additional +positional encoding. Our approach demonstrates good performance for the person +segmentation task and further performs well for the estimation of the person +action states walking, waving, and sitting. An ablation study provides insights +about the individual channel contributions for the person segmentation task. +The trained models, code and dataset are made publicly available. + +
+
+ comment: 6 pages, 9 figures, 4 tables, accepted for publication at IEEE/SICE + International Symposium on System Integration (SII), Munich, Germany, January + 2025 +
+
+
+
+
+ + ☆ Emergent Structure in Multi-agent Systems Using Geometric Embeddings + + +
+ This work investigates the self-organization of multi-agent systems into +closed trajectories, a common requirement in unmanned aerial vehicle (UAV) +surveillance tasks. In such scenarios, smooth, unbiased control signals save +energy and mitigate mechanical strain. We propose a decentralized control +system architecture that produces a globally stable emergent structure from +local observations only; there is no requirement for agents to share a global +plan or follow prescribed trajectories. Central to our approach is the +formulation of an injective virtual embedding induced by rotations from the +actual agent positions. This embedding serves as a structure-preserving map +around which all agent stabilize their relative positions and permits the use +of well-established linear control techniques. We construct the embedding such +that it is topologically equivalent to the desired trajectory (i.e., a +homeomorphism), thereby preserving the stability characteristics. We +demonstrate the versatility of this approach through implementation on a swarm +of Quanser QDrone quadcopters. Results demonstrate the quadcopters +self-organize into the desired trajectory while maintaining even separation. + +
+
+
+
+
+ + ☆ EROAM: Event-based Camera Rotational Odometry and Mapping in Real-time + + +
+ This paper presents EROAM, a novel event-based rotational odometry and +mapping system that achieves real-time, accurate camera rotation estimation. +Unlike existing approaches that rely on event generation models or contrast +maximization, EROAM employs a spherical event representation by projecting +events onto a unit sphere and introduces Event Spherical Iterative Closest +Point (ES-ICP), a novel geometric optimization framework designed specifically +for event camera data. The spherical representation simplifies rotational +motion formulation while enabling continuous mapping for enhanced spatial +resolution. Combined with parallel point-to-line optimization, EROAM achieves +efficient computation without compromising accuracy. Extensive experiments on +both synthetic and real-world datasets show that EROAM significantly +outperforms state-of-the-art methods in terms of accuracy, robustness, and +computational efficiency. Our method maintains consistent performance under +challenging conditions, including high angular velocities and extended +sequences, where other methods often fail or show significant drift. +Additionally, EROAM produces high-quality panoramic reconstructions with +preserved fine structural details. + +
+
+
+
+
+ + ☆ Modulating Reservoir Dynamics via Reinforcement Learning for Efficient + Robot Skill Synthesis + + +
+ A random recurrent neural network, called a reservoir, can be used to learn +robot movements conditioned on context inputs that encode task goals. The +Learning is achieved by mapping the random dynamics of the reservoir modulated +by context to desired trajectories via linear regression. This makes the +reservoir computing (RC) approach computationally efficient as no iterative +gradient descent learning is needed. In this work, we propose a novel RC-based +Learning from Demonstration (LfD) framework that not only learns to generate +the demonstrated movements but also allows online modulation of the reservoir +dynamics to generate movement trajectories that are not covered by the initial +demonstration set. This is made possible by using a Reinforcement Learning (RL) +module that learns a policy to output context as its actions based on the robot +state. Considering that the context dimension is typically low, learning with +the RL module is very efficient. We show the validity of the proposed model +with systematic experiments on a 2 degrees-of-freedom (DOF) simulated robot +that is taught to reach targets, encoded as context, with and without obstacle +avoidance constraint. The initial data set includes a set of reaching +demonstrations which are learned by the reservoir system. To enable reaching +out-of-distribution targets, the RL module is engaged in learning a policy to +generate dynamic contexts so that the generated trajectory achieves the desired +goal without any learning in the reservoir system. Overall, the proposed model +uses an initial learned motor primitive set to efficiently generate diverse +motor behaviors guided by the designed reward function. Thus the model can be +used as a flexible and effective LfD system where the action repertoire can be +extended without new data collection. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ CropNav: a Framework for Autonomous Navigation in Real Farms ICRA + + +
+ Small robots that can operate under the plant canopy can enable new +possibilities in agriculture. However, unlike larger autonomous tractors, +autonomous navigation for such under canopy robots remains an open challenge +because Global Navigation Satellite System (GNSS) is unreliable under the plant +canopy. We present a hybrid navigation system that autonomously switches +between different sets of sensing modalities to enable full field navigation, +both inside and outside of crop. By choosing the appropriate path reference +source, the robot can accommodate for loss of GNSS signal quality and leverage +row-crop structure to autonomously navigate. However, such switching can be +tricky and difficult to execute over scale. Our system provides a solution by +automatically switching between an exteroceptive sensing based system, such as +Light Detection And Ranging (LiDAR) row-following navigation and waypoints path +tracking. In addition, we show how our system can detect when the navigate +fails and recover automatically extending the autonomous time and mitigating +the necessity of human intervention. Our system shows an improvement of about +750 m per intervention over GNSS-based navigation and 500 m over row following +navigation. + +
+
+ comment: Presented in the 2023 IEEE International Conference on Robotics and + Automation (ICRA) +
+
+
+
+
+ + ☆ Avian-Inspired High-Precision Tracking Control for Aerial Manipulators + + +
+ Aerial manipulators, composed of multirotors and robotic arms, have a +structure and function highly reminiscent of avian species. This paper studies +the tracking control problem for aerial manipulators. This paper studies the +tracking control problem for aerial manipulators. We propose an avian-inspired +aerial manipulation system, which includes an avian-inspired robotic arm +design, a Recursive Newton-Euler (RNE) method-based nonlinear flight +controller, and a coordinated controller with two modes. Compared to existing +methods, our proposed approach offers several attractive features. First, the +morphological characteristics of avian species are used to determine the size +proportion of the multirotor and the robotic arm in the aerial manipulator. +Second, the dynamic coupling of the aerial manipulator is addressed by the +RNE-based flight controller and a dual-mode coordinated controller. +Specifically, under our proposed algorithm, the aerial manipulator can +stabilize the end-effector's pose, similar to avian head stabilization. The +proposed approach is verified through three numerical experiments. The results +show that even when the quadcopter is disturbed by different forces, the +position error of the end-effector achieves millimeter-level accuracy, and the +attitude error remains within 1 degree. The limitation of this work is not +considering aggressive manipulation like that seen in birds. Addressing this +through future studies that explore real-world experiments will be a key +direction for research. + +
+
+
+
+
+ + ☆ Efficient Estimation of Relaxed Model Parameters for Robust UAV + Trajectory Optimization + + +
+ Online trajectory optimization and optimal control methods are crucial for +enabling sustainable unmanned aerial vehicle (UAV) services, such as +agriculture, environmental monitoring, and transportation, where available +actuation and energy are limited. However, optimal controllers are highly +sensitive to model mismatch, which can occur due to loaded equipment, packages +to be delivered, or pre-existing variability in fundamental structural and +thrust-related parameters. To circumvent this problem, optimal controllers can +be paired with parameter estimators to improve their trajectory planning +performance and perform adaptive control. However, UAV platforms are limited in +terms of onboard processing power, oftentimes making nonlinear parameter +estimation too computationally expensive to consider. To address these issues, +we propose a relaxed, affine-in-parameters multirotor model along with an +efficient optimal parameter estimator. We convexify the nominal Moving Horizon +Parameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via +an affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast +quadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC) +in real time. We compare this approach to the equivalent nonlinear estimator in +Monte Carlo simulations, demonstrating a decrease in average solve time and +trajectory optimality cost by 98.2% and 23.9-56.2%, respectively. + +
+
+ comment: 8 pages, 5 figures, submitted to IEEE Sustech 2025 +
+
+
+
+
+ + ☆ Exciting Contact Modes in Differentiable Simulations for Robot Learning + + +
+ In this paper, we explore an approach to actively plan and excite contact +modes in differentiable simulators as a means to tighten the sim-to-real gap. +We propose an optimal experimental design approach derived from +information-theoretic methods to identify and search for information-rich +contact modes through the use of contact-implicit optimization. We demonstrate +our approach on a robot parameter estimation problem with unknown inertial and +kinematic parameters which actively seeks contacts with a nearby surface. We +show that our approach improves the identification of unknown parameter +estimates over experimental runs by an estimate error reduction of at least +$\sim 84\%$ when compared to a random sampling baseline, with significantly +higher information gains. + +
+
+
+
+
+ + ☆ On-Board Vision-Language Models for Personalized Autonomous Vehicle + Motion Control: System Design and Real-World Validation + + +
+ Personalized driving refers to an autonomous vehicle's ability to adapt its +driving behavior or control strategies to match individual users' preferences +and driving styles while maintaining safety and comfort standards. However, +existing works either fail to capture every individual preference precisely or +become computationally inefficient as the user base expands. Vision-Language +Models (VLMs) offer promising solutions to this front through their natural +language understanding and scene reasoning capabilities. In this work, we +propose a lightweight yet effective on-board VLM framework that provides +low-latency personalized driving performance while maintaining strong reasoning +capabilities. Our solution incorporates a Retrieval-Augmented Generation +(RAG)-based memory module that enables continuous learning of individual +driving preferences through human feedback. Through comprehensive real-world +vehicle deployment and experiments, our system has demonstrated the ability to +provide safe, comfortable, and personalized driving experiences across various +scenarios and significantly reduce takeover rates by up to 76.9%. To the best +of our knowledge, this work represents the first end-to-end VLM-based motion +control system in real-world autonomous vehicles. + +
+
+
+
+
+ + ☆ ModeSeq: Taming Sparse Multimodal Motion Prediction with Sequential Mode + Modeling + + +
+ Anticipating the multimodality of future events lays the foundation for safe +autonomous driving. However, multimodal motion prediction for traffic agents +has been clouded by the lack of multimodal ground truth. Existing works +predominantly adopt the winner-take-all training strategy to tackle this +challenge, yet still suffer from limited trajectory diversity and misaligned +mode confidence. While some approaches address these limitations by generating +excessive trajectory candidates, they necessitate a post-processing stage to +identify the most representative modes, a process lacking universal principles +and compromising trajectory accuracy. We are thus motivated to introduce +ModeSeq, a new multimodal prediction paradigm that models modes as sequences. +Unlike the common practice of decoding multiple plausible trajectories in one +shot, ModeSeq requires motion decoders to infer the next mode step by step, +thereby more explicitly capturing the correlation between modes and +significantly enhancing the ability to reason about multimodality. Leveraging +the inductive bias of sequential mode prediction, we also propose the +Early-Match-Take-All (EMTA) training strategy to diversify the trajectories +further. Without relying on dense mode prediction or rule-based trajectory +selection, ModeSeq considerably improves the diversity of multimodal output +while attaining satisfactory trajectory accuracy, resulting in balanced +performance on motion prediction benchmarks. Moreover, ModeSeq naturally +emerges with the capability of mode extrapolation, which supports forecasting +more behavior modes when the future is highly uncertain. + +
+
+
+
+
+ + ♻ ☆ Motion Before Action: Diffusing Object Motion as Manipulation Condition + + +
+ Inferring object motion representations from observations enhances the +performance of robotic manipulation tasks. This paper introduces a new paradigm +for robot imitation learning that generates action sequences by reasoning about +object motion from visual observations. We propose MBA (Motion Before Action), +a novel module that employs two cascaded diffusion processes for object motion +generation and robot action generation under object motion guidance. MBA first +predicts the future pose sequence of the object based on observations, then +uses this sequence as a condition to guide robot action generation. Designed as +a plug-and-play component, MBA can be flexibly integrated into existing robotic +manipulation policies with diffusion action heads. Extensive experiments in +both simulated and real-world environments demonstrate that our approach +substantially improves the performance of existing policies across a wide range +of manipulation tasks. Project page: https://selen-suyue.github.io/MBApage/ + +
+
+
+
+
+ + ♻ ☆ Collaborative Goal Tracking of Multiple Mobile Robots Based on Geometric + Graph Neural Network + + +
+ Multiple mobile robots play a significant role in various spatially +distributed tasks.In unfamiliar and non-repetitive scenarios, reconstructing +the global map is time-inefficient and sometimes unrealistic. Hence, research +has focused on achieving real-time collaborative planning by utilizing sensor +data from multiple robots located at different positions, all without relying +on a global map.This paper introduces a Multi-Robot collaborative Path Planning +method based on Geometric Graph Neural Network (MRPP-GeoGNN). We extract the +features of each neighboring robot's sensory data and integrate the relative +positions of neighboring robots into each interaction layer to incorporate +obstacle information along with location details using geometric feature +encoders. After that, a MLP layer is used to map the amalgamated local features +to multiple forward directions for the robot's actual movement. We generated +expert data in ROS to train the network and carried out both simulations and +physical experiments to validate the effectiveness of the proposed method. +Simulation results demonstrate an approximate 5% improvement in accuracy +compared to the model based solely on CNN on expert datasets. The success rate +is enhanced by about 4% compared to CNN, and the flowtime increase is reduced +by approximately 18% in the ROS test, surpassing other GNN models. Besides, the +proposed method is able to leverage neighbor's information and greatly improves +path efficiency in real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ Gathering on a Circle with Limited Visibility by Anonymous Oblivious + Robots + + +
+ A swarm of anonymous oblivious mobile robots, operating in deterministic +Look-Compute-Move cycles, is confined within a circular track. All robots agree +on the clockwise direction (chirality), they are activated by an adversarial +semi-synchronous scheduler (SSYNCH), and an active robot always reaches the +destination point it computes (rigidity). Robots have limited visibility: each +robot can see only the points on the circle that have an angular distance +strictly smaller than a constant $\vartheta$ from the robot's current location, +where $0<\vartheta\leq\pi$ (angles are expressed in radians). + We study the Gathering problem for such a swarm of robots: that is, all +robots are initially in distinct locations on the circle, and their task is to +reach the same point on the circle in a finite number of turns, regardless of +the way they are activated by the scheduler. Note that, due to the anonymity of +the robots, this task is impossible if the initial configuration is +rotationally symmetric; hence, we have to make the assumption that the initial +configuration be rotationally asymmetric. + We prove that, if $\vartheta=\pi$ (i.e., each robot can see the entire circle +except its antipodal point), there is a distributed algorithm that solves the +Gathering problem for swarms of any size. By contrast, we also prove that, if +$\vartheta\leq \pi/2$, no distributed algorithm solves the Gathering problem, +regardless of the size of the swarm, even under the assumption that the initial +configuration is rotationally asymmetric and the visibility graph of the robots +is connected. + The latter impossibility result relies on a probabilistic technique based on +random perturbations, which is novel in the context of anonymous mobile robots. +Such a technique is of independent interest, and immediately applies to other +Pattern-Formation problems. + +
+
+ comment: 34 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Adverse Weather-Immune Semantic Segmentation with Unfolded + Regularization and Foundation Model Knowledge Distillation for Autonomous + Driving + + +
+ Various adverse weather conditions pose a significant challenge to autonomous +driving (AD) street scene semantic understanding (segmentation). A common +strategy is to minimize the disparity between images captured in clear and +adverse weather conditions. However, this technique typically relies on +utilizing clear image as a reference, which is challenging to obtain in +practice. Furthermore, this method typically targets a single adverse +condition, and thus perform poorly when confronting a mixture of multiple +adverse weather conditions. To address these issues, we introduce a +reference-free and Adverse weather-Immune scheme (called AdvImmu) that +leverages the invariance of weather conditions over short periods (seconds). +Specifically, AdvImmu includes three components: Locally Sequential Mechanism +(LSM), Globally Shuffled Mechanism (GSM), and Unfolded Regularizers (URs). LSM +leverages temporal correlations between adjacent frames to enhance model +performance. GSM is proposed to shuffle LSM segments to prevent overfitting of +temporal patterns. URs are the deep unfolding implementation of two proposed +regularizers to penalize the model complexity to enhance across-weather +generalization. In addition, to overcome the over-reliance on consecutive +frame-wise annotations in the training of AdvImmu (typically unavailable in AD +scenarios), we incorporate a foundation model named Segment Anything Model +(SAM) to assist to annotate frames, and additionally propose a cluster +algorithm (denoted as SBICAC) to surmount SAM's category-agnostic issue to +generate pseudo-labels. Extensive experiments demonstrate that the proposed +AdvImmu outperforms existing state-of-the-art methods by 88.56% in mean +Intersection over Union (mIoU). + +
+
+ comment: 16 Pages +
+
+
+
+
+ + ♻ ☆ Designs for Enabling Collaboration in Human-Machine Teaming via + Interactive and Explainable Systems + + +
+ Collaborative robots and machine learning-based virtual agents are +increasingly entering the human workspace with the aim of increasing +productivity and enhancing safety. Despite this, we show in a ubiquitous +experimental domain, Overcooked-AI, that state-of-the-art techniques for +human-machine teaming (HMT), which rely on imitation or reinforcement learning, +are brittle and result in a machine agent that aims to decouple the machine and +human's actions to act independently rather than in a synergistic fashion. To +remedy this deficiency, we develop HMT approaches that enable iterative, +mixed-initiative team development allowing end-users to interactively reprogram +interpretable AI teammates. Our 50-subject study provides several findings that +we summarize into guidelines. While all approaches underperform a simple +collaborative heuristic (a critical, negative result for learning-based +methods), we find that white-box approaches supported by interactive +modification can lead to significant team development, outperforming white-box +approaches alone, and that black-box approaches are easier to train and result +in better HMT performance highlighting a tradeoff between explainability and +interactivity versus ease-of-training. Together, these findings present three +important future research directions: 1) Improving the ability to generate +collaborative agents with white-box models, 2) Better learning methods to +facilitate collaboration rather than individualized coordination, and 3) +Mixed-initiative interfaces that enable users, who may vary in ability, to +improve collaboration. + +
+
+
+
+
+ + ♻ ☆ NeuPAN: Direct Point Robot Navigation with End-to-End Model-based + Learning + + +
+ Navigating a nonholonomic robot in a cluttered environment requires extremely +accurate perception and locomotion for collision avoidance. This paper presents +NeuPAN: a real-time, highly-accurate, map-free, robot-agnostic, and +environment-invariant robot navigation solution. Leveraging a tightly-coupled +perception-locomotion framework, NeuPAN has two key innovations compared to +existing approaches: 1) it directly maps raw points to a learned multi-frame +distance space, avoiding error propagation from perception to control; 2) it is +interpretable from an end-to-end model-based learning perspective, enabling +provable convergence. The crux of NeuPAN is to solve a high-dimensional +end-to-end mathematical model with various point-level constraints using the +plug-and-play (PnP) proximal alternating-minimization network (PAN) with +neurons in the loop. This allows NeuPAN to generate real-time, end-to-end, +physically-interpretable motions directly from point clouds, which seamlessly +integrates data- and knowledge-engines, where its network parameters are +adjusted via back propagation. We evaluate NeuPAN on car-like robot, +wheel-legged robot, and passenger autonomous vehicle, in both simulated and +real-world environments. Experiments demonstrate that NeuPAN outperforms +various benchmarks, in terms of accuracy, efficiency, robustness, and +generalization capability across various environments, including the cluttered +sandbox, office, corridor, and parking lot. We show that NeuPAN works well in +unstructured environments with arbitrary-shape undetectable objects, making +impassable ways passable. + +
+
+ comment: revision in TRO; project website: + https://hanruihua.github.io/neupan_project/ +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 30 + +
+
+
+ + ☆ BVI-CR: A Multi-View Human Dataset for Volumetric Video Compression + + +
+ The advances in immersive technologies and 3D reconstruction have enabled the +creation of digital replicas of real-world objects and environments with fine +details. These processes generate vast amounts of 3D data, requiring more +efficient compression methods to satisfy the memory and bandwidth constraints +associated with data storage and transmission. However, the development and +validation of efficient 3D data compression methods are constrained by the lack +of comprehensive and high-quality volumetric video datasets, which typically +require much more effort to acquire and consume increased resources compared to +2D image and video databases. To bridge this gap, we present an open multi-view +volumetric human dataset, denoted BVI-CR, which contains 18 multi-view RGB-D +captures and their corresponding textured polygonal meshes, depicting a range +of diverse human actions. Each video sequence contains 10 views in 1080p +resolution with durations between 10-15 seconds at 30FPS. Using BVI-CR, we +benchmarked three conventional and neural coordinate-based multi-view video +compression methods, following the MPEG MIV Common Test Conditions, and +reported their rate quality performance based on various quality metrics. The +results show the great potential of neural representation based methods in +volumetric video compression compared to conventional video coding methods +(with an up to 38\% average coding gain in PSNR). This dataset provides a +development and validation platform for a variety of tasks including volumetric +reconstruction, compression, and quality assessment. The database will be +shared publicly at \url{https://github.com/fan-aaron-zhang/bvi-cr}. + +
+
+
+
+
+ + ☆ PickScan: Object discovery and reconstruction from handheld interactions IROS 2024 + + +
+ Reconstructing compositional 3D representations of scenes, where each object +is represented with its own 3D model, is a highly desirable capability in +robotics and augmented reality. However, most existing methods rely heavily on +strong appearance priors for object discovery, therefore only working on those +classes of objects on which the method has been trained, or do not allow for +object manipulation, which is necessary to scan objects fully and to guide +object discovery in challenging scenarios. We address these limitations with a +novel interaction-guided and class-agnostic method based on object +displacements that allows a user to move around a scene with an RGB-D camera, +hold up objects, and finally outputs one 3D model per held-up object. Our main +contribution to this end is a novel approach to detecting user-object +interactions and extracting the masks of manipulated objects. On a +custom-captured dataset, our pipeline discovers manipulated objects with 78.3% +precision at 100% recall and reconstructs them with a mean chamfer distance of +0.90cm. Compared to Co-Fusion, the only comparable interaction-based and +class-agnostic baseline, this corresponds to a reduction in chamfer distance of +73% while detecting 99% fewer false positives. + +
+
+ comment: 7 pages, 8 figures, published in the 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ DeepSPV: An Interpretable Deep Learning Pipeline for 3D Spleen Volume + Estimation from 2D Ultrasound Images + + +
+ Splenomegaly, the enlargement of the spleen, is an important clinical +indicator for various associated medical conditions, such as sickle cell +disease (SCD). Spleen length measured from 2D ultrasound is the most widely +used metric for characterising spleen size. However, it is still considered a +surrogate measure, and spleen volume remains the gold standard for assessing +spleen size. Accurate spleen volume measurement typically requires 3D imaging +modalities, such as computed tomography or magnetic resonance imaging, but +these are not widely available, especially in the Global South which has a high +prevalence of SCD. In this work, we introduce a deep learning pipeline, +DeepSPV, for precise spleen volume estimation from single or dual 2D ultrasound +images. The pipeline involves a segmentation network and a variational +autoencoder for learning low-dimensional representations from the estimated +segmentations. We investigate three approaches for spleen volume estimation and +our best model achieves 86.62%/92.5% mean relative volume accuracy (MRVA) under +single-view/dual-view settings, surpassing the performance of human experts. In +addition, the pipeline can provide confidence intervals for the volume +estimates as well as offering benefits in terms of interpretability, which +further support clinicians in decision-making when identifying splenomegaly. We +evaluate the full pipeline using a highly realistic synthetic dataset generated +by a diffusion model, achieving an overall MRVA of 83.0% from a single 2D +ultrasound image. Our proposed DeepSPV is the first work to use deep learning +to estimate 3D spleen volume from 2D ultrasound images and can be seamlessly +integrated into the current clinical workflow for spleen assessment. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2308.08038 +
+
+
+
+
+ + ☆ Freqformer: Frequency-Domain Transformer for 3-D Visualization and + Quantification of Human Retinal Circulation + + +
+ We introduce Freqformer, a novel Transformer-based architecture designed for +3-D, high-definition visualization of human retinal circulation from a single +scan in commercial optical coherence tomography angiography (OCTA). Freqformer +addresses the challenge of limited signal-to-noise ratio in OCTA volume by +utilizing a complex-valued frequency-domain module (CFDM) and a simplified +multi-head attention (Sim-MHA) mechanism. Using merged volumes as ground truth, +Freqformer enables accurate reconstruction of retinal vasculature across the +depth planes, allowing for 3-D quantification of capillary segments (count, +density, and length). Our method outperforms state-of-the-art convolutional +neural networks (CNNs) and several Transformer-based models, with superior +performance in peak signal-to-noise ratio (PSNR), structural similarity index +measure (SSIM), and learned perceptual image patch similarity (LPIPS). +Furthermore, Freqformer demonstrates excellent generalizability across lower +scanning density, effectively enhancing OCTA scans with larger fields of view +(from 3$\times$3 $mm^{2}$ to 6$\times$6 $mm^{2}$ and 12$\times$12 $mm^{2}$). +These results suggest that Freqformer can significantly improve the +understanding and characterization of retinal circulation, offering potential +clinical applications in diagnosing and managing retinal vascular diseases. + +
+
+
+
+
+ + ☆ Enhanced Anime Image Generation Using USE-CMHSA-GAN + + +
+ With the growing popularity of ACG (Anime, Comics, and Games) culture, +generating high-quality anime character images has become an important research +topic. This paper introduces a novel Generative Adversarial Network model, +USE-CMHSA-GAN, designed to produce high-quality anime character images. The +model builds upon the traditional DCGAN framework, incorporating USE and CMHSA +modules to enhance feature extraction capabilities for anime character images. +Experiments were conducted on the anime-face-dataset, and the results +demonstrate that USE-CMHSA-GAN outperforms other benchmark models, including +DCGAN, VAE-GAN, and WGAN, in terms of FID and IS scores, indicating superior +image quality. These findings suggest that USE-CMHSA-GAN is highly effective +for anime character image generation and provides new insights for further +improving the quality of generative models. + +
+
+
+
+
+ + ☆ RPN 2: On Interdependence Function Learning Towards Unifying and + Advancing CNN, RNN, GNN, and Transformer + + +
+ This paper builds upon our previous work on the Reconciled Polynomial Network +(RPN). The original RPN model was designed under the assumption of input data +independence, presuming the independence among both individual instances within +data batches and attributes in each data instance. However, this assumption +often proves invalid for function learning tasks involving complex, +interdependent data such as language, images, time series, and graphs. Ignoring +such data interdependence may inevitably lead to significant performance +degradation. + To overcome these limitations, we introduce the new Reconciled Polynomial +Network (version 2), namely RPN 2, in this paper. By incorporating data and +structural interdependence functions, RPN 2 explicitly models data +interdependence via new component functions in its architecture. + This enhancement not only significantly improves RPN 2's learning performance +but also substantially expands its unifying potential, enabling it to encompass +a broader range of contemporary dominant backbone models within its canonical +representation. These backbones include, but are not limited to, convolutional +neural networks (CNNs), recurrent neural networks (RNNs), graph neural networks +(GNNs), and Transformers. Our analysis reveals that the fundamental +distinctions among these backbone models primarily stem from their diverse +approaches to defining the interdependence functions. Furthermore, this unified +representation opens up new opportunities for designing innovative +architectures with the potential to surpass the performance of these dominant +backbones. + +
+
+ comment: 105 pages, 37 figures, 6 tables, preprint version +
+
+
+
+
+ + ☆ Person Segmentation and Action Classification for Multi-Channel + Hemisphere Field of View LiDAR Sensors + + +
+ Robots need to perceive persons in their surroundings for safety and to +interact with them. In this paper, we present a person segmentation and action +classification approach that operates on 3D scans of hemisphere field of view +LiDAR sensors. We recorded a data set with an Ouster OSDome-64 sensor +consisting of scenes where persons perform three different actions and +annotated it. We propose a method based on a MaskDINO model to detect and +segment persons and to recognize their actions from combined spherical +projected multi-channel representations of the LiDAR data with an additional +positional encoding. Our approach demonstrates good performance for the person +segmentation task and further performs well for the estimation of the person +action states walking, waving, and sitting. An ablation study provides insights +about the individual channel contributions for the person segmentation task. +The trained models, code and dataset are made publicly available. + +
+
+ comment: 6 pages, 9 figures, 4 tables, accepted for publication at IEEE/SICE + International Symposium on System Integration (SII), Munich, Germany, January + 2025 +
+
+
+
+
+ + ☆ A Comprehensive Survey on Visual Question Answering Datasets and + Algorithms + + +
+ Visual question answering (VQA) refers to the problem where, given an image +and a natural language question about the image, a correct natural language +answer has to be generated. A VQA model has to demonstrate both the visual +understanding of the image and the semantic understanding of the question, +demonstrating reasoning capability. Since the inception of this field, a +plethora of VQA datasets and models have been published. In this article, we +meticulously analyze the current state of VQA datasets and models, while +cleanly dividing them into distinct categories and then summarizing the +methodologies and characteristics of each category. We divide VQA datasets into +four categories: (1) available datasets that contain a rich collection of +authentic images, (2) synthetic datasets that contain only synthetic images +produced through artificial means, (3) diagnostic datasets that are specially +designed to test model performance in a particular area, e.g., understanding +the scene text, and (4) KB (Knowledge-Based) datasets that are designed to +measure a model's ability to utilize outside knowledge. Concurrently, we +explore six main paradigms of VQA models: fusion, where we discuss different +methods of fusing information between visual and textual modalities; attention, +the technique of using information from one modality to filter information from +another; external knowledge base, where we discuss different models utilizing +outside information; composition or reasoning, where we analyze techniques to +answer advanced questions that require complex reasoning steps; explanation, +which is the process of generating visual and textual descriptions to verify +sound reasoning; and graph models, which encode and manipulate relationships +through nodes in a graph. We also discuss some miscellaneous topics, such as +scene text understanding, counting, and bias reduction. + +
+
+
+
+
+ + ☆ Oscillation Inversion: Understand the structure of Large Flow Model + through the Lens of Inversion Method + + +
+ We explore the oscillatory behavior observed in inversion methods applied to +large-scale text-to-image diffusion models, with a focus on the "Flux" model. +By employing a fixed-point-inspired iterative approach to invert real-world +images, we observe that the solution does not achieve convergence, instead +oscillating between distinct clusters. Through both toy experiments and +real-world diffusion models, we demonstrate that these oscillating clusters +exhibit notable semantic coherence. We offer theoretical insights, showing that +this behavior arises from oscillatory dynamics in rectified flow models. +Building on this understanding, we introduce a simple and fast distribution +transfer technique that facilitates image enhancement, stroke-based recoloring, +as well as visual prompt-guided image editing. Furthermore, we provide +quantitative results demonstrating the effectiveness of our method for tasks +such as image enhancement, makeup transfer, reconstruction quality, and guided +sampling quality. Higher-quality examples of videos and images are available at +\href{https://yanyanzheng96.github.io/oscillation_inversion/}{this link}. + +
+
+
+
+
+ + ☆ DBF-Net: A Dual-Branch Network with Feature Fusion for Ultrasound Image + Segmentation + + +
+ Accurately segmenting lesions in ultrasound images is challenging due to the +difficulty in distinguishing boundaries between lesions and surrounding +tissues. While deep learning has improved segmentation accuracy, there is +limited focus on boundary quality and its relationship with body structures. To +address this, we introduce UBBS-Net, a dual-branch deep neural network that +learns the relationship between body and boundary for improved segmentation. We +also propose a feature fusion module to integrate body and boundary +information. Evaluated on three public datasets, UBBS-Net outperforms existing +methods, achieving Dice Similarity Coefficients of 81.05% for breast cancer, +76.41% for brachial plexus nerves, and 87.75% for infantile hemangioma +segmentation. Our results demonstrate the effectiveness of UBBS-Net for +ultrasound image segmentation. The code is available at +https://github.com/apple1986/DBF-Net. + +
+
+
+
+
+ + ☆ Retinal Vessel Segmentation via Neuron Programming + + +
+ The accurate segmentation of retinal blood vessels plays a crucial role in +the early diagnosis and treatment of various ophthalmic diseases. Designing a +network model for this task requires meticulous tuning and extensive +experimentation to handle the tiny and intertwined morphology of retinal blood +vessels. To tackle this challenge, Neural Architecture Search (NAS) methods are +developed to fully explore the space of potential network architectures and go +after the most powerful one. Inspired by neuronal diversity which is the +biological foundation of all kinds of intelligent behaviors in our brain, this +paper introduces a novel and foundational approach to neural network design, +termed ``neuron programming'', to automatically search neuronal types into a +network to enhance a network's representation ability at the neuronal level, +which is complementary to architecture-level enhancement done by NAS. +Additionally, to mitigate the time and computational intensity of neuron +programming, we develop a hypernetwork that leverages the search-derived +architectural information to predict optimal neuronal configurations. +Comprehensive experiments validate that neuron programming can achieve +competitive performance in retinal blood segmentation, demonstrating the strong +potential of neuronal diversity in medical image analysis. + +
+
+
+
+
+ + ☆ Label Sharing Incremental Learning Framework for Independent Multi-Label + Segmentation Tasks + + +
+ In a setting where segmentation models have to be built for multiple +datasets, each with its own corresponding label set, a straightforward way is +to learn one model for every dataset and its labels. Alternatively, multi-task +architectures with shared encoders and multiple segmentation heads or shared +weights with compound labels can also be made use of. This work proposes a +novel label sharing framework where a shared common label space is constructed +and each of the individual label sets are systematically mapped to the common +labels. This transforms multiple datasets with disparate label sets into a +single large dataset with shared labels, and therefore all the segmentation +tasks can be addressed by learning a single model. This eliminates the need for +task specific adaptations in network architectures and also results in +parameter and data efficient models. Furthermore, label sharing framework is +naturally amenable for incremental learning where segmentations for new +datasets can be easily learnt. We experimentally validate our method on various +medical image segmentation datasets, each involving multi-label segmentation. +Furthermore, we demonstrate the efficacy of the proposed method in terms of +performance and incremental learning ability vis-a-vis alternative methods. + +
+
+
+
+
+ + ☆ MolParser: End-to-end Visual Recognition of Molecule Structures in the + Wild + + +
+ In recent decades, chemistry publications and patents have increased rapidly. +A significant portion of key information is embedded in molecular structure +figures, complicating large-scale literature searches and limiting the +application of large language models in fields such as biology, chemistry, and +pharmaceuticals. The automatic extraction of precise chemical structures is of +critical importance. However, the presence of numerous Markush structures in +real-world documents, along with variations in molecular image quality, drawing +styles, and noise, significantly limits the performance of existing optical +chemical structure recognition (OCSR) methods. We present MolParser, a novel +end-to-end OCSR method that efficiently and accurately recognizes chemical +structures from real-world documents, including difficult Markush structure. We +use a extended SMILES encoding rule to annotate our training dataset. Under +this rule, we build MolParser-7M, the largest annotated molecular image dataset +to our knowledge. While utilizing a large amount of synthetic data, we employed +active learning methods to incorporate substantial in-the-wild data, +specifically samples cropped from real patents and scientific literature, into +the training process. We trained an end-to-end molecular image captioning +model, MolParser, using a curriculum learning approach. MolParser significantly +outperforms classical and learning-based methods across most scenarios, with +potential for broader downstream applications. The dataset is publicly +available. + +
+
+
+
+
+ + ☆ D-Cube: Exploiting Hyper-Features of Diffusion Model for Robust Medical + Classification + + +
+ The integration of deep learning technologies in medical imaging aims to +enhance the efficiency and accuracy of cancer diagnosis, particularly for +pancreatic and breast cancers, which present significant diagnostic challenges +due to their high mortality rates and complex imaging characteristics. This +paper introduces Diffusion-Driven Diagnosis (D-Cube), a novel approach that +leverages hyper-features from a diffusion model combined with contrastive +learning to improve cancer diagnosis. D-Cube employs advanced feature selection +techniques that utilize the robust representational capabilities of diffusion +models, enhancing classification performance on medical datasets under +challenging conditions such as data imbalance and limited sample availability. +The feature selection process optimizes the extraction of clinically relevant +features, significantly improving classification accuracy and demonstrating +resilience in imbalanced and limited data scenarios. Experimental results +validate the effectiveness of D-Cube across multiple medical imaging +modalities, including CT, MRI, and X-ray, showing superior performance compared +to existing baseline models. D-Cube represents a new strategy in cancer +detection, employing advanced deep learning techniques to achieve +state-of-the-art diagnostic accuracy and efficiency. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ STOP: Spatiotemporal Orthogonal Propagation for Weight-Threshold-Leakage + Synergistic Training of Deep Spiking Neural Networks + + +
+ The prevailing of artificial intelligence-of-things calls for higher +energy-efficient edge computing paradigms, such as neuromorphic agents +leveraging brain-inspired spiking neural network (SNN) models based on +spatiotemporally sparse binary activations. However, the lack of efficient and +high-accuracy deep SNN learning algorithms prevents them from practical edge +deployments with a strictly bounded cost. In this paper, we propose a +spatiotemporal orthogonal propagation (STOP) algorithm to tack this challenge. +Our algorithm enables fully synergistic learning of synaptic weights as well as +firing thresholds and leakage factors in spiking neurons to improve SNN +accuracy, while under a unified temporally-forward trace-based framework to +mitigate the huge memory requirement for storing neural states of all +time-steps in the forward pass. Characteristically, the spatially-backward +neuronal errors and temporally-forward traces propagate orthogonally to and +independently of each other, substantially reducing computational overhead. Our +STOP algorithm obtained high recognition accuracies of 99.53%, 94.84%, 74.92%, +98.26% and 77.10% on the MNIST, CIFAR-10, CIFAR-100, DVS-Gesture and +DVS-CIFAR10 datasets with adequate SNNs of intermediate scales from LeNet-5 to +ResNet-18. Compared with other deep SNN training works, our method is more +plausible for edge intelligent scenarios where resources are limited but +high-accuracy in-situ learning is desired. + +
+
+ comment: 13 pages (exclude supplementary), 5 figures +
+
+
+
+
+ + ☆ Electrostatic Force Regularization for Neural Structured Pruning + + +
+ The demand for deploying deep convolutional neural networks (DCNNs) on +resource-constrained devices for real-time applications remains substantial. +However, existing state-of-the-art structured pruning methods often involve +intricate implementations, require modifications to the original network +architectures, and necessitate an extensive fine-tuning phase. To overcome +these challenges, we propose a novel method that, for the first time, +incorporates the concepts of charge and electrostatic force from physics into +the training process of DCNNs. The magnitude of this force is directly +proportional to the product of the charges of the convolution filter and the +source filter, and inversely proportional to the square of the distance between +them. We applied this electrostatic-like force to the convolution filters, +either attracting filters with opposite charges toward non-zero weights or +repelling filters with like charges toward zero weights. Consequently, filters +subject to repulsive forces have their weights reduced to zero, enabling their +removal, while the attractive forces preserve filters with significant weights +that retain information. Unlike conventional methods, our approach is +straightforward to implement, does not require any architectural modifications, +and simultaneously optimizes weights and ranks filter importance, all without +the need for extensive fine-tuning. We validated the efficacy of our method on +modern DCNN architectures using the MNIST, CIFAR, and ImageNet datasets, +achieving competitive performance compared to existing structured pruning +approaches. + +
+
+
+
+
+ + ☆ Skeleton-Guided Spatial-Temporal Feature Learning for Video-Based + Visible-Infrared Person Re-Identification + + +
+ Video-based visible-infrared person re-identification (VVI-ReID) is +challenging due to significant modality feature discrepancies. Spatial-temporal +information in videos is crucial, but the accuracy of spatial-temporal +information is often influenced by issues like low quality and occlusions in +videos. Existing methods mainly focus on reducing modality differences, but pay +limited attention to improving spatial-temporal features, particularly for +infrared videos. To address this, we propose a novel Skeleton-guided +spatial-Temporal feAture leaRning (STAR) method for VVI-ReID. By using skeleton +information, which is robust to issues such as poor image quality and +occlusions, STAR improves the accuracy of spatial-temporal features in videos +of both modalities. Specifically, STAR employs two levels of skeleton-guided +strategies: frame level and sequence level. At the frame level, the robust +structured skeleton information is used to refine the visual features of +individual frames. At the sequence level, we design a feature aggregation +mechanism based on skeleton key points graph, which learns the contribution of +different body parts to spatial-temporal features, further enhancing the +accuracy of global features. Experiments on benchmark datasets demonstrate that +STAR outperforms state-of-the-art methods. Code will be open source soon. + +
+
+
+
+
+ + ☆ TS-LLaVA: Constructing Visual Tokens through Thumbnail-and-Sampling for + Training-Free Video Large Language Models + + +
+ Recent advances in multimodal Large Language Models (LLMs) have shown great +success in understanding multi-modal contents. For video understanding tasks, +training-based video LLMs are difficult to build due to the scarcity of +high-quality, curated video-text paired data. In contrast, paired image-text +data are much easier to obtain, and there is substantial similarity between +images and videos. Consequently, extending image LLMs for video understanding +tasks presents an appealing alternative. Developing effective strategies for +compressing visual tokens from multiple frames is a promising way to leverage +the powerful pre-trained image LLM. In this work, we explore the limitations of +the existing compression strategies for building a training-free video LLM. The +findings lead to our method TS-LLaVA, which constructs visual tokens through a +Thumbnail-and-Sampling strategy. Given a video, we select few equidistant +frames from all input frames to construct a Thumbnail image as a detailed +visual cue, complemented by Sampled visual tokens from all input frames. Our +method establishes the new state-of-the-art performance among training-free +video LLMs on various benchmarks. Notably, our 34B model outperforms GPT-4V on +the MVBench benchmark, and achieves performance comparable to the 72B +training-based video LLM, Video-LLaMA2, on the challenging MLVU benchmark. Code +is available at https://github.com/tingyu215/TS-LLaVA. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ Learned Scalable Video Coding For Humans and Machines + + +
+ Video coding has traditionally been developed to support services such as +video streaming, videoconferencing, digital TV, and so on. The main intent was +to enable human viewing of the encoded content. However, with the advances in +deep neural networks (DNNs), encoded video is increasingly being used for +automatic video analytics performed by machines. In applications such as +automatic traffic monitoring, analytics such as vehicle detection, tracking and +counting, would run continuously, while human viewing could be required +occasionally to review potential incidents. To support such applications, a new +paradigm for video coding is needed that will facilitate efficient +representation and compression of video for both machine and human use in a +scalable manner. In this manuscript, we introduce an end-to-end learnable video +codec that supports a machine vision task in its base layer, while its +enhancement layer, together with the base layer, supports input reconstruction +for human viewing. The proposed system is constructed based on the concept of +conditional coding to achieve better compression gains. Comprehensive +experimental evaluations conducted on four standard video datasets demonstrate +that our framework outperforms both state-of-the-art learned and conventional +video codecs in its base layer, while maintaining comparable performance on the +human vision task in its enhancement layer. Implementation of the proposed +system is available at https://github.com/hadipardis/svc + +
+
+ comment: 18 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ OOD-SEG: Out-Of-Distribution detection for image SEGmentation with + sparse multi-class positive-only annotations + + +
+ Despite significant advancements, segmentation based on deep neural networks +in medical and surgical imaging faces several challenges, two of which we aim +to address in this work. First, acquiring complete pixel-level segmentation +labels for medical images is time-consuming and requires domain expertise. +Second, typical segmentation pipelines cannot detect out-of-distribution (OOD) +pixels, leaving them prone to spurious outputs during deployment. In this work, +we propose a novel segmentation approach exploiting OOD detection that learns +only from sparsely annotated pixels from multiple positive-only classes. These +multi-class positive annotations naturally fall within the in-distribution (ID) +set. Unlabelled pixels may contain positive classes but also negative ones, +including what is typically referred to as \emph{background} in standard +segmentation formulations. Here, we forgo the need for background annotation +and consider these together with any other unseen classes as part of the OOD +set. Our framework can integrate, at a pixel-level, any OOD detection +approaches designed for classification tasks. To address the lack of existing +OOD datasets and established evaluation metric for medical image segmentation, +we propose a cross-validation strategy that treats held-out labelled classes as +OOD. Extensive experiments on both multi-class hyperspectral and RGB surgical +imaging datasets demonstrate the robustness and generalisation capability of +our proposed framework. + +
+
+
+
+
+ + ♻ ☆ Evaluating Representations with Readout Model Switching + + +
+ Although much of the success of Deep Learning builds on learning good +representations, a rigorous method to evaluate their quality is lacking. In +this paper, we treat the evaluation of representations as a model selection +problem and propose to use the Minimum Description Length (MDL) principle to +devise an evaluation metric. Contrary to the established practice of limiting +the capacity of the readout model, we design a hybrid discrete and +continuous-valued model space for the readout models and employ a switching +strategy to combine their predictions. The MDL score takes model complexity, as +well as data efficiency into account. As a result, the most appropriate model +for the specific task and representation will be chosen, making it a unified +measure for comparison. The proposed metric can be efficiently computed with an +online method and we present results for pre-trained vision encoders of various +architectures (ResNet and ViT) and objective functions (supervised and +self-supervised) on a range of downstream tasks. We compare our methods with +accuracy-based approaches and show that the latter are inconsistent when +multiple readout models are used. Finally, we discuss important properties +revealed by our evaluations such as model scaling, preferred readout model, and +data efficiency. + +
+
+
+
+
+ + ♻ ☆ Enhancing Cross-Modal Contextual Congruence for Crowdfunding Success + using Knowledge-infused Learning + + +
+ The digital landscape continually evolves with multimodality, enriching the +online experience for users. Creators and marketers aim to weave subtle +contextual cues from various modalities into congruent content to engage users +with a harmonious message. This interplay of multimodal cues is often a crucial +factor in attracting users' attention. However, this richness of multimodality +presents a challenge to computational modeling, as the semantic contextual cues +spanning across modalities need to be unified to capture the true holistic +meaning of the multimodal content. This contextual meaning is critical in +attracting user engagement as it conveys the intended message of the brand or +the organization. In this work, we incorporate external commonsense knowledge +from knowledge graphs to enhance the representation of multimodal data using +compact Visual Language Models (VLMs) and predict the success of multi-modal +crowdfunding campaigns. Our results show that external knowledge commonsense +bridges the semantic gap between text and image modalities, and the enhanced +knowledge-infused representations improve the predictive performance of models +for campaign success upon the baselines without knowledge. Our findings +highlight the significance of contextual congruence in online multimodal +content for engaging and successful crowdfunding campaigns. + +
+
+ comment: Accepted at IEEE International Conference on Big Data 2024 (IEEE + BigData 2024) +
+
+
+
+
+ + ♻ ☆ ConvMixFormer- A Resource-efficient Convolution Mixer for + Transformer-based Dynamic Hand Gesture Recognition + + +
+ Transformer models have demonstrated remarkable success in many domains such +as natural language processing (NLP) and computer vision. With the growing +interest in transformer-based architectures, they are now utilized for gesture +recognition. So, we also explore and devise a novel ConvMixFormer architecture +for dynamic hand gestures. The transformers use quadratic scaling of the +attention features with the sequential data, due to which these models are +computationally complex and heavy. We have considered this drawback of the +transformer and designed a resource-efficient model that replaces the +self-attention in the transformer with the simple convolutional layer-based +token mixer. The computational cost and the parameters used for the +convolution-based mixer are comparatively less than the quadratic +self-attention. Convolution-mixer helps the model capture the local spatial +features that self-attention struggles to capture due to their sequential +processing nature. Further, an efficient gate mechanism is employed instead of +a conventional feed-forward network in the transformer to help the model +control the flow of features within different stages of the proposed model. +This design uses fewer learnable parameters which is nearly half the vanilla +transformer that helps in fast and efficient training. The proposed method is +evaluated on NVidia Dynamic Hand Gesture and Briareo datasets and our model has +achieved state-of-the-art results on single and multimodal inputs. We have also +shown the parameter efficiency of the proposed ConvMixFormer model compared to +other methods. The source code is available at +https://github.com/mallikagarg/ConvMixFormer. + +
+
+
+
+
+ + ♻ ☆ Leveraging Bi-Focal Perspectives and Granular Feature Integration for + Accurate Reliable Early Alzheimer's Detection + + +
+ Alzheimer's disease (AD) is the most common neurodegeneration, annually +diagnosed in millions of patients. The present medicine scenario still finds +challenges in the exact diagnosis and classification of AD through neuroimaging +data. Traditional CNNs can extract a good amount of low-level information in an +image but fail to extract high-level minuscule particles, which is a +significant challenge in detecting AD from MRI scans. To overcome this, we +propose a novel Granular Feature Integration method to combine information +extraction at different scales combined with an efficient information flow, +enabling the model to capture both broad and fine-grained features +simultaneously. We also propose a Bi-Focal Perspective mechanism to highlight +the subtle neurofibrillary tangles and amyloid plaques in the MRI scans, +ensuring that critical pathological markers are accurately identified. Our +model achieved an F1-Score of 99.31%, precision of 99.24%, and recall of +99.51%. These scores prove that our model is significantly better than the +state-of-the-art (SOTA) CNNs in existence. + +
+
+ comment: 14 pages, 12 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Efficient Large Multi-modal Models via Visual Context Compression NeurIPS 2024 + + +
+ While significant advancements have been made in compressed representations +for text embeddings in large language models (LLMs), the compression of visual +tokens in multi-modal LLMs (MLLMs) has remained a largely overlooked area. In +this work, we present the study on the analysis of redundancy concerning visual +tokens and efficient training within these models. Our initial experiments show +that eliminating up to 70% of visual tokens at the testing stage by simply +average pooling only leads to a minimal 3% reduction in visual question +answering accuracy on the GQA benchmark, indicating significant redundancy in +visual context. Addressing this, we introduce Visual Context Compressor, which +reduces the number of visual tokens to enhance training and inference +efficiency without sacrificing performance. To minimize information loss caused +by the compression on visual tokens while maintaining training efficiency, we +develop LLaVolta as a light and staged training scheme that incorporates +stage-wise visual context compression to progressively compress the visual +tokens from heavily to lightly compression during training, yielding no loss of +information when testing. Extensive experiments demonstrate that our approach +enhances the performance of MLLMs in both image-language and video-language +understanding, while also significantly cutting training costs and improving +inference efficiency. + +
+
+ comment: NeurIPS 2024 Camera Ready; Code is available at + https://github.com/Beckschen/LLaVolta +
+
+
+
+
+ + ♻ ☆ Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework + for Multimodal LLMs NeurIPS 2024 + + +
+ Multimodal large language models (MLLMs) have shown impressive success across +modalities such as image, video, and audio in a variety of understanding and +generation tasks. However, current MLLMs are surprisingly poor at understanding +webpage screenshots and generating their corresponding HTML code. To address +this problem, we propose $\texttt{Web2Code}$, a benchmark consisting of a new +large-scale webpage-to-code dataset for instruction tuning and an evaluation +framework for the webpage understanding and HTML code translation abilities of +MLLMs. For dataset construction, we leverage pretrained LLMs to enhance +existing webpage-to-code datasets as well as generate a diverse pool of new +webpages rendered into images. Specifically, the inputs are webpage images and +instructions, while the responses are the webpage's HTML code. We further +include diverse natural language QA pairs about the webpage content in the +responses to enable a more comprehensive understanding of the web content. To +evaluate model performance in these tasks, we develop an evaluation framework +for testing MLLMs' abilities in webpage understanding and web-to-code +generation. Extensive experiments show that our proposed dataset is beneficial +not only to our proposed tasks but also in the general visual domain. We hope +our work will contribute to the development of general MLLMs suitable for +web-based content generation and task automation. Our data and code are +available at https://github.com/MBZUAI-LLM/web2code. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Camera-ready Version. Website at + https://mbzuai-llm.github.io/webpage2code/ +
+
+
+
+
+ + ♻ ☆ Exploring the Adversarial Frontier: Quantifying Robustness via + Adversarial Hypervolume + + +
+ The escalating threat of adversarial attacks on deep learning models, +particularly in security-critical fields, has underscored the need for robust +deep learning systems. Conventional robustness evaluations have relied on +adversarial accuracy, which measures a model's performance under a specific +perturbation intensity. However, this singular metric does not fully +encapsulate the overall resilience of a model against varying degrees of +perturbation. To address this gap, we propose a new metric termed adversarial +hypervolume, assessing the robustness of deep learning models comprehensively +over a range of perturbation intensities from a multi-objective optimization +standpoint. This metric allows for an in-depth comparison of defense mechanisms +and recognizes the trivial improvements in robustness afforded by less potent +defensive strategies. Additionally, we adopt a novel training algorithm that +enhances adversarial robustness uniformly across various perturbation +intensities, in contrast to methods narrowly focused on optimizing adversarial +accuracy. Our extensive empirical studies validate the effectiveness of the +adversarial hypervolume metric, demonstrating its ability to reveal subtle +differences in robustness that adversarial accuracy overlooks. This research +contributes a new measure of robustness and establishes a standard for +assessing and benchmarking the resilience of current and future defensive +models against adversarial threats. + +
+
+
+
+
+ + ♻ ☆ Gaze-Assisted Medical Image Segmentation NeurIPS'24 + + +
+ The annotation of patient organs is a crucial part of various diagnostic and +treatment procedures, such as radiotherapy planning. Manual annotation is +extremely time-consuming, while its automation using modern image analysis +techniques has not yet reached levels sufficient for clinical adoption. This +paper investigates the idea of semi-supervised medical image segmentation using +human gaze as interactive input for segmentation correction. In particular, we +fine-tuned the Segment Anything Model in Medical Images (MedSAM), a public +solution that uses various prompt types as additional input for semi-automated +segmentation correction. We used human gaze data from reading abdominal images +as a prompt for fine-tuning MedSAM. The model was validated on a public WORD +database, which consists of 120 CT scans of 16 abdominal organs. The results of +the gaze-assisted MedSAM were shown to be superior to the results of the +state-of-the-art segmentation models. In particular, the average Dice +coefficient for 16 abdominal organs was 85.8%, 86.7%, 81.7%, and 90.5% for +nnUNetV2, ResUNet, original MedSAM, and our gaze-assisted MedSAM model, +respectively. + +
+
+ comment: 16 pages, 4 figures, Accepted to AIM-FM Workshop @ NeurIPS'24 +
+
+
+
+
+ + ♻ ☆ Stimulating Diffusion Model for Image Denoising via Adaptive Embedding + and Ensembling + + +
+ Image denoising is a fundamental problem in computational photography, where +achieving high perception with low distortion is highly demanding. Current +methods either struggle with perceptual quality or suffer from significant +distortion. Recently, the emerging diffusion model has achieved +state-of-the-art performance in various tasks and demonstrates great potential +for image denoising. However, stimulating diffusion models for image denoising +is not straightforward and requires solving several critical problems. For one +thing, the input inconsistency hinders the connection between diffusion models +and image denoising. For another, the content inconsistency between the +generated image and the desired denoised image introduces distortion. To tackle +these problems, we present a novel strategy called the Diffusion Model for +Image Denoising (DMID) by understanding and rethinking the diffusion model from +a denoising perspective. Our DMID strategy includes an adaptive embedding +method that embeds the noisy image into a pre-trained unconditional diffusion +model and an adaptive ensembling method that reduces distortion in the denoised +image. Our DMID strategy achieves state-of-the-art performance on both +distortion-based and perception-based metrics, for both Gaussian and real-world +image denoising.The code is available at https://github.com/Li-Tong-621/DMID. + +
+
+ comment: 18 pages,15 figures +
+
+
+
+
+ + ♻ ☆ MDA: An Interpretable and Scalable Multi-Modal Fusion under Missing + Modalities and Intrinsic Noise Conditions + + +
+ Multi-modal learning has shown exceptional performance in various tasks, +especially in medical applications, where it integrates diverse medical +information for comprehensive diagnostic evidence. However, there still are +several challenges in multi-modal learning, 1. Heterogeneity between +modalities, 2. uncertainty in missing modalities, 3. influence of intrinsic +noise, and 4. interpretability for fusion result. This paper introduces the +Modal-Domain Attention (MDA) model to address the above challenges. MDA +constructs linear relationships between modalities through continuous +attention, due to its ability to adaptively allocate dynamic attention to +different modalities, MDA can reduce attention to low-correlation data, missing +modalities, or modalities with inherent noise, thereby maintaining SOTA +performance across various tasks on multiple public datasets. Furthermore, our +observations on the contribution of different modalities indicate that MDA +aligns with established clinical diagnostic imaging gold standards and holds +promise as a reference for pathologies where these standards are not yet +clearly defined. The code and dataset will be available. + +
+
+
+
+
+
+
+
+ + Systems and Control 15 + +
+
+
+ + ☆ Robot Metabolism: Towards machines that can grow by consuming other + machines + + +
+ Biological lifeforms can heal, grow, adapt, and reproduce -- abilities +essential for sustained survival and development. In contrast, robots today are +primarily monolithic machines with limited ability to self-repair, physically +develop, or incorporate material from their environments. A key challenge to +such physical adaptation has been that while robot minds are rapidly evolving +new behaviors through AI, their bodies remain closed systems, unable to +systematically integrate new material to grow or heal. We argue that open-ended +physical adaptation is only possible when robots are designed using only a +small repertoire of simple modules. This allows machines to mechanically adapt +by consuming parts from other machines or their surroundings and shedding +broken components. We demonstrate this principle using a truss modular robot +platform composed of one-dimensional actuated bars. We show how robots in this +space can grow bigger, faster, and more capable by consuming materials from +their environment and from other robots. We suggest that machine metabolic +processes akin to the one demonstrated here will be an essential part of any +sustained future robot ecology. + +
+
+ comment: Manuscript combined with Supplementary Materials File for arXiv + submission. Submitting to Journal and will update external DOI once available +
+
+
+
+
+ + ☆ Robust Defense Against Extreme Grid Events Using Dual-Policy + Reinforcement Learning Agents + + +
+ Reinforcement learning (RL) agents are powerful tools for managing power +grids. They use large amounts of data to inform their actions and receive +rewards or penalties as feedback to learn favorable responses for the system. +Once trained, these agents can efficiently make decisions that would be too +computationally complex for a human operator. This ability is especially +valuable in decarbonizing power networks, where the demand for RL agents is +increasing. These agents are well suited to control grid actions since the +action space is constantly growing due to uncertainties in renewable +generation, microgrid integration, and cybersecurity threats. To assess the +efficacy of RL agents in response to an adverse grid event, we use the Grid2Op +platform for agent training. We employ a proximal policy optimization (PPO) +algorithm in conjunction with graph neural networks (GNNs). By simulating +agents' responses to grid events, we assess their performance in avoiding grid +failure for as long as possible. The performance of an agent is expressed +concisely through its reward function, which helps the agent learn the most +optimal ways to reconfigure a grid's topology amidst certain events. To model +multi-actor scenarios that threaten modern power networks, particularly those +resulting from cyberattacks, we integrate an opponent that acts iteratively +against a given agent. This interplay between the RL agent and opponent is +utilized in N-k contingency screening, providing a novel alternative to the +traditional security assessment. + +
+
+ comment: 6 pages, 5 figures, submitted to the 2025 Texas Power and Energy + Conference (TPEC) +
+
+
+
+
+ + ☆ Emergent Structure in Multi-agent Systems Using Geometric Embeddings + + +
+ This work investigates the self-organization of multi-agent systems into +closed trajectories, a common requirement in unmanned aerial vehicle (UAV) +surveillance tasks. In such scenarios, smooth, unbiased control signals save +energy and mitigate mechanical strain. We propose a decentralized control +system architecture that produces a globally stable emergent structure from +local observations only; there is no requirement for agents to share a global +plan or follow prescribed trajectories. Central to our approach is the +formulation of an injective virtual embedding induced by rotations from the +actual agent positions. This embedding serves as a structure-preserving map +around which all agent stabilize their relative positions and permits the use +of well-established linear control techniques. We construct the embedding such +that it is topologically equivalent to the desired trajectory (i.e., a +homeomorphism), thereby preserving the stability characteristics. We +demonstrate the versatility of this approach through implementation on a swarm +of Quanser QDrone quadcopters. Results demonstrate the quadcopters +self-organize into the desired trajectory while maintaining even separation. + +
+
+
+
+
+ + ☆ Leveraging Bitcoin Mining Machines in Demand-Response Mechanisms to + Mitigate Ramping-Induced Transients + + +
+ We propose an extended demand response program, based on ancillary service +for supplying flexible electricity demand. In our proposed scheme, we suggest a +broader management model to control the scheduling and power consumption of +Bitcoin mining machines. The main aspect that we focus on is suppressing the +power ramping and related transient effects. We extend previous works on the +subject, that study the impact of incorporating cryptocurrency mining machines +into existing power grid, and explore the potential profit of exploiting this +flexible load in the Israeli electricity market. We analyze a trend based on +historical data, of increasing electricity prices and ramping costs due to the +increasing penetration of renewable energy sources. We suggest an extension to +the unit commitment problem from which we obtain the scheduling scheme of the +Bitcoin mining machines. We use simulation and the real-world data acquired +from the "Noga" grid operator to verify the proposed ancillary service and test +its practical limits for reducing the ramping costs, under changing ratio of +energy production from renewable sources. Out results suggests that the machine +price and ratio of production from renewable sources plays a significant role +in determining the profitability of the proposed demand-response program. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Iterative Learning Control for Ramp Metering on Service Station On-ramps + + +
+ Congestion on highways has become a significant social problem due to the +increasing number of vehicles, leading to considerable waste of time and +pollution. Regulating the outflow from the Service Station can help alleviate +this congestion. Notably, traffic flows follow recurring patterns over days and +weeks, allowing for the application of Iterative Learning Control (ILC). +Building on these insights, we propose an ILC approach based on the Cell +Transmission Model with service stations (CTM-s). It is shown that ILC can +effectively compensate for potential inaccuracies in model parameter estimates +by leveraging historical data. + +
+
+
+
+
+ + ☆ Dynamic Dimensioning of Frequency Containment Reserves: The Case of the + Nordic Grid + + +
+ One of the main responsibilities of a Transmission System Operator (TSO) +operating an electric grid is to maintain a designated frequency (e.g., 50 Hz +in Europe). To achieve this, TSOs have created several products called +frequency-supporting ancillary services. The Frequency Containment Reserve +(FCR) is one of these ancillary service products. This article focuses on the +TSO problem of determining the volume procured for FCR. Specifically, we +investigate the potential benefits and impact on grid security when +transitioning from a traditionally static procurement method to a dynamic +strategy for FCR volume. We take the Nordic synchronous area in Europe as a +case study and use a diffusion model to capture its frequency development. We +introduce a controlled mean reversal parameter to assess changes in FCR +obligations, in particular for the Nordic FCR-N ancillary service product. We +establish closed-form expressions for exceedance probabilities and use +historical frequency data as input to calibrate the model. We show that a +dynamic dimensioning approach for FCR has the potential to significantly reduce +the exceedance probabilities (up to 37%) while keeping the total yearly +procured FCR volume the same as compared to the current static approach. + +
+
+ comment: 10 pages, 10 figures, submitted to IEEE Transactions on Power Systems +
+
+
+
+
+ + ☆ Immersion of General Nonlinear Systems Into State-Affine Ones for the + Design of Generalized Parameter Estimation-Based Observers: A Simple + Algebraic Procedure + + +
+ Generalized parameter estimation-based observers have proven very successful +to deal with systems described in state-affine form. In this paper, we enlarge +the domain of applicability of this method proposing an algebraic procedure to +immerse} an $n$-dimensional general nonlinear system into and $n_z$-dimensional +system in state affine form, with $n_z>n$. First, we recall the necessary and +sufficient condition for the solution of the general problem, which requires +the solution of a partial differential equation that, moreover, has to satisfy +a restrictive injectivity condition. Given the complexity of this task we +propose an alternative simple algebraic method to identify the required dynamic +extension and coordinate transformation, a procedure that, as shown in the +paper, is rather natural for physical systems. We illustrate the method with +some academic benchmark examples from observer theory literature -- that, in +spite of their apparent simplicity, are difficult to solve with the existing +methods -- as well as several practically relevant physical examples. + +
+
+
+
+
+ + ☆ Efficient Estimation of Relaxed Model Parameters for Robust UAV + Trajectory Optimization + + +
+ Online trajectory optimization and optimal control methods are crucial for +enabling sustainable unmanned aerial vehicle (UAV) services, such as +agriculture, environmental monitoring, and transportation, where available +actuation and energy are limited. However, optimal controllers are highly +sensitive to model mismatch, which can occur due to loaded equipment, packages +to be delivered, or pre-existing variability in fundamental structural and +thrust-related parameters. To circumvent this problem, optimal controllers can +be paired with parameter estimators to improve their trajectory planning +performance and perform adaptive control. However, UAV platforms are limited in +terms of onboard processing power, oftentimes making nonlinear parameter +estimation too computationally expensive to consider. To address these issues, +we propose a relaxed, affine-in-parameters multirotor model along with an +efficient optimal parameter estimator. We convexify the nominal Moving Horizon +Parameter Estimation (MHPE) problem into a linear-quadratic form (LQ-MHPE) via +an affine-in-parameter relaxation on the nonlinear dynamics, resulting in fast +quadratic programs (QPs) that facilitate adaptive Model Predictve Control (MPC) +in real time. We compare this approach to the equivalent nonlinear estimator in +Monte Carlo simulations, demonstrating a decrease in average solve time and +trajectory optimality cost by 98.2% and 23.9-56.2%, respectively. + +
+
+ comment: 8 pages, 5 figures, submitted to IEEE Sustech 2025 +
+
+
+
+
+ + ☆ Wildfire Risk Metric Impact on Public Safety Power Shut-off Cost Savings + + +
+ Public Safety Power Shutoffs (PSPS) are a proactive strategy to mitigate fire +hazards from power system infrastructure failures. System operators employ PSPS +to deactivate portions of the electric grid with heightened wildfire risks to +prevent wildfire ignition and redispatch generators to minimize load shedding. +A measure of vegetation flammability, called the Wildland Fire Potential Index +(WFPI), has been widely used to evaluate the risk of nearby wildfires to power +system operation. However, the WFPI does not correlate as strongly to +historically observed wildfire ignition probabilities (OWIP) as WFPI-based the +Large Fire Probability (WLFP).Prior work chose not to incorporate +wildfire-driven failure probabilities, such as the WLFP, because constraints +with Bernoulli random variables to represent wildfire ignitions could require +non-linear or non-convex constraints. This paper uses a deterministic +equivalent of an otherwise complicating line de-energization constraint by +quantifying the wildfire risk of operating transmission line as a sum of each +energized line's wildfire ignition log probability (log(WIP)) rather than as a +sum of each energized line's WFPI. A day-ahead unit commitment and line +de-energization PSPS framework is used to assess the cost differences driven by +the choice between the WFPI and WLFP risk metrics. Training the optimization on +scenarios developed by mapping WLFP to log(WIP) rather than mapping the WFPI to +log(WIP) leads to reductions in the total real-time costs. For the IEEE RTS +24-bus test system, mapping transmission line WLFP values to log(WIP) resulted +in a 14.8 % (on average) decrease in expected real-time costs. + +
+
+ comment: 10 pages, 9 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Tunable Sub-THz and THz lasing effect using FETs at room temperature + + +
+ I report on the first observed self-amplification by stimulated emission of +0.2THz and 1.63THz radiation using InGaAs/GaAs HEMT operating in the deep +saturation regime at room temperature. I demonstrate both theoretically and +experimentally that the Sub-THz and THz FETs response is due to rectification +of the nonlinear dependence of the device current-voltage characteristics. FETs +do operate as a nonlinear THz mixers and rectifiers and its open-drain +responsivity is given by a similar expression to that of zero-bias Schottky +diode detector. However, operating FETs deep in the saturation regime does +allow the accurate tuning of the device to the resonance condition or the +negative resistance mode at room temperature, hence FETs can be tuned in the +deep saturation regime to enable sub-THz and THz lasing effect. This observed +sub-THz and THz laser phenomena using FETs will revolutionize human technology +in all fields of life in the near future. + +
+
+ comment: 5 pages, 5 figures, to be submitted in Journal +
+
+
+
+
+ + ♻ ☆ Social Equity Based Optimal Power Flow Framework to Hedge Against Price + Events + + +
+ With the increasing frequency of high impact low probability events, +electricity markets are experiencing significant price spikes more often. This +paper proposes a novel social equity driven optimal power flow framework to +mitigate the adverse effects of price events that lead to such price spikes. +The framework integrates social welfare optimization with socioeconomic +considerations by including a socioeconomic score that quantifies the energy +burden and socioeconomic status of consumers. By incorporating both supply cost +and consumer satisfaction, the model aims to achieve a balanced and fair +distribution of resources during price events, while considering resource +scarcity and possible load curtailment. The proposed framework is tested for +convergence on modified versions of the PJM 5-bus system and IEEE 24-bus +reliability test system, discussing its potential effectiveness in enhancing +social equity and optimizing power flow under system security constraints. +Sensitivity analysis further highlights the impact of socioeconomic score on +social welfare, providing insights for future improvements. + +
+
+ comment: Published in proceedings of the 2024 56th North American Power + Symposium (NAPS) +
+
+
+
+
+ + ♻ ☆ Sliding Mode Roll Control of Active Suspension Electric Vehicles + + +
+ Vehicle roll control has been a well studied problem. One of the ubiquitous +methods to mitigate vehicle rollover in the automobile industry is via a +mechanical anti-roll bar. However with the advent of electric vehicles, +rollover mitigation can be pursued using electric actuation. In this work, we +study a roll control algorithm using sliding mode control for active suspension +vehicles, where the actuation for the roll control signal is generated by +electric motors independently at the four corners of the vehicle. This +technology precludes the need for any mechanical actuation which is often +slower as well as any anti-roll bar to mitigate vehicle rollover situations. We +provide an implementation of the proposed algorithm and conduct numerical +experiments to validate the functionality and effectiveness. Specifically, we +perform Slalom and J-turn maneuvering tests on an active suspension electric +vehicle with sliding model roll control and it is shown to mitigate rollover by +atleast 50% compared to passive suspension vehicles, while simultaneously +maintaining rider comfort. + +
+
+
+
+
+ + ♻ ☆ Low-Complexity Control for a Class of Uncertain MIMO Nonlinear Systems + under Generalized Time-Varying Output Constraints + + +
+ This paper introduces a novel control framework to address the satisfaction +of multiple time-varying output constraints in uncertain high-order MIMO +nonlinear control systems. Unlike existing methods, which often assume that the +constraints are always decoupled and feasible, our approach can handle coupled +time-varying constraints even in the presence of potential infeasibilities. +First, it is shown that satisfying multiple constraints essentially boils down +to ensuring the positivity of a scalar variable, representing the signed +distance from the boundary of the time-varying output-constrained set. To +achieve this, a single consolidating constraint is designed that, when +satisfied, guarantees convergence to and invariance of the time-varying +output-constrained set within a user-defined finite time. Next, a novel robust +and low-complexity feedback controller is proposed to ensure the satisfaction +of the consolidating constraint. Additionally, we provide a mechanism for +online modification of the consolidating constraint to find a least violating +solution when the constraints become mutually infeasible for some time. +Finally, simulation examples of trajectory and region tracking for a mobile +robot validate the proposed approach. + +
+
+ comment: extended version, 21 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Sparse Representations of Dynamical Networks: A Coprime Factorization + Approach + + +
+ We study a class of dynamical networks modeled by linear and time-invariant +systems which are described by state-space realizations. For these networks, we +investigate the relations between various types of factorizations which +preserve the structure of their component subsystems' interconnection. In doing +so, we provide tractable means of shifting between different types of +sparsity-preserving representations and we show how to employ these +factorizations to obtain distributed implementations for stabilizing and +possibly stable controllers. By formulating all these results for both +discrete- and continuous-time systems, we develop specialized distributed +implementations that, up to this point, were only available for networks +modeled as discrete-time systems. + +
+
+ comment: 35 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Optimal decentralized wavelength control in light sources for + lithography + + +
+ Pulsed light sources are a critical component of modern lithography, with +fine light beam wavelength control paramount for wafer etching accuracy. We +study optimal wavelength control by casting it as a decentralized linear +quadratic Gaussian (LQG) problem in presence of time-delays. In particular, we +consider the multi-optics module (optics and actuators) used for generating the +requisite wavelength in light sources as cooperatively interacting systems +defined over a directed acyclic graph (DAG). We show that any measurement and +other continuous time-delays can be exactly compensated, and the resulting +optimal controller implementation at the individual optics-level outperforms +any existing wavelength control techniques. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 28 + +
+
+
+ + ☆ Capturing Sparks of Abstraction for the ARC Challenge + + +
+ Excellent progress has been made recently in solving ARC Challenge problems. +However, it seems that new techniques may be required to push beyond 60% +accuracy. Even commercial Large Language Models (LLMs) struggle to 'understand' +many of the problems (when given the input and output grids), which makes +discovering solutions by LLM-lead program search somewhat futile. + In this work, LLM 'understanding' is attempted from a stronger starting +position : An LLM is given complete solutions to tasks in code, and then asked +to explain how the task is being solved at various levels of abstraction. +Specifically, the LLM was given code solutions implemented in arc-dsl-llm (an +LLM-legible version of Hodel's arc-dsl to obtain: (a) commented code; (b) code +refactored into reusable functional chunks; (c) problem solution steps; and (d) +high-level problem-solving tactics. + We demonstrate that 'Sparks of Abstraction' can be extracted from the LLM +output - in a form that could be used in downstream tasks with Local LLMs +eligible to enter the ARC Prize. + Both the arc-dsl-llm DSL framework (with the re-engineered solutions) and the +Gemini LLM-generated data (along with the generation code) are made Open +Source. + +
+
+ comment: Submitted as a paper entry for the 2024 ARC Prize +
+
+
+
+
+ + ☆ PickScan: Object discovery and reconstruction from handheld interactions IROS 2024 + + +
+ Reconstructing compositional 3D representations of scenes, where each object +is represented with its own 3D model, is a highly desirable capability in +robotics and augmented reality. However, most existing methods rely heavily on +strong appearance priors for object discovery, therefore only working on those +classes of objects on which the method has been trained, or do not allow for +object manipulation, which is necessary to scan objects fully and to guide +object discovery in challenging scenarios. We address these limitations with a +novel interaction-guided and class-agnostic method based on object +displacements that allows a user to move around a scene with an RGB-D camera, +hold up objects, and finally outputs one 3D model per held-up object. Our main +contribution to this end is a novel approach to detecting user-object +interactions and extracting the masks of manipulated objects. On a +custom-captured dataset, our pipeline discovers manipulated objects with 78.3% +precision at 100% recall and reconstructs them with a mean chamfer distance of +0.90cm. Compared to Co-Fusion, the only comparable interaction-based and +class-agnostic baseline, this corresponds to a reduction in chamfer distance of +73% while detecting 99% fewer false positives. + +
+
+ comment: 7 pages, 8 figures, published in the 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ Improving User Experience in Preference-Based Optimization of Reward + Functions for Assistive Robots + + +
+ Assistive robots interact with humans and must adapt to different users' +preferences to be effective. An easy and effective technique to learn +non-expert users' preferences is through rankings of robot behaviors, for +example, robot movement trajectories or gestures. Existing techniques focus on +generating trajectories for users to rank that maximize the outcome of the +preference learning process. However, the generated trajectories do not appear +to reflect the user's preference over repeated interactions. In this work, we +design an algorithm to generate trajectories for users to rank that we call +Covariance Matrix Adaptation Evolution Strategies with Information Gain +(CMA-ES-IG). CMA-ES-IG prioritizes the user's experience of the preference +learning process. We show that users find our algorithm more intuitive and +easier to use than previous approaches across both physical and social robot +tasks. This project's code is hosted at github.com/interaction-lab/CMA-ES-IG + +
+
+ comment: Accepted to ISRR +
+
+
+
+
+ + ☆ Enhanced Anime Image Generation Using USE-CMHSA-GAN + + +
+ With the growing popularity of ACG (Anime, Comics, and Games) culture, +generating high-quality anime character images has become an important research +topic. This paper introduces a novel Generative Adversarial Network model, +USE-CMHSA-GAN, designed to produce high-quality anime character images. The +model builds upon the traditional DCGAN framework, incorporating USE and CMHSA +modules to enhance feature extraction capabilities for anime character images. +Experiments were conducted on the anime-face-dataset, and the results +demonstrate that USE-CMHSA-GAN outperforms other benchmark models, including +DCGAN, VAE-GAN, and WGAN, in terms of FID and IS scores, indicating superior +image quality. These findings suggest that USE-CMHSA-GAN is highly effective +for anime character image generation and provides new insights for further +improving the quality of generative models. + +
+
+
+
+
+ + ☆ LLäMmlein: Compact and Competitive German-Only Language Models from + Scratch + + +
+ We create two German-only decoder models, LL\"aMmlein 120M and 1B, +transparently from scratch and publish them, along with the training data, for +the German NLP research community to use. The model training involved several +key steps, including extensive data preprocessing, the creation of a custom +German tokenizer, the training itself, as well as the evaluation of the final +models on various benchmarks. Throughout the training process, multiple +checkpoints were saved and analyzed using the SuperGLEBer benchmark to monitor +the models' learning dynamics. Compared to state-of-the-art models on the +SuperGLEBer benchmark, both LL\"aMmlein models performed competitively, +consistently matching or surpassing models with similar parameter sizes. The +results show that the models' quality scales with size as expected, but +performance improvements on some tasks plateaued early, offering valuable +insights into resource allocation for future model development. + +
+
+ comment: first draft; + https://www.informatik.uni-wuerzburg.de/datascience/projects/nlp/llammlein/ +
+
+
+
+
+ + ☆ RPN 2: On Interdependence Function Learning Towards Unifying and + Advancing CNN, RNN, GNN, and Transformer + + +
+ This paper builds upon our previous work on the Reconciled Polynomial Network +(RPN). The original RPN model was designed under the assumption of input data +independence, presuming the independence among both individual instances within +data batches and attributes in each data instance. However, this assumption +often proves invalid for function learning tasks involving complex, +interdependent data such as language, images, time series, and graphs. Ignoring +such data interdependence may inevitably lead to significant performance +degradation. + To overcome these limitations, we introduce the new Reconciled Polynomial +Network (version 2), namely RPN 2, in this paper. By incorporating data and +structural interdependence functions, RPN 2 explicitly models data +interdependence via new component functions in its architecture. + This enhancement not only significantly improves RPN 2's learning performance +but also substantially expands its unifying potential, enabling it to encompass +a broader range of contemporary dominant backbone models within its canonical +representation. These backbones include, but are not limited to, convolutional +neural networks (CNNs), recurrent neural networks (RNNs), graph neural networks +(GNNs), and Transformers. Our analysis reveals that the fundamental +distinctions among these backbone models primarily stem from their diverse +approaches to defining the interdependence functions. Furthermore, this unified +representation opens up new opportunities for designing innovative +architectures with the potential to surpass the performance of these dominant +backbones. + +
+
+ comment: 105 pages, 37 figures, 6 tables, preprint version +
+
+
+
+
+ + ☆ MPLite: Multi-Aspect Pretraining for Mining Clinical Health Records + + +
+ The adoption of digital systems in healthcare has resulted in the +accumulation of vast electronic health records (EHRs), offering valuable data +for machine learning methods to predict patient health outcomes. However, +single-visit records of patients are often neglected in the training process +due to the lack of annotations of next-visit information, thereby limiting the +predictive and expressive power of machine learning models. In this paper, we +present a novel framework MPLite that utilizes Multi-aspect Pretraining with +Lab results through a light-weight neural network to enhance medical concept +representation and predict future health outcomes of individuals. By +incorporating both structured medical data and additional information from lab +results, our approach fully leverages patient admission records. We design a +pretraining module that predicts medical codes based on lab results, ensuring +robust prediction by fusing multiple aspects of features. Our experimental +evaluation using both MIMIC-III and MIMIC-IV datasets demonstrates improvements +over existing models in diagnosis prediction and heart failure prediction +tasks, achieving a higher weighted-F1 and recall with MPLite. This work reveals +the potential of integrating diverse aspects of data to advance predictive +modeling in healthcare. + +
+
+
+
+
+ + ☆ TabDeco: A Comprehensive Contrastive Framework for Decoupled + Representations in Tabular Data + + +
+ Representation learning is a fundamental aspect of modern artificial +intelligence, driving substantial improvements across diverse applications. +While selfsupervised contrastive learning has led to significant advancements +in fields like computer vision and natural language processing, its adaptation +to tabular data presents unique challenges. Traditional approaches often +prioritize optimizing model architecture and loss functions but may overlook +the crucial task of constructing meaningful positive and negative sample pairs +from various perspectives like feature interactions, instance-level patterns +and batch-specific contexts. To address these challenges, we introduce TabDeco, +a novel method that leverages attention-based encoding strategies across both +rows and columns and employs contrastive learning framework to effectively +disentangle feature representations at multiple levels, including features, +instances and data batches. With the innovative feature decoupling hierarchies, +TabDeco consistently surpasses existing deep learning methods and leading +gradient boosting algorithms, including XG-Boost, CatBoost, and LightGBM, +across various benchmark tasks, underscoring its effectiveness in advancing +tabular data representation learning. + +
+
+
+
+
+ + ☆ CLMIA: Membership Inference Attacks via Unsupervised Contrastive + Learning + + +
+ Since machine learning model is often trained on a limited data set, the +model is trained multiple times on the same data sample, which causes the model +to memorize most of the training set data. Membership Inference Attacks (MIAs) +exploit this feature to determine whether a data sample is used for training a +machine learning model. However, in realistic scenarios, it is difficult for +the adversary to obtain enough qualified samples that mark accurate identity +information, especially since most samples are non-members in real world +applications. To address this limitation, in this paper, we propose a new +attack method called CLMIA, which uses unsupervised contrastive learning to +train an attack model without using extra membership status information. +Meanwhile, in CLMIA, we require only a small amount of data with known +membership status to fine-tune the attack model. Experimental results +demonstrate that CLMIA performs better than existing attack methods for +different datasets and model structures, especially with data with less marked +identity information. In addition, we experimentally find that the attack +performs differently for different proportions of labeled identity information +for member and non-member data. More analysis proves that our attack method +performs better with less labeled identity information, which applies to more +realistic scenarios. + +
+
+
+
+
+ + ☆ Label Sharing Incremental Learning Framework for Independent Multi-Label + Segmentation Tasks + + +
+ In a setting where segmentation models have to be built for multiple +datasets, each with its own corresponding label set, a straightforward way is +to learn one model for every dataset and its labels. Alternatively, multi-task +architectures with shared encoders and multiple segmentation heads or shared +weights with compound labels can also be made use of. This work proposes a +novel label sharing framework where a shared common label space is constructed +and each of the individual label sets are systematically mapped to the common +labels. This transforms multiple datasets with disparate label sets into a +single large dataset with shared labels, and therefore all the segmentation +tasks can be addressed by learning a single model. This eliminates the need for +task specific adaptations in network architectures and also results in +parameter and data efficient models. Furthermore, label sharing framework is +naturally amenable for incremental learning where segmentations for new +datasets can be easily learnt. We experimentally validate our method on various +medical image segmentation datasets, each involving multi-label segmentation. +Furthermore, we demonstrate the efficacy of the proposed method in terms of +performance and incremental learning ability vis-a-vis alternative methods. + +
+
+
+
+
+ + ☆ Different Horses for Different Courses: Comparing Bias Mitigation + Algorithms in ML NeurIPS 2024 + + +
+ With fairness concerns gaining significant attention in Machine Learning +(ML), several bias mitigation techniques have been proposed, often compared +against each other to find the best method. These benchmarking efforts tend to +use a common setup for evaluation under the assumption that providing a uniform +environment ensures a fair comparison. However, bias mitigation techniques are +sensitive to hyperparameter choices, random seeds, feature selection, etc., +meaning that comparison on just one setting can unfairly favour certain +algorithms. In this work, we show significant variance in fairness achieved by +several algorithms and the influence of the learning pipeline on fairness +scores. We highlight that most bias mitigation techniques can achieve +comparable performance, given the freedom to perform hyperparameter +optimization, suggesting that the choice of the evaluation parameters-rather +than the mitigation technique itself-can sometimes create the perceived +superiority of one method over another. We hope our work encourages future +research on how various choices in the lifecycle of developing an algorithm +impact fairness, and trends that guide the selection of appropriate algorithms. + +
+
+ comment: To appear at AFME@NeurIPS 2024 +
+
+
+
+
+ + ☆ Mitigating Relative Over-Generalization in Multi-Agent Reinforcement + Learning + + +
+ In decentralized multi-agent reinforcement learning, agents learning in +isolation can lead to relative over-generalization (RO), where optimal joint +actions are undervalued in favor of suboptimal ones. This hinders effective +coordination in cooperative tasks, as agents tend to choose actions that are +individually rational but collectively suboptimal. To address this issue, we +introduce MaxMax Q-Learning (MMQ), which employs an iterative process of +sampling and evaluating potential next states, selecting those with maximal +Q-values for learning. This approach refines approximations of ideal state +transitions, aligning more closely with the optimal joint policy of +collaborating agents. We provide theoretical analysis supporting MMQ's +potential and present empirical evaluations across various environments +susceptible to RO. Our results demonstrate that MMQ frequently outperforms +existing baselines, exhibiting enhanced convergence and sample efficiency. + +
+
+ comment: Published in Transactions on Machine Learning Research (11/2024) +
+
+
+
+
+ + ☆ Reinforcing Competitive Multi-Agents for Playing So Long Sucker + + +
+ This paper examines the use of classical deep reinforcement learning (DRL) +algorithms, DQN, DDQN, and Dueling DQN, in the strategy game So Long Sucker +(SLS), a diplomacy-driven game defined by coalition-building and strategic +betrayal. SLS poses unique challenges due to its blend of cooperative and +adversarial dynamics, making it an ideal platform for studying multi-agent +learning and game theory. The study's primary goal is to teach autonomous +agents the game's rules and strategies using classical DRL methods. To support +this effort, the authors developed a novel, publicly available implementation +of SLS, featuring a graphical user interface (GUI) and benchmarking tools for +DRL algorithms. Experimental results reveal that while considered basic by +modern DRL standards, DQN, DDQN, and Dueling DQN agents achieved roughly 50% of +the maximum possible game reward. This suggests a baseline understanding of the +game's mechanics, with agents favoring legal moves over illegal ones. However, +a significant limitation was the extensive training required, around 2000 +games, for agents to reach peak performance, compared to human players who +grasp the game within a few rounds. Even after prolonged training, agents +occasionally made illegal moves, highlighting both the potential and +limitations of these classical DRL methods in semi-complex, socially driven +games. The findings establish a foundational benchmark for training agents in +SLS and similar negotiation-based environments while underscoring the need for +advanced or hybrid DRL approaches to improve learning efficiency and +adaptability. Future research could incorporate game-theoretic strategies to +enhance agent decision-making in dynamic multi-agent contexts. + +
+
+
+
+
+ + ☆ SRA-MCTS: Self-driven Reasoning Aurmentation with Monte Carlo Tree + Search for Enhanced Code Generation + + +
+ Large language models demonstrate exceptional performance in simple code +generation tasks but still face challenges in tackling complex problems. These +challenges may stem from insufficient reasoning and problem decomposition +capabilities. To address this issue, we propose a reasoning-augmented data +generation process, SRA-MCTS, which guides the model to autonomously generate +high-quality intermediate reasoning paths. This creates a positive feedback +loop, enabling continuous improvement. Our method operates entirely through the +model itself without requiring additional supervision. By synthesizing natural +language reasoning paths and translating them into executable code, the +approach ensures analytical accuracy and enhances the success rate in solving +complex tasks. Experimental results show that, even without additional +supervisory signals, our method achieves performance improvements across +different model scales, demonstrating the significant potential of +self-improvement in small models. Furthermore, the method remains robust when +traditional Chain-of-Thought (CoT) approaches exhibit performance degradation, +with notable improvements observed in diversity metrics such as pass@10. We +encourage further exploration of reasoning processes within training data to +enhance the ability of language models to address complex problems. + +
+
+
+
+
+ + ☆ Knowledge-enhanced Transformer for Multivariate Long Sequence + Time-series Forecasting + + +
+ Multivariate Long Sequence Time-series Forecasting (LSTF) has been a critical +task across various real-world applications. Recent advancements focus on the +application of transformer architectures attributable to their ability to +capture temporal patterns effectively over extended periods. However, these +approaches often overlook the inherent relationships and interactions between +the input variables that could be drawn from their characteristic properties. +In this paper, we aim to bridge this gap by integrating information-rich +Knowledge Graph Embeddings (KGE) with state-of-the-art transformer-based +architectures. We introduce a novel approach that encapsulates conceptual +relationships among variables within a well-defined knowledge graph, forming +dynamic and learnable KGEs for seamless integration into the transformer +architecture. We investigate the influence of this integration into seminal +architectures such as PatchTST, Autoformer, Informer, and Vanilla Transformer. +Furthermore, we thoroughly investigate the performance of these +knowledge-enhanced architectures along with their original implementations for +long forecasting horizons and demonstrate significant improvement in the +benchmark results. This enhancement empowers transformer-based architectures to +address the inherent structural relation between variables. Our +knowledge-enhanced approach improves the accuracy of multivariate LSTF by +capturing complex temporal and relational dynamics across multiple domains. To +substantiate the validity of our model, we conduct comprehensive experiments +using Weather and Electric Transformer Temperature (ETT) datasets. + +
+
+ comment: 9 pages, 4 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Blockchain for Large Language Model Security and Safety: A Holistic + Survey + + +
+ With the growing development and deployment of large language models (LLMs) +in both industrial and academic fields, their security and safety concerns have +become increasingly critical. However, recent studies indicate that LLMs face +numerous vulnerabilities, including data poisoning, prompt injections, and +unauthorized data exposure, which conventional methods have struggled to +address fully. In parallel, blockchain technology, known for its data +immutability and decentralized structure, offers a promising foundation for +safeguarding LLMs. In this survey, we aim to comprehensively assess how to +leverage blockchain technology to enhance LLMs' security and safety. Besides, +we propose a new taxonomy of blockchain for large language models (BC4LLMs) to +systematically categorize related works in this emerging field. Our analysis +includes novel frameworks and definitions to delineate security and safety in +the context of BC4LLMs, highlighting potential research directions and +challenges at this intersection. Through this study, we aim to stimulate +targeted advancements in blockchain-integrated LLM security. + +
+
+ comment: Accepted to SIGKDD Explorations, to appear Dec 2024 +
+
+
+
+
+ + ♻ ☆ Feature learning as alignment: a structural property of gradient descent + in non-linear neural networks + + +
+ Understanding the mechanisms through which neural networks extract statistics +from input-label pairs through feature learning is one of the most important +unsolved problems in supervised learning. Prior works demonstrated that the +gram matrices of the weights (the neural feature matrices, NFM) and the average +gradient outer products (AGOP) become correlated during training, in a +statement known as the neural feature ansatz (NFA). Through the NFA, the +authors introduce mapping with the AGOP as a general mechanism for neural +feature learning. However, these works do not provide a theoretical explanation +for this correlation or its origins. In this work, we further clarify the +nature of this correlation, and explain its emergence. We show that this +correlation is equivalent to alignment between the left singular structure of +the weight matrices and the newly defined pre-activation tangent features at +each layer. We further establish that the alignment is driven by the +interaction of weight changes induced by SGD with the pre-activation features, +and analyze the resulting dynamics analytically at early times in terms of +simple statistics of the inputs and labels. We prove the derivative alignment +occurs almost surely in specific high dimensional settings. Finally, we +introduce a simple optimization rule motivated by our analysis of the centered +correlation which dramatically increases the NFA correlations at any given +layer and improves the quality of features learned. + +
+
+
+
+
+ + ♻ ☆ Enhancing Cross-Modal Contextual Congruence for Crowdfunding Success + using Knowledge-infused Learning + + +
+ The digital landscape continually evolves with multimodality, enriching the +online experience for users. Creators and marketers aim to weave subtle +contextual cues from various modalities into congruent content to engage users +with a harmonious message. This interplay of multimodal cues is often a crucial +factor in attracting users' attention. However, this richness of multimodality +presents a challenge to computational modeling, as the semantic contextual cues +spanning across modalities need to be unified to capture the true holistic +meaning of the multimodal content. This contextual meaning is critical in +attracting user engagement as it conveys the intended message of the brand or +the organization. In this work, we incorporate external commonsense knowledge +from knowledge graphs to enhance the representation of multimodal data using +compact Visual Language Models (VLMs) and predict the success of multi-modal +crowdfunding campaigns. Our results show that external knowledge commonsense +bridges the semantic gap between text and image modalities, and the enhanced +knowledge-infused representations improve the predictive performance of models +for campaign success upon the baselines without knowledge. Our findings +highlight the significance of contextual congruence in online multimodal +content for engaging and successful crowdfunding campaigns. + +
+
+ comment: Accepted at IEEE International Conference on Big Data 2024 (IEEE + BigData 2024) +
+
+
+
+
+ + ♻ ☆ Instruct-Tuning Pretrained Causal Language Models for Ancient Greek + Papyrology and Epigraphy + + +
+ This article presents an experiment in fine-tuning a pretrained causal +language model (Meta's Llama 3.1 8B Instruct) to assist with restoring missing +or illegible characters in ancient Greek inscriptions and documentary papyri. +Utilizing a straightforward instruction-based approach and a 95%/5% train/test +split, the papyrus restoration model achieved a character error rate (CER) of +14.9%, a top-1 accuracy of 73.5%, and a top-20 accuracy of 86.0% for sequences +up to 10 characters. A model was also fine-tuned for geographic attribution, +reaching a top-1 accuracy of 66.4% and a top-3 accuracy of 79.9%. In +chronological attribution, it demonstrated an average deviation of 21.7 years +from the actual terminus post/ante quem, with a median deviation of 0 years. +For inscriptions, the restoration model achieved a CER of 20.5%, a top-1 +accuracy of 63.7%, and a top-20 accuracy of 83.0% for sequences up to 10 +characters. In geographic attribution, it attained a top-1 accuracy of 75.0% +and a top-3 accuracy of 83.7%, while in dating, it had an average deviation of +37.1 years and a median deviation of 3 years from the actual date range. +Benchmarked against the state-of-the-art model (Ithaca) on a shared test set +and on recently edited inscriptions, the instruction-tuned models excelled in +text restoration, while also offering the practical advantage of ignoring +spaces during reconstruction, which aligns with the scriptio continua of +ancient textual artifacts. However, their performance in geographic and +chronological attribution was lower than Ithaca's. To evaluate the approach in +a more even setup, the instruction model was retrained with an 80%/10%/10% +train-validation-test split, and still outperformed Ithaca in text restoration. +The results suggest that fine-tuning larger pretrained causal language models +using instruction templates for emendations and conjectures to ancient texts +holds promise. + +
+
+ comment: 9 pages, 1 table. To be submitted +
+
+
+
+
+ + ♻ ☆ Learning-Augmented Priority Queues NeurIPS 2024 + + +
+ Priority queues are one of the most fundamental and widely used data +structures in computer science. Their primary objective is to efficiently +support the insertion of new elements with assigned priorities and the +extraction of the highest priority element. In this study, we investigate the +design of priority queues within the learning-augmented framework, where +algorithms use potentially inaccurate predictions to enhance their worst-case +performance. We examine three prediction models spanning different use cases, +and show how the predictions can be leveraged to enhance the performance of +priority queue operations. Moreover, we demonstrate the optimality of our +solution and discuss some possible applications. + +
+
+ comment: Accepted as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Improving LLM Classification of Logical Errors by Integrating Error + Relationship into Prompts + + +
+ LLMs trained in the understanding of programming syntax are now providing +effective assistance to developers and are being used in programming education +such as in generation of coding problem examples or providing code +explanations. A key aspect of programming education is understanding and +dealing with error message. However, 'logical errors' in which the program +operates against the programmer's intentions do not receive error messages from +the compiler. In this study, building on existing research on programming +errors, we first define the types of logical errors that can occur in +programming in general. Based on the definition, we propose an effective +approach for detecting logical errors with LLMs that makes use of relations +among error types in the Chain-of-Thought and Tree-of-Thought prompts. The +experimental results indicate that when such logical error descriptions in the +prompt are used, the average classifition performance is about 21% higher than +the ones without them. We also conducted an experiment for exploiting the +relations among errors in generating a new logical error dataset using LLMs. As +there is very limited dataset for logical errors such benchmark dataset can be +very useful for various programming related applications. We expect that our +work can assist novice programmers in identifying the causes of code errors and +correct them more effectively. + +
+
+ comment: Published in ITS 2024 (Best Paper Award) +
+
+
+
+
+ + ♻ ☆ Smooth Non-Stationary Bandits ICML 2023 + + +
+ In many applications of online decision making, the environment is +non-stationary and it is therefore crucial to use bandit algorithms that handle +changes. Most existing approaches are designed to protect against non-smooth +changes, constrained only by total variation or Lipschitzness over time. +However, in practice, environments often change {\em smoothly}, so such +algorithms may incur higher-than-necessary regret. We study a non-stationary +bandits problem where each arm's mean reward sequence can be embedded into a +$\beta$-H\"older function, i.e., a function that is $(\beta-1)$-times +Lipschitz-continuously differentiable. The non-stationarity becomes more smooth +as $\beta$ increases. When $\beta=1$, this corresponds to the non-smooth +regime, where \cite{besbes2014stochastic} established a minimax regret of +$\tilde \Theta(T^{2/3})$. We show the first separation between the smooth +(i.e., $\beta\ge 2$) and non-smooth (i.e., $\beta=1$) regimes by presenting a +policy with $\tilde O(k^{4/5} T^{3/5})$ regret on any $k$-armed, $2$-H\"older +instance. We complement this result by showing that the minimax regret on the +$\beta$-H\"older family of instances is $\Omega(T^{(\beta+1)/(2\beta+1)})$ for +any integer $\beta\ge 1$. This matches our upper bound for $\beta=2$ up to +logarithmic factors. Furthermore, we validated the effectiveness of our policy +through a comprehensive numerical study using real-world click-through rate +data. + +
+
+ comment: Accepted by ICML 2023 +
+
+
+
+
+ + ♻ ☆ Narrative-of-Thought: Improving Temporal Reasoning of Large Language + Models via Recounted Narratives EMNLP'24 + + +
+ Reasoning about time and temporal relations is an integral aspect of human +cognition, essential for perceiving the world and navigating our experiences. +Though large language models (LLMs) have demonstrated impressive performance in +many reasoning tasks, temporal reasoning remains challenging due to its +intrinsic complexity. In this work, we first study an essential task of +temporal reasoning -- temporal graph generation, to unveil LLMs' inherent, +global reasoning capabilities. We show that this task presents great challenges +even for the most powerful LLMs, such as GPT-3.5/4. We also notice a +significant performance gap by small models (<10B) that lag behind LLMs by 50%. +Next, we study how to close this gap with a budget constraint, e.g., not using +model finetuning. We propose a new prompting technique tailored for temporal +reasoning, Narrative-of-Thought (NoT), that first converts the events set to a +Python class, then prompts a small model to generate a temporally grounded +narrative, guiding the final generation of a temporal graph. Extensive +experiments showcase the efficacy of NoT in improving various metrics. Notably, +NoT attains the highest F1 on the Schema-11 evaluation set, while securing an +overall F1 on par with GPT-3.5. NoT also achieves the best structural +similarity across the board, even compared with GPT-3.5/4. Our code is +available at https://github.com/launchnlp/NoT. + +
+
+ comment: EMNLP'24 Findings +
+
+
+
+
+ + ♻ ☆ Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework + for Multimodal LLMs NeurIPS 2024 + + +
+ Multimodal large language models (MLLMs) have shown impressive success across +modalities such as image, video, and audio in a variety of understanding and +generation tasks. However, current MLLMs are surprisingly poor at understanding +webpage screenshots and generating their corresponding HTML code. To address +this problem, we propose $\texttt{Web2Code}$, a benchmark consisting of a new +large-scale webpage-to-code dataset for instruction tuning and an evaluation +framework for the webpage understanding and HTML code translation abilities of +MLLMs. For dataset construction, we leverage pretrained LLMs to enhance +existing webpage-to-code datasets as well as generate a diverse pool of new +webpages rendered into images. Specifically, the inputs are webpage images and +instructions, while the responses are the webpage's HTML code. We further +include diverse natural language QA pairs about the webpage content in the +responses to enable a more comprehensive understanding of the web content. To +evaluate model performance in these tasks, we develop an evaluation framework +for testing MLLMs' abilities in webpage understanding and web-to-code +generation. Extensive experiments show that our proposed dataset is beneficial +not only to our proposed tasks but also in the general visual domain. We hope +our work will contribute to the development of general MLLMs suitable for +web-based content generation and task automation. Our data and code are +available at https://github.com/MBZUAI-LLM/web2code. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Camera-ready Version. Website at + https://mbzuai-llm.github.io/webpage2code/ +
+
+
+
+
+ + ♻ ☆ Taming the Long Tail in Human Mobility Prediction NeurIPS 2024 + + +
+ With the popularity of location-based services, human mobility prediction +plays a key role in enhancing personalized navigation, optimizing +recommendation systems, and facilitating urban mobility and planning. This +involves predicting a user's next POI (point-of-interest) visit using their +past visit history. However, the uneven distribution of visitations over time +and space, namely the long-tail problem in spatial distribution, makes it +difficult for AI models to predict those POIs that are less visited by humans. +In light of this issue, we propose the Long-Tail Adjusted Next POI Prediction +(LoTNext) framework for mobility prediction, combining a Long-Tailed Graph +Adjustment module to reduce the impact of the long-tailed nodes in the user-POI +interaction graph and a novel Long-Tailed Loss Adjustment module to adjust loss +by logit score and sample weight adjustment strategy. Also, we employ the +auxiliary prediction task to enhance generalization and accuracy. Our +experiments with two real-world trajectory datasets demonstrate that LoTNext +significantly surpasses existing state-of-the-art works. Our code is available +at https://github.com/Yukayo/LoTNext. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Exploring the Adversarial Frontier: Quantifying Robustness via + Adversarial Hypervolume + + +
+ The escalating threat of adversarial attacks on deep learning models, +particularly in security-critical fields, has underscored the need for robust +deep learning systems. Conventional robustness evaluations have relied on +adversarial accuracy, which measures a model's performance under a specific +perturbation intensity. However, this singular metric does not fully +encapsulate the overall resilience of a model against varying degrees of +perturbation. To address this gap, we propose a new metric termed adversarial +hypervolume, assessing the robustness of deep learning models comprehensively +over a range of perturbation intensities from a multi-objective optimization +standpoint. This metric allows for an in-depth comparison of defense mechanisms +and recognizes the trivial improvements in robustness afforded by less potent +defensive strategies. Additionally, we adopt a novel training algorithm that +enhances adversarial robustness uniformly across various perturbation +intensities, in contrast to methods narrowly focused on optimizing adversarial +accuracy. Our extensive empirical studies validate the effectiveness of the +adversarial hypervolume metric, demonstrating its ability to reveal subtle +differences in robustness that adversarial accuracy overlooks. This research +contributes a new measure of robustness and establishes a standard for +assessing and benchmarking the resilience of current and future defensive +models against adversarial threats. + +
+
+
+
+
+ + ♻ ☆ When Your AIs Deceive You: Challenges of Partial Observability in + Reinforcement Learning from Human Feedback NeurIPS 2024 + + +
+ Past analyses of reinforcement learning from human feedback (RLHF) assume +that the human evaluators fully observe the environment. What happens when +human feedback is based only on partial observations? We formally define two +failure cases: deceptive inflation and overjustification. Modeling the human as +Boltzmann-rational w.r.t. a belief over trajectories, we prove conditions under +which RLHF is guaranteed to result in policies that deceptively inflate their +performance, overjustify their behavior to make an impression, or both. Under +the new assumption that the human's partial observability is known and +accounted for, we then analyze how much information the feedback process +provides about the return function. We show that sometimes, the human's +feedback determines the return function uniquely up to an additive constant, +but in other realistic cases, there is irreducible ambiguity. We propose +exploratory research directions to help tackle these challenges, experimentally +validate both the theoretical concerns and potential mitigations, and caution +against blindly applying RLHF in partially observable settings. + +
+
+ comment: Advances in Neural Information Processing Systems 37 (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Privacy and Copyright Protection in Generative AI: A Lifecycle + Perspective + + +
+ The advent of Generative AI has marked a significant milestone in artificial +intelligence, demonstrating remarkable capabilities in generating realistic +images, texts, and data patterns. However, these advancements come with +heightened concerns over data privacy and copyright infringement, primarily due +to the reliance on vast datasets for model training. Traditional approaches +like differential privacy, machine unlearning, and data poisoning only offer +fragmented solutions to these complex issues. Our paper delves into the +multifaceted challenges of privacy and copyright protection within the data +lifecycle. We advocate for integrated approaches that combines technical +innovation with ethical foresight, holistically addressing these concerns by +investigating and devising solutions that are informed by the lifecycle +perspective. This work aims to catalyze a broader discussion and inspire +concerted efforts towards data privacy and copyright integrity in Generative +AI. + +
+
+ comment: Accepted by 2024 IEEE/ACM 3rd International Conference on AI + Engineering - Software Engineering for AI (CAIN) +
+
+
+
+
+
+
+
+ + Computation and Language 35 + +
+
+
+ + ☆ Capturing Sparks of Abstraction for the ARC Challenge + + +
+ Excellent progress has been made recently in solving ARC Challenge problems. +However, it seems that new techniques may be required to push beyond 60% +accuracy. Even commercial Large Language Models (LLMs) struggle to 'understand' +many of the problems (when given the input and output grids), which makes +discovering solutions by LLM-lead program search somewhat futile. + In this work, LLM 'understanding' is attempted from a stronger starting +position : An LLM is given complete solutions to tasks in code, and then asked +to explain how the task is being solved at various levels of abstraction. +Specifically, the LLM was given code solutions implemented in arc-dsl-llm (an +LLM-legible version of Hodel's arc-dsl to obtain: (a) commented code; (b) code +refactored into reusable functional chunks; (c) problem solution steps; and (d) +high-level problem-solving tactics. + We demonstrate that 'Sparks of Abstraction' can be extracted from the LLM +output - in a form that could be used in downstream tasks with Local LLMs +eligible to enter the ARC Prize. + Both the arc-dsl-llm DSL framework (with the re-engineered solutions) and the +Gemini LLM-generated data (along with the generation code) are made Open +Source. + +
+
+ comment: Submitted as a paper entry for the 2024 ARC Prize +
+
+
+
+
+ + ☆ Debiasing Watermarks for Large Language Models via Maximal Coupling + + +
+ Watermarking language models is essential for distinguishing between human +and machine-generated text and thus maintaining the integrity and +trustworthiness of digital communication. We present a novel green/red list +watermarking approach that partitions the token set into ``green'' and ``red'' +lists, subtly increasing the generation probability for green tokens. To +correct token distribution bias, our method employs maximal coupling, using a +uniform coin flip to decide whether to apply bias correction, with the result +embedded as a pseudorandom watermark signal. Theoretical analysis confirms this +approach's unbiased nature and robust detection capabilities. Experimental +results show that it outperforms prior techniques by preserving text quality +while maintaining high detectability, and it demonstrates resilience to +targeted modifications aimed at improving text quality. This research provides +a promising watermarking solution for language models, balancing effective +detection with minimal impact on text quality. + +
+
+
+
+
+ + ☆ LLäMmlein: Compact and Competitive German-Only Language Models from + Scratch + + +
+ We create two German-only decoder models, LL\"aMmlein 120M and 1B, +transparently from scratch and publish them, along with the training data, for +the German NLP research community to use. The model training involved several +key steps, including extensive data preprocessing, the creation of a custom +German tokenizer, the training itself, as well as the evaluation of the final +models on various benchmarks. Throughout the training process, multiple +checkpoints were saved and analyzed using the SuperGLEBer benchmark to monitor +the models' learning dynamics. Compared to state-of-the-art models on the +SuperGLEBer benchmark, both LL\"aMmlein models performed competitively, +consistently matching or surpassing models with similar parameter sizes. The +results show that the models' quality scales with size as expected, but +performance improvements on some tasks plateaued early, offering valuable +insights into resource allocation for future model development. + +
+
+ comment: first draft; + https://www.informatik.uni-wuerzburg.de/datascience/projects/nlp/llammlein/ +
+
+
+
+
+ + ☆ The Promises and Pitfalls of LLM Annotations in Dataset Labeling: a Case + Study on Media Bias Detection + + +
+ High annotation costs from hiring or crowdsourcing complicate the creation of +large, high-quality datasets needed for training reliable text classifiers. +Recent research suggests using Large Language Models (LLMs) to automate the +annotation process, reducing these costs while maintaining data quality. LLMs +have shown promising results in annotating downstream tasks like hate speech +detection and political framing. Building on the success in these areas, this +study investigates whether LLMs are viable for annotating the complex task of +media bias detection and whether a downstream media bias classifier can be +trained on such data. We create annolexical, the first large-scale dataset for +media bias classification with over 48000 synthetically annotated examples. Our +classifier, fine-tuned on this dataset, surpasses all of the annotator LLMs by +5-9 percent in Matthews Correlation Coefficient (MCC) and performs close to or +outperforms the model trained on human-labeled data when evaluated on two media +bias benchmark datasets (BABE and BASIL). This study demonstrates how our +approach significantly reduces the cost of dataset creation in the media bias +domain and, by extension, the development of classifiers, while our subsequent +behavioral stress-testing reveals some of its current limitations and +trade-offs. + +
+
+
+
+
+ + ☆ Multilingual Large Language Models: A Systematic Survey + + +
+ This paper provides a comprehensive survey of the latest research on +multilingual large language models (MLLMs). MLLMs not only are able to +understand and generate language across linguistic boundaries, but also +represent an important advancement in artificial intelligence. We first discuss +the architecture and pre-training objectives of MLLMs, highlighting the key +components and methodologies that contribute to their multilingual +capabilities. We then discuss the construction of multilingual pre-training and +alignment datasets, underscoring the importance of data quality and diversity +in enhancing MLLM performance. An important focus of this survey is on the +evaluation of MLLMs. We present a detailed taxonomy and roadmap covering the +assessment of MLLMs' cross-lingual knowledge, reasoning, alignment with human +values, safety, interpretability and specialized applications. Specifically, we +extensively discuss multilingual evaluation benchmarks and datasets, and +explore the use of LLMs themselves as multilingual evaluators. To enhance MLLMs +from black to white boxes, we also address the interpretability of multilingual +capabilities, cross-lingual transfer and language bias within these models. +Finally, we provide a comprehensive review of real-world applications of MLLMs +across diverse domains, including biology, medicine, computer science, +mathematics and law. We showcase how these models have driven innovation and +improvements in these specialized fields while also highlighting the challenges +and opportunities in deploying MLLMs within diverse language communities and +application scenarios.We listed the paper related in this survey and publicly +available at https://github.com/tjunlp-lab/Awesome-Multilingual-LLMs-Papers . + +
+
+
+
+
+ + ☆ Beyond Human-Like Processing: Large Language Models Perform Equivalently + on Forward and Backward Scientific Text + + +
+ The impressive performance of large language models (LLMs) has led to their +consideration as models of human language processing. Instead, we suggest that +the success of LLMs arises from the flexibility of the transformer learning +architecture. To evaluate this conjecture, we trained LLMs on scientific texts +that were either in a forward or backward format. Despite backward text being +inconsistent with the structure of human languages, we found that LLMs +performed equally well in either format on a neuroscience benchmark, eclipsing +human expert performance for both forward and backward orders. Our results are +consistent with the success of transformers across diverse domains, such as +weather prediction and protein design. This widespread success is attributable +to LLM's ability to extract predictive patterns from any sufficiently +structured input. Given their generality, we suggest caution in interpreting +LLM's success in linguistic tasks as evidence for human-like mechanisms. + +
+
+
+
+
+ + ☆ FastDraft: How to Train Your Draft NeurIPS + + +
+ Speculative Decoding has gained popularity as an effective technique for +accelerating the auto-regressive inference process of Large Language Models +(LLMs). However, Speculative Decoding entirely relies on the availability of +efficient draft models, which are often lacking for many existing language +models due to a stringent constraint of vocabulary incompatibility. In this +work we introduce FastDraft, a novel and efficient approach for pre-training +and aligning a draft model to any large language model by incorporating +efficient pre-training, followed by fine-tuning over synthetic datasets +generated by the target model. We demonstrate FastDraft by training two highly +parameter efficient drafts for the popular Phi-3-mini and Llama-3.1-8B models. +Using FastDraft, we were able to produce a draft with approximately 10 billion +tokens on a single server with 8 Intel$^\circledR$ Gaudi$^\circledR$ 2 +accelerators in under 24 hours. Our results show that the draft model achieves +impressive results in key metrics of acceptance rate, block efficiency and up +to 3x memory bound speed up when evaluated on code completion and up to 2x in +summarization, text completion and instruction tasks. We validate our +theoretical findings through benchmarking on the latest Intel$^\circledR$ +Core$^{\tiny \text{TM}}$ Ultra, achieving a wall-clock time speedup of up to +2x, indicating a significant reduction in runtime. Due to its high quality, +FastDraft unlocks large language models inference on AI-PC and other +edge-devices. + +
+
+ comment: ENLSP NeurIPS Workshop 2024 +
+
+
+
+
+ + ☆ SRA-MCTS: Self-driven Reasoning Aurmentation with Monte Carlo Tree + Search for Enhanced Code Generation + + +
+ Large language models demonstrate exceptional performance in simple code +generation tasks but still face challenges in tackling complex problems. These +challenges may stem from insufficient reasoning and problem decomposition +capabilities. To address this issue, we propose a reasoning-augmented data +generation process, SRA-MCTS, which guides the model to autonomously generate +high-quality intermediate reasoning paths. This creates a positive feedback +loop, enabling continuous improvement. Our method operates entirely through the +model itself without requiring additional supervision. By synthesizing natural +language reasoning paths and translating them into executable code, the +approach ensures analytical accuracy and enhances the success rate in solving +complex tasks. Experimental results show that, even without additional +supervisory signals, our method achieves performance improvements across +different model scales, demonstrating the significant potential of +self-improvement in small models. Furthermore, the method remains robust when +traditional Chain-of-Thought (CoT) approaches exhibit performance degradation, +with notable improvements observed in diversity metrics such as pass@10. We +encourage further exploration of reasoning processes within training data to +enhance the ability of language models to address complex problems. + +
+
+
+
+
+ + ☆ BianCang: A Traditional Chinese Medicine Large Language Model + + +
+ The rise of large language models (LLMs) has driven significant progress in +medical applications, including traditional Chinese medicine (TCM). However, +current medical LLMs struggle with TCM diagnosis and syndrome differentiation +due to substantial differences between TCM and modern medical theory, and the +scarcity of specialized, high-quality corpora. This paper addresses these +challenges by proposing BianCang, a TCM-specific LLM, using a two-stage +training process that first injects domain-specific knowledge and then aligns +it through targeted stimulation. To enhance diagnostic and differentiation +capabilities, we constructed pre-training corpora, instruction-aligned datasets +based on real hospital records, and the ChP-TCM dataset derived from the +Pharmacopoeia of the People's Republic of China. We compiled extensive TCM and +medical corpora for continuous pre-training and supervised fine-tuning, +building a comprehensive dataset to refine the model's understanding of TCM. +Evaluations across 11 test sets involving 29 models and 4 tasks demonstrate the +effectiveness of BianCang, offering valuable insights for future research. +Code, datasets, and models are available at +https://github.com/QLU-NLP/BianCang. + +
+
+
+
+
+ + ☆ A Topic-aware Comparable Corpus of Chinese Variations + + +
+ This study aims to fill the gap by constructing a topic-aware comparable +corpus of Mainland Chinese Mandarin and Taiwanese Mandarin from the social +media in Mainland China and Taiwan, respectively. Using Dcard for Taiwanese +Mandarin and Sina Weibo for Mainland Chinese, we create a comparable corpus +that updates regularly and reflects modern language use on social media. + +
+
+ comment: 4 pages, 4 figures, presented at APCLC2018: ASIA-PACIFIC CORPUS + LINGUISTICS CONFERENCE 2018 +
+
+
+
+
+ + ☆ Dialectal Toxicity Detection: Evaluating LLM-as-a-Judge Consistency + Across Language Varieties + + +
+ There has been little systematic study on how dialectal differences affect +toxicity detection by modern LLMs. Furthermore, although using LLMs as +evaluators ("LLM-as-a-judge") is a growing research area, their sensitivity to +dialectal nuances is still underexplored and requires more focused attention. +In this paper, we address these gaps through a comprehensive toxicity +evaluation of LLMs across diverse dialects. We create a multi-dialect dataset +through synthetic transformations and human-assisted translations, covering 10 +language clusters and 60 varieties. We then evaluated three LLMs on their +ability to assess toxicity across multilingual, dialectal, and LLM-human +consistency. Our findings show that LLMs are sensitive in handling both +multilingual and dialectal variations. However, if we have to rank the +consistency, the weakest area is LLM-human agreement, followed by dialectal +consistency. Code repository: +\url{https://github.com/ffaisal93/dialect_toxicity_llm_judge} + +
+
+
+
+
+ + ☆ Understanding Multimodal LLMs: the Mechanistic Interpretability of Llava + in Visual Question Answering + + +
+ Understanding the mechanisms behind Large Language Models (LLMs) is crucial +for designing improved models and strategies. While recent studies have yielded +valuable insights into the mechanisms of textual LLMs, the mechanisms of +Multi-modal Large Language Models (MLLMs) remain underexplored. In this paper, +we apply mechanistic interpretability methods to analyze the visual question +answering (VQA) mechanisms in the first MLLM, Llava. We compare the mechanisms +between VQA and textual QA (TQA) in color answering tasks and find that: a) VQA +exhibits a mechanism similar to the in-context learning mechanism observed in +TQA; b) the visual features exhibit significant interpretability when +projecting the visual embeddings into the embedding space; and c) Llava +enhances the existing capabilities of the corresponding textual LLM Vicuna +during visual instruction tuning. Based on these findings, we develop an +interpretability tool to help users and researchers identify important visual +locations for final predictions, aiding in the understanding of visual +hallucination. Our method demonstrates faster and more effective results +compared to existing interpretability approaches. Code: +\url{https://github.com/zepingyu0512/llava-mechanism} + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Memory-Augmented Multimodal LLMs for Surgical VQA via Self-Contained + Inquiry + + +
+ Comprehensively understanding surgical scenes in Surgical Visual Question +Answering (Surgical VQA) requires reasoning over multiple objects. Previous +approaches address this task using cross-modal fusion strategies to enhance +reasoning ability. However, these methods often struggle with limited scene +understanding and question comprehension, and some rely on external resources +(e.g., pre-extracted object features), which can introduce errors and +generalize poorly across diverse surgical environments. To address these +challenges, we propose SCAN, a simple yet effective memory-augmented framework +that leverages Multimodal LLMs to improve surgical context comprehension via +Self-Contained Inquiry. SCAN operates autonomously, generating two types of +memory for context augmentation: Direct Memory (DM), which provides multiple +candidates (or hints) to the final answer, and Indirect Memory (IM), which +consists of self-contained question-hint pairs to capture broader scene +context. DM directly assists in answering the question, while IM enhances +understanding of the surgical scene beyond the immediate query. Reasoning over +these object-aware memories enables the model to accurately interpret images +and respond to questions. Extensive experiments on three publicly available +Surgical VQA datasets demonstrate that SCAN achieves state-of-the-art +performance, offering improved accuracy and robustness across various surgical +scenarios. + +
+
+
+
+
+ + ☆ Analyzing Pokémon and Mario Streamers' Twitch Chat with LLM-based User + Embeddings + + +
+ We present a novel digital humanities method for representing our Twitch +chatters as user embeddings created by a large language model (LLM). We cluster +these embeddings automatically using affinity propagation and further narrow +this clustering down through manual analysis. We analyze the chat of one stream +by each Twitch streamer: SmallAnt, DougDoug and PointCrow. Our findings suggest +that each streamer has their own type of chatters, however two categories +emerge for all of the streamers: supportive viewers and emoji and reaction +senders. Repetitive message spammers is a shared chatter category for two of +the streamers. + +
+
+ comment: NLP4DH 2024 +
+
+
+
+
+ + ☆ Learn from Downstream and Be Yourself in Multimodal Large Language Model + Fine-Tuning + + +
+ Multimodal Large Language Model (MLLM) have demonstrated strong +generalization capabilities across diverse distributions and tasks, largely due +to extensive pre-training datasets. Fine-tuning MLLM has become a common +practice to improve performance on specific downstream tasks. However, during +fine-tuning, MLLM often faces the risk of forgetting knowledge acquired during +pre-training, which can result in a decline in generalization abilities. To +balance the trade-off between generalization and specialization, we propose +measuring the parameter importance for both pre-trained and fine-tuning +distributions, based on frozen pre-trained weight magnitude and accumulated +fine-tuning gradient values. We further apply an importance-aware weight +allocation strategy, selectively updating relatively important parameters for +downstream tasks. We conduct empirical evaluations on both image captioning and +visual question-answering tasks using various MLLM architectures. The +comprehensive experimental analysis demonstrates the effectiveness of the +proposed solution, highlighting the efficiency of the crucial modules in +enhancing downstream specialization performance while mitigating generalization +degradation in MLLM Fine-Tuning. + +
+
+
+
+
+ + ☆ Inter-linguistic Phonetic Composition (IPC): A Theoretical and + Computational Approach to Enhance Second Language Pronunciation ACL + + +
+ Learners of a second language (L2) often unconsciously substitute unfamiliar +L2 phonemes with similar phonemes from their native language (L1), even though +native speakers of the L2 perceive these sounds as distinct and +non-interchangeable. This phonemic substitution leads to deviations from the +standard phonological patterns of the L2, creating challenges for learners in +acquiring accurate L2 pronunciation. To address this, we propose +Inter-linguistic Phonetic Composition (IPC), a novel computational method +designed to minimize incorrect phonological transfer by reconstructing L2 +phonemes as composite sounds derived from multiple L1 phonemes. Tests with two +automatic speech recognition models demonstrated that when L2 speakers produced +IPC-generated composite sounds, the recognition rate of target L2 phonemes +improved by 20% compared to when their pronunciation was influenced by original +phonological transfer patterns. The improvement was observed within a +relatively shorter time frame, demonstrating rapid acquisition of the composite +sound. + +
+
+ comment: 10 pages, 6 Figures, submitted to ACL ARR October 2024 for NAACL 2025 +
+
+
+
+
+ + ♻ ☆ FG-PRM: Fine-grained Hallucination Detection and Mitigation in Language + Model Mathematical Reasoning + + +
+ Hallucinations in large language models (LLMs) pose significant challenges in +tasks requiring complex multi-step reasoning, such as mathematical +problem-solving. Existing approaches primarily detect the presence of +hallucinations but lack a nuanced understanding of their types and +manifestations. In this paper, we first introduce a comprehensive taxonomy that +categorizes the common hallucinations in mathematical reasoning task into six +types: fabrication, factual inconsistency, context inconsistency, instruction +inconsistency, logical inconsistency, and logical error. We then propose FG-PRM +(Fine-Grained Process Reward Model), an augmented model designed to detect and +mitigate hallucinations in a fine-grained, step-level manner. To address the +limitations of manually labeling training data, we propose an automated method +for generating fine-grained hallucination data using LLMs. By injecting +hallucinations into reasoning steps of correct solutions, we create a diverse +and balanced synthetic dataset for training FG-PRM, which consists of six +specialized Process Reward Models (PRMs), each tailored to detect a specific +hallucination type. Our FG-PRM demonstrates superior performance across two key +tasks: 1) Fine-grained hallucination detection: classifying hallucination types +for each reasoning step; and 2) Verification: ranking multiple LLM-generated +outputs to select the most accurate solution, mitigating reasoning +hallucinations. Our experiments show that FG-PRM outperforms ChatGPT-3.5 and +Claude-3 on fine-grained hallucination detection and substantially boosts the +performance of LLMs on GSM8K and MATH benchmarks. + +
+
+
+
+
+ + ♻ ☆ You can remove GPT2's LayerNorm by fine-tuning NeurIPS 2024 + + +
+ The LayerNorm (LN) layer in GPT-style transformer models has long been a +hindrance to mechanistic interpretability. LN is a crucial component required +to stabilize the training of large language models, and LN or the similar +RMSNorm have been used in practically all large language models based on the +transformer architecture. The non-linear nature of the LN layers is a hindrance +for mechanistic interpretability as it hinders interpretation of the residual +stream, and makes it difficult to decompose the model into circuits. Some +researchers have gone so far as to name "reasons interpretability researchers +hate layer norm." + In this paper we show that it is possible to remove the LN layers from a +pre-trained GPT2-small model by fine-tuning on a fraction (500M tokens) of the +training data. We demonstrate that this LN-free model achieves similar +performance to the original model on the OpenWebText and ThePile datasets +(-0.05 cross-entropy loss), and the Hellaswag benchmark (-0.5% accuracy). We +provide our implementation at https://github.com/ApolloResearch/gpt2_noLN, and +fine-tuned GPT2-small models at +https://huggingface.co/apollo-research/gpt2_noLN. + Our work not only provides a simplified model for mechanistic +interpretability research, but also provides evidence that the LN layers, at +inference time, do not play a crucial role in transformer models. + +
+
+ comment: Presented at the Attributing Model Behavior at Scale (ATTRIB) and + Interpretable AI: Past, Present, and Future workshops at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Cross-Modal Contextual Congruence for Crowdfunding Success + using Knowledge-infused Learning + + +
+ The digital landscape continually evolves with multimodality, enriching the +online experience for users. Creators and marketers aim to weave subtle +contextual cues from various modalities into congruent content to engage users +with a harmonious message. This interplay of multimodal cues is often a crucial +factor in attracting users' attention. However, this richness of multimodality +presents a challenge to computational modeling, as the semantic contextual cues +spanning across modalities need to be unified to capture the true holistic +meaning of the multimodal content. This contextual meaning is critical in +attracting user engagement as it conveys the intended message of the brand or +the organization. In this work, we incorporate external commonsense knowledge +from knowledge graphs to enhance the representation of multimodal data using +compact Visual Language Models (VLMs) and predict the success of multi-modal +crowdfunding campaigns. Our results show that external knowledge commonsense +bridges the semantic gap between text and image modalities, and the enhanced +knowledge-infused representations improve the predictive performance of models +for campaign success upon the baselines without knowledge. Our findings +highlight the significance of contextual congruence in online multimodal +content for engaging and successful crowdfunding campaigns. + +
+
+ comment: Accepted at IEEE International Conference on Big Data 2024 (IEEE + BigData 2024) +
+
+
+
+
+ + ♻ ☆ Instruct-Tuning Pretrained Causal Language Models for Ancient Greek + Papyrology and Epigraphy + + +
+ This article presents an experiment in fine-tuning a pretrained causal +language model (Meta's Llama 3.1 8B Instruct) to assist with restoring missing +or illegible characters in ancient Greek inscriptions and documentary papyri. +Utilizing a straightforward instruction-based approach and a 95%/5% train/test +split, the papyrus restoration model achieved a character error rate (CER) of +14.9%, a top-1 accuracy of 73.5%, and a top-20 accuracy of 86.0% for sequences +up to 10 characters. A model was also fine-tuned for geographic attribution, +reaching a top-1 accuracy of 66.4% and a top-3 accuracy of 79.9%. In +chronological attribution, it demonstrated an average deviation of 21.7 years +from the actual terminus post/ante quem, with a median deviation of 0 years. +For inscriptions, the restoration model achieved a CER of 20.5%, a top-1 +accuracy of 63.7%, and a top-20 accuracy of 83.0% for sequences up to 10 +characters. In geographic attribution, it attained a top-1 accuracy of 75.0% +and a top-3 accuracy of 83.7%, while in dating, it had an average deviation of +37.1 years and a median deviation of 3 years from the actual date range. +Benchmarked against the state-of-the-art model (Ithaca) on a shared test set +and on recently edited inscriptions, the instruction-tuned models excelled in +text restoration, while also offering the practical advantage of ignoring +spaces during reconstruction, which aligns with the scriptio continua of +ancient textual artifacts. However, their performance in geographic and +chronological attribution was lower than Ithaca's. To evaluate the approach in +a more even setup, the instruction model was retrained with an 80%/10%/10% +train-validation-test split, and still outperformed Ithaca in text restoration. +The results suggest that fine-tuning larger pretrained causal language models +using instruction templates for emendations and conjectures to ancient texts +holds promise. + +
+
+ comment: 9 pages, 1 table. To be submitted +
+
+
+
+
+ + ♻ ☆ DocNet: Semantic Structure in Inductive Bias Detection Models + + +
+ News will have biases so long as people have opinions. It is increasingly +important for informed citizens to be able to identify bias as social media +becomes the primary entry point for news and partisan differences increase. If +people know the biases of the news they are consuming, they will be able to +take action to avoid polarizing echo chambers. In this paper, we explore an +often overlooked aspect of bias detection in documents: the semantic structure +of news articles. We present DocNet, a novel, inductive, and low-resource +document embedding and bias detection model that outperforms large language +models. We also demonstrate that the semantic structure of news articles from +opposing partisan sides, as represented in document-level graph embeddings, +have significant similarities. These results can be used to advance bias +detection in low-resource environments. Our code, data, and the corresponding +datasheet are made available at: https://anonymous.4open.science/r/DocNet/. + +
+
+
+
+
+ + ♻ ☆ ReasoningRank: Teaching Student Models to Rank through Reasoning-Based + Knowledge Distillation + + +
+ Reranking documents based on their relevance to a given query is a critical +task in information retrieval. Traditional reranking methods often lack +transparency and rely on proprietary models, hindering reproducibility and +interpretability. We propose Reason-to-Rank (R2R), a novel open-source +reranking approach that enhances transparency by generating two types of +reasoning: direct relevance reasoning, which explains how a document addresses +the query, and comparison reasoning, which justifies the relevance of one +document over another. We leverage large language models (LLMs) as teacher +models to generate these explanations and distill this knowledge into smaller, +openly available student models. Our student models are trained to generate +meaningful reasoning and rerank documents, achieving competitive performance +across multiple datasets, including MSMARCO and BRIGHT. Experiments demonstrate +that R2R not only improves reranking accuracy but also provides valuable +insights into the decision-making process. By offering a structured and +interpretable solution with openly accessible resources, R2R aims to bridge the +gap between effectiveness and transparency in information retrieval, fostering +reproducibility and further research in the field. + +
+
+
+
+
+ + ♻ ☆ Narrative-of-Thought: Improving Temporal Reasoning of Large Language + Models via Recounted Narratives EMNLP'24 + + +
+ Reasoning about time and temporal relations is an integral aspect of human +cognition, essential for perceiving the world and navigating our experiences. +Though large language models (LLMs) have demonstrated impressive performance in +many reasoning tasks, temporal reasoning remains challenging due to its +intrinsic complexity. In this work, we first study an essential task of +temporal reasoning -- temporal graph generation, to unveil LLMs' inherent, +global reasoning capabilities. We show that this task presents great challenges +even for the most powerful LLMs, such as GPT-3.5/4. We also notice a +significant performance gap by small models (<10B) that lag behind LLMs by 50%. +Next, we study how to close this gap with a budget constraint, e.g., not using +model finetuning. We propose a new prompting technique tailored for temporal +reasoning, Narrative-of-Thought (NoT), that first converts the events set to a +Python class, then prompts a small model to generate a temporally grounded +narrative, guiding the final generation of a temporal graph. Extensive +experiments showcase the efficacy of NoT in improving various metrics. Notably, +NoT attains the highest F1 on the Schema-11 evaluation set, while securing an +overall F1 on par with GPT-3.5. NoT also achieves the best structural +similarity across the board, even compared with GPT-3.5/4. Our code is +available at https://github.com/launchnlp/NoT. + +
+
+ comment: EMNLP'24 Findings +
+
+
+
+
+ + ♻ ☆ Web2Code: A Large-scale Webpage-to-Code Dataset and Evaluation Framework + for Multimodal LLMs NeurIPS 2024 + + +
+ Multimodal large language models (MLLMs) have shown impressive success across +modalities such as image, video, and audio in a variety of understanding and +generation tasks. However, current MLLMs are surprisingly poor at understanding +webpage screenshots and generating their corresponding HTML code. To address +this problem, we propose $\texttt{Web2Code}$, a benchmark consisting of a new +large-scale webpage-to-code dataset for instruction tuning and an evaluation +framework for the webpage understanding and HTML code translation abilities of +MLLMs. For dataset construction, we leverage pretrained LLMs to enhance +existing webpage-to-code datasets as well as generate a diverse pool of new +webpages rendered into images. Specifically, the inputs are webpage images and +instructions, while the responses are the webpage's HTML code. We further +include diverse natural language QA pairs about the webpage content in the +responses to enable a more comprehensive understanding of the web content. To +evaluate model performance in these tasks, we develop an evaluation framework +for testing MLLMs' abilities in webpage understanding and web-to-code +generation. Extensive experiments show that our proposed dataset is beneficial +not only to our proposed tasks but also in the general visual domain. We hope +our work will contribute to the development of general MLLMs suitable for +web-based content generation and task automation. Our data and code are +available at https://github.com/MBZUAI-LLM/web2code. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Camera-ready Version. Website at + https://mbzuai-llm.github.io/webpage2code/ +
+
+
+
+
+ + ♻ ☆ PrExMe! Large Scale Prompt Exploration of Open Source LLMs for Machine + Translation and Summarization Evaluation EMNLP 2024 + + +
+ Large language models (LLMs) have revolutionized NLP research. Notably, +in-context learning enables their use as evaluation metrics for natural +language generation, making them particularly advantageous in low-resource +scenarios and time-restricted applications. In this work, we introduce PrExMe, +a large-scale Prompt Exploration for Metrics, where we evaluate more than 720 +prompt templates for open-source LLM-based metrics on machine translation (MT) +and summarization datasets, totalling over 6.6M evaluations. This extensive +comparison (1) benchmarks recent open-source LLMs as metrics and (2) explores +the stability and variability of different prompting strategies. We discover +that, on the one hand, there are scenarios for which prompts are stable. For +instance, some LLMs show idiosyncratic preferences and favor to grade generated +texts with textual labels while others prefer to return numeric scores. On the +other hand, the stability of prompts and model rankings can be susceptible to +seemingly innocuous changes. For example, changing the requested output format +from "0 to 100" to "-1 to +1" can strongly affect the rankings in our +evaluation. Our study contributes to understanding the impact of different +prompting approaches on LLM-based metrics for MT and summarization evaluation, +highlighting the most stable prompting patterns and potential limitations. + +
+
+ comment: EMNLP 2024 main; camera-ready +
+
+
+
+
+ + ♻ ☆ Towards Explainable Evaluation Metrics for Machine Translation + + +
+ Unlike classical lexical overlap metrics such as BLEU, most current +evaluation metrics for machine translation (for example, COMET or BERTScore) +are based on black-box large language models. They often achieve strong +correlations with human judgments, but recent research indicates that the +lower-quality classical metrics remain dominant, one of the potential reasons +being that their decision processes are more transparent. To foster more +widespread acceptance of novel high-quality metrics, explainability thus +becomes crucial. In this concept paper, we identify key properties as well as +key goals of explainable machine translation metrics and provide a +comprehensive synthesis of recent techniques, relating them to our established +goals and properties. In this context, we also discuss the latest +state-of-the-art approaches to explainable metrics based on generative models +such as ChatGPT and GPT4. Finally, we contribute a vision of next-generation +approaches, including natural language explanations. We hope that our work can +help catalyze and guide future research on explainable evaluation metrics and, +mediately, also contribute to better and more transparent machine translation +systems. + +
+
+ comment: Published at JMLR 3/24. We released an earlier preprint of this paper + under a different title (arXiv:2203.11131) +
+
+
+
+
+ + ♻ ☆ PEneo: Unifying Line Extraction, Line Grouping, and Entity Linking for + End-to-end Document Pair Extraction ACM MM 2024 + + +
+ Document pair extraction aims to identify key and value entities as well as +their relationships from visually-rich documents. Most existing methods divide +it into two separate tasks: semantic entity recognition (SER) and relation +extraction (RE). However, simply concatenating SER and RE serially can lead to +severe error propagation, and it fails to handle cases like multi-line entities +in real scenarios. To address these issues, this paper introduces a novel +framework, PEneo (Pair Extraction new decoder option), which performs document +pair extraction in a unified pipeline, incorporating three concurrent +sub-tasks: line extraction, line grouping, and entity linking. This approach +alleviates the error accumulation problem and can handle the case of multi-line +entities. Furthermore, to better evaluate the model's performance and to +facilitate future research on pair extraction, we introduce RFUND, a +re-annotated version of the commonly used FUNSD and XFUND datasets, to make +them more accurate and cover realistic situations. Experiments on various +benchmarks demonstrate PEneo's superiority over previous pipelines, boosting +the performance by a large margin (e.g., 19.89%-22.91% F1 score on RFUND-EN) +when combined with various backbones like LiLT and LayoutLMv3, showing its +effectiveness and generality. Codes and the new annotations are available at +https://github.com/ZeningLin/PEneo. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Combining Induction and Transduction for Abstract Reasoning + + +
+ When learning an input-output mapping from very few examples, is it better to +first infer a latent function that explains the examples, or is it better to +directly predict new test outputs, e.g. using a neural network? We study this +question on ARC, a highly diverse dataset of abstract reasoning tasks. We train +neural models for induction (inferring latent functions) and transduction +(directly predicting the test output for a given test input). Our models are +trained on synthetic data generated by prompting LLMs to produce Python code +specifying a function to be inferred, plus a stochastic subroutine for +generating inputs to that function. We find inductive and transductive models +solve very different problems, despite training on the same problems, and +despite sharing the same neural architecture. + +
+
+
+
+
+ + ♻ ☆ Safely Learning with Private Data: A Federated Learning Framework for + Large Language Model + + +
+ Private data, being larger and quality-higher than public data, can greatly +improve large language models (LLM). However, due to privacy concerns, this +data is often dispersed in multiple silos, making its secure utilization for +LLM training a challenge. Federated learning (FL) is an ideal solution for +training models with distributed private data, but traditional frameworks like +FedAvg are unsuitable for LLM due to their high computational demands on +clients. An alternative, split learning, offloads most training parameters to +the server while training embedding and output layers locally, making it more +suitable for LLM. Nonetheless, it faces significant challenges in security and +efficiency. Firstly, the gradients of embeddings are prone to attacks, leading +to potential reverse engineering of private data. Furthermore, the server's +limitation of handle only one client's training request at a time hinders +parallel training, severely impacting training efficiency. In this paper, we +propose a Federated Learning framework for LLM, named FL-GLM, which prevents +data leakage caused by both server-side and peer-client attacks while improving +training efficiency. Specifically, we first place the input block and output +block on local client to prevent embedding gradient attacks from server. +Secondly, we employ key-encryption during client-server communication to +prevent reverse engineering attacks from peer-clients. Lastly, we employ +optimization methods like client-batching or server-hierarchical, adopting +different acceleration methods based on the actual computational capabilities +of the server. Experimental results on NLU and generation tasks demonstrate +that FL-GLM achieves comparable metrics to centralized chatGLM model, +validating the effectiveness of our federated learning framework. + +
+
+
+
+
+ + ♻ ☆ FiSTECH: Financial Style Transfer to Enhance Creativity without + Hallucinations in LLMs + + +
+ Recent trends in Generative AI have emerged towards fine-tuning foundational +large language models (LLMs) to create domain-specific LLMs for automation and +chatbot-like applications. Specialized applications for analytics-heavy domains +such as Financial report generation require specific writing styles that +comprise compound and creative sentences with minimized hallucinations. In this +work, we explore the self-corrective auto-regressive qualities of LLMs to learn +creativity in writing styles with minimal prompting. We propose a novel +two-stage fine-tuning (FT) strategy wherein in the first stage public domain +financial reports are used to train for writing styles while allowing the LLM +to hallucinate. In the second stage the examples of hallucinations are manually +corrected and further used to fine-tune the LLM. The finally trained LLM learns +to generate specific financial report sections using minimal instructions and +tabular data inputs while ensuring low fine-tuning costs. Our proposed +two-stage fine-tuning boosts the accuracy of financial questions answering by +two-folds while reducing hallucinations by over 50%. Also, the fine-tuned model +has lower perplexity, improved ROUGE, TER and BLEU scores, higher creativity +and knowledge density with lower uncertainty and cross entropy than base LLMs. +Thus, the proposed framework can be generalized to train creativity in LLMs by +first allowing them to hallucinate. + +
+
+ comment: 10 pages, 14 figures, 5 tables, conference +
+
+
+
+
+ + ♻ ☆ A Comprehensive Study of Knowledge Editing for Large Language Models + + +
+ Large Language Models (LLMs) have shown extraordinary capabilities in +understanding and generating text that closely mirrors human communication. +However, a primary limitation lies in the significant computational demands +during training, arising from their extensive parameterization. This challenge +is further intensified by the dynamic nature of the world, necessitating +frequent updates to LLMs to correct outdated information or integrate new +knowledge, thereby ensuring their continued relevance. Note that many +applications demand continual model adjustments post-training to address +deficiencies or undesirable behaviors. There is an increasing interest in +efficient, lightweight methods for on-the-fly model modifications. To this end, +recent years have seen a burgeoning in the techniques of knowledge editing for +LLMs, which aim to efficiently modify LLMs' behaviors within specific domains +while preserving overall performance across various inputs. In this paper, we +first define the knowledge editing problem and then provide a comprehensive +review of cutting-edge approaches. Drawing inspiration from educational and +cognitive research theories, we propose a unified categorization criterion that +classifies knowledge editing methods into three groups: resorting to external +knowledge, merging knowledge into the model, and editing intrinsic knowledge. +Furthermore, we introduce a new benchmark, KnowEdit, for a comprehensive +empirical evaluation of representative knowledge editing approaches. +Additionally, we provide an in-depth analysis of knowledge location, which can +give a deeper understanding of the knowledge structures inherent within LLMs. +Finally, we discuss several potential applications of knowledge editing, +outlining its broad and impactful implications. + +
+
+ comment: Ongoing work (v5): we have updated the Table 4 results after + optimizing certain methods (related to AdaLoRA) and fixing computational bugs + (related to ROME and MEMIT) in the EasyEdit. These improvements have led to + better results than before. We will continue updating this paper and welcome + everyone to discuss and exchange ideas +
+
+
+
+
+ + ♻ ☆ Fox-1 Technical Report + + +
+ We present Fox-1, a series of small language models (SLMs) consisting of +Fox-1-1.6B and Fox-1-1.6B-Instruct-v0.1. These models are pre-trained on 3 +trillion tokens of web-scraped document data and fine-tuned with 5 billion +tokens of instruction-following and multi-turn conversation data. Aiming to +improve the pre-training efficiency, Fox-1-1.6B model introduces a novel +3-stage data curriculum across all the training data with 2K-8K sequence +length. In architecture design, Fox-1 features a deeper layer structure, an +expanded vocabulary, and utilizes Grouped Query Attention (GQA), offering a +performant and efficient architecture compared to other SLMs. Fox-1 achieves +better or on-par performance in various benchmarks compared to StableLM-2-1.6B, +Gemma-2B, Qwen1.5-1.8B, and OpenELM1.1B, with competitive inference speed and +throughput. The model weights have been released under the Apache 2.0 license, +where we aim to promote the democratization of LLMs and make them fully +accessible to the whole open-source community. + +
+
+ comment: Base model is available at + https://huggingface.co/tensoropera/Fox-1-1.6B and the instruction-tuned + version is available at + https://huggingface.co/tensoropera/Fox-1-1.6B-Instruct-v0.1 +
+
+
+
+
+ + ♻ ☆ OpenOmni: A Collaborative Open Source Tool for Building Future-Ready + Multimodal Conversational Agents EMNLP 2024 + + +
+ Multimodal conversational agents are highly desirable because they offer +natural and human-like interaction. However, there is a lack of comprehensive +end-to-end solutions to support collaborative development and benchmarking. +While proprietary systems like GPT-4o and Gemini demonstrating impressive +integration of audio, video, and text with response times of 200-250ms, +challenges remain in balancing latency, accuracy, cost, and data privacy. To +better understand and quantify these issues, we developed OpenOmni, an +open-source, end-to-end pipeline benchmarking tool that integrates advanced +technologies such as Speech-to-Text, Emotion Detection, Retrieval Augmented +Generation, Large Language Models, along with the ability to integrate +customized models. OpenOmni supports local and cloud deployment, ensuring data +privacy and supporting latency and accuracy benchmarking. This flexible +framework allows researchers to customize the pipeline, focusing on real +bottlenecks and facilitating rapid proof-of-concept development. OpenOmni can +significantly enhance applications like indoor assistance for visually impaired +individuals, advancing human-computer interaction. Our demonstration video is +available https://www.youtube.com/watch?v=zaSiT3clWqY, demo is available via +https://openomni.ai4wa.com, code is available via +https://github.com/AI4WA/OpenOmniFramework. + +
+
+ comment: Published in Proceedings of the 2024 Conference on Empirical Methods + in Natural Language Processing: System Demonstrations (EMNLP 2024) Best Demo + Paper Award at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Improving Math Problem Solving in Large Language Models Through + Categorization and Strategy Tailoring + + +
+ In this paper, we investigate how to harness large language models (LLMs) to +solve mathematical problems both quickly and accurately. Specifically, we +demonstrate the effectiveness of classifying problems into distinct categories +and applying category-specific problem-solving strategies to enhance the math +performance of LLMs. We develop a straightforward machine learning model for +problem categorization and show that its accuracy can be significantly improved +through the creation of well-designed training datasets. We believe that our +approach works by helping reduce hallucinations in LLMs, which is a critical +step toward unlocking their potential to tackle advanced mathematical problems. + +
+
+
+
+
+ + ♻ ☆ MAGNET: Improving the Multilingual Fairness of Language Models with + Adaptive Gradient-Based Tokenization + + +
+ In multilingual settings, non-Latin scripts and low-resource languages are +usually disadvantaged in terms of language models' utility, efficiency, and +cost. Specifically, previous studies have reported multiple modeling biases +that the current tokenization algorithms introduce to non-Latin script +languages, the main one being over-segmentation. In this work, we propose +MAGNET; multilingual adaptive gradient-based tokenization to reduce +over-segmentation via adaptive gradient-based subword tokenization. MAGNET +learns to predict segment boundaries between byte tokens in a sequence via +sub-modules within the model, which act as internal boundary predictors +(tokenizers). Previous gradient-based tokenization methods aimed for uniform +compression across sequences by integrating a single boundary predictor during +training and optimizing it end-to-end through stochastic reparameterization +alongside the next token prediction objective. However, this approach still +results in over-segmentation for non-Latin script languages in multilingual +settings. In contrast, MAGNET offers a customizable architecture where +byte-level sequences are routed through language-script-specific predictors, +each optimized for its respective language script. This modularity enforces +equitable segmentation granularity across different language scripts compared +to previous methods. Through extensive experiments, we demonstrate that in +addition to reducing segmentation disparities, MAGNET also enables faster +language modelling and improves downstream utility. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 14 + +
+
+
+ + ☆ Planning for Tabletop Object Rearrangement + + +
+ Finding an high-quality solution for the tabletop object rearrangement +planning is a challenging problem. Compared to determining a goal arrangement, +rearrangement planning is challenging due to the dependencies between objects +and the buffer capacity available to hold objects. Although orla* has proposed +an A* based searching strategy with lazy evaluation for the high-quality +solution, it is not scalable, with the success rate decreasing as the number of +objects increases. To overcome this limitation, we propose an enhanced A*-based +algorithm that improves state representation and employs incremental goal +attempts with lazy evaluation at each iteration. This approach aims to enhance +scalability while maintaining solution quality. Our evaluation demonstrates +that our algorithm can provide superior solutions compared to orla*, in a +shorter time, for both stationary and mobile robots. + +
+
+
+
+
+ + ☆ MetricGold: Leveraging Text-To-Image Latent Diffusion Models for Metric + Depth Estimation + + +
+ Recovering metric depth from a single image remains a fundamental challenge +in computer vision, requiring both scene understanding and accurate scaling. +While deep learning has advanced monocular depth estimation, current models +often struggle with unfamiliar scenes and layouts, particularly in zero-shot +scenarios and when predicting scale-ergodic metric depth. We present +MetricGold, a novel approach that harnesses generative diffusion model's rich +priors to improve metric depth estimation. Building upon recent advances in +MariGold, DDVM and Depth Anything V2 respectively, our method combines latent +diffusion, log-scaled metric depth representation, and synthetic data training. +MetricGold achieves efficient training on a single RTX 3090 within two days +using photo-realistic synthetic data from HyperSIM, VirtualKitti, and +TartanAir. Our experiments demonstrate robust generalization across diverse +datasets, producing sharper and higher quality metric depth estimates compared +to existing approaches. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2312.02145 by + other authors +
+
+
+
+
+ + ☆ Experimental study of fish-like bodies with passive tail and tunable + stiffness + + +
+ Scombrid fishes and tuna are efficient swimmers capable of maximizing +performance to escape predators and save energy during long journeys. A key +aspect in achieving these goals is the flexibility of the tail, which the fish +optimizes during swimming. Though, the robotic counterparts, although highly +efficient, have partially investigated the importance of flexibility. We have +designed and tested a fish-like robotic platform (of 30 cm in length) to +quantify performance with a tail made flexible through a torsional spring +placed at the peduncle. Body kinematics, forces, and power have been measured +and compared with real fish. The platform can vary its frequency between 1 and +3 Hz, reaching self-propulsion conditions with speed over 1 BL/s and Strouhal +number in the optimal range. We show that changing the frequency of the robot +can influence the thrust and power achieved by the fish-like robot. +Furthermore, by using appropriately tuned stiffness, the robot deforms in +accordance with the travelling wave mechanism, which has been revealed to be +the actual motion of real fish. These findings demonstrate the potential of +tuning the stiffness in fish swimming and offer a basis for investigating +fish-like flexibility in bio-inspired underwater vehicles. + +
+
+ comment: Conference Paper submitted to the 15th International Conference on + Hydrodynamics (ICHD 2024) +
+
+
+
+
+ + ☆ DGS-SLAM: Gaussian Splatting SLAM in Dynamic Environment + + +
+ We introduce Dynamic Gaussian Splatting SLAM (DGS-SLAM), the first dynamic +SLAM framework built on the foundation of Gaussian Splatting. While recent +advancements in dense SLAM have leveraged Gaussian Splatting to enhance scene +representation, most approaches assume a static environment, making them +vulnerable to photometric and geometric inconsistencies caused by dynamic +objects. To address these challenges, we integrate Gaussian Splatting SLAM with +a robust filtering process to handle dynamic objects throughout the entire +pipeline, including Gaussian insertion and keyframe selection. Within this +framework, to further improve the accuracy of dynamic object removal, we +introduce a robust mask generation method that enforces photometric consistency +across keyframes, reducing noise from inaccurate segmentation and artifacts +such as shadows. Additionally, we propose the loop-aware window selection +mechanism, which utilizes unique keyframe IDs of 3D Gaussians to detect loops +between the current and past frames, facilitating joint optimization of the +current camera poses and the Gaussian map. DGS-SLAM achieves state-of-the-art +performance in both camera tracking and novel view synthesis on various dynamic +SLAM benchmarks, proving its effectiveness in handling real-world dynamic +scenes. + +
+
+ comment: Preprint, Under review +
+
+
+
+
+ + ☆ Hierarchical Adaptive Motion Planning with Nonlinear Model Predictive + Control for Safety-Critical Collaborative Loco-Manipulation + + +
+ As legged robots take on roles in industrial and autonomous construction, +collaborative loco-manipulation is crucial for handling large and heavy objects +that exceed the capabilities of a single robot. However, ensuring the safety of +these multi-robot tasks is essential to prevent accidents and guarantee +reliable operation. This paper presents a hierarchical control system for +object manipulation using a team of quadrupedal robots. The combination of the +motion planner and the decentralized locomotion controller in a hierarchical +structure enables safe, adaptive planning for teams in complex scenarios. A +high-level nonlinear model predictive control planner generates collision-free +paths by incorporating control barrier functions, accounting for static and +dynamic obstacles. This process involves calculating contact points and forces +while adapting to unknown objects and terrain properties. The decentralized +loco-manipulation controller then ensures each robot maintains stable +locomotion and manipulation based on the planner's guidance. The effectiveness +of our method is carefully examined in simulations under various conditions and +validated in real-life setups with robot hardware. By modifying the object's +configuration, the robot team can maneuver unknown objects through an +environment containing both static and dynamic obstacles. We have made our code +publicly available in an open-source repository at +\url{https://github.com/DRCL-USC/collaborative_loco_manipulation}. + +
+
+
+
+
+ + ♻ ☆ Comparison of Middlewares in Edge-to-Edge and Edge-to-Cloud + Communication for Distributed ROS2 Systems + + +
+ The increased data transmission and number of devices involved in +communications among distributed systems make it challenging yet significantly +necessary to have an efficient and reliable networking middleware. In robotics +and autonomous systems, the wide application of ROS\,2 brings the possibility +of utilizing various networking middlewares together with DDS in ROS\,2 for +better communication among edge devices or between edge devices and the cloud. +However, there is a lack of comprehensive communication performance comparison +of integrating these networking middlewares with ROS\,2. In this study, we +provide a quantitative analysis for the communication performance of utilized +networking middlewares including MQTT and Zenoh alongside DDS in ROS\,2 among a +multiple host system. For a complete and reliable comparison, we calculate the +latency and throughput of these middlewares by sending distinct amounts and +types of data through different network setups including Ethernet, Wi-Fi, and +4G. To further extend the evaluation to real-world application scenarios, we +assess the drift error (the position changes) over time caused by these +networking middlewares with the robot moving in an identical square-shaped +path. Our results show that CycloneDDS performs better under Ethernet while +Zenoh performs better under Wi-Fi and 4G. In the actual robot test, the robot +moving trajectory drift error over time (96\,s) via Zenoh is the smallest. It +is worth noting we have a discussion of the CPU utilization of these networking +middlewares and the performance impact caused by enabling the security feature +in ROS\,2 at the end of the paper. + +
+
+ comment: Accepted by the Journal of Intelligent & Robotic Systems +
+
+
+
+
+ + ♻ ☆ A SysML-based language for evaluating digital twin software reusability + in cyber-physical system structure + + +
+ Evaluating early design concepts is crucial as it impacts quality and cost. +This process is often hindered by vague and uncertain design information. This +article introduces the SysML-based Simulated-Physical Systems Modeling Language +(SPSysML). It is a Domain-Specification Language for evaluating component +reusability in Cyber-Physical Systems incorporating Digital Twins and other +simulated parts. The proposed factors assess the design quantitatively. SPSysML +uses a requirement-based system structuring method to couple simulated and +physical parts with requirements. SPSysML enables DTs to perceive exogenous +actions in the simulated world. + SPSysML validation is survey- and application-based. First, we develop a +robotic system for an assisted living project. As a result of the SPSysML +application, we observed an integrity improvement between the simulated and +physical parts of the system. Thus, more system components are shared between +the simulated and physical setups. The system was deployed on the physical +robot and two simulators based on ROS and ROS2. Additionally, we share a +questionnaire for SPSysML assessment. The feedback that we already received is +published in this article. + +
+
+ comment: This work has been submitted to the Elsevier Robotics and Autonomous + Systems Journal +
+
+
+
+
+ + ♻ ☆ Psycho Gundam: Electroencephalography based real-time robotic control + system with deep learning + + +
+ The Psycho Frame, a sophisticated system primarily used in Universal Century +(U.C.) series mobile suits for NEWTYPE pilots, has evolved as an integral +component in harnessing the latent potential of mental energy. Its ability to +amplify and resonate with the pilot's psyche enables real-time mental control, +creating unique applications such as psychomagnetic fields and sensory-based +weaponry. This paper presents the development of a novel robotic control system +inspired by the Psycho Frame, combining electroencephalography (EEG) and deep +learning for real-time control of robotic systems. By capturing and +interpreting brainwave data through EEG, the system extends human cognitive +commands to robotic actions, reflecting the seamless synchronization of thought +and machine, much like the Psyco Frame's integration with a Newtype pilot's +mental faculties. This research demonstrates how modern AI techniques can +expand the limits of human-machine interaction, potentially transcending +traditional input methods and enabling a deeper, more intuitive control of +complex robotic systems. + +
+
+
+
+
+ + ♻ ☆ Towards Physically-Realizable Adversarial Attacks in Embodied Vision + Navigation ICRA + + +
+ The deployment of embodied navigation agents in safety-critical environments +raises concerns about their vulnerability to adversarial attacks on deep neural +networks. However, current attack methods often lack practicality due to +challenges in transitioning from the digital to the physical world, while +existing physical attacks for object detection fail to achieve both multi-view +effectiveness and naturalness. To address this, we propose a practical attack +method for embodied navigation by attaching adversarial patches with learnable +textures and opacity to objects. Specifically, to ensure effectiveness across +varying viewpoints, we employ a multi-view optimization strategy based on +object-aware sampling, which uses feedback from the navigation model to +optimize the patch's texture. To make the patch inconspicuous to human +observers, we introduce a two-stage opacity optimization mechanism, where +opacity is refined after texture optimization. Experimental results show our +adversarial patches reduce navigation success rates by about 40%, outperforming +previous methods in practicality, effectiveness, and naturalness. Code is +available at: +[https://github.com/chen37058/Physical-Attacks-in-Embodied-Navigation]. + +
+
+ comment: 8 pages, 6 figures, submitted to the 2025 IEEE International + Conference on Robotics & Automation (ICRA) +
+
+
+
+
+ + ♻ ☆ Closed-Loop Long-Horizon Robotic Planning via Equilibrium Sequence + Modeling + + +
+ In the endeavor to make autonomous robots take actions, task planning is a +major challenge that requires translating high-level task descriptions into +long-horizon action sequences. Despite recent advances in language model +agents, they remain prone to planning errors and limited in their ability to +plan ahead. To address these limitations in robotic planning, we advocate a +self-refining scheme that iteratively refines a draft plan until an equilibrium +is reached. Remarkably, this process can be optimized end-to-end from an +analytical perspective without the need to curate additional verifiers or +reward models, allowing us to train self-refining planners in a simple +supervised learning fashion. Meanwhile, a nested equilibrium sequence modeling +procedure is devised for efficient closed-loop planning that incorporates +useful feedback from the environment (or an internal world model). Our method +is evaluated on the VirtualHome-Env benchmark, showing advanced performance +with better scaling for inference computation. Code is available at +https://github.com/Singularity0104/equilibrium-planner. + +
+
+
+
+
+ + ♻ ☆ Software-Hardware Co-Design For Embodied AI Robots + + +
+ Embodied AI robots have the potential to fundamentally improve the way human +beings live and manufacture. Continued progress in the burgeoning field of +using large language models to control robots depends critically on an +efficient computing substrate. In particular, today's computing systems for +embodied AI robots are designed purely based on the interest of algorithm +developers, where robot actions are divided into a discrete frame-basis. Such +an execution pipeline creates high latency and energy consumption. This paper +proposes Corki, an algorithm-architecture co-design framework for real-time +embodied AI robot control. Our idea is to decouple LLM inference, robotic +control and data communication in the embodied AI robots compute pipeline. +Instead of predicting action for one single frame, Corki predicts the +trajectory for the near future to reduce the frequency of LLM inference. The +algorithm is coupled with a hardware that accelerates transforming trajectory +into actual torque signals used to control robots and an execution pipeline +that parallels data communication with computation. Corki largely reduces LLM +inference frequency by up to 8.0x, resulting in up to 3.6x speed up. The +success rate improvement can be up to 17.3%. Code is provided for +re-implementation. https://github.com/hyy0613/Corki + +
+
+
+
+
+ + ♻ ☆ AIC MLLM: Autonomous Interactive Correction MLLM for Robust Robotic + Manipulation + + +
+ The ability to reflect on and correct failures is crucial for robotic systems +to interact stably with real-life objects.Observing the generalization and +reasoning capabilities of Multimodal Large Language Models (MLLMs), previous +approaches have aimed to utilize these models to enhance robotic systems +accordingly.However, these methods typically focus on high-level planning +corrections using an additional MLLM, with limited utilization of failed +samples to correct low-level contact poses which is particularly prone to occur +during articulated object manipulation.To address this gap, we propose an +Autonomous Interactive Correction (AIC) MLLM, which makes use of previous +low-level interaction experiences to correct SE(3) pose predictions for +articulated object. Specifically, AIC MLLM is initially fine-tuned to acquire +both pose prediction and feedback prompt comprehension abilities.We design two +types of prompt instructions for interactions with objects: 1) visual masks to +highlight unmovable parts for position correction, and 2) textual descriptions +to indicate potential directions for rotation correction. During inference, a +Feedback Information Extraction module is introduced to recognize the failure +cause, allowing AIC MLLM to adaptively correct the pose prediction using the +corresponding prompts.To further enhance manipulation stability, we devise a +Test Time Adaptation strategy that enables AIC MLLM to better adapt to the +current scene configuration.Finally, extensive experiments are conducted in +both simulated and real-world environments to evaluate the proposed method. The +results demonstrate that our AIC MLLM can efficiently correct failure samples +by leveraging interaction experience prompts.Our project website is +https://sites.google.com/view/aic-mllm. + +
+
+
+
+
+ + ♻ ☆ A Simple Multi-agent Joint Prediction Method for Autonomous Driving + + +
+ Predicting future motions of road participants is an important task for +driving autonomously. Most existing models excel at predicting the marginal +trajectory of a single agent, but predicting joint trajectories for multiple +agents that are consistent within a scene remains a challenge. Previous +research has often focused on marginal predictions, but the importance of joint +predictions has become increasingly apparent. Joint prediction aims to generate +trajectories that are consistent across the entire scene. Our research builds +upon the SIMPL baseline to explore methods for generating scene-consistent +trajectories. We tested our algorithm on the Argoverse 2 dataset, and +experimental results demonstrate that our approach can generate +scene-consistent trajectories. Compared to the SIMPL baseline, our method +significantly reduces the collision rate of joint trajectories within the +scene. + +
+
+
+
+
+ + ♻ ☆ RINO: Accurate, Robust Radar-Inertial Odometry with Non-Iterative + Estimation + + +
+ Precise localization and mapping are critical for achieving autonomous +navigation in self-driving vehicles. However, ego-motion estimation still faces +significant challenges, particularly when GNSS failures occur or under extreme +weather conditions (e.g., fog, rain, and snow). In recent years, scanning radar +has emerged as an effective solution due to its strong penetration +capabilities. Nevertheless, scanning radar data inherently contains high levels +of noise, necessitating hundreds to thousands of iterations of optimization to +estimate a reliable transformation from the noisy data. Such iterative solving +is time-consuming, unstable, and prone to failure. To address these challenges, +we propose an accurate and robust Radar-Inertial Odometry system, RINO, which +employs a non-iterative solving approach. Our method decouples rotation and +translation estimation and applies an adaptive voting scheme for 2D rotation +estimation, enhancing efficiency while ensuring consistent solving time. +Additionally, the approach implements a loosely coupled system between the +scanning radar and an inertial measurement unit (IMU), leveraging Error-State +Kalman Filtering (ESKF). Notably, we successfully estimated the uncertainty of +the pose estimation from the scanning radar, incorporating this into the +filter's Maximum A Posteriori estimation, a consideration that has been +previously overlooked. Validation on publicly available datasets demonstrates +that RINO outperforms state-of-the-art methods and baselines in both accuracy +and robustness. Our code is available at https://github.com/yangsc4063/rino. + +
+
+
+
+
+
+
+
+ + Systems and Control 10 + +
+
+
+ + ☆ Adaptive Soft Actor-Critic Framework for RIS-Assisted and UAV-Aided + Communication + + +
+ In this work, we explore UAV-assisted reconfigurable intelligent surface +(RIS) technology to enhance downlink communications in wireless networks. By +integrating RIS on both UAVs and ground infrastructure, we aim to boost network +coverage, fairness, and resilience against challenges such as UAV jitter. To +maximize the minimum achievable user rate, we formulate a joint optimization +problem involving beamforming, phase shifts, and UAV trajectory. To address +this problem, we propose an adaptive soft actor-critic (ASAC) framework. In +this approach, agents are built using adaptive sparse transformers with +attentive feature refinement (ASTAFER), enabling dynamic feature processing +that adapts to real-time network conditions. The ASAC model learns optimal +solutions to the coupled subproblems in real time, delivering an end-to-end +solution without relying on iterative or relaxation-based methods. Simulation +results demonstrate that our ASAC-based approach achieves better performance +compared to the conventional SAC. This makes it a robust, adaptable solution +for real-time, fair, and efficient downlink communication in UAV-RIS networks. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Molecular Dynamics Study of Liquid Condensation on Nano-structured + Sinusoidal Hybrid Wetting Surfaces + + +
+ Although real surfaces exhibit intricate topologies at the nanoscale, rough +surface consideration is often overlooked in nanoscale heat transfer studies. +Superimposed sinusoidal functions effectively model the complexity of these +surfaces. This study investigates the impact of sinusoidal roughness on liquid +argon condensation over a functional gradient wetting (FGW) surface with 84% +hydrophilic content using molecular dynamics simulations. Argon atoms are +confined between two platinum substrates: a flat lower substrate heated to 130K +and a rough upper substrate at 90K. Key metrics of the nanoscale condensation +process, such as nucleation, surface heat flux, and total energy per atom, are +analyzed. Rough surfaces significantly enhance nucleation, nearly doubling +cluster counts compared to smooth surfaces and achieving a more extended atomic +density profile with a peak of approximately and improved heat flux. Stronger +atom-surface interactions also lead to more efficient energy dissipation. These +findings underscore the importance of surface roughness in optimizing +condensation and heat transfer, offering a more accurate representation of +surface textures and a basis for designing surfaces that achieve superior heat +transfer performance. + +
+
+ comment: 9 pages, 7 figures, conference +
+
+
+
+
+ + ☆ Existence of $ε$-Nash Equilibria in Nonzero-Sum Borel Stochastic + Games and Equilibria of Quantized Models + + +
+ Establishing the existence of exact or near Markov or stationary perfect Nash +equilibria in nonzero-sum Markov games over Borel spaces remains a challenging +problem, with few positive results to date. In this paper, we establish the +existence of approximate Markov and stationary Nash equilibria for nonzero-sum +stochastic games over Borel spaces, assuming only mild regularity conditions on +the model. Our approach involves analyzing a quantized version of the game, for +which we provide an explicit construction under both finite-horizon and +discounted cost criteria. This work has significant implications for emerging +applications such as multi-agent learning. Our results apply to both compact +and non-compact state spaces. For the compact state space case, we first +approximate the standard Borel model with a finite state-action model. Using +the existence of Markov and stationary perfect Nash equilibria for these finite +models under finite-horizon and discounted cost criteria, we demonstrate that +these joint policies constitute approximate Markov and stationary perfect +equilibria under mild continuity conditions on the one-stage costs and +transition probabilities. For the non-compact state space case, we achieve +similar results by first approximating the model with a compact-state model. +Compared with previous results in the literature, which we comprehensively +review, we provide more general and complementary conditions, along with +explicit approximation models whose equilibria are $\epsilon$-equilibria for +the original model. + +
+
+
+
+
+ + ☆ Demonstrating Remote Synchronization: An Experimental Approach with + Nonlinear Oscillators + + +
+ This study investigates remote synchronization in arbitrary network clusters +of coupled nonlinear oscillators, a phenomenon inspired by neural +synchronization in the brain. Employing a multi-faceted approach encompassing +analytical, numerical, and experimental methodologies, we leverage the Master +Stability Function (MSF) to analyze network stability. We provide experimental +evidence of remote synchronization between two clusters of nonlinear +oscillators, where oscillators within each cluster are also remotely connected. +This observation parallels the thalamus-mediated synchronization of neuronal +populations in the brain. An electronic circuit testbed, supported by nonlinear +ODE modeling and LT Spice simulation, was developed to validate our theoretical +predictions. Future work will extend this investigation to encompass diverse +network topologies and explore potential applications in neuroscience, +communication networks, and power systems. + +
+
+
+
+
+ + ☆ A Wearable Gait Monitoring System for 17 Gait Parameters Based on + Computer Vision + + +
+ We developed a shoe-mounted gait monitoring system capable of tracking up to +17 gait parameters, including gait length, step time, stride velocity, and +others. The system employs a stereo camera mounted on one shoe to track a +marker placed on the opposite shoe, enabling the estimation of spatial gait +parameters. Additionally, a Force Sensitive Resistor (FSR) affixed to the heel +of the shoe, combined with a custom-designed algorithm, is utilized to measure +temporal gait parameters. Through testing on multiple participants and +comparison with the gait mat, the proposed gait monitoring system exhibited +notable performance, with the accuracy of all measured gait parameters +exceeding 93.61%. The system also demonstrated a low drift of 4.89% during +long-distance walking. A gait identification task conducted on participants +using a trained Transformer model achieved 95.7% accuracy on the dataset +collected by the proposed system, demonstrating that our hardware has the +potential to collect long-sequence gait data suitable for integration with +current Large Language Models (LLMs). The system is cost-effective, +user-friendly, and well-suited for real-life measurements. + +
+
+ comment: 13 pages, 14 figures. This paper was submitted for publication to the + IEEE Transactions on Instrumentation and Measurement +
+
+
+
+
+ + ☆ Self-Triggered Control in Artificial Pancreas + + +
+ The management of type 1 diabetes has been revolutionized by the artificial +pancreas system (APS), which automates insulin delivery based on continuous +glucose monitor (CGM). While conventional closed-loop systems rely on CGM data, +which leads to higher energy consumption at the sensors and increased data +redundancy in the underlying communication network. In contrast, this paper +proposes a self-triggered control mechanism that can potentially achieve lower +latency and energy efficiency. The model for the APS consists of a state and +input-constrained dynamical system affected by exogenous meal disturbances. Our +self-triggered mechanism relies on restricting the state evolution within the +robust control invariant of such a system at all times. To that end, using +tools from reachability, we associate a safe time interval with such invariant +sets, which denotes the maximum time for which the invariant set remains +invariant, even without transmission of CGM data at all times. + +
+
+
+
+
+ + ☆ Wireless Resource Allocation with Collaborative Distributed and + Centralized DRL under Control Channel Attacks + + +
+ In this paper, we consider a wireless resource allocation problem in a +cyber-physical system (CPS) where the control channel, carrying resource +allocation commands, is subjected to denial-of-service (DoS) attacks. We +propose a novel concept of collaborative distributed and centralized (CDC) +resource allocation to effectively mitigate the impact of these attacks. To +optimize the CDC resource allocation policy, we develop a new CDC-deep +reinforcement learning (DRL) algorithm, whereas existing DRL frameworks only +formulate either centralized or distributed decision-making problems. +Simulation results demonstrate that the CDC-DRL algorithm significantly +outperforms state-of-the-art DRL benchmarks, showcasing its ability to address +resource allocation problems in large-scale CPSs under control channel attacks. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ Game-Theoretic Neyman-Pearson Detection to Combat Strategic Evasion + + +
+ The security in networked systems depends greatly on recognizing and +identifying adversarial behaviors. Traditional detection methods focus on +specific categories of attacks and have become inadequate for increasingly +stealthy and deceptive attacks that are designed to bypass detection +strategically. This work aims to develop a holistic theory to countermeasure +such evasive attacks. We focus on extending a fundamental class of +statistical-based detection methods based on Neyman-Pearson's (NP) hypothesis +testing formulation. We propose game-theoretic frameworks to capture the +conflicting relationship between a strategic evasive attacker and an +evasion-aware NP detector. By analyzing both the equilibrium behaviors of the +attacker and the NP detector, we characterize their performance using +Equilibrium Receiver-Operational-Characteristic (EROC) curves. We show that the +evasion-aware NP detectors outperform the passive ones in the way that the +former can act strategically against the attacker's behavior and adaptively +modify their decision rules based on the received messages. In addition, we +extend our framework to a sequential setting where the user sends out +identically distributed messages. We corroborate the analytical results with a +case study of anomaly detection. + +
+
+
+
+
+ + ♻ ☆ A SysML-based language for evaluating digital twin software reusability + in cyber-physical system structure + + +
+ Evaluating early design concepts is crucial as it impacts quality and cost. +This process is often hindered by vague and uncertain design information. This +article introduces the SysML-based Simulated-Physical Systems Modeling Language +(SPSysML). It is a Domain-Specification Language for evaluating component +reusability in Cyber-Physical Systems incorporating Digital Twins and other +simulated parts. The proposed factors assess the design quantitatively. SPSysML +uses a requirement-based system structuring method to couple simulated and +physical parts with requirements. SPSysML enables DTs to perceive exogenous +actions in the simulated world. + SPSysML validation is survey- and application-based. First, we develop a +robotic system for an assisted living project. As a result of the SPSysML +application, we observed an integrity improvement between the simulated and +physical parts of the system. Thus, more system components are shared between +the simulated and physical setups. The system was deployed on the physical +robot and two simulators based on ROS and ROS2. Additionally, we share a +questionnaire for SPSysML assessment. The feedback that we already received is +published in this article. + +
+
+ comment: This work has been submitted to the Elsevier Robotics and Autonomous + Systems Journal +
+
+
+
+
+ + ♻ ☆ SupplyGraph: A Benchmark Dataset for Supply Chain Planning using Graph + Neural Networks AAAI 2024 + + +
+ Graph Neural Networks (GNNs) have gained traction across different domains +such as transportation, bio-informatics, language processing, and computer +vision. However, there is a noticeable absence of research on applying GNNs to +supply chain networks. Supply chain networks are inherently graph-like in +structure, making them prime candidates for applying GNN methodologies. This +opens up a world of possibilities for optimizing, predicting, and solving even +the most complex supply chain problems. A major setback in this approach lies +in the absence of real-world benchmark datasets to facilitate the research and +resolution of supply chain problems using GNNs. To address the issue, we +present a real-world benchmark dataset for temporal tasks, obtained from one of +the leading FMCG companies in Bangladesh, focusing on supply chain planning for +production purposes. The dataset includes temporal data as node features to +enable sales predictions, production planning, and the identification of +factory issues. By utilizing this dataset, researchers can employ GNNs to +address numerous supply chain problems, thereby advancing the field of supply +chain analytics and planning. Source: https://github.com/CIOL-SUST/SupplyGraph + +
+
+ comment: Accepted to 4th workshop on Graphs and more Complex structures for + Learning and Reasoning, colocated with AAAI 2024. Extended journal version + with experiments is available here: arXiv:2411.08550 +
+
+
+
+
+
+
+
+ + Computation and Language 38 + +
+
+
+ + ☆ Bias in Large Language Models: Origin, Evaluation, and Mitigation + + +
+ Large Language Models (LLMs) have revolutionized natural language processing, +but their susceptibility to biases poses significant challenges. This +comprehensive review examines the landscape of bias in LLMs, from its origins +to current mitigation strategies. We categorize biases as intrinsic and +extrinsic, analyzing their manifestations in various NLP tasks. The review +critically assesses a range of bias evaluation methods, including data-level, +model-level, and output-level approaches, providing researchers with a robust +toolkit for bias detection. We further explore mitigation strategies, +categorizing them into pre-model, intra-model, and post-model techniques, +highlighting their effectiveness and limitations. Ethical and legal +implications of biased LLMs are discussed, emphasizing potential harms in +real-world applications such as healthcare and criminal justice. By +synthesizing current knowledge on bias in LLMs, this review contributes to the +ongoing effort to develop fair and responsible AI systems. Our work serves as a +comprehensive resource for researchers and practitioners working towards +understanding, evaluating, and mitigating bias in LLMs, fostering the +development of more equitable AI technologies. + +
+
+
+
+
+ + ☆ BPO: Towards Balanced Preference Optimization between Knowledge Breadth + and Depth in Alignment + + +
+ Reinforcement Learning with Human Feedback (RLHF) is the key to the success +of large language models (LLMs) in recent years. In this work, we first +introduce the concepts of knowledge breadth and knowledge depth, which measure +the comprehensiveness and depth of an LLM or knowledge source respectively. We +reveal that the imbalance in the number of prompts and responses can lead to a +potential disparity in breadth and depth learning within alignment tuning +datasets by showing that even a simple uniform method for balancing the number +of instructions and responses can lead to significant improvements. Building on +this, we further propose Balanced Preference Optimization (BPO), designed to +dynamically augment the knowledge depth of each sample. BPO is motivated by the +observation that the usefulness of knowledge varies across samples, +necessitating tailored learning of knowledge depth. To achieve this, we +introduce gradient-based clustering, estimating the knowledge informativeness +and usefulness of each augmented sample based on the model's optimization +direction. Our experimental results across various benchmarks demonstrate that +BPO outperforms other baseline methods in alignment tuning while maintaining +training efficiency. Furthermore, we conduct a detailed analysis of each +component of BPO, providing guidelines for future research in preference data +optimization. + +
+
+
+
+
+ + ☆ SPICA: Retrieving Scenarios for Pluralistic In-Context Alignment + + +
+ Alignment of large language models (LLMs) to societal values should account +for pluralistic values from diverse groups. One technique uses in-context +learning for inference-time alignment, but only considers similarity when +drawing few-shot examples, not accounting for cross-group differences in value +prioritization. We propose SPICA, a framework for pluralistic alignment that +accounts for group-level differences during in-context example retrieval. SPICA +introduces three designs to facilitate pluralistic alignment: scenario banks, +group-informed metrics, and in-context alignment prompts. From an evaluation of +SPICA on an alignment task collecting inputs from four demographic groups ($n = +544$), our metrics retrieve in-context examples that more closely match +observed preferences, with the best prompt configuration using multiple +contrastive responses to demonstrate examples. In an end-to-end evaluation ($n += 80$), we observe that SPICA-aligned models are higher rated than a baseline +similarity-only retrieval approach, with groups seeing up to a +0.16 point +improvement on a 5 point scale. Additionally, gains from SPICA were more +uniform, with all groups benefiting from alignment rather than only some. +Finally, we find that while a group-agnostic approach can effectively align to +aggregated values, it is not most suited for aligning to divergent groups. + +
+
+
+
+
+ + ☆ BanglaDialecto: An End-to-End AI-Powered Regional Speech Standardization + + +
+ This study focuses on recognizing Bangladeshi dialects and converting diverse +Bengali accents into standardized formal Bengali speech. Dialects, often +referred to as regional languages, are distinctive variations of a language +spoken in a particular location and are identified by their phonetics, +pronunciations, and lexicon. Subtle changes in pronunciation and intonation are +also influenced by geographic location, educational attainment, and +socioeconomic status. Dialect standardization is needed to ensure effective +communication, educational consistency, access to technology, economic +opportunities, and the preservation of linguistic resources while respecting +cultural diversity. Being the fifth most spoken language with around 55 +distinct dialects spoken by 160 million people, addressing Bangla dialects is +crucial for developing inclusive communication tools. However, limited research +exists due to a lack of comprehensive datasets and the challenges of handling +diverse dialects. With the advancement in multilingual Large Language Models +(mLLMs), emerging possibilities have been created to address the challenges of +dialectal Automated Speech Recognition (ASR) and Machine Translation (MT). This +study presents an end-to-end pipeline for converting dialectal Noakhali speech +to standard Bangla speech. This investigation includes constructing a +large-scale diverse dataset with dialectal speech signals that tailored the +fine-tuning process in ASR and LLM for transcribing the dialect speech to +dialect text and translating the dialect text to standard Bangla text. Our +experiments demonstrated that fine-tuning the Whisper ASR model achieved a CER +of 0.8% and WER of 1.5%, while the BanglaT5 model attained a BLEU score of +41.6% for dialect-to-standard text translation. + +
+
+ comment: Accepted in 2024 IEEE International Conference on Big Data (IEEE + BigData) +
+
+
+
+
+ + ☆ Empowering Meta-Analysis: Leveraging Large Language Models for + Scientific Synthesis + + +
+ This study investigates the automation of meta-analysis in scientific +documents using large language models (LLMs). Meta-analysis is a robust +statistical method that synthesizes the findings of multiple studies support +articles to provide a comprehensive understanding. We know that a meta-article +provides a structured analysis of several articles. However, conducting +meta-analysis by hand is labor-intensive, time-consuming, and susceptible to +human error, highlighting the need for automated pipelines to streamline the +process. Our research introduces a novel approach that fine-tunes the LLM on +extensive scientific datasets to address challenges in big data handling and +structured data extraction. We automate and optimize the meta-analysis process +by integrating Retrieval Augmented Generation (RAG). Tailored through prompt +engineering and a new loss metric, Inverse Cosine Distance (ICD), designed for +fine-tuning on large contextual datasets, LLMs efficiently generate structured +meta-analysis content. Human evaluation then assesses relevance and provides +information on model performance in key metrics. This research demonstrates +that fine-tuned models outperform non-fine-tuned models, with fine-tuned LLMs +generating 87.6% relevant meta-analysis abstracts. The relevance of the +context, based on human evaluation, shows a reduction in irrelevancy from 4.56% +to 1.9%. These experiments were conducted in a low-resource environment, +highlighting the study's contribution to enhancing the efficiency and +reliability of meta-analysis automation. + +
+
+ comment: Accepted in 2024 IEEE International Conference on Big Data (IEEE + BigData) +
+
+
+
+
+ + ☆ Large Language Models (LLMs) as Traffic Control Systems at Urban + Intersections: A New Paradigm + + +
+ This study introduces a novel approach for traffic control systems by using +Large Language Models (LLMs) as traffic controllers. The study utilizes their +logical reasoning, scene understanding, and decision-making capabilities to +optimize throughput and provide feedback based on traffic conditions in +real-time. LLMs centralize traditionally disconnected traffic control processes +and can integrate traffic data from diverse sources to provide context-aware +decisions. LLMs can also deliver tailored outputs using various means such as +wireless signals and visuals to drivers, infrastructures, and autonomous +vehicles. To evaluate LLMs ability as traffic controllers, this study proposed +a four-stage methodology. The methodology includes data creation and +environment initialization, prompt engineering, conflict identification, and +fine-tuning. We simulated multi-lane four-leg intersection scenarios and +generates detailed datasets to enable conflict detection using LLMs and Python +simulation as a ground truth. We used chain-of-thought prompts to lead LLMs in +understanding the context, detecting conflicts, resolving them using traffic +rules, and delivering context-sensitive traffic management solutions. We +evaluated the prformance GPT-mini, Gemini, and Llama as traffic controllers. +Results showed that the fine-tuned GPT-mini achieved 83% accuracy and an +F1-score of 0.84. GPT-mini model exhibited a promising performance in +generating actionable traffic management insights, with high ROUGE-L scores +across conflict identification of 0.95, decision-making of 0.91, priority +assignment of 0.94, and waiting time optimization of 0.92. We demonstrated that +LLMs can offer precise recommendations to drivers in real-time including +yielding, slowing, or stopping based on vehicle dynamics. + +
+
+ comment: The data and code that support the findings of this study are openly + available in Zenodo at https://doi.org/10.5281/zenodo.14171745, reference + number 14171745 +
+
+
+
+
+ + ☆ Large Vision-Language Models for Remote Sensing Visual Question + Answering + + +
+ Remote Sensing Visual Question Answering (RSVQA) is a challenging task that +involves interpreting complex satellite imagery to answer natural language +questions. Traditional approaches often rely on separate visual feature +extractors and language processing models, which can be computationally +intensive and limited in their ability to handle open-ended questions. In this +paper, we propose a novel method that leverages a generative Large +Vision-Language Model (LVLM) to streamline the RSVQA process. Our approach +consists of a two-step training strategy: domain-adaptive pretraining and +prompt-based finetuning. This method enables the LVLM to generate natural +language answers by conditioning on both visual and textual inputs, without the +need for predefined answer categories. We evaluate our model on the RSVQAxBEN +dataset, demonstrating superior performance compared to state-of-the-art +baselines. Additionally, a human evaluation study shows that our method +produces answers that are more accurate, relevant, and fluent. The results +highlight the potential of generative LVLMs in advancing the field of remote +sensing analysis. + +
+
+
+
+
+ + ☆ Bilingual Text-dependent Speaker Verification with Pre-trained Models + for TdSV Challenge 2024 + + +
+ This paper presents our submissions to the Iranian division of the +Text-dependent Speaker Verification Challenge (TdSV) 2024. TdSV aims to +determine if a specific phrase was spoken by a target speaker. We developed two +independent subsystems based on pre-trained models: For phrase verification, a +phrase classifier rejected incorrect phrases, while for speaker verification, a +pre-trained ResNet293 with domain adaptation extracted speaker embeddings for +computing cosine similarity scores. In addition, we evaluated Whisper-PMFA, a +pre-trained ASR model adapted for speaker verification, and found that, +although it outperforms randomly initialized ResNets, it falls short of the +performance of pre-trained ResNets, highlighting the importance of large-scale +pre-training. The results also demonstrate that achieving competitive +performance on TdSV without joint modeling of speaker and text is possible. Our +best system achieved a MinDCF of 0.0358 on the evaluation subset and won the +challenge. + +
+
+ comment: 5 pages, no figures +
+
+
+
+
+ + ☆ Information Anxiety in Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated strong performance as +knowledge repositories, enabling models to understand user queries and generate +accurate and context-aware responses. Extensive evaluation setups have +corroborated the positive correlation between the retrieval capability of LLMs +and the frequency of entities in their pretraining corpus. We take the +investigation further by conducting a comprehensive analysis of the internal +reasoning and retrieval mechanisms of LLMs. Our work focuses on three critical +dimensions - the impact of entity popularity, the models' sensitivity to +lexical variations in query formulation, and the progression of hidden state +representations across LLM layers. Our preliminary findings reveal that popular +questions facilitate early convergence of internal states toward the correct +answer. However, as the popularity of a query increases, retrieved attributes +across lexical variations become increasingly dissimilar and less accurate. +Interestingly, we find that LLMs struggle to disentangle facts, grounded in +distinct relations, from their parametric memory when dealing with highly +popular subjects. Through a case study, we explore these latent strains within +LLMs when processing highly popular queries, a phenomenon we term information +anxiety. The emergence of information anxiety in LLMs underscores the +adversarial injection in the form of linguistic variations and calls for a more +holistic evaluation of frequently occurring entities. + +
+
+
+
+
+ + ☆ Can Generic LLMs Help Analyze Child-adult Interactions Involving + Children with Autism in Clinical Observation? NeurIPS 2024 + + +
+ Large Language Models (LLMs) have shown significant potential in +understanding human communication and interaction. However, their performance +in the domain of child-inclusive interactions, including in clinical settings, +remains less explored. In this work, we evaluate generic LLMs' ability to +analyze child-adult dyadic interactions in a clinically relevant context +involving children with ASD. Specifically, we explore LLMs in performing four +tasks: classifying child-adult utterances, predicting engaged activities, +recognizing language skills and understanding traits that are clinically +relevant. Our evaluation shows that generic LLMs are highly capable of +analyzing long and complex conversations in clinical observation sessions, +often surpassing the performance of non-expert human evaluators. The results +show their potential to segment interactions of interest, assist in language +skills evaluation, identify engaged activities, and offer clinical-relevant +context for assessments. + +
+
+ comment: GenAI for Health Workshop, NeurIPS 2024 +
+
+
+
+
+ + ☆ Chain-of-Programming (CoP) : Empowering Large Language Models for + Geospatial Code Generation + + +
+ With the rapid growth of interdisciplinary demands for geospatial modeling +and the rise of large language models (LLMs), geospatial code generation +technology has seen significant advancements. However, existing LLMs often face +challenges in the geospatial code generation process due to incomplete or +unclear user requirements and insufficient knowledge of specific platform +syntax rules, leading to the generation of non-executable code, a phenomenon +known as "code hallucination." To address this issue, this paper proposes a +Chain of Programming (CoP) framework, which decomposes the code generation +process into five steps: requirement analysis, algorithm design, code +implementation, code debugging, and code annotation. The framework incorporates +a shared information pool, knowledge base retrieval, and user feedback +mechanisms, forming an end-to-end code generation flow from requirements to +code without the need for model fine-tuning. Based on a geospatial problem +classification framework and evaluation benchmarks, the CoP strategy +significantly improves the logical clarity, syntactical correctness, and +executability of the generated code, with improvements ranging from 3.0% to +48.8%. Comparative and ablation experiments further validate the superiority of +the CoP strategy over other optimization approaches and confirm the rationality +and necessity of its key components. Through case studies on building data +visualization and fire data analysis, this paper demonstrates the application +and effectiveness of CoP in various geospatial scenarios. The CoP framework +offers a systematic, step-by-step approach to LLM-based geospatial code +generation tasks, significantly enhancing code generation performance in +geospatial tasks and providing valuable insights for code generation in other +vertical domains. + +
+
+
+
+
+ + ☆ Comparison of Multilingual and Bilingual Models for Satirical News + Detection of Arabic and English ALT + + +
+ Satirical news is real news combined with a humorous comment or exaggerated +content, and it often mimics the format and style of real news. However, +satirical news is often misunderstood as misinformation, especially by +individuals from different cultural and social backgrounds. This research +addresses the challenge of distinguishing satire from truthful news by +leveraging multilingual satire detection methods in English and Arabic. We +explore both zero-shot and chain-of-thought (CoT) prompting using two language +models, Jais-chat(13B) and LLaMA-2-chat(7B). Our results show that CoT +prompting offers a significant advantage for the Jais-chat model over the +LLaMA-2-chat model. Specifically, Jais-chat achieved the best performance, with +an F1-score of 80\% in English when using CoT prompting. These results +highlight the importance of structured reasoning in CoT, which enhances +contextual understanding and is vital for complex tasks like satire detection. + +
+
+ comment: ALTA 2024 (Selected for publication) +
+
+
+
+
+ + ☆ HJ-Ky-0.1: an Evaluation Dataset for Kyrgyz Word Embeddings + + +
+ One of the key tasks in modern applied computational linguistics is +constructing word vector representations (word embeddings), which are widely +used to address natural language processing tasks such as sentiment analysis, +information extraction, and more. To choose an appropriate method for +generating these word embeddings, quality assessment techniques are often +necessary. A standard approach involves calculating distances between vectors +for words with expert-assessed 'similarity'. This work introduces the first +'silver standard' dataset for such tasks in the Kyrgyz language, alongside +training corresponding models and validating the dataset's suitability through +quality evaluation metrics. + +
+
+
+
+
+ + ☆ A Regularized LSTM Method for Detecting Fake News Articles + + +
+ Nowadays, the rapid diffusion of fake news poses a significant problem, as it +can spread misinformation and confusion. This paper aims to develop an advanced +machine learning solution for detecting fake news articles. Leveraging a +comprehensive dataset of news articles, including 23,502 fake news articles and +21,417 accurate news articles, we implemented and evaluated three +machine-learning models. Our dataset, curated from diverse sources, provides +rich textual content categorized into title, text, subject, and Date features. +These features are essential for training robust classification models to +distinguish between fake and authentic news articles. The initial model +employed a Long Short-Term Memory (LSTM) network, achieving an accuracy of 94%. +The second model improved upon this by incorporating additional regularization +techniques and fine-tuning hyperparameters, resulting in a 97% accuracy. The +final model combined the strengths of previous architectures with advanced +optimization strategies, achieving a peak accuracy of 98%. These results +demonstrate the effectiveness of our approach in identifying fake news with +high precision. Implementing these models showcases significant advancements in +natural language processing and machine learning techniques, contributing +valuable tools for combating misinformation. Our work highlights the potential +for deploying such models in real-world applications, providing a reliable +method for automated fake news detection and enhancing the credibility of news +dissemination. + +
+
+ comment: 6 pages, 7 figures, 2024 IEEE International Conference on Signal + Processing, Information, Communication and Systems (SPICSCON) +
+
+
+
+
+ + ☆ Structured Dialogue System for Mental Health: An LLM Chatbot Leveraging + the PM+ Guidelines + + +
+ The Structured Dialogue System, referred to as SuDoSys, is an innovative +Large Language Model (LLM)-based chatbot designed to provide psychological +counseling. SuDoSys leverages the World Health Organization (WHO)'s Problem +Management Plus (PM+) guidelines to deliver stage-aware multi-turn dialogues. +Existing methods for employing an LLM in multi-turn psychological counseling +typically involve direct fine-tuning using generated dialogues, often +neglecting the dynamic stage shifts of counseling sessions. Unlike previous +approaches, SuDoSys considers the different stages of counseling and stores +essential information throughout the counseling process, ensuring coherent and +directed conversations. The system employs an LLM, a stage-aware instruction +generator, a response unpacker, a topic database, and a stage controller to +maintain dialogue flow. In addition, we propose a novel technique that +simulates counseling clients to interact with the evaluated system and evaluate +its performance automatically. When assessed using both objective and +subjective evaluations, SuDoSys demonstrates its effectiveness in generating +logically coherent responses. The system's code and program scripts for +evaluation are open-sourced. + +
+
+ comment: Accepted to the 16th International Conference on Social Robotic (ICSR + 2024) +
+
+
+
+
+ + ☆ IntentGPT: Few-shot Intent Discovery with Large Language Models ICLR 2024 + + +
+ In today's digitally driven world, dialogue systems play a pivotal role in +enhancing user interactions, from customer service to virtual assistants. In +these dialogues, it is important to identify user's goals automatically to +resolve their needs promptly. This has necessitated the integration of models +that perform Intent Detection. However, users' intents are diverse and dynamic, +making it challenging to maintain a fixed set of predefined intents. As a +result, a more practical approach is to develop a model capable of identifying +new intents as they emerge. We address the challenge of Intent Discovery, an +area that has drawn significant attention in recent research efforts. Existing +methods need to train on a substantial amount of data for correctly identifying +new intents, demanding significant human effort. To overcome this, we introduce +IntentGPT, a novel training-free method that effectively prompts Large Language +Models (LLMs) such as GPT-4 to discover new intents with minimal labeled data. +IntentGPT comprises an \textit{In-Context Prompt Generator}, which generates +informative prompts for In-Context Learning, an \textit{Intent Predictor} for +classifying and discovering user intents from utterances, and a +\textit{Semantic Few-Shot Sampler} that selects relevant few-shot examples and +a set of known intents to be injected into the prompt. Our experiments show +that IntentGPT outperforms previous methods that require extensive +domain-specific data and fine-tuning, in popular benchmarks, including CLINC +and BANKING, among others. + +
+
+ comment: ICLR 2024 Workshop on LLM Agents +
+
+
+
+
+ + ☆ SAM Decoding: Speculative Decoding via Suffix Automaton + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +by unifying tasks into text generation, yet their large parameter sizes and +autoregressive nature limit inference speed. SAM-Decoding addresses this by +introducing a novel retrieval-based speculative decoding method that uses a +suffix automaton for efficient and accurate draft generation. Unlike n-gram +matching used by the existing method, SAM-Decoding finds the longest suffix +match in generating text and text corpuss, achieving an average time complexity +of $O(1)$ per generation step. SAM-Decoding constructs static and dynamic +suffix automatons for the text corpus and input prompts, respectively, enabling +fast and precise draft generation. Meanwhile, it is designed as an approach +that can be combined with existing methods, allowing SAM-Decoding to adaptively +select a draft generation strategy based on the matching length, thus +increasing the inference speed of the LLM. When combined with Token Recycling, +evaluations show SAM-Decoding outperforms existing model-free methods, +achieving a speedup of $2.27\times$ over autoregressive decoding on Spec-Bench. +When combined with EAGLE2, it reaches a speedup of $2.49\times$, surpassing all +current approaches. Our code is available at +https://github.com/hyx1999/SAM-Decoding. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ BlueLM-V-3B: Algorithm and System Co-Design for Multimodal Large + Language Models on Mobile Devices + + +
+ The emergence and growing popularity of multimodal large language models +(MLLMs) have significant potential to enhance various aspects of daily life, +from improving communication to facilitating learning and problem-solving. +Mobile phones, as essential daily companions, represent the most effective and +accessible deployment platform for MLLMs, enabling seamless integration into +everyday tasks. However, deploying MLLMs on mobile phones presents challenges +due to limitations in memory size and computational capability, making it +difficult to achieve smooth and real-time processing without extensive +optimization. In this paper, we present BlueLM-V-3B, an algorithm and system +co-design approach specifically tailored for the efficient deployment of MLLMs +on mobile platforms. To be specific, we redesign the dynamic resolution scheme +adopted by mainstream MLLMs and implement system optimization for +hardware-aware deployment to optimize model inference on mobile phones. +BlueLM-V-3B boasts the following key highlights: (1) Small Size: BlueLM-V-3B +features a language model with 2.7B parameters and a vision encoder with 400M +parameters. (2) Fast Speed: BlueLM-V-3B achieves a generation speed of 24.4 +token/s on the MediaTek Dimensity 9300 processor with 4-bit LLM weight +quantization. (3) Strong Performance: BlueLM-V-3B has attained the highest +average score of 66.1 on the OpenCompass benchmark among models with $\leq$ 4B +parameters and surpassed a series of models with much larger parameter sizes +(e.g., MiniCPM-V-2.6, InternVL2-8B). + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ MTA: Multimodal Task Alignment for BEV Perception and Captioning + + +
+ Bird's eye view (BEV)-based 3D perception plays a crucial role in autonomous +driving applications. The rise of large language models has spurred interest in +BEV-based captioning to understand object behavior in the surrounding +environment. However, existing approaches treat perception and captioning as +separate tasks, focusing on the performance of only one of the tasks and +overlooking the potential benefits of multimodal alignment. To bridge this gap +between modalities, we introduce MTA, a novel multimodal task alignment +framework that boosts both BEV perception and captioning. MTA consists of two +key components: (1) BEV-Language Alignment (BLA), a contextual learning +mechanism that aligns the BEV scene representations with ground-truth language +representations, and (2) Detection-Captioning Alignment (DCA), a cross-modal +prompting mechanism that aligns detection and captioning outputs. MTA +integrates into state-of-the-art baselines during training, adding no extra +computational complexity at runtime. Extensive experiments on the nuScenes and +TOD3Cap datasets show that MTA significantly outperforms state-of-the-art +baselines, achieving a 4.9% improvement in perception and a 9.2% improvement in +captioning. These results underscore the effectiveness of unified alignment in +reconciling BEV-based perception and captioning. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Gender Bias Mitigation for Bangla Classification Tasks + + +
+ In this study, we investigate gender bias in Bangla pretrained language +models, a largely under explored area in low-resource languages. To assess this +bias, we applied gender-name swapping techniques to existing datasets, creating +four manually annotated, task-specific datasets for sentiment analysis, +toxicity detection, hate speech detection, and sarcasm detection. By altering +names and gender-specific terms, we ensured these datasets were suitable for +detecting and mitigating gender bias. We then proposed a joint loss +optimization technique to mitigate gender bias across task-specific pretrained +models. Our approach was evaluated against existing bias mitigation methods, +with results showing that our technique not only effectively reduces bias but +also maintains competitive accuracy compared to other baseline approaches. To +promote further research, we have made both our implementation and datasets +publicly available +https://github.com/sajib-kumar/Gender-Bias-Mitigation-From-Bangla-PLM + +
+
+
+
+
+ + ♻ ☆ PipeInfer: Accelerating LLM Inference using Asynchronous Pipelined + Speculation + + +
+ Inference of Large Language Models (LLMs) across computer clusters has become +a focal point of research in recent times, with many acceleration techniques +taking inspiration from CPU speculative execution. These techniques reduce +bottlenecks associated with memory bandwidth, but also increase end-to-end +latency per inference run, requiring high speculation acceptance rates to +improve performance. Combined with a variable rate of acceptance across tasks, +speculative inference techniques can result in reduced performance. +Additionally, pipeline-parallel designs require many user requests to maintain +maximum utilization. As a remedy, we propose PipeInfer, a pipelined speculative +acceleration technique to reduce inter-token latency and improve system +utilization for single-request scenarios while also improving tolerance to low +speculation acceptance rates and low-bandwidth interconnects. PipeInfer +exhibits up to a 2.15$\times$ improvement in generation speed over standard +speculative inference. PipeInfer achieves its improvement through Continuous +Asynchronous Speculation and Early Inference Cancellation, the former improving +latency and generation speed by running single-token inference simultaneously +with several speculative runs, while the latter improves speed and latency by +skipping the computation of invalidated runs, even in the middle of inference. + +
+
+ comment: 11 pages, submitted to SC24 conference +
+
+
+
+
+ + ♻ ☆ Self-Attention Limits Working Memory Capacity of Transformer-Based + Models + + +
+ Recent work on Transformer-based large language models (LLMs) has revealed +striking limits in their working memory capacity, similar to what has been +found in human behavioral studies. Specifically, these models' performance +drops significantly on N-back tasks as N increases. However, there is still a +lack of mechanistic interpretability as to why this phenomenon would arise. +Inspired by the executive attention theory from behavioral sciences, we +hypothesize that the self-attention mechanism within Transformer-based models +might be responsible for their working memory capacity limits. To test this +hypothesis, we train vanilla decoder-only transformers to perform N-back tasks +and find that attention scores gradually aggregate to the N-back positions over +training, suggesting that the model masters the task by learning a strategy to +pay attention to the relationship between the current position and the N-back +position. Critically, we find that the total entropy of the attention score +matrix increases as N increases, suggesting that the dispersion of attention +scores might be the cause of the capacity limit observed in N-back tasks. Our +findings thus offer insights into the shared role of attention in both human +and artificial intelligence. Moreover, the limitations of the self-attention +mechanism revealed in the current study could inform future efforts to design +more powerful model architectures with enhanced working memory capacity and +cognitive capabilities. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Efficient Encoder-Decoder Transformer Decoding for Decomposable Tasks + + +
+ Transformer-based NLP models are powerful but have high computational costs +that limit deployment. Finetuned encoder-decoder models are popular in +specialized domains and can outperform larger more generalized decoder-only +models, such as GPT-4. We introduce a new configuration for encoder-decoder +models that improves efficiency on structured output and decomposable tasks +where multiple outputs are required for a single shared input. Our method, +prompt-in-decoder (PiD), encodes the input once and decodes the output in +parallel, boosting both training and inference efficiency by avoiding duplicate +input encoding and increasing the operational intensity (ratio of numbers of +arithmetic operation to memory access) of decoding process by sharing the input +key-value cache. We achieve computation reduction that roughly scales with the +number of subtasks, gaining up to 4.6x speed-up over state-of-the-art models +for dialogue state tracking, summarization, and question-answering tasks, with +comparable or better performance. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ How (un)ethical are instruction-centric responses of LLMs? Unveiling the + vulnerabilities of safety guardrails to harmful queries AAAI + + +
+ In this study, we tackle a growing concern around the safety and ethical use +of large language models (LLMs). Despite their potential, these models can be +tricked into producing harmful or unethical content through various +sophisticated methods, including 'jailbreaking' techniques and targeted +manipulation. Our work zeroes in on a specific issue: to what extent LLMs can +be led astray by asking them to generate responses that are instruction-centric +such as a pseudocode, a program or a software snippet as opposed to vanilla +text. To investigate this question, we introduce TechHazardQA, a dataset +containing complex queries which should be answered in both text and +instruction-centric formats (e.g., pseudocodes), aimed at identifying triggers +for unethical responses. We query a series of LLMs -- Llama-2-13b, Llama-2-7b, +Mistral-V2 and Mistral 8X7B -- and ask them to generate both text and +instruction-centric responses. For evaluation we report the harmfulness score +metric as well as judgements from GPT-4 and humans. Overall, we observe that +asking LLMs to produce instruction-centric responses enhances the unethical +response generation by ~2-38% across the models. As an additional objective, we +investigate the impact of model editing using the ROME technique, which further +increases the propensity for generating undesirable content. In particular, +asking edited LLMs to generate instruction-centric responses further increases +the unethical response generation by ~3-16% across the different models. + +
+
+ comment: Accepted at AAAI Conference on Web and Social Media (ICWSM) 2025. + [Dataset](https://huggingface.co/datasets/SoftMINER-Group/TechHazardQA) +
+
+
+
+
+ + ♻ ☆ ProverbEval: Exploring LLM Evaluation Challenges for Low-resource + Language Understanding + + +
+ With the rapid development of evaluation datasets to assess LLMs +understanding across a wide range of subjects and domains, identifying a +suitable language understanding benchmark has become increasingly challenging. +In this work, we explore LLM evaluation challenges for low-resource language +understanding and introduce ProverbEval, LLM evaluation benchmark for +low-resource languages based on proverbs to focus on low-resource language +understanding in culture-specific scenarios. We benchmark various LLMs and +explore factors that create variability in the benchmarking process. We +observed performance variances of up to 50%, depending on the order in which +answer choices were presented in multiple-choice tasks. Native language proverb +descriptions significantly improve tasks such as proverb generation, +contributing to improved outcomes. Additionally, monolingual evaluations +consistently outperformed their cross-lingual counterparts. We argue special +attention must be given to the order of choices, choice of prompt language, +task variability, and generation tasks when creating LLM evaluation benchmarks. + +
+
+
+
+
+ + ♻ ☆ Investigating Annotator Bias in Large Language Models for Hate Speech + Detection NeurIPS + + +
+ Data annotation, the practice of assigning descriptive labels to raw data, is +pivotal in optimizing the performance of machine learning models. However, it +is a resource-intensive process susceptible to biases introduced by annotators. +The emergence of sophisticated Large Language Models (LLMs) presents a unique +opportunity to modernize and streamline this complex procedure. While existing +research extensively evaluates the efficacy of LLMs, as annotators, this paper +delves into the biases present in LLMs when annotating hate speech data. Our +research contributes to understanding biases in four key categories: gender, +race, religion, and disability with four LLMs: GPT-3.5, GPT-4o, Llama-3.1 and +Gemma-2. Specifically targeting highly vulnerable groups within these +categories, we analyze annotator biases. Furthermore, we conduct a +comprehensive examination of potential factors contributing to these biases by +scrutinizing the annotated data. We introduce our custom hate speech detection +dataset, HateBiasNet, to conduct this research. Additionally, we perform the +same experiments on the ETHOS (Mollas et al. 2022) dataset also for comparative +analysis. This paper serves as a crucial resource, guiding researchers and +practitioners in harnessing the potential of LLMs for data annotation, thereby +fostering advancements in this critical field. + +
+
+ comment: Accepted at NeurIPS Safe Generative AI Workshop, 2024 +
+
+
+
+
+ + ♻ ☆ MiCEval: Unveiling Multimodal Chain of Thought's Quality via Image + Description and Reasoning Steps + + +
+ Multimodal Chain of Thought (MCoT) is a popular prompting strategy for +improving the performance of multimodal large language models (MLLMs) across a +range of complex reasoning tasks. Despite its popularity, there is a notable +absence of automated methods for evaluating the quality of reasoning steps in +MCoT. To address this gap, we propose Multimodal Chain-of-Thought Evaluation +(MiCEval), a framework designed to assess the correctness of reasoning chains +by evaluating the quality of both the description and each reasoning step. The +evaluation of the description component focuses on the accuracy of the image +descriptions, while the reasoning step evaluates the quality of each step as it +is conditionally generated based on the preceding steps. MiCEval is built upon +a fine-grained dataset with annotations that rate each step according to +correctness, relevance, and informativeness. Extensive experiments on four +state-of-the-art MLLMs show that step-wise evaluations using MiCEval align more +closely with human judgments compared to existing methods based on cosine +similarity or fine-tuning approaches. MiCEval datasets and code can be found in +https://github.com/alenai97/MiCEval. + +
+
+ comment: 41 pages +
+
+
+
+
+ + ♻ ☆ Influence of Solution Efficiency and Valence of Instruction on Additive + and Subtractive Solution Strategies in Humans and GPT-4 + + +
+ Generative artificial intelligences, particularly large language models +(LLMs), play an increasingly prominent role in human decision-making contexts, +necessitating transparency about their capabilities. While prior studies have +shown addition biases in humans (Adams et al., 2021) and OpenAI's GPT-3 (Winter +et al., 2023), this study extends the research by comparing human and GPT-4 +problem-solving across both spatial and linguistic tasks, with variations in +solution efficiency and valence of task instruction. Four preregistered +experiments with 588 participants from the U.S. and 680 GPT-4 iterations +revealed a stronger tendency towards additive transformations in GPT-4 than in +humans. Human participants were less likely to use additive strategies when +subtraction was relatively more efficient than when addition and subtraction +were equally efficient. GPT-4 exhibited the opposite behavior, with a strong +addition bias when subtraction was more efficient. In terms of valence of task +instruction, GPT-4's use of additive strategies increased when instructed to +"improve" (positive) rather than "edit" (neutral). These findings demonstrate +that biases in human problem-solving are amplified in GPT-4, and that LLM +behavior differs from human efficiency-based strategies. This highlights the +limitations of LLMs and the need for caution when using them in real-world +applications. + +
+
+ comment: 29 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ PreAlign: Boosting Cross-Lingual Transfer by Early Establishment of + Multilingual Alignment + + +
+ Large language models demonstrate reasonable multilingual abilities, despite +predominantly English-centric pretraining. However, the spontaneous +multilingual alignment in these models is shown to be weak, leading to +unsatisfactory cross-lingual transfer and knowledge sharing. Previous works +attempt to address this issue by explicitly injecting multilingual alignment +information during or after pretraining. Thus for the early stage in +pretraining, the alignment is weak for sharing information or knowledge across +languages. In this paper, we propose PreAlign, a framework that establishes +multilingual alignment prior to language model pretraining. PreAlign injects +multilingual alignment by initializing the model to generate similar +representations of aligned words and preserves this alignment using a +code-switching strategy during pretraining. Extensive experiments in a +synthetic English to English-Clone setting demonstrate that PreAlign +significantly outperforms standard multilingual joint training in language +modeling, zero-shot cross-lingual transfer, and cross-lingual knowledge +application. Further experiments in real-world scenarios further validate +PreAlign's effectiveness across various model sizes. + +
+
+
+
+
+ + ♻ ☆ Towards Operationalizing Right to Data Protection + + +
+ The widespread practice of indiscriminate data scraping to fine-tune language +models (LMs) raises significant legal and ethical concerns, particularly +regarding compliance with data protection laws such as the General Data +Protection Regulation (GDPR). This practice often results in the unauthorized +use of personal information, prompting growing debate within the academic and +regulatory communities. Recent works have introduced the concept of generating +unlearnable datasets (by adding imperceptible noise to the clean data), such +that the underlying model achieves lower loss during training but fails to +generalize to the unseen test setting. Though somewhat effective, these +approaches are predominantly designed for images and are limited by several +practical constraints like requiring knowledge of the target model. To this +end, we introduce RegText, a framework that injects imperceptible spurious +correlations into natural language datasets, effectively rendering them +unlearnable without affecting semantic content. We demonstrate RegText's +utility through rigorous empirical analysis of small and large LMs. Notably, +RegText can restrict newer models like GPT-4o and Llama from learning on our +generated data, resulting in a drop in their test accuracy compared to their +zero-shot performance and paving the way for generating unlearnable text to +protect public data. + +
+
+ comment: First two authors contributed equally to this work +
+
+
+
+
+ + ♻ ☆ Transformers Learn to Achieve Second-Order Convergence Rates for + In-Context Linear Regression NeurIPS 2024 + + +
+ Transformers excel at in-context learning (ICL) -- learning from +demonstrations without parameter updates -- but how they do so remains a +mystery. Recent work suggests that Transformers may internally run Gradient +Descent (GD), a first-order optimization method, to perform ICL. In this paper, +we instead demonstrate that Transformers learn to approximate second-order +optimization methods for ICL. For in-context linear regression, Transformers +share a similar convergence rate as Iterative Newton's Method, both +exponentially faster than GD. Empirically, predictions from successive +Transformer layers closely match different iterations of Newton's Method +linearly, with each middle layer roughly computing 3 iterations; thus, +Transformers and Newton's method converge at roughly the same rate. In +contrast, Gradient Descent converges exponentially more slowly. We also show +that Transformers can learn in-context on ill-conditioned data, a setting where +Gradient Descent struggles but Iterative Newton succeeds. Finally, to +corroborate our empirical findings, we prove that Transformers can implement +$k$ iterations of Newton's method with $k + \mathcal{O}(1)$ layers. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Towards Robust Text Classification: Mitigating Spurious Correlations + with Causal Learning + + +
+ In text classification tasks, models often rely on spurious correlations for +predictions, incorrectly associating irrelevant features with the target +labels. This issue limits the robustness and generalization of models, +especially when faced with out-of-distribution data where such spurious +correlations no longer hold. To address this challenge, we propose the Causally +Calibrated Robust Classifier (CCR), which aims to reduce models' reliance on +spurious correlations and improve model robustness. Our approach integrates a +causal feature selection method based on counterfactual reasoning, along with +an unbiased inverse propensity weighting (IPW) loss function. By focusing on +selecting causal features, we ensure that the model relies less on spurious +features during prediction. We theoretically justify our approach and +empirically show that CCR achieves state-of-the-art performance among methods +without group labels, and in some cases, it can compete with the models that +utilize group labels. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are Null-Shot Learners EMNLP 2024 + + +
+ This paper presents null-shot prompting. Null-shot prompting exploits +hallucination in large language models (LLMs) by instructing LLMs to utilize +information from the "Examples" section that never exists within the provided +context to perform a task. While reducing hallucination is crucial and +non-negligible for daily and critical uses of LLMs, we propose that in the +current landscape in which these LLMs still hallucinate, it is possible, in +fact, to exploit hallucination to increase performance in performing tasks +compared to standard zero-shot prompting. Experiments with eight LLMs show +improvements in performance across the majority of eight datasets, including +reading comprehension, arithmetic reasoning, and closed-book question +answering. The observed inconsistency in increased relative performance across +the LLMs also potentially indicates a different degree of inherent +hallucination in each model. These differences show that it is possible to +utilize null-shot prompting as a way to detect degrees of hallucination in LLMs +using existing benchmarking datasets. We also perform ablation studies, +including experimenting with a modified version of null-shot prompting that +incorporates ideas from zero-shot chain-of-thought prompting, which shows +different trends of results. + +
+
+ comment: 28 pages; v2: added Gemini Pro results, error analysis, and a + discussion on confabulation; v3: see its extended version, an EMNLP 2024 + paper, at https://aclanthology.org/2024.emnlp-main.740/ +
+
+
+
+
+ + ♻ ☆ KyrgyzNLP: Challenges, Progress, and Future + + +
+ Large language models (LLMs) have excelled in numerous benchmarks, advancing +AI applications in both linguistic and non-linguistic tasks. However, this has +primarily benefited well-resourced languages, leaving less-resourced ones +(LRLs) at a disadvantage. In this paper, we highlight the current state of the +NLP field in the specific LRL: kyrgyz tili. + Human evaluation, including annotated datasets created by native speakers, +remains an irreplaceable component of reliable NLP performance, especially for +LRLs where automatic evaluations can fall short. In recent assessments of the +resources for Turkic languages, Kyrgyz is labeled with the status 'Scraping +By', a severely under-resourced language spoken by millions. This is concerning +given the growing importance of the language, not only in Kyrgyzstan but also +among diaspora communities where it holds no official status. + We review prior efforts in the field, noting that many of the publicly +available resources have only recently been developed, with few exceptions +beyond dictionaries (the processed data used for the analysis is presented at +https://kyrgyznlp.github.io/). While recent papers have made some headway, much +more remains to be done. Despite interest and support from both business and +government sectors in the Kyrgyz Republic, the situation for Kyrgyz language +resources remains challenging. We stress the importance of community-driven +efforts to build these resources, ensuring the future advancement +sustainability. We then share our view of the most pressing challenges in +Kyrgyz NLP. Finally, we propose a roadmap for future development in terms of +research topics and language resources. + +
+
+ comment: Keynote talk at the 12th International Conference on Analysis of + Images, Social Networks and Texts (AIST-2024) +
+
+
+
+
+ + ♻ ☆ MATES: Model-Aware Data Selection for Efficient Pretraining with Data + Influence Models NeurIPS 2024 + + +
+ Pretraining data selection has the potential to improve language model +pretraining efficiency by utilizing higher-quality data from massive web data +corpora. Current data selection methods, which rely on either hand-crafted +rules or larger reference models, are conducted statically and do not capture +the evolving data preferences during pretraining. In this paper, we introduce +model-aware data selection with data influence models (MATES), where a data +influence model continuously adapts to the evolving data preferences of the +pretraining model and then selects the data most effective for the current +pretraining progress. Specifically, we collect oracle data influence by locally +probing the pretraining model and fine-tune a small data influence model to +approximate it accurately. The data influence model then predicts data +influence over the whole pretraining corpus and selects the most influential +data for the next pretraining stage. Experiments of pretraining 410M and 1B +models on the C4 dataset demonstrate that MATES significantly outperforms +random data selection on extensive downstream tasks. It doubles the gains +achieved by the state-of-the-art data selection approach that leverages larger +reference models and reduces the total FLOPs required to reach certain +performances by half. Further analyses validate the effectiveness of the +locally probed oracle data influence and the approximation with data influence +models. Our code is open-sourced at https://github.com/cxcscmu/MATES. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ IQA-EVAL: Automatic Evaluation of Human-Model Interactive Question + Answering NeurIPS 2024 + + +
+ To evaluate Large Language Models (LLMs) for question answering (QA), +traditional methods typically focus on assessing single-turn responses to given +questions. However, this approach doesn't capture the dynamic nature of +human-AI interactions, where humans actively seek information through +conversation. Recent works in human-computer interaction (HCI) have employed +human evaluators to conduct interactions and evaluations, but they are often +prohibitively expensive and time-consuming to scale. We introduce an automatic +evaluation framework IQA-EVAL to achieve Interactive Question Answering +Evaluations, more specifically, we introduce a LLM-based Evaluation Agent (LEA) +that can: (1) simulate human behaviors to generate interactions with IQA +models; (2) automatically evaluate the generated interactions. Moreover, we +propose assigning personas to LEAs to better simulate groups of real human +evaluators. We show that: (1) our evaluation framework with GPT-4 (or Claude) +as the backbone model achieves a high correlation with human evaluations on the +IQA task; (2) assigning personas to LEA to better represent the crowd further +significantly improves correlations. Finally, we use our automatic metric to +evaluate five recent representative LLMs with over 1000 questions from complex +and ambiguous question answering tasks, which comes with a substantial cost of +$5k if evaluated by humans. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Stereotype Detection in LLMs: A Multiclass, Explainable, and + Benchmark-Driven Approach + + +
+ Stereotype detection is a challenging and subjective task, as certain +statements, such as "Black people like to play basketball," may not appear +overtly toxic but still reinforce racial stereotypes. With the increasing +prevalence of large language models (LLMs) in human-facing artificial +intelligence (AI) applications, detecting these types of biases is essential. +However, LLMs risk perpetuating and amplifying stereotypical outputs derived +from their training data. A reliable stereotype detector is crucial for +benchmarking bias, monitoring model input and output, filtering training data, +and ensuring fairer model behavior in downstream applications. This paper +introduces the Multi-Grain Stereotype (MGS) dataset, consisting of 51,867 +instances across gender, race, profession, religion, and other stereotypes, +curated from multiple existing datasets. We evaluate various machine learning +approaches to establish baselines and fine-tune language models of different +architectures and sizes, presenting a suite of stereotype multiclass +classifiers trained on the MGS dataset. Given the subjectivity of stereotypes, +explainability is essential to align model learning with human understanding of +stereotypes. We employ explainable AI (XAI) tools, including SHAP, LIME, and +BertViz, to assess whether the model's learned patterns align with human +intuitions about stereotypes.Additionally, we develop stereotype elicitation +prompts and benchmark the presence of stereotypes in text generation tasks +using popular LLMs, employing the best-performing stereotype classifiers. + +
+
+ comment: Under review as a conference paper at ARR October 2024 +
+
+
+
+
+ + ♻ ☆ SAGED: A Holistic Bias-Benchmarking Pipeline for Language Models with + Customisable Fairness Calibration COLING 2025 + + +
+ The development of unbiased large language models is widely recognized as +crucial, yet existing benchmarks fall short in detecting biases due to limited +scope, contamination, and lack of a fairness baseline. SAGED(-Bias) is the +first holistic benchmarking pipeline to address these problems. The pipeline +encompasses five core stages: scraping materials, assembling benchmarks, +generating responses, extracting numeric features, and diagnosing with +disparity metrics. SAGED includes metrics for max disparity, such as impact +ratio, and bias concentration, such as Max Z-scores. Noticing that assessment +tool bias and contextual bias in prompts can distort evaluation, SAGED +implements counterfactual branching and baseline calibration for mitigation. +For demonstration, we use SAGED on G20 Countries with popular 8b-level models +including Gemma2, Llama3.1, Mistral, and Qwen2. With sentiment analysis, we +find that while Mistral and Qwen2 show lower max disparity and higher bias +concentration than Gemma2 and Llama3.1, all models are notably biased against +countries like Russia and (except for Qwen2) China. With further experiments to +have models role-playing U.S. (vice-/former-) presidents, we see bias amplifies +and shifts in heterogeneous directions. Moreover, we see Qwen2 and Mistral not +engage in role-playing, while Llama3.1 and Gemma2 role-play Trump notably more +intensively than Biden and Harris, indicating role-playing performance bias in +these models. + +
+
+ comment: Submitted to COLING 2025 Main Conference +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 37 + +
+
+
+ + ☆ VeriGraph: Scene Graphs for Execution Verifiable Robot Planning + + +
+ Recent advancements in vision-language models (VLMs) offer potential for +robot task planning, but challenges remain due to VLMs' tendency to generate +incorrect action sequences. To address these limitations, we propose VeriGraph, +a novel framework that integrates VLMs for robotic planning while verifying +action feasibility. VeriGraph employs scene graphs as an intermediate +representation, capturing key objects and spatial relationships to improve plan +verification and refinement. The system generates a scene graph from input +images and uses it to iteratively check and correct action sequences generated +by an LLM-based task planner, ensuring constraints are respected and actions +are executable. Our approach significantly enhances task completion rates +across diverse manipulation scenarios, outperforming baseline methods by 58% +for language-based tasks and 30% for image-based tasks. + +
+
+
+
+
+ + ☆ BMP: Bridging the Gap between B-Spline and Movement Primitives + + +
+ This work introduces B-spline Movement Primitives (BMPs), a new Movement +Primitive (MP) variant that leverages B-splines for motion representation. +B-splines are a well-known concept in motion planning due to their ability to +generate complex, smooth trajectories with only a few control points while +satisfying boundary conditions, i.e., passing through a specified desired +position with desired velocity. However, current usages of B-splines tend to +ignore the higher-order statistics in trajectory distributions, which limits +their usage in imitation learning (IL) and reinforcement learning (RL), where +modeling trajectory distribution is essential. In contrast, MPs are commonly +used in IL and RL for their capacity to capture trajectory likelihoods and +correlations. However, MPs are constrained by their abilities to satisfy +boundary conditions and usually need extra terms in learning objectives to +satisfy velocity constraints. By reformulating B-splines as MPs, represented +through basis functions and weight parameters, BMPs combine the strengths of +both approaches, allowing B-splines to capture higher-order statistics while +retaining their ability to satisfy boundary conditions. Empirical results in IL +and RL demonstrate that BMPs broaden the applicability of B-splines in robot +learning and offer greater expressiveness compared to existing MP variants. + +
+
+
+
+
+ + ☆ M3TR: Generalist HD Map Construction with Variable Map Priors + + +
+ Autonomous vehicles require road information for their operation, usually in +form of HD maps. Since offline maps eventually become outdated or may only be +partially available, online HD map construction methods have been proposed to +infer map information from live sensor data. A key issue remains how to exploit +such partial or outdated map information as a prior. We introduce M3TR +(Multi-Masking Map Transformer), a generalist approach for HD map construction +both with and without map priors. We address shortcomings in ground truth +generation for Argoverse 2 and nuScenes and propose the first realistic +scenarios with semantically diverse map priors. Examining various query +designs, we use an improved method for integrating prior map elements into a HD +map construction model, increasing performance by +4.3 mAP. Finally, we show +that training across all prior scenarios yields a single Generalist model, +whose performance is on par with previous Expert models that can handle only +one specific type of map prior. M3TR thus is the first model capable of +leveraging variable map priors, making it suitable for real-world deployment. +Code is available at https://github.com/immel-f/m3tr + +
+
+
+
+
+ + ☆ Moving Forward: A Review of Autonomous Driving Software and Hardware + Systems + + +
+ With their potential to significantly reduce traffic accidents, enhance road +safety, optimize traffic flow, and decrease congestion, autonomous driving +systems are a major focus of research and development in recent years. Beyond +these immediate benefits, they offer long-term advantages in promoting +sustainable transportation by reducing emissions and fuel consumption. +Achieving a high level of autonomy across diverse conditions requires a +comprehensive understanding of the environment. This is accomplished by +processing data from sensors such as cameras, radars, and LiDARs through a +software stack that relies heavily on machine learning algorithms. These ML +models demand significant computational resources and involve large-scale data +movement, presenting challenges for hardware to execute them efficiently and at +high speed. In this survey, we first outline and highlight the key components +of self-driving systems, covering input sensors, commonly used datasets, +simulation platforms, and the software architecture. We then explore the +underlying hardware platforms that support the execution of these software +systems. By presenting a comprehensive view of autonomous driving systems and +their increasing demands, particularly for higher levels of autonomy, we +analyze the performance and efficiency of scaled-up off-the-shelf GPU/CPU-based +systems, emphasizing the challenges within the computational components. +Through examples showcasing the diverse computational and memory requirements +in the software stack, we demonstrate how more specialized hardware and +processing closer to memory can enable more efficient execution with lower +latency. Finally, based on current trends and future demands, we conclude by +speculating what a future hardware platform for autonomous driving might look +like. + +
+
+
+
+
+ + ☆ Learning Generalizable 3D Manipulation With 10 Demonstrations + + +
+ Learning robust and generalizable manipulation skills from demonstrations +remains a key challenge in robotics, with broad applications in industrial +automation and service robotics. While recent imitation learning methods have +achieved impressive results, they often require large amounts of demonstration +data and struggle to generalize across different spatial variants. In this +work, we present a novel framework that learns manipulation skills from as few +as 10 demonstrations, yet still generalizes to spatial variants such as +different initial object positions and camera viewpoints. Our framework +consists of two key modules: Semantic Guided Perception (SGP), which constructs +task-focused, spatially aware 3D point cloud representations from RGB-D inputs; +and Spatial Generalized Decision (SGD), an efficient diffusion-based +decision-making module that generates actions via denoising. To effectively +learn generalization ability from limited data, we introduce a critical +spatially equivariant training strategy that captures the spatial knowledge +embedded in expert demonstrations. We validate our framework through extensive +experiments on both simulation benchmarks and real-world robotic systems. Our +method demonstrates a 60 percent improvement in success rates over +state-of-the-art approaches on a series of challenging tasks, even with +substantial variations in object poses and camera viewpoints. This work shows +significant potential for advancing efficient, generalizable manipulation skill +learning in real-world applications. + +
+
+
+
+
+ + ☆ BEV-ODOM: Reducing Scale Drift in Monocular Visual Odometry with BEV + Representation + + +
+ Monocular visual odometry (MVO) is vital in autonomous navigation and +robotics, providing a cost-effective and flexible motion tracking solution, but +the inherent scale ambiguity in monocular setups often leads to cumulative +errors over time. In this paper, we present BEV-ODOM, a novel MVO framework +leveraging the Bird's Eye View (BEV) Representation to address scale drift. +Unlike existing approaches, BEV-ODOM integrates a depth-based perspective-view +(PV) to BEV encoder, a correlation feature extraction neck, and a CNN-MLP-based +decoder, enabling it to estimate motion across three degrees of freedom without +the need for depth supervision or complex optimization techniques. Our +framework reduces scale drift in long-term sequences and achieves accurate +motion estimation across various datasets, including NCLT, Oxford, and KITTI. +The results indicate that BEV-ODOM outperforms current MVO methods, +demonstrating reduced scale drift and higher accuracy. + +
+
+
+
+
+ + ☆ Let people fail! Exploring the influence of explainable virtual and + robotic agents in learning-by-doing tasks + + +
+ Collaborative decision-making with artificial intelligence (AI) agents +presents opportunities and challenges. While human-AI performance often +surpasses that of individuals, the impact of such technology on human behavior +remains insufficiently understood, primarily when AI agents can provide +justifiable explanations for their suggestions. This study compares the effects +of classic vs. partner-aware explanations on human behavior and performance +during a learning-by-doing task. Three participant groups were involved: one +interacting with a computer, another with a humanoid robot, and a third one +without assistance. Results indicated that partner-aware explanations +influenced participants differently based on the type of artificial agents +involved. With the computer, participants enhanced their task completion times. +At the same time, those interacting with the humanoid robot were more inclined +to follow its suggestions, although they did not reduce their timing. +Interestingly, participants autonomously performing the learning-by-doing task +demonstrated superior knowledge acquisition than those assisted by explainable +AI (XAI). These findings raise profound questions and have significant +implications for automated tutoring and human-AI collaboration. + +
+
+
+
+
+ + ☆ Imagine-2-Drive: High-Fidelity World Modeling in CARLA for Autonomous + Vehicles ICRA 2025 + + +
+ In autonomous driving with image based state space, accurate prediction of +future events and modeling diverse behavioral modes are essential for safety +and effective decision-making. World model-based Reinforcement Learning (WMRL) +approaches offers a promising solution by simulating future states from current +state and actions. However, utility of world models is often limited by typical +RL policies being limited to deterministic or single gaussian distribution. By +failing to capture the full spectrum of possible actions, reduces their +adaptability in complex, dynamic environments. In this work, we introduce +Imagine-2-Drive, a framework that consists of two components, VISTAPlan, a +high-fidelity world model for accurate future prediction and Diffusion Policy +Actor (DPA), a diffusion based policy to model multi-modal behaviors for +trajectory prediction. We use VISTAPlan to simulate and evaluate trajectories +from DPA and use Denoising Diffusion Policy Optimization (DDPO) to train DPA to +maximize the cumulative sum of rewards over the trajectories. We analyze the +benefits of each component and the framework as a whole in CARLA with standard +driving metrics. As a consequence of our twin novelties- VISTAPlan and DPA, we +significantly outperform the state of the art (SOTA) world models on standard +driving metrics by 15% and 20% on Route Completion and Success Rate +respectively. + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ☆ Better Safe Than Sorry: Enhancing Arbitration Graphs for Safe and Robust + Autonomous Decision-Making ICRA + 2025 + + +
+ This paper introduces an extension to the arbitration graph framework +designed to enhance the safety and robustness of autonomous systems in complex, +dynamic environments. Building on the flexibility and scalability of +arbitration graphs, the proposed method incorporates a verification step and +structured fallback layers in the decision-making process. This ensures that +only verified and safe commands are executed while enabling graceful +degradation in the presence of unexpected faults or bugs. The approach is +demonstrated using a Pac-Man simulation and further validated in the context of +autonomous driving, where it shows significant reductions in accident risk and +improvements in overall system safety. The bottom-up design of arbitration +graphs allows for an incremental integration of new behavior components. The +extension presented in this work enables the integration of experimental or +immature behavior components while maintaining system safety by clearly and +precisely defining the conditions under which behaviors are considered safe. +The proposed method is implemented as a ready to use header-only C++ library, +published under the MIT License. Together with the Pac-Man demo, it is +available at github.com/KIT-MRT/arbitration_graphs. + +
+
+ comment: 7 pages, 5 figures, handed in for possible publication at IEEE ICRA + 2025, source code available at github.com/KIT-MRT/arbitration_graphs +
+
+
+
+
+ + ☆ Evaluating Text-to-Image Diffusion Models for Texturing Synthetic Data + + +
+ Building generic robotic manipulation systems often requires large amounts of +real-world data, which can be dificult to collect. Synthetic data generation +offers a promising alternative, but limiting the sim-to-real gap requires +significant engineering efforts. To reduce this engineering effort, we +investigate the use of pretrained text-to-image diffusion models for texturing +synthetic images and compare this approach with using random textures, a common +domain randomization technique in synthetic data generation. We focus on +generating object-centric representations, such as keypoints and segmentation +masks, which are important for robotic manipulation and require precise +annotations. We evaluate the efficacy of the texturing methods by training +models on the synthetic data and measuring their performance on real-world +datasets for three object categories: shoes, T-shirts, and mugs. Surprisingly, +we find that texturing using a diffusion model performs on par with random +textures, despite generating seemingly more realistic images. Our results +suggest that, for now, using diffusion models for texturing does not benefit +synthetic data generation for robotics. The code, data and trained models are +available at \url{https://github.com/tlpss/diffusing-synthetic-data.git}. + +
+
+ comment: Submitted to RA-L +
+
+
+
+
+ + ☆ Multi-UAV Search and Rescue in Wilderness Using Smart Agent-Based + Probability Models + + +
+ The application of Multiple Unmanned Aerial Vehicles (Multi-UAV) in +Wilderness Search and Rescue (WiSAR) significantly enhances mission success due +to their rapid coverage of search areas from high altitudes and their +adaptability to complex terrains. This capability is particularly crucial +because time is a critical factor in searching for a lost person in the +wilderness; as time passes, survival rates decrease and the search area +expands. The probability of success in such searches can be further improved if +UAVs leverage terrain features to predict the lost person's position. In this +paper, we aim to enhance search missions by proposing a smart agent-based +probability model that combines Monte Carlo simulations with an agent strategy +list, mimicking the behavior of a lost person in the wildness areas. +Furthermore, we develop a distributed Multi-UAV receding horizon search +strategy with dynamic partitioning, utilizing the generated probability density +model as prior information to prioritize locations where the lost person is +most likely to be found. Simulated search experiments across different terrains +have been conducted to validate the search efficiency of the proposed methods +compared to other benchmark methods. + +
+
+
+
+
+ + ☆ SPLIT: SE(3)-diffusion via Local Geometry-based Score Prediction for 3D + Scene-to-Pose-Set Matching Problems + + +
+ To enable versatile robot manipulation, robots must detect task-relevant +poses for different purposes from raw scenes. Currently, many perception +algorithms are designed for specific purposes, which limits the flexibility of +the perception module. We present a general problem formulation called 3D +scene-to-pose-set matching, which directly matches the corresponding poses from +the scene without relying on task-specific heuristics. To address this, we +introduce SPLIT, an SE(3)-diffusion model for generating pose samples from a +scene. The model's efficiency comes from predicting scores based on local +geometry with respect to the sample pose. Moreover, leveraging the conditioned +generation capability of diffusion models, we demonstrate that SPLIT can +generate the multi-purpose poses, required to complete both the mug +reorientation and hanging manipulation within a single model. + +
+
+
+
+
+ + ☆ Remote Life Support Robot Interface System for Global Task Planning and + Local Action Expansion Using Foundation Models + + +
+ Robot systems capable of executing tasks based on language instructions have +been actively researched. It is challenging to convey uncertain information +that can only be determined on-site with a single language instruction to the +robot. In this study, we propose a system that includes ambiguous parts as +template variables in language instructions to communicate the information to +be collected and the options to be presented to the robot for predictable +uncertain events. This study implements prompt generation for each robot action +function based on template variables to collect information, and a feedback +system for presenting and selecting options based on template variables for +user-to-robot communication. The effectiveness of the proposed system was +demonstrated through its application to real-life support tasks performed by +the robot. + +
+
+ comment: Accepted to 2024 IEEE-RAS International Conference on Humanoids + Robots (Humanoids 2024) +
+
+
+
+
+ + ☆ 'What did the Robot do in my Absence?' Video Foundation Models to + Enhance Intermittent Supervision + + +
+ This paper investigates the application of Video Foundation Models (ViFMs) +for generating robot data summaries to enhance intermittent human supervision +of robot teams. We propose a novel framework that produces both generic and +query-driven summaries of long-duration robot vision data in three modalities: +storyboards, short videos, and text. Through a user study involving 30 +participants, we evaluate the efficacy of these summary methods in allowing +operators to accurately retrieve the observations and actions that occurred +while the robot was operating without supervision over an extended duration (40 +min). Our findings reveal that query-driven summaries significantly improve +retrieval accuracy compared to generic summaries or raw data, albeit with +increased task duration. Storyboards are found to be the most effective +presentation modality, especially for object-related queries. This work +represents, to our knowledge, the first zero-shot application of ViFMs for +generating multi-modal robot-to-human communication in intermittent supervision +contexts, demonstrating both the promise and limitations of these models in +human-robot interaction (HRI) scenarios. + +
+
+ comment: This work has been submitted to the IEEE RAL for possible publication +
+
+
+
+
+ + ☆ Express Yourself: Enabling large-scale public events involving + multi-human-swarm interaction for social applications with MOSAIX + + +
+ Robot swarms have the potential to help groups of people with social tasks, +given their ability to scale to large numbers of robots and users. Developing +multi-human-swarm interaction is therefore crucial to support multiple people +interacting with the swarm simultaneously - which is an area that is scarcely +researched, unlike single-human, single-robot or single-human, multi-robot +interaction. Moreover, most robots are still confined to laboratory settings. +In this paper, we present our work with MOSAIX, a swarm of robot Tiles, that +facilitated ideation at a science museum. 63 robots were used as a swarm of +smart sticky notes, collecting input from the public and aggregating it based +on themes, providing an evolving visualization tool that engaged visitors and +fostered their participation. Our contribution lies in creating a large-scale +(63 robots and 294 attendees) public event, with a completely decentralized +swarm system in real-life settings. We also discuss learnings we obtained that +might help future researchers create multi-human-swarm interaction with the +public. + +
+
+
+
+
+ + ☆ Explanation for Trajectory Planning using Multi-modal Large Language + Model for Autonomous Driving ECCV 2024 + + +
+ End-to-end style autonomous driving models have been developed recently. +These models lack interpretability of decision-making process from perception +to control of the ego vehicle, resulting in anxiety for passengers. To +alleviate it, it is effective to build a model which outputs captions +describing future behaviors of the ego vehicle and their reason. However, the +existing approaches generate reasoning text that inadequately reflects the +future plans of the ego vehicle, because they train models to output captions +using momentary control signals as inputs. In this study, we propose a +reasoning model that takes future planning trajectories of the ego vehicle as +inputs to solve this limitation with the dataset newly collected. + +
+
+ comment: Accepted and presented at ECCV 2024 2nd Workshop on Vision-Centric + Autonomous Driving (VCAD) on September 30, 2024. 13 pages, 5 figures +
+
+
+
+
+ + ☆ Brain-inspired Action Generation with Spiking Transformer Diffusion + Policy Model + + +
+ Spiking Neural Networks (SNNs) has the ability to extract spatio-temporal +features due to their spiking sequence. While previous research has primarily +foucus on the classification of image and reinforcement learning. In our paper, +we put forward novel diffusion policy model based on Spiking Transformer Neural +Networks and Denoising Diffusion Probabilistic Model (DDPM): Spiking +Transformer Modulate Diffusion Policy Model (STMDP), a new brain-inspired model +for generating robot action trajectories. In order to improve the performance +of this model, we develop a novel decoder module: Spiking Modulate De coder +(SMD), which replaces the traditional Decoder module within the Transformer +architecture. Additionally, we explored the substitution of DDPM with Denoising +Diffusion Implicit Models (DDIM) in our frame work. We conducted experiments +across four robotic manipulation tasks and performed ablation studies on the +modulate block. Our model consistently outperforms existing Transformer-based +diffusion policy method. Especially in Can task, we achieved an improvement of +8%. The proposed STMDP method integrates SNNs, dffusion model and Transformer +architecture, which offers new perspectives and promising directions for +exploration in brain-inspired robotics. + +
+
+ comment: 10 pages, 4 figures and 2 tables, conference submission +
+
+
+
+
+ + ☆ ALPHA-$α$ and Bi-ACT Are All You Need: Importance of Position and + Force Information/Control for Imitation Learning of Unimanual and Bimanual + Robotic Manipulation with Low-Cost System + + +
+ Autonomous manipulation in everyday tasks requires flexible action generation +to handle complex, diverse real-world environments, such as objects with +varying hardness and softness. Imitation Learning (IL) enables robots to learn +complex tasks from expert demonstrations. However, a lot of existing methods +rely on position/unilateral control, leaving challenges in tasks that require +force information/control, like carefully grasping fragile or varying-hardness +objects. As the need for diverse controls increases, there are demand for +low-cost bimanual robots that consider various motor inputs. To address these +challenges, we introduce Bilateral Control-Based Imitation Learning via Action +Chunking with Transformers(Bi-ACT) and"A" "L"ow-cost "P"hysical "Ha"rdware +Considering Diverse Motor Control Modes for Research in Everyday Bimanual +Robotic Manipulation (ALPHA-$\alpha$). Bi-ACT leverages bilateral control to +utilize both position and force information, enhancing the robot's adaptability +to object characteristics such as hardness, shape, and weight. The concept of +ALPHA-$\alpha$ is affordability, ease of use, repairability, ease of assembly, +and diverse control modes (position, velocity, torque), allowing +researchers/developers to freely build control systems using ALPHA-$\alpha$. In +our experiments, we conducted a detailed analysis of Bi-ACT in unimanual +manipulation tasks, confirming its superior performance and adaptability +compared to Bi-ACT without force control. Based on these results, we applied +Bi-ACT to bimanual manipulation tasks. Experimental results demonstrated high +success rates in coordinated bimanual operations across multiple tasks. The +effectiveness of the Bi-ACT and ALPHA-$\alpha$ can be seen through +comprehensive real-world experiments. Video available at: +https://mertcookimg.github.io/alpha-biact/ + +
+
+
+
+
+ + ☆ Whole-Body Impedance Coordinative Control of Wheel-Legged Robot on + Uncertain Terrain + + +
+ This article propose a whole-body impedance coordinative control framework +for a wheel-legged humanoid robot to achieve adaptability on complex terrains +while maintaining robot upper body stability. The framework contains a bi-level +control strategy. The outer level is a variable damping impedance controller, +which optimizes the damping parameters to ensure the stability of the upper +body while holding an object. The inner level employs Whole-Body Control (WBC) +optimization that integrates real-time terrain estimation based on wheel-foot +position and force data. It generates motor torques while accounting for +dynamic constraints, joint limits,friction cones, real-time terrain updates, +and a model-free friction compensation strategy. The proposed whole-body +coordinative control method has been tested on a recently developed quadruped +humanoid robot. The results demonstrate that the proposed algorithm effectively +controls the robot, maintaining upper body stability to successfully complete a +water-carrying task while adapting to varying terrains. + +
+
+
+
+
+ + ☆ Autonomous Robotic Pepper Harvesting: Imitation Learning in Unstructured + Agricultural Environments + + +
+ Automating tasks in outdoor agricultural fields poses significant challenges +due to environmental variability, unstructured terrain, and diverse crop +characteristics. We present a robotic system for autonomous pepper harvesting +designed to operate in these unprotected, complex settings. Utilizing a custom +handheld shear-gripper, we collected 300 demonstrations to train a visuomotor +policy, enabling the system to adapt to varying field conditions and crop +diversity. We achieved a success rate of 28.95% with a cycle time of 31.71 +seconds, comparable to existing systems tested under more controlled conditions +like greenhouses. Our system demonstrates the feasibility and effectiveness of +leveraging imitation learning for automated harvesting in unstructured +agricultural environments. This work aims to advance scalable, automated +robotic solutions for agriculture in natural settings. + +
+
+ comment: 8 pages, 11 figures +
+
+
+
+
+ + ☆ Self-Supervised Learning of Grasping Arbitrary Objects On-the-Move + + +
+ Mobile grasping enhances manipulation efficiency by utilizing robots' +mobility. This study aims to enable a commercial off-the-shelf robot for mobile +grasping, requiring precise timing and pose adjustments. Self-supervised +learning can develop a generalizable policy to adjust the robot's velocity and +determine grasp position and orientation based on the target object's shape and +pose. Due to mobile grasping's complexity, action primitivization and +step-by-step learning are crucial to avoid data sparsity in learning from trial +and error. This study simplifies mobile grasping into two grasp action +primitives and a moving action primitive, which can be operated with limited +degrees of freedom for the manipulator. This study introduces three fully +convolutional neural network (FCN) models to predict static grasp primitive, +dynamic grasp primitive, and residual moving velocity error from visual inputs. +A two-stage grasp learning approach facilitates seamless FCN model learning. +The ablation study demonstrated that the proposed method achieved the highest +grasping accuracy and pick-and-place efficiency. Furthermore, randomizing +object shapes and environments in the simulation effectively achieved +generalizable mobile grasping. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Deep learning robotics using self-supervised spatial differentiation + drive autonomous contact-based semiconductor characterization + + +
+ Integrating autonomous contact-based robotic characterization into +self-driving laboratories can enhance measurement quality, reliability, and +throughput. While deep learning models support robust autonomy, current methods +lack pixel-precision positioning and require extensive labeled data. To +overcome these challenges, we propose a self-supervised convolutional neural +network with a spatially differentiable loss function, incorporating shape +priors to refine predictions of optimal robot contact poses for semiconductor +characterization. This network improves valid pose generation by 20.0%, +relative to existing models. We demonstrate our network's performance by +driving a 4-degree-of-freedom robot to characterize photoconductivity at 3,025 +predicted poses across a gradient of perovskite compositions, achieving +throughputs over 125 measurements per hour. Spatially mapping photoconductivity +onto each drop-casted film reveals regions of inhomogeneity. With this +self-supervised deep learning-driven robotic system, we enable high-precision +and reliable automation of contact-based characterization techniques at high +throughputs, thereby allowing the measurement of previously inaccessible yet +important semiconductor properties for self-driving laboratories. + +
+
+
+
+
+ + ☆ Off-Dynamics Reinforcement Learning via Domain Adaptation and Reward + Augmented Imitation + + +
+ Training a policy in a source domain for deployment in the target domain +under a dynamics shift can be challenging, often resulting in performance +degradation. Previous work tackles this challenge by training on the source +domain with modified rewards derived by matching distributions between the +source and the target optimal trajectories. However, pure modified rewards only +ensure the behavior of the learned policy in the source domain resembles +trajectories produced by the target optimal policies, which does not guarantee +optimal performance when the learned policy is actually deployed to the target +domain. In this work, we propose to utilize imitation learning to transfer the +policy learned from the reward modification to the target domain so that the +new policy can generate the same trajectories in the target domain. Our +approach, Domain Adaptation and Reward Augmented Imitation Learning (DARAIL), +utilizes the reward modification for domain adaptation and follows the general +framework of generative adversarial imitation learning from observation (GAIfO) +by applying a reward augmented estimator for the policy optimization step. +Theoretically, we present an error bound for our method under a mild assumption +regarding the dynamics shift to justify the motivation of our method. +Empirically, our method outperforms the pure modified reward method without +imitation learning and also outperforms other baselines in benchmark +off-dynamics environments. + +
+
+ comment: Published at Neurips 2024 +
+
+
+
+
+ + Planning by Simulation: Motion Planning with Learning-based Parallel + Scenario Prediction for Autonomous Driving + + +
+ Planning safe trajectories for autonomous vehicles is essential for +operational safety but remains extremely challenging due to the complex +interactions among traffic participants. Recent autonomous driving frameworks +have focused on improving prediction accuracy to explicitly model these +interactions. However, some methods overlook the significant influence of the +ego vehicle's planning on the possible trajectories of other agents, which can +alter prediction accuracy and lead to unsafe planning decisions. In this paper, +we propose a novel motion Planning approach by Simulation with learning-based +parallel scenario prediction (PS). PS deduces predictions iteratively based on +Monte Carlo Tree Search (MCTS), jointly inferring scenarios that cooperate with +the ego vehicle's planning set. Our method simulates possible scenes and +calculates their costs after the ego vehicle executes potential actions. To +balance and prune unreasonable actions and scenarios, we adopt MCTS as the +foundation to explore possible future interactions encoded within the +prediction network. Moreover, the query-centric trajectory prediction +streamlines our scene generation, enabling a sophisticated framework that +captures the mutual influence between other agents' predictions and the ego +vehicle's planning. We evaluate our framework on the Argoverse 2 dataset, and +the results demonstrate that our approach effectively achieves parallel ego +vehicle planning. + +
+
+
+
+
+ + ☆ Impact-Aware Control using Time-Invariant Reference Spreading + + +
+ With the goal of increasing the speed and efficiency in robotic manipulation, +a control approach is presented that aims to utilize intentional simultaneous +impacts to its advantage. This approach exploits the concept of the +time-invariant reference spreading framework, in which partly-overlapping ante- +and post-impact reference vector fields are used. These vector fields are +coupled via an impact model in proximity of the expected impact area, +minimizing the otherwise large impact-induced velocity errors and control +efforts. We show how a nonsmooth physics engine can be used to construct this +impact model for complex scenarios, which warrants applicability to a large +range of possible impact states without requiring contact stiffness and damping +parameters. In addition, a novel interim-impact control mode provides +robustness in the execution against the inevitable lack of exact impact +simultaneity and the corresponding unreliable velocity error during the time +when contact is only partially established. This interim mode uses a position +feedback signal that is derived from the ante-impact velocity reference to +promote contact completion, and smoothly transitions into the post-impact mode. +An experimental validation of time-invariant reference spreading control is +presented for the first time through a set of 600 robotic hit-and-push and +dual-arm grabbing experiments. + +
+
+ comment: 15 pages, 10 figures. Submitted to IEEE Transactions on Robotics + (T-RO) +
+
+
+
+
+ + ☆ A Novel MLLM-based Approach for Autonomous Driving in Different Weather + Conditions + + +
+ Autonomous driving (AD) technology promises to revolutionize daily +transportation by making it safer, more efficient, and more comfortable. Their +role in reducing traffic accidents and improving mobility will be vital to the +future of intelligent transportation systems. Autonomous driving in harsh +environmental conditions presents significant challenges that demand robust and +adaptive solutions and require more investigation. In this context, we present +in this paper a comprehensive performance analysis of an autonomous driving +agent leveraging the capabilities of a Multi-modal Large Language Model (MLLM) +using GPT-4o within the LimSim++ framework that offers close loop interaction +with the CARLA driving simulator. We call it MLLM-AD-4o. Our study evaluates +the agent's decision-making, perception, and control under adverse conditions, +including bad weather, poor visibility, and complex traffic scenarios. Our +results demonstrate the AD agent's ability to maintain high levels of safety +and efficiency, even in challenging environments, underscoring the potential of +GPT-4o to enhance autonomous driving systems (ADS) in any environment +condition. Moreover, we evaluate the performance of MLLM-AD-4o when different +perception entities are used including either front cameras only, front and +rear cameras, and when combined with LiDAR. The results of this work provide +valuable insights into integrating MLLMs with AD frameworks, paving the way for +future advancements in this field. + +
+
+ comment: 9 pages, 6 figures; Submitted to IEEE Transactions on Intelligent + Transportation Systems +
+
+
+
+
+ + ☆ Autonomous Sensor Exchange and Calibration for Cornstalk Nitrate + Monitoring Robot + + +
+ Interactive sensors are an important component of robotic systems but often +require manual replacement due to wear and tear. Automating this process can +enhance system autonomy and facilitate long-term deployment. We developed an +autonomous sensor exchange and calibration system for an agriculture crop +monitoring robot that inserts a nitrate sensor into cornstalks. A novel gripper +and replacement mechanism, featuring a reliable funneling design, were +developed to enable efficient and reliable sensor exchanges. To maintain +consistent nitrate sensor measurement, an on-board sensor calibration station +was integrated to provide in-field sensor cleaning and calibration. The system +was deployed at the Ames Curtis Farm in June 2024, where it successfully +inserted nitrate sensors with high accuracy into 30 cornstalks with a 77$\%$ +success rate. + +
+
+
+
+
+ + ☆ The Oxford Spires Dataset: Benchmarking Large-Scale LiDAR-Visual + Localisation, Reconstruction and Radiance Field Methods + + +
+ This paper introduces a large-scale multi-modal dataset captured in and +around well-known landmarks in Oxford using a custom-built multi-sensor +perception unit as well as a millimetre-accurate map from a Terrestrial LiDAR +Scanner (TLS). The perception unit includes three synchronised global shutter +colour cameras, an automotive 3D LiDAR scanner, and an inertial sensor - all +precisely calibrated. We also establish benchmarks for tasks involving +localisation, reconstruction, and novel-view synthesis, which enable the +evaluation of Simultaneous Localisation and Mapping (SLAM) methods, +Structure-from-Motion (SfM) and Multi-view Stereo (MVS) methods as well as +radiance field methods such as Neural Radiance Fields (NeRF) and 3D Gaussian +Splatting. To evaluate 3D reconstruction the TLS 3D models are used as ground +truth. Localisation ground truth is computed by registering the mobile LiDAR +scans to the TLS 3D models. Radiance field methods are evaluated not only with +poses sampled from the input trajectory, but also from viewpoints that are from +trajectories which are distant from the training poses. Our evaluation +demonstrates a key limitation of state-of-the-art radiance field methods: we +show that they tend to overfit to the training poses/images and do not +generalise well to out-of-sequence poses. They also underperform in 3D +reconstruction compared to MVS systems using the same visual inputs. Our +dataset and benchmarks are intended to facilitate better integration of +radiance field methods and SLAM systems. The raw and processed data, along with +software for parsing and evaluation, can be accessed at +https://dynamic.robots.ox.ac.uk/datasets/oxford-spires/. + +
+
+ comment: Website: https://dynamic.robots.ox.ac.uk/datasets/oxford-spires/ +
+
+
+
+
+ + ☆ Advancing Autonomous Driving Perception: Analysis of Sensor Fusion and + Computer Vision Techniques + + +
+ In autonomous driving, perception systems are piv otal as they interpret +sensory data to understand the envi ronment, which is essential for +decision-making and planning. + Ensuring the safety of these perception systems is fundamental + for achieving high-level autonomy, allowing us to confidently + delegate driving and monitoring tasks to machines. This re port aims to +enhance the safety of perception systems by + examining and summarizing the latest advancements in vision + based systems, and metrics for perception tasks in autonomous + driving. The report also underscores significant achievements and + recognized challenges faced by current research in this field. This + project focuses on enhancing the understanding and navigation + capabilities of self-driving robots through depth based perception + and computer vision techniques. Specifically, it explores how we + can perform better navigation into unknown map 2D map with + existing detection and tracking algorithms and on top of that how + depth based perception can enhance the navigation capabilities of + the wheel based bots to improve autonomous driving perception. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Denoising Diffusion Planner: Learning Complex Paths from Low-Quality + Demonstrations + + +
+ Denoising Diffusion Probabilistic Models (DDPMs) are powerful generative deep +learning models that have been very successful at image generation, and, very +recently, in path planning and control. In this paper, we investigate how to +leverage the generalization and conditional sampling capabilities of DDPMs to +generate complex paths for a robotic end effector. We show that training a DDPM +with synthetic and low-quality demonstrations is sufficient for generating +nontrivial paths reaching arbitrary targets and avoiding obstacles. +Additionally, we investigate different strategies for conditional sampling +combining classifier-free and classifier-guided approaches. Eventually, we +deploy the DDPM in a receding-horizon control scheme to enhance its planning +capabilities. The Denoising Diffusion Planner is experimentally validated +through various experiments on a Franka Emika Panda robot. + +
+
+
+
+
+ + ♻ ☆ Safe Navigation in Unmapped Environments for Robotic Systems with Input + Constraints + + +
+ This paper presents an approach for navigation and control in unmapped +environments under input and state constraints using a composite control +barrier function (CBF). We consider the scenario where real-time perception +feedback (e.g., LiDAR) is used online to construct a local CBF that models +local state constraints (e.g., local safety constraints such as obstacles) in +the a priori unmapped environment. The approach employs a soft-maximum function +to synthesize a single time-varying CBF from the N most recently obtained local +CBFs. Next, the input constraints are transformed into controller-state +constraints through the use of control dynamics. Then, we use a soft-minimum +function to compose the input constraints with the time-varying CBF that models +the a priori unmapped environment. This composition yields a single relaxed +CBF, which is used in a constrained optimization to obtain an optimal control +that satisfies the state and input constraints. The approach is validated +through simulations of a nonholonomic ground robot that is equipped with LiDAR +and navigates an unmapped environment. The robot successfully navigates the +environment while avoiding the a priori unmapped obstacles and satisfying both +speed and input constraints. + +
+
+ comment: Preprint submitted to 2025 American Control Conference (ACC). arXiv + admin note: substantial text overlap with arXiv:2409.01458 +
+
+
+
+
+ + ♻ ☆ Energy-Aware Predictive Motion Planning for Autonomous Vehicles Using a + Hybrid Zonotope Constraint Representation + + +
+ Uncrewed aerial systems have tightly coupled energy and motion dynamics which +must be accounted for by onboard planning algorithms. This work proposes a +strategy for coupled motion and energy planning using model predictive control +(MPC). A reduced-order linear time-invariant model of coupled energy and motion +dynamics is presented. Constrained zonotopes are used to represent state and +input constraints, and hybrid zonotopes are used to represent non-convex +constraints tied to a map of the environment. The structures of these +constraint representations are exploited within a mixed-integer quadratic +program solver tailored to MPC motion planning problems. Results apply the +proposed methodology to coupled motion and energy utilization planning problems +for 1) a hybrid-electric vehicle that must restrict engine usage when flying +over regions with noise restrictions, and 2) an electric package delivery drone +that must track waysets with both position and battery state of charge +requirements. By leveraging the structure-exploiting solver, the proposed +mixed-integer MPC formulations can be implemented in real time. + +
+
+
+
+
+ + ♻ ☆ A Dense Subframe-based SLAM Framework with Side-scan Sonar + + +
+ Side-scan sonar (SSS) is a lightweight acoustic sensor that is commonly +deployed on autonomous underwater vehicles (AUVs) to provide high-resolution +seafloor images. However, leveraging side-scan images for simultaneous +localization and mapping (SLAM) presents a notable challenge, primarily due to +the difficulty of establishing sufficient amount of accurate correspondences +between these images. To address this, we introduce a novel subframe-based +dense SLAM framework utilizing side-scan sonar data, enabling effective dense +matching in overlapping regions of paired side-scan images. With each image +being evenly divided into subframes, we propose a robust estimation pipeline to +estimate the relative pose between each paired subframes, by using a good +inlier set identified from dense correspondences. These relative poses are then +integrated as edge constraints in a factor graph to optimize the AUV pose +trajectory. + The proposed framework is evaluated on three real datasets collected by a +Hugin AUV. Among one of them includes manually-annotated keypoint +correspondences as ground truth and is used for evaluation of pose trajectory. +We also present a feasible way of evaluating mapping quality against multi-beam +echosounder (MBES) data without the influence of pose. Experimental results +demonstrate that our approach effectively mitigates drift from the +dead-reckoning (DR) system and enables quasi-dense bathymetry reconstruction. +An open-source implementation of this work is available. + +
+
+ comment: 13 pages, 15 figures. Preprint version of manuscript accepted to IEEE + Journal of Ocean Engineering. arXiv admin note: text overlap with + arXiv:2304.01854 +
+
+
+
+
+ + ♻ ☆ UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for + Egocentric Hand Object Interaction Videos + + +
+ Egocentric Hand Object Interaction (HOI) videos provide valuable insights +into human interactions with the physical world, attracting growing interest +from the computer vision and robotics communities. A key task in fully +understanding the geometry and dynamics of HOI scenes is dense pointclouds +sequence reconstruction. However, the inherent motion of both hands and the +camera makes this challenging. Current methods often rely on time-consuming +test-time optimization, making them impractical for reconstructing +internet-scale videos. To address this, we introduce UniHOI, a model that +unifies the estimation of all variables necessary for dense 4D reconstruction, +including camera intrinsic, camera poses, and video depth, for egocentric HOI +scene in a fast feed-forward manner. We end-to-end optimize all these variables +to improve their consistency in 3D space. Furthermore, our model could be +trained solely on large-scale monocular video dataset, overcoming the +limitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain +and zero-shot generalization setting, surpassing all baselines in pointclouds +sequence reconstruction and long-term 3D scene flow recovery. UniHOI is the +first approach to offer fast, dense, and generalizable monocular egocentric HOI +scene reconstruction in the presence of motion. Code and trained model will be +released in the future. + +
+
+
+
+
+ + ♻ ☆ Towards Safe and Robust Autonomous Vehicle Platooning: A Self-Organizing + Cooperative Control Framework + + +
+ In hybrid traffic environments where human-driven vehicles (HDVs) and +autonomous vehicles (AVs) coexist, achieving safe and robust decision-making +for AV platooning remains a complex challenge. Existing platooning systems +often struggle with dynamic formation management and adaptability, especially +in unpredictable, mixed-traffic conditions. To enhance autonomous vehicle +platooning within these hybrid environments, this paper presents TriCoD, a +twin-world safety-enhanced Data-Model-Knowledge Triple-Driven Cooperative +Decision-making Framework. This framework integrates deep reinforcement +learning (DRL) with model-driven approaches, enabling dynamic formation +dissolution and reconfiguration through a safety-prioritized twin-world +deduction mechanism. The DRL component augments traditional model-driven +methods, enhancing both safety and operational efficiency, especially under +emergency conditions. Additionally, an adaptive switching mechanism allows the +system to seamlessly shift between data-driven and model-driven strategies +based on real-time traffic demands, thereby optimizing decision-making ability +and adaptability. Simulation experiments and hardware-in-the-loop tests +demonstrate that the proposed framework significantly improves safety, +robustness, and flexibility. A detailed account of the validation results for +the model can be found in +\href{https://perfectxu88.github.io/towardssafeandrobust.github.io/}{Our +Website}. + +
+
+
+
+
+ + ♻ ☆ GSORB-SLAM: Gaussian Splatting SLAM benefits from ORB features and + Transmittance information + + +
+ The emergence of 3D Gaussian Splatting (3DGS) has recently sparked a renewed +wave of dense visual SLAM research. However, current methods face challenges +such as sensitivity to artifacts and noise, sub-optimal selection of training +viewpoints, and a lack of light global optimization. In this paper, we propose +a dense SLAM system that tightly couples 3DGS with ORB features. We design a +joint optimization approach for robust tracking and effectively reducing the +impact of noise and artifacts. This involves combining novel geometric +observations, derived from accumulated transmittance, with ORB features +extracted from pixel data. Furthermore, to improve mapping quality, we propose +an adaptive Gaussian expansion and regularization method that enables Gaussian +primitives to represent the scene compactly. This is coupled with a viewpoint +selection strategy based on the hybrid graph to mitigate over-fitting effects +and enhance convergence quality. Finally, our approach achieves compact and +high-quality scene representations and accurate localization. GSORB-SLAM has +been evaluated on different datasets, demonstrating outstanding performance. +The code will be available. + +
+
+
+
+
+ + ♻ ☆ Sequential Gaussian Variational Inference for Nonlinear State Estimation + and Its Application in Robot Navigation + + +
+ Probabilistic state estimation is essential for robots navigating uncertain +environments. Accurately and efficiently managing uncertainty in estimated +states is key to robust robotic operation. However, nonlinearities in robotic +platforms pose significant challenges that require advanced estimation +techniques. Gaussian variational inference (GVI) offers an optimization +perspective on the estimation problem, providing analytically tractable +solutions and efficiencies derived from the geometry of Gaussian space. We +propose a Sequential Gaussian Variational Inference (S-GVI) method to address +nonlinearity and provide efficient sequential inference processes. Our approach +integrates sequential Bayesian principles into the GVI framework, which are +addressed using statistical approximations and gradient updates on the +information geometry. Validations through simulations and real-world +experiments demonstrate significant improvements in state estimation over the +Maximum A Posteriori (MAP) estimation method. + +
+
+ comment: 8 pages +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 130 + +
+
+
+ + ☆ Enhancing the Reasoning Ability of Multimodal Large Language Models via + Mixed Preference Optimization + + +
+ Existing open-source multimodal large language models (MLLMs) generally +follow a training process involving pre-training and supervised fine-tuning. +However, these models suffer from distribution shifts, which limit their +multimodal reasoning, particularly in the Chain-of-Thought (CoT) performance. +To address this, we introduce a preference optimization (PO) process to enhance +the multimodal reasoning capabilities of MLLMs. Specifically, (1) on the data +side, we design an automated preference data construction pipeline to create +MMPR, a high-quality, large-scale multimodal reasoning preference dataset. and +(2) on the model side, we explore integrating PO with MLLMs, developing a +simple yet effective method, termed Mixed Preference Optimization (MPO), which +boosts multimodal CoT performance. Our approach demonstrates improved +performance across multiple benchmarks, particularly in multimodal reasoning +tasks. Notably, our model, InternVL2-8B-MPO, achieves an accuracy of 67.0 on +MathVista, outperforming InternVL2-8B by 8.7 points and achieving performance +comparable to the 10x larger InternVL2-76B. We hope this study could inspire +further advancements in MLLMs. Code, data, and model shall be publicly +released. + +
+
+
+
+
+ + ☆ LLaVA-o1: Let Vision Language Models Reason Step-by-Step + + +
+ Large language models have demonstrated substantial advancements in reasoning +capabilities, particularly through inference-time scaling, as illustrated by +models such as OpenAI's o1. However, current Vision-Language Models (VLMs) +often struggle to perform systematic and structured reasoning, especially when +handling complex visual question-answering tasks. In this work, we introduce +LLaVA-o1, a novel VLM designed to conduct autonomous multistage reasoning. +Unlike chain-of-thought prompting, LLaVA-o1 independently engages in sequential +stages of summarization, visual interpretation, logical reasoning, and +conclusion generation. This structured approach enables LLaVA-o1 to achieve +marked improvements in precision on reasoning-intensive tasks. To accomplish +this, we compile the LLaVA-o1-100k dataset, integrating samples from various +visual question answering sources and providing structured reasoning +annotations. Besides, we propose an inference-time stage-level beam search +method, which enables effective inference-time scaling. Remarkably, with only +100k training samples and a simple yet effective inference time scaling method, +LLaVA-o1 not only outperforms its base model by 8.9% on a wide range of +multimodal reasoning benchmarks, but also surpasses the performance of larger +and even closed-source models, such as Gemini-1.5-pro, GPT-4o-mini, and +Llama-3.2-90B-Vision-Instruct. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Mitigating Hallucination in Multimodal Large Language Model via + Hallucination-targeted Direct Preference Optimization + + +
+ Multimodal Large Language Models (MLLMs) are known to hallucinate, which +limits their practical applications. Recent works have attempted to apply +Direct Preference Optimization (DPO) to enhance the performance of MLLMs, but +have shown inconsistent improvements in mitigating hallucinations. To address +this issue more effectively, we introduce Hallucination-targeted Direct +Preference Optimization (HDPO) to reduce hallucinations in MLLMs. Unlike +previous approaches, our method tackles hallucinations from their diverse forms +and causes. Specifically, we develop three types of preference pair data +targeting the following causes of MLLM hallucinations: (1) insufficient visual +capabilities, (2) long context generation, and (3) multimodal conflicts. +Experimental results demonstrate that our method achieves superior performance +across multiple hallucination evaluation datasets, surpassing most +state-of-the-art (SOTA) methods and highlighting the potential of our approach. +Ablation studies and in-depth analyses further confirm the effectiveness of our +method and suggest the potential for further improvements through scaling up. + +
+
+
+
+
+ + ☆ M-VAR: Decoupled Scale-wise Autoregressive Modeling for High-Quality + Image Generation + + +
+ There exists recent work in computer vision, named VAR, that proposes a new +autoregressive paradigm for image generation. Diverging from the vanilla +next-token prediction, VAR structurally reformulates the image generation into +a coarse to fine next-scale prediction. In this paper, we show that this +scale-wise autoregressive framework can be effectively decoupled into +\textit{intra-scale modeling}, which captures local spatial dependencies within +each scale, and \textit{inter-scale modeling}, which models cross-scale +relationships progressively from coarse-to-fine scales. This decoupling +structure allows to rebuild VAR in a more computationally efficient manner. +Specifically, for intra-scale modeling -- crucial for generating high-fidelity +images -- we retain the original bidirectional self-attention design to ensure +comprehensive modeling; for inter-scale modeling, which semantically connects +different scales but is computationally intensive, we apply linear-complexity +mechanisms like Mamba to substantially reduce computational overhead. We term +this new framework M-VAR. Extensive experiments demonstrate that our method +outperforms existing models in both image quality and generation speed. For +example, our 1.5B model, with fewer parameters and faster inference speed, +outperforms the largest VAR-d30-2B. Moreover, our largest model M-VAR-d32 +impressively registers 1.78 FID on ImageNet 256$\times$256 and outperforms the +prior-art autoregressive models LlamaGen/VAR by 0.4/0.19 and popular diffusion +models LDM/DiT by 1.82/0.49, respectively. Code is avaiable at +\url{https://github.com/OliverRensu/MVAR}. + +
+
+
+
+
+ + ☆ Llama Guard 3 Vision: Safeguarding Human-AI Image Understanding + Conversations + + +
+ We introduce Llama Guard 3 Vision, a multimodal LLM-based safeguard for +human-AI conversations that involves image understanding: it can be used to +safeguard content for both multimodal LLM inputs (prompt classification) and +outputs (response classification). Unlike the previous text-only Llama Guard +versions (Inan et al., 2023; Llama Team, 2024b,a), it is specifically designed +to support image reasoning use cases and is optimized to detect harmful +multimodal (text and image) prompts and text responses to these prompts. Llama +Guard 3 Vision is fine-tuned on Llama 3.2-Vision and demonstrates strong +performance on the internal benchmarks using the MLCommons taxonomy. We also +test its robustness against adversarial attacks. We believe that Llama Guard 3 +Vision serves as a good starting point to build more capable and robust content +moderation tools for human-AI conversation with multimodal capabilities. + +
+
+
+
+
+ + ☆ Repurposing Stable Diffusion Attention for Training-Free Unsupervised + Interactive Segmentation + + +
+ Recent progress in interactive point prompt based Image Segmentation allows +to significantly reduce the manual effort to obtain high quality semantic +labels. State-of-the-art unsupervised methods use self-supervised pre-trained +models to obtain pseudo-labels which are used in training a prompt-based +segmentation model. In this paper, we propose a novel unsupervised and +training-free approach based solely on the self-attention of Stable Diffusion. +We interpret the self-attention tensor as a Markov transition operator, which +enables us to iteratively construct a Markov chain. Pixel-wise counting of the +required number of iterations along the Markov-chain to reach a relative +probability threshold yields a Markov-iteration-map, which we simply call a +Markov-map. Compared to the raw attention maps, we show that our proposed +Markov-map has less noise, sharper semantic boundaries and more uniform values +within semantically similar regions. We integrate the Markov-map in a simple +yet effective truncated nearest neighbor framework to obtain interactive point +prompt based segmentation. Despite being training-free, we experimentally show +that our approach yields excellent results in terms of Number of Clicks (NoC), +even outperforming state-of-the-art training based unsupervised methods in most +of the datasets. + +
+
+
+
+
+ + ☆ On the Foundation Model for Cardiac MRI Reconstruction MICCAI + + +
+ In recent years, machine learning (ML) based reconstruction has been widely +investigated and employed in cardiac magnetic resonance (CMR) imaging. ML-based +reconstructions can deliver clinically acceptable image quality under +substantially accelerated scans. ML-based reconstruction, however, also +requires substantial data and computational time to train the neural network, +which is often optimized for a fixed acceleration rate or image contrast. In +practice, imaging parameters are often tuned to best suit the diagnosis, which +may differ from the training data. This can result in degraded image quality, +and multiple trained networks are needed to fulfill the clinical demands. In +this study, we propose a foundation model that uses adaptive unrolling, +channel-shifting, and Pattern and Contrast-Prompt-UNet (PCP-UNet) to tackle the +problem. In particular, the undersampled data goes through a different number +of unrolled iterations according to its acceleration rate. Channel-shifting +improves reconstructed data quality. The PCP-UNet is equipped with an image +contrast and sampling pattern prompt. In vivo CMR experiments were performed +using mixed combinations of image contrasts, acceleration rates, and +(under)sampling patterns. The proposed foundation model has significantly +improved image quality for a wide range of CMR protocols and outperforms the +conventional ML-based method. + +
+
+ comment: For MICCAI CMRxRecon Challenge 2024 team CardiAxs +
+
+
+
+
+ + ☆ Deep Learning for Micro-Scale Crack Detection on Imbalanced Datasets + Using Key Point Localization + + +
+ Internal crack detection has been a subject of focus in structural health +monitoring. By focusing on crack detection in structural datasets, it is +demonstrated that deep learning (DL) methods can effectively analyze seismic +wave fields interacting with micro-scale cracks, which are beyond the +resolution of conventional visual inspection. This work explores a novel +application of DL-based key point detection technique, where cracks are +localized by predicting the coordinates of four key points that define a +bounding region of the crack. The study not only opens new research directions +for non-visual applications but also effectively mitigates the impact of +imbalanced data which poses a challenge for previous DL models, as it can be +biased toward predicting the majority class (non-crack regions). Popular DL +techniques, such as the Inception blocks, are used and investigated. The model +shows an overall reduction in loss when applied to micro-scale crack detection +and is reflected in the lower average deviation between the location of actual +and predicted cracks, with an average Intersection over Union (IoU) being 0.511 +for all micro cracks (greater than 0.00 micrometers) and 0.631 for larger micro +cracks (greater than 4 micrometers). + +
+
+
+
+
+ + ☆ Generation of synthetic gait data: application to multiple sclerosis + patients' gait patterns + + +
+ Multiple sclerosis (MS) is the leading cause of severe non-traumatic +disability in young adults and its incidence is increasing worldwide. The +variability of gait impairment in MS necessitates the development of a +non-invasive, sensitive, and cost-effective tool for quantitative gait +evaluation. The eGait movement sensor, designed to characterize human gait +through unit quaternion time series (QTS) representing hip rotations, is a +promising approach. However, the small sample sizes typical of clinical studies +pose challenges for the stability of gait data analysis tools. To address these +challenges, this article presents two key scientific contributions. First, a +comprehensive framework is proposed for transforming QTS data into a form that +preserves the essential geometric properties of gait while enabling the use of +any tabular synthetic data generation method. Second, a synthetic data +generation method is introduced, based on nearest neighbors weighting, which +produces high-fidelity synthetic QTS data suitable for small datasets and +private data environments. The effectiveness of the proposed method, is +demonstrated through its application to MS gait data, showing very good +fidelity and respect of the initial geometry of the data. Thanks to this work, +we are able to produce synthetic data sets and work on the stability of +clustering methods. + +
+
+
+
+
+ + ☆ Towards High-Fidelity 3D Portrait Generation with Rich Details by + Cross-View Prior-Aware Diffusion + + +
+ Recent diffusion-based Single-image 3D portrait generation methods typically +employ 2D diffusion models to provide multi-view knowledge, which is then +distilled into 3D representations. However, these methods usually struggle to +produce high-fidelity 3D models, frequently yielding excessively blurred +textures. We attribute this issue to the insufficient consideration of +cross-view consistency during the diffusion process, resulting in significant +disparities between different views and ultimately leading to blurred 3D +representations. In this paper, we address this issue by comprehensively +exploiting multi-view priors in both the conditioning and diffusion procedures +to produce consistent, detail-rich portraits. From the conditioning standpoint, +we propose a Hybrid Priors Diffsion model, which explicitly and implicitly +incorporates multi-view priors as conditions to enhance the status consistency +of the generated multi-view portraits. From the diffusion perspective, +considering the significant impact of the diffusion noise distribution on +detailed texture generation, we propose a Multi-View Noise Resamplig Strategy +integrated within the optimization process leveraging cross-view priors to +enhance representation consistency. Extensive experiments demonstrate that our +method can produce 3D portraits with accurate geometry and rich details from a +single image. The project page is at +\url{https://haoran-wei.github.io/Portrait-Diffusion}. + +
+
+
+
+
+ + ☆ Mechanisms of Generative Image-to-Image Translation Networks + + +
+ Generative Adversarial Networks (GANs) are a class of neural networks that +have been widely used in the field of image-to-image translation. In this +paper, we propose a streamlined image-to-image translation network with a +simpler architecture compared to existing models. We investigate the +relationship between GANs and autoencoders and provide an explanation for the +efficacy of employing only the GAN component for tasks involving image +translation. We show that adversarial for GAN models yields results comparable +to those of existing methods without additional complex loss penalties. +Subsequently, we elucidate the rationale behind this phenomenon. We also +incorporate experimental results to demonstrate the validity of our findings. + +
+
+
+
+
+ + ☆ Interactive Image-Based Aphid Counting in Yellow Water Traps under + Stirring Actions + + +
+ The current vision-based aphid counting methods in water traps suffer from +undercounts caused by occlusions and low visibility arising from dense +aggregation of insects and other objects. To address this problem, we propose a +novel aphid counting method through interactive stirring actions. We use +interactive stirring to alter the distribution of aphids in the yellow water +trap and capture a sequence of images which are then used for aphid detection +and counting through an optimized small object detection network based on +Yolov5. We also propose a counting confidence evaluation system to evaluate the +confidence of count-ing results. The final counting result is a weighted sum of +the counting results from all sequence images based on the counting confidence. +Experimental results show that our proposed aphid detection network +significantly outperforms the original Yolov5, with improvements of 33.9% in +AP@0.5 and 26.9% in AP@[0.5:0.95] on the aphid test set. In addition, the aphid +counting test results using our proposed counting confidence evaluation system +show significant improvements over the static counting method, closely aligning +with manual counting results. + +
+
+
+
+
+ + BiDense: Binarization for Dense Prediction + + +
+ Dense prediction is a critical task in computer vision. However, previous +methods often require extensive computational resources, which hinders their +real-world application. In this paper, we propose BiDense, a generalized binary +neural network (BNN) designed for efficient and accurate dense prediction +tasks. BiDense incorporates two key techniques: the Distribution-adaptive +Binarizer (DAB) and the Channel-adaptive Full-precision Bypass (CFB). The DAB +adaptively calculates thresholds and scaling factors for binarization, +effectively retaining more information within BNNs. Meanwhile, the CFB +facilitates full-precision bypassing for binary convolutional layers undergoing +various channel size transformations, which enhances the propagation of +real-valued signals and minimizes information loss. By leveraging these +techniques, BiDense preserves more real-valued information, enabling more +accurate and detailed dense predictions in BNNs. Extensive experiments +demonstrate that our framework achieves performance levels comparable to +full-precision models while significantly reducing memory usage and +computational costs. + +
+
+
+
+
+ + ☆ Comparative Analysis of Machine Learning Approaches for Bone Age + Assessment: A Comprehensive Study on Three Distinct Models + + +
+ Radiologists and doctors make use of X-ray images of the non-dominant hands +of children and infants to assess the possibility of genetic conditions and +growth abnormalities. This is done by assessing the difference between the +actual extent of growth found using the X-rays and the chronological age of the +subject. The assessment was done conventionally using The Greulich Pyle (GP) or +Tanner Whitehouse (TW) approach. These approaches require a high level of +expertise and may often lead to observer bias. Hence, to automate the process +of assessing the X-rays, and to increase its accuracy and efficiency, several +machine learning models have been developed. These machine-learning models have +several differences in their accuracy and efficiencies, leading to an unclear +choice for the suitable model depending on their needs and available resources. +Methods: In this study, we have analyzed the 3 most widely used models for the +automation of bone age prediction, which are the Xception model, VGG model and +CNN model. These models were trained on the preprocessed dataset and the +accuracy was measured using the MAE in terms of months for each model. Using +this, the comparison between the models was done. Results: The 3 models, +Xception, VGG, and CNN models have been tested for accuracy and other relevant +factors. + +
+
+
+
+
+ + ☆ Y-MAP-Net: Real-time depth, normals, segmentation, multi-label + captioning and 2D human pose in RGB images + + +
+ We present Y-MAP-Net, a Y-shaped neural network architecture designed for +real-time multi-task learning on RGB images. Y-MAP-Net, simultaneously predicts +depth, surface normals, human pose, semantic segmentation and generates +multi-label captions, all from a single network evaluation. To achieve this, we +adopt a multi-teacher, single-student training paradigm, where task-specific +foundation models supervise the network's learning, enabling it to distill +their capabilities into a lightweight architecture suitable for real-time +applications. Y-MAP-Net, exhibits strong generalization, simplicity and +computational efficiency, making it ideal for robotics and other practical +scenarios. To support future research, we will release our code publicly. + +
+
+ comment: 8 page paper, 6 Figures, 3 Tables +
+
+
+
+
+ + ☆ Number it: Temporal Grounding Videos like Flipping Manga + + +
+ Video Large Language Models (Vid-LLMs) have made remarkable advancements in +comprehending video content for QA dialogue. However, they struggle to extend +this visual understanding to tasks requiring precise temporal localization, +known as Video Temporal Grounding (VTG). To address this gap, we introduce +Number-Prompt (NumPro), a novel method that empowers Vid-LLMs to bridge visual +comprehension with temporal grounding by adding unique numerical identifiers to +each video frame. Treating a video as a sequence of numbered frame images, +NumPro transforms VTG into an intuitive process: flipping through manga panels +in sequence. This allows Vid-LLMs to "read" event timelines, accurately linking +visual content with corresponding temporal information. Our experiments +demonstrate that NumPro significantly boosts VTG performance of top-tier +Vid-LLMs without additional computational cost. Furthermore, fine-tuning on a +NumPro-enhanced dataset defines a new state-of-the-art for VTG, surpassing +previous top-performing methods by up to 6.9\% in mIoU for moment retrieval and +8.5\% in mAP for highlight detection. The code will be available at +https://github.com/yongliang-wu/NumPro. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ CNN-Based Classification of Persian Miniature Paintings from Five + Renowned Schools + + +
+ This article addresses the gap in computational painting analysis focused on +Persian miniature painting, a rich cultural and artistic heritage. It +introduces a novel approach using Convolutional Neural Networks (CNN) to +classify Persian miniatures from five schools: Herat, Tabriz-e Avval, Shiraz-e +Avval, Tabriz-e Dovvom, and Qajar. The method achieves an average accuracy of +over 91%. A meticulously curated dataset captures the distinct features of each +school, with a patch-based CNN approach classifying image segments +independently before merging results for enhanced accuracy. This research +contributes significantly to digital art analysis, providing detailed insights +into the dataset, CNN architecture, training, and validation processes. It +highlights the potential for future advancements in automated art analysis, +bridging machine learning, art history, and digital humanities, thereby aiding +the preservation and understanding of Persian cultural heritage. + +
+
+ comment: 20 pages, submitted to journal +
+
+
+
+
+ + The Dawn of GUI Agent: A Preliminary Case Study with Claude 3.5 Computer + Use + + +
+ The recently released model, Claude 3.5 Computer Use, stands out as the first +frontier AI model to offer computer use in public beta as a graphical user +interface (GUI) agent. As an early beta, its capability in the real-world +complex environment remains unknown. In this case study to explore Claude 3.5 +Computer Use, we curate and organize a collection of carefully designed tasks +spanning a variety of domains and software. Observations from these cases +demonstrate Claude 3.5 Computer Use's unprecedented ability in end-to-end +language to desktop actions. Along with this study, we provide an +out-of-the-box agent framework for deploying API-based GUI automation models +with easy implementation. Our case studies aim to showcase a groundwork of +capabilities and limitations of Claude 3.5 Computer Use with detailed analyses +and bring to the fore questions about planning, action, and critic, which must +be considered for future improvement. We hope this preliminary exploration will +inspire future research into the GUI agent community. All the test cases in the +paper can be tried through the project: +https://github.com/showlab/computer_use_ootb. + +
+
+ comment: 40 pages, 21 figures, preprint +
+
+
+
+
+ + ☆ Melanoma Detection with Uncertainty Quantification + + +
+ Early detection of melanoma is crucial for improving survival rates. Current +detection tools often utilize data-driven machine learning methods but often +overlook the full integration of multiple datasets. We combine publicly +available datasets to enhance data diversity, allowing numerous experiments to +train and evaluate various classifiers. We then calibrate them to minimize +misdiagnoses by incorporating uncertainty quantification. Our experiments on +benchmark datasets show accuracies of up to 93.2% before and 97.8% after +applying uncertainty-based rejection, leading to a reduction in misdiagnoses by +over 40.5%. Our code and data are publicly available, and a web-based interface +for quick melanoma detection of user-supplied images is also provided. + +
+
+ comment: 5 pages, 5 figures, 3 tables, submitted to ISBI2025 +
+
+
+
+
+ + ☆ Probabilistic Prior Driven Attention Mechanism Based on Diffusion Model + for Imaging Through Atmospheric Turbulence + + +
+ Atmospheric turbulence introduces severe spatial and geometric distortions, +challenging traditional image restoration methods. We propose the Probabilistic +Prior Turbulence Removal Network (PPTRN), which combines probabilistic +diffusion-based prior modeling with Transformer-driven feature extraction to +address this issue. PPTRN employs a two-stage approach: first, a latent encoder +and Transformer are jointly trained on clear images to establish robust feature +representations. Then, a Denoising Diffusion Probabilistic Model (DDPM) models +prior distributions over latent vectors, guiding the Transformer in capturing +diverse feature variations essential for restoration. A key innovation in PPTRN +is the Probabilistic Prior Driven Cross Attention mechanism, which integrates +the DDPM-generated prior with feature embeddings to reduce artifacts and +enhance spatial coherence. Extensive experiments validate that PPTRN +significantly improves restoration quality on turbulence-degraded images, +setting a new benchmark in clarity and structural fidelity. + +
+
+
+
+
+ + ☆ M3TR: Generalist HD Map Construction with Variable Map Priors + + +
+ Autonomous vehicles require road information for their operation, usually in +form of HD maps. Since offline maps eventually become outdated or may only be +partially available, online HD map construction methods have been proposed to +infer map information from live sensor data. A key issue remains how to exploit +such partial or outdated map information as a prior. We introduce M3TR +(Multi-Masking Map Transformer), a generalist approach for HD map construction +both with and without map priors. We address shortcomings in ground truth +generation for Argoverse 2 and nuScenes and propose the first realistic +scenarios with semantically diverse map priors. Examining various query +designs, we use an improved method for integrating prior map elements into a HD +map construction model, increasing performance by +4.3 mAP. Finally, we show +that training across all prior scenarios yields a single Generalist model, +whose performance is on par with previous Expert models that can handle only +one specific type of map prior. M3TR thus is the first model capable of +leveraging variable map priors, making it suitable for real-world deployment. +Code is available at https://github.com/immel-f/m3tr + +
+
+
+
+
+ + ☆ Modification Takes Courage: Seamless Image Stitching via + Reference-Driven Inpainting + + +
+ Current image stitching methods often produce noticeable seams in challenging +scenarios such as uneven hue and large parallax. To tackle this problem, we +propose the Reference-Driven Inpainting Stitcher (RDIStitcher), which +reformulates the image fusion and rectangling as a reference-based inpainting +model, incorporating a larger modification fusion area and stronger +modification intensity than previous methods. Furthermore, we introduce a +self-supervised model training method, which enables the implementation of +RDIStitcher without requiring labeled data by fine-tuning a Text-to-Image (T2I) +diffusion model. Recognizing difficulties in assessing the quality of stitched +images, we present the Multimodal Large Language Models (MLLMs)-based metrics, +offering a new perspective on evaluating stitched image quality. Compared to +the state-of-the-art (SOTA) method, extensive experiments demonstrate that our +method significantly enhances content coherence and seamless transitions in the +stitched images. Especially in the zero-shot experiments, our method exhibits +strong generalization capabilities. Code: +https://github.com/yayoyo66/RDIStitcher + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ A Realistic Collimated X-Ray Image Simulation Pipeline + + +
+ Collimator detection remains a challenging task in X-ray systems with +unreliable or non-available information about the detectors position relative +to the source. This paper presents a physically motivated image processing +pipeline for simulating the characteristics of collimator shadows in X-ray +images. By generating randomized labels for collimator shapes and locations, +incorporating scattered radiation simulation, and including Poisson noise, the +pipeline enables the expansion of limited datasets for training deep neural +networks. We validate the proposed pipeline by a qualitative and quantitative +comparison against real collimator shadows. Furthermore, it is demonstrated +that utilizing simulated data within our deep learning framework not only +serves as a suitable substitute for actual collimators but also enhances the +generalization performance when applied to real-world data. + +
+
+
+
+
+ + ☆ RETR: Multi-View Radar Detection Transformer for Indoor Perception NeurIPS 2024 + + +
+ Indoor radar perception has seen rising interest due to affordable costs +driven by emerging automotive imaging radar developments and the benefits of +reduced privacy concerns and reliability under hazardous conditions (e.g., fire +and smoke). However, existing radar perception pipelines fail to account for +distinctive characteristics of the multi-view radar setting. In this paper, we +propose Radar dEtection TRansformer (RETR), an extension of the popular DETR +architecture, tailored for multi-view radar perception. RETR inherits the +advantages of DETR, eliminating the need for hand-crafted components for object +detection and segmentation in the image plane. More importantly, RETR +incorporates carefully designed modifications such as 1) depth-prioritized +feature similarity via a tunable positional encoding (TPE); 2) a tri-plane loss +from both radar and camera coordinates; and 3) a learnable radar-to-camera +transformation via reparameterization, to account for the unique multi-view +radar setting. Evaluated on two indoor radar perception datasets, our approach +outperforms existing state-of-the-art methods by a margin of 15.38+ AP for +object detection and 11.77+ IoU for instance segmentation, respectively. + +
+
+ comment: 24 pages, Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Multidimensional Byte Pair Encoding: Shortened Sequences for Improved + Visual Data Generation + + +
+ In language processing, transformers benefit greatly from text being +condensed. This is achieved through a larger vocabulary that captures word +fragments instead of plain characters. This is often done with Byte Pair +Encoding. In the context of images, tokenisation of visual data is usually +limited to regular grids obtained from quantisation methods, without global +content awareness. Our work improves tokenisation of visual data by bringing +Byte Pair Encoding from 1D to multiple dimensions, as a complementary add-on to +existing compression. We achieve this through counting constellations of token +pairs and replacing the most frequent token pair with a newly introduced token. +The multidimensionality only increases the computation time by a factor of 2 +for images, making it applicable even to large datasets like ImageNet within +minutes on consumer hardware. This is a lossless preprocessing step. Our +evaluation shows improved training and inference performance of transformers on +visual data achieved by compressing frequent constellations of tokens: The +resulting sequences are shorter, with more uniformly distributed information +content, e.g. condensing empty regions in an image into single tokens. As our +experiments show, these condensed sequences are easier to process. We +additionally introduce a strategy to amplify this compression further by +clustering the vocabulary. + +
+
+
+
+
+ + ☆ 4DPV: 4D Pet from Videos by Coarse-to-Fine Non-Rigid Radiance Fields ACCV 2024 + + +
+ We present a coarse-to-fine neural deformation model to simultaneously +recover the camera pose and the 4D reconstruction of an unknown object from +multiple RGB sequences in the wild. To that end, our approach does not consider +any pre-built 3D template nor 3D training data as well as controlled +illumination conditions, and can sort out the problem in a self-supervised +manner. Our model exploits canonical and image-variant spaces where both coarse +and fine components are considered. We introduce a neural local quadratic model +with spatio-temporal consistency to encode fine details that is combined with +canonical embeddings in order to establish correspondences across sequences. We +thoroughly validate the method on challenging scenarios with complex and +real-world deformations, providing both quantitative and qualitative +evaluations, an ablation study and a comparison with respect to competing +approaches. Our project is available at https://github.com/smontode24/4DPV. + +
+
+ comment: 17th Asian Conference on Computer Vision (ACCV 2024) +
+
+
+
+
+ + ☆ Fill in the blanks: Rethinking Interpretability in vision + + +
+ Model interpretability is a key challenge that has yet to align with the +advancements observed in contemporary state-of-the-art deep learning models. In +particular, deep learning aided vision tasks require interpretability, in order +for their adoption in more specialized domains such as medical imaging. +Although the field of explainable AI (XAI) developed methods for interpreting +vision models along with early convolutional neural networks, recent XAI +research has mainly focused on assigning attributes via saliency maps. As such, +these methods are restricted to providing explanations at a sample level, and +many explainability methods suffer from low adaptability across a wide range of +vision models. In our work, we re-think vision-model explainability from a +novel perspective, to probe the general input structure that a model has learnt +during its training. To this end, we ask the question: "How would a vision +model fill-in a masked-image". Experiments on standard vision datasets and +pre-trained models reveal consistent patterns, and could be intergrated as an +additional model-agnostic explainability tool in modern machine-learning +platforms. The code will be available at +\url{https://github.com/BoTZ-TND/FillingTheBlanks.git} + +
+
+
+
+
+ + ☆ Partial Scene Text Retrieval + + +
+ The task of partial scene text retrieval involves localizing and searching +for text instances that are the same or similar to a given query text from an +image gallery. However, existing methods can only handle text-line instances, +leaving the problem of searching for partial patches within these text-line +instances unsolved due to a lack of patch annotations in the training data. To +address this issue, we propose a network that can simultaneously retrieve both +text-line instances and their partial patches. Our method embeds the two types +of data (query text and scene text instances) into a shared feature space and +measures their cross-modal similarities. To handle partial patches, our +proposed approach adopts a Multiple Instance Learning (MIL) approach to learn +their similarities with query text, without requiring extra annotations. +However, constructing bags, which is a standard step of conventional MIL +approaches, can introduce numerous noisy samples for training, and lower +inference speed. To address this issue, we propose a Ranking MIL (RankMIL) +approach to adaptively filter those noisy samples. Additionally, we present a +Dynamic Partial Match Algorithm (DPMA) that can directly search for the target +partial patch from a text-line instance during the inference stage, without +requiring bags. This greatly improves the search efficiency and the performance +of retrieving partial patches. The source code and dataset are available at +https://github.com/lanfeng4659/PSTR. + +
+
+ comment: Accepted on TPAMI +
+
+
+
+
+ + ☆ The Unreasonable Effectiveness of Guidance for Diffusion Models + + +
+ Guidance is an error-correcting technique used to improve the perceptual +quality of images generated by diffusion models. Typically, the correction is +achieved by linear extrapolation, using an auxiliary diffusion model that has +lower performance than the primary model. Using a 2D toy example, we show that +it is highly beneficial when the auxiliary model exhibits similar errors as the +primary one but stronger. We verify this finding in higher dimensions, where we +show that competitive generative performance to state-of-the-art guidance +methods can be achieved when the auxiliary model differs from the primary one +only by having stronger weight regularization. As an independent contribution, +we investigate whether upweighting long-range spatial dependencies improves +visual fidelity. The result is a novel guidance method, which we call sliding +window guidance (SWG), that guides the primary model with itself by +constraining its receptive field. Intriguingly, SWG aligns better with human +preferences than state-of-the-art guidance methods while requiring neither +training, architectural modifications, nor class conditioning. The code will be +released. + +
+
+ comment: Preprint. 19 pages, 14 figures in total, including references and + appendix +
+
+
+
+
+ + ☆ Visual-Linguistic Agent: Towards Collaborative Contextual Object + Reasoning + + +
+ Multimodal Large Language Models (MLLMs) excel at descriptive tasks within +images but often struggle with precise object localization, a critical element +for reliable visual interpretation. In contrast, traditional object detection +models provide high localization accuracy but frequently generate detections +lacking contextual coherence due to limited modeling of inter-object +relationships. To address this fundamental limitation, we introduce the +\textbf{Visual-Linguistic Agent (VLA), a collaborative framework that combines +the relational reasoning strengths of MLLMs with the precise localization +capabilities of traditional object detectors. In the VLA paradigm, the MLLM +serves as a central Linguistic Agent, working collaboratively with specialized +Vision Agents for object detection and classification. The Linguistic Agent +evaluates and refines detections by reasoning over spatial and contextual +relationships among objects, while the classification Vision Agent offers +corrective feedback to improve classification accuracy. This collaborative +approach enables VLA to significantly enhance both spatial reasoning and object +localization, addressing key challenges in multimodal understanding. Extensive +evaluations on the COCO dataset demonstrate substantial performance +improvements across multiple detection models, highlighting VLA's potential to +set a new benchmark in accurate and contextually coherent object detection. + +
+
+
+
+
+ + ☆ Morpho-Aware Global Attention for Image Matting + + +
+ Vision Transformers (ViTs) and Convolutional Neural Networks (CNNs) face +inherent challenges in image matting, particularly in preserving fine +structural details. ViTs, with their global receptive field enabled by the +self-attention mechanism, often lose local details such as hair strands. +Conversely, CNNs, constrained by their local receptive field, rely on deeper +layers to approximate global context but struggle to retain fine structures at +greater depths. + To overcome these limitations, we propose a novel Morpho-Aware Global +Attention (MAGA) mechanism, designed to effectively capture the morphology of +fine structures. MAGA employs Tetris-like convolutional patterns to align the +local shapes of fine structures, ensuring optimal local correspondence while +maintaining sensitivity to morphological details. The extracted local +morphology information is used as query embeddings, which are projected onto +global key embeddings to emphasize local details in a broader context. +Subsequently, by projecting onto value embeddings, MAGA seamlessly integrates +these emphasized morphological details into a unified global structure. + This approach enables MAGA to simultaneously focus on local morphology and +unify these details into a coherent whole, effectively preserving fine +structures. Extensive experiments show that our MAGA-based ViT achieves +significant performance gains, outperforming state-of-the-art methods across +two benchmarks with average improvements of 4.3% in SAD and 39.5% in MSE. + +
+
+
+
+
+ + ☆ ScribbleVS: Scribble-Supervised Medical Image Segmentation via Dynamic + Competitive Pseudo Label Selection + + +
+ In clinical medicine, precise image segmentation can provide substantial +support to clinicians. However, achieving such precision often requires a large +amount of finely annotated data, which can be costly. Scribble annotation +presents a more efficient alternative, boosting labeling efficiency. However, +utilizing such minimal supervision for medical image segmentation training, +especially with scribble annotations, poses significant challenges. To address +these challenges, we introduce ScribbleVS, a novel framework that leverages +scribble annotations. We introduce a Regional Pseudo Labels Diffusion Module to +expand the scope of supervision and reduce the impact of noise present in +pseudo labels. Additionally, we propose a Dynamic Competitive Selection module +for enhanced refinement in selecting pseudo labels. Experiments conducted on +the ACDC and MSCMRseg datasets have demonstrated promising results, achieving +performance levels that even exceed those of fully supervised methodologies. +The codes of this study are available at +https://github.com/ortonwang/ScribbleVS. + +
+
+
+
+
+ + ☆ ColorEdit: Training-free Image-Guided Color editing with diffusion model + + +
+ Text-to-image (T2I) diffusion models, with their impressive generative +capabilities, have been adopted for image editing tasks, demonstrating +remarkable efficacy. However, due to attention leakage and collision between +the cross-attention map of the object and the new color attribute from the text +prompt, text-guided image editing methods may fail to change the color of an +object, resulting in a misalignment between the resulting image and the text +prompt. In this paper, we conduct an in-depth analysis on the process of +text-guided image synthesizing and what semantic information different +cross-attention blocks have learned. We observe that the visual representation +of an object is determined in the up-block of the diffusion model in the early +stage of the denoising process, and color adjustment can be achieved through +value matrices alignment in the cross-attention layer. Based on our findings, +we propose a straightforward, yet stable, and effective image-guided method to +modify the color of an object without requiring any additional fine-tuning or +training. Lastly, we present a benchmark dataset called COLORBENCH, the first +benchmark to evaluate the performance of color change methods. Extensive +experiments validate the effectiveness of our method in object-level color +editing and surpass the performance of popular text-guided image editing +approaches in both synthesized and real images. + +
+
+
+
+
+ + ☆ A Low-Resolution Image is Worth 1x1 Words: Enabling Fine Image + Super-Resolution with Transformers and TaylorShift + + +
+ Transformer-based Super-Resolution (SR) models have recently advanced image +reconstruction quality, yet challenges remain due to computational complexity +and an over-reliance on large patch sizes, which constrain fine-grained detail +enhancement. In this work, we propose TaylorIR to address these limitations by +utilizing a patch size of 1x1, enabling pixel-level processing in any +transformer-based SR model. To address the significant computational demands +under the traditional self-attention mechanism, we employ the TaylorShift +attention mechanism, a memory-efficient alternative based on Taylor series +expansion, achieving full token-to-token interactions with linear complexity. +Experimental results demonstrate that our approach achieves new +state-of-the-art SR performance while reducing memory consumption by up to 60% +compared to traditional self-attention-based transformers. + +
+
+
+
+
+ + ☆ MCL: Multi-view Enhanced Contrastive Learning for Chest X-ray Report + Generation + + +
+ Radiology reports are crucial for planning treatment strategies and enhancing +doctor-patient communication, yet manually writing these reports is burdensome +for radiologists. While automatic report generation offers a solution, existing +methods often rely on single-view radiographs, limiting diagnostic accuracy. To +address this problem, we propose MCL, a Multi-view enhanced Contrastive +Learning method for chest X-ray report generation. Specifically, we first +introduce multi-view enhanced contrastive learning for visual representation by +maximizing agreements between multi-view radiographs and their corresponding +report. Subsequently, to fully exploit patient-specific indications (e.g., +patient's symptoms) for report generation, we add a transitional ``bridge" for +missing indications to reduce embedding space discrepancies caused by their +presence or absence. Additionally, we construct Multi-view CXR and Two-view CXR +datasets from public sources to support research on multi-view report +generation. Our proposed MCL surpasses recent state-of-the-art methods across +multiple datasets, achieving a 5.0% F1 RadGraph improvement on MIMIC-CXR, a +7.3% BLEU-1 improvement on MIMIC-ABN, a 3.1% BLEU-4 improvement on Multi-view +CXR, and an 8.2% F1 CheXbert improvement on Two-view CXR. + +
+
+ comment: https://github.com/mk-runner/MCL +
+
+
+
+
+ + ☆ Learning Generalizable 3D Manipulation With 10 Demonstrations + + +
+ Learning robust and generalizable manipulation skills from demonstrations +remains a key challenge in robotics, with broad applications in industrial +automation and service robotics. While recent imitation learning methods have +achieved impressive results, they often require large amounts of demonstration +data and struggle to generalize across different spatial variants. In this +work, we present a novel framework that learns manipulation skills from as few +as 10 demonstrations, yet still generalizes to spatial variants such as +different initial object positions and camera viewpoints. Our framework +consists of two key modules: Semantic Guided Perception (SGP), which constructs +task-focused, spatially aware 3D point cloud representations from RGB-D inputs; +and Spatial Generalized Decision (SGD), an efficient diffusion-based +decision-making module that generates actions via denoising. To effectively +learn generalization ability from limited data, we introduce a critical +spatially equivariant training strategy that captures the spatial knowledge +embedded in expert demonstrations. We validate our framework through extensive +experiments on both simulation benchmarks and real-world robotic systems. Our +method demonstrates a 60 percent improvement in success rates over +state-of-the-art approaches on a series of challenging tasks, even with +substantial variations in object poses and camera viewpoints. This work shows +significant potential for advancing efficient, generalizable manipulation skill +learning in real-world applications. + +
+
+
+
+
+ + ☆ Block based Adaptive Compressive Sensing with Sampling Rate Control + + +
+ Compressive sensing (CS), acquiring and reconstructing signals below the +Nyquist rate, has great potential in image and video acquisition to exploit +data redundancy and greatly reduce the amount of sampled data. To further +reduce the sampled data while keeping the video quality, this paper explores +the temporal redundancy in video CS and proposes a block based adaptive +compressive sensing framework with a sampling rate (SR) control strategy. To +avoid redundant compression of non-moving regions, we first incorporate moving +block detection between consecutive frames, and only transmit the measurements +of moving blocks. The non-moving regions are reconstructed from the previous +frame. In addition, we propose a block storage system and a dynamic threshold +to achieve adaptive SR allocation to each frame based on the area of moving +regions and target SR for controlling the average SR within the target SR. +Finally, to reduce blocking artifacts and improve reconstruction quality, we +adopt a cooperative reconstruction of the moving and non-moving blocks by +referring to the measurements of the non-moving blocks from the previous frame. +Extensive experiments have demonstrated that this work is able to control SR +and obtain better performance than existing works. + +
+
+ comment: Accepted to MMAsia2024 +
+
+
+
+
+ + ☆ STLight: a Fully Convolutional Approach for Efficient Predictive + Learning by Spatio-Temporal joint Processing WACV 2025 + + +
+ Spatio-Temporal predictive Learning is a self-supervised learning paradigm +that enables models to identify spatial and temporal patterns by predicting +future frames based on past frames. Traditional methods, which use recurrent +neural networks to capture temporal patterns, have proven their effectiveness +but come with high system complexity and computational demand. Convolutions +could offer a more efficient alternative but are limited by their +characteristic of treating all previous frames equally, resulting in poor +temporal characterization, and by their local receptive field, limiting the +capacity to capture distant correlations among frames. In this paper, we +propose STLight, a novel method for spatio-temporal learning that relies solely +on channel-wise and depth-wise convolutions as learnable layers. STLight +overcomes the limitations of traditional convolutional approaches by +rearranging spatial and temporal dimensions together, using a single +convolution to mix both types of features into a comprehensive spatio-temporal +patch representation. This representation is then processed in a purely +convolutional framework, capable of focusing simultaneously on the interaction +among near and distant patches, and subsequently allowing for efficient +reconstruction of the predicted frames. Our architecture achieves +state-of-the-art performance on STL benchmarks across different datasets and +settings, while significantly improving computational efficiency in terms of +parameters and computational FLOPs. The code is publicly available + +
+
+ comment: Accepted at WACV 2025 conference +
+
+
+
+
+ + ☆ DiMoDif: Discourse Modality-information Differentiation for Audio-visual + Deepfake Detection and Localization + + +
+ Deepfake technology has rapidly advanced, posing significant threats to +information integrity and societal trust. While significant progress has been +made in detecting deepfakes, the simultaneous manipulation of audio and visual +modalities, sometimes at small parts but still altering the meaning, presents a +more challenging detection scenario. We present a novel audio-visual deepfake +detection framework that leverages the inter-modality differences in machine +perception of speech, based on the assumption that in real samples - in +contrast to deepfakes - visual and audio signals coincide in terms of +information. Our framework leverages features from deep networks that +specialize in video and audio speech recognition to spot frame-level +cross-modal incongruities, and in that way to temporally localize the deepfake +forgery. To this end, DiMoDif employs a Transformer encoder-based architecture +with a feature pyramid scheme and local attention, and optimizes the detection +model through a composite loss function accounting for frame-level detections +and fake intervals localization. DiMoDif outperforms the state-of-the-art on +the Temporal Forgery Localization task by +47.88% AP@0.75 on AV-Deepfake1M, and +performs on-par on LAV-DF. On the Deepfake Detection task, it outperforms the +state-of-the-art by +30.5% AUC on AV-Deepfake1M, +2.8% AUC on FakeAVCeleb, and +performs on-par on LAV-DF. Code available at +https://github.com/mever-team/dimodif. + +
+
+
+
+
+ + ☆ NeISF++: Neural Incident Stokes Field for Polarized Inverse Rendering of + Conductors and Dielectrics + + +
+ Recent inverse rendering methods have greatly improved shape, material, and +illumination reconstruction by utilizing polarization cues. However, existing +methods only support dielectrics, ignoring conductors that are found everywhere +in life. Since conductors and dielectrics have different reflection properties, +using previous conductor methods will lead to obvious errors. In addition, +conductors are glossy, which may cause strong specular reflection and is hard +to reconstruct. To solve the above issues, we propose NeISF++, an inverse +rendering pipeline that supports conductors and dielectrics. The key ingredient +for our proposal is a general pBRDF that describes both conductors and +dielectrics. As for the strong specular reflection problem, we propose a novel +geometry initialization method using DoLP images. This physical cue is +invariant to intensities and thus robust to strong specular reflections. +Experimental results on our synthetic and real datasets show that our method +surpasses the existing polarized inverse rendering methods for geometry and +material decomposition as well as downstream tasks like relighting. + +
+
+
+
+
+ + ☆ Try-On-Adapter: A Simple and Flexible Try-On Paradigm + + +
+ Image-based virtual try-on, widely used in online shopping, aims to generate +images of a naturally dressed person conditioned on certain garments, providing +significant research and commercial potential. A key challenge of try-on is to +generate realistic images of the model wearing the garments while preserving +the details of the garments. Previous methods focus on masking certain parts of +the original model's standing image, and then inpainting on masked areas to +generate realistic images of the model wearing corresponding reference +garments, which treat the try-on task as an inpainting task. However, such +implements require the user to provide a complete, high-quality standing image, +which is user-unfriendly in practical applications. In this paper, we propose +Try-On-Adapter (TOA), an outpainting paradigm that differs from the existing +inpainting paradigm. Our TOA can preserve the given face and garment, naturally +imagine the rest parts of the image, and provide flexible control ability with +various conditions, e.g., garment properties and human pose. In the +experiments, TOA shows excellent performance on the virtual try-on task even +given relatively low-quality face and garment images in qualitative +comparisons. Additionally, TOA achieves the state-of-the-art performance of FID +scores 5.56 and 7.23 for paired and unpaired on the VITON-HD dataset in +quantitative comparisons. + +
+
+ comment: Image virtual try-on, 7 pages, 3 figures +
+
+
+
+
+ + ☆ Efficient Progressive Image Compression with Variance-aware Masking WACV 2025 + + +
+ Learned progressive image compression is gaining momentum as it allows +improved image reconstruction as more bits are decoded at the receiver. We +propose a progressive image compression method in which an image is first +represented as a pair of base-quality and top-quality latent representations. +Next, a residual latent representation is encoded as the element-wise +difference between the top and base representations. Our scheme enables +progressive image compression with element-wise granularity by introducing a +masking system that ranks each element of the residual latent representation +from most to least important, dividing it into complementary components, which +can be transmitted separately to the decoder in order to obtain different +reconstruction quality. The masking system does not add further parameters nor +complexity. At the receiver, any elements of the top latent representation +excluded from the transmitted components can be independently replaced with the +mean predicted by the hyperprior architecture, ensuring reliable +reconstructions at any intermediate quality level. We also introduced Rate +Enhancement Modules (REMs), which refine the estimation of entropy parameters +using already decoded components. We obtain results competitive with +state-of-the-art competitors, while significantly reducing computational +complexity, decoding time, and number of parameters. + +
+
+ comment: 10 pages. Accepted at WACV 2025 +
+
+
+
+
+ + ☆ Visual question answering based evaluation metrics for text-to-image + generation + + +
+ Text-to-image generation and text-guided image manipulation have received +considerable attention in the field of image generation tasks. However, the +mainstream evaluation methods for these tasks have difficulty in evaluating +whether all the information from the input text is accurately reflected in the +generated images, and they mainly focus on evaluating the overall alignment +between the input text and the generated images. This paper proposes new +evaluation metrics that assess the alignment between input text and generated +images for every individual object. Firstly, according to the input text, +chatGPT is utilized to produce questions for the generated images. After that, +we use Visual Question Answering(VQA) to measure the relevance of the generated +images to the input text, which allows for a more detailed evaluation of the +alignment compared to existing methods. In addition, we use Non-Reference Image +Quality Assessment(NR-IQA) to evaluate not only the text-image alignment but +also the quality of the generated images. Experimental results show that our +proposed evaluation approach is the superior metric that can simultaneously +assess finer text-image alignment and image quality while allowing for the +adjustment of these ratios. + +
+
+ comment: Accepted to ISCAS2024 +
+
+
+
+
+ + ☆ CART: Compositional Auto-Regressive Transformer for Image Generation CVPR 2025 + + +
+ In recent years, image synthesis has achieved remarkable advancements, +enabling diverse applications in content creation, virtual reality, and beyond. +We introduce a novel approach to image generation using Auto-Regressive (AR) +modeling, which leverages a next-detail prediction strategy for enhanced +fidelity and scalability. While AR models have achieved transformative success +in language modeling, replicating this success in vision tasks has presented +unique challenges due to the inherent spatial dependencies in images. Our +proposed method addresses these challenges by iteratively adding finer details +to an image compositionally, constructing it as a hierarchical combination of +base and detail image factors. This strategy is shown to be more effective than +the conventional next-token prediction and even surpasses the state-of-the-art +next-scale prediction approaches. A key advantage of this method is its +scalability to higher resolutions without requiring full model retraining, +making it a versatile solution for high-resolution image generation. + +
+
+ comment: under review at CVPR 2025 +
+
+
+
+
+ + ☆ The Surprising Ineffectiveness of Pre-Trained Visual Representations for + Model-Based Reinforcement Learning NeurIPS 2024 + + +
+ Visual Reinforcement Learning (RL) methods often require extensive amounts of +data. As opposed to model-free RL, model-based RL (MBRL) offers a potential +solution with efficient data utilization through planning. Additionally, RL +lacks generalization capabilities for real-world tasks. Prior work has shown +that incorporating pre-trained visual representations (PVRs) enhances sample +efficiency and generalization. While PVRs have been extensively studied in the +context of model-free RL, their potential in MBRL remains largely unexplored. +In this paper, we benchmark a set of PVRs on challenging control tasks in a +model-based RL setting. We investigate the data efficiency, generalization +capabilities, and the impact of different properties of PVRs on the performance +of model-based agents. Our results, perhaps surprisingly, reveal that for MBRL +current PVRs are not more sample efficient than learning representations from +scratch, and that they do not generalize better to out-of-distribution (OOD) +settings. To explain this, we analyze the quality of the trained dynamics +model. Furthermore, we show that data diversity and network architecture are +the most important contributors to OOD generalization performance. + +
+
+ comment: Published at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/ +
+
+
+
+
+ + ☆ SEAGULL: No-reference Image Quality Assessment for Regions of Interest + via Vision-Language Instruction Tuning + + +
+ Existing Image Quality Assessment (IQA) methods achieve remarkable success in +analyzing quality for overall image, but few works explore quality analysis for +Regions of Interest (ROIs). The quality analysis of ROIs can provide +fine-grained guidance for image quality improvement and is crucial for +scenarios focusing on region-level quality. This paper proposes a novel +network, SEAGULL, which can SEe and Assess ROIs quality with GUidance from a +Large vision-Language model. SEAGULL incorporates a vision-language model +(VLM), masks generated by Segment Anything Model (SAM) to specify ROIs, and a +meticulously designed Mask-based Feature Extractor (MFE) to extract global and +local tokens for specified ROIs, enabling accurate fine-grained IQA for ROIs. +Moreover, this paper constructs two ROI-based IQA datasets, SEAGULL-100w and +SEAGULL-3k, for training and evaluating ROI-based IQA. SEAGULL-100w comprises +about 100w synthetic distortion images with 33 million ROIs for pre-training to +improve the model's ability of regional quality perception, and SEAGULL-3k +contains about 3k authentic distortion ROIs to enhance the model's ability to +perceive real world distortions. After pre-training on SEAGULL-100w and +fine-tuning on SEAGULL-3k, SEAGULL shows remarkable performance on fine-grained +ROI quality assessment. Code and datasets are publicly available at the +https://github.com/chencn2020/Seagull. + +
+
+
+
+
+ + ☆ Outliers resistant image classification by anomaly detection + + +
+ Various technologies, including computer vision models, are employed for the +automatic monitoring of manual assembly processes in production. These models +detect and classify events such as the presence of components in an assembly +area or the connection of components. A major challenge with detection and +classification algorithms is their susceptibility to variations in +environmental conditions and unpredictable behavior when processing objects +that are not included in the training dataset. As it is impractical to add all +possible subjects in the training sample, an alternative solution is necessary. +This study proposes a model that simultaneously performs classification and +anomaly detection, employing metric learning to generate vector representations +of images in a multidimensional space, followed by classification using +cross-entropy. For experimentation, a dataset of over 327,000 images was +prepared. Experiments were conducted with various computer vision model +architectures, and the outcomes of each approach were compared. + +
+
+ comment: 19 pages, in Russian +
+
+
+
+
+ + ☆ Matrix-Valued LogSumExp Approximation for Colour Morphology + + +
+ Mathematical morphology is a part of image processing that uses a window that +moves across the image to change certain pixels according to certain +operations. The concepts of supremum and infimum play a crucial role here, but +it proves challenging to define them generally for higher-dimensional data, +such as colour representations. Numerous approaches have therefore been taken +to solve this problem with certain compromises. In this paper we will analyse +the construction of a new approach, which we have already presented +experimentally in paper [Kahra, M., Breu{\ss}, M., Kleefeld, A., Welk, M., DGMM +2024, pp. 325-337]. This is based on a method by Burgeth and Kleefeld [Burgeth, +B., Kleefeld, A., ISMM 2013, pp. 243-254], who regard the colours as symmetric +$2\times2$ matrices and compare them by means of the Loewner order in a bi-cone +through different suprema. However, we will replace the supremum with the +LogExp approximation for the maximum instead. This allows us to transfer the +associativity of the dilation from the one-dimensional case to the +higher-dimensional case. In addition, we will investigate the minimality +property and specify a relaxation to ensure that our approach is continuously +dependent on the input data. + +
+
+ comment: 42 pages, 10 figures, to be submitted in JMIV +
+
+
+
+
+ + ☆ CoSAM: Self-Correcting SAM for Domain Generalization in 2D Medical Image + Segmentation + + +
+ Medical images often exhibit distribution shifts due to variations in imaging +protocols and scanners across different medical centers. Domain Generalization +(DG) methods aim to train models on source domains that can generalize to +unseen target domains. Recently, the segment anything model (SAM) has +demonstrated strong generalization capabilities due to its prompt-based design, +and has gained significant attention in image segmentation tasks. Existing +SAM-based approaches attempt to address the need for manual prompts by +introducing prompt generators that automatically generate these prompts. +However, we argue that auto-generated prompts may not be sufficiently accurate +under distribution shifts, potentially leading to incorrect predictions that +still require manual verification and correction by clinicians. To address this +challenge, we propose a method for 2D medical image segmentation called +Self-Correcting SAM (CoSAM). Our approach begins by generating coarse masks +using SAM in a prompt-free manner, providing prior prompts for the subsequent +stages, and eliminating the need for prompt generators. To automatically refine +these coarse masks, we introduce a generalized error decoder that simulates the +correction process typically performed by clinicians. Furthermore, we generate +diverse prompts as feedback based on the corrected masks, which are used to +iteratively refine the predictions within a self-correcting loop, enhancing the +generalization performance of our model. Extensive experiments on two medical +image segmentation benchmarks across multiple scenarios demonstrate the +superiority of CoSAM over state-of-the-art SAM-based methods. + +
+
+
+
+
+ + ☆ Efficient Density Control for 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3DGS) excels in novel view synthesis, balancing +advanced rendering quality with real-time performance. However, in trained +scenes, a large number of Gaussians with low opacity significantly increase +rendering costs. This issue arises due to flaws in the split and clone +operations during the densification process, which lead to extensive Gaussian +overlap and subsequent opacity reduction. To enhance the efficiency of Gaussian +utilization, we improve the adaptive density control of 3DGS. First, we +introduce a more efficient long-axis split operation to replace the original +clone and split, which mitigates Gaussian overlap and improves densification +efficiency.Second, we propose a simple adaptive pruning technique to reduce the +number of low-opacity Gaussians. Finally, by dynamically lowering the splitting +threshold and applying importance weighting, the efficiency of Gaussian +utilization is further improved.We evaluate our proposed method on various +challenging real-world datasets. Experimental results show that our Efficient +Density Control (EDC) can enhance both the rendering speed and quality. + +
+
+
+
+
+ + ☆ Towards Multi-View Consistent Style Transfer with One-Step Diffusion via + Vision Conditioning ECCV 2024 + + +
+ The stylization of 3D scenes is an increasingly attractive topic in 3D +vision. Although image style transfer has been extensively researched with +promising results, directly applying 2D style transfer methods to 3D scenes +often fails to preserve the structural and multi-view properties of 3D +environments, resulting in unpleasant distortions in images from different +viewpoints. To address these issues, we leverage the remarkable generative +prior of diffusion-based models and propose a novel style transfer method, +OSDiffST, based on a pre-trained one-step diffusion model (i.e., SD-Turbo) for +rendering diverse styles in multi-view images of 3D scenes. To efficiently +adapt the pre-trained model for multi-view style transfer on small datasets, we +introduce a vision condition module to extract style information from the +reference style image to serve as conditional input for the diffusion model and +employ LoRA in diffusion model for adaptation. Additionally, we consider color +distribution alignment and structural similarity between the stylized and +content images using two specific loss functions. As a result, our method +effectively preserves the structural information and multi-view consistency in +stylized images without any 3D information. Experiments show that our method +surpasses other promising style transfer methods in synthesizing various styles +for multi-view images of 3D scenes. Stylized images from different viewpoints +generated by our method achieve superior visual quality, with better structural +integrity and less distortion. The source code is available at +https://github.com/YushenZuo/OSDiffST. + +
+
+ comment: Accepted by ECCV 2024 AI for Visual Arts Workshop and Challenges, 18 + pages, 7 figures +
+
+
+
+
+ + ☆ Multi-Task Adversarial Variational Autoencoder for Estimating Biological + Brain Age with Multimodal Neuroimaging + + +
+ Despite advances in deep learning for estimating brain age from structural +MRI data, incorporating functional MRI data is challenging due to its complex +structure and the noisy nature of functional connectivity measurements. To +address this, we present the Multitask Adversarial Variational Autoencoder, a +custom deep learning framework designed to improve brain age predictions +through multimodal MRI data integration. This model separates latent variables +into generic and unique codes, isolating shared and modality-specific features. +By integrating multitask learning with sex classification as an additional +task, the model captures sex-specific aging patterns. Evaluated on the OpenBHB +dataset, a large multisite brain MRI collection, the model achieves a mean +absolute error of 2.77 years, outperforming traditional methods. This success +positions M-AVAE as a powerful tool for metaverse-based healthcare applications +in brain age estimation. + +
+
+
+
+
+ + ☆ CorrCLIP: Reconstructing Correlations in CLIP with Off-the-Shelf + Foundation Models for Open-Vocabulary Semantic Segmentation + + +
+ Open-vocabulary semantic segmentation aims to assign semantic labels to each +pixel without relying on a predefined set of categories. Contrastive +Language-Image Pre-training (CLIP) demonstrates outstanding zero-shot +classification capabilities but struggles with the pixel-wise segmentation task +as the captured inter-patch correlations correspond to no specific visual +concepts. Despite previous CLIP-based works improving inter-patch correlations +by self-self attention, they still face the inherent limitation that image +patches tend to have high similarity to outlier ones. In this work, we +introduce CorrCLIP, a training-free approach for open-vocabulary semantic +segmentation, which reconstructs significantly coherent inter-patch +correlations utilizing foundation models. Specifically, it employs the Segment +Anything Model (SAM) to define the scope of patch interactions, ensuring that +patches interact only with semantically similar ones. Furthermore, CorrCLIP +obtains an understanding of an image's semantic layout via self-supervised +models to determine concrete similarity values between image patches, which +addresses the similarity irregularity problem caused by the aforementioned +restricted patch interaction regime. Finally, CorrCLIP reuses the region masks +produced by SAM to update the segmentation map. As a training-free method, +CorrCLIP achieves a notable improvement across eight challenging benchmarks +regarding the averaged mean Intersection over Union, boosting it from 44.4% to +51.0%. + +
+
+
+
+
+ + ☆ Influence of Depth Camera Noise Models on Respiration Estimation + + +
+ Depth cameras are an interesting modality for capturing vital signs such as +respiratory rate. Plenty approaches exist to extract vital signs in a +controlled setting, but in order to apply them more flexibly for example in +multi-camera settings, a simulated environment is needed to generate enough +data for training and testing of new algorithms. We show first results of a +3D-rendering simulation pipeline that focuses on different noise models in +order to generate realistic, depth-camera based respiratory signals using both +synthetic and real respiratory signals as a baseline. While most noise can be +accurately modelled as Gaussian in this context, we can show that as soon as +the available image resolution is too low, the differences between different +noise models surface. + +
+
+ comment: Poster Prague 2023 Conference, 4 pages +
+
+
+
+
+ + ☆ Uncertainty-Weighted Mutual Distillation for Multi-View Fusion + + +
+ Multi-view learning often faces challenges in effectively leveraging images +captured from different angles and locations. This challenge is particularly +pronounced when addressing inconsistencies and uncertainties between views. In +this paper, we propose a novel Multi-View Uncertainty-Weighted Mutual +Distillation (MV-UWMD) method. Our method enhances prediction consistency by +performing hierarchical mutual distillation across all possible view +combinations, including single-view, partial multi-view, and full multi-view +predictions. This introduces an uncertainty-based weighting mechanism through +mutual distillation, allowing effective exploitation of unique information from +each view while mitigating the impact of uncertain predictions. We extend a +CNN-Transformer hybrid architecture to facilitate robust feature learning and +integration across multiple view combinations. We conducted extensive +experiments using a large, unstructured dataset captured from diverse, +non-fixed viewpoints. The results demonstrate that MV-UWMD improves prediction +accuracy and consistency compared to existing multi-view learning approaches. + +
+
+
+
+
+ + ☆ Improving the accuracy of automated labeling of specimen images datasets + via a confidence-based process + + +
+ The digitization of natural history collections over the past three decades +has unlocked a treasure trove of specimen imagery and metadata. There is great +interest in making this data more useful by further labeling it with additional +trait data, and modern deep learning machine learning techniques utilizing +convolutional neural nets (CNNs) and similar networks show particular promise +to reduce the amount of required manual labeling by human experts, making the +process much faster and less expensive. However, in most cases, the accuracy of +these approaches is too low for reliable utilization of the automatic labeling, +typically in the range of 80-85% accuracy. In this paper, we present and +validate an approach that can greatly improve this accuracy, essentially by +examining the confidence that the network has in the generated label as well as +utilizing a user-defined threshold to reject labels that fall below a chosen +level. We demonstrate that a naive model that produced 86% initial accuracy can +achieve improved performance - over 95% accuracy (rejecting about 40% of the +labels) or over 99% accuracy (rejecting about 65%) by selecting higher +confidence thresholds. This gives flexibility to adapt existing models to the +statistical requirements of various types of research and has the potential to +move these automatic labeling approaches from being unusably inaccurate to +being an invaluable new tool. After validating the approach in a number of +ways, we annotate the reproductive state of a large dataset of over 600,000 +herbarium specimens. The analysis of the results points at under-investigated +correlations as well as general alignment with known trends. By sharing this +new dataset alongside this work, we want to allow ecologists to gather insights +for their own research questions, at their chosen point of accuracy/coverage +trade-off. + +
+
+
+
+
+ + ☆ Real-Time AI-Driven People Tracking and Counting Using Overhead Cameras + + +
+ Accurate people counting in smart buildings and intelligent transportation +systems is crucial for energy management, safety protocols, and resource +allocation. This is especially critical during emergencies, where precise +occupant counts are vital for safe evacuation. Existing methods struggle with +large crowds, often losing accuracy with even a few additional people. To +address this limitation, this study proposes a novel approach combining a new +object tracking algorithm, a novel counting algorithm, and a fine-tuned object +detection model. This method achieves 97% accuracy in real-time people counting +with a frame rate of 20-27 FPS on a low-power edge computer. + +
+
+ comment: This paper is accepted to IEEE Region 10 conference (TENCON) 2024 +
+
+
+
+
+ + ☆ Evidential Federated Learning for Skin Lesion Image Classification ICPR 2024 + + +
+ We introduce FedEvPrompt, a federated learning approach that integrates +principles of evidential deep learning, prompt tuning, and knowledge +distillation for distributed skin lesion classification. FedEvPrompt leverages +two sets of prompts: b-prompts (for low-level basic visual knowledge) and +t-prompts (for task-specific knowledge) prepended to frozen pre-trained Vision +Transformer (ViT) models trained in an evidential learning framework to +maximize class evidences. Crucially, knowledge sharing across federation +clients is achieved only through knowledge distillation on attention maps +generated by the local ViT models, ensuring enhanced privacy preservation +compared to traditional parameter or synthetic image sharing methodologies. +FedEvPrompt is optimized within a round-based learning paradigm, where each +round involves training local models followed by attention maps sharing with +all federation clients. Experimental validation conducted in a real distributed +setting, on the ISIC2019 dataset, demonstrates the superior performance of +FedEvPrompt against baseline federated learning algorithms and knowledge +distillation methods, without sharing model parameters. In conclusion, +FedEvPrompt offers a promising approach for federated learning, effectively +addressing challenges such as data heterogeneity, imbalance, privacy +preservation, and knowledge sharing. + +
+
+ comment: Published as a conference paper at ICPR 2024 +
+
+
+
+
+ + ☆ Step-wise Distribution Alignment Guided Style Prompt Tuning for + Source-free Cross-domain Few-shot Learning + + +
+ Existing cross-domain few-shot learning (CDFSL) methods, which develop +source-domain training strategies to enhance model transferability, face +challenges with large-scale pre-trained models (LMs) due to inaccessible source +data and training strategies. Moreover, fine-tuning LMs for CDFSL demands +substantial computational resources, limiting practicality. This paper +addresses the source-free CDFSL (SF-CDFSL) problem, tackling few-shot learning +(FSL) in the target domain using only pre-trained models and a few target +samples without source data or strategies. To overcome the challenge of +inaccessible source data, this paper introduces Step-wise Distribution +Alignment Guided Style Prompt Tuning (StepSPT), which implicitly narrows domain +gaps through prediction distribution optimization. StepSPT proposes a style +prompt to align target samples with the desired distribution and adopts a +dual-phase optimization process. In the external process, a step-wise +distribution alignment strategy factorizes prediction distribution optimization +into a multi-step alignment problem to tune the style prompt. In the internal +process, the classifier is updated using standard cross-entropy loss. +Evaluations on five datasets demonstrate that StepSPT outperforms existing +prompt tuning-based methods and SOTAs. Ablation studies further verify its +effectiveness. Code will be made publicly available at +\url{https://github.com/xuhuali-mxj/StepSPT}. + +
+
+ comment: 15 pages, 12 figures, 7 tables +
+
+
+
+
+ + ☆ Diachronic Document Dataset for Semantic Layout Analysis + + +
+ We present a novel, open-access dataset designed for semantic layout +analysis, built to support document recreation workflows through mapping with +the Text Encoding Initiative (TEI) standard. This dataset includes 7,254 +annotated pages spanning a large temporal range (1600-2024) of digitised and +born-digital materials across diverse document types (magazines, papers from +sciences and humanities, PhD theses, monographs, plays, administrative reports, +etc.) sorted into modular subsets. By incorporating content from different +periods and genres, it addresses varying layout complexities and historical +changes in document structure. The modular design allows domain-specific +configurations. We evaluate object detection models on this dataset, examining +the impact of input size and subset-based training. Results show that a +1280-pixel input size for YOLO is optimal and that training on subsets +generally benefits from incorporating them into a generic model rather than +fine-tuning pre-trained weights. + +
+
+
+
+
+ + ☆ Federated Domain Generalization via Prompt Learning and Aggregation + + +
+ Federated domain generalization (FedDG) aims to improve the global model +generalization in unseen domains by addressing data heterogeneity under +privacy-preserving constraints. A common strategy in existing FedDG studies +involves sharing domain-specific knowledge among clients, such as spectrum +information, class prototypes, and data styles. However, this knowledge is +extracted directly from local client samples, and sharing such sensitive +information poses a potential risk of data leakage, which might not fully meet +the requirements of FedDG. In this paper, we introduce prompt learning to adapt +pre-trained vision-language models (VLMs) in the FedDG scenario, and leverage +locally learned prompts as a more secure bridge to facilitate knowledge +transfer among clients. Specifically, we propose a novel FedDG framework +through Prompt Learning and AggregatioN (PLAN), which comprises two training +stages to collaboratively generate local prompts and global prompts at each +federated round. First, each client performs both text and visual prompt +learning using their own data, with local prompts indirectly synchronized by +regarding the global prompts as a common reference. Second, all domain-specific +local prompts are exchanged among clients and selectively aggregated into the +global prompts using lightweight attention-based aggregators. The global +prompts are finally applied to adapt VLMs to unseen target domains. As our PLAN +framework requires training only a limited number of prompts and lightweight +aggregators, it offers notable advantages in computational and communication +efficiency for FedDG. Extensive experiments demonstrate the superior +generalization ability of PLAN across four benchmark datasets. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ EchoMimicV2: Towards Striking, Simplified, and Semi-Body Human Animation + + +
+ Recent work on human animation usually involves audio, pose, or movement maps +conditions, thereby achieves vivid animation quality. However, these methods +often face practical challenges due to extra control conditions, cumbersome +condition injection modules, or limitation to head region driving. Hence, we +ask if it is possible to achieve striking half-body human animation while +simplifying unnecessary conditions. To this end, we propose a half-body human +animation method, dubbed EchoMimicV2, that leverages a novel Audio-Pose Dynamic +Harmonization strategy, including Pose Sampling and Audio Diffusion, to enhance +half-body details, facial and gestural expressiveness, and meanwhile reduce +conditions redundancy. To compensate for the scarcity of half-body data, we +utilize Head Partial Attention to seamlessly accommodate headshot data into our +training framework, which can be omitted during inference, providing a free +lunch for animation. Furthermore, we design the Phase-specific Denoising Loss +to guide motion, detail, and low-level quality for animation in specific +phases, respectively. Besides, we also present a novel benchmark for evaluating +the effectiveness of half-body human animation. Extensive experiments and +analyses demonstrate that EchoMimicV2 surpasses existing methods in both +quantitative and qualitative evaluations. + +
+
+
+
+
+ + ☆ Rethinking Normalization Strategies and Convolutional Kernels for + Multimodal Image Fusion + + +
+ Multimodal image fusion (MMIF) aims to integrate information from different +modalities to obtain a comprehensive image, aiding downstream tasks. However, +existing methods tend to prioritize natural image fusion and focus on +information complementary and network training strategies. They ignore the +essential distinction between natural and medical image fusion and the +influence of underlying components. This paper dissects the significant +differences between the two tasks regarding fusion goals, statistical +properties, and data distribution. Based on this, we rethink the suitability of +the normalization strategy and convolutional kernels for end-to-end +MMIF.Specifically, this paper proposes a mixture of instance normalization and +group normalization to preserve sample independence and reinforce intrinsic +feature correlation.This strategy promotes the potential of enriching feature +maps, thus boosting fusion performance. To this end, we further introduce the +large kernel convolution, effectively expanding receptive fields and enhancing +the preservation of image detail. Moreover, the proposed multipath adaptive +fusion module recalibrates the decoder input with features of various scales +and receptive fields, ensuring the transmission of crucial information. +Extensive experiments demonstrate that our method exhibits state-of-the-art +performance in multiple fusion tasks and significantly improves downstream +applications. The code is available at https://github.com/HeDan-11/LKC-FUNet. + +
+
+
+
+
+ + ☆ GSEditPro: 3D Gaussian Splatting Editing with Attention-based + Progressive Localization + + +
+ With the emergence of large-scale Text-to-Image(T2I) models and implicit 3D +representations like Neural Radiance Fields (NeRF), many text-driven generative +editing methods based on NeRF have appeared. However, the implicit encoding of +geometric and textural information poses challenges in accurately locating and +controlling objects during editing. Recently, significant advancements have +been made in the editing methods of 3D Gaussian Splatting, a real-time +rendering technology that relies on explicit representation. However, these +methods still suffer from issues including inaccurate localization and limited +manipulation over editing. To tackle these challenges, we propose GSEditPro, a +novel 3D scene editing framework which allows users to perform various creative +and precise editing using text prompts only. Leveraging the explicit nature of +the 3D Gaussian distribution, we introduce an attention-based progressive +localization module to add semantic labels to each Gaussian during rendering. +This enables precise localization on editing areas by classifying Gaussians +based on their relevance to the editing prompts derived from cross-attention +layers of the T2I model. Furthermore, we present an innovative editing +optimization method based on 3D Gaussian Splatting, obtaining stable and +refined editing results through the guidance of Score Distillation Sampling and +pseudo ground truth. We prove the efficacy of our method through extensive +experiments. + +
+
+ comment: Pacific Graphics 2024 +
+
+
+
+
+ + ☆ VMID: A Multimodal Fusion LLM Framework for Detecting and Identifying + Misinformation of Short Videos + + +
+ Short video platforms have become important channels for news dissemination, +offering a highly engaging and immediate way for users to access current events +and share information. However, these platforms have also emerged as +significant conduits for the rapid spread of misinformation, as fake news and +rumors can leverage the visual appeal and wide reach of short videos to +circulate extensively among audiences. Existing fake news detection methods +mainly rely on single-modal information, such as text or images, or apply only +basic fusion techniques, limiting their ability to handle the complex, +multi-layered information inherent in short videos. To address these +limitations, this paper presents a novel fake news detection method based on +multimodal information, designed to identify misinformation through a +multi-level analysis of video content. This approach effectively utilizes +different modal representations to generate a unified textual description, +which is then fed into a large language model for comprehensive evaluation. The +proposed framework successfully integrates multimodal features within videos, +significantly enhancing the accuracy and reliability of fake news detection. +Experimental results demonstrate that the proposed approach outperforms +existing models in terms of accuracy, robustness, and utilization of multimodal +information, achieving an accuracy of 90.93%, which is significantly higher +than the best baseline model (SV-FEND) at 81.05%. Furthermore, case studies +provide additional evidence of the effectiveness of the approach in accurately +distinguishing between fake news, debunking content, and real incidents, +highlighting its reliability and robustness in real-world applications. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2211.10973 by other authors +
+
+
+
+
+ + ☆ Toward Robust and Accurate Adversarial Camouflage Generation against + Vehicle Detectors + + +
+ Adversarial camouflage is a widely used physical attack against vehicle +detectors for its superiority in multi-view attack performance. One promising +approach involves using differentiable neural renderers to facilitate +adversarial camouflage optimization through gradient back-propagation. However, +existing methods often struggle to capture environmental characteristics during +the rendering process or produce adversarial textures that can precisely map to +the target vehicle. Moreover, these approaches neglect diverse weather +conditions, reducing the efficacy of generated camouflage across varying +weather scenarios. To tackle these challenges, we propose a robust and accurate +camouflage generation method, namely RAUCA. The core of RAUCA is a novel neural +rendering component, End-to-End Neural Renderer Plus (E2E-NRP), which can +accurately optimize and project vehicle textures and render images with +environmental characteristics such as lighting and weather. In addition, we +integrate a multi-weather dataset for camouflage generation, leveraging the +E2E-NRP to enhance the attack robustness. Experimental results on six popular +object detectors show that RAUCA-final outperforms existing methods in both +simulation and real-world settings. + +
+
+ comment: 14 pages. arXiv admin note: substantial text overlap with + arXiv:2402.15853 +
+
+
+
+
+ + ☆ MOT\_FCG++: Enhanced Representation of Motion and Appearance Features + + +
+ The goal of multi-object tracking (MOT) is to detect and track all objects in +a scene across frames, while maintaining a unique identity for each object. +Most existing methods rely on the spatial motion features and appearance +embedding features of the detected objects in consecutive frames. Effectively +and robustly representing the spatial and appearance features of long +trajectories has become a critical factor affecting the performance of MOT. We +propose a novel approach for appearance and spatial feature representation, +improving upon the clustering association method MOT\_FCG. For spatial motion +features, we propose Diagonal Modulated GIoU, which more accurately represents +the relationship between the position and shape of the objects. For appearance +features, we utilize a dynamic appearance representation that incorporates +confidence information, enabling the trajectory appearance features to be more +robust and global. Based on the baseline model MOT\_FCG, we achieved 76.1 HOTA, +80.4 MOTA and 81.3 IDF1 on the MOT17 validation set, and also achieved +competitive performance on the MOT20 and DanceTrack validation sets. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Towards Utilising a Range of Neural Activations for Comprehending + Representational Associations + + +
+ Recent efforts to understand intermediate representations in deep neural +networks have commonly attempted to label individual neurons and combinations +of neurons that make up linear directions in the latent space by examining +extremal neuron activations and the highest direction projections. In this +paper, we show that this approach, although yielding a good approximation for +many purposes, fails to capture valuable information about the behaviour of a +representation. Neural network activations are generally dense, and so a more +complex, but realistic scenario is that linear directions encode information at +various levels of stimulation. We hypothesise that non-extremal level +activations contain complex information worth investigating, such as +statistical associations, and thus may be used to locate confounding human +interpretable concepts. We explore the value of studying a range of neuron +activations by taking the case of mid-level output neuron activations and +demonstrate on a synthetic dataset how they can inform us about aspects of +representations in the penultimate layer not evident through analysing maximal +activations alone. We use our findings to develop a method to curate data from +mid-range logit samples for retraining to mitigate spurious correlations, or +confounding concepts in the penultimate layer, on real benchmark datasets. The +success of our method exemplifies the utility of inspecting non-maximal +activations to extract complex relationships learned by models. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ MicroCrackAttentionNeXt: Advancing Microcrack Detection in Wave Field + Analysis Using Deep Neural Networks through Feature Visualization + + +
+ Micro Crack detection using deep neural networks (DNNs) through an automated +pipeline using wave fields interacting with the damaged areas is highly sought +after. These high-dimensional spatio-temporal crack data are limited, and these +datasets have large dimensions in the temporal domain. The dataset presents a +substantial class imbalance, with crack pixels constituting an average of only +5% of the total pixels per sample. This extreme class imbalance poses a +challenge for deep learning models with the different micro-scale cracks, as +the network can be biased toward predicting the majority class, generally +leading to poor detection accuracy. This study builds upon the previous +benchmark SpAsE-Net, an asymmetric encoder-decoder network for micro-crack +detection. The impact of various activation and loss functions were examined +through feature space visualization using the manifold discovery and analysis +(MDA) algorithm. The optimized architecture and training methodology achieved +an accuracy of 86.85%. + +
+
+
+
+
+ + ☆ Efficient Depth Estimation for Unstable Stereo Camera Systems on AR + Glasses + + +
+ Stereo depth estimation is a fundamental component in augmented reality (AR) +applications. Although AR applications require very low latency for their +real-time applications, traditional depth estimation models often rely on +time-consuming preprocessing steps such as rectification to achieve high +accuracy. Also, non standard ML operator based algorithms such as cost volume +also require significant latency, which is aggravated on compute +resource-constrained mobile platforms. Therefore, we develop hardware-friendly +alternatives to the costly cost volume and preprocessing and design two new +models based on them, MultiHeadDepth and HomoDepth. Our approaches for cost +volume is replacing it with a new group-pointwise convolution-based operator +and approximation of consine similarity based on layernorm and dot product. For +online stereo rectification (preprocessing), we introduce homograhy matrix +prediction network with a rectification positional encoding (RPE), which +delivers both low latency and robustness to unrectified images, which +eliminates the needs for preprocessing. Our MultiHeadDepth, which includes +optimized cost volume, provides 11.8-30.3% improvements in accuracy and +22.9-25.2% reduction in latency compared to a state-of-the-art depth estimation +model for AR glasses from industry. Our HomoDepth, which includes optimized +preprocessing (Homograhpy + RPE) upon MultiHeadDepth, can process unrectified +images and reduce the end-to-end latency by 44.5%. We adopt a multi-task +learning framework to handle misaligned stereo inputs on HomoDepth, which +reduces theAbsRel error by 10.0-24.3%. The results demonstrate the efficacy of +our approaches in achieving both high model performance with low latency, which +makes a step forward toward practical depth estimation on future AR devices. + +
+
+
+
+
+ + ☆ EyeDiff: text-to-image diffusion model improves rare eye disease + diagnosis + + +
+ The rising prevalence of vision-threatening retinal diseases poses a +significant burden on the global healthcare systems. Deep learning (DL) offers +a promising solution for automatic disease screening but demands substantial +data. Collecting and labeling large volumes of ophthalmic images across various +modalities encounters several real-world challenges, especially for rare +diseases. Here, we introduce EyeDiff, a text-to-image model designed to +generate multimodal ophthalmic images from natural language prompts and +evaluate its applicability in diagnosing common and rare diseases. EyeDiff is +trained on eight large-scale datasets using the advanced latent diffusion +model, covering 14 ophthalmic image modalities and over 80 ocular diseases, and +is adapted to ten multi-country external datasets. The generated images +accurately capture essential lesional characteristics, achieving high alignment +with text prompts as evaluated by objective metrics and human experts. +Furthermore, integrating generated images significantly enhances the accuracy +of detecting minority classes and rare eye diseases, surpassing traditional +oversampling methods in addressing data imbalance. EyeDiff effectively tackles +the issue of data imbalance and insufficiency typically encountered in rare +diseases and addresses the challenges of collecting large-scale annotated +images, offering a transformative solution to enhance the development of +expert-level diseases diagnosis models in ophthalmic field. + +
+
+ comment: 28 pages, 2 figures +
+
+
+
+
+ + ☆ Adaptive Non-Uniform Timestep Sampling for Diffusion Model Training + + +
+ As a highly expressive generative model, diffusion models have demonstrated +exceptional success across various domains, including image generation, natural +language processing, and combinatorial optimization. However, as data +distributions grow more complex, training these models to convergence becomes +increasingly computationally intensive. While diffusion models are typically +trained using uniform timestep sampling, our research shows that the variance +in stochastic gradients varies significantly across timesteps, with +high-variance timesteps becoming bottlenecks that hinder faster convergence. To +address this issue, we introduce a non-uniform timestep sampling method that +prioritizes these more critical timesteps. Our method tracks the impact of +gradient updates on the objective for each timestep, adaptively selecting those +most likely to minimize the objective effectively. Experimental results +demonstrate that this approach not only accelerates the training process, but +also leads to improved performance at convergence. Furthermore, our method +shows robust performance across various datasets, scheduling strategies, and +diffusion architectures, outperforming previously proposed timestep sampling +and weighting heuristics that lack this degree of robustness. + +
+
+
+
+
+ + ☆ Unlocking Transfer Learning for Open-World Few-Shot Recognition + + +
+ Few-Shot Open-Set Recognition (FSOSR) targets a critical real-world +challenge, aiming to categorize inputs into known categories, termed closed-set +classes, while identifying open-set inputs that fall outside these classes. +Although transfer learning where a model is tuned to a given few-shot task has +become a prominent paradigm in closed-world, we observe that it fails to expand +to open-world. To unlock this challenge, we propose a two-stage method which +consists of open-set aware meta-learning with open-set free transfer learning. +In the open-set aware meta-learning stage, a model is trained to establish a +metric space that serves as a beneficial starting point for the subsequent +stage. During the open-set free transfer learning stage, the model is further +adapted to a specific target task through transfer learning. Additionally, we +introduce a strategy to simulate open-set examples by modifying the training +dataset or generating pseudo open-set examples. The proposed method achieves +state-of-the-art performance on two widely recognized benchmarks, miniImageNet +and tieredImageNet, with only a 1.5\% increase in training effort. Our work +demonstrates the effectiveness of transfer learning in FSOSR. + +
+
+
+
+
+ + ☆ Explanation for Trajectory Planning using Multi-modal Large Language + Model for Autonomous Driving ECCV 2024 + + +
+ End-to-end style autonomous driving models have been developed recently. +These models lack interpretability of decision-making process from perception +to control of the ego vehicle, resulting in anxiety for passengers. To +alleviate it, it is effective to build a model which outputs captions +describing future behaviors of the ego vehicle and their reason. However, the +existing approaches generate reasoning text that inadequately reflects the +future plans of the ego vehicle, because they train models to output captions +using momentary control signals as inputs. In this study, we propose a +reasoning model that takes future planning trajectories of the ego vehicle as +inputs to solve this limitation with the dataset newly collected. + +
+
+ comment: Accepted and presented at ECCV 2024 2nd Workshop on Vision-Centric + Autonomous Driving (VCAD) on September 30, 2024. 13 pages, 5 figures +
+
+
+
+
+ + ☆ Seeing Clearly by Layer Two: Enhancing Attention Heads to Alleviate + Hallucination in LVLMs + + +
+ The hallucination problem in multimodal large language models (MLLMs) remains +a common issue. Although image tokens occupy a majority of the input sequence +of MLLMs, there is limited research to explore the relationship between image +tokens and hallucinations. In this paper, we analyze the distribution of +attention scores for image tokens across each layer and head of the model, +revealing an intriguing and common phenomenon: most hallucinations are closely +linked to the pattern of attention sinks in the self-attention matrix of image +tokens, where shallow layers exhibit dense attention sinks and deeper layers +show sparse attention sinks. We further analyze the attention heads of +different layers and find that heads with high-density attention sink in the +image part play a positive role in alleviating hallucinations. In this paper, +we propose a training-free method named \textcolor{red}{\textbf{E}}nhancing +\textcolor{red}{\textbf{A}}ttention \textcolor{red}{\textbf{H}}eads (EAH), an +approach designed to enhance the convergence of image tokens attention sinks in +the shallow layers. EAH identifies the attention head that shows the vision +sink in a shallow layer and extracts its attention matrix. This attention map +is then broadcast to other heads in the layer, thereby strengthening the layer +to pay more attention to the image itself. With extensive experiments, EAH +shows significant hallucination-mitigating performance on different MLLMs and +metrics, proving its effectiveness and generality. + +
+
+
+
+
+ + ☆ Instruction-Guided Editing Controls for Images and Multimedia: A Survey + in LLM era + + +
+ The rapid advancement of large language models (LLMs) and multimodal learning +has transformed digital content creation and manipulation. Traditional visual +editing tools require significant expertise, limiting accessibility. Recent +strides in instruction-based editing have enabled intuitive interaction with +visual content, using natural language as a bridge between user intent and +complex editing operations. This survey provides an overview of these +techniques, focusing on how LLMs and multimodal models empower users to achieve +precise visual modifications without deep technical knowledge. By synthesizing +over 100 publications, we explore methods from generative adversarial networks +to diffusion models, examining multimodal integration for fine-grained content +control. We discuss practical applications across domains such as fashion, 3D +scene manipulation, and video synthesis, highlighting increased accessibility +and alignment with human intuition. Our survey compares existing literature, +emphasizing LLM-empowered editing, and identifies key challenges to stimulate +further research. We aim to democratize powerful visual editing across various +industries, from entertainment to education. Interested readers are encouraged +to access our repository at +https://github.com/tamlhp/awesome-instruction-editing. + +
+
+
+
+
+ + ☆ GGAvatar: Reconstructing Garment-Separated 3D Gaussian Splatting Avatars + from Monocular Video + + +
+ Avatar modelling has broad applications in human animation and virtual +try-ons. Recent advancements in this field have focused on high-quality and +comprehensive human reconstruction but often overlook the separation of +clothing from the body. To bridge this gap, this paper introduces GGAvatar +(Garment-separated 3D Gaussian Splatting Avatar), which relies on monocular +videos. Through advanced parameterized templates and unique phased training, +this model effectively achieves decoupled, editable, and realistic +reconstruction of clothed humans. Comparative evaluations with other costly +models confirm GGAvatar's superior quality and efficiency in modelling both +clothed humans and separable garments. The paper also showcases applications in +clothing editing, as illustrated in Figure 1, highlighting the model's benefits +and the advantages of effective disentanglement. The code is available at +https://github.com/J-X-Chen/GGAvatar/. + +
+
+ comment: MMAsia'24 Accepted +
+
+
+
+
+ + ☆ JRadiEvo: A Japanese Radiology Report Generation Model Enhanced by + Evolutionary Optimization of Model Merging NeurIPS'24 + + +
+ With the rapid advancement of large language models (LLMs), foundational +models (FMs) have seen significant advancements. Healthcare is one of the most +crucial application areas for these FMs, given the significant time and effort +required for physicians to analyze large volumes of patient data. Recent +efforts have focused on adapting multimodal FMs to the medical domain through +techniques like instruction-tuning, leading to the development of medical +foundation models (MFMs). However, these approaches typically require large +amounts of training data to effectively adapt models to the medical field. +Moreover, most existing models are trained on English datasets, limiting their +practicality in non-English-speaking regions where healthcare professionals and +patients are not always fluent in English. The need for translation introduces +additional costs and inefficiencies. To address these challenges, we propose a +\textbf{J}apanese \textbf{Radi}ology report generation model enhanced by +\textbf{Evo}lutionary optimization of model merging (JRadiEvo). This is the +first attempt to extend a non-medical vision-language foundation model to the +medical domain through evolutionary optimization of model merging. We +successfully created a model that generates accurate Japanese reports from +X-ray images using only 50 translated samples from publicly available data. +This model, developed with highly efficient use of limited data, outperformed +leading models from recent research trained on much larger datasets. +Additionally, with only 8 billion parameters, this relatively compact +foundation model can be deployed locally within hospitals, making it a +practical solution for environments where APIs and other external services +cannot be used due to strict privacy and security requirements. + +
+
+ comment: Accepted by NeurIPS'24 Workshop on AIM-FM: Advancements In Medical + Foundation Models: Explainability, Robustness, Security, and Beyond +
+
+
+
+
+ + ☆ A Polarization Image Dehazing Method Based on the Principle of Physical + Diffusion + + +
+ Computer vision is increasingly used in areas such as unmanned vehicles, +surveillance systems and remote sensing. However, in foggy scenarios, image +degradation leads to loss of target details, which seriously affects the +accuracy and effectiveness of these vision tasks. Polarized light, due to the +fact that its electromagnetic waves vibrate in a specific direction, is able to +resist scattering and refraction effects in complex media more effectively +compared to unpolarized light. As a result, polarized light has a greater +ability to maintain its polarization characteristics in complex transmission +media and under long-distance imaging conditions. This property makes polarized +imaging especially suitable for complex scenes such as outdoor and underwater, +especially in foggy environments, where higher quality images can be obtained. +Based on this advantage, we propose an innovative semi-physical polarization +dehazing method that does not rely on an external light source. The method +simulates the diffusion process of fog and designs a diffusion kernel that +corresponds to the image blurriness caused by this diffusion. By employing +spatiotemporal Fourier transforms and deconvolution operations, the method +recovers the state of fog droplets prior to diffusion and the light inversion +distribution of objects. This approach effectively achieves dehazing and detail +enhancement of the scene. + +
+
+
+
+
+ + ☆ Motion-Grounded Video Reasoning: Understanding and Perceiving Motion at + Pixel Level + + +
+ In this paper, we introduce Motion-Grounded Video Reasoning, a new motion +understanding task that requires generating visual answers (video segmentation +masks) according to the input question, and hence needs implicit spatiotemporal +reasoning and grounding. This task extends existing spatiotemporal grounding +work focusing on explicit action/motion grounding, to a more general format by +enabling implicit reasoning via questions. To facilitate the development of the +new task, we collect a large-scale dataset called GROUNDMORE, which comprises +1,715 video clips, 249K object masks that are deliberately designed with 4 +question types (Causal, Sequential, Counterfactual, and Descriptive) for +benchmarking deep and comprehensive motion reasoning abilities. GROUNDMORE +uniquely requires models to generate visual answers, providing a more concrete +and visually interpretable response than plain texts. It evaluates models on +both spatiotemporal grounding and reasoning, fostering to address complex +challenges in motion-related video reasoning, temporal perception, and +pixel-level understanding. Furthermore, we introduce a novel baseline model +named Motion-Grounded Video Reasoning Assistant (MORA). MORA incorporates the +multimodal reasoning ability from the Multimodal LLM, the pixel-level +perception capability from the grounding model (SAM), and the temporal +perception ability from a lightweight localization head. MORA achieves +respectable performance on GROUNDMORE outperforming the best existing visual +grounding baseline model by an average of 21.5% relatively. We hope this novel +and challenging task will pave the way for future advancements in robust and +general motion understanding via video reasoning segmentation + +
+
+
+
+
+ + ☆ mmSpyVR: Exploiting mmWave Radar for Penetrating Obstacles to Uncover + Privacy Vulnerability of Virtual Reality + + +
+ Virtual reality (VR), while enhancing user experiences, introduces +significant privacy risks. This paper reveals a novel vulnerability in VR +systems that allows attackers to capture VR privacy through obstacles utilizing +millimeter-wave (mmWave) signals without physical intrusion and virtual +connection with the VR devices. We propose mmSpyVR, a novel attack on VR user's +privacy via mmWave radar. The mmSpyVR framework encompasses two main parts: (i) +A transfer learning-based feature extraction model to achieve VR feature +extraction from mmWave signal. (ii) An attention-based VR privacy spying module +to spy VR privacy information from the extracted feature. The mmSpyVR +demonstrates the capability to extract critical VR privacy from the mmWave +signals that have penetrated through obstacles. We evaluate mmSpyVR through +IRB-approved user studies. Across 22 participants engaged in four experimental +scenes utilizing VR devices from three different manufacturers, our system +achieves an application recognition accuracy of 98.5\% and keystroke +recognition accuracy of 92.6\%. This newly discovered vulnerability has +implications across various domains, such as cybersecurity, privacy protection, +and VR technology development. We also engage with VR manufacturer Meta to +discuss and explore potential mitigation strategies. Data and code are publicly +available for scrutiny and research at https://github.com/luoyumei1-a/mmSpyVR/ + +
+
+
+
+
+ + ☆ DiffFNO: Diffusion Fourier Neural Operator + + +
+ We introduce DiffFNO, a novel diffusion framework for arbitrary-scale +super-resolution strengthened by a Weighted Fourier Neural Operator (WFNO). +Mode Re-balancing in WFNO effectively captures critical frequency components, +significantly improving the reconstruction of high-frequency image details that +are crucial for super-resolution tasks. Gated Fusion Mechanism (GFM) adaptively +complements WFNO's spectral features with spatial features from an +Attention-based Neural Operator (AttnNO). This enhances the network's +capability to capture both global structures and local details. Adaptive +Time-Step (ATS) ODE solver, a deterministic sampling strategy, accelerates +inference without sacrificing output quality by dynamically adjusting +integration step sizes ATS. Extensive experiments demonstrate that DiffFNO +achieves state-of-the-art (SOTA) results, outperforming existing methods across +various scaling factors by a margin of 2 to 4 dB in PSNR, including those +beyond the training distribution. It also achieves this at lower inference +time. Our approach sets a new standard in super-resolution, delivering both +superior accuracy and computational efficiency. + +
+
+
+
+
+ + ☆ Free Lunch in Pathology Foundation Model: Task-specific Model Adaptation + with Concept-Guided Feature Enhancement + + +
+ Whole slide image (WSI) analysis is gaining prominence within the medical +imaging field. Recent advances in pathology foundation models have shown the +potential to extract powerful feature representations from WSIs for downstream +tasks. However, these foundation models are usually designed for +general-purpose pathology image analysis and may not be optimal for specific +downstream tasks or cancer types. In this work, we present Concept +Anchor-guided Task-specific Feature Enhancement (CATE), an adaptable paradigm +that can boost the expressivity and discriminativeness of pathology foundation +models for specific downstream tasks. Based on a set of task-specific concepts +derived from the pathology vision-language model with expert-designed prompts, +we introduce two interconnected modules to dynamically calibrate the generic +image features extracted by foundation models for certain tasks or cancer +types. Specifically, we design a Concept-guided Information Bottleneck module +to enhance task-relevant characteristics by maximizing the mutual information +between image features and concept anchors while suppressing superfluous +information. Moreover, a Concept-Feature Interference module is proposed to +utilize the similarity between calibrated features and concept anchors to +further generate discriminative task-specific features. The extensive +experiments on public WSI datasets demonstrate that CATE significantly enhances +the performance and generalizability of MIL models. Additionally, heatmap and +umap visualization results also reveal the effectiveness and interpretability +of CATE. The source code is available at https://github.com/HKU-MedAI/CATE. + +
+
+
+
+
+ + ☆ Memory Proxy Maps for Visual Navigation + + +
+ Visual navigation takes inspiration from humans, who navigate in previously +unseen environments using vision without detailed environment maps. Inspired by +this, we introduce a novel no-RL, no-graph, no-odometry approach to visual +navigation using feudal learning to build a three tiered agent. Key to our +approach is a memory proxy map (MPM), an intermediate representation of the +environment learned in a self-supervised manner by the high-level manager agent +that serves as a simplified memory, approximating what the agent has seen. We +demonstrate that recording observations in this learned latent space is an +effective and efficient memory proxy that can remove the need for graphs and +odometry in visual navigation tasks. For the mid-level manager agent, we +develop a waypoint network (WayNet) that outputs intermediate subgoals, or +waypoints, imitating human waypoint selection during local navigation. For the +low-level worker agent, we learn a classifier over a discrete action space that +avoids local obstacles and moves the agent towards the WayNet waypoint. The +resulting feudal navigation network offers a novel approach with no RL, no +graph, no odometry, and no metric map; all while achieving SOTA results on the +image goal navigation task. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2402.12498 +
+
+
+
+
+ + ☆ Content-Aware Preserving Image Generation + + +
+ Remarkable progress has been achieved in image generation with the +introduction of generative models. However, precisely controlling the content +in generated images remains a challenging task due to their fundamental +training objective. This paper addresses this challenge by proposing a novel +image generation framework explicitly designed to incorporate desired content +in output images. The framework utilizes advanced encoding techniques, +integrating subnetworks called content fusion and frequency encoding modules. +The frequency encoding module first captures features and structures of +reference images by exclusively focusing on selected frequency components. +Subsequently, the content fusion module generates a content-guiding vector that +encapsulates desired content features. During the image generation process, +content-guiding vectors from real images are fused with projected noise +vectors. This ensures the production of generated images that not only maintain +consistent content from guiding images but also exhibit diverse stylistic +variations. To validate the effectiveness of the proposed framework in +preserving content attributes, extensive experiments are conducted on widely +used benchmark datasets, including Flickr-Faces-High Quality, Animal Faces High +Quality, and Large-scale Scene Understanding datasets. + +
+
+ comment: 35 pages, 12 figures, 1 table, journal +
+
+
+
+
+ + ☆ Face De-identification: State-of-the-art Methods and Comparative Studies + + +
+ The widespread use of image acquisition technologies, along with advances in +facial recognition, has raised serious privacy concerns. Face de-identification +usually refers to the process of concealing or replacing personal identifiers, +which is regarded as an effective means to protect the privacy of facial +images. A significant number of methods for face de-identification have been +proposed in recent years. In this survey, we provide a comprehensive review of +state-of-the-art face de-identification methods, categorized into three levels: +pixel-level, representation-level, and semantic-level techniques. We +systematically evaluate these methods based on two key criteria, the +effectiveness of privacy protection and preservation of image utility, +highlighting their advantages and limitations. Our analysis includes +qualitative and quantitative comparisons of the main algorithms, demonstrating +that deep learning-based approaches, particularly those using Generative +Adversarial Networks (GANs) and diffusion models, have achieved significant +advancements in balancing privacy and utility. Experimental results reveal that +while recent methods demonstrate strong privacy protection, trade-offs remain +in visual fidelity and computational complexity. This survey not only +summarizes the current landscape but also identifies key challenges and future +research directions in face de-identification. + +
+
+
+
+
+ + ☆ Masked Image Contrastive Learning for Efficient Visual Conceptual + Pre-training + + +
+ This paper proposes a scalable and straightforward pre-training paradigm for +efficient visual conceptual representation called masked image contrastive +learning (MiCL). Our MiCL approach is simple: we randomly mask patches to +generate different views within an image and contrast them among a mini-batch +of images. The core idea behind MiCL consists of two designs. First, masked +tokens have the potential to significantly diminish the conceptual redundancy +inherent in images, and create distinct views with substantial fine-grained +differences on the semantic concept level instead of the instance level. +Second, contrastive learning is adept at extracting high-level semantic +conceptual features during the pre-training, circumventing the high-frequency +interference and additional costs associated with image reconstruction. +Importantly, MiCL learns highly semantic conceptual representations efficiently +without relying on hand-crafted data augmentations or additional auxiliary +modules. Empirically, MiCL demonstrates high scalability with Vision +Transformers, as the ViT-L/16 can complete pre-training in 133 hours using only +4 A100 GPUs, achieving 85.8% accuracy in downstream fine-tuning tasks. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Enhancing Diffusion Posterior Sampling for Inverse Problems by + Integrating Crafted Measurements + + +
+ Diffusion models have emerged as a powerful foundation model for visual +generation. With an appropriate sampling process, it can effectively serve as a +generative prior to solve general inverse problems. Current posterior sampling +based methods take the measurement (i.e., degraded image sample) into the +posterior sampling to infer the distribution of the target data (i.e., clean +image sample). However, in this manner, we show that high-frequency information +can be prematurely introduced during the early stages, which could induce +larger posterior estimate errors during the restoration sampling. To address +this issue, we first reveal that forming the log posterior gradient with the +noisy measurement ( i.e., samples from a diffusion forward process) instead of +the clean one can benefit the reverse process. Consequently, we propose a novel +diffusion posterior sampling method DPS-CM, which incorporates a Crafted +Measurement (i.e., samples generated by a reverse denoising process, compared +to random sampling with noise in standard methods) to form the posterior +estimate. This integration aims to mitigate the misalignment with the diffusion +prior caused by cumulative posterior estimate errors. Experimental results +demonstrate that our approach significantly improves the overall capacity to +solve general and noisy inverse problems, such as Gaussian deblurring, +super-resolution, inpainting, nonlinear deblurring, and tasks with Poisson +noise, relative to existing approaches. + +
+
+
+
+
+ + ♻ ☆ Treat Visual Tokens as Text? But Your MLLM Only Needs Fewer Efforts to + See + + +
+ By treating visual tokens from visual encoders as text tokens, Multimodal +Large Language Models (MLLMs) have achieved remarkable progress across diverse +visual understanding tasks, leveraging the robust architectures of Large +Language Models (LLMs). However, as token counts grow, the quadratic scaling of +computation in LLMs introduces a significant efficiency bottleneck, impeding +further scalability. Although recent approaches have explored pruning visual +tokens or employing lighter LLM architectures, the computational overhead from +an increasing number of visual tokens remains a substantial challenge. + In this study, we investigate the redundancy in visual computation at both +the parameter and computational pattern levels within LLaVA, a representative +MLLM, and introduce a suite of streamlined strategies to enhance efficiency. +These include neighbor-aware visual token attention, pruning of inactive visual +attention heads, and selective layer dropping for visual computations. By +implementing these strategies in LLaVA, we achieve a reduction in computational +demands of 88% while maintaining model performance across key benchmarks. +Additionally, we validate the existence of visual computational redundancy in +other MLLMs, such as Qwen2-VL-7B and InternVL-2.0-4B/8B/26B. These results +present a novel pathway for MLLMs to handle dense visual tokens with minimal +computational costs. Code and model checkpoints will be released to support +further research. + +
+
+
+
+
+ + ♻ ☆ Image Matching Filtering and Refinement by Planes and Beyond + + +
+ This paper introduces a modular, non-deep learning method for filtering and +refining sparse correspondences in image matching. Assuming that motion flow +within the scene can be approximated by local homography transformations, +matches are aggregated into overlapping clusters corresponding to virtual +planes using an iterative RANSAC-based approach, with non-conforming +correspondences discarded. Moreover, the underlying planar structural design +provides an explicit map between local patches associated with the matches, +enabling optional refinement of keypoint positions through cross-correlation +template matching after patch reprojection. Finally, to enhance robustness and +fault-tolerance against violations of the piece-wise planar approximation +assumption, a further strategy is designed for minimizing relative patch +distortion in the plane reprojection by introducing an intermediate homography +that projects both patches into a common plane. The proposed method is +extensively evaluated on standard datasets and image matching pipelines, and +compared with state-of-the-art approaches. Unlike other current comparisons, +the proposed benchmark also takes into account the more general, real, and +practical cases where camera intrinsics are unavailable. Experimental results +demonstrate that our proposed non-deep learning, geometry-based approach +achieves performances that are either superior to or on par with recent +state-of-the-art deep learning methods. Finally, this study suggests that there +are still development potential in actual image matching solutions in the +considered research direction, which could be in the future incorporated in +novel deep image matching architectures. + +
+
+ comment: project page: https://github.com/fb82/MiHo +
+
+
+
+
+ + ♻ ☆ ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric + Thermal Images + + +
+ Designing egocentric 3D hand pose estimation systems that can perform +reliably in complex, real-world scenarios is crucial for downstream +applications. Previous approaches using RGB or NIR imagery struggle in +challenging conditions: RGB methods are susceptible to lighting variations and +obstructions like handwear, while NIR techniques can be disrupted by sunlight +or interference from other NIR-equipped devices. To address these limitations, +we present ThermoHands, the first benchmark focused on thermal image-based +egocentric 3D hand pose estimation, demonstrating the potential of thermal +imaging to achieve robust performance under these conditions. The benchmark +includes a multi-view and multi-spectral dataset collected from 28 subjects +performing hand-object and hand-virtual interactions under diverse scenarios, +accurately annotated with 3D hand poses through an automated process. We +introduce a new baseline method, TherFormer, utilizing dual transformer modules +for effective egocentric 3D hand pose estimation in thermal imagery. Our +experimental results highlight TherFormer's leading performance and affirm +thermal imaging's effectiveness in enabling robust 3D hand pose estimation in +adverse conditions. + +
+
+ comment: 15 pages, 9 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Pretrained ViTs Yield Versatile Representations For Medical Images ICCV + 2021 + + +
+ Convolutional Neural Networks (CNNs) have reigned for a decade as the de +facto approach to automated medical image diagnosis, pushing the +state-of-the-art in classification, detection and segmentation tasks. Over the +last years, vision transformers (ViTs) have appeared as a competitive +alternative to CNNs, yielding impressive levels of performance in the natural +image domain, while possessing several interesting properties that could prove +beneficial for medical imaging tasks. In this work, we explore the benefits and +drawbacks of transformer-based models for medical image classification. We +conduct a series of experiments on several standard 2D medical image benchmark +datasets and tasks. Our findings show that, while CNNs perform better if +trained from scratch, off-the-shelf vision transformers can perform on par with +CNNs when pretrained on ImageNet, both in a supervised and self-supervised +setting, rendering them as a viable alternative to CNNs. + +
+
+ comment: Extended version of arXiv:2108.09038 originally published at the ICCV + 2021 Workshop on Computer Vision for Automated Medical Diagnosis +
+
+
+
+
+ + ♻ ☆ CLCE: An Approach to Refining Cross-Entropy and Contrastive Learning for + Optimized Learning Fusion + + +
+ State-of-the-art pre-trained image models predominantly adopt a two-stage +approach: initial unsupervised pre-training on large-scale datasets followed by +task-specific fine-tuning using Cross-Entropy loss~(CE). However, it has been +demonstrated that CE can compromise model generalization and stability. While +recent works employing contrastive learning address some of these limitations +by enhancing the quality of embeddings and producing better decision +boundaries, they often overlook the importance of hard negative mining and rely +on resource intensive and slow training using large sample batches. To counter +these issues, we introduce a novel approach named CLCE, which integrates +Label-Aware Contrastive Learning with CE. Our approach not only maintains the +strengths of both loss functions but also leverages hard negative mining in a +synergistic way to enhance performance. Experimental results demonstrate that +CLCE significantly outperforms CE in Top-1 accuracy across twelve benchmarks, +achieving gains of up to 3.52% in few-shot learning scenarios and 3.41% in +transfer learning settings with the BEiT-3 model. Importantly, our proposed +CLCE approach effectively mitigates the dependency of contrastive learning on +large batch sizes such as 4096 samples per batch, a limitation that has +previously constrained the application of contrastive learning in +budget-limited hardware environments. + +
+
+
+
+
+ + ♻ ☆ DCD: Discriminative and Consistent Representation Distillation + + +
+ Knowledge Distillation (KD) aims to transfer knowledge from a large teacher +model to a smaller student model. While contrastive learning has shown promise +in self-supervised learning by creating discriminative representations, its +application in knowledge distillation remains limited and focuses primarily on +discrimination, neglecting the structural relationships captured by the teacher +model. To address this limitation, we propose Discriminative and Consistent +Distillation (DCD), which employs a contrastive loss along with a consistency +regularization to minimize the discrepancy between the distributions of teacher +and student representations. Our method introduces learnable temperature and +bias parameters that adapt during training to balance these complementary +objectives, replacing the fixed hyperparameters commonly used in contrastive +learning approaches. Through extensive experiments on CIFAR-100 and ImageNet +ILSVRC-2012, we demonstrate that DCD achieves state-of-the-art performance, +with the student model sometimes surpassing the teacher's accuracy. +Furthermore, we show that DCD's learned representations exhibit superior +cross-dataset generalization when transferred to Tiny ImageNet and STL-10. Code +is available at https://github.com/giakoumoglou/distillers. + +
+
+ comment: 11 pages, 3 figures, 6 tables. The paper's title has been changed, + again +
+
+
+
+
+ + ♻ ☆ Q-VLM: Post-training Quantization for Large Vision-Language Models + + +
+ In this paper, we propose a post-training quantization framework of large +vision-language models (LVLMs) for efficient multi-modal inference. +Conventional quantization methods sequentially search the layer-wise rounding +functions by minimizing activation discretization errors, which fails to +acquire optimal quantization strategy without considering cross-layer +dependency. On the contrary, we mine the cross-layer dependency that +significantly influences discretization errors of the entire vision-language +model, and embed this dependency into optimal quantization strategy searching +with low search cost. Specifically, we observe the strong correlation between +the activation entropy and the cross-layer dependency concerning output +discretization errors. Therefore, we employ the entropy as the proxy to +partition blocks optimally, which aims to achieve satisfying trade-offs between +discretization errors and the search cost. Moreover, we optimize the visual +encoder to disentangle the cross-layer dependency for fine-grained +decomposition of search space, so that the search cost is further reduced +without harming the quantization accuracy. Experimental results demonstrate +that our method compresses the memory by 2.78x and increase generate speed by +1.44x about 13B LLaVA model without performance degradation on diverse +multi-modal reasoning tasks. Code is available at +https://github.com/ChangyuanWang17/QVLM. + +
+
+
+
+
+ + ♻ ☆ SINETRA: a Versatile Framework for Evaluating Single Neuron Tracking in + Behaving Animals + + +
+ Accurately tracking neuronal activity in behaving animals presents +significant challenges due to complex motions and background noise. The lack of +annotated datasets limits the evaluation and improvement of such tracking +algorithms. To address this, we developed SINETRA, a versatile simulator that +generates synthetic tracking data for particles on a deformable background, +closely mimicking live animal recordings. This simulator produces annotated 2D +and 3D videos that reflect the intricate movements seen in behaving animals +like Hydra Vulgaris. We evaluated four state-of-the-art tracking algorithms +highlighting the current limitations of these methods in challenging scenarios +and paving the way for improved cell tracking techniques in dynamic biological +systems. + +
+
+ comment: 5 pages, 3 figures, submitted at 2025 IEEE International Symposium on + Biomedical Imaging (ISBI) +
+
+
+
+
+ + ♻ ☆ UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for + Egocentric Hand Object Interaction Videos + + +
+ Egocentric Hand Object Interaction (HOI) videos provide valuable insights +into human interactions with the physical world, attracting growing interest +from the computer vision and robotics communities. A key task in fully +understanding the geometry and dynamics of HOI scenes is dense pointclouds +sequence reconstruction. However, the inherent motion of both hands and the +camera makes this challenging. Current methods often rely on time-consuming +test-time optimization, making them impractical for reconstructing +internet-scale videos. To address this, we introduce UniHOI, a model that +unifies the estimation of all variables necessary for dense 4D reconstruction, +including camera intrinsic, camera poses, and video depth, for egocentric HOI +scene in a fast feed-forward manner. We end-to-end optimize all these variables +to improve their consistency in 3D space. Furthermore, our model could be +trained solely on large-scale monocular video dataset, overcoming the +limitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain +and zero-shot generalization setting, surpassing all baselines in pointclouds +sequence reconstruction and long-term 3D scene flow recovery. UniHOI is the +first approach to offer fast, dense, and generalizable monocular egocentric HOI +scene reconstruction in the presence of motion. Code and trained model will be +released in the future. + +
+
+
+
+
+ + ♻ ☆ GSFusion: Online RGB-D Mapping Where Gaussian Splatting Meets TSDF + Fusion + + +
+ Traditional volumetric fusion algorithms preserve the spatial structure of 3D +scenes, which is beneficial for many tasks in computer vision and robotics. +However, they often lack realism in terms of visualization. Emerging 3D +Gaussian splatting bridges this gap, but existing Gaussian-based reconstruction +methods often suffer from artifacts and inconsistencies with the underlying 3D +structure, and struggle with real-time optimization, unable to provide users +with immediate feedback in high quality. One of the bottlenecks arises from the +massive amount of Gaussian parameters that need to be updated during +optimization. Instead of using 3D Gaussian as a standalone map representation, +we incorporate it into a volumetric mapping system to take advantage of +geometric information and propose to use a quadtree data structure on images to +drastically reduce the number of splats initialized. In this way, we +simultaneously generate a compact 3D Gaussian map with fewer artifacts and a +volumetric map on the fly. Our method, GSFusion, significantly enhances +computational efficiency without sacrificing rendering quality, as demonstrated +on both synthetic and real datasets. Code will be available at +https://github.com/goldoak/GSFusion. + +
+
+
+
+
+ + ♻ ☆ Self-eXplainable AI for Medical Image Analysis: A Survey and New + Outlooks + + +
+ The increasing demand for transparent and reliable models, particularly in +high-stakes decision-making areas such as medical image analysis, has led to +the emergence of eXplainable Artificial Intelligence (XAI). Post-hoc XAI +techniques, which aim to explain black-box models after training, have raised +concerns about their fidelity to model predictions. In contrast, +Self-eXplainable AI (S-XAI) offers a compelling alternative by incorporating +explainability directly into the training process of deep learning models. This +approach allows models to generate inherent explanations that are closely +aligned with their internal decision-making processes, enhancing transparency +and supporting the trustworthiness, robustness, and accountability of AI +systems in real-world medical applications. To facilitate the development of +S-XAI methods for medical image analysis, this survey presents a comprehensive +review across various image modalities and clinical applications. It covers +more than 200 papers from three key perspectives: 1) input explainability +through the integration of explainable feature engineering and knowledge graph, +2) model explainability via attention-based learning, concept-based learning, +and prototype-based learning, and 3) output explainability by providing textual +and counterfactual explanations. This paper also outlines desired +characteristics of explainability and evaluation methods for assessing +explanation quality, while discussing major challenges and future research +directions in developing S-XAI for medical image analysis. + +
+
+
+
+
+ + ♻ ☆ I2I-Mamba: Multi-modal medical image synthesis via selective state space + modeling + + +
+ In recent years, deep learning models comprising transformer components have +pushed the performance envelope in medical image synthesis tasks. Contrary to +convolutional neural networks (CNNs) that use static, local filters, +transformers use self-attention mechanisms to permit adaptive, non-local +filtering to sensitively capture long-range context. However, this sensitivity +comes at the expense of substantial model complexity, which can compromise +learning efficacy particularly on relatively modest-sized imaging datasets. +Here, we propose a novel adversarial model for multi-modal medical image +synthesis, I2I-Mamba, that leverages selective state space modeling (SSM) to +efficiently capture long-range context while maintaining local precision. To do +this, I2I-Mamba injects channel-mixed Mamba (cmMamba) blocks in the bottleneck +of a convolutional backbone. In cmMamba blocks, SSM layers are used to learn +context across the spatial dimension and channel-mixing layers are used to +learn context across the channel dimension of feature maps. Comprehensive +demonstrations are reported for imputing missing images in multi-contrast MRI +and MRI-CT protocols. Our results indicate that I2I-Mamba offers superior +performance against state-of-the-art CNN- and transformer-based methods in +synthesizing target-modality images. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ BOP-Distrib: Revisiting 6D Pose Estimation Benchmark for Better + Evaluation under Visual Ambiguities + + +
+ 6D pose estimation aims at determining the pose of the object that best +explains the camera observation. The unique solution for a non-symmetrical +object can turn into a multi-modal pose distribution for a symmetrical object +or when occlusions of symmetry-breaking elements happen, depending on the +viewpoint. Currently, 6D pose estimation methods are benchmarked on datasets +that consider, for their ground truth annotations, visual ambiguities as only +related to global object symmetries, whereas they should be defined per-image +to account for the camera viewpoint. We thus first propose an automatic method +to re-annotate those datasets with a 6D pose distribution specific to each +image, taking into account the visibility of the object surface in the image to +correctly determine the visual ambiguities. Second, given this improved ground +truth, we re-evaluate the state-of-the-art single pose methods and show that +this greatly modifies the ranking of these methods. Third, as some recent works +focus on estimating the complete set of solutions, we derive a precision/recall +formulation to evaluate them against our image-wise distribution ground truth, +making it the first benchmark for pose distribution methods on real images. We +will make our annotations for the T-LESS dataset and our code publicly +available. + +
+
+
+
+
+ + ♻ ☆ Automated Segmentation of Ischemic Stroke Lesions in Non-Contrast + Computed Tomography Images for Enhanced Treatment and Prognosis MICCAI + + +
+ Stroke is the second leading cause of death worldwide, and is increasingly +prevalent in low- and middle-income countries (LMICs). Timely interventions can +significantly influence stroke survivability and the quality of life after +treatment. However, the standard and most widely available imaging method for +confirming strokes and their sub-types, the NCCT, is more challenging and +time-consuming to employ in cases of ischemic stroke. For this reason, we +developed an automated method for ischemic stroke lesion segmentation in NCCTs +using the nnU-Net frame work, aimed at enhancing early treatment and improving +the prognosis of ischemic stroke patients. We achieved Dice scores of 0.596 and +Intersection over Union (IoU) scores of 0.501 on the sampled dataset. After +adjusting for outliers, these scores improved to 0.752 for the Dice score and +0.643 for the IoU. Proper delineation of the region of infarction can help +clinicians better assess the potential impact of the infarction, and guide +treatment procedures. + +
+
+ comment: 7 pages, 3 figures, MICCAI Meets Africa Workshop +
+
+
+
+
+ + ♻ ☆ Advancing Prompt Learning through an External Layer + + +
+ Prompt learning represents a promising method for adapting pre-trained +vision-language models (VLMs) to various downstream tasks by learning a set of +text embeddings. One challenge inherent to these methods is the poor +generalization performance due to the invalidity of the learned text embeddings +for unseen tasks. A straightforward approach to bridge this gap is to freeze +the text embeddings in prompts, which results in a lack of capacity to adapt +VLMs for downstream tasks. To address this dilemma, we propose a paradigm +called EnPrompt with a novel External Layer (EnLa). Specifically, we propose a +textual external layer and learnable visual embeddings for adapting VLMs to +downstream tasks. The learnable external layer is built upon valid embeddings +of pre-trained CLIP. This design considers the balance of learning capabilities +between the two branches. To align the textual and visual features, we propose +a novel two-pronged approach: i) we introduce the optimal transport as the +discrepancy metric to align the vision and text modalities, and ii) we +introduce a novel strengthening feature to enhance the interaction between +these two modalities. Four representative experiments (i.e., base-to-novel +generalization, few-shot learning, cross-dataset generalization, domain shifts +generalization) across 15 datasets demonstrate that our method outperforms the +existing prompt learning method. + +
+
+
+
+
+ + ♻ ☆ Calibration of ordinal regression networks + + +
+ Recent studies have shown that deep neural networks are not well-calibrated +and often produce over-confident predictions. The miscalibration issue +primarily stems from using cross-entropy in classifications, which aims to +align predicted softmax probabilities with one-hot labels. In ordinal +regression tasks, this problem is compounded by an additional challenge: the +expectation that softmax probabilities should exhibit unimodal distribution is +not met with cross-entropy. The ordinal regression literature has focused on +learning orders and overlooked calibration. To address both issues, we propose +a novel loss function that introduces order-aware calibration, ensuring that +prediction confidence adheres to ordinal relationships between classes. It +incorporates soft ordinal encoding and order-aware regularization to enforce +both calibration and unimodality. Extensive experiments across three popular +ordinal regression benchmarks demonstrate that our approach achieves +state-of-the-art calibration without compromising accuracy. + +
+
+
+
+
+ + ♻ ☆ A Joint Representation Using Continuous and Discrete Features for + Cardiovascular Diseases Risk Prediction on Chest CT Scans + + +
+ Cardiovascular diseases (CVD) remain a leading health concern and contribute +significantly to global mortality rates. While clinical advancements have led +to a decline in CVD mortality, accurately identifying individuals who could +benefit from preventive interventions remains an unsolved challenge in +preventive cardiology. Current CVD risk prediction models, recommended by +guidelines, are based on limited traditional risk factors or use CT imaging to +acquire quantitative biomarkers, and still have limitations in predictive +accuracy and applicability. On the other hand, end-to-end trained CVD risk +prediction methods leveraging deep learning on CT images often fail to provide +transparent and explainable decision grounds for assisting physicians. In this +work, we proposed a novel joint representation that integrates discrete +quantitative biomarkers and continuous deep features extracted from chest CT +scans. Our approach initiated with a deep CVD risk classification model by +capturing comprehensive continuous deep learning features while jointly +obtaining currently clinical-established quantitative biomarkers via +segmentation models. In the feature joint representation stage, we use an +instance-wise feature-gated mechanism to align the continuous and discrete +features, followed by a soft instance-wise feature interaction mechanism +fostering independent and effective feature interaction for the final CVD risk +prediction. Our method substantially improves CVD risk predictive performance +and offers individual contribution analysis of each biomarker, which is +important in assisting physicians' decision-making processes. We validated our +method on a public chest low-dose CT dataset and a private external chest +standard-dose CT patient cohort of 17,207 CT volumes from 6,393 unique +subjects, and demonstrated superior predictive performance, achieving AUCs of +0.875 and 0.843, respectively. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ A Low-rank Matching Attention based Cross-modal Feature Fusion Method + for Conversational Emotion Recognition + + +
+ Conversational emotion recognition (CER) is an important research topic in +human-computer interactions. {Although recent advancements in transformer-based +cross-modal fusion methods have shown promise in CER tasks, they tend to +overlook the crucial intra-modal and inter-modal emotional interaction or +suffer from high computational complexity. To address this, we introduce a +novel and lightweight cross-modal feature fusion method called Low-Rank +Matching Attention Method (LMAM). LMAM effectively captures contextual +emotional semantic information in conversations while mitigating the quadratic +complexity issue caused by the self-attention mechanism. Specifically, by +setting a matching weight and calculating inter-modal features attention scores +row by row, LMAM requires only one-third of the parameters of self-attention +methods. We also employ the low-rank decomposition method on the weights to +further reduce the number of parameters in LMAM. As a result, LMAM offers a +lightweight model while avoiding overfitting problems caused by a large number +of parameters. Moreover, LMAM is able to fully exploit the intra-modal +emotional contextual information within each modality and integrates +complementary emotional semantic information across modalities by computing and +fusing similarities of intra-modal and inter-modal features simultaneously. +Experimental results verify the superiority of LMAM compared with other popular +cross-modal fusion methods on the premise of being more lightweight. Also, LMAM +can be embedded into any existing state-of-the-art CER methods in a +plug-and-play manner, and can be applied to other multi-modal recognition +tasks, e.g., session recommendation and humour detection, demonstrating its +remarkable generalization ability. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ DiffLoRA: Generating Personalized Low-Rank Adaptation Weights with + Diffusion + + +
+ Personalized text-to-image generation has gained significant attention for +its capability to generate high-fidelity portraits of specific identities +conditioned on user-defined prompts. Existing methods typically involve +test-time fine-tuning or incorporating an additional pre-trained branch. +However, these approaches struggle to simultaneously address efficiency, +identity fidelity, and the preservation of the model's original generative +capabilities. In this paper, we propose DiffLoRA, an efficient method that +leverages the diffusion model as a hypernetwork to predict personalized +Low-Rank Adaptation (LoRA) weights based on the reference images. By +incorporating these LoRA weights into the off-the-shelf text-to-image model, +DiffLoRA enables zero-shot personalization during inference, eliminating the +need for post-processing optimization. Moreover, we introduce a novel +identity-oriented LoRA weights construction pipeline to facilitate the training +process of DiffLoRA. The dataset generated through this pipeline enables +DiffLoRA to produce consistently high-quality LoRA weights. Notably, the +distinctive properties of the diffusion model enhance the generation of +superior weights by employing probabilistic modeling to capture intricate +structural patterns and thoroughly explore the weight space. Comprehensive +experimental results demonstrate that DiffLoRA outperforms existing +personalization approaches across multiple benchmarks, achieving both time +efficiency and maintaining identity fidelity throughout the personalization +process. + +
+
+ comment: 9 pages,8 figures +
+
+
+
+
+ + ♻ ☆ Tissue Concepts: supervised foundation models in computational pathology + + +
+ Due to the increasing workload of pathologists, the need for automation to +support diagnostic tasks and quantitative biomarker evaluation is becoming more +and more apparent. Foundation models have the potential to improve +generalizability within and across centers and serve as starting points for +data efficient development of specialized yet robust AI models. However, the +training foundation models themselves is usually very expensive in terms of +data, computation, and time. This paper proposes a supervised training method +that drastically reduces these expenses. The proposed method is based on +multi-task learning to train a joint encoder, by combining 16 different +classification, segmentation, and detection tasks on a total of 912,000 +patches. Since the encoder is capable of capturing the properties of the +samples, we term it the Tissue Concepts encoder. To evaluate the performance +and generalizability of the Tissue Concepts encoder across centers, +classification of whole slide images from four of the most prevalent solid +cancers - breast, colon, lung, and prostate - was used. The experiments show +that the Tissue Concepts model achieve comparable performance to models trained +with self-supervision, while requiring only 6% of the amount of training +patches. Furthermore, the Tissue Concepts encoder outperforms an ImageNet +pre-trained encoder on both in-domain and out-of-domain data. + +
+
+ comment: 22 Pages, 3 Figures, submitted to and under revision at Computers in + Biology and Medicine +
+
+
+
+
+ + ♻ ☆ Flow Priors for Linear Inverse Problems via Iterative Corrupted + Trajectory Matching NeurIPS 2024 + + +
+ Generative models based on flow matching have attracted significant attention +for their simplicity and superior performance in high-resolution image +synthesis. By leveraging the instantaneous change-of-variables formula, one can +directly compute image likelihoods from a learned flow, making them enticing +candidates as priors for downstream tasks such as inverse problems. In +particular, a natural approach would be to incorporate such image probabilities +in a maximum-a-posteriori (MAP) estimation problem. A major obstacle, however, +lies in the slow computation of the log-likelihood, as it requires +backpropagating through an ODE solver, which can be prohibitively slow for +high-dimensional problems. In this work, we propose an iterative algorithm to +approximate the MAP estimator efficiently to solve a variety of linear inverse +problems. Our algorithm is mathematically justified by the observation that the +MAP objective can be approximated by a sum of $N$ ``local MAP'' objectives, +where $N$ is the number of function evaluations. By leveraging Tweedie's +formula, we show that we can perform gradient steps to sequentially optimize +these objectives. We validate our approach for various linear inverse problems, +such as super-resolution, deblurring, inpainting, and compressed sensing, and +demonstrate that we can outperform other methods based on flow matching. Code +is available at https://github.com/YasminZhang/ICTM. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Bridging The Gap between Low-rank and Orthogonal Adaptation via + Householder Reflection Adaptation + + +
+ While following different technical routes, both low-rank and orthogonal +adaptation techniques can efficiently adapt large-scale pre-training models in +specific tasks or domains based on a small piece of trainable parameters. In +this study, we bridge the gap between these two techniques, proposing a simple +but effective adaptation method based on Householder reflections. Given a +pre-trained model, our method fine-tunes its layers by multiplying each frozen +weight matrix with an orthogonal matrix constructed by a chain of learnable +Householder reflections (HRs). This HR-based orthogonal fine-tuning is +equivalent to an adaptive low-rank adaptation. Moreover, we show that the +orthogonality of the reflection planes corresponding to the HRs impacts the +model capacity and regularity. The analysis motivates us to regularize the +orthogonality of the HRs, leading to different implementations of the proposed +Householder reflection adaptation (HRA) method. Compared with state-of-the-art +methods, HRA achieves superior performance with fewer learnable parameters when +adapting large language models and conditional image generators. The code of +the experiments is available at \url{https://github.com/DaShenZi721/HRA}, and +the method has been merged into the +\href{https://github.com/huggingface/peft}{PEFT} package. + +
+
+
+
+
+ + ♻ ☆ VLEU: a Method for Automatic Evaluation for Generalizability of + Text-to-Image Models EMNLP2024 + + +
+ Progress in Text-to-Image (T2I) models has significantly improved the +generation of images from textual descriptions. However, existing evaluation +metrics do not adequately assess the models' ability to handle a diverse range +of textual prompts, which is crucial for their generalizability. To address +this, we introduce a new metric called Visual Language Evaluation Understudy +(VLEU). VLEU uses large language models to sample from the visual text domain, +the set of all possible input texts for T2I models, to generate a wide variety +of prompts. The images generated from these prompts are evaluated based on +their alignment with the input text using the CLIP model.VLEU quantifies a +model's generalizability by computing the Kullback-Leibler divergence between +the marginal distribution of the visual text and the conditional distribution +of the images generated by the model. This metric provides a quantitative way +to compare different T2I models and track improvements during model finetuning. +Our experiments demonstrate the effectiveness of VLEU in evaluating the +generalization capability of various T2I models, positioning it as an essential +metric for future research in text-to-image synthesis. + +
+
+ comment: accepted by EMNLP2024(long paper,main conference) +
+
+
+
+
+ + ♻ ☆ Constraint Learning for Parametric Point Cloud + + +
+ Parametric point clouds are sampled from CAD shapes, and have become +increasingly prevalent in industrial manufacturing. However, most existing +point cloud learning methods focus on the geometric features, such as +developing efficient convolution operations, overlooking the important +attribute of constraints inherent in CAD shapes, which limits these methods' +ability to comprehend CAD shapes fully. To address this issue, we analyzed the +effect of constraints, and proposed its deep learning-friendly representation, +after that, the Constraint Feature Learning Network (CstNet) was developed to +extract and leverage constraints. Our CstNet includes two stages. Stage 1 +extracts constraints from B-Rep data or point cloud. Stage 2 leverages +coordinates and constraints to enhance the comprehension of CAD shapes. +Additionally, we built up the Parametric 20,000 Multi-modal Dataset for the +scarcity of labeled B-Rep datasets. Experiments demonstrate that our CstNet +achieved state-of-the-art performance on both public and proposed CAD shape +datasets. To the best of our knowledge, CstNet is the first constraint-based +learning method tailored for CAD shape analysis. + +
+
+
+
+
+ + ♻ ☆ MANTIS: Interleaved Multi-Image Instruction Tuning + + +
+ Large multimodal models (LMMs) have shown great results in single-image +vision language tasks. However, their abilities to solve multi-image visual +language tasks is yet to be improved. The existing LMMs like OpenFlamingo, +Emu2, and Idefics gain their multi-image ability through pre-training on +hundreds of millions of noisy interleaved image-text data from the web, which +is neither efficient nor effective. In this paper, we aim to build strong +multi-image LMMs via instruction tuning with academic-level resources. +Therefore, we meticulously construct Mantis-Instruct containing 721K +multi-image instruction data to train a family of Mantis models. The +instruction tuning empowers Mantis with different multi-image skills like +co-reference, comparison, reasoning, and temporal understanding. We evaluate +Mantis on 8 multi-image benchmarks and 6 single-image benchmarks. +Mantis-Idefics2 can achieve SoTA results on all the multi-image benchmarks and +beat the strongest multi-image baseline, Idefics2-8B by an average of 13 +absolute points. Notably, Idefics2-8B was pre-trained on 140M interleaved +multi-image data, which is 200x larger than Mantis-Instruct. We observe that +Mantis performs equivalently well on the held-in and held-out benchmarks, which +shows its generalization ability. We further evaluate Mantis on single-image +benchmarks and demonstrate that Mantis also maintains a strong single-image +performance on par with CogVLM and Emu2. Our results show that multi-image +abilities are not necessarily gained through massive pre-training, instead, +they can be gained by low-cost instruction tuning. The training and evaluation +of Mantis has paved the road for future work to improve LMMs' multi-image +abilities. + +
+
+ comment: 13 pages, 3 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ Confidence-aware Denoised Fine-tuning of Off-the-shelf Models for + Certified Robustness + + +
+ The remarkable advances in deep learning have led to the emergence of many +off-the-shelf classifiers, e.g., large pre-trained models. However, since they +are typically trained on clean data, they remain vulnerable to adversarial +attacks. Despite this vulnerability, their superior performance and +transferability make off-the-shelf classifiers still valuable in practice, +demanding further work to provide adversarial robustness for them in a post-hoc +manner. A recently proposed method, denoised smoothing, leverages a denoiser +model in front of the classifier to obtain provable robustness without +additional training. However, the denoiser often creates hallucination, i.e., +images that have lost the semantics of their originally assigned class, leading +to a drop in robustness. Furthermore, its noise-and-denoise procedure +introduces a significant distribution shift from the original distribution, +causing the denoised smoothing framework to achieve sub-optimal robustness. In +this paper, we introduce Fine-Tuning with Confidence-Aware Denoised Image +Selection (FT-CADIS), a novel fine-tuning scheme to enhance the certified +robustness of off-the-shelf classifiers. FT-CADIS is inspired by the +observation that the confidence of off-the-shelf classifiers can effectively +identify hallucinated images during denoised smoothing. Based on this, we +develop a confidence-aware training objective to handle such hallucinated +images and improve the stability of fine-tuning from denoised images. In this +way, the classifier can be fine-tuned using only images that are beneficial for +adversarial robustness. We also find that such a fine-tuning can be done by +updating a small fraction of parameters of the classifier. Extensive +experiments demonstrate that FT-CADIS has established the state-of-the-art +certified robustness among denoised smoothing methods across all +$\ell_2$-adversary radius in various benchmarks. + +
+
+ comment: 26 pages; TMLR 2024; Code is available at + https://github.com/suhyeok24/FT-CADIS +
+
+
+
+
+ + ♻ ☆ Enhancing Robustness to Noise Corruption for Point Cloud Recognition via + Spatial Sorting and Set-Mixing Aggregation Module ACCV2024 + + +
+ Current models for point cloud recognition demonstrate promising performance +on synthetic datasets. However, real-world point cloud data inevitably contains +noise, impacting model robustness. While recent efforts focus on enhancing +robustness through various strategies, there still remains a gap in +comprehensive analyzes from the standpoint of network architecture design. +Unlike traditional methods that rely on generic techniques, our approach +optimizes model robustness to noise corruption through network architecture +design. Inspired by the token-mixing technique applied in 2D images, we propose +Set-Mixer, a noise-robust aggregation module which facilitates communication +among all points to extract geometric shape information and mitigating the +influence of individual noise points. A sorting strategy is designed to enable +our module to be invariant to point permutation, which also tackles the +unordered structure of point cloud and introduces consistent relative spatial +information. Experiments conducted on ModelNet40-C indicate that Set-Mixer +significantly enhances the model performance on noisy point clouds, +underscoring its potential to advance real-world applicability in 3D +recognition and perception tasks. + +
+
+ comment: Accepted by ACCV2024 +
+
+
+
+
+ + ♻ ☆ QuST: QuPath Extension for Integrative Whole Slide Image and Spatial + Transcriptomics Analysis + + +
+ The integration of AI in digital pathology, particularly in whole slide image +(WSI) and spatial transcriptomics (ST) analysis, holds immense potential for +enhancing our understanding of diseases. Despite challenges such as training +pattern preparation and resolution disparities, the convergence of these +technologies can unlock new insights. We introduce QuST, a tool that bridges +the gap between WSI and ST, underscoring the transformative power of this +integrated approach in disease biology. + +
+
+ comment: 18 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ HMAFlow: Learning More Accurate Optical Flow via Hierarchical Motion + Field Alignment + + +
+ Optical flow estimation is a fundamental and long-standing visual task. In +this work, we present a novel method, dubbed HMAFlow, to improve optical flow +estimation in challenging scenes, particularly those involving small objects. +The proposed model mainly consists of two core components: a Hierarchical +Motion Field Alignment (HMA) module and a Correlation Self-Attention (CSA) +module. In addition, we rebuild 4D cost volumes by employing a Multi-Scale +Correlation Search (MCS) layer and replacing average pooling in common cost +volumes with a search strategy utilizing multiple search ranges. Experimental +results demonstrate that our model achieves the best generalization performance +compared to other state-of-the-art methods. Specifically, compared with RAFT, +our method achieves relative error reductions of 14.2% and 3.4% on the clean +pass and final pass of the Sintel online benchmark, respectively. On the KITTI +test benchmark, HMAFlow surpasses RAFT and GMA in the Fl-all metric by relative +margins of 6.8% and 7.7%, respectively. To facilitate future research, our code +will be made available at https://github.com/BooTurbo/HMAFlow. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ SC3D: Label-Efficient Outdoor 3D Object Detection via Single Click + Annotation + + +
+ LiDAR-based outdoor 3D object detection has received widespread attention. +However, training 3D detectors from the LiDAR point cloud typically relies on +expensive bounding box annotations. This paper presents SC3D, an innovative +label-efficient method requiring only a single coarse click on the bird's eye +view of the 3D point cloud for each frame. A key challenge here is the absence +of complete geometric descriptions of the target objects from such simple click +annotations. To address this issue, our proposed SC3D adopts a progressive +pipeline. Initially, we design a mixed pseudo-label generation module that +expands limited click annotations into a mixture of bounding box and semantic +mask supervision. Next, we propose a mix-supervised teacher model, enabling the +detector to learn mixed supervision information. Finally, we introduce a +mixed-supervised student network that leverages the teacher model's +generalization ability to learn unclicked instances.Experimental results on the +widely used nuScenes and KITTI datasets demonstrate that our SC3D with only +coarse clicks, which requires only 0.2% annotation cost, achieves +state-of-the-art performance compared to weakly-supervised 3D detection +methods.The code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Networking Systems for Video Anomaly Detection: A Tutorial and Survey + + +
+ The increasing utilization of surveillance cameras in smart cities, coupled +with the surge of online video applications, has heightened concerns regarding +public security and privacy protection, which propelled automated Video Anomaly +Detection (VAD) into a fundamental research task within the Artificial +Intelligence (AI) community. With the advancements in deep learning and edge +computing, VAD has made significant progress and advances synergized with +emerging applications in smart cities and video internet, which has moved +beyond the conventional research scope of algorithm engineering to deployable +Networking Systems for VAD (NSVAD), a practical hotspot for intersection +exploration in the AI, IoVT, and computing fields. In this article, we +delineate the foundational assumptions, learning frameworks, and applicable +scenarios of various deep learning-driven VAD routes, offering an exhaustive +tutorial for novices in NSVAD. This article elucidates core concepts by +reviewing recent advances and typical solutions and aggregating available +research resources accessible at https://github.com/fdjingliu/NSVAD. +Additionally, we showcase our latest NSVAD research in industrial IoT and smart +cities, along with an end-cloud collaborative architecture for deployable +NSVAD. Lastly, this article projects future development trends and discusses +how the integration of AI and computing technologies can address existing +research challenges and promote open opportunities, serving as an insightful +guide for prospective researchers and engineers. + +
+
+ comment: Revised to ACM Computing Surveys, under review, for more information + and supplementary material, please see https://github.com/fdjingliu/NSVAD +
+
+
+
+
+ + ♻ ☆ Quanta Video Restoration ECCV + + +
+ The proliferation of single-photon image sensors has opened the door to a +plethora of high-speed and low-light imaging applications. However, data +collected by these sensors are often 1-bit or few-bit, and corrupted by noise +and strong motion. Conventional video restoration methods are not designed to +handle this situation, while specialized quanta burst algorithms have limited +performance when the number of input frames is low. In this paper, we introduce +Quanta Video Restoration (QUIVER), an end-to-end trainable network built on the +core ideas of classical quanta restoration methods, i.e., pre-filtering, flow +estimation, fusion, and refinement. We also collect and publish I2-2000FPS, a +high-speed video dataset with the highest temporal resolution of 2000 +frames-per-second, for training and testing. On simulated and real data, QUIVER +outperforms existing quanta restoration methods by a significant margin. Code +and dataset available at +https://github.com/chennuriprateek/Quanta_Video_Restoration-QUIVER- + +
+
+ comment: Accepted at European Conference on Computer Vision (ECCV) 2024, + Milano, Italy, Sept 29 - Oct 4, 2024, Part XL, LNCS 15098 +
+
+
+
+
+ + ♻ ☆ Co-Fix3D: Enhancing 3D Object Detection with Collaborative Refinement + + +
+ 3D object detection in driving scenarios faces the challenge of complex road +environments, which can lead to the loss or incompleteness of key features, +thereby affecting perception performance. To address this issue, we propose an +advanced detection framework called Co-Fix3D. Co-Fix3D integrates Local and +Global Enhancement (LGE) modules to refine Bird's Eye View (BEV) features. The +LGE module uses Discrete Wavelet Transform (DWT) for pixel-level local +optimization and incorporates an attention mechanism for global optimization. +To handle varying detection difficulties, we adopt multi-head LGE modules, +enabling each module to focus on targets with different levels of detection +complexity, thus further enhancing overall perception capability. Experimental +results show that on the nuScenes dataset's LiDAR benchmark, Co-Fix3D achieves +69.4\% mAP and 73.5\% NDS, while on the multimodal benchmark, it achieves +72.3\% mAP and 74.7\% NDS. The source code is publicly available at +\href{https://github.com/rubbish001/Co-Fix3d}{https://github.com/rubbish001/Co-Fix3d}. + +
+
+
+
+
+ + ♻ ☆ GSGAN: Adversarial Learning for Hierarchical Generation of 3D Gaussian + Splats NeurIPS 2024 + + +
+ Most advances in 3D Generative Adversarial Networks (3D GANs) largely depend +on ray casting-based volume rendering, which incurs demanding rendering costs. +One promising alternative is rasterization-based 3D Gaussian Splatting (3D-GS), +providing a much faster rendering speed and explicit 3D representation. In this +paper, we exploit Gaussian as a 3D representation for 3D GANs by leveraging its +efficient and explicit characteristics. However, in an adversarial framework, +we observe that a na\"ive generator architecture suffers from training +instability and lacks the capability to adjust the scale of Gaussians. This +leads to model divergence and visual artifacts due to the absence of proper +guidance for initialized positions of Gaussians and densification to manage +their scales adaptively. To address these issues, we introduce a generator +architecture with a hierarchical multi-scale Gaussian representation that +effectively regularizes the position and scale of generated Gaussians. +Specifically, we design a hierarchy of Gaussians where finer-level Gaussians +are parameterized by their coarser-level counterparts; the position of +finer-level Gaussians would be located near their coarser-level counterparts, +and the scale would monotonically decrease as the level becomes finer, modeling +both coarse and fine details of the 3D scene. Experimental results demonstrate +that ours achieves a significantly faster rendering speed (x100) compared to +state-of-the-art 3D consistent GANs with comparable 3D generation capability. +Project page: https://hse1032.github.io/gsgan. + +
+
+ comment: NeurIPS 2024 / Project page: https://hse1032.github.io/gsgan +
+
+
+
+
+ + ♻ ☆ How Does Vision-Language Adaptation Impact the Safety of Vision Language + Models? + + +
+ Vision-Language adaptation (VL adaptation) transforms Large Language Models +(LLMs) into Large Vision-Language Models (LVLMs) for multimodal tasks, but this +process often compromises the inherent safety capabilities embedded in the +original LLMs. Despite potential harmfulness due to weakened safety measures, +in-depth analysis on the effects of VL adaptation on safety remains +under-explored. This study examines how VL adaptation influences safety and +evaluates the impact of safety fine-tuning methods. Our analysis reveals that +safety degradation occurs during VL adaptation, even when the training data is +safe. While safety tuning techniques like supervised fine-tuning with safety +datasets or reinforcement learning from human feedback mitigate some risks, +they still lead to safety degradation and a reduction in helpfulness due to +over-rejection issues. Further analysis of internal model weights suggests that +VL adaptation may impact certain safety-related layers, potentially lowering +overall safety levels. Additionally, our findings demonstrate that the +objectives of VL adaptation and safety tuning are divergent, which often +results in their simultaneous application being suboptimal. To address this, we +suggest the weight merging approach as an optimal solution effectively reducing +safety degradation while maintaining helpfulness. These insights help guide the +development of more reliable and secure LVLMs for real-world applications. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ♻ ☆ Feature Extraction Reimagined: Achieving Superior Accuracy in Camera + Calibration + + +
+ Camera calibration is crucial for 3D vision applications. This paper focuses +on improving the accuracy of feature extraction, which is a key step in +calibration. We address the aliasing problem of star-shaped pattern by +introducing a novel dynamic calibration target that synthesizes multiple +checkerboard patterns of different angle around pattern center, which +significantly improves feature refinement accuracy. Additionally, we propose a +novel cost function of feature refinement that accounts for defocus effect, +offering a more physically realistic model compared to existing symmetry based +method, experiment on a large dataset demonstrate significant improvements in +calibration accuracy with reduced computation time. Our code is available from +https://github.com/spdfghi/Feature-Extraction-Reimagined-Achieving-Superior-Accuracy-in-Camera-Calibration.git. + +
+
+
+
+
+ + ♻ ☆ CleanerCLIP: Fine-grained Counterfactual Semantic Augmentation for + Backdoor Defense in Contrastive Learning + + +
+ Pre-trained large models for multimodal contrastive learning, such as CLIP, +have been widely recognized in the industry as highly susceptible to +data-poisoned backdoor attacks. This poses significant risks to downstream +model training. In response to such potential threats, finetuning offers a +simpler and more efficient defense choice compared to retraining large models +with augmented data. In the supervised learning domain, fine-tuning defense +strategies can achieve excellent defense performance. However, in the +unsupervised and semi-supervised domain, we find that when CLIP faces some +complex attack techniques, the existing fine-tuning defense strategy, +CleanCLIP, has some limitations on defense performance. The synonym +substitution of its text-augmentation is insufficient to enhance the text +feature space. To compensate for this weakness, we improve it by proposing a +fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to +cut off feature connections of backdoor triggers. We randomly select a few +samples for positive and negative subtext generation at each epoch of +CleanCLIP, and align the subtexts to the images to strengthen the text +self-supervision. We evaluate the effectiveness of our TA-Cleaner against six +attack algorithms and conduct comprehensive zero-shot classification tests on +ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves +state-of-the-art defensiveness among finetuning-based defense techniques. Even +when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms +CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, +respectively. + +
+
+
+
+
+ + ♻ ☆ SOWA: Adapting Hierarchical Frozen Window Self-Attention to + Visual-Language Models for Better Anomaly Detection + + +
+ Visual anomaly detection is essential in industrial manufacturing, yet +traditional methods often rely heavily on extensive normal datasets and +task-specific models, limiting their scalability. Recent advancements in +large-scale vision-language models have significantly enhanced zero- and +few-shot anomaly detection. However, these approaches may not fully leverage +hierarchical features, potentially overlooking nuanced details crucial for +accurate detection. To address this, we introduce a novel window self-attention +mechanism based on the CLIP model, augmented with learnable prompts to process +multi-level features within a Soldier-Officer Window Self-Attention (SOWA) +framework. Our method has been rigorously evaluated on five benchmark datasets, +achieving superior performance by leading in 18 out of 20 metrics, setting a +new standard against existing state-of-the-art techniques. + +
+
+ comment: 8 pages, 9 figures, conference +
+
+
+
+
+ + ♻ ☆ Similarity-aware Syncretic Latent Diffusion Model for Medical Image + Translation with Representation Learning + + +
+ Non-contrast CT (NCCT) imaging may reduce image contrast and anatomical +visibility, potentially increasing diagnostic uncertainty. In contrast, +contrast-enhanced CT (CECT) facilitates the observation of regions of interest +(ROI). Leading generative models, especially the conditional diffusion model, +demonstrate remarkable capabilities in medical image modality transformation. +Typical conditional diffusion models commonly generate images with guidance of +segmentation labels for medical modal transformation. Limited access to +authentic guidance and its low cardinality can pose challenges to the practical +clinical application of conditional diffusion models. To achieve an equilibrium +of generative quality and clinical practices, we propose a novel Syncretic +generative model based on the latent diffusion model for medical image +translation (S$^2$LDM), which can realize high-fidelity reconstruction without +demand of additional condition during inference. S$^2$LDM enhances the +similarity in distinct modal images via syncretic encoding and diffusing, +promoting amalgamated information in the latent space and generating medical +images with more details in contrast-enhanced regions. However, syncretic +latent spaces in the frequency domain tend to favor lower frequencies, commonly +locate in identical anatomic structures. Thus, S$^2$LDM applies adaptive +similarity loss and dynamic similarity to guide the generation and supplements +the shortfall in high-frequency details throughout the training process. +Quantitative experiments confirm the effectiveness of our approach in medical +image translation. Our code will release lately. + +
+
+ comment: We decide to modify the majority of the content +
+
+
+
+
+ + ♻ ☆ RSHazeDiff: A Unified Fourier-aware Diffusion Model for Remote Sensing + Image Dehazing + + +
+ Haze severely degrades the visual quality of remote sensing images and +hampers the performance of road extraction, vehicle detection, and traffic flow +monitoring. The emerging denoising diffusion probabilistic model (DDPM) +exhibits the significant potential for dense haze removal with its strong +generation ability. Since remote sensing images contain extensive small-scale +texture structures, it is important to effectively restore image details from +hazy images. However, current wisdom of DDPM fails to preserve image details +and color fidelity well, limiting its dehazing capacity for remote sensing +images. In this paper, we propose a novel unified Fourier-aware diffusion model +for remote sensing image dehazing, termed RSHazeDiff. From a new perspective, +RSHazeDiff explores the conditional DDPM to improve image quality in dense hazy +scenarios, and it makes three key contributions. First, RSHazeDiff refines the +training phase of diffusion process by performing noise estimation and +reconstruction constraints in a coarse-to-fine fashion. Thus, it remedies the +unpleasing results caused by the simple noise estimation constraint in DDPM. +Second, by taking the frequency information as important prior knowledge during +iterative sampling steps, RSHazeDiff can preserve more texture details and +color fidelity in dehazed images. Third, we design a global compensated +learning module to utilize the Fourier transform to capture the global +dependency features of input images, which can effectively mitigate the effects +of boundary artifacts when processing fixed-size patches. Experiments on both +synthetic and real-world benchmarks validate the favorable performance of +RSHazeDiff over state-of-the-art methods. Source code will be released at +https://github.com/jm-xiong/RSHazeDiff. + +
+
+ comment: IEEE TITS; 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Semantic Segmentation by Semantic Proportions + + +
+ Semantic segmentation is a critical task in computer vision aiming to +identify and classify individual pixels in an image, with numerous applications +in for example autonomous driving and medical image analysis. However, semantic +segmentation can be highly challenging particularly due to the need for large +amounts of annotated data. Annotating images is a time-consuming and costly +process, often requiring expert knowledge and significant effort; moreover, +saving the annotated images could dramatically increase the storage space. In +this paper, we propose a novel approach for semantic segmentation, requiring +the rough information of individual semantic class proportions, shortened as +semantic proportions, rather than the necessity of ground-truth segmentation +maps. This greatly simplifies the data annotation process and thus will +significantly reduce the annotation time, cost and storage space, opening up +new possibilities for semantic segmentation tasks where obtaining the full +ground-truth segmentation maps may not be feasible or practical. Our proposed +method of utilising semantic proportions can (i) further be utilised as a +booster in the presence of ground-truth segmentation maps to gain performance +without extra data and model complexity, and (ii) also be seen as a +parameter-free plug-and-play module, which can be attached to existing deep +neural networks designed for semantic segmentation. Extensive experimental +results demonstrate the good performance of our method compared to benchmark +methods that rely on ground-truth segmentation maps. Utilising semantic +proportions suggested in this work offers a promising direction for future +semantic segmentation research. + +
+
+
+
+
+ + ♻ ☆ From Isolation to Collaboration: Federated Class-Heterogeneous Learning + for Chest X-Ray Classification + + +
+ Federated learning (FL) is a promising paradigm to collaboratively train a +global chest x-ray (CXR) classification model using distributed datasets while +preserving patient privacy. A significant, yet relatively underexplored, +challenge in FL is class-heterogeneity, where clients have different sets of +classes. We propose surgical aggregation, a FL method that uses selective +aggregation to collaboratively train a global model using distributed, +class-heterogeneous datasets. Unlike other methods, our method does not rely on +the assumption that clients share the same classes as other clients, know the +classes of other clients, or have access to a fully annotated dataset. We +evaluate surgical aggregation using class-heterogeneous CXR datasets across IID +and non-IID settings. Our results show that our method outperforms current +methods and has better generalizability. + +
+
+
+
+
+
+
+
+ + Systems and Control 33 + +
+
+
+ + ☆ Balancing Passenger Transport and Power Distribution: A Distributed + Dispatch Policy for Shared Autonomous Electric Vehicles + + +
+ Shared autonomous electric vehicles can provide on-demand transportation for +passengers while also interacting extensively with the electric distribution +system. This interaction is especially beneficial after a disaster when the +large battery capacity of the fleet can be used to restore critical electric +loads. We develop a dispatch policy that balances the need to continue serving +passengers (especially critical workers) and the ability to transfer energy +across the network. The model predictive control policy tracks both passenger +and energy flows and provides maximum passenger throughput if any policy can. +The resulting mixed integer linear programming problem is difficult to solve +for large-scale problems, so a distributed solution approach is developed to +improve scalability, privacy, and resilience. We demonstrate that the proposed +heuristic, based on the alternating direction method of multipliers, is +effective in achieving near-optimal solutions quickly. The dispatch policy is +examined in simulation to demonstrate the ability of vehicles to balance these +competing objectives with benefits to both systems. Finally, we compare several +dispatch behaviors, demonstrating the importance of including operational +constraints and objectives from both the transportation and electric systems in +the model. + +
+
+
+
+
+ + ☆ Mitigating Parameter Degeneracy using Joint Conditional Diffusion Model + for WECC Composite Load Model in Power Systems + + +
+ Data-driven modeling for dynamic systems has gained widespread attention in +recent years. Its inverse formulation, parameter estimation, aims to infer the +inherent model parameters from observations. However, parameter degeneracy, +where different combinations of parameters yield the same observable output, +poses a critical barrier to accurately and uniquely identifying model +parameters. In the context of WECC composite load model (CLM) in power systems, +utility practitioners have observed that CLM parameters carefully selected for +one fault event may not perform satisfactorily in another fault. Here, we +innovate a joint conditional diffusion model-based inverse problem solver +(JCDI), that incorporates a joint conditioning architecture with simultaneous +inputs of multi-event observations to improve parameter generalizability. +Simulation studies on the WECC CLM show that the proposed JCDI effectively +reduces uncertainties of degenerate parameters, thus the parameter estimation +error is decreased by 42.1% compared to a single-event learning scheme. This +enables the model to achieve high accuracy in predicting power trajectories +under different fault events, including electronic load tripping and motor +stalling, outperforming standard deep reinforcement learning and supervised +learning approaches. We anticipate this work will contribute to mitigating +parameter degeneracy in system dynamics, providing a general parameter +estimation framework across various scientific domains. + +
+
+
+
+
+ + ☆ Koopman-based control of nonlinear systems with closed-loop guarantees + + +
+ In this paper, we provide a tutorial overview and an extension of a recently +developed framework for data-driven control of unknown nonlinear systems with +rigorous closed-loop guarantees. The proposed approach relies on the Koopman +operator representation of the nonlinear system, for which a bilinear surrogate +model is estimated based on data. In contrast to existing Koopman-based +estimation procedures, we state guaranteed bounds on the approximation error +using the stability- and certificate-oriented extended dynamic mode +decomposition (SafEDMD) framework. The resulting surrogate model and the +uncertainty bounds allow us to design controllers via robust control theory and +sum-of-squares optimization, guaranteeing desirable properties for the +closed-loop system. We present results on stabilization both in discrete and +continuous time, and we derive a method for controller design with performance +objectives. The benefits of the presented framework over established approaches +are demonstrated with a numerical example. + +
+
+
+
+
+ + ☆ Observer-Based Safety Monitoring of Nonlinear Dynamical Systems with + Neural Networks via Quadratic Constraint Approach + + +
+ The safety monitoring for nonlinear dynamical systems with embedded neural +network components is addressed in this paper. The interval-observer-based +safety monitor is developed consisting of two auxiliary neural networks derived +from the neural network components of the dynamical system. Due to the presence +of nonlinear activation functions in neural networks, we use quadratic +constraints on the global sector to abstract the nonlinear activation functions +in neural networks. By combining a quadratic constraint approach for the +activation function with Lyapunov theory, the interval observer design problem +is transformed into a series of quadratic and linear programming feasibility +problems to make the interval observer operate with the ability to correctly +estimate the system state with estimation errors within acceptable limits. The +applicability of the proposed method is verified by simulation of the lateral +vehicle control system. + +
+
+
+
+
+ + ☆ Data-Driven Decentralized Control Design for Discrete-Time Large-Scale + Systems + + +
+ In this paper, a data-driven approach is developed for controller design for +a class of discrete-time large-scale systems, where a large-scale system can be +expressed in an equivalent data-driven form and the decentralized controllers +can be parameterized by the data collected from its subsystems, i.e., system +state, control input, and interconnection input. Based on the developed +data-driven method and the Lyapunov approach, a data-driven semi-definite +programming problem is constructed to obtain decentralized stabilizing +controllers. The proposed approach has been validated on a mass-spring chain +model, with the significant advantage of avoiding extensive modeling processes. + +
+
+
+
+
+ + ☆ Efficient Neural Hybrid System Learning and Transition System + Abstraction for Dynamical Systems + + +
+ This paper proposes a neural network hybrid modeling framework for dynamics +learning to promote an interpretable, computationally efficient way of dynamics +learning and system identification. First, a low-level model will be trained to +learn the system dynamics, which utilizes multiple simple neural networks to +approximate the local dynamics generated from data-driven partitions. Then, +based on the low-level model, a high-level model will be trained to abstract +the low-level neural hybrid system model into a transition system that allows +Computational Tree Logic Verification to promote the model's ability with human +interaction and verification efficiency. + +
+
+
+
+
+ + ☆ Two-Stage Robust Optimal Operation of Distribution Networks using + Confidence Level Based Distributionally Information Gap Decision + + +
+ This paper presents a confidence level-based distributionally information gap +decision theory (CL-DIGDT) framework for the two-stage robust optimal operation +of distribution networks, aiming at deriving an optimal operational scheme +capable of addressing uncertainties related to renewable energy and load +demands. Building on conventional IGDT, the proposed framework utilizes the +confidence level to capture the asymmetric characteristics of uncertainties and +maximize the risk-averse capability of the solution in a probabilistic manner. +To account for the probabilistic consideration, the imprecise Dirichlet model +is employed to construct the ambiguity sets of uncertainties, reducing reliance +on precise probability distributions. Consequently, a two-stage robust optimal +operation model for distribution networks using CL-DIGDT is developed. An +iterative method is proposed to solve the model and determine the upper and +lower bounds of the objective function. Case study demonstrates that the +proposed approach yields a more robust and statistically optimized solution +with required accuracy compared to existing method, contributing to a reduction +in first-stage cost by 0.84%, second-stage average cost by 6.7%, and +significantly increasing the reliability of the solution by 8%. + +
+
+
+
+
+ + ☆ Neural Port-Hamiltonian Models for Nonlinear Distributed Control: An + Unconstrained Parametrization Approach + + +
+ The control of large-scale cyber-physical systems requires optimal +distributed policies relying solely on limited communication with neighboring +agents. However, computing stabilizing controllers for nonlinear systems while +optimizing complex costs remains a significant challenge. Neural Networks +(NNs), known for their expressivity, can be leveraged to parametrize control +policies that yield good performance. However, NNs' sensitivity to small input +changes poses a risk of destabilizing the closed-loop system. Many existing +approaches enforce constraints on the controllers' parameter space to guarantee +closed-loop stability, leading to computationally expensive optimization +procedures. To address these problems, we leverage the framework of +port-Hamiltonian systems to design continuous-time distributed control policies +for nonlinear systems that guarantee closed-loop stability and finite +$\mathcal{L}_2$ or incremental $\mathcal{L}_2$ gains, independent of the +optimzation parameters of the controllers. This eliminates the need to +constrain parameters during optimization, allowing the use of standard +techniques such as gradient-based methods. Additionally, we discuss +discretization schemes that preserve the dissipation properties of these +controllers for implementation on embedded systems. The effectiveness of the +proposed distributed controllers is demonstrated through consensus control of +non-holonomic mobile robots subject to collision avoidance and averaged voltage +regulation with weighted power sharing in DC microgrids. + +
+
+ comment: The paper has 15 pages, and has been submitted for a possible + publication. arXiv admin note: text overlap with arXiv:2403.17785 +
+
+
+
+
+ + ☆ Unsupervised Congestion Status Identification Using LMP Data + + +
+ Having a better understanding of how locational marginal prices (LMPs) change +helps in price forecasting and market strategy making. This paper investigates +the fundamental distribution of the congestion part of LMPs in high-dimensional +Euclidean space using an unsupervised approach. LMP models based on the +lossless and lossy DC optimal power flow (DC-OPF) are analyzed to show the +overlapping subspace property of the LMP data. The congestion part of LMPs is +spanned by certain row vectors of the power transfer distribution factor (PTDF) +matrix, and the subspace attributes of an LMP vector uniquely are found to +reflect the instantaneous congestion status of all the transmission lines. The +proposed method searches for the basis vectors that span the subspaces of +congestion LMP data in hierarchical ways. In the bottom-up search, the data +belonging to 1-dimensional subspaces are detected, and other data are projected +on the orthogonal subspaces. This procedure is repeated until all the basis +vectors are found or the basis gap appears. Top-down searching is used to +address the basis gap by hyperplane detection with outliers. Once all the basis +vectors are detected, the congestion status can be identified. Numerical +experiments based on the IEEE 30-bus system, IEEE 118-bus system, Illinois +200-bus system, and Southwest Power Pool are conducted to show the performance +of the proposed method. + +
+
+ comment: Paper accepted for IEEE Transactions on Smart Grid. Personal use of + this material is permitted. Permission from IEEE must be obtained for all + other uses +
+
+
+
+
+ + ☆ Enforcing Cooperative Safety for Reinforcement Learning-based + Mixed-Autonomy Platoon Control + + +
+ It is recognized that the control of mixed-autonomy platoons comprising +connected and automated vehicles (CAVs) and human-driven vehicles (HDVs) can +enhance traffic flow. Among existing methods, Multi-Agent Reinforcement +Learning (MARL) appears to be a promising control strategy because it can +manage complex scenarios in real time. However, current research on MARL-based +mixed-autonomy platoon control suffers from several limitations. First, +existing MARL approaches address safety by penalizing safety violations in the +reward function, thus lacking theoretical safety guarantees due to the +black-box nature of RL. Second, few studies have explored the cooperative +safety of multi-CAV platoons, where CAVs can be coordinated to further enhance +the system-level safety involving the safety of both CAVs and HDVs. Third, +existing work tends to make an unrealistic assumption that the behavior of HDVs +and CAVs is publicly known and rationale. To bridge the research gaps, we +propose a safe MARL framework for mixed-autonomy platoons. Specifically, this +framework (i) characterizes cooperative safety by designing a cooperative +Control Barrier Function (CBF), enabling CAVs to collaboratively improve the +safety of the entire platoon, (ii) provides a safety guarantee to the +MARL-based controller by integrating the CBF-based safety constraints into MARL +through a differentiable quadratic programming (QP) layer, and (iii) +incorporates a conformal prediction module that enables each CAV to estimate +the unknown behaviors of the surrounding vehicles with uncertainty +qualification. Simulation results show that our proposed control strategy can +effectively enhance the system-level safety through CAV cooperation of a +mixed-autonomy platoon with a minimal impact on control performance. + +
+
+
+
+
+ + ☆ Exploring the Influence of Residential Electric Vehicle Charging on + Distribution System Hosting Capacity -- A Case-Study in Arizona + + +
+ The installation of high-capacity fast chargers for electric vehicles (EVs) +is posing a significant risk to the distribution grid as the increased demand +from widespread residential EV charging could exceed the technical limits of +the distribution system. Addressing this issue is critical, given that current +infrastructure upgrades to enhance EV hosting capacity are both costly and +time-consuming. Moreover, the inherent uncertainties associated with EV +charging parameters make it challenging for power utilities to accurately +assess the impact of EVs added to specific locations. To address these +knowledge gaps, this study (a) introduces an algorithm to coordinate +residential EV charging, and (b) proposes a comprehensive framework that +evaluates all transformers within a feeder. The proposed method is applied to a +real-world feeder, which includes 120 transformers of varying capacities. The +results demonstrate that this approach effectively manages a substantial number +of EVs without overloading any of the transformers, while also pinpointing +locations that must be prioritized for future upgrades. This framework can +serve as a valuable reference for utilities when conducting distribution system +evaluations for supporting the growing EV penetration. + +
+
+
+
+
+ + ☆ A Secure Estimator with Gaussian Bernoulli Mixture Model + + +
+ The implementation of cyber-physical systems in real-world applications is +challenged by safety requirements in the presence of sensor threats. Most +cyber-physical systems, in particular the vulnerable multi-sensor systems, +struggle to detect the attack in observation signals. In this paper, we tackle +this issue by proposing a Gaussian-Bernoulli Secure (GBS) estimator, which +effectively transforms the assessment of sensor status into an optimal +estimation problem concerning the system state and observation indicators. It +encompasses two theoretical sub-problems: sequential state estimation with +partial observations and estimation updates with disordered new observations. +Within the framework of Kalman filter, we derive closed-form solutions for +these two issues. However, due to their computational inefficiency, we propose +the iterative approach employing proximal gradient descent to accelerate the +estimation update. We conduct comprehensive experiments from three +perspectives: computational efficiency, detection and estimation performance, +and characterization of observation error. Our GBS estimator shows the +improvements compared to other methods. + +
+
+
+
+
+ + ☆ Reaching Resilient Leader-Follower Consensus in Time-Varying Networks + via Multi-Hop Relays + + +
+ We study resilient leader-follower consensus of multi-agent systems (MASs) in +the presence of adversarial agents, where agents' communication is modeled by +time-varying topologies. The objective is to develop distributed algorithms for +the nonfaulty/normal followers to track an arbitrary reference value propagated +by a set of leaders while they are in interaction with the unknown adversarial +agents. Our approaches are based on the weighted mean subsequence reduced +(W-MSR) algorithms with agents being capable to communicate with multi-hop +neighbors. Our algorithms can handle agents possessing first-order and +second-order dynamics. Moreover, we characterize necessary and sufficient graph +conditions for our algorithms to succeed by the novel notion of jointly robust +following graphs. Our graph condition is tighter than the sufficient conditions +in the literature when agents use only one-hop communication (without relays). +Using multi-hop relays, we can enhance robustness of leader-follower networks +without increasing communication links and obtain further relaxed graph +requirements for our algorithms to succeed. Numerical examples are given to +verify the efficacy of our algorithms. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ A Graph-based Strategic Sensor Deployment Approach for k-coverage in WSN + + +
+ This paper studies a graph-based sensor deployment approach in wireless +sensor networks (WSNs). Specifically, in today's world, where sensors are +everywhere, detecting various attributes like temperature and movement, their +deteriorating lifetime is indeed a very concerning issue. In many scenarios, +these sensors are placed in extremely remote areas, where maintenance becomes +challenging. As a result, it is not very wise to depend on a single sensor to +obtain data from a particular terrain or place. Hence, multiple sensors are +deployed in these places, such that no problem arises if one or few of them +fail. In this work, this problem of intelligent placement of sensors is +modelled from the graph theoretic point of view. We propose a new sensor +deployment approach here, which results in lesser sensor density per unit area +and less number of sensors as compared to the existing benchmark schemes. +Finally, the numerical results also support our claims and provide insights +regarding the selection of parameters that enhance the system performance. + +
+
+ comment: Submitted for a possible publication +
+
+
+
+
+ + ☆ A Survey of Machine Learning-based Physical-Layer Authentication in + Wireless Communications + + +
+ To ensure secure and reliable communication in wireless systems, +authenticating the identities of numerous nodes is imperative. Traditional +cryptography-based authentication methods suffer from issues such as low +compatibility, reliability, and high complexity. Physical-Layer Authentication +(PLA) is emerging as a promising complement due to its exploitation of unique +properties in wireless environments. Recently, Machine Learning (ML)-based PLA +has gained attention for its intelligence, adaptability, universality, and +scalability compared to non-ML approaches. However, a comprehensive overview of +state-of-the-art ML-based PLA and its foundational aspects is lacking. This +paper presents a comprehensive survey of characteristics and technologies that +can be used in the ML-based PLA. We categorize existing ML-based PLA schemes +into two main types: multi-device identification and attack detection schemes. +In deep learning-based multi-device identification schemes, Deep Neural +Networks are employed to train models, avoiding complex processing and expert +feature transformation. Deep learning-based multi-device identification schemes +are further subdivided, with schemes based on Convolutional Neural Networks +being extensively researched. In ML-based attack detection schemes, receivers +utilize intelligent ML techniques to set detection thresholds automatically, +eliminating the need for manual calculation or knowledge of channel models. +ML-based attack detection schemes are categorized into three sub-types: +Supervised Learning, Unsupervised Learning, and Reinforcement Learning. +Additionally, we summarize open-source datasets used for PLA, encompassing +Radio Frequency fingerprints and channel fingerprints. Finally, this paper +outlines future research directions to guide researchers in related fields. + +
+
+ comment: 111 pages, 9 figures +
+
+
+
+
+ + ☆ Regulating Stability Margins in Symbiotic Control: A Low-Pass Filter + Approach + + +
+ Symbiotic control synergistically integrates fixed-gain control and adaptive +learning architectures to mitigate system uncertainties more predictably than +adaptive learning alone and without requiring prior knowledge of uncertainty +bounds as compared to fixed-gain control alone. Specifically, increasing the +fixed-gain control parameter achieves a desired level of closed-loop system +performance while the adaptive law simultaneously learns and suppresses the +system uncertainties. However, stability margins can be reduced when this +parameter is large and this paper aims to address this practical challenge. To +this end, we propose a new fixed-gain control architecture predicated on a +low-pass filter approach to regulate stability margins in the symbiotic control +framework. In addition to the presented system-theoretical results focusing on +the stability of the closed-loop system, we provide two illustrative numerical +examples to demonstrate how the low-pass filter parameters are chosen for the +stability margin regulation problem without significantly compromising the +closed-loop system performance. + +
+
+
+
+
+ + ☆ A Novel MLLM-based Approach for Autonomous Driving in Different Weather + Conditions + + +
+ Autonomous driving (AD) technology promises to revolutionize daily +transportation by making it safer, more efficient, and more comfortable. Their +role in reducing traffic accidents and improving mobility will be vital to the +future of intelligent transportation systems. Autonomous driving in harsh +environmental conditions presents significant challenges that demand robust and +adaptive solutions and require more investigation. In this context, we present +in this paper a comprehensive performance analysis of an autonomous driving +agent leveraging the capabilities of a Multi-modal Large Language Model (MLLM) +using GPT-4o within the LimSim++ framework that offers close loop interaction +with the CARLA driving simulator. We call it MLLM-AD-4o. Our study evaluates +the agent's decision-making, perception, and control under adverse conditions, +including bad weather, poor visibility, and complex traffic scenarios. Our +results demonstrate the AD agent's ability to maintain high levels of safety +and efficiency, even in challenging environments, underscoring the potential of +GPT-4o to enhance autonomous driving systems (ADS) in any environment +condition. Moreover, we evaluate the performance of MLLM-AD-4o when different +perception entities are used including either front cameras only, front and +rear cameras, and when combined with LiDAR. The results of this work provide +valuable insights into integrating MLLMs with AD frameworks, paving the way for +future advancements in this field. + +
+
+ comment: 9 pages, 6 figures; Submitted to IEEE Transactions on Intelligent + Transportation Systems +
+
+
+
+
+ + ☆ A Systematic LMI Approach to Design Multivariable Sliding Mode + Controllers + + +
+ This paper deals with sliding mode control for multivariable polytopic +uncertain systems. We provide systematic procedures to design variable +structure controllers (VSCs) and unit-vector controllers (UVCs). Based on +suitable representations for the closed-loop system, we derive sufficient +conditions in the form of linear matrix inequalities (LMIs) to design the +robust sliding mode controllers such that the origin of the closed-loop system +is globally stable in finite time. Moreover, by noticing that the reaching time +depends on the initial condition and the decay rate, we provide convex +optimization problems to design robust controllers by considering the +minimization of the reaching time associated with a given set of initial +conditions. Two examples illustrate the effectiveness of the proposed +approaches. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Gradient-Based Stochastic Extremum-Seeking Control for Multivariable + Systems with Distinct Input Delays + + +
+ This paper addresses the design and analysis of a multivariable +gradient-based stochastic extremum-seeking control method for multi-input +systems with arbitrary input delays. The approach accommodates systems with +distinct time delays across input channels and achieves local exponential +stability of the closed-loop system, guaranteeing convergence to a small +neighborhood around the extremum point. By incorporating phase compensation for +dither signals and a novel predictor-feedback mechanism with averaging-based +estimates of the unknown gradient and Hessian, the proposed method overcomes +traditional challenges associated with arbitrary, distinct input delays. Unlike +previous work on deterministic multiparameter extremum-seeking with distinct +input delays, this stability analysis is achieved without using backstepping +transformations, simplifying the predictor design and enabling a more +straightforward implementation. Specifically, the direct application of +Artstein's reduction approach results in delay- and +system-dimension-independent convergence rates, enhancing practical +applicability. A numerical example illustrates the robust performance and +advantages of the proposed delay-compensated stochastic extremum-seeking +method. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ AC-Informed DC Optimal Transmission Switching Problems via Parameter + Optimization + + +
+ Optimal Transmission Switching (OTS) problems minimize operational costs +while treating both the transmission line energization statuses and generator +setpoints as decision variables. The combination of nonlinearities from an AC +power flow model and discrete variables associated with line statuses makes +AC-OTS a computationally challenging Mixed-Integer Nonlinear Program (MINLP). +To address these challenges, the DC power flow approximation is often used to +obtain a DC-OTS formulation expressed as a Mixed-Integer Linear Program (MILP). +However, this approximation often leads to suboptimal or infeasible switching +decisions when evaluated with an AC power flow model. This paper proposes an +enhanced DC-OTS formulation that leverages techniques for training machine +learning models to optimize the DC power flow model's parameters. By optimally +selecting parameter values that align flows in the DC power flow model with +apparent power flows -- incorporating both real and reactive components -- from +AC Optimal Power Flow (OPF) solutions, our method more accurately captures line +congestion behavior. Integrating these optimized parameters into the DC-OTS +formulation significantly improves the accuracy of switching decisions and +reduces discrepancies between DC-OTS and AC-OTS solutions. We compare our +optimized DC-OTS model against traditional OTS approaches, including DC-OTS, +Linear Programming AC (LPAC)-OTS, and Quadratic Convex (QC)-OTS. Numeric +results show that switching decisions from our model yield better performance +when evaluated using an AC power flow model, with up to $44\%$ cost reductions +in some cases. + +
+
+
+
+
+ + ♻ ☆ Long-term Hydrothermal Bid-based Market Simulator + + +
+ Simulating long-term hydrothermal bid-based markets considering strategic +agents is a challenging task. The representation of strategic agents +considering intertemporal constraints within a stochastic framework brings +additional complexity to the already difficult single-period bilevel, thus, +non-convex, optimal bidding problem. Thus, we propose a simulation methodology +that effectively addresses these challenges for large-scale hydrothermal power +systems. We demonstrate the effectiveness of the framework through a case study +with real data from the large-scale Brazilian power system. In the case +studies, we show the effects of market concentration in power systems and how +contracts can be used to mitigate them. In particular, we show how market power +might affect the current setting in Brazil. The developed method can strongly +benefit policymakers, market monitors, and market designers as simulations can +be used to understand existing power systems and experiment with alternative +designs. + +
+
+
+
+
+ + ♻ ☆ Safe Navigation in Unmapped Environments for Robotic Systems with Input + Constraints + + +
+ This paper presents an approach for navigation and control in unmapped +environments under input and state constraints using a composite control +barrier function (CBF). We consider the scenario where real-time perception +feedback (e.g., LiDAR) is used online to construct a local CBF that models +local state constraints (e.g., local safety constraints such as obstacles) in +the a priori unmapped environment. The approach employs a soft-maximum function +to synthesize a single time-varying CBF from the N most recently obtained local +CBFs. Next, the input constraints are transformed into controller-state +constraints through the use of control dynamics. Then, we use a soft-minimum +function to compose the input constraints with the time-varying CBF that models +the a priori unmapped environment. This composition yields a single relaxed +CBF, which is used in a constrained optimization to obtain an optimal control +that satisfies the state and input constraints. The approach is validated +through simulations of a nonholonomic ground robot that is equipped with LiDAR +and navigates an unmapped environment. The robot successfully navigates the +environment while avoiding the a priori unmapped obstacles and satisfying both +speed and input constraints. + +
+
+ comment: Preprint submitted to 2025 American Control Conference (ACC). arXiv + admin note: substantial text overlap with arXiv:2409.01458 +
+
+
+
+
+ + ♻ ☆ Energy-Aware Predictive Motion Planning for Autonomous Vehicles Using a + Hybrid Zonotope Constraint Representation + + +
+ Uncrewed aerial systems have tightly coupled energy and motion dynamics which +must be accounted for by onboard planning algorithms. This work proposes a +strategy for coupled motion and energy planning using model predictive control +(MPC). A reduced-order linear time-invariant model of coupled energy and motion +dynamics is presented. Constrained zonotopes are used to represent state and +input constraints, and hybrid zonotopes are used to represent non-convex +constraints tied to a map of the environment. The structures of these +constraint representations are exploited within a mixed-integer quadratic +program solver tailored to MPC motion planning problems. Results apply the +proposed methodology to coupled motion and energy utilization planning problems +for 1) a hybrid-electric vehicle that must restrict engine usage when flying +over regions with noise restrictions, and 2) an electric package delivery drone +that must track waysets with both position and battery state of charge +requirements. By leveraging the structure-exploiting solver, the proposed +mixed-integer MPC formulations can be implemented in real time. + +
+
+
+
+
+ + ♻ ☆ Analyzing electric vehicle, load and photovoltaic generation uncertainty + using publicly available datasets + + +
+ This paper aims to analyze three publicly available datasets for quantifying +seasonal and annual uncertainty for efficient scenario creation. The datasets +from Elaad, Elia and Fluvius are utilized to statistically analyze electric +vehicle charging, normalized solar generation and low-voltage consumer load +profiles, respectively. Frameworks for scenario generation are also provided +for these datasets. The datasets for load profiles and solar generation +analyzed are for the year 2022, thus embedding seasonal information. An online +repository is created for the wider applicability of this work. Finally, the +extreme load week(s) are identified and linked to the weather data measured at +EnergyVille in Belgium. + +
+
+
+
+
+ + ♻ ☆ An Ontology-based Approach Towards Traceable Behavior Specifications in + Automated Driving + + +
+ Vehicles in public traffic that are equipped with Automated Driving Systems +are subject to a number of expectations: Among other aspects, their behavior +should be safe, conforming to the rules of the road and provide mobility to +their users. This poses challenges for the developers of such systems: +Developers are responsible for specifying this behavior, for example, in terms +of requirements at system design time. As we will discuss in the article, this +specification always involves the need for assumptions and trade-offs. As a +result, insufficiencies in such a behavior specification can occur that can +potentially lead to unsafe system behavior. In order to support the +identification of specification insufficiencies, requirements and respective +assumptions need to be made explicit. In this article, we propose the Semantic +Norm Behavior Analysis as an ontology-based approach to specify the behavior +for an Automated Driving System equipped vehicle. We use ontologies to formally +represent specified behavior for a targeted operational environment, and to +establish traceability between specified behavior and the addressed stakeholder +needs. Furthermore, we illustrate the application of the Semantic Norm Behavior +Analysis in a German legal context with two example scenarios and evaluate our +results. Our evaluation shows that the explicit documentation of assumptions in +the behavior specification supports both the identification of specification +insufficiencies and their treatment. Therefore, this article provides +requirements, terminology and an according methodology to facilitate +ontology-based behavior specifications in automated driving. + +
+
+ comment: 24 pages, 12 figures, submitted for publication +
+
+
+
+
+ + ♻ ☆ DEEP-IoT: Downlink-Enhanced Efficient-Power Internet of Things + + +
+ At the heart of the Internet of Things (IoT) -- a domain witnessing explosive +growth -- the imperative for energy efficiency and the extension of device +lifespans has never been more pressing. This paper presents DEEP-IoT, an +innovative communication paradigm poised to redefine how IoT devices +communicate. Through a pioneering feedback channel coding strategy, DEEP-IoT +challenges and transforms the traditional transmitter (IoT devices)-centric +communication model to one where the receiver (the access point) play a pivotal +role, thereby cutting down energy use and boosting device longevity. We not +only conceptualize DEEP-IoT but also actualize it by integrating deep +learning-enhanced feedback channel codes within a narrow-band system. +Simulation results show a significant enhancement in the operational lifespan +of IoT cells -- surpassing traditional systems using Turbo and Polar codes by +up to 52.71%. This leap signifies a paradigm shift in IoT communications, +setting the stage for a future where IoT devices boast unprecedented efficiency +and durability. + +
+
+
+
+
+ + ♻ ☆ A Control Theoretical Approach to Online Constrained Optimization + + +
+ In this paper we focus on the solution of online problems with time-varying, +linear equality and inequality constraints. Our approach is to design a novel +online algorithm by leveraging the tools of control theory. In particular, for +the case of equality constraints only, using robust control we design an online +algorithm with asymptotic convergence to the optimal trajectory, differently +from the alternatives that achieve non-zero tracking error. When also +inequality constraints are present, we show how to modify the proposed +algorithm to account for the wind-up induced by the nonnegativity constraints +on the dual variables. We report numerical results that corroborate the +theoretical analysis, and show how the proposed approach outperforms +state-of-the-art algorithms both with equality and inequality constraints. + +
+
+ comment: To appear in Automatica +
+
+
+
+
+ + ♻ ☆ Distributed Solvers for Network Linear Equations with Scalarized + Compression + + +
+ Distributed computing is fundamental to multi-agent systems, with solving +distributed linear equations as a typical example. In this paper, we study +distributed solvers for network linear equations over a network with +node-to-node communication messages compressed as scalar values. Our key idea +lies in a dimension compression scheme that includes a dimension-compressing +vector and a data unfolding step. The compression vector applies to individual +node states as an inner product to generate a real-valued message for node +communication. In the unfolding step, such scalar message is then plotted along +the subspace generated by the compression vector for the local computations. We +first present a compressed consensus flow that relies only on such scalarized +communication, and show that linear convergence can be achieved with well +excited signals for the compression vector. We then employ such a compressed +consensus flow as a fundamental consensus subroutine to develop distributed +continuous-time and discrete-time solvers for network linear equations, and +prove their linear convergence properties under scalar node communications. +With scalar communications, a direct benefit would be the reduced node-to-node +communication channel burden for distributed computing. Numerical examples are +presented to illustrate the effectiveness of the established theoretical +results. + +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Communication Compression for Distributed Prime-Dual + Optimization + + +
+ Several data compressors have been proposed in distributed optimization +frameworks of network systems to reduce communication overhead in large-scale +applications. In this paper, we demonstrate that effective information +compression may occur over time or space during sequences of node +communications in distributed algorithms, leading to the concept of +spatio-temporal compressors. This abstraction classifies existing compressors +as spatio-temporal compressors, with their effectiveness described by +constructive stability criteria from nonlinear system theory. Subsequently, we +apply these spatio-temporal compressors to standard continuous-time consensus +flows and distributed prime-dual flows, establishing conditions ensuring +convergence. Additionally, we introduce a novel observer-based distributed +primal-dual continuous flow integrated with spatio-temporal compressors, which +provides broader convergence conditions. These continuous flows achieve +exponential convergence to the global optimum when the objective function is +strongly convex and can be discretized using Euler approximations. Finally, +numerical simulations illustrate the versatility of the proposed +spatio-temporal compressors and verify the convergence of algorithms. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2408.02332 +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Communication Compression in Distributed Prime-Dual + Flows + + +
+ In this paper, we study distributed prime-dual flows for multi-agent +optimization with spatio-temporal compressions. The central aim of multi-agent +optimization is for a network of agents to collaboratively solve a system-level +optimization problem with local objective functions and node-to-node +communication by distributed algorithms. The scalability of such algorithms +crucially depends on the complexity of the communication messages, and a number +of communication compressors for distributed optimization have recently been +proposed in the literature. First of all, we introduce a general +spatio-temporal compressor characterized by the stability of the resulting +dynamical system along the vector field of the compressor. We show that several +important distributed optimization compressors such as the greedy sparsifier, +the uniform quantizer, and the scalarizer all fall into the category of this +spatio-temporal compressor. Next, we propose two distributed prime-dual flows +with the spatio-temporal compressors being applied to local node states and +local error states, respectively, and prove (exponential) convergence of the +node trajectories to the global optimizer for (strongly) convex cost functions. +Finally, a few numerical examples are present to illustrate our theoretical +results. + +
+
+
+
+
+ + ♻ ☆ A Multi-Granularity Supervised Contrastive Framework for Remaining + Useful Life Prediction of Aero-engines + + +
+ Accurate remaining useful life (RUL) predictions are critical to the safe +operation of aero-engines. Currently, the RUL prediction task is mainly a +regression paradigm with only mean square error as the loss function and lacks +research on feature space structure, the latter of which has shown excellent +performance in a large number of studies. This paper develops a +multi-granularity supervised contrastive (MGSC) framework from plain intuition +that samples with the same RUL label should be aligned in the feature space, +and address the problems of too large minibatch size and unbalanced samples in +the implementation. The RUL prediction with MGSC is implemented on using the +proposed multi-phase training strategy. This paper also demonstrates a simple +and scalable basic network structure and validates the proposed MGSC strategy +on the CMPASS dataset using a convolutional long short-term memory network as a +baseline, which effectively improves the accuracy of RUL prediction. + +
+
+
+
+
+ + ♻ ☆ Pricing for Multi-modal Pickup and Delivery Problems with Heterogeneous + Users + + +
+ In this paper, we study the pickup and delivery problem with multiple +transportation modalities, and address the challenge of efficiently allocating +transportation resources while price matching users with their desired delivery +modes. More precisely, we consider that orders are demanded by a heterogeneous +population of users with varying trade-offs between price and latency. To +capture how prices affect the behavior of heterogeneous selfish users choosing +between multiple delivery modes, we construct a congestion game taking place +over a form of star network, where each source-sink pair is composed of +parallel links connecting users with their preferred delivery method. Using the +unique geometry of this network, we prove that one can set prices explicitly to +induce any desired network flow, i.e, given a desired allocation strategy, we +have a closed-form solution for the delivery prices. We conclude by performing +a case study on a meal delivery problem with multiple courier modalities using +data from real world instances. + +
+
+
+
+
+ + ♻ ☆ Extremum Seeking is Stable for Scalar Maps that are Strictly but Not + Strongly Convex + + +
+ For a map that is strictly but not strongly convex, model-based gradient +extremum seeking has an eigenvalue of zero at the extremum, i.e., it fails at +exponential convergence. Interestingly, perturbation-based model-free extremum +seeking has a negative Jacobian, in the average, meaning that its (practical) +convergence is exponential, even though the map's Hessian is zero at the +extremum. While these observations for the gradient algorithm are not trivial, +we focus in this paper on an even more nontrivial study of the same phenomenon +for Newton-based extremum seeking control (NESC). + NESC is a second-order method which corrects for the unknown Hessian of the +unknown map, not only in order to speed up parameter convergence, but also (1) +to make the convergence rate user-assignable in spite of the unknown Hessian, +and (2) to equalize the convergence rates in different directions for +multivariable maps. Previous NESC work established stability only for maps +whose Hessians are strictly positive definite everywhere, so the Hessian is +invertible everywhere. For a scalar map, we establish the rather unexpected +property that, even when the map behind is strictly convex but not strongly +convex, i.e., when the Hessian may be zero, NESC guarantees practical +asymptotic stability, semiglobally. While a model-based Newton-based algorithm +would run into non-invertibility of the Hessian, the perturbation-based NESC, +surprisingly, avoids this challenge by leveraging the fact that the average of +the perturbation-based Hessian estimate is always positive, even though the +actual Hessian may be zero. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+
+
+
+ + Machine Learning 132 + +
+
+
+ + ☆ MARS: Unleashing the Power of Variance Reduction for Training Large + Models + + +
+ Training deep neural networks--and more recently, large models--demands +efficient and scalable optimizers. Adaptive gradient algorithms like Adam, +AdamW, and their variants have been central to this task. Despite the +development of numerous variance reduction algorithms in the past decade aimed +at accelerating stochastic optimization in both convex and nonconvex settings, +variance reduction has not found widespread success in training deep neural +networks or large language models. Consequently, it has remained a less favored +approach in modern AI. In this paper, to unleash the power of variance +reduction for efficient training of large models, we propose a unified +optimization framework, MARS (Make vAriance Reduction Shine), which reconciles +preconditioned gradient methods with variance reduction via a scaled stochastic +recursive momentum technique. Within our framework, we introduce three +instances of MARS that leverage preconditioned gradient updates based on AdamW, +Lion, and Shampoo, respectively. We also draw a connection between our +algorithms and existing optimizers. Experimental results on training GPT-2 +models indicate that MARS consistently outperforms AdamW by a large margin. + +
+
+ comment: 23 pages, 7 figures, 6 tables +
+
+
+
+
+ + ☆ The Spatial Complexity of Optical Computing and How to Reduce It + + +
+ Similar to algorithms, which consume time and memory to run, hardware +requires resources to function. For devices processing physical waves, +implementing operations needs sufficient "space," as dictated by wave physics. +How much space is needed to perform a certain function is a fundamental +question in optics, with recent research addressing it for given mathematical +operations, but not for more general computing tasks, e.g., classification. +Inspired by computational complexity theory, we study the "spatial complexity" +of optical computing systems in terms of scaling laws - specifically, how their +physical dimensions must scale as the dimension of the mathematical operation +increases - and propose a new paradigm for designing optical computing systems: +space-efficient neuromorphic optics, based on structural sparsity constraints +and neural pruning methods motivated by wave physics (notably, the concept of +"overlapping nonlocality"). On two mainstream platforms, free-space optics and +on-chip integrated photonics, our methods demonstrate substantial size +reductions (to 1%-10% the size of conventional designs) with minimal compromise +on performance. Our theoretical and computational results reveal a trend of +diminishing returns on accuracy as structure dimensions increase, providing a +new perspective for interpreting and approaching the ultimate limits of optical +computing - a balanced trade-off between device size and accuracy. + +
+
+
+
+
+ + ☆ Private Counterfactual Retrieval With Immutable Features + + +
+ In a classification task, counterfactual explanations provide the minimum +change needed for an input to be classified into a favorable class. We consider +the problem of privately retrieving the exact closest counterfactual from a +database of accepted samples while enforcing that certain features of the input +sample cannot be changed, i.e., they are \emph{immutable}. An applicant (user) +whose feature vector is rejected by a machine learning model wants to retrieve +the sample closest to them in the database without altering a private subset of +their features, which constitutes the immutable set. While doing this, the user +should keep their feature vector, immutable set and the resulting +counterfactual index information-theoretically private from the institution. We +refer to this as immutable private counterfactual retrieval (I-PCR) problem +which generalizes PCR to a more practical setting. In this paper, we propose +two I-PCR schemes by leveraging techniques from private information retrieval +(PIR) and characterize their communication costs. Further, we quantify the +information that the user learns about the database and compare it for the +proposed schemes. + +
+
+
+
+
+ + ☆ Back to Supervision: Boosting Word Boundary Detection through Frame + Classification + + +
+ Speech segmentation at both word and phoneme levels is crucial for various +speech processing tasks. It significantly aids in extracting meaningful units +from an utterance, thus enabling the generation of discrete elements. In this +work we propose a model-agnostic framework to perform word boundary detection +in a supervised manner also employing a labels augmentation technique and an +output-frame selection strategy. We trained and tested on the Buckeye dataset +and only tested on TIMIT one, using state-of-the-art encoder models, including +pre-trained solutions (Wav2Vec 2.0 and HuBERT), as well as convolutional and +convolutional recurrent networks. Our method, with the HuBERT encoder, +surpasses the performance of other state-of-the-art architectures, whether +trained in supervised or self-supervised settings on the same datasets. +Specifically, we achieved F-values of 0.8427 on the Buckeye dataset and 0.7436 +on the TIMIT dataset, along with R-values of 0.8489 and 0.7807, respectively. +These results establish a new state-of-the-art for both datasets. Beyond the +immediate task, our approach offers a robust and efficient preprocessing method +for future research in audio tokenization. + +
+
+
+
+
+ + ☆ Multiscale Dubuc: A New Similarity Measure for Time Series + + +
+ Quantifying similarities between time series in a meaningful way remains a +challenge in time series analysis, despite many advances in the field. Most +real-world solutions still rely on a few popular measures, such as Euclidean +Distance (EuD), Longest Common Subsequence (LCSS), and Dynamic Time Warping +(DTW). The strengths and weaknesses of these measures have been studied +extensively, and incremental improvements have been proposed. In this study, +however, we present a different similarity measure that fuses the notion of +Dubuc's variation from fractal analysis with the Intersection-over-Union (IoU) +measure which is widely used in object recognition (also known as the Jaccard +Index). In this proof-of-concept paper, we introduce the Multiscale Dubuc +Distance (MDD) measure and prove that it is a metric, possessing desirable +properties such as the triangle inequality. We use 95 datasets from the UCR +Time Series Classification Archive to compare MDD's performance with EuD, LCSS, +and DTW. Our experiments show that MDD's overall success, without any +case-specific customization, is comparable to DTW with optimized window sizes +per dataset. We also highlight several datasets where MDD's performance +improves significantly when its single parameter is customized. This +customization serves as a powerful tool for gauging MDD's sensitivity to noise. +Lastly, we show that MDD's running time is linear in the length of the time +series, which is crucial for real-world applications involving very large +datasets. + +
+
+ comment: 6 pages, 3 figures, IEEE Big Data 2024 +
+
+
+
+
+ + ☆ Features that Make a Difference: Leveraging Gradients for Improved + Dictionary Learning NAACL 2025 + + +
+ Sparse Autoencoders (SAEs) are a promising approach for extracting neural +network representations by learning a sparse and overcomplete decomposition of +the network's internal activations. However, SAEs are traditionally trained +considering only activation values and not the effect those activations have on +downstream computations. This limits the information available to learn +features, and biases the autoencoder towards neglecting features which are +represented with small activation values but strongly influence model outputs. +To address this, we introduce Gradient SAEs (g-SAEs), which modify the +$k$-sparse autoencoder architecture by augmenting the TopK activation function +to rely on the gradients of the input activation when selecting the $k$ +elements. For a given sparsity level, g-SAEs produce reconstructions that are +more faithful to original network performance when propagated through the +network. Additionally, we find evidence that g-SAEs learn latents that are on +average more effective at steering models in arbitrary contexts. By considering +the downstream effects of activations, our approach leverages the dual nature +of neural network features as both $\textit{representations}$, retrospectively, +and $\textit{actions}$, prospectively. While previous methods have approached +the problem of feature discovery primarily focused on the former aspect, g-SAEs +represent a step towards accounting for the latter as well. + +
+
+ comment: 9 pages, 8 figures. Submitted to NAACL 2025 +
+
+
+
+
+ + ☆ Deep Learning for Micro-Scale Crack Detection on Imbalanced Datasets + Using Key Point Localization + + +
+ Internal crack detection has been a subject of focus in structural health +monitoring. By focusing on crack detection in structural datasets, it is +demonstrated that deep learning (DL) methods can effectively analyze seismic +wave fields interacting with micro-scale cracks, which are beyond the +resolution of conventional visual inspection. This work explores a novel +application of DL-based key point detection technique, where cracks are +localized by predicting the coordinates of four key points that define a +bounding region of the crack. The study not only opens new research directions +for non-visual applications but also effectively mitigates the impact of +imbalanced data which poses a challenge for previous DL models, as it can be +biased toward predicting the majority class (non-crack regions). Popular DL +techniques, such as the Inception blocks, are used and investigated. The model +shows an overall reduction in loss when applied to micro-scale crack detection +and is reflected in the lower average deviation between the location of actual +and predicted cracks, with an average Intersection over Union (IoU) being 0.511 +for all micro cracks (greater than 0.00 micrometers) and 0.631 for larger micro +cracks (greater than 4 micrometers). + +
+
+
+
+
+ + ☆ Low-Latency Task-Oriented Communications with Multi-Round, Multi-Task + Deep Learning + + +
+ In this paper, we address task-oriented (or goal-oriented) communications +where an encoder at the transmitter learns compressed latent representations of +data, which are then transmitted over a wireless channel. At the receiver, a +decoder performs a machine learning task, specifically for classifying the +received signals. The deep neural networks corresponding to the encoder-decoder +pair are jointly trained, taking both channel and data characteristics into +account. Our objective is to achieve high accuracy in completing the underlying +task while minimizing the number of channel uses determined by the encoder's +output size. To this end, we propose a multi-round, multi-task learning (MRMTL) +approach for the dynamic update of channel uses in multi-round transmissions. +The transmitter incrementally sends an increasing number of encoded samples +over the channel based on the feedback from the receiver, and the receiver +utilizes the signals from a previous round to enhance the task performance, +rather than only considering the latest transmission. This approach employs +multi-task learning to jointly optimize accuracy across varying number of +channel uses, treating each configuration as a distinct task. By evaluating the +confidence of the receiver in task decisions, MRMTL decides on whether to +allocate additional channel uses in multiple rounds. We characterize both the +accuracy and the delay (total number of channel uses) of MRMTL, demonstrating +that it achieves the accuracy close to that of conventional methods requiring +large numbers of channel uses, but with reduced delay by incorporating signals +from a prior round. We consider the CIFAR-10 dataset, convolutional neural +network architectures, and AWGN and Rayleigh channel models for performance +evaluation. We show that MRMTL significantly improves the efficiency of +task-oriented communications, balancing accuracy and latency effectively. + +
+
+
+
+
+ + ☆ Framework for Co-distillation Driven Federated Learning to Address Class + Imbalance in Healthcare + + +
+ Federated Learning (FL) is a pioneering approach in distributed machine +learning, enabling collaborative model training across multiple clients while +retaining data privacy. However, the inherent heterogeneity due to imbalanced +resource representations across multiple clients poses significant challenges, +often introducing bias towards the majority class. This issue is particularly +prevalent in healthcare settings, where hospitals acting as clients share +medical images. To address class imbalance and reduce bias, we propose a +co-distillation driven framework in a federated healthcare setting. Unlike +traditional federated setups with a designated server client, our framework +promotes knowledge sharing among clients to collectively improve learning +outcomes. Our experiments demonstrate that in a federated healthcare setting, +co-distillation outperforms other federated methods in handling class +imbalance. Additionally, we demonstrate that our framework has the least +standard deviation with increasing imbalance while outperforming other +baselines, signifying the robustness of our framework for FL in healthcare. + +
+
+ comment: Accepted at CODS COMAD'24 and to be published in the Discover Data + Journal(https://link.springer.com/journal/44248) +
+
+
+
+
+ + ☆ Continual Adversarial Reinforcement Learning (CARL) of False Data + Injection detection: forgetting and explainability + + +
+ False data injection attacks (FDIAs) on smart inverters are a growing concern +linked to increased renewable energy production. While data-based FDIA +detection methods are also actively developed, we show that they remain +vulnerable to impactful and stealthy adversarial examples that can be crafted +using Reinforcement Learning (RL). We propose to include such adversarial +examples in data-based detection training procedure via a continual adversarial +RL (CARL) approach. This way, one can pinpoint the deficiencies of data-based +detection, thereby offering explainability during their incremental +improvement. We show that a continual learning implementation is subject to +catastrophic forgetting, and additionally show that forgetting can be addressed +by employing a joint training strategy on all generated FDIA scenarios. + +
+
+
+
+
+ + ☆ Weakly-Supervised Multimodal Learning on MIMIC-CXR ML4H + + +
+ Multimodal data integration and label scarcity pose significant challenges +for machine learning in medical settings. To address these issues, we conduct +an in-depth evaluation of the newly proposed Multimodal Variational +Mixture-of-Experts (MMVM) VAE on the challenging MIMIC-CXR dataset. Our +analysis demonstrates that the MMVM VAE consistently outperforms other +multimodal VAEs and fully supervised approaches, highlighting its strong +potential for real-world medical applications. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 13 pages. arXiv + admin note: text overlap with arXiv:2403.05300 +
+
+
+
+
+ + ☆ Comparative Analysis of Machine Learning Approaches for Bone Age + Assessment: A Comprehensive Study on Three Distinct Models + + +
+ Radiologists and doctors make use of X-ray images of the non-dominant hands +of children and infants to assess the possibility of genetic conditions and +growth abnormalities. This is done by assessing the difference between the +actual extent of growth found using the X-rays and the chronological age of the +subject. The assessment was done conventionally using The Greulich Pyle (GP) or +Tanner Whitehouse (TW) approach. These approaches require a high level of +expertise and may often lead to observer bias. Hence, to automate the process +of assessing the X-rays, and to increase its accuracy and efficiency, several +machine learning models have been developed. These machine-learning models have +several differences in their accuracy and efficiencies, leading to an unclear +choice for the suitable model depending on their needs and available resources. +Methods: In this study, we have analyzed the 3 most widely used models for the +automation of bone age prediction, which are the Xception model, VGG model and +CNN model. These models were trained on the preprocessed dataset and the +accuracy was measured using the MAE in terms of months for each model. Using +this, the comparison between the models was done. Results: The 3 models, +Xception, VGG, and CNN models have been tested for accuracy and other relevant +factors. + +
+
+
+
+
+ + ☆ On the Cost of Model-Serving Frameworks: An Experimental Evaluation + + +
+ In machine learning (ML), the inference phase is the process of applying +pre-trained models to new, unseen data with the objective of making +predictions. During the inference phase, end-users interact with ML services to +gain insights, recommendations, or actions based on the input data. For this +reason, serving strategies are nowadays crucial for deploying and managing +models in production environments effectively. These strategies ensure that +models are available, scalable, reliable, and performant for real-world +applications, such as time series forecasting, image classification, natural +language processing, and so on. In this paper, we evaluate the performances of +five widely-used model serving frameworks (TensorFlow Serving, TorchServe, +MLServer, MLflow, and BentoML) under four different scenarios (malware +detection, cryptocoin prices forecasting, image classification, and sentiment +analysis). We demonstrate that TensorFlow Serving is able to outperform all the +other frameworks in serving deep learning (DL) models. Moreover, we show that +DL-specific frameworks (TensorFlow Serving and TorchServe) display +significantly lower latencies than the three general-purpose ML frameworks +(BentoML, MLFlow, and MLServer). + +
+
+
+
+
+ + ☆ Bitcoin Research with a Transaction Graph Dataset + + +
+ Bitcoin, launched in 2008 by Satoshi Nakamoto, established a new digital +economy where value can be stored and transferred in a fully decentralized +manner - alleviating the need for a central authority. This paper introduces a +large scale dataset in the form of a transactions graph representing +transactions between Bitcoin users along with a set of tasks and baselines. The +graph includes 252 million nodes and 785 million edges, covering a time span of +nearly 13 years of and 670 million transactions. Each node and edge is +timestamped. As for supervised tasks we provide two labeled sets i. a 33,000 +nodes based on entity type and ii. nearly 100,000 Bitcoin addresses labeled +with an entity name and an entity type. This is the largest publicly available +data set of bitcoin transactions designed to facilitate advanced research and +exploration in this domain, overcoming the limitations of existing datasets. +Various graph neural network models are trained to predict node labels, +establishing a baseline for future research. In addition, several use cases are +presented to demonstrate the dataset's applicability beyond Bitcoin analysis. +Finally, all data and source code is made publicly available to enable +reproducibility of the results. + +
+
+
+
+
+ + ☆ RETR: Multi-View Radar Detection Transformer for Indoor Perception NeurIPS 2024 + + +
+ Indoor radar perception has seen rising interest due to affordable costs +driven by emerging automotive imaging radar developments and the benefits of +reduced privacy concerns and reliability under hazardous conditions (e.g., fire +and smoke). However, existing radar perception pipelines fail to account for +distinctive characteristics of the multi-view radar setting. In this paper, we +propose Radar dEtection TRansformer (RETR), an extension of the popular DETR +architecture, tailored for multi-view radar perception. RETR inherits the +advantages of DETR, eliminating the need for hand-crafted components for object +detection and segmentation in the image plane. More importantly, RETR +incorporates carefully designed modifications such as 1) depth-prioritized +feature similarity via a tunable positional encoding (TPE); 2) a tri-plane loss +from both radar and camera coordinates; and 3) a learnable radar-to-camera +transformation via reparameterization, to account for the unique multi-view +radar setting. Evaluated on two indoor radar perception datasets, our approach +outperforms existing state-of-the-art methods by a margin of 15.38+ AP for +object detection and 11.77+ IoU for instance segmentation, respectively. + +
+
+ comment: 24 pages, Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ The ParClusterers Benchmark Suite (PCBS): A Fine-Grained Analysis of + Scalable Graph Clustering + + +
+ We introduce the ParClusterers Benchmark Suite (PCBS) -- a collection of +highly scalable parallel graph clustering algorithms and benchmarking tools +that streamline comparing different graph clustering algorithms and +implementations. + The benchmark includes clustering algorithms that target a wide range of +modern clustering use cases, including community detection, classification, and +dense subgraph mining. + The benchmark toolkit makes it easy to run and evaluate multiple instances of +different clustering algorithms, which can be useful for fine-tuning the +performance of clustering on a given task, and for comparing different +clustering algorithms based on different metrics of interest, including +clustering quality and running time. + Using PCBS, we evaluate a broad collection of real-world graph clustering +datasets. Somewhat surprisingly, we find that the best quality results are +obtained by algorithms that not included in many popular graph clustering +toolkits. The PCBS provides a standardized way to evaluate and judge the +quality-performance tradeoffs of the active research area of scalable graph +clustering algorithms. We believe it will help enable fair, accurate, and +nuanced evaluation of graph clustering algorithms in the future. + +
+
+ comment: This is a preliminary version of a paper that will appear at VLDB'25 +
+
+
+
+
+ + ☆ Multidimensional Byte Pair Encoding: Shortened Sequences for Improved + Visual Data Generation + + +
+ In language processing, transformers benefit greatly from text being +condensed. This is achieved through a larger vocabulary that captures word +fragments instead of plain characters. This is often done with Byte Pair +Encoding. In the context of images, tokenisation of visual data is usually +limited to regular grids obtained from quantisation methods, without global +content awareness. Our work improves tokenisation of visual data by bringing +Byte Pair Encoding from 1D to multiple dimensions, as a complementary add-on to +existing compression. We achieve this through counting constellations of token +pairs and replacing the most frequent token pair with a newly introduced token. +The multidimensionality only increases the computation time by a factor of 2 +for images, making it applicable even to large datasets like ImageNet within +minutes on consumer hardware. This is a lossless preprocessing step. Our +evaluation shows improved training and inference performance of transformers on +visual data achieved by compressing frequent constellations of tokens: The +resulting sequences are shorter, with more uniformly distributed information +content, e.g. condensing empty regions in an image into single tokens. As our +experiments show, these condensed sequences are easier to process. We +additionally introduce a strategy to amplify this compression further by +clustering the vocabulary. + +
+
+
+
+
+ + ☆ Scaling Law for Post-training after Model Pruning + + +
+ Large language models (LLMs) based on the Transformer architecture are widely +employed across various domains and tasks. However, their increasing size +imposes significant hardware demands, limiting practical deployment. To +mitigate this, model pruning techniques have been developed to create more +efficient models while maintaining high performance. Despite this, +post-training after pruning is crucial for performance recovery and can be +resource-intensive. This paper investigates the post-training requirements of +pruned LLMs and introduces a scaling law to determine the optimal amount of +post-training data. Post-training experiments with the Llama-3 and Qwen-2.5 +series models, pruned using depth pruning, width pruning, and 2:4 +semi-structured pruning, show that higher pruning ratios necessitate more +post-training data for performance recovery, whereas larger LLMs require less. +The proposed scaling law predicts a model's loss based on its parameter counts +before and after pruning, as well as the post-training token counts. +Furthermore, we find that the scaling law established from smaller LLMs can be +reliably extrapolated to larger LLMs. This work provides valuable insights into +the post-training of pruned LLMs and offers a practical scaling law for +optimizing post-training data usage. + +
+
+
+
+
+ + ☆ Towards Sample-Efficiency and Generalization of Transfer and Inverse + Reinforcement Learning: A Comprehensive Literature Review + + +
+ Reinforcement learning (RL) is a sub-domain of machine learning, mainly +concerned with solving sequential decision-making problems by a learning agent +that interacts with the decision environment to improve its behavior through +the reward it receives from the environment. This learning paradigm is, +however, well-known for being time-consuming due to the necessity of collecting +a large amount of data, making RL suffer from sample inefficiency and difficult +generalization. Furthermore, the construction of an explicit reward function +that accounts for the trade-off between multiple desiderata of a decision +problem is often a laborious task. These challenges have been recently +addressed utilizing transfer and inverse reinforcement learning (T-IRL). In +this regard, this paper is devoted to a comprehensive review of realizing the +sample efficiency and generalization of RL algorithms through T-IRL. Following +a brief introduction to RL, the fundamental T-IRL methods are presented and the +most recent advancements in each research field have been extensively reviewed. +Our findings denote that a majority of recent research works have dealt with +the aforementioned challenges by utilizing human-in-the-loop and sim-to-real +strategies for the efficient transfer of knowledge from source domains to the +target domain under the transfer learning scheme. Under the IRL structure, +training schemes that require a low number of experience transitions and +extension of such frameworks to multi-agent and multi-intention problems have +been the priority of researchers in recent years. + +
+
+
+
+
+ + ☆ MDHP-Net: Detecting Injection Attacks on In-vehicle Network using + Multi-Dimensional Hawkes Process and Temporal Model + + +
+ The integration of intelligent and connected technologies in modern vehicles, +while offering enhanced functionalities through Electronic Control Unit and +interfaces like OBD-II and telematics, also exposes the vehicle's in-vehicle +network (IVN) to potential cyberattacks. In this paper, we consider a specific +type of cyberattack known as the injection attack. As demonstrated by empirical +data from real-world cybersecurity adversarial competitions(available at +https://mimic2024.xctf.org.cn/race/qwmimic2024 ), these injection attacks have +excitation effect over time, gradually manipulating network traffic and +disrupting the vehicle's normal functioning, ultimately compromising both its +stability and safety. To profile the abnormal behavior of attackers, we propose +a novel injection attack detector to extract long-term features of attack +behavior. Specifically, we first provide a theoretical analysis of modeling the +time-excitation effects of the attack using Multi-Dimensional Hawkes Process +(MDHP). A gradient descent solver specifically tailored for MDHP, MDHP-GDS, is +developed to accurately estimate optimal MDHP parameters. We then propose an +injection attack detector, MDHP-Net, which integrates optimal MDHP parameters +with MDHP-LSTM blocks to enhance temporal feature extraction. By introducing +MDHP parameters, MDHP-Net captures complex temporal features that standard Long +Short-Term Memory (LSTM) cannot, enriching temporal dependencies within our +customized structure. Extensive evaluations demonstrate the effectiveness of +our proposed detection approach. + +
+
+
+
+
+ + ☆ The Unreasonable Effectiveness of Guidance for Diffusion Models + + +
+ Guidance is an error-correcting technique used to improve the perceptual +quality of images generated by diffusion models. Typically, the correction is +achieved by linear extrapolation, using an auxiliary diffusion model that has +lower performance than the primary model. Using a 2D toy example, we show that +it is highly beneficial when the auxiliary model exhibits similar errors as the +primary one but stronger. We verify this finding in higher dimensions, where we +show that competitive generative performance to state-of-the-art guidance +methods can be achieved when the auxiliary model differs from the primary one +only by having stronger weight regularization. As an independent contribution, +we investigate whether upweighting long-range spatial dependencies improves +visual fidelity. The result is a novel guidance method, which we call sliding +window guidance (SWG), that guides the primary model with itself by +constraining its receptive field. Intriguingly, SWG aligns better with human +preferences than state-of-the-art guidance methods while requiring neither +training, architectural modifications, nor class conditioning. The code will be +released. + +
+
+ comment: Preprint. 19 pages, 14 figures in total, including references and + appendix +
+
+
+
+
+ + ☆ Uncertainty in Supply Chain Digital Twins: A Quantum-Classical Hybrid + Approach + + +
+ This study investigates uncertainty quantification (UQ) using +quantum-classical hybrid machine learning (ML) models for applications in +complex and dynamic fields, such as attaining resiliency in supply chain +digital twins and financial risk assessment. Although quantum feature +transformations have been integrated into ML models for complex data tasks, a +gap exists in determining their impact on UQ within their hybrid architectures +(quantum-classical approach). This work applies existing UQ techniques for +different models within a hybrid framework, examining how quantum feature +transformation affects uncertainty propagation. Increasing qubits from 4 to 16 +shows varied model responsiveness to outlier detection (OD) samples, which is a +critical factor for resilient decision-making in dynamic environments. This +work shows how quantum computing techniques can transform data features for UQ, +particularly when combined with traditional methods. + +
+
+
+
+
+ + ☆ Measuring Non-Adversarial Reproduction of Training Data in Large + Language Models + + +
+ Large language models memorize parts of their training data. Memorizing short +snippets and facts is required to answer questions about the world and to be +fluent in any language. But models have also been shown to reproduce long +verbatim sequences of memorized text when prompted by a motivated adversary. In +this work, we investigate an intermediate regime of memorization that we call +non-adversarial reproduction, where we quantify the overlap between model +responses and pretraining data when responding to natural and benign prompts. +For a variety of innocuous prompt categories (e.g., writing a letter or a +tutorial), we show that up to 15% of the text output by popular conversational +language models overlaps with snippets from the Internet. In worst cases, we +find generations where 100% of the content can be found exactly online. For the +same tasks, we find that human-written text has far less overlap with Internet +data. We further study whether prompting strategies can close this reproduction +gap between models and humans. While appropriate prompting can reduce +non-adversarial reproduction on average, we find that mitigating worst-case +reproduction of training data requires stronger defenses -- even for benign +interactions. + +
+
+
+
+
+ + ☆ Efficient Neural Hybrid System Learning and Transition System + Abstraction for Dynamical Systems + + +
+ This paper proposes a neural network hybrid modeling framework for dynamics +learning to promote an interpretable, computationally efficient way of dynamics +learning and system identification. First, a low-level model will be trained to +learn the system dynamics, which utilizes multiple simple neural networks to +approximate the local dynamics generated from data-driven partitions. Then, +based on the low-level model, a high-level model will be trained to abstract +the low-level neural hybrid system model into a transition system that allows +Computational Tree Logic Verification to promote the model's ability with human +interaction and verification efficiency. + +
+
+
+
+
+ + ☆ A Low-Resolution Image is Worth 1x1 Words: Enabling Fine Image + Super-Resolution with Transformers and TaylorShift + + +
+ Transformer-based Super-Resolution (SR) models have recently advanced image +reconstruction quality, yet challenges remain due to computational complexity +and an over-reliance on large patch sizes, which constrain fine-grained detail +enhancement. In this work, we propose TaylorIR to address these limitations by +utilizing a patch size of 1x1, enabling pixel-level processing in any +transformer-based SR model. To address the significant computational demands +under the traditional self-attention mechanism, we employ the TaylorShift +attention mechanism, a memory-efficient alternative based on Taylor series +expansion, achieving full token-to-token interactions with linear complexity. +Experimental results demonstrate that our approach achieves new +state-of-the-art SR performance while reducing memory consumption by up to 60% +compared to traditional self-attention-based transformers. + +
+
+
+
+
+ + ☆ Machine Learning Algorithms to Assess Site Closure Time Frames for Soil + and Groundwater Contamination + + +
+ Monitored Natural Attenuation (MNA) is gaining prominence as an effective +method for managing soil and groundwater contamination due to its +cost-efficiency and minimal environmental disruption. Despite its benefits, MNA +necessitates extensive groundwater monitoring to ensure that contaminant levels +decrease to meet safety standards. This study expands the capabilities of +PyLEnM, a Python package designed for long-term environmental monitoring, by +incorporating new algorithms to enhance its predictive and analytical +functionalities. We introduce methods to estimate the timeframe required for +contaminants like Sr-90 and I-129 to reach regulatory safety standards using +linear regression and to forecast future contaminant levels with the +Bidirectional Long Short-Term Memory (Bi-LSTM) networks. Additionally, Random +Forest regression is employed to identify factors influencing the time to reach +safety standards. Our methods are illustrated using data from the Savannah +River Site (SRS) F-Area, where preliminary findings reveal a notable downward +trend in contaminant levels, with variability linked to initial concentrations +and groundwater flow dynamics. The Bi-LSTM model effectively predicts +contaminant concentrations for the next four years, demonstrating the potential +of advanced time series analysis to improve MNA strategies and reduce reliance +on manual groundwater sampling. The code, along with its usage instructions, +validation, and requirements, is available at: +https://github.com/csplevuanh/pylenm_extension. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ Embedding Byzantine Fault Tolerance into Federated Learning via Virtual + Data-Driven Consistency Scoring Plugin + + +
+ Given sufficient data from multiple edge devices, federated learning (FL) +enables training a shared model without transmitting private data to a central +server. However, FL is generally vulnerable to Byzantine attacks from +compromised edge devices, which can significantly degrade the model +performance. In this paper, we propose a intuitive plugin that can be +integrated into existing FL techniques to achieve Byzantine-Resilience. Key +idea is to generate virtual data samples and evaluate model consistency scores +across local updates to effectively filter out compromised edge devices. By +utilizing this scoring mechanism before the aggregation phase, the proposed +plugin enables existing FL techniques to become robust against Byzantine +attacks while maintaining their original benefits. Numerical results on medical +image classification task validate that plugging the proposed approach into +representative FL algorithms, effectively achieves Byzantine resilience. +Furthermore, the proposed plugin maintains the original convergence properties +of the base FL algorithms when no Byzantine attacks are present. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Fused Gromov-Wasserstein Variance Decomposition with Linear Optimal + Transport + + +
+ Wasserstein distances form a family of metrics on spaces of probability +measures that have recently seen many applications. However, statistical +analysis in these spaces is complex due to the nonlinearity of Wasserstein +spaces. One potential solution to this problem is Linear Optimal Transport +(LOT). This method allows one to find a Euclidean embedding, called LOT +embedding, of measures in some Wasserstein spaces, but some information is lost +in this embedding. So, to understand whether statistical analysis relying on +LOT embeddings can make valid inferences about original data, it is helpful to +quantify how well these embeddings describe that data. To answer this question, +we present a decomposition of the Fr\'echet variance of a set of measures in +the 2-Wasserstein space, which allows one to compute the percentage of variance +explained by LOT embeddings of those measures. We then extend this +decomposition to the Fused Gromov-Wasserstein setting. We also present several +experiments that explore the relationship between the dimension of the LOT +embedding, the percentage of variance explained by the embedding, and the +classification accuracy of machine learning classifiers built on the embedded +data. We use the MNIST handwritten digits dataset, IMDB-50000 dataset, and +Diffusion Tensor MRI images for these experiments. Our results illustrate the +effectiveness of low dimensional LOT embeddings in terms of the percentage of +variance explained and the classification accuracy of models built on the +embedded data. + +
+
+
+
+
+ + ☆ FengWu-W2S: A deep learning model for seamless weather-to-subseasonal + forecast of global atmosphere + + +
+ Seamless forecasting that produces warning information at continuum +timescales based on only one system is a long-standing pursuit for +weather-climate service. While the rapid advancement of deep learning has +induced revolutionary changes in classical forecasting field, current efforts +are still focused on building separate AI models for weather and climate +forecasts. To explore the seamless forecasting ability based on one AI model, +we propose FengWu-Weather to Subseasonal (FengWu-W2S), which builds on the +FengWu global weather forecast model and incorporates an ocean-atmosphere-land +coupling structure along with a diverse perturbation strategy. FengWu-W2S can +generate 6-hourly atmosphere forecasts extending up to 42 days through an +autoregressive and seamless manner. Our hindcast results demonstrate that +FengWu-W2S reliably predicts atmospheric conditions out to 3-6 weeks ahead, +enhancing predictive capabilities for global surface air temperature, +precipitation, geopotential height and intraseasonal signals such as the +Madden-Julian Oscillation (MJO) and North Atlantic Oscillation (NAO). Moreover, +our ablation experiments on forecast error growth from daily to seasonal +timescales reveal potential pathways for developing AI-based integrated system +for seamless weather-climate forecasting in the future. + +
+
+ comment: 23 pages,8 figures +
+
+
+
+
+ + ☆ CART: Compositional Auto-Regressive Transformer for Image Generation CVPR 2025 + + +
+ In recent years, image synthesis has achieved remarkable advancements, +enabling diverse applications in content creation, virtual reality, and beyond. +We introduce a novel approach to image generation using Auto-Regressive (AR) +modeling, which leverages a next-detail prediction strategy for enhanced +fidelity and scalability. While AR models have achieved transformative success +in language modeling, replicating this success in vision tasks has presented +unique challenges due to the inherent spatial dependencies in images. Our +proposed method addresses these challenges by iteratively adding finer details +to an image compositionally, constructing it as a hierarchical combination of +base and detail image factors. This strategy is shown to be more effective than +the conventional next-token prediction and even surpasses the state-of-the-art +next-scale prediction approaches. A key advantage of this method is its +scalability to higher resolutions without requiring full model retraining, +making it a versatile solution for high-resolution image generation. + +
+
+ comment: under review at CVPR 2025 +
+
+
+
+
+ + ☆ The Surprising Ineffectiveness of Pre-Trained Visual Representations for + Model-Based Reinforcement Learning NeurIPS 2024 + + +
+ Visual Reinforcement Learning (RL) methods often require extensive amounts of +data. As opposed to model-free RL, model-based RL (MBRL) offers a potential +solution with efficient data utilization through planning. Additionally, RL +lacks generalization capabilities for real-world tasks. Prior work has shown +that incorporating pre-trained visual representations (PVRs) enhances sample +efficiency and generalization. While PVRs have been extensively studied in the +context of model-free RL, their potential in MBRL remains largely unexplored. +In this paper, we benchmark a set of PVRs on challenging control tasks in a +model-based RL setting. We investigate the data efficiency, generalization +capabilities, and the impact of different properties of PVRs on the performance +of model-based agents. Our results, perhaps surprisingly, reveal that for MBRL +current PVRs are not more sample efficient than learning representations from +scratch, and that they do not generalize better to out-of-distribution (OOD) +settings. To explain this, we analyze the quality of the trained dynamics +model. Furthermore, we show that data diversity and network architecture are +the most important contributors to OOD generalization performance. + +
+
+ comment: Published at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/ +
+
+
+
+
+ + ☆ Continuous Bayesian Model Selection for Multivariate Causal Discovery + + +
+ Current causal discovery approaches require restrictive model assumptions or +assume access to interventional data to ensure structure identifiability. These +assumptions often do not hold in real-world applications leading to a loss of +guarantees and poor accuracy in practice. Recent work has shown that, in the +bivariate case, Bayesian model selection can greatly improve accuracy by +exchanging restrictive modelling for more flexible assumptions, at the cost of +a small probability of error. We extend the Bayesian model selection approach +to the important multivariate setting by making the large discrete selection +problem scalable through a continuous relaxation. We demonstrate how for our +choice of Bayesian non-parametric model, the Causal Gaussian Process +Conditional Density Estimator (CGP-CDE), an adjacency matrix can be constructed +from the model hyperparameters. This adjacency matrix is then optimised using +the marginal likelihood and an acyclicity regulariser, outputting the maximum a +posteriori causal graph. We demonstrate the competitiveness of our approach on +both synthetic and real-world datasets, showing it is possible to perform +multivariate causal discovery without infeasible assumptions using Bayesian +model selection. + +
+
+
+
+
+ + ☆ BONE: a unifying framework for Bayesian online learning in + non-stationary environments + + +
+ We propose a unifying framework for methods that perform Bayesian online +learning in non-stationary environments. We call the framework BONE, which +stands for (B)ayesian (O)nline learning in (N)on-stationary (E)nvironments. +BONE provides a common structure to tackle a variety of problems, including +online continual learning, prequential forecasting, and contextual bandits. The +framework requires specifying three modelling choices: (i) a model for +measurements (e.g., a neural network), (ii) an auxiliary process to model +non-stationarity (e.g., the time since the last changepoint), and (iii) a +conditional prior over model parameters (e.g., a multivariate Gaussian). The +framework also requires two algorithmic choices, which we use to carry out +approximate inference under this framework: (i) an algorithm to estimate +beliefs (posterior distribution) about the model parameters given the auxiliary +variable, and (ii) an algorithm to estimate beliefs about the auxiliary +variable. We show how this modularity allows us to write many different +existing methods as instances of BONE; we also use this framework to propose a +new method. We then experimentally compare existing methods with our proposed +new method on several datasets; we provide insights into the situations that +make one method more suitable than another for a given task. + +
+
+
+
+
+ + ☆ Causal Time-Series Synchronization for Multi-Dimensional Forecasting + + +
+ The process industry's high expectations for Digital Twins require modeling +approaches that can generalize across tasks and diverse domains with +potentially different data dimensions and distributional shifts i.e., +Foundational Models. Despite success in natural language processing and +computer vision, transfer learning with (self-) supervised signals for +pre-training general-purpose models is largely unexplored in the context of +Digital Twins in the process industry due to challenges posed by +multi-dimensional time-series data, lagged cause-effect dependencies, complex +causal structures, and varying number of (exogenous) variables. We propose a +novel channel-dependent pre-training strategy that leverages synchronized +cause-effect pairs to overcome these challenges by breaking down the +multi-dimensional time-series data into pairs of cause-effect variables. Our +approach focuses on: (i) identifying highly lagged causal relationships using +data-driven methods, (ii) synchronizing cause-effect pairs to generate training +samples for channel-dependent pre-training, and (iii) evaluating the +effectiveness of this approach in channel-dependent forecasting. Our +experimental results demonstrate significant improvements in forecasting +accuracy and generalization capability compared to traditional training +methods. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ DaYu: Data-Driven Model for Geostationary Satellite Observed Cloud + Images Forecasting + + +
+ In the past few years, Artificial Intelligence (AI)-based weather forecasting +methods have widely demonstrated strong competitiveness among the weather +forecasting systems. However, these methods are insufficient for +high-spatial-resolution short-term nowcasting within 6 hours, which is crucial +for warning short-duration, mesoscale and small-scale weather events. +Geostationary satellite remote sensing provides detailed, high spatio-temporal +and all-day observations, which can address the above limitations of existing +methods. Therefore, this paper proposed an advanced data-driven thermal +infrared cloud images forecasting model, "DaYu." Unlike existing data-driven +weather forecasting models, DaYu is specifically designed for geostationary +satellite observations, with a temporal resolution of 0.5 hours and a spatial +resolution of ${0.05}^\circ$ $\times$ ${0.05}^\circ$. DaYu is based on a +large-scale transformer architecture, which enables it to capture fine-grained +cloud structures and learn fast-changing spatio-temporal evolution features +effectively. Moreover, its attention mechanism design achieves a balance in +computational complexity, making it practical for applications. DaYu not only +achieves accurate forecasts up to 3 hours with a correlation coefficient higher +than 0.9, 6 hours higher than 0.8, and 12 hours higher than 0.7, but also +detects short-duration, mesoscale, and small-scale weather events with enhanced +detail, effectively addressing the shortcomings of existing methods in +providing detailed short-term nowcasting within 6 hours. Furthermore, DaYu has +significant potential in short-term climate disaster prevention and mitigation. + +
+
+
+
+
+ + ☆ Prompting and Fine-tuning Large Language Models for Automated Code + Review Comment Generation + + +
+ Generating accurate code review comments remains a significant challenge due +to the inherently diverse and non-unique nature of the task output. Large +language models pretrained on both programming and natural language data tend +to perform well in code-oriented tasks. However, large-scale pretraining is not +always feasible due to its environmental impact and project-specific +generalizability issues. In this work, first we fine-tune open-source Large +language models (LLM) in parameter-efficient, quantized low-rank (QLoRA) +fashion on consumer-grade hardware to improve review comment generation. Recent +studies demonstrate the efficacy of augmenting semantic metadata information +into prompts to boost performance in other code-related tasks. To explore this +in code review activities, we also prompt proprietary, closed-source LLMs +augmenting the input code patch with function call graphs and code summaries. +Both of our strategies improve the review comment generation performance, with +function call graph augmented few-shot prompting on the GPT-3.5 model +surpassing the pretrained baseline by around 90% BLEU-4 score on the +CodeReviewer dataset. Moreover, few-shot prompted Gemini-1.0 Pro, QLoRA +fine-tuned Code Llama and Llama 3.1 models achieve competitive results (ranging +from 25% to 83% performance improvement) on this task. An additional human +evaluation study further validates our experimental findings, reflecting +real-world developers' perceptions of LLM-generated code review comments based +on relevant qualitative metrics. + +
+
+
+
+
+ + ☆ On the Universal Statistical Consistency of Expansive Hyperbolic Deep + Convolutional Neural Networks + + +
+ The emergence of Deep Convolutional Neural Networks (DCNNs) has been a +pervasive tool for accomplishing widespread applications in computer vision. +Despite its potential capability to capture intricate patterns inside the data, +the underlying embedding space remains Euclidean and primarily pursues +contractive convolution. Several instances can serve as a precedent for the +exacerbating performance of DCNNs. The recent advancement of neural networks in +the hyperbolic spaces gained traction, incentivizing the development of +convolutional deep neural networks in the hyperbolic space. In this work, we +propose Hyperbolic DCNN based on the Poincar\'{e} Disc. The work predominantly +revolves around analyzing the nature of expansive convolution in the context of +the non-Euclidean domain. We further offer extensive theoretical insights +pertaining to the universal consistency of the expansive convolution in the +hyperbolic space. Several simulations were performed not only on the synthetic +datasets but also on some real-world datasets. The experimental results reveal +that the hyperbolic convolutional architecture outperforms the Euclidean ones +by a commendable margin. + +
+
+
+
+
+ + ☆ Energy-GNoME: A Living Database of Selected Materials for Energy + Applications + + +
+ Artificial Intelligence (AI) in materials science is driving significant +advancements in the discovery of advanced materials for energy applications. +The recent GNoME protocol identifies over 380,000 novel stable crystals. From +this, we identify over 33,000 materials with potential as energy materials +forming the Energy-GNoME database. Leveraging Machine Learning (ML) and Deep +Learning (DL) tools, our protocol mitigates cross-domain data bias using +feature spaces to identify potential candidates for thermoelectric materials, +novel battery cathodes, and novel perovskites. Classifiers with both structural +and compositional features identify domains of applicability, where we expect +enhanced accuracy of the regressors. Such regressors are trained to predict key +materials properties like, thermoelectric figure of merit (zT), band gap (Eg), +and cathode voltage ($\Delta V_c$). This method significantly narrows the pool +of potential candidates, serving as an efficient guide for experimental and +computational chemistry investigations and accelerating the discovery of +materials suited for electricity generation, energy storage and conversion. + +
+
+ comment: 60 pages, 16 figures +
+
+
+
+
+ + ☆ Generative Agent Simulations of 1,000 People + + +
+ The promise of human behavioral simulation--general-purpose computational +agents that replicate human behavior across domains--could enable broad +applications in policymaking and social science. We present a novel agent +architecture that simulates the attitudes and behaviors of 1,052 real +individuals--applying large language models to qualitative interviews about +their lives, then measuring how well these agents replicate the attitudes and +behaviors of the individuals that they represent. The generative agents +replicate participants' responses on the General Social Survey 85% as +accurately as participants replicate their own answers two weeks later, and +perform comparably in predicting personality traits and outcomes in +experimental replications. Our architecture reduces accuracy biases across +racial and ideological groups compared to agents given demographic +descriptions. This work provides a foundation for new tools that can help +investigate individual and collective behavior. + +
+
+
+
+
+ + ☆ Recent Advances on Machine Learning-aided DSP for Short-reach and + Long-haul Optical Communications + + +
+ In this paper, we highlight recent advances in the use of machine learning +for implementing equalizers for optical communications. We highlight both +algorithmic advances as well as implementation aspects using conventional and +neuromorphic hardware. + +
+
+ comment: paper accompanying an invited presentation at OFC 2025 +
+
+
+
+
+ + ☆ Neural Port-Hamiltonian Models for Nonlinear Distributed Control: An + Unconstrained Parametrization Approach + + +
+ The control of large-scale cyber-physical systems requires optimal +distributed policies relying solely on limited communication with neighboring +agents. However, computing stabilizing controllers for nonlinear systems while +optimizing complex costs remains a significant challenge. Neural Networks +(NNs), known for their expressivity, can be leveraged to parametrize control +policies that yield good performance. However, NNs' sensitivity to small input +changes poses a risk of destabilizing the closed-loop system. Many existing +approaches enforce constraints on the controllers' parameter space to guarantee +closed-loop stability, leading to computationally expensive optimization +procedures. To address these problems, we leverage the framework of +port-Hamiltonian systems to design continuous-time distributed control policies +for nonlinear systems that guarantee closed-loop stability and finite +$\mathcal{L}_2$ or incremental $\mathcal{L}_2$ gains, independent of the +optimzation parameters of the controllers. This eliminates the need to +constrain parameters during optimization, allowing the use of standard +techniques such as gradient-based methods. Additionally, we discuss +discretization schemes that preserve the dissipation properties of these +controllers for implementation on embedded systems. The effectiveness of the +proposed distributed controllers is demonstrated through consensus control of +non-holonomic mobile robots subject to collision avoidance and averaged voltage +regulation with weighted power sharing in DC microgrids. + +
+
+ comment: The paper has 15 pages, and has been submitted for a possible + publication. arXiv admin note: text overlap with arXiv:2403.17785 +
+
+
+
+
+ + ☆ PFML: Self-Supervised Learning of Time-Series Data Without + Representation Collapse + + +
+ Self-supervised learning (SSL) is a data-driven learning approach that +utilizes the innate structure of the data to guide the learning process. In +contrast to supervised learning, which depends on external labels, SSL utilizes +the inherent characteristics of the data to produce its own supervisory signal. +However, one frequent issue with SSL methods is representation collapse, where +the model outputs a constant input-invariant feature representation. This issue +hinders the potential application of SSL methods to new data modalities, as +trying to avoid representation collapse wastes researchers' time and effort. +This paper introduces a novel SSL algorithm for time-series data called +Prediction of Functionals from Masked Latents (PFML). Instead of predicting +masked input signals or their latent representations directly, PFML operates by +predicting statistical functionals of the input signal corresponding to masked +embeddings, given a sequence of unmasked embeddings. The algorithm is designed +to avoid representation collapse, rendering it straightforwardly applicable to +different time-series data domains, such as novel sensor modalities in clinical +data. We demonstrate the effectiveness of PFML through complex, real-life +classification tasks across three different data modalities: infant posture and +movement classification from multi-sensor inertial measurement unit data, +emotion recognition from speech data, and sleep stage classification from EEG +data. The results show that PFML is superior to a conceptually similar +pre-existing SSL method and competitive against the current state-of-the-art +SSL method, while also being conceptually simpler and without suffering from +representation collapse. + +
+
+
+
+
+ + ☆ Evidential Federated Learning for Skin Lesion Image Classification ICPR 2024 + + +
+ We introduce FedEvPrompt, a federated learning approach that integrates +principles of evidential deep learning, prompt tuning, and knowledge +distillation for distributed skin lesion classification. FedEvPrompt leverages +two sets of prompts: b-prompts (for low-level basic visual knowledge) and +t-prompts (for task-specific knowledge) prepended to frozen pre-trained Vision +Transformer (ViT) models trained in an evidential learning framework to +maximize class evidences. Crucially, knowledge sharing across federation +clients is achieved only through knowledge distillation on attention maps +generated by the local ViT models, ensuring enhanced privacy preservation +compared to traditional parameter or synthetic image sharing methodologies. +FedEvPrompt is optimized within a round-based learning paradigm, where each +round involves training local models followed by attention maps sharing with +all federation clients. Experimental validation conducted in a real distributed +setting, on the ISIC2019 dataset, demonstrates the superior performance of +FedEvPrompt against baseline federated learning algorithms and knowledge +distillation methods, without sharing model parameters. In conclusion, +FedEvPrompt offers a promising approach for federated learning, effectively +addressing challenges such as data heterogeneity, imbalance, privacy +preservation, and knowledge sharing. + +
+
+ comment: Published as a conference paper at ICPR 2024 +
+
+
+
+
+ + ☆ Adaptive Physics-Guided Neural Network + + +
+ This paper introduces an adaptive physics-guided neural network (APGNN) +framework for predicting quality attributes from image data by integrating +physical laws into deep learning models. The APGNN adaptively balances +data-driven and physics-informed predictions, enhancing model accuracy and +robustness across different environments. Our approach is evaluated on both +synthetic and real-world datasets, with comparisons to conventional data-driven +models such as ResNet. For the synthetic data, 2D domains were generated using +three distinct governing equations: the diffusion equation, the +advection-diffusion equation, and the Poisson equation. Non-linear +transformations were applied to these domains to emulate complex physical +processes in image form. + In real-world experiments, the APGNN consistently demonstrated superior +performance in the diverse thermal image dataset. On the cucumber dataset, +characterized by low material diversity and controlled conditions, APGNN and +PGNN showed similar performance, both outperforming the data-driven ResNet. +However, in the more complex thermal dataset, particularly for outdoor +materials with higher environmental variability, APGNN outperformed both PGNN +and ResNet by dynamically adjusting its reliance on physics-based versus +data-driven insights. This adaptability allowed APGNN to maintain robust +performance across structured, low-variability settings and more heterogeneous +scenarios. These findings underscore the potential of adaptive physics-guided +learning to integrate physical constraints effectively, even in challenging +real-world contexts with diverse environmental conditions. + +
+
+
+
+
+ + ☆ Federated Domain Generalization via Prompt Learning and Aggregation + + +
+ Federated domain generalization (FedDG) aims to improve the global model +generalization in unseen domains by addressing data heterogeneity under +privacy-preserving constraints. A common strategy in existing FedDG studies +involves sharing domain-specific knowledge among clients, such as spectrum +information, class prototypes, and data styles. However, this knowledge is +extracted directly from local client samples, and sharing such sensitive +information poses a potential risk of data leakage, which might not fully meet +the requirements of FedDG. In this paper, we introduce prompt learning to adapt +pre-trained vision-language models (VLMs) in the FedDG scenario, and leverage +locally learned prompts as a more secure bridge to facilitate knowledge +transfer among clients. Specifically, we propose a novel FedDG framework +through Prompt Learning and AggregatioN (PLAN), which comprises two training +stages to collaboratively generate local prompts and global prompts at each +federated round. First, each client performs both text and visual prompt +learning using their own data, with local prompts indirectly synchronized by +regarding the global prompts as a common reference. Second, all domain-specific +local prompts are exchanged among clients and selectively aggregated into the +global prompts using lightweight attention-based aggregators. The global +prompts are finally applied to adapt VLMs to unseen target domains. As our PLAN +framework requires training only a limited number of prompts and lightweight +aggregators, it offers notable advantages in computational and communication +efficiency for FedDG. Extensive experiments demonstrate the superior +generalization ability of PLAN across four benchmark datasets. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Unsupervised Congestion Status Identification Using LMP Data + + +
+ Having a better understanding of how locational marginal prices (LMPs) change +helps in price forecasting and market strategy making. This paper investigates +the fundamental distribution of the congestion part of LMPs in high-dimensional +Euclidean space using an unsupervised approach. LMP models based on the +lossless and lossy DC optimal power flow (DC-OPF) are analyzed to show the +overlapping subspace property of the LMP data. The congestion part of LMPs is +spanned by certain row vectors of the power transfer distribution factor (PTDF) +matrix, and the subspace attributes of an LMP vector uniquely are found to +reflect the instantaneous congestion status of all the transmission lines. The +proposed method searches for the basis vectors that span the subspaces of +congestion LMP data in hierarchical ways. In the bottom-up search, the data +belonging to 1-dimensional subspaces are detected, and other data are projected +on the orthogonal subspaces. This procedure is repeated until all the basis +vectors are found or the basis gap appears. Top-down searching is used to +address the basis gap by hyperplane detection with outliers. Once all the basis +vectors are detected, the congestion status can be identified. Numerical +experiments based on the IEEE 30-bus system, IEEE 118-bus system, Illinois +200-bus system, and Southwest Power Pool are conducted to show the performance +of the proposed method. + +
+
+ comment: Paper accepted for IEEE Transactions on Smart Grid. Personal use of + this material is permitted. Permission from IEEE must be obtained for all + other uses +
+
+
+
+
+ + ☆ KuaiFormer: Transformer-Based Retrieval at Kuaishou + + +
+ In large-scale content recommendation systems, retrieval serves as the +initial stage in the pipeline, responsible for selecting thousands of candidate +items from billions of options to pass on to ranking modules. Traditionally, +the dominant retrieval method has been Embedding-Based Retrieval (EBR) using a +Deep Neural Network (DNN) dual-tower structure. However, applying transformer +in retrieval tasks has been the focus of recent research, though real-world +industrial deployment still presents significant challenges. In this paper, we +introduce KuaiFormer, a novel transformer-based retrieval framework deployed in +a large-scale content recommendation system. KuaiFormer fundamentally redefines +the retrieval process by shifting from conventional score estimation tasks +(such as click-through rate estimate) to a transformer-driven Next Action +Prediction paradigm. This shift enables more effective real-time interest +acquisition and multi-interest extraction, significantly enhancing retrieval +performance. KuaiFormer has been successfully integrated into Kuaishou App's +short-video recommendation system since May 2024, serving over 400 million +daily active users and resulting in a marked increase in average daily usage +time of Kuaishou users. We provide insights into both the technical and +business aspects of deploying transformer in large-scale recommendation +systems, addressing practical challenges encountered during industrial +implementation. Our findings offer valuable guidance for engineers and +researchers aiming to leverage transformer models to optimize large-scale +content recommendation systems. + +
+
+
+
+
+ + ☆ That Chip Has Sailed: A Critique of Unfounded Skepticism Around AI for + Chip Design + + +
+ In 2020, we introduced a deep reinforcement learning method capable of +generating superhuman chip layouts, which we then published in Nature and +open-sourced on GitHub. AlphaChip has inspired an explosion of work on AI for +chip design, and has been deployed in state-of-the-art chips across Alphabet +and extended by external chipmakers. Even so, a non-peer-reviewed invited paper +at ISPD 2023 questioned its performance claims, despite failing to run our +method as described in Nature. For example, it did not pre-train the RL method +(removing its ability to learn from prior experience), used substantially fewer +compute resources (20x fewer RL experience collectors and half as many GPUs), +did not train to convergence (standard practice in machine learning), and +evaluated on test cases that are not representative of modern chips. Recently, +Igor Markov published a meta-analysis of three papers: our peer-reviewed Nature +paper, the non-peer-reviewed ISPD paper, and Markov's own unpublished paper +(though he does not disclose that he co-authored it). Although AlphaChip has +already achieved widespread adoption and impact, we publish this response to +ensure that no one is wrongly discouraged from innovating in this impactful +area. + +
+
+
+
+
+ + ☆ Jal Anveshak: Prediction of fishing zones using fine-tuned LlaMa 2 + + +
+ In recent years, the global and Indian government efforts in monitoring and +collecting data related to the fisheries industry have witnessed significant +advancements. Despite this wealth of data, there exists an untapped potential +for leveraging artificial intelligence based technological systems to benefit +Indian fishermen in coastal areas. To fill this void in the Indian technology +ecosystem, the authors introduce Jal Anveshak. This is an application framework +written in Dart and Flutter that uses a Llama 2 based Large Language Model +fine-tuned on pre-processed and augmented government data related to fishing +yield and availability. Its main purpose is to help Indian fishermen safely get +the maximum yield of fish from coastal areas and to resolve their fishing +related queries in multilingual and multimodal ways. + +
+
+
+
+
+ + ☆ Physics-informed neural networks need a physicist to be accurate: the + case of mass and heat transport in Fischer-Tropsch catalyst particles + + +
+ Physics-Informed Neural Networks (PINNs) have emerged as an influential +technology, merging the swift and automated capabilities of machine learning +with the precision and dependability of simulations grounded in theoretical +physics. PINNs are often employed to solve algebraic or differential equations +to replace some or even all steps of multi-stage computational workflows, +leading to their significant speed-up. However, wide adoption of PINNs is still +hindered by reliability issues, particularly at extreme ends of the input +parameter ranges. In this study, we demonstrate this in the context of a system +of coupled non-linear differential reaction-diffusion and heat transfer +equations related to Fischer-Tropsch synthesis, which are solved by a +finite-difference method with a PINN used in evaluating their source terms. It +is shown that the testing strategies traditionally used to assess the accuracy +of neural networks as function approximators can overlook the peculiarities +which ultimately cause instabilities of the finite-difference solver. We +propose a domain knowledge-based modifications to the PINN architecture +ensuring its correct asymptotic behavior. When combined with an improved +numerical scheme employed as an initial guess generator, the proposed +modifications are shown to recover the overall stability of the simulations, +while preserving the speed-up brought by PINN as the workflow component. We +discuss the possible applications of the proposed hybrid transport equation +solver in context of chemical reactors simulations. + +
+
+
+
+
+ + ☆ Model Inversion Attacks: A Survey of Approaches and Countermeasures + + +
+ The success of deep neural networks has driven numerous research studies and +applications from Euclidean to non-Euclidean data. However, there are +increasing concerns about privacy leakage, as these networks rely on processing +private data. Recently, a new type of privacy attack, the model inversion +attacks (MIAs), aims to extract sensitive features of private data for training +by abusing access to a well-trained model. The effectiveness of MIAs has been +demonstrated in various domains, including images, texts, and graphs. These +attacks highlight the vulnerability of neural networks and raise awareness +about the risk of privacy leakage within the research community. Despite the +significance, there is a lack of systematic studies that provide a +comprehensive overview and deeper insights into MIAs across different domains. +This survey aims to summarize up-to-date MIA methods in both attacks and +defenses, highlighting their contributions and limitations, underlying modeling +principles, optimization challenges, and future directions. We hope this survey +bridges the gap in the literature and facilitates future research in this +critical area. Besides, we are maintaining a repository to keep track of +relevant research at +https://github.com/AndrewZhou924/Awesome-model-inversion-attack. + +
+
+ comment: 40 pages, 17 figures +
+
+
+
+
+ + ☆ Towards Utilising a Range of Neural Activations for Comprehending + Representational Associations + + +
+ Recent efforts to understand intermediate representations in deep neural +networks have commonly attempted to label individual neurons and combinations +of neurons that make up linear directions in the latent space by examining +extremal neuron activations and the highest direction projections. In this +paper, we show that this approach, although yielding a good approximation for +many purposes, fails to capture valuable information about the behaviour of a +representation. Neural network activations are generally dense, and so a more +complex, but realistic scenario is that linear directions encode information at +various levels of stimulation. We hypothesise that non-extremal level +activations contain complex information worth investigating, such as +statistical associations, and thus may be used to locate confounding human +interpretable concepts. We explore the value of studying a range of neuron +activations by taking the case of mid-level output neuron activations and +demonstrate on a synthetic dataset how they can inform us about aspects of +representations in the penultimate layer not evident through analysing maximal +activations alone. We use our findings to develop a method to curate data from +mid-range logit samples for retraining to mitigate spurious correlations, or +confounding concepts in the penultimate layer, on real benchmark datasets. The +success of our method exemplifies the utility of inspecting non-maximal +activations to extract complex relationships learned by models. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ MicroCrackAttentionNeXt: Advancing Microcrack Detection in Wave Field + Analysis Using Deep Neural Networks through Feature Visualization + + +
+ Micro Crack detection using deep neural networks (DNNs) through an automated +pipeline using wave fields interacting with the damaged areas is highly sought +after. These high-dimensional spatio-temporal crack data are limited, and these +datasets have large dimensions in the temporal domain. The dataset presents a +substantial class imbalance, with crack pixels constituting an average of only +5% of the total pixels per sample. This extreme class imbalance poses a +challenge for deep learning models with the different micro-scale cracks, as +the network can be biased toward predicting the majority class, generally +leading to poor detection accuracy. This study builds upon the previous +benchmark SpAsE-Net, an asymmetric encoder-decoder network for micro-crack +detection. The impact of various activation and loss functions were examined +through feature space visualization using the manifold discovery and analysis +(MDA) algorithm. The optimized architecture and training methodology achieved +an accuracy of 86.85%. + +
+
+
+
+
+ + ☆ Efficient Depth Estimation for Unstable Stereo Camera Systems on AR + Glasses + + +
+ Stereo depth estimation is a fundamental component in augmented reality (AR) +applications. Although AR applications require very low latency for their +real-time applications, traditional depth estimation models often rely on +time-consuming preprocessing steps such as rectification to achieve high +accuracy. Also, non standard ML operator based algorithms such as cost volume +also require significant latency, which is aggravated on compute +resource-constrained mobile platforms. Therefore, we develop hardware-friendly +alternatives to the costly cost volume and preprocessing and design two new +models based on them, MultiHeadDepth and HomoDepth. Our approaches for cost +volume is replacing it with a new group-pointwise convolution-based operator +and approximation of consine similarity based on layernorm and dot product. For +online stereo rectification (preprocessing), we introduce homograhy matrix +prediction network with a rectification positional encoding (RPE), which +delivers both low latency and robustness to unrectified images, which +eliminates the needs for preprocessing. Our MultiHeadDepth, which includes +optimized cost volume, provides 11.8-30.3% improvements in accuracy and +22.9-25.2% reduction in latency compared to a state-of-the-art depth estimation +model for AR glasses from industry. Our HomoDepth, which includes optimized +preprocessing (Homograhpy + RPE) upon MultiHeadDepth, can process unrectified +images and reduce the end-to-end latency by 44.5%. We adopt a multi-task +learning framework to handle misaligned stereo inputs on HomoDepth, which +reduces theAbsRel error by 10.0-24.3%. The results demonstrate the efficacy of +our approaches in achieving both high model performance with low latency, which +makes a step forward toward practical depth estimation on future AR devices. + +
+
+
+
+
+ + ☆ DeepMedcast: A Deep Learning Method for Generating Intermediate Weather + Forecasts among Multiple NWP Models + + +
+ Numerical weather prediction (NWP) centers around the world operate a variety +of NWP models, and recent advances in AI-driven NWP models have increased the +availability of diverse NWP outputs. While this expansion holds the potential +to improve forecast accuracy, it also raises a critical challenge of +identifying the most reliable predictions for specific forecast scenarios. +Traditional approaches, such as ensemble or weighted averaging, combine +multiple NWP outputs but often generate unrealistic atmospheric fields, +complicating the production of reliable and consistent forecasts in operational +settings. In this study, we introduce DeepMedcast, a deep learning method that +generates intermediate forecast, or "medcast", between two or more NWP outputs. +Unlike ensemble averaging, DeepMedcast can provide consistent and explainable +medcast without distorting meteorological fields. This paper details the +methodology and case studies of DeepMedcast, discussing its advantages and +potential contributions to operational forecasting. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ DuSEGO: Dual Second-order Equivariant Graph Ordinary Differential + Equation + + +
+ Graph Neural Networks (GNNs) with equivariant properties have achieved +significant success in modeling complex dynamic systems and molecular +properties. However, their expressiveness ability is limited by: (1) Existing +methods often overlook the over-smoothing issue caused by traditional GNN +models, as well as the gradient explosion or vanishing problems in deep GNNs. +(2) Most models operate on first-order information, neglecting that the real +world often consists of second-order systems, which further limits the model's +representation capabilities. To address these issues, we propose the +\textbf{Du}al \textbf{S}econd-order \textbf{E}quivariant \textbf{G}raph +\textbf{O}rdinary Differential Equation (\method{}) for equivariant +representation. Specifically, \method{} apply the dual second-order equivariant +graph ordinary differential equations (Graph ODEs) on graph embeddings and node +coordinates, simultaneously. Theoretically, we first prove that \method{} +maintains the equivariant property. Furthermore, we provide theoretical +insights showing that \method{} effectively alleviates the over-smoothing +problem in both feature representation and coordinate update. Additionally, we +demonstrate that the proposed \method{} mitigates the exploding and vanishing +gradients problem, facilitating the training of deep multi-layer GNNs. +Extensive experiments on benchmark datasets validate the superiority of the +proposed \method{} compared to baselines. + +
+
+
+
+
+ + ☆ Adaptive Non-Uniform Timestep Sampling for Diffusion Model Training + + +
+ As a highly expressive generative model, diffusion models have demonstrated +exceptional success across various domains, including image generation, natural +language processing, and combinatorial optimization. However, as data +distributions grow more complex, training these models to convergence becomes +increasingly computationally intensive. While diffusion models are typically +trained using uniform timestep sampling, our research shows that the variance +in stochastic gradients varies significantly across timesteps, with +high-variance timesteps becoming bottlenecks that hinder faster convergence. To +address this issue, we introduce a non-uniform timestep sampling method that +prioritizes these more critical timesteps. Our method tracks the impact of +gradient updates on the objective for each timestep, adaptively selecting those +most likely to minimize the objective effectively. Experimental results +demonstrate that this approach not only accelerates the training process, but +also leads to improved performance at convergence. Furthermore, our method +shows robust performance across various datasets, scheduling strategies, and +diffusion architectures, outperforming previously proposed timestep sampling +and weighting heuristics that lack this degree of robustness. + +
+
+
+
+
+ + ☆ Fully Dynamic Adversarially Robust Correlation Clustering in + Polylogarithmic Update Time + + +
+ We study the dynamic correlation clustering problem with $\textit{adaptive}$ +edge label flips. In correlation clustering, we are given a $n$-vertex complete +graph whose edges are labeled either $(+)$ or $(-)$, and the goal is to +minimize the total number of $(+)$ edges between clusters and the number of +$(-)$ edges within clusters. We consider the dynamic setting with adversarial +robustness, in which the $\textit{adaptive}$ adversary could flip the label of +an edge based on the current output of the algorithm. Our main result is a +randomized algorithm that always maintains an $O(1)$-approximation to the +optimal correlation clustering with $O(\log^{2}{n})$ amortized update time. +Prior to our work, no algorithm with $O(1)$-approximation and +$\text{polylog}{(n)}$ update time for the adversarially robust setting was +known. We further validate our theoretical results with experiments on +synthetic and real-world datasets with competitive empirical performances. Our +main technical ingredient is an algorithm that maintains $\textit{sparse-dense +decomposition}$ with $\text{polylog}{(n)}$ update time, which could be of +independent interest. + +
+
+
+
+
+ + ☆ Establishing and Evaluating Trustworthy AI: Overview and Research + Challenges + + +
+ Artificial intelligence (AI) technologies (re-)shape modern life, driving +innovation in a wide range of sectors. However, some AI systems have yielded +unexpected or undesirable outcomes or have been used in questionable manners. +As a result, there has been a surge in public and academic discussions about +aspects that AI systems must fulfill to be considered trustworthy. In this +paper, we synthesize existing conceptualizations of trustworthy AI along six +requirements: 1) human agency and oversight, 2) fairness and +non-discrimination, 3) transparency and explainability, 4) robustness and +accuracy, 5) privacy and security, and 6) accountability. For each one, we +provide a definition, describe how it can be established and evaluated, and +discuss requirement-specific research challenges. Finally, we conclude this +analysis by identifying overarching research challenges across the requirements +with respect to 1) interdisciplinary research, 2) conceptual clarity, 3) +context-dependency, 4) dynamics in evolving systems, and 5) investigations in +real-world contexts. Thus, this paper synthesizes and consolidates a +wide-ranging and active discussion currently taking place in various academic +sub-communities and public forums. It aims to serve as a reference for a broad +audience and as a basis for future research directions. + +
+
+ comment: Accepted in Frontiers in Big Data and AI, Research Topic: Towards + Fair AI for Trustworthy Artificial Intelligence +
+
+
+
+
+ + ☆ Dense ReLU Neural Networks for Temporal-spatial Model + + +
+ In this paper, we focus on fully connected deep neural networks utilizing the +Rectified Linear Unit (ReLU) activation function for nonparametric estimation. +We derive non-asymptotic bounds that lead to convergence rates, addressing both +temporal and spatial dependence in the observed measurements. By accounting for +dependencies across time and space, our models better reflect the complexities +of real-world data, enhancing both predictive performance and theoretical +robustness. We also tackle the curse of dimensionality by modeling the data on +a manifold, exploring the intrinsic dimensionality of high-dimensional data. We +broaden existing theoretical findings of temporal-spatial analysis by applying +them to neural networks in more general contexts and demonstrate that our proof +techniques are effective for models with short-range dependence. Our empirical +simulations across various synthetic response functions underscore the superior +performance of our method, outperforming established approaches in the existing +literature. These findings provide valuable insights into the strong +capabilities of dense neural networks for temporal-spatial modeling across a +broad range of function classes. + +
+
+
+
+
+ + ☆ Instruction-Guided Editing Controls for Images and Multimedia: A Survey + in LLM era + + +
+ The rapid advancement of large language models (LLMs) and multimodal learning +has transformed digital content creation and manipulation. Traditional visual +editing tools require significant expertise, limiting accessibility. Recent +strides in instruction-based editing have enabled intuitive interaction with +visual content, using natural language as a bridge between user intent and +complex editing operations. This survey provides an overview of these +techniques, focusing on how LLMs and multimodal models empower users to achieve +precise visual modifications without deep technical knowledge. By synthesizing +over 100 publications, we explore methods from generative adversarial networks +to diffusion models, examining multimodal integration for fine-grained content +control. We discuss practical applications across domains such as fashion, 3D +scene manipulation, and video synthesis, highlighting increased accessibility +and alignment with human intuition. Our survey compares existing literature, +emphasizing LLM-empowered editing, and identifies key challenges to stimulate +further research. We aim to democratize powerful visual editing across various +industries, from entertainment to education. Interested readers are encouraged +to access our repository at +https://github.com/tamlhp/awesome-instruction-editing. + +
+
+
+
+
+ + ☆ TEESlice: Protecting Sensitive Neural Network Models in Trusted + Execution Environments When Attackers have Pre-Trained Models + + +
+ Trusted Execution Environments (TEE) are used to safeguard on-device models. +However, directly employing TEEs to secure the entire DNN model is challenging +due to the limited computational speed. Utilizing GPU can accelerate DNN's +computation speed but commercial widely-available GPUs usually lack security +protection. To this end, scholars introduce TSDP, a method that protects +privacy-sensitive weights within TEEs and offloads insensitive weights to GPUs. +Nevertheless, current methods do not consider the presence of a knowledgeable +adversary who can access abundant publicly available pre-trained models and +datasets. This paper investigates the security of existing methods against such +a knowledgeable adversary and reveals their inability to fulfill their security +promises. Consequently, we introduce a novel partition before training +strategy, which effectively separates privacy-sensitive weights from other +components of the model. Our evaluation demonstrates that our approach can +offer full model protection with a computational cost reduced by a factor of +10. In addition to traditional CNN models, we also demonstrate the scalability +to large language models. Our approach can compress the private functionalities +of the large language model to lightweight slices and achieve the same level of +protection as the shielding-whole-model baseline. + +
+
+ comment: Accepted by TOSEM. Extended version of the S&P24 paper + (arXiv:2310.07152) +
+
+
+
+
+ + ☆ Zero-shot Voice Conversion with Diffusion Transformers + + +
+ Zero-shot voice conversion aims to transform a source speech utterance to +match the timbre of a reference speech from an unseen speaker. Traditional +approaches struggle with timbre leakage, insufficient timbre representation, +and mismatches between training and inference tasks. We propose Seed-VC, a +novel framework that addresses these issues by introducing an external timbre +shifter during training to perturb the source speech timbre, mitigating leakage +and aligning training with inference. Additionally, we employ a diffusion +transformer that leverages the entire reference speech context, capturing +fine-grained timbre features through in-context learning. Experiments +demonstrate that Seed-VC outperforms strong baselines like OpenVoice and +CosyVoice, achieving higher speaker similarity and lower word error rates in +zero-shot voice conversion tasks. We further extend our approach to zero-shot +singing voice conversion by incorporating fundamental frequency (F0) +conditioning, resulting in comparative performance to current state-of-the-art +methods. Our findings highlight the effectiveness of Seed-VC in overcoming core +challenges, paving the way for more accurate and versatile voice conversion +systems. + +
+
+
+
+
+ + ☆ Is Precise Recovery Necessary? A Task-Oriented Imputation Approach for + Time Series Forecasting on Variable Subset + + +
+ Variable Subset Forecasting (VSF) refers to a unique scenario in multivariate +time series forecasting, where available variables in the inference phase are +only a subset of the variables in the training phase. VSF presents significant +challenges as the entire time series may be missing, and neither inter- nor +intra-variable correlations persist. Such conditions impede the effectiveness +of traditional imputation methods, primarily focusing on filling in individual +missing data points. Inspired by the principle of feature engineering that not +all variables contribute positively to forecasting, we propose Task-Oriented +Imputation for VSF (TOI-VSF), a novel framework shifts the focus from accurate +data recovery to directly support the downstream forecasting task. TOI-VSF +incorporates a self-supervised imputation module, agnostic to the forecasting +model, designed to fill in missing variables while preserving the vital +characteristics and temporal patterns of time series data. Additionally, we +implement a joint learning strategy for imputation and forecasting, ensuring +that the imputation process is directly aligned with and beneficial to the +forecasting objective. Extensive experiments across four datasets demonstrate +the superiority of TOI-VSF, outperforming baseline methods by $15\%$ on +average. + +
+
+
+
+
+ + ☆ Physics-informed Machine Learning for Battery Pack Thermal Management + + +
+ With the popularity of electric vehicles, the demand for lithium-ion +batteries is increasing. Temperature significantly influences the performance +and safety of batteries. Battery thermal management systems can effectively +control the temperature of batteries; therefore, the performance and safety can +be ensured. However, the development process of battery thermal management +systems is time-consuming and costly due to the extensive training dataset +needed by data-driven models requiring enormous computational costs for finite +element analysis. Therefore, a new approach to constructing surrogate models is +needed in the era of AI. Physics-informed machine learning enforces the +physical laws in surrogate models, making it the perfect candidate for +estimating battery pack temperature distribution. In this study, we first +developed a 21700 battery pack indirect liquid cooling system with cold plates +on the top and bottom with thermal paste surrounding the battery cells. Then, +the simplified finite element model was built based on experiment results. Due +to the high coolant flow rate, the cold plates can be considered as constant +temperature boundaries, while battery cells are the heat sources. The +physics-informed convolutional neural network served as a surrogate model to +estimate the temperature distribution of the battery pack. The loss function +was constructed considering the heat conduction equation based on the finite +difference method. The physics-informed loss function helped the convergence of +the training process with less data. As a result, the physics-informed +convolutional neural network showed more than 15 percents improvement in +accuracy compared to the data-driven method with the same training data. + +
+
+
+
+
+ + ☆ Self-Supervised Learning of Grasping Arbitrary Objects On-the-Move + + +
+ Mobile grasping enhances manipulation efficiency by utilizing robots' +mobility. This study aims to enable a commercial off-the-shelf robot for mobile +grasping, requiring precise timing and pose adjustments. Self-supervised +learning can develop a generalizable policy to adjust the robot's velocity and +determine grasp position and orientation based on the target object's shape and +pose. Due to mobile grasping's complexity, action primitivization and +step-by-step learning are crucial to avoid data sparsity in learning from trial +and error. This study simplifies mobile grasping into two grasp action +primitives and a moving action primitive, which can be operated with limited +degrees of freedom for the manipulator. This study introduces three fully +convolutional neural network (FCN) models to predict static grasp primitive, +dynamic grasp primitive, and residual moving velocity error from visual inputs. +A two-stage grasp learning approach facilitates seamless FCN model learning. +The ablation study demonstrated that the proposed method achieved the highest +grasping accuracy and pick-and-place efficiency. Furthermore, randomizing +object shapes and environments in the simulation effectively achieved +generalizable mobile grasping. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Statistical Analysis of Policy Space Compression Problem + + +
+ Policy search methods are crucial in reinforcement learning, offering a +framework to address continuous state-action and partially observable problems. +However, the complexity of exploring vast policy spaces can lead to significant +inefficiencies. Reducing the policy space through policy compression emerges as +a powerful, reward-free approach to accelerate the learning process. This +technique condenses the policy space into a smaller, representative set while +maintaining most of the original effectiveness. Our research focuses on +determining the necessary sample size to learn this compressed set accurately. +We employ R\'enyi divergence to measure the similarity between true and +estimated policy distributions, establishing error bounds for good +approximations. To simplify the analysis, we employ the $l_1$ norm, determining +sample size requirements for both model-based and model-free settings. Finally, +we correlate the error bounds from the $l_1$ norm with those from R\'enyi +divergence, distinguishing between policies near the vertices and those in the +middle of the policy space, to determine the lower and upper bounds for the +required sample sizes. + +
+
+
+
+
+ + ☆ Revealing the Evolution of Order in Materials Microstructures Using + Multi-Modal Computer Vision + + +
+ The development of high-performance materials for microelectronics, energy +storage, and extreme environments depends on our ability to describe and direct +property-defining microstructural order. Our present understanding is typically +derived from laborious manual analysis of imaging and spectroscopy data, which +is difficult to scale, challenging to reproduce, and lacks the ability to +reveal latent associations needed for mechanistic models. Here, we demonstrate +a multi-modal machine learning (ML) approach to describe order from electron +microscopy analysis of the complex oxide La$_{1-x}$Sr$_x$FeO$_3$. We construct +a hybrid pipeline based on fully and semi-supervised classification, allowing +us to evaluate both the characteristics of each data modality and the value +each modality adds to the ensemble. We observe distinct differences in the +performance of uni- and multi-modal models, from which we draw general lessons +in describing crystal order using computer vision. + +
+
+ comment: 30 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Deep learning robotics using self-supervised spatial differentiation + drive autonomous contact-based semiconductor characterization + + +
+ Integrating autonomous contact-based robotic characterization into +self-driving laboratories can enhance measurement quality, reliability, and +throughput. While deep learning models support robust autonomy, current methods +lack pixel-precision positioning and require extensive labeled data. To +overcome these challenges, we propose a self-supervised convolutional neural +network with a spatially differentiable loss function, incorporating shape +priors to refine predictions of optimal robot contact poses for semiconductor +characterization. This network improves valid pose generation by 20.0%, +relative to existing models. We demonstrate our network's performance by +driving a 4-degree-of-freedom robot to characterize photoconductivity at 3,025 +predicted poses across a gradient of perovskite compositions, achieving +throughputs over 125 measurements per hour. Spatially mapping photoconductivity +onto each drop-casted film reveals regions of inhomogeneity. With this +self-supervised deep learning-driven robotic system, we enable high-precision +and reliable automation of contact-based characterization techniques at high +throughputs, thereby allowing the measurement of previously inaccessible yet +important semiconductor properties for self-driving laboratories. + +
+
+
+
+
+ + ☆ Off-Dynamics Reinforcement Learning via Domain Adaptation and Reward + Augmented Imitation + + +
+ Training a policy in a source domain for deployment in the target domain +under a dynamics shift can be challenging, often resulting in performance +degradation. Previous work tackles this challenge by training on the source +domain with modified rewards derived by matching distributions between the +source and the target optimal trajectories. However, pure modified rewards only +ensure the behavior of the learned policy in the source domain resembles +trajectories produced by the target optimal policies, which does not guarantee +optimal performance when the learned policy is actually deployed to the target +domain. In this work, we propose to utilize imitation learning to transfer the +policy learned from the reward modification to the target domain so that the +new policy can generate the same trajectories in the target domain. Our +approach, Domain Adaptation and Reward Augmented Imitation Learning (DARAIL), +utilizes the reward modification for domain adaptation and follows the general +framework of generative adversarial imitation learning from observation (GAIfO) +by applying a reward augmented estimator for the policy optimization step. +Theoretically, we present an error bound for our method under a mild assumption +regarding the dynamics shift to justify the motivation of our method. +Empirically, our method outperforms the pure modified reward method without +imitation learning and also outperforms other baselines in benchmark +off-dynamics environments. + +
+
+ comment: Published at Neurips 2024 +
+
+
+
+
+ + ☆ InvestESG: A multi-agent reinforcement learning benchmark for studying + climate investment as a social dilemma + + +
+ InvestESG is a novel multi-agent reinforcement learning (MARL) benchmark +designed to study the impact of Environmental, Social, and Governance (ESG) +disclosure mandates on corporate climate investments. Supported by both PyTorch +and GPU-accelerated JAX framework, the benchmark models an intertemporal social +dilemma where companies balance short-term profit losses from climate +mitigation efforts and long-term benefits from reducing climate risk, while +ESG-conscious investors attempt to influence corporate behavior through their +investment decisions. Companies allocate capital across mitigation, +greenwashing, and resilience, with varying strategies influencing climate +outcomes and investor preferences. Our experiments show that without +ESG-conscious investors with sufficient capital, corporate mitigation efforts +remain limited under the disclosure mandate. However, when a critical mass of +investors prioritizes ESG, corporate cooperation increases, which in turn +reduces climate risks and enhances long-term financial stability. Additionally, +providing more information about global climate risks encourages companies to +invest more in mitigation, even without investor involvement. Our findings +align with empirical research using real-world data, highlighting MARL's +potential to inform policy by providing insights into large-scale +socio-economic challenges through efficient testing of alternative policy and +market designs. + +
+
+
+
+
+ + ☆ Fair Secretaries with Unfair Predictions NeurIPS 2024 + + +
+ Algorithms with predictions is a recent framework for decision-making under +uncertainty that leverages the power of machine-learned predictions without +making any assumption about their quality. The goal in this framework is for +algorithms to achieve an improved performance when the predictions are accurate +while maintaining acceptable guarantees when the predictions are erroneous. A +serious concern with algorithms that use predictions is that these predictions +can be biased and, as a result, cause the algorithm to make decisions that are +deemed unfair. We show that this concern manifests itself in the classical +secretary problem in the learning-augmented setting -- the state-of-the-art +algorithm can have zero probability of accepting the best candidate, which we +deem unfair, despite promising to accept a candidate whose expected value is at +least $\max\{\Omega (1) , 1 - O(\epsilon)\}$ times the optimal value, where +$\epsilon$ is the prediction error. We show how to preserve this promise while +also guaranteeing to accept the best candidate with probability $\Omega(1)$. +Our algorithm and analysis are based on a new "pegging" idea that diverges from +existing works and simplifies/unifies some of their results. Finally, we extend +to the $k$-secretary problem and complement our theoretical analysis with +experiments. + +
+
+ comment: to appear at NeurIPS 2024 +
+
+
+
+
+ + ☆ KULCQ: An Unsupervised Keyword-based Utterance Level Clustering Quality + Metric + + +
+ Intent discovery is crucial for both building new conversational agents and +improving existing ones. While several approaches have been proposed for intent +discovery, most rely on clustering to group similar utterances together. +Traditional evaluation of these utterance clusters requires intent labels for +each utterance, limiting scalability. Although some clustering quality metrics +exist that do not require labeled data, they focus solely on cluster geometry +while ignoring the linguistic nuances present in conversational transcripts. In +this paper, we introduce Keyword-based Utterance Level Clustering Quality +(KULCQ), an unsupervised metric that leverages keyword analysis to evaluate +clustering quality. We demonstrate KULCQ's effectiveness by comparing it with +existing unsupervised clustering metrics and validate its performance through +comprehensive ablation studies. Our results show that KULCQ better captures +semantic relationships in conversational data while maintaining consistency +with geometric clustering principles. + +
+
+
+
+
+ + ☆ InterFormer: Towards Effective Heterogeneous Interaction Learning for + Click-Through Rate Prediction + + +
+ Click-through rate (CTR) prediction, which predicts the probability of a user +clicking an ad, is a fundamental task in recommender systems. The emergence of +heterogeneous information, such as user profile and behavior sequences, depicts +user interests from different aspects. A mutually beneficial integration of +heterogeneous information is the cornerstone towards the success of CTR +prediction. However, most of the existing methods suffer from two fundamental +limitations, including (1) insufficient inter-mode interaction due to the +unidirectional information flow between modes, and (2) aggressive information +aggregation caused by early summarization, resulting in excessive information +loss. To address the above limitations, we propose a novel module named +InterFormer to learn heterogeneous information interaction in an interleaving +style. To achieve better interaction learning, InterFormer enables +bidirectional information flow for mutually beneficial learning across +different modes. To avoid aggressive information aggregation, we retain +complete information in each data mode and use a separate bridging arch for +effective information selection and summarization. Our proposed InterFormer +achieves state-of-the-art performance on three public datasets and a +large-scale industrial dataset. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ SymbolFit: Automatic Parametric Modeling with Symbolic Regression + + +
+ We introduce SymbolFit, a framework that automates parametric modeling by +using symbolic regression to perform a machine-search for functions that fit +the data, while simultaneously providing uncertainty estimates in a single run. +Traditionally, constructing a parametric model to accurately describe binned +data has been a manual and iterative process, requiring an adequate functional +form to be determined before the fit can be performed. The main challenge +arises when the appropriate functional forms cannot be derived from first +principles, especially when there is no underlying true closed-form function +for the distribution. In this work, we address this problem by utilizing +symbolic regression, a machine learning technique that explores a vast space of +candidate functions without needing a predefined functional form, treating the +functional form itself as a trainable parameter. Our approach is demonstrated +in data analysis applications in high-energy physics experiments at the CERN +Large Hadron Collider (LHC). We demonstrate its effectiveness and efficiency +using five real proton-proton collision datasets from new physics searches at +the LHC, namely the background modeling in resonance searches for high-mass +dijet, trijet, paired-dijet, diphoton, and dimuon events. We also validate the +framework using several toy datasets with one and more variables. + +
+
+ comment: 53 pages, 35 figures. Under review +
+
+
+
+
+ + ☆ Enhancing Diffusion Posterior Sampling for Inverse Problems by + Integrating Crafted Measurements + + +
+ Diffusion models have emerged as a powerful foundation model for visual +generation. With an appropriate sampling process, it can effectively serve as a +generative prior to solve general inverse problems. Current posterior sampling +based methods take the measurement (i.e., degraded image sample) into the +posterior sampling to infer the distribution of the target data (i.e., clean +image sample). However, in this manner, we show that high-frequency information +can be prematurely introduced during the early stages, which could induce +larger posterior estimate errors during the restoration sampling. To address +this issue, we first reveal that forming the log posterior gradient with the +noisy measurement ( i.e., samples from a diffusion forward process) instead of +the clean one can benefit the reverse process. Consequently, we propose a novel +diffusion posterior sampling method DPS-CM, which incorporates a Crafted +Measurement (i.e., samples generated by a reverse denoising process, compared +to random sampling with noise in standard methods) to form the posterior +estimate. This integration aims to mitigate the misalignment with the diffusion +prior caused by cumulative posterior estimate errors. Experimental results +demonstrate that our approach significantly improves the overall capacity to +solve general and noisy inverse problems, such as Gaussian deblurring, +super-resolution, inpainting, nonlinear deblurring, and tasks with Poisson +noise, relative to existing approaches. + +
+
+
+
+
+ + ♻ ☆ Learning Diffusion Priors from Observations by Expectation Maximization + + +
+ Diffusion models recently proved to be remarkable priors for Bayesian inverse +problems. However, training these models typically requires access to large +amounts of clean data, which could prove difficult in some settings. In this +work, we present a novel method based on the expectation-maximization algorithm +for training diffusion models from incomplete and noisy observations only. +Unlike previous works, our method leads to proper diffusion models, which is +crucial for downstream tasks. As part of our method, we propose and motivate an +improved posterior sampling scheme for unconditional diffusion models. We +present empirical evidence supporting the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Demo: Multi-Modal Seizure Prediction System + + +
+ This demo presents SeizNet, an innovative system for predicting epileptic +seizures benefiting from a multi-modal sensor network and utilizing Deep +Learning (DL) techniques. Epilepsy affects approximately 65 million people +worldwide, many of whom experience drug-resistant seizures. SeizNet aims at +providing highly accurate alerts, allowing individuals to take preventive +measures without being disturbed by false alarms. SeizNet uses a combination of +data collected through either invasive (intracranial electroencephalogram +(iEEG)) or non-invasive (electroencephalogram (EEG) and electrocardiogram +(ECG)) sensors, and processed by advanced DL algorithms that are optimized for +real-time inference at the edge, ensuring privacy and minimizing data +transmission. SeizNet achieves > 97% accuracy in seizure prediction while +keeping the size and energy restrictions of an implantable device. + +
+
+ comment: 1 page, 1 figure, Proceedings of the IEEE 20th International + Conference on Body Sensor Networks (BSN), October 2024 +
+
+
+
+
+ + ♻ ☆ Swarm Characteristics Classification Using Neural Networks + + +
+ Understanding the characteristics of swarming autonomous agents is critical +for defense and security applications. This article presents a study on using +supervised neural network time series classification (NN TSC) to predict key +attributes and tactics of swarming autonomous agents for military contexts. +Specifically, NN TSC is applied to infer two binary attributes - communication +and proportional navigation - which combine to define four mutually exclusive +swarm tactics. We identify a gap in literature on using NNs for swarm +classification and demonstrate the effectiveness of NN TSC in rapidly deducing +intelligence about attacking swarms to inform counter-maneuvers. Through +simulated swarm-vs-swarm engagements, we evaluate NN TSC performance in terms +of observation window requirements, noise robustness, and scalability to swarm +size. Key findings show NNs can predict swarm behaviors with 97% accuracy using +short observation windows of 20 time steps, while also demonstrating graceful +degradation down to 80% accuracy under 50% noise, as well as excellent +scalability to swarm sizes from 10 to 100 agents. These capabilities are +promising for real-time decision-making support in defense scenarios by rapidly +inferring insights about swarm behavior. + +
+
+ comment: Added funding acknowledgment and author bios +
+
+
+
+
+ + ♻ ☆ Coniferest: a complete active anomaly detection framework + + +
+ We present coniferest, an open source generic purpose active anomaly +detection framework written in Python. The package design and implemented +algorithms are described. Currently, static outlier detection analysis is +supported via the Isolation forest algorithm. Moreover, Active Anomaly +Discovery (AAD) and Pineforest algorithms are available to tackle active +anomaly detection problems. The algorithms and package performance are +evaluated on a series of synthetic datasets. We also describe a few success +cases which resulted from applying the package to real astronomical data in +active anomaly detection tasks within the SNAD project. + +
+
+ comment: 13 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Recurrent Neural Goodness-of-Fit Test for Time Series + + +
+ Time series data are crucial across diverse domains such as finance and +healthcare, where accurate forecasting and decision-making rely on advanced +modeling techniques. While generative models have shown great promise in +capturing the intricate dynamics inherent in time series, evaluating their +performance remains a major challenge. Traditional evaluation metrics fall +short due to the temporal dependencies and potential high dimensionality of the +features. In this paper, we propose the REcurrent NeurAL (RENAL) +Goodness-of-Fit test, a novel and statistically rigorous framework for +evaluating generative time series models. By leveraging recurrent neural +networks, we transform the time series into conditionally independent data +pairs, enabling the application of a chi-square-based goodness-of-fit test to +the temporal dependencies within the data. This approach offers a robust, +theoretically grounded solution for assessing the quality of generative models, +particularly in settings with limited time sequences. We demonstrate the +efficacy of our method across both synthetic and real-world datasets, +outperforming existing methods in terms of reliability and accuracy. Our method +fills a critical gap in the evaluation of time series generative models, +offering a tool that is both practical and adaptable to high-stakes +applications. + +
+
+ comment: 27 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Risk Sources and Risk Management Measures in Support of Standards for + General-Purpose AI Systems + + +
+ There is an urgent need to identify both short and long-term risks from newly +emerging types of Artificial Intelligence (AI), as well as available risk +management measures. In response, and to support global efforts in regulating +AI and writing safety standards, we compile an extensive catalog of risk +sources and risk management measures for general-purpose AI (GPAI) systems, +complete with descriptions and supporting examples where relevant. This work +involves identifying technical, operational, and societal risks across model +development, training, and deployment stages, as well as surveying established +and experimental methods for managing these risks. To the best of our +knowledge, this paper is the first of its kind to provide extensive +documentation of both GPAI risk sources and risk management measures that are +descriptive, self-contained and neutral with respect to any existing regulatory +framework. This work intends to help AI providers, standards experts, +researchers, policymakers, and regulators in identifying and mitigating +systemic risks from GPAI systems. For this reason, the catalog is released +under a public domain license for ease of direct use by stakeholders in AI +governance and standards. + +
+
+ comment: 92 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Training Deep 3D Convolutional Neural Networks to Extract BSM Physics + Parameters Directly from HEP Data: a Proof-of-Concept Study Using Monte Carlo + Simulations + + +
+ We report on a novel application of computer vision techniques to extract +beyond the Standard Model parameters directly from high energy physics flavor +data. We propose a simple but novel data representation that transforms the +angular and kinematic distributions into "quasi-images", which are used to +train a convolutional neural network to perform regression tasks, similar to +fitting. As a proof-of-concept, we train a 34-layer Residual Neural Network to +regress on these images and determine information about the Wilson Coefficient +$C_{9}$ in Monte Carlo simulations of $B^0 \rightarrow K^{*0}\mu^{+}\mu^{-}$ +decays. The method described here can be generalized and may find applicability +across a variety of experiments. + +
+
+
+
+
+ + ♻ ☆ Open LLMs are Necessary for Current Private Adaptations and Outperform + their Closed Alternatives NeurIPS 2024 + + +
+ While open Large Language Models (LLMs) have made significant progress, they +still fall short of matching the performance of their closed, proprietary +counterparts, making the latter attractive even for the use on highly private +data. Recently, various new methods have been proposed to adapt closed LLMs to +private data without leaking private information to third parties and/or the +LLM provider. In this work, we analyze the privacy protection and performance +of the four most recent methods for private adaptation of closed LLMs. By +examining their threat models and thoroughly comparing their performance under +different privacy levels according to differential privacy (DP), various LLM +architectures, and multiple datasets for classification and generation tasks, +we find that: (1) all the methods leak query data, i.e., the (potentially +sensitive) user data that is queried at inference time, to the LLM provider, +(2) three out of four methods also leak large fractions of private training +data to the LLM provider while the method that protects private data requires a +local open LLM, (3) all the methods exhibit lower performance compared to three +private gradient-based adaptation methods for local open LLMs, and (4) the +private adaptation methods for closed LLMs incur higher monetary training and +query costs than running the alternative methods on local open LLMs. This +yields the conclusion that, to achieve truly privacy-preserving LLM adaptations +that yield high performance and more privacy at lower costs, taking into +account current methods and models, one should use open LLMs. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ CE-SSL: Computation-Efficient Semi-Supervised Learning for ECG-based + Cardiovascular Diseases Detection + + +
+ The label scarcity problem is the main challenge that hinders the wide +application of deep learning systems in automatic cardiovascular diseases +(CVDs) detection using electrocardiography (ECG). Tuning pre-trained models +alleviates this problem by transferring knowledge learned from large datasets +to downstream small datasets. However, bottlenecks in computational efficiency +and detection performance limit its clinical applications. It is difficult to +improve the detection performance without significantly sacrificing the +computational efficiency during model training. Here, we propose a +computation-efficient semi-supervised learning paradigm (CE-SSL) for robust and +computation-efficient CVDs detection using ECG. It enables a robust adaptation +of pre-trained models on downstream datasets with limited supervision and high +computational efficiency. First, a random-deactivation technique is developed +to achieve robust and fast low-rank adaptation of pre-trained weights. +Subsequently, we propose a one-shot rank allocation module to determine the +optimal ranks for the update matrices of the pre-trained weights. Finally, a +lightweight semi-supervised learning pipeline is introduced to enhance model +performance by leveraging labeled and unlabeled data with high computational +efficiency. Extensive experiments on four downstream datasets demonstrate that +CE-SSL not only outperforms the state-of-the-art methods in multi-label CVDs +detection but also consumes fewer GPU footprints, training time, and parameter +storage space. As such, this paradigm provides an effective solution for +achieving high computational efficiency and robust detection performance in the +clinical applications of pre-trained models under limited supervision. Code and +Supplementary Materials are available at https://github.com/KAZABANA/CE-SSL + +
+
+
+
+
+ + ♻ ☆ Label Cluster Chains for Multi-Label Classification + + +
+ Multi-label classification is a type of supervised machine learning that can +simultaneously assign multiple labels to an instance. To solve this task, some +methods divide the original problem into several sub-problems (local approach), +others learn all labels at once (global approach), and others combine several +classifiers (ensemble approach). Regardless of the approach used, exploring and +learning label correlations is important to improve the classifier predictions. +Ensemble of Classifier Chains (ECC) is a well-known multi-label method that +considers label correlations and can achieve good overall performance on +several multi-label datasets and evaluation measures. However, one of the +challenges when working with ECC is the high dimensionality of the label space, +which can impose limitations for fully-cascaded chains as the complexity +increases regarding feature space expansion. To improve classifier chains, we +propose a method to chain disjoint correlated label clusters obtained by +applying a partition method in the label space. During the training phase, the +ground truth labels of each cluster are used as new features for all of the +following clusters. During the test phase, the predicted labels of clusters are +used as new features for all the following clusters. Our proposal, called Label +Cluster Chains for Multi-Label Classification (LCC-ML), uses multi-label Random +Forests as base classifiers in each cluster, combining their predictions to +obtain a final multi-label classification. Our proposal obtained better results +compared to the original ECC. This shows that learning and chaining disjoint +correlated label clusters can better explore and learn label correlations. + +
+
+ comment: The article was submitted prematurely, and after it was published on + arXiv, we identified aspects that require attention, adjustments, and + improvements. We are working to review and significantly improve the content. + Therefore, we request its temporary withdrawal to avoid the dissemination of + information that may be incomplete or incorrectly interpreted +
+
+
+
+
+ + ♻ ☆ Improved Canonicalization for Model Agnostic Equivariance CVPR 2024 + + +
+ This work introduces a novel approach to achieving architecture-agnostic +equivariance in deep learning, particularly addressing the limitations of +traditional layerwise equivariant architectures and the inefficiencies of the +existing architecture-agnostic methods. Building equivariant models using +traditional methods requires designing equivariant versions of existing models +and training them from scratch, a process that is both impractical and +resource-intensive. Canonicalization has emerged as a promising alternative for +inducing equivariance without altering model architecture, but it suffers from +the need for highly expressive and expensive equivariant networks to learn +canonical orientations accurately. We propose a new optimization-based method +that employs any non-equivariant network for canonicalization. Our method uses +contrastive learning to efficiently learn a canonical orientation and offers +more flexibility for the choice of canonicalization network. We empirically +demonstrate that this approach outperforms existing methods in achieving +equivariance for large pretrained models and significantly speeds up the +canonicalization process, making it up to 2 times faster. + +
+
+ comment: Accepted to EquiVision workshop, CVPR 2024. 8 pages, 2 figures, 2 + tables +
+
+
+
+
+ + ♻ ☆ Inconsistencies In Consistency Models: Better ODE Solving Does Not Imply + Better Samples NeurIPS 2024 + + +
+ Although diffusion models can generate remarkably high-quality samples, they +are intrinsically bottlenecked by their expensive iterative sampling procedure. +Consistency models (CMs) have recently emerged as a promising diffusion model +distillation method, reducing the cost of sampling by generating high-fidelity +samples in just a few iterations. Consistency model distillation aims to solve +the probability flow ordinary differential equation (ODE) defined by an +existing diffusion model. CMs are not directly trained to minimize error +against an ODE solver, rather they use a more computationally tractable +objective. As a way to study how effectively CMs solve the probability flow +ODE, and the effect that any induced error has on the quality of generated +samples, we introduce Direct CMs, which \textit{directly} minimize this error. +Intriguingly, we find that Direct CMs reduce the ODE solving error compared to +CMs but also result in significantly worse sample quality, calling into +question why exactly CMs work well in the first place. Full code is available +at: https://github.com/layer6ai-labs/direct-cms. + +
+
+ comment: NeurIPS 2024 ATTRIB Workshop +
+
+
+
+
+ + ♻ ☆ ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric + Thermal Images + + +
+ Designing egocentric 3D hand pose estimation systems that can perform +reliably in complex, real-world scenarios is crucial for downstream +applications. Previous approaches using RGB or NIR imagery struggle in +challenging conditions: RGB methods are susceptible to lighting variations and +obstructions like handwear, while NIR techniques can be disrupted by sunlight +or interference from other NIR-equipped devices. To address these limitations, +we present ThermoHands, the first benchmark focused on thermal image-based +egocentric 3D hand pose estimation, demonstrating the potential of thermal +imaging to achieve robust performance under these conditions. The benchmark +includes a multi-view and multi-spectral dataset collected from 28 subjects +performing hand-object and hand-virtual interactions under diverse scenarios, +accurately annotated with 3D hand poses through an automated process. We +introduce a new baseline method, TherFormer, utilizing dual transformer modules +for effective egocentric 3D hand pose estimation in thermal imagery. Our +experimental results highlight TherFormer's leading performance and affirm +thermal imaging's effectiveness in enabling robust 3D hand pose estimation in +adverse conditions. + +
+
+ comment: 15 pages, 9 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Unlocking Real-Time Fluorescence Lifetime Imaging: Multi-Pixel + Parallelism for FPGA-Accelerated Processing + + +
+ Fluorescence lifetime imaging (FLI) is a widely used technique in the +biomedical field for measuring the decay times of fluorescent molecules, +providing insights into metabolic states, protein interactions, and +ligand-receptor bindings. However, its broader application in fast biological +processes, such as dynamic activity monitoring, and clinical use, such as in +guided surgery, is limited by long data acquisition times and computationally +demanding data processing. While deep learning has reduced post-processing +times, time-resolved data acquisition remains a bottleneck for real-time +applications. To address this, we propose a method to achieve real-time FLI +using an FPGA-based hardware accelerator. Specifically, we implemented a +GRU-based sequence-to-sequence (Seq2Seq) model on an FPGA board compatible with +time-resolved cameras. The GRU model balances accurate processing with the +resource constraints of FPGAs, which have limited DSP units and BRAM. The +limited memory and computational resources on the FPGA require efficient +scheduling of operations and memory allocation to deploy deep learning models +for low-latency applications. We address these challenges by using STOMP, a +queue-based discrete-event simulator that automates and optimizes task +scheduling and memory management on hardware. By integrating a GRU-based +Seq2Seq model and its compressed version, called Seq2SeqLite, generated through +knowledge distillation, we were able to process multiple pixels in parallel, +reducing latency compared to sequential processing. We explore various levels +of parallelism to achieve an optimal balance between performance and resource +utilization. Our results indicate that the proposed techniques achieved a 17.7x +and 52.0x speedup over manual scheduling for the Seq2Seq model and the +Seq2SeqLite model, respectively. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Harnessing Machine Learning for Single-Shot Measurement of Free Electron + Laser Pulse Power NeurIPS 2024 + + +
+ Electron beam accelerators are essential in many scientific and technological +fields. Their operation relies heavily on the stability and precision of the +electron beam. Traditional diagnostic techniques encounter difficulties in +addressing the complex and dynamic nature of electron beams. Particularly in +the context of free-electron lasers (FELs), it is fundamentally impossible to +measure the lasing-on and lasingoff electron power profiles for a single +electron bunch. This is a crucial hurdle in the exact reconstruction of the +photon pulse profile. To overcome this hurdle, we developed a machine learning +model that predicts the temporal power profile of the electron bunch in the +lasing-off regime using machine parameters that can be obtained when lasing is +on. The model was statistically validated and showed superior predictions +compared to the state-of-the-art batch calibrations. The work we present here +is a critical element for a virtual pulse reconstruction diagnostic (VPRD) tool +designed to reconstruct the power profile of individual photon pulses without +requiring repeated measurements in the lasing-off regime. This promises to +significantly enhance the diagnostic capabilities in FELs at large. + +
+
+ comment: 10 pages, 4 figures, Machine Learning and the Physical Sciences + Workshop, NeurIPS 2024 https://neurips.cc/virtual/2024/100009 +
+
+
+
+
+ + ♻ ☆ CLCE: An Approach to Refining Cross-Entropy and Contrastive Learning for + Optimized Learning Fusion + + +
+ State-of-the-art pre-trained image models predominantly adopt a two-stage +approach: initial unsupervised pre-training on large-scale datasets followed by +task-specific fine-tuning using Cross-Entropy loss~(CE). However, it has been +demonstrated that CE can compromise model generalization and stability. While +recent works employing contrastive learning address some of these limitations +by enhancing the quality of embeddings and producing better decision +boundaries, they often overlook the importance of hard negative mining and rely +on resource intensive and slow training using large sample batches. To counter +these issues, we introduce a novel approach named CLCE, which integrates +Label-Aware Contrastive Learning with CE. Our approach not only maintains the +strengths of both loss functions but also leverages hard negative mining in a +synergistic way to enhance performance. Experimental results demonstrate that +CLCE significantly outperforms CE in Top-1 accuracy across twelve benchmarks, +achieving gains of up to 3.52% in few-shot learning scenarios and 3.41% in +transfer learning settings with the BEiT-3 model. Importantly, our proposed +CLCE approach effectively mitigates the dependency of contrastive learning on +large batch sizes such as 4096 samples per batch, a limitation that has +previously constrained the application of contrastive learning in +budget-limited hardware environments. + +
+
+
+
+
+ + ♻ ☆ Learning rheological parameters of non-Newtonian fluids from velocimetry + data + + +
+ We solve a Bayesian inverse Navier-Stokes (N-S) problem that assimilates +velocimetry data in order to jointly reconstruct the flow field and learn the +unknown N-S parameters. By incorporating a Carreau shear-thinning viscosity +model into the N-S problem, we devise an algorithm that learns the most likely +Carreau parameters of a shear-thinning fluid, and estimates their +uncertainties, from velocimetry data alone. We then conduct a flow-MRI +experiment to obtain velocimetry data of an axisymmetric laminar jet through an +idealised medical device (FDA nozzle) for a blood analogue fluid. We show that +the algorithm can successfully reconstruct the flow field by learning the most +likely Carreau parameters, and that the learned parameters are in very good +agreement with rheometry measurements. The algorithm accepts any algebraic +effective viscosity model, as long as the model is differentiable, and it can +be extended to more complicated non-Newtonian fluids (e.g. Oldroyd-B fluid) if +a viscoelastic model is incorporated into the N-S problem. + +
+
+
+
+
+ + ♻ ☆ GenoCraft: A Comprehensive, User-Friendly Web-Based Platform for + High-Throughput Omics Data Analysis and Visualization + + +
+ The surge in high-throughput omics data has reshaped the landscape of +biological research, underlining the need for powerful, user-friendly data +analysis and interpretation tools. This paper presents GenoCraft, a web-based +comprehensive software solution designed to handle the entire pipeline of omics +data processing. GenoCraft offers a unified platform featuring advanced +bioinformatics tools, covering all aspects of omics data analysis. It +encompasses a range of functionalities, such as normalization, quality control, +differential analysis, network analysis, pathway analysis, and diverse +visualization techniques. This software makes state-of-the-art omics data +analysis more accessible to a wider range of users. With GenoCraft, researchers +and data scientists have access to an array of cutting-edge bioinformatics +tools under a user-friendly interface, making it a valuable resource for +managing and analyzing large-scale omics data. The API with an interactive web +interface is publicly available at https://genocraft.stanford. edu/. We also +release all the codes in https://github.com/futianfan/GenoCraft. + +
+
+
+
+
+ + ♻ ☆ A Survey on State-of-the-art Deep Learning Applications and Challenges + + +
+ Deep learning, a branch of artificial intelligence, is a data-driven method +that uses multiple layers of interconnected units (neurons) to learn intricate +patterns and representations directly from raw input data. Empowered by this +learning capability, it has become a powerful tool for solving complex problems +and is the core driver of many groundbreaking technologies and innovations. +Building a deep learning model is challenging due to the algorithm's complexity +and the dynamic nature of real-world problems. Several studies have reviewed +deep learning concepts and applications. However, the studies mostly focused on +the types of deep learning models and convolutional neural network +architectures, offering limited coverage of the state-of-the-art deep learning +models and their applications in solving complex problems across different +domains. Therefore, motivated by the limitations, this study aims to +comprehensively review the state-of-the-art deep learning models in computer +vision, natural language processing, time series analysis and pervasive +computing. We highlight the key features of the models and their effectiveness +in solving the problems within each domain. Furthermore, this study presents +the fundamentals of deep learning, various deep learning model types and +prominent convolutional neural network architectures. Finally, challenges and +future directions in deep learning research are discussed to offer a broader +perspective for future researchers. + +
+
+
+
+
+ + ♻ ☆ Supra-Laplacian Encoding for Transformer on Dynamic Graphs + + +
+ Fully connected Graph Transformers (GT) have rapidly become prominent in the +static graph community as an alternative to Message-Passing models, which +suffer from a lack of expressivity, oversquashing, and under-reaching. However, +in a dynamic context, by interconnecting all nodes at multiple snapshots with +self-attention, GT loose both structural and temporal information. In this +work, we introduce Supra-LAplacian encoding for spatio-temporal TransformErs +(SLATE), a new spatio-temporal encoding to leverage the GT architecture while +keeping spatio-temporal information. Specifically, we transform Discrete Time +Dynamic Graphs into multi-layer graphs and take advantage of the spectral +properties of their associated supra-Laplacian matrix. Our second contribution +explicitly model nodes' pairwise relationships with a cross-attention +mechanism, providing an accurate edge representation for dynamic link +prediction. SLATE outperforms numerous state-of-the-art methods based on +Message-Passing Graph Neural Networks combined with recurrent models (e.g +LSTM), and Dynamic Graph Transformers, on 9 datasets. Code is available at: +github.com/ykrmm/SLATE. + +
+
+
+
+
+ + ♻ ☆ DEEP-IoT: Downlink-Enhanced Efficient-Power Internet of Things + + +
+ At the heart of the Internet of Things (IoT) -- a domain witnessing explosive +growth -- the imperative for energy efficiency and the extension of device +lifespans has never been more pressing. This paper presents DEEP-IoT, an +innovative communication paradigm poised to redefine how IoT devices +communicate. Through a pioneering feedback channel coding strategy, DEEP-IoT +challenges and transforms the traditional transmitter (IoT devices)-centric +communication model to one where the receiver (the access point) play a pivotal +role, thereby cutting down energy use and boosting device longevity. We not +only conceptualize DEEP-IoT but also actualize it by integrating deep +learning-enhanced feedback channel codes within a narrow-band system. +Simulation results show a significant enhancement in the operational lifespan +of IoT cells -- surpassing traditional systems using Turbo and Polar codes by +up to 52.71%. This leap signifies a paradigm shift in IoT communications, +setting the stage for a future where IoT devices boast unprecedented efficiency +and durability. + +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration From Cognitive Psychology + + +
+ The cognitive mechanism by which Large Language Models (LLMs) solve +mathematical problems remains a widely debated and unresolved issue. Currently, +there is little interpretable experimental evidence that connects LLMs' +problem-solving with human cognitive psychology.To determine if LLMs possess +human-like mathematical reasoning, we modified the problems used in the human +Cognitive Reflection Test (CRT). Our results show that, even with the use of +Chains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model +(noted for its reasoning capabilities), have a high error rate when solving +these modified CRT problems. Specifically, the average accuracy rate dropped by +up to 50% compared to the original questions.Further analysis of LLMs' +incorrect answers suggests that they primarily rely on pattern matching from +their training data, which aligns more with human intuition (System 1 thinking) +rather than with human-like reasoning (System 2 thinking). This finding +challenges the belief that LLMs have genuine mathematical reasoning abilities +comparable to humans. As a result, this work may adjust overly optimistic views +on LLMs' progress towards artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ Arithmetical Binary Decision Tree Traversals + + +
+ This paper introduces a series of methods for traversing binary decision +trees using arithmetic operations. We present a suite of binary tree traversal +algorithms that leverage novel representation matrices to flatten the full +binary tree structure and embed the aggregated internal node Boolean tests into +a single binary vector. Our approach, grounded in maximum inner product search, +offers new insights into decision tree. + +
+
+ comment: Correct some citation format and typoes +
+
+
+
+
+ + ♻ ☆ Interpretable Concept-Based Memory Reasoning + + +
+ The lack of transparency in the decision-making processes of deep learning +systems presents a significant challenge in modern artificial intelligence +(AI), as it impairs users' ability to rely on and verify these systems. To +address this challenge, Concept Bottleneck Models (CBMs) have made significant +progress by incorporating human-interpretable concepts into deep learning +architectures. This approach allows predictions to be traced back to specific +concept patterns that users can understand and potentially intervene on. +However, existing CBMs' task predictors are not fully interpretable, preventing +a thorough analysis and any form of formal verification of their +decision-making process prior to deployment, thereby raising significant +reliability concerns. To bridge this gap, we introduce Concept-based Memory +Reasoner (CMR), a novel CBM designed to provide a human-understandable and +provably-verifiable task prediction process. Our approach is to model each task +prediction as a neural selection mechanism over a memory of learnable logic +rules, followed by a symbolic evaluation of the selected rule. The presence of +an explicit memory and the symbolic evaluation allow domain experts to inspect +and formally verify the validity of certain global properties of interest for +the task prediction process. Experimental results demonstrate that CMR achieves +better accuracy-interpretability trade-offs to state-of-the-art CBMs, discovers +logic rules consistent with ground truths, allows for rule interventions, and +allows pre-deployment verification. + +
+
+
+
+
+ + ♻ ☆ FGCE: Feasible Group Counterfactual Explanations for Auditing Fairness + + +
+ This paper introduces the first graph-based framework for generating group +counterfactual explanations to audit model fairness, a crucial aspect of +trustworthy machine learning. Counterfactual explanations are instrumental in +understanding and mitigating unfairness by revealing how inputs should change +to achieve a desired outcome. Our framework, named Feasible Group +Counterfactual Explanations (FGCEs), captures real-world feasibility +constraints and constructs subgroups with similar counterfactuals, setting it +apart from existing methods. It also addresses key trade-offs in counterfactual +generation, including the balance between the number of counterfactuals, their +associated costs, and the breadth of coverage achieved. To evaluate these +trade-offs and assess fairness, we propose measures tailored to group +counterfactual generation. Our experimental results on benchmark datasets +demonstrate the effectiveness of our approach in managing feasibility +constraints and trade-offs, as well as the potential of our proposed metrics in +identifying and quantifying fairness issues. + +
+
+
+
+
+ + ♻ ☆ Adversarial Robustness of VAEs across Intersectional Subgroups + + +
+ Despite advancements in Autoencoders (AEs) for tasks like dimensionality +reduction, representation learning and data generation, they remain vulnerable +to adversarial attacks. Variational Autoencoders (VAEs), with their +probabilistic approach to disentangling latent spaces, show stronger resistance +to such perturbations compared to deterministic AEs; however, their resilience +against adversarial inputs is still a concern. This study evaluates the +robustness of VAEs against non-targeted adversarial attacks by optimizing +minimal sample-specific perturbations to cause maximal damage across diverse +demographic subgroups (combinations of age and gender). We investigate two +questions: whether there are robustness disparities among subgroups, and what +factors contribute to these disparities, such as data scarcity and +representation entanglement. Our findings reveal that robustness disparities +exist but are not always correlated with the size of the subgroup. By using +downstream gender and age classifiers and examining latent embeddings, we +highlight the vulnerability of subgroups like older women, who are prone to +misclassification due to adversarial perturbations pushing their +representations toward those of other subgroups. + +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks Do Not Always Oversmooth + + +
+ Graph neural networks (GNNs) have emerged as powerful tools for processing +relational data in applications. However, GNNs suffer from the problem of +oversmoothing, the property that the features of all nodes exponentially +converge to the same vector over layers, prohibiting the design of deep GNNs. +In this work we study oversmoothing in graph convolutional networks (GCNs) by +using their Gaussian process (GP) equivalence in the limit of infinitely many +hidden features. By generalizing methods from conventional deep neural networks +(DNNs), we can describe the distribution of features at the output layer of +deep GCNs in terms of a GP: as expected, we find that typical parameter choices +from the literature lead to oversmoothing. The theory, however, allows us to +identify a new, non-oversmoothing phase: if the initial weights of the network +have sufficiently large variance, GCNs do not oversmooth, and node features +remain informative even at large depth. We demonstrate the validity of this +prediction in finite-size GCNs by training a linear classifier on their output. +Moreover, using the linearization of the GCN GP, we generalize the concept of +propagation depth of information from DNNs to GCNs. This propagation depth +diverges at the transition between the oversmoothing and non-oversmoothing +phase. We test the predictions of our approach and find good agreement with +finite-size GCNs. Initializing GCNs near the transition to the +non-oversmoothing phase, we obtain networks which are both deep and expressive. + +
+
+
+
+
+ + ♻ ☆ Dynamic Dimension Wrapping (DDW) Algorithm: A Novel Approach for + Efficient Cross-Dimensional Search in Dynamic Multidimensional Spaces + + +
+ To effectively search for the optimal motion template in dynamic +multidimensional space, this paper proposes a novel optimization algorithm, +Dynamic Dimension Wrapping (DDW).The algorithm combines Dynamic Time Warping +(DTW) and Euclidean distance, and designs a fitness function that adapts to +dynamic multidimensional space by establishing a time-data chain mapping across +dimensions. This paper also proposes a novel update mechanism,Optimal Dimension +Collection (ODC), combined with the search strategy of traditional optimization +algorithms, enables DDW to adjust both the dimension values and the number of +dimensions of the population individuals simultaneously. In this way, DDW +significantly reduces computational complexity and improves search accuracy. +Experimental results show that DDW performs excellently in dynamic +multidimensional space, outperforming 31 traditional optimization algorithms. +This algorithm provides a novel approach to solving dynamic multidimensional +optimization problems and demonstrates broad application potential in fields +such as motion data analysis. + +
+
+
+
+
+ + ♻ ☆ Communication Compression for Tensor Parallel LLM Inference + + +
+ Large Language Models (LLMs) have pushed the frontier of artificial +intelligence but are comprised of hundreds of billions of parameters and +operations. For faster inference latency, LLMs are deployed on multiple +hardware accelerators through various Model Parallelism strategies. Our paper +looks into the details on one such strategy - Tensor Parallel - and proposes to +reduce latency by compressing inter-accelerator communication. We leverage fine +grained quantization techniques to compress selected activations by 3.5 - 4.5x. +Our proposed method leads up to 2x reduction of time-to-first-token (TTFT) with +negligible model performance degradation. + +
+
+
+
+
+ + ♻ ☆ Multi-View Symbolic Regression GECCO-2024 + + +
+ Symbolic regression (SR) searches for analytical expressions representing the +relationship between a set of explanatory and response variables. Current SR +methods assume a single dataset extracted from a single experiment. +Nevertheless, frequently, the researcher is confronted with multiple sets of +results obtained from experiments conducted with different setups. Traditional +SR methods may fail to find the underlying expression since the parameters of +each experiment can be different. In this work we present Multi-View Symbolic +Regression (MvSR), which takes into account multiple datasets simultaneously, +mimicking experimental environments, and outputs a general parametric solution. +This approach fits the evaluated expression to each independent dataset and +returns a parametric family of functions f(x; theta) simultaneously capable of +accurately fitting all datasets. We demonstrate the effectiveness of MvSR using +data generated from known expressions, as well as real-world data from +astronomy, chemistry and economy, for which an a priori analytical expression +is not available. Results show that MvSR obtains the correct expression more +frequently and is robust to hyperparameters change. In real-world data, it is +able to grasp the group behavior, recovering known expressions from the +literature as well as promising alternatives, thus enabling the use of SR to a +large range of experimental scenarios. + +
+
+ comment: Published in GECCO-2024. 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks and Differential Equations: A hybrid approach for + data assimilation of fluid flows + + +
+ This study presents a novel hybrid approach that combines Graph Neural +Networks (GNNs) with Reynolds-Averaged Navier Stokes (RANS) equations to +enhance the accuracy of mean flow reconstruction across a range of fluid +dynamics applications. Traditional purely data-driven Neural Networks (NNs) +models, often struggle maintaining physical consistency. Moreover, they +typically require large datasets to achieve reliable performances. The GNN +framework, which naturally handles unstructured data such as complex geometries +in Computational Fluid Dynamics (CFD), is here integrated with RANS equations +as a physical baseline model. The methodology leverages the adjoint method, +enabling the use of RANS-derived gradients as optimization terms in the GNN +training process. This ensures that the learned model adheres to the governing +physics, maintaining physical consistency while improving the prediction +accuracy. We test our approach on multiple CFD scenarios, including cases +involving generalization with respect to the Reynolds number, sparse +measurements, denoising and inpainting of missing portions of the mean flow. +The results demonstrate significant improvements in the accuracy of the +reconstructed mean flow compared to purely data-driven models, using limited +amounts of data in the training dataset. The key strengths of this study are +the integration of physical laws into the training process of the GNN, and the +ability to achieve high-accuracy predictions with a limited amount of data, +making this approach particularly valuable for applications in fluid dynamics +where data is often scarce. + +
+
+
+
+
+ + ♻ ☆ Towards Efficient and Optimal Covariance-Adaptive Algorithms for + Combinatorial Semi-Bandits + + +
+ We address the problem of stochastic combinatorial semi-bandits, where a +player selects among P actions from the power set of a set containing d base +items. Adaptivity to the problem's structure is essential in order to obtain +optimal regret upper bounds. As estimating the coefficients of a covariance +matrix can be manageable in practice, leveraging them should improve the +regret. We design "optimistic" covariance-adaptive algorithms relying on online +estimations of the covariance structure, called OLS-UCB-C and COS-V (only the +variances for the latter). They both yields improved gap-free regret. Although +COS-V can be slightly suboptimal, it improves on computational complexity by +taking inspiration from ThompsonSampling approaches. It is the first +sampling-based algorithm satisfying a T^1/2 gap-free regret (up to poly-logs). +We also show that in some cases, our approach efficiently leverages the +semi-bandit feedback and outperforms bandit feedback approaches, not only in +exponential regimes where P >> d but also when P <= d, which is not covered by +existing analyses. + +
+
+
+
+
+ + ♻ ☆ Calibration of ordinal regression networks + + +
+ Recent studies have shown that deep neural networks are not well-calibrated +and often produce over-confident predictions. The miscalibration issue +primarily stems from using cross-entropy in classifications, which aims to +align predicted softmax probabilities with one-hot labels. In ordinal +regression tasks, this problem is compounded by an additional challenge: the +expectation that softmax probabilities should exhibit unimodal distribution is +not met with cross-entropy. The ordinal regression literature has focused on +learning orders and overlooked calibration. To address both issues, we propose +a novel loss function that introduces order-aware calibration, ensuring that +prediction confidence adheres to ordinal relationships between classes. It +incorporates soft ordinal encoding and order-aware regularization to enforce +both calibration and unimodality. Extensive experiments across three popular +ordinal regression benchmarks demonstrate that our approach achieves +state-of-the-art calibration without compromising accuracy. + +
+
+
+
+
+ + ♻ ☆ Dockformer: A transformer-based molecular docking paradigm for + large-scale virtual screening + + +
+ Molecular docking enables virtual screening of compound libraries to identify +potential ligands that target proteins of interest, a crucial step in drug +development; however, as the size of the compound library increases, the +computational complexity of traditional docking models increases. Deep learning +algorithms can provide data-driven research and development models to increase +the speed of the docking process. Unfortunately, few models can achieve +superior screening performance compared to that of traditional models. +Therefore, a novel deep learning-based docking approach named Dockformer is +introduced in this study. Dockformer leverages multimodal information to +capture the geometric topology and structural knowledge of molecules and can +directly generate binding conformations with the corresponding confidence +measures in an end-to-end manner. The experimental results show that Dockformer +achieves success rates of 90.53\% and 82.71\% on the PDBbind core set and +PoseBusters benchmarks, respectively, and more than a 100-fold increase in the +inference process speed, outperforming almost all state-of-the-art docking +methods. In addition, the ability of Dockformer to identify the main protease +inhibitors of coronaviruses is demonstrated in a real-world virtual screening +scenario. Considering its high docking accuracy and screening efficiency, +Dockformer can be regarded as a powerful and robust tool in the field of drug +design. + +
+
+ comment: 14 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ A Dual Adaptive Assignment Approach for Robust Graph-Based Clustering + + +
+ Graph clustering is an essential aspect of network analysis that involves +grouping nodes into separate clusters. Recent developments in deep learning +have resulted in advanced deep graph clustering techniques, which have proven +effective in many applications. Nonetheless, these methods often encounter +difficulties when dealing with the complexities of real-world graphs, +particularly in the presence of noisy edges. Additionally, many denoising graph +clustering strategies tend to suffer from lower performance compared to their +non-denoised counterparts, training instability, and challenges in scaling to +large datasets. To tackle these issues, we introduce a new framework called the +Dual Adaptive Assignment Approach for Robust Graph-Based Clustering (RDSA). +RDSA consists of three key components: (i) a node embedding module that +effectively integrates the graph's topological features and node attributes; +(ii) a structure-based soft assignment module that improves graph modularity by +utilizing an affinity matrix for node assignments; and (iii) a node-based soft +assignment module that identifies community landmarks and refines node +assignments to enhance the model's robustness. We assess RDSA on various +real-world datasets, demonstrating its superior performance relative to +existing state-of-the-art methods. Our findings indicate that RDSA provides +robust clustering across different graph types, excelling in clustering +effectiveness and robustness, including adaptability to noise, stability, and +scalability. + +
+
+
+
+
+ + ♻ ☆ Approximate Probabilistic Inference for Time-Series Data A Robust Latent + Gaussian Model With Temporal Awareness + + +
+ The development of robust generative models for highly varied non-stationary +time series data is a complex yet important problem. Traditional models for +time series data prediction, such as Long Short-Term Memory (LSTM), are +inefficient and generalize poorly as they cannot capture complex temporal +relationships. In this paper, we present a probabilistic generative model that +can be trained to capture temporal information, and that is robust to data +errors. We call it Time Deep Latent Gaussian Model (tDLGM). Its novel +architecture is inspired by Deep Latent Gaussian Model (DLGM). Our model is +trained to minimize a loss function based on the negative log loss. One +contributing factor to Time Deep Latent Gaussian Model (tDLGM) robustness is +our regularizer, which accounts for data trends. Experiments conducted show +that tDLGM is able to reconstruct and generate complex time series data, and +that it is robust against to noise and faulty data. + +
+
+ comment: New revision added a space between "for" and "Time-Series" in the + title +
+
+
+
+
+ + ♻ ☆ Flow Priors for Linear Inverse Problems via Iterative Corrupted + Trajectory Matching NeurIPS 2024 + + +
+ Generative models based on flow matching have attracted significant attention +for their simplicity and superior performance in high-resolution image +synthesis. By leveraging the instantaneous change-of-variables formula, one can +directly compute image likelihoods from a learned flow, making them enticing +candidates as priors for downstream tasks such as inverse problems. In +particular, a natural approach would be to incorporate such image probabilities +in a maximum-a-posteriori (MAP) estimation problem. A major obstacle, however, +lies in the slow computation of the log-likelihood, as it requires +backpropagating through an ODE solver, which can be prohibitively slow for +high-dimensional problems. In this work, we propose an iterative algorithm to +approximate the MAP estimator efficiently to solve a variety of linear inverse +problems. Our algorithm is mathematically justified by the observation that the +MAP objective can be approximated by a sum of $N$ ``local MAP'' objectives, +where $N$ is the number of function evaluations. By leveraging Tweedie's +formula, we show that we can perform gradient steps to sequentially optimize +these objectives. We validate our approach for various linear inverse problems, +such as super-resolution, deblurring, inpainting, and compressed sensing, and +demonstrate that we can outperform other methods based on flow matching. Code +is available at https://github.com/YasminZhang/ICTM. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Bridging The Gap between Low-rank and Orthogonal Adaptation via + Householder Reflection Adaptation + + +
+ While following different technical routes, both low-rank and orthogonal +adaptation techniques can efficiently adapt large-scale pre-training models in +specific tasks or domains based on a small piece of trainable parameters. In +this study, we bridge the gap between these two techniques, proposing a simple +but effective adaptation method based on Householder reflections. Given a +pre-trained model, our method fine-tunes its layers by multiplying each frozen +weight matrix with an orthogonal matrix constructed by a chain of learnable +Householder reflections (HRs). This HR-based orthogonal fine-tuning is +equivalent to an adaptive low-rank adaptation. Moreover, we show that the +orthogonality of the reflection planes corresponding to the HRs impacts the +model capacity and regularity. The analysis motivates us to regularize the +orthogonality of the HRs, leading to different implementations of the proposed +Householder reflection adaptation (HRA) method. Compared with state-of-the-art +methods, HRA achieves superior performance with fewer learnable parameters when +adapting large language models and conditional image generators. The code of +the experiments is available at \url{https://github.com/DaShenZi721/HRA}, and +the method has been merged into the +\href{https://github.com/huggingface/peft}{PEFT} package. + +
+
+
+
+
+ + ♻ ☆ A Simple but Strong Baseline for Sounding Video Generation: Effective + Adaptation of Audio and Video Diffusion Models for Joint Generation + + +
+ In this work, we build a simple but strong baseline for sounding video +generation. Given base diffusion models for audio and video, we integrate them +with additional modules into a single model and train it to make the model +jointly generate audio and video. To enhance alignment between audio-video +pairs, we introduce two novel mechanisms in our model. The first one is +timestep adjustment, which provides different timestep information to each base +model. It is designed to align how samples are generated along with timesteps +across modalities. The second one is a new design of the additional modules, +termed Cross-Modal Conditioning as Positional Encoding (CMC-PE). In CMC-PE, +cross-modal information is embedded as if it represents temporal position +information, and the embeddings are fed into the model like positional +encoding. Compared with the popular cross-attention mechanism, CMC-PE provides +a better inductive bias for temporal alignment in the generated data. +Experimental results validate the effectiveness of the two newly introduced +mechanisms and also demonstrate that our method outperforms existing methods. + +
+
+ comment: The source code is available: + https://github.com/SonyResearch/SVG_baseline +
+
+
+
+
+ + ♻ ☆ Provably Unlearnable Data Examples + + +
+ The exploitation of publicly accessible data has led to escalating concerns +regarding data privacy and intellectual property (IP) breaches in the age of +artificial intelligence. To safeguard both data privacy and IP-related domain +knowledge, efforts have been undertaken to render shared data unlearnable for +unauthorized models in the wild. Existing methods apply empirically optimized +perturbations to the data in the hope of disrupting the correlation between the +inputs and the corresponding labels such that the data samples are converted +into Unlearnable Examples (UEs). Nevertheless, the absence of mechanisms to +verify the robustness of UEs against uncertainty in unauthorized models and +their training procedures engenders several under-explored challenges. First, +it is hard to quantify the unlearnability of UEs against unauthorized +adversaries from different runs of training, leaving the soundness of the +defense in obscurity. Particularly, as a prevailing evaluation metric, +empirical test accuracy faces generalization errors and may not plausibly +represent the quality of UEs. This also leaves room for attackers, as there is +no rigid guarantee of the maximal test accuracy achievable by attackers. +Furthermore, we find that a simple recovery attack can restore the clean-task +performance of the classifiers trained on UEs by slightly perturbing the +learned weights. To mitigate the aforementioned problems, in this paper, we +propose a mechanism for certifying the so-called $(q, \eta)$-Learnability of an +unlearnable dataset via parametric smoothing. A lower certified $(q, +\eta)$-Learnability indicates a more robust and effective protection over the +dataset. Concretely, we 1) improve the tightness of certified $(q, +\eta)$-Learnability and 2) design Provably Unlearnable Examples (PUEs) which +have reduced $(q, \eta)$-Learnability. + +
+
+ comment: Accepted to Network and Distributed System Security (NDSS) Symposium + 2025, San Diego, CA, USA. Source code is available at + https://github.com/NeuralSec/certified-data-learnability +
+
+
+
+
+ + ♻ ☆ Confidence-aware Denoised Fine-tuning of Off-the-shelf Models for + Certified Robustness + + +
+ The remarkable advances in deep learning have led to the emergence of many +off-the-shelf classifiers, e.g., large pre-trained models. However, since they +are typically trained on clean data, they remain vulnerable to adversarial +attacks. Despite this vulnerability, their superior performance and +transferability make off-the-shelf classifiers still valuable in practice, +demanding further work to provide adversarial robustness for them in a post-hoc +manner. A recently proposed method, denoised smoothing, leverages a denoiser +model in front of the classifier to obtain provable robustness without +additional training. However, the denoiser often creates hallucination, i.e., +images that have lost the semantics of their originally assigned class, leading +to a drop in robustness. Furthermore, its noise-and-denoise procedure +introduces a significant distribution shift from the original distribution, +causing the denoised smoothing framework to achieve sub-optimal robustness. In +this paper, we introduce Fine-Tuning with Confidence-Aware Denoised Image +Selection (FT-CADIS), a novel fine-tuning scheme to enhance the certified +robustness of off-the-shelf classifiers. FT-CADIS is inspired by the +observation that the confidence of off-the-shelf classifiers can effectively +identify hallucinated images during denoised smoothing. Based on this, we +develop a confidence-aware training objective to handle such hallucinated +images and improve the stability of fine-tuning from denoised images. In this +way, the classifier can be fine-tuned using only images that are beneficial for +adversarial robustness. We also find that such a fine-tuning can be done by +updating a small fraction of parameters of the classifier. Extensive +experiments demonstrate that FT-CADIS has established the state-of-the-art +certified robustness among denoised smoothing methods across all +$\ell_2$-adversary radius in various benchmarks. + +
+
+ comment: 26 pages; TMLR 2024; Code is available at + https://github.com/suhyeok24/FT-CADIS +
+
+
+
+
+ + ♻ ☆ AdapShare: An RL-Based Dynamic Spectrum Sharing Solution for O-RAN + + +
+ The Open Radio Access Network (O-RAN) initiative, characterized by open +interfaces and AI/ML-capable RAN Intelligent Controller (RIC), facilitates +effective spectrum sharing among RANs. In this context, we introduce AdapShare, +an ORAN-compatible solution leveraging Reinforcement Learning (RL) for +intent-based spectrum management, with the primary goal of minimizing resource +surpluses or deficits in RANs. By employing RL agents, AdapShare intelligently +learns network demand patterns and uses them to allocate resources. We +demonstrate the efficacy of AdapShare in the spectrum sharing scenario between +LTE and NR networks, incorporating real-world LTE resource usage data and +synthetic NR usage data to demonstrate its practical use. We use the average +surplus or deficit and fairness index to measure the system's performance in +various scenarios. AdapShare outperforms a quasi-static resource allocation +scheme based on long-term network demand statistics, particularly when +available resources are scarce or exceed the aggregate demand from the +networks. Lastly, we present a high-level O-RAN compatible architecture using +RL agents, which demonstrates the seamless integration of AdapShare into +real-world deployment scenarios. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.09110 +
+
+
+
+
+ + ♻ ☆ meds_reader: A fast and efficient EHR processing library ML4H + + +
+ The growing demand for machine learning in healthcare requires processing +increasingly large electronic health record (EHR) datasets, but existing +pipelines are not computationally efficient or scalable. In this paper, we +introduce meds_reader, an optimized Python package for efficient EHR data +processing that is designed to take advantage of many intrinsic properties of +EHR data for improved speed. We then demonstrate the benefits of meds_reader by +reimplementing key components of two major EHR processing pipelines, achieving +10-100x improvements in memory, speed, and disk usage. The code for meds_reader +can be found at https://github.com/som-shahlab/meds_reader. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 8 pages +
+
+
+
+
+ + ♻ ☆ Adaptive Transfer Clustering: A Unified Framework + + +
+ We propose a general transfer learning framework for clustering given a main +dataset and an auxiliary one about the same subjects. The two datasets may +reflect similar but different latent grouping structures of the subjects. We +propose an adaptive transfer clustering (ATC) algorithm that automatically +leverages the commonality in the presence of unknown discrepancy, by optimizing +an estimated bias-variance decomposition. It applies to a broad class of +statistical models including Gaussian mixture models, stochastic block models, +and latent class models. A theoretical analysis proves the optimality of ATC +under the Gaussian mixture model and explicitly quantifies the benefit of +transfer. Extensive simulations and real data experiments confirm our method's +effectiveness in various scenarios. + +
+
+ comment: 55 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ BAdam: A Memory Efficient Full Parameter Optimization Method for Large + Language Models + + +
+ This work presents BAdam, an optimization method that leverages the block +coordinate descent (BCD) framework with Adam's update rule. BAdam offers a +memory efficient approach to the full parameter finetuning of large language +models. We conduct a theoretical convergence analysis for BAdam in the +deterministic case. Experimentally, we apply BAdam to finetune the Llama 3-8B +and Llama 3-70B models using a single RTX3090-24GB GPU and 4 A100-80GB GPUs, +respectively. The results confirm BAdam's efficiency in terms of memory usage, +running time, and optimization capability. Furthermore, the downstream +performance evaluation based on MT-bench and math benchmarks shows that BAdam +outperforms existing memory efficient baselines such as LoRA. It also +demonstrates that BAdam can achieve comparable or even superior performance +compared to Adam. Finally, the ablation study using SGD's update rule +illustrates the suitability of BCD for finetuning LLMs. Our code can be easily +integrated into any PyTorch-based codebase and is available at +https://github.com/Ledzy/BAdam. + +
+
+ comment: Accepted for Publication in Conference on Neural Information + Processing Systems, 2024 +
+
+
+
+
+ + ♻ ☆ Effective Generative AI: The Human-Algorithm Centaur + + +
+ Advanced analytics science methods have enabled combining the power of +artificial and human intelligence, creating \textit{centaurs} that allow +superior decision-making. Centaurs are hybrid human-algorithm models that +combine both formal analytics and human intuition in a symbiotic manner within +their learning and reasoning process. We argue that the future of AI +development and use in many domains needs to focus more on centaurs as opposed +to other AI approaches. This paradigm shift towards centaur-based AI methods +raises some fundamental questions: How are centaurs different from other +human-in-the-loop methods? What are the most effective methods for creating +centaurs? When should centaurs be used, and when should the lead be given to +pure AI models? Doesn't the incorporation of human intuition -- which at times +can be misleading -- in centaurs' decision-making process degrade its +performance compared to pure AI methods? This work aims to address these +fundamental questions, focusing on recent advancements in generative AI, and +especially in Large Language Models (LLMs), as a main case study to illustrate +centaurs' critical essentiality to future AI endeavors. + +
+
+ comment: To Appear in SI: Future Shock, Harvard Data Science Review + (https://hdsr.mitpress.mit.edu/specialissue5) +
+
+
+
+
+ + ♻ ☆ Efficient Pauli channel estimation with logarithmic quantum memory + + +
+ Here we revisit one of the prototypical tasks for characterizing the +structure of noise in quantum devices: estimating every eigenvalue of an +$n$-qubit Pauli noise channel to error $\epsilon$. Prior work [14] proved no-go +theorems for this task in the practical regime where one has a limited amount +of quantum memory, e.g. any protocol with $\le 0.99n$ ancilla qubits of quantum +memory must make exponentially many measurements, provided it is +non-concatenating. Such protocols can only interact with the channel by +repeatedly preparing a state, passing it through the channel, and measuring +immediately afterward. + This left open a natural question: does the lower bound hold even for general +protocols, i.e. ones which chain together many queries to the channel, +interleaved with arbitrary data-processing channels, before measuring? +Surprisingly, in this work we show the opposite: there is a protocol that can +estimate the eigenvalues of a Pauli channel to error $\epsilon$ using only +$O(\log n/\epsilon^2)$ ancilla and $\tilde{O}(n^2/\epsilon^2)$ measurements. In +contrast, we show that any protocol with zero ancilla, even a concatenating +one, must make $\Omega(2^n/\epsilon^2)$ measurements, which is tight. + Our results imply, to our knowledge, the first quantum learning task where +logarithmically many qubits of quantum memory suffice for an exponential +statistical advantage. Our protocol can be naturally extended to a protocol +that learns the eigenvalues of Pauli terms within any subset $A$ of a Pauli +channel with $O(\log\log(|A|)/\epsilon^2)$ ancilla and +$\tilde{O}(n^2/\epsilon^2)$ measurements. + +
+
+ comment: 57 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ ORLM: A Customizable Framework in Training Large Models for Automated + Optimization Modeling + + +
+ Optimization modeling and solving play a critical role in the application of +Operations Research (OR) tools to address real-world problems, yet they pose +challenges and require extensive expertise from OR experts. With the advent of +large language models (LLMs), new opportunities have emerged to streamline and +automate these tasks. However, current research predominantly relies on +closed-source LLMs such as GPT-4, along with extensive prompt engineering +techniques. This reliance stems from the scarcity of high-quality training +datasets for optimization modeling, resulting in elevated costs, prolonged +processing times, and privacy concerns. To address these challenges, our work +is the first to propose a viable path for training open-source LLMs that are +capable of optimization modeling as well as developing and executing solver +codes, eventually leading to a superior ability for automating optimization +modeling and solving. Particularly, we introduce a semi-automated data +synthesis framework designed for optimization modeling issues, named +OR-Instruct. This framework merges the training data requirements of large +models with the unique characteristics of optimization modeling problems, and +allows for customizable enhancements tailored to specific scenarios or modeling +types. To evaluate the performance of our proposed framework, we present the +IndustryOR benchmark, the inaugural industrial standard for evaluating LLMs in +solving practical OR problems. Utilizing data synthesized through OR-Instruct, +we train various open-source LLMs with a capacity of 7 billion parameters +(dubbed ORLMs). The resulting model demonstrates significantly enhanced +optimization modeling capabilities, achieving state-of-the-art performance +across the NL4OPT, MAMO, and IndustryOR benchmarks. Our code and data are +available at \url{https://github.com/Cardinal-Operations/ORLM}. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Stabilizer bootstrapping: A recipe for efficient agnostic tomography and + magic estimation + + +
+ We study the task of agnostic tomography: given copies of an unknown +$n$-qubit state $\rho$ which has fidelity $\tau$ with some state in a given +class $C$, find a state which has fidelity $\ge \tau - \epsilon$ with $\rho$. +We give a new framework, stabilizer bootstrapping, for designing +computationally efficient protocols for this task, and use this to get new +agnostic tomography protocols for the following classes: + Stabilizer states: We give a protocol that runs in time +$\mathrm{poly}(n,1/\epsilon)\cdot (1/\tau)^{O(\log(1/\tau))}$, answering an +open question posed by Grewal, Iyer, Kretschmer, Liang [43] and Anshu and +Arunachalam [6]. Previous protocols ran in time $\mathrm{exp}(\Theta(n))$ or +required $\tau>\cos^2(\pi/8)$. + States with stabilizer dimension $n - t$: We give a protocol that runs in +time $n^3\cdot(2^t/\tau)^{O(\log(1/\epsilon))}$, extending recent work on +learning quantum states prepared by circuits with few non-Clifford gates, which +only applied in the realizable setting where $\tau = 1$ [33, 40, 49, 66]. + Discrete product states: If $C = K^{\otimes n}$ for some $\mu$-separated +discrete set $K$ of single-qubit states, we give a protocol that runs in time +$(n/\mu)^{O((1 + \log (1/\tau))/\mu)}/\epsilon^2$. This strictly generalizes a +prior guarantee which applied to stabilizer product states [42]. For stabilizer +product states, we give a further improved protocol that runs in time +$(n^2/\epsilon^2)\cdot (1/\tau)^{O(\log(1/\tau))}$. + As a corollary, we give the first protocol for estimating stabilizer +fidelity, a standard measure of magic for quantum states, to error $\epsilon$ +in $n^3 \mathrm{quasipoly}(1/\epsilon)$ time. + +
+
+ comment: 68 pages +
+
+
+
+
+ + ♻ ☆ A Multi-Granularity Supervised Contrastive Framework for Remaining + Useful Life Prediction of Aero-engines + + +
+ Accurate remaining useful life (RUL) predictions are critical to the safe +operation of aero-engines. Currently, the RUL prediction task is mainly a +regression paradigm with only mean square error as the loss function and lacks +research on feature space structure, the latter of which has shown excellent +performance in a large number of studies. This paper develops a +multi-granularity supervised contrastive (MGSC) framework from plain intuition +that samples with the same RUL label should be aligned in the feature space, +and address the problems of too large minibatch size and unbalanced samples in +the implementation. The RUL prediction with MGSC is implemented on using the +proposed multi-phase training strategy. This paper also demonstrates a simple +and scalable basic network structure and validates the proposed MGSC strategy +on the CMPASS dataset using a convolutional long short-term memory network as a +baseline, which effectively improves the accuracy of RUL prediction. + +
+
+
+
+
+ + ♻ ☆ DeepOSets: Non-Autoregressive In-Context Learning of Supervised Learning + Operators + + +
+ We introduce DeepSets Operator Networks (DeepOSets), an efficient, +non-autoregressive neural network architecture for in-context operator +learning. In-context learning allows a trained machine learning model to learn +from a user prompt without further training. DeepOSets adds in-context learning +capabilities to Deep Operator Networks (DeepONets) by combining it with the +DeepSets architecture. As the first non-autoregressive model for in-context +operator learning, DeepOSets allow the user prompt to be processed in parallel, +leading to significant computational savings. Here, we present the application +of DeepOSets in the problem of learning supervised learning algorithms, which +are operators mapping a finite-dimensional space of labeled data into an +infinite-dimensional hypothesis space of prediction functions. In an empirical +comparison with a popular autoregressive (transformer-based) model for +in-context learning of linear regression in one and five dimensions, DeepOSets +reduced the number of model weights by several orders of magnitude and required +a fraction of training and inference time. Furthermore, DeepOSets proved to be +less sensitive to noise, significantly outperforming the transformer model in +noisy settings. + +
+
+ comment: Janossy pooling results were added; Figures 1 and 2 were updated; + minor edits were made throughout +
+
+
+
+
+ + ♻ ☆ Adversarial Environment Design via Regret-Guided Diffusion Models + + +
+ Training agents that are robust to environmental changes remains a +significant challenge in deep reinforcement learning (RL). Unsupervised +environment design (UED) has recently emerged to address this issue by +generating a set of training environments tailored to the agent's capabilities. +While prior works demonstrate that UED has the potential to learn a robust +policy, their performance is constrained by the capabilities of the environment +generation. To this end, we propose a novel UED algorithm, adversarial +environment design via regret-guided diffusion models (ADD). The proposed +method guides the diffusion-based environment generator with the regret of the +agent to produce environments that the agent finds challenging but conducive to +further improvement. By exploiting the representation power of diffusion +models, ADD can directly generate adversarial environments while maintaining +the diversity of training environments, enabling the agent to effectively learn +a robust policy. Our experimental results demonstrate that the proposed method +successfully generates an instructive curriculum of environments, outperforming +UED baselines in zero-shot generalization across novel, out-of-distribution +environments. Project page: https://rllab-snu.github.io/projects/ADD + +
+
+ comment: 38th Conference on Neural Information Processing Systems +
+
+
+
+
+ + ♻ ☆ Mitigating Gradient Overlap in Deep Residual Networks with Gradient + Normalization for Improved Non-Convex Optimization + + +
+ In deep learning, Residual Networks (ResNets) have proven effective in +addressing the vanishing gradient problem, allowing for the successful training +of very deep networks. However, skip connections in ResNets can lead to +gradient overlap, where gradients from both the learned transformation and the +skip connection combine, potentially resulting in overestimated gradients. This +overestimation can cause inefficiencies in optimization, as some updates may +overshoot optimal regions, affecting weight updates. To address this, we +examine Z-score Normalization (ZNorm) as a technique to manage gradient +overlap. ZNorm adjusts the gradient scale, standardizing gradients across +layers and reducing the negative impact of overlapping gradients. Our +experiments demonstrate that ZNorm improves training process, especially in +non-convex optimization scenarios common in deep learning, where finding +optimal solutions is challenging. These findings suggest that ZNorm can affect +the gradient flow, enhancing performance in large-scale data processing where +accuracy is critical. + +
+
+
+
+
+ + ♻ ☆ EHRMamba: Towards Generalizable and Scalable Foundation Models for + Electronic Health Records + + +
+ Transformers have significantly advanced the modeling of Electronic Health +Records (EHR), yet their deployment in real-world healthcare is limited by +several key challenges. Firstly, the quadratic computational cost and +insufficient context length of these models hinder hospitals' ability in +processing the extensive medical histories typical in EHR data. Additionally, +existing models employ separate finetuning for each clinical task, complicating +maintenance in healthcare environments. Moreover, these models focus +exclusively on either clinical prediction or EHR forecasting, lacking +proficiency in both tasks. To overcome these limitations, we introduce +EHRMamba, a robust foundation model built on the Mamba architecture. EHRMamba +can process sequences up to 300% longer than previous models due to its linear +computational cost. We also introduce a novel approach to Multitask Prompted +Finetuning (MPF) for EHR data, which enables EHRMamba to simultaneously learn +multiple clinical tasks in a single finetuning phase, significantly enhancing +deployment and cross-task generalization. Furthermore, our model leverages the +HL7 FHIR data standard to simplify integration into existing hospital systems. +Alongside EHRMamba, we open-source Odyssey, a toolkit designed to support the +development and deployment of EHR foundation models, with an emphasis on data +standardization and interpretability. Our evaluations on the MIMIC-IV dataset +demonstrate that EHRMamba advances state-of-the-art performance across 6 major +clinical tasks and excels in EHR forecasting, marking a significant leap +forward in the field. + +
+
+ comment: 17 Pages, 4 Figures +
+
+
+
+
+ + ♻ ☆ ConSmax: Hardware-Friendly Alternative Softmax with Learnable Parameters + + +
+ The self-attention mechanism distinguishes transformer-based large language +models (LLMs) apart from convolutional and recurrent neural networks. Despite +the performance improvement, achieving real-time LLM inference on silicon +remains challenging due to the extensive use of Softmax in self-attention. In +addition to the non-linearity, the low arithmetic intensity significantly +limits processing parallelism, especially when working with longer contexts. To +address this challenge, we propose Constant Softmax (ConSmax), a +software-hardware co-design that serves as an efficient alternative to Softmax. +ConSmax utilizes differentiable normalization parameters to eliminate the need +for maximum searching and denominator summation in Softmax. This approach +enables extensive parallelization while still executing the essential functions +of Softmax. Moreover, a scalable ConSmax hardware design with a bitwidth-split +look-up table (LUT) can achieve lossless non-linear operations and support +mixed-precision computing. Experimental results show that ConSmax achieves a +minuscule power consumption of 0.2mW and an area of 0.0008mm^2 at 1250MHz +working frequency in 16nm FinFET technology. For open-source contribution, we +further implement our design with the OpenROAD toolchain under SkyWater's 130nm +CMOS technology. The corresponding power is 2.69mW and the area is 0.007mm^2. +ConSmax achieves 3.35x power savings and 2.75x area savings in 16nm technology, +and 3.15x power savings and 4.14x area savings with the open-source EDA +toolchain. In the meantime, it also maintains comparable accuracy on the GPT-2 +model and the WikiText103 dataset. The project is available at +https://github.com/ReaLLMASIC/ConSmax + +
+
+
+
+
+ + ♻ ☆ From Isolation to Collaboration: Federated Class-Heterogeneous Learning + for Chest X-Ray Classification + + +
+ Federated learning (FL) is a promising paradigm to collaboratively train a +global chest x-ray (CXR) classification model using distributed datasets while +preserving patient privacy. A significant, yet relatively underexplored, +challenge in FL is class-heterogeneity, where clients have different sets of +classes. We propose surgical aggregation, a FL method that uses selective +aggregation to collaboratively train a global model using distributed, +class-heterogeneous datasets. Unlike other methods, our method does not rely on +the assumption that clients share the same classes as other clients, know the +classes of other clients, or have access to a fully annotated dataset. We +evaluate surgical aggregation using class-heterogeneous CXR datasets across IID +and non-IID settings. Our results show that our method outperforms current +methods and has better generalizability. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 129 + +
+
+
+ + ☆ VeriGraph: Scene Graphs for Execution Verifiable Robot Planning + + +
+ Recent advancements in vision-language models (VLMs) offer potential for +robot task planning, but challenges remain due to VLMs' tendency to generate +incorrect action sequences. To address these limitations, we propose VeriGraph, +a novel framework that integrates VLMs for robotic planning while verifying +action feasibility. VeriGraph employs scene graphs as an intermediate +representation, capturing key objects and spatial relationships to improve plan +verification and refinement. The system generates a scene graph from input +images and uses it to iteratively check and correct action sequences generated +by an LLM-based task planner, ensuring constraints are respected and actions +are executable. Our approach significantly enhances task completion rates +across diverse manipulation scenarios, outperforming baseline methods by 58% +for language-based tasks and 30% for image-based tasks. + +
+
+
+
+
+ + ☆ Mitigating Hallucination in Multimodal Large Language Model via + Hallucination-targeted Direct Preference Optimization + + +
+ Multimodal Large Language Models (MLLMs) are known to hallucinate, which +limits their practical applications. Recent works have attempted to apply +Direct Preference Optimization (DPO) to enhance the performance of MLLMs, but +have shown inconsistent improvements in mitigating hallucinations. To address +this issue more effectively, we introduce Hallucination-targeted Direct +Preference Optimization (HDPO) to reduce hallucinations in MLLMs. Unlike +previous approaches, our method tackles hallucinations from their diverse forms +and causes. Specifically, we develop three types of preference pair data +targeting the following causes of MLLM hallucinations: (1) insufficient visual +capabilities, (2) long context generation, and (3) multimodal conflicts. +Experimental results demonstrate that our method achieves superior performance +across multiple hallucination evaluation datasets, surpassing most +state-of-the-art (SOTA) methods and highlighting the potential of our approach. +Ablation studies and in-depth analyses further confirm the effectiveness of our +method and suggest the potential for further improvements through scaling up. + +
+
+
+
+
+ + ☆ Mitigating Parameter Degeneracy using Joint Conditional Diffusion Model + for WECC Composite Load Model in Power Systems + + +
+ Data-driven modeling for dynamic systems has gained widespread attention in +recent years. Its inverse formulation, parameter estimation, aims to infer the +inherent model parameters from observations. However, parameter degeneracy, +where different combinations of parameters yield the same observable output, +poses a critical barrier to accurately and uniquely identifying model +parameters. In the context of WECC composite load model (CLM) in power systems, +utility practitioners have observed that CLM parameters carefully selected for +one fault event may not perform satisfactorily in another fault. Here, we +innovate a joint conditional diffusion model-based inverse problem solver +(JCDI), that incorporates a joint conditioning architecture with simultaneous +inputs of multi-event observations to improve parameter generalizability. +Simulation studies on the WECC CLM show that the proposed JCDI effectively +reduces uncertainties of degenerate parameters, thus the parameter estimation +error is decreased by 42.1% compared to a single-event learning scheme. This +enables the model to achieve high accuracy in predicting power trajectories +under different fault events, including electronic load tripping and motor +stalling, outperforming standard deep reinforcement learning and supervised +learning approaches. We anticipate this work will contribute to mitigating +parameter degeneracy in system dynamics, providing a general parameter +estimation framework across various scientific domains. + +
+
+
+
+
+ + ☆ Evaluating Creativity and Deception in Large Language Models: A + Simulation Framework for Multi-Agent Balderdash ACL 2024 + + +
+ Large Language Models (LLMs) have shown impressive capabilities in complex +tasks and interactive environments, yet their creativity remains underexplored. +This paper introduces a simulation framework utilizing the game Balderdash to +evaluate both the creativity and logical reasoning of LLMs. In Balderdash, +players generate fictitious definitions for obscure terms to deceive others +while identifying correct definitions. Our framework enables multiple LLM +agents to participate in this game, assessing their ability to produce +plausible definitions and strategize based on game rules and history. We +implemented a centralized game engine featuring various LLMs as participants +and a judge LLM to evaluate semantic equivalence. Through a series of +experiments, we analyzed the performance of different LLMs, examining metrics +such as True Definition Ratio, Deception Ratio, and Correct Guess Ratio. The +results provide insights into the creative and deceptive capabilities of LLMs, +highlighting their strengths and areas for improvement. Specifically, the study +reveals that infrequent vocabulary in LLMs' input leads to poor reasoning on +game rules and historical context +(https://github.com/ParsaHejabi/Simulation-Framework-for-Multi-Agent-Balderdash). + +
+
+ comment: Accepted at Wordplay: When Language Meets Games @ ACL 2024 +
+
+
+
+
+ + ☆ Towards Automatic Evaluation of Task-Oriented Dialogue Flows + + +
+ Task-oriented dialogue systems rely on predefined conversation schemes +(dialogue flows) often represented as directed acyclic graphs. These flows can +be manually designed or automatically generated from previously recorded +conversations. Due to variations in domain expertise or reliance on different +sets of prior conversations, these dialogue flows can manifest in significantly +different graph structures. Despite their importance, there is no standard +method for evaluating the quality of dialogue flows. We introduce FuDGE (Fuzzy +Dialogue-Graph Edit Distance), a novel metric that evaluates dialogue flows by +assessing their structural complexity and representational coverage of the +conversation data. FuDGE measures how well individual conversations align with +a flow and, consequently, how well a set of conversations is represented by the +flow overall. Through extensive experiments on manually configured flows and +flows generated by automated techniques, we demonstrate the effectiveness of +FuDGE and its evaluation framework. By standardizing and optimizing dialogue +flows, FuDGE enables conversational designers and automated techniques to +achieve higher levels of efficiency and automation. + +
+
+
+
+
+ + ☆ Repurposing Stable Diffusion Attention for Training-Free Unsupervised + Interactive Segmentation + + +
+ Recent progress in interactive point prompt based Image Segmentation allows +to significantly reduce the manual effort to obtain high quality semantic +labels. State-of-the-art unsupervised methods use self-supervised pre-trained +models to obtain pseudo-labels which are used in training a prompt-based +segmentation model. In this paper, we propose a novel unsupervised and +training-free approach based solely on the self-attention of Stable Diffusion. +We interpret the self-attention tensor as a Markov transition operator, which +enables us to iteratively construct a Markov chain. Pixel-wise counting of the +required number of iterations along the Markov-chain to reach a relative +probability threshold yields a Markov-iteration-map, which we simply call a +Markov-map. Compared to the raw attention maps, we show that our proposed +Markov-map has less noise, sharper semantic boundaries and more uniform values +within semantically similar regions. We integrate the Markov-map in a simple +yet effective truncated nearest neighbor framework to obtain interactive point +prompt based segmentation. Despite being training-free, we experimentally show +that our approach yields excellent results in terms of Number of Clicks (NoC), +even outperforming state-of-the-art training based unsupervised methods in most +of the datasets. + +
+
+
+
+
+ + ☆ Features that Make a Difference: Leveraging Gradients for Improved + Dictionary Learning NAACL 2025 + + +
+ Sparse Autoencoders (SAEs) are a promising approach for extracting neural +network representations by learning a sparse and overcomplete decomposition of +the network's internal activations. However, SAEs are traditionally trained +considering only activation values and not the effect those activations have on +downstream computations. This limits the information available to learn +features, and biases the autoencoder towards neglecting features which are +represented with small activation values but strongly influence model outputs. +To address this, we introduce Gradient SAEs (g-SAEs), which modify the +$k$-sparse autoencoder architecture by augmenting the TopK activation function +to rely on the gradients of the input activation when selecting the $k$ +elements. For a given sparsity level, g-SAEs produce reconstructions that are +more faithful to original network performance when propagated through the +network. Additionally, we find evidence that g-SAEs learn latents that are on +average more effective at steering models in arbitrary contexts. By considering +the downstream effects of activations, our approach leverages the dual nature +of neural network features as both $\textit{representations}$, retrospectively, +and $\textit{actions}$, prospectively. While previous methods have approached +the problem of feature discovery primarily focused on the former aspect, g-SAEs +represent a step towards accounting for the latter as well. + +
+
+ comment: 9 pages, 8 figures. Submitted to NAACL 2025 +
+
+
+
+
+ + ☆ Deep Learning for Micro-Scale Crack Detection on Imbalanced Datasets + Using Key Point Localization + + +
+ Internal crack detection has been a subject of focus in structural health +monitoring. By focusing on crack detection in structural datasets, it is +demonstrated that deep learning (DL) methods can effectively analyze seismic +wave fields interacting with micro-scale cracks, which are beyond the +resolution of conventional visual inspection. This work explores a novel +application of DL-based key point detection technique, where cracks are +localized by predicting the coordinates of four key points that define a +bounding region of the crack. The study not only opens new research directions +for non-visual applications but also effectively mitigates the impact of +imbalanced data which poses a challenge for previous DL models, as it can be +biased toward predicting the majority class (non-crack regions). Popular DL +techniques, such as the Inception blocks, are used and investigated. The model +shows an overall reduction in loss when applied to micro-scale crack detection +and is reflected in the lower average deviation between the location of actual +and predicted cracks, with an average Intersection over Union (IoU) being 0.511 +for all micro cracks (greater than 0.00 micrometers) and 0.631 for larger micro +cracks (greater than 4 micrometers). + +
+
+
+
+
+ + ☆ Low-Latency Task-Oriented Communications with Multi-Round, Multi-Task + Deep Learning + + +
+ In this paper, we address task-oriented (or goal-oriented) communications +where an encoder at the transmitter learns compressed latent representations of +data, which are then transmitted over a wireless channel. At the receiver, a +decoder performs a machine learning task, specifically for classifying the +received signals. The deep neural networks corresponding to the encoder-decoder +pair are jointly trained, taking both channel and data characteristics into +account. Our objective is to achieve high accuracy in completing the underlying +task while minimizing the number of channel uses determined by the encoder's +output size. To this end, we propose a multi-round, multi-task learning (MRMTL) +approach for the dynamic update of channel uses in multi-round transmissions. +The transmitter incrementally sends an increasing number of encoded samples +over the channel based on the feedback from the receiver, and the receiver +utilizes the signals from a previous round to enhance the task performance, +rather than only considering the latest transmission. This approach employs +multi-task learning to jointly optimize accuracy across varying number of +channel uses, treating each configuration as a distinct task. By evaluating the +confidence of the receiver in task decisions, MRMTL decides on whether to +allocate additional channel uses in multiple rounds. We characterize both the +accuracy and the delay (total number of channel uses) of MRMTL, demonstrating +that it achieves the accuracy close to that of conventional methods requiring +large numbers of channel uses, but with reduced delay by incorporating signals +from a prior round. We consider the CIFAR-10 dataset, convolutional neural +network architectures, and AWGN and Rayleigh channel models for performance +evaluation. We show that MRMTL significantly improves the efficiency of +task-oriented communications, balancing accuracy and latency effectively. + +
+
+
+
+
+ + ☆ A Survey of Event Causality Identification: Principles, Taxonomy, + Challenges, and Assessment + + +
+ Event Causality Identification (ECI) has become a crucial task in Natural +Language Processing (NLP), aimed at automatically extracting causalities from +textual data. In this survey, we systematically address the foundational +principles, technical frameworks, and challenges of ECI, offering a +comprehensive taxonomy to categorize and clarify current research +methodologies, as well as a quantitative assessment of existing models. We +first establish a conceptual framework for ECI, outlining key definitions, +problem formulations, and evaluation standards. Our taxonomy classifies ECI +methods according to the two primary tasks of sentence-level (SECI) and +document-level (DECI) event causality identification. For SECI, we examine +feature pattern-based matching, deep semantic encoding, causal knowledge +pre-training and prompt-based fine-tuning, and external knowledge enhancement +methods. For DECI, we highlight approaches focused on event graph reasoning and +prompt-based techniques to address the complexity of cross-sentence causal +inference. Additionally, we analyze the strengths, limitations, and open +challenges of each approach. We further conduct an extensive quantitative +evaluation of various ECI methods on two benchmark datasets. Finally, we +explore future research directions, highlighting promising pathways to overcome +current limitations and broaden ECI applications. + +
+
+
+
+
+ + ☆ Towards High-Fidelity 3D Portrait Generation with Rich Details by + Cross-View Prior-Aware Diffusion + + +
+ Recent diffusion-based Single-image 3D portrait generation methods typically +employ 2D diffusion models to provide multi-view knowledge, which is then +distilled into 3D representations. However, these methods usually struggle to +produce high-fidelity 3D models, frequently yielding excessively blurred +textures. We attribute this issue to the insufficient consideration of +cross-view consistency during the diffusion process, resulting in significant +disparities between different views and ultimately leading to blurred 3D +representations. In this paper, we address this issue by comprehensively +exploiting multi-view priors in both the conditioning and diffusion procedures +to produce consistent, detail-rich portraits. From the conditioning standpoint, +we propose a Hybrid Priors Diffsion model, which explicitly and implicitly +incorporates multi-view priors as conditions to enhance the status consistency +of the generated multi-view portraits. From the diffusion perspective, +considering the significant impact of the diffusion noise distribution on +detailed texture generation, we propose a Multi-View Noise Resamplig Strategy +integrated within the optimization process leveraging cross-view priors to +enhance representation consistency. Extensive experiments demonstrate that our +method can produce 3D portraits with accurate geometry and rich details from a +single image. The project page is at +\url{https://haoran-wei.github.io/Portrait-Diffusion}. + +
+
+
+
+
+ + ☆ Mechanisms of Generative Image-to-Image Translation Networks + + +
+ Generative Adversarial Networks (GANs) are a class of neural networks that +have been widely used in the field of image-to-image translation. In this +paper, we propose a streamlined image-to-image translation network with a +simpler architecture compared to existing models. We investigate the +relationship between GANs and autoencoders and provide an explanation for the +efficacy of employing only the GAN component for tasks involving image +translation. We show that adversarial for GAN models yields results comparable +to those of existing methods without additional complex loss penalties. +Subsequently, we elucidate the rationale behind this phenomenon. We also +incorporate experimental results to demonstrate the validity of our findings. + +
+
+
+
+
+ + ☆ Continual Adversarial Reinforcement Learning (CARL) of False Data + Injection detection: forgetting and explainability + + +
+ False data injection attacks (FDIAs) on smart inverters are a growing concern +linked to increased renewable energy production. While data-based FDIA +detection methods are also actively developed, we show that they remain +vulnerable to impactful and stealthy adversarial examples that can be crafted +using Reinforcement Learning (RL). We propose to include such adversarial +examples in data-based detection training procedure via a continual adversarial +RL (CARL) approach. This way, one can pinpoint the deficiencies of data-based +detection, thereby offering explainability during their incremental +improvement. We show that a continual learning implementation is subject to +catastrophic forgetting, and additionally show that forgetting can be addressed +by employing a joint training strategy on all generated FDIA scenarios. + +
+
+
+
+
+ + ☆ Forming Auxiliary High-confident Instance-level Loss to Promote Learning + from Label Proportions + + +
+ Learning from label proportions (LLP), i.e., a challenging weakly-supervised +learning task, aims to train a classifier by using bags of instances and the +proportions of classes within bags, rather than annotated labels for each +instance. Beyond the traditional bag-level loss, the mainstream methodology of +LLP is to incorporate an auxiliary instance-level loss with pseudo-labels +formed by predictions. Unfortunately, we empirically observed that the +pseudo-labels are are often inaccurate due to over-smoothing, especially for +the scenarios with large bag sizes, hurting the classifier induction. To +alleviate this problem, we suggest a novel LLP method, namely Learning from +Label Proportions with Auxiliary High-confident Instance-level Loss +(L^2P-AHIL). Specifically, we propose a dual entropy-based weight (DEW) method +to adaptively measure the confidences of pseudo-labels. It simultaneously +emphasizes accurate predictions at the bag level and avoids overly smoothed +predictions. We then form high-confident instance-level loss with DEW, and +jointly optimize it with the bag-level loss in a self-training manner. The +experimental results on benchmark datasets show that L^2P-AHIL can surpass the +existing baseline methods, and the performance gain can be more significant as +the bag size increases. + +
+
+
+
+
+ + ☆ Domain Adaptation-based Edge Computing for Cross-Conditions Fault + Diagnosis + + +
+ Fault diagnosis technology supports the healthy operation of mechanical +equipment. However, the variations conditions during the operation of +mechanical equipment lead to significant disparities in data distribution, +posing challenges to fault diagnosis. Furthermore, when deploying applications, +traditional methods often encounter issues such as latency and data security. +Therefore, conducting fault diagnosis and deploying application methods under +cross-operating conditions holds significant value. This paper proposes a +domain adaptation-based lightweight fault diagnosis framework for edge +computing scenarios. Incorporating the local maximum mean discrepancy into +knowledge transfer aligns the feature distributions of different domains in a +high-dimensional feature space, to discover a common feature space across +domains. The acquired fault diagnosis expertise from the cloud-model is +transferred to the lightweight edge-model using adaptation knowledge transfer +methods. While ensuring real-time diagnostic capabilities, accurate fault +diagnosis is achieved across working conditions. We conducted validation +experiments on the NVIDIA Jetson Xavier NX kit. In terms of diagnostic +performance, the proposed method significantly improved diagnostic accuracy, +with average increases of 34.44% and 17.33% compared to the comparison method, +respectively. Regarding lightweight effectiveness, proposed method achieved an +average inference speed increase of 80.47%. Additionally, compared to the +cloud-model, the parameter count of the edge-model decreased by 96.37%, while +the Flops decreased by 83.08%. + +
+
+ comment: 28 pages, 11 figures +
+
+
+
+
+ + ☆ Safe Text-to-Image Generation: Simply Sanitize the Prompt Embedding + + +
+ In recent years, text-to-image (T2I) generation models have made significant +progress in generating high-quality images that align with text descriptions. +However, these models also face the risk of unsafe generation, potentially +producing harmful content that violates usage policies, such as explicit +material. Existing safe generation methods typically focus on suppressing +inappropriate content by erasing undesired concepts from visual +representations, while neglecting to sanitize the textual representation. +Although these methods help mitigate the risk of misuse to certain extent, +their robustness remains insufficient when dealing with adversarial attacks. + Given that semantic consistency between input text and output image is a +fundamental requirement for T2I models, we identify that textual +representations (i.e., prompt embeddings) are likely the primary source of +unsafe generation. To this end, we propose a vision-agnostic safe generation +framework, Embedding Sanitizer (ES), which focuses on erasing inappropriate +concepts from prompt embeddings and uses the sanitized embeddings to guide the +model for safe generation. ES is applied to the output of the text encoder as a +plug-and-play module, enabling seamless integration with different T2I models +as well as other safeguards. In addition, ES's unique scoring mechanism assigns +a score to each token in the prompt to indicate its potential harmfulness, and +dynamically adjusts the sanitization intensity to balance defensive performance +and generation quality. Through extensive evaluation on five prompt benchmarks, +our approach achieves state-of-the-art robustness by sanitizing the source +(prompt embedding) of unsafe generation compared to nine baseline methods. It +significantly outperforms existing safeguards in terms of interpretability and +controllability while maintaining generation quality. + +
+
+
+
+
+ + The Dawn of GUI Agent: A Preliminary Case Study with Claude 3.5 Computer + Use + + +
+ The recently released model, Claude 3.5 Computer Use, stands out as the first +frontier AI model to offer computer use in public beta as a graphical user +interface (GUI) agent. As an early beta, its capability in the real-world +complex environment remains unknown. In this case study to explore Claude 3.5 +Computer Use, we curate and organize a collection of carefully designed tasks +spanning a variety of domains and software. Observations from these cases +demonstrate Claude 3.5 Computer Use's unprecedented ability in end-to-end +language to desktop actions. Along with this study, we provide an +out-of-the-box agent framework for deploying API-based GUI automation models +with easy implementation. Our case studies aim to showcase a groundwork of +capabilities and limitations of Claude 3.5 Computer Use with detailed analyses +and bring to the fore questions about planning, action, and critic, which must +be considered for future improvement. We hope this preliminary exploration will +inspire future research into the GUI agent community. All the test cases in the +paper can be tried through the project: +https://github.com/showlab/computer_use_ootb. + +
+
+ comment: 40 pages, 21 figures, preprint +
+
+
+
+
+ + ☆ A Realistic Collimated X-Ray Image Simulation Pipeline + + +
+ Collimator detection remains a challenging task in X-ray systems with +unreliable or non-available information about the detectors position relative +to the source. This paper presents a physically motivated image processing +pipeline for simulating the characteristics of collimator shadows in X-ray +images. By generating randomized labels for collimator shapes and locations, +incorporating scattered radiation simulation, and including Poisson noise, the +pipeline enables the expansion of limited datasets for training deep neural +networks. We validate the proposed pipeline by a qualitative and quantitative +comparison against real collimator shadows. Furthermore, it is demonstrated +that utilizing simulated data within our deep learning framework not only +serves as a suitable substitute for actual collimators but also enhances the +generalization performance when applied to real-world data. + +
+
+
+
+
+ + ☆ RETR: Multi-View Radar Detection Transformer for Indoor Perception NeurIPS 2024 + + +
+ Indoor radar perception has seen rising interest due to affordable costs +driven by emerging automotive imaging radar developments and the benefits of +reduced privacy concerns and reliability under hazardous conditions (e.g., fire +and smoke). However, existing radar perception pipelines fail to account for +distinctive characteristics of the multi-view radar setting. In this paper, we +propose Radar dEtection TRansformer (RETR), an extension of the popular DETR +architecture, tailored for multi-view radar perception. RETR inherits the +advantages of DETR, eliminating the need for hand-crafted components for object +detection and segmentation in the image plane. More importantly, RETR +incorporates carefully designed modifications such as 1) depth-prioritized +feature similarity via a tunable positional encoding (TPE); 2) a tri-plane loss +from both radar and camera coordinates; and 3) a learnable radar-to-camera +transformation via reparameterization, to account for the unique multi-view +radar setting. Evaluated on two indoor radar perception datasets, our approach +outperforms existing state-of-the-art methods by a margin of 15.38+ AP for +object detection and 11.77+ IoU for instance segmentation, respectively. + +
+
+ comment: 24 pages, Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ The ParClusterers Benchmark Suite (PCBS): A Fine-Grained Analysis of + Scalable Graph Clustering + + +
+ We introduce the ParClusterers Benchmark Suite (PCBS) -- a collection of +highly scalable parallel graph clustering algorithms and benchmarking tools +that streamline comparing different graph clustering algorithms and +implementations. + The benchmark includes clustering algorithms that target a wide range of +modern clustering use cases, including community detection, classification, and +dense subgraph mining. + The benchmark toolkit makes it easy to run and evaluate multiple instances of +different clustering algorithms, which can be useful for fine-tuning the +performance of clustering on a given task, and for comparing different +clustering algorithms based on different metrics of interest, including +clustering quality and running time. + Using PCBS, we evaluate a broad collection of real-world graph clustering +datasets. Somewhat surprisingly, we find that the best quality results are +obtained by algorithms that not included in many popular graph clustering +toolkits. The PCBS provides a standardized way to evaluate and judge the +quality-performance tradeoffs of the active research area of scalable graph +clustering algorithms. We believe it will help enable fair, accurate, and +nuanced evaluation of graph clustering algorithms in the future. + +
+
+ comment: This is a preliminary version of a paper that will appear at VLDB'25 +
+
+
+
+
+ + ☆ Systolic Arrays and Structured Pruning Co-design for Efficient + Transformers in Edge Systems + + +
+ Efficient deployment of resource-intensive transformers on edge devices +necessitates cross-stack optimization. We thus study the interrelation between +structured pruning and systolic acceleration, matching the size of pruned +blocks with the systolic array dimensions. In this setting, computations of +pruned weight blocks can be skipped, reducing run-time and energy consumption, +but potentially impacting quality of service (QoS). To evaluate the trade-offs +between systolic array size and sparsity opportunities, we present a novel +co-design framework that integrates algorithmic optimization, system +simulation, and hardware design. Targeting speech recognition using +transformers as a case study, we analyze how configuration choices across the +stack affect performance metrics. Results demonstrate that structured pruning +on systems featuring systolic array acceleration can effectively increase +performance, while maintaining high QoS levels. Up to 26% system-wide speedups +due to structured pruning were measured, with only 1.4% word error rate +degradation on the standard Librispeech dataset. + +
+
+ comment: 7 pages, 10 figures +
+
+
+
+
+ + ☆ Lateral Movement Detection via Time-aware Subgraph Classification on + Authentication Logs + + +
+ Lateral movement is a crucial component of advanced persistent threat (APT) +attacks in networks. Attackers exploit security vulnerabilities in internal +networks or IoT devices, expanding their control after initial infiltration to +steal sensitive data or carry out other malicious activities, posing a serious +threat to system security. Existing research suggests that attackers generally +employ seemingly unrelated operations to mask their malicious intentions, +thereby evading existing lateral movement detection methods and hiding their +intrusion traces. In this regard, we analyze host authentication log data from +a graph perspective and propose a multi-scale lateral movement detection +framework called LMDetect. The main workflow of this framework proceeds as +follows: 1) Construct a heterogeneous multigraph from host authentication log +data to strengthen the correlations among internal system entities; 2) Design a +time-aware subgraph generator to extract subgraphs centered on authentication +events from the heterogeneous authentication multigraph; 3) Design a +multi-scale attention encoder that leverages both local and global attention to +capture hidden anomalous behavior patterns in the authentication subgraphs, +thereby achieving lateral movement detection. Extensive experiments on two +real-world authentication log datasets demonstrate the effectiveness and +superiority of our framework in detecting lateral movement behaviors. + +
+
+
+
+
+ + ☆ Scaling Law for Post-training after Model Pruning + + +
+ Large language models (LLMs) based on the Transformer architecture are widely +employed across various domains and tasks. However, their increasing size +imposes significant hardware demands, limiting practical deployment. To +mitigate this, model pruning techniques have been developed to create more +efficient models while maintaining high performance. Despite this, +post-training after pruning is crucial for performance recovery and can be +resource-intensive. This paper investigates the post-training requirements of +pruned LLMs and introduces a scaling law to determine the optimal amount of +post-training data. Post-training experiments with the Llama-3 and Qwen-2.5 +series models, pruned using depth pruning, width pruning, and 2:4 +semi-structured pruning, show that higher pruning ratios necessitate more +post-training data for performance recovery, whereas larger LLMs require less. +The proposed scaling law predicts a model's loss based on its parameter counts +before and after pruning, as well as the post-training token counts. +Furthermore, we find that the scaling law established from smaller LLMs can be +reliably extrapolated to larger LLMs. This work provides valuable insights into +the post-training of pruned LLMs and offers a practical scaling law for +optimizing post-training data usage. + +
+
+
+
+
+ + ☆ The Unreasonable Effectiveness of Guidance for Diffusion Models + + +
+ Guidance is an error-correcting technique used to improve the perceptual +quality of images generated by diffusion models. Typically, the correction is +achieved by linear extrapolation, using an auxiliary diffusion model that has +lower performance than the primary model. Using a 2D toy example, we show that +it is highly beneficial when the auxiliary model exhibits similar errors as the +primary one but stronger. We verify this finding in higher dimensions, where we +show that competitive generative performance to state-of-the-art guidance +methods can be achieved when the auxiliary model differs from the primary one +only by having stronger weight regularization. As an independent contribution, +we investigate whether upweighting long-range spatial dependencies improves +visual fidelity. The result is a novel guidance method, which we call sliding +window guidance (SWG), that guides the primary model with itself by +constraining its receptive field. Intriguingly, SWG aligns better with human +preferences than state-of-the-art guidance methods while requiring neither +training, architectural modifications, nor class conditioning. The code will be +released. + +
+
+ comment: Preprint. 19 pages, 14 figures in total, including references and + appendix +
+
+
+
+
+ + ☆ Artificial Intelligence in Pediatric Echocardiography: Exploring + Challenges, Opportunities, and Clinical Applications with Explainable AI and + Federated Learning + + +
+ Pediatric heart diseases present a broad spectrum of congenital and acquired +diseases. More complex congenital malformations require a differentiated and +multimodal decision-making process, usually including echocardiography as a +central imaging method. Artificial intelligence (AI) offers considerable +promise for clinicians by facilitating automated interpretation of pediatric +echocardiography data. However, adapting AI technologies for pediatric +echocardiography analysis has challenges such as limited public data +availability, data privacy, and AI model transparency. Recently, researchers +have focused on disruptive technologies, such as federated learning (FL) and +explainable AI (XAI), to improve automatic diagnostic and decision support +workflows. This study offers a comprehensive overview of the limitations and +opportunities of AI in pediatric echocardiography, emphasizing the synergistic +workflow and role of XAI and FL, identifying research gaps, and exploring +potential future developments. Additionally, three relevant clinical use cases +demonstrate the functionality of XAI and FL with a focus on (i) view +recognition, (ii) disease classification, (iii) segmentation of cardiac +structures, and (iv) quantitative assessment of cardiac function. + +
+
+ comment: This article is planned for submission to Frontiers Journal +
+
+
+
+
+ + ☆ Generative AI in Multimodal User Interfaces: Trends, Challenges, and + Cross-Platform Adaptability + + +
+ As the boundaries of human computer interaction expand, Generative AI emerges +as a key driver in reshaping user interfaces, introducing new possibilities for +personalized, multimodal and cross-platform interactions. This integration +reflects a growing demand for more adaptive and intuitive user interfaces that +can accommodate diverse input types such as text, voice and video, and deliver +seamless experiences across devices. This paper explores the integration of +generative AI in modern user interfaces, examining historical developments and +focusing on multimodal interaction, cross-platform adaptability and dynamic +personalization. A central theme is the interface dilemma, which addresses the +challenge of designing effective interactions for multimodal large language +models, assessing the trade-offs between graphical, voice-based and immersive +interfaces. The paper further evaluates lightweight frameworks tailored for +mobile platforms, spotlighting the role of mobile hardware in enabling scalable +multimodal AI. Technical and ethical challenges, including context retention, +privacy concerns and balancing cloud and on-device processing are thoroughly +examined. Finally, the paper outlines future directions such as emotionally +adaptive interfaces, predictive AI driven user interfaces and real-time +collaborative systems, underscoring generative AI's potential to redefine +adaptive user-centric interfaces across platforms. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ ColorEdit: Training-free Image-Guided Color editing with diffusion model + + +
+ Text-to-image (T2I) diffusion models, with their impressive generative +capabilities, have been adopted for image editing tasks, demonstrating +remarkable efficacy. However, due to attention leakage and collision between +the cross-attention map of the object and the new color attribute from the text +prompt, text-guided image editing methods may fail to change the color of an +object, resulting in a misalignment between the resulting image and the text +prompt. In this paper, we conduct an in-depth analysis on the process of +text-guided image synthesizing and what semantic information different +cross-attention blocks have learned. We observe that the visual representation +of an object is determined in the up-block of the diffusion model in the early +stage of the denoising process, and color adjustment can be achieved through +value matrices alignment in the cross-attention layer. Based on our findings, +we propose a straightforward, yet stable, and effective image-guided method to +modify the color of an object without requiring any additional fine-tuning or +training. Lastly, we present a benchmark dataset called COLORBENCH, the first +benchmark to evaluate the performance of color change methods. Extensive +experiments validate the effectiveness of our method in object-level color +editing and surpass the performance of popular text-guided image editing +approaches in both synthesized and real images. + +
+
+
+
+
+ + ☆ A Low-Resolution Image is Worth 1x1 Words: Enabling Fine Image + Super-Resolution with Transformers and TaylorShift + + +
+ Transformer-based Super-Resolution (SR) models have recently advanced image +reconstruction quality, yet challenges remain due to computational complexity +and an over-reliance on large patch sizes, which constrain fine-grained detail +enhancement. In this work, we propose TaylorIR to address these limitations by +utilizing a patch size of 1x1, enabling pixel-level processing in any +transformer-based SR model. To address the significant computational demands +under the traditional self-attention mechanism, we employ the TaylorShift +attention mechanism, a memory-efficient alternative based on Taylor series +expansion, achieving full token-to-token interactions with linear complexity. +Experimental results demonstrate that our approach achieves new +state-of-the-art SR performance while reducing memory consumption by up to 60% +compared to traditional self-attention-based transformers. + +
+
+
+
+
+ + ☆ MCL: Multi-view Enhanced Contrastive Learning for Chest X-ray Report + Generation + + +
+ Radiology reports are crucial for planning treatment strategies and enhancing +doctor-patient communication, yet manually writing these reports is burdensome +for radiologists. While automatic report generation offers a solution, existing +methods often rely on single-view radiographs, limiting diagnostic accuracy. To +address this problem, we propose MCL, a Multi-view enhanced Contrastive +Learning method for chest X-ray report generation. Specifically, we first +introduce multi-view enhanced contrastive learning for visual representation by +maximizing agreements between multi-view radiographs and their corresponding +report. Subsequently, to fully exploit patient-specific indications (e.g., +patient's symptoms) for report generation, we add a transitional ``bridge" for +missing indications to reduce embedding space discrepancies caused by their +presence or absence. Additionally, we construct Multi-view CXR and Two-view CXR +datasets from public sources to support research on multi-view report +generation. Our proposed MCL surpasses recent state-of-the-art methods across +multiple datasets, achieving a 5.0% F1 RadGraph improvement on MIMIC-CXR, a +7.3% BLEU-1 improvement on MIMIC-ABN, a 3.1% BLEU-4 improvement on Multi-view +CXR, and an 8.2% F1 CheXbert improvement on Two-view CXR. + +
+
+ comment: https://github.com/mk-runner/MCL +
+
+
+
+
+ + An Empirical Study on LLM-based Agents for Automated Bug Fixing + + +
+ Large language models (LLMs) and LLM-based Agents have been applied to fix +bugs automatically, demonstrating the capability in addressing software defects +by engaging in development environment interaction, iterative validation and +code modification. However, systematic analysis of these agent and non-agent +systems remain limited, particularly regarding performance variations among +top-performing ones. In this paper, we examine seven proprietary and +open-source systems on the SWE-bench Lite benchmark for automated bug fixing. +We first assess each system's overall performance, noting instances solvable by +all or none of these sytems, and explore why some instances are uniquely solved +by specific system types. We also compare fault localization accuracy at file +and line levels and evaluate bug reproduction capabilities, identifying +instances solvable only through dynamic reproduction. Through analysis, we +concluded that further optimization is needed in both the LLM itself and the +design of Agentic flow to improve the effectiveness of the Agent in bug fixing. + +
+
+
+
+
+ + ☆ A logic for reasoning with inconsistent knowledge -- A reformulation + using nowadays terminology (2024) + + +
+ In many situations humans have to reason with inconsistent knowledge. These +inconsistencies may occur due to not fully reliable sources of information. In +order to reason with inconsistent knowledge, it is not possible to view a set +of premisses as absolute truths as is done in predicate logic. Viewing the set +of premisses as a set of assumptions, however, it is possible to deduce useful +conclusions from an inconsistent set of premisses. In this paper a logic for +reasoning with inconsistent knowledge is described. This logic is a +generalization of the work of N. Rescher [15]. In the logic a reliability +relation is used to choose between incompatible assumptions. These choices are +only made when a contradiction is derived. As long as no contradiction is +derived, the knowledge is assumed to be consistent. This makes it possible to +define an argumentation-based deduction process for the logic. For the logic a +semantics based on the ideas of Y. Shoham [22, 23], is defined. It turns out +that the semantics for the logic is a preferential semantics according to the +definition S. Kraus, D. Lehmann and M. Magidor [12]. Therefore the logic is a +logic of system P and possesses all the properties of an ideal non-monotonic +logic. + +
+
+ comment: The original version was published in the Artificial Intelligence + journal. This original version uses 'justifications' in the proof system, + which we would call nowadays 'arguments'. The current version presents the + same results but now using the terminology of an assumption-based + argumentation system +
+
+
+
+
+ + ☆ FengWu-W2S: A deep learning model for seamless weather-to-subseasonal + forecast of global atmosphere + + +
+ Seamless forecasting that produces warning information at continuum +timescales based on only one system is a long-standing pursuit for +weather-climate service. While the rapid advancement of deep learning has +induced revolutionary changes in classical forecasting field, current efforts +are still focused on building separate AI models for weather and climate +forecasts. To explore the seamless forecasting ability based on one AI model, +we propose FengWu-Weather to Subseasonal (FengWu-W2S), which builds on the +FengWu global weather forecast model and incorporates an ocean-atmosphere-land +coupling structure along with a diverse perturbation strategy. FengWu-W2S can +generate 6-hourly atmosphere forecasts extending up to 42 days through an +autoregressive and seamless manner. Our hindcast results demonstrate that +FengWu-W2S reliably predicts atmospheric conditions out to 3-6 weeks ahead, +enhancing predictive capabilities for global surface air temperature, +precipitation, geopotential height and intraseasonal signals such as the +Madden-Julian Oscillation (MJO) and North Atlantic Oscillation (NAO). Moreover, +our ablation experiments on forecast error growth from daily to seasonal +timescales reveal potential pathways for developing AI-based integrated system +for seamless weather-climate forecasting in the future. + +
+
+ comment: 23 pages,8 figures +
+
+
+
+
+ + ☆ Agentic LLMs in the Supply Chain: Towards Autonomous Multi-Agent + Consensus-Seeking + + +
+ This paper explores how Large Language Models (LLMs) can automate +consensus-seeking in supply chain management (SCM), where frequent decisions on +problems such as inventory levels and delivery times require coordination among +companies. Traditional SCM relies on human consensus in decision-making to +avoid emergent problems like the bullwhip effect. Some routine consensus +processes, especially those that are time-intensive and costly, can be +automated. Existing solutions for automated coordination have faced challenges +due to high entry barriers locking out SMEs, limited capabilities, and limited +adaptability in complex scenarios. However, recent advances in Generative AI, +particularly LLMs, show promise in overcoming these barriers. LLMs, trained on +vast datasets can negotiate, reason, and plan, facilitating near-human-level +consensus at scale with minimal entry barriers. In this work, we identify key +limitations in existing approaches and propose autonomous LLM agents to address +these gaps. We introduce a series of novel, supply chain-specific +consensus-seeking frameworks tailored for LLM agents and validate the +effectiveness of our approach through a case study in inventory management. To +accelerate progress within the SCM community, we open-source our code, +providing a foundation for further advancements in LLM-powered autonomous +supply chain solutions. + +
+
+
+
+
+ + ☆ Let people fail! Exploring the influence of explainable virtual and + robotic agents in learning-by-doing tasks + + +
+ Collaborative decision-making with artificial intelligence (AI) agents +presents opportunities and challenges. While human-AI performance often +surpasses that of individuals, the impact of such technology on human behavior +remains insufficiently understood, primarily when AI agents can provide +justifiable explanations for their suggestions. This study compares the effects +of classic vs. partner-aware explanations on human behavior and performance +during a learning-by-doing task. Three participant groups were involved: one +interacting with a computer, another with a humanoid robot, and a third one +without assistance. Results indicated that partner-aware explanations +influenced participants differently based on the type of artificial agents +involved. With the computer, participants enhanced their task completion times. +At the same time, those interacting with the humanoid robot were more inclined +to follow its suggestions, although they did not reduce their timing. +Interestingly, participants autonomously performing the learning-by-doing task +demonstrated superior knowledge acquisition than those assisted by explainable +AI (XAI). These findings raise profound questions and have significant +implications for automated tutoring and human-AI collaboration. + +
+
+
+
+
+ + ☆ The Surprising Ineffectiveness of Pre-Trained Visual Representations for + Model-Based Reinforcement Learning NeurIPS 2024 + + +
+ Visual Reinforcement Learning (RL) methods often require extensive amounts of +data. As opposed to model-free RL, model-based RL (MBRL) offers a potential +solution with efficient data utilization through planning. Additionally, RL +lacks generalization capabilities for real-world tasks. Prior work has shown +that incorporating pre-trained visual representations (PVRs) enhances sample +efficiency and generalization. While PVRs have been extensively studied in the +context of model-free RL, their potential in MBRL remains largely unexplored. +In this paper, we benchmark a set of PVRs on challenging control tasks in a +model-based RL setting. We investigate the data efficiency, generalization +capabilities, and the impact of different properties of PVRs on the performance +of model-based agents. Our results, perhaps surprisingly, reveal that for MBRL +current PVRs are not more sample efficient than learning representations from +scratch, and that they do not generalize better to out-of-distribution (OOD) +settings. To explain this, we analyze the quality of the trained dynamics +model. Furthermore, we show that data diversity and network architecture are +the most important contributors to OOD generalization performance. + +
+
+ comment: Published at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024). Project page: https://schneimo.com/pvr4mbrl/ +
+
+
+
+
+ + ☆ A Hard-Label Cryptanalytic Extraction of Non-Fully Connected Deep Neural + Networks using Side-Channel Attacks + + +
+ During the past decade, Deep Neural Networks (DNNs) proved their value on a +large variety of subjects. However despite their high value and public +accessibility, the protection of the intellectual property of DNNs is still an +issue and an emerging research field. Recent works have successfully extracted +fully-connected DNNs using cryptanalytic methods in hard-label settings, +proving that it was possible to copy a DNN with high fidelity, i.e., high +similitude in the output predictions. However, the current cryptanalytic +attacks cannot target complex, i.e., not fully connected, DNNs and are limited +to special cases of neurons present in deep networks. + In this work, we introduce a new end-to-end attack framework designed for +model extraction of embedded DNNs with high fidelity. We describe a new +black-box side-channel attack which splits the DNN in several linear parts for +which we can perform cryptanalytic extraction and retrieve the weights in +hard-label settings. With this method, we are able to adapt cryptanalytic +extraction, for the first time, to non-fully connected DNNs, while maintaining +a high fidelity. We validate our contributions by targeting several +architectures implemented on a microcontroller unit, including a Multi-Layer +Perceptron (MLP) of 1.7 million parameters and a shortened MobileNetv1. Our +framework successfully extracts all of these DNNs with high fidelity (88.4% for +the MobileNetv1 and 93.2% for the MLP). Furthermore, we use the stolen model to +generate adversarial examples and achieve close to white-box performance on the +victim's model (95.8% and 96.7% transfer rate). + +
+
+
+
+
+ + ☆ Semantics and Spatiality of Emergent Communication NeurIPS 2024 + + +
+ When artificial agents are jointly trained to perform collaborative tasks +using a communication channel, they develop opaque goal-oriented communication +protocols. Good task performance is often considered sufficient evidence that +meaningful communication is taking place, but existing empirical results show +that communication strategies induced by common objectives can be +counterintuitive whilst solving the task nearly perfectly. In this work, we +identify a goal-agnostic prerequisite to meaningful communication, which we +term semantic consistency, based on the idea that messages should have similar +meanings across instances. We provide a formal definition for this idea, and +use it to compare the two most common objectives in the field of emergent +communication: discrimination and reconstruction. We prove, under mild +assumptions, that semantically inconsistent communication protocols can be +optimal solutions to the discrimination task, but not to reconstruction. We +further show that the reconstruction objective encourages a stricter property, +spatial meaningfulness, which also accounts for the distance between messages. +Experiments with emergent communication games validate our theoretical results. +These findings demonstrate an inherent advantage of distance-based +communication goals, and contextualize previous empirical discoveries. + +
+
+ comment: 34 pages, to be published in NeurIPS 2024 +
+
+
+
+
+ + ☆ Increasing the Accessibility of Causal Domain Knowledge via Causal + Information Extraction Methods: A Case Study in the Semiconductor + Manufacturing Industry + + +
+ The extraction of causal information from textual data is crucial in the +industry for identifying and mitigating potential failures, enhancing process +efficiency, prompting quality improvements, and addressing various operational +challenges. This paper presents a study on the development of automated methods +for causal information extraction from actual industrial documents in the +semiconductor manufacturing industry. The study proposes two types of causal +information extraction methods, single-stage sequence tagging (SST) and +multi-stage sequence tagging (MST), and evaluates their performance using +existing documents from a semiconductor manufacturing company, including +presentation slides and FMEA (Failure Mode and Effects Analysis) documents. The +study also investigates the effect of representation learning on downstream +tasks. The presented case study showcases that the proposed MST methods for +extracting causal information from industrial documents are suitable for +practical applications, especially for semi structured documents such as FMEAs, +with a 93\% F1 score. Additionally, MST achieves a 73\% F1 score on texts +extracted from presentation slides. Finally, the study highlights the +importance of choosing a language model that is more aligned with the domain +and in-domain fine-tuning. + +
+
+ comment: 17 pages, 2 figures +
+
+
+
+
+ + ☆ Imagine-2-Drive: High-Fidelity World Modeling in CARLA for Autonomous + Vehicles ICRA 2025 + + +
+ In autonomous driving with image based state space, accurate prediction of +future events and modeling diverse behavioral modes are essential for safety +and effective decision-making. World model-based Reinforcement Learning (WMRL) +approaches offers a promising solution by simulating future states from current +state and actions. However, utility of world models is often limited by typical +RL policies being limited to deterministic or single gaussian distribution. By +failing to capture the full spectrum of possible actions, reduces their +adaptability in complex, dynamic environments. In this work, we introduce +Imagine-2-Drive, a framework that consists of two components, VISTAPlan, a +high-fidelity world model for accurate future prediction and Diffusion Policy +Actor (DPA), a diffusion based policy to model multi-modal behaviors for +trajectory prediction. We use VISTAPlan to simulate and evaluate trajectories +from DPA and use Denoising Diffusion Policy Optimization (DDPO) to train DPA to +maximize the cumulative sum of rewards over the trajectories. We analyze the +benefits of each component and the framework as a whole in CARLA with standard +driving metrics. As a consequence of our twin novelties- VISTAPlan and DPA, we +significantly outperform the state of the art (SOTA) world models on standard +driving metrics by 15% and 20% on Route Completion and Success Rate +respectively. + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ☆ Evaluating the role of `Constitutions' for learning from AI feedback NeurIPS 2024 + + +
+ The growing capabilities of large language models (LLMs) have led to their +use as substitutes for human feedback for training and assessing other LLMs. +These methods often rely on `constitutions', written guidelines which a critic +model uses to provide feedback and improve generations. We investigate how the +choice of constitution affects feedback quality by using four different +constitutions to improve patient-centered communication in medical interviews. +In pairwise comparisons conducted by 215 human raters, we found that detailed +constitutions led to better results regarding emotive qualities. However, none +of the constitutions outperformed the baseline in learning more +practically-oriented skills related to information gathering and provision. Our +findings indicate that while detailed constitutions should be prioritised, +there are possible limitations to the effectiveness of AI feedback as a reward +signal in certain areas. + +
+
+ comment: 4 pages, 2 figures. In NeurIPS 2024 Workshop on Language Gamification +
+
+
+
+
+ + ☆ Mitigating Sycophancy in Decoder-Only Transformer Architectures: + Synthetic Data Intervention + + +
+ To address the sycophancy problem caused by reinforcement learning from human +feedback in large language models, this research applies synthetic data +intervention technology to the decoder-only transformer architecture. Based on +the research gaps in the existing literature, the researcher designed an +experimental process to reduce the tendency of models to cater by generating +diversified data, and used GPT4o as an experimental tool for verification. The +experiment used 100 true and false questions, and compared the performance of +the model trained with synthetic data intervention and the original untrained +model on multiple indicators. The results show that the SDI training model +supports the technology in terms of accuracy rate and sycophancy rate and has +significant effectiveness in reducing sycophancy phenomena. Notably, the data +set, experimental process, code and data results have been uploaded to Github, +the link is https://github.com/brucewang123456789/GeniusTrail.git. + +
+
+ comment: This research is also submitted to OpenReview. The main text is 9 + pages (excluding citations), 7 figures, and 1 table +
+
+
+
+
+ + ☆ Causal Time-Series Synchronization for Multi-Dimensional Forecasting + + +
+ The process industry's high expectations for Digital Twins require modeling +approaches that can generalize across tasks and diverse domains with +potentially different data dimensions and distributional shifts i.e., +Foundational Models. Despite success in natural language processing and +computer vision, transfer learning with (self-) supervised signals for +pre-training general-purpose models is largely unexplored in the context of +Digital Twins in the process industry due to challenges posed by +multi-dimensional time-series data, lagged cause-effect dependencies, complex +causal structures, and varying number of (exogenous) variables. We propose a +novel channel-dependent pre-training strategy that leverages synchronized +cause-effect pairs to overcome these challenges by breaking down the +multi-dimensional time-series data into pairs of cause-effect variables. Our +approach focuses on: (i) identifying highly lagged causal relationships using +data-driven methods, (ii) synchronizing cause-effect pairs to generate training +samples for channel-dependent pre-training, and (iii) evaluating the +effectiveness of this approach in channel-dependent forecasting. Our +experimental results demonstrate significant improvements in forecasting +accuracy and generalization capability compared to traditional training +methods. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Legal Evalutions and Challenges of Large Language Models + + +
+ In this paper, we review legal testing methods based on Large Language Models +(LLMs), using the OPENAI o1 model as a case study to evaluate the performance +of large models in applying legal provisions. We compare current +state-of-the-art LLMs, including open-source, closed-source, and legal-specific +models trained specifically for the legal domain. Systematic tests are +conducted on English and Chinese legal cases, and the results are analyzed in +depth. Through systematic testing of legal cases from common law systems and +China, this paper explores the strengths and weaknesses of LLMs in +understanding and applying legal texts, reasoning through legal issues, and +predicting judgments. The experimental results highlight both the potential and +limitations of LLMs in legal applications, particularly in terms of challenges +related to the interpretation of legal language and the accuracy of legal +reasoning. Finally, the paper provides a comprehensive analysis of the +advantages and disadvantages of various types of models, offering valuable +insights and references for the future application of AI in the legal field. + +
+
+
+
+
+ + ☆ Memorization in Attention-only Transformers AISTATS 2025 + + +
+ Recent research has explored the memorization capacity of multi-head +attention, but these findings are constrained by unrealistic limitations on the +context size. We present a novel proof for language-based Transformers that +extends the current hypothesis to any context size. Our approach improves upon +the state-of-the-art by achieving more effective exact memorization with an +attention layer, while also introducing the concept of approximate memorization +of distributions. Through experimental validation, we demonstrate that our +proposed bounds more accurately reflect the true memorization capacity of +language models, and provide a precise comparison with prior work. + +
+
+ comment: 16 pages, 6 figures, submitted to AISTATS 2025, +
+
+
+
+
+ + ☆ Generative Agent Simulations of 1,000 People + + +
+ The promise of human behavioral simulation--general-purpose computational +agents that replicate human behavior across domains--could enable broad +applications in policymaking and social science. We present a novel agent +architecture that simulates the attitudes and behaviors of 1,052 real +individuals--applying large language models to qualitative interviews about +their lives, then measuring how well these agents replicate the attitudes and +behaviors of the individuals that they represent. The generative agents +replicate participants' responses on the General Social Survey 85% as +accurately as participants replicate their own answers two weeks later, and +perform comparably in predicting personality traits and outcomes in +experimental replications. Our architecture reduces accuracy biases across +racial and ideological groups compared to agents given demographic +descriptions. This work provides a foundation for new tools that can help +investigate individual and collective behavior. + +
+
+
+
+
+ + ☆ Identifying Key Drivers of Heatwaves: A Novel Spatio-Temporal Framework + for Extreme Event Detection + + +
+ Heatwaves (HWs) are extreme atmospheric events that produce significant +societal and environmental impacts. Predicting these extreme events remains +challenging, as their complex interactions with large-scale atmospheric and +climatic variables are difficult to capture with traditional statistical and +dynamical models. This work presents a general method for driver identification +in extreme climate events. A novel framework (STCO-FS) is proposed to identify +key immediate (short-term) HW drivers by combining clustering algorithms with +an ensemble evolutionary algorithm. The framework analyzes spatio-temporal +data, reduces dimensionality by grouping similar geographical nodes for each +variable, and develops driver selection in spatial and temporal domains, +identifying the best time lags between predictive variables and HW occurrences. +The proposed method has been applied to analyze HWs in the Adda river basin in +Italy. The approach effectively identifies significant variables influencing +HWs in this region. This research can potentially enhance our understanding of +HW drivers and predictability. + +
+
+ comment: 28 pages, 10 figures, 4 tables +
+
+
+
+
+ + ☆ Multi-Task Adversarial Variational Autoencoder for Estimating Biological + Brain Age with Multimodal Neuroimaging + + +
+ Despite advances in deep learning for estimating brain age from structural +MRI data, incorporating functional MRI data is challenging due to its complex +structure and the noisy nature of functional connectivity measurements. To +address this, we present the Multitask Adversarial Variational Autoencoder, a +custom deep learning framework designed to improve brain age predictions +through multimodal MRI data integration. This model separates latent variables +into generic and unique codes, isolating shared and modality-specific features. +By integrating multitask learning with sex classification as an additional +task, the model captures sex-specific aging patterns. Evaluated on the OpenBHB +dataset, a large multisite brain MRI collection, the model achieves a mean +absolute error of 2.77 years, outperforming traditional methods. This success +positions M-AVAE as a powerful tool for metaverse-based healthcare applications +in brain age estimation. + +
+
+
+
+
+ + ☆ AI and the Future of Work in Africa White Paper + + +
+ This white paper is the output of a multidisciplinary workshop in Nairobi +(Nov 2023). Led by a cross-organisational team including Microsoft Research, +NEPAD, Lelapa AI, and University of Oxford. The workshop brought together +diverse thought-leaders from various sectors and backgrounds to discuss the +implications of Generative AI for the future of work in Africa. Discussions +centred around four key themes: Macroeconomic Impacts; Jobs, Skills and Labour +Markets; Workers' Perspectives and Africa-Centris AI Platforms. The white paper +provides an overview of the current state and trends of generative AI and its +applications in different domains, as well as the challenges and risks +associated with its adoption and regulation. It represents a diverse set of +perspectives to create a set of insights and recommendations which aim to +encourage debate and collaborative action towards creating a dignified future +of work for everyone across Africa. + +
+
+
+
+
+ + ☆ PFML: Self-Supervised Learning of Time-Series Data Without + Representation Collapse + + +
+ Self-supervised learning (SSL) is a data-driven learning approach that +utilizes the innate structure of the data to guide the learning process. In +contrast to supervised learning, which depends on external labels, SSL utilizes +the inherent characteristics of the data to produce its own supervisory signal. +However, one frequent issue with SSL methods is representation collapse, where +the model outputs a constant input-invariant feature representation. This issue +hinders the potential application of SSL methods to new data modalities, as +trying to avoid representation collapse wastes researchers' time and effort. +This paper introduces a novel SSL algorithm for time-series data called +Prediction of Functionals from Masked Latents (PFML). Instead of predicting +masked input signals or their latent representations directly, PFML operates by +predicting statistical functionals of the input signal corresponding to masked +embeddings, given a sequence of unmasked embeddings. The algorithm is designed +to avoid representation collapse, rendering it straightforwardly applicable to +different time-series data domains, such as novel sensor modalities in clinical +data. We demonstrate the effectiveness of PFML through complex, real-life +classification tasks across three different data modalities: infant posture and +movement classification from multi-sensor inertial measurement unit data, +emotion recognition from speech data, and sleep stage classification from EEG +data. The results show that PFML is superior to a conceptually similar +pre-existing SSL method and competitive against the current state-of-the-art +SSL method, while also being conceptually simpler and without suffering from +representation collapse. + +
+
+
+
+
+ + ☆ Adapting the Biological SSVEP Response to Artificial Neural Networks + + +
+ Neuron importance assessment is crucial for understanding the inner workings +of artificial neural networks (ANNs) and improving their interpretability and +efficiency. This paper introduces a novel approach to neuron significance +assessment inspired by frequency tagging, a technique from neuroscience. By +applying sinusoidal contrast modulation to image inputs and analyzing resulting +neuron activations, this method enables fine-grained analysis of a network's +decision-making processes. Experiments conducted with a convolutional neural +network for image classification reveal notable harmonics and intermodulations +in neuron-specific responses under part-based frequency tagging. These findings +suggest that ANNs exhibit behavior akin to biological brains in tuning to +flickering frequencies, thereby opening avenues for neuron/filter importance +assessment through frequency tagging. The proposed method holds promise for +applications in network pruning, and model interpretability, contributing to +the advancement of explainable artificial intelligence and addressing the lack +of transparency in neural networks. Future research directions include +developing novel loss functions to encourage biologically plausible behavior in +ANNs. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Real-Time AI-Driven People Tracking and Counting Using Overhead Cameras + + +
+ Accurate people counting in smart buildings and intelligent transportation +systems is crucial for energy management, safety protocols, and resource +allocation. This is especially critical during emergencies, where precise +occupant counts are vital for safe evacuation. Existing methods struggle with +large crowds, often losing accuracy with even a few additional people. To +address this limitation, this study proposes a novel approach combining a new +object tracking algorithm, a novel counting algorithm, and a fine-tuned object +detection model. This method achieves 97% accuracy in real-time people counting +with a frame rate of 20-27 FPS on a low-power edge computer. + +
+
+ comment: This paper is accepted to IEEE Region 10 conference (TENCON) 2024 +
+
+
+
+
+ + ☆ Evidential Federated Learning for Skin Lesion Image Classification ICPR 2024 + + +
+ We introduce FedEvPrompt, a federated learning approach that integrates +principles of evidential deep learning, prompt tuning, and knowledge +distillation for distributed skin lesion classification. FedEvPrompt leverages +two sets of prompts: b-prompts (for low-level basic visual knowledge) and +t-prompts (for task-specific knowledge) prepended to frozen pre-trained Vision +Transformer (ViT) models trained in an evidential learning framework to +maximize class evidences. Crucially, knowledge sharing across federation +clients is achieved only through knowledge distillation on attention maps +generated by the local ViT models, ensuring enhanced privacy preservation +compared to traditional parameter or synthetic image sharing methodologies. +FedEvPrompt is optimized within a round-based learning paradigm, where each +round involves training local models followed by attention maps sharing with +all federation clients. Experimental validation conducted in a real distributed +setting, on the ISIC2019 dataset, demonstrates the superior performance of +FedEvPrompt against baseline federated learning algorithms and knowledge +distillation methods, without sharing model parameters. In conclusion, +FedEvPrompt offers a promising approach for federated learning, effectively +addressing challenges such as data heterogeneity, imbalance, privacy +preservation, and knowledge sharing. + +
+
+ comment: Published as a conference paper at ICPR 2024 +
+
+
+
+
+ + ☆ Federated Domain Generalization via Prompt Learning and Aggregation + + +
+ Federated domain generalization (FedDG) aims to improve the global model +generalization in unseen domains by addressing data heterogeneity under +privacy-preserving constraints. A common strategy in existing FedDG studies +involves sharing domain-specific knowledge among clients, such as spectrum +information, class prototypes, and data styles. However, this knowledge is +extracted directly from local client samples, and sharing such sensitive +information poses a potential risk of data leakage, which might not fully meet +the requirements of FedDG. In this paper, we introduce prompt learning to adapt +pre-trained vision-language models (VLMs) in the FedDG scenario, and leverage +locally learned prompts as a more secure bridge to facilitate knowledge +transfer among clients. Specifically, we propose a novel FedDG framework +through Prompt Learning and AggregatioN (PLAN), which comprises two training +stages to collaboratively generate local prompts and global prompts at each +federated round. First, each client performs both text and visual prompt +learning using their own data, with local prompts indirectly synchronized by +regarding the global prompts as a common reference. Second, all domain-specific +local prompts are exchanged among clients and selectively aggregated into the +global prompts using lightweight attention-based aggregators. The global +prompts are finally applied to adapt VLMs to unseen target domains. As our PLAN +framework requires training only a limited number of prompts and lightweight +aggregators, it offers notable advantages in computational and communication +efficiency for FedDG. Extensive experiments demonstrate the superior +generalization ability of PLAN across four benchmark datasets. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ KuaiFormer: Transformer-Based Retrieval at Kuaishou + + +
+ In large-scale content recommendation systems, retrieval serves as the +initial stage in the pipeline, responsible for selecting thousands of candidate +items from billions of options to pass on to ranking modules. Traditionally, +the dominant retrieval method has been Embedding-Based Retrieval (EBR) using a +Deep Neural Network (DNN) dual-tower structure. However, applying transformer +in retrieval tasks has been the focus of recent research, though real-world +industrial deployment still presents significant challenges. In this paper, we +introduce KuaiFormer, a novel transformer-based retrieval framework deployed in +a large-scale content recommendation system. KuaiFormer fundamentally redefines +the retrieval process by shifting from conventional score estimation tasks +(such as click-through rate estimate) to a transformer-driven Next Action +Prediction paradigm. This shift enables more effective real-time interest +acquisition and multi-interest extraction, significantly enhancing retrieval +performance. KuaiFormer has been successfully integrated into Kuaishou App's +short-video recommendation system since May 2024, serving over 400 million +daily active users and resulting in a marked increase in average daily usage +time of Kuaishou users. We provide insights into both the technical and +business aspects of deploying transformer in large-scale recommendation +systems, addressing practical challenges encountered during industrial +implementation. Our findings offer valuable guidance for engineers and +researchers aiming to leverage transformer models to optimize large-scale +content recommendation systems. + +
+
+
+
+
+ + ☆ Towards unearthing neglected climate innovations from scientific + literature using Large Language Models NeurIPS 2024 + + +
+ Climate change poses an urgent global threat, needing the rapid +identification and deployment of innovative solutions. We hypothesise that many +of these solutions already exist within scientific literature but remain +underutilised. To address this gap, this study employs a curated dataset +sourced from OpenAlex, a comprehensive repository of scientific papers. +Utilising Large Language Models (LLMs), such as GPT4-o from OpenAI, we evaluate +title-abstract pairs from scientific papers on seven dimensions, covering +climate change mitigation potential, stage of technological development, and +readiness for deployment. The outputs of the language models are then compared +with human evaluations to assess their effectiveness in identifying promising +yet overlooked climate innovations. Our findings suggest that these LLM-based +models can effectively augment human expertise, uncovering climate solutions +that are potentially impactful but with far greater speed, throughput and +consistency. Here, we focused on UK-based solutions, but the workflow is +region-agnostic. This work contributes to the discovery of neglected +innovations in scientific literature and demonstrates the potential of AI in +enhancing climate action strategies. + +
+
+ comment: 10 pages. Accepted in the LatinX in AI workshop at NeurIPS 2024 +
+
+
+
+
+ + ☆ That Chip Has Sailed: A Critique of Unfounded Skepticism Around AI for + Chip Design + + +
+ In 2020, we introduced a deep reinforcement learning method capable of +generating superhuman chip layouts, which we then published in Nature and +open-sourced on GitHub. AlphaChip has inspired an explosion of work on AI for +chip design, and has been deployed in state-of-the-art chips across Alphabet +and extended by external chipmakers. Even so, a non-peer-reviewed invited paper +at ISPD 2023 questioned its performance claims, despite failing to run our +method as described in Nature. For example, it did not pre-train the RL method +(removing its ability to learn from prior experience), used substantially fewer +compute resources (20x fewer RL experience collectors and half as many GPUs), +did not train to convergence (standard practice in machine learning), and +evaluated on test cases that are not representative of modern chips. Recently, +Igor Markov published a meta-analysis of three papers: our peer-reviewed Nature +paper, the non-peer-reviewed ISPD paper, and Markov's own unpublished paper +(though he does not disclose that he co-authored it). Although AlphaChip has +already achieved widespread adoption and impact, we publish this response to +ensure that no one is wrongly discouraged from innovating in this impactful +area. + +
+
+
+
+
+ + ☆ Jal Anveshak: Prediction of fishing zones using fine-tuned LlaMa 2 + + +
+ In recent years, the global and Indian government efforts in monitoring and +collecting data related to the fisheries industry have witnessed significant +advancements. Despite this wealth of data, there exists an untapped potential +for leveraging artificial intelligence based technological systems to benefit +Indian fishermen in coastal areas. To fill this void in the Indian technology +ecosystem, the authors introduce Jal Anveshak. This is an application framework +written in Dart and Flutter that uses a Llama 2 based Large Language Model +fine-tuned on pre-processed and augmented government data related to fishing +yield and availability. Its main purpose is to help Indian fishermen safely get +the maximum yield of fish from coastal areas and to resolve their fishing +related queries in multilingual and multimodal ways. + +
+
+
+
+
+ + ☆ Physics-informed neural networks need a physicist to be accurate: the + case of mass and heat transport in Fischer-Tropsch catalyst particles + + +
+ Physics-Informed Neural Networks (PINNs) have emerged as an influential +technology, merging the swift and automated capabilities of machine learning +with the precision and dependability of simulations grounded in theoretical +physics. PINNs are often employed to solve algebraic or differential equations +to replace some or even all steps of multi-stage computational workflows, +leading to their significant speed-up. However, wide adoption of PINNs is still +hindered by reliability issues, particularly at extreme ends of the input +parameter ranges. In this study, we demonstrate this in the context of a system +of coupled non-linear differential reaction-diffusion and heat transfer +equations related to Fischer-Tropsch synthesis, which are solved by a +finite-difference method with a PINN used in evaluating their source terms. It +is shown that the testing strategies traditionally used to assess the accuracy +of neural networks as function approximators can overlook the peculiarities +which ultimately cause instabilities of the finite-difference solver. We +propose a domain knowledge-based modifications to the PINN architecture +ensuring its correct asymptotic behavior. When combined with an improved +numerical scheme employed as an initial guess generator, the proposed +modifications are shown to recover the overall stability of the simulations, +while preserving the speed-up brought by PINN as the workflow component. We +discuss the possible applications of the proposed hybrid transport equation +solver in context of chemical reactors simulations. + +
+
+
+
+
+ + ☆ Rethinking Normalization Strategies and Convolutional Kernels for + Multimodal Image Fusion + + +
+ Multimodal image fusion (MMIF) aims to integrate information from different +modalities to obtain a comprehensive image, aiding downstream tasks. However, +existing methods tend to prioritize natural image fusion and focus on +information complementary and network training strategies. They ignore the +essential distinction between natural and medical image fusion and the +influence of underlying components. This paper dissects the significant +differences between the two tasks regarding fusion goals, statistical +properties, and data distribution. Based on this, we rethink the suitability of +the normalization strategy and convolutional kernels for end-to-end +MMIF.Specifically, this paper proposes a mixture of instance normalization and +group normalization to preserve sample independence and reinforce intrinsic +feature correlation.This strategy promotes the potential of enriching feature +maps, thus boosting fusion performance. To this end, we further introduce the +large kernel convolution, effectively expanding receptive fields and enhancing +the preservation of image detail. Moreover, the proposed multipath adaptive +fusion module recalibrates the decoder input with features of various scales +and receptive fields, ensuring the transmission of crucial information. +Extensive experiments demonstrate that our method exhibits state-of-the-art +performance in multiple fusion tasks and significantly improves downstream +applications. The code is available at https://github.com/HeDan-11/LKC-FUNet. + +
+
+
+
+
+ + ☆ VMID: A Multimodal Fusion LLM Framework for Detecting and Identifying + Misinformation of Short Videos + + +
+ Short video platforms have become important channels for news dissemination, +offering a highly engaging and immediate way for users to access current events +and share information. However, these platforms have also emerged as +significant conduits for the rapid spread of misinformation, as fake news and +rumors can leverage the visual appeal and wide reach of short videos to +circulate extensively among audiences. Existing fake news detection methods +mainly rely on single-modal information, such as text or images, or apply only +basic fusion techniques, limiting their ability to handle the complex, +multi-layered information inherent in short videos. To address these +limitations, this paper presents a novel fake news detection method based on +multimodal information, designed to identify misinformation through a +multi-level analysis of video content. This approach effectively utilizes +different modal representations to generate a unified textual description, +which is then fed into a large language model for comprehensive evaluation. The +proposed framework successfully integrates multimodal features within videos, +significantly enhancing the accuracy and reliability of fake news detection. +Experimental results demonstrate that the proposed approach outperforms +existing models in terms of accuracy, robustness, and utilization of multimodal +information, achieving an accuracy of 90.93%, which is significantly higher +than the best baseline model (SV-FEND) at 81.05%. Furthermore, case studies +provide additional evidence of the effectiveness of the approach in accurately +distinguishing between fake news, debunking content, and real incidents, +highlighting its reliability and robustness in real-world applications. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2211.10973 by other authors +
+
+
+
+
+ + ☆ MOT\_FCG++: Enhanced Representation of Motion and Appearance Features + + +
+ The goal of multi-object tracking (MOT) is to detect and track all objects in +a scene across frames, while maintaining a unique identity for each object. +Most existing methods rely on the spatial motion features and appearance +embedding features of the detected objects in consecutive frames. Effectively +and robustly representing the spatial and appearance features of long +trajectories has become a critical factor affecting the performance of MOT. We +propose a novel approach for appearance and spatial feature representation, +improving upon the clustering association method MOT\_FCG. For spatial motion +features, we propose Diagonal Modulated GIoU, which more accurately represents +the relationship between the position and shape of the objects. For appearance +features, we utilize a dynamic appearance representation that incorporates +confidence information, enabling the trajectory appearance features to be more +robust and global. Based on the baseline model MOT\_FCG, we achieved 76.1 HOTA, +80.4 MOTA and 81.3 IDF1 on the MOT17 validation set, and also achieved +competitive performance on the MOT20 and DanceTrack validation sets. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ MicroCrackAttentionNeXt: Advancing Microcrack Detection in Wave Field + Analysis Using Deep Neural Networks through Feature Visualization + + +
+ Micro Crack detection using deep neural networks (DNNs) through an automated +pipeline using wave fields interacting with the damaged areas is highly sought +after. These high-dimensional spatio-temporal crack data are limited, and these +datasets have large dimensions in the temporal domain. The dataset presents a +substantial class imbalance, with crack pixels constituting an average of only +5% of the total pixels per sample. This extreme class imbalance poses a +challenge for deep learning models with the different micro-scale cracks, as +the network can be biased toward predicting the majority class, generally +leading to poor detection accuracy. This study builds upon the previous +benchmark SpAsE-Net, an asymmetric encoder-decoder network for micro-crack +detection. The impact of various activation and loss functions were examined +through feature space visualization using the manifold discovery and analysis +(MDA) algorithm. The optimized architecture and training methodology achieved +an accuracy of 86.85%. + +
+
+
+
+
+ + ☆ DeepMedcast: A Deep Learning Method for Generating Intermediate Weather + Forecasts among Multiple NWP Models + + +
+ Numerical weather prediction (NWP) centers around the world operate a variety +of NWP models, and recent advances in AI-driven NWP models have increased the +availability of diverse NWP outputs. While this expansion holds the potential +to improve forecast accuracy, it also raises a critical challenge of +identifying the most reliable predictions for specific forecast scenarios. +Traditional approaches, such as ensemble or weighted averaging, combine +multiple NWP outputs but often generate unrealistic atmospheric fields, +complicating the production of reliable and consistent forecasts in operational +settings. In this study, we introduce DeepMedcast, a deep learning method that +generates intermediate forecast, or "medcast", between two or more NWP outputs. +Unlike ensemble averaging, DeepMedcast can provide consistent and explainable +medcast without distorting meteorological fields. This paper details the +methodology and case studies of DeepMedcast, discussing its advantages and +potential contributions to operational forecasting. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Graph-based Complexity for Causal Effect by Empirical Plug-in + + +
+ This paper focuses on the computational complexity of computing empirical +plug-in estimates for causal effect queries. Given a causal graph and +observational data, any identifiable causal query can be estimated from an +expression over the observed variables, called the estimand. The estimand can +then be evaluated by plugging in probabilities computed empirically from data. +In contrast to conventional wisdom, which assumes that high dimensional +probabilistic functions will lead to exponential evaluation time of the +estimand. We show that computation can be done efficiently, potentially in time +linear in the data size, depending on the estimand's hypergraph. + In particular, we show that both the treewidth and hypertree width of the +estimand's structure bound the evaluation complexity of the plug-in estimands, +analogous to their role in the complexity of probabilistic inference in +graphical models. Often, the hypertree width provides a more effective bound, +since the empirical distributions are sparse. + +
+
+
+
+
+ + ☆ Orca: Enhancing Role-Playing Abilities of Large Language Models by + Integrating Personality Traits + + +
+ Large language models has catalyzed the development of personalized dialogue +systems, numerous role-playing conversational agents have emerged. While +previous research predominantly focused on enhancing the model's capability to +follow instructions by designing character profiles, neglecting the +psychological factors that drive human conversations. In this paper, we propose +Orca, a framework for data processing and training LLMs of custom characters by +integrating personality traits. Orca comprises four stages: (1) Personality +traits inferring, leverage LLMs to infer user's BigFive personality trait +reports and scores. (2) Data Augment, simulate user's profile, background +story, and psychological activities. (3) Dataset construction, +personality-conditioned instruction prompting (PCIP) to stimulate LLMs. (4) +Modeling and Training, personality-conditioned instruction tuning (PTIT and +PSIT), using the generated data to enhance existing open-source LLMs. We +introduce OrcaBench, the first benchmark for evaluating the quality of content +generated by LLMs on social platforms across multiple scales. Our experiments +demonstrate that our proposed model achieves superior performance on this +benchmark, demonstrating its excellence and effectiveness in perceiving +personality traits that significantly improve role-playing abilities. Our Code +is available at https://github.com/Aipura/Orca. + +
+
+
+
+
+ + ☆ EyeDiff: text-to-image diffusion model improves rare eye disease + diagnosis + + +
+ The rising prevalence of vision-threatening retinal diseases poses a +significant burden on the global healthcare systems. Deep learning (DL) offers +a promising solution for automatic disease screening but demands substantial +data. Collecting and labeling large volumes of ophthalmic images across various +modalities encounters several real-world challenges, especially for rare +diseases. Here, we introduce EyeDiff, a text-to-image model designed to +generate multimodal ophthalmic images from natural language prompts and +evaluate its applicability in diagnosing common and rare diseases. EyeDiff is +trained on eight large-scale datasets using the advanced latent diffusion +model, covering 14 ophthalmic image modalities and over 80 ocular diseases, and +is adapted to ten multi-country external datasets. The generated images +accurately capture essential lesional characteristics, achieving high alignment +with text prompts as evaluated by objective metrics and human experts. +Furthermore, integrating generated images significantly enhances the accuracy +of detecting minority classes and rare eye diseases, surpassing traditional +oversampling methods in addressing data imbalance. EyeDiff effectively tackles +the issue of data imbalance and insufficiency typically encountered in rare +diseases and addresses the challenges of collecting large-scale annotated +images, offering a transformative solution to enhance the development of +expert-level diseases diagnosis models in ophthalmic field. + +
+
+ comment: 28 pages, 2 figures +
+
+
+
+
+ + ☆ DuSEGO: Dual Second-order Equivariant Graph Ordinary Differential + Equation + + +
+ Graph Neural Networks (GNNs) with equivariant properties have achieved +significant success in modeling complex dynamic systems and molecular +properties. However, their expressiveness ability is limited by: (1) Existing +methods often overlook the over-smoothing issue caused by traditional GNN +models, as well as the gradient explosion or vanishing problems in deep GNNs. +(2) Most models operate on first-order information, neglecting that the real +world often consists of second-order systems, which further limits the model's +representation capabilities. To address these issues, we propose the +\textbf{Du}al \textbf{S}econd-order \textbf{E}quivariant \textbf{G}raph +\textbf{O}rdinary Differential Equation (\method{}) for equivariant +representation. Specifically, \method{} apply the dual second-order equivariant +graph ordinary differential equations (Graph ODEs) on graph embeddings and node +coordinates, simultaneously. Theoretically, we first prove that \method{} +maintains the equivariant property. Furthermore, we provide theoretical +insights showing that \method{} effectively alleviates the over-smoothing +problem in both feature representation and coordinate update. Additionally, we +demonstrate that the proposed \method{} mitigates the exploding and vanishing +gradients problem, facilitating the training of deep multi-layer GNNs. +Extensive experiments on benchmark datasets validate the superiority of the +proposed \method{} compared to baselines. + +
+
+
+
+
+ + ☆ Building 6G Radio Foundation Models with Transformer Architectures + + +
+ Foundation deep learning (DL) models are general models, designed to learn +general, robust and adaptable representations of their target modality, +enabling finetuning across a range of downstream tasks. These models are +pretrained on large, unlabeled datasets using self-supervised learning (SSL). +Foundation models have demonstrated better generalization than traditional +supervised approaches, a critical requirement for wireless communications where +the dynamic environment demands model adaptability. In this work, we propose +and demonstrate the effectiveness of a Vision Transformer (ViT) as a radio +foundation model for spectrogram learning. We introduce a Masked Spectrogram +Modeling (MSM) approach to pretrain the ViT in a self-supervised fashion. We +evaluate the ViT-based foundation model on two downstream tasks: Channel State +Information (CSI)-based Human Activity sensing and Spectrogram Segmentation. +Experimental results demonstrate competitive performance to supervised training +while generalizing across diverse domains. Notably, the pretrained ViT model +outperforms a four-times larger model that is trained from scratch on the +spectrogram segmentation task, while requiring significantly less training +time, and achieves competitive performance on the CSI-based human activity +sensing task. This work demonstrates the effectiveness of ViT with MSM for +pretraining as a promising technique for scalable foundation model development +in future 6G networks. + +
+
+
+
+
+ + ☆ Unlocking Transfer Learning for Open-World Few-Shot Recognition + + +
+ Few-Shot Open-Set Recognition (FSOSR) targets a critical real-world +challenge, aiming to categorize inputs into known categories, termed closed-set +classes, while identifying open-set inputs that fall outside these classes. +Although transfer learning where a model is tuned to a given few-shot task has +become a prominent paradigm in closed-world, we observe that it fails to expand +to open-world. To unlock this challenge, we propose a two-stage method which +consists of open-set aware meta-learning with open-set free transfer learning. +In the open-set aware meta-learning stage, a model is trained to establish a +metric space that serves as a beneficial starting point for the subsequent +stage. During the open-set free transfer learning stage, the model is further +adapted to a specific target task through transfer learning. Additionally, we +introduce a strategy to simulate open-set examples by modifying the training +dataset or generating pseudo open-set examples. The proposed method achieves +state-of-the-art performance on two widely recognized benchmarks, miniImageNet +and tieredImageNet, with only a 1.5\% increase in training effort. Our work +demonstrates the effectiveness of transfer learning in FSOSR. + +
+
+
+
+
+ + ☆ Large Language Models as User-Agents for Evaluating + Task-Oriented-Dialogue Systems + + +
+ Traditionally, offline datasets have been used to evaluate task-oriented +dialogue (TOD) models. These datasets lack context awareness, making them +suboptimal benchmarks for conversational systems. In contrast, user-agents, +which are context-aware, can simulate the variability and unpredictability of +human conversations, making them better alternatives as evaluators. Prior +research has utilized large language models (LLMs) to develop user-agents. Our +work builds upon this by using LLMs to create user-agents for the evaluation of +TOD systems. This involves prompting an LLM, using in-context examples as +guidance, and tracking the user-goal state. Our evaluation of diversity and +task completion metrics for the user-agents shows improved performance with the +use of better prompts. Additionally, we propose methodologies for the automatic +evaluation of TOD models within this dynamic framework. + +
+
+
+
+
+ + ☆ Steering AI-Driven Personalization of Scientific Text for General + Audiences + + +
+ Digital media platforms (e.g., social media, science blogs) offer +opportunities to communicate scientific content to general audiences at scale. +However, these audiences vary in their scientific expertise, literacy levels, +and personal backgrounds, making effective science communication challenging. +To address this challenge, we designed TranSlider, an AI-powered tool that +generates personalized translations of scientific text based on individual user +profiles (e.g., hobbies, location, and education). Our tool features an +interactive slider that allows users to steer the degree of personalization +from 0 (weakly relatable) to 100 (strongly relatable), leveraging LLMs to +generate the translations with given degrees. Through an exploratory study with +15 participants, we investigated both the utility of these AI-personalized +translations and how interactive reading features influenced users' +understanding and reading experiences. We found that participants who preferred +higher degrees of personalization appreciated the relatable and contextual +translations, while those who preferred lower degrees valued concise +translations with subtle contextualization. Furthermore, participants reported +the compounding effect of multiple translations on their understanding of +scientific content. Given these findings, we discuss several implications of +AI-personalized translation tools in facilitating communication in +collaborative contexts. + +
+
+ comment: 23 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ Seeing Clearly by Layer Two: Enhancing Attention Heads to Alleviate + Hallucination in LVLMs + + +
+ The hallucination problem in multimodal large language models (MLLMs) remains +a common issue. Although image tokens occupy a majority of the input sequence +of MLLMs, there is limited research to explore the relationship between image +tokens and hallucinations. In this paper, we analyze the distribution of +attention scores for image tokens across each layer and head of the model, +revealing an intriguing and common phenomenon: most hallucinations are closely +linked to the pattern of attention sinks in the self-attention matrix of image +tokens, where shallow layers exhibit dense attention sinks and deeper layers +show sparse attention sinks. We further analyze the attention heads of +different layers and find that heads with high-density attention sink in the +image part play a positive role in alleviating hallucinations. In this paper, +we propose a training-free method named \textcolor{red}{\textbf{E}}nhancing +\textcolor{red}{\textbf{A}}ttention \textcolor{red}{\textbf{H}}eads (EAH), an +approach designed to enhance the convergence of image tokens attention sinks in +the shallow layers. EAH identifies the attention head that shows the vision +sink in a shallow layer and extracts its attention matrix. This attention map +is then broadcast to other heads in the layer, thereby strengthening the layer +to pay more attention to the image itself. With extensive experiments, EAH +shows significant hallucination-mitigating performance on different MLLMs and +metrics, proving its effectiveness and generality. + +
+
+
+
+
+ + ☆ Instruction-Guided Editing Controls for Images and Multimedia: A Survey + in LLM era + + +
+ The rapid advancement of large language models (LLMs) and multimodal learning +has transformed digital content creation and manipulation. Traditional visual +editing tools require significant expertise, limiting accessibility. Recent +strides in instruction-based editing have enabled intuitive interaction with +visual content, using natural language as a bridge between user intent and +complex editing operations. This survey provides an overview of these +techniques, focusing on how LLMs and multimodal models empower users to achieve +precise visual modifications without deep technical knowledge. By synthesizing +over 100 publications, we explore methods from generative adversarial networks +to diffusion models, examining multimodal integration for fine-grained content +control. We discuss practical applications across domains such as fashion, 3D +scene manipulation, and video synthesis, highlighting increased accessibility +and alignment with human intuition. Our survey compares existing literature, +emphasizing LLM-empowered editing, and identifies key challenges to stimulate +further research. We aim to democratize powerful visual editing across various +industries, from entertainment to education. Interested readers are encouraged +to access our repository at +https://github.com/tamlhp/awesome-instruction-editing. + +
+
+
+
+
+ + ☆ GGAvatar: Reconstructing Garment-Separated 3D Gaussian Splatting Avatars + from Monocular Video + + +
+ Avatar modelling has broad applications in human animation and virtual +try-ons. Recent advancements in this field have focused on high-quality and +comprehensive human reconstruction but often overlook the separation of +clothing from the body. To bridge this gap, this paper introduces GGAvatar +(Garment-separated 3D Gaussian Splatting Avatar), which relies on monocular +videos. Through advanced parameterized templates and unique phased training, +this model effectively achieves decoupled, editable, and realistic +reconstruction of clothed humans. Comparative evaluations with other costly +models confirm GGAvatar's superior quality and efficiency in modelling both +clothed humans and separable garments. The paper also showcases applications in +clothing editing, as illustrated in Figure 1, highlighting the model's benefits +and the advantages of effective disentanglement. The code is available at +https://github.com/J-X-Chen/GGAvatar/. + +
+
+ comment: MMAsia'24 Accepted +
+
+
+
+
+ + ☆ TEESlice: Protecting Sensitive Neural Network Models in Trusted + Execution Environments When Attackers have Pre-Trained Models + + +
+ Trusted Execution Environments (TEE) are used to safeguard on-device models. +However, directly employing TEEs to secure the entire DNN model is challenging +due to the limited computational speed. Utilizing GPU can accelerate DNN's +computation speed but commercial widely-available GPUs usually lack security +protection. To this end, scholars introduce TSDP, a method that protects +privacy-sensitive weights within TEEs and offloads insensitive weights to GPUs. +Nevertheless, current methods do not consider the presence of a knowledgeable +adversary who can access abundant publicly available pre-trained models and +datasets. This paper investigates the security of existing methods against such +a knowledgeable adversary and reveals their inability to fulfill their security +promises. Consequently, we introduce a novel partition before training +strategy, which effectively separates privacy-sensitive weights from other +components of the model. Our evaluation demonstrates that our approach can +offer full model protection with a computational cost reduced by a factor of +10. In addition to traditional CNN models, we also demonstrate the scalability +to large language models. Our approach can compress the private functionalities +of the large language model to lightweight slices and achieve the same level of +protection as the shielding-whole-model baseline. + +
+
+ comment: Accepted by TOSEM. Extended version of the S&P24 paper + (arXiv:2310.07152) +
+
+
+
+
+ + ☆ JRadiEvo: A Japanese Radiology Report Generation Model Enhanced by + Evolutionary Optimization of Model Merging NeurIPS'24 + + +
+ With the rapid advancement of large language models (LLMs), foundational +models (FMs) have seen significant advancements. Healthcare is one of the most +crucial application areas for these FMs, given the significant time and effort +required for physicians to analyze large volumes of patient data. Recent +efforts have focused on adapting multimodal FMs to the medical domain through +techniques like instruction-tuning, leading to the development of medical +foundation models (MFMs). However, these approaches typically require large +amounts of training data to effectively adapt models to the medical field. +Moreover, most existing models are trained on English datasets, limiting their +practicality in non-English-speaking regions where healthcare professionals and +patients are not always fluent in English. The need for translation introduces +additional costs and inefficiencies. To address these challenges, we propose a +\textbf{J}apanese \textbf{Radi}ology report generation model enhanced by +\textbf{Evo}lutionary optimization of model merging (JRadiEvo). This is the +first attempt to extend a non-medical vision-language foundation model to the +medical domain through evolutionary optimization of model merging. We +successfully created a model that generates accurate Japanese reports from +X-ray images using only 50 translated samples from publicly available data. +This model, developed with highly efficient use of limited data, outperformed +leading models from recent research trained on much larger datasets. +Additionally, with only 8 billion parameters, this relatively compact +foundation model can be deployed locally within hospitals, making it a +practical solution for environments where APIs and other external services +cannot be used due to strict privacy and security requirements. + +
+
+ comment: Accepted by NeurIPS'24 Workshop on AIM-FM: Advancements In Medical + Foundation Models: Explainability, Robustness, Security, and Beyond +
+
+
+
+
+ + ☆ Motion-Grounded Video Reasoning: Understanding and Perceiving Motion at + Pixel Level + + +
+ In this paper, we introduce Motion-Grounded Video Reasoning, a new motion +understanding task that requires generating visual answers (video segmentation +masks) according to the input question, and hence needs implicit spatiotemporal +reasoning and grounding. This task extends existing spatiotemporal grounding +work focusing on explicit action/motion grounding, to a more general format by +enabling implicit reasoning via questions. To facilitate the development of the +new task, we collect a large-scale dataset called GROUNDMORE, which comprises +1,715 video clips, 249K object masks that are deliberately designed with 4 +question types (Causal, Sequential, Counterfactual, and Descriptive) for +benchmarking deep and comprehensive motion reasoning abilities. GROUNDMORE +uniquely requires models to generate visual answers, providing a more concrete +and visually interpretable response than plain texts. It evaluates models on +both spatiotemporal grounding and reasoning, fostering to address complex +challenges in motion-related video reasoning, temporal perception, and +pixel-level understanding. Furthermore, we introduce a novel baseline model +named Motion-Grounded Video Reasoning Assistant (MORA). MORA incorporates the +multimodal reasoning ability from the Multimodal LLM, the pixel-level +perception capability from the grounding model (SAM), and the temporal +perception ability from a lightweight localization head. MORA achieves +respectable performance on GROUNDMORE outperforming the best existing visual +grounding baseline model by an average of 21.5% relatively. We hope this novel +and challenging task will pave the way for future advancements in robust and +general motion understanding via video reasoning segmentation + +
+
+
+
+
+ + ☆ AMXFP4: Taming Activation Outliers with Asymmetric Microscaling + Floating-Point for 4-bit LLM Inference + + +
+ Scaling Large Language Models (LLMs) with extended context lengths has +increased the need for efficient low-bit quantization to manage their +substantial computational demands. However, reducing precision to 4 bits +frequently degrades performance due to activation outliers. To address this, we +propose Asymmetric Microscaling 4-bit Floating-Point (AMXFP4) for efficient LLM +inference. This novel data format leverages asymmetric shared scales to +mitigate outliers while naturally capturing the asymmetry introduced by +group-wise quantization. Unlike conventional 4-bit quantization methods that +rely on data rotation and costly calibration, AMXFP4 uses asymmetric shared +scales for direct 4-bit casting, achieving near-ideal quantization accuracy +across various LLM tasks, including multi-turn conversations, long-context +reasoning, and visual question answering. Our AMXFP4 format significantly +outperforms MXFP4 and other leading quantization techniques, enabling robust, +calibration-free 4-bit inference. + +
+
+
+
+
+ + ☆ Statistical Analysis of Policy Space Compression Problem + + +
+ Policy search methods are crucial in reinforcement learning, offering a +framework to address continuous state-action and partially observable problems. +However, the complexity of exploring vast policy spaces can lead to significant +inefficiencies. Reducing the policy space through policy compression emerges as +a powerful, reward-free approach to accelerate the learning process. This +technique condenses the policy space into a smaller, representative set while +maintaining most of the original effectiveness. Our research focuses on +determining the necessary sample size to learn this compressed set accurately. +We employ R\'enyi divergence to measure the similarity between true and +estimated policy distributions, establishing error bounds for good +approximations. To simplify the analysis, we employ the $l_1$ norm, determining +sample size requirements for both model-based and model-free settings. Finally, +we correlate the error bounds from the $l_1$ norm with those from R\'enyi +divergence, distinguishing between policies near the vertices and those in the +middle of the policy space, to determine the lower and upper bounds for the +required sample sizes. + +
+
+
+
+
+ + ☆ Off-Dynamics Reinforcement Learning via Domain Adaptation and Reward + Augmented Imitation + + +
+ Training a policy in a source domain for deployment in the target domain +under a dynamics shift can be challenging, often resulting in performance +degradation. Previous work tackles this challenge by training on the source +domain with modified rewards derived by matching distributions between the +source and the target optimal trajectories. However, pure modified rewards only +ensure the behavior of the learned policy in the source domain resembles +trajectories produced by the target optimal policies, which does not guarantee +optimal performance when the learned policy is actually deployed to the target +domain. In this work, we propose to utilize imitation learning to transfer the +policy learned from the reward modification to the target domain so that the +new policy can generate the same trajectories in the target domain. Our +approach, Domain Adaptation and Reward Augmented Imitation Learning (DARAIL), +utilizes the reward modification for domain adaptation and follows the general +framework of generative adversarial imitation learning from observation (GAIfO) +by applying a reward augmented estimator for the policy optimization step. +Theoretically, we present an error bound for our method under a mild assumption +regarding the dynamics shift to justify the motivation of our method. +Empirically, our method outperforms the pure modified reward method without +imitation learning and also outperforms other baselines in benchmark +off-dynamics environments. + +
+
+ comment: Published at Neurips 2024 +
+
+
+
+
+ + ☆ A Hybrid Artificial Intelligence System for Automated EEG Background + Analysis and Report Generation + + +
+ Electroencephalography (EEG) plays a crucial role in the diagnosis of various +neurological disorders. However, small hospitals and clinics often lack +advanced EEG signal analysis systems and are prone to misinterpretation in +manual EEG reading. This study proposes an innovative hybrid artificial +intelligence (AI) system for automatic interpretation of EEG background +activity and report generation. The system combines deep learning models for +posterior dominant rhythm (PDR) prediction, unsupervised artifact removal, and +expert-designed algorithms for abnormality detection. For PDR prediction, 1530 +labeled EEGs were used, and the best ensemble model achieved a mean absolute +error (MAE) of 0.237, a root mean square error (RMSE) of 0.359, an accuracy of +91.8% within a 0.6Hz error, and an accuracy of 99% within a 1.2Hz error. The AI +system significantly outperformed neurologists in detecting generalized +background slowing (p = 0.02; F1: AI 0.93, neurologists 0.82) and demonstrated +improved focal abnormality detection, although not statistically significant (p += 0.79; F1: AI 0.71, neurologists 0.55). Validation on both an internal dataset +and the Temple University Abnormal EEG Corpus showed consistent performance +(F1: 0.884 and 0.835, respectively; p = 0.66), demonstrating generalizability. +The use of large language models (LLMs) for report generation demonstrated 100% +accuracy, verified by three other independent LLMs. This hybrid AI system +provides an easily scalable and accurate solution for EEG interpretation in +resource-limited settings, assisting neurologists in improving diagnostic +accuracy and reducing misdiagnosis rates. + +
+
+ comment: Example code available at https://github.com/tcs211/AI_EEEG_REPORT +
+
+
+
+
+ + ☆ InterFormer: Towards Effective Heterogeneous Interaction Learning for + Click-Through Rate Prediction + + +
+ Click-through rate (CTR) prediction, which predicts the probability of a user +clicking an ad, is a fundamental task in recommender systems. The emergence of +heterogeneous information, such as user profile and behavior sequences, depicts +user interests from different aspects. A mutually beneficial integration of +heterogeneous information is the cornerstone towards the success of CTR +prediction. However, most of the existing methods suffer from two fundamental +limitations, including (1) insufficient inter-mode interaction due to the +unidirectional information flow between modes, and (2) aggressive information +aggregation caused by early summarization, resulting in excessive information +loss. To address the above limitations, we propose a novel module named +InterFormer to learn heterogeneous information interaction in an interleaving +style. To achieve better interaction learning, InterFormer enables +bidirectional information flow for mutually beneficial learning across +different modes. To avoid aggressive information aggregation, we retain +complete information in each data mode and use a separate bridging arch for +effective information selection and summarization. Our proposed InterFormer +achieves state-of-the-art performance on three public datasets and a +large-scale industrial dataset. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Enhancing Diffusion Posterior Sampling for Inverse Problems by + Integrating Crafted Measurements + + +
+ Diffusion models have emerged as a powerful foundation model for visual +generation. With an appropriate sampling process, it can effectively serve as a +generative prior to solve general inverse problems. Current posterior sampling +based methods take the measurement (i.e., degraded image sample) into the +posterior sampling to infer the distribution of the target data (i.e., clean +image sample). However, in this manner, we show that high-frequency information +can be prematurely introduced during the early stages, which could induce +larger posterior estimate errors during the restoration sampling. To address +this issue, we first reveal that forming the log posterior gradient with the +noisy measurement ( i.e., samples from a diffusion forward process) instead of +the clean one can benefit the reverse process. Consequently, we propose a novel +diffusion posterior sampling method DPS-CM, which incorporates a Crafted +Measurement (i.e., samples generated by a reverse denoising process, compared +to random sampling with noise in standard methods) to form the posterior +estimate. This integration aims to mitigate the misalignment with the diffusion +prior caused by cumulative posterior estimate errors. Experimental results +demonstrate that our approach significantly improves the overall capacity to +solve general and noisy inverse problems, such as Gaussian deblurring, +super-resolution, inpainting, nonlinear deblurring, and tasks with Poisson +noise, relative to existing approaches. + +
+
+
+
+
+ + ♻ ☆ Temporal Patterns of Multiple Long-Term Conditions in Individuals with + Intellectual Disability Living in Wales: An Unsupervised Clustering Approach + to Disease Trajectories + + +
+ Identifying and understanding the co-occurrence of multiple long-term +conditions (MLTC) in individuals with intellectual disabilities (ID) is vital +for effective healthcare management. These individuals often face earlier onset +and higher prevalence of MLTCs, yet specific co-occurrence patterns remain +unexplored. This study applies an unsupervised approach to characterise MLTC +clusters based on shared disease trajectories using electronic health records +(EHRs) from 13069 individuals with ID in Wales (2000-2021). Disease +associations and temporal directionality were assessed, followed by spectral +clustering to group shared trajectories. The population consisted of 52.3% +males and 47.7% females, with an average of 4.5 conditions per patient. Males +under 45 formed a single cluster dominated by neurological conditions (32.4%), +while males above 45 had three clusters, the largest characterised circulatory +(51.8%). Females under 45 formed one cluster with digestive conditions (24.6%) +as most prevalent, while those aged 45 and older showed two clusters: one +dominated by circulatory (34.1%), and the other by digestive (25.9%) and +musculoskeletal (21.9%) system conditions. Mental illness, epilepsy, and reflux +were common across groups. These clusters offer insights into disease +progression in individuals with ID, informing targeted interventions and +personalised healthcare strategies. + +
+
+
+
+
+ + ♻ ☆ Large Language Model-Based Interpretable Machine Learning Control in + Building Energy Systems + + +
+ The potential of Machine Learning Control (MLC) in HVAC systems is hindered +by its opaque nature and inference mechanisms, which is challenging for users +and modelers to fully comprehend, ultimately leading to a lack of trust in +MLC-based decision-making. To address this challenge, this paper investigates +and explores Interpretable Machine Learning (IML), a branch of Machine Learning +(ML) that enhances transparency and understanding of models and their +inferences, to improve the credibility of MLC and its industrial application in +HVAC systems. Specifically, we developed an innovative framework that combines +the principles of Shapley values and the in-context learning feature of Large +Language Models (LLMs). While the Shapley values are instrumental in dissecting +the contributions of various features in ML models, LLM provides an in-depth +understanding of the non-data-driven or rule-based elements in MLC; combining +them, LLM further packages these insights into a coherent, human-understandable +narrative. The paper presents a case study to demonstrate the feasibility of +the developed IML framework for model predictive control-based precooling under +demand response events in a virtual testbed. The results indicate that the +developed framework generates and explains the control signals in accordance +with the rule-based rationale. + +
+
+
+
+
+ + ♻ ☆ Advancing Building Energy Modeling with Large Language Models: + Exploration and Case Studies + + +
+ The rapid progression in artificial intelligence has facilitated the +emergence of large language models like ChatGPT, offering potential +applications extending into specialized engineering modeling, especially +physics-based building energy modeling. This paper investigates the innovative +integration of large language models with building energy modeling software, +focusing specifically on the fusion of ChatGPT with EnergyPlus. A literature +review is first conducted to reveal a growing trend of incorporating large +language models in engineering modeling, albeit limited research on their +application in building energy modeling. We underscore the potential of large +language models in addressing building energy modeling challenges and outline +potential applications including simulation input generation, simulation output +analysis and visualization, conducting error analysis, co-simulation, +simulation knowledge extraction and training, and simulation optimization. +Three case studies reveal the transformative potential of large language models +in automating and optimizing building energy modeling tasks, underscoring the +pivotal role of artificial intelligence in advancing sustainable building +practices and energy efficiency. The case studies demonstrate that selecting +the right large language model techniques is essential to enhance performance +and reduce engineering efforts. The findings advocate a multidisciplinary +approach in future artificial intelligence research, with implications +extending beyond building energy modeling to other specialized engineering +modeling. + +
+
+
+
+
+ + ♻ ☆ KPC-cF: Aspect-Based Sentiment Analysis via Implicit-Feature Alignment + with Corpus Filtering ICML 2024 + + +
+ Investigations into Aspect-Based Sentiment Analysis (ABSA) for Korean +industrial reviews are notably lacking in the existing literature. Our research +proposes an intuitive and effective framework for ABSA in low-resource +languages such as Korean. It optimizes prediction labels by integrating +translated benchmark and unlabeled Korean data. Using a model fine-tuned on +translated data, we pseudo-labeled the actual Korean NLI set. Subsequently, we +applied LaBSE and \MSP{}-based filtering to this pseudo-NLI set as implicit +feature, enhancing Aspect Category Detection and Polarity determination through +additional training. Incorporating dual filtering, this model bridged dataset +gaps, achieving positive results in Korean ABSA with minimal resources. Through +additional data injection pipelines, our approach aims to utilize high-resource +data and construct effective models within communities, whether corporate or +individual, in low-resource language countries. Compared to English ABSA, our +framework showed an approximately 3\% difference in F1 scores and accuracy. We +release the dataset and our code for Korean ABSA, at this link. + +
+
+ comment: Work in Progress, DMLR@ICML 2024 +
+
+
+
+
+ + ♻ ☆ Exploring GPU-to-GPU Communication: Insights into Supercomputer + Interconnects + + +
+ Multi-GPU nodes are increasingly common in the rapidly evolving landscape of +exascale supercomputers. On these systems, GPUs on the same node are connected +through dedicated networks, with bandwidths up to a few terabits per second. +However, gauging performance expectations and maximizing system efficiency is +challenging due to different technologies, design options, and software layers. +This paper comprehensively characterizes three supercomputers - Alps, Leonardo, +and LUMI - each with a unique architecture and design. We focus on performance +evaluation of intra-node and inter-node interconnects on up to 4096 GPUs, using +a mix of intra-node and inter-node benchmarks. By analyzing its limitations and +opportunities, we aim to offer practical guidance to researchers, system +architects, and software developers dealing with multi-GPU supercomputing. Our +results show that there is untapped bandwidth, and there are still many +opportunities for optimization, ranging from network to software optimization. + +
+
+
+
+
+ + ♻ ☆ Risk Sources and Risk Management Measures in Support of Standards for + General-Purpose AI Systems + + +
+ There is an urgent need to identify both short and long-term risks from newly +emerging types of Artificial Intelligence (AI), as well as available risk +management measures. In response, and to support global efforts in regulating +AI and writing safety standards, we compile an extensive catalog of risk +sources and risk management measures for general-purpose AI (GPAI) systems, +complete with descriptions and supporting examples where relevant. This work +involves identifying technical, operational, and societal risks across model +development, training, and deployment stages, as well as surveying established +and experimental methods for managing these risks. To the best of our +knowledge, this paper is the first of its kind to provide extensive +documentation of both GPAI risk sources and risk management measures that are +descriptive, self-contained and neutral with respect to any existing regulatory +framework. This work intends to help AI providers, standards experts, +researchers, policymakers, and regulators in identifying and mitigating +systemic risks from GPAI systems. For this reason, the catalog is released +under a public domain license for ease of direct use by stakeholders in AI +governance and standards. + +
+
+ comment: 92 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Mitigating the Linguistic Gap with Phonemic Representations for Robust + Cross-lingual Transfer EMNLP 2024 + + +
+ Approaches to improving multilingual language understanding often struggle +with significant performance gaps between high-resource and low-resource +languages. While there are efforts to align the languages in a single latent +space to mitigate such gaps, how different input-level representations +influence such gaps has not been investigated, particularly with phonemic +inputs. We hypothesize that the performance gaps are affected by representation +discrepancies between these languages, and revisit the use of phonemic +representations as a means to mitigate these discrepancies. To demonstrate the +effectiveness of phonemic representations, we present experiments on three +representative cross-lingual tasks on 12 languages in total. The results show +that phonemic representations exhibit higher similarities between languages +compared to orthographic representations, and it consistently outperforms +grapheme-based baseline model on languages that are relatively low-resourced. +We present quantitative evidence from three cross-lingual tasks that +demonstrate the effectiveness of phonemic representations, and it is further +justified by a theoretical analysis of the cross-lingual performance gap. + +
+
+ comment: Accepted to the 4th Multilingual Representation Learning (MRL) + Workshop (co-located with EMNLP 2024) +
+
+
+
+
+ + ♻ ☆ Provocation: Who benefits from "inclusion" in Generative AI? NeurIPS 2024 + + +
+ The demands for accurate and representative generative AI systems means there +is an increased demand on participatory evaluation structures. While these +participatory structures are paramount to to ensure non-dominant values, +knowledge and material culture are also reflected in AI models and the media +they generate, we argue that dominant structures of community participation in +AI development and evaluation are not explicit enough about the benefits and +harms that members of socially marginalized groups may experience as a result +of their participation. Without explicit interrogation of these benefits by AI +developers, as a community we may remain blind to the immensity of systemic +change that is needed as well. To support this provocation, we present a +speculative case study, developed from our own collective experiences as AI +researchers. We use this speculative context to itemize the barriers that need +to be overcome in order for the proposed benefits to marginalized communities +to be realized, and harms mitigated. + +
+
+ comment: 3 pages, 1 figure. Published as a Short Paper in the NeurIPS 2024 + Workshop on Evaluating Evaluations: Examining Best Practices for Measuring + Broader Impacts of Generative AI +
+
+
+
+
+ + ♻ ☆ CE-SSL: Computation-Efficient Semi-Supervised Learning for ECG-based + Cardiovascular Diseases Detection + + +
+ The label scarcity problem is the main challenge that hinders the wide +application of deep learning systems in automatic cardiovascular diseases +(CVDs) detection using electrocardiography (ECG). Tuning pre-trained models +alleviates this problem by transferring knowledge learned from large datasets +to downstream small datasets. However, bottlenecks in computational efficiency +and detection performance limit its clinical applications. It is difficult to +improve the detection performance without significantly sacrificing the +computational efficiency during model training. Here, we propose a +computation-efficient semi-supervised learning paradigm (CE-SSL) for robust and +computation-efficient CVDs detection using ECG. It enables a robust adaptation +of pre-trained models on downstream datasets with limited supervision and high +computational efficiency. First, a random-deactivation technique is developed +to achieve robust and fast low-rank adaptation of pre-trained weights. +Subsequently, we propose a one-shot rank allocation module to determine the +optimal ranks for the update matrices of the pre-trained weights. Finally, a +lightweight semi-supervised learning pipeline is introduced to enhance model +performance by leveraging labeled and unlabeled data with high computational +efficiency. Extensive experiments on four downstream datasets demonstrate that +CE-SSL not only outperforms the state-of-the-art methods in multi-label CVDs +detection but also consumes fewer GPU footprints, training time, and parameter +storage space. As such, this paradigm provides an effective solution for +achieving high computational efficiency and robust detection performance in the +clinical applications of pre-trained models under limited supervision. Code and +Supplementary Materials are available at https://github.com/KAZABANA/CE-SSL + +
+
+
+
+
+ + ♻ ☆ Inconsistencies In Consistency Models: Better ODE Solving Does Not Imply + Better Samples NeurIPS 2024 + + +
+ Although diffusion models can generate remarkably high-quality samples, they +are intrinsically bottlenecked by their expensive iterative sampling procedure. +Consistency models (CMs) have recently emerged as a promising diffusion model +distillation method, reducing the cost of sampling by generating high-fidelity +samples in just a few iterations. Consistency model distillation aims to solve +the probability flow ordinary differential equation (ODE) defined by an +existing diffusion model. CMs are not directly trained to minimize error +against an ODE solver, rather they use a more computationally tractable +objective. As a way to study how effectively CMs solve the probability flow +ODE, and the effect that any induced error has on the quality of generated +samples, we introduce Direct CMs, which \textit{directly} minimize this error. +Intriguingly, we find that Direct CMs reduce the ODE solving error compared to +CMs but also result in significantly worse sample quality, calling into +question why exactly CMs work well in the first place. Full code is available +at: https://github.com/layer6ai-labs/direct-cms. + +
+
+ comment: NeurIPS 2024 ATTRIB Workshop +
+
+
+
+
+ + ♻ ☆ ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric + Thermal Images + + +
+ Designing egocentric 3D hand pose estimation systems that can perform +reliably in complex, real-world scenarios is crucial for downstream +applications. Previous approaches using RGB or NIR imagery struggle in +challenging conditions: RGB methods are susceptible to lighting variations and +obstructions like handwear, while NIR techniques can be disrupted by sunlight +or interference from other NIR-equipped devices. To address these limitations, +we present ThermoHands, the first benchmark focused on thermal image-based +egocentric 3D hand pose estimation, demonstrating the potential of thermal +imaging to achieve robust performance under these conditions. The benchmark +includes a multi-view and multi-spectral dataset collected from 28 subjects +performing hand-object and hand-virtual interactions under diverse scenarios, +accurately annotated with 3D hand poses through an automated process. We +introduce a new baseline method, TherFormer, utilizing dual transformer modules +for effective egocentric 3D hand pose estimation in thermal imagery. Our +experimental results highlight TherFormer's leading performance and affirm +thermal imaging's effectiveness in enabling robust 3D hand pose estimation in +adverse conditions. + +
+
+ comment: 15 pages, 9 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Unlocking Real-Time Fluorescence Lifetime Imaging: Multi-Pixel + Parallelism for FPGA-Accelerated Processing + + +
+ Fluorescence lifetime imaging (FLI) is a widely used technique in the +biomedical field for measuring the decay times of fluorescent molecules, +providing insights into metabolic states, protein interactions, and +ligand-receptor bindings. However, its broader application in fast biological +processes, such as dynamic activity monitoring, and clinical use, such as in +guided surgery, is limited by long data acquisition times and computationally +demanding data processing. While deep learning has reduced post-processing +times, time-resolved data acquisition remains a bottleneck for real-time +applications. To address this, we propose a method to achieve real-time FLI +using an FPGA-based hardware accelerator. Specifically, we implemented a +GRU-based sequence-to-sequence (Seq2Seq) model on an FPGA board compatible with +time-resolved cameras. The GRU model balances accurate processing with the +resource constraints of FPGAs, which have limited DSP units and BRAM. The +limited memory and computational resources on the FPGA require efficient +scheduling of operations and memory allocation to deploy deep learning models +for low-latency applications. We address these challenges by using STOMP, a +queue-based discrete-event simulator that automates and optimizes task +scheduling and memory management on hardware. By integrating a GRU-based +Seq2Seq model and its compressed version, called Seq2SeqLite, generated through +knowledge distillation, we were able to process multiple pixels in parallel, +reducing latency compared to sequential processing. We explore various levels +of parallelism to achieve an optimal balance between performance and resource +utilization. Our results indicate that the proposed techniques achieved a 17.7x +and 52.0x speedup over manual scheduling for the Seq2Seq model and the +Seq2SeqLite model, respectively. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Disclosure of AI-Generated News Increases Engagement but Does Not Reduce + Aversion, Despite Positive Quality Ratings + + +
+ The advancement of artificial intelligence (AI) has led to its application in +many areas, including news media. The integration of AI in journalism presents +both opportunities and risks for democracy, making it crucial to understand +public reception of and engagement with AI-generated news, as it may directly +influence political knowledge and trust. This preregistered study investigates +(i) the perceived quality of AI-assisted and AI-generated versus +human-generated news articles, (ii) whether disclosure of AI's involvement in +generating these news articles influences engagement with them, and (iii) +whether such awareness affects the willingness to read AI-generated articles in +the future. We employed a between-subjects survey experiment with 599 +participants from the German-speaking part of Switzerland, who evaluated the +credibility, readability, and expertise of news articles. These articles were +either written by journalists (control group), rewritten by AI (AI-assisted +group), or entirely generated by AI (AI-generated group). Our results indicate +that all news articles, regardless of whether they were written by journalists +or AI, were perceived to be of equal quality. When participants in the +treatment groups were subsequently made aware of AI's involvement in generating +the articles, they expressed a higher willingness to engage with (i.e., +continue reading) the articles than participants in the control group. However, +they were not more willing to read AI-generated news in the future. These +results suggest that aversion to AI usage in news media is not primarily rooted +in a perceived lack of quality, and that by disclosing using AI, journalists +could attract more immediate engagement with their content, at least in the +short term. + +
+
+
+
+
+ + ♻ ☆ CLCE: An Approach to Refining Cross-Entropy and Contrastive Learning for + Optimized Learning Fusion + + +
+ State-of-the-art pre-trained image models predominantly adopt a two-stage +approach: initial unsupervised pre-training on large-scale datasets followed by +task-specific fine-tuning using Cross-Entropy loss~(CE). However, it has been +demonstrated that CE can compromise model generalization and stability. While +recent works employing contrastive learning address some of these limitations +by enhancing the quality of embeddings and producing better decision +boundaries, they often overlook the importance of hard negative mining and rely +on resource intensive and slow training using large sample batches. To counter +these issues, we introduce a novel approach named CLCE, which integrates +Label-Aware Contrastive Learning with CE. Our approach not only maintains the +strengths of both loss functions but also leverages hard negative mining in a +synergistic way to enhance performance. Experimental results demonstrate that +CLCE significantly outperforms CE in Top-1 accuracy across twelve benchmarks, +achieving gains of up to 3.52% in few-shot learning scenarios and 3.41% in +transfer learning settings with the BEiT-3 model. Importantly, our proposed +CLCE approach effectively mitigates the dependency of contrastive learning on +large batch sizes such as 4096 samples per batch, a limitation that has +previously constrained the application of contrastive learning in +budget-limited hardware environments. + +
+
+
+
+
+ + ♻ ☆ Optimization-based Prompt Injection Attack to LLM-as-a-Judge + + +
+ LLM-as-a-Judge uses a large language model (LLM) to select the best response +from a set of candidates for a given question. LLM-as-a-Judge has many +applications such as LLM-powered search, reinforcement learning with AI +feedback (RLAIF), and tool selection. In this work, we propose JudgeDeceiver, +an optimization-based prompt injection attack to LLM-as-a-Judge. JudgeDeceiver +injects a carefully crafted sequence into an attacker-controlled candidate +response such that LLM-as-a-Judge selects the candidate response for an +attacker-chosen question no matter what other candidate responses are. +Specifically, we formulate finding such sequence as an optimization problem and +propose a gradient based method to approximately solve it. Our extensive +evaluation shows that JudgeDeceive is highly effective, and is much more +effective than existing prompt injection attacks that manually craft the +injected sequences and jailbreak attacks when extended to our problem. We also +show the effectiveness of JudgeDeceiver in three case studies, i.e., +LLM-powered search, RLAIF, and tool selection. Moreover, we consider defenses +including known-answer detection, perplexity detection, and perplexity windowed +detection. Our results show these defenses are insufficient, highlighting the +urgent need for developing new defense strategies. Our implementation is +available at this repository: https://github.com/ShiJiawenwen/JudgeDeceiver. + +
+
+ comment: To appear in the Proceedings of The ACM Conference on Computer and + Communications Security (CCS), 2024 +
+
+
+
+
+ + ♻ ☆ DCD: Discriminative and Consistent Representation Distillation + + +
+ Knowledge Distillation (KD) aims to transfer knowledge from a large teacher +model to a smaller student model. While contrastive learning has shown promise +in self-supervised learning by creating discriminative representations, its +application in knowledge distillation remains limited and focuses primarily on +discrimination, neglecting the structural relationships captured by the teacher +model. To address this limitation, we propose Discriminative and Consistent +Distillation (DCD), which employs a contrastive loss along with a consistency +regularization to minimize the discrepancy between the distributions of teacher +and student representations. Our method introduces learnable temperature and +bias parameters that adapt during training to balance these complementary +objectives, replacing the fixed hyperparameters commonly used in contrastive +learning approaches. Through extensive experiments on CIFAR-100 and ImageNet +ILSVRC-2012, we demonstrate that DCD achieves state-of-the-art performance, +with the student model sometimes surpassing the teacher's accuracy. +Furthermore, we show that DCD's learned representations exhibit superior +cross-dataset generalization when transferred to Tiny ImageNet and STL-10. Code +is available at https://github.com/giakoumoglou/distillers. + +
+
+ comment: 11 pages, 3 figures, 6 tables. The paper's title has been changed, + again +
+
+
+
+
+ + ♻ ☆ Fault Injection and Safe-Error Attack for Extraction of Embedded Neural + Network Models ECAI + + +
+ Model extraction emerges as a critical security threat with attack vectors +exploiting both algorithmic and implementation-based approaches. The main goal +of an attacker is to steal as much information as possible about a protected +victim model, so that he can mimic it with a substitute model, even with a +limited access to similar training data. Recently, physical attacks such as +fault injection have shown worrying efficiency against the integrity and +confidentiality of embedded models. We focus on embedded deep neural network +models on 32-bit microcontrollers, a widespread family of hardware platforms in +IoT, and the use of a standard fault injection strategy - Safe Error Attack +(SEA) - to perform a model extraction attack with an adversary having a limited +access to training data. Since the attack strongly depends on the input +queries, we propose a black-box approach to craft a successful attack set. For +a classical convolutional neural network, we successfully recover at least 90% +of the most significant bits with about 1500 crafted inputs. These information +enable to efficiently train a substitute model, with only 8% of the training +dataset, that reaches high fidelity and near identical accuracy level than the +victim model. + +
+
+ comment: Accepted at SECAI Workshop, ESORICS 2023 (v2. Fix notations) +
+
+
+
+
+ + ♻ ☆ An Ontology-based Approach Towards Traceable Behavior Specifications in + Automated Driving + + +
+ Vehicles in public traffic that are equipped with Automated Driving Systems +are subject to a number of expectations: Among other aspects, their behavior +should be safe, conforming to the rules of the road and provide mobility to +their users. This poses challenges for the developers of such systems: +Developers are responsible for specifying this behavior, for example, in terms +of requirements at system design time. As we will discuss in the article, this +specification always involves the need for assumptions and trade-offs. As a +result, insufficiencies in such a behavior specification can occur that can +potentially lead to unsafe system behavior. In order to support the +identification of specification insufficiencies, requirements and respective +assumptions need to be made explicit. In this article, we propose the Semantic +Norm Behavior Analysis as an ontology-based approach to specify the behavior +for an Automated Driving System equipped vehicle. We use ontologies to formally +represent specified behavior for a targeted operational environment, and to +establish traceability between specified behavior and the addressed stakeholder +needs. Furthermore, we illustrate the application of the Semantic Norm Behavior +Analysis in a German legal context with two example scenarios and evaluate our +results. Our evaluation shows that the explicit documentation of assumptions in +the behavior specification supports both the identification of specification +insufficiencies and their treatment. Therefore, this article provides +requirements, terminology and an according methodology to facilitate +ontology-based behavior specifications in automated driving. + +
+
+ comment: 24 pages, 12 figures, submitted for publication +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration From Cognitive Psychology + + +
+ The cognitive mechanism by which Large Language Models (LLMs) solve +mathematical problems remains a widely debated and unresolved issue. Currently, +there is little interpretable experimental evidence that connects LLMs' +problem-solving with human cognitive psychology.To determine if LLMs possess +human-like mathematical reasoning, we modified the problems used in the human +Cognitive Reflection Test (CRT). Our results show that, even with the use of +Chains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model +(noted for its reasoning capabilities), have a high error rate when solving +these modified CRT problems. Specifically, the average accuracy rate dropped by +up to 50% compared to the original questions.Further analysis of LLMs' +incorrect answers suggests that they primarily rely on pattern matching from +their training data, which aligns more with human intuition (System 1 thinking) +rather than with human-like reasoning (System 2 thinking). This finding +challenges the belief that LLMs have genuine mathematical reasoning abilities +comparable to humans. As a result, this work may adjust overly optimistic views +on LLMs' progress towards artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ Interpretable Concept-Based Memory Reasoning + + +
+ The lack of transparency in the decision-making processes of deep learning +systems presents a significant challenge in modern artificial intelligence +(AI), as it impairs users' ability to rely on and verify these systems. To +address this challenge, Concept Bottleneck Models (CBMs) have made significant +progress by incorporating human-interpretable concepts into deep learning +architectures. This approach allows predictions to be traced back to specific +concept patterns that users can understand and potentially intervene on. +However, existing CBMs' task predictors are not fully interpretable, preventing +a thorough analysis and any form of formal verification of their +decision-making process prior to deployment, thereby raising significant +reliability concerns. To bridge this gap, we introduce Concept-based Memory +Reasoner (CMR), a novel CBM designed to provide a human-understandable and +provably-verifiable task prediction process. Our approach is to model each task +prediction as a neural selection mechanism over a memory of learnable logic +rules, followed by a symbolic evaluation of the selected rule. The presence of +an explicit memory and the symbolic evaluation allow domain experts to inspect +and formally verify the validity of certain global properties of interest for +the task prediction process. Experimental results demonstrate that CMR achieves +better accuracy-interpretability trade-offs to state-of-the-art CBMs, discovers +logic rules consistent with ground truths, allows for rule interventions, and +allows pre-deployment verification. + +
+
+
+
+
+ + ♻ ☆ FGCE: Feasible Group Counterfactual Explanations for Auditing Fairness + + +
+ This paper introduces the first graph-based framework for generating group +counterfactual explanations to audit model fairness, a crucial aspect of +trustworthy machine learning. Counterfactual explanations are instrumental in +understanding and mitigating unfairness by revealing how inputs should change +to achieve a desired outcome. Our framework, named Feasible Group +Counterfactual Explanations (FGCEs), captures real-world feasibility +constraints and constructs subgroups with similar counterfactuals, setting it +apart from existing methods. It also addresses key trade-offs in counterfactual +generation, including the balance between the number of counterfactuals, their +associated costs, and the breadth of coverage achieved. To evaluate these +trade-offs and assess fairness, we propose measures tailored to group +counterfactual generation. Our experimental results on benchmark datasets +demonstrate the effectiveness of our approach in managing feasibility +constraints and trade-offs, as well as the potential of our proposed metrics in +identifying and quantifying fairness issues. + +
+
+
+
+
+ + ♻ ☆ Adversarial Robustness of VAEs across Intersectional Subgroups + + +
+ Despite advancements in Autoencoders (AEs) for tasks like dimensionality +reduction, representation learning and data generation, they remain vulnerable +to adversarial attacks. Variational Autoencoders (VAEs), with their +probabilistic approach to disentangling latent spaces, show stronger resistance +to such perturbations compared to deterministic AEs; however, their resilience +against adversarial inputs is still a concern. This study evaluates the +robustness of VAEs against non-targeted adversarial attacks by optimizing +minimal sample-specific perturbations to cause maximal damage across diverse +demographic subgroups (combinations of age and gender). We investigate two +questions: whether there are robustness disparities among subgroups, and what +factors contribute to these disparities, such as data scarcity and +representation entanglement. Our findings reveal that robustness disparities +exist but are not always correlated with the size of the subgroup. By using +downstream gender and age classifiers and examining latent embeddings, we +highlight the vulnerability of subgroups like older women, who are prone to +misclassification due to adversarial perturbations pushing their +representations toward those of other subgroups. + +
+
+
+
+
+ + ♻ ☆ Communication Compression for Tensor Parallel LLM Inference + + +
+ Large Language Models (LLMs) have pushed the frontier of artificial +intelligence but are comprised of hundreds of billions of parameters and +operations. For faster inference latency, LLMs are deployed on multiple +hardware accelerators through various Model Parallelism strategies. Our paper +looks into the details on one such strategy - Tensor Parallel - and proposes to +reduce latency by compressing inter-accelerator communication. We leverage fine +grained quantization techniques to compress selected activations by 3.5 - 4.5x. +Our proposed method leads up to 2x reduction of time-to-first-token (TTFT) with +negligible model performance degradation. + +
+
+
+
+
+ + ♻ ☆ Automated Segmentation of Ischemic Stroke Lesions in Non-Contrast + Computed Tomography Images for Enhanced Treatment and Prognosis MICCAI + + +
+ Stroke is the second leading cause of death worldwide, and is increasingly +prevalent in low- and middle-income countries (LMICs). Timely interventions can +significantly influence stroke survivability and the quality of life after +treatment. However, the standard and most widely available imaging method for +confirming strokes and their sub-types, the NCCT, is more challenging and +time-consuming to employ in cases of ischemic stroke. For this reason, we +developed an automated method for ischemic stroke lesion segmentation in NCCTs +using the nnU-Net frame work, aimed at enhancing early treatment and improving +the prognosis of ischemic stroke patients. We achieved Dice scores of 0.596 and +Intersection over Union (IoU) scores of 0.501 on the sampled dataset. After +adjusting for outliers, these scores improved to 0.752 for the Dice score and +0.643 for the IoU. Proper delineation of the region of infarction can help +clinicians better assess the potential impact of the infarction, and guide +treatment procedures. + +
+
+ comment: 7 pages, 3 figures, MICCAI Meets Africa Workshop +
+
+
+
+
+ + ♻ ☆ Dockformer: A transformer-based molecular docking paradigm for + large-scale virtual screening + + +
+ Molecular docking enables virtual screening of compound libraries to identify +potential ligands that target proteins of interest, a crucial step in drug +development; however, as the size of the compound library increases, the +computational complexity of traditional docking models increases. Deep learning +algorithms can provide data-driven research and development models to increase +the speed of the docking process. Unfortunately, few models can achieve +superior screening performance compared to that of traditional models. +Therefore, a novel deep learning-based docking approach named Dockformer is +introduced in this study. Dockformer leverages multimodal information to +capture the geometric topology and structural knowledge of molecules and can +directly generate binding conformations with the corresponding confidence +measures in an end-to-end manner. The experimental results show that Dockformer +achieves success rates of 90.53\% and 82.71\% on the PDBbind core set and +PoseBusters benchmarks, respectively, and more than a 100-fold increase in the +inference process speed, outperforming almost all state-of-the-art docking +methods. In addition, the ability of Dockformer to identify the main protease +inhibitors of coronaviruses is demonstrated in a real-world virtual screening +scenario. Considering its high docking accuracy and screening efficiency, +Dockformer can be regarded as a powerful and robust tool in the field of drug +design. + +
+
+ comment: 14 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ DiffLoRA: Generating Personalized Low-Rank Adaptation Weights with + Diffusion + + +
+ Personalized text-to-image generation has gained significant attention for +its capability to generate high-fidelity portraits of specific identities +conditioned on user-defined prompts. Existing methods typically involve +test-time fine-tuning or incorporating an additional pre-trained branch. +However, these approaches struggle to simultaneously address efficiency, +identity fidelity, and the preservation of the model's original generative +capabilities. In this paper, we propose DiffLoRA, an efficient method that +leverages the diffusion model as a hypernetwork to predict personalized +Low-Rank Adaptation (LoRA) weights based on the reference images. By +incorporating these LoRA weights into the off-the-shelf text-to-image model, +DiffLoRA enables zero-shot personalization during inference, eliminating the +need for post-processing optimization. Moreover, we introduce a novel +identity-oriented LoRA weights construction pipeline to facilitate the training +process of DiffLoRA. The dataset generated through this pipeline enables +DiffLoRA to produce consistently high-quality LoRA weights. Notably, the +distinctive properties of the diffusion model enhance the generation of +superior weights by employing probabilistic modeling to capture intricate +structural patterns and thoroughly explore the weight space. Comprehensive +experimental results demonstrate that DiffLoRA outperforms existing +personalization approaches across multiple benchmarks, achieving both time +efficiency and maintaining identity fidelity throughout the personalization +process. + +
+
+ comment: 9 pages,8 figures +
+
+
+
+
+ + ♻ ☆ SaMoye: Zero-shot Singing Voice Conversion Model Based on Feature + Disentanglement and Enhancement + + +
+ Singing voice conversion (SVC) aims to convert a singer's voice to another +singer's from a reference audio while keeping the original semantics. However, +existing SVC methods can hardly perform zero-shot due to incomplete feature +disentanglement or dependence on the speaker look-up table. We propose the +first open-source high-quality zero-shot SVC model SaMoye that can convert +singing to human and non-human timbre. SaMoye disentangles the singing voice's +features into content, timbre, and pitch features, where we combine multiple +ASR models and compress the content features to reduce timbre leaks. Besides, +we enhance the timbre features by unfreezing the speaker encoder and mixing the +speaker embedding with top-3 similar speakers. We also establish an +unparalleled large-scale dataset to guarantee zero-shot performance, which +comprises more than 1,815 hours of pure singing voice and 6,367 speakers. We +conduct objective and subjective experiments to find that SaMoye outperforms +other models in zero-shot SVC tasks even under extreme conditions like +converting singing to animals' timbre. The code and weight of SaMoye are +available on https://github.com/CarlWangChina/SaMoye-SVC. The weights, code, +dataset, and documents of SaMoye are publicly available on +\url{https://github.com/CarlWangChina/SaMoye-SVC}. + +
+
+ comment: This paper needs major changes for resubmit +
+
+
+
+
+ + ♻ ☆ VLEU: a Method for Automatic Evaluation for Generalizability of + Text-to-Image Models EMNLP2024 + + +
+ Progress in Text-to-Image (T2I) models has significantly improved the +generation of images from textual descriptions. However, existing evaluation +metrics do not adequately assess the models' ability to handle a diverse range +of textual prompts, which is crucial for their generalizability. To address +this, we introduce a new metric called Visual Language Evaluation Understudy +(VLEU). VLEU uses large language models to sample from the visual text domain, +the set of all possible input texts for T2I models, to generate a wide variety +of prompts. The images generated from these prompts are evaluated based on +their alignment with the input text using the CLIP model.VLEU quantifies a +model's generalizability by computing the Kullback-Leibler divergence between +the marginal distribution of the visual text and the conditional distribution +of the images generated by the model. This metric provides a quantitative way +to compare different T2I models and track improvements during model finetuning. +Our experiments demonstrate the effectiveness of VLEU in evaluating the +generalization capability of various T2I models, positioning it as an essential +metric for future research in text-to-image synthesis. + +
+
+ comment: accepted by EMNLP2024(long paper,main conference) +
+
+
+
+
+ + ♻ ☆ Evaluating and Enhancing Large Language Models for Conversational + Reasoning on Knowledge Graphs + + +
+ The development of large language models (LLMs) has been catalyzed by +advancements in pre-training techniques. These models have demonstrated robust +reasoning capabilities through manually designed prompts. In this work, we +evaluate the conversational reasoning capabilities of the current +state-of-the-art LLM (GPT-4) on knowledge graphs (KGs). However, the +performance of LLMs is constrained due to a lack of KG environment awareness +and the difficulties in developing effective optimization mechanisms for +intermediary reasoning stages. We further introduce LLM-ARK, a LLM grounded KG +reasoning agent designed to deliver precise and adaptable predictions on KG +paths. LLM-ARK leverages Full Textual Environment (FTE) prompt to assimilate +state information within each reasoning step. We reframe the challenge of +multi-hop reasoning on the KG as a sequential decision-making task. Utilizing +the Proximal Policy Optimization (PPO) online policy gradient reinforcement +learning algorithm, our model is optimized to learn from rich reward signals. +Additionally, we conduct an evaluation of our model and GPT-4 on the OpenDialKG +dataset. The experimental results reveal that LLaMA-2-7B-ARK outperforms the +current state-of-the-art model by 5.28 percentage points, with a performance +rate of 36.39% on the target@1 evaluation metric. Meanwhile, GPT-4 scored +14.91%, further demonstrating the effectiveness of our method. Our code is +available on GitHub (https://github.com/Aipura/LLM-ARK) for further access. + +
+
+
+
+
+ + ♻ ☆ MANTIS: Interleaved Multi-Image Instruction Tuning + + +
+ Large multimodal models (LMMs) have shown great results in single-image +vision language tasks. However, their abilities to solve multi-image visual +language tasks is yet to be improved. The existing LMMs like OpenFlamingo, +Emu2, and Idefics gain their multi-image ability through pre-training on +hundreds of millions of noisy interleaved image-text data from the web, which +is neither efficient nor effective. In this paper, we aim to build strong +multi-image LMMs via instruction tuning with academic-level resources. +Therefore, we meticulously construct Mantis-Instruct containing 721K +multi-image instruction data to train a family of Mantis models. The +instruction tuning empowers Mantis with different multi-image skills like +co-reference, comparison, reasoning, and temporal understanding. We evaluate +Mantis on 8 multi-image benchmarks and 6 single-image benchmarks. +Mantis-Idefics2 can achieve SoTA results on all the multi-image benchmarks and +beat the strongest multi-image baseline, Idefics2-8B by an average of 13 +absolute points. Notably, Idefics2-8B was pre-trained on 140M interleaved +multi-image data, which is 200x larger than Mantis-Instruct. We observe that +Mantis performs equivalently well on the held-in and held-out benchmarks, which +shows its generalization ability. We further evaluate Mantis on single-image +benchmarks and demonstrate that Mantis also maintains a strong single-image +performance on par with CogVLM and Emu2. Our results show that multi-image +abilities are not necessarily gained through massive pre-training, instead, +they can be gained by low-cost instruction tuning. The training and evaluation +of Mantis has paved the road for future work to improve LMMs' multi-image +abilities. + +
+
+ comment: 13 pages, 3 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ Confidence-aware Denoised Fine-tuning of Off-the-shelf Models for + Certified Robustness + + +
+ The remarkable advances in deep learning have led to the emergence of many +off-the-shelf classifiers, e.g., large pre-trained models. However, since they +are typically trained on clean data, they remain vulnerable to adversarial +attacks. Despite this vulnerability, their superior performance and +transferability make off-the-shelf classifiers still valuable in practice, +demanding further work to provide adversarial robustness for them in a post-hoc +manner. A recently proposed method, denoised smoothing, leverages a denoiser +model in front of the classifier to obtain provable robustness without +additional training. However, the denoiser often creates hallucination, i.e., +images that have lost the semantics of their originally assigned class, leading +to a drop in robustness. Furthermore, its noise-and-denoise procedure +introduces a significant distribution shift from the original distribution, +causing the denoised smoothing framework to achieve sub-optimal robustness. In +this paper, we introduce Fine-Tuning with Confidence-Aware Denoised Image +Selection (FT-CADIS), a novel fine-tuning scheme to enhance the certified +robustness of off-the-shelf classifiers. FT-CADIS is inspired by the +observation that the confidence of off-the-shelf classifiers can effectively +identify hallucinated images during denoised smoothing. Based on this, we +develop a confidence-aware training objective to handle such hallucinated +images and improve the stability of fine-tuning from denoised images. In this +way, the classifier can be fine-tuned using only images that are beneficial for +adversarial robustness. We also find that such a fine-tuning can be done by +updating a small fraction of parameters of the classifier. Extensive +experiments demonstrate that FT-CADIS has established the state-of-the-art +certified robustness among denoised smoothing methods across all +$\ell_2$-adversary radius in various benchmarks. + +
+
+ comment: 26 pages; TMLR 2024; Code is available at + https://github.com/suhyeok24/FT-CADIS +
+
+
+
+
+ + ♻ ☆ HMAFlow: Learning More Accurate Optical Flow via Hierarchical Motion + Field Alignment + + +
+ Optical flow estimation is a fundamental and long-standing visual task. In +this work, we present a novel method, dubbed HMAFlow, to improve optical flow +estimation in challenging scenes, particularly those involving small objects. +The proposed model mainly consists of two core components: a Hierarchical +Motion Field Alignment (HMA) module and a Correlation Self-Attention (CSA) +module. In addition, we rebuild 4D cost volumes by employing a Multi-Scale +Correlation Search (MCS) layer and replacing average pooling in common cost +volumes with a search strategy utilizing multiple search ranges. Experimental +results demonstrate that our model achieves the best generalization performance +compared to other state-of-the-art methods. Specifically, compared with RAFT, +our method achieves relative error reductions of 14.2% and 3.4% on the clean +pass and final pass of the Sintel online benchmark, respectively. On the KITTI +test benchmark, HMAFlow surpasses RAFT and GMA in the Fl-all metric by relative +margins of 6.8% and 7.7%, respectively. To facilitate future research, our code +will be made available at https://github.com/BooTurbo/HMAFlow. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ SC3D: Label-Efficient Outdoor 3D Object Detection via Single Click + Annotation + + +
+ LiDAR-based outdoor 3D object detection has received widespread attention. +However, training 3D detectors from the LiDAR point cloud typically relies on +expensive bounding box annotations. This paper presents SC3D, an innovative +label-efficient method requiring only a single coarse click on the bird's eye +view of the 3D point cloud for each frame. A key challenge here is the absence +of complete geometric descriptions of the target objects from such simple click +annotations. To address this issue, our proposed SC3D adopts a progressive +pipeline. Initially, we design a mixed pseudo-label generation module that +expands limited click annotations into a mixture of bounding box and semantic +mask supervision. Next, we propose a mix-supervised teacher model, enabling the +detector to learn mixed supervision information. Finally, we introduce a +mixed-supervised student network that leverages the teacher model's +generalization ability to learn unclicked instances.Experimental results on the +widely used nuScenes and KITTI datasets demonstrate that our SC3D with only +coarse clicks, which requires only 0.2% annotation cost, achieves +state-of-the-art performance compared to weakly-supervised 3D detection +methods.The code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Networking Systems for Video Anomaly Detection: A Tutorial and Survey + + +
+ The increasing utilization of surveillance cameras in smart cities, coupled +with the surge of online video applications, has heightened concerns regarding +public security and privacy protection, which propelled automated Video Anomaly +Detection (VAD) into a fundamental research task within the Artificial +Intelligence (AI) community. With the advancements in deep learning and edge +computing, VAD has made significant progress and advances synergized with +emerging applications in smart cities and video internet, which has moved +beyond the conventional research scope of algorithm engineering to deployable +Networking Systems for VAD (NSVAD), a practical hotspot for intersection +exploration in the AI, IoVT, and computing fields. In this article, we +delineate the foundational assumptions, learning frameworks, and applicable +scenarios of various deep learning-driven VAD routes, offering an exhaustive +tutorial for novices in NSVAD. This article elucidates core concepts by +reviewing recent advances and typical solutions and aggregating available +research resources accessible at https://github.com/fdjingliu/NSVAD. +Additionally, we showcase our latest NSVAD research in industrial IoT and smart +cities, along with an end-cloud collaborative architecture for deployable +NSVAD. Lastly, this article projects future development trends and discusses +how the integration of AI and computing technologies can address existing +research challenges and promote open opportunities, serving as an insightful +guide for prospective researchers and engineers. + +
+
+ comment: Revised to ACM Computing Surveys, under review, for more information + and supplementary material, please see https://github.com/fdjingliu/NSVAD +
+
+
+
+
+ + ♻ ☆ A Dynamic LLM-Powered Agent Network for Task-Oriented Agent + Collaboration ALT + + +
+ Recent studies show that collaborating multiple large language model (LLM) +powered agents is a promising way for task solving. However, current approaches +are constrained by using a fixed number of agents and static communication +structures. In this work, we propose automatically selecting a team of agents +from candidates to collaborate in a dynamic communication structure toward +different tasks and domains. Specifically, we build a framework named Dynamic +LLM-Powered Agent Network ($\textbf{DyLAN}$) for LLM-powered agent +collaboration, operating a two-stage paradigm: (1) Team Optimization and (2) +Task Solving. During the first stage, we utilize an $\textit{agent selection}$ +algorithm, based on an unsupervised metric called $\textit{Agent Importance +Score}$, enabling the selection of best agents according to their contributions +in a preliminary trial, oriented to the given task. Then, in the second stage, +the selected agents collaborate dynamically according to the query. +Empirically, we demonstrate that DyLAN outperforms strong baselines in code +generation, decision-making, general reasoning, and arithmetic reasoning tasks +with moderate computational cost. On specific subjects in MMLU, selecting a +team of agents in the team optimization stage improves accuracy by up to 25.0% +in DyLAN. + +
+
+ comment: Published in COLM2024. Code Repo: https://github.com/SALT-NLP/DyLAN +
+
+
+
+
+ + ♻ ☆ Effective Generative AI: The Human-Algorithm Centaur + + +
+ Advanced analytics science methods have enabled combining the power of +artificial and human intelligence, creating \textit{centaurs} that allow +superior decision-making. Centaurs are hybrid human-algorithm models that +combine both formal analytics and human intuition in a symbiotic manner within +their learning and reasoning process. We argue that the future of AI +development and use in many domains needs to focus more on centaurs as opposed +to other AI approaches. This paradigm shift towards centaur-based AI methods +raises some fundamental questions: How are centaurs different from other +human-in-the-loop methods? What are the most effective methods for creating +centaurs? When should centaurs be used, and when should the lead be given to +pure AI models? Doesn't the incorporation of human intuition -- which at times +can be misleading -- in centaurs' decision-making process degrade its +performance compared to pure AI methods? This work aims to address these +fundamental questions, focusing on recent advancements in generative AI, and +especially in Large Language Models (LLMs), as a main case study to illustrate +centaurs' critical essentiality to future AI endeavors. + +
+
+ comment: To Appear in SI: Future Shock, Harvard Data Science Review + (https://hdsr.mitpress.mit.edu/specialissue5) +
+
+
+
+
+ + ♻ ☆ ORLM: A Customizable Framework in Training Large Models for Automated + Optimization Modeling + + +
+ Optimization modeling and solving play a critical role in the application of +Operations Research (OR) tools to address real-world problems, yet they pose +challenges and require extensive expertise from OR experts. With the advent of +large language models (LLMs), new opportunities have emerged to streamline and +automate these tasks. However, current research predominantly relies on +closed-source LLMs such as GPT-4, along with extensive prompt engineering +techniques. This reliance stems from the scarcity of high-quality training +datasets for optimization modeling, resulting in elevated costs, prolonged +processing times, and privacy concerns. To address these challenges, our work +is the first to propose a viable path for training open-source LLMs that are +capable of optimization modeling as well as developing and executing solver +codes, eventually leading to a superior ability for automating optimization +modeling and solving. Particularly, we introduce a semi-automated data +synthesis framework designed for optimization modeling issues, named +OR-Instruct. This framework merges the training data requirements of large +models with the unique characteristics of optimization modeling problems, and +allows for customizable enhancements tailored to specific scenarios or modeling +types. To evaluate the performance of our proposed framework, we present the +IndustryOR benchmark, the inaugural industrial standard for evaluating LLMs in +solving practical OR problems. Utilizing data synthesized through OR-Instruct, +we train various open-source LLMs with a capacity of 7 billion parameters +(dubbed ORLMs). The resulting model demonstrates significantly enhanced +optimization modeling capabilities, achieving state-of-the-art performance +across the NL4OPT, MAMO, and IndustryOR benchmarks. Our code and data are +available at \url{https://github.com/Cardinal-Operations/ORLM}. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ A Multi-Granularity Supervised Contrastive Framework for Remaining + Useful Life Prediction of Aero-engines + + +
+ Accurate remaining useful life (RUL) predictions are critical to the safe +operation of aero-engines. Currently, the RUL prediction task is mainly a +regression paradigm with only mean square error as the loss function and lacks +research on feature space structure, the latter of which has shown excellent +performance in a large number of studies. This paper develops a +multi-granularity supervised contrastive (MGSC) framework from plain intuition +that samples with the same RUL label should be aligned in the feature space, +and address the problems of too large minibatch size and unbalanced samples in +the implementation. The RUL prediction with MGSC is implemented on using the +proposed multi-phase training strategy. This paper also demonstrates a simple +and scalable basic network structure and validates the proposed MGSC strategy +on the CMPASS dataset using a convolutional long short-term memory network as a +baseline, which effectively improves the accuracy of RUL prediction. + +
+
+
+
+
+ + ♻ ☆ CleanerCLIP: Fine-grained Counterfactual Semantic Augmentation for + Backdoor Defense in Contrastive Learning + + +
+ Pre-trained large models for multimodal contrastive learning, such as CLIP, +have been widely recognized in the industry as highly susceptible to +data-poisoned backdoor attacks. This poses significant risks to downstream +model training. In response to such potential threats, finetuning offers a +simpler and more efficient defense choice compared to retraining large models +with augmented data. In the supervised learning domain, fine-tuning defense +strategies can achieve excellent defense performance. However, in the +unsupervised and semi-supervised domain, we find that when CLIP faces some +complex attack techniques, the existing fine-tuning defense strategy, +CleanCLIP, has some limitations on defense performance. The synonym +substitution of its text-augmentation is insufficient to enhance the text +feature space. To compensate for this weakness, we improve it by proposing a +fine-grained \textbf{T}ext \textbf{A}lignment \textbf{C}leaner (TA-Cleaner) to +cut off feature connections of backdoor triggers. We randomly select a few +samples for positive and negative subtext generation at each epoch of +CleanCLIP, and align the subtexts to the images to strengthen the text +self-supervision. We evaluate the effectiveness of our TA-Cleaner against six +attack algorithms and conduct comprehensive zero-shot classification tests on +ImageNet1K. Our experimental results demonstrate that TA-Cleaner achieves +state-of-the-art defensiveness among finetuning-based defense techniques. Even +when faced with the novel attack technique BadCLIP, our TA-Cleaner outperforms +CleanCLIP by reducing the ASR of Top-1 and Top-10 by 52.02\% and 63.88\%, +respectively. + +
+
+
+
+
+ + ♻ ☆ Automated Clinical Data Extraction with Knowledge Conditioned LLMs COLING25 + + +
+ The extraction of lung lesion information from clinical and medical imaging +reports is crucial for research on and clinical care of lung-related diseases. +Large language models (LLMs) can be effective at interpreting unstructured text +in reports, but they often hallucinate due to a lack of domain-specific +knowledge, leading to reduced accuracy and posing challenges for use in +clinical settings. To address this, we propose a novel framework that aligns +generated internal knowledge with external knowledge through in-context +learning (ICL). Our framework employs a retriever to identify relevant units of +internal or external knowledge and a grader to evaluate the truthfulness and +helpfulness of the retrieved internal-knowledge rules, to align and update the +knowledge bases. Experiments with expert-curated test datasets demonstrate that +this ICL approach can increase the F1 score for key fields (lesion size, margin +and solidity) by an average of 12.9% over existing ICL methods. + +
+
+ comment: COLING25 Industry Track +
+
+
+
+
+ + ♻ ☆ Semantic Segmentation by Semantic Proportions + + +
+ Semantic segmentation is a critical task in computer vision aiming to +identify and classify individual pixels in an image, with numerous applications +in for example autonomous driving and medical image analysis. However, semantic +segmentation can be highly challenging particularly due to the need for large +amounts of annotated data. Annotating images is a time-consuming and costly +process, often requiring expert knowledge and significant effort; moreover, +saving the annotated images could dramatically increase the storage space. In +this paper, we propose a novel approach for semantic segmentation, requiring +the rough information of individual semantic class proportions, shortened as +semantic proportions, rather than the necessity of ground-truth segmentation +maps. This greatly simplifies the data annotation process and thus will +significantly reduce the annotation time, cost and storage space, opening up +new possibilities for semantic segmentation tasks where obtaining the full +ground-truth segmentation maps may not be feasible or practical. Our proposed +method of utilising semantic proportions can (i) further be utilised as a +booster in the presence of ground-truth segmentation maps to gain performance +without extra data and model complexity, and (ii) also be seen as a +parameter-free plug-and-play module, which can be attached to existing deep +neural networks designed for semantic segmentation. Extensive experimental +results demonstrate the good performance of our method compared to benchmark +methods that rely on ground-truth segmentation maps. Utilising semantic +proportions suggested in this work offers a promising direction for future +semantic segmentation research. + +
+
+
+
+
+ + ♻ ☆ Adversarial Environment Design via Regret-Guided Diffusion Models + + +
+ Training agents that are robust to environmental changes remains a +significant challenge in deep reinforcement learning (RL). Unsupervised +environment design (UED) has recently emerged to address this issue by +generating a set of training environments tailored to the agent's capabilities. +While prior works demonstrate that UED has the potential to learn a robust +policy, their performance is constrained by the capabilities of the environment +generation. To this end, we propose a novel UED algorithm, adversarial +environment design via regret-guided diffusion models (ADD). The proposed +method guides the diffusion-based environment generator with the regret of the +agent to produce environments that the agent finds challenging but conducive to +further improvement. By exploiting the representation power of diffusion +models, ADD can directly generate adversarial environments while maintaining +the diversity of training environments, enabling the agent to effectively learn +a robust policy. Our experimental results demonstrate that the proposed method +successfully generates an instructive curriculum of environments, outperforming +UED baselines in zero-shot generalization across novel, out-of-distribution +environments. Project page: https://rllab-snu.github.io/projects/ADD + +
+
+ comment: 38th Conference on Neural Information Processing Systems +
+
+
+
+
+ + ♻ ☆ Mitigating Gradient Overlap in Deep Residual Networks with Gradient + Normalization for Improved Non-Convex Optimization + + +
+ In deep learning, Residual Networks (ResNets) have proven effective in +addressing the vanishing gradient problem, allowing for the successful training +of very deep networks. However, skip connections in ResNets can lead to +gradient overlap, where gradients from both the learned transformation and the +skip connection combine, potentially resulting in overestimated gradients. This +overestimation can cause inefficiencies in optimization, as some updates may +overshoot optimal regions, affecting weight updates. To address this, we +examine Z-score Normalization (ZNorm) as a technique to manage gradient +overlap. ZNorm adjusts the gradient scale, standardizing gradients across +layers and reducing the negative impact of overlapping gradients. Our +experiments demonstrate that ZNorm improves training process, especially in +non-convex optimization scenarios common in deep learning, where finding +optimal solutions is challenging. These findings suggest that ZNorm can affect +the gradient flow, enhancing performance in large-scale data processing where +accuracy is critical. + +
+
+
+
+
+ + ♻ ☆ Demystifying Large Language Models for Medicine: A Primer + + +
+ Large language models (LLMs) represent a transformative class of AI tools +capable of revolutionizing various aspects of healthcare by generating +human-like responses across diverse contexts and adapting to novel tasks +following human instructions. Their potential application spans a broad range +of medical tasks, such as clinical documentation, matching patients to clinical +trials, and answering medical questions. In this primer paper, we propose an +actionable guideline to help healthcare professionals more efficiently utilize +LLMs in their work, along with a set of best practices. This approach consists +of several main phases, including formulating the task, choosing LLMs, prompt +engineering, fine-tuning, and deployment. We start with the discussion of +critical considerations in identifying healthcare tasks that align with the +core capabilities of LLMs and selecting models based on the selected task and +data, performance requirements, and model interface. We then review the +strategies, such as prompt engineering and fine-tuning, to adapt standard LLMs +to specialized medical tasks. Deployment considerations, including regulatory +compliance, ethical guidelines, and continuous monitoring for fairness and +bias, are also discussed. By providing a structured step-by-step methodology, +this tutorial aims to equip healthcare professionals with the tools necessary +to effectively integrate LLMs into clinical practice, ensuring that these +powerful technologies are applied in a safe, reliable, and impactful manner. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ ConSmax: Hardware-Friendly Alternative Softmax with Learnable Parameters + + +
+ The self-attention mechanism distinguishes transformer-based large language +models (LLMs) apart from convolutional and recurrent neural networks. Despite +the performance improvement, achieving real-time LLM inference on silicon +remains challenging due to the extensive use of Softmax in self-attention. In +addition to the non-linearity, the low arithmetic intensity significantly +limits processing parallelism, especially when working with longer contexts. To +address this challenge, we propose Constant Softmax (ConSmax), a +software-hardware co-design that serves as an efficient alternative to Softmax. +ConSmax utilizes differentiable normalization parameters to eliminate the need +for maximum searching and denominator summation in Softmax. This approach +enables extensive parallelization while still executing the essential functions +of Softmax. Moreover, a scalable ConSmax hardware design with a bitwidth-split +look-up table (LUT) can achieve lossless non-linear operations and support +mixed-precision computing. Experimental results show that ConSmax achieves a +minuscule power consumption of 0.2mW and an area of 0.0008mm^2 at 1250MHz +working frequency in 16nm FinFET technology. For open-source contribution, we +further implement our design with the OpenROAD toolchain under SkyWater's 130nm +CMOS technology. The corresponding power is 2.69mW and the area is 0.007mm^2. +ConSmax achieves 3.35x power savings and 2.75x area savings in 16nm technology, +and 3.15x power savings and 4.14x area savings with the open-source EDA +toolchain. In the meantime, it also maintains comparable accuracy on the GPT-2 +model and the WikiText103 dataset. The project is available at +https://github.com/ReaLLMASIC/ConSmax + +
+
+
+
+
+ + ♻ ☆ From Isolation to Collaboration: Federated Class-Heterogeneous Learning + for Chest X-Ray Classification + + +
+ Federated learning (FL) is a promising paradigm to collaboratively train a +global chest x-ray (CXR) classification model using distributed datasets while +preserving patient privacy. A significant, yet relatively underexplored, +challenge in FL is class-heterogeneity, where clients have different sets of +classes. We propose surgical aggregation, a FL method that uses selective +aggregation to collaboratively train a global model using distributed, +class-heterogeneous datasets. Unlike other methods, our method does not rely on +the assumption that clients share the same classes as other clients, know the +classes of other clients, or have access to a fully annotated dataset. We +evaluate surgical aggregation using class-heterogeneous CXR datasets across IID +and non-IID settings. Our results show that our method outperforms current +methods and has better generalizability. + +
+
+
+
+
+
+
+
+ + Computation and Language 66 + +
+
+
+ + ☆ Enhancing the Reasoning Ability of Multimodal Large Language Models via + Mixed Preference Optimization + + +
+ Existing open-source multimodal large language models (MLLMs) generally +follow a training process involving pre-training and supervised fine-tuning. +However, these models suffer from distribution shifts, which limit their +multimodal reasoning, particularly in the Chain-of-Thought (CoT) performance. +To address this, we introduce a preference optimization (PO) process to enhance +the multimodal reasoning capabilities of MLLMs. Specifically, (1) on the data +side, we design an automated preference data construction pipeline to create +MMPR, a high-quality, large-scale multimodal reasoning preference dataset. and +(2) on the model side, we explore integrating PO with MLLMs, developing a +simple yet effective method, termed Mixed Preference Optimization (MPO), which +boosts multimodal CoT performance. Our approach demonstrates improved +performance across multiple benchmarks, particularly in multimodal reasoning +tasks. Notably, our model, InternVL2-8B-MPO, achieves an accuracy of 67.0 on +MathVista, outperforming InternVL2-8B by 8.7 points and achieving performance +comparable to the 10x larger InternVL2-76B. We hope this study could inspire +further advancements in MLLMs. Code, data, and model shall be publicly +released. + +
+
+
+
+
+ + ☆ Mitigating Hallucination in Multimodal Large Language Model via + Hallucination-targeted Direct Preference Optimization + + +
+ Multimodal Large Language Models (MLLMs) are known to hallucinate, which +limits their practical applications. Recent works have attempted to apply +Direct Preference Optimization (DPO) to enhance the performance of MLLMs, but +have shown inconsistent improvements in mitigating hallucinations. To address +this issue more effectively, we introduce Hallucination-targeted Direct +Preference Optimization (HDPO) to reduce hallucinations in MLLMs. Unlike +previous approaches, our method tackles hallucinations from their diverse forms +and causes. Specifically, we develop three types of preference pair data +targeting the following causes of MLLM hallucinations: (1) insufficient visual +capabilities, (2) long context generation, and (3) multimodal conflicts. +Experimental results demonstrate that our method achieves superior performance +across multiple hallucination evaluation datasets, surpassing most +state-of-the-art (SOTA) methods and highlighting the potential of our approach. +Ablation studies and in-depth analyses further confirm the effectiveness of our +method and suggest the potential for further improvements through scaling up. + +
+
+
+
+
+ + ☆ Towards Automatic Evaluation of Task-Oriented Dialogue Flows + + +
+ Task-oriented dialogue systems rely on predefined conversation schemes +(dialogue flows) often represented as directed acyclic graphs. These flows can +be manually designed or automatically generated from previously recorded +conversations. Due to variations in domain expertise or reliance on different +sets of prior conversations, these dialogue flows can manifest in significantly +different graph structures. Despite their importance, there is no standard +method for evaluating the quality of dialogue flows. We introduce FuDGE (Fuzzy +Dialogue-Graph Edit Distance), a novel metric that evaluates dialogue flows by +assessing their structural complexity and representational coverage of the +conversation data. FuDGE measures how well individual conversations align with +a flow and, consequently, how well a set of conversations is represented by the +flow overall. Through extensive experiments on manually configured flows and +flows generated by automated techniques, we demonstrate the effectiveness of +FuDGE and its evaluation framework. By standardizing and optimizing dialogue +flows, FuDGE enables conversational designers and automated techniques to +achieve higher levels of efficiency and automation. + +
+
+
+
+
+ + ☆ Llama Guard 3 Vision: Safeguarding Human-AI Image Understanding + Conversations + + +
+ We introduce Llama Guard 3 Vision, a multimodal LLM-based safeguard for +human-AI conversations that involves image understanding: it can be used to +safeguard content for both multimodal LLM inputs (prompt classification) and +outputs (response classification). Unlike the previous text-only Llama Guard +versions (Inan et al., 2023; Llama Team, 2024b,a), it is specifically designed +to support image reasoning use cases and is optimized to detect harmful +multimodal (text and image) prompts and text responses to these prompts. Llama +Guard 3 Vision is fine-tuned on Llama 3.2-Vision and demonstrates strong +performance on the internal benchmarks using the MLCommons taxonomy. We also +test its robustness against adversarial attacks. We believe that Llama Guard 3 +Vision serves as a good starting point to build more capable and robust content +moderation tools for human-AI conversation with multimodal capabilities. + +
+
+
+
+
+ + ☆ Features that Make a Difference: Leveraging Gradients for Improved + Dictionary Learning NAACL 2025 + + +
+ Sparse Autoencoders (SAEs) are a promising approach for extracting neural +network representations by learning a sparse and overcomplete decomposition of +the network's internal activations. However, SAEs are traditionally trained +considering only activation values and not the effect those activations have on +downstream computations. This limits the information available to learn +features, and biases the autoencoder towards neglecting features which are +represented with small activation values but strongly influence model outputs. +To address this, we introduce Gradient SAEs (g-SAEs), which modify the +$k$-sparse autoencoder architecture by augmenting the TopK activation function +to rely on the gradients of the input activation when selecting the $k$ +elements. For a given sparsity level, g-SAEs produce reconstructions that are +more faithful to original network performance when propagated through the +network. Additionally, we find evidence that g-SAEs learn latents that are on +average more effective at steering models in arbitrary contexts. By considering +the downstream effects of activations, our approach leverages the dual nature +of neural network features as both $\textit{representations}$, retrospectively, +and $\textit{actions}$, prospectively. While previous methods have approached +the problem of feature discovery primarily focused on the former aspect, g-SAEs +represent a step towards accounting for the latter as well. + +
+
+ comment: 9 pages, 8 figures. Submitted to NAACL 2025 +
+
+
+
+
+ + ☆ A Survey of Event Causality Identification: Principles, Taxonomy, + Challenges, and Assessment + + +
+ Event Causality Identification (ECI) has become a crucial task in Natural +Language Processing (NLP), aimed at automatically extracting causalities from +textual data. In this survey, we systematically address the foundational +principles, technical frameworks, and challenges of ECI, offering a +comprehensive taxonomy to categorize and clarify current research +methodologies, as well as a quantitative assessment of existing models. We +first establish a conceptual framework for ECI, outlining key definitions, +problem formulations, and evaluation standards. Our taxonomy classifies ECI +methods according to the two primary tasks of sentence-level (SECI) and +document-level (DECI) event causality identification. For SECI, we examine +feature pattern-based matching, deep semantic encoding, causal knowledge +pre-training and prompt-based fine-tuning, and external knowledge enhancement +methods. For DECI, we highlight approaches focused on event graph reasoning and +prompt-based techniques to address the complexity of cross-sentence causal +inference. Additionally, we analyze the strengths, limitations, and open +challenges of each approach. We further conduct an extensive quantitative +evaluation of various ECI methods on two benchmark datasets. Finally, we +explore future research directions, highlighting promising pathways to overcome +current limitations and broaden ECI applications. + +
+
+
+
+
+ + ☆ Safe Text-to-Image Generation: Simply Sanitize the Prompt Embedding + + +
+ In recent years, text-to-image (T2I) generation models have made significant +progress in generating high-quality images that align with text descriptions. +However, these models also face the risk of unsafe generation, potentially +producing harmful content that violates usage policies, such as explicit +material. Existing safe generation methods typically focus on suppressing +inappropriate content by erasing undesired concepts from visual +representations, while neglecting to sanitize the textual representation. +Although these methods help mitigate the risk of misuse to certain extent, +their robustness remains insufficient when dealing with adversarial attacks. + Given that semantic consistency between input text and output image is a +fundamental requirement for T2I models, we identify that textual +representations (i.e., prompt embeddings) are likely the primary source of +unsafe generation. To this end, we propose a vision-agnostic safe generation +framework, Embedding Sanitizer (ES), which focuses on erasing inappropriate +concepts from prompt embeddings and uses the sanitized embeddings to guide the +model for safe generation. ES is applied to the output of the text encoder as a +plug-and-play module, enabling seamless integration with different T2I models +as well as other safeguards. In addition, ES's unique scoring mechanism assigns +a score to each token in the prompt to indicate its potential harmfulness, and +dynamically adjusts the sanitization intensity to balance defensive performance +and generation quality. Through extensive evaluation on five prompt benchmarks, +our approach achieves state-of-the-art robustness by sanitizing the source +(prompt embedding) of unsafe generation compared to nine baseline methods. It +significantly outperforms existing safeguards in terms of interpretability and +controllability while maintaining generation quality. + +
+
+
+
+
+ + ☆ Emotion Detection in Reddit: Comparative Study of Machine Learning and + Deep Learning Techniques + + +
+ Emotion detection is pivotal in human communication, as it significantly +influences behavior, relationships, and decision-making processes. This study +concentrates on text-based emotion detection by leveraging the GoEmotions +dataset, which annotates Reddit comments with 27 distinct emotions. These +emotions are subsequently mapped to Ekman's six basic categories: joy, anger, +fear, sadness, disgust, and surprise. We employed a range of models for this +task, including six machine learning models, three ensemble models, and a Long +Short-Term Memory (LSTM) model to determine the optimal model for emotion +detection. Results indicate that the Stacking classifier outperforms other +models in accuracy and performance. We also benchmark our models against +EmoBERTa, a pre-trained emotion detection model, with our Stacking classifier +proving more effective. Finally, the Stacking classifier is deployed via a +Streamlit web application, underscoring its potential for real-world +applications in text-based emotion analysis. + +
+
+
+
+
+ + The Dawn of GUI Agent: A Preliminary Case Study with Claude 3.5 Computer + Use + + +
+ The recently released model, Claude 3.5 Computer Use, stands out as the first +frontier AI model to offer computer use in public beta as a graphical user +interface (GUI) agent. As an early beta, its capability in the real-world +complex environment remains unknown. In this case study to explore Claude 3.5 +Computer Use, we curate and organize a collection of carefully designed tasks +spanning a variety of domains and software. Observations from these cases +demonstrate Claude 3.5 Computer Use's unprecedented ability in end-to-end +language to desktop actions. Along with this study, we provide an +out-of-the-box agent framework for deploying API-based GUI automation models +with easy implementation. Our case studies aim to showcase a groundwork of +capabilities and limitations of Claude 3.5 Computer Use with detailed analyses +and bring to the fore questions about planning, action, and critic, which must +be considered for future improvement. We hope this preliminary exploration will +inspire future research into the GUI agent community. All the test cases in the +paper can be tried through the project: +https://github.com/showlab/computer_use_ootb. + +
+
+ comment: 40 pages, 21 figures, preprint +
+
+
+
+
+ + ☆ Unveiling Topological Structures in Text: A Comprehensive Survey of + Topological Data Analysis Applications in NLP + + +
+ The surge of data available on the internet has led to the adoption of +various computational methods to analyze and extract valuable insights from +this wealth of information. Among these, the field of Machine Learning (ML) has +thrived by leveraging data to extract meaningful insights. However, ML +techniques face notable challenges when dealing with real-world data, often due +to issues of imbalance, noise, insufficient labeling, and high dimensionality. +To address these limitations, some researchers advocate for the adoption of +Topological Data Analysis (TDA), a statistical approach that discerningly +captures the intrinsic shape of data despite noise. Despite its potential, TDA +has not gained as much traction within the Natural Language Processing (NLP) +domain compared to structurally distinct areas like computer vision. +Nevertheless, a dedicated community of researchers has been exploring the +application of TDA in NLP, yielding 85 papers we comprehensively survey in this +paper. Our findings categorize these efforts into theoretical and +nontheoretical approaches. Theoretical approaches aim to explain linguistic +phenomena from a topological viewpoint, while non-theoretical approaches merge +TDA with ML features, utilizing diverse numerical representation techniques. We +conclude by exploring the challenges and unresolved questions that persist in +this niche field. Resources and a list of papers on this topic can be found at: +https://github.com/AdaUchendu/AwesomeTDA4NLP. + +
+
+
+
+
+ + ☆ Scaling Law for Post-training after Model Pruning + + +
+ Large language models (LLMs) based on the Transformer architecture are widely +employed across various domains and tasks. However, their increasing size +imposes significant hardware demands, limiting practical deployment. To +mitigate this, model pruning techniques have been developed to create more +efficient models while maintaining high performance. Despite this, +post-training after pruning is crucial for performance recovery and can be +resource-intensive. This paper investigates the post-training requirements of +pruned LLMs and introduces a scaling law to determine the optimal amount of +post-training data. Post-training experiments with the Llama-3 and Qwen-2.5 +series models, pruned using depth pruning, width pruning, and 2:4 +semi-structured pruning, show that higher pruning ratios necessitate more +post-training data for performance recovery, whereas larger LLMs require less. +The proposed scaling law predicts a model's loss based on its parameter counts +before and after pruning, as well as the post-training token counts. +Furthermore, we find that the scaling law established from smaller LLMs can be +reliably extrapolated to larger LLMs. This work provides valuable insights into +the post-training of pruned LLMs and offers a practical scaling law for +optimizing post-training data usage. + +
+
+
+
+
+ + ☆ Scaling up the Evaluation of Collaborative Problem Solving: Promises and + Challenges of Coding Chat Data with ChatGPT + + +
+ Collaborative problem solving (CPS) is widely recognized as a critical 21st +century skill. Efficiently coding communication data is a big challenge in +scaling up research on assessing CPS. This paper reports the findings on using +ChatGPT to directly code CPS chat data by benchmarking performance across +multiple datasets and coding frameworks. We found that ChatGPT-based coding +outperformed human coding in tasks where the discussions were characterized by +colloquial languages but fell short in tasks where the discussions dealt with +specialized scientific terminology and contexts. The findings offer practical +guidelines for researchers to develop strategies for efficient and scalable +analysis of communication data from CPS tasks. + +
+
+ comment: 21 pages, 3 figures, 5 tables. Initially report in the edArXiv:xw6kz +
+
+
+
+
+ + ☆ Measuring Non-Adversarial Reproduction of Training Data in Large + Language Models + + +
+ Large language models memorize parts of their training data. Memorizing short +snippets and facts is required to answer questions about the world and to be +fluent in any language. But models have also been shown to reproduce long +verbatim sequences of memorized text when prompted by a motivated adversary. In +this work, we investigate an intermediate regime of memorization that we call +non-adversarial reproduction, where we quantify the overlap between model +responses and pretraining data when responding to natural and benign prompts. +For a variety of innocuous prompt categories (e.g., writing a letter or a +tutorial), we show that up to 15% of the text output by popular conversational +language models overlaps with snippets from the Internet. In worst cases, we +find generations where 100% of the content can be found exactly online. For the +same tasks, we find that human-written text has far less overlap with Internet +data. We further study whether prompting strategies can close this reproduction +gap between models and humans. While appropriate prompting can reduce +non-adversarial reproduction on average, we find that mitigating worst-case +reproduction of training data requires stronger defenses -- even for benign +interactions. + +
+
+
+
+
+ + ☆ Entropy and type-token ratio in gigaword corpora + + +
+ Lexical diversity measures the vocabulary variation in texts. While its +utility is evident for analyses in language change and applied linguistics, it +is not yet clear how to operationalize this concept in a unique way. We here +investigate entropy and text-token ratio, two widely employed metrics for +lexical diversities, in six massive linguistic datasets in English, Spanish, +and Turkish, consisting of books, news articles, and tweets. These gigaword +corpora correspond to languages with distinct morphological features and differ +in registers and genres, thus constituting a diverse testbed for a quantitative +approach to lexical diversity. Strikingly, we find a functional relation +between entropy and text-token ratio that holds across the corpora under +consideration. Further, in the limit of large vocabularies we find an +analytical expression that sheds light on the origin of this relation and its +connection with both Zipf and Heaps laws. Our results then contribute to the +theoretical understanding of text structure and offer practical implications +for fields like natural language processing. + +
+
+ comment: 12 pages, 10 figures, 7 tables +
+
+
+
+
+ + ☆ Increasing the Accessibility of Causal Domain Knowledge via Causal + Information Extraction Methods: A Case Study in the Semiconductor + Manufacturing Industry + + +
+ The extraction of causal information from textual data is crucial in the +industry for identifying and mitigating potential failures, enhancing process +efficiency, prompting quality improvements, and addressing various operational +challenges. This paper presents a study on the development of automated methods +for causal information extraction from actual industrial documents in the +semiconductor manufacturing industry. The study proposes two types of causal +information extraction methods, single-stage sequence tagging (SST) and +multi-stage sequence tagging (MST), and evaluates their performance using +existing documents from a semiconductor manufacturing company, including +presentation slides and FMEA (Failure Mode and Effects Analysis) documents. The +study also investigates the effect of representation learning on downstream +tasks. The presented case study showcases that the proposed MST methods for +extracting causal information from industrial documents are suitable for +practical applications, especially for semi structured documents such as FMEAs, +with a 93\% F1 score. Additionally, MST achieves a 73\% F1 score on texts +extracted from presentation slides. Finally, the study highlights the +importance of choosing a language model that is more aligned with the domain +and in-domain fine-tuning. + +
+
+ comment: 17 pages, 2 figures +
+
+
+
+
+ + ☆ Evaluating the role of `Constitutions' for learning from AI feedback NeurIPS 2024 + + +
+ The growing capabilities of large language models (LLMs) have led to their +use as substitutes for human feedback for training and assessing other LLMs. +These methods often rely on `constitutions', written guidelines which a critic +model uses to provide feedback and improve generations. We investigate how the +choice of constitution affects feedback quality by using four different +constitutions to improve patient-centered communication in medical interviews. +In pairwise comparisons conducted by 215 human raters, we found that detailed +constitutions led to better results regarding emotive qualities. However, none +of the constitutions outperformed the baseline in learning more +practically-oriented skills related to information gathering and provision. Our +findings indicate that while detailed constitutions should be prioritised, +there are possible limitations to the effectiveness of AI feedback as a reward +signal in certain areas. + +
+
+ comment: 4 pages, 2 figures. In NeurIPS 2024 Workshop on Language Gamification +
+
+
+
+
+ + ☆ Compound-QA: A Benchmark for Evaluating LLMs on Compound Questions + + +
+ Large language models (LLMs) demonstrate remarkable performance across +various tasks, prompting researchers to develop diverse evaluation benchmarks. +However, existing benchmarks typically measure the ability of LLMs to respond +to individual questions, neglecting the complex interactions in real-world +applications. In this paper, we introduce Compound Question Synthesis (CQ-Syn) +to create the Compound-QA benchmark, focusing on compound questions with +multiple sub-questions. This benchmark is derived from existing QA datasets, +annotated with proprietary LLMs and verified by humans for accuracy. It +encompasses five categories: Factual-Statement, Cause-and-Effect, +Hypothetical-Analysis, Comparison-and-Selection, and Evaluation-and-Suggestion. +It evaluates the LLM capability in terms of three dimensions including +understanding, reasoning, and knowledge. Our assessment of eight open-source +LLMs using Compound-QA reveals distinct patterns in their responses to compound +questions, which are significantly poorer than those to non-compound questions. +Additionally, we investigate various methods to enhance LLMs performance on +compound questions. The results indicate that these approaches significantly +improve the models' comprehension and reasoning abilities on compound +questions. + +
+
+
+
+
+ + ☆ An Effective Framework to Help Large Language Models Handle + Numeric-involved Long-context Tasks + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +handling long texts and have almost perfect performance in traditional +retrieval tasks. However, their performance significantly degrades when it +comes to numerical calculations in the long-context. Numeric-involved +long-context tasks typically cannot be addressed by current LLMs in normal +settings due to their inherent limitations in simultaneously handling complex +and massive information. Some CoT like prompting methods can improve accuracy +but demands massive output tokens, which is costly and slow. To address this +issue, we propose a workflow, which decompose a numeric-involved long-context +task into 4 low-level subtasks: judging, extracting and processing with code +and conclusion. The former 2 subtasks is relatively simple, which allows us to +use smaller models for efficiently processing long context. When numerical +calculations are required, we use code generated by LLMs to avoid the +disadvantage of LLM not being good at calculations. The results in 2 +numeric-involved long-context benchmarks demonstrate our workflow can not only +improve accuracy, but also significantly reduce the cost of API calls. + +
+
+
+
+
+ + ☆ Legal Evalutions and Challenges of Large Language Models + + +
+ In this paper, we review legal testing methods based on Large Language Models +(LLMs), using the OPENAI o1 model as a case study to evaluate the performance +of large models in applying legal provisions. We compare current +state-of-the-art LLMs, including open-source, closed-source, and legal-specific +models trained specifically for the legal domain. Systematic tests are +conducted on English and Chinese legal cases, and the results are analyzed in +depth. Through systematic testing of legal cases from common law systems and +China, this paper explores the strengths and weaknesses of LLMs in +understanding and applying legal texts, reasoning through legal issues, and +predicting judgments. The experimental results highlight both the potential and +limitations of LLMs in legal applications, particularly in terms of challenges +related to the interpretation of legal language and the accuracy of legal +reasoning. Finally, the paper provides a comprehensive analysis of the +advantages and disadvantages of various types of models, offering valuable +insights and references for the future application of AI in the legal field. + +
+
+
+
+
+ + ☆ Prompting and Fine-tuning Large Language Models for Automated Code + Review Comment Generation + + +
+ Generating accurate code review comments remains a significant challenge due +to the inherently diverse and non-unique nature of the task output. Large +language models pretrained on both programming and natural language data tend +to perform well in code-oriented tasks. However, large-scale pretraining is not +always feasible due to its environmental impact and project-specific +generalizability issues. In this work, first we fine-tune open-source Large +language models (LLM) in parameter-efficient, quantized low-rank (QLoRA) +fashion on consumer-grade hardware to improve review comment generation. Recent +studies demonstrate the efficacy of augmenting semantic metadata information +into prompts to boost performance in other code-related tasks. To explore this +in code review activities, we also prompt proprietary, closed-source LLMs +augmenting the input code patch with function call graphs and code summaries. +Both of our strategies improve the review comment generation performance, with +function call graph augmented few-shot prompting on the GPT-3.5 model +surpassing the pretrained baseline by around 90% BLEU-4 score on the +CodeReviewer dataset. Moreover, few-shot prompted Gemini-1.0 Pro, QLoRA +fine-tuned Code Llama and Llama 3.1 models achieve competitive results (ranging +from 25% to 83% performance improvement) on this task. An additional human +evaluation study further validates our experimental findings, reflecting +real-world developers' perceptions of LLM-generated code review comments based +on relevant qualitative metrics. + +
+
+
+
+
+ + ☆ Memorization in Attention-only Transformers AISTATS 2025 + + +
+ Recent research has explored the memorization capacity of multi-head +attention, but these findings are constrained by unrealistic limitations on the +context size. We present a novel proof for language-based Transformers that +extends the current hypothesis to any context size. Our approach improves upon +the state-of-the-art by achieving more effective exact memorization with an +attention layer, while also introducing the concept of approximate memorization +of distributions. Through experimental validation, we demonstrate that our +proposed bounds more accurately reflect the true memorization capacity of +language models, and provide a precise comparison with prior work. + +
+
+ comment: 16 pages, 6 figures, submitted to AISTATS 2025, +
+
+
+
+
+ + ☆ Xmodel-1.5: An 1B-scale Multilingual LLM + + +
+ We introduce Xmodel-1.5, a novel 1-billion-parameter multilingual large model +pretrained on approximately 2 trillion tokens. The model demonstrates strong +performance across several languages, with particularly notable results in +Thai, Arabic, and French, alongside its effectiveness in Chinese and English. +In addition, we contribute to the research community by releasing a Thai +evaluation dataset, which includes hundreds of questions annotated by students +from Chulalongkorn University's School of Integrated Innovation. While the +results are promising, we acknowledge that there is still room for improvement. +We hope this work advances ongoing efforts in multilingual AI research and +promotes better cross-linguistic understanding in various natural language +processing tasks. Our models and code are publicly available on GitHub at +https://github.com/XiaoduoAILab/XmodelLM. + +
+
+
+
+
+ + ☆ Understanding The Effect Of Temperature On Alignment With Human Opinions + + +
+ With the increasing capabilities of LLMs, recent studies focus on +understanding whose opinions are represented by them and how to effectively +extract aligned opinion distributions. We conducted an empirical analysis of +three straightforward methods for obtaining distributions and evaluated the +results across a variety of metrics. Our findings suggest that sampling and +log-probability approaches with simple parameter adjustments can return better +aligned outputs in subjective tasks compared to direct prompting. Yet, assuming +models reflect human opinions may be limiting, highlighting the need for +further research on how human subjectivity affects model uncertainty. + +
+
+
+
+
+ + ☆ Layer Importance and Hallucination Analysis in Large Language Models via + Enhanced Activation Variance-Sparsity + + +
+ Evaluating the importance of different layers in large language models (LLMs) +is crucial for optimizing model performance and interpretability. This paper +first explores layer importance using the Activation Variance-Sparsity Score +(AVSS), which combines normalized activation variance and sparsity to quantify +each layer's contribution to overall model performance. By ranking layers based +on AVSS and pruning the least impactful 25\%, our experiments on tasks such as +question answering, language modeling, and sentiment classification show that +over 90\% of the original performance is retained, highlighting potential +redundancies in LLM architectures. Building on AVSS, we propose an enhanced +version tailored to assess hallucination propensity across layers (EAVSS). This +improved approach introduces Hallucination-Specific Activation Variance (HSAV) +and Hallucination-Specific Sparsity (HSS) metrics, allowing precise +identification of hallucination-prone layers. By incorporating contrastive +learning on these layers, we effectively mitigate hallucination generation, +contributing to more robust and efficient LLMs(The maximum performance +improvement is 12\%). Our results on the NQ, SciQ, TriviaQA, TruthfulQA, and +WikiQA datasets demonstrate the efficacy of this method, offering a +comprehensive framework for both layer importance evaluation and hallucination +mitigation in LLMs. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ CMATH: Cross-Modality Augmented Transformer with Hierarchical + Variational Distillation for Multimodal Emotion Recognition in Conversation + + +
+ Multimodal emotion recognition in conversation (MER) aims to accurately +identify emotions in conversational utterances by integrating multimodal +information. Previous methods usually treat multimodal information as equal +quality and employ symmetric architectures to conduct multimodal fusion. +However, in reality, the quality of different modalities usually varies +considerably, and utilizing a symmetric architecture is difficult to accurately +recognize conversational emotions when dealing with uneven modal information. +Furthermore, fusing multi-modality information in a single granularity may fail +to adequately integrate modal information, exacerbating the inaccuracy in +emotion recognition. In this paper, we propose a novel Cross-Modality Augmented +Transformer with Hierarchical Variational Distillation, called CMATH, which +consists of two major components, i.e., Multimodal Interaction Fusion and +Hierarchical Variational Distillation. The former is comprised of two +submodules, including Modality Reconstruction and Cross-Modality Augmented +Transformer (CMA-Transformer), where Modality Reconstruction focuses on +obtaining high-quality compressed representation of each modality, and +CMA-Transformer adopts an asymmetric fusion strategy which treats one modality +as the central modality and takes others as auxiliary modalities. The latter +first designs a variational fusion network to fuse the fine-grained +representations learned by CMA- Transformer into a coarse-grained +representations. Then, it introduces a hierarchical distillation framework to +maintain the consistency between modality representations with different +granularities. Experiments on the IEMOCAP and MELD datasets demonstrate that +our proposed model outperforms previous state-of-the-art baselines. +Implementation codes can be available at https://github.com/ cjw-MER/CMATH. + +
+
+
+
+
+ + ☆ Towards unearthing neglected climate innovations from scientific + literature using Large Language Models NeurIPS 2024 + + +
+ Climate change poses an urgent global threat, needing the rapid +identification and deployment of innovative solutions. We hypothesise that many +of these solutions already exist within scientific literature but remain +underutilised. To address this gap, this study employs a curated dataset +sourced from OpenAlex, a comprehensive repository of scientific papers. +Utilising Large Language Models (LLMs), such as GPT4-o from OpenAI, we evaluate +title-abstract pairs from scientific papers on seven dimensions, covering +climate change mitigation potential, stage of technological development, and +readiness for deployment. The outputs of the language models are then compared +with human evaluations to assess their effectiveness in identifying promising +yet overlooked climate innovations. Our findings suggest that these LLM-based +models can effectively augment human expertise, uncovering climate solutions +that are potentially impactful but with far greater speed, throughput and +consistency. Here, we focused on UK-based solutions, but the workflow is +region-agnostic. This work contributes to the discovery of neglected +innovations in scientific literature and demonstrates the potential of AI in +enhancing climate action strategies. + +
+
+ comment: 10 pages. Accepted in the LatinX in AI workshop at NeurIPS 2024 +
+
+
+
+
+ + ☆ Information Extraction from Clinical Notes: Are We Ready to Switch to + Large Language Models? + + +
+ Backgrounds: Information extraction (IE) is critical in clinical natural +language processing (NLP). While large language models (LLMs) excel on +generative tasks, their performance on extractive tasks remains debated. +Methods: We investigated Named Entity Recognition (NER) and Relation Extraction +(RE) using 1,588 clinical notes from four sources (UT Physicians, MTSamples, +MIMIC-III, and i2b2). We developed an annotated corpus covering 4 clinical +entities and 16 modifiers, and compared instruction-tuned LLaMA-2 and LLaMA-3 +against BiomedBERT in terms of performance, generalizability, computational +resources, and throughput to BiomedBERT. Results: LLaMA models outperformed +BiomedBERT across datasets. With sufficient training data, LLaMA showed modest +improvements (1% on NER, 1.5-3.7% on RE); improvements were larger with limited +training data. On unseen i2b2 data, LLaMA-3-70B outperformed BiomedBERT by 7% +(F1) on NER and 4% on RE. However, LLaMA models required more computing +resources and ran up to 28 times slower. We implemented "Kiwi," a clinical IE +package featuring both models, available at https://kiwi.clinicalnlp.org/. +Conclusion: This study is among the first to develop and evaluate a +comprehensive clinical IE system using open-source LLMs. Results indicate that +LLaMA models outperform BiomedBERT for clinical NER and RE but with higher +computational costs and lower throughputs. These findings highlight that +choosing between LLMs and traditional deep learning methods for clinical IE +applications should remain task-specific, taking into account both performance +metrics and practical considerations such as available computing resources and +the intended use case scenarios. + +
+
+
+
+
+ + ☆ Once More, With Feeling: Measuring Emotion of Acting Performances in + Contemporary American Film + + +
+ Narrative film is a composition of writing, cinematography, editing, and +performance. While much computational work has focused on the writing or visual +style in film, we conduct in this paper a computational exploration of acting +performance. Applying speech emotion recognition models and a variationist +sociolinguistic analytical framework to a corpus of popular, contemporary +American film, we find narrative structure, diachronic shifts, and genre- and +dialogue-based constraints located in spoken performances. + +
+
+ comment: Accepted CHR 2024 +
+
+
+
+
+ + ☆ Orca: Enhancing Role-Playing Abilities of Large Language Models by + Integrating Personality Traits + + +
+ Large language models has catalyzed the development of personalized dialogue +systems, numerous role-playing conversational agents have emerged. While +previous research predominantly focused on enhancing the model's capability to +follow instructions by designing character profiles, neglecting the +psychological factors that drive human conversations. In this paper, we propose +Orca, a framework for data processing and training LLMs of custom characters by +integrating personality traits. Orca comprises four stages: (1) Personality +traits inferring, leverage LLMs to infer user's BigFive personality trait +reports and scores. (2) Data Augment, simulate user's profile, background +story, and psychological activities. (3) Dataset construction, +personality-conditioned instruction prompting (PCIP) to stimulate LLMs. (4) +Modeling and Training, personality-conditioned instruction tuning (PTIT and +PSIT), using the generated data to enhance existing open-source LLMs. We +introduce OrcaBench, the first benchmark for evaluating the quality of content +generated by LLMs on social platforms across multiple scales. Our experiments +demonstrate that our proposed model achieves superior performance on this +benchmark, demonstrating its excellence and effectiveness in perceiving +personality traits that significantly improve role-playing abilities. Our Code +is available at https://github.com/Aipura/Orca. + +
+
+
+
+
+ + ☆ HistoLens: An LLM-Powered Framework for Multi-Layered Analysis of + Historical Texts -- A Case Application of Yantie Lun + + +
+ This paper proposes HistoLens, a multi-layered analysis framework for +historical texts based on Large Language Models (LLMs). Using the important +Western Han dynasty text "Yantie Lun" as a case study, we demonstrate the +framework's potential applications in historical research and education. +HistoLens integrates NLP technology (especially LLMs), including named entity +recognition, knowledge graph construction, and geographic information +visualization. The paper showcases how HistoLens explores Western Han culture +in "Yantie Lun" through multi-dimensional, visual, and quantitative methods, +focusing particularly on the influence of Confucian and Legalist thoughts on +political, economic, military, and ethnic. We also demonstrate how HistoLens +constructs a machine teaching scenario using LLMs for explainable analysis, +based on a dataset of Confucian and Legalist ideas extracted with LLM +assistance. This approach offers novel and diverse perspectives for studying +historical texts like "Yantie Lun" and provides new auxiliary tools for history +education. The framework aims to equip historians and learners with +LLM-assisted tools to facilitate in-depth, multi-layered analysis of historical +texts and foster innovation in historical education. + +
+
+
+
+
+ + ☆ Large Language Models as User-Agents for Evaluating + Task-Oriented-Dialogue Systems + + +
+ Traditionally, offline datasets have been used to evaluate task-oriented +dialogue (TOD) models. These datasets lack context awareness, making them +suboptimal benchmarks for conversational systems. In contrast, user-agents, +which are context-aware, can simulate the variability and unpredictability of +human conversations, making them better alternatives as evaluators. Prior +research has utilized large language models (LLMs) to develop user-agents. Our +work builds upon this by using LLMs to create user-agents for the evaluation of +TOD systems. This involves prompting an LLM, using in-context examples as +guidance, and tracking the user-goal state. Our evaluation of diversity and +task completion metrics for the user-agents shows improved performance with the +use of better prompts. Additionally, we propose methodologies for the automatic +evaluation of TOD models within this dynamic framework. + +
+
+
+
+
+ + ☆ LoRA-LiteE: A Computationally Efficient Framework for Chatbot + Preference-Tuning + + +
+ Effective preference tuning is pivotal in aligning chatbot responses with +human expectations, enhancing user satisfaction and engagement. Traditional +approaches, notably Reinforcement Learning from Human Feedback (RLHF) as +employed in advanced models like GPT-4, have demonstrated considerable success +in this domain. However, RLHF methods are often computationally intensive and +resource-demanding, limiting their scalability and accessibility for broader +applications. To address these challenges, this study introduces LoRA-Lite +Ensemble (LoRA-LiteE), an innovative framework that combines Supervised +Fine-tuning (SFT) with Low-Rank Adaptation (LoRA) and Ensemble Learning +techniques to effectively aggregate predictions of lightweight models, which +aim to achieve a balance between the performance and computational cost. +Utilizing the Chatbot Arena benchmark dataset, we conduct a comprehensive +comparative analysis among our LoRA-LiteE model, corresponding base models at +different scales, and GPT-4 trained with RLHF. Our empirical results +demonstrate that the proposed LoRA-LiteE model achieves comparable performance +to un-finetuned GPT-4 and outperforms the single larger-scale models under +limited resource constraints. These findings highlight that our LoRA-LiteE +provides a feasible and efficient methodology for human preference prediction +in chatbot systems, enhancing scalability and accessibility, and thereby +broadening the applicability of preference-tuned chatbots in +resource-constrained environments. + +
+
+
+
+
+ + ☆ SlimLM: An Efficient Small Language Model for On-Device Document + Assistance + + +
+ While small language models (SLMs) show promises for mobile deployment, their +real-world performance and applications on smartphones remains underexplored. +We present SlimLM, a series of SLMs optimized for document assistance tasks on +mobile devices. Through extensive experiments on a Samsung Galaxy S24, we +identify the optimal trade-offs between model size (ranging from 125M to 7B +parameters), context length, and inference time for efficient on-device +processing. SlimLM is pre-trained on SlimPajama-627B and fine-tuned on +DocAssist, our constructed dataset for summarization, question answering and +suggestion tasks. Our smallest model demonstrates efficient performance on S24, +while larger variants offer enhanced capabilities within mobile constraints. We +evaluate SlimLM against existing SLMs, showing comparable or superior +performance and offering a benchmark for future research in on-device language +models. We also provide an Android application, offering practical insights +into SLM deployment. Our findings provide valuable insights and illuminate the +capabilities of running advanced language models on high-end smartphones, +potentially reducing server costs and enhancing privacy through on-device +processing. + +
+
+
+
+
+ + ☆ Refined and Segmented Price Sentiment Indices from Survey Comments + + +
+ We aim to enhance a price sentiment index and to more precisely understand +price trends from the perspective of not only consumers but also businesses. We +extract comments related to prices from the Economy Watchers Survey conducted +by the Cabinet Office of Japan and classify price trends using a large language +model (LLM). We classify whether the survey sample reflects the perspective of +consumers or businesses, and whether the comments pertain to goods or services +by utilizing information on the fields of comments and the industries of +respondents included in the Economy Watchers Survey. From these classified +price-related comments, we construct price sentiment indices not only for a +general purpose but also for more specific objectives by combining perspectives +on consumers and prices, as well as goods and services. It becomes possible to +achieve a more accurate classification of price directions by employing a LLM +for classification. Furthermore, integrating the outputs of multiple LLMs +suggests the potential for the better performance of the classification. The +use of more accurately classified comments allows for the construction of an +index with a higher correlation to existing indices than previous studies. We +demonstrate that the correlation of the price index for consumers, which has a +larger sample size, is further enhanced by selecting comments for aggregation +based on the industry of the survey respondents. + +
+
+ comment: Accepted to IEEE BigData 2024. 9 pages, 11 tables, 1 figure +
+
+
+
+
+ + ☆ JRadiEvo: A Japanese Radiology Report Generation Model Enhanced by + Evolutionary Optimization of Model Merging NeurIPS'24 + + +
+ With the rapid advancement of large language models (LLMs), foundational +models (FMs) have seen significant advancements. Healthcare is one of the most +crucial application areas for these FMs, given the significant time and effort +required for physicians to analyze large volumes of patient data. Recent +efforts have focused on adapting multimodal FMs to the medical domain through +techniques like instruction-tuning, leading to the development of medical +foundation models (MFMs). However, these approaches typically require large +amounts of training data to effectively adapt models to the medical field. +Moreover, most existing models are trained on English datasets, limiting their +practicality in non-English-speaking regions where healthcare professionals and +patients are not always fluent in English. The need for translation introduces +additional costs and inefficiencies. To address these challenges, we propose a +\textbf{J}apanese \textbf{Radi}ology report generation model enhanced by +\textbf{Evo}lutionary optimization of model merging (JRadiEvo). This is the +first attempt to extend a non-medical vision-language foundation model to the +medical domain through evolutionary optimization of model merging. We +successfully created a model that generates accurate Japanese reports from +X-ray images using only 50 translated samples from publicly available data. +This model, developed with highly efficient use of limited data, outperformed +leading models from recent research trained on much larger datasets. +Additionally, with only 8 billion parameters, this relatively compact +foundation model can be deployed locally within hospitals, making it a +practical solution for environments where APIs and other external services +cannot be used due to strict privacy and security requirements. + +
+
+ comment: Accepted by NeurIPS'24 Workshop on AIM-FM: Advancements In Medical + Foundation Models: Explainability, Robustness, Security, and Beyond +
+
+
+
+
+ + ☆ Research on Domain-Specific Chinese Spelling Correction Method Based on + Plugin Extension Modules + + +
+ This paper proposes a Chinese spelling correction method based on plugin +extension modules, aimed at addressing the limitations of existing models in +handling domain-specific texts. Traditional Chinese spelling correction models +are typically trained on general-domain datasets, resulting in poor performance +when encountering specialized terminology in domain-specific texts. To address +this issue, we design an extension module that learns the features of +domain-specific terminology, thereby enhancing the model's correction +capabilities within specific domains. This extension module can provide domain +knowledge to the model without compromising its general spelling correction +performance, thus improving its accuracy in specialized fields. Experimental +results demonstrate that after integrating extension modules for medical, +legal, and official document domains, the model's correction performance is +significantly improved compared to the baseline model without any extension +modules. + +
+
+
+
+
+ + ☆ KULCQ: An Unsupervised Keyword-based Utterance Level Clustering Quality + Metric + + +
+ Intent discovery is crucial for both building new conversational agents and +improving existing ones. While several approaches have been proposed for intent +discovery, most rely on clustering to group similar utterances together. +Traditional evaluation of these utterance clusters requires intent labels for +each utterance, limiting scalability. Although some clustering quality metrics +exist that do not require labeled data, they focus solely on cluster geometry +while ignoring the linguistic nuances present in conversational transcripts. In +this paper, we introduce Keyword-based Utterance Level Clustering Quality +(KULCQ), an unsupervised metric that leverages keyword analysis to evaluate +clustering quality. We demonstrate KULCQ's effectiveness by comparing it with +existing unsupervised clustering metrics and validate its performance through +comprehensive ablation studies. Our results show that KULCQ better captures +semantic relationships in conversational data while maintaining consistency +with geometric clustering principles. + +
+
+
+
+
+ + ☆ Leveraging large language models for efficient representation learning + for entity resolution + + +
+ In this paper, the authors propose TriBERTa, a supervised entity resolution +system that utilizes a pre-trained large language model and a triplet loss +function to learn representations for entity matching. The system consists of +two steps: first, name entity records are fed into a Sentence Bidirectional +Encoder Representations from Transformers (SBERT) model to generate vector +representations, which are then fine-tuned using contrastive learning based on +a triplet loss function. Fine-tuned representations are used as input for +entity matching tasks, and the results show that the proposed approach +outperforms state-of-the-art representations, including SBERT without +fine-tuning and conventional Term Frequency-Inverse Document Frequency +(TF-IDF), by a margin of 3 - 19%. Additionally, the representations generated +by TriBERTa demonstrated increased robustness, maintaining consistently higher +performance across a range of datasets. The authors also discussed the +importance of entity resolution in today's data-driven landscape and the +challenges that arise when identifying and reconciling duplicate data across +different sources. They also described the ER process, which involves several +crucial steps, including blocking, entity matching, and clustering. + +
+
+ comment: 22 pages and 12 figures +
+
+
+
+
+ + ☆ A dataset of questions on decision-theoretic reasoning in Newcomb-like + problems + + +
+ We introduce a dataset of natural-language questions in the decision theory +of so-called Newcomb-like problems. Newcomb-like problems include, for +instance, decision problems in which an agent interacts with a similar other +agent, and thus has to reason about the fact that the other agent will likely +reason in similar ways. Evaluating LLM reasoning about Newcomb-like problems is +important because interactions between foundation-model-based agents will often +be Newcomb-like. Some ways of reasoning about Newcomb-like problems may allow +for greater cooperation between models. + Our dataset contains both capabilities questions (i.e., questions with a +unique, uncontroversially correct answer) and attitude questions (i.e., +questions about which decision theorists would disagree). We use our dataset +for an investigation of decision-theoretical capabilities and expressed +attitudes and their interplay in existing models (different models by OpenAI, +Anthropic, Meta, GDM, Reka, etc.), as well as models under simple prompt-based +interventions. We find, among other things, that attitudes vary significantly +between existing models; that high capabilities are associated with attitudes +more favorable toward so-called evidential decision theory; and that attitudes +are consistent across different types of questions. + +
+
+ comment: 48 pages, 15 figures; code and data at + https://github.com/casparoe/newcomblike_questions_dataset +
+
+
+
+
+ + ☆ On the Shortcut Learning in Multilingual Neural Machine Translation + + +
+ In this study, we revisit the commonly-cited off-target issue in multilingual +neural machine translation (MNMT). By carefully designing experiments on +different MNMT scenarios and models, we attribute the off-target issue to the +overfitting of the shortcuts of (non-centric, centric) language mappings. +Specifically, the learned shortcuts biases MNMT to mistakenly translate +non-centric languages into the centric language instead of the expected +non-centric language for zero-shot translation. Analyses on learning dynamics +show that the shortcut learning generally occurs in the later stage of model +training, and multilingual pretraining accelerates and aggravates the shortcut +learning. Based on these observations, we propose a simple and effective +training strategy to eliminate the shortcuts in MNMT models by leveraging the +forgetting nature of model training. The only difference from the standard +training is that we remove the training instances that may induce the shortcut +learning in the later stage of model training. Without introducing any +additional data and computational costs, our approach can consistently and +significantly improve the zero-shot translation performance by alleviating the +shortcut learning for different MNMT models and benchmarks. + +
+
+ comment: Accepted by Neurocomputing 2024 +
+
+
+
+
+ + ☆ Hysteresis Activation Function for Efficient Inference NeurIPS + + +
+ The widely used ReLU is favored for its hardware efficiency, {as the +implementation at inference is a one bit sign case,} yet suffers from issues +such as the ``dying ReLU'' problem, where during training, neurons fail to +activate and constantly remain at zero, as highlighted by Lu et al. Traditional +approaches to mitigate this issue often introduce more complex and less +hardware-friendly activation functions. In this work, we propose a Hysteresis +Rectified Linear Unit (HeLU), an efficient activation function designed to +address the ``dying ReLU'' problem with minimal complexity. Unlike traditional +activation functions with fixed thresholds for training and inference, HeLU +employs a variable threshold that refines the backpropagation. This refined +mechanism allows simpler activation functions to achieve competitive +performance comparable to their more complex counterparts without introducing +unnecessary complexity or requiring inductive biases. Empirical evaluations +demonstrate that HeLU enhances model generalization across diverse datasets, +offering a promising solution for efficient and effective inference suitable +for a wide range of neural network architectures. + +
+
+ comment: Accepted to 4th NeurIPS Efficient Natural Language and Speech + Processing Workshop (ENLSP-IV 2024) +
+
+
+
+
+ + ☆ mlan: language-based instruction tuning improves zero-shot + generalization of multimodal large language models + + +
+ We present a novel instruction tuning recipe to improve the zero-shot task +generalization of multimodal large language models. In contrast to existing +instruction tuning mechanisms that heavily rely on visual instructions, our +approach focuses on language-based instruction tuning, offering a distinct and +more training efficient path for multimodal instruction tuning. We evaluate the +performance of the proposed approach on 9 unseen datasets across both language +and vision modalities. Our results show that our language-only instruction +tuning is able to significantly improve the performance of two pretrained +multimodal models based on Llama 2 and Vicuna on those unseen datasets. +Interestingly, the language instruction following ability also helps unlock the +models to follow vision instructions without explicit training. Compared to the +state of the art multimodal instruction tuning approaches that are mainly based +on visual instructions, our language-based method not only achieves superior +performance but also significantly enhances training efficiency. For instance, +the language-only instruction tuning produces competitive average performance +across the evaluated datasets (with even better performance on language +datasets) with significant training efficiency improvements (on average 4x), +thanks to the striking reduction in the need for vision data. With a small +number of visual instructions, this emerging language instruction following +ability transfers well to the unseen vision datasets, outperforming the state +of the art with greater training efficiency. + +
+
+
+
+
+ + ☆ Efficient Alignment of Large Language Models via Data Sampling + + +
+ LLM alignment ensures that large language models behave safely and +effectively by aligning their outputs with human values, goals, and intentions. +Aligning LLMs employ huge amounts of data, computation, and time. Moreover, +curating data with human feedback is expensive and takes time. Recent research +depicts the benefit of data engineering in the fine-tuning and pre-training +paradigms to bring down such costs. However, alignment differs from the +afore-mentioned paradigms and it is unclear if data efficient alignment is +feasible. In this work, we first aim to understand how the performance of LLM +alignment scales with data. We find out that LLM alignment performance follows +an exponential plateau pattern which tapers off post a rapid initial increase. +Based on this, we identify data subsampling as a viable method to reduce +resources required for alignment. Further, we propose an information +theory-based methodology for efficient alignment by identifying a small high +quality subset thereby reducing the computation and time required by alignment. +We evaluate the proposed methodology over multiple datasets and compare the +results. We find that the model aligned using our proposed methodology +outperforms other sampling methods and performs comparable to the model aligned +with the full dataset while using less than 10% data, leading to greater than +90% savings in costs, resources, and faster LLM alignment. + +
+
+
+
+
+ + ☆ SoftLMs: Efficient Adaptive Low-Rank Approximation of Language Models + using Soft-Thresholding Mechanism + + +
+ Extensive efforts have been made to boost the performance in the domain of +language models by introducing various attention-based transformers. However, +the inclusion of linear layers with large dimensions contributes to significant +computational and memory overheads. The escalating computational demands of +these models necessitate the development of various compression techniques to +ensure their deployment on devices, particularly in resource-constrained +environments. In this paper, we propose a novel compression methodology that +dynamically determines the rank of each layer using a soft thresholding +mechanism, which clips the singular values with a small magnitude in a +differentiable form. This approach automates the decision-making process to +identify the optimal degree of compression for each layer. We have successfully +applied the proposed technique to attention-based architectures, including BERT +for discriminative tasks and GPT2 and TinyLlama for generative tasks. +Additionally, we have validated our method on Mamba, a recently proposed +state-space model. Our experiments demonstrate that the proposed technique +achieves a speed-up of 1.33X to 1.72X in the encoder/ decoder with a 50% +reduction in total parameters. + +
+
+
+
+
+ + ☆ Does Prompt Formatting Have Any Impact on LLM Performance? NAACL 2025 + + +
+ In the realm of Large Language Models (LLMs), prompt optimization is crucial +for model performance. Although previous research has explored aspects like +rephrasing prompt contexts, using various prompting techniques (like in-context +learning and chain-of-thought), and ordering few-shot examples, our +understanding of LLM sensitivity to prompt templates remains limited. +Therefore, this paper examines the impact of different prompt templates on LLM +performance. We formatted the same contexts into various human-readable +templates, including plain text, Markdown, JSON, and YAML, and evaluated their +impact across tasks like natural language reasoning, code generation, and +translation using OpenAI's GPT models. Experiments show that GPT-3.5-turbo's +performance varies by up to 40\% in a code translation task depending on the +prompt template, while larger models like GPT-4 are more robust to these +variations. Our analysis highlights the need to reconsider the use of fixed +prompt templates, as different formats can significantly affect model +performance. + +
+
+ comment: Submitted to NAACL 2025 +
+
+
+
+
+ + ♻ ☆ KPC-cF: Aspect-Based Sentiment Analysis via Implicit-Feature Alignment + with Corpus Filtering ICML 2024 + + +
+ Investigations into Aspect-Based Sentiment Analysis (ABSA) for Korean +industrial reviews are notably lacking in the existing literature. Our research +proposes an intuitive and effective framework for ABSA in low-resource +languages such as Korean. It optimizes prediction labels by integrating +translated benchmark and unlabeled Korean data. Using a model fine-tuned on +translated data, we pseudo-labeled the actual Korean NLI set. Subsequently, we +applied LaBSE and \MSP{}-based filtering to this pseudo-NLI set as implicit +feature, enhancing Aspect Category Detection and Polarity determination through +additional training. Incorporating dual filtering, this model bridged dataset +gaps, achieving positive results in Korean ABSA with minimal resources. Through +additional data injection pipelines, our approach aims to utilize high-resource +data and construct effective models within communities, whether corporate or +individual, in low-resource language countries. Compared to English ABSA, our +framework showed an approximately 3\% difference in F1 scores and accuracy. We +release the dataset and our code for Korean ABSA, at this link. + +
+
+ comment: Work in Progress, DMLR@ICML 2024 +
+
+
+
+
+ + ♻ ☆ Mitigating the Linguistic Gap with Phonemic Representations for Robust + Cross-lingual Transfer EMNLP 2024 + + +
+ Approaches to improving multilingual language understanding often struggle +with significant performance gaps between high-resource and low-resource +languages. While there are efforts to align the languages in a single latent +space to mitigate such gaps, how different input-level representations +influence such gaps has not been investigated, particularly with phonemic +inputs. We hypothesize that the performance gaps are affected by representation +discrepancies between these languages, and revisit the use of phonemic +representations as a means to mitigate these discrepancies. To demonstrate the +effectiveness of phonemic representations, we present experiments on three +representative cross-lingual tasks on 12 languages in total. The results show +that phonemic representations exhibit higher similarities between languages +compared to orthographic representations, and it consistently outperforms +grapheme-based baseline model on languages that are relatively low-resourced. +We present quantitative evidence from three cross-lingual tasks that +demonstrate the effectiveness of phonemic representations, and it is further +justified by a theoretical analysis of the cross-lingual performance gap. + +
+
+ comment: Accepted to the 4th Multilingual Representation Learning (MRL) + Workshop (co-located with EMNLP 2024) +
+
+
+
+
+ + ♻ ☆ The Silicon Ceiling: Auditing GPT's Race and Gender Biases in Hiring + + +
+ Large language models (LLMs) are increasingly being introduced in workplace +settings, with the goals of improving efficiency and fairness. However, +concerns have arisen regarding these models' potential to reflect or exacerbate +social biases and stereotypes. This study explores the potential impact of LLMs +on hiring practices. To do so, we conduct an AI audit of race and gender biases +in one commonly-used LLM, OpenAI's GPT-3.5, taking inspiration from the history +of traditional offline resume audits. We conduct two studies using names with +varied race and gender connotations: resume assessment (Study 1) and resume +generation (Study 2). In Study 1, we ask GPT to score resumes with 32 different +names (4 names for each combination of the 2 gender and 4 racial groups) and +two anonymous options across 10 occupations and 3 evaluation tasks (overall +rating, willingness to interview, and hireability). We find that the model +reflects some biases based on stereotypes. In Study 2, we prompt GPT to create +resumes (10 for each name) for fictitious job candidates. When generating +resumes, GPT reveals underlying biases; women's resumes had occupations with +less experience, while Asian and Hispanic resumes had immigrant markers, such +as non-native English and non-U.S. education and work experiences. Our findings +contribute to a growing body of literature on LLM biases, particularly in +workplace contexts. + +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration From Cognitive Psychology + + +
+ The cognitive mechanism by which Large Language Models (LLMs) solve +mathematical problems remains a widely debated and unresolved issue. Currently, +there is little interpretable experimental evidence that connects LLMs' +problem-solving with human cognitive psychology.To determine if LLMs possess +human-like mathematical reasoning, we modified the problems used in the human +Cognitive Reflection Test (CRT). Our results show that, even with the use of +Chains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model +(noted for its reasoning capabilities), have a high error rate when solving +these modified CRT problems. Specifically, the average accuracy rate dropped by +up to 50% compared to the original questions.Further analysis of LLMs' +incorrect answers suggests that they primarily rely on pattern matching from +their training data, which aligns more with human intuition (System 1 thinking) +rather than with human-like reasoning (System 2 thinking). This finding +challenges the belief that LLMs have genuine mathematical reasoning abilities +comparable to humans. As a result, this work may adjust overly optimistic views +on LLMs' progress towards artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ Communication Compression for Tensor Parallel LLM Inference + + +
+ Large Language Models (LLMs) have pushed the frontier of artificial +intelligence but are comprised of hundreds of billions of parameters and +operations. For faster inference latency, LLMs are deployed on multiple +hardware accelerators through various Model Parallelism strategies. Our paper +looks into the details on one such strategy - Tensor Parallel - and proposes to +reduce latency by compressing inter-accelerator communication. We leverage fine +grained quantization techniques to compress selected activations by 3.5 - 4.5x. +Our proposed method leads up to 2x reduction of time-to-first-token (TTFT) with +negligible model performance degradation. + +
+
+
+
+
+ + ♻ ☆ REBORN: Reinforcement-Learned Boundary Segmentation with Iterative + Training for Unsupervised ASR NeurIPS 2024 + + +
+ Unsupervised automatic speech recognition (ASR) aims to learn the mapping +between the speech signal and its corresponding textual transcription without +the supervision of paired speech-text data. A word/phoneme in the speech signal +is represented by a segment of speech signal with variable length and unknown +boundary, and this segmental structure makes learning the mapping between +speech and text challenging, especially without paired data. In this paper, we +propose REBORN,Reinforcement-Learned Boundary Segmentation with Iterative +Training for Unsupervised ASR. REBORN alternates between (1) training a +segmentation model that predicts the boundaries of the segmental structures in +speech signals and (2) training the phoneme prediction model, whose input is +the speech feature segmented by the segmentation model, to predict a phoneme +transcription. Since supervised data for training the segmentation model is not +available, we use reinforcement learning to train the segmentation model to +favor segmentations that yield phoneme sequence predictions with a lower +perplexity. We conduct extensive experiments and find that under the same +setting, REBORN outperforms all prior unsupervised ASR models on LibriSpeech, +TIMIT, and five non-English languages in Multilingual LibriSpeech. We +comprehensively analyze why the boundaries learned by REBORN improve the +unsupervised ASR performance. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ VLEU: a Method for Automatic Evaluation for Generalizability of + Text-to-Image Models EMNLP2024 + + +
+ Progress in Text-to-Image (T2I) models has significantly improved the +generation of images from textual descriptions. However, existing evaluation +metrics do not adequately assess the models' ability to handle a diverse range +of textual prompts, which is crucial for their generalizability. To address +this, we introduce a new metric called Visual Language Evaluation Understudy +(VLEU). VLEU uses large language models to sample from the visual text domain, +the set of all possible input texts for T2I models, to generate a wide variety +of prompts. The images generated from these prompts are evaluated based on +their alignment with the input text using the CLIP model.VLEU quantifies a +model's generalizability by computing the Kullback-Leibler divergence between +the marginal distribution of the visual text and the conditional distribution +of the images generated by the model. This metric provides a quantitative way +to compare different T2I models and track improvements during model finetuning. +Our experiments demonstrate the effectiveness of VLEU in evaluating the +generalization capability of various T2I models, positioning it as an essential +metric for future research in text-to-image synthesis. + +
+
+ comment: accepted by EMNLP2024(long paper,main conference) +
+
+
+
+
+ + ♻ ☆ Hyper-multi-step: The Truth Behind Difficult Long-context Tasks + + +
+ Long-context language models (LCLM), characterized by their extensive context +window, is becoming increasingly popular. Meanwhile, many long-context +benchmarks present challenging tasks that even the most advanced LCLMs struggle +to complete. However, the underlying sources of various challenging +long-context tasks have seldom been studied. To bridge this gap, we conduct +experiments to indicate their difficulty stems primarily from two basic issues: +"multi-matching retrieval," which requires the simultaneous retrieval of +multiple items, and "logic-based retrieval," which necessitates logical +judgment within retrieval criteria. These two problems, while seemingly +straightforward, actually exceed the capabilities of LCLMs because they are +proven to be hyper-multi-step (demanding numerous steps to solve) in nature. +This finding could explain why LLMs struggle with more advanced long-context +tasks, providing a more accurate perspective for rethinking solutions for them. + +
+
+ comment: Our code is publicly available at + https://github.com/yuyijiong/hard_retrieval_for_llm and the datasets is at + https://huggingface.co/datasets/yuyijiong/difficult_retrieval +
+
+
+
+
+ + ♻ ☆ Evaluating and Enhancing Large Language Models for Conversational + Reasoning on Knowledge Graphs + + +
+ The development of large language models (LLMs) has been catalyzed by +advancements in pre-training techniques. These models have demonstrated robust +reasoning capabilities through manually designed prompts. In this work, we +evaluate the conversational reasoning capabilities of the current +state-of-the-art LLM (GPT-4) on knowledge graphs (KGs). However, the +performance of LLMs is constrained due to a lack of KG environment awareness +and the difficulties in developing effective optimization mechanisms for +intermediary reasoning stages. We further introduce LLM-ARK, a LLM grounded KG +reasoning agent designed to deliver precise and adaptable predictions on KG +paths. LLM-ARK leverages Full Textual Environment (FTE) prompt to assimilate +state information within each reasoning step. We reframe the challenge of +multi-hop reasoning on the KG as a sequential decision-making task. Utilizing +the Proximal Policy Optimization (PPO) online policy gradient reinforcement +learning algorithm, our model is optimized to learn from rich reward signals. +Additionally, we conduct an evaluation of our model and GPT-4 on the OpenDialKG +dataset. The experimental results reveal that LLaMA-2-7B-ARK outperforms the +current state-of-the-art model by 5.28 percentage points, with a performance +rate of 36.39% on the target@1 evaluation metric. Meanwhile, GPT-4 scored +14.91%, further demonstrating the effectiveness of our method. Our code is +available on GitHub (https://github.com/Aipura/LLM-ARK) for further access. + +
+
+
+
+
+ + ♻ ☆ MANTIS: Interleaved Multi-Image Instruction Tuning + + +
+ Large multimodal models (LMMs) have shown great results in single-image +vision language tasks. However, their abilities to solve multi-image visual +language tasks is yet to be improved. The existing LMMs like OpenFlamingo, +Emu2, and Idefics gain their multi-image ability through pre-training on +hundreds of millions of noisy interleaved image-text data from the web, which +is neither efficient nor effective. In this paper, we aim to build strong +multi-image LMMs via instruction tuning with academic-level resources. +Therefore, we meticulously construct Mantis-Instruct containing 721K +multi-image instruction data to train a family of Mantis models. The +instruction tuning empowers Mantis with different multi-image skills like +co-reference, comparison, reasoning, and temporal understanding. We evaluate +Mantis on 8 multi-image benchmarks and 6 single-image benchmarks. +Mantis-Idefics2 can achieve SoTA results on all the multi-image benchmarks and +beat the strongest multi-image baseline, Idefics2-8B by an average of 13 +absolute points. Notably, Idefics2-8B was pre-trained on 140M interleaved +multi-image data, which is 200x larger than Mantis-Instruct. We observe that +Mantis performs equivalently well on the held-in and held-out benchmarks, which +shows its generalization ability. We further evaluate Mantis on single-image +benchmarks and demonstrate that Mantis also maintains a strong single-image +performance on par with CogVLM and Emu2. Our results show that multi-image +abilities are not necessarily gained through massive pre-training, instead, +they can be gained by low-cost instruction tuning. The training and evaluation +of Mantis has paved the road for future work to improve LMMs' multi-image +abilities. + +
+
+ comment: 13 pages, 3 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ A Dynamic LLM-Powered Agent Network for Task-Oriented Agent + Collaboration ALT + + +
+ Recent studies show that collaborating multiple large language model (LLM) +powered agents is a promising way for task solving. However, current approaches +are constrained by using a fixed number of agents and static communication +structures. In this work, we propose automatically selecting a team of agents +from candidates to collaborate in a dynamic communication structure toward +different tasks and domains. Specifically, we build a framework named Dynamic +LLM-Powered Agent Network ($\textbf{DyLAN}$) for LLM-powered agent +collaboration, operating a two-stage paradigm: (1) Team Optimization and (2) +Task Solving. During the first stage, we utilize an $\textit{agent selection}$ +algorithm, based on an unsupervised metric called $\textit{Agent Importance +Score}$, enabling the selection of best agents according to their contributions +in a preliminary trial, oriented to the given task. Then, in the second stage, +the selected agents collaborate dynamically according to the query. +Empirically, we demonstrate that DyLAN outperforms strong baselines in code +generation, decision-making, general reasoning, and arithmetic reasoning tasks +with moderate computational cost. On specific subjects in MMLU, selecting a +team of agents in the team optimization stage improves accuracy by up to 25.0% +in DyLAN. + +
+
+ comment: Published in COLM2024. Code Repo: https://github.com/SALT-NLP/DyLAN +
+
+
+
+
+ + ♻ ☆ ORLM: A Customizable Framework in Training Large Models for Automated + Optimization Modeling + + +
+ Optimization modeling and solving play a critical role in the application of +Operations Research (OR) tools to address real-world problems, yet they pose +challenges and require extensive expertise from OR experts. With the advent of +large language models (LLMs), new opportunities have emerged to streamline and +automate these tasks. However, current research predominantly relies on +closed-source LLMs such as GPT-4, along with extensive prompt engineering +techniques. This reliance stems from the scarcity of high-quality training +datasets for optimization modeling, resulting in elevated costs, prolonged +processing times, and privacy concerns. To address these challenges, our work +is the first to propose a viable path for training open-source LLMs that are +capable of optimization modeling as well as developing and executing solver +codes, eventually leading to a superior ability for automating optimization +modeling and solving. Particularly, we introduce a semi-automated data +synthesis framework designed for optimization modeling issues, named +OR-Instruct. This framework merges the training data requirements of large +models with the unique characteristics of optimization modeling problems, and +allows for customizable enhancements tailored to specific scenarios or modeling +types. To evaluate the performance of our proposed framework, we present the +IndustryOR benchmark, the inaugural industrial standard for evaluating LLMs in +solving practical OR problems. Utilizing data synthesized through OR-Instruct, +we train various open-source LLMs with a capacity of 7 billion parameters +(dubbed ORLMs). The resulting model demonstrates significantly enhanced +optimization modeling capabilities, achieving state-of-the-art performance +across the NL4OPT, MAMO, and IndustryOR benchmarks. Our code and data are +available at \url{https://github.com/Cardinal-Operations/ORLM}. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ How Does Vision-Language Adaptation Impact the Safety of Vision Language + Models? + + +
+ Vision-Language adaptation (VL adaptation) transforms Large Language Models +(LLMs) into Large Vision-Language Models (LVLMs) for multimodal tasks, but this +process often compromises the inherent safety capabilities embedded in the +original LLMs. Despite potential harmfulness due to weakened safety measures, +in-depth analysis on the effects of VL adaptation on safety remains +under-explored. This study examines how VL adaptation influences safety and +evaluates the impact of safety fine-tuning methods. Our analysis reveals that +safety degradation occurs during VL adaptation, even when the training data is +safe. While safety tuning techniques like supervised fine-tuning with safety +datasets or reinforcement learning from human feedback mitigate some risks, +they still lead to safety degradation and a reduction in helpfulness due to +over-rejection issues. Further analysis of internal model weights suggests that +VL adaptation may impact certain safety-related layers, potentially lowering +overall safety levels. Additionally, our findings demonstrate that the +objectives of VL adaptation and safety tuning are divergent, which often +results in their simultaneous application being suboptimal. To address this, we +suggest the weight merging approach as an optimal solution effectively reducing +safety degradation while maintaining helpfulness. These insights help guide the +development of more reliable and secure LVLMs for real-world applications. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ♻ ☆ DriveThru: a Document Extraction Platform and Benchmark Datasets for + Indonesian Local Language Archives + + +
+ Indonesia is one of the most diverse countries linguistically. However, +despite this linguistic diversity, Indonesian languages remain underrepresented +in Natural Language Processing (NLP) research and technologies. In the past two +years, several efforts have been conducted to construct NLP resources for +Indonesian languages. However, most of these efforts have been focused on +creating manual resources thus difficult to scale to more languages. Although +many Indonesian languages do not have a web presence, locally there are +resources that document these languages well in printed forms such as books, +magazines, and newspapers. Digitizing these existing resources will enable +scaling of Indonesian language resource construction to many more languages. In +this paper, we propose an alternative method of creating datasets by digitizing +documents, which have not previously been used to build digital language +resources in Indonesia. DriveThru is a platform for extracting document content +utilizing Optical Character Recognition (OCR) techniques in its system to +provide language resource building with less manual effort and cost. This paper +also studies the utility of current state-of-the-art LLM for post-OCR +correction to show the capability of increasing the character accuracy rate +(CAR) and word accuracy rate (WAR) compared to off-the-shelf OCR. + +
+
+ comment: 12 pages, 3 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Geometry of orofacial neuromuscular signals: speech articulation + decoding using surface electromyography + + +
+ Each year, millions of individuals lose the ability to speak intelligibly due +to causes such as neuromuscular disease, stroke, trauma, and head/neck cancer +surgery (e.g. laryngectomy) or treatment (e.g. radiotherapy toxicity to the +speech articulators). Effective communication is crucial for daily activities, +and losing the ability to speak leads to isolation, depression, anxiety, and a +host of detrimental sequelae. Noninvasive surface electromyography (sEMG) has +shown promise to restore speech output in these individuals. The goal is to +collect sEMG signals from multiple articulatory sites as people silently +produce speech and then decode the signals to enable fluent and natural +communication. Currently, many fundamental properties of orofacial +neuromuscular signals relating to speech articulation remain unanswered. They +include questions relating to 1) the data structure of the orofacial sEMG +signals, 2)the signal distribution shift of sEMG across individuals, 3) ability +of sEMG signals to span the entire English language phonetic space during +silent speech articulations, and 4) the generalization capability of +non-invasive sEMG based silent speech interfaces. We address these questions +through a series of experiments involving healthy human subjects. We show that +sEMG signals evince graph data structure and that the signal distribution shift +is given by a change of basis. Furthermore, we show that silently voiced +articulations spanning the entire English language phonetic space can be +decoded using small neural networks which can be trained with little data and +that such architectures work well across individuals. To ensure transparency +and reproducibility, we open-source all the data and codes used in this study. + +
+
+
+
+
+ + ♻ ☆ Automated Clinical Data Extraction with Knowledge Conditioned LLMs COLING25 + + +
+ The extraction of lung lesion information from clinical and medical imaging +reports is crucial for research on and clinical care of lung-related diseases. +Large language models (LLMs) can be effective at interpreting unstructured text +in reports, but they often hallucinate due to a lack of domain-specific +knowledge, leading to reduced accuracy and posing challenges for use in +clinical settings. To address this, we propose a novel framework that aligns +generated internal knowledge with external knowledge through in-context +learning (ICL). Our framework employs a retriever to identify relevant units of +internal or external knowledge and a grader to evaluate the truthfulness and +helpfulness of the retrieved internal-knowledge rules, to align and update the +knowledge bases. Experiments with expert-curated test datasets demonstrate that +this ICL approach can increase the F1 score for key fields (lesion size, margin +and solidity) by an average of 12.9% over existing ICL methods. + +
+
+ comment: COLING25 Industry Track +
+
+
+
+
+ + ♻ ☆ Demystifying Large Language Models for Medicine: A Primer + + +
+ Large language models (LLMs) represent a transformative class of AI tools +capable of revolutionizing various aspects of healthcare by generating +human-like responses across diverse contexts and adapting to novel tasks +following human instructions. Their potential application spans a broad range +of medical tasks, such as clinical documentation, matching patients to clinical +trials, and answering medical questions. In this primer paper, we propose an +actionable guideline to help healthcare professionals more efficiently utilize +LLMs in their work, along with a set of best practices. This approach consists +of several main phases, including formulating the task, choosing LLMs, prompt +engineering, fine-tuning, and deployment. We start with the discussion of +critical considerations in identifying healthcare tasks that align with the +core capabilities of LLMs and selecting models based on the selected task and +data, performance requirements, and model interface. We then review the +strategies, such as prompt engineering and fine-tuning, to adapt standard LLMs +to specialized medical tasks. Deployment considerations, including regulatory +compliance, ethical guidelines, and continuous monitoring for fairness and +bias, are also discussed. By providing a structured step-by-step methodology, +this tutorial aims to equip healthcare professionals with the tools necessary +to effectively integrate LLMs into clinical practice, ensuring that these +powerful technologies are applied in a safe, reliable, and impactful manner. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ A Benchmark for Cross-Domain Argumentative Stance Classification on + Social Media AAAI + + +
+ Argumentative stance classification plays a key role in identifying authors' +viewpoints on specific topics. However, generating diverse pairs of +argumentative sentences across various domains is challenging. Existing +benchmarks often come from a single domain or focus on a limited set of topics. +Additionally, manual annotation for accurate labeling is time-consuming and +labor-intensive. To address these challenges, we propose leveraging platform +rules, readily available expert-curated content, and large language models to +bypass the need for human annotation. Our approach produces a multidomain +benchmark comprising 4,498 topical claims and 30,961 arguments from three +sources, spanning 21 domains. We benchmark the dataset in fully supervised, +zero-shot, and few-shot settings, shedding light on the strengths and +limitations of different methodologies. We release the dataset and code in this +study at hidden for anonymity. + +
+
+ comment: Accepted by AAAI ICWSM 2025 +
+
+
+
+
+ + ♻ ☆ Upsample or Upweight? Balanced Training on Heavily Imbalanced Datasets + + +
+ Data availability across domains often follows a long-tail distribution: a +few domains have abundant data, while most face dat . a scarcity. This +imbalance poses challenges in training language models uniformly across all +domains. In our study, we focus on multilingual settings, where data sizes vary +significantly between high- and low-resource languages. Common strategies to +address this include upsampling low-resource languages (Temperature Sampling) +or upweighting their loss (Scalarization). Although often considered +equivalent, this assumption has not been proven, which motivates our study. +Through both theoretical and empirical analysis, we identify the conditions +under which these approaches are equivalent and when they diverge. +Specifically, we demonstrate that these two methods are equivalent under full +gradient descent, but this equivalence breaks down with stochastic gradient +descent. Empirically, we observe that Temperature Sampling converges more +quickly but is prone to overfitting. We argue that this faster convergence is +likely due to the lower variance in gradient estimations, as shown +theoretically. Based on these insights, we propose Cooldown, a strategy that +reduces sampling temperature during training, accelerating convergence without +overfitting to low-resource languages. Our method is competitive with existing +data re-weighting and offers computational efficiency. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Sim-CLIP: Unsupervised Siamese Adversarial Fine-Tuning for Robust and + Semantically-Rich Vision-Language Models + + +
+ Vision-language models (VLMs) have achieved significant strides in recent +times specially in multimodal tasks, yet they remain susceptible to adversarial +attacks on their vision components. To address this, we propose Sim-CLIP, an +unsupervised adversarial fine-tuning method that enhances the robustness of the +widely-used CLIP vision encoder against such attacks while maintaining semantic +richness and specificity. By employing a Siamese architecture with cosine +similarity loss, Sim-CLIP learns semantically meaningful and attack-resilient +visual representations without requiring large batch sizes or momentum +encoders. Our results demonstrate that VLMs enhanced with Sim-CLIP's fine-tuned +CLIP encoder exhibit significantly enhanced robustness against adversarial +attacks, while preserving semantic meaning of the perturbed images. Notably, +Sim-CLIP does not require additional training or fine-tuning of the VLM itself; +replacing the original vision encoder with our fine-tuned Sim-CLIP suffices to +provide robustness. This work underscores the significance of reinforcing +foundational models like CLIP to safeguard the reliability of downstream VLM +applications, paving the way for more secure and effective multimodal systems. + +
+
+
+
+
+ + ♻ ☆ Hybrid Querying Over Relational Databases and Large Language Models + + +
+ Database queries traditionally operate under the closed-world assumption, +providing no answers to questions that require information beyond the data +stored in the database. Hybrid querying using SQL offers an alternative by +integrating relational databases with large language models (LLMs) to answer +beyond-database questions. In this paper, we present the first cross-domain +benchmark, SWAN, containing 120 beyond-database questions over four real-world +databases. To leverage state-of-the-art language models in addressing these +complex questions in SWAN, we present two solutions: one based on schema +expansion and the other based on user defined functions. We also discuss +optimization opportunities and potential future directions. Our evaluation +demonstrates that using GPT-4 Turbo with few-shot prompts, one can achieves up +to 40.0\% in execution accuracy and 48.2\% in data factuality. These results +highlights both the potential and challenges for hybrid querying. We believe +that our work will inspire further research in creating more efficient and +accurate data systems that seamlessly integrate relational databases and large +language models to address beyond-database questions. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 38 + +
+
+
+ + ☆ Motion Before Action: Diffusing Object Motion as Manipulation Condition + + +
+ Inferring object motion representations from observations enhances the +performance of robotic manipulation tasks. This paper introduces a new paradigm +for robot imitation learning that generates action sequences by reasoning about +object motion from visual observations. We propose MBA (Motion Before Action), +a novel module that employs two cascaded diffusion processes for object motion +generation and robot action generation under object motion guidance. MBA first +predicts the future pose sequence of the object based on observations, then +uses this sequence as a condition to guide robot action generation. Designed as +a plug-and-play component, MBA can be flexibly integrated into existing robotic +manipulation policies with diffusion action heads. Extensive experiments in +both simulated and real-world environments demonstrate that our approach +substantially improves the performance of existing policies across a wide range +of manipulation tasks. + +
+
+
+
+
+ + ☆ Modular Fault Diagnosis Framework for Complex Autonomous Driving Systems + + +
+ Fault diagnosis is crucial for complex autonomous mobile systems, especially +for modern-day autonomous driving (AD). Different actors, numerous use cases, +and complex heterogeneous components motivate a fault diagnosis of the system +and overall system integrity. AD systems are composed of many heterogeneous +components, each with different functionality and possibly using a different +algorithm (e.g., rule-based vs. AI components). In addition, these components +are subject to the vehicle's driving state and are highly dependent. This +paper, therefore, faces this problem by presenting the concept of a modular +fault diagnosis framework for AD systems. The concept suggests modular state +monitoring and diagnosis elements, together with a state- and dependency-aware +aggregation method. Our proposed classification scheme allows for the +categorization of the fault diagnosis modules. The concept is implemented on AD +shuttle buses and evaluated to demonstrate its capabilities. + +
+
+ comment: Accepted at 2024 IEEE 20th International Conference on Intelligent + Computer Communication and Processing (ICCP 2024) +
+
+
+
+
+ + ☆ One-Shot Manipulation Strategy Learning by Making Contact Analogies CoRL + + +
+ We present a novel approach, MAGIC (manipulation analogies for generalizable +intelligent contacts), for one-shot learning of manipulation strategies with +fast and extensive generalization to novel objects. By leveraging a reference +action trajectory, MAGIC effectively identifies similar contact points and +sequences of actions on novel objects to replicate a demonstrated strategy, +such as using different hooks to retrieve distant objects of different shapes +and sizes. Our method is based on a two-stage contact-point matching process +that combines global shape matching using pretrained neural features with local +curvature analysis to ensure precise and physically plausible contact points. +We experiment with three tasks including scooping, hanging, and hooking +objects. MAGIC demonstrates superior performance over existing methods, +achieving significant improvements in runtime speed and generalization to +different object categories. Website: https://magic-2024.github.io/ . + +
+
+ comment: CoRL LEAP Workshop, 2024 +
+
+
+
+
+ + ☆ Vision-based Manipulation of Transparent Plastic Bags in Industrial + Setups + + +
+ This paper addresses the challenges of vision-based manipulation for +autonomous cutting and unpacking of transparent plastic bags in industrial +setups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data, +connectivity, analytics, and robotics, promises enhanced accessibility and +sustainability throughout the value chain. The integration of autonomous +systems, including collaborative robots (cobots), into industrial processes is +pivotal for efficiency and safety. The proposed solution employs advanced +Machine Learning algorithms, particularly Convolutional Neural Networks (CNNs), +to identify transparent plastic bags under varying lighting and background +conditions. Tracking algorithms and depth sensing technologies are utilized for +3D spatial awareness during pick and placement. The system addresses challenges +in grasping and manipulation, considering optimal points, compliance control +with vacuum gripping technology, and real-time automation for safe interaction +in dynamic environments. The system's successful testing and validation in the +lab with the FRANKA robot arm, showcases its potential for widespread +industrial applications, while demonstrating effectiveness in automating the +unpacking and cutting of transparent plastic bags for an 8-stack bulk-loader +based on specific requirements and rigorous testing. + +
+
+
+
+
+ + ☆ Smart Automation in Luxury Leather Shoe Polishing: A Human Centric + Robotic Approach + + +
+ The polishing of luxury leather shoes is a delicate, labor intensive process +traditionally performed by skilled craftsmen. Footwear companies aim to +automate parts of this process to enhance quality, productivity, and operator +well-being, but the unique nature of luxury shoe production presents +challenges. This paper introduces a solution involving a collaborative robotic +cell to assist in shoe polishing. A collaborative robotic manipulator, equipped +with a specialized tool and governed by force control, executes the polishing +tasks. Key factors such as trajectory design, applied force, polishing speed, +and polish amount were analyzed. Polishing trajectories are designed using CAM +software and transferred to the robot control system. Human operators design +the process, supervise the robot, and perform final finishing, ensuring their +expertise is integral to achieving quality. Extensive testing on various shoe +models showed significant improvements in quality and reliability, leading to +successful implementation on an industrial production line. + +
+
+
+
+
+ + ☆ Vlimb: A Wire-Driven Wearable Robot for Bodily Extension, Balancing + Powerfulness and Reachability + + +
+ Numerous wearable robots have been developed to meet the demands of physical +assistance and entertainment. These wearable robots range from body-enhancing +types that assist human arms and legs to body-extending types that have extra +arms. This study focuses specifically on wearable robots of the latter +category, aimed at bodily extension. However, they have not yet achieved the +level of powerfulness and reachability equivalent to that of human limbs, +limiting their application to entertainment and manipulation tasks involving +lightweight objects. Therefore, in this study, we develop an body-extending +wearable robot, Vlimb, which has enough powerfulness to lift a human and can +perform manipulation. Leveraging the advantages of tendon-driven mechanisms, +Vlimb incorporates a wire routing mechanism capable of accommodating both +delicate manipulations and robust lifting tasks. Moreover, by introducing a +passive ring structure to overcome the limited reachability inherent in +tendon-driven mechanisms, Vlimb achieves both the powerfulness and reachability +comparable to that of humans. This paper outlines the design methodology of +Vlimb, conducts preliminary manipulation and lifting tasks, and verifies its +effectiveness. + +
+
+
+
+
+ + ☆ FlowNav: Learning Efficient Navigation Policies via Conditional Flow + Matching CoRL 2024 + + +
+ Effective robot navigation in dynamic environments is a challenging task that +depends on generating precise control actions at high frequencies. Recent +advancements have framed navigation as a goal-conditioned control problem. +Current state-of-the-art methods for goal-based navigation, such as diffusion +policies, either generate sub-goal images or robot control actions to guide +robots. However, despite their high accuracy, these methods incur substantial +computational costs, which limits their practicality for real-time +applications. Recently, Conditional Flow Matching(CFM) has emerged as a more +efficient and robust generalization of diffusion. In this work we explore the +use of CFM to learn action policies that help the robot navigate its +environment. Our results demonstrate that CFM is able to generate highly +accurate robot actions. CFM not only matches the accuracy of diffusion policies +but also significantly improves runtime performance. This makes it particularly +advantageous for real-time robot navigation, where swift, reliable action +generation is vital for collision avoidance and smooth operation. By leveraging +CFM, we provide a pathway to more scalable, responsive robot navigation systems +capable of handling the demands of dynamic and unpredictable environments. + +
+
+ comment: Accepted at CoRL 2024 workshop on Learning Effective Abstractions for + Planning (LEAP) and workshop on Differentiable Optimization Everywhere: + Simulation, Estimation, Learning, and Control. 7 pages + 2 pages of + references, 7 figures +
+
+
+
+
+ + ☆ Strategic Sacrifice: Self-Organized Robot Swarm Localization for + Inspection Productivity + + +
+ Robot swarms offer significant potential for inspecting diverse +infrastructure, ranging from bridges to space stations. However, effective +inspection requires accurate robot localization, which demands substantial +computational resources and limits productivity. Inspired by biological +systems, we introduce a novel cooperative localization mechanism that minimizes +collective computation expenditure through self-organized sacrifice. Here, a +few agents bear the computational burden of localization; through local +interactions, they improve the inspection productivity of the swarm. Our +approach adaptively maximizes inspection productivity for unconstrained +trajectories in dynamic interaction and environmental settings. We demonstrate +the optimality and robustness using mean-field analytical models, multi-agent +simulations, and hardware experiments with metal climbing robots inspecting a +3D cylinder. + +
+
+ comment: 14 pages, 10 figures, 17th International Symposium on Distributed + Autonomous Robotic Systems (DARS'24) +
+
+
+
+
+ + ☆ DiffRoad: Realistic and Diverse Road Scenario Generation for Autonomous + Vehicle Testing + + +
+ Generating realistic and diverse road scenarios is essential for autonomous +vehicle testing and validation. Nevertheless, owing to the complexity and +variability of real-world road environments, creating authentic and varied +scenarios for intelligent driving testing is challenging. In this paper, we +propose DiffRoad, a novel diffusion model designed to produce controllable and +high-fidelity 3D road scenarios. DiffRoad leverages the generative capabilities +of diffusion models to synthesize road layouts from white noise through an +inverse denoising process, preserving real-world spatial features. To enhance +the quality of generated scenarios, we design the Road-UNet architecture, +optimizing the balance between backbone and skip connections for high-realism +scenario generation. Furthermore, we introduce a road scenario evaluation +module that screens adequate and reasonable scenarios for intelligent driving +testing using two critical metrics: road continuity and road reasonableness. +Experimental results on multiple real-world datasets demonstrate DiffRoad's +ability to generate realistic and smooth road structures while maintaining the +original distribution. Additionally, the generated scenarios can be fully +automated into the OpenDRIVE format, facilitating generalized autonomous +vehicle simulation testing. DiffRoad provides a rich and diverse scenario +library for large-scale autonomous vehicle testing and offers valuable insights +for future infrastructure designs that are better suited for autonomous +vehicles. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ A ROS~2-based Navigation and Simulation Stack for the Robotino + + +
+ The Robotino, developed by Festo Didactic, serves as a versatile platform in +education and research for mobile robotics tasks. However, there currently is +no ROS2 integration for the Robotino available. In this paper, we describe our +work on a Webots simulation environment for a Robotino platform extended by +LIDAR sensors. A ROS2 integration and a pre-configured setup for localization +and navigation using existing ROS packages from the Nav2 suite are provided. We +validate our setup by comparing simulations with real-world experiments +conducted by three Robotinos in a logistics environment in our lab. +Additionally, we tested the setup using a ROS 2 hardware driver for the +Robotino developed by team GRIPS of the RoboCup Logistics League. The results +demonstrate the feasibility of using ROS2 and Nav2 for navigation tasks on the +Robotino platform showing great consistency between simulation and real-world +performance. + +
+
+ comment: Published at RoboCup 2024: Robot World Cup XXVII, Springer-Verlag, + 2024 +
+
+
+
+
+ + ☆ Robot Tasks with Fuzzy Time Requirements from Natural Language + Instructions + + +
+ Natural language allows robot programming to be accessible to everyone. +However, the inherent fuzziness in natural language poses challenges for +inflexible, traditional robot systems. We focus on instructions with fuzzy time +requirements (e.g., "start in a few minutes"). Building on previous robotics +research, we introduce fuzzy skills. These define an execution by the robot +with so-called satisfaction functions representing vague execution time +requirements. Such functions express a user's satisfaction over potential +starting times for skill execution. When the robot handles multiple fuzzy +skills, the satisfaction function provides a temporal tolerance window for +execution, thus, enabling optimal scheduling based on satisfaction. We +generalized such functions based on individual user expectations with a user +study. The participants rated their satisfaction with an instruction's +execution at various times. Our investigations reveal that trapezoidal +functions best approximate the users' satisfaction. Additionally, the results +suggest that users are more lenient if the execution is specified further into +the future. + +
+
+ comment: 9 pages, 8 figures, to be published in 2024 IEEE International + Conference on Robotic Computing (IRC) +
+
+
+
+
+ + ☆ D4W: Dependable Data-Driven Dynamics for Wheeled Robots + + +
+ Wheeled robots have gained significant attention due to their wide range of +applications in manufacturing, logistics, and service industries. However, due +to the difficulty of building a highly accurate dynamics model for wheeled +robots, developing and testing control algorithms for them remains challenging +and time-consuming, requiring extensive physical experimentation. To address +this problem, we propose D4W, i.e., Dependable Data-Driven Dynamics for Wheeled +Robots, a simulation framework incorporating data-driven methods to accelerate +the development and evaluation of algorithms for wheeled robots. The key +contribution of D4W is a solution that utilizes real-world sensor data to learn +accurate models of robot dynamics. The learned dynamics can capture complex +robot behaviors and interactions with the environment throughout simulations, +surpassing the limitations of analytical methods, which only work in simplified +scenarios. Experimental results show that D4W achieves the best simulation +accuracy compared to traditional approaches, allowing for rapid iteration of +wheel robot algorithms with less or no need for fine-tuning in reality. We +further verify the usability and practicality of the proposed framework through +integration with existing simulators and controllers. + +
+
+ comment: The Fifth International Conference on Distributed Artificial + Intelligence +
+
+
+
+
+ + ☆ Hearing the Robot's Mind: Sonification for Explicit Feedback in + Human-Robot Interaction + + +
+ Social robots are required not only to understand human intentions but also +to effectively communicate their intentions or own internal states to users. +This study explores the use of sonification to provide explicit auditory +feedback, enhancing mutual understanding in HRI. We introduce a novel +sonification approach that conveys the robot's internal state, linked to its +perception of nearby individuals and their interaction intentions. The approach +is evaluated through a two-fold user study: an online video-based survey with +$26$ participants and live experiments with $10$ participants. Results indicate +that while sonification improves the robot's expressivity and communication +effectiveness, the design of the auditory feedback needs refinement to enhance +user experience. Participants found the auditory cues useful but described the +sounds as uninteresting and unpleasant. These findings underscore the +importance of carefully designed auditory feedback in developing more effective +and engaging HRI systems. + +
+
+
+
+
+ + ☆ Learning Hand State Estimation for a Light Exoskeleton + + +
+ We propose a machine learning-based estimator of the hand state for +rehabilitation purposes, using light exoskeletons. These devices are easy to +use and useful for delivering domestic and frequent therapies. We build a +supervised approach using information from the muscular activity of the forearm +and the motion of the exoskeleton to reconstruct the hand's opening degree and +compliance level. Such information can be used to evaluate the therapy progress +and develop adaptive control behaviors. Our approach is validated with a real +light exoskeleton. The experiments demonstrate good predictive performance of +our approach when trained on data coming from a single user and tested on the +same user, even across different sessions. This generalization capability makes +our system promising for practical use in real rehabilitation. + +
+
+
+
+
+ + ☆ BlueME: Robust Underwater Robot-to-Robot Communication Using Compact + Magnetoelectric Antennas + + +
+ We present the design, development, and experimental validation of BlueME, a +compact magnetoelectric (ME) antenna array system for underwater robot-to-robot +communication. BlueME employs ME antennas operating at their natural mechanical +resonance frequency to efficiently transmit and receive very-low-frequency +(VLF) electromagnetic signals underwater. To evaluate its performance, we +deployed BlueME on an autonomous surface vehicle (ASV) and a remotely operated +vehicle (ROV) in open-water field trials. Our tests demonstrate that BlueME +maintains reliable signal transmission at distances beyond 200 meters while +consuming only 1 watt of power. Field trials show that the system operates +effectively in challenging underwater conditions such as turbidity, obstacles, +and multipath interference-- that generally affect acoustics and optics. Our +analysis also examines the impact of complete submersion on system performance +and identifies key deployment considerations. This work represents the first +practical underwater deployment of ME antennas outside the laboratory and +implements the largest VLF ME array system to date. BlueME demonstrates +significant potential for marine robotics and automation in multi-robot +cooperative systems and remote sensor networks. + +
+
+
+
+
+ + ☆ Risk-aware MPPI for Stochastic Hybrid Systems + + +
+ Path Planning for stochastic hybrid systems presents a unique challenge of +predicting distributions of future states subject to a state-dependent dynamics +switching function. In this work, we propose a variant of Model Predictive Path +Integral Control (MPPI) to plan kinodynamic paths for such systems. Monte Carlo +may be inaccurate when few samples are chosen to predict future states under +state-dependent disturbances. We employ recently proposed Unscented +Transform-based methods to capture stochasticity in the states as well as the +state-dependent switching surfaces. This is in contrast to previous works that +perform switching based only on the mean of predicted states. We focus our +motion planning application on the navigation of a mobile robot in the presence +of dynamically moving agents whose responses are based on sensor-constrained +attention zones. We evaluate our framework on a simulated mobile robot and show +faster convergence to a goal without collisions when the robot exploits the +hybrid human dynamics versus when it does not. + +
+
+
+
+
+ + ☆ Rationality based Innate-Values-driven Reinforcement Learning + + +
+ Innate values describe agents' intrinsic motivations, which reflect their +inherent interests and preferences to pursue goals and drive them to develop +diverse skills satisfying their various needs. The essence of reinforcement +learning (RL) is learning from interaction based on reward-driven behaviors, +much like natural agents. It is an excellent model to describe the +innate-values-driven (IV) behaviors of AI agents. Especially developing the +awareness of the AI agent through balancing internal and external utilities +based on its needs in different tasks is a crucial problem for individuals +learning to support AI agents integrating human society with safety and harmony +in the long term. This paper proposes a hierarchical compound intrinsic value +reinforcement learning model -- innate-values-driven reinforcement learning +termed IVRL to describe the complex behaviors of AI agents' interaction. We +formulated the IVRL model and proposed two IVRL models: DQN and A2C. By +comparing them with benchmark algorithms such as DQN, DDQN, A2C, and PPO in the +Role-Playing Game (RPG) reinforcement learning test platform VIZDoom, we +demonstrated that rationally organizing various individual needs can +effectively achieve better performance. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2401.05572 +
+
+
+
+
+ + ☆ VidMan: Exploiting Implicit Dynamics from Video Diffusion Model for + Effective Robot Manipulation NeurIPS 2024 + + +
+ Recent advancements utilizing large-scale video data for learning video +generation models demonstrate significant potential in understanding complex +physical dynamics. It suggests the feasibility of leveraging diverse robot +trajectory data to develop a unified, dynamics-aware model to enhance robot +manipulation. However, given the relatively small amount of available robot +data, directly fitting data without considering the relationship between visual +observations and actions could lead to suboptimal data utilization. To this +end, we propose VidMan (Video Diffusion for Robot Manipulation), a novel +framework that employs a two-stage training mechanism inspired by dual-process +theory from neuroscience to enhance stability and improve data utilization +efficiency. Specifically, in the first stage, VidMan is pre-trained on the Open +X-Embodiment dataset (OXE) for predicting future visual trajectories in a video +denoising diffusion manner, enabling the model to develop a long horizontal +awareness of the environment's dynamics. In the second stage, a flexible yet +effective layer-wise self-attention adapter is introduced to transform VidMan +into an efficient inverse dynamics model that predicts action modulated by the +implicit dynamics knowledge via parameter sharing. Our VidMan framework +outperforms state-of-the-art baseline model GR-1 on the CALVIN benchmark, +achieving a 11.7% relative improvement, and demonstrates over 9% precision +gains on the OXE small-scale dataset. These results provide compelling evidence +that world models can significantly enhance the precision of robot action +prediction. Codes and models will be public. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for + Egocentric Hand Object Interaction Videos + + +
+ Egocentric Hand Object Interaction (HOI) videos provide valuable insights +into human interactions with the physical world, attracting growing interest +from the computer vision and robotics communities. A key task in fully +understanding the geometry and dynamics of HOI scenes is dense pointclouds +sequence reconstruction. However, the inherent motion of both hands and the +camera makes this challenging. Current methods often rely on time-consuming +test-time optimization, making them impractical for reconstructing +internet-scale videos. To address this, we introduce UniHOI, a model that +unifies the estimation of all variables necessary for dense 4D reconstruction, +including camera intrinsic, camera poses, and video depth, for egocentric HOI +scene in a fast feed-forward manner. We end-to-end optimize all these variables +to improve their consistency in 3D space. Furthermore, our model could be +trained solely on large-scale monocular video dataset, overcoming the +limitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain +and zero-shot generalization setting, surpassing all baselines in pointclouds +sequence reconstruction and long-term 3D scene flow recovery. UniHOI is the +first approach to offer fast, dense, and generalizable monocular egocentric HOI +scene reconstruction in the presence of motion. Code and trained model will be +released in the future. + +
+
+
+
+
+ + ☆ Information-Optimal Multi-Spacecraft Positioning for Interstellar Object + Exploration + + +
+ Interstellar objects (ISOs), astronomical objects not gravitationally bound +to the sun, could present valuable opportunities to advance our understanding +of the universe's formation and composition. In response to the unpredictable +nature of their discoveries that inherently come with large and rapidly +changing uncertainty in their state, this paper proposes a novel +multi-spacecraft framework for locally maximizing information to be gained +through ISO encounters with formal probabilistic guarantees. Given some +approximated control and estimation policies for fully autonomous spacecraft +operations, we first construct an ellipsoid around its terminal position, where +the ISO would be located with a finite probability. The large state uncertainty +of the ISO is formally handled here through the hierarchical property in +stochastically contracting nonlinear systems. We then propose a method to find +the terminal positions of the multiple spacecraft optimally distributed around +the ellipsoid, which locally maximizes the information we can get from all the +points of interest (POIs). This utilizes a probabilistic information cost +function that accounts for spacecraft positions, camera specifications, and ISO +position uncertainty, where the information is defined as visual data collected +by cameras. Numerical simulations demonstrate the efficacy of this approach +using synthetic ISO candidates generated from quasi-realistic empirical +populations. Our method allows each spacecraft to optimally select its terminal +state and determine the ideal number of POIs to investigate, potentially +enhancing the ability to study these rare and fleeting interstellar visitors +while minimizing resource utilization. + +
+
+ comment: IEEE Aerospace Conference, Preprint Version, Accepted: November 2024 +
+
+
+
+
+ + ☆ BlueME: Robust Underwater Robot-to-Robot Communication Using Compact + Magnetoelectric Antennas + + +
+ We present the design, development, and experimental validation of BlueME, a +compact magnetoelectric (ME) antenna array system for underwater robot-to-robot +communication. BlueME employs ME antennas operating at their natural mechanical +resonance frequency to efficiently transmit and receive very-low-frequency +(VLF) electromagnetic signals underwater. To evaluate its performance, we +deployed BlueME on an autonomous surface vehicle (ASV) and a remotely operated +vehicle (ROV) in open-water field trials. Our tests demonstrate that BlueME +maintains reliable signal transmission at distances beyond 200 meters while +consuming only 1 watt of power. Field trials show that the system operates +effectively in challenging underwater conditions such as turbidity, obstacles, +and multipath interference -- that generally affect acoustics and optics. Our +analysis also examines the impact of complete submersion on system performance +and identifies key deployment considerations. This work represents the first +practical underwater deployment of ME antennas outside the laboratory and +implements the largest VLF ME array system to date. BlueME demonstrates +significant potential for marine robotics and automation in multi-robot +cooperative systems and remote sensor networks. + +
+
+
+
+
+ + ☆ Robustness Assessment of Static Structures for Efficient Object Handling + + +
+ This work establishes a solution to the problem of assessing the robustness +of multi-object assemblies to external forces. Our physically-grounded approach +handles arbitrary static structures made from rigid objects of any shape and +mass distribution without relying on heuristics or approximations. The result +is a method that provides a foundation for autonomous robot decision-making +when interacting with objects in frictional contact. Our strategy decouples +slipping from toppling, enabling independent assessments of these two +phenomena, with a shared robustness representation being key to combining the +results into an accurate robustness assessment. Our algorithms can be used by +motion planners to produce efficient assembly transportation plans, and by +object placement planners to select poses that improve the strength of an +assembly. Compared to prior work, our approach is more generally applicable +than commonly used heuristics and more efficient than dynamics simulations. + +
+
+ comment: Submitted to IEEE Transactions on Robotics. Contains 16 pages, 13 + figures, and 3 tables +
+
+
+
+
+ + ♻ ☆ Learning Multi-Agent Loco-Manipulation for Long-Horizon Quadrupedal + Pushing + + +
+ Recently, quadrupedal locomotion has achieved significant success, but their +manipulation capabilities, particularly in handling large objects, remain +limited, restricting their usefulness in demanding real-world applications such +as search and rescue, construction, industrial automation, and room +organization. This paper tackles the task of obstacle-aware, long-horizon +pushing by multiple quadrupedal robots. We propose a hierarchical multi-agent +reinforcement learning framework with three levels of control. The high-level +controller integrates an RRT planner and a centralized adaptive policy to +generate subgoals, while the mid-level controller uses a decentralized +goal-conditioned policy to guide the robots toward these sub-goals. A +pre-trained low-level locomotion policy executes the movement commands. We +evaluate our method against several baselines in simulation, demonstrating +significant improvements over baseline approaches, with 36.0% higher success +rates and 24.5% reduction in completion time than the best baseline. Our +framework successfully enables long-horizon, obstacle-aware manipulation tasks +like Push-Cuboid and Push-T on Go1 robots in the real world. + +
+
+
+
+
+ + ♻ ☆ From Imitation to Refinement -- Residual RL for Precise Assembly + + +
+ Advances in behavior cloning (BC), like action-chunking and diffusion, have +enabled impressive capabilities. Still, imitation alone remains insufficient +for learning reliable policies for tasks requiring precise aligning and +inserting of objects, like assembly. Our key insight is that chunked BC +policies effectively function as trajectory planners, enabling long-horizon +tasks. Conversely, as they execute action chunks open-loop, they lack the +fine-grained reactivity necessary for reliable execution. Further, we find that +the performance of BC policies saturates despite increasing data. Reinforcement +learning (RL) is a natural way to overcome BC's limitations, but it is not +straightforward to apply directly to action-chunked models like diffusion +policies. We present a simple yet effective method, ResiP (Residual for Precise +Manipulation), that sidesteps these challenges by augmenting a frozen, chunked +BC model with a fully closed-loop residual policy trained with RL. The residual +policy is trained via on-policy RL, addressing distribution shifts and +introducing reactive control without altering the BC trajectory planner. +Evaluation on high-precision manipulation tasks demonstrates strong performance +of ResiP over BC methods and direct RL fine-tuning. Videos, code, and data are +available at https://residual-assembly.github.io. + +
+
+ comment: Project website: https://residual-assembly.github.io +
+
+
+
+
+ + ♻ ☆ Region-aware Grasp Framework with Normalized Grasp Space for Efficient + 6-DoF Grasping CoRL2024 + + +
+ A series of region-based methods succeed in extracting regional features and +enhancing grasp detection quality. However, faced with a cluttered scene with +potential collision, the definition of the grasp-relevant region stays +inconsistent, and the relationship between grasps and regional spaces remains +incompletely investigated. In this paper, we propose Normalized Grasp Space +(NGS) from a novel region-aware viewpoint, unifying the grasp representation +within a normalized regional space and benefiting the generalizability of +methods. Leveraging the NGS, we find that CNNs are underestimated for 3D +feature extraction and 6-DoF grasp detection in clutter scenes and build a +highly efficient Region-aware Normalized Grasp Network (RNGNet). Experiments on +the public benchmark show that our method achieves significant >20% performance +gains while attaining a real-time inference speed of approximately 50 FPS. +Real-world cluttered scene clearance experiments underscore the effectiveness +of our method. Further, human-to-robot handover and dynamic object grasping +experiments demonstrate the potential of our proposed method for closed-loop +grasping in dynamic scenarios. + +
+
+ comment: Accepted by CoRL2024, final camera-ready version will be updated soon +
+
+
+
+
+ + ♻ ☆ Efficient End-to-End 6-Dof Grasp Detection Framework for Edge Devices + with Hierarchical Heatmaps and Feature Propagation + + +
+ 6-DoF grasp detection is critically important for the advancement of +intelligent embodied systems, as it provides feasible robot poses for object +grasping. Various methods have been proposed to detect 6-DoF grasps through the +extraction of 3D geometric features from RGBD or point cloud data. However, +most of these approaches encounter challenges during real robot deployment due +to their significant computational demands, which can be particularly +problematic for mobile robot platforms, especially those reliant on edge +computing devices. This paper presents an Efficient End-to-End Grasp Detection +Network (E3GNet) for 6-DoF grasp detection utilizing hierarchical heatmap +representations. E3GNet effectively identifies high-quality and diverse grasps +in cluttered real-world environments. Benefiting from our end-to-end +methodology and efficient network design, our approach surpasses previous +methods in model inference efficiency and achieves real-time 6-Dof grasp +detection on edge devices. Furthermore, real-world experiments validate the +effectiveness of our method, achieving a satisfactory 94% object grasping +success rate. + +
+
+
+
+
+ + ♻ ☆ Is Linear Feedback on Smoothed Dynamics Sufficient for Stabilizing + Contact-Rich Plans? ICRA2025 + + +
+ Designing planners and controllers for contact-rich manipulation is extremely +challenging as contact violates the smoothness conditions that many +gradient-based controller synthesis tools assume. Contact smoothing +approximates a non-smooth system with a smooth one, allowing one to use these +synthesis tools more effectively. However, applying classical control synthesis +methods to smoothed contact dynamics remains relatively under-explored. This +paper analyzes the efficacy of linear controller synthesis using differential +simulators based on contact smoothing. We introduce natural baselines for +leveraging contact smoothing to compute (a) open-loop plans robust to uncertain +conditions and/or dynamics, and (b) feedback gains to stabilize around +open-loop plans. Using robotic bimanual whole-body manipulation as a testbed, +we perform extensive empirical experiments on over 300 trajectories and analyze +why LQR seems insufficient for stabilizing contact-rich plans. The video +summarizing this paper and hardware experiments is found here: +https://youtu.be/HLaKi6qbwQg?si=_zCAmBBD6rGSitm9. + +
+
+ comment: Under review for ICRA2025 +
+
+
+
+
+ + ♻ ☆ ShanghaiTech Mapping Robot is All You Need: Robot System for Collecting + Universal Ground Vehicle Datasets + + +
+ This paper presents the ShanghaiTech Mapping Robot, a state-of-the-art +unmanned ground vehicle (UGV) designed for collecting comprehensive +multi-sensor datasets to support research in robotics, Simultaneous +Localization and Mapping (SLAM), computer vision, and autonomous driving. The +robot is equipped with a wide array of sensors including RGB cameras, RGB-D +cameras, event-based cameras, IR cameras, LiDARs, mmWave radars, IMUs, +ultrasonic range finders, and a GNSS RTK receiver. The sensor suite is +integrated onto a specially designed mechanical structure with a centralized +power system and a synchronization mechanism to ensure spatial and temporal +alignment of the sensor data. A 16-node on-board computing cluster handles +sensor control, data collection, and storage. We describe the hardware and +software architecture of the robot in detail and discuss the calibration +procedures for the various sensors and investigate the interference for LiDAR +and RGB-D sensors. The capabilities of the platform are demonstrated through an +extensive outdoor dataset collected in a diverse campus environment. +Experiments with two LiDAR-based and two RGB-based SLAM approaches showcase the +potential of the dataset to support development and benchmarking for robotics. +To facilitate research, we make the dataset publicly available along with the +associated robot sensor calibration data: +https://slam-hive.net/wiki/ShanghaiTech_Datasets + +
+
+ comment: 19 pages, 27 figures. Submitted to IEEE Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ Closed-Loop Long-Horizon Robotic Planning via Equilibrium Sequence + Modeling + + +
+ In the endeavor to make autonomous robots take actions, task planning is a +major challenge that requires translating high-level task descriptions into +long-horizon action sequences. Despite recent advances in language model +agents, they remain prone to planning errors and limited in their ability to +plan ahead. To address these limitations in robotic planning, we advocate a +self-refining scheme that iteratively refines a draft plan until an equilibrium +is reached. Remarkably, this process can be optimized end-to-end from an +analytical perspective without the need to curate additional verifiers or +reward models, allowing us to train self-refining planners in a simple +supervised learning fashion. Meanwhile, a nested equilibrium sequence modeling +procedure is devised for efficient closed-loop planning that incorporates +useful feedback from the environment (or an internal world model). Our method +is evaluated on the VirtualHome-Env benchmark, showing advanced performance +with better scaling for inference computation. Code is available at +https://github.com/Singularity0104/equilibrium-planner. + +
+
+
+
+
+ + ♻ ☆ Affordance-based Robot Manipulation with Flow Matching + + +
+ We present a framework for assistive robot manipulation, which focuses on two +fundamental challenges: first, efficiently adapting large-scale models to +downstream scene affordance understanding tasks, especially in daily living +scenarios where gathering multi-task data involving humans requires strenuous +effort; second, effectively learning robot trajectories by grounding the visual +affordance model. We tackle the first challenge by employing a +parameter-efficient prompt tuning method that prepends learnable text prompts +to the frozen vision model to predict manipulation affordances in multi-task +scenarios. Then we propose to learn robot trajectories guided by affordances in +a supervised Flow Matching method. Flow matching represents a robot visuomotor +policy as a conditional process of flowing random waypoints to desired robot +trajectories. Finally, we introduce a real-world dataset with 10 tasks across +Activities of Daily Living to test our framework. Our extensive evaluation +highlights that the proposed prompt tuning method for learning manipulation +affordance with language prompter achieves competitive performance and even +outperforms other finetuning protocols across data scales, while satisfying +parameter efficiency. Learning multi-task robot trajectories with flow matching +policy also leads to consistently better generalization performance and faster +inference than alternative behavior cloning methods, especially given +multimodal robot action distributions. Our framework seamlessly unifies +affordance model learning and trajectory generation with flow matching for +robot manipulation. + +
+
+
+
+
+ + ♻ ☆ 3D Branch Point Cloud Completion for Robotic Pruning in Apple Orchards IROS 2024 + + +
+ Robotic branch pruning is a significantly growing research area to cope with +the shortage of labor force in the context of agriculture. One fundamental +requirement in robotic pruning is the perception of detailed geometry and +topology of branches. However, the point clouds obtained in agricultural +settings often exhibit incompleteness due to several constraints, thereby +restricting the accuracy of downstream robotic pruning. In this work, we +addressed the issue of point cloud quality through a simulation-based deep +neural network, leveraging a Real-to-Simulation (Real2Sim) data generation +pipeline that not only eliminates the need for manual parameterization but also +guarantees the realism of simulated data. The simulation-based neural network +was applied to jointly perform point cloud completion and skeletonization on +real-world partial branches, without additional real-world training. The +Sim2Real qualitative completion and skeletonization results showed the model's +remarkable capability for geometry reconstruction and topology prediction. +Additionally, we quantitatively evaluated the Sim2Real performance by comparing +branch-level trait characterization errors using raw incomplete data and +complete data. The Mean Absolute Error (MAE) reduced by 75% and 8% for branch +diameter and branch angle estimation, respectively, using the best complete +data, which indicates the effectiveness of the Real2Sim data in a zero-shot +generalization setting. The characterization improvements contributed to the +precision and efficacy of robotic branch pruning. + +
+
+ comment: Accepted by IROS 2024 +
+
+
+
+
+ + ♻ ☆ Benchmarking SLAM Algorithms in the Cloud: The SLAM Hive Benchmarking + Suite + + +
+ Evaluating the performance of Simultaneous Localization and Mapping (SLAM) +algorithms is essential for scientists and users of robotic systems alike. But +there are a multitude of different permutations of possible options of hardware +setups and algorithm configurations, as well as different datasets and +algorithms, such that it was previously infeasible to thoroughly compare SLAM +systems against the full state of the art. To solve that we present the SLAM +Hive Benchmarking Suite, which is able to analyze SLAM algorithms in 1000's of +mapping runs, through its utilization of container technology and deployment in +the cloud. This paper presents the architecture and open source implementation +of SLAM Hive and compares it to existing efforts on SLAM evaluation. We perform +mapping runs with popular visual, RGBD and LiDAR based SLAM algorithms against +commonly used datasets and show how SLAM Hive can be used to conveniently +analyze the results against various aspects. Through this we envision that SLAM +Hive can become an essential tool for proper comparisons and evaluations of +SLAM algorithms and thus drive the scientific development in the research on +SLAM. The open source software as well as a demo to show the live analysis of +1000's of mapping runs can be found on our SLAM Hive website. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.11854 +
+
+
+
+
+ + ♻ ☆ Comparing the Consistency of User Studies Conducted in Simulations and + Laboratory Settings + + +
+ Human-robot collaboration enables highly adaptive co-working. The variety of +resulting workflows makes it difficult to measure metrics as, e.g. makespans or +idle times for multiple systems and tasks in a comparable manner. This issue +can be addressed with virtual commissioning, where arbitrary numbers of +non-deterministic human-robot workflows in assembly tasks can be simulated. To +this end, data-driven models of human decisions are needed. Gathering the +required large corpus of data with on-site user studies is quite +time-consuming. In comparison, simulation-based studies (e.g., by +crowdsourcing) would allow us to access a large pool of study participants with +less effort. To rely on respective study results, human action sequences +observed in a browser-based simulation environment must be shown to match those +gathered in a laboratory setting. To this end, this work aims to understand to +what extent cooperative assembly work in a simulated environment differs from +that in an on-site laboratory setting. We show how a simulation environment can +be aligned with a laboratory setting in which a robot and a human perform +pick-and-place tasks together. A user study (N=29) indicates that participants' +assembly decisions and perception of the situation are consistent across these +different environments. + +
+
+ comment: Accepted for presentation at 2024 IEEE International Conference on + Robotic Computing (IRC) +
+
+
+
+
+ + ♻ ☆ TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for + Robotic Manipulation + + +
+ Vision-Language-Action (VLA) models have shown remarkable potential in +visuomotor control and instruction comprehension through end-to-end learning +processes. However, current VLA models face significant challenges: they are +slow during inference and require extensive pre-training on large amounts of +robotic data, making real-world deployment difficult. In this paper, we +introduce a new family of compact vision-language-action models, called +TinyVLA, which offers two key advantages over existing VLA models: (1) faster +inference speeds, and (2) improved data efficiency, eliminating the need for +pre-training stage. Our framework incorporates two essential components to +build TinyVLA: (1) initializing the policy backbone with robust, high-speed +multimodal models, and (2) integrating a diffusion policy decoder during +fine-tuning to enable precise robot actions. We conducted extensive evaluations +of TinyVLA in both simulation and on real robots, demonstrating that our +approach significantly outperforms the state-of-the-art VLA model, OpenVLA, in +terms of speed and data efficiency, while delivering comparable or superior +performance. Additionally, TinyVLA exhibits strong generalization capabilities +across various dimensions, including language instructions, novel objects, +unseen positions, changes in object appearance, background variations, and +environmental shifts, often matching or exceeding the performance of OpenVLA. +We believe that \methodname offers an interesting perspective on utilizing +pre-trained multimodal models for policy learning. Our project is at +https://tiny-vla.github.io. + +
+
+ comment: add more citations +
+
+
+
+
+ + ♻ ☆ Scaling Diffusion Policy in Transformer to 1 Billion Parameters for + Robotic Manipulation + + +
+ Diffusion Policy is a powerful technique tool for learning end-to-end +visuomotor robot control. It is expected that Diffusion Policy possesses +scalability, a key attribute for deep neural networks, typically suggesting +that increasing model size would lead to enhanced performance. However, our +observations indicate that Diffusion Policy in transformer architecture (\DP) +struggles to scale effectively; even minor additions of layers can deteriorate +training outcomes. To address this issue, we introduce Scalable Diffusion +Transformer Policy for visuomotor learning. Our proposed method, namely +\textbf{\methodname}, introduces two modules that improve the training dynamic +of Diffusion Policy and allow the network to better handle multimodal action +distribution. First, we identify that \DP~suffers from large gradient issues, +making the optimization of Diffusion Policy unstable. To resolve this issue, we +factorize the feature embedding of observation into multiple affine layers, and +integrate it into the transformer blocks. Additionally, our utilize non-causal +attention which allows the policy network to \enquote{see} future actions +during prediction, helping to reduce compounding errors. We demonstrate that +our proposed method successfully scales the Diffusion Policy from 10 million to +1 billion parameters. This new model, named \methodname, can effectively scale +up the model size with improved performance and generalization. We benchmark +\methodname~across 50 different tasks from MetaWorld and find that our largest +\methodname~outperforms \DP~with an average improvement of 21.6\%. Across 7 +real-world robot tasks, our ScaleDP demonstrates an average improvement of +36.25\% over DP-T on four single-arm tasks and 75\% on three bimanual tasks. We +believe our work paves the way for scaling up models for visuomotor learning. +The project page is available at scaling-diffusion-policy.github.io. + +
+
+
+
+
+ + ♻ ☆ A Unified Probabilistic Approach to Traffic Conflict Detection + + +
+ Traffic conflict detection is essential for proactive road safety by +identifying potential collisions before they occur. Existing methods rely on +surrogate safety measures tailored to specific interactions (e.g., +car-following, side-swiping, or path-crossing) and require varying thresholds +in different traffic conditions. This variation leads to inconsistencies and +limited adaptability of conflict detection in evolving traffic environments. +Consequently, a need persists for consistent detection of traffic conflicts +across interaction contexts. To address this need, this study proposes a +unified probabilistic approach. The proposed approach establishes a unified +framework of traffic conflict detection, where traffic conflicts are formulated +as context-dependent extreme events of road user interactions. The detection of +conflicts is then decomposed into a series of statistical learning tasks: +representing interaction contexts, inferring proximity distributions, and +assessing extreme collision risk. The unified formulation accommodates diverse +hypotheses of traffic conflicts and the learning tasks enable data-driven +analysis of factors such as motion states of road users, environment +conditions, and participant characteristics. Jointly, this approach supports +consistent and comprehensive evaluation of the collision risk emerging in road +user interactions. Our experiments using real-world trajectory data show that +the approach provides effective collision warnings, generalises across distinct +datasets and traffic environments, covers a broad range of conflict types, and +captures a long-tailed distribution of conflict intensity. The findings +highlight its potential to enhance the safety assessment of traffic +infrastructures and policies, improve collision warning systems for autonomous +driving, and deepen the understanding of road user behaviour in safety-critical +interactions. + +
+
+ comment: 21 pages, 10 figures, under revision +
+
+
+
+
+ + ♻ ☆ iKalibr: Unified Targetless Spatiotemporal Calibration for Resilient + Integrated Inertial Systems + + +
+ The integrated inertial system, typically integrating an IMU and an +exteroceptive sensor such as radar, LiDAR, and camera, has been widely accepted +and applied in modern robotic applications for ego-motion estimation, motion +control, or autonomous exploration. To improve system accuracy, robustness, and +further usability, both multiple and various sensors are generally resiliently +integrated, which benefits the system performance regarding failure tolerance, +perception capability, and environment compatibility. For such systems, +accurate and consistent spatiotemporal calibration is required to maintain a +unique spatiotemporal framework for multi-sensor fusion. Considering most +existing calibration methods (i) are generally oriented to specific integrated +inertial systems, (ii) often only focus on spatial determination, (iii) usually +require artificial targets, lacking convenience and usability, we propose +iKalibr: a unified targetless spatiotemporal calibration framework for +resilient integrated inertial systems, which overcomes the above issues, and +enables both accurate and consistent calibration. Altogether four commonly +employed sensors are supported in iKalibr currently, namely IMU, radar, LiDAR, +and camera. The proposed method starts with a rigorous and efficient dynamic +initialization, where all parameters in the estimator would be accurately +recovered. Subsequently, several continuous-time batch optimizations are +conducted to refine the initialized parameters toward better states. Sufficient +real-world experiments were conducted to verify the feasibility and evaluate +the calibration performance of iKalibr. The results demonstrate that iKalibr +can achieve accurate resilient spatiotemporal calibration. We open-source our +implementations at (https://github.com/Unsigned-Long/iKalibr) to benefit the +research community. + +
+
+
+
+
+ + ♻ ☆ ALLO: A Photorealistic Dataset and Data Generation Pipeline for Anomaly + Detection During Robotic Proximity Operations in Lunar Orbit ICRA'25 + + +
+ NASA's forthcoming Lunar Gateway space station, which will be uncrewed most +of the time, will need to operate with an unprecedented level of autonomy. +Enhancing autonomy on the Gateway presents several unique challenges, one of +which is to equip the Canadarm3, the Gateway's external robotic system, with +the capability to perform worksite monitoring. Monitoring will involve using +the arm's inspection cameras to detect any anomalies within the operating +environment, a task complicated by the widely-varying lighting conditions in +space. In this paper, we introduce the visual anomaly detection and +localization task for space applications and establish a benchmark with our +novel synthetic dataset called ALLO (for Anomaly Localization in Lunar Orbit). +We develop a complete data generation pipeline to create ALLO, which we use to +evaluate the performance of state-of-the-art visual anomaly detection +algorithms. Given the low tolerance for risk during space operations and the +lack of relevant data, we emphasize the need for novel, robust, and accurate +anomaly detection methods to handle the challenging visual conditions found in +lunar orbit and beyond. + +
+
+ comment: Submitted to International Conference on Robotics and Automation + (ICRA'25), Atlanta, USA, May 19-23, 2025 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 124 + +
+
+
+ + ☆ MagicQuill: An Intelligent Interactive Image Editing System + + +
+ Image editing involves a variety of complex tasks and requires efficient and +precise manipulation techniques. In this paper, we present MagicQuill, an +integrated image editing system that enables swift actualization of creative +ideas. Our system features a streamlined yet functionally robust interface, +allowing for the articulation of editing operations (e.g., inserting elements, +erasing objects, altering color) with minimal input. These interactions are +monitored by a multimodal large language model (MLLM) to anticipate editing +intentions in real time, bypassing the need for explicit prompt entry. Finally, +we apply a powerful diffusion prior, enhanced by a carefully learned two-branch +plug-in module, to process editing requests with precise control. Experimental +results demonstrate the effectiveness of MagicQuill in achieving high-quality +image edits. Please visit https://magic-quill.github.io to try out our system. + +
+
+ comment: Code and demo available at https://magic-quill.github.io +
+
+
+
+
+ + ☆ On the Surprising Effectiveness of Attention Transfer for Vision + Transformers NeurIPS 2024 + + +
+ Conventional wisdom suggests that pre-training Vision Transformers (ViT) +improves downstream performance by learning useful representations. Is this +actually true? We investigate this question and find that the features and +representations learned during pre-training are not essential. Surprisingly, +using only the attention patterns from pre-training (i.e., guiding how +information flows between tokens) is sufficient for models to learn high +quality features from scratch and achieve comparable downstream performance. We +show this by introducing a simple method called attention transfer, where only +the attention patterns from a pre-trained teacher ViT are transferred to a +student, either by copying or distilling the attention maps. Since attention +transfer lets the student learn its own features, ensembling it with a +fine-tuned teacher also further improves accuracy on ImageNet. We +systematically study various aspects of our findings on the sufficiency of +attention maps, including distribution shift settings where they underperform +fine-tuning. We hope our exploration provides a better understanding of what +pre-training accomplishes and leads to a useful alternative to the standard +practice of fine-tuning + +
+
+ comment: NeurIPS 2024. Code: + https://github.com/alexlioralexli/attention-transfer +
+
+
+
+
+ + ☆ CropCraft: Inverse Procedural Modeling for 3D Reconstruction of Crop + Plants + + +
+ The ability to automatically build 3D digital twins of plants from images has +countless applications in agriculture, environmental science, robotics, and +other fields. However, current 3D reconstruction methods fail to recover +complete shapes of plants due to heavy occlusion and complex geometries. In +this work, we present a novel method for 3D reconstruction of agricultural +crops based on optimizing a parametric model of plant morphology via inverse +procedural modeling. Our method first estimates depth maps by fitting a neural +radiance field and then employs Bayesian optimization to estimate plant +morphological parameters that result in consistent depth renderings. The +resulting 3D model is complete and biologically plausible. We validate our +method on a dataset of real images of agricultural fields, and demonstrate that +the reconstructions can be used for a variety of monitoring and simulation +applications. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Advancing Fine-Grained Visual Understanding with Multi-Scale Alignment + in Multi-Modal Models + + +
+ Multi-modal large language models (MLLMs) have achieved remarkable success in +fine-grained visual understanding across a range of tasks. However, they often +encounter significant challenges due to inadequate alignment for fine-grained +knowledge, which restricts their ability to accurately capture local details +and attain a comprehensive global perception. While recent advancements have +focused on aligning object expressions with grounding information, they +typically lack explicit integration of object images, which contain affluent +information beyond mere texts or coordinates. To bridge this gap, we introduce +a novel fine-grained visual knowledge alignment method that effectively aligns +and integrates multi-scale knowledge of objects, including texts, coordinates, +and images. This innovative method is underpinned by our multi-scale +fine-grained enhancement data synthesis pipeline, which provides over 300K +essential training data to enhance alignment and improve overall performance. +Furthermore, we present TinyGroundingGPT, a series of compact models optimized +for high-level alignments. With a scale of approximately 3B parameters, +TinyGroundingGPT achieves outstanding results in grounding tasks while +delivering performance comparable to larger MLLMs in complex visual scenarios. + +
+
+
+
+
+ + ☆ One-Shot Manipulation Strategy Learning by Making Contact Analogies CoRL + + +
+ We present a novel approach, MAGIC (manipulation analogies for generalizable +intelligent contacts), for one-shot learning of manipulation strategies with +fast and extensive generalization to novel objects. By leveraging a reference +action trajectory, MAGIC effectively identifies similar contact points and +sequences of actions on novel objects to replicate a demonstrated strategy, +such as using different hooks to retrieve distant objects of different shapes +and sizes. Our method is based on a two-stage contact-point matching process +that combines global shape matching using pretrained neural features with local +curvature analysis to ensure precise and physically plausible contact points. +We experiment with three tasks including scooping, hanging, and hooking +objects. MAGIC demonstrates superior performance over existing methods, +achieving significant improvements in runtime speed and generalization to +different object categories. Website: https://magic-2024.github.io/ . + +
+
+ comment: CoRL LEAP Workshop, 2024 +
+
+
+
+
+ + ☆ Vision-based Manipulation of Transparent Plastic Bags in Industrial + Setups + + +
+ This paper addresses the challenges of vision-based manipulation for +autonomous cutting and unpacking of transparent plastic bags in industrial +setups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data, +connectivity, analytics, and robotics, promises enhanced accessibility and +sustainability throughout the value chain. The integration of autonomous +systems, including collaborative robots (cobots), into industrial processes is +pivotal for efficiency and safety. The proposed solution employs advanced +Machine Learning algorithms, particularly Convolutional Neural Networks (CNNs), +to identify transparent plastic bags under varying lighting and background +conditions. Tracking algorithms and depth sensing technologies are utilized for +3D spatial awareness during pick and placement. The system addresses challenges +in grasping and manipulation, considering optimal points, compliance control +with vacuum gripping technology, and real-time automation for safe interaction +in dynamic environments. The system's successful testing and validation in the +lab with the FRANKA robot arm, showcases its potential for widespread +industrial applications, while demonstrating effectiveness in automating the +unpacking and cutting of transparent plastic bags for an 8-stack bulk-loader +based on specific requirements and rigorous testing. + +
+
+
+
+
+ + ☆ Local-Global Attention: An Adaptive Mechanism for Multi-Scale Feature + Integration + + +
+ In recent years, attention mechanisms have significantly enhanced the +performance of object detection by focusing on key feature information. +However, prevalent methods still encounter difficulties in effectively +balancing local and global features. This imbalance hampers their ability to +capture both fine-grained details and broader contextual information-two +critical elements for achieving accurate object detection.To address these +challenges, we propose a novel attention mechanism, termed Local-Global +Attention, which is designed to better integrate both local and global +contextual features. Specifically, our approach combines multi-scale +convolutions with positional encoding, enabling the model to focus on local +details while concurrently considering the broader global context. +Additionally, we introduce a learnable parameters, which allow the model to +dynamically adjust the relative importance of local and global attention, +depending on the specific requirements of the task, thereby optimizing feature +representations across multiple scales.We have thoroughly evaluated the +Local-Global Attention mechanism on several widely used object detection and +classification datasets. Our experimental results demonstrate that this +approach significantly enhances the detection of objects at various scales, +with particularly strong performance on multi-class and small object detection +tasks. In comparison to existing attention mechanisms, Local-Global Attention +consistently outperforms them across several key metrics, all while maintaining +computational efficiency. + +
+
+
+
+
+ + ☆ Assessing the Performance of the DINOv2 Self-supervised Learning Vision + Transformer Model for the Segmentation of the Left Atrium from MRI Images + + +
+ Accurate left atrium (LA) segmentation from pre-operative scans is crucial +for diagnosing atrial fibrillation, treatment planning, and supporting surgical +interventions. While deep learning models are key in medical image +segmentation, they often require extensive manually annotated data. Foundation +models trained on larger datasets have reduced this dependency, enhancing +generalizability and robustness through transfer learning. We explore DINOv2, a +self-supervised learning vision transformer trained on natural images, for LA +segmentation using MRI. The challenges for LA's complex anatomy, thin +boundaries, and limited annotated data make accurate segmentation difficult +before & during the image-guided intervention. We demonstrate DINOv2's ability +to provide accurate & consistent segmentation, achieving a mean Dice score of +.871 & a Jaccard Index of .792 for end-to-end fine-tuning. Through few-shot +learning across various data sizes & patient counts, DINOv2 consistently +outperforms baseline models. These results suggest that DINOv2 effectively +adapts to MRI with limited data, highlighting its potential as a competitive +tool for segmentation & encouraging broader use in medical imaging. + +
+
+ comment: 6 pages, 3 figures, SPIE Medical Imaging, 2025 +
+
+
+
+
+ + ☆ LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models + + +
+ This work explores expanding the capabilities of large language models (LLMs) +pretrained on text to generate 3D meshes within a unified model. This offers +key advantages of (1) leveraging spatial knowledge already embedded in LLMs, +derived from textual sources like 3D tutorials, and (2) enabling conversational +3D generation and mesh understanding. A primary challenge is effectively +tokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly. +To address this, we introduce LLaMA-Mesh, a novel approach that represents the +vertex coordinates and face definitions of 3D meshes as plain text, allowing +direct integration with LLMs without expanding the vocabulary. We construct a +supervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate +3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs +as required, and (3) understand and interpret 3D meshes. Our work is the first +to demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge +for 3D mesh generation in a text-based format, effectively unifying the 3D and +text modalities. LLaMA-Mesh achieves mesh generation quality on par with models +trained from scratch while maintaining strong text generation performance. + +
+
+ comment: See the project website at + https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/ +
+
+
+
+
+ + ☆ SMILE-UHURA Challenge -- Small Vessel Segmentation at Mesoscopic Scale + from Ultra-High Resolution 7T Magnetic Resonance Angiograms + + +
+ The human brain receives nutrients and oxygen through an intricate network of +blood vessels. Pathology affecting small vessels, at the mesoscopic scale, +represents a critical vulnerability within the cerebral blood supply and can +lead to severe conditions, such as Cerebral Small Vessel Diseases. The advent +of 7 Tesla MRI systems has enabled the acquisition of higher spatial resolution +images, making it possible to visualise such vessels in the brain. However, the +lack of publicly available annotated datasets has impeded the development of +robust, machine learning-driven segmentation algorithms. To address this, the +SMILE-UHURA challenge was organised. This challenge, held in conjunction with +the ISBI 2023, in Cartagena de Indias, Colombia, aimed to provide a platform +for researchers working on related topics. The SMILE-UHURA challenge addresses +the gap in publicly available annotated datasets by providing an annotated +dataset of Time-of-Flight angiography acquired with 7T MRI. This dataset was +created through a combination of automated pre-segmentation and extensive +manual refinement. In this manuscript, sixteen submitted methods and two +baseline methods are compared both quantitatively and qualitatively on two +different datasets: held-out test MRAs from the same dataset as the training +data (with labels kept secret) and a separate 7T ToF MRA dataset where both +input volumes and labels are kept secret. The results demonstrate that most of +the submitted deep learning methods, trained on the provided training dataset, +achieved reliable segmentation performance. Dice scores reached up to 0.838 +$\pm$ 0.066 and 0.716 $\pm$ 0.125 on the respective datasets, with an average +performance of up to 0.804 $\pm$ 0.15. + +
+
+
+
+
+ + ☆ Spider: Any-to-Many Multimodal LLM + + +
+ Multimodal LLMs (MLLMs) have emerged as an extension of Large Language Models +(LLMs), enabling the integration of various modalities. However, Any-to-Any +MLLMs are limited to generating pairwise modalities 'Text + X' within a single +response, such as Text + {Image or Audio or Video}. To address this limitation, +we introduce Spider, a novel efficient Any-to-Many Modalities Generation (AMMG) +framework, which can generate an arbitrary combination of modalities 'Text + +Xs', such as Text + {Image and Audio and Video}. To achieve efficient AMMG, our +Spider integrates three core components: a Base Model for basic X-to-X (i.e., +Any-to-Any) modality processing, a novel Efficient Decoders-Controller for +controlling multimodal Decoders to generate Xs (many-modal) contents, and an +Any-to-Many Instruction Template designed for producing Xs signal prompts. To +train Spider, we constructed a novel Text-formatted Many-Modal (TMM) dataset, +which facilitates the learning of the X-to-Xs (i.e., Any-to-Many) capability +necessary for AMMG. Ultimately, the well-trained Spider generates a pseudo +X-to-Xs dataset, the first-ever X-to-Xs many-modal dataset, enhancing the +potential for AMMG task in future research. Overall, this work not only pushes +the boundary of multimodal interaction but also provides rich data support for +advancing the field. + +
+
+
+
+
+ + ☆ Dynamic Reconstruction of Hand-Object Interaction with Distributed + Force-aware Contact Representation + + +
+ We present ViTaM-D, a novel visual-tactile framework for dynamic hand-object +interaction reconstruction, integrating distributed tactile sensing for more +accurate contact modeling. While existing methods focus primarily on visual +inputs, they struggle with capturing detailed contact interactions such as +object deformation. Our approach leverages distributed tactile sensors to +address this limitation by introducing DF-Field. This distributed force-aware +contact representation models both kinetic and potential energy in hand-object +interaction. ViTaM-D first reconstructs hand-object interactions using a +visual-only network, VDT-Net, and then refines contact details through a +force-aware optimization (FO) process, enhancing object deformation modeling. +To benchmark our approach, we introduce the HOT dataset, which features 600 +sequences of hand-object interactions, including deformable objects, built in a +high-precision simulation environment. Extensive experiments on both the DexYCB +and HOT datasets demonstrate significant improvements in accuracy over previous +state-of-the-art methods such as gSDF and HOTrack. Our results highlight the +superior performance of ViTaM-D in both rigid and deformable object +reconstruction, as well as the effectiveness of DF-Field in refining hand +poses. This work offers a comprehensive solution to dynamic hand-object +interaction reconstruction by seamlessly integrating visual and tactile data. +Codes, models, and datasets will be available. + +
+
+
+
+
+ + ☆ VPBSD:Vessel-Pattern-Based Semi-Supervised Distillation for Efficient 3D + Microscopic Cerebrovascular Segmentation + + +
+ 3D microscopic cerebrovascular images are characterized by their high +resolution, presenting significant annotation challenges, large data volumes, +and intricate variations in detail. Together, these factors make achieving +high-quality, efficient whole-brain segmentation particularly demanding. In +this paper, we propose a novel Vessel-Pattern-Based Semi-Supervised +Distillation pipeline (VpbSD) to address the challenges of 3D microscopic +cerebrovascular segmentation. This pipeline initially constructs a +vessel-pattern codebook that captures diverse vascular structures from +unlabeled data during the teacher model's pretraining phase. In the knowledge +distillation stage, the codebook facilitates the transfer of rich knowledge +from a heterogeneous teacher model to a student model, while the +semi-supervised approach further enhances the student model's exposure to +diverse learning samples. Experimental results on real-world data, including +comparisons with state-of-the-art methods and ablation studies, demonstrate +that our pipeline and its individual components effectively address the +challenges inherent in microscopic cerebrovascular segmentation. + +
+
+
+
+
+ + ☆ Adaptive Deviation Learning for Visual Anomaly Detection with Data + Contamination WACV 2025 + + +
+ Visual anomaly detection targets to detect images that notably differ from +normal pattern, and it has found extensive application in identifying defective +parts within the manufacturing industry. These anomaly detection paradigms +predominantly focus on training detection models using only clean, unlabeled +normal samples, assuming an absence of contamination; a condition often unmet +in real-world scenarios. The performance of these methods significantly depends +on the quality of the data and usually decreases when exposed to noise. We +introduce a systematic adaptive method that employs deviation learning to +compute anomaly scores end-to-end while addressing data contamination by +assigning relative importance to the weights of individual instances. In this +approach, the anomaly scores for normal instances are designed to approximate +scalar scores obtained from the known prior distribution. Meanwhile, anomaly +scores for anomaly examples are adjusted to exhibit statistically significant +deviations from these reference scores. Our approach incorporates a constrained +optimization problem within the deviation learning framework to update instance +weights, resolving this problem for each mini-batch. Comprehensive experiments +on the MVTec and VisA benchmark datasets indicate that our proposed method +surpasses competing techniques and exhibits both stability and robustness in +the presence of data contamination. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV 2025) +
+
+
+
+
+ + ☆ Image Processing for Motion Magnification + + +
+ Motion Magnification (MM) is a collection of relative recent techniques +within the realm of Image Processing. The main motivation of introducing these +techniques in to support the human visual system to capture relevant +displacements of an object of interest; these motions can be in object color +and in object location. In fact, the goal is to opportunely process a video +sequence to obtain as output a new video in which motions are magnified and +visible to the viewer. We propose a numerical technique using the Phase-Based +Motion Magnification which analyses the video sequence in the Fourier Domain +and rely on the Fourier Shifting Property. We describe the mathematical +foundation of this method and the corresponding implementation in a numerical +algorithm. We present preliminary experiments, focusing on some basic test made +up using synthetic images. + +
+
+
+
+
+ + ☆ OOD-SEG: Out-Of-Distribution detection for image SEGmentation with + sparse multi-class positive-only annotations + + +
+ Despite significant advancements, segmentation based on deep neural networks +in medical and surgical imaging faces several challenges, two of which we aim +to address in this work. First, acquiring complete pixel-level segmentation +labels for medical images is time-consuming and requires domain expertise. +Second, typical segmentation pipelines cannot detect out-of-distribution (OOD) +pixels, leaving them prone to spurious outputs during deployment. In this work, +we propose a novel segmentation approach exploiting OOD detection that learns +only from sparsely annotated pixels from multiple positive-only classes. %but +\emph{no background class} annotation. These multi-class positive annotations +naturally fall within the in-distribution (ID) set. Unlabelled pixels may +contain positive classes but also negative ones, including what is typically +referred to as \emph{background} in standard segmentation formulations. Here, +we forgo the need for background annotation and consider these together with +any other unseen classes as part of the OOD set. Our framework can integrate, +at a pixel-level, any OOD detection approaches designed for classification +tasks. To address the lack of existing OOD datasets and established evaluation +metric for medical image segmentation, we propose a cross-validation strategy +that treats held-out labelled classes as OOD. Extensive experiments on both +multi-class hyperspectral and RGB surgical imaging datasets demonstrate the +robustness and generalisation capability of our proposed framework. + +
+
+
+
+
+ + ☆ MFTIQ: Multi-Flow Tracker with Independent Matching Quality Estimation WACV 2025 + + +
+ In this work, we present MFTIQ, a novel dense long-term tracking model that +advances the Multi-Flow Tracker (MFT) framework to address challenges in +point-level visual tracking in video sequences. MFTIQ builds upon the +flow-chaining concepts of MFT, integrating an Independent Quality (IQ) module +that separates correspondence quality estimation from optical flow +computations. This decoupling significantly enhances the accuracy and +flexibility of the tracking process, allowing MFTIQ to maintain reliable +trajectory predictions even in scenarios of prolonged occlusions and complex +dynamics. Designed to be "plug-and-play", MFTIQ can be employed with any +off-the-shelf optical flow method without the need for fine-tuning or +architectural modifications. Experimental validations on the TAP-Vid Davis +dataset show that MFTIQ with RoMa optical flow not only surpasses MFT but also +performs comparably to state-of-the-art trackers while having substantially +faster processing speed. Code and models available at +https://github.com/serycjon/MFTIQ . + +
+
+ comment: accepted to WACV 2025 +
+
+
+
+
+ + ☆ Prompting the Unseen: Detecting Hidden Backdoors in Black-Box Models + + +
+ Visual prompting (VP) is a new technique that adapts well-trained frozen +models for source domain tasks to target domain tasks. This study examines VP's +benefits for black-box model-level backdoor detection. The visual prompt in VP +maps class subspaces between source and target domains. We identify a +misalignment, termed class subspace inconsistency, between clean and poisoned +datasets. Based on this, we introduce \textsc{BProm}, a black-box model-level +detection method to identify backdoors in suspicious models, if any. +\textsc{BProm} leverages the low classification accuracy of prompted models +when backdoors are present. Extensive experiments confirm \textsc{BProm}'s +effectiveness. + +
+
+
+
+
+ + ☆ Marker-free Human Gait Analysis using a Smart Edge Sensor System + + +
+ The human gait is a complex interplay between the neuronal and the muscular +systems, reflecting an individual's neurological and physiological condition. +This makes gait analysis a valuable tool for biomechanics and medical experts. +Traditional observational gait analysis is cost-effective but lacks reliability +and accuracy, while instrumented gait analysis, particularly using marker-based +optical systems, provides accurate data but is expensive and time-consuming. In +this paper, we introduce a novel markerless approach for gait analysis using a +multi-camera setup with smart edge sensors to estimate 3D body poses without +fiducial markers. We propose a Siamese embedding network with triplet loss +calculation to identify individuals by their gait pattern. This network +effectively maps gait sequences to an embedding space that enables clustering +sequences from the same individual or activity closely together while +separating those of different ones. Our results demonstrate the potential of +the proposed system for efficient automated gait analysis in diverse real-world +environments, facilitating a wide range of applications. + +
+
+ comment: accepted for SII 2025 +
+
+
+
+
+ + ☆ GAN-Based Architecture for Low-dose Computed Tomography Imaging + Denoising + + +
+ Generative Adversarial Networks (GANs) have surfaced as a revolutionary +element within the domain of low-dose computed tomography (LDCT) imaging, +providing an advanced resolution to the enduring issue of reconciling radiation +exposure with image quality. This comprehensive review synthesizes the rapid +advancements in GAN-based LDCT denoising techniques, examining the evolution +from foundational architectures to state-of-the-art models incorporating +advanced features such as anatomical priors, perceptual loss functions, and +innovative regularization strategies. We critically analyze various GAN +architectures, including conditional GANs (cGANs), CycleGANs, and +Super-Resolution GANs (SRGANs), elucidating their unique strengths and +limitations in the context of LDCT denoising. The evaluation provides both +qualitative and quantitative results related to the improvements in performance +in benchmark and clinical datasets with metrics such as PSNR, SSIM, and LPIPS. +After highlighting the positive results, we discuss some of the challenges +preventing a wider clinical use, including the interpretability of the images +generated by GANs, synthetic artifacts, and the need for clinically relevant +metrics. The review concludes by highlighting the essential significance of +GAN-based methodologies in the progression of precision medicine via tailored +LDCT denoising models, underlining the transformative possibilities presented +by artificial intelligence within contemporary radiological practice. + +
+
+
+
+
+ + ☆ Golden Noise for Diffusion Models: A Learning Framework + + +
+ Text-to-image diffusion model is a popular paradigm that synthesizes +personalized images by providing a text prompt and a random Gaussian noise. +While people observe that some noises are ``golden noises'' that can achieve +better text-image alignment and higher human preference than others, we still +lack a machine learning framework to obtain those golden noises. To learn +golden noises for diffusion sampling, we mainly make three contributions in +this paper. First, we identify a new concept termed the \textit{noise prompt}, +which aims at turning a random Gaussian noise into a golden noise by adding a +small desirable perturbation derived from the text prompt. Following the +concept, we first formulate the \textit{noise prompt learning} framework that +systematically learns ``prompted'' golden noise associated with a text prompt +for diffusion models. Second, we design a noise prompt data collection pipeline +and collect a large-scale \textit{noise prompt dataset}~(NPD) that contains +100k pairs of random noises and golden noises with the associated text prompts. +With the prepared NPD as the training dataset, we trained a small \textit{noise +prompt network}~(NPNet) that can directly learn to transform a random noise +into a golden noise. The learned golden noise perturbation can be considered as +a kind of prompt for noise, as it is rich in semantic information and tailored +to the given text prompt. Third, our extensive experiments demonstrate the +impressive effectiveness and generalization of NPNet on improving the quality +of synthesized images across various diffusion models, including SDXL, +DreamShaper-xl-v2-turbo, and Hunyuan-DiT. Moreover, NPNet is a small and +efficient controller that acts as a plug-and-play module with very limited +additional inference and computational costs, as it just provides a golden +noise instead of a random noise without accessing the original pipeline. + +
+
+
+
+
+ + ☆ Image Matching Filtering and Refinement by Planes and Beyond + + +
+ This paper introduces a modular, non-deep learning method for filtering and +refining sparse correspondences in image matching. Assuming that motion flow +within the scene can be approximated by local homography transformations, +matches are aggregated into overlapping clusters corresponding to virtual +planes using an iterative RANSAC-based approach, with non-conforming +correspondences discarded. Moreover, the underlying planar structural design +provides an explicit map between local patches associated with the matches, +enabling optional refinement of keypoint positions through cross-correlation +template matching after patch reprojection. Finally, to enhance robustness and +fault-tolerance against violations of the piece-wise planar approximation +assumption, a further strategy is designed for minimizing relative patch +distortion in the plane reprojection by introducing an intermediate homography +that projects both patches into a common plane. The proposed method is +extensively evaluated on standard datasets and image matching pipelines, and +compared with state-of-the-art approaches. Unlike other current comparisons, +the proposed benchmark also takes into account the more general, real, and +practical cases where camera intrinsics are unavailable. Experimental results +demonstrate that our proposed non-deep learning, geometry-based approach +achieves performances that are either superior to or on par with recent +state-of-the-art deep learning methods. Finally, this study suggests that there +are still development potential in actual image matching solutions in the +considered research direction, which could be in the future incorporated in +novel deep image matching architectures. + +
+
+ comment: project page: https://github.com/fb82/MiHo +
+
+
+
+
+ + ☆ Renal Cell Carcinoma subtyping: learning from multi-resolution + localization + + +
+ Renal Cell Carcinoma is typically asymptomatic at the early stages for many +patients. This leads to a late diagnosis of the tumor, where the curability +likelihood is lower, and makes the mortality rate of Renal Cell Carcinoma high, +with respect to its incidence rate. To increase the survival chance, a fast and +correct categorization of the tumor subtype is paramount. Nowadays, +computerized methods, based on artificial intelligence, represent an +interesting opportunity to improve the productivity and the objectivity of the +microscopy-based Renal Cell Carcinoma diagnosis. Nonetheless, much of their +exploitation is hampered by the paucity of annotated dataset, essential for a +proficient training of supervised machine learning technologies. This study +sets out to investigate a novel self supervised training strategy for machine +learning diagnostic tools, based on the multi-resolution nature of the +histological samples. We aim at reducing the need of annotated dataset, without +significantly reducing the accuracy of the tool. We demonstrate the +classification capability of our tool on a whole slide imaging dataset for +Renal Cancer subtyping, and we compare our solution with several +state-of-the-art classification counterparts. + +
+
+
+
+
+ + ☆ SINETRA: a Versatile Framework for Evaluating Single Neuron Tracking in + Behaving Animals + + +
+ Accurately tracking neuronal activity in behaving animals presents +significant challenges due to complex motions and background noise. The lack of +annotated datasets limits the evaluation and improvement of such tracking +algorithms. To address this, we developed SINETRA, a versatile simulator that +generates synthetic tracking data for particles on a deformable background, +closely mimicking live animal recordings. This simulator produces annotated 2D +and 3D videos that reflect the intricate movements seen in behaving animals +like Hydra Vulgaris. We evaluated four state-of-the-art tracking algorithms +highlighting the current limitations of these methods in challenging scenarios +and paving the way for improved cell tracking techniques in dynamic biological +systems. + +
+
+ comment: 5 pages, 3 figures, submitted at 2025 IEEE International Symposium on + Biomedical Imaging (ISBI) +
+
+
+
+
+ + ☆ Long-Tailed Object Detection Pre-training: Dynamic Rebalancing + Contrastive Learning with Dual Reconstruction NeurIPS 2024 + + +
+ Pre-training plays a vital role in various vision tasks, such as object +recognition and detection. Commonly used pre-training methods, which typically +rely on randomized approaches like uniform or Gaussian distributions to +initialize model parameters, often fall short when confronted with long-tailed +distributions, especially in detection tasks. This is largely due to extreme +data imbalance and the issue of simplicity bias. In this paper, we introduce a +novel pre-training framework for object detection, called Dynamic Rebalancing +Contrastive Learning with Dual Reconstruction (2DRCL). Our method builds on a +Holistic-Local Contrastive Learning mechanism, which aligns pre-training with +object detection by capturing both global contextual semantics and detailed +local patterns. To tackle the imbalance inherent in long-tailed data, we design +a dynamic rebalancing strategy that adjusts the sampling of underrepresented +instances throughout the pre-training process, ensuring better representation +of tail classes. Moreover, Dual Reconstruction addresses simplicity bias by +enforcing a reconstruction task aligned with the self-consistency principle, +specifically benefiting underrepresented tail classes. Experiments on COCO and +LVIS v1.0 datasets demonstrate the effectiveness of our method, particularly in +improving the mAP/AP scores for tail classes. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Image Regeneration: Evaluating Text-to-Image Model via Generating + Identical Image with Multimodal Large Language Models + + +
+ Diffusion models have revitalized the image generation domain, playing +crucial roles in both academic research and artistic expression. With the +emergence of new diffusion models, assessing the performance of text-to-image +models has become increasingly important. Current metrics focus on directly +matching the input text with the generated image, but due to cross-modal +information asymmetry, this leads to unreliable or incomplete assessment +results. Motivated by this, we introduce the Image Regeneration task in this +study to assess text-to-image models by tasking the T2I model with generating +an image according to the reference image. We use GPT4V to bridge the gap +between the reference image and the text input for the T2I model, allowing T2I +models to understand image content. This evaluation process is simplified as +comparisons between the generated image and the reference image are +straightforward. Two regeneration datasets spanning content-diverse and +style-diverse evaluation dataset are introduced to evaluate the leading +diffusion models currently available. Additionally, we present ImageRepainter +framework to enhance the quality of generated images by improving content +comprehension via MLLM guided iterative generation and revision. Our +comprehensive experiments have showcased the effectiveness of this framework in +assessing the generative capabilities of models. By leveraging MLLM, we have +demonstrated that a robust T2M can produce images more closely resembling the +reference image. + +
+
+
+
+
+ + ☆ SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph + Attention for Vision Transformers + + +
+ Image classification is a computer vision task where a model analyzes an +image to categorize it into a specific label. Vision Transformers (ViT) improve +this task by leveraging self-attention to capture complex patterns and long +range relationships between image patches. However, a key challenge for ViTs is +efficiently incorporating multiscale feature representations, which is inherent +in CNNs through their hierarchical structure. In this paper, we introduce the +Scale-Aware Graph Attention Vision Transformer (SAG-ViT), a novel framework +that addresses this challenge by integrating multi-scale features. Using +EfficientNet as a backbone, the model extracts multi-scale feature maps, which +are divided into patches to preserve semantic information. These patches are +organized into a graph based on spatial and feature similarities, with a Graph +Attention Network (GAT) refining the node embeddings. Finally, a Transformer +encoder captures long-range dependencies and complex interactions. The SAG-ViT +is evaluated on benchmark datasets, demonstrating its effectiveness in +enhancing image classification performance. + +
+
+ comment: 10 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Script-centric behavior understanding for assisted autism spectrum + disorder diagnosis ICASSP 2025 + + +
+ Observing and analyzing children's social behaviors is crucial for the early +diagnosis of Autism Spectrum Disorders (ASD). This work focuses on +automatically detecting ASD using computer vision techniques and large language +models (LLMs). Existing methods typically rely on supervised learning. However, +the scarcity of ASD diagnostic datasets and the lack of interpretability in +diagnostic results significantly limits its clinical application. To address +these challenges, we introduce a novel unsupervised approach based on +script-centric behavior understanding. Our pipeline converts video content into +scripts that describe the behavior of characters, leveraging the +generalizability of large language models to detect ASD in a zero-shot or +few-shot manner. Specifically, we propose a scripts transcription module for +multimodal behavior data textualization and a domain prompts module to bridge +LLMs. Our method achieves an accuracy of 92.00\% in diagnosing ASD in children +with an average age of 24 months, surpassing the performance of supervised +learning methods by 3.58\% absolutely. Extensive experiments confirm the +effectiveness of our approach and suggest its potential for advancing ASD +research through LLMs. + +
+
+ comment: 5 pages, 4 figures, submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Building Height Estimation Using Shadow Length in Satellite Imagery + + +
+ Estimating building height from satellite imagery poses significant +challenges, especially when monocular images are employed, resulting in a loss +of essential 3D information during imaging. This loss of spatial depth further +complicates the height estimation process. We addressed this issue by using +shadow length as an additional cue to compensate for the loss of building +height estimation using single-view imagery. We proposed a novel method that +first localized a building and its shadow in the given satellite image. After +localization, the shadow length is estimated using a regression model. To +estimate the final height of each building, we utilize the principles of +photogrammetry, specifically considering the relationship between the solar +elevation angle, the vertical edge length of the building, and the length of +the building's shadow. For the localization of buildings in our model, we +utilized a modified YOLOv7 detector, and to regress the shadow length for each +building we utilized the ResNet18 as backbone architecture. Finally, we +estimated the associated building height using solar elevation with shadow +length through analytical formulation. We evaluated our method on 42 different +cities and the results showed that the proposed framework surpasses the +state-of-the-art methods with a suitable margin. + +
+
+ comment: 6 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Automated Segmentation of Ischemic Stroke Lesions in Non-Contrast + Computed Tomography Images for Enhanced Treatment and Prognosis MICCAI + + +
+ Stroke is the second leading cause of death worldwide, and is increasingly +prevalent in low- and middle-income countries (LMICs). Timely interventions can +significantly influence stroke survivability and the quality of life after +treatment. However, the standard and most widely available imaging method for +confirming strokes and their sub-types, the NCCT, is more challenging and +time-consuming to employ in cases of ischemic stroke. For this reason, we +developed an automated method for ischemic stroke lesion segmentation in NCCTs +using the nnU-Net frame work, aimed at enhancing early treatment and improving +the prognosis of ischemic stroke patients. We achieved Dice scores of 0.596 and +Intersection over Union (IoU) scores of 0.501 on the sampled dataset. After +adjusting for outliers, these scores improved to 0.752 for the Dice score and +0.643 for the IoU. Proper delineation of the region of infarction can help +clinicians better assess the potential impact of the infarction, and guide +treatment procedures. + +
+
+ comment: 7 pages, 3 figures, MICCAI Meets Africa Workshop +
+
+
+
+
+ + ☆ Instruction-Driven Fusion of Infrared-Visible Images: Tailoring for + Diverse Downstream Tasks + + +
+ The primary value of infrared and visible image fusion technology lies in +applying the fusion results to downstream tasks. However, existing methods face +challenges such as increased training complexity and significantly compromised +performance of individual tasks when addressing multiple downstream tasks +simultaneously. To tackle this, we propose Task-Oriented Adaptive Regulation +(T-OAR), an adaptive mechanism specifically designed for multi-task +environments. Additionally, we introduce the Task-related Dynamic Prompt +Injection (T-DPI) module, which generates task-specific dynamic prompts from +user-input text instructions and integrates them into target representations. +This guides the feature extraction module to produce representations that are +more closely aligned with the specific requirements of downstream tasks. By +incorporating the T-DPI module into the T-OAR framework, our approach generates +fusion images tailored to task-specific requirements without the need for +separate training or task-specific weights. This not only reduces computational +costs but also enhances adaptability and performance across multiple tasks. +Experimental results show that our method excels in object detection, semantic +segmentation, and salient object detection, demonstrating its strong +adaptability, flexibility, and task specificity. This provides an efficient +solution for image fusion in multi-task environments, highlighting the +technology's potential across diverse applications. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Are nuclear masks all you need for improved out-of-domain + generalisation? A closer look at cancer classification in histopathology NeurIPS 2024 + + +
+ Domain generalisation in computational histopathology is challenging because +the images are substantially affected by differences among hospitals due to +factors like fixation and staining of tissue and imaging equipment. We +hypothesise that focusing on nuclei can improve the out-of-domain (OOD) +generalisation in cancer detection. We propose a simple approach to improve OOD +generalisation for cancer detection by focusing on nuclear morphology and +organisation, as these are domain-invariant features critical in cancer +detection. Our approach integrates original images with nuclear segmentation +masks during training, encouraging the model to prioritise nuclei and their +spatial arrangement. Going beyond mere data augmentation, we introduce a +regularisation technique that aligns the representations of masks and original +images. We show, using multiple datasets, that our method improves OOD +generalisation and also leads to increased robustness to image corruptions and +adversarial attacks. The source code is available at +https://github.com/undercutspiky/SFL/ + +
+
+ comment: Poster at NeurIPS 2024 +
+
+
+
+
+ + ☆ DSCformer: A Dual-Branch Network Integrating Enhanced Dynamic Snake + Convolution and SegFormer for Crack Segmentation + + +
+ In construction quality monitoring, accurately detecting and segmenting +cracks in concrete structures is paramount for safety and maintenance. Current +convolutional neural networks (CNNs) have demonstrated strong performance in +crack segmentation tasks, yet they often struggle with complex backgrounds and +fail to capture fine-grained tubular structures fully. In contrast, +Transformers excel at capturing global context but lack precision in detailed +feature extraction. We introduce DSCformer, a novel hybrid model that +integrates an enhanced Dynamic Snake Convolution (DSConv) with a Transformer +architecture for crack segmentation to address these challenges. Our key +contributions include the enhanced DSConv through a pyramid kernel for adaptive +offset computation and a simultaneous bi-directional learnable offset +iteration, significantly improving the model's performance to capture intricate +crack patterns. Additionally, we propose a Weighted Convolutional Attention +Module (WCAM), which refines channel attention, allowing for more precise and +adaptive feature attention. We evaluate DSCformer on the Crack3238 and FIND +datasets, achieving IoUs of 59.22\% and 87.24\%, respectively. The experimental +results suggest that our DSCformer outperforms state-of-the-art methods across +different datasets. + +
+
+
+
+
+ + ☆ Time-to-Event Pretraining for 3D Medical Imaging + + +
+ With the rise of medical foundation models and the growing availability of +imaging data, scalable pretraining techniques offer a promising way to identify +imaging biomarkers predictive of future disease risk. While current +self-supervised methods for 3D medical imaging models capture local structural +features like organ morphology, they fail to link pixel biomarkers with +long-term health outcomes due to a missing context problem. Current approaches +lack the temporal context necessary to identify biomarkers correlated with +disease progression, as they rely on supervision derived only from images and +concurrent text descriptions. To address this, we introduce time-to-event +pretraining, a pretraining framework for 3D medical imaging models that +leverages large-scale temporal supervision from paired, longitudinal electronic +health records (EHRs). Using a dataset of 18,945 CT scans (4.2 million 2D +images) and time-to-event distributions across thousands of EHR-derived tasks, +our method improves outcome prediction, achieving an average AUROC increase of +23.7% and a 29.4% gain in Harrell's C-index across 8 benchmark tasks. +Importantly, these gains are achieved without sacrificing diagnostic +classification performance. This study lays the foundation for integrating +longitudinal EHR and 3D imaging data to advance clinical risk prediction. + +
+
+ comment: 34 pages, 19 figures +
+
+
+
+
+ + ☆ Adaptively Augmented Consistency Learning: A Semi-supervised + Segmentation Framework for Remote Sensing + + +
+ Remote sensing (RS) involves the acquisition of data about objects or areas +from a distance, primarily to monitor environmental changes, manage resources, +and support planning and disaster response. A significant challenge in RS +segmentation is the scarcity of high-quality labeled images due to the +diversity and complexity of RS image, which makes pixel-level annotation +difficult and hinders the development of effective supervised segmentation +algorithms. To solve this problem, we propose Adaptively Augmented Consistency +Learning (AACL), a semi-supervised segmentation framework designed to enhances +RS segmentation accuracy under condictions of limited labeled data. AACL +extracts additional information embedded in unlabeled images through the use of +Uniform Strength Augmentation (USAug) and Adaptive Cut-Mix (AdaCM). Evaluations +across various RS datasets demonstrate that AACL achieves competitive +performance in semi-supervised segmentation, showing up to a 20% improvement in +specific categories and 2% increase in overall performance compared to +state-of-the-art frameworks. + +
+
+
+
+
+ + ☆ Exploring Zero-Shot Anomaly Detection with CLIP in Medical Imaging: Are + We There Yet? + + +
+ Zero-shot anomaly detection (ZSAD) offers potential for identifying anomalies +in medical imaging without task-specific training. In this paper, we evaluate +CLIP-based models, originally developed for industrial tasks, on brain tumor +detection using the BraTS-MET dataset. Our analysis examines their ability to +detect medical-specific anomalies with no or minimal supervision, addressing +the challenges posed by limited data annotation. While these models show +promise in transferring general knowledge to medical tasks, their performance +falls short of the precision required for clinical use. Our findings highlight +the need for further adaptation before CLIP-based models can be reliably +applied to medical anomaly detection. + +
+
+ comment: accepted at 3rd AIxIA Workshop on Artificial Intelligence for + Healthcare and 5th Data4SmartHealth +
+
+
+
+
+ + ☆ DT-JRD: Deep Transformer based Just Recognizable Difference Prediction + Model for Video Coding for Machines + + +
+ Just Recognizable Difference (JRD) represents the minimum visual difference +that is detectable by machine vision, which can be exploited to promote machine +vision oriented visual signal processing. In this paper, we propose a Deep +Transformer based JRD (DT-JRD) prediction model for Video Coding for Machines +(VCM), where the accurately predicted JRD can be used reduce the coding bit +rate while maintaining the accuracy of machine tasks. Firstly, we model the JRD +prediction as a multi-class classification and propose a DT-JRD prediction +model that integrates an improved embedding, a content and distortion feature +extraction, a multi-class classification and a novel learning strategy. +Secondly, inspired by the perception property that machine vision exhibits a +similar response to distortions near JRD, we propose an asymptotic JRD loss by +using Gaussian Distribution-based Soft Labels (GDSL), which significantly +extends the number of training labels and relaxes classification boundaries. +Finally, we propose a DT-JRD based VCM to reduce the coding bits while +maintaining the accuracy of object detection. Extensive experimental results +demonstrate that the mean absolute error of the predicted JRD by the DT-JRD is +5.574, outperforming the state-of-the-art JRD prediction model by 13.1%. Coding +experiments shows that comparing with the VVC, the DT-JRD based VCM achieves an +average of 29.58% bit rate reduction while maintaining the object detection +accuracy. + +
+
+ comment: Submitted to IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ LHRS-Bot-Nova: Improved Multimodal Large Language Model for Remote + Sensing Vision-Language Interpretation + + +
+ Automatically and rapidly understanding Earth's surface is fundamental to our +grasp of the living environment and informed decision-making. This underscores +the need for a unified system with comprehensive capabilities in analyzing +Earth's surface to address a wide range of human needs. The emergence of +multimodal large language models (MLLMs) has great potential in boosting the +efficiency and convenience of intelligent Earth observation. These models can +engage in human-like conversations, serve as unified platforms for +understanding images, follow diverse instructions, and provide insightful +feedbacks. In this study, we introduce LHRS-Bot-Nova, an MLLM specialized in +understanding remote sensing (RS) images, designed to expertly perform a wide +range of RS understanding tasks aligned with human instructions. LHRS-Bot-Nova +features an enhanced vision encoder and a novel bridge layer, enabling +efficient visual compression and better language-vision alignment. To further +enhance RS-oriented vision-language alignment, we propose a large-scale RS +image-caption dataset, generated through feature-guided image recaptioning. +Additionally, we introduce an instruction dataset specifically designed to +improve spatial recognition abilities. Extensive experiments demonstrate +superior performance of LHRS-Bot-Nova across various RS image understanding +tasks. We also evaluate different MLLM performances in complex RS perception +and instruction following using a complicated multi-choice question evaluation +benchmark, providing a reliable guide for future model selection and +improvement. Data, code, and models will be available at +https://github.com/NJU-LHRS/LHRS-Bot. + +
+
+
+
+
+ + ☆ LLV-FSR: Exploiting Large Language-Vision Prior for Face + Super-resolution + + +
+ Existing face super-resolution (FSR) methods have made significant +advancements, but they primarily super-resolve face with limited visual +information, original pixel-wise space in particular, commonly overlooking the +pluralistic clues, like the higher-order depth and semantics, as well as +non-visual inputs (text caption and description). Consequently, these methods +struggle to produce a unified and meaningful representation from the input +face. We suppose that introducing the language-vision pluralistic +representation into unexplored potential embedding space could enhance FSR by +encoding and exploiting the complementarity across language-vision prior. This +motivates us to propose a new framework called LLV-FSR, which marries the power +of large vision-language model and higher-order visual prior with the +challenging task of FSR. Specifically, besides directly absorbing knowledge +from original input, we introduce the pre-trained vision-language model to +generate pluralistic priors, involving the image caption, descriptions, face +semantic mask and depths. These priors are then employed to guide the more +critical feature representation, facilitating realistic and high-quality face +super-resolution. Experimental results demonstrate that our proposed framework +significantly improves both the reconstruction quality and perceptual quality, +surpassing the SOTA by 0.43dB in terms of PSNR on the MMCelebA-HQ dataset. + +
+
+
+
+
+ + ☆ Leveraging Auxiliary Classification for Rib Fracture Segmentation + + +
+ Thoracic trauma often results in rib fractures, which demand swift and +accurate diagnosis for effective treatment. However, detecting these fractures +on rib CT scans poses considerable challenges, involving the analysis of many +image slices in sequence. Despite notable advancements in algorithms for +automated fracture segmentation, the persisting challenges stem from the +diverse shapes and sizes of these fractures. To address these issues, this +study introduces a sophisticated deep-learning model with an auxiliary +classification task designed to enhance the accuracy of rib fracture +segmentation. The auxiliary classification task is crucial in distinguishing +between fractured ribs and negative regions, encompassing non-fractured ribs +and surrounding tissues, from the patches obtained from CT scans. By leveraging +this auxiliary task, the model aims to improve feature representation at the +bottleneck layer by highlighting the regions of interest. Experimental results +on the RibFrac dataset demonstrate significant improvement in segmentation +performance. + +
+
+ comment: Accepted at ICVGIP'24 +
+
+
+
+
+ + ☆ LES-Talker: Fine-Grained Emotion Editing for Talking Head Generation in + Linear Emotion Space + + +
+ While existing one-shot talking head generation models have achieved progress +in coarse-grained emotion editing, there is still a lack of fine-grained +emotion editing models with high interpretability. We argue that for an +approach to be considered fine-grained, it needs to provide clear definitions +and sufficiently detailed differentiation. We present LES-Talker, a novel +one-shot talking head generation model with high interpretability, to achieve +fine-grained emotion editing across emotion types, emotion levels, and facial +units. We propose a Linear Emotion Space (LES) definition based on Facial +Action Units to characterize emotion transformations as vector transformations. +We design the Cross-Dimension Attention Net (CDAN) to deeply mine the +correlation between LES representation and 3D model representation. Through +mining multiple relationships across different feature and structure +dimensions, we enable LES representation to guide the controllable deformation +of 3D model. In order to adapt the multimodal data with deviations to the LES +and enhance visual quality, we utilize specialized network design and training +strategies. Experiments show that our method provides high visual quality along +with multilevel and interpretable fine-grained emotion editing, outperforming +mainstream methods. + +
+
+
+
+
+ + ☆ How Good is ChatGPT at Audiovisual Deepfake Detection: A Comparative + Study of ChatGPT, AI Models and Human Perception + + +
+ Multimodal deepfakes involving audiovisual manipulations are a growing threat +because they are difficult to detect with the naked eye or using unimodal deep +learningbased forgery detection methods. Audiovisual forensic models, while +more capable than unimodal models, require large training datasets and are +computationally expensive for training and inference. Furthermore, these models +lack interpretability and often do not generalize well to unseen manipulations. +In this study, we examine the detection capabilities of a large language model +(LLM) (i.e., ChatGPT) to identify and account for any possible visual and +auditory artifacts and manipulations in audiovisual deepfake content. Extensive +experiments are conducted on videos from a benchmark multimodal deepfake +dataset to evaluate the detection performance of ChatGPT and compare it with +the detection capabilities of state-of-the-art multimodal forensic models and +humans. Experimental results demonstrate the importance of domain knowledge and +prompt engineering for video forgery detection tasks using LLMs. Unlike +approaches based on end-to-end learning, ChatGPT can account for spatial and +spatiotemporal artifacts and inconsistencies that may exist within or across +modalities. Additionally, we discuss the limitations of ChatGPT for multimedia +forensic tasks. + +
+
+
+
+
+ + ☆ BEARD: Benchmarking the Adversarial Robustness for Dataset Distillation + + +
+ Dataset Distillation (DD) is an emerging technique that compresses +large-scale datasets into significantly smaller synthesized datasets while +preserving high test performance and enabling the efficient training of large +models. However, current research primarily focuses on enhancing evaluation +accuracy under limited compression ratios, often overlooking critical security +concerns such as adversarial robustness. A key challenge in evaluating this +robustness lies in the complex interactions between distillation methods, model +architectures, and adversarial attack strategies, which complicate standardized +assessments. To address this, we introduce BEARD, an open and unified benchmark +designed to systematically assess the adversarial robustness of DD methods, +including DM, IDM, and BACON. BEARD encompasses a variety of adversarial +attacks (e.g., FGSM, PGD, C&W) on distilled datasets like CIFAR-10/100 and +TinyImageNet. Utilizing an adversarial game framework, it introduces three key +metrics: Robustness Ratio (RR), Attack Efficiency Ratio (AE), and Comprehensive +Robustness-Efficiency Index (CREI). Our analysis includes unified benchmarks, +various Images Per Class (IPC) settings, and the effects of adversarial +training. Results are available on the BEARD Leaderboard, along with a library +providing model and dataset pools to support reproducible research. Access the +code at BEARD. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ☆ Rethinking Weight-Averaged Model-merging + + +
+ Weight-averaged model-merging has emerged as a powerful approach in deep +learning, capable of enhancing model performance without fine-tuning or +retraining. However, the underlying mechanisms that explain its effectiveness +remain largely unexplored. In this paper, we investigate this technique from +three novel perspectives to provide deeper insights into how and why +weight-averaged model-merging works: (1) we examine the intrinsic patterns +captured by the learning of the model weights, through the visualizations of +their patterns on several datasets, showing that these weights often encode +structured and interpretable patterns; (2) we investigate model ensemble +merging strategies based on averaging on weights versus averaging on features, +providing detailed analyses across diverse architectures and datasets; and (3) +we explore the impact on model-merging prediction stability in terms of +changing the parameter magnitude, revealing insights into the way of weight +averaging works as regularization by showing the robustness across different +parameter scales. Our findings shed light on the "black box" of weight-averaged +model-merging, offering valuable insights and practical recommendations that +advance the model-merging process. + +
+
+
+
+
+ + ☆ Jailbreak Attacks and Defenses against Multimodal Generative Models: A + Survey + + +
+ The rapid evolution of multimodal foundation models has led to significant +advancements in cross-modal understanding and generation across diverse +modalities, including text, images, audio, and video. However, these models +remain susceptible to jailbreak attacks, which can bypass built-in safety +mechanisms and induce the production of potentially harmful content. +Consequently, understanding the methods of jailbreak attacks and existing +defense mechanisms is essential to ensure the safe deployment of multimodal +generative models in real-world scenarios, particularly in security-sensitive +applications. To provide comprehensive insight into this topic, this survey +reviews jailbreak and defense in multimodal generative models. First, given the +generalized lifecycle of multimodal jailbreak, we systematically explore +attacks and corresponding defense strategies across four levels: input, +encoder, generator, and output. Based on this analysis, we present a detailed +taxonomy of attack methods, defense mechanisms, and evaluation frameworks +specific to multimodal generative models. Additionally, we cover a wide range +of input-output configurations, including modalities such as Any-to-Text, +Any-to-Vision, and Any-to-Any within generative systems. Finally, we highlight +current research challenges and propose potential directions for future +research.The open-source repository corresponding to this work can be found at +https://github.com/liuxuannan/Awesome-Multimodal-Jailbreak. + +
+
+ comment: ongoing work +
+
+
+
+
+ + ☆ Cross Space and Time: A Spatio-Temporal Unitized Model for Traffic Flow + Forecasting + + +
+ Predicting spatio-temporal traffic flow presents significant challenges due +to complex interactions between spatial and temporal factors. Existing +approaches often address these dimensions in isolation, neglecting their +critical interdependencies. In this paper, we introduce the Spatio-Temporal +Unitized Model (STUM), a unified framework designed to capture both spatial and +temporal dependencies while addressing spatio-temporal heterogeneity through +techniques such as distribution alignment and feature fusion. It also ensures +both predictive accuracy and computational efficiency. Central to STUM is the +Adaptive Spatio-temporal Unitized Cell (ASTUC), which utilizes low-rank +matrices to seamlessly store, update, and interact with space, time, as well as +their correlations. Our framework is also modular, allowing it to integrate +with various spatio-temporal graph neural networks through components such as +backbone models, feature extractors, residual fusion blocks, and predictive +modules to collectively enhance forecasting outcomes. Experimental results +across multiple real-world datasets demonstrate that STUM consistently improves +prediction performance with minimal computational cost. These findings are +further supported by hyperparameter optimization, pre-training analysis, and +result visualization. We provide our source code for reproducibility at +https://anonymous.4open.science/r/STUM-E4F0. + +
+
+
+
+
+ + ☆ Embedding Space Allocation with Angle-Norm Joint Classifiers for + Few-Shot Class-Incremental Learning + + +
+ Few-shot class-incremental learning (FSCIL) aims to continually learn new +classes from only a few samples without forgetting previous ones, requiring +intelligent agents to adapt to dynamic environments. FSCIL combines the +characteristics and challenges of class-incremental learning and few-shot +learning: (i) Current classes occupy the entire feature space, which is +detrimental to learning new classes. (ii) The small number of samples in +incremental rounds is insufficient for fully training. In existing mainstream +virtual class methods, for addressing the challenge (i), they attempt to use +virtual classes as placeholders. However, new classes may not necessarily align +with the virtual classes. For the challenge (ii), they replace trainable fully +connected layers with Nearest Class Mean (NCM) classifiers based on cosine +similarity, but NCM classifiers do not account for sample imbalance issues. To +address these issues in previous methods, we propose the class-center guided +embedding Space Allocation with Angle-Norm joint classifiers (SAAN) learning +framework, which provides balanced space for all classes and leverages norm +differences caused by sample imbalance to enhance classification criteria. +Specifically, for challenge (i), SAAN divides the feature space into multiple +subspaces and allocates a dedicated subspace for each session by guiding +samples with the pre-set category centers. For challenge (ii), SAAN establishes +a norm distribution for each class and generates angle-norm joint logits. +Experiments demonstrate that SAAN can achieve state-of-the-art performance and +it can be directly embedded into other SOTA methods as a plug-in, further +enhancing their performance. + +
+
+
+
+
+ + ☆ Harnessing Vision Foundation Models for High-Performance, Training-Free + Open Vocabulary Segmentation + + +
+ While Contrastive Language-Image Pre-training (CLIP) has advanced +open-vocabulary predictions, its performance on semantic segmentation remains +suboptimal. This shortfall primarily stems from its spatial-invariant semantic +features and constrained resolution. While previous adaptations addressed +spatial invariance semantic by modifying the self-attention in CLIP's image +encoder, the issue of limited resolution remains unexplored. Different from +previous segment-then-splice methods that segment sub-images via a sliding +window and splice the results, we introduce a splice-then-segment paradigm that +incorporates Segment-Anything Model (SAM) to tackle the resolution issue since +SAM excels at extracting fine-grained semantic correlations from +high-resolution images. Specifically, we introduce Trident, a training-free +framework that first splices features extracted by CLIP and DINO from +sub-images, then leverages SAM's encoder to create a correlation matrix for +global aggregation, enabling a broadened receptive field for effective +segmentation. Besides, we propose a refinement strategy for CLIP's coarse +segmentation outputs by transforming them into prompts for SAM, further +enhancing the segmentation performance. Trident achieves a significant +improvement in the mIoU across eight benchmarks compared with the current SOTA, +increasing from 44.4 to 48.6.Code is available at +https://github.com/YuHengsss/Trident. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ JoyVASA: Portrait and Animal Image Animation with Diffusion-Based + Audio-Driven Facial Dynamics and Head Motion Generation + + +
+ Audio-driven portrait animation has made significant advances with +diffusion-based models, improving video quality and lipsync accuracy. However, +the increasing complexity of these models has led to inefficiencies in training +and inference, as well as constraints on video length and inter-frame +continuity. In this paper, we propose JoyVASA, a diffusion-based method for +generating facial dynamics and head motion in audio-driven facial animation. +Specifically, in the first stage, we introduce a decoupled facial +representation framework that separates dynamic facial expressions from static +3D facial representations. This decoupling allows the system to generate longer +videos by combining any static 3D facial representation with dynamic motion +sequences. Then, in the second stage, a diffusion transformer is trained to +generate motion sequences directly from audio cues, independent of character +identity. Finally, a generator trained in the first stage uses the 3D facial +representation and the generated motion sequences as inputs to render +high-quality animations. With the decoupled facial representation and the +identity-independent motion generation process, JoyVASA extends beyond human +portraits to animate animal faces seamlessly. The model is trained on a hybrid +dataset of private Chinese and public English data, enabling multilingual +support. Experimental results validate the effectiveness of our approach. +Future work will focus on improving real-time performance and refining +expression control, further expanding the applications in portrait animation. +The code will be available at: https://jdhalgo.github.io/JoyVASA. + +
+
+
+
+
+ + ☆ LEAP:D - A Novel Prompt-based Approach for Domain-Generalized Aerial + Object Detection ICIP 2024 + + +
+ Drone-captured images present significant challenges in object detection due +to varying shooting conditions, which can alter object appearance and shape. +Factors such as drone altitude, angle, and weather cause these variations, +influencing the performance of object detection algorithms. To tackle these +challenges, we introduce an innovative vision-language approach using learnable +prompts. This shift from conventional manual prompts aims to reduce +domain-specific knowledge interference, ultimately improving object detection +capabilities. Furthermore, we streamline the training process with a one-step +approach, updating the learnable prompt concurrently with model training, +enhancing efficiency without compromising performance. Our study contributes to +domain-generalized object detection by leveraging learnable prompts and +optimizing training processes. This enhances model robustness and adaptability +across diverse environments, leading to more effective aerial object detection. + +
+
+ comment: ICIP 2024 Workshop accepted paper +
+
+
+
+
+ + ☆ Gazing at Rewards: Eye Movements as a Lens into Human and AI + Decision-Making in Hybrid Visual Foraging + + +
+ Imagine searching a collection of coins for quarters ($0.25$), dimes +($0.10$), nickels ($0.05$), and pennies ($0.01$)-a hybrid foraging task where +observers look for multiple instances of multiple target types. In such tasks, +how do target values and their prevalence influence foraging and eye movement +behaviors (e.g., should you prioritize rare quarters or common nickels)? To +explore this, we conducted human psychophysics experiments, revealing that +humans are proficient reward foragers. Their eye fixations are drawn to regions +with higher average rewards, fixation durations are longer on more valuable +targets, and their cumulative rewards exceed chance, approaching the upper +bound of optimal foragers. To probe these decision-making processes of humans, +we developed a transformer-based Visual Forager (VF) model trained via +reinforcement learning. Our VF model takes a series of targets, their +corresponding values, and the search image as inputs, processes the images +using foveated vision, and produces a sequence of eye movements along with +decisions on whether to collect each fixated item. Our model outperforms all +baselines, achieves cumulative rewards comparable to those of humans, and +approximates human foraging behavior in eye movements and foraging biases +within time-limited environments. Furthermore, stress tests on +out-of-distribution tasks with novel targets, unseen values, and varying set +sizes demonstrate the VF model's effective generalization. Our work offers +valuable insights into the relationship between eye movements and +decision-making, with our model serving as a powerful tool for further +exploration of this connection. All data, code, and models will be made +publicly available. + +
+
+
+
+
+ + ☆ Advancing Diffusion Models: Alias-Free Resampling and Enhanced + Rotational Equivariance + + +
+ Recent advances in image generation, particularly via diffusion models, have +led to impressive improvements in image synthesis quality. Despite this, +diffusion models are still challenged by model-induced artifacts and limited +stability in image fidelity. In this work, we hypothesize that the primary +cause of this issue is the improper resampling operation that introduces +aliasing in the diffusion model and a careful alias-free resampling dictated by +image processing theory can improve the model's performance in image synthesis. +We propose the integration of alias-free resampling layers into the UNet +architecture of diffusion models without adding extra trainable parameters, +thereby maintaining computational efficiency. We then assess whether these +theory-driven modifications enhance image quality and rotational equivariance. +Our experimental results on benchmark datasets, including CIFAR-10, MNIST, and +MNIST-M, reveal consistent gains in image quality, particularly in terms of FID +and KID scores. Furthermore, we propose a modified diffusion process that +enables user-controlled rotation of generated images without requiring +additional training. Our findings highlight the potential of theory-driven +enhancements such as alias-free resampling in generative models to improve +image quality while maintaining model efficiency and pioneer future research +directions to incorporate them into video-generating diffusion models, enabling +deeper exploration of the applications of alias-free resampling in generative +modeling. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ DyGASR: Dynamic Generalized Exponential Splatting with Surface Alignment + for Accelerated 3D Mesh Reconstruction + + +
+ Recent advancements in 3D Gaussian Splatting (3DGS), which lead to +high-quality novel view synthesis and accelerated rendering, have remarkably +improved the quality of radiance field reconstruction. However, the extraction +of mesh from a massive number of minute 3D Gaussian points remains great +challenge due to the large volume of Gaussians and difficulty of representation +of sharp signals caused by their inherent low-pass characteristics. To address +this issue, we propose DyGASR, which utilizes generalized exponential function +instead of traditional 3D Gaussian to decrease the number of particles and +dynamically optimize the representation of the captured signal. In addition, it +is observed that reconstructing mesh with Generalized Exponential +Splatting(GES) without modifications frequently leads to failures since the +generalized exponential distribution centroids may not precisely align with the +scene surface. To overcome this, we adopt Sugar's approach and introduce +Generalized Surface Regularization (GSR), which reduces the smallest scaling +vector of each point cloud to zero and ensures normal alignment perpendicular +to the surface, facilitating subsequent Poisson surface mesh reconstruction. +Additionally, we propose a dynamic resolution adjustment strategy that utilizes +a cosine schedule to gradually increase image resolution from low to high +during the training stage, thus avoiding constant full resolution, which +significantly boosts the reconstruction speed. Our approach surpasses existing +3DGS-based mesh reconstruction methods, as evidenced by extensive evaluations +on various scene datasets, demonstrating a 25\% increase in speed, and a 30\% +reduction in memory usage. + +
+
+
+
+
+ + ☆ VidMan: Exploiting Implicit Dynamics from Video Diffusion Model for + Effective Robot Manipulation NeurIPS 2024 + + +
+ Recent advancements utilizing large-scale video data for learning video +generation models demonstrate significant potential in understanding complex +physical dynamics. It suggests the feasibility of leveraging diverse robot +trajectory data to develop a unified, dynamics-aware model to enhance robot +manipulation. However, given the relatively small amount of available robot +data, directly fitting data without considering the relationship between visual +observations and actions could lead to suboptimal data utilization. To this +end, we propose VidMan (Video Diffusion for Robot Manipulation), a novel +framework that employs a two-stage training mechanism inspired by dual-process +theory from neuroscience to enhance stability and improve data utilization +efficiency. Specifically, in the first stage, VidMan is pre-trained on the Open +X-Embodiment dataset (OXE) for predicting future visual trajectories in a video +denoising diffusion manner, enabling the model to develop a long horizontal +awareness of the environment's dynamics. In the second stage, a flexible yet +effective layer-wise self-attention adapter is introduced to transform VidMan +into an efficient inverse dynamics model that predicts action modulated by the +implicit dynamics knowledge via parameter sharing. Our VidMan framework +outperforms state-of-the-art baseline model GR-1 on the CALVIN benchmark, +achieving a 11.7% relative improvement, and demonstrates over 9% precision +gains on the OXE small-scale dataset. These results provide compelling evidence +that world models can significantly enhance the precision of robot action +prediction. Codes and models will be public. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Mono2Stereo: Monocular Knowledge Transfer for Enhanced Stereo Matching + + +
+ The generalization and performance of stereo matching networks are limited +due to the domain gap of the existing synthetic datasets and the sparseness of +GT labels in the real datasets. In contrast, monocular depth estimation has +achieved significant advancements, benefiting from large-scale depth datasets +and self-supervised strategies. To bridge the performance gap between monocular +depth estimation and stereo matching, we propose leveraging monocular knowledge +transfer to enhance stereo matching, namely Mono2Stereo. We introduce knowledge +transfer with a two-stage training process, comprising synthetic data +pre-training and real-world data fine-tuning. In the pre-training stage, we +design a data generation pipeline that synthesizes stereo training data from +monocular images. This pipeline utilizes monocular depth for warping and novel +view synthesis and employs our proposed Edge-Aware (EA) inpainting module to +fill in missing contents in the generated images. In the fine-tuning stage, we +introduce a Sparse-to-Dense Knowledge Distillation (S2DKD) strategy encouraging +the distributions of predictions to align with dense monocular depths. This +strategy mitigates issues with edge blurring in sparse real-world labels and +enhances overall consistency. Experimental results demonstrate that our +pre-trained model exhibits strong zero-shot generalization capabilities. +Furthermore, domain-specific fine-tuning using our pre-trained model and S2DKD +strategy significantly increments in-domain performance. The code will be made +available soon. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ UniHOI: Learning Fast, Dense and Generalizable 4D Reconstruction for + Egocentric Hand Object Interaction Videos + + +
+ Egocentric Hand Object Interaction (HOI) videos provide valuable insights +into human interactions with the physical world, attracting growing interest +from the computer vision and robotics communities. A key task in fully +understanding the geometry and dynamics of HOI scenes is dense pointclouds +sequence reconstruction. However, the inherent motion of both hands and the +camera makes this challenging. Current methods often rely on time-consuming +test-time optimization, making them impractical for reconstructing +internet-scale videos. To address this, we introduce UniHOI, a model that +unifies the estimation of all variables necessary for dense 4D reconstruction, +including camera intrinsic, camera poses, and video depth, for egocentric HOI +scene in a fast feed-forward manner. We end-to-end optimize all these variables +to improve their consistency in 3D space. Furthermore, our model could be +trained solely on large-scale monocular video dataset, overcoming the +limitation of scarce labeled HOI data. We evaluate UniHOI with both in-domain +and zero-shot generalization setting, surpassing all baselines in pointclouds +sequence reconstruction and long-term 3D scene flow recovery. UniHOI is the +first approach to offer fast, dense, and generalizable monocular egocentric HOI +scene reconstruction in the presence of motion. Code and trained model will be +released in the future. + +
+
+
+
+
+ + ☆ Adversarial Vessel-Unveiling Semi-Supervised Segmentation for + Retinopathy of Prematurity Diagnosis + + +
+ Accurate segmentation of retinal images plays a crucial role in aiding +ophthalmologists in diagnosing retinopathy of prematurity (ROP) and assessing +its severity. However, due to their underdeveloped, thinner vessels, manual +annotation in infant fundus images is very complex, and this presents +challenges for fully supervised learning. To address the scarcity of +annotations, we propose a semi supervised segmentation framework designed to +advance ROP studies without the need for extensive manual vessel annotation. +Unlike previous methods that rely solely on limited labeled data, our approach +leverages teacher student learning by integrating two powerful components: an +uncertainty weighted vessel unveiling module and domain adversarial learning. +The vessel unveiling module helps the model effectively reveal obscured and +hard to detect vessel structures, while adversarial training aligns feature +representations across different domains, ensuring robust and generalizable +vessel segmentations. We validate our approach on public datasets (CHASEDB, +STARE) and an in-house ROP dataset, demonstrating its superior performance +across multiple evaluation metrics. Additionally, we extend the model's utility +to a downstream task of ROP multi-stage classification, where vessel masks +extracted by our segmentation model improve diagnostic accuracy. The promising +results in classification underscore the model's potential for clinical +application, particularly in early-stage ROP diagnosis and intervention. +Overall, our work offers a scalable solution for leveraging unlabeled data in +pediatric ophthalmology, opening new avenues for biomarker discovery and +clinical research. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Fast probabilistic snake algorithm + + +
+ Few people use the probability theory in order to achieve image segmentation +with snake models. In this article, we are presenting an active contour +algorithm based on a probability approach inspired by A. Blake work and P. +R{\'e}fr{\'e}gier's team research in France. Our algorithm, both very fast and +highly accurate as far as contour description is concerned, is easily adaptable +to any specific application. + +
+
+
+
+
+ + ☆ Computational metaoptics for imaging + + +
+ Metasurfaces -- ultrathin structures composed of subwavelength optical +elements -- have revolutionized light manipulation by enabling precise control +over electromagnetic waves' amplitude, phase, polarization, and spectral +properties. Concurrently, computational imaging leverages algorithms to +reconstruct images from optically processed signals, overcoming limitations of +traditional imaging systems. This review explores the synergistic integration +of metaoptics and computational imaging, "computational metaoptics," which +combines the physical wavefront shaping ability of metasurfaces with advanced +computational algorithms to enhance imaging performance beyond conventional +limits. We discuss how computational metaoptics addresses the inherent +limitations of single-layer metasurfaces in achieving multifunctionality +without compromising efficiency. By treating metasurfaces as physical +preconditioners and co-designing them with reconstruction algorithms through +end-to-end (inverse) design, it is possible to jointly optimize the optical +hardware and computational software. This holistic approach allows for the +automatic discovery of optimal metasurface designs and reconstruction methods +that significantly improve imaging capabilities. Advanced applications enabled +by computational metaoptics are highlighted, including phase imaging and +quantum state measurement, which benefit from the metasurfaces' ability to +manipulate complex light fields and the computational algorithms' capacity to +reconstruct high-dimensional information. We also examine performance +evaluation challenges, emphasizing the need for new metrics that account for +the combined optical and computational nature of these systems. Finally, we +identify new frontiers in computational metaoptics which point toward a future +where computational metaoptics may play a central role in advancing imaging +science and technology. + +
+
+
+
+
+ + ☆ SCAN: Bootstrapping Contrastive Pre-training for Data Efficiency + + +
+ While contrastive pre-training is widely employed, its data efficiency +problem has remained relatively under-explored thus far. Existing methods often +rely on static coreset selection algorithms to pre-identify important data for +training. However, this static nature renders them unable to dynamically track +the data usefulness throughout pre-training, leading to subpar pre-trained +models. To address this challenge, our paper introduces a novel dynamic +bootstrapping dataset pruning method. It involves pruning data preparation +followed by dataset mutation operations, both of which undergo iterative and +dynamic updates. We apply this method to two prevalent contrastive pre-training +frameworks: \textbf{CLIP} and \textbf{MoCo}, representing vision-language and +vision-centric domains, respectively. In particular, we individually pre-train +seven CLIP models on two large-scale image-text pair datasets, and two MoCo +models on the ImageNet dataset, resulting in a total of 16 pre-trained models. +With a data pruning rate of 30-35\% across all 16 models, our method exhibits +only marginal performance degradation (less than \textbf{1\%} on average) +compared to corresponding models trained on the full dataset counterparts +across various downstream datasets, and also surpasses several baselines with a +large performance margin. Additionally, the byproduct from our method, \ie +coresets derived from the original datasets after pre-training, also +demonstrates significant superiority in terms of downstream performance over +other static coreset selection approaches. + +
+
+
+
+
+ + ☆ VCBench: A Controllable Benchmark for Symbolic and Abstract Challenges + in Video Cognition + + +
+ Recent advancements in Large Video-Language Models (LVLMs) have driven the +development of benchmarks designed to assess cognitive abilities in video-based +tasks. However, most existing benchmarks heavily rely on web-collected videos +paired with human annotations or model-generated questions, which limit control +over the video content and fall short in evaluating advanced cognitive +abilities involving symbolic elements and abstract concepts. To address these +limitations, we introduce VCBench, a controllable benchmark to assess LVLMs' +cognitive abilities, involving symbolic and abstract concepts at varying +difficulty levels. By generating video data with the Python-based engine, +VCBench allows for precise control over the video content, creating dynamic, +task-oriented videos that feature complex scenes and abstract concepts. Each +task pairs with tailored question templates that target specific cognitive +challenges, providing a rigorous evaluation test. Our evaluation reveals that +even state-of-the-art (SOTA) models, such as Qwen2-VL-72B, struggle with simple +video cognition tasks involving abstract concepts, with performance sharply +dropping by 19% as video complexity rises. These findings reveal the current +limitations of LVLMs in advanced cognitive tasks and highlight the critical +role of VCBench in driving research toward more robust LVLMs for complex video +cognition challenges. + +
+
+
+
+
+ + ☆ Heuristical Comparison of Vision Transformers Against Convolutional + Neural Networks for Semantic Segmentation on Remote Sensing Imagery + + +
+ Vision Transformers (ViT) have recently brought a new wave of research in the +field of computer vision. These models have done particularly well in the field +of image classification and segmentation. Research on semantic and instance +segmentation has emerged to accelerate with the inception of the new +architecture, with over 80\% of the top 20 benchmarks for the iSAID dataset +being either based on the ViT architecture or the attention mechanism behind +its success. This paper focuses on the heuristic comparison of three key +factors of using (or not using) ViT for semantic segmentation of remote sensing +aerial images on the iSAID. The experimental results observed during the course +of the research were under the scrutinization of the following objectives: 1. +Use of weighted fused loss function for the maximum mean Intersection over +Union (mIoU) score, Dice score, and minimization or conservation of entropy or +class representation, 2. Comparison of transfer learning on Meta's MaskFormer, +a ViT-based semantic segmentation model, against generic UNet Convolutional +Neural Networks (CNNs) judged over mIoU, Dice scores, training efficiency, and +inference time, and 3. What do we lose for what we gain? i.e., the comparison +of the two models against current state-of-art segmentation models. We show the +use of the novel combined weighted loss function significantly boosts the CNN +model's performance capacities as compared to transfer learning the ViT. The +code for this implementation can be found on +\url{https://github.com/ashimdahal/ViT-vs-CNN-ImageSegmentation}. + +
+
+
+
+
+ + ☆ LEAP:D -- A Novel Prompt-based Approach for Domain-Generalized Aerial + Object Detection ICIP 2024 + + +
+ Drone-captured images present significant challenges in object detection due +to varying shooting conditions, which can alter object appearance and shape. +Factors such as drone altitude, angle, and weather cause these variations, +influencing the performance of object detection algorithms. To tackle these +challenges, we introduce an innovative vision-language approach using learnable +prompts. This shift from conventional manual prompts aims to reduce +domain-specific knowledge interference, ultimately improving object detection +capabilities. Furthermore, we streamline the training process with a one-step +approach, updating the learnable prompt concurrently with model training, +enhancing efficiency without compromising performance. Our study contributes to +domain-generalized object detection by leveraging learnable prompts and +optimizing training processes. This enhances model robustness and adaptability +across diverse environments, leading to more effective aerial object detection. + +
+
+ comment: ICIP 2024 Workshop accepted paper +
+
+
+
+
+ + ☆ OneNet: A Channel-Wise 1D Convolutional U-Net + + +
+ Many state-of-the-art computer vision architectures leverage U-Net for its +adaptability and efficient feature extraction. However, the multi-resolution +convolutional design often leads to significant computational demands, limiting +deployment on edge devices. We present a streamlined alternative: a 1D +convolutional encoder that retains accuracy while enhancing its suitability for +edge applications. Our novel encoder architecture achieves semantic +segmentation through channel-wise 1D convolutions combined with pixel-unshuffle +operations. By incorporating PixelShuffle, known for improving accuracy in +super-resolution tasks while reducing computational load, OneNet captures +spatial relationships without requiring 2D convolutions, reducing parameters by +up to 47%. Additionally, we explore a fully 1D encoder-decoder that achieves a +71% reduction in size, albeit with some accuracy loss. We benchmark our +approach against U-Net variants across diverse mask-generation tasks, +demonstrating that it preserves accuracy effectively. Although focused on image +segmentation, this architecture is adaptable to other convolutional +applications. Code for the project is available at +https://github.com/shbyun080/OneNet . + +
+
+
+
+
+ + ☆ Architect: Generating Vivid and Interactive 3D Scenes with Hierarchical + 2D Inpainting + + +
+ Creating large-scale interactive 3D environments is essential for the +development of Robotics and Embodied AI research. Current methods, including +manual design, procedural generation, diffusion-based scene generation, and +large language model (LLM) guided scene design, are hindered by limitations +such as excessive human effort, reliance on predefined rules or training +datasets, and limited 3D spatial reasoning ability. Since pre-trained 2D image +generative models better capture scene and object configuration than LLMs, we +address these challenges by introducing Architect, a generative framework that +creates complex and realistic 3D embodied environments leveraging +diffusion-based 2D image inpainting. In detail, we utilize foundation visual +perception models to obtain each generated object from the image and leverage +pre-trained depth estimation models to lift the generated 2D image to 3D space. +Our pipeline is further extended to a hierarchical and iterative inpainting +process to continuously generate placement of large furniture and small objects +to enrich the scene. This iterative structure brings the flexibility for our +method to generate or refine scenes from various starting points, such as text, +floor plans, or pre-arranged environments. + +
+
+
+
+
+ + ☆ A Self-Supervised Model for Multi-modal Stroke Risk Prediction + + +
+ Predicting stroke risk is a complex challenge that can be enhanced by +integrating diverse clinically available data modalities. This study introduces +a self-supervised multimodal framework that combines 3D brain imaging, clinical +data, and image-derived features to improve stroke risk prediction prior to +onset. By leveraging large unannotated clinical datasets, the framework +captures complementary and synergistic information across image and tabular +data modalities. Our approach is based on a contrastive learning framework that +couples contrastive language-image pretraining with an image-tabular matching +module, to better align multimodal data representations in a shared latent +space. The model is trained on the UK Biobank, which includes structural brain +MRI and clinical data. We benchmark its performance against state-of-the-art +unimodal and multimodal methods using tabular, image, and image-tabular +combinations under diverse frozen and trainable model settings. The proposed +model outperformed self-supervised tabular (image) methods by 2.6% (2.6%) in +ROC-AUC and by 3.3% (5.6%) in balanced accuracy. Additionally, it showed a 7.6% +increase in balanced accuracy compared to the best multimodal supervised model. +Through interpretable tools, our approach demonstrated better integration of +tabular and image data, providing richer and more aligned embeddings. +Gradient-weighted Class Activation Mapping heatmaps further revealed activated +brain regions commonly associated in the literature with brain aging, stroke +risk, and clinical outcomes. This robust self-supervised multimodal framework +surpasses state-of-the-art methods for stroke risk prediction and offers a +strong foundation for future studies integrating diverse data modalities to +advance clinical predictive modelling. + +
+
+ comment: Accepted as oral paper at AIM-FM workshop, Neurips 2024 +
+
+
+
+
+ + ☆ Automatic Classification of General Movements in Newborns ML4H + + +
+ General movements (GMs) are spontaneous, coordinated body movements in +infants that offer valuable insights into the developing nervous system. +Assessed through the Prechtl GM Assessment (GMA), GMs are reliable predictors +for neurodevelopmental disorders. However, GMA requires specifically trained +clinicians, who are limited in number. To scale up newborn screening, there is +a need for an algorithm that can automatically classify GMs from infant video +recordings. This data poses challenges, including variability in recording +length, device type, and setting, with each video coarsely annotated for +overall movement quality. In this work, we introduce a tool for extracting +features from these recordings and explore various machine learning techniques +for automated GM classification. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages +
+
+
+
+
+ + ☆ Video Denoising in Fluorescence Guided Surgery + + +
+ Fluorescence guided surgery (FGS) is a promising surgical technique that +gives surgeons a unique view of tissue that is used to guide their practice by +delineating tissue types and diseased areas. As new fluorescent contrast agents +are developed that have low fluorescent photon yields, it becomes increasingly +important to develop computational models to allow FGS systems to maintain good +video quality in real time environments. To further complicate this task, FGS +has a difficult bias noise term from laser leakage light (LLL) that represents +unfiltered excitation light that can be on the order of the fluorescent signal. +Most conventional video denoising methods focus on zero mean noise, and +non-causal processing, both of which are violated in FGS. Luckily in FGS, often +a co-located reference video is also captured which we use to simulate the LLL +and assist in the denoising processes. In this work, we propose an accurate +noise simulation pipeline that includes LLL and propose three baseline deep +learning based algorithms for FGS video denoising. + +
+
+
+
+
+ + ☆ Deep Learning for Fetal Inflammatory Response Diagnosis in the Umbilical + Cord + + +
+ Inflammation of the umbilical cord can be seen as a result of ascending +intrauterine infection or other inflammatory stimuli. Acute fetal inflammatory +response (FIR) is characterized by infiltration of the umbilical cord by fetal +neutrophils, and can be associated with neonatal sepsis or fetal inflammatory +response syndrome. Recent advances in deep learning in digital pathology have +demonstrated favorable performance across a wide range of clinical tasks, such +as diagnosis and prognosis. In this study we classified FIR from whole slide +images (WSI). We digitized 4100 histological slides of umbilical cord stained +with hematoxylin and eosin(H&E) and extracted placental diagnoses from the +electronic health record. We build models using attention-based whole slide +learning models. We compared strategies between features extracted by a model +(ConvNeXtXLarge) pretrained on non-medical images (ImageNet), and one +pretrained using histopathology images (UNI). We trained multiple iterations of +each model and combined them into an ensemble. The predictions from the +ensemble of models trained using UNI achieved an overall balanced accuracy of +0.836 on the test dataset. In comparison, the ensembled predictions using +ConvNeXtXLarge had a lower balanced accuracy of 0.7209. Heatmaps generated from +top accuracy model appropriately highlighted arteritis in cases of FIR 2. In +FIR 1, the highest performing model assigned high attention to areas of +activated-appearing stroma in Wharton's Jelly. However, other high-performing +models assigned attention to umbilical vessels. We developed models for +diagnosis of FIR from placental histology images, helping reduce interobserver +variability among pathologists. Future work may examine the utility of these +models for identifying infants at risk of systemic inflammatory response or +early onset neonatal sepsis. + +
+
+
+
+
+ + ☆ NACNet: A Histology Context-aware Transformer Graph Convolution Network + for Predicting Treatment Response to Neoadjuvant Chemotherapy in Triple + Negative Breast Cancer + + +
+ Neoadjuvant chemotherapy (NAC) response prediction for triple negative breast +cancer (TNBC) patients is a challenging task clinically as it requires +understanding complex histology interactions within the tumor microenvironment +(TME). Digital whole slide images (WSIs) capture detailed tissue information, +but their giga-pixel size necessitates computational methods based on multiple +instance learning, which typically analyze small, isolated image tiles without +the spatial context of the TME. To address this limitation and incorporate TME +spatial histology interactions in predicting NAC response for TNBC patients, we +developed a histology context-aware transformer graph convolution network +(NACNet). Our deep learning method identifies the histopathological labels on +individual image tiles from WSIs, constructs a spatial TME graph, and +represents each node with features derived from tissue texture and social +network analysis. It predicts NAC response using a transformer graph +convolution network model enhanced with graph isomorphism network layers. We +evaluate our method with WSIs of a cohort of TNBC patient (N=105) and compared +its performance with multiple state-of-the-art machine learning and deep +learning models, including both graph and non-graph approaches. Our NACNet +achieves 90.0% accuracy, 96.0% sensitivity, 88.0% specificity, and an AUC of +0.82, through eight-fold cross-validation, outperforming baseline models. These +comprehensive experimental results suggest that NACNet holds strong potential +for stratifying TNBC patients by NAC response, thereby helping to prevent +overtreatment, improve patient quality of life, reduce treatment cost, and +enhance clinical outcomes, marking an important advancement toward personalized +breast cancer treatment. + +
+
+ comment: This paper is accepted by Computerized Medical Imaging and Graphics + (Nov 07 2024) +
+
+
+
+
+ + ☆ Partial Multi-View Clustering via Meta-Learning and Contrastive Feature + Alignment + + +
+ Partial multi-view clustering (PVC) presents significant challenges practical +research problem for data analysis in real-world applications, especially when +some views of the data are partially missing. Existing clustering methods +struggle to handle incomplete views effectively, leading to suboptimal +clustering performance. In this paper, we propose a novel dual optimization +framework based on contrastive learning, which aims to maximize the consistency +of latent features in incomplete multi-view data and improve clustering +performance through deep learning models. By combining a fine-tuned Vision +Transformer and k-nearest neighbors (KNN), we fill in missing views and +dynamically adjust view weights using self-supervised learning and +meta-learning. Experimental results demonstrate that our framework outperforms +state-of-the-art clustering models on the BDGP and HW datasets, particularly in +handling complex and incomplete multi-view data. + +
+
+
+
+
+ + ☆ Analyzing the AI Nudification Application Ecosystem + + +
+ Given a source image of a clothed person (an image subject), AI-based +nudification applications can produce nude (undressed) images of that person. +Moreover, not only do such applications exist, but there is ample evidence of +the use of such applications in the real world and without the consent of an +image subject. Still, despite the growing awareness of the existence of such +applications and their potential to violate the rights of image subjects and +cause downstream harms, there has been no systematic study of the nudification +application ecosystem across multiple applications. We conduct such a study +here, focusing on 20 popular and easy-to-find nudification websites. We study +the positioning of these web applications (e.g., finding that most sites +explicitly target the nudification of women, not all people), the features that +they advertise (e.g., ranging from undressing-in-place to the rendering of +image subjects in sexual positions, as well as differing user-privacy options), +and their underlying monetization infrastructure (e.g., credit cards and +cryptocurrencies). We believe this work will empower future, data-informed +conversations -- within the scientific, technical, and policy communities -- on +how to better protect individuals' rights and minimize harm in the face of +modern (and future) AI-based nudification applications. Content warning: This +paper includes descriptions of web applications that can be used to create +synthetic non-consensual explicit AI-created imagery (SNEACI). This paper also +includes an artistic rendering of a user interface for such an application. + +
+
+ comment: 22 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Adversarial Attacks Using Differentiable Rendering: A Survey + + +
+ Differentiable rendering methods have emerged as a promising means for +generating photo-realistic and physically plausible adversarial attacks by +manipulating 3D objects and scenes that can deceive deep neural networks +(DNNs). Recently, differentiable rendering capabilities have evolved +significantly into a diverse landscape of libraries, such as Mitsuba, +PyTorch3D, and methods like Neural Radiance Fields and 3D Gaussian Splatting +for solving inverse rendering problems that share conceptually similar +properties commonly used to attack DNNs, such as back-propagation and +optimization. However, the adversarial machine learning research community has +not yet fully explored or understood such capabilities for generating attacks. +Some key reasons are that researchers often have different attack goals, such +as misclassification or misdetection, and use different tasks to accomplish +these goals by manipulating different representation in a scene, such as the +mesh or texture of an object. This survey adopts a task-oriented unifying +framework that systematically summarizes common tasks, such as manipulating +textures, altering illumination, and modifying 3D meshes to exploit +vulnerabilities in DNNs. Our framework enables easy comparison of existing +works, reveals research gaps and spotlights exciting future research directions +in this rapidly evolving field. Through focusing on how these tasks enable +attacks on various DNNs such as image classification, facial recognition, +object detection, optical flow and depth estimation, our survey helps +researchers and practitioners better understand the vulnerabilities of computer +vision systems against photorealistic adversarial attacks that could threaten +real-world applications. + +
+
+
+
+
+ + ♻ ☆ I2I-Mamba: Multi-modal medical image synthesis via selective state space + modeling + + +
+ In recent years, deep learning models comprising transformer components have +pushed the performance envelope in medical image synthesis tasks. Contrary to +convolutional neural networks (CNNs) that use static, local filters, +transformers use self-attention mechanisms to permit adaptive, non-local +filtering to sensitively capture long-range context. However, this sensitivity +comes at the expense of substantial model complexity, which can compromise +learning efficacy particularly on relatively modest-sized imaging datasets. +Here, we propose a novel adversarial model for multi-modal medical image +synthesis, I2I-Mamba, that leverages selective state space modeling (SSM) to +efficiently capture long-range context while maintaining local precision. To do +this, I2I-Mamba injects channel-mixed Mamba (cmMamba) blocks in the bottleneck +of a convolutional backbone. In cmMamba blocks, SSM layers are used to learn +context across the spatial dimension and channel-mixing layers are used to +learn context across the channel dimension of feature maps. Comprehensive +demonstrations are reported for imputing missing images in multi-contrast MRI +and MRI-CT protocols. Our results indicate that I2I-Mamba offers superior +performance against state-of-the-art CNN- and transformer-based methods in +synthesizing target-modality images. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Super-resolution multi-contrast unbiased eye atlases with deep + probabilistic refinement + + +
+ Purpose: Eye morphology varies significantly across the population, +especially for the orbit and optic nerve. These variations limit the +feasibility and robustness of generalizing population-wise features of eye +organs to an unbiased spatial reference. + Approach: To tackle these limitations, we propose a process for creating +high-resolution unbiased eye atlases. First, to restore spatial details from +scans with a low through-plane resolution compared to a high in-plane +resolution, we apply a deep learning-based super-resolution algorithm. Then, we +generate an initial unbiased reference with an iterative metric-based +registration using a small portion of subject scans. We register the remaining +scans to this template and refine the template using an unsupervised deep +probabilistic approach that generates a more expansive deformation field to +enhance the organ boundary alignment. We demonstrate this framework using +magnetic resonance images across four different tissue contrasts, generating +four atlases in separate spatial alignments. + Results: For each tissue contrast, we find a significant improvement using +the Wilcoxon signed-rank test in the average Dice score across four labeled +regions compared to a standard registration framework consisting of rigid, +affine, and deformable transformations. These results highlight the effective +alignment of eye organs and boundaries using our proposed process. + Conclusions: By combining super-resolution preprocessing and deep +probabilistic models, we address the challenge of generating an eye atlas to +serve as a standardized reference across a largely variable population. + +
+
+ comment: Published in SPIE Journal of Medical Imaging + (https://doi.org/10.1117/1.JMI.11.6.064004). 27 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Information-driven design of imaging systems + + +
+ Most modern imaging systems process the data they capture algorithmically +before-or instead of-human viewing. As a result, performance depends not on how +interpretable the measurements appear, but how effectively they encode details +for algorithmic processing. Information theory provides mathematical tools to +analyze this, but developing methods that can handle the complexity of +real-world measurements yet remain practical enough for widespread use has +proven challenging. We introduce a data-driven approach for estimating the +information content of imaging system measurements. Our framework requires only +experimental measurements and noise characterization, with no need for ground +truth data. We demonstrate that these information estimates reliably predict +system performance across diverse imaging modalities, including color +photography, radio astronomy, lensless imaging, and label-free microscopy. To +automate the process of designing imaging systems that maximize information +capture we introduce an optimization technique called Information-Driven +Encoder Analysis Learning (IDEAL). The tools we develop in this work unlock +information theory as a powerful, practical tool for analyzing and designing +imaging systems across a broad range of applications. + A video summarizing this work can be found at +https://waller-lab.github.io/EncodingInformationWebsite/ + +
+
+
+
+
+ + ♻ ☆ Stable Consistency Tuning: Understanding and Improving Consistency + Models + + +
+ Diffusion models achieve superior generation quality but suffer from slow +generation speed due to the iterative nature of denoising. In contrast, +consistency models, a new generative family, achieve competitive performance +with significantly faster sampling. These models are trained either through +consistency distillation, which leverages pretrained diffusion models, or +consistency training/tuning directly from raw data. In this work, we propose a +novel framework for understanding consistency models by modeling the denoising +process of the diffusion model as a Markov Decision Process (MDP) and framing +consistency model training as the value estimation through Temporal +Difference~(TD) Learning. More importantly, this framework allows us to analyze +the limitations of current consistency training/tuning strategies. Built upon +Easy Consistency Tuning (ECT), we propose Stable Consistency Tuning (SCT), +which incorporates variance-reduced learning using the score identity. SCT +leads to significant performance improvements on benchmarks such as CIFAR-10 +and ImageNet-64. On ImageNet-64, SCT achieves 1-step FID 2.42 and 2-step FID +1.55, a new SoTA for consistency models. + +
+
+ comment: Code is available at + https://github.com/G-U-N/Stable-Consistency-Tuning +
+
+
+
+
+ + ♻ ☆ V2A-Mark: Versatile Deep Visual-Audio Watermarking for Manipulation + Localization and Copyright Protection ACM MM 2024 + + +
+ AI-generated video has revolutionized short video production, filmmaking, and +personalized media, making video local editing an essential tool. However, this +progress also blurs the line between reality and fiction, posing challenges in +multimedia forensics. To solve this urgent issue, V2A-Mark is proposed to +address the limitations of current video tampering forensics, such as poor +generalizability, singular function, and single modality focus. Combining the +fragility of video-into-video steganography with deep robust watermarking, our +method can embed invisible visual-audio localization watermarks and copyright +watermarks into the original video frames and audio, enabling precise +manipulation localization and copyright protection. We also design a temporal +alignment and fusion module and degradation prompt learning to enhance the +localization accuracy and decoding robustness. Meanwhile, we introduce a +sample-level audio localization method and a cross-modal copyright extraction +mechanism to couple the information of audio and video frames. The +effectiveness of V2A-Mark has been verified on a visual-audio tampering +dataset, emphasizing its superiority in localization precision and copyright +accuracy, crucial for the sustainable development of video editing in the AIGC +video era. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Sampling Correction via Approximately 10 Parameters + + +
+ Diffusion Probabilistic Models (DPMs) have demonstrated exceptional +performance in generative tasks, but this comes at the expense of sampling +efficiency. To enhance sampling speed without sacrificing quality, various +distillation-based accelerated sampling algorithms have been recently proposed. +However, they typically require significant additional training costs and model +parameter storage, which limit their practical application. In this work, we +propose PCA-based Adaptive Search (PAS), which optimizes existing solvers for +DPMs with minimal learnable parameters and training costs. Specifically, we +first employ PCA to obtain a few orthogonal unit basis vectors to span the +high-dimensional sampling space, which enables us to learn just a set of +coordinates to correct the sampling direction; furthermore, based on the +observation that the cumulative truncation error exhibits an ``S''-shape, we +design an adaptive search strategy that further enhances the sampling +efficiency and reduces the number of stored parameters to approximately 10. +Extensive experiments demonstrate that PAS can significantly enhance existing +fast solvers in a plug-and-play manner with negligible costs. For instance, on +CIFAR10, PAS requires only 12 parameters and less than 1 minute of training on +a single NVIDIA A100 GPU to optimize the DDIM from 15.69 FID (NFE=10) to 4.37. + +
+
+
+
+
+ + ♻ ☆ HyCoT: A Transformer-Based Autoencoder for Hyperspectral Image + Compression RSS + + +
+ The development of learning-based hyperspectral image (HSI) compression +models has recently attracted significant interest. Existing models +predominantly utilize convolutional filters, which capture only local +dependencies. Furthermore,they often incur high training costs and exhibit +substantial computational complexity. To address these limitations, in this +paper we propose Hyperspectral Compression Transformer (HyCoT) that is a +transformer-based autoencoder for pixelwise HSI compression. Additionally, we +apply a simple yet effective training set reduction approach to accelerate the +training process. Experimental results on the HySpecNet-11k dataset demonstrate +that HyCoT surpasses the state of the art across various compression ratios by +over 1 dB of PSNR with significantly reduced computational requirements. Our +code and pre-trained weights are publicly available at +https://git.tu-berlin.de/rsim/hycot . + +
+
+ comment: Accepted at 14th IEEE GRSS Workshop on Hyperspectral Image and Signal + Processing: Evolution in Remote Sensing (WHISPERS), 2024 +
+
+
+
+
+ + ♻ ☆ Dinomaly: The Less Is More Philosophy in Multi-Class Unsupervised + Anomaly Detection + + +
+ Recent studies highlighted a practical setting of unsupervised anomaly +detection (UAD) that builds a unified model for multi-class images. Despite +various advancements addressing this challenging task, the detection +performance under the multi-class setting still lags far behind +state-of-the-art class-separated models. Our research aims to bridge this +substantial performance gap. In this paper, we introduce a minimalistic +reconstruction-based anomaly detection framework, namely Dinomaly, which +leverages pure Transformer architectures without relying on complex designs, +additional modules, or specialized tricks. Given this powerful framework +consisted of only Attentions and MLPs, we found four simple components that are +essential to multi-class anomaly detection: (1) Foundation Transformers that +extracts universal and discriminative features, (2) Noisy Bottleneck where +pre-existing Dropouts do all the noise injection tricks, (3) Linear Attention +that naturally cannot focus, and (4) Loose Reconstruction that does not force +layer-to-layer and point-by-point reconstruction. Extensive experiments are +conducted across popular anomaly detection benchmarks including MVTec-AD, VisA, +and Real-IAD. Our proposed Dinomaly achieves impressive image-level AUROC of +99.6%, 98.7%, and 89.3% on the three datasets respectively, which is not only +superior to state-of-the-art multi-class UAD methods, but also achieves the +most advanced class-separated UAD records. + +
+
+
+
+
+ + ♻ ☆ Breaking the Low-Rank Dilemma of Linear Attention + + +
+ The Softmax attention mechanism in Transformer models is notoriously +computationally expensive, particularly due to its quadratic complexity, posing +significant challenges in vision applications. In contrast, linear attention +provides a far more efficient solution by reducing the complexity to linear +levels. However, compared to Softmax attention, linear attention often +experiences significant performance degradation. Our experiments indicate that +this performance drop is due to the low-rank nature of linear attention's +feature map, which hinders its ability to adequately model complex spatial +information. In this paper, to break the low-rank dilemma of linear attention, +we conduct rank analysis from two perspectives: the KV buffer and the output +features. Consequently, we introduce Rank-Augmented Linear Attention (RALA), +which rivals the performance of Softmax attention while maintaining linear +complexity and high efficiency. Based on RALA, we construct the Rank-Augmented +Vision Linear Transformer (RAVLT). Extensive experiments demonstrate that RAVLT +achieves excellent performance across various vision tasks. Specifically, +without using any additional labels, data, or supervision during training, +RAVLT achieves an 84.4% Top-1 accuracy on ImageNet-1k with only 26M parameters +and 4.6G FLOPs. This result significantly surpasses previous linear attention +mechanisms, fully illustrating the potential of RALA. Code will be available at +https://github.com/qhfan/RALA. + +
+
+
+
+
+ + ♻ ☆ Generative Adversarial Networks for Spatio-Spectral Compression of + Hyperspectral Images RSS + + +
+ The development of deep learning-based models for the compression of +hyperspectral images (HSIs) has recently attracted great attention in remote +sensing due to the sharp growing of hyperspectral data archives. Most of the +existing models achieve either spectral or spatial compression, and do not +jointly consider the spatio-spectral redundancies present in HSIs. To address +this problem, in this paper we focus our attention on the High Fidelity +Compression (HiFiC) model (which is proven to be highly effective for spatial +compression problems) and adapt it to perform spatio-spectral compression of +HSIs. In detail, we introduce two new models: i) HiFiC using Squeeze and +Excitation (SE) blocks (denoted as HiFiC$_{SE}$); and ii) HiFiC with 3D +convolutions (denoted as HiFiC$_{3D}$) in the framework of compression of HSIs. +We analyze the effectiveness of HiFiC$_{SE}$ and HiFiC$_{3D}$ in compressing +the spatio-spectral redundancies with channel attention and inter-dependency +analysis. Experimental results show the efficacy of the proposed models in +performing spatio-spectral compression, while reconstructing images at reduced +bitrates with higher reconstruction quality. The code of the proposed models is +publicly available at https://git.tu-berlin.de/rsim/HSI-SSC . + +
+
+ comment: Accepted at 14th IEEE GRSS Workshop on Hyperspectral Image and Signal + Processing: Evolution in Remote Sensing (WHISPERS), 2024 +
+
+
+
+
+ + ♻ ☆ CFCPalsy: Facial Image Synthesis with Cross-Fusion Cycle Diffusion Model + for Facial Paralysis Individuals + + +
+ Currently, the diagnosis of facial paralysis remains a challenging task, +often relying heavily on the subjective judgment and experience of clinicians, +which can introduce variability and uncertainty in the assessment process. One +promising application in real-life situations is the automatic estimation of +facial paralysis. However, the scarcity of facial paralysis datasets limits the +development of robust machine learning models for automated diagnosis and +therapeutic interventions. To this end, this study aims to synthesize a +high-quality facial paralysis dataset to address this gap, enabling more +accurate and efficient algorithm training. Specifically, a novel Cross-Fusion +Cycle Palsy Expression Generative Model (CFCPalsy) based on the diffusion model +is proposed to combine different features of facial information and enhance the +visual details of facial appearance and texture in facial regions, thus +creating synthetic facial images that accurately represent various degrees and +types of facial paralysis. We have qualitatively and quantitatively evaluated +the proposed method on the commonly used public clinical datasets of facial +paralysis to demonstrate its effectiveness. Experimental results indicate that +the proposed method surpasses state-of-the-art methods, generating more +realistic facial images and maintaining identity consistency. + +
+
+
+
+
+ + ♻ ☆ Spatial Re-parameterization for N:M Sparsity + + +
+ This paper presents a Spatial Re-parameterization (SpRe) method for the N:M +sparsity in CNNs. SpRe is stemmed from an observation regarding the restricted +variety in spatial sparsity present in N:M sparsity compared with unstructured +sparsity. Particularly, N:M sparsity exhibits a fixed sparsity rate within the +spatial domains due to its distinctive pattern that mandates N non-zero +components among M successive weights in the input channel dimension of +convolution filters. On the contrary, we observe that unstructured sparsity +displays a substantial divergence in sparsity across the spatial domains, which +we experimentally verified to be very crucial for its robust performance +retention compared with N:M sparsity. Therefore, SpRe employs the +spatial-sparsity distribution of unstructured sparsity to assign an extra +branch in conjunction with the original N:M branch at training time, which +allows the N:M sparse network to sustain a similar distribution of spatial +sparsity with unstructured sparsity. During inference, the extra branch can be +further re-parameterized into the main N:M branch, without exerting any +distortion on the sparse pattern or additional computation costs. SpRe has +achieved a commendable feat by matching the performance of N:M sparsity methods +with state-of-the-art unstructured sparsity methods across various benchmarks. +Code and models are anonymously available at +\url{https://github.com/zyxxmu/SpRe}. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ DiffPAD: Denoising Diffusion-based Adversarial Patch Decontamination WACV + + +
+ In the ever-evolving adversarial machine learning landscape, developing +effective defenses against patch attacks has become a critical challenge, +necessitating reliable solutions to safeguard real-world AI systems. Although +diffusion models have shown remarkable capacity in image synthesis and have +been recently utilized to counter $\ell_p$-norm bounded attacks, their +potential in mitigating localized patch attacks remains largely underexplored. +In this work, we propose DiffPAD, a novel framework that harnesses the power of +diffusion models for adversarial patch decontamination. DiffPAD first performs +super-resolution restoration on downsampled input images, then adopts +binarization, dynamic thresholding scheme and sliding window for effective +localization of adversarial patches. Such a design is inspired by the +theoretically derived correlation between patch size and diffusion restoration +error that is generalized across diverse patch attack scenarios. Finally, +DiffPAD applies inpainting techniques to the original input images with the +estimated patch region being masked. By integrating closed-form solutions for +super-resolution restoration and image inpainting into the conditional reverse +sampling process of a pre-trained diffusion model, DiffPAD obviates the need +for text guidance or fine-tuning. Through comprehensive experiments, we +demonstrate that DiffPAD not only achieves state-of-the-art adversarial +robustness against patch attacks but also excels in recovering naturalistic +images without patch remnants. The source code is available at +https://github.com/JasonFu1998/DiffPAD. + +
+
+ comment: Accepted to 2025 IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV) +
+
+
+
+
+ + ♻ ☆ TE-NeXt: A LiDAR-Based 3D Sparse Convolutional Network for + Traversability Estimation + + +
+ This paper presents TE-NeXt, a novel and efficient architecture for +Traversability Estimation (TE) from sparse LiDAR point clouds based on a +residual convolution block. TE-NeXt block fuses notions of current trends such +as attention mechanisms and 3D sparse convolutions. TE-NeXt aims to demonstrate +high capacity for generalisation in a variety of urban and natural +environments, using well-known and accessible datasets such as SemanticKITTI, +Rellis-3D and SemanticUSL. Thus, the designed architecture ouperforms +state-of-the-art methods in the problem of semantic segmentation, demonstrating +better results in unstructured environments and maintaining high reliability +and robustness in urbans environments, which leads to better abstraction. +Implementation is available in a open repository to the scientific community +with the aim of ensuring the reproducibility of results. + +
+
+ comment: This work has been submitted to the Expert Systems With applications +
+
+
+
+
+ + ♻ ☆ Improving Arabic Multi-Label Emotion Classification using Stacked + Embeddings and Hybrid Loss Function + + +
+ In multi-label emotion classification, particularly for low-resource +languages like Arabic, the challenges of class imbalance and label correlation +hinder model performance, especially in accurately predicting minority +emotions. To address these issues, this study proposes a novel approach that +combines stacked embeddings, meta-learning, and a hybrid loss function to +enhance multi-label emotion classification for the Arabic language. The study +extracts contextual embeddings from three fine-tuned language +models-ArabicBERT, MarBERT, and AraBERT-which are then stacked to form enriched +embeddings. A meta-learner is trained on these stacked embeddings, and the +resulting concatenated representations are provided as input to a Bi-LSTM +model, followed by a fully connected neural network for multi-label +classification. To further improve performance, a hybrid loss function is +introduced, incorporating class weighting, label correlation matrix, and +contrastive learning, effectively addressing class imbalances and improving the +handling of label correlations. Extensive experiments validate the proposed +model's performance across key metrics such as Precision, Recall, F1-Score, +Jaccard Accuracy, and Hamming Loss. The class-wise performance analysis +demonstrates the hybrid loss function's ability to significantly reduce +disparities between majority and minority classes, resulting in a more balanced +emotion classification. An ablation study highlights the contribution of each +component, showing the superiority of the model compared to baseline approaches +and other loss functions. This study not only advances multi-label emotion +classification for Arabic but also presents a generalizable framework that can +be adapted to other languages and domains, providing a significant step forward +in addressing the challenges of low-resource emotion classification tasks. + +
+
+ comment: The paper is submitted in Scientific Reports and is currently under + review +
+
+
+
+
+ + ♻ ☆ IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of + brain MR images + + +
+ In MRI studies, the aggregation of imaging data from multiple acquisition +sites enhances sample size but may introduce site-related variabilities that +hinder consistency in subsequent analyses. Deep learning methods for image +translation have emerged as a solution for harmonizing MR images across sites. +In this study, we introduce IGUANe (Image Generation with Unified Adversarial +Networks), an original 3D model that leverages the strengths of domain +translation and straightforward application of style transfer methods for +multicenter brain MR image harmonization. IGUANe extends CycleGAN by +integrating an arbitrary number of domains for training through a many-to-one +architecture. The framework based on domain pairs enables the implementation of +sampling strategies that prevent confusion between site-related and biological +variabilities. During inference, the model can be applied to any image, even +from an unknown acquisition site, making it a universal generator for +harmonization. Trained on a dataset comprising T1-weighted images from 11 +different scanners, IGUANe was evaluated on data from unseen sites. The +assessments included the transformation of MR images with traveling subjects, +the preservation of pairwise distances between MR images within domains, the +evolution of volumetric patterns related to age and Alzheimer$'$s disease (AD), +and the performance in age regression and patient classification tasks. +Comparisons with other harmonization and normalization methods suggest that +IGUANe better preserves individual information in MR images and is more +suitable for maintaining and reinforcing variabilities related to age and AD. +Future studies may further assess IGUANe in other multicenter contexts, either +using the same model or retraining it for applications to different image +modalities. IGUANe is available at +https://github.com/RocaVincent/iguane_harmonization.git. + +
+
+ comment: 29 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ MikuDance: Animating Character Art with Mixed Motion Dynamics + + +
+ We propose MikuDance, a diffusion-based pipeline incorporating mixed motion +dynamics to animate stylized character art. MikuDance consists of two key +techniques: Mixed Motion Modeling and Mixed-Control Diffusion, to address the +challenges of high-dynamic motion and reference-guidance misalignment in +character art animation. Specifically, a Scene Motion Tracking strategy is +presented to explicitly model the dynamic camera in pixel-wise space, enabling +unified character-scene motion modeling. Building on this, the Mixed-Control +Diffusion implicitly aligns the scale and body shape of diverse characters with +motion guidance, allowing flexible control of local character motion. +Subsequently, a Motion-Adaptive Normalization module is incorporated to +effectively inject global scene motion, paving the way for comprehensive +character art animation. Through extensive experiments, we demonstrate the +effectiveness and generalizability of MikuDance across various character art +and motion guidance, consistently producing high-quality animations with +remarkable motion dynamics. + +
+
+
+
+
+ + ♻ ☆ Machine learning-enabled velocity model building with uncertainty + quantification + + +
+ Accurately characterizing migration velocity models is crucial for a wide +range of geophysical applications, from hydrocarbon exploration to monitoring +of CO2 sequestration projects. Traditional velocity model building methods such +as Full-Waveform Inversion (FWI) are powerful but often struggle with the +inherent complexities of the inverse problem, including noise, limited +bandwidth, receiver aperture and computational constraints. To address these +challenges, we propose a scalable methodology that integrates generative +modeling, in the form of Diffusion networks, with physics-informed summary +statistics, making it suitable for complicated imaging problems including field +datasets. By defining these summary statistics in terms of subsurface-offset +image volumes for poor initial velocity models, our approach allows for +computationally efficient generation of Bayesian posterior samples for +migration velocity models that offer a useful assessment of uncertainty. To +validate our approach, we introduce a battery of tests that measure the quality +of the inferred velocity models, as well as the quality of the inferred +uncertainties. With modern synthetic datasets, we reconfirm gains from using +subsurface-image gathers as the conditioning observable. For complex velocity +model building involving salt, we propose a new iterative workflow that refines +amortized posterior approximations with salt flooding and demonstrate how the +uncertainty in the velocity model can be propagated to the final product +reverse time migrated images. Finally, we present a proof of concept on field +datasets to show that our method can scale to industry-sized problems. + +
+
+
+
+
+ + ♻ ☆ MotionDreamer: Exploring Semantic Video Diffusion features for Zero-Shot + 3D Mesh Animation + + +
+ Animation techniques bring digital 3D worlds and characters to life. However, +manual animation is tedious and automated techniques are often specialized to +narrow shape classes. In our work, we propose a technique for automatic +re-animation of various 3D shapes based on a motion prior extracted from a +video diffusion model. Unlike existing 4D generation methods, we focus solely +on the motion, and we leverage an explicit mesh-based representation compatible +with existing computer-graphics pipelines. Furthermore, our utilization of +diffusion features enhances accuracy of our motion fitting. We analyze efficacy +of these features for animation fitting and we experimentally validate our +approach for two different diffusion models and four animation models. Finally, +we demonstrate that our time-efficient zero-shot method achieves a superior +performance re-animating a diverse set of 3D shapes when compared to existing +techniques in a user study. The project website is located at +https://lukas.uzolas.com/MotionDreamer. + +
+
+
+
+
+ + ♻ ☆ ROCKET-1: Mastering Open-World Interaction with Visual-Temporal Context + Prompting + + +
+ Vision-language models (VLMs) have excelled in multimodal tasks, but adapting +them to embodied decision-making in open-world environments presents +challenges. One critical issue is bridging the gap between discrete entities in +low-level observations and the abstract concepts required for effective +planning. A common solution is building hierarchical agents, where VLMs serve +as high-level reasoners that break down tasks into executable sub-tasks, +typically specified using language. However, language suffers from the +inability to communicate detailed spatial information. We propose +visual-temporal context prompting, a novel communication protocol between VLMs +and policy models. This protocol leverages object segmentation from past +observations to guide policy-environment interactions. Using this approach, we +train ROCKET-1, a low-level policy that predicts actions based on concatenated +visual observations and segmentation masks, supported by real-time object +tracking from SAM-2. Our method unlocks the potential of VLMs, enabling them to +tackle complex tasks that demand spatial reasoning. Experiments in Minecraft +show that our approach enables agents to achieve previously unattainable tasks, +with a $\mathbf{76}\%$ absolute improvement in open-world interaction +performance. Codes and demos are now available on the project page: +https://craftjarvis.github.io/ROCKET-1. + +
+
+
+
+
+ + ♻ ☆ From Explicit Rules to Implicit Reasoning in an Interpretable Violence + Monitoring System + + +
+ Recently, research based on pre-trained models has demonstrated outstanding +performance in violence surveillance tasks. However, most of them were +black-box systems which faced challenges regarding explainability during +training and inference processes. An important question is how to incorporate +explicit knowledge into these implicit models, thereby designing expertdriven +and interpretable violence surveillance systems. This paper proposes a new +paradigm for weakly supervised violence monitoring (WSVM) called Rule base +Violence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure +with different designs for images and text. One of the branches is called the +implicit branch, which uses only visual features for coarse-grained binary +classification. In this branch, image feature extraction is divided into two +channels: one responsible for extracting scene frames and the other focusing on +extracting actions. The other branch is called the explicit branch, which +utilizes language-image alignment to perform fine-grained classification. For +the language channel design in the explicit branch, the proposed RuleVM uses +the state-of-the-art YOLOWorld model to detect objects in video frames, and +association rules are identified through data mining methods as descriptions of +the video. Leveraging the dual-branch architecture, RuleVM achieves +interpretable coarse-grained and fine-grained violence surveillance. Extensive +experiments were conducted on two commonly used benchmarks, and the results +show that RuleVM achieved the best performance in both coarse-grained and +finegrained monitoring, significantly outperforming existing state-ofthe-art +methods. Moreover, interpretability experiments uncovered some interesting +rules, such as the observation that as the number of people increases, the risk +level of violent behavior also rises. + +
+
+ comment: 12 pages,7 figures IEEE TSMCA (Under review) +
+
+
+
+
+ + ♻ ☆ V2X Cooperative Perception for Autonomous Driving: Recent Advances and + Challenges + + +
+ Achieving fully autonomous driving with heightened safety and efficiency +depends on vehicle-to-everything (V2X) cooperative perception (CP), which +allows vehicles to share perception data, thereby enhancing situational +awareness and overcoming the limitations of the sensing ability of individual +vehicles. V2X CP is crucial for extending perception range, improving accuracy, +and strengthening the decision-making and control capabilities of autonomous +vehicles in complex environments. This paper provides a comprehensive survey of +recent advances in V2X CP, introducing mathematical models of CP processes +across various collaboration strategies. We examine essential techniques for +reliable perception sharing, including agent selection, data alignment, and +fusion methods. Key issues are analyzed, such as agent and model heterogeneity, +perception uncertainty, and the impact of V2X communication constraints like +delays and data loss on CP effectiveness. To inspire further advancements in +V2X CP, we outline promising avenues, including privacy-preserving artificial +intelligence (AI), collaborative AI, and integrated sensing frameworks, as +pathways to enhance CP capabilities. + +
+
+
+
+
+ + ♻ ☆ TinyVLA: Towards Fast, Data-Efficient Vision-Language-Action Models for + Robotic Manipulation + + +
+ Vision-Language-Action (VLA) models have shown remarkable potential in +visuomotor control and instruction comprehension through end-to-end learning +processes. However, current VLA models face significant challenges: they are +slow during inference and require extensive pre-training on large amounts of +robotic data, making real-world deployment difficult. In this paper, we +introduce a new family of compact vision-language-action models, called +TinyVLA, which offers two key advantages over existing VLA models: (1) faster +inference speeds, and (2) improved data efficiency, eliminating the need for +pre-training stage. Our framework incorporates two essential components to +build TinyVLA: (1) initializing the policy backbone with robust, high-speed +multimodal models, and (2) integrating a diffusion policy decoder during +fine-tuning to enable precise robot actions. We conducted extensive evaluations +of TinyVLA in both simulation and on real robots, demonstrating that our +approach significantly outperforms the state-of-the-art VLA model, OpenVLA, in +terms of speed and data efficiency, while delivering comparable or superior +performance. Additionally, TinyVLA exhibits strong generalization capabilities +across various dimensions, including language instructions, novel objects, +unseen positions, changes in object appearance, background variations, and +environmental shifts, often matching or exceeding the performance of OpenVLA. +We believe that \methodname offers an interesting perspective on utilizing +pre-trained multimodal models for policy learning. Our project is at +https://tiny-vla.github.io. + +
+
+ comment: add more citations +
+
+
+
+
+ + ♻ ☆ Grounding is All You Need? Dual Temporal Grounding for Video Dialog + + +
+ In the realm of video dialog response generation, the understanding of video +content and the temporal nuances of conversation history are paramount. While a +segment of current research leans heavily on large-scale pretrained +visual-language models and often overlooks temporal dynamics, another delves +deep into spatial-temporal relationships within videos but demands intricate +object trajectory pre-extractions and sidelines dialog temporal dynamics. This +paper introduces the Dual Temporal Grounding-enhanced Video Dialog model +(DTGVD), strategically designed to merge the strengths of both dominant +approaches. It emphasizes dual temporal relationships by predicting dialog +turn-specific temporal regions, filtering video content accordingly, and +grounding responses in both video and dialog contexts. One standout feature of +DTGVD is its heightened attention to chronological interplay. By recognizing +and acting upon the dependencies between different dialog turns, it captures +more nuanced conversational dynamics. To further bolster the alignment between +video and dialog temporal dynamics, we've implemented a list-wise contrastive +learning strategy. Within this framework, accurately grounded turn-clip +pairings are designated as positive samples, while less precise pairings are +categorized as negative. This refined classification is then funneled into our +holistic end-to-end response generation mechanism. Evaluations using +AVSD@DSTC-7 and AVSD@DSTC-8 datasets underscore the superiority of our +methodology. + +
+
+
+
+
+ + ♻ ☆ ViTGaze: Gaze Following with Interaction Features in Vision Transformers + + +
+ Gaze following aims to interpret human-scene interactions by predicting the +person's focal point of gaze. Prevailing approaches often adopt a two-stage +framework, whereby multi-modality information is extracted in the initial stage +for gaze target prediction. Consequently, the efficacy of these methods highly +depends on the precision of the preceding modality extraction. Others use a +single-modality approach with complex decoders, increasing network +computational load. Inspired by the remarkable success of pre-trained plain +vision transformers (ViTs), we introduce a novel single-modality gaze following +framework called ViTGaze. In contrast to previous methods, it creates a novel +gaze following framework based mainly on powerful encoders (relative decoder +parameters less than 1%). Our principal insight is that the inter-token +interactions within self-attention can be transferred to interactions between +humans and scenes. Leveraging this presumption, we formulate a framework +consisting of a 4D interaction encoder and a 2D spatial guidance module to +extract human-scene interaction information from self-attention maps. +Furthermore, our investigation reveals that ViT with self-supervised +pre-training has an enhanced ability to extract correlation information. Many +experiments have been conducted to demonstrate the performance of the proposed +method. Our method achieves state-of-the-art (SOTA) performance among all +single-modality methods (3.4% improvement in the area under curve (AUC) score, +5.1% improvement in the average precision (AP)) and very comparable performance +against multi-modality methods with 59% number of parameters less. + +
+
+ comment: 15 pages; Accepted by Visual Intelligence +
+
+
+
+
+ + ♻ ☆ One Homography is All You Need: IMM-based Joint Homography and Multiple + Object State Estimation + + +
+ A novel online MOT algorithm, IMM Joint Homography State Estimation +(IMM-JHSE), is proposed. IMM-JHSE uses an initial homography estimate as the +only additional 3D information, whereas other 3D MOT methods use regular 3D +measurements. By jointly modelling the homography matrix and its dynamics as +part of track state vectors, IMM-JHSE removes the explicit influence of camera +motion compensation techniques on predicted track position states, which was +prevalent in previous approaches. Expanding upon this, static and dynamic +camera motion models are combined using an IMM filter. A simple bounding box +motion model is used to predict bounding box positions to incorporate image +plane information. In addition to applying an IMM to camera motion, a +non-standard IMM approach is applied where bounding-box-based BIoU scores are +mixed with ground-plane-based Mahalanobis distances in an IMM-like fashion to +perform association only, making IMM-JHSE robust to motion away from the ground +plane. Finally, IMM-JHSE makes use of dynamic process and measurement noise +estimation techniques. IMM-JHSE improves upon related techniques, including +UCMCTrack, OC-SORT, C-BIoU and ByteTrack on the DanceTrack and KITTI-car +datasets, increasing HOTA by 2.64 and 2.11, respectively, while offering +competitive performance on the MOT17, MOT20 and KITTI-pedestrian datasets. +Using publicly available detections, IMM-JHSE outperforms almost all other 2D +MOT methods and is outperformed only by 3D MOT methods -- some of which are +offline -- on the KITTI-car dataset. Compared to tracking-by-attention methods, +IMM-JHSE shows remarkably similar performance on the DanceTrack dataset and +outperforms them on the MOT17 dataset. The code is publicly available: +\url{https://github.com/Paulkie99/imm-jhse}. + +
+
+ comment: Preprint submitted to Information Fusion +
+
+
+
+
+ + ♻ ☆ MEGA: Masked Generative Autoencoder for Human Mesh Recovery + + +
+ Human Mesh Recovery (HMR) from a single RGB image is a highly ambiguous +problem, as an infinite set of 3D interpretations can explain the 2D +observation equally well. Nevertheless, most HMR methods overlook this issue +and make a single prediction without accounting for this ambiguity. A few +approaches generate a distribution of human meshes, enabling the sampling of +multiple predictions; however, none of them is competitive with the latest +single-output model when making a single prediction. This work proposes a new +approach based on masked generative modeling. By tokenizing the human pose and +shape, we formulate the HMR task as generating a sequence of discrete tokens +conditioned on an input image. We introduce MEGA, a MaskEd Generative +Autoencoder trained to recover human meshes from images and partial human mesh +token sequences. Given an image, our flexible generation scheme allows us to +predict a single human mesh in deterministic mode or to generate multiple human +meshes in stochastic mode. Experiments on in-the-wild benchmarks show that MEGA +achieves state-of-the-art performance in deterministic and stochastic modes, +outperforming single-output and multi-output approaches. + +
+
+
+
+
+ + ♻ ☆ A Smartphone-Based Method for Assessing Tomato Nutrient Status through + Trichome Density Measurement + + +
+ Early detection of fertilizer-induced stress in tomato plants is crucial for +optimizing crop yield through timely management interventions. While +conventional optical methods struggle to detect fertilizer stress in young +leaves, these leaves contain valuable diagnostic information through their +microscopic hair-like structures, particularly trichomes, which existing +approaches have overlooked. This study introduces a smartphone-based +noninvasive technique that leverages mobile computing and digital imaging +capabilities to quantify trichome density on young leaves with superior +detection latency. Our method uniquely combines augmented reality technology +with image processing algorithms to analyze trichomes transferred onto +specialized measurement paper. A robust automated pipeline processes these +images through region extraction, perspective transformation, and illumination +correction to precisely quantify trichome density. Validation experiments on +hydroponically grown tomatoes under varying fertilizer conditions demonstrated +the method's effectiveness. Leave-one-out cross-validation revealed strong +predictive performance with the area under the precision-recall curve (PR-AUC: +0.82) and area under the receiver operating characteristic curve (ROC-AUC: +0.64), while the predicted and observed trichome densities exhibited high +correlation ($r = 0.79$). This innovative approach transforms smartphones into +precise diagnostic tools for plant nutrition assessment, offering a practical, +cost-effective solution for precision agriculture. + +
+
+
+
+
+ + ♻ ☆ Masked Image Modeling Boosting Semi-Supervised Semantic Segmentation + + +
+ In view of the fact that semi- and self-supervised learning share a +fundamental principle, effectively modeling knowledge from unlabeled data, +various semi-supervised semantic segmentation methods have integrated +representative self-supervised learning paradigms for further regularization. +However, the potential of the state-of-the-art generative self-supervised +paradigm, masked image modeling, has been scarcely studied. This paradigm +learns the knowledge through establishing connections between the masked and +visible parts of masked image, during the pixel reconstruction process. By +inheriting and extending this insight, we successfully leverage masked image +modeling to boost semi-supervised semantic segmentation. Specifically, we +introduce a novel class-wise masked image modeling that independently +reconstructs different image regions according to their respective classes. In +this way, the mask-induced connections are established within each class, +mitigating the semantic confusion that arises from plainly reconstructing +images in basic masked image modeling. To strengthen these intra-class +connections, we further develop a feature aggregation strategy that minimizes +the distances between features corresponding to the masked and visible parts +within the same class. Additionally, in semantic space, we explore the +application of masked image modeling to enhance regularization. Extensive +experiments conducted on well-known benchmarks demonstrate that our approach +achieves state-of-the-art performance. The code will be available at +https://github.com/haoxt/S4MIM. + +
+
+ comment: 13 pages. This work has been submitted to the IEEE for possible + publication +
+
+
+
+
+ + ♻ ☆ I&S-ViT: An Inclusive & Stable Method for Pushing the Limit of + Post-Training ViTs Quantization + + +
+ Albeit the scalable performance of vision transformers (ViTs), the dense +computational costs (training & inference) undermine their position in +industrial applications. Post-training quantization (PTQ), tuning ViTs with a +tiny dataset and running in a low-bit format, well addresses the cost issue but +unluckily bears more performance drops in lower-bit cases. In this paper, we +introduce I&S-ViT, a novel method that regulates the PTQ of ViTs in an +inclusive and stable fashion. I&S-ViT first identifies two issues in the PTQ of +ViTs: (1) Quantization inefficiency in the prevalent log2 quantizer for +post-Softmax activations; (2) Rugged and magnified loss landscape in +coarse-grained quantization granularity for post-LayerNorm activations. Then, +I&S-ViT addresses these issues by introducing: (1) A novel shift-uniform-log2 +quantizer (SULQ) that incorporates a shift mechanism followed by uniform +quantization to achieve both an inclusive domain representation and accurate +distribution approximation; (2) A three-stage smooth optimization strategy +(SOS) that amalgamates the strengths of channel-wise and layer-wise +quantization to enable stable learning. Comprehensive evaluations across +diverse vision tasks validate I&S-ViT' superiority over existing PTQ of ViTs +methods, particularly in low-bit scenarios. For instance, I&S-ViT elevates the +performance of 3-bit ViT-B by an impressive 50.68%. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey for Hyperspectral Image Classification: The + Evolution from Conventional to Transformers and Mamba Models + + +
+ Hyperspectral Image Classification (HSC) presents significant challenges +owing to the high dimensionality and intricate nature of Hyperspectral (HS) +data. While traditional Machine Learning (TML) approaches have demonstrated +effectiveness, they often encounter substantial obstacles in real-world +applications, including the variability of optimal feature sets, subjectivity +in human-driven design, inherent biases, and methodological limitations. +Specifically, TML suffers from the curse of dimensionality, difficulties in +feature selection and extraction, insufficient consideration of spatial +information, limited robustness against noise, scalability issues, and +inadequate adaptability to complex data distributions. In recent years, Deep +Learning (DL) techniques have emerged as robust solutions to address these +challenges. This survey offers a comprehensive overview of current trends and +future prospects in HSC, emphasizing advancements from DL models to the +increasing adoption of Transformer and Mamba Model architectures. We +systematically review key concepts, methodologies, and state-of-the-art +approaches in DL for HSC. Furthermore, we investigate the potential of +Transformer-based models and the Mamba Model in HSC, detailing their advantages +and challenges. Emerging trends in HSC are explored, including in-depth +discussions on Explainable AI and Interoperability concepts, alongside +Diffusion Models for image denoising, feature extraction, and image fusion. +Comprehensive experimental results were conducted on three HS datasets to +substantiate the efficacy of various conventional DL models and Transformers. +Additionally, we identify several open challenges and pertinent research +questions in the field of HSC. Finally, we outline future research directions +and potential applications aimed at enhancing the accuracy and efficiency of +HSC. + +
+
+
+
+
+ + ♻ ☆ GaussianCity: Generative Gaussian Splatting for Unbounded 3D City + Generation + + +
+ 3D city generation with NeRF-based methods shows promising generation results +but is computationally inefficient. Recently 3D Gaussian Splatting (3D-GS) has +emerged as a highly efficient alternative for object-level 3D generation. +However, adapting 3D-GS from finite-scale 3D objects and humans to +infinite-scale 3D cities is non-trivial. Unbounded 3D city generation entails +significant storage overhead (out-of-memory issues), arising from the need to +expand points to billions, often demanding hundreds of Gigabytes of VRAM for a +city scene spanning 10km^2. In this paper, we propose GaussianCity, a +generative Gaussian Splatting framework dedicated to efficiently synthesizing +unbounded 3D cities with a single feed-forward pass. Our key insights are +two-fold: 1) Compact 3D Scene Representation: We introduce BEV-Point as a +highly compact intermediate representation, ensuring that the growth in VRAM +usage for unbounded scenes remains constant, thus enabling unbounded city +generation. 2) Spatial-aware Gaussian Attribute Decoder: We present +spatial-aware BEV-Point decoder to produce 3D Gaussian attributes, which +leverages Point Serializer to integrate the structural and contextual +characteristics of BEV points. Extensive experiments demonstrate that +GaussianCity achieves state-of-the-art results in both drone-view and +street-view 3D city generation. Notably, compared to CityDreamer, GaussianCity +exhibits superior performance with a speedup of 60 times (10.72 FPS v.s. 0.18 +FPS). + +
+
+
+
+
+ + ♻ ☆ Category-Extensible Out-of-Distribution Detection via Hierarchical + Context Descriptions NeurIPS 2023 + + +
+ The key to OOD detection has two aspects: generalized feature representation +and precise category description. Recently, vision-language models such as CLIP +provide significant advances in both two issues, but constructing precise +category descriptions is still in its infancy due to the absence of unseen +categories. This work introduces two hierarchical contexts, namely perceptual +context and spurious context, to carefully describe the precise category +boundary through automatic prompt tuning. Specifically, perceptual contexts +perceive the inter-category difference (e.g., cats vs apples) for current +classification tasks, while spurious contexts further identify spurious +(similar but exactly not) OOD samples for every single category (e.g., cats vs +panthers, apples vs peaches). The two contexts hierarchically construct the +precise description for a certain category, which is, first roughly classifying +a sample to the predicted category and then delicately identifying whether it +is truly an ID sample or actually OOD. Moreover, the precise descriptions for +those categories within the vision-language framework present a novel +application: CATegory-EXtensible OOD detection (CATEX). One can efficiently +extend the set of recognizable categories by simply merging the hierarchical +contexts learned under different sub-task settings. And extensive experiments +are conducted to demonstrate CATEX's effectiveness, robustness, and +category-extensibility. For instance, CATEX consistently surpasses the rivals +by a large margin with several protocols on the challenging ImageNet-1K +dataset. In addition, we offer new insights on how to efficiently scale up the +prompt engineering in vision-language models to recognize thousands of object +categories, as well as how to incorporate large language models (like GPT-3) to +boost zero-shot applications. Code is publicly available at +https://github.com/alibaba/catex. + +
+
+ comment: Accepted by 37th Conference on Neural Information Processing Systems + (NeurIPS 2023). Code is available at https://github.com/alibaba/catex +
+
+
+
+
+ + ♻ ☆ Projecting Gaussian Ellipsoids While Avoiding Affine Projection + Approximation + + +
+ Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its +real-time rendering speed and state-of-the-art rendering quality. However, +during the rendering process, the use of the Jacobian of the affine +approximation of the projection transformation leads to inevitable errors, +resulting in blurriness, artifacts and a lack of scene consistency in the final +rendered images. To address this issue, we introduce an ellipsoid-based +projection method to calculate the projection of Gaussian ellipsoid onto the +image plane, which is the primitive of 3D Gaussian Splatting. As our proposed +ellipsoid-based projection method cannot handle Gaussian ellipsoids with camera +origins inside them or parts lying below $z=0$ plane in the camera space, we +designed a pre-filtering strategy. Experiments over multiple widely adopted +benchmark datasets show that our ellipsoid-based projection method can enhance +the rendering quality of 3D Gaussian Splatting and its extensions. + +
+
+
+
+
+ + ♻ ☆ STARFlow: Spatial Temporal Feature Re-embedding with Attentive Learning + for Real-world Scene Flow 3DV 2025 + + +
+ Scene flow prediction is a crucial underlying task in understanding dynamic +scenes as it offers fundamental motion information. However, contemporary scene +flow methods encounter three major challenges. Firstly, flow estimation solely +based on local receptive fields lacks long-dependency matching of point pairs. +To address this issue, we propose global attentive flow embedding to match +all-to-all point pairs in both feature space and Euclidean space, providing +global initialization before local refinement. Secondly, there are deformations +existing in non-rigid objects after warping, which leads to variations in the +spatiotemporal relation between the consecutive frames. For a more precise +estimation of residual flow, a spatial temporal feature re-embedding module is +devised to acquire the sequence features after deformation. Furthermore, +previous methods perform poor generalization due to the significant domain gap +between the synthesized and LiDAR-scanned datasets. We leverage novel domain +adaptive losses to effectively bridge the gap of motion inference from +synthetic to real-world. Experiments demonstrate that our approach achieves +state-of-the-art performance across various datasets, with particularly +outstanding results on real-world LiDAR-scanned datasets. Our code is available +at https://github.com/O-VIGIA/StarFlow. + +
+
+ comment: This paper was renamed to:"SSRFlow: Semantic-aware Fusion with + Spatial Temporal Re-embedding for Real-world Scene Flow" [arXiv:2408.07825] + and was accepted in 3DV 2025 +
+
+
+
+
+ + ♻ ☆ USTC-TD: A Test Dataset and Benchmark for Image and Video Coding in + 2020s + + +
+ Image/video coding has been a remarkable research area for both academia and +industry for many years. Testing datasets, especially high-quality image/video +datasets are desirable for the justified evaluation of coding-related research, +practical applications, and standardization activities. We put forward a test +dataset namely USTC-TD, which has been successfully adopted in the practical +end-to-end image/video coding challenge of the IEEE International Conference on +Visual Communications and lmage Processing (VCIP) in 2022 and 2023. USTC-TD +contains 40 images at 4K spatial resolution and 10 video sequences at 1080p +spatial resolution, featuring various content due to the diverse environmental +factors (e.g. scene type, texture, motion, view) and the designed imaging +factors (e.g. illumination, lens, shadow). We quantitatively evaluate USTC-TD +on different image/video features (spatial, temporal, color, lightness), and +compare it with the previous image/video test datasets, which verifies the +wider coverage and more diversity of the proposed dataset. We also evaluate +both classic standardized and recent learned image/video coding schemes on +USTC-TD with PSNR and MS-SSIM, and provide an extensive benchmark for the +evaluated schemes. Based on the characteristics and specific design of the +proposed test dataset, we analyze the benchmark performance and shed light on +the future research and development of image/video coding. All the data are +released online: https://esakak.github.io/USTC-TD . + +
+
+ comment: 23 pages. Project Page: https://esakak.github.io/USTC-TD +
+
+
+
+
+ + ♻ ☆ Learning Diffeomorphism for Image Registration with Time-Continuous + Networks using Semigroup Regularization + + +
+ Diffeomorphic image registration (DIR) is a critical task in 3D medical image +analysis, aimed at finding topology preserving deformations between pairs of +images. Focusing on the solution of the flow map differential equation as the +diffeomorphic deformation, recent methods use discrete timesteps along with +various regularization terms to penalize the negative determinant of Jacobian +and impose smoothness of the solution vector field. In this paper, we propose a +novel learning-based approach for diffeomorphic 3D-image registration which +finds the diffeomorphisms in the time continuum with only a single +regularization term and no additional integration. As one of the fundamental +properties of flow maps, we exploit the semigroup property as the only form of +regularization, ensuring temporally continuous diffeomorphic flows between +pairs of images. Leveraging this property, our method alleviates the need for +additional regularization terms and scaling and squaring integration during +both training and evaluation. To achieve time-continuous diffeomorphisms, we +employ time-embedded UNets, an architecture commonly utilized in diffusion +models. The proposed method reveals that ensuring diffeomorphism in a +continuous time interval leads to better registration results. Experimental +results on four public datasets demonstrate the superiority of our model over +both learning-based and optimization-based methods. + +
+
+ comment: 20 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Deep Learning Technology for Face Forgery Detection: A Survey + + +
+ Currently, the rapid development of computer vision and deep learning has +enabled the creation or manipulation of high-fidelity facial images and videos +via deep generative approaches. This technology, also known as deepfake, has +achieved dramatic progress and become increasingly popular in social media. +However, the technology can generate threats to personal privacy and national +security by spreading misinformation. To diminish the risks of deepfake, it is +desirable to develop powerful forgery detection methods to distinguish fake +faces from real faces. This paper presents a comprehensive survey of recent +deep learning-based approaches for facial forgery detection. We attempt to +provide the reader with a deeper understanding of the current advances as well +as the major challenges for deepfake detection based on deep learning. We +present an overview of deepfake techniques and analyse the characteristics of +various deepfake datasets. We then provide a systematic review of different +categories of deepfake detection and state-of-the-art deepfake detection +methods. The drawbacks of existing detection methods are analyzed, and future +research directions are discussed to address the challenges in improving both +the performance and generalization of deepfake detection. + +
+
+ comment: The paper "Deep Learning Technology for Face Forgery Detection: A + Survey" is hereby formally withdrawn. The reason for this withdrawal is that + I did not adequately consult and obtain proper authorization from the + corresponding author during the submission process. I sincerely apologize for + any inconvenience this may have caused the journal, reviewers, and readers +
+
+
+
+
+ + ♻ ☆ Prion-ViT: Prions-Inspired Vision Transformers for Temperature + prediction with Specklegrams + + +
+ Fiber Specklegram Sensors (FSS) are vital for environmental monitoring due to +their high temperature sensitivity, but their complex data poses challeng-es +for predictive models. This study introduces Prion-ViT, a prion-inspired Vision +Transformer model, inspired by biological prion memory mecha-nisms, to improve +long-term dependency modeling and temperature prediction accuracy using FSS +data. Prion-ViT leverages a persistent memory state to retain and propagate key +features across layers, reducing mean absolute error (MAE) to 0.52{\deg}C and +outperforming models like ResNet, Inception Net V2, and standard vision +transformers. This work highlights Prion-ViT's potential for real-time +industrial temperature monitoring and broader optical sensing applications. + +
+
+
+
+
+ + ♻ ☆ Can KAN Work? Exploring the Potential of Kolmogorov-Arnold Networks in + Computer Vision + + +
+ Kolmogorov-Arnold Networks(KANs), as a theoretically efficient neural network +architecture, have garnered attention for their potential in capturing complex +patterns. However, their application in computer vision remains relatively +unexplored. This study first analyzes the potential of KAN in computer vision +tasks, evaluating the performance of KAN and its convolutional variants in +image classification and semantic segmentation. The focus is placed on +examining their characteristics across varying data scales and noise levels. +Results indicate that while KAN exhibits stronger fitting capabilities, it is +highly sensitive to noise, limiting its robustness. To address this challenge, +we propose a smoothness regularization method and introduce a Segment +Deactivation technique. Both approaches enhance KAN's stability and +generalization, demonstrating its potential in handling complex visual data +tasks. + +
+
+
+
+
+ + ♻ ☆ Pubic Symphysis-Fetal Head Segmentation Using Pure Transformer with + Bi-level Routing Attention + + +
+ In this paper, we propose a method, named BRAU-Net, to solve the pubic +symphysis-fetal head segmentation task. The method adopts a U-Net-like pure +Transformer architecture with bi-level routing attention and skip connections, +which effectively learns local-global semantic information. The proposed +BRAU-Net was evaluated on transperineal Ultrasound images dataset from the +pubic symphysis-fetal head segmentation and angle of progression (FH-PS-AOP) +challenge. The results demonstrate that the proposed BRAU-Net achieves +comparable a final score. The codes will be available at +https://github.com/Caipengzhou/BRAU-Net. + +
+
+
+
+
+ + ♻ ☆ Vision Mamba: Efficient Visual Representation Learning with + Bidirectional State Space Model ICML 2024 + + +
+ Recently the state space models (SSMs) with efficient hardware-aware designs, +i.e., the Mamba deep learning model, have shown great potential for long +sequence modeling. Meanwhile building efficient and generic vision backbones +purely upon SSMs is an appealing direction. However, representing visual data +is challenging for SSMs due to the position-sensitivity of visual data and the +requirement of global context for visual understanding. In this paper, we show +that the reliance on self-attention for visual representation learning is not +necessary and propose a new generic vision backbone with bidirectional Mamba +blocks (Vim), which marks the image sequences with position embeddings and +compresses the visual representation with bidirectional state space models. On +ImageNet classification, COCO object detection, and ADE20k semantic +segmentation tasks, Vim achieves higher performance compared to +well-established vision transformers like DeiT, while also demonstrating +significantly improved computation & memory efficiency. For example, Vim is +2.8$\times$ faster than DeiT and saves 86.8% GPU memory when performing batch +inference to extract features on images with a resolution of 1248$\times$1248. +The results demonstrate that Vim is capable of overcoming the computation & +memory constraints on performing Transformer-style understanding for +high-resolution images and it has great potential to be the next-generation +backbone for vision foundation models. Code is available at +https://github.com/hustvl/Vim. + +
+
+ comment: Vision Mamba (Vim) is accepted by ICML 2024. Code is available at + https://github.com/hustvl/Vim +
+
+
+
+
+ + ♻ ☆ LLM2CLIP: Powerful Language Model Unlocks Richer Visual Representation + + +
+ CLIP is one of the most important multimodal foundational models today. What +powers CLIP's capabilities? The rich supervision signals provided by natural +language, the carrier of human knowledge, shape a powerful cross-modal +representation space. However, with the rapid advancements in large language +models LLMs like GPT-4 and LLaMA, the boundaries of language comprehension and +generation are continually being pushed. This raises an intriguing question: +can the capabilities of LLMs be harnessed to further improve multimodal +representation learning? The potential benefits of incorporating LLMs into CLIP +are clear. LLMs' strong textual understanding can fundamentally improve CLIP's +ability to handle image captions, drastically enhancing its ability to process +long and complex texts, a well-known limitation of vanilla CLIP. Moreover, LLMs +are trained on a vast corpus of text, possessing open-world knowledge. This +allows them to expand on caption information during training, increasing the +efficiency of the learning process. In this paper, we propose LLM2CLIP, a novel +approach that embraces the power of LLMs to unlock CLIP's potential. By +fine-tuning the LLM in the caption space with contrastive learning, we extract +its textual capabilities into the output embeddings, significantly improving +the output layer's textual discriminability. We then design an efficient +training process where the fine-tuned LLM acts as a powerful teacher for CLIP's +visual encoder. Thanks to the LLM's presence, we can now incorporate longer and +more complex captions without being restricted by vanilla CLIP's text encoder's +context window and ability limitations. Our experiments demonstrate that this +approach brings substantial improvements in cross-modal tasks. + +
+
+
+
+
+ + ♻ ☆ Deciphering the Definition of Adversarial Robustness for post-hoc OOD + Detectors + + +
+ Detecting out-of-distribution (OOD) inputs is critical for safely deploying +deep learning models in real-world scenarios. In recent years, many OOD +detectors have been developed, and even the benchmarking has been standardized, +i.e. OpenOOD. The number of post-hoc detectors is growing fast. They are +showing an option to protect a pre-trained classifier against natural +distribution shifts and claim to be ready for real-world scenarios. However, +its effectiveness in dealing with adversarial examples (AdEx) has been +neglected in most studies. In cases where an OOD detector includes AdEx in its +experiments, the lack of uniform parameters for AdEx makes it difficult to +accurately evaluate the performance of the OOD detector. This paper +investigates the adversarial robustness of 16 post-hoc detectors against +various evasion attacks. It also discusses a roadmap for adversarial defense in +OOD detectors that would help adversarial robustness. We believe that level 1 +(AdEx on a unified dataset) should be added to any OOD detector to see the +limitations. The last level in the roadmap (defense against adaptive attacks) +we added for integrity from an adversarial machine learning (AML) point of +view, which we do not believe is the ultimate goal for OOD detectors. + +
+
+
+
+
+ + ♻ ☆ A Cognitive Architecture for Machine Consciousness and Artificial + Superintelligence: Thought Is Structured by the Iterative Updating of Working + Memory + + +
+ This article provides an analytical framework for how to simulate human-like +thought processes within a computer. It describes how attention and memory +should be structured, updated, and utilized to search for associative additions +to the stream of thought. The focus is on replicating the dynamics of the +mammalian working memory system, which features two forms of persistent +activity: sustained firing (preserving information on the order of seconds) and +synaptic potentiation (preserving information from minutes to hours). The +article uses a series of figures to systematically demonstrate how the +iterative updating of these working memory stores provides functional +organization to behavior, cognition, and awareness. + In a machine learning implementation, these two memory stores should be +updated continuously and in an iterative fashion. This means each state should +preserve a proportion of the coactive representations from the state before it +(where each representation is an ensemble of neural network nodes). This makes +each state a revised iteration of the preceding state and causes successive +configurations to overlap and blend with respect to the information they +contain. Thus, the set of concepts in working memory will evolve gradually and +incrementally over time. Transitions between states happen as persistent +activity spreads activation energy throughout the hierarchical network, +searching long-term memory for the most appropriate representation to be added +to the global workspace. The result is a chain of associatively linked +intermediate states capable of advancing toward a solution or goal. Iterative +updating is conceptualized here as an information processing strategy, a model +of working memory, a theory of consciousness, and an algorithm for designing +and programming artificial intelligence (AI, AGI, and ASI). + +
+
+ comment: 88 pages and 53 figures +
+
+
+
+
+ + ♻ ☆ Dense Connector for MLLMs NeurIPS 2024 + + +
+ Do we fully leverage the potential of visual encoder in Multimodal Large +Language Models (MLLMs)? The recent outstanding performance of MLLMs in +multimodal understanding has garnered broad attention from both academia and +industry. In the current MLLM rat race, the focus seems to be predominantly on +the linguistic side. We witness the rise of larger and higher-quality +instruction datasets, as well as the involvement of larger-sized LLMs. Yet, +scant attention has been directed towards the visual signals utilized by MLLMs, +often assumed to be the final high-level features extracted by a frozen visual +encoder. In this paper, we introduce the Dense Connector - a simple, effective, +and plug-and-play vision-language connector that significantly enhances +existing MLLMs by leveraging multi-layer visual features, with minimal +additional computational overhead. Building on this, we also propose the +Efficient Dense Connector, which achieves performance comparable to LLaVA-v1.5 +with only 25% of the visual tokens. Furthermore, our model, trained solely on +images, showcases remarkable zero-shot capabilities in video understanding as +well. Experimental results across various vision encoders, image resolutions, +training dataset scales, varying sizes of LLMs (2.7B->70B), and diverse +architectures of MLLMs (e.g., LLaVA-v1.5, LLaVA-NeXT and Mini-Gemini) validate +the versatility and scalability of our approach, achieving state-of-the-art +performance across 19 image and video benchmarks. We hope that this work will +provide valuable experience and serve as a basic module for future MLLM +development. Code is available at https://github.com/HJYao00/DenseConnector . + +
+
+ comment: 27 pages, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating Modern Approaches in 3D Scene Reconstruction: NeRF vs + Gaussian-Based Methods + + +
+ Exploring the capabilities of Neural Radiance Fields (NeRF) and +Gaussian-based methods in the context of 3D scene reconstruction, this study +contrasts these modern approaches with traditional Simultaneous Localization +and Mapping (SLAM) systems. Utilizing datasets such as Replica and ScanNet, we +assess performance based on tracking accuracy, mapping fidelity, and view +synthesis. Findings reveal that NeRF excels in view synthesis, offering unique +capabilities in generating new perspectives from existing data, albeit at +slower processing speeds. Conversely, Gaussian-based methods provide rapid +processing and significant expressiveness but lack comprehensive scene +completion. Enhanced by global optimization and loop closure techniques, newer +methods like NICE-SLAM and SplaTAM not only surpass older frameworks such as +ORB-SLAM2 in terms of robustness but also demonstrate superior performance in +dynamic and complex environments. This comparative analysis bridges theoretical +research with practical implications, shedding light on future developments in +robust 3D scene reconstruction across various real-world applications. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ♻ ☆ DiffusionPID: Interpreting Diffusion via Partial Information + Decomposition + + +
+ Text-to-image diffusion models have made significant progress in generating +naturalistic images from textual inputs, and demonstrate the capacity to learn +and represent complex visual-semantic relationships. While these diffusion +models have achieved remarkable success, the underlying mechanisms driving +their performance are not yet fully accounted for, with many unanswered +questions surrounding what they learn, how they represent visual-semantic +relationships, and why they sometimes fail to generalize. Our work presents +Diffusion Partial Information Decomposition (DiffusionPID), a novel technique +that applies information-theoretic principles to decompose the input text +prompt into its elementary components, enabling a detailed examination of how +individual tokens and their interactions shape the generated image. We +introduce a formal approach to analyze the uniqueness, redundancy, and synergy +terms by applying PID to the denoising model at both the image and pixel level. +This approach enables us to characterize how individual tokens and their +interactions affect the model output. We first present a fine-grained analysis +of characteristics utilized by the model to uniquely localize specific +concepts, we then apply our approach in bias analysis and show it can recover +gender and ethnicity biases. Finally, we use our method to visually +characterize word ambiguity and similarity from the model's perspective and +illustrate the efficacy of our method for prompt intervention. Our results show +that PID is a potent tool for evaluating and diagnosing text-to-image diffusion +models. + +
+
+
+
+
+ + ♻ ☆ MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation + Models, Convolutional Neural Networks, and Uncertainty Quantification for + High-Speed Video Phase Detection Data + + +
+ Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in +nuclear reactors, chemical processing, and electronics cooling for detecting +vapor, liquid, and microlayer phases. Traditional segmentation models face +pixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ +introduces VideoSAM, a hybrid framework leveraging convolutional neural +networks (CNNs) and transformer-based vision models to enhance segmentation +accuracy and generalizability across complex multimodal PD tasks. Methods: +VideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced +feature extraction and segmentation across diverse HSV PD modalities, spanning +fluids like water, FC-72, nitrogen, and argon under varied heat flux +conditions. The framework also incorporates uncertainty quantification (UQ) to +assess pixel-based discretization errors, delivering reliable metrics such as +contact line density and dry area fraction under experimental conditions. +Results: VideoSAM outperforms SAM and modality-specific CNN models in +segmentation accuracy, excelling in environments with complex phase boundaries, +overlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid +architecture supports cross-dataset generalization, adapting effectively to +varying modalities. The UQ module provides accurate error estimates, enhancing +the reliability of segmentation outputs for advanced HSV PD research. +Conclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD +segmentation, addressing previous limitations with advanced deep learning and +UQ techniques. The open-source datasets and tools introduced enable scalable, +precise, and adaptable segmentation for multimodal PD datasets, supporting +advancements in HSV analysis and autonomous experimentation. The codes and data +used for this paper are publicly available at +https://github.com/chikap421/mseg_vcuq + +
+
+ comment: Under Review in EAAI +
+
+
+
+
+ + ♻ ☆ Explainable Differential Privacy-Hyperdimensional Computing for + Balancing Privacy and Transparency in Additive Manufacturing Monitoring + + +
+ Machine Learning (ML) models combined with in-situ sensing offer a powerful +solution to address defect detection challenges in Additive Manufacturing (AM), +yet this integration raises critical data privacy concerns, such as data +leakage and sensor data compromise, potentially exposing sensitive information +about part design and material composition. Differential Privacy (DP), which +adds mathematically controlled noise to ML models, provides a way to balance +data utility with privacy by concealing identifiable traces from sensor data. +However, introducing noise into ML models, especially black-box Artificial +Intelligence (AI) models, complicates the prediction of how noise impacts model +accuracy. This study presents the Differential Privacy-Hyperdimensional +Computing (DP-HD) framework, which leverages Explainable AI (XAI) and the +vector symbolic paradigm to quantify noise effects on accuracy. By defining a +Signal-to-Noise Ratio (SNR) metric, DP-HD assesses the contribution of training +data relative to DP noise, allowing selection of an optimal balance between +accuracy and privacy. Experimental results using high-speed melt pool data for +anomaly detection in AM demonstrate that DP-HD achieves superior operational +efficiency, prediction accuracy, and privacy protection. For instance, with a +privacy budget set at 1, DP-HD achieves 94.43% accuracy, outperforming +state-of-the-art ML models. Furthermore, DP-HD maintains high accuracy under +substantial noise additions to enhance privacy, unlike current models that +experience significant accuracy declines under stringent privacy constraints. + +
+
+ comment: 28 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ MRSegmentator: Multi-Modality Segmentation of 40 Classes in MRI and CT + + +
+ Purpose: To develop and evaluate a deep learning model for multi-organ +segmentation of MRI scans. + Materials and Methods: The model was trained on 1,200 manually annotated 3D +axial MRI scans from the UK Biobank, 221 in-house MRI scans, and 1228 CT scans +from the TotalSegmentator dataset. A human-in-the-loop annotation workflow was +employed, leveraging cross-modality transfer learning from an existing CT +segmentation model to segment 40 anatomical structures. The annotation process +began with a model based on transfer learning between CT and MR, which was +iteratively refined based on manual corrections to predicted segmentations. The +model's performance was evaluated on MRI examinations obtained from the German +National Cohort (NAKO) study (n=900) from the AMOS22 dataset (n=60) and from +the TotalSegmentator-MRI test data (n=29). The Dice Similarity Coefficient +(DSC) and Hausdorff Distance (HD) were used to assess segmentation quality, +stratified by organ and scan type. The model and its weights will be +open-sourced. + Results: MRSegmentator demonstrated high accuracy for well-defined organs +(lungs: DSC 0.96, heart: DSC 0.94) and organs with anatomic variability (liver: +DSC 0.96, kidneys: DSC 0.95). Smaller structures showed lower accuracy +(portal/splenic veins: DSC 0.64, adrenal glands: DSC 0.69). On external +validation using NAKO data, mean DSC ranged from 0.85 $\pm$ 0.08 for T2-HASTE +to 0.91 $\pm$ 0.05 for in-phase sequences. The model generalized well to CT, +achieving mean DSC of 0.84 $\pm$ 0.11 on AMOS CT data. + Conclusion: MRSegmentator accurately segments 40 anatomical structures in MRI +across diverse datasets and imaging protocols, with additional generalizability +to CT images. This open-source model will provide a valuable tool for automated +multi-organ segmentation in medical imaging research. It can be downloaded from +https://github.com/hhaentze/MRSegmentator. + +
+
+ comment: 17 pages, 6 figures; updated data; completed co-author info +
+
+
+
+
+
+
+
+ + Systems and Control 31 + +
+
+
+ + ☆ How to implement the Bayes' formula in the age of ML? + + +
+ This chapter contains a self-contained introduction to the significance of +Bayes' formula in the context of nonlinear filtering problems. Both +discrete-time and continuous-time settings of the problem are considered in a +unified manner. In control theory, the focus on optimization-based solution +approaches is stressed together with a discussion of historical developments in +this area (from 1960s onwards). The heart of this chapter contains a +presentation of a novel optimal transportation formulation for the Bayes +formula (developed recently by the first author) and its relationship to some +of the prior joint work (feedback particle filter) from the authors. The +presentation highlights how optimal transportation theory is leveraged to +overcome some of the numerical challenges of implementing Bayes' law by +enabling the use of machine learning (ML) tools. + +
+
+
+
+
+ + ☆ Nash equilibrium seeking for a class of quadratic-bilinear Wasserstein + distributionally robust games + + +
+ We consider a class of Wasserstein distributionally robust Nash equilibrium +problems, where agents construct heterogeneous data-driven Wasserstein +ambiguity sets using private samples and radii, in line with their individual +risk-averse behaviour. By leveraging relevant properties of this class of +games, we show that equilibria of the original seemingly infinite-dimensional +problem can be obtained as a solution to a finite-dimensional Nash equilibrium +problem. We then reformulate the problem as a finite-dimensional variational +inequality and establish the connection between the corresponding solution +sets. Our reformulation has scalable behaviour with respect to the data size +and maintains a fixed number of constraints, independently of the number of +samples. To compute a solution, we leverage two algorithms, based on the golden +ratio algorithm. The efficiency of both algorithmic schemes is corroborated +through extensive simulation studies on an illustrative example and a +stochastic portfolio allocation game, where behavioural coupling among +investors is modeled. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Safety Filter for Robust Disturbance Rejection via Online Optimization + + +
+ Disturbance rejection in high-precision control applications can be +significantly improved upon via online convex optimization (OCO). This includes +classical techniques such as recursive least squares (RLS) and more recent, +regret-based formulations. However, these methods can cause instabilities in +the presence of model uncertainty. This paper introduces a safety filter for +systems with OCO in the form of adaptive finite impulse response (FIR) +filtering to ensure robust disturbance rejection. The safety filter enforces a +robust stability constraint on the FIR coefficients while minimally altering +the OCO command in the $\infty$-norm cost. Additionally, we show that the +induced $\ell_\infty$-norm allows for easy online implementation of the safety +filter by directly limiting the OCO command. The constraint can be tuned to +trade off robustness and performance. We provide a simple example to +demonstrate the safety filter. + +
+
+ comment: Submitted to the 2025 European Control Conference. This paper builds + on the work done in arXiv:2405.07037 +
+
+
+
+
+ + ☆ A small-gain criterion for 2-contraction of large scale interconnected + systems + + +
+ Despite modular conditions to guarantee stability for large-scale systems +have been widely studied, few methods are available to tackle the case of +networks with multiple equilibria. This paper introduces small-gain like +sufficient conditions for 2-contraction of large-scale interconnected systems +on the basis of a family of upper-bounds to the $L_2$ gains that arise from the +gains computed on individual channels of the second additive variational +equation. Such a condition guarantee the 2-additive compound of the system's +Jacobian to be exponentially contractive, thus implying convergence towards +equilibria of the system's solutions. The gains are obtained by solving +suitable Linear Matrix Inequalities. Three interconnected Thomas' systems are +considered in order to illustrate the application of the theory and the degree +of conservatism. + +
+
+
+
+
+ + ☆ Architectural Exploration of Application-Specific Resonant SRAM + Compute-in-Memory (rCiM) + + +
+ While general-purpose computing follows Von Neumann's architecture, the data +movement between memory and processor elements dictates the processor's +performance. The evolving compute-in-memory (CiM) paradigm tackles this issue +by facilitating simultaneous processing and storage within static random-access +memory (SRAM) elements. Numerous design decisions taken at different levels of +hierarchy affect the figure of merits (FoMs) of SRAM, such as power, +performance, area, and yield. The absence of a rapid assessment mechanism for +the impact of changes at different hierarchy levels on global FoMs poses a +challenge to accurately evaluating innovative SRAM designs. This paper presents +an automation tool designed to optimize the energy and latency of SRAM designs +incorporating diverse implementation strategies for executing logic operations +within the SRAM. The tool structure allows easy comparison across different +array topologies and various design strategies to result in energy-efficient +implementations. Our study involves a comprehensive comparison of over 6900+ +distinct design implementation strategies for EPFL combinational benchmark +circuits on the energy-recycling resonant compute-in-memory (rCiM) architecture +designed using TSMC 28 nm technology. When provided with a combinational +circuit, the tool aims to generate an energy-efficient implementation strategy +tailored to the specified input memory and latency constraints. The tool +reduces 80.9% of energy consumption on average across all benchmarks while +using the six-topology implementation compared to baseline implementation of +single-macro topology by considering the parallel processing capability of rCiM +cache size ranging from 4KB to 192KB. + +
+
+
+
+
+ + ☆ Experimental Demonstration of Remote Synchronization in Coupled + Nonlinear Oscillator + + +
+ This study investigates remote synchronization in scale-free networks of +coupled nonlinear oscillators inspired by synchronization observed in the +brain's cortical regions and power grid. We employ the Master Stability +Function (MSF) approach to analyze network stability across various oscillator +models. Synchronization results are obtained for a star network using +linearization techniques and extended to arbitrary networks with benchmark +oscillators, verifying consistent behavior. Stable synchronous solutions emerge +as the Floquet multiplier decreases and the MSF becomes negative. Additionally, +we demonstrate remote synchronization in a star network, where peripheral +oscillators communicate exclusively through a central hub, drawing parallels to +neuronal synchronization in the brain. Experimental validation is achieved +through an electronic circuit testbed, supported by nonlinear ODE modeling and +LTspice simulation. Future work will extend the investigation to arbitrary +network topologies, further elucidating synchronization dynamics in complex +systems. + +
+
+
+
+
+ + ☆ AMARETTO: Enabling Efficient Quantum Algorithm Emulation on Low-Tier + FPGAs + + +
+ Researchers and industries are increasingly drawn to quantum computing for +its computational potential. However, validating new quantum algorithms is +challenging due to the limitations of current quantum devices. Software +simulators are time and memory-consuming, making hardware emulators an +attractive alternative. This article introduces AMARETTO (quAntuM ARchitecture +EmulaTion TechnOlogy), designed for quantum computing emulation on low-tier +Field-Programmable gate arrays (FPGAs), supporting Clifford+T and rotational +gate sets. It simplifies and accelerates the verification of quantum algorithms +using a Reduced-Instruction-Set-Computer (RISC)-like structure and efficient +handling of sparse quantum gates. A dedicated compiler translates OpenQASM 2.0 +into RISC-like instructions. AMARETTO is validated against the Qiskit +simulators. Our results show successful emulation of sixteen qubits on a AMD +Kria KV260 SoM. This approach rivals other works in emulated qubit capacity on +a smaller, more affordable FPGA + +
+
+ comment: paper accepted at the IEEE International Conference on Electronics + Circuits and Systems 2024 conference, 4 pages, 6 figures +
+
+
+
+
+ + ☆ Model-Based Event-Triggered Implementation of Hybrid Controllers Using + Finite-Time Convergent Observers + + +
+ In this paper, we explore the conditions for asymptotic stability of the +hybrid closed-loop system resulting from the interconnection of a nonlinear +plant, an intelligent sensor that generates finite-time convergent estimates of +the plant state, and a controller node that receives opportunistic samples from +the sensor node when certain model-based event-triggering conditions are met. +The proposed method is endowed with a degree of separation, in the sense that +the controller design is independent of the sensor design. This is achieved +under mild regularity conditions imposed on the hybrid closed-loop system and +the existence of persistently flowing solutions. We demonstrate the versatility +of the method by implementing it on: 1) a sampled-data controller for +regulation of linear plants; 2) a synergistic controller for attitude +stabilization of rigid bodies. The effectiveness of these novel controllers is +demonstrated through numerical simulations. + +
+
+
+
+
+ + ☆ A Comparative Analysis of Electricity Consumption Flexibility in + Different Industrial Plant Configurations + + +
+ The flexibility of industrial power consumption plays a key role in the +transition to renewable energy systems, contributing to grid stability, cost +reduction and decarbonization efforts. This paper presents a novel methodology +to quantify and optimize the flexibility of electricity consumption in +manufacturing plants. The proposed model is applied to actual cement and steel +plant configurations. Comparative simulations performed with the model reveal +significant differences in flexibility and cost-effectiveness, driven by +factors such as production capacity, downstream process demand, storage +capacity, and operational constraints. A comprehensive sensitivity analysis +further clarifies the impact of various parameters on production optimization +and flexibility savings. Specifically, as demand approaches production levels, +flexibility decreases. Although increasing storage capacity typically reduces +production costs, the benefits diminish above a certain threshold. The results +provide valuable information for industrial operators wishing to improve +operational efficiency, reduce costs and increase the flexibility of their +operations. + +
+
+
+
+
+ + ☆ Are the flows of complex-valued Laplacians and their pseudoinverses + related? + + +
+ Laplacian flows model the rate of change of each node's state as being +proportional to the difference between its value and that of its neighbors. +Typically, these flows capture diffusion or synchronization dynamics and are +well-studied. Expanding on these classical flows, we introduce a pseudoinverse +Laplacian flow system, substituting the Laplacian with its pseudoinverse within +complex-valued networks. Interestingly, for undirected graphs and unsigned +weight-balanced digraphs, Laplacian and the pseudoinverse Laplacian flows +exhibit an interdependence in terms of consensus. To show this relation, we +first present the conditions for achieving consensus in the pseudoinverse +Laplacian flow system using the property of real eventually exponentially +positivity. Thereafter, we show that the pseudoinverse Laplacian flow system +converges to consensus if and only if the Laplacian flow system achieves +consensus in the above-mentioned networks. However, these are only the +sufficient conditions for digraphs. Further, we illustrate the efficacy of the +proposed approach through examples, focusing primarily on power networks. + +
+
+
+
+
+ + ☆ Unsupervised Physics-Informed Neural Network-based Nonlinear Observer + design for autonomous systems using contraction analysis + + +
+ Contraction analysis offers, through elegant mathematical developments, a +unified way of designing observers for a general class of nonlinear systems, +where the observer correction term is obtained by solving an infinite +dimensional inequality that guarantees global exponential convergence. However, +solving the matrix partial differential inequality involved in contraction +analysis design is both analytically and numerically challenging and represents +a long-lasting challenge that prevented its wide use. Therefore, the present +paper proposes a novel approach that relies on an unsupervised Physics Informed +Neural Network (PINN) to design the observer's correction term by enforcing the +partial differential inequality in the loss function. The performance of the +proposed PINN-based nonlinear observer is assessed in numerical simulation as +well as its robustness to measurement noise and neural network approximation +error. + +
+
+
+
+
+ + ☆ Enhancing reinforcement learning for population setpoint tracking in + co-cultures + + +
+ Efficient multiple setpoint tracking can enable advanced biotechnological +applications, such as maintaining desired population levels in co-cultures for +optimal metabolic division of labor. In this study, we employ reinforcement +learning as a control method for population setpoint tracking in co-cultures, +focusing on policy-gradient techniques where the control policy is +parameterized by neural networks. However, achieving accurate tracking across +multiple setpoints is a significant challenge in reinforcement learning, as the +agent must effectively balance the contributions of various setpoints to +maximize the expected system performance. Traditional return functions, such as +those based on a quadratic cost, often yield suboptimal performance due to +their inability to efficiently guide the agent toward the simultaneous +satisfaction of all setpoints. To overcome this, we propose a novel return +function that rewards the simultaneous satisfaction of multiple setpoints and +diminishes overall reward gains otherwise, accounting for both stage and +terminal system performance. This return function includes parameters to +fine-tune the desired smoothness and steepness of the learning process. We +demonstrate our approach considering an $\textit{Escherichia coli}$ co-culture +in a chemostat with optogenetic control over amino acid synthesis pathways, +leveraging auxotrophies to modulate growth. + +
+
+
+
+
+ + ☆ Information-Optimal Multi-Spacecraft Positioning for Interstellar Object + Exploration + + +
+ Interstellar objects (ISOs), astronomical objects not gravitationally bound +to the sun, could present valuable opportunities to advance our understanding +of the universe's formation and composition. In response to the unpredictable +nature of their discoveries that inherently come with large and rapidly +changing uncertainty in their state, this paper proposes a novel +multi-spacecraft framework for locally maximizing information to be gained +through ISO encounters with formal probabilistic guarantees. Given some +approximated control and estimation policies for fully autonomous spacecraft +operations, we first construct an ellipsoid around its terminal position, where +the ISO would be located with a finite probability. The large state uncertainty +of the ISO is formally handled here through the hierarchical property in +stochastically contracting nonlinear systems. We then propose a method to find +the terminal positions of the multiple spacecraft optimally distributed around +the ellipsoid, which locally maximizes the information we can get from all the +points of interest (POIs). This utilizes a probabilistic information cost +function that accounts for spacecraft positions, camera specifications, and ISO +position uncertainty, where the information is defined as visual data collected +by cameras. Numerical simulations demonstrate the efficacy of this approach +using synthetic ISO candidates generated from quasi-realistic empirical +populations. Our method allows each spacecraft to optimally select its terminal +state and determine the ideal number of POIs to investigate, potentially +enhancing the ability to study these rare and fleeting interstellar visitors +while minimizing resource utilization. + +
+
+ comment: IEEE Aerospace Conference, Preprint Version, Accepted: November 2024 +
+
+
+
+
+ + ☆ Edge Caching Optimization with PPO and Transfer Learning for Dynamic + Environments + + +
+ This paper addresses the challenge of edge caching in dynamic environments, +where rising traffic loads strain backhaul links and core networks. We propose +a Proximal Policy Optimization (PPO)-based caching strategy that fully +incorporates key file attributes such as size, lifetime, importance, and +popularity, while also considering random file request arrivals, reflecting +more realistic edge caching scenarios. In dynamic environments, changes such as +shifts in content popularity and variations in request rates frequently occur, +making previously learned policies less effective as they were optimized for +earlier conditions. Without adaptation, caching efficiency and response times +can degrade. While learning a new policy from scratch in a new environment is +an option, it is highly inefficient and computationally expensive. Thus, +adapting an existing policy to these changes is critical. To address this, we +develop a mechanism that detects changes in content popularity and request +rates, ensuring timely adjustments to the caching strategy. We also propose a +transfer learning-based PPO algorithm that accelerates convergence in new +environments by leveraging prior knowledge. Simulation results demonstrate the +significant effectiveness of our approach, outperforming a recent Deep +Reinforcement Learning (DRL)-based method. + +
+
+
+
+
+ + ☆ ART-Rx: A Proportional-Integral-Derivative (PID) Controlled Adaptive + Real-Time Threshold Receiver for Molecular Communication + + +
+ Molecular communication (MC) in microfluidic channels faces significant +challenges in signal detection due to the stochastic nature of molecule +propagation and dynamic, noisy environments. Conventional detection methods +often struggle under varying channel conditions, leading to high bit error +rates (BER) and reduced communication efficiency. This paper introduces ART-Rx, +a novel Adaptive Real-Time Threshold Receiver for MC that addresses these +challenges. Implemented within a conceptual system-on-chip (SoC), ART-Rx +employs a Proportional-Integral-Derivative (PID) controller to dynamically +adjust the detection threshold based on observed errors in real time. +Comprehensive simulations using MATLAB and Smoldyn compare ART-Rx's performance +against a statistically optimal detection threshold across various scenarios, +including different levels of interference, concentration shift keying (CSK) +levels, flow velocities, transmitter-receiver distances, diffusion +coefficients, and binding rates. The results demonstrate that ART-Rx +significantly outperforms conventional methods, maintaining consistently low +BER and bit error probabilities (BEP) even in high-noise conditions and extreme +channel environments. The system exhibits exceptional robustness to +interference and shows the potential to enable higher data rates in CSK +modulation. Furthermore, because ART-Rx is effectively adaptable to varying +environmental conditions in microfluidic channels, it offers a computationally +efficient and straightforward approach to enhance signal detection in nanoscale +communication systems. This approach presents a promising control theory-based +solution to improve the reliability of data transmission in practical MC +systems, with potential applications in healthcare, brain-machine interfaces +(BMI), and the Internet of Bio-Nano Things (IoBNT). + +
+
+ comment: 14 pages, 7 figures, submitted to IEEE Transactions on Molecular, + Biological, and Multi-Scale Communications (TMBMC) +
+
+
+
+
+ + ☆ Exploring the Use of Autonomous Unmanned Vehicles for Supporting Power + Grid Operations + + +
+ This paper explores the use of autonomous unmanned vehicles for supporting +power grid operations. With built-in batteries and the capability to carry +additional battery energy storage, the rising number of autonomous vehicles can +represent a substantial amount of capacity that is currently underutilized in +the power grid. Unlike traditional electric vehicles which require drivers, the +operations of autonomous vehicles can be performed without human intervention. +To guide idle vehicles to support power grids autonomously, we propose a +tractable optimization-based method for effectively integrating these ``mobile +batteries'' into grid operations. During real-time operations, the vehicles are +strategically routed to target locations to help maintain system power balance +and reduce operating costs. Numerical studies have confirmed both the validity +and scalability of the proposed algorithm for efficiently integrating +autonomous vehicles into routine power system operations. + +
+
+
+
+
+ + ☆ ModelPredictiveControl.jl: advanced process control made easy in Julia + + +
+ Proprietary closed-source software is still the norm in advanced process +control. Transparency and reproducibility are key aspects of scientific +research. Free and open-source toolkit can contribute to the development, +sharing and advancement of new and efficient control approaches, and the +industrial sector will certainly benefit from them. This paper presents +ModelPredictiveControl.jl, an open-source software package for designing model +predictive controllers in the Julia programming language. It is designed to be +easy to use and modular, while providing advanced features like nonlinear +control and moving horizon estimation. It relies on powerful control system and +mathematical optimization frameworks to simplify the construction and testing +of state estimators and predictive controllers. It also integrates with the +standard plotting library to quickly visualize closed-loop data. The paper +presents the main functionalities and illustrates them with two case studies in +simulation. The first example is a continuously stirred tank reactor described +by linear dynamics. The second one implements a nonlinear, an economic, and a +successive linearization model predictive controllers for an inverted pendulum. +The solving times are benchmarked against equivalent implementations in MATLAB +to show the efficiency of the package. + +
+
+ comment: 11 pages, 11 figures, 1 table +
+
+
+
+
+ + ☆ Integrating Fuzzy Set Theory with Pandora Temporal Fault Trees for + Dynamic Failure Analysis of Complex Systems + + +
+ Pandora temporal fault tree, as one notable extension of the fault tree, +introduces temporal gates and temporal laws. Pandora Temporal Fault Tree(TFT) +enhances the capability of fault trees and enables the modeling of system +failure behavior that depends on sequences. The calculation of system failure +probability in Pandora TFT relies on precise probabilistic information on +component failures. However, obtaining such precise failure data can often be +challenging. The data may be uncertain as historical records are used to derive +failure data for system components. To mitigate this uncertainty, in this +study, we proposed a method that integrates fuzzy set theory with Pandora TFT. +This integration aims to enable dynamic analysis of complex systems, even in +cases where quantitative failure data of components is unreliable or imprecise. +The proposed work introduces the development of Fuzzy AND, Fuzzy OR, Fuzzy +PAND, and Fuzzy POR logic gates for Pandora TFT. We also introduce a fuzzy +importance measure for criticality analysis of basic events. All events in our +analysis are assumed to have exponentially distributed failures, with their +failure rates represented as triangular fuzzy numbers. We illustrate the +proposed method through a case study of the Aircraft Fuel Distribution System +(AFDS), highlighting its practical application and effectiveness in analyzing +complex systems. The results are compared with existing results from Petri net +and Bayesian network techniques to validate the findings. + +
+
+
+
+
+ + ♻ ☆ Is Linear Feedback on Smoothed Dynamics Sufficient for Stabilizing + Contact-Rich Plans? ICRA2025 + + +
+ Designing planners and controllers for contact-rich manipulation is extremely +challenging as contact violates the smoothness conditions that many +gradient-based controller synthesis tools assume. Contact smoothing +approximates a non-smooth system with a smooth one, allowing one to use these +synthesis tools more effectively. However, applying classical control synthesis +methods to smoothed contact dynamics remains relatively under-explored. This +paper analyzes the efficacy of linear controller synthesis using differential +simulators based on contact smoothing. We introduce natural baselines for +leveraging contact smoothing to compute (a) open-loop plans robust to uncertain +conditions and/or dynamics, and (b) feedback gains to stabilize around +open-loop plans. Using robotic bimanual whole-body manipulation as a testbed, +we perform extensive empirical experiments on over 300 trajectories and analyze +why LQR seems insufficient for stabilizing contact-rich plans. The video +summarizing this paper and hardware experiments is found here: +https://youtu.be/HLaKi6qbwQg?si=_zCAmBBD6rGSitm9. + +
+
+ comment: Under review for ICRA2025 +
+
+
+
+
+ + ♻ ☆ Multi-Agent Control Synthesis from Global Temporal Logic Tasks with + Synchronous Satisfaction Requirements + + +
+ This paper addresses the multi-agent control problem under global temporal +logic tasks, considering agents with heterogeneous capabilities. These global +tasks involve not only absolute and relative temporal and spatial constraints, +but also group behaviors, including task completion times, agent capabilities, +and task interdependencies such as the need for synchronous execution. The +global tasks are formally formulated into global signal temporal logic (STL) +formulae, and a synchronous robustness metric is designed to evaluate the +synchronization quality with real values. A mixed-integer linear programming +(MILP) encoding method is further proposed to realize task-satisfied motion +planning with high synchronicity and minimum control efforts. The encoding +method uses a logarithmic number of binary variables to fully capture +synchronous robustness, leading to only linear computational complexity. +Simulations are conducted to demonstrate the efficiency of the proposed control +strategy. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ An iterative scheme for finite horizon model reduction of + continuous-time linear time-varying systems + + +
+ In this paper, we obtain the functional derivatives of a finite horizon error +norm between a full-order and a reduced-order continuous-time linear +time-varying (LTV) system. Based on the functional derivatives, first-order +necessary conditions for optimality of the error norm are derived, and a +projection-based iterative scheme for model reduction is proposed. The +iterative scheme upon convergence produces reduced-order models satisfying the +optimality conditions. Finally, through a numerical example, we demonstrate the +better performance of the proposed model reduction scheme in comparison to the +finite horizon balanced truncation algorithm for continuous-time LTV systems. + +
+
+
+
+
+ + ♻ ☆ CaRL: Cascade Reinforcement Learning with State Space Splitting for + O-RAN based Traffic Steering + + +
+ The Open Radio Access Network (O-RAN) architecture empowers intelligent and +automated optimization of the RAN through applications deployed on the RAN +Intelligent Controller (RIC) platform, enabling capabilities beyond what is +achievable with traditional RAN solutions. Within this paradigm, Traffic +Steering (TS) emerges as a pivotal RIC application that focuses on optimizing +cell-level mobility settings in near-real-time, aiming to significantly improve +network spectral efficiency. In this paper, we design a novel TS algorithm +based on a Cascade Reinforcement Learning (CaRL) framework. We propose state +space factorization and policy decomposition to reduce the need for large +models and well-labeled datasets. For each sub-state space, an RL sub-policy +will be trained to learn an optimized mapping onto the action space. To apply +CaRL on new network regions, we propose a knowledge transfer approach to +initialize a new sub-policy based on knowledge learned by the trained policies. +To evaluate CaRL, we build a data-driven and scalable RIC digital twin (DT) +that is modeled using important real-world data, including network +configuration, user geo-distribution, and traffic demand, among others, from a +tier-1 mobile operator in the US. We evaluate CaRL on two DT scenarios +representing two network clusters in two different cities and compare its +performance with the business-as-usual (BAU) policy and other competing +optimization approaches using heuristic and Q-table algorithms. Benchmarking +results show that CaRL performs the best and improves the average +cluster-aggregated downlink throughput over the BAU policy by 24% and 18% in +these two scenarios, respectively. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Nonlinear moving horizon estimation for robust state and parameter + estimation - extended version + + +
+ We propose a moving horizon estimation scheme to estimate the states and the +unknown constant parameters of general nonlinear uncertain discrete-time +systems. The proposed framework and analysis explicitly do not involve the a +priori verification of a particular excitation condition for the parameters. +Instead, we use online information about the actual excitation of the +parameters at any time during operation and ensure that the regularization term +in the cost function is always automatically selected appropriately. This +ensures that the state and parameter estimation error is bounded for all times, +even if the parameters are never (or only rarely) excited during operation. +Robust exponential stability of the state and parameter estimation error +emerges under an additional uniform condition on the maximum duration of +insufficient excitation. The theoretical results are illustrated by a numerical +example. + +
+
+ comment: Replaced by revised version +
+
+
+
+
+ + ♻ ☆ Learning-based model augmentation with LFRs + + +
+ Nonlinear system identification (NL-SI) has proven to be effective in +obtaining accurate models for highly complex systems. Especially, recent +encoder-based methods for artificial neural networks state-space (ANN-SS) +models have achieved state-of-the-art performance on various benchmarks, while +offering consistency and computational efficiency. The inclusion of prior +knowledge of the system can be exploited to increase (i) estimation speed, (ii) +accuracy, and (iii) interpretability of the resulting models. This paper +proposes an encoder based model augmentation method incorporating prior +knowledge from first-principles (FP) models. We introduce a novel +linear-fractional-representation (LFR) model structure that allows for the +unified representation of various augmentation structures including the ones +that are commonly used in the literature, and an identification algorithm for +estimating the proposed structure together with appropriate initialization +methods. The performance and generalization capabilities of the proposed method +are demonstrated on a hardening mass-spring-damper simulation. + +
+
+ comment: Submitted for ECC 2025 +
+
+
+
+
+ + ♻ ☆ Enhancing Attack Resilience in Real-Time Systems through Variable + Control Task Sampling Rates + + +
+ Cyber-physical systems (CPSs) in modern real-time applications integrate +numerous control units linked through communication networks, each responsible +for executing a mix of real-time safety-critical and non-critical tasks. To +ensure predictable timing behaviour, most safety-critical tasks are scheduled +with fixed sampling periods, which supports rigorous safety and performance +analyses. However, this deterministic execution can be exploited by attackers +to launch inference-based attacks on safety-critical tasks. This paper +addresses the challenge of preventing such timing inference or schedule-based +attacks by dynamically adjusting the execution rates of safety-critical tasks +while maintaining their performance. We propose a novel schedule vulnerability +analysis methodology, enabling runtime switching between valid schedules for +various control task sampling rates. Leveraging this approach, we present the +Multi-Rate Attack-Aware Randomized Scheduling (MAARS) framework for preemptive +fixed-priority schedulers, designed to reduce the success rate of timing +inference attacks on real-time systems. To our knowledge, this is the first +method that combines attack-aware schedule randomization with preserved control +and scheduling integrity. The framework's efficacy in attack prevention is +evaluated on automotive benchmarks using a Hardware-in-the-Loop (HiL) setup. + +
+
+ comment: 12 pages including references, Total 10 figures (with 3 having + subfigures) +
+
+
+
+
+ + ♻ ☆ Optimizing Highway Ramp Merge Safety and Efficiency via Spatio-Temporal + Cooperative Control and Vehicle-Road Coordination + + +
+ In view of existing automatic driving is difficult to accurately and timely +obtain the status and driving intention of other vehicles and the safety risk +and urgency of autonomous vehicles in the absence of collision are evaluated. +As a result, while vehicles generally maintain safe distances, accidents still +frequently occur, particularly in merging areas. To ensure safety, improve road +efficiency, this paper presents a pre-programmed technique for managing +vehicles' spatiotemporal trajectories to proactively mitigate conflicts among +vehicles. Firstly, the study focuses on the calculation of safe distances under +varying spatiotemporal conditions, taking into account differences in vehicle +speed. Subsequently, an evaluation model for vehicle conflict risk is +developed, which incorporates critical parameters such as collision +acceleration and emergency acceleration. The methodology further identifies the +main line vehicles that are potentially in conflict with on-ramp vehicles and +determines the target gap for the latter. Based on this selected target gap, a +cooperative control method is formulated, enabling the pre-programming of +vehicle trajectories. Using highway ramp merging as a case study, the paper +introduces a mainline priority spatiotemporal cooperative control method and +validates its efficacy through rigorous simulations. The analysis indicates +that the average delay time can be reduced by 97.96%, and fuel consumption by +6.01%. The mainline priority strategy demonstrates increased speed, low latency +and low fuel consumption. + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Power Scheduling: A User-Centric Approach + + +
+ While traditional optimization and scheduling schemes are designed to meet +fixed, predefined system requirements, future systems are moving toward +user-driven approaches and personalized services, aiming to achieve high +quality-of-experience (QoE) and flexibility. This challenge is particularly +pronounced in wireless and digitalized energy networks, where users' +requirements have largely not been taken into consideration due to the lack of +a common language between users and machines. The emergence of powerful large +language models (LLMs) marks a radical departure from traditional +system-centric methods into more advanced user-centric approaches by providing +a natural communication interface between users and devices. In this paper, for +the first time, we introduce a novel architecture for resource scheduling +problems by constructing three LLM agents to convert an arbitrary user's voice +request (VRQ) into a resource allocation vector. Specifically, we design an LLM +intent recognition agent to translate the request into an optimization problem +(OP), an LLM OP parameter identification agent, and an LLM OP solving agent. To +evaluate system performance, we construct a database of typical VRQs in the +context of electric vehicle (EV) charging. As a proof of concept, we primarily +use Llama 3 8B. Through testing with different prompt engineering scenarios, +the obtained results demonstrate the efficiency of the proposed architecture. +The conducted performance analysis allows key insights to be extracted. For +instance, having a larger set of candidate OPs to model the real-world problem +might degrade the final performance because of a higher recognition/OP +classification noise level. All results and codes are open source. + +
+
+
+
+
+ + ♻ ☆ Robustness to Model Approximation, Empirical Model Learning, and Sample + Complexity in Wasserstein Regular MDPs + + +
+ The paper studies the robustness properties of discrete-time stochastic +optimal control under Wasserstein model approximation for both discounted cost +and average cost criteria. Specifically, we study the performance loss when +applying an optimal policy designed for an approximate model to the true +dynamics compared with the optimal cost for the true model under the +sup-norm-induced metric, and relate it to the Wasserstein-1 distance between +the approximate and true transition kernels. A primary motivation of this +analysis is empirical model learning, as well as empirical noise distribution +learning, where Wasserstein convergence holds under mild conditions but +stronger convergence criteria, such as total variation, may not. We discuss +applications of the results to the disturbance estimation problem, where sample +complexity bounds are given, and also to a general empirical model learning +approach, obtained under either Markov or i.i.d.~learning settings. Further +applications regarding the continuity of invariant probability measures with +respect to transition kernels are also discussed. + +
+
+ comment: 35 pages +
+
+
+
+
+ + ♻ ☆ Adaptive Power Flow Approximations with Second-Order Sensitivity + Insights + + +
+ The power flow equations are fundamental to power system planning, analysis, +and control. However, the inherent non-linearity and non-convexity of these +equations present formidable obstacles in problem-solving processes. To +mitigate these challenges, recent research has proposed adaptive power flow +linearizations that aim to achieve accuracy over wide operating ranges. The +accuracy of these approximations inherently depends on the curvature of the +power flow equations within these ranges, which necessitates considering +second-order sensitivities. In this paper, we leverage second-order +sensitivities to both analyze and improve power flow approximations. We +evaluate the curvature across broad operational ranges and subsequently utilize +this information to inform the computation of various sample-based power flow +approximation techniques. Additionally, we leverage second-order sensitivities +to guide the development of rational approximations that yield linear +constraints in optimization problems. This approach is extended to enhance +accuracy beyond the limitations of linear functions across varied operational +scenarios. + +
+
+
+
+
+ + ♻ ☆ Nonlinear moving horizon estimation for robust state and parameter + estimation -- extended version + + +
+ We propose a moving horizon estimation scheme to estimate the states and the +unknown constant parameters of general nonlinear uncertain discrete-time +systems. The proposed framework and analysis explicitly do not involve the a +priori verification of a particular excitation condition for the parameters. +Instead, we use online information about the actual excitation of the +parameters at any time during operation and ensure that the regularization term +in the cost function is always automatically selected appropriately. This +ensures that the state and parameter estimation error is bounded for all times, +even if the parameters are never (or only rarely) excited during operation. +Robust exponential stability of the state and parameter estimation error +emerges under an additional uniform condition on the maximum duration of +insufficient excitation. The theoretical results are illustrated by a numerical +example. + +
+
+ comment: Replaced by revised version +
+
+
+
+
+ + ♻ ☆ Decentralized Coordination of Distributed Energy Resources through Local + Energy Markets and Deep Reinforcement Learning + + +
+ As distributed energy resources (DERs) grow, the electricity grid faces +increased net load variability at the grid edge, impacting operability and +reliability. Transactive energy, facilitated through local energy markets, +offers a decentralized, indirect demand response solution, with model-free +control techniques, such as deep reinforcement learning (DRL), enabling +automated, decentralized participation. However, existing studies largely +overlook community-level net load variability, focusing instead on +socioeconomic metrics. + This study addresses this gap by using DRL agents to automate end-user +participation in a local energy market (ALEX), where agents act independently +to minimize individual energy bills. Results reveal a strong link between bill +reduction and decreased net load variability, assessed across metrics such as +ramping rate, load factor, and peak demand over various time horizons. Using a +no-control baseline, DRL agents are benchmarked against a near-optimal dynamic +programming approach. The dynamic programming benchmark achieves reductions of +22.05 percent, 83.92 percent, and 24.09 percent in daily import, export, and +peak demand, respectively, while the DRL agents show comparable or superior +results with reductions of 21.93 percent, 84.46 percent, and 27.02 percent. + This study demonstrates the effectiveness of DRL in decentralized grid +management, highlighting its scalability and near-optimal performance in +reducing net load variability within community-driven energy markets. + +
+
+ comment: preprint, submitted to Energy and AI +
+
+
+
+
+
+
+
+ + Machine Learning 134 + +
+
+
+ + ☆ On the Surprising Effectiveness of Attention Transfer for Vision + Transformers NeurIPS 2024 + + +
+ Conventional wisdom suggests that pre-training Vision Transformers (ViT) +improves downstream performance by learning useful representations. Is this +actually true? We investigate this question and find that the features and +representations learned during pre-training are not essential. Surprisingly, +using only the attention patterns from pre-training (i.e., guiding how +information flows between tokens) is sufficient for models to learn high +quality features from scratch and achieve comparable downstream performance. We +show this by introducing a simple method called attention transfer, where only +the attention patterns from a pre-trained teacher ViT are transferred to a +student, either by copying or distilling the attention maps. Since attention +transfer lets the student learn its own features, ensembling it with a +fine-tuned teacher also further improves accuracy on ImageNet. We +systematically study various aspects of our findings on the sufficiency of +attention maps, including distribution shift settings where they underperform +fine-tuning. We hope our exploration provides a better understanding of what +pre-training accomplishes and leads to a useful alternative to the standard +practice of fine-tuning + +
+
+ comment: NeurIPS 2024. Code: + https://github.com/alexlioralexli/attention-transfer +
+
+
+
+
+ + ☆ Conditional regression for the Nonlinear Single-Variable Model + + +
+ Several statistical models for regression of a function $F$ on $\mathbb{R}^d$ +without the statistical and computational curse of dimensionality exist, for +example by imposing and exploiting geometric assumptions on the distribution of +the data (e.g. that its support is low-dimensional), or strong smoothness +assumptions on $F$, or a special structure $F$. Among the latter, compositional +models assume $F=f\circ g$ with $g$ mapping to $\mathbb{R}^r$ with $r\ll d$, +have been studied, and include classical single- and multi-index models and +recent works on neural networks. While the case where $g$ is linear is rather +well-understood, much less is known when $g$ is nonlinear, and in particular +for which $g$'s the curse of dimensionality in estimating $F$, or both $f$ and +$g$, may be circumvented. In this paper, we consider a model +$F(X):=f(\Pi_\gamma X) $ where $\Pi_\gamma:\mathbb{R}^d\to[0,\rm{len}_\gamma]$ +is the closest-point projection onto the parameter of a regular curve $\gamma: +[0,\rm{len}_\gamma]\to\mathbb{R}^d$ and $f:[0,\rm{len}_\gamma]\to\mathbb{R}^1$. +The input data $X$ is not low-dimensional, far from $\gamma$, conditioned on +$\Pi_\gamma(X)$ being well-defined. The distribution of the data, $\gamma$ and +$f$ are unknown. This model is a natural nonlinear generalization of the +single-index model, which corresponds to $\gamma$ being a line. We propose a +nonparametric estimator, based on conditional regression, and show that under +suitable assumptions, the strongest of which being that $f$ is coarsely +monotone, it can achieve the $one$-$dimensional$ optimal min-max rate for +non-parametric regression, up to the level of noise in the observations, and be +constructed in time $\mathcal{O}(d^2n\log n)$. All the constants in the +learning bounds, in the minimal number of samples required for our bounds to +hold, and in the computational complexity are at most low-order polynomials in +$d$. + +
+
+ comment: 55 pages, 10 figures +
+
+
+
+
+ + ☆ Towards a Classification of Open-Source ML Models and Datasets for + Software Engineering + + +
+ Background: Open-Source Pre-Trained Models (PTMs) and datasets provide +extensive resources for various Machine Learning (ML) tasks, yet these +resources lack a classification tailored to Software Engineering (SE) needs. +Aims: We apply an SE-oriented classification to PTMs and datasets on a popular +open-source ML repository, Hugging Face (HF), and analyze the evolution of PTMs +over time. Method: We conducted a repository mining study. We started with a +systematically gathered database of PTMs and datasets from the HF API. Our +selection was refined by analyzing model and dataset cards and metadata, such +as tags, and confirming SE relevance using Gemini 1.5 Pro. All analyses are +replicable, with a publicly accessible replication package. Results: The most +common SE task among PTMs and datasets is code generation, with a primary focus +on software development and limited attention to software management. Popular +PTMs and datasets mainly target software development. Among ML tasks, text +generation is the most common in SE PTMs and datasets. There has been a marked +increase in PTMs for SE since 2023 Q2. Conclusions: This study underscores the +need for broader task coverage to enhance the integration of ML within SE +practices. + +
+
+ comment: 5 pages, 8 figures +
+
+
+
+
+ + ☆ NeuralDEM - Real-time Simulation of Industrial Particulate Flows + + +
+ Advancements in computing power have made it possible to numerically simulate +large-scale fluid-mechanical and/or particulate systems, many of which are +integral to core industrial processes. Among the different numerical methods +available, the discrete element method (DEM) provides one of the most accurate +representations of a wide range of physical systems involving granular and +discontinuous materials. Consequently, DEM has become a widely accepted +approach for tackling engineering problems connected to granular flows and +powder mechanics. Additionally, DEM can be integrated with grid-based +computational fluid dynamics (CFD) methods, enabling the simulation of chemical +processes taking place, e.g., in fluidized beds. However, DEM is +computationally intensive because of the intrinsic multiscale nature of +particulate systems, restricting simulation duration or number of particles. +Towards this end, NeuralDEM presents an end-to-end approach to replace slow +numerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM +is capable of picturing long-term transport processes across different regimes +using macroscopic observables without any reference to microscopic model +parameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an +underlying continuous field, while simultaneously modeling macroscopic behavior +directly as additional auxiliary fields. Second, NeuralDEM introduces +multi-branch neural operators scalable to real-time modeling of +industrially-sized scenarios - from slow and pseudo-steady to fast and +transient. Such scenarios have previously posed insurmountable challenges for +deep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM +fluidized bed reactors of 160k CFD cells and 500k DEM particles for +trajectories of 28s. NeuralDEM will open many new doors to advanced engineering +and much faster process cycles. + +
+
+ comment: Project page: https://nx-ai.github.io/NeuralDEM/ +
+
+
+
+
+ + ☆ Med-Bot: An AI-Powered Assistant to Provide Accurate and Reliable + Medical Information + + +
+ This paper introduces Med-Bot, an AI-powered chatbot designed to provide +users with accurate and reliable medical information. Utilizing advanced +libraries and frameworks such as PyTorch, Chromadb, Langchain and Autogptq, +Med-Bot is built to handle the complexities of natural language understanding +in a healthcare context. The integration of llamaassisted data processing and +AutoGPT-Q provides enhanced performance in processing and responding to queries +based on PDFs of medical literature, ensuring that users receive precise and +trustworthy information. This research details the methodologies employed in +developing Med-Bot and evaluates its effectiveness in disseminating healthcare +information. + +
+
+ comment: 3 figures, 5 pages Keywords-LLM, AI-powered healthcare, Medical + chatbot, Context-based interaction, Llama-assisted data processing, + AutoGPT-Q, PyTorch, TensorFlow, Reliable medical information, Machine + learning in healthcare, Conversational AI +
+
+
+
+
+ + ☆ How do Machine Learning Models Change? + + +
+ The proliferation of Machine Learning (ML) models and their open-source +implementations has transformed Artificial Intelligence research and +applications. Platforms like Hugging Face (HF) enable the development, sharing, +and deployment of these models, fostering an evolving ecosystem. While previous +studies have examined aspects of models hosted on platforms like HF, a +comprehensive longitudinal study of how these models change remains +underexplored. This study addresses this gap by utilizing both repository +mining and longitudinal analysis methods to examine over 200,000 commits and +1,200 releases from over 50,000 models on HF. We replicate and extend an ML +change taxonomy for classifying commits and utilize Bayesian networks to +uncover patterns in commit and release activities over time. Our findings +indicate that commit activities align with established data science +methodologies, such as CRISP-DM, emphasizing iterative refinement and +continuous improvement. Additionally, release patterns tend to consolidate +significant updates, particularly in documentation, distinguishing between +granular changes and milestone-based releases. Furthermore, projects with +higher popularity prioritize infrastructure enhancements early in their +lifecycle, and those with intensive collaboration practices exhibit improved +documentation standards. These and other insights enhance the understanding of +model changes on community platforms and provide valuable guidance for best +practices in model maintenance. + +
+
+
+
+
+ + ☆ Neural Operators Can Play Dynamic Stackelberg Games + + +
+ Dynamic Stackelberg games are a broad class of two-player games in which the +leader acts first, and the follower chooses a response strategy to the leader's +strategy. Unfortunately, only stylized Stackelberg games are explicitly +solvable since the follower's best-response operator (as a function of the +control of the leader) is typically analytically intractable. This paper +addresses this issue by showing that the \textit{follower's best-response +operator} can be approximately implemented by an \textit{attention-based neural +operator}, uniformly on compact subsets of adapted open-loop controls for the +leader. We further show that the value of the Stackelberg game where the +follower uses the approximate best-response operator approximates the value of +the original Stackelberg game. Our main result is obtained using our universal +approximation theorem for attention-based neural operators between spaces of +square-integrable adapted stochastic processes, as well as stability results +for a general class of Stackelberg games. + +
+
+
+
+
+ + ☆ On the Limits of Language Generation: Trade-Offs Between Hallucination + and Mode Collapse + + +
+ Specifying all desirable properties of a language model is challenging, but +certain requirements seem essential. Given samples from an unknown language, +the trained model should produce valid strings not seen in training and be +expressive enough to capture the language's full richness. Otherwise, +outputting invalid strings constitutes "hallucination," and failing to capture +the full range leads to "mode collapse." We ask if a language model can meet +both requirements. + We investigate this within a statistical language generation setting building +on Gold and Angluin. Here, the model receives random samples from a +distribution over an unknown language K, which belongs to a possibly infinite +collection of languages. The goal is to generate unseen strings from K. We say +the model generates from K with consistency and breadth if, as training size +increases, its output converges to all unseen strings in K. + Kleinberg and Mullainathan [KM24] asked if consistency and breadth in +language generation are possible. We answer this negatively: for a large class +of language models, including next-token prediction models, this is impossible +for most collections of candidate languages. This contrasts with [KM24]'s +result, showing consistent generation without breadth is possible for any +countable collection of languages. Our finding highlights that generation with +breadth fundamentally differs from generation without breadth. + As a byproduct, we establish near-tight bounds on the number of samples +needed for generation with or without breadth. + Finally, our results offer hope: consistent generation with breadth is +achievable for any countable collection of languages when negative examples +(strings outside K) are available alongside positive ones. This suggests that +post-training feedback, which encodes negative examples, can be crucial in +reducing hallucinations while limiting mode collapse. + +
+
+ comment: Abstract shortened to fit arXiv limit +
+
+
+
+
+ + ☆ MCCE: Missingness-aware Causal Concept Explainer + + +
+ Causal concept effect estimation is gaining increasing interest in the field +of interpretable machine learning. This general approach explains the behaviors +of machine learning models by estimating the causal effect of +human-understandable concepts, which represent high-level knowledge more +comprehensibly than raw inputs like tokens. However, existing causal concept +effect explanation methods assume complete observation of all concepts involved +within the dataset, which can fail in practice due to incomplete annotations or +missing concept data. We theoretically demonstrate that unobserved concepts can +bias the estimation of the causal effects of observed concepts. To address this +limitation, we introduce the Missingness-aware Causal Concept Explainer (MCCE), +a novel framework specifically designed to estimate causal concept effects when +not all concepts are observable. Our framework learns to account for residual +bias resulting from missing concepts and utilizes a linear predictor to model +the relationships between these concepts and the outputs of black-box machine +learning models. It can offer explanations on both local and global levels. We +conduct validations using a real-world dataset, demonstrating that MCCE +achieves promising performance compared to state-of-the-art explanation methods +in causal concept effect estimation. + +
+
+
+
+
+ + ☆ Counterfactual Uncertainty Quantification of Factual Estimand of + Efficacy from Before-and-After Treatment Repeated Measures Randomized + Controlled Trials + + +
+ The ideal estimand for comparing a new treatment $Rx$ with a control $C$ is +the $\textit{counterfactual}$ efficacy $Rx:C$, the expected differential +outcome between $Rx$ and $C$ if each patient were given $\textit{both}$. While +counterfactual $\textit{point estimation}$ from $\textit{factual}$ Randomized +Controlled Trials (RCTs) has been available, this article shows +$\textit{counterfactual}$ uncertainty quantification (CUQ), quantifying +uncertainty for factual point estimates but in a counterfactual setting, is +surprisingly achievable. We achieve CUQ whose variability is typically smaller +than factual UQ, by creating a new statistical modeling principle called ETZ +which is applicable to RCTs with $\textit{Before-and-After}$ treatment Repeated +Measures, common in many therapeutic areas. + We urge caution when estimate of the unobservable true condition of a patient +before treatment has measurement error, because that violation of standard +regression assumption can cause attenuation in estimating treatment effects. +Fortunately, we prove that, for traditional medicine in general, and for +targeted therapy with efficacy defined as averaged over the population, +counterfactual point estimation is unbiased. However, for targeted therapy, +both Real Human and Digital Twins approaches should respect this limitation, +lest predicted treatment effect in $\textit{subgroups}$ will have bias. + +
+
+ comment: 39 pages, 7 figures +
+
+
+
+
+ + ☆ Local deployment of large-scale music AI models on commodity hardware + + +
+ We present the MIDInfinite, a web application capable of generating symbolic +music using a large-scale generative AI model locally on commodity hardware. +Creating this demo involved porting the Anticipatory Music Transformer, a large +language model (LLM) pre-trained on the Lakh MIDI dataset, to the Machine +Learning Compilation (MLC) framework. Once the model is ported, MLC facilitates +inference on a variety of runtimes including C++, mobile, and the browser. We +envision that MLC has the potential to bridge the gap between the landscape of +increasingly capable music AI models and technology more familiar to music +software developers. As a proof of concept, we build a web application that +allows users to generate endless streams of multi-instrumental MIDI in the +browser, either from scratch or conditioned on a prompt. On commodity hardware +(an M3 Macbook Pro), our demo can generate 51 notes per second, which is faster +than real-time playback for 72.9% of generations, and increases to 86.3% with 2 +seconds of upfront buffering. + +
+
+ comment: 2 pages +
+
+
+
+
+ + ☆ MICCAI-CDMRI 2023 QuantConn Challenge Findings on Achieving Robust + Quantitative Connectivity through Harmonized Preprocessing of Diffusion MRI + + +
+ White matter alterations are increasingly implicated in neurological diseases +and their progression. International-scale studies use diffusion-weighted +magnetic resonance imaging (DW-MRI) to qualitatively identify changes in white +matter microstructure and connectivity. Yet, quantitative analysis of DW-MRI +data is hindered by inconsistencies stemming from varying acquisition +protocols. There is a pressing need to harmonize the preprocessing of DW-MRI +datasets to ensure the derivation of robust quantitative diffusion metrics +across acquisitions. In the MICCAI-CDMRI 2023 QuantConn challenge, participants +were provided raw data from the same individuals collected on the same scanner +but with two different acquisitions and tasked with preprocessing the DW-MRI to +minimize acquisition differences while retaining biological variation. +Submissions are evaluated on the reproducibility and comparability of +cross-acquisition bundle-wise microstructure measures, bundle shape features, +and connectomics. The key innovations of the QuantConn challenge are that (1) +we assess bundles and tractography in the context of harmonization for the +first time, (2) we assess connectomics in the context of harmonization for the +first time, and (3) we have 10x additional subjects over prior harmonization +challenge, MUSHAC and 100x over SuperMUDI. We find that bundle surface area, +fractional anisotropy, connectome assortativity, betweenness centrality, edge +count, modularity, nodal strength, and participation coefficient measures are +most biased by acquisition and that machine learning voxel-wise correction, +RISH mapping, and NeSH methods effectively reduce these biases. In addition, +microstructure measures AD, MD, RD, bundle length, connectome density, +efficiency, and path length are least biased by these acquisition differences. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2024/019 +
+
+
+
+
+ + ☆ The Moral Foundations Weibo Corpus + + +
+ Moral sentiments expressed in natural language significantly influence both +online and offline environments, shaping behavioral styles and interaction +patterns, including social media selfpresentation, cyberbullying, adherence to +social norms, and ethical decision-making. To effectively measure moral +sentiments in natural language processing texts, it is crucial to utilize +large, annotated datasets that provide nuanced understanding for accurate +analysis and modeltraining. However, existing corpora, while valuable, often +face linguistic limitations. To address this gap in the Chinese language +domain,we introduce the Moral Foundation Weibo Corpus. This corpus consists of +25,671 Chinese comments on Weibo, encompassing six diverse topic areas. Each +comment is manually annotated by at least three systematically trained +annotators based on ten moral categories derived from a grounded theory of +morality. To assess annotator reliability, we present the kappa testresults, a +gold standard for measuring consistency. Additionally, we apply several the +latest large language models to supplement the manual annotations, conducting +analytical experiments to compare their performance and report baseline results +for moral sentiment classification. + +
+
+
+
+
+ + ☆ Latency Optimization in LEO Satellite Communications with Hybrid Beam + Pattern and Interference Control + + +
+ The rapid advancement of low Earth orbit (LEO) satellite communication +systems has significantly enhanced global connectivity, offering high-capacity, +low-latency services crucial for next-generation applications. However, the +dense configuration of LEO constellations poses challenges in resource +allocation optimization and interference management, complicating coexistence +with other communication systems. To address these limitations, this paper +proposes a novel framework for optimizing the beam scheduling and resource +allocation in multi-beam LEO systems. To satisfy the uneven terrestrial traffic +demand, a hybrid beam pattern is employed to enhance the downlink quality of +service and minimize the transmission latency from LEO satellites to ground +user terminals. Additionally, a dynamic co-channel interference (CCI) control +mechanism is developed to mitigate inter-beam interference within the LEO +constellation and limit cross-system interference affecting protected users +from other networks. The problem of user-beam-frequency allocation with power +optimization is formulated as a mixed-integer dynamic programming model and +solved using a low-complexity neural network-based graph generation algorithm. +Simulation results show that the proposed approach outperforms the baseline +methods of full frequency reuse and single-channel transmission, and highlights +the potential for further performance improvement with multi-user +transmissions. + +
+
+
+
+
+ + ☆ LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models + + +
+ This work explores expanding the capabilities of large language models (LLMs) +pretrained on text to generate 3D meshes within a unified model. This offers +key advantages of (1) leveraging spatial knowledge already embedded in LLMs, +derived from textual sources like 3D tutorials, and (2) enabling conversational +3D generation and mesh understanding. A primary challenge is effectively +tokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly. +To address this, we introduce LLaMA-Mesh, a novel approach that represents the +vertex coordinates and face definitions of 3D meshes as plain text, allowing +direct integration with LLMs without expanding the vocabulary. We construct a +supervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate +3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs +as required, and (3) understand and interpret 3D meshes. Our work is the first +to demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge +for 3D mesh generation in a text-based format, effectively unifying the 3D and +text modalities. LLaMA-Mesh achieves mesh generation quality on par with models +trained from scratch while maintaining strong text generation performance. + +
+
+ comment: See the project website at + https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/ +
+
+
+
+
+ + ☆ Expert Study on Interpretable Machine Learning Models with Missing Data ML4H + + +
+ Inherently interpretable machine learning (IML) models provide valuable +insights for clinical decision-making but face challenges when features have +missing values. Classical solutions like imputation or excluding incomplete +records are often unsuitable in applications where values are missing at test +time. In this work, we conducted a survey with 71 clinicians from 29 trauma +centers across France, including 20 complete responses to study the interaction +between medical professionals and IML applied to data with missing values. This +provided valuable insights into how missing data is interpreted in clinical +machine learning. We used the prediction of hemorrhagic shock as a concrete +example to gauge the willingness and readiness of the participants to adopt IML +models from three classes of methods. Our findings show that, while clinicians +value interpretability and are familiar with common IML methods, classical +imputation techniques often misalign with their intuition, and that models that +natively handle missing values are preferred. These results emphasize the need +to integrate clinical intuition into future IML models for better +human-computer interaction. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 13 pages +
+
+
+
+
+ + ☆ Adaptive Deviation Learning for Visual Anomaly Detection with Data + Contamination WACV 2025 + + +
+ Visual anomaly detection targets to detect images that notably differ from +normal pattern, and it has found extensive application in identifying defective +parts within the manufacturing industry. These anomaly detection paradigms +predominantly focus on training detection models using only clean, unlabeled +normal samples, assuming an absence of contamination; a condition often unmet +in real-world scenarios. The performance of these methods significantly depends +on the quality of the data and usually decreases when exposed to noise. We +introduce a systematic adaptive method that employs deviation learning to +compute anomaly scores end-to-end while addressing data contamination by +assigning relative importance to the weights of individual instances. In this +approach, the anomaly scores for normal instances are designed to approximate +scalar scores obtained from the known prior distribution. Meanwhile, anomaly +scores for anomaly examples are adjusted to exhibit statistically significant +deviations from these reference scores. Our approach incorporates a constrained +optimization problem within the deviation learning framework to update instance +weights, resolving this problem for each mini-batch. Comprehensive experiments +on the MVTec and VisA benchmark datasets indicate that our proposed method +surpasses competing techniques and exhibits both stability and robustness in +the presence of data contamination. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV 2025) +
+
+
+
+
+ + ☆ Equation-informed data-driven identification of flow budgets and + dynamics + + +
+ Computational Fluid Dynamics (CFD) is an indispensable method of fluid +modelling in engineering applications, reducing the need for physical +prototypes and testing for tasks such as design optimisation and performance +analysis. Depending on the complexity of the system under consideration, models +ranging from low to high fidelity can be used for prediction, allowing +significant speed-up. However, the choice of model requires information about +the actual dynamics of the flow regime. Correctly identifying the +regions/clusters of flow that share the same dynamics has been a challenging +research topic to date. In this study, we propose a novel hybrid approach to +flow clustering. It consists of characterising each sample point of the system +with equation-based features, i.e. features are budgets that represent the +contribution of each term from the original governing equation to the local +dynamics at each sample point. This was achieved by applying the Sparse +Identification of Nonlinear Dynamical systems (SINDy) method pointwise to time +evolution data. The method proceeds with equation-based clustering using the +Girvan-Newman algorithm. This allows the detection of communities that share +the same physical dynamics. The algorithm is implemented in both Eulerian and +Lagrangian frameworks. In the Lagrangian, i.e. dynamic approach, the clustering +is performed on the trajectory of each point, allowing the change of clusters +to be represented also in time. The performance of the algorithm is first +tested on a flow around a cylinder. The construction of the dynamic clusters in +this test case clearly shows the evolution of the wake from the steady state +solution through the transient to the oscillatory solution. Dynamic clustering +was then successfully tested on turbulent flow data. Two distinct and +well-defined clusters were identified and their temporal evolution was +reconstructed. + +
+
+
+
+
+ + ☆ Prompting the Unseen: Detecting Hidden Backdoors in Black-Box Models + + +
+ Visual prompting (VP) is a new technique that adapts well-trained frozen +models for source domain tasks to target domain tasks. This study examines VP's +benefits for black-box model-level backdoor detection. The visual prompt in VP +maps class subspaces between source and target domains. We identify a +misalignment, termed class subspace inconsistency, between clean and poisoned +datasets. Based on this, we introduce \textsc{BProm}, a black-box model-level +detection method to identify backdoors in suspicious models, if any. +\textsc{BProm} leverages the low classification accuracy of prompted models +when backdoors are present. Extensive experiments confirm \textsc{BProm}'s +effectiveness. + +
+
+
+
+
+ + ☆ A Practical Guide to Fine-tuning Language Models with Limited Data + + +
+ Employing pre-trained Large Language Models (LLMs) has become the de facto +standard in Natural Language Processing (NLP) despite their extensive data +requirements. Motivated by the recent surge in research focused on training +LLMs with limited data, particularly in low-resource domains and languages, +this paper surveys recent transfer learning approaches to optimize model +performance in downstream tasks where data is scarce. We first address initial +and continued pre-training strategies to better leverage prior knowledge in +unseen domains and languages. We then examine how to maximize the utility of +limited data during fine-tuning and few-shot learning. The final section takes +a task-specific perspective, reviewing models and methods suited for different +levels of data scarcity. Our goal is to provide practitioners with practical +guidelines for overcoming the challenges posed by constrained data while also +highlighting promising directions for future research. + +
+
+
+
+
+ + ☆ Randomized Truthful Auctions with Learning Agents + + +
+ We study a setting where agents use no-regret learning algorithms to +participate in repeated auctions. \citet{kolumbus2022auctions} showed, rather +surprisingly, that when bidders participate in second-price auctions using +no-regret bidding algorithms, no matter how large the number of interactions +$T$ is, the runner-up bidder may not converge to bidding truthfully. Our first +result shows that this holds for \emph{general deterministic} truthful +auctions. We also show that the ratio of the learning rates of the bidders can +\emph{qualitatively} affect the convergence of the bidders. Next, we consider +the problem of revenue maximization in this environment. In the setting with +fully rational bidders, \citet{myerson1981optimal} showed that revenue can be +maximized by using a second-price auction with reserves.We show that, in stark +contrast, in our setting with learning bidders, \emph{randomized} auctions can +have strictly better revenue guarantees than second-price auctions with +reserves, when $T$ is large enough. Finally, we study revenue maximization in +the non-asymptotic regime. We define a notion of {\em auctioneer regret} +comparing the revenue generated to the revenue of a second price auction with +truthful bids. When the auctioneer has to use the same auction throughout the +interaction, we show an (almost) tight regret bound of $\smash{\widetilde +\Theta(T^{3/4})}.$ If the auctioneer can change auctions during the +interaction, but in a way that is oblivious to the bids, we show an (almost) +tight bound of $\smash{\widetilde \Theta(\sqrt{T})}.$ + +
+
+
+
+
+ + ☆ GAN-Based Architecture for Low-dose Computed Tomography Imaging + Denoising + + +
+ Generative Adversarial Networks (GANs) have surfaced as a revolutionary +element within the domain of low-dose computed tomography (LDCT) imaging, +providing an advanced resolution to the enduring issue of reconciling radiation +exposure with image quality. This comprehensive review synthesizes the rapid +advancements in GAN-based LDCT denoising techniques, examining the evolution +from foundational architectures to state-of-the-art models incorporating +advanced features such as anatomical priors, perceptual loss functions, and +innovative regularization strategies. We critically analyze various GAN +architectures, including conditional GANs (cGANs), CycleGANs, and +Super-Resolution GANs (SRGANs), elucidating their unique strengths and +limitations in the context of LDCT denoising. The evaluation provides both +qualitative and quantitative results related to the improvements in performance +in benchmark and clinical datasets with metrics such as PSNR, SSIM, and LPIPS. +After highlighting the positive results, we discuss some of the challenges +preventing a wider clinical use, including the interpretability of the images +generated by GANs, synthetic artifacts, and the need for clinically relevant +metrics. The review concludes by highlighting the essential significance of +GAN-based methodologies in the progression of precision medicine via tailored +LDCT denoising models, underlining the transformative possibilities presented +by artificial intelligence within contemporary radiological practice. + +
+
+
+
+
+ + ☆ Communication Compression for Tensor Parallel LLM Inference + + +
+ Large Language Models (LLMs) have pushed the frontier of artificial +intelligence but are comprised of hundreds of billions of parameters and +operations. For faster inference latency, LLMs are deployed on multiple +hardware accelerators through various Model Parallelism strategies. Our paper +looks into the details on one such strategy - Tensor Parallel - and proposes to +reduce latency by compressing inter-accelerator communication. We leverage fine +grained quantization techniques to compress selected activations by 3.5 - 4.5x. +Our proposed method leads up to 2x reduction of time-to-first-token (TTFT) with +negligible model performance degradation. + +
+
+
+
+
+ + ☆ Golden Noise for Diffusion Models: A Learning Framework + + +
+ Text-to-image diffusion model is a popular paradigm that synthesizes +personalized images by providing a text prompt and a random Gaussian noise. +While people observe that some noises are ``golden noises'' that can achieve +better text-image alignment and higher human preference than others, we still +lack a machine learning framework to obtain those golden noises. To learn +golden noises for diffusion sampling, we mainly make three contributions in +this paper. First, we identify a new concept termed the \textit{noise prompt}, +which aims at turning a random Gaussian noise into a golden noise by adding a +small desirable perturbation derived from the text prompt. Following the +concept, we first formulate the \textit{noise prompt learning} framework that +systematically learns ``prompted'' golden noise associated with a text prompt +for diffusion models. Second, we design a noise prompt data collection pipeline +and collect a large-scale \textit{noise prompt dataset}~(NPD) that contains +100k pairs of random noises and golden noises with the associated text prompts. +With the prepared NPD as the training dataset, we trained a small \textit{noise +prompt network}~(NPNet) that can directly learn to transform a random noise +into a golden noise. The learned golden noise perturbation can be considered as +a kind of prompt for noise, as it is rich in semantic information and tailored +to the given text prompt. Third, our extensive experiments demonstrate the +impressive effectiveness and generalization of NPNet on improving the quality +of synthesized images across various diffusion models, including SDXL, +DreamShaper-xl-v2-turbo, and Hunyuan-DiT. Moreover, NPNet is a small and +efficient controller that acts as a plug-and-play module with very limited +additional inference and computational costs, as it just provides a golden +noise instead of a random noise without accessing the original pipeline. + +
+
+
+
+
+ + ☆ Developement of Reinforcement Learning based Optimisation Method for + Side-Sill Design + + +
+ Optimisation for crashworthiness is a critical part of the vehicle +development process. Due to stringent regulations and increasing market +demands, multiple factors must be considered within a limited timeframe. +However, for optimal crashworthiness design, multiobjective optimisation is +necessary, and for complex parts, multiple design parameters must be evaluated. +This crashworthiness analysis requires computationally intensive finite element +simulations. This challenge leads to the need for inverse multi-parameter +multi-objective optimisation. This challenge leads to the need for +multi-parameter, multi-objective inverse optimisation. This article +investigates a machine learning-based method for this type of optimisation, +focusing on the design optimisation of a multi-cell side sill to improve +crashworthiness results. Furthermore, the optimiser is coupled with an FE +solver to achieve improved results. + +
+
+
+
+
+ + ☆ Sparse Bayesian Generative Modeling for Compressive Sensing + + +
+ This work addresses the fundamental linear inverse problem in compressive +sensing (CS) by introducing a new type of regularizing generative prior. Our +proposed method utilizes ideas from classical dictionary-based CS and, in +particular, sparse Bayesian learning (SBL), to integrate a strong +regularization towards sparse solutions. At the same time, by leveraging the +notion of conditional Gaussianity, it also incorporates the adaptability from +generative models to training data. However, unlike most state-of-the-art +generative models, it is able to learn from a few compressed and noisy data +samples and requires no optimization algorithm for solving the inverse problem. +Additionally, similar to Dirichlet prior networks, our model parameterizes a +conjugate prior enabling its application for uncertainty quantification. We +support our approach theoretically through the concept of variational inference +and validate it empirically using different types of compressible signals. + +
+
+
+
+
+ + ☆ What makes a good BIM design: quantitative linking between design + behavior and quality + + +
+ In the Architecture Engineering & Construction (AEC) industry, how design +behaviors impact design quality remains unclear. This study proposes a novel +approach, which, for the first time, identifies and quantitatively describes +the relationship between design behaviors and quality of design based on +Building Information Modeling (BIM). Real-time collection and log mining are +integrated to collect raw data of design behaviors. Feature engineering and +various machine learning models are then utilized for quantitative modeling and +interpretation. Results confirm an existing quantifiable relationship which can +be learned by various models. The best-performing model using Extremely Random +Trees achieved an R2 value of 0.88 on the test set. Behavioral features related +to designer's skill level and changes of design intentions are identified to +have significant impacts on design quality. These findings deepen our +understanding of the design process and help forming BIM designs with better +quality. + +
+
+
+
+
+ + ☆ Graph Neural Networks and Differential Equations: A hybrid approach for + data assimilation of fluid flows + + +
+ This study presents a novel hybrid approach that combines Graph Neural +Networks (GNNs) with Reynolds-Averaged Navier Stokes (RANS) equations to +enhance the accuracy of mean flow reconstruction across a range of fluid +dynamics applications. Traditional purely data-driven Neural Networks (NNs) +models, often struggle maintaining physical consistency. Moreover, they +typically require large datasets to achieve reliable performances. The GNN +framework, which naturally handles unstructured data such as complex geometries +in Computational Fluid Dynamics (CFD), is here integrated with RANS equations +as a physical baseline model. The methodology leverages the adjoint method, +enabling the use of RANS-derived gradients as optimization terms in the GNN +training process. This ensures that the learned model adheres to the governing +physics, maintaining physical consistency while improving the prediction +accuracy. We test our approach on multiple CFD scenarios, including cases +involving generalization with respect to the Reynolds number, sparse +measurements, denoising and inpainting of missing portions of the mean flow. +The results demonstrate significant improvements in the accuracy of the +reconstructed mean flow compared to purely data-driven models, using limited +amounts of data in the training dataset. The key strengths of this study are +the integration of physical laws into the training process of the GNN, and the +ability to achieve high-accuracy predictions with a limited amount of data, +making this approach particularly valuable for applications in fluid dynamics +where data is often scarce. + +
+
+
+
+
+ + ☆ ResidualDroppath: Enhancing Feature Reuse over Residual Connections + + +
+ Residual connections are one of the most important components in neural +network architectures for mitigating the vanishing gradient problem and +facilitating the training of much deeper networks. One possible explanation for +how residual connections aid deeper network training is by promoting feature +reuse. However, we identify and analyze the limitations of feature reuse with +vanilla residual connections. To address these limitations, we propose +modifications in training methods. Specifically, we provide an additional +opportunity for the model to learn feature reuse with residual connections +through two types of iterations during training. The first type of iteration +involves using droppath, which enforces feature reuse by randomly dropping a +subset of layers. The second type of iteration focuses on training the dropped +parts of the model while freezing the undropped parts. As a result, the dropped +parts learn in a way that encourages feature reuse, as the model relies on the +undropped parts with feature reuse in mind. Overall, we demonstrated +performance improvements in models with residual connections for image +classification in certain cases. + +
+
+
+
+
+ + ☆ Renal Cell Carcinoma subtyping: learning from multi-resolution + localization + + +
+ Renal Cell Carcinoma is typically asymptomatic at the early stages for many +patients. This leads to a late diagnosis of the tumor, where the curability +likelihood is lower, and makes the mortality rate of Renal Cell Carcinoma high, +with respect to its incidence rate. To increase the survival chance, a fast and +correct categorization of the tumor subtype is paramount. Nowadays, +computerized methods, based on artificial intelligence, represent an +interesting opportunity to improve the productivity and the objectivity of the +microscopy-based Renal Cell Carcinoma diagnosis. Nonetheless, much of their +exploitation is hampered by the paucity of annotated dataset, essential for a +proficient training of supervised machine learning technologies. This study +sets out to investigate a novel self supervised training strategy for machine +learning diagnostic tools, based on the multi-resolution nature of the +histological samples. We aim at reducing the need of annotated dataset, without +significantly reducing the accuracy of the tool. We demonstrate the +classification capability of our tool on a whole slide imaging dataset for +Renal Cancer subtyping, and we compare our solution with several +state-of-the-art classification counterparts. + +
+
+
+
+
+ + ☆ Harnessing Machine Learning for Single-Shot Measurement of Free Electron + Laser Pulse Power NeurIPS 2024 + + +
+ Electron beam accelerators are essential in many scientific and technological +fields. Their operation relies heavily on the stability and precision of the +electron beam. Traditional diagnostic techniques encounter difficulties in +addressing the complex and dynamic nature of electron beams. Particularly in +the context of free-electron lasers (FELs), it is fundamentally impossible to +measure the lasing-on and lasingoff electron power profiles for a single +electron bunch. This is a crucial hurdle in the exact reconstruction of the +photon pulse profile. To overcome this hurdle, we developed a machine learning +model that predicts the temporal power profile of the electron bunch in the +lasing-off regime using machine parameters that can be obtained when lasing is +on. The model was statistically validated and showed superior predictions +compared to the state-of-the-art batch calibrations. The work we present here +is a critical element for a virtual pulse reconstruction diagnostic (VPRD) tool +designed to reconstruct the power profile of individual photon pulses without +requiring repeated measurements in the lasing-off regime. This promises to +significantly enhance the diagnostic capabilities in FELs at large. + +
+
+ comment: 10 pages, 4 figures, Machine Learning and the Physical Sciences + Workshop, NeurIPS 2024 https://neurips.cc/virtual/2024/100009 +
+
+
+
+
+ + ☆ Caravan MultiMet: Extending Caravan with Multiple Weather Nowcasts and + Forecasts + + +
+ The Caravan large-sample hydrology dataset (Kratzert et al., 2023) was +created to standardize and harmonize streamflow data from various regional +datasets, combined with globally available meteorological forcing and catchment +attributes. This community-driven project also allows researchers to +conveniently extend the dataset for additional basins, as done 6 times to date +(see https://github.com/kratzert/Caravan/discussions/10). We present a novel +extension to Caravan, focusing on enriching the meteorological forcing data. +Our extension adds three precipitation nowcast products (CPC, IMERG v07 Early, +and CHIRPS) and three weather forecast products (ECMWF IFS HRES, GraphCast, and +CHIRPS-GEFS) to the existing ERA5-Land reanalysis data. The inclusion of +diverse data sources, particularly weather forecasts, enables more robust +evaluation and benchmarking of hydrological models, especially for real-time +forecasting scenarios. To the best of our knowledge, this extension makes +Caravan the first large-sample hydrology dataset to incorporate weather +forecast data, significantly enhancing its capabilities and fostering +advancements in hydrological research, benchmarking, and real-time hydrologic +forecasting. The data is publicly available under a CC-BY-4.0 license on Zenodo +in two parts (https://zenodo.org/records/14161235, +https://zenodo.org/records/14161281) and on Google Cloud Platform (GCP) - see +more under the Data Availability chapter. + +
+
+
+
+
+ + ☆ Long-Tailed Object Detection Pre-training: Dynamic Rebalancing + Contrastive Learning with Dual Reconstruction NeurIPS 2024 + + +
+ Pre-training plays a vital role in various vision tasks, such as object +recognition and detection. Commonly used pre-training methods, which typically +rely on randomized approaches like uniform or Gaussian distributions to +initialize model parameters, often fall short when confronted with long-tailed +distributions, especially in detection tasks. This is largely due to extreme +data imbalance and the issue of simplicity bias. In this paper, we introduce a +novel pre-training framework for object detection, called Dynamic Rebalancing +Contrastive Learning with Dual Reconstruction (2DRCL). Our method builds on a +Holistic-Local Contrastive Learning mechanism, which aligns pre-training with +object detection by capturing both global contextual semantics and detailed +local patterns. To tackle the imbalance inherent in long-tailed data, we design +a dynamic rebalancing strategy that adjusts the sampling of underrepresented +instances throughout the pre-training process, ensuring better representation +of tail classes. Moreover, Dual Reconstruction addresses simplicity bias by +enforcing a reconstruction task aligned with the self-consistency principle, +specifically benefiting underrepresented tail classes. Experiments on COCO and +LVIS v1.0 datasets demonstrate the effectiveness of our method, particularly in +improving the mAP/AP scores for tail classes. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ DiffRoad: Realistic and Diverse Road Scenario Generation for Autonomous + Vehicle Testing + + +
+ Generating realistic and diverse road scenarios is essential for autonomous +vehicle testing and validation. Nevertheless, owing to the complexity and +variability of real-world road environments, creating authentic and varied +scenarios for intelligent driving testing is challenging. In this paper, we +propose DiffRoad, a novel diffusion model designed to produce controllable and +high-fidelity 3D road scenarios. DiffRoad leverages the generative capabilities +of diffusion models to synthesize road layouts from white noise through an +inverse denoising process, preserving real-world spatial features. To enhance +the quality of generated scenarios, we design the Road-UNet architecture, +optimizing the balance between backbone and skip connections for high-realism +scenario generation. Furthermore, we introduce a road scenario evaluation +module that screens adequate and reasonable scenarios for intelligent driving +testing using two critical metrics: road continuity and road reasonableness. +Experimental results on multiple real-world datasets demonstrate DiffRoad's +ability to generate realistic and smooth road structures while maintaining the +original distribution. Additionally, the generated scenarios can be fully +automated into the OpenDRIVE format, facilitating generalized autonomous +vehicle simulation testing. DiffRoad provides a rich and diverse scenario +library for large-scale autonomous vehicle testing and offers valuable insights +for future infrastructure designs that are better suited for autonomous +vehicles. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ Learning efficient and provably convergent splitting methods + + +
+ Splitting methods are widely used for solving initial value problems (IVPs) +due to their ability to simplify complicated evolutions into more manageable +subproblems which can be solved efficiently and accurately. Traditionally, +these methods are derived using analytic and algebraic techniques from +numerical analysis, including truncated Taylor series and their Lie algebraic +analogue, the Baker--Campbell--Hausdorff formula. These tools enable the +development of high-order numerical methods that provide exceptional accuracy +for small timesteps. Moreover, these methods often (nearly) conserve important +physical invariants, such as mass, unitarity, and energy. However, in many +practical applications the computational resources are limited. Thus, it is +crucial to identify methods that achieve the best accuracy within a fixed +computational budget, which might require taking relatively large timesteps. In +this regime, high-order methods derived with traditional methods often exhibit +large errors since they are only designed to be asymptotically optimal. Machine +Learning techniques offer a potential solution since they can be trained to +efficiently solve a given IVP with less computational resources. However, they +are often purely data-driven, come with limited convergence guarantees in the +small-timestep regime and do not necessarily conserve physical invariants. In +this work, we propose a framework for finding machine learned splitting methods +that are computationally efficient for large timesteps and have provable +convergence and conservation guarantees in the small-timestep limit. We +demonstrate numerically that the learned methods, which by construction +converge quadratically in the timestep size, can be significantly more +efficient than established methods for the Schr\"{o}dinger equation if the +computational budget is limited. + +
+
+
+
+
+ + ☆ SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph + Attention for Vision Transformers + + +
+ Image classification is a computer vision task where a model analyzes an +image to categorize it into a specific label. Vision Transformers (ViT) improve +this task by leveraging self-attention to capture complex patterns and long +range relationships between image patches. However, a key challenge for ViTs is +efficiently incorporating multiscale feature representations, which is inherent +in CNNs through their hierarchical structure. In this paper, we introduce the +Scale-Aware Graph Attention Vision Transformer (SAG-ViT), a novel framework +that addresses this challenge by integrating multi-scale features. Using +EfficientNet as a backbone, the model extracts multi-scale feature maps, which +are divided into patches to preserve semantic information. These patches are +organized into a graph based on spatial and feature similarities, with a Graph +Attention Network (GAT) refining the node embeddings. Finally, a Transformer +encoder captures long-range dependencies and complex interactions. The SAG-ViT +is evaluated on benchmark datasets, demonstrating its effectiveness in +enhancing image classification performance. + +
+
+ comment: 10 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Inherently Interpretable and Uncertainty-Aware Models for Online + Learning in Cyber-Security Problems + + +
+ In this paper, we address the critical need for interpretable and +uncertainty-aware machine learning models in the context of online learning for +high-risk industries, particularly cyber-security. While deep learning and +other complex models have demonstrated impressive predictive capabilities, +their opacity and lack of uncertainty quantification present significant +questions about their trustworthiness. We propose a novel pipeline for online +supervised learning problems in cyber-security, that harnesses the inherent +interpretability and uncertainty awareness of Additive Gaussian Processes +(AGPs) models. Our approach aims to balance predictive performance with +transparency while improving the scalability of AGPs, which represents their +main drawback, potentially enabling security analysts to better validate threat +detection, troubleshoot and reduce false positives, and generally make +trustworthy, informed decisions. This work contributes to the growing field of +interpretable AI by proposing a class of models that can be significantly +beneficial for high-stake decision problems such as the ones typical of the +cyber-security domain. The source code is available. + +
+
+
+
+
+ + ☆ Less is More: Unseen Domain Fake News Detection via Causal Propagation + Substructures + + +
+ The spread of fake news on social media poses significant threats to +individuals and society. Text-based and graph-based models have been employed +for fake news detection by analysing news content and propagation networks, +showing promising results in specific scenarios. However, these data-driven +models heavily rely on pre-existing in-distribution data for training, limiting +their performance when confronted with fake news from emerging or previously +unseen domains, known as out-of-distribution (OOD) data. Tackling OOD fake news +is a challenging yet critical task. In this paper, we introduce the Causal +Subgraph-oriented Domain Adaptive Fake News Detection (CSDA) model, designed to +enhance zero-shot fake news detection by extracting causal substructures from +propagation graphs using in-distribution data and generalising this approach to +OOD data. The model employs a graph neural network based mask generation +process to identify dominant nodes and edges within the propagation graph, +using these substructures for fake news detection. Additionally, the +performance of CSDA is further improved through contrastive learning in +few-shot scenarios, where a limited amount of OOD data is available for +training. Extensive experiments on public social media datasets demonstrate +that CSDA effectively handles OOD fake news detection, achieving a 7 to 16 +percents accuracy improvement over other state-of-the-art models. + +
+
+ comment: 9 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ A survey of probabilistic generative frameworks for molecular + simulations + + +
+ Generative artificial intelligence is now a widely used tool in molecular +science. Despite the popularity of probabilistic generative models, numerical +experiments benchmarking their performance on molecular data are lacking. In +this work, we introduce and explain several classes of generative models, +broadly sorted into two categories: flow-based models and diffusion models. We +select three representative models: Neural Spline Flows, Conditional Flow +Matching, and Denoising Diffusion Probabilistic Models, and examine their +accuracy, computational cost, and generation speed across datasets with tunable +dimensionality, complexity, and modal asymmetry. Our findings are varied, with +no one framework being the best for all purposes. In a nutshell, (i) Neural +Spline Flows do best at capturing mode asymmetry present in low-dimensional +data, (ii) Conditional Flow Matching outperforms other models for +high-dimensional data with low complexity, and (iii) Denoising Diffusion +Probabilistic Models appears the best for low-dimensional data with high +complexity. Our datasets include a Gaussian mixture model and the dihedral +torsion angle distribution of the Aib\textsubscript{9} peptide, generated via a +molecular dynamics simulation. We hope our taxonomy of probabilistic generative +frameworks and numerical results may guide model selection for a wide range of +molecular tasks. + +
+
+
+
+
+ + ☆ Are nuclear masks all you need for improved out-of-domain + generalisation? A closer look at cancer classification in histopathology NeurIPS 2024 + + +
+ Domain generalisation in computational histopathology is challenging because +the images are substantially affected by differences among hospitals due to +factors like fixation and staining of tissue and imaging equipment. We +hypothesise that focusing on nuclei can improve the out-of-domain (OOD) +generalisation in cancer detection. We propose a simple approach to improve OOD +generalisation for cancer detection by focusing on nuclear morphology and +organisation, as these are domain-invariant features critical in cancer +detection. Our approach integrates original images with nuclear segmentation +masks during training, encouraging the model to prioritise nuclei and their +spatial arrangement. Going beyond mere data augmentation, we introduce a +regularisation technique that aligns the representations of masks and original +images. We show, using multiple datasets, that our method improves OOD +generalisation and also leads to increased robustness to image corruptions and +adversarial attacks. The source code is available at +https://github.com/undercutspiky/SFL/ + +
+
+ comment: Poster at NeurIPS 2024 +
+
+
+
+
+ + ☆ Stability and Generalization for Distributed SGDA + + +
+ Minimax optimization is gaining increasing attention in modern machine +learning applications. Driven by large-scale models and massive volumes of data +collected from edge devices, as well as the concern to preserve client privacy, +communication-efficient distributed minimax optimization algorithms become +popular, such as Local Stochastic Gradient Descent Ascent (Local-SGDA), and +Local Decentralized SGDA (Local-DSGDA). While most existing research on +distributed minimax algorithms focuses on convergence rates, computation +complexity, and communication efficiency, the generalization performance +remains underdeveloped, whereas generalization ability is a pivotal indicator +for evaluating the holistic performance of a model when fed with unknown data. +In this paper, we propose the stability-based generalization analytical +framework for Distributed-SGDA, which unifies two popular distributed minimax +algorithms including Local-SGDA and Local-DSGDA, and conduct a comprehensive +analysis of stability error, generalization gap, and population risk across +different metrics under various settings, e.g., (S)C-(S)C, PL-SC, and NC-NC +cases. Our theoretical results reveal the trade-off between the generalization +gap and optimization error and suggest hyperparameters choice to obtain the +optimal population risk. Numerical experiments for Local-SGDA and Local-DSGDA +validate the theoretical results. + +
+
+
+
+
+ + ☆ Time-to-Event Pretraining for 3D Medical Imaging + + +
+ With the rise of medical foundation models and the growing availability of +imaging data, scalable pretraining techniques offer a promising way to identify +imaging biomarkers predictive of future disease risk. While current +self-supervised methods for 3D medical imaging models capture local structural +features like organ morphology, they fail to link pixel biomarkers with +long-term health outcomes due to a missing context problem. Current approaches +lack the temporal context necessary to identify biomarkers correlated with +disease progression, as they rely on supervision derived only from images and +concurrent text descriptions. To address this, we introduce time-to-event +pretraining, a pretraining framework for 3D medical imaging models that +leverages large-scale temporal supervision from paired, longitudinal electronic +health records (EHRs). Using a dataset of 18,945 CT scans (4.2 million 2D +images) and time-to-event distributions across thousands of EHR-derived tasks, +our method improves outcome prediction, achieving an average AUROC increase of +23.7% and a 29.4% gain in Harrell's C-index across 8 benchmark tasks. +Importantly, these gains are achieved without sacrificing diagnostic +classification performance. This study lays the foundation for integrating +longitudinal EHR and 3D imaging data to advance clinical risk prediction. + +
+
+ comment: 34 pages, 19 figures +
+
+
+
+
+ + ☆ Approximated Variational Bayesian Inverse Reinforcement Learning for + Large Language Model Alignment + + +
+ The alignment of large language models (LLMs) is crucial for generating +helpful and harmless content. Existing approaches leverage preference-based +human feedback data to learn the reward function and align the LLM with the +feedback data. However, these approaches focus on modeling the reward +difference between the chosen and rejected demonstrations, rather than directly +modeling the true reward from each demonstration. Moreover, these approaches +assume that the reward is only obtained at the end of the sentence, which +overlooks the modeling of intermediate rewards. These issues lead to +insufficient use of training signals in the feedback data, limiting the +representation and generalization ability of the reward and potentially +resulting in reward hacking. In this paper, we formulate LLM alignment as a +Bayesian Inverse Reinforcement Learning (BIRL) problem and propose a novel +training objective, Approximated Variational Alignment (AVA), to perform LLM +alignment through Approximated Variational Reward Imitation Learning (AVRIL). +The BIRL formulation facilitates intermediate reward modeling and direct reward +modeling on each single demonstration, which enhances the utilization of +training signals in the feedback data. Experiments show that AVA outperforms +existing LLM alignment approaches in reward modeling, RL fine-tuning, and +direct optimization. + +
+
+
+
+
+ + ☆ Improving hp-Variational Physics-Informed Neural Networks for + Steady-State Convection-Dominated Problems + + +
+ This paper proposes and studies two extensions of applying hp-variational +physics-informed neural networks, more precisely the FastVPINNs framework, to +convection-dominated convection-diffusion-reaction problems. First, a term in +the spirit of a SUPG stabilization is included in the loss functional and a +network architecture is proposed that predicts spatially varying stabilization +parameters. Having observed that the selection of the indicator function in +hard-constrained Dirichlet boundary conditions has a big impact on the accuracy +of the computed solutions, the second novelty is the proposal of a network +architecture that learns good parameters for a class of indicator functions. +Numerical studies show that both proposals lead to noticeably more accurate +results than approaches that can be found in the literature. + +
+
+ comment: 25 pages, 11 figures, 8 tables +
+
+
+
+
+ + ☆ Pie: Pooling CPU Memory for LLM Inference + + +
+ The rapid growth of LLMs has revolutionized natural language processing and +AI analysis, but their increasing size and memory demands present significant +challenges. A common solution is to spill over to CPU memory; however, +traditional GPU-CPU memory swapping often results in higher latency and lower +throughput. + This paper introduces Pie, an LLM inference framework that addresses these +challenges with performance-transparent swapping and adaptive expansion. By +leveraging predictable memory access patterns and the high bandwidth of modern +hardware like the NVIDIA GH200 Grace Hopper Superchip, Pie enables concurrent +data swapping without affecting foreground computation, expanding effective +memory without added latency. Adaptive expansion dynamically adjusts CPU memory +allocation based on real-time information, optimizing memory usage and +performance under varying conditions. + Pie maintains low computation latency, high throughput, and high elasticity. +Our experimental evaluation demonstrates that Pie achieves optimal swapping +policy during cache warmup and effectively balances increased memory capacity +with negligible impact on computation. With its extended capacity, Pie +outperforms vLLM by up to 1.9X in throughput and 2X in latency. Additionally, +Pie can reduce GPU memory usage by up to 1.67X while maintaining the same +performance. Compared to FlexGen, an offline profiling-based swapping solution, +Pie achieves magnitudes lower latency and 9.4X higher throughput. + +
+
+
+
+
+ + ☆ Approximate Probabilistic Inference forTime-Series Data A Robust Latent + Gaussian Model With Temporal Awareness + + +
+ The development of robust generative models for highly varied non-stationary +time series data is a complex yet important problem. Traditional models for +time series data prediction, such as Long Short-Term Memory (LSTM), are +inefficient and generalize poorly as they cannot capture complex temporal +relationships. In this paper, we present a probabilistic generative model that +can be trained to capture temporal information, and that is robust to data +errors. We call it Time Deep Latent Gaussian Model (tDLGM). Its novel +architecture is inspired by Deep Latent Gaussian Model (DLGM). Our model is +trained to minimize a loss function based on the negative log loss. One +contributing factor to Time Deep Latent Gaussian Model (tDLGM) robustness is +our regularizer, which accounts for data trends. Experiments conducted show +that tDLGM is able to reconstruct and generate complex time series data, and +that it is robust against to noise and faulty data. + +
+
+
+
+
+ + ☆ Compression Method for Solar Polarization Spectra Collected from Hinode + SOT/SP Observations + + +
+ The complex structure and extensive details of solar spectral data, combined +with a recent surge in volume, present significant processing challenges. To +address this, we propose a deep learning-based compression technique using deep +autoencoder (DAE) and 1D-convolutional autoencoder (CAE) models developed with +Hinode SOT/SP data. We focused on compressing Stokes I and V polarization +spectra from the quiet Sun, as well as from active regions, providing a novel +insight into comprehensive spectral analysis by incorporating spectra from +extreme magnetic fields. The results indicate that the CAE model outperforms +the DAE model in reconstructing Stokes profiles, demonstrating greater +robustness and achieving reconstruction errors around the observational noise +level. The proposed method has proven effective in compressing Stokes I and V +spectra from both the quiet Sun and active regions, highlighting its potential +for impactful applications in solar spectral analysis, such as detection of +unusual spectral signals. + +
+
+
+
+
+ + ☆ Enhancing generalization in high energy physics using white-box + adversarial attacks + + +
+ Machine learning is becoming increasingly popular in the context of particle +physics. Supervised learning, which uses labeled Monte Carlo (MC) simulations, +remains one of the most widely used methods for discriminating signals beyond +the Standard Model. However, this paper suggests that supervised models may +depend excessively on artifacts and approximations from Monte Carlo +simulations, potentially limiting their ability to generalize well to real +data. This study aims to enhance the generalization properties of supervised +models by reducing the sharpness of local minima. It reviews the application of +four distinct white-box adversarial attacks in the context of classifying Higgs +boson decay signals. The attacks are divided into weight space attacks, and +feature space attacks. To study and quantify the sharpness of different local +minima this paper presents two analysis methods: gradient ascent and reduced +Hessian eigenvalue analysis. The results show that white-box adversarial +attacks significantly improve generalization performance, albeit with increased +computational complexity. + +
+
+ comment: 10 pages, 4 figures, 8 tables, 3 algorithms, to be published in + Physical Review D (PRD), presented at the ML4Jets 2024 conference +
+
+
+
+
+ + ☆ A Centralized-Distributed Transfer Model for Cross-Domain Recommendation + Based on Multi-Source Heterogeneous Transfer Learning + + +
+ Cross-domain recommendation (CDR) methods are proposed to tackle the sparsity +problem in click through rate (CTR) estimation. Existing CDR methods directly +transfer knowledge from the source domains to the target domain and ignore the +heterogeneities among domains, including feature dimensional heterogeneity and +latent space heterogeneity, which may lead to negative transfer. Besides, most +of the existing methods are based on single-source transfer, which cannot +simultaneously utilize knowledge from multiple source domains to further +improve the model performance in the target domain. In this paper, we propose a +centralized-distributed transfer model (CDTM) for CDR based on multi-source +heterogeneous transfer learning. To address the issue of feature dimension +heterogeneity, we build a dual embedding structure: domain specific embedding +(DSE) and global shared embedding (GSE) to model the feature representation in +the single domain and the commonalities in the global space,separately. To +solve the latent space heterogeneity, the transfer matrix and attention +mechanism are used to map and combine DSE and GSE adaptively. Extensive offline +and online experiments demonstrate the effectiveness of our model. + +
+
+ comment: Published in: 2022 IEEE International Conference on Data Mining + (ICDM) (The authors were affiliated Hangzhou NetEase Cloud Music Technology + Co., Ltd.) +
+
+
+
+
+ + ☆ Towards efficient compression and communication for prototype-based + decentralized learning + + +
+ In prototype-based federated learning, the exchange of model parameters +between clients and the master server is replaced by transmission of prototypes +or quantized versions of the data samples to the aggregation server. A fully +decentralized deployment of prototype- based learning, without a central +agregartor of prototypes, is more robust upon network failures and reacts +faster to changes in the statistical distribution of the data, suggesting +potential advantages and quick adaptation in dynamic learning tasks, e.g., when +the data sources are IoT devices or when data is non-iid. In this paper, we +consider the problem of designing a communication-efficient decentralized +learning system based on prototypes. We address the challenge of prototype +redundancy by leveraging on a twofold data compression technique, i.e., sending +only update messages if the prototypes are informationtheoretically useful (via +the Jensen-Shannon distance), and using clustering on the prototypes to +compress the update messages used in the gossip protocol. We also use parallel +instead of sequential gossiping, and present an analysis of its +age-of-information (AoI). Our experimental results show that, with these +improvements, the communications load can be substantially reduced without +decreasing the convergence rate of the learning algorithm. + +
+
+ comment: 15 pages, 2 tables, 7 figures, 6 algorithms +
+
+
+
+
+ + ☆ How Good is ChatGPT at Audiovisual Deepfake Detection: A Comparative + Study of ChatGPT, AI Models and Human Perception + + +
+ Multimodal deepfakes involving audiovisual manipulations are a growing threat +because they are difficult to detect with the naked eye or using unimodal deep +learningbased forgery detection methods. Audiovisual forensic models, while +more capable than unimodal models, require large training datasets and are +computationally expensive for training and inference. Furthermore, these models +lack interpretability and often do not generalize well to unseen manipulations. +In this study, we examine the detection capabilities of a large language model +(LLM) (i.e., ChatGPT) to identify and account for any possible visual and +auditory artifacts and manipulations in audiovisual deepfake content. Extensive +experiments are conducted on videos from a benchmark multimodal deepfake +dataset to evaluate the detection performance of ChatGPT and compare it with +the detection capabilities of state-of-the-art multimodal forensic models and +humans. Experimental results demonstrate the importance of domain knowledge and +prompt engineering for video forgery detection tasks using LLMs. Unlike +approaches based on end-to-end learning, ChatGPT can account for spatial and +spatiotemporal artifacts and inconsistencies that may exist within or across +modalities. Additionally, we discuss the limitations of ChatGPT for multimedia +forensic tasks. + +
+
+
+
+
+ + ☆ Rethinking Weight-Averaged Model-merging + + +
+ Weight-averaged model-merging has emerged as a powerful approach in deep +learning, capable of enhancing model performance without fine-tuning or +retraining. However, the underlying mechanisms that explain its effectiveness +remain largely unexplored. In this paper, we investigate this technique from +three novel perspectives to provide deeper insights into how and why +weight-averaged model-merging works: (1) we examine the intrinsic patterns +captured by the learning of the model weights, through the visualizations of +their patterns on several datasets, showing that these weights often encode +structured and interpretable patterns; (2) we investigate model ensemble +merging strategies based on averaging on weights versus averaging on features, +providing detailed analyses across diverse architectures and datasets; and (3) +we explore the impact on model-merging prediction stability in terms of +changing the parameter magnitude, revealing insights into the way of weight +averaging works as regularization by showing the robustness across different +parameter scales. Our findings shed light on the "black box" of weight-averaged +model-merging, offering valuable insights and practical recommendations that +advance the model-merging process. + +
+
+
+
+
+ + ☆ FluidML: Fast and Memory Efficient Inference Optimization + + +
+ Machine learning models deployed on edge devices have enabled numerous +exciting new applications, such as humanoid robots, AR glasses, and autonomous +vehicles. However, the computing resources available on these edge devices are +not catching up with the ever-growing number of parameters in these models. As +the models become bigger and more complicated, the novel yet sophisticated +structure challenges the inference runtime optimization. We present FluidML, a +generic runtime memory management and optimization framework that can flexibly +transform the model execution blueprint to achieve faster and more +memory-efficient inference. Evaluations across different platforms show that +FluidML can consistently reduce the end-to-end inference latency by up to +25.38% for popular language models and reduce peak memory usage by up to +41.47%, compared to state-of-the-art approaches. FluidML is of ~30K line of +codes, built for general-purpose usage, and will be released as an open-source +inference runtime optimization framework to the community. + +
+
+
+
+
+ + ☆ Rethinking the "Heatmap + Monte Carlo Tree Search" Paradigm for Solving + Large Scale TSP + + +
+ The Travelling Salesman Problem (TSP) remains a fundamental challenge in +combinatorial optimization, inspiring diverse algorithmic strategies. This +paper revisits the "heatmap + Monte Carlo Tree Search (MCTS)" paradigm that has +recently gained traction for learning-based TSP solutions. Within this +framework, heatmaps encode the likelihood of edges forming part of the optimal +tour, and MCTS refines this probabilistic guidance to discover optimal +solutions. Contemporary approaches have predominantly emphasized the refinement +of heatmap generation through sophisticated learning models, inadvertently +sidelining the critical role of MCTS. Our extensive empirical analysis reveals +two pivotal insights: 1) The configuration of MCTS strategies profoundly +influences the solution quality, demanding meticulous tuning to leverage their +full potential; 2) Our findings demonstrate that a rudimentary and +parameter-free heatmap, derived from the intrinsic $k$-nearest nature of TSP, +can rival or even surpass the performance of complicated heatmaps, with strong +generalizability across various scales. Empirical evaluations across various +TSP scales underscore the efficacy of our approach, achieving competitive +results. These observations challenge the prevailing focus on heatmap +sophistication, advocating a reevaluation of the paradigm to harness both +components synergistically. Our code is available at: +https://github.com/LOGO-CUHKSZ/rethink_mcts_tsp. + +
+
+
+
+
+ + ☆ Classical Verification of Quantum Learning Advantages with Noises + + +
+ Classical verification of quantum learning allows classical clients to +reliably leverage quantum computing advantages by interacting with untrusted +quantum servers. Yet, current quantum devices available in practice suffers +from a variety of noises and whether existed classical verification protocols +carry over to noisy scenarios remains unclear. Here, we propose an efficient +classical error rectification algorithm to reconstruct the noise-free results +given by the quantum Fourier sampling circuit with practical constant-level +noises. In particular, we prove that the error rectification algorithm can +restore the heavy Fourier coefficients by using a small number of noisy samples +that scales logarithmically with the problem size. We apply this algorithm to +the agnostic parity learning task with uniform input marginal and prove that +this task can be accomplished in an efficient way on noisy quantum devices with +our algorithm. In addition, we prove that a classical client with access to the +random example oracle can verify the agnostic parity learning results from the +noisy quantum prover in an efficient way, under the condition that the Fourier +coefficients are sparse. Our results demonstrate the feasibility of classical +verification of quantum learning advantages with noises, which provide a +valuable guide for both theoretical studies and practical applications with +current noisy intermediate scale quantum devices. + +
+
+ comment: 13 pages 1 figure +
+
+
+
+
+ + ☆ Ghost-Connect Net: A Generalization-Enhanced Guidance For Sparse Deep + Networks Under Distribution Shifts + + +
+ Sparse deep neural networks (DNNs) excel in real-world applications like +robotics and computer vision, by reducing computational demands that hinder +usability. However, recent studies aim to boost DNN efficiency by trimming +redundant neurons or filters based on task relevance, but neglect their +adaptability to distribution shifts. We aim to enhance these existing +techniques by introducing a companion network, Ghost Connect-Net (GC-Net), to +monitor the connections in the original network with distribution +generalization advantage. GC-Net's weights represent connectivity measurements +between consecutive layers of the original network. After pruning GC-Net, the +pruned locations are mapped back to the original network as pruned connections, +allowing for the combination of magnitude and connectivity-based pruning +methods. Experimental results using common DNN benchmarks, such as CIFAR-10, +Fashion MNIST, and Tiny ImageNet show promising results for hybridizing the +method, and using GC-Net guidance for later layers of a network and direct +pruning on earlier layers. We provide theoretical foundations for GC-Net's +approach to improving generalization under distribution shifts. + +
+
+ comment: 21 pages, 4 figures, 3 subfigures, 42 tables +
+
+
+
+
+ + ☆ Dynamic technology impact analysis: A multi-task learning approach to + patent citation prediction + + +
+ Machine learning (ML) models are valuable tools for analyzing the impact of +technology using patent citation information. However, existing ML-based +methods often struggle to account for the dynamic nature of the technology +impact over time and the interdependencies of these impacts across different +periods. This study proposes a multi-task learning (MTL) approach to enhance +the prediction of technology impact across various time frames by leveraging +knowledge sharing and simultaneously monitoring the evolution of technology +impact. First, we quantify the technology impacts and identify patterns through +citation analysis over distinct time periods. Next, we develop MTL models to +predict citation counts using multiple patent indicators over time. Finally, we +examine the changes in key input indicators and their patterns over different +periods using the SHapley Additive exPlanation method. We also offer guidelines +for validating and interpreting the results by employing statistical methods +and natural language processing techniques. A case study on battery +technologies demonstrates that our approach not only deepens the understanding +of technology impact, but also improves prediction accuracy, yielding valuable +insights for both academia and industry. + +
+
+
+
+
+ + ☆ DeBaTeR: Denoising Bipartite Temporal Graph for Recommendation + + +
+ Due to the difficulty of acquiring large-scale explicit user feedback, +implicit feedback (e.g., clicks or other interactions) is widely applied as an +alternative source of data, where user-item interactions can be modeled as a +bipartite graph. Due to the noisy and biased nature of implicit real-world +user-item interactions, identifying and rectifying noisy interactions are vital +to enhance model performance and robustness. Previous works on purifying +user-item interactions in collaborative filtering mainly focus on mining the +correlation between user/item embeddings and noisy interactions, neglecting the +benefit of temporal patterns in determining noisy interactions. Time +information, while enhancing the model utility, also bears its natural +advantage in helping to determine noisy edges, e.g., if someone usually watches +horror movies at night and talk shows in the morning, a record of watching a +horror movie in the morning is more likely to be noisy interaction. Armed with +this observation, we introduce a simple yet effective mechanism for generating +time-aware user/item embeddings and propose two strategies for denoising +bipartite temporal graph in recommender systems (DeBaTeR): the first is through +reweighting the adjacency matrix (DeBaTeR-A), where a reliability score is +defined to reweight the edges through both soft assignment and hard assignment; +the second is through reweighting the loss function (DeBaTeR-L), where weights +are generated to reweight user-item samples in the losses. Extensive +experiments have been conducted to demonstrate the efficacy of our methods and +illustrate how time information indeed helps identifying noisy edges. + +
+
+
+
+
+ + ☆ SAFES: Sequential Privacy and Fairness Enhancing Data Synthesis for + Responsible AI + + +
+ As data-driven and AI-based decision making gains widespread adoption in most +disciplines, it is crucial that both data privacy and decision fairness are +appropriately addressed. While differential privacy (DP) provides a robust +framework for guaranteeing privacy and several widely accepted methods have +been proposed for improving fairness, the vast majority of existing literature +treats the two concerns independently. For methods that do consider privacy and +fairness simultaneously, they often only apply to a specific machine learning +task, limiting their generalizability. In response, we introduce SAFES, a +Sequential PrivAcy and Fairness Enhancing data Synthesis procedure that +sequentially combines DP data synthesis with a fairness-aware data +transformation. SAFES allows full control over the privacy-fairness-utility +trade-off via tunable privacy and fairness parameters. We illustrate SAFES by +combining AIM, a graphical model-based DP data synthesizer, with a popular +fairness-aware data pre-processing transformation. Empirical evaluations on the +Adult and COMPAS datasets demonstrate that for reasonable privacy loss, +SAFES-generated synthetic data achieve significantly improved fairness metrics +with relatively low utility loss. + +
+
+
+
+
+ + ☆ Hybrid deep additive neural networks + + +
+ Traditional neural networks (multi-layer perceptrons) have become an +important tool in data science due to their success across a wide range of +tasks. However, their performance is sometimes unsatisfactory, and they often +require a large number of parameters, primarily due to their reliance on the +linear combination structure. Meanwhile, additive regression has been a popular +alternative to linear regression in statistics. In this work, we introduce +novel deep neural networks that incorporate the idea of additive regression. +Our neural networks share architectural similarities with Kolmogorov-Arnold +networks but are based on simpler yet flexible activation and basis functions. +Additionally, we introduce several hybrid neural networks that combine this +architecture with that of traditional neural networks. We derive their +universal approximation properties and demonstrate their effectiveness through +simulation studies and a real-data application. The numerical results indicate +that our neural networks generally achieve better performance than traditional +neural networks while using fewer parameters. + +
+
+ comment: 29 pages, 13 figures +
+
+
+
+
+ + ☆ Advancing Diffusion Models: Alias-Free Resampling and Enhanced + Rotational Equivariance + + +
+ Recent advances in image generation, particularly via diffusion models, have +led to impressive improvements in image synthesis quality. Despite this, +diffusion models are still challenged by model-induced artifacts and limited +stability in image fidelity. In this work, we hypothesize that the primary +cause of this issue is the improper resampling operation that introduces +aliasing in the diffusion model and a careful alias-free resampling dictated by +image processing theory can improve the model's performance in image synthesis. +We propose the integration of alias-free resampling layers into the UNet +architecture of diffusion models without adding extra trainable parameters, +thereby maintaining computational efficiency. We then assess whether these +theory-driven modifications enhance image quality and rotational equivariance. +Our experimental results on benchmark datasets, including CIFAR-10, MNIST, and +MNIST-M, reveal consistent gains in image quality, particularly in terms of FID +and KID scores. Furthermore, we propose a modified diffusion process that +enables user-controlled rotation of generated images without requiring +additional training. Our findings highlight the potential of theory-driven +enhancements such as alias-free resampling in generative models to improve +image quality while maintaining model efficiency and pioneer future research +directions to incorporate them into video-generating diffusion models, enabling +deeper exploration of the applications of alias-free resampling in generative +modeling. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Towards Scalable Handwriting Communication via EEG Decoding and Latent + Embedding Integration + + +
+ In recent years, brain-computer interfaces have made advances in decoding +various motor-related tasks, including gesture recognition and movement +classification, utilizing electroencephalogram (EEG) data. These developments +are fundamental in exploring how neural signals can be interpreted to recognize +specific physical actions. This study centers on a written alphabet +classification task, where we aim to decode EEG signals associated with +handwriting. To achieve this, we incorporate hand kinematics to guide the +extraction of the consistent embeddings from high-dimensional neural recordings +using auxiliary variables (CEBRA). These CEBRA embeddings, along with the EEG, +are processed by a parallel convolutional neural network model that extracts +features from both data sources simultaneously. The model classifies nine +different handwritten characters, including symbols such as exclamation marks +and commas, within the alphabet. We evaluate the model using a quantitative +five-fold cross-validation approach and explore the structure of the embedding +space through visualizations. Our approach achieves a classification accuracy +of 91 % for the nine-class task, demonstrating the feasibility of fine-grained +handwriting decoding from EEG. + +
+
+ comment: 4 pages, 2 figures, 1 table, Name of Conference: International + Conference on Brain-Computer Interface +
+
+
+
+
+ + ☆ Rationality based Innate-Values-driven Reinforcement Learning + + +
+ Innate values describe agents' intrinsic motivations, which reflect their +inherent interests and preferences to pursue goals and drive them to develop +diverse skills satisfying their various needs. The essence of reinforcement +learning (RL) is learning from interaction based on reward-driven behaviors, +much like natural agents. It is an excellent model to describe the +innate-values-driven (IV) behaviors of AI agents. Especially developing the +awareness of the AI agent through balancing internal and external utilities +based on its needs in different tasks is a crucial problem for individuals +learning to support AI agents integrating human society with safety and harmony +in the long term. This paper proposes a hierarchical compound intrinsic value +reinforcement learning model -- innate-values-driven reinforcement learning +termed IVRL to describe the complex behaviors of AI agents' interaction. We +formulated the IVRL model and proposed two IVRL models: DQN and A2C. By +comparing them with benchmark algorithms such as DQN, DDQN, A2C, and PPO in the +Role-Playing Game (RPG) reinforcement learning test platform VIZDoom, we +demonstrated that rationally organizing various individual needs can +effectively achieve better performance. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2401.05572 +
+
+
+
+
+ + ☆ GRAINRec: Graph and Attention Integrated Approach for Real-Time + Session-Based Item Recommendations + + +
+ Recent advancements in session-based recommendation models using deep +learning techniques have demonstrated significant performance improvements. +While they can enhance model sophistication and improve the relevance of +recommendations, they also make it challenging to implement a scalable +real-time solution. To addressing this challenge, we propose GRAINRec- a Graph +and Attention Integrated session-based recommendation model that generates +recommendations in real-time. Our scope of work is item recommendations in +online retail where a session is defined as an ordered sequence of digital +guest actions, such as page views or adds to cart. The proposed model generates +recommendations by considering the importance of all items in the session +together, letting us predict relevant recommendations dynamically as the +session evolves. We also propose a heuristic approach to implement real-time +inferencing that meets Target platform's service level agreement (SLA). The +proposed architecture lets us predict relevant recommendations dynamically as +the session evolves, rather than relying on pre-computed recommendations for +each item. Evaluation results of the proposed model show an average improvement +of 1.5% across all offline evaluation metrics. A/B tests done over a 2 week +duration showed an increase of 10% in click through rate and 9% increase in +attributable demand. Extensive ablation studies are also done to understand our +model performance for different parameters. + +
+
+ comment: Accepted to the 2024 IEEE International Conference on Big Data (IEEE + BigData 2024) +
+
+
+
+
+ + ☆ Laplace Transform Interpretation of Differential Privacy + + +
+ We introduce a set of useful expressions of Differential Privacy (DP) notions +in terms of the Laplace transform of the privacy loss distribution. Its bare +form expression appears in several related works on analyzing DP, either as an +integral or an expectation. We show that recognizing the expression as a +Laplace transform unlocks a new way to reason about DP properties by exploiting +the duality between time and frequency domains. Leveraging our interpretation, +we connect the $(q, \rho(q))$-R\'enyi DP curve and the $(\epsilon, +\delta(\epsilon))$-DP curve as being the Laplace and inverse-Laplace transforms +of one another. This connection shows that the R\'enyi divergence is +well-defined for complex orders $q = \gamma + i \omega$. Using our Laplace +transform-based analysis, we also prove an adaptive composition theorem for +$(\epsilon, \delta)$-DP guarantees that is exactly tight (i.e., matches even in +constants) for all values of $\epsilon$. Additionally, we resolve an issue +regarding symmetry of $f$-DP on subsampling that prevented equivalence across +all functional DP notions. + +
+
+
+
+
+ + ☆ Complexity-Aware Training of Deep Neural Networks for Optimal Structure + Discovery + + +
+ We propose a novel algorithm for combined unit/filter and layer pruning of +deep neural networks that functions during training and without requiring a +pre-trained network to apply. Our algorithm optimally trades-off learning +accuracy and pruning levels while balancing layer vs. unit/filter pruning and +computational vs. parameter complexity using only three user-defined +parameters, which are easy to interpret and tune. The optimal network structure +is found as the solution of a stochastic optimization problem over the network +weights and the parameters of variational Bernoulli distributions for 0/1 +Random Variables scaling the units and layers of the network. Pruning occurs +when a variational parameter converges to 0 rendering the corresponding +structure permanently inactive, thus saving computations during training and +prediction. A key contribution of our approach is to define a cost function +that combines the objectives of prediction accuracy and network pruning in a +computational/parameter complexity-aware manner and the automatic selection of +the many regularization parameters. We show that the solutions of the +optimization problem to which the algorithm converges are deterministic +networks. We analyze the ODE system that underlies our stochastic optimization +algorithm and establish domains of attraction around zero for the dynamics of +the network parameters. These results provide theoretical support for safely +pruning units/filters and/or layers during training and lead to practical +pruning conditions. We evaluate our method on the CIFAR-10/100 and ImageNet +datasets using ResNet architectures and demonstrate that our method improves +upon layer only or unit only pruning and favorably competes with combined +unit/filter and layer pruning algorithms requiring pre-trained networks with +respect to pruning ratios and test accuracy. + +
+
+ comment: 28 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Neural Graph Simulator for Complex Systems + + +
+ Numerical simulation is a predominant tool for studying the dynamics in +complex systems, but large-scale simulations are often intractable due to +computational limitations. Here, we introduce the Neural Graph Simulator (NGS) +for simulating time-invariant autonomous systems on graphs. Utilizing a graph +neural network, the NGS provides a unified framework to simulate diverse +dynamical systems with varying topologies and sizes without constraints on +evaluation times through its non-uniform time step and autoregressive approach. +The NGS offers significant advantages over numerical solvers by not requiring +prior knowledge of governing equations and effectively handling noisy or +missing data with a robust training scheme. It demonstrates superior +computational efficiency over conventional methods, improving performance by +over $10^5$ times in stiff problems. Furthermore, it is applied to real traffic +data, forecasting traffic flow with state-of-the-art accuracy. The versatility +of the NGS extends beyond the presented cases, offering numerous potential +avenues for enhancement. + +
+
+
+
+
+ + ☆ FxTS-Net: Fixed-Time Stable Learning Framework for Neural ODEs + + +
+ Neural Ordinary Differential Equations (Neural ODEs), as a novel category of +modeling big data methods, cleverly link traditional neural networks and +dynamical systems. However, it is challenging to ensure the dynamics system +reaches a correctly predicted state within a user-defined fixed time. To +address this problem, we propose a new method for training Neural ODEs using +fixed-time stability (FxTS) Lyapunov conditions. Our framework, called +FxTS-Net, is based on the novel FxTS loss (FxTS-Loss) designed on Lyapunov +functions, which aims to encourage convergence to accurate predictions in a +user-defined fixed time. We also provide an innovative approach for +constructing Lyapunov functions to meet various tasks and network architecture +requirements, achieved by leveraging supervised information during training. By +developing a more precise time upper bound estimation for bounded +non-vanishingly perturbed systems, we demonstrate that minimizing FxTS-Loss not +only guarantees FxTS behavior of the dynamics but also input perturbation +robustness. For optimising FxTS-Loss, we also propose a learning algorithm, in +which the simulated perturbation sampling method can capture sample points in +critical regions to approximate FxTS-Loss. Experimentally, we find that +FxTS-Net provides better prediction performance and better robustness under +input perturbation. + +
+
+
+
+
+ + ☆ Efficiently learning and sampling multimodal distributions with + data-based initialization + + +
+ We consider the problem of sampling a multimodal distribution with a Markov +chain given a small number of samples from the stationary measure. Although +mixing can be arbitrarily slow, we show that if the Markov chain has a $k$th +order spectral gap, initialization from a set of $\tilde O(k/\varepsilon^2)$ +samples from the stationary distribution will, with high probability over the +samples, efficiently generate a sample whose conditional law is +$\varepsilon$-close in TV distance to the stationary measure. In particular, +this applies to mixtures of $k$ distributions satisfying a Poincar\'e +inequality, with faster convergence when they satisfy a log-Sobolev inequality. +Our bounds are stable to perturbations to the Markov chain, and in particular +work for Langevin diffusion over $\mathbb R^d$ with score estimation error, as +well as Glauber dynamics combined with approximation error from +pseudolikelihood estimation. This justifies the success of data-based +initialization for score matching methods despite slow mixing for the data +distribution, and improves and generalizes the results of Koehler and Vuong +(2023) to have linear, rather than exponential, dependence on $k$ and apply to +arbitrary semigroups. As a consequence of our results, we show for the first +time that a natural class of low-complexity Ising measures can be efficiently +learned from samples. + +
+
+
+
+
+ + ☆ Reducing Reasoning Costs - The Path of Optimization for Chain of Thought + via Sparse Attention Mechanism NeurIPS 2024 + + +
+ In order to address the chain of thought in the large language model +inference cost surge, this research proposes to use a sparse attention +mechanism that only focuses on a few relevant tokens. The researcher +constructed a new attention mechanism and used GiantRabbit trained with custom +GPTs as an experimental tool. The experiment tested and compared the reasoning +time, correctness score and chain of thought length of this model and o1 +Preview in solving the linear algebra test questions of MIT OpenCourseWare. The +results show that GiantRabbit's reasoning time and chain of thought length are +significantly lower than o1 Preview, confirming the feasibility of the sparse +attention mechanism in reducing chain of thought reasoning. Detailed +architectural details and experimental process have been uploaded to Github, +the link is:https://github.com/brucewang123456789/GeniusTrail.git. + +
+
+ comment: The main text is 9 pages, totaling 13 pages; 5 figures, 3 tables; + preprints have been submitted to NeurIPS 2024 Workshop MusIML and OpenReview +
+
+
+
+
+ + ☆ NeuralDEM -- Real-time Simulation of Industrial Particulate Flows + + +
+ Advancements in computing power have made it possible to numerically simulate +large-scale fluid-mechanical and/or particulate systems, many of which are +integral to core industrial processes. Among the different numerical methods +available, the discrete element method (DEM) provides one of the most accurate +representations of a wide range of physical systems involving granular and +discontinuous materials. Consequently, DEM has become a widely accepted +approach for tackling engineering problems connected to granular flows and +powder mechanics. Additionally, DEM can be integrated with grid-based +computational fluid dynamics (CFD) methods, enabling the simulation of chemical +processes taking place, e.g., in fluidized beds. However, DEM is +computationally intensive because of the intrinsic multiscale nature of +particulate systems, restricting simulation duration or number of particles. +Towards this end, NeuralDEM presents an end-to-end approach to replace slow +numerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM +is capable of picturing long-term transport processes across different regimes +using macroscopic observables without any reference to microscopic model +parameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an +underlying continuous field, while simultaneously modeling macroscopic behavior +directly as additional auxiliary fields. Second, NeuralDEM introduces +multi-branch neural operators scalable to real-time modeling of +industrially-sized scenarios - from slow and pseudo-steady to fast and +transient. Such scenarios have previously posed insurmountable challenges for +deep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM +fluidized bed reactors of 160k CFD cells and 500k DEM particles for +trajectories of 28s. NeuralDEM will open many new doors to advanced engineering +and much faster process cycles. + +
+
+ comment: Project page: https://nx-ai.github.io/NeuralDEM/ +
+
+
+
+
+ + ☆ Towards efficient compression and communication for prototype-based + decentralized learning + + +
+ In prototype-based federated learning, the exchange of model parameters +between clients and the master server is replaced by transmission of prototypes +or quantized versions of the data samples to the aggregation server. A fully +decentralized deployment of prototype-based learning, without a central +agregartor of prototypes, is more robust upon network failures and reacts +faster to changes in the statistical distribution of the data, suggesting +potential advantages and quick adaptation in dynamic learning tasks, e.g., when +the data sources are IoT devices or when data is non-iid. In this paper, we +consider the problem of designing a communication-efficient decentralized +learning system based on prototypes. We address the challenge of prototype +redundancy by leveraging on a twofold data compression technique, i.e., sending +only update messages if the prototypes are informationtheoretically useful (via +the Jensen-Shannon distance), and using clustering on the prototypes to +compress the update messages used in the gossip protocol. We also use parallel +instead of sequential gossiping, and present an analysis of its +age-of-information (AoI). Our experimental results show that, with these +improvements, the communications load can be substantially reduced without +decreasing the convergence rate of the learning algorithm. + +
+
+ comment: 15 pages, 2 tables, 7 figures, 6 algorithms +
+
+
+
+
+ + ☆ GRAINRec: Graph and Attention Integrated Approach for Real-Time + Session-Based Item Recommendations + + +
+ Recent advancements in session-based recommendation models using deep +learning techniques have demonstrated significant performance improvements. +While they can enhance model sophistication and improve the relevance of +recommendations, they also make it challenging to implement a scalable +real-time solution. To addressing this challenge, we propose GRAINRec: a Graph +and Attention Integrated session-based recommendation model that generates +recommendations in real-time. Our scope of work is item recommendations in +online retail where a session is defined as an ordered sequence of digital +guest actions, such as page views or adds to cart. The proposed model generates +recommendations by considering the importance of all items in the session +together, letting us predict relevant recommendations dynamically as the +session evolves. We also propose a heuristic approach to implement real-time +inferencing that meets Target platform's service level agreement (SLA). The +proposed architecture lets us predict relevant recommendations dynamically as +the session evolves, rather than relying on pre-computed recommendations for +each item. Evaluation results of the proposed model show an average improvement +of 1.5% across all offline evaluation metrics. A/B tests done over a 2 week +duration showed an increase of 10% in click through rate and 9% increase in +attributable demand. Extensive ablation studies are also done to understand our +model performance for different parameters. + +
+
+ comment: Accepted to the 2024 IEEE International Conference on Big Data (IEEE + BigData 2024) +
+
+
+
+
+ + ☆ Reducing Reasoning Costs -- The Path of Optimization for Chain of + Thought via Sparse Attention Mechanism NeurIPS 2024 + + +
+ In order to address the chain of thought in the large language model +inference cost surge, this research proposes to use a sparse attention +mechanism that only focuses on a few relevant tokens. The researcher +constructed a new attention mechanism and used GiantRabbit trained with custom +GPTs as an experimental tool. The experiment tested and compared the reasoning +time, correctness score and chain of thought length of this model and o1 +Preview in solving the linear algebra test questions of MIT OpenCourseWare. The +results show that GiantRabbit's reasoning time and chain of thought length are +significantly lower than o1 Preview, confirming the feasibility of the sparse +attention mechanism in reducing chain of thought reasoning. Detailed +architectural details and experimental process have been uploaded to Github, +the link is:https://github.com/brucewang123456789/GeniusTrail.git. + +
+
+ comment: The main text is 9 pages, totaling 13 pages; 5 figures, 3 tables; + preprints have been submitted to NeurIPS 2024 Workshop MusIML and OpenReview +
+
+
+
+
+ + ☆ Self-Supervised Radio Pre-training: Toward Foundational Models for + Spectrogram Learning + + +
+ Foundational deep learning (DL) models are general models, trained on large, +diverse, and unlabelled datasets, typically using self-supervised learning +techniques have led to significant advancements especially in natural language +processing. These pretrained models can be fine-tuned for related downstream +tasks, offering faster development and reduced training costs, while often +achieving improved performance. In this work, we introduce Masked Spectrogram +Modeling, a novel self-supervised learning approach for pretraining +foundational DL models on radio signals. Adopting a Convolutional LSTM +architecture for efficient spatio-temporal processing, we pretrain the model +with an unlabelled radio dataset collected from over-the-air measurements. +Subsequently, the pretrained model is fine-tuned for two downstream tasks: +spectrum forecasting and segmentation. Experimental results demonstrate that +our methodology achieves competitive performance in both forecasting accuracy +and segmentation, validating its effectiveness for developing foundational +radio models. + +
+
+
+
+
+ + ☆ Towards a Fairer Non-negative Matrix Factorization + + +
+ Topic modeling, or more broadly, dimensionality reduction, techniques provide +powerful tools for uncovering patterns in large datasets and are widely applied +across various domains. We investigate how Non-negative Matrix Factorization +(NMF) can introduce bias in the representation of data groups, such as those +defined by demographics or protected attributes. We present an approach, called +Fairer-NMF, that seeks to minimize the maximum reconstruction loss for +different groups relative to their size and intrinsic complexity. Further, we +present two algorithms for solving this problem. The first is an alternating +minimization (AM) scheme and the second is a multiplicative updates (MU) scheme +which demonstrates a reduced computational time compared to AM while still +achieving similar performance. Lastly, we present numerical experiments on +synthetic and real datasets to evaluate the overall performance and trade-offs +of Fairer-NMF + +
+
+
+
+
+ + ☆ Deep Autoencoders for Unsupervised Anomaly Detection in Wildfire + Prediction + + +
+ Wildfires pose a significantly increasing hazard to global ecosystems due to +the climate crisis. Due to its complex nature, there is an urgent need for +innovative approaches to wildfire prediction, such as machine learning. This +research took a unique approach, differentiating from classical supervised +learning, and addressed the gap in unsupervised wildfire prediction using +autoencoders and clustering techniques for anomaly detection. Historical +weather and normalised difference vegetation index datasets of Australia for +2005 - 2021 were utilised. Two main unsupervised approaches were analysed. The +first used a deep autoencoder to obtain latent features, which were then fed +into clustering models, isolation forest, local outlier factor and one-class +SVM for anomaly detection. The second approach used a deep autoencoder to +reconstruct the input data and use reconstruction errors to identify anomalies. +Long Short-Term Memory (LSTM) autoencoders and fully connected (FC) +autoencoders were employed in this part, both in an unsupervised way learning +only from nominal data. The FC autoencoder outperformed its counterparts, +achieving an accuracy of 0.71, an F1-score of 0.74, and an MCC of 0.42. These +findings highlight the practicality of this method, as it effectively predicts +wildfires in the absence of ground truth, utilising an unsupervised learning +technique. + +
+
+ comment: 33 pages, 18 figure, 16 tables. To appear in Earth and Space Science +
+
+
+
+
+ + ☆ FedRewind: Rewinding Continual Model Exchange for Decentralized + Federated Learning + + +
+ In this paper, we present FedRewind, a novel approach to decentralized +federated learning that leverages model exchange among nodes to address the +issue of data distribution shift. Drawing inspiration from continual learning +(CL) principles and cognitive neuroscience theories for memory retention, +FedRewind implements a decentralized routing mechanism where nodes send/receive +models to/from other nodes in the federation to address spatial distribution +challenges inherent in distributed learning (FL). During local training, +federation nodes periodically send their models back (i.e., rewind) to the +nodes they received them from for a limited number of iterations. This strategy +reduces the distribution shift between nodes' data, leading to enhanced +learning and generalization performance. We evaluate our method on multiple +benchmarks, demonstrating its superiority over standard decentralized federated +learning methods and those enforcing specific routing schemes within the +federation. Furthermore, the combination of federated and continual learning +concepts enables our method to tackle the more challenging federated continual +learning task, with data shifts over both space and time, surpassing existing +baselines. + +
+
+
+
+
+ + ☆ Real-time Adapting Routing (RAR): Improving Efficiency Through + Continuous Learning in Software Powered by Layered Foundation Models + + +
+ To balance the quality and inference cost of a Foundation Model (FM, such as +large language models (LLMs)) powered software, people often opt to train a +routing model that routes requests to FMs with different sizes and +capabilities. Existing routing models rely on learning the optimal routing +decision from carefully curated data, require complex computations to be +updated, and do not consider the potential evolution of weaker FMs. In this +paper, we propose Real-time Adaptive Routing (RAR), an approach to continuously +adapt FM routing decisions while using guided in-context learning to enhance +the capabilities of weaker FM. The goal is to reduce reliance on stronger, more +expensive FMs. We evaluate our approach on different subsets of the popular +MMLU benchmark. Over time, our approach routes 50.2% fewer requests to +computationally expensive models while maintaining around 90.5% of the general +response quality. In addition, the guides generated from stronger models have +shown intra-domain generalization and led to a better quality of responses +compared to an equivalent approach with a standalone weaker FM. + +
+
+
+
+
+ + ☆ The Good, The Efficient and the Inductive Biases: Exploring Efficiency + in Deep Learning Through the Use of Inductive Biases + + +
+ The emergence of Deep Learning has marked a profound shift in machine +learning, driven by numerous breakthroughs achieved in recent years. However, +as Deep Learning becomes increasingly present in everyday tools and +applications, there is a growing need to address unresolved challenges related +to its efficiency and sustainability. This dissertation delves into the role of +inductive biases -- particularly, continuous modeling and symmetry preservation +-- as strategies to enhance the efficiency of Deep Learning. It is structured +in two main parts. + The first part investigates continuous modeling as a tool to improve the +efficiency of Deep Learning algorithms. Continuous modeling involves the idea +of parameterizing neural operations in a continuous space. The research +presented here demonstrates substantial benefits for the (i) computational +efficiency -- in time and memory, (ii) the parameter efficiency, and (iii) +design efficiency -- the complexity of designing neural architectures for new +datasets and tasks. + The second focuses on the role of symmetry preservation on Deep Learning +efficiency. Symmetry preservation involves designing neural operations that +align with the inherent symmetries of data. The research presented in this part +highlights significant gains both in data and parameter efficiency through the +use of symmetry preservation. However, it also acknowledges a resulting +trade-off of increased computational costs. + The dissertation concludes with a critical evaluation of these findings, +openly discussing their limitations and proposing strategies to address them, +informed by literature and the author insights. It ends by identifying +promising future research avenues in the exploration of inductive biases for +efficiency, and their wider implications for Deep Learning. + +
+
+ comment: PhD Dissertation +
+
+
+
+
+ + ☆ Automatic Classification of General Movements in Newborns ML4H + + +
+ General movements (GMs) are spontaneous, coordinated body movements in +infants that offer valuable insights into the developing nervous system. +Assessed through the Prechtl GM Assessment (GMA), GMs are reliable predictors +for neurodevelopmental disorders. However, GMA requires specifically trained +clinicians, who are limited in number. To scale up newborn screening, there is +a need for an algorithm that can automatically classify GMs from infant video +recordings. This data poses challenges, including variability in recording +length, device type, and setting, with each video coarsely annotated for +overall movement quality. In this work, we introduce a tool for extracting +features from these recordings and explore various machine learning techniques +for automated GM classification. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages +
+
+
+
+
+ + ☆ WelQrate: Defining the Gold Standard in Small Molecule Drug Discovery + Benchmarking + + +
+ While deep learning has revolutionized computer-aided drug discovery, the AI +community has predominantly focused on model innovation and placed less +emphasis on establishing best benchmarking practices. We posit that without a +sound model evaluation framework, the AI community's efforts cannot reach their +full potential, thereby slowing the progress and transfer of innovation into +real-world drug discovery. Thus, in this paper, we seek to establish a new gold +standard for small molecule drug discovery benchmarking, WelQrate. +Specifically, our contributions are threefold: WelQrate Dataset Collection - we +introduce a meticulously curated collection of 9 datasets spanning 5 +therapeutic target classes. Our hierarchical curation pipelines, designed by +drug discovery experts, go beyond the primary high-throughput screen by +leveraging additional confirmatory and counter screens along with rigorous +domain-driven preprocessing, such as Pan-Assay Interference Compounds (PAINS) +filtering, to ensure the high-quality data in the datasets; WelQrate Evaluation +Framework - we propose a standardized model evaluation framework considering +high-quality datasets, featurization, 3D conformation generation, evaluation +metrics, and data splits, which provides a reliable benchmarking for drug +discovery experts conducting real-world virtual screening; Benchmarking - we +evaluate model performance through various research questions using the +WelQrate dataset collection, exploring the effects of different models, dataset +quality, featurization methods, and data splitting strategies on the results. +In summary, we recommend adopting our proposed WelQrate as the gold standard in +small molecule drug discovery benchmarking. The WelQrate dataset collection, +along with the curation codes, and experimental scripts are all publicly +available at WelQrate.org. + +
+
+ comment: * denotes equal contribution +
+
+
+
+
+ + ☆ Learning Parameter Sharing with Tensor Decompositions and Sparsity + + +
+ Large neural networks achieve remarkable performance, but their size hinders +deployment on resource-constrained devices. While various compression +techniques exist, parameter sharing remains relatively unexplored. This paper +introduces Fine-grained Parameter Sharing (FiPS), a novel algorithm that +leverages the relationship between parameter sharing, tensor decomposition, and +sparsity to efficiently compress large vision transformer models. FiPS employs +a shared base and sparse factors to represent shared neurons across multi-layer +perception (MLP) modules. Shared parameterization is initialized via Singular +Value Decomposition (SVD) and optimized by minimizing block-wise reconstruction +error. Experiments demonstrate that FiPS compresses DeiT-B and Swin-L MLPs to +25-40% of their original parameter count while maintaining accuracy within 1 +percentage point of the original models. + +
+
+
+
+
+ + ☆ Can Features for Phishing URL Detection Be Trusted Across Diverse + Datasets? A Case Study with Explainable AI + + +
+ Phishing has been a prevalent cyber threat that manipulates users into +revealing sensitive private information through deceptive tactics, designed to +masquerade as trustworthy entities. Over the years, proactively detection of +phishing URLs (or websites) has been established as an widely-accepted defense +approach. In literature, we often find supervised Machine Learning (ML) models +with highly competitive performance for detecting phishing websites based on +the extracted features from both phishing and benign (i.e., legitimate) +websites. However, it is still unclear if these features or indicators are +dependent on a particular dataset or they are generalized for overall phishing +detection. In this paper, we delve deeper into this issue by analyzing two +publicly available phishing URL datasets, where each dataset has its own set of +unique and overlapping features related to URL string and website contents. We +want to investigate if overlapping features are similar in nature across +datasets and how does the model perform when trained on one dataset and tested +on the other. We conduct practical experiments and leverage explainable AI +(XAI) methods such as SHAP plots to provide insights into different features' +contributions in case of phishing detection to answer our primary question, +``Can features for phishing URL detection be trusted across diverse dataset?''. +Our case study experiment results show that features for phishing URL detection +can often be dataset-dependent and thus may not be trusted across different +datasets even though they share same set of feature behaviors. + +
+
+ comment: 8 pages, 10 figures, The 11th International Conference on Networking, + Systems and Security, December 19-21, 2024 +
+
+
+
+
+ + ☆ Edge Caching Optimization with PPO and Transfer Learning for Dynamic + Environments + + +
+ This paper addresses the challenge of edge caching in dynamic environments, +where rising traffic loads strain backhaul links and core networks. We propose +a Proximal Policy Optimization (PPO)-based caching strategy that fully +incorporates key file attributes such as size, lifetime, importance, and +popularity, while also considering random file request arrivals, reflecting +more realistic edge caching scenarios. In dynamic environments, changes such as +shifts in content popularity and variations in request rates frequently occur, +making previously learned policies less effective as they were optimized for +earlier conditions. Without adaptation, caching efficiency and response times +can degrade. While learning a new policy from scratch in a new environment is +an option, it is highly inefficient and computationally expensive. Thus, +adapting an existing policy to these changes is critical. To address this, we +develop a mechanism that detects changes in content popularity and request +rates, ensuring timely adjustments to the caching strategy. We also propose a +transfer learning-based PPO algorithm that accelerates convergence in new +environments by leveraging prior knowledge. Simulation results demonstrate the +significant effectiveness of our approach, outperforming a recent Deep +Reinforcement Learning (DRL)-based method. + +
+
+
+
+
+ + ♻ ☆ Enhancing Maritime Trajectory Forecasting via H3 Index and Causal + Language Modelling (CLM) + + +
+ The prediction of ship trajectories is a growing field of study in artificial +intelligence. Traditional methods rely on the use of LSTM, GRU networks, and +even Transformer architectures for the prediction of spatio-temporal series. +This study proposes a viable alternative for predicting these trajectories +using only GNSS positions. It considers this spatio-temporal problem as a +natural language processing problem. The latitude/longitude coordinates of AIS +messages are transformed into cell identifiers using the H3 index. Thanks to +the pseudo-octal representation, it becomes easier for language models to learn +the spatial hierarchy of the H3 index. The method is compared with a classical +Kalman filter, widely used in the maritime domain, and introduces the Fr\'echet +distance as the main evaluation metric. We show that it is possible to predict +ship trajectories quite precisely up to 8 hours ahead with 30 minutes of +context, using solely GNSS positions, without relying on any additional +information such as speed, course, or external conditions - unlike many +traditional methods. We demonstrate that this alternative works well enough to +predict trajectories worldwide. + +
+
+ comment: 28 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ AutoDefense: Multi-Agent LLM Defense against Jailbreak Attacks + + +
+ Despite extensive pre-training in moral alignment to prevent generating +harmful information, large language models (LLMs) remain vulnerable to +jailbreak attacks. In this paper, we propose AutoDefense, a multi-agent defense +framework that filters harmful responses from LLMs. With the response-filtering +mechanism, our framework is robust against different jailbreak attack prompts, +and can be used to defend different victim models. AutoDefense assigns +different roles to LLM agents and employs them to complete the defense task +collaboratively. The division in tasks enhances the overall +instruction-following of LLMs and enables the integration of other defense +components as tools. With AutoDefense, small open-source LMs can serve as +agents and defend larger models against jailbreak attacks. Our experiments show +that AutoDefense can effectively defense against different jailbreak attacks, +while maintaining the performance at normal user request. For example, we +reduce the attack success rate on GPT-3.5 from 55.74% to 7.95% using +LLaMA-2-13b with a 3-agent system. Our code and data are publicly available at +https://github.com/XHMY/AutoDefense. + +
+
+
+
+
+ + ♻ ☆ Learning Multi-Agent Loco-Manipulation for Long-Horizon Quadrupedal + Pushing + + +
+ Recently, quadrupedal locomotion has achieved significant success, but their +manipulation capabilities, particularly in handling large objects, remain +limited, restricting their usefulness in demanding real-world applications such +as search and rescue, construction, industrial automation, and room +organization. This paper tackles the task of obstacle-aware, long-horizon +pushing by multiple quadrupedal robots. We propose a hierarchical multi-agent +reinforcement learning framework with three levels of control. The high-level +controller integrates an RRT planner and a centralized adaptive policy to +generate subgoals, while the mid-level controller uses a decentralized +goal-conditioned policy to guide the robots toward these sub-goals. A +pre-trained low-level locomotion policy executes the movement commands. We +evaluate our method against several baselines in simulation, demonstrating +significant improvements over baseline approaches, with 36.0% higher success +rates and 24.5% reduction in completion time than the best baseline. Our +framework successfully enables long-horizon, obstacle-aware manipulation tasks +like Push-Cuboid and Push-T on Go1 robots in the real world. + +
+
+
+
+
+ + ♻ ☆ Stable Consistency Tuning: Understanding and Improving Consistency + Models + + +
+ Diffusion models achieve superior generation quality but suffer from slow +generation speed due to the iterative nature of denoising. In contrast, +consistency models, a new generative family, achieve competitive performance +with significantly faster sampling. These models are trained either through +consistency distillation, which leverages pretrained diffusion models, or +consistency training/tuning directly from raw data. In this work, we propose a +novel framework for understanding consistency models by modeling the denoising +process of the diffusion model as a Markov Decision Process (MDP) and framing +consistency model training as the value estimation through Temporal +Difference~(TD) Learning. More importantly, this framework allows us to analyze +the limitations of current consistency training/tuning strategies. Built upon +Easy Consistency Tuning (ECT), we propose Stable Consistency Tuning (SCT), +which incorporates variance-reduced learning using the score identity. SCT +leads to significant performance improvements on benchmarks such as CIFAR-10 +and ImageNet-64. On ImageNet-64, SCT achieves 1-step FID 2.42 and 2-step FID +1.55, a new SoTA for consistency models. + +
+
+ comment: Code is available at + https://github.com/G-U-N/Stable-Consistency-Tuning +
+
+
+
+
+ + ♻ ☆ From Imitation to Refinement -- Residual RL for Precise Assembly + + +
+ Advances in behavior cloning (BC), like action-chunking and diffusion, have +enabled impressive capabilities. Still, imitation alone remains insufficient +for learning reliable policies for tasks requiring precise aligning and +inserting of objects, like assembly. Our key insight is that chunked BC +policies effectively function as trajectory planners, enabling long-horizon +tasks. Conversely, as they execute action chunks open-loop, they lack the +fine-grained reactivity necessary for reliable execution. Further, we find that +the performance of BC policies saturates despite increasing data. Reinforcement +learning (RL) is a natural way to overcome BC's limitations, but it is not +straightforward to apply directly to action-chunked models like diffusion +policies. We present a simple yet effective method, ResiP (Residual for Precise +Manipulation), that sidesteps these challenges by augmenting a frozen, chunked +BC model with a fully closed-loop residual policy trained with RL. The residual +policy is trained via on-policy RL, addressing distribution shifts and +introducing reactive control without altering the BC trajectory planner. +Evaluation on high-precision manipulation tasks demonstrates strong performance +of ResiP over BC methods and direct RL fine-tuning. Videos, code, and data are +available at https://residual-assembly.github.io. + +
+
+ comment: Project website: https://residual-assembly.github.io +
+
+
+
+
+ + ♻ ☆ Equivariant Symmetry Breaking Sets + + +
+ Equivariant neural networks (ENNs) have been shown to be extremely effective +in applications involving underlying symmetries. By construction ENNs cannot +produce lower symmetry outputs given a higher symmetry input. However, symmetry +breaking occurs in many physical systems and we may obtain a less symmetric +stable state from an initial highly symmetric one. Hence, it is imperative that +we understand how to systematically break symmetry in ENNs. In this work, we +propose a novel symmetry breaking framework that is fully equivariant and is +the first which fully addresses spontaneous symmetry breaking. We emphasize +that our approach is general and applicable to equivariance under any group. To +achieve this, we introduce the idea of symmetry breaking sets (SBS). Rather +than redesign existing networks, we design sets of symmetry breaking objects +which we feed into our network based on the symmetry of our inputs and outputs. +We show there is a natural way to define equivariance on these sets, which +gives an additional constraint. Minimizing the size of these sets equates to +data efficiency. We prove that minimizing these sets translates to a well +studied group theory problem, and tabulate solutions to this problem for the +point groups. Finally, we provide some examples of symmetry breaking to +demonstrate how our approach works in practice. The code for these examples is +available at \url{https://github.com/atomicarchitects/equivariant-SBS}. + +
+
+ comment: 50 pages, 19 figures Published in Transactions on Machine Learning + Research, October 2024 +
+
+
+
+
+ + ♻ ☆ Causal Discovery and Classification Using Lempel-Ziv Complexity + + +
+ Inferring causal relationships in the decision-making processes of machine +learning algorithms is a crucial step toward achieving explainable Artificial +Intelligence (AI). In this research, we introduce a novel causality measure and +a distance metric derived from Lempel-Ziv (LZ) complexity. We explore how the +proposed causality measure can be used in decision trees by enabling splits +based on features that most strongly \textit{cause} the outcome. We further +evaluate the effectiveness of the causality-based decision tree and the +distance-based decision tree in comparison to a traditional decision tree using +Gini impurity. While the proposed methods demonstrate comparable classification +performance overall, the causality-based decision tree significantly +outperforms both the distance-based decision tree and the Gini-based decision +tree on datasets generated from causal models. This result indicates that the +proposed approach can capture insights beyond those of classical decision +trees, especially in causally structured data. Based on the features used in +the LZ causal measure based decision tree, we introduce a causal strength for +each features in the dataset so as to infer the predominant causal variables +for the occurrence of the outcome. + +
+
+ comment: 17 pages, 8 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Diffusion Sampling Correction via Approximately 10 Parameters + + +
+ Diffusion Probabilistic Models (DPMs) have demonstrated exceptional +performance in generative tasks, but this comes at the expense of sampling +efficiency. To enhance sampling speed without sacrificing quality, various +distillation-based accelerated sampling algorithms have been recently proposed. +However, they typically require significant additional training costs and model +parameter storage, which limit their practical application. In this work, we +propose PCA-based Adaptive Search (PAS), which optimizes existing solvers for +DPMs with minimal learnable parameters and training costs. Specifically, we +first employ PCA to obtain a few orthogonal unit basis vectors to span the +high-dimensional sampling space, which enables us to learn just a set of +coordinates to correct the sampling direction; furthermore, based on the +observation that the cumulative truncation error exhibits an ``S''-shape, we +design an adaptive search strategy that further enhances the sampling +efficiency and reduces the number of stored parameters to approximately 10. +Extensive experiments demonstrate that PAS can significantly enhance existing +fast solvers in a plug-and-play manner with negligible costs. For instance, on +CIFAR10, PAS requires only 12 parameters and less than 1 minute of training on +a single NVIDIA A100 GPU to optimize the DDIM from 15.69 FID (NFE=10) to 4.37. + +
+
+
+
+
+ + ♻ ☆ Generative Forests NeurIPS'24 + + +
+ We focus on generative AI for a type of data that still represent one of the +most prevalent form of data: tabular data. Our paper introduces two key +contributions: a new powerful class of forest-based models fit for such tasks +and a simple training algorithm with strong convergence guarantees in a +boosting model that parallels that of the original weak / strong supervised +learning setting. This algorithm can be implemented by a few tweaks to the most +popular induction scheme for decision tree induction (i.e. supervised learning) +with two classes. Experiments on the quality of generated data display +substantial improvements compared to the state of the art. The losses our +algorithm minimize and the structure of our models make them practical for +related tasks that require fast estimation of a density given a generative +model and an observation (even partially specified): such tasks include missing +data imputation and density estimation. Additional experiments on these tasks +reveal that our models can be notably good contenders to diverse state of the +art methods, relying on models as diverse as (or mixing elements of) trees, +neural nets, kernels or graphical models. + +
+
+ comment: NeurIPS'24 +
+
+
+
+
+ + ♻ ☆ Closed-Loop Long-Horizon Robotic Planning via Equilibrium Sequence + Modeling + + +
+ In the endeavor to make autonomous robots take actions, task planning is a +major challenge that requires translating high-level task descriptions into +long-horizon action sequences. Despite recent advances in language model +agents, they remain prone to planning errors and limited in their ability to +plan ahead. To address these limitations in robotic planning, we advocate a +self-refining scheme that iteratively refines a draft plan until an equilibrium +is reached. Remarkably, this process can be optimized end-to-end from an +analytical perspective without the need to curate additional verifiers or +reward models, allowing us to train self-refining planners in a simple +supervised learning fashion. Meanwhile, a nested equilibrium sequence modeling +procedure is devised for efficient closed-loop planning that incorporates +useful feedback from the environment (or an internal world model). Our method +is evaluated on the VirtualHome-Env benchmark, showing advanced performance +with better scaling for inference computation. Code is available at +https://github.com/Singularity0104/equilibrium-planner. + +
+
+
+
+
+ + ♻ ☆ DiffPAD: Denoising Diffusion-based Adversarial Patch Decontamination WACV + + +
+ In the ever-evolving adversarial machine learning landscape, developing +effective defenses against patch attacks has become a critical challenge, +necessitating reliable solutions to safeguard real-world AI systems. Although +diffusion models have shown remarkable capacity in image synthesis and have +been recently utilized to counter $\ell_p$-norm bounded attacks, their +potential in mitigating localized patch attacks remains largely underexplored. +In this work, we propose DiffPAD, a novel framework that harnesses the power of +diffusion models for adversarial patch decontamination. DiffPAD first performs +super-resolution restoration on downsampled input images, then adopts +binarization, dynamic thresholding scheme and sliding window for effective +localization of adversarial patches. Such a design is inspired by the +theoretically derived correlation between patch size and diffusion restoration +error that is generalized across diverse patch attack scenarios. Finally, +DiffPAD applies inpainting techniques to the original input images with the +estimated patch region being masked. By integrating closed-form solutions for +super-resolution restoration and image inpainting into the conditional reverse +sampling process of a pre-trained diffusion model, DiffPAD obviates the need +for text guidance or fine-tuning. Through comprehensive experiments, we +demonstrate that DiffPAD not only achieves state-of-the-art adversarial +robustness against patch attacks but also excels in recovering naturalistic +images without patch remnants. The source code is available at +https://github.com/JasonFu1998/DiffPAD. + +
+
+ comment: Accepted to 2025 IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV) +
+
+
+
+
+ + ♻ ☆ Terracorder: Sense Long and Prosper + + +
+ In-situ sensing devices need to be deployed in remote environments for long +periods of time; minimizing their power consumption is vital for maximising +both their operational lifetime and coverage. We introduce Terracorder -- a +versatile multi-sensor device -- and showcase its exceptionally low power +consumption using an on-device reinforcement learning scheduler. We prototype a +unique device setup for biodiversity monitoring and compare its battery life +using our scheduler against a number of fixed schedules; the scheduler captures +more than 80% of events at less than 50% of the number of activations of the +best-performing fixed schedule. We then explore how a collaborative scheduler +can maximise the useful operation of a network of devices, improving overall +network power consumption and robustness. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ A Similarity-Based Oversampling Method for Multi-label Imbalanced Text + Data + + +
+ In real-world applications, as data availability increases, obtaining labeled +data for machine learning (ML) projects remains challenging due to the high +costs and intensive efforts required for data annotation. Many ML projects, +particularly those focused on multi-label classification, also grapple with +data imbalance issues, where certain classes may lack sufficient data to train +effective classifiers. This study introduces and examines a novel oversampling +method for multi-label text classification, designed to address performance +challenges associated with data imbalance. The proposed method identifies +potential new samples from unlabeled data by leveraging similarity measures +between instances. By iteratively searching the unlabeled dataset, the method +locates instances similar to those in underrepresented classes and evaluates +their contribution to classifier performance enhancement. Instances that +demonstrate performance improvement are then added to the labeled dataset. +Experimental results indicate that the proposed approach effectively enhances +classifier performance post-oversampling. + +
+
+
+
+
+ + ♻ ☆ IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of + brain MR images + + +
+ In MRI studies, the aggregation of imaging data from multiple acquisition +sites enhances sample size but may introduce site-related variabilities that +hinder consistency in subsequent analyses. Deep learning methods for image +translation have emerged as a solution for harmonizing MR images across sites. +In this study, we introduce IGUANe (Image Generation with Unified Adversarial +Networks), an original 3D model that leverages the strengths of domain +translation and straightforward application of style transfer methods for +multicenter brain MR image harmonization. IGUANe extends CycleGAN by +integrating an arbitrary number of domains for training through a many-to-one +architecture. The framework based on domain pairs enables the implementation of +sampling strategies that prevent confusion between site-related and biological +variabilities. During inference, the model can be applied to any image, even +from an unknown acquisition site, making it a universal generator for +harmonization. Trained on a dataset comprising T1-weighted images from 11 +different scanners, IGUANe was evaluated on data from unseen sites. The +assessments included the transformation of MR images with traveling subjects, +the preservation of pairwise distances between MR images within domains, the +evolution of volumetric patterns related to age and Alzheimer$'$s disease (AD), +and the performance in age regression and patient classification tasks. +Comparisons with other harmonization and normalization methods suggest that +IGUANe better preserves individual information in MR images and is more +suitable for maintaining and reinforcing variabilities related to age and AD. +Future studies may further assess IGUANe in other multicenter contexts, either +using the same model or retraining it for applications to different image +modalities. IGUANe is available at +https://github.com/RocaVincent/iguane_harmonization.git. + +
+
+ comment: 29 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Volume-Preserving Transformers for Learning Time Series Data with + Structure + + +
+ Two of the many trends in neural network research of the past few years have +been (i) the learning of dynamical systems, especially with recurrent neural +networks such as long short-term memory networks (LSTMs) and (ii) the +introduction of transformer neural networks for natural language processing +(NLP) tasks. + While some work has been performed on the intersection of these two trends, +those efforts were largely limited to using the vanilla transformer directly +without adjusting its architecture for the setting of a physical system. + In this work we develop a transformer-inspired neural network and use it to +learn a dynamical system. We (for the first time) change the activation +function of the attention layer to imbue the transformer with +structure-preserving properties to improve long-term stability. This is shown +to be of great advantage when applying the neural network to learning the +trajectory of a rigid body. + +
+
+ comment: Will be published as part of "Cemracs Proceedings 2023" (status: + accepted) +
+
+
+
+
+ + ♻ ☆ Machine learning-enabled velocity model building with uncertainty + quantification + + +
+ Accurately characterizing migration velocity models is crucial for a wide +range of geophysical applications, from hydrocarbon exploration to monitoring +of CO2 sequestration projects. Traditional velocity model building methods such +as Full-Waveform Inversion (FWI) are powerful but often struggle with the +inherent complexities of the inverse problem, including noise, limited +bandwidth, receiver aperture and computational constraints. To address these +challenges, we propose a scalable methodology that integrates generative +modeling, in the form of Diffusion networks, with physics-informed summary +statistics, making it suitable for complicated imaging problems including field +datasets. By defining these summary statistics in terms of subsurface-offset +image volumes for poor initial velocity models, our approach allows for +computationally efficient generation of Bayesian posterior samples for +migration velocity models that offer a useful assessment of uncertainty. To +validate our approach, we introduce a battery of tests that measure the quality +of the inferred velocity models, as well as the quality of the inferred +uncertainties. With modern synthetic datasets, we reconfirm gains from using +subsurface-image gathers as the conditioning observable. For complex velocity +model building involving salt, we propose a new iterative workflow that refines +amortized posterior approximations with salt flooding and demonstrate how the +uncertainty in the velocity model can be propagated to the final product +reverse time migrated images. Finally, we present a proof of concept on field +datasets to show that our method can scale to industry-sized problems. + +
+
+
+
+
+ + ♻ ☆ Doob's Lagrangian: A Sample-Efficient Variational Approach to Transition + Path Sampling NeurIPS 2024 + + +
+ Rare event sampling in dynamical systems is a fundamental problem arising in +the natural sciences, which poses significant computational challenges due to +an exponentially large space of trajectories. For settings where the dynamical +system of interest follows a Brownian motion with known drift, the question of +conditioning the process to reach a given endpoint or desired rare event is +definitively answered by Doob's h-transform. However, the naive estimation of +this transform is infeasible, as it requires simulating sufficiently many +forward trajectories to estimate rare event probabilities. In this work, we +propose a variational formulation of Doob's h-transform as an optimization +problem over trajectories between a given initial point and the desired ending +point. To solve this optimization, we propose a simulation-free training +objective with a model parameterization that imposes the desired boundary +conditions by design. Our approach significantly reduces the search space over +trajectories and avoids expensive trajectory simulation and inefficient +importance sampling estimators which are required in existing methods. We +demonstrate the ability of our method to find feasible transition paths on +real-world molecular simulation and protein folding tasks. + +
+
+ comment: Accepted as Spotlight at Conference on Neural Information Processing + Systems (NeurIPS 2024); Alanine dipeptide results updated after fixing + unphysical parameterization +
+
+
+
+
+ + ♻ ☆ Tract-RLFormer: A Tract-Specific RL policy based Decoder-only + Transformer Network ICPR + + +
+ Fiber tractography is a cornerstone of neuroimaging, enabling the detailed +mapping of the brain's white matter pathways through diffusion MRI. This is +crucial for understanding brain connectivity and function, making it a valuable +tool in neurological applications. Despite its importance, tractography faces +challenges due to its complexity and susceptibility to false positives, +misrepresenting vital pathways. To address these issues, recent strategies have +shifted towards deep learning, utilizing supervised learning, which depends on +precise ground truth, or reinforcement learning, which operates without it. In +this work, we propose Tract-RLFormer, a network utilizing both supervised and +reinforcement learning, in a two-stage policy refinement process that markedly +improves the accuracy and generalizability across various data-sets. By +employing a tract-specific approach, our network directly delineates the tracts +of interest, bypassing the traditional segmentation process. Through rigorous +validation on datasets such as TractoInferno, HCP, and ISMRM-2015, our +methodology demonstrates a leap forward in tractography, showcasing its ability +to accurately map the brain's white matter tracts. + +
+
+ comment: Accepted at 27th International Conference on Pattern Recognition + (ICPR), 2024 +
+
+
+
+
+ + ♻ ☆ SeMaScore : a new evaluation metric for automatic speech recognition + tasks + + +
+ In this study, we present SeMaScore, generated using a segment-wise mapping +and scoring algorithm that serves as an evaluation metric for automatic speech +recognition tasks. SeMaScore leverages both the error rate and a more robust +similarity score. We show that our algorithm's score generation improves upon +the state-of-the-art BERTScore. Our experimental results show that SeMaScore +corresponds well with expert human assessments, signal-to-noise ratio levels, +and other natural language metrics. We outperform BERTScore by 41x in metric +computation speed. Overall, we demonstrate that SeMaScore serves as a more +dependable evaluation metric, particularly in real-world situations involving +atypical speech patterns. + +
+
+ comment: Accepted at Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ InfiBench: Evaluating the Question-Answering Capabilities of Code Large + Language Models NeurIPS 2024 + + +
+ Large Language Models for code (code LLMs) have witnessed tremendous progress +in recent years. With the rapid development of code LLMs, many popular +evaluation benchmarks, such as HumanEval, DS-1000, and MBPP, have emerged to +measure the performance of code LLMs with a particular focus on code generation +tasks. However, they are insufficient to cover the full range of expected +capabilities of code LLMs, which span beyond code generation to answering +diverse coding-related questions. To fill this gap, we propose InfiBench, the +first large-scale freeform question-answering (QA) benchmark for code to our +knowledge, comprising 234 carefully selected high-quality Stack Overflow +questions that span across 15 programming languages. InfiBench uses four types +of model-free automatic metrics to evaluate response correctness where domain +experts carefully concretize the criterion for each question. We conduct a +systematic evaluation for over 100 latest code LLMs on InfiBench, leading to a +series of novel and insightful findings. Our detailed analyses showcase +potential directions for further advancement of code LLMs. InfiBench is fully +open source at https://infi-coder.github.io/infibench and continuously +expanding to foster more scientific and systematic practices for code LLM +evaluation. + +
+
+ comment: 31 pages. Appear at NeurIPS 2024 Datasets and Benchmarks track. + Project website: https://infi-coder.github.io/infibench +
+
+
+
+
+ + ♻ ☆ Joint Estimation of Conditional Mean and Covariance for Unbalanced + Panels + + +
+ We propose a nonparametric, kernel-based joint estimator for conditional mean +and covariance matrices in large unbalanced panels. Our estimator, with proven +consistency and finite-sample guarantees, is applied to a comprehensive panel +of monthly US stock excess returns from 1962 to 2021, conditioned on +macroeconomic and firm-specific covariates. The estimator captures time-varying +cross-sectional dependencies effectively, demonstrating robust statistical +performance. In asset pricing, it generates conditional mean-variance efficient +portfolios with out-of-sample Sharpe ratios that substantially exceed those of +equal-weighted benchmarks. + +
+
+
+
+
+ + ♻ ☆ Embedding Hardware Approximations in Discrete Genetic-based Training for + Printed MLPs + + +
+ Printed Electronics (PE) stands out as a promisingtechnology for widespread +computing due to its distinct attributes, such as low costs and flexible +manufacturing. Unlike traditional silicon-based technologies, PE enables +stretchable, conformal,and non-toxic hardware. However, PE are constrained by +larger feature sizes, making it challenging to implement complex circuits such +as machine learning (ML) classifiers. Approximate computing has been proven to +reduce the hardware cost of ML circuits such as Multilayer Perceptrons (MLPs). +In this paper, we maximize the benefits of approximate computing by integrating +hardware approximation into the MLP training process. Due to the discrete +nature of hardware approximation, we propose and implement a genetic-based, +approximate, hardware-aware training approach specifically designed for printed +MLPs. For a 5% accuracy loss, our MLPs achieve over 5x area and power reduction +compared to the baseline while outperforming state of-the-art approximate and +stochastic printed MLPs. + +
+
+ comment: Accepted for publication at the 27th Design, Automation and Test in + Europe Conference (DATE'24), Mar 25-27 2024, Valencia, Spain +
+
+
+
+
+ + ♻ ☆ Bespoke Approximation of Multiplication-Accumulation and Activation + Targeting Printed Multilayer Perceptrons + + +
+ Printed Electronics (PE) feature distinct and remarkable characteristics that +make them a prominent technology for achieving true ubiquitous computing. This +is particularly relevant in application domains that require conformal and +ultra-low cost solutions, which have experienced limited penetration of +computing until now. Unlike silicon-based technologies, PE offer unparalleled +features such as non-recurring engineering costs, ultra-low manufacturing cost, +and on-demand fabrication of conformal, flexible, non-toxic, and stretchable +hardware. However, PE face certain limitations due to their large feature +sizes, that impede the realization of complex circuits, such as machine +learning classifiers. In this work, we address these limitations by leveraging +the principles of Approximate Computing and Bespoke (fully-customized) design. +We propose an automated framework for designing ultra-low power Multilayer +Perceptron (MLP) classifiers which employs, for the first time, a holistic +approach to approximate all functions of the MLP's neurons: multiplication, +accumulation, and activation. Through comprehensive evaluation across various +MLPs of varying size, our framework demonstrates the ability to enable +battery-powered operation of even the most intricate MLP architecture examined, +significantly surpassing the current state of the art. + +
+
+ comment: Accepted for publication at the 42th IEEE/ACM International + Conference on Computer Aided Design (ICCAD) 2023, San Francisco, USA +
+
+
+
+
+ + ♻ ☆ ResBit: Residual Bit Vector for Categorical Values + + +
+ One-hot vectors, a common method for representing discrete/categorical data, +in machine learning are widely used because of their simplicity and +intuitiveness. However, one-hot vectors suffer from a linear increase in +dimensionality, posing computational and memory challenges, especially when +dealing with datasets containing numerous categories. In this paper, we focus +on tabular data generation, and reveal the multinomial diffusion faces the mode +collapse phenomenon when the cardinality is high. Moreover, due to the +limitations of one-hot vectors, the training phase takes time longer in such a +situation. To address these issues, we propose Residual Bit Vectors (ResBit), a +technique for densely representing categorical data. ResBit is an extension of +analog bits and overcomes limitations of analog bits when applied to tabular +data generation. Our experiments demonstrate that ResBit not only accelerates +training but also maintains performance when compared with the situations +before applying ResBit. Furthermore, our results indicate that many existing +methods struggle with high-cardinality data, underscoring the need for +lower-dimensional representations, such as ResBit and latent vectors. + +
+
+ comment: 25 pages, 29 tables, and 10 figures +
+
+
+
+
+ + ♻ ☆ How to Boost Any Loss Function NeurIPS'24 + + +
+ Boosting is a highly successful ML-born optimization setting in which one is +required to computationally efficiently learn arbitrarily good models based on +the access to a weak learner oracle, providing classifiers performing at least +slightly differently from random guessing. A key difference with gradient-based +optimization is that boosting's original model does not requires access to +first order information about a loss, yet the decades long history of boosting +has quickly evolved it into a first order optimization setting -- sometimes +even wrongfully defining it as such. Owing to recent progress extending +gradient-based optimization to use only a loss' zeroth ($0^{th}$) order +information to learn, this begs the question: what loss functions can be +efficiently optimized with boosting and what is the information really needed +for boosting to meet the original boosting blueprint's requirements? + We provide a constructive formal answer essentially showing that any loss +function can be optimized with boosting and thus boosting can achieve a feat +not yet known to be possible in the classical $0^{th}$ order setting, since +loss functions are not required to be be convex, nor differentiable or +Lipschitz -- and in fact not required to be continuous either. Some tools we +use are rooted in quantum calculus, the mathematical field -- not to be +confounded with quantum computation -- that studies calculus without passing to +the limit, and thus without using first order information. + +
+
+ comment: NeurIPS'24 +
+
+
+
+
+ + ♻ ☆ Toward Green and Human-Like Artificial Intelligence: A Complete Survey + on Contemporary Few-Shot Learning Approaches + + +
+ Despite deep learning's widespread success, its data-hungry and +computationally expensive nature makes it impractical for many data-constrained +real-world applications. Few-Shot Learning (FSL) aims to address these +limitations by enabling rapid adaptation to novel learning tasks, seeing +significant growth in recent years. This survey provides a comprehensive +overview of the field's latest advancements. Initially, FSL is formally +defined, and its relationship with different learning fields is presented. A +novel taxonomy is introduced, extending previously proposed ones, and +real-world applications in classic and novel fields are described. Finally, +recent trends shaping the field, outstanding challenges, and promising future +research directions are discussed. + +
+
+ comment: 35 pages, 9 figures. Submitted to ACM Computing Surveys +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration From Cognitive Psychology + + +
+ The cognitive mechanism by which Large Language Models (LLMs) solve +mathematical problems remains a widely debated and unresolved issue. Currently, +there is little interpretable experimental evidence that connects LLMs' +problem-solving with human cognitive psychology.To determine if LLMs possess +human-like mathematical reasoning, we modified the problems used in the human +Cognitive Reflection Test (CRT). Our results show that, even with the use of +Chains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model +(noted for its reasoning capabilities), have a high error rate when solving +these modified CRT problems. Specifically, the average accuracy rate dropped by +up to 50% compared to the original questions.Further analysis of LLMs' +incorrect answers suggests that they primarily rely on pattern matching from +their training data, which aligns more with human intuition (System 1 thinking) +rather than with human-like reasoning (System 2 thinking). This finding +challenges the belief that LLMs have genuine mathematical reasoning abilities +comparable to humans. As a result, this work may adjust overly optimistic views +on LLMs' progress towards artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ Hierarchical mixtures of Unigram models for short text clustering: the + role of Beta-Liouville priors + + +
+ This paper presents a variant of the Multinomial mixture model tailored for +the unsupervised classification of short text data. Traditionally, the +Multinomial probability vector in this hierarchical model is assigned a +Dirichlet prior distribution. Here, however, we explore an alternative +prior--the Beta-Liouville distribution--which offers a more flexible +correlation structure than the Dirichlet. We examine the theoretical properties +of the Beta-Liouville distribution, focusing on its conjugacy with the +Multinomial likelihood. This property enables the derivation of update +equations for a CAVI (Coordinate Ascent Variational Inference) variational +algorithm, facilitating the approximate posterior estimation of model +parameters. Additionally, we propose a stochastic variant of the CAVI algorithm +that enhances scalability. The paper concludes with data examples that +demonstrate effective strategies for setting the Beta-Liouville +hyperparameters. + +
+
+ comment: 32 pages, 4 figures. Submitted +
+
+
+
+
+ + ♻ ☆ An improved tabular data generator with VAE-GMM integration + + +
+ The rising use of machine learning in various fields requires robust methods +to create synthetic tabular data. Data should preserve key characteristics +while addressing data scarcity challenges. Current approaches based on +Generative Adversarial Networks, such as the state-of-the-art CTGAN model, +struggle with the complex structures inherent in tabular data. These data often +contain both continuous and discrete features with non-Gaussian distributions. +Therefore, we propose a novel Variational Autoencoder (VAE)-based model that +addresses these limitations. Inspired by the TVAE model, our approach +incorporates a Bayesian Gaussian Mixture model (BGM) within the VAE +architecture. This avoids the limitations imposed by assuming a strictly +Gaussian latent space, allowing for a more accurate representation of the +underlying data distribution during data generation. Furthermore, our model +offers enhanced flexibility by allowing the use of various differentiable +distributions for individual features, making it possible to handle both +continuous and discrete data types. We thoroughly validate our model on three +real-world datasets with mixed data types, including two medically relevant +ones, based on their resemblance and utility. This evaluation demonstrates +significant outperformance against CTGAN and TVAE, establishing its potential +as a valuable tool for generating synthetic tabular data in various domains, +particularly in healthcare. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ SageAttention: Accurate 8-Bit Attention for Plug-and-play Inference + Acceleration + + +
+ The transformer architecture predominates across various models. As the heart +of the transformer, attention has a computational complexity of O(N^2), +compared to O(N) for linear transformations. When handling large sequence +lengths, attention becomes the primary time-consuming component. Although +quantization has proven to be an effective method for accelerating model +inference, existing quantization methods primarily focus on optimizing the +linear layer. In response, we first analyze the feasibility of quantization in +attention detailedly. Following that, we propose SageAttention, a highly +efficient and accurate quantization method for attention. The OPS (operations +per second) of our approach outperforms FlashAttention2 and xformers by about +2.1 times and 2.7 times, respectively. SageAttention also achieves superior +accuracy performance over FlashAttention3. Comprehensive experiments confirm +that our approach incurs almost no end-to-end metrics loss across diverse +models, including those for large language processing, image generation, and +video generation. The codes are available at +https://github.com/thu-ml/SageAttention. + +
+
+
+
+
+ + ♻ ☆ Comparative Evaluation of Clustered Federated Learning Methods + + +
+ Over recent years, Federated Learning (FL) has proven to be one of the most +promising methods of distributed learning which preserves data privacy. As the +method evolved and was confronted to various real-world scenarios, new +challenges have emerged. One such challenge is the presence of highly +heterogeneous (often referred as non-IID) data distributions among participants +of the FL protocol. A popular solution to this hurdle is Clustered Federated +Learning (CFL), which aims to partition clients into groups where the +distribution are homogeneous. In the literature, state-of-the-art CFL +algorithms are often tested using a few cases of data heterogeneities, without +systematically justifying the choices. Further, the taxonomy used for +differentiating the different heterogeneity scenarios is not always +straightforward. In this paper, we explore the performance of two +state-of-theart CFL algorithms with respect to a proposed taxonomy of data +heterogeneities in federated learning (FL). We work with three image +classification datasets and analyze the resulting clusters against the +heterogeneity classes using extrinsic clustering metrics. Our objective is to +provide a clearer understanding of the relationship between CFL performances +and data heterogeneity scenarios. + +
+
+
+
+
+ + ♻ ☆ More Expressive Attention with Negative Weights + + +
+ We propose a novel attention mechanism, named Cog Attention, that enables +attention weights to be negative for enhanced expressiveness, which stems from +two key factors: (1) Cog Attention can shift the token deletion and copying +function from a static OV matrix to dynamic QK inner products, with the OV +matrix now focusing more on refinement or modification. The attention head can +simultaneously delete, copy, or retain tokens by assigning them negative, +positive, or minimal attention weights, respectively. As a result, a single +attention head becomes more flexible and expressive. (2) Cog Attention improves +the model's robustness against representational collapse, which can occur when +earlier tokens are over-squashed into later positions, leading to homogeneous +representations. Negative weights reduce effective information paths from +earlier to later tokens, helping to mitigate this issue. We develop +Transformer-like models which use Cog Attention as attention modules, including +decoder-only models for language modeling and U-ViT diffusion models for image +generation. Experiments show that models using Cog Attention exhibit superior +performance compared to those employing traditional softmax attention modules. +Our approach suggests a promising research direction for rethinking and +breaking the entrenched constraints of traditional softmax attention, such as +the requirement for non-negative weights. + +
+
+
+
+
+ + ♻ ☆ Dual-Segment Clustering Strategy for Hierarchical Federated Learning in + Heterogeneous Wireless Environments + + +
+ Non-independent and identically distributed (Non- IID) data adversely affects +federated learning (FL) while heterogeneity in communication quality can +undermine the reliability of model parameter transmission, potentially +degrading wireless FL convergence. This paper proposes a novel dual-segment +clustering (DSC) strategy that jointly addresses communication and data +heterogeneity in FL. This is achieved by defining a new signal-to-noise ratio +(SNR) matrix and information quantity matrix to capture the communication and +data heterogeneity, respectively. The celebrated affinity propagation algorithm +is leveraged to iteratively refine the clustering of clients based on the newly +defined matrices effectively enhancing model aggregation in heterogeneous +environments. The convergence analysis and experimental results show that the +DSC strategy can improve the convergence rate of wireless FL and demonstrate +superior accuracy in heterogeneous environments compared to classical +clustering methods. + +
+
+
+
+
+ + ♻ ☆ xPerT: Extended Persistence Transformer + + +
+ A persistence diagram provides a compact summary of persistent homology, +which captures the topological features of a space at different scales. +However, due to its nature as a set, incorporating it as a feature into a +machine learning framework is challenging. Several methods have been proposed +to use persistence diagrams as input for machine learning models, but they +often require complex preprocessing steps and extensive hyperparameter tuning. +In this paper, we propose a novel transformer architecture called the +\textit{Extended Persistence Transformer (xPerT)}, which is highly scalable +than the compared to Persformer, an existing transformer for persistence +diagrams. xPerT reduces GPU memory usage by over 90\% and improves accuracy on +multiple datasets. Additionally, xPerT does not require complex preprocessing +steps or extensive hyperparameter tuning, making it easy to use in practice. +Our code is available at https://github.com/sehunfromdaegu/xpert. + +
+
+
+
+
+ + ♻ ☆ Can Small Language Models Learn, Unlearn, and Retain Noise Patterns? + + +
+ Small Language Models (SLMs) are generally considered more compact versions +of large language models (LLMs). This study investigates the ability of SLMs +with parameters between 1 and 3 billion to learn, retain, and subsequently +eliminate different types of noise present in the data. Four pre-trained SLMs +were utilized for this: Olmo 1B, Qwen1.5 1.8B, Gemma 2B, and Phi2 2.7B. The +models were instruction-tuned on noise-free data and tested using in-context +examples to determine if they could learn noise through examples. Subsequently, +noise patterns were introduced in instruction tuning to evaluate the noise +learning, unlearning, and retention capabilities of the models. Olmo, the +smallest model, was highly sensitive to noise, quickly adapting to noisy +patterns. Phi2 resisted learning character-level and transliteration noise, +likely due to its carefully curated, structured, and high-quality pretraining +data. Gemma excelled with transliteration noise, likely benefiting from its +multilingual pretraining. The findings can be used to develop robust training +strategies for SLMs. + +
+
+
+
+
+ + ♻ ☆ The Roles of Generative Artificial Intelligence in Internet of Electric + Vehicles + + +
+ With the advancements of generative artificial intelligence (GenAI) models, +their capabilities are expanding significantly beyond content generation and +the models are increasingly being used across diverse applications. +Particularly, GenAI shows great potential in addressing challenges in the +electric vehicle (EV) ecosystem ranging from charging management to +cyber-attack prevention. In this paper, we specifically consider Internet of +electric vehicles (IoEV) and we categorize GenAI for IoEV into four different +layers namely, EV's battery layer, individual EV layer, smart grid layer, and +security layer. We introduce various GenAI techniques used in each layer of +IoEV applications. Subsequently, public datasets available for training the +GenAI models are summarized. Finally, we provide recommendations for future +directions. This survey not only categorizes the applications of GenAI in IoEV +across different layers but also serves as a valuable resource for researchers +and practitioners by highlighting the design and implementation challenges +within each layer. Furthermore, it provides a roadmap for future research +directions, enabling the development of more robust and efficient IoEV systems +through the integration of advanced GenAI techniques. + +
+
+ comment: 25 Pages +
+
+
+
+
+ + ♻ ☆ Distributionally Robust Safe Sample Elimination under Covariate Shift + + +
+ We consider a machine learning setup where one training dataset is used to +train multiple models across slightly different data distributions. This occurs +when customized models are needed for various deployment environments. To +reduce storage and training costs, we propose the DRSSS method, which combines +distributionally robust (DR) optimization and safe sample screening (SSS). The +key benefit of this method is that models trained on the reduced dataset will +perform the same as those trained on the full dataset for all possible +different environments. In this paper, we focus on covariate shift as a type of +data distribution change and demonstrate the effectiveness of our method +through experiments. + +
+
+
+
+
+ + ♻ ☆ No-Regret Learning of Nash Equilibrium for Black-Box Games via Gaussian + Processes UAI 2024 + + +
+ This paper investigates the challenge of learning in black-box games, where +the underlying utility function is unknown to any of the agents. While there is +an extensive body of literature on the theoretical analysis of algorithms for +computing the Nash equilibrium with complete information about the game, +studies on Nash equilibrium in black-box games are less common. In this paper, +we focus on learning the Nash equilibrium when the only available information +about an agent's payoff comes in the form of empirical queries. We provide a +no-regret learning algorithm that utilizes Gaussian processes to identify the +equilibrium in such games. Our approach not only ensures a theoretical +convergence rate but also demonstrates effectiveness across a variety +collection of games through experimental validation. + +
+
+ comment: 40th Conference on Uncertainty in Artificial Intelligence (UAI 2024) +
+
+
+
+
+ + ♻ ☆ Online Budgeted Matching with General Bids NeurIPS 2024 + + +
+ Online Budgeted Matching (OBM) is a classic problem with important +applications in online advertising, online service matching, revenue +management, and beyond. Traditional online algorithms typically assume a small +bid setting, where the maximum bid-to-budget ratio (\kappa) is infinitesimally +small. While recent algorithms have tried to address scenarios with non-small +or general bids, they often rely on the Fractional Last Matching (FLM) +assumption, which allows for accepting partial bids when the remaining budget +is insufficient. This assumption, however, does not hold for many applications +with indivisible bids. In this paper, we remove the FLM assumption and tackle +the open problem of OBM with general bids. We first establish an upper bound of +1-\kappa on the competitive ratio for any deterministic online algorithm. We +then propose a novel meta algorithm, called MetaAd, which reduces to different +algorithms with first known provable competitive ratios parameterized by the +maximum bid-to-budget ratio \kappa \in [0, 1]. As a by-product, we extend +MetaAd to the FLM setting and get provable competitive algorithms. Finally, we +apply our competitive analysis to the design learning-augmented algorithms. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Vision Mamba: Efficient Visual Representation Learning with + Bidirectional State Space Model ICML 2024 + + +
+ Recently the state space models (SSMs) with efficient hardware-aware designs, +i.e., the Mamba deep learning model, have shown great potential for long +sequence modeling. Meanwhile building efficient and generic vision backbones +purely upon SSMs is an appealing direction. However, representing visual data +is challenging for SSMs due to the position-sensitivity of visual data and the +requirement of global context for visual understanding. In this paper, we show +that the reliance on self-attention for visual representation learning is not +necessary and propose a new generic vision backbone with bidirectional Mamba +blocks (Vim), which marks the image sequences with position embeddings and +compresses the visual representation with bidirectional state space models. On +ImageNet classification, COCO object detection, and ADE20k semantic +segmentation tasks, Vim achieves higher performance compared to +well-established vision transformers like DeiT, while also demonstrating +significantly improved computation & memory efficiency. For example, Vim is +2.8$\times$ faster than DeiT and saves 86.8% GPU memory when performing batch +inference to extract features on images with a resolution of 1248$\times$1248. +The results demonstrate that Vim is capable of overcoming the computation & +memory constraints on performing Transformer-style understanding for +high-resolution images and it has great potential to be the next-generation +backbone for vision foundation models. Code is available at +https://github.com/hustvl/Vim. + +
+
+ comment: Vision Mamba (Vim) is accepted by ICML 2024. Code is available at + https://github.com/hustvl/Vim +
+
+
+
+
+ + ♻ ☆ Autobidders with Budget and ROI Constraints: Efficiency, Regret, and + Pacing Dynamics COLT 2024 + + +
+ We study a game between autobidding algorithms that compete in an online +advertising platform. Each autobidder is tasked with maximizing its +advertiser's total value over multiple rounds of a repeated auction, subject to +budget and return-on-investment constraints. We propose a gradient-based +learning algorithm that is guaranteed to satisfy all constraints and achieves +vanishing individual regret. Our algorithm uses only bandit feedback and can be +used with the first- or second-price auction, as well as with any +"intermediate" auction format. Our main result is that when these autobidders +play against each other, the resulting expected liquid welfare over all rounds +is at least half of the expected optimal liquid welfare achieved by any +allocation. This holds whether or not the bidding dynamics converges to an +equilibrium. + +
+
+ comment: Appeared at COLT 2024. Numerical experiments added since Jun'24 + version +
+
+
+
+
+ + ♻ ☆ Recurrent Neural Goodness-of-Fit Test for Time Series + + +
+ Time series data are crucial across diverse domains such as finance and +healthcare, where accurate forecasting and decision-making rely on advanced +modeling techniques. While generative models have shown great promise in +capturing the intricate dynamics inherent in time series, evaluating their +performance remains a major challenge. Traditional evaluation metrics fall +short due to the temporal dependencies and potential high dimensionality of the +features. In this paper, we propose the REcurrent NeurAL (RENAL) +Goodness-of-Fit test, a novel and statistically rigorous framework for +evaluating generative time series models. By leveraging recurrent neural +networks, we transform the time series into conditionally independent data +pairs, enabling the application of a chi-square-based goodness-of-fit test to +the temporal dependencies within the data. This approach offers a robust, +theoretically grounded solution for assessing the quality of generative models, +particularly in settings with limited time sequences. We demonstrate the +efficacy of our method across both synthetic and real-world datasets, +outperforming existing methods in terms of reliability and accuracy. Our method +fills a critical gap in the evaluation of time series generative models, +offering a tool that is both practical and adaptable to high-stakes +applications. + +
+
+ comment: 27 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Review of Large Language Models and Autonomous Agents in Chemistry + + +
+ Large language models (LLMs) have emerged as powerful tools in chemistry, +significantly impacting molecule design, property prediction, and synthesis +optimization. This review highlights LLM capabilities in these domains and +their potential to accelerate scientific discovery through automation. We also +review LLM-based autonomous agents: LLMs with a broader set of tools to +interact with their surrounding environment. These agents perform diverse tasks +such as paper scraping, interfacing with automated laboratories, and synthesis +planning. As agents are an emerging topic, we extend the scope of our review of +agents beyond chemistry and discuss across any scientific domains. This review +covers the recent history, current capabilities, and design of LLMs and +autonomous agents, addressing specific challenges, opportunities, and future +directions in chemistry. Key challenges include data quality and integration, +model interpretability, and the need for standard benchmarks, while future +directions point towards more sophisticated multi-modal agents and enhanced +collaboration between agents and experimental methods. Due to the quick pace of +this field, a repository has been built to keep track of the latest studies: +https://github.com/ur-whitelab/LLMs-in-science. + +
+
+
+
+
+ + ♻ ☆ Boosted Neural Decoders: Achieving Extreme Reliability of LDPC Codes for + 6G Networks + + +
+ Ensuring extremely high reliability in channel coding is essential for 6G +networks. The next-generation of ultra-reliable and low-latency communications +(xURLLC) scenario within 6G networks requires frame error rate (FER) below +$10^{-9}$. However, low-density parity-check (LDPC) codes, the standard in 5G +new radio (NR), encounter a challenge known as the error floor phenomenon, +which hinders to achieve such low rates. To tackle this problem, we introduce +an innovative solution: boosted neural min-sum (NMS) decoder. This decoder +operates identically to conventional NMS decoders, but is trained by novel +training methods including: i) boosting learning with uncorrected vectors, ii) +block-wise training schedule to address the vanishing gradient issue, iii) +dynamic weight sharing to minimize the number of trainable weights, iv) +transfer learning to reduce the required sample count, and v) data augmentation +to expedite the sampling process. Leveraging these training strategies, the +boosted NMS decoder achieves the state-of-the art performance in reducing the +error floor as well as superior waterfall performance. Remarkably, we fulfill +the 6G xURLLC requirement for 5G LDPC codes without a severe error floor. +Additionally, the boosted NMS decoder, once its weights are trained, can +perform decoding without additional modules, making it highly practical for +immediate application. The source code is available at +https://github.com/ghy1228/LDPC_Error_Floor. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Evaluating Modern Approaches in 3D Scene Reconstruction: NeRF vs + Gaussian-Based Methods + + +
+ Exploring the capabilities of Neural Radiance Fields (NeRF) and +Gaussian-based methods in the context of 3D scene reconstruction, this study +contrasts these modern approaches with traditional Simultaneous Localization +and Mapping (SLAM) systems. Utilizing datasets such as Replica and ScanNet, we +assess performance based on tracking accuracy, mapping fidelity, and view +synthesis. Findings reveal that NeRF excels in view synthesis, offering unique +capabilities in generating new perspectives from existing data, albeit at +slower processing speeds. Conversely, Gaussian-based methods provide rapid +processing and significant expressiveness but lack comprehensive scene +completion. Enhanced by global optimization and loop closure techniques, newer +methods like NICE-SLAM and SplaTAM not only surpass older frameworks such as +ORB-SLAM2 in terms of robustness but also demonstrate superior performance in +dynamic and complex environments. This comparative analysis bridges theoretical +research with practical implications, shedding light on future developments in +robust 3D scene reconstruction across various real-world applications. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ♻ ☆ Interpolating neural network: A lightweight yet precise architecture for + data training, equation solving, and parameter calibration + + +
+ Artificial intelligence (AI) has revolutionized software development, +shifting from task-specific codes (Software 1.0) to neural network-based +approaches (Software 2.0). However, applying this transition in engineering +software presents challenges, including low surrogate model accuracy, the curse +of dimensionality in inverse design, and rising complexity in physical +simulations. We introduce an interpolating neural network (INN), grounded in +interpolation theory and tensor decomposition, to realize Engineering Software +2.0 by advancing data training, partial differential equation solving, and +parameter calibration. INN offers orders of magnitude fewer trainable/solvable +parameters for comparable model accuracy than traditional multi-layer +perceptron (MLP) or physics-informed neural networks (PINN). Demonstrated in +metal additive manufacturing, INN rapidly constructs an accurate surrogate +model of Laser Powder Bed Fusion (L-PBF) heat transfer simulation, achieving +sub-10-micrometer resolution for a 10 mm path in under 15 minutes on a single +GPU. This makes a transformative step forward across all domains essential to +engineering software. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Mitigating Partial Observability in Sequential Decision Processes via + the Lambda Discrepancy + + +
+ Reinforcement learning algorithms typically rely on the assumption that the +environment dynamics and value function can be expressed in terms of a +Markovian state representation. However, when state information is only +partially observable, how can an agent learn such a state representation, and +how can it detect when it has found one? We introduce a metric that can +accomplish both objectives, without requiring access to -- or knowledge of -- +an underlying, unobservable state space. Our metric, the $\lambda$-discrepancy, +is the difference between two distinct temporal difference (TD) value +estimates, each computed using TD($\lambda$) with a different value of +$\lambda$. Since TD($\lambda{=}0$) makes an implicit Markov assumption and +TD($\lambda{=}1$) does not, a discrepancy between these estimates is a +potential indicator of a non-Markovian state representation. Indeed, we prove +that the $\lambda$-discrepancy is exactly zero for all Markov decision +processes and almost always non-zero for a broad class of partially observable +environments. We also demonstrate empirically that, once detected, minimizing +the $\lambda$-discrepancy can help with learning a memory function to mitigate +the corresponding partial observability. We then train a reinforcement learning +agent that simultaneously constructs two recurrent value networks with +different $\lambda$ parameters and minimizes the difference between them as an +auxiliary loss. The approach scales to challenging partially observable +domains, where the resulting agent frequently performs significantly better +(and never performs worse) than a baseline recurrent agent with only a single +value network. + +
+
+ comment: GitHub URL: https://github.com/brownirl/lambda_discrepancy; Project + page: https://lambda-discrepancy.github.io/ +
+
+
+
+
+ + ♻ ☆ Impactful Bit-Flip Search on Full-precision Models + + +
+ Neural networks have shown remarkable performance in various tasks, yet they +remain susceptible to subtle changes in their input or model parameters. One +particularly impactful vulnerability arises through the Bit-Flip Attack (BFA), +where flipping a small number of critical bits in a model's parameters can +severely degrade its performance. A common technique for inducing bit flips in +DRAM is the Row-Hammer attack, which exploits frequent uncached memory accesses +to alter data. Identifying susceptible bits can be achieved through exhaustive +search or progressive layer-by-layer analysis, especially in quantized +networks. In this work, we introduce Impactful Bit-Flip Search (IBS), a novel +method for efficiently pinpointing and flipping critical bits in full-precision +networks. Additionally, we propose a Weight-Stealth technique that +strategically modifies the model's parameters in a way that maintains the float +values within the original distribution, thereby bypassing simple range checks +often used in tamper detection. + +
+
+
+
+
+ + ♻ ☆ MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation + Models, Convolutional Neural Networks, and Uncertainty Quantification for + High-Speed Video Phase Detection Data + + +
+ Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in +nuclear reactors, chemical processing, and electronics cooling for detecting +vapor, liquid, and microlayer phases. Traditional segmentation models face +pixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ +introduces VideoSAM, a hybrid framework leveraging convolutional neural +networks (CNNs) and transformer-based vision models to enhance segmentation +accuracy and generalizability across complex multimodal PD tasks. Methods: +VideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced +feature extraction and segmentation across diverse HSV PD modalities, spanning +fluids like water, FC-72, nitrogen, and argon under varied heat flux +conditions. The framework also incorporates uncertainty quantification (UQ) to +assess pixel-based discretization errors, delivering reliable metrics such as +contact line density and dry area fraction under experimental conditions. +Results: VideoSAM outperforms SAM and modality-specific CNN models in +segmentation accuracy, excelling in environments with complex phase boundaries, +overlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid +architecture supports cross-dataset generalization, adapting effectively to +varying modalities. The UQ module provides accurate error estimates, enhancing +the reliability of segmentation outputs for advanced HSV PD research. +Conclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD +segmentation, addressing previous limitations with advanced deep learning and +UQ techniques. The open-source datasets and tools introduced enable scalable, +precise, and adaptable segmentation for multimodal PD datasets, supporting +advancements in HSV analysis and autonomous experimentation. The codes and data +used for this paper are publicly available at +https://github.com/chikap421/mseg_vcuq + +
+
+ comment: Under Review in EAAI +
+
+
+
+
+
+
+
+ + Artificial Intelligence 126 + +
+
+
+ + ☆ On the Surprising Effectiveness of Attention Transfer for Vision + Transformers NeurIPS 2024 + + +
+ Conventional wisdom suggests that pre-training Vision Transformers (ViT) +improves downstream performance by learning useful representations. Is this +actually true? We investigate this question and find that the features and +representations learned during pre-training are not essential. Surprisingly, +using only the attention patterns from pre-training (i.e., guiding how +information flows between tokens) is sufficient for models to learn high +quality features from scratch and achieve comparable downstream performance. We +show this by introducing a simple method called attention transfer, where only +the attention patterns from a pre-trained teacher ViT are transferred to a +student, either by copying or distilling the attention maps. Since attention +transfer lets the student learn its own features, ensembling it with a +fine-tuned teacher also further improves accuracy on ImageNet. We +systematically study various aspects of our findings on the sufficiency of +attention maps, including distribution shift settings where they underperform +fine-tuning. We hope our exploration provides a better understanding of what +pre-training accomplishes and leads to a useful alternative to the standard +practice of fine-tuning + +
+
+ comment: NeurIPS 2024. Code: + https://github.com/alexlioralexli/attention-transfer +
+
+
+
+
+ + ☆ LLM Hallucination Reasoning with Zero-shot Knowledge Test + + +
+ LLM hallucination, where LLMs occasionally generate unfaithful text, poses +significant challenges for their practical applications. Most existing +detection methods rely on external knowledge, LLM fine-tuning, or +hallucination-labeled datasets, and they do not distinguish between different +types of hallucinations, which are crucial for improving detection performance. +We introduce a new task, Hallucination Reasoning, which classifies +LLM-generated text into one of three categories: aligned, misaligned, and +fabricated. Our novel zero-shot method assesses whether LLM has enough +knowledge about a given prompt and text. Our experiments conducted on new +datasets demonstrate the effectiveness of our method in hallucination reasoning +and underscore its importance for enhancing detection performance. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Towards a Classification of Open-Source ML Models and Datasets for + Software Engineering + + +
+ Background: Open-Source Pre-Trained Models (PTMs) and datasets provide +extensive resources for various Machine Learning (ML) tasks, yet these +resources lack a classification tailored to Software Engineering (SE) needs. +Aims: We apply an SE-oriented classification to PTMs and datasets on a popular +open-source ML repository, Hugging Face (HF), and analyze the evolution of PTMs +over time. Method: We conducted a repository mining study. We started with a +systematically gathered database of PTMs and datasets from the HF API. Our +selection was refined by analyzing model and dataset cards and metadata, such +as tags, and confirming SE relevance using Gemini 1.5 Pro. All analyses are +replicable, with a publicly accessible replication package. Results: The most +common SE task among PTMs and datasets is code generation, with a primary focus +on software development and limited attention to software management. Popular +PTMs and datasets mainly target software development. Among ML tasks, text +generation is the most common in SE PTMs and datasets. There has been a marked +increase in PTMs for SE since 2023 Q2. Conclusions: This study underscores the +need for broader task coverage to enhance the integration of ML within SE +practices. + +
+
+ comment: 5 pages, 8 figures +
+
+
+
+
+ + ☆ NeuralDEM - Real-time Simulation of Industrial Particulate Flows + + +
+ Advancements in computing power have made it possible to numerically simulate +large-scale fluid-mechanical and/or particulate systems, many of which are +integral to core industrial processes. Among the different numerical methods +available, the discrete element method (DEM) provides one of the most accurate +representations of a wide range of physical systems involving granular and +discontinuous materials. Consequently, DEM has become a widely accepted +approach for tackling engineering problems connected to granular flows and +powder mechanics. Additionally, DEM can be integrated with grid-based +computational fluid dynamics (CFD) methods, enabling the simulation of chemical +processes taking place, e.g., in fluidized beds. However, DEM is +computationally intensive because of the intrinsic multiscale nature of +particulate systems, restricting simulation duration or number of particles. +Towards this end, NeuralDEM presents an end-to-end approach to replace slow +numerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM +is capable of picturing long-term transport processes across different regimes +using macroscopic observables without any reference to microscopic model +parameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an +underlying continuous field, while simultaneously modeling macroscopic behavior +directly as additional auxiliary fields. Second, NeuralDEM introduces +multi-branch neural operators scalable to real-time modeling of +industrially-sized scenarios - from slow and pseudo-steady to fast and +transient. Such scenarios have previously posed insurmountable challenges for +deep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM +fluidized bed reactors of 160k CFD cells and 500k DEM particles for +trajectories of 28s. NeuralDEM will open many new doors to advanced engineering +and much faster process cycles. + +
+
+ comment: Project page: https://nx-ai.github.io/NeuralDEM/ +
+
+
+
+
+ + ☆ Med-Bot: An AI-Powered Assistant to Provide Accurate and Reliable + Medical Information + + +
+ This paper introduces Med-Bot, an AI-powered chatbot designed to provide +users with accurate and reliable medical information. Utilizing advanced +libraries and frameworks such as PyTorch, Chromadb, Langchain and Autogptq, +Med-Bot is built to handle the complexities of natural language understanding +in a healthcare context. The integration of llamaassisted data processing and +AutoGPT-Q provides enhanced performance in processing and responding to queries +based on PDFs of medical literature, ensuring that users receive precise and +trustworthy information. This research details the methodologies employed in +developing Med-Bot and evaluates its effectiveness in disseminating healthcare +information. + +
+
+ comment: 3 figures, 5 pages Keywords-LLM, AI-powered healthcare, Medical + chatbot, Context-based interaction, Llama-assisted data processing, + AutoGPT-Q, PyTorch, TensorFlow, Reliable medical information, Machine + learning in healthcare, Conversational AI +
+
+
+
+
+ + ☆ On the Limits of Language Generation: Trade-Offs Between Hallucination + and Mode Collapse + + +
+ Specifying all desirable properties of a language model is challenging, but +certain requirements seem essential. Given samples from an unknown language, +the trained model should produce valid strings not seen in training and be +expressive enough to capture the language's full richness. Otherwise, +outputting invalid strings constitutes "hallucination," and failing to capture +the full range leads to "mode collapse." We ask if a language model can meet +both requirements. + We investigate this within a statistical language generation setting building +on Gold and Angluin. Here, the model receives random samples from a +distribution over an unknown language K, which belongs to a possibly infinite +collection of languages. The goal is to generate unseen strings from K. We say +the model generates from K with consistency and breadth if, as training size +increases, its output converges to all unseen strings in K. + Kleinberg and Mullainathan [KM24] asked if consistency and breadth in +language generation are possible. We answer this negatively: for a large class +of language models, including next-token prediction models, this is impossible +for most collections of candidate languages. This contrasts with [KM24]'s +result, showing consistent generation without breadth is possible for any +countable collection of languages. Our finding highlights that generation with +breadth fundamentally differs from generation without breadth. + As a byproduct, we establish near-tight bounds on the number of samples +needed for generation with or without breadth. + Finally, our results offer hope: consistent generation with breadth is +achievable for any countable collection of languages when negative examples +(strings outside K) are available alongside positive ones. This suggests that +post-training feedback, which encodes negative examples, can be crucial in +reducing hallucinations while limiting mode collapse. + +
+
+ comment: Abstract shortened to fit arXiv limit +
+
+
+
+
+ + ☆ One-Shot Manipulation Strategy Learning by Making Contact Analogies CoRL + + +
+ We present a novel approach, MAGIC (manipulation analogies for generalizable +intelligent contacts), for one-shot learning of manipulation strategies with +fast and extensive generalization to novel objects. By leveraging a reference +action trajectory, MAGIC effectively identifies similar contact points and +sequences of actions on novel objects to replicate a demonstrated strategy, +such as using different hooks to retrieve distant objects of different shapes +and sizes. Our method is based on a two-stage contact-point matching process +that combines global shape matching using pretrained neural features with local +curvature analysis to ensure precise and physically plausible contact points. +We experiment with three tasks including scooping, hanging, and hooking +objects. MAGIC demonstrates superior performance over existing methods, +achieving significant improvements in runtime speed and generalization to +different object categories. Website: https://magic-2024.github.io/ . + +
+
+ comment: CoRL LEAP Workshop, 2024 +
+
+
+
+
+ + ☆ Vision-based Manipulation of Transparent Plastic Bags in Industrial + Setups + + +
+ This paper addresses the challenges of vision-based manipulation for +autonomous cutting and unpacking of transparent plastic bags in industrial +setups, aligning with the Industry 4.0 paradigm. Industry 4.0, driven by data, +connectivity, analytics, and robotics, promises enhanced accessibility and +sustainability throughout the value chain. The integration of autonomous +systems, including collaborative robots (cobots), into industrial processes is +pivotal for efficiency and safety. The proposed solution employs advanced +Machine Learning algorithms, particularly Convolutional Neural Networks (CNNs), +to identify transparent plastic bags under varying lighting and background +conditions. Tracking algorithms and depth sensing technologies are utilized for +3D spatial awareness during pick and placement. The system addresses challenges +in grasping and manipulation, considering optimal points, compliance control +with vacuum gripping technology, and real-time automation for safe interaction +in dynamic environments. The system's successful testing and validation in the +lab with the FRANKA robot arm, showcases its potential for widespread +industrial applications, while demonstrating effectiveness in automating the +unpacking and cutting of transparent plastic bags for an 8-stack bulk-loader +based on specific requirements and rigorous testing. + +
+
+
+
+
+ + ☆ PTR: Precision-Driven Tool Recommendation for Large Language Models + + +
+ By augmenting Large Language Models (LLMs) with external tools, their +capacity to solve complex problems has been significantly enhanced. However, +despite ongoing advancements in the parsing capabilities of LLMs, incorporating +all available tools simultaneously in the prompt remains impractical due to the +vast number of external tools. Consequently, it is essential to provide LLMs +with a precise set of tools tailored to the specific task, considering both +quantity and quality. Current tool retrieval methods primarily focus on +refining the ranking list of tools and directly packaging a fixed number of +top-ranked tools as the tool set. However, these approaches often fail to equip +LLMs with the optimal set of tools prior to execution, since the optimal number +of tools for different tasks could be different, resulting in inefficiencies +such as redundant or unsuitable tools, which impede immediate access to the +most relevant tools. This paper addresses the challenge of recommending precise +toolsets for LLMs. We introduce the problem of tool recommendation, define its +scope, and propose a novel Precision-driven Tool Recommendation (PTR) approach. +PTR captures an initial, concise set of tools by leveraging historical tool +bundle usage and dynamically adjusts the tool set by performing tool matching, +culminating in a multi-view-based tool addition. Additionally, we present a new +dataset, RecTools, and a metric, TRACC, designed to evaluate the effectiveness +of tool recommendation for LLMs. We further validate our design choices through +comprehensive experiments, demonstrating promising accuracy across two open +benchmarks and our RecTools dataset. + +
+
+
+
+
+ + ☆ Local-Global Attention: An Adaptive Mechanism for Multi-Scale Feature + Integration + + +
+ In recent years, attention mechanisms have significantly enhanced the +performance of object detection by focusing on key feature information. +However, prevalent methods still encounter difficulties in effectively +balancing local and global features. This imbalance hampers their ability to +capture both fine-grained details and broader contextual information-two +critical elements for achieving accurate object detection.To address these +challenges, we propose a novel attention mechanism, termed Local-Global +Attention, which is designed to better integrate both local and global +contextual features. Specifically, our approach combines multi-scale +convolutions with positional encoding, enabling the model to focus on local +details while concurrently considering the broader global context. +Additionally, we introduce a learnable parameters, which allow the model to +dynamically adjust the relative importance of local and global attention, +depending on the specific requirements of the task, thereby optimizing feature +representations across multiple scales.We have thoroughly evaluated the +Local-Global Attention mechanism on several widely used object detection and +classification datasets. Our experimental results demonstrate that this +approach significantly enhances the detection of objects at various scales, +with particularly strong performance on multi-class and small object detection +tasks. In comparison to existing attention mechanisms, Local-Global Attention +consistently outperforms them across several key metrics, all while maintaining +computational efficiency. + +
+
+
+
+
+ + ☆ Accelerating Knowledge Graph and Ontology Engineering with Large + Language Models + + +
+ Large Language Models bear the promise of significant acceleration of key +Knowledge Graph and Ontology Engineering tasks, including ontology modeling, +extension, modification, population, alignment, as well as entity +disambiguation. We lay out LLM-based Knowledge Graph and Ontology Engineering +as a new and coming area of research, and argue that modular approaches to +ontologies will be of central importance. + +
+
+
+
+
+ + ☆ LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models + + +
+ This work explores expanding the capabilities of large language models (LLMs) +pretrained on text to generate 3D meshes within a unified model. This offers +key advantages of (1) leveraging spatial knowledge already embedded in LLMs, +derived from textual sources like 3D tutorials, and (2) enabling conversational +3D generation and mesh understanding. A primary challenge is effectively +tokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly. +To address this, we introduce LLaMA-Mesh, a novel approach that represents the +vertex coordinates and face definitions of 3D meshes as plain text, allowing +direct integration with LLMs without expanding the vocabulary. We construct a +supervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate +3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs +as required, and (3) understand and interpret 3D meshes. Our work is the first +to demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge +for 3D mesh generation in a text-based format, effectively unifying the 3D and +text modalities. LLaMA-Mesh achieves mesh generation quality on par with models +trained from scratch while maintaining strong text generation performance. + +
+
+ comment: See the project website at + https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/ +
+
+
+
+
+ + ☆ SMILE-UHURA Challenge -- Small Vessel Segmentation at Mesoscopic Scale + from Ultra-High Resolution 7T Magnetic Resonance Angiograms + + +
+ The human brain receives nutrients and oxygen through an intricate network of +blood vessels. Pathology affecting small vessels, at the mesoscopic scale, +represents a critical vulnerability within the cerebral blood supply and can +lead to severe conditions, such as Cerebral Small Vessel Diseases. The advent +of 7 Tesla MRI systems has enabled the acquisition of higher spatial resolution +images, making it possible to visualise such vessels in the brain. However, the +lack of publicly available annotated datasets has impeded the development of +robust, machine learning-driven segmentation algorithms. To address this, the +SMILE-UHURA challenge was organised. This challenge, held in conjunction with +the ISBI 2023, in Cartagena de Indias, Colombia, aimed to provide a platform +for researchers working on related topics. The SMILE-UHURA challenge addresses +the gap in publicly available annotated datasets by providing an annotated +dataset of Time-of-Flight angiography acquired with 7T MRI. This dataset was +created through a combination of automated pre-segmentation and extensive +manual refinement. In this manuscript, sixteen submitted methods and two +baseline methods are compared both quantitatively and qualitatively on two +different datasets: held-out test MRAs from the same dataset as the training +data (with labels kept secret) and a separate 7T ToF MRA dataset where both +input volumes and labels are kept secret. The results demonstrate that most of +the submitted deep learning methods, trained on the provided training dataset, +achieved reliable segmentation performance. Dice scores reached up to 0.838 +$\pm$ 0.066 and 0.716 $\pm$ 0.125 on the respective datasets, with an average +performance of up to 0.804 $\pm$ 0.15. + +
+
+
+
+
+ + ☆ Adopting RAG for LLM-Aided Future Vehicle Design + + +
+ In this paper, we explore the integration of Large Language Models (LLMs) +with Retrieval-Augmented Generation (RAG) to enhance automated design and +software development in the automotive industry. We present two case studies: a +standardization compliance chatbot and a design copilot, both utilizing RAG to +provide accurate, context-aware responses. We evaluate four LLMs-GPT-4o, +LLAMA3, Mistral, and Mixtral- comparing their answering accuracy and execution +time. Our results demonstrate that while GPT-4 offers superior performance, +LLAMA3 and Mistral also show promising capabilities for local deployment, +addressing data privacy concerns in automotive applications. This study +highlights the potential of RAG-augmented LLMs in improving design workflows +and compliance in automotive engineering. + +
+
+ comment: Conference paper accepted in IEEE FLLM 2024 +
+
+
+
+
+ + ☆ Software Performance Engineering for Foundation Model-Powered Software + (FMware) + + +
+ The rise of Foundation Models (FMs) like Large Language Models (LLMs) is +revolutionizing software development. Despite the impressive prototypes, +transforming FMware into production-ready products demands complex engineering +across various domains. A critical but overlooked aspect is performance +engineering, which aims at ensuring FMware meets performance goals such as +throughput and latency to avoid user dissatisfaction and financial loss. Often, +performance considerations are an afterthought, leading to costly optimization +efforts post-deployment. FMware's high computational resource demands highlight +the need for efficient hardware use. Continuous performance engineering is +essential to prevent degradation. This paper highlights the significance of +Software Performance Engineering (SPE) in FMware, identifying four key +challenges: cognitive architecture design, communication protocols, tuning and +optimization, and deployment. These challenges are based on literature surveys +and experiences from developing an in-house FMware system. We discuss problems, +current practices, and innovative paths for the software engineering community. + +
+
+
+
+
+ + ☆ Automating Reformulation of Essence Specifications via Graph Rewriting + + +
+ Formulating an effective constraint model of a parameterised problem class is +crucial to the efficiency with which instances of the class can subsequently be +solved. It is difficult to know beforehand which of a set of candidate models +will perform best in practice. This paper presents a system that employs graph +rewriting to reformulate an input model for improved performance automatically. +By situating our work in the Essence abstract constraint specification +language, we can use the structure in its high level variable types to trigger +rewrites directly. We implement our system via rewrite rules expressed in the +Graph Programs 2 language, applied to the abstract syntax tree of an input +specification. We show how to automatically translate the solution of the +reformulated problem into a solution of the original problem for verification +and presentation. We demonstrate the efficacy of our system with a detailed +case study. + +
+
+ comment: Presented at the PTHG 2024 workshop +
+
+
+
+
+ + ☆ Piecing It All Together: Verifying Multi-Hop Multimodal Claims + + +
+ Existing claim verification datasets often do not require systems to perform +complex reasoning or effectively interpret multimodal evidence. To address +this, we introduce a new task: multi-hop multimodal claim verification. This +task challenges models to reason over multiple pieces of evidence from diverse +sources, including text, images, and tables, and determine whether the combined +multimodal evidence supports or refutes a given claim. To study this task, we +construct MMCV, a large-scale dataset comprising 16k multi-hop claims paired +with multimodal evidence, generated and refined using large language models, +with additional input from human feedback. We show that MMCV is challenging +even for the latest state-of-the-art multimodal large language models, +especially as the number of reasoning hops increases. Additionally, we +establish a human performance benchmark on a subset of MMCV. We hope this +dataset and its evaluation task will encourage future research in multimodal +multi-hop claim verification. + +
+
+
+
+
+ + ☆ OpenGeMM: A High-Utilization GeMM Accelerator Generator with Lightweight + RISC-V Control and Tight Memory Coupling + + +
+ Deep neural networks (DNNs) face significant challenges when deployed on +resource-constrained extreme edge devices due to their computational and +data-intensive nature. While standalone accelerators tailored for specific +application scenarios suffer from inflexible control and limited +programmability, generic hardware acceleration platforms coupled with RISC-V +CPUs can enable high reusability and flexibility, yet typically at the expense +of system level efficiency and low utilization. To fill this gap, we propose +OpenGeMM, an open-source acceleration platform, jointly demonstrating high +efficiency and utilization, as well as ease of configurability and +programmability. OpenGeMM encompasses a parameterized Chisel-coded GeMM +accelerator, a lightweight RISC-V processor, and a tightly coupled multi-banked +scratchpad memory. The GeMM core utilization and system efficiency are boosted +through three mechanisms: configuration pre-loading, input pre-fetching with +output buffering, and programmable strided memory access. Experimental results +show that OpenGeMM can consistently achieve hardware utilization ranging from +81.89% to 99.34% across diverse CNN and Transformer workloads. Compared to the +SotA open-source Gemmini accelerator, OpenGeMM demonstrates a 3.58x to 16.40x +speedup on normalized throughput across a wide variety ofGeMM workloads, while +achieving 4.68 TOPS/W system efficiency. + +
+
+
+
+
+ + ☆ Prompting the Unseen: Detecting Hidden Backdoors in Black-Box Models + + +
+ Visual prompting (VP) is a new technique that adapts well-trained frozen +models for source domain tasks to target domain tasks. This study examines VP's +benefits for black-box model-level backdoor detection. The visual prompt in VP +maps class subspaces between source and target domains. We identify a +misalignment, termed class subspace inconsistency, between clean and poisoned +datasets. Based on this, we introduce \textsc{BProm}, a black-box model-level +detection method to identify backdoors in suspicious models, if any. +\textsc{BProm} leverages the low classification accuracy of prompted models +when backdoors are present. Extensive experiments confirm \textsc{BProm}'s +effectiveness. + +
+
+
+
+
+ + ☆ Navigating the Risks: A Survey of Security, Privacy, and Ethics Threats + in LLM-Based Agents + + +
+ With the continuous development of large language models (LLMs), +transformer-based models have made groundbreaking advances in numerous natural +language processing (NLP) tasks, leading to the emergence of a series of agents +that use LLMs as their control hub. While LLMs have achieved success in various +tasks, they face numerous security and privacy threats, which become even more +severe in the agent scenarios. To enhance the reliability of LLM-based +applications, a range of research has emerged to assess and mitigate these +risks from different perspectives. + To help researchers gain a comprehensive understanding of various risks, this +survey collects and analyzes the different threats faced by these agents. To +address the challenges posed by previous taxonomies in handling cross-module +and cross-stage threats, we propose a novel taxonomy framework based on the +sources and impacts. Additionally, we identify six key features of LLM-based +agents, based on which we summarize the current research progress and analyze +their limitations. Subsequently, we select four representative agents as case +studies to analyze the risks they may face in practical use. Finally, based on +the aforementioned analyses, we propose future research directions from the +perspectives of data, methodology, and policy, respectively. + +
+
+
+
+
+ + ☆ Communication Compression for Tensor Parallel LLM Inference + + +
+ Large Language Models (LLMs) have pushed the frontier of artificial +intelligence but are comprised of hundreds of billions of parameters and +operations. For faster inference latency, LLMs are deployed on multiple +hardware accelerators through various Model Parallelism strategies. Our paper +looks into the details on one such strategy - Tensor Parallel - and proposes to +reduce latency by compressing inter-accelerator communication. We leverage fine +grained quantization techniques to compress selected activations by 3.5 - 4.5x. +Our proposed method leads up to 2x reduction of time-to-first-token (TTFT) with +negligible model performance degradation. + +
+
+
+
+
+ + ☆ Toward a Cohesive AI and Simulation Software Ecosystem for Scientific + Innovation + + +
+ In this paper, we discuss the need for an integrated software stack that +unites artificial intelligence (AI) and modeling and simulation (ModSim) tools +to advance scientific discovery. The authors advocate for a unified AI/ModSim +software ecosystem that ensures compatibility across a wide range of software +on diverse high-performance computing systems, promoting ease of deployment, +version management, and binary distribution. Key challenges highlighted include +balancing the distinct needs of AI and ModSim, especially in terms of software +build practices, dependency management, and compatibility. The document +underscores the importance of continuous integration, community-driven +stewardship, and collaboration with the Department of Energy (DOE) to develop a +portable and cohesive scientific software ecosystem. Recommendations focus on +supporting standardized environments through initiatives like the Extreme-scale +Scientific Software Stack (E4S) and Spack to foster interdisciplinary +innovation and facilitate new scientific advancements. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ MM-Eval: A Hierarchical Benchmark for Modern Mongolian Evaluation in + LLMs + + +
+ Large language models (LLMs) excel in high-resource languages but face +notable challenges in low-resource languages like Mongolian. This paper +addresses these challenges by categorizing capabilities into language abilities +(syntax and semantics) and cognitive abilities (knowledge and reasoning). To +systematically evaluate these areas, we developed MM-Eval, a specialized +dataset based on Modern Mongolian Language Textbook I and enriched with WebQSP +and MGSM datasets. + Preliminary experiments on models including Qwen2-7B-Instruct, GLM4-9b-chat, +Llama3.1-8B-Instruct, GPT-4, and DeepseekV2.5 revealed that: 1) all models +performed better on syntactic tasks than semantic tasks, highlighting a gap in +deeper language understanding; and 2) knowledge tasks showed a moderate +decline, suggesting that models can transfer general knowledge from +high-resource to low-resource contexts. + The release of MM-Eval, comprising 569 syntax, 677 semantics, 344 knowledge, +and 250 reasoning tasks, offers valuable insights for advancing NLP and LLMs in +low-resource languages like Mongolian. The dataset is available at +https://github.com/joenahm/MM-Eval. + +
+
+
+
+
+ + ☆ ResidualDroppath: Enhancing Feature Reuse over Residual Connections + + +
+ Residual connections are one of the most important components in neural +network architectures for mitigating the vanishing gradient problem and +facilitating the training of much deeper networks. One possible explanation for +how residual connections aid deeper network training is by promoting feature +reuse. However, we identify and analyze the limitations of feature reuse with +vanilla residual connections. To address these limitations, we propose +modifications in training methods. Specifically, we provide an additional +opportunity for the model to learn feature reuse with residual connections +through two types of iterations during training. The first type of iteration +involves using droppath, which enforces feature reuse by randomly dropping a +subset of layers. The second type of iteration focuses on training the dropped +parts of the model while freezing the undropped parts. As a result, the dropped +parts learn in a way that encourages feature reuse, as the model relies on the +undropped parts with feature reuse in mind. Overall, we demonstrated +performance improvements in models with residual connections for image +classification in certain cases. + +
+
+
+
+
+ + ☆ Renal Cell Carcinoma subtyping: learning from multi-resolution + localization + + +
+ Renal Cell Carcinoma is typically asymptomatic at the early stages for many +patients. This leads to a late diagnosis of the tumor, where the curability +likelihood is lower, and makes the mortality rate of Renal Cell Carcinoma high, +with respect to its incidence rate. To increase the survival chance, a fast and +correct categorization of the tumor subtype is paramount. Nowadays, +computerized methods, based on artificial intelligence, represent an +interesting opportunity to improve the productivity and the objectivity of the +microscopy-based Renal Cell Carcinoma diagnosis. Nonetheless, much of their +exploitation is hampered by the paucity of annotated dataset, essential for a +proficient training of supervised machine learning technologies. This study +sets out to investigate a novel self supervised training strategy for machine +learning diagnostic tools, based on the multi-resolution nature of the +histological samples. We aim at reducing the need of annotated dataset, without +significantly reducing the accuracy of the tool. We demonstrate the +classification capability of our tool on a whole slide imaging dataset for +Renal Cancer subtyping, and we compare our solution with several +state-of-the-art classification counterparts. + +
+
+
+
+
+ + ☆ An Explainable Attention Model for Cervical Precancer Risk + Classification using Colposcopic Images + + +
+ Cervical cancer remains a major worldwide health issue, with early +identification and risk assessment playing critical roles in effective +preventive interventions. This paper presents the Cervix-AID-Net model for +cervical precancer risk classification. The study designs and evaluates the +proposed Cervix-AID-Net model based on patients colposcopy images. The model +comprises a Convolutional Block Attention Module (CBAM) and convolutional +layers that extract interpretable and representative features of colposcopic +images to distinguish high-risk and low-risk cervical precancer. In addition, +the proposed Cervix-AID-Net model integrates four explainable techniques, +namely gradient class activation maps, Local Interpretable Model-agnostic +Explanations, CartoonX, and pixel rate distortion explanation based on output +feature maps and input features. The evaluation using holdout and ten-fold +cross-validation techniques yielded a classification accuracy of 99.33\% and +99.81\%. The analysis revealed that CartoonX provides meticulous explanations +for the decision of the Cervix-AID-Net model due to its ability to provide the +relevant piece-wise smooth part of the image. The effect of Gaussian noise and +blur on the input shows that the performance remains unchanged up to Gaussian +noise of 3\% and blur of 10\%, while the performance reduces thereafter. A +comparison study of the proposed model's performance compared to other deep +learning approaches highlights the Cervix-AID-Net model's potential as a +supplemental tool for increasing the effectiveness of cervical precancer risk +assessment. The proposed method, which incorporates the CBAM and explainable +artificial integration, has the potential to influence cervical cancer +prevention and early detection, improving patient outcomes and lowering the +worldwide burden of this preventable disease. + +
+
+ comment: 19 pages, 9 figure, and 7 tables +
+
+
+
+
+ + ☆ DiffRoad: Realistic and Diverse Road Scenario Generation for Autonomous + Vehicle Testing + + +
+ Generating realistic and diverse road scenarios is essential for autonomous +vehicle testing and validation. Nevertheless, owing to the complexity and +variability of real-world road environments, creating authentic and varied +scenarios for intelligent driving testing is challenging. In this paper, we +propose DiffRoad, a novel diffusion model designed to produce controllable and +high-fidelity 3D road scenarios. DiffRoad leverages the generative capabilities +of diffusion models to synthesize road layouts from white noise through an +inverse denoising process, preserving real-world spatial features. To enhance +the quality of generated scenarios, we design the Road-UNet architecture, +optimizing the balance between backbone and skip connections for high-realism +scenario generation. Furthermore, we introduce a road scenario evaluation +module that screens adequate and reasonable scenarios for intelligent driving +testing using two critical metrics: road continuity and road reasonableness. +Experimental results on multiple real-world datasets demonstrate DiffRoad's +ability to generate realistic and smooth road structures while maintaining the +original distribution. Additionally, the generated scenarios can be fully +automated into the OpenDRIVE format, facilitating generalized autonomous +vehicle simulation testing. DiffRoad provides a rich and diverse scenario +library for large-scale autonomous vehicle testing and offers valuable insights +for future infrastructure designs that are better suited for autonomous +vehicles. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ AI-driven inverse design of materials: Past, present and future + + +
+ The discovery of advanced materials is the cornerstone of human technological +development and progress. The structures of materials and their corresponding +properties are essentially the result of a complex interplay of multiple +degrees of freedom such as lattice, charge, spin, symmetry, and topology. This +poses significant challenges for the inverse design methods of materials. +Humans have long explored new materials through a large number of experiments +and proposed corresponding theoretical systems to predict new material +properties and structures. With the improvement of computational power, +researchers have gradually developed various electronic structure calculation +methods, particularly such as the one based density functional theory, as well +as high-throughput computational methods. Recently, the rapid development of +artificial intelligence technology in the field of computer science has enabled +the effective characterization of the implicit association between material +properties and structures, thus opening up an efficient paradigm for the +inverse design of functional materials. A significant progress has been made in +inverse design of materials based on generative and discriminative models, +attracting widespread attention from researchers. Considering this rapid +technological progress, in this survey, we look back on the latest advancements +in AI-driven inverse design of materials by introducing the background, key +findings, and mainstream technological development routes. In addition, we +summarize the remaining issues for future directions. This survey provides the +latest overview of AI-driven inverse design of materials, which can serve as a +useful resource for researchers. + +
+
+ comment: 43 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ An Adaptive Open-Source Dataset Generation Framework for Machine + Learning Tasks in Logic Synthesis + + +
+ This paper introduces an adaptive logic synthesis dataset generation +framework designed to enhance machine learning applications within the logic +synthesis process. Unlike previous dataset generation flows that were tailored +for specific tasks or lacked integrated machine learning capabilities, the +proposed framework supports a comprehensive range of machine learning tasks by +encapsulating the three fundamental steps of logic synthesis: Boolean +representation, logic optimization, and technology mapping. It preserves the +original information in the intermediate files that can be stored in both +Verilog and Graphmal format. Verilog files enable semi-customizability, +allowing researchers to add steps and incrementally refine the generated +dataset. The framework also includes an adaptive circuit engine to facilitate +the loading of GraphML files for final dataset packaging and sub-dataset +extraction. The generated OpenLS-D dataset comprises 46 combinational designs +from established benchmarks, totaling over 966,000 Boolean circuits, with each +design containing 21,000 circuits generated from 1000 synthesis recipes, +including 7000 Boolean networks, 7000 ASIC netlists, and 7000 FPGA netlists. +Furthermore, OpenLS-D supports integrating newly desired data features, making +it more versatile for new challenges. The utility of OpenLS-D is demonstrated +through four distinct downstream tasks: circuit classification, circuit +ranking, quality of results (QoR) prediction, and probability prediction. Each +task highlights different internal steps of logic synthesis, with the datasets +extracted and relabeled from the OpenLS-D dataset using the circuit engine. The +experimental results confirm the dataset's diversity and extensive +applicability. The source code and datasets are available at +https://github.com/Logic-Factory/ACE/blob/master/OpenLS-D/readme.md. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ SAG-ViT: A Scale-Aware, High-Fidelity Patching Approach with Graph + Attention for Vision Transformers + + +
+ Image classification is a computer vision task where a model analyzes an +image to categorize it into a specific label. Vision Transformers (ViT) improve +this task by leveraging self-attention to capture complex patterns and long +range relationships between image patches. However, a key challenge for ViTs is +efficiently incorporating multiscale feature representations, which is inherent +in CNNs through their hierarchical structure. In this paper, we introduce the +Scale-Aware Graph Attention Vision Transformer (SAG-ViT), a novel framework +that addresses this challenge by integrating multi-scale features. Using +EfficientNet as a backbone, the model extracts multi-scale feature maps, which +are divided into patches to preserve semantic information. These patches are +organized into a graph based on spatial and feature similarities, with a Graph +Attention Network (GAT) refining the node embeddings. Finally, a Transformer +encoder captures long-range dependencies and complex interactions. The SAG-ViT +is evaluated on benchmark datasets, demonstrating its effectiveness in +enhancing image classification performance. + +
+
+ comment: 10 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Script-centric behavior understanding for assisted autism spectrum + disorder diagnosis ICASSP 2025 + + +
+ Observing and analyzing children's social behaviors is crucial for the early +diagnosis of Autism Spectrum Disorders (ASD). This work focuses on +automatically detecting ASD using computer vision techniques and large language +models (LLMs). Existing methods typically rely on supervised learning. However, +the scarcity of ASD diagnostic datasets and the lack of interpretability in +diagnostic results significantly limits its clinical application. To address +these challenges, we introduce a novel unsupervised approach based on +script-centric behavior understanding. Our pipeline converts video content into +scripts that describe the behavior of characters, leveraging the +generalizability of large language models to detect ASD in a zero-shot or +few-shot manner. Specifically, we propose a scripts transcription module for +multimodal behavior data textualization and a domain prompts module to bridge +LLMs. Our method achieves an accuracy of 92.00\% in diagnosing ASD in children +with an average age of 24 months, surpassing the performance of supervised +learning methods by 3.58\% absolutely. Extensive experiments confirm the +effectiveness of our approach and suggest its potential for advancing ASD +research through LLMs. + +
+
+ comment: 5 pages, 4 figures, submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Quantum Machine Learning: An Interplay Between Quantum Computing and + Machine Learning + + +
+ Quantum machine learning (QML) is a rapidly growing field that combines +quantum computing principles with traditional machine learning. It seeks to +revolutionize machine learning by harnessing the unique capabilities of quantum +mechanics and employs machine learning techniques to advance quantum computing +research. This paper introduces quantum computing for the machine learning +paradigm, where variational quantum circuits (VQC) are used to develop QML +architectures on noisy intermediate-scale quantum (NISQ) devices. We discuss +machine learning for the quantum computing paradigm, showcasing our recent +theoretical and empirical findings. In particular, we delve into future +directions for studying QML, exploring the potential industrial impacts of QML +research. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ Automated Segmentation of Ischemic Stroke Lesions in Non-Contrast + Computed Tomography Images for Enhanced Treatment and Prognosis MICCAI + + +
+ Stroke is the second leading cause of death worldwide, and is increasingly +prevalent in low- and middle-income countries (LMICs). Timely interventions can +significantly influence stroke survivability and the quality of life after +treatment. However, the standard and most widely available imaging method for +confirming strokes and their sub-types, the NCCT, is more challenging and +time-consuming to employ in cases of ischemic stroke. For this reason, we +developed an automated method for ischemic stroke lesion segmentation in NCCTs +using the nnU-Net frame work, aimed at enhancing early treatment and improving +the prognosis of ischemic stroke patients. We achieved Dice scores of 0.596 and +Intersection over Union (IoU) scores of 0.501 on the sampled dataset. After +adjusting for outliers, these scores improved to 0.752 for the Dice score and +0.643 for the IoU. Proper delineation of the region of infarction can help +clinicians better assess the potential impact of the infarction, and guide +treatment procedures. + +
+
+ comment: 7 pages, 3 figures, MICCAI Meets Africa Workshop +
+
+
+
+
+ + ☆ Imagined Speech and Visual Imagery as Intuitive Paradigms for + Brain-Computer Interfaces + + +
+ Recent advancements in brain-computer interface (BCI) technology have +emphasized the promise of imagined speech and visual imagery as effective +paradigms for intuitive communication. This study investigates the +classification performance and brain connectivity patterns associated with +these paradigms, focusing on decoding accuracy across selected word classes. +Sixteen participants engaged in tasks involving thirteen imagined speech and +visual imagery classes, revealing above-chance classification accuracy for both +paradigms. Variability in classification accuracy across individual classes +highlights the influence of sensory and motor associations in imagined speech +and vivid visual associations in visual imagery. Connectivity analysis further +demonstrated increased functional connectivity in language-related and sensory +regions for imagined speech, whereas visual imagery activated spatial and +visual processing networks. These findings suggest the potential of imagined +speech and visual imagery as an intuitive and scalable paradigm for BCI +communication when selecting optimal word classes. Further exploration of the +decoding outcomes for these two paradigms could provide insights for practical +BCI communication. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Less is More: Unseen Domain Fake News Detection via Causal Propagation + Substructures + + +
+ The spread of fake news on social media poses significant threats to +individuals and society. Text-based and graph-based models have been employed +for fake news detection by analysing news content and propagation networks, +showing promising results in specific scenarios. However, these data-driven +models heavily rely on pre-existing in-distribution data for training, limiting +their performance when confronted with fake news from emerging or previously +unseen domains, known as out-of-distribution (OOD) data. Tackling OOD fake news +is a challenging yet critical task. In this paper, we introduce the Causal +Subgraph-oriented Domain Adaptive Fake News Detection (CSDA) model, designed to +enhance zero-shot fake news detection by extracting causal substructures from +propagation graphs using in-distribution data and generalising this approach to +OOD data. The model employs a graph neural network based mask generation +process to identify dominant nodes and edges within the propagation graph, +using these substructures for fake news detection. Additionally, the +performance of CSDA is further improved through contrastive learning in +few-shot scenarios, where a limited amount of OOD data is available for +training. Extensive experiments on public social media datasets demonstrate +that CSDA effectively handles OOD fake news detection, achieving a 7 to 16 +percents accuracy improvement over other state-of-the-art models. + +
+
+ comment: 9 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ LTLf+ and PPLTL+: Extending LTLf and PPLTL to Infinite Traces + + +
+ We introduce LTLf+ and PPLTL+, two logics to express properties of infinite +traces, that are based on the linear-time temporal logics LTLf and PPLTL on +finite traces. LTLf+/PPLTL+ use levels of Manna and Pnueli's LTL +safety-progress hierarchy, and thus have the same expressive power as LTL. +However, they also retain a crucial characteristic of the reactive synthesis +problem for the base logics: the game arena for strategy extraction can be +derived from deterministic finite automata (DFA). Consequently, these logics +circumvent the notorious difficulties associated with determinizing infinite +trace automata, typical of LTL reactive synthesis. We present DFA-based +synthesis techniques for LTLf+/PPLTL+, and show that synthesis is +2EXPTIME-complete for LTLf+ (matching LTLf) and EXPTIME-complete for PPLTL+ +(matching PPLTL). Notably, while PPLTL+ retains the full expressive power of +LTL, reactive synthesis is EXPTIME-complete instead of 2EXPTIME-complete. The +techniques are also adapted to optimally solve satisfiability, validity, and +model-checking, to get EXPSPACE-complete for LTLf+ (extending a recent result +for the guarantee level using LTLf), and PSPACE-complete for PPLTL+. + +
+
+
+
+
+ + ☆ Your Fixed Watermark is Fragile: Towards Semantic-Aware Watermark for + EaaS Copyright Protection + + +
+ Embedding-as-a-Service (EaaS) has emerged as a successful business pattern +but faces significant challenges related to various forms of copyright +infringement, including API misuse and different attacks. Various studies have +proposed backdoor-based watermarking schemes to protect the copyright of EaaS +services. In this paper, we reveal that previous watermarking schemes possess +semantic-independent characteristics and propose the Semantic Perturbation +Attack (SPA). Our theoretical and experimental analyses demonstrate that this +semantic-independent nature makes current watermarking schemes vulnerable to +adaptive attacks that exploit semantic perturbations test to bypass watermark +verification. To address this vulnerability, we propose the Semantic Aware +Watermarking (SAW) scheme, a robust defense mechanism designed to resist SPA, +by injecting a watermark that adapts to the text semantics. Extensive +experimental results across multiple datasets demonstrate that the True +Positive Rate (TPR) for detecting watermarked samples under SPA can reach up to +more than 95%, rendering previous watermarks ineffective. Meanwhile, our +watermarking scheme can resist such attack while ensuring the watermark +verification capability. Our code is available at +https://github.com/Zk4-ps/EaaS-Embedding-Watermark. + +
+
+
+
+
+ + ☆ Multi-scale Generative Modeling for Fast Sampling + + +
+ While working within the spatial domain can pose problems associated with +ill-conditioned scores caused by power-law decay, recent advances in +diffusion-based generative models have shown that transitioning to the wavelet +domain offers a promising alternative. However, within the wavelet domain, we +encounter unique challenges, especially the sparse representation of +high-frequency coefficients, which deviates significantly from the Gaussian +assumptions in the diffusion process. To this end, we propose a multi-scale +generative modeling in the wavelet domain that employs distinct strategies for +handling low and high-frequency bands. In the wavelet domain, we apply +score-based generative modeling with well-conditioned scores for low-frequency +bands, while utilizing a multi-scale generative adversarial learning for +high-frequency bands. As supported by the theoretical analysis and experimental +results, our model significantly improve performance and reduce the number of +trainable parameters, sampling steps, and time. + +
+
+
+
+
+ + ☆ EEG-Based Speech Decoding: A Novel Approach Using Multi-Kernel Ensemble + Diffusion Models + + +
+ In this study, we propose an ensemble learning framework for +electroencephalogram-based overt speech classification, leveraging denoising +diffusion probabilistic models with varying convolutional kernel sizes. The +ensemble comprises three models with kernel sizes of 51, 101, and 201, +effectively capturing multi-scale temporal features inherent in signals. This +approach improves the robustness and accuracy of speech decoding by +accommodating the rich temporal complexity of neural signals. The ensemble +models work in conjunction with conditional autoencoders that refine the +reconstructed signals and maximize the useful information for downstream +classification tasks. The results indicate that the proposed ensemble-based +approach significantly outperforms individual models and existing +state-of-the-art techniques. These findings demonstrate the potential of +ensemble methods in advancing brain signal decoding, offering new possibilities +for non-verbal communication applications, particularly in brain-computer +interface systems aimed at aiding individuals with speech impairments. + +
+
+
+
+
+ + ☆ Learning Hand State Estimation for a Light Exoskeleton + + +
+ We propose a machine learning-based estimator of the hand state for +rehabilitation purposes, using light exoskeletons. These devices are easy to +use and useful for delivering domestic and frequent therapies. We build a +supervised approach using information from the muscular activity of the forearm +and the motion of the exoskeleton to reconstruct the hand's opening degree and +compliance level. Such information can be used to evaluate the therapy progress +and develop adaptive control behaviors. Our approach is validated with a real +light exoskeleton. The experiments demonstrate good predictive performance of +our approach when trained on data coming from a single user and tested on the +same user, even across different sessions. This generalization capability makes +our system promising for practical use in real rehabilitation. + +
+
+
+
+
+ + ☆ StreamAdapter: Efficient Test Time Adaptation from Contextual Streams + + +
+ In-context learning (ICL) allows large language models (LLMs) to adapt to new +tasks directly from the given demonstrations without requiring gradient +updates. While recent advances have expanded context windows to accommodate +more demonstrations, this approach increases inference costs without +necessarily improving performance. To mitigate these issues, We propose +StreamAdapter, a novel approach that directly updates model parameters from +context at test time, eliminating the need for explicit in-context +demonstrations. StreamAdapter employs context mapping and weight absorption +mechanisms to dynamically transform ICL demonstrations into parameter updates +with minimal additional parameters. By reducing reliance on numerous in-context +examples, StreamAdapter significantly reduce inference costs and allows for +efficient inference with constant time complexity, regardless of demonstration +count. Extensive experiments across diverse tasks and model architectures +demonstrate that StreamAdapter achieves comparable or superior adaptation +capability to ICL while requiring significantly fewer demonstrations. The +superior task adaptation and context encoding capabilities of StreamAdapter on +both language understanding and generation tasks provides a new perspective for +adapting LLMs at test time using context, allowing for more efficient +adaptation across scenarios and more cost-effective inference + +
+
+ comment: 22 Pages, 9 Figures +
+
+
+
+
+ + ☆ Cross-Modal Consistency in Multimodal Large Language Models + + +
+ Recent developments in multimodal methodologies have marked the beginning of +an exciting era for models adept at processing diverse data types, encompassing +text, audio, and visual content. Models like GPT-4V, which merge computer +vision with advanced language processing, exhibit extraordinary proficiency in +handling intricate tasks that require a simultaneous understanding of both +textual and visual information. Prior research efforts have meticulously +evaluated the efficacy of these Vision Large Language Models (VLLMs) in various +domains, including object detection, image captioning, and other related +fields. However, existing analyses have often suffered from limitations, +primarily centering on the isolated evaluation of each modality's performance +while neglecting to explore their intricate cross-modal interactions. +Specifically, the question of whether these models achieve the same level of +accuracy when confronted with identical task instances across different +modalities remains unanswered. In this study, we take the initiative to delve +into the interaction and comparison among these modalities of interest by +introducing a novel concept termed cross-modal consistency. Furthermore, we +propose a quantitative evaluation framework founded on this concept. Our +experimental findings, drawn from a curated collection of parallel +vision-language datasets developed by us, unveil a pronounced inconsistency +between the vision and language modalities within GPT-4V, despite its portrayal +as a unified multimodal model. Our research yields insights into the +appropriate utilization of such models and hints at potential avenues for +enhancing their design. + +
+
+
+
+
+ + ☆ Harnessing multiple LLMs for Information Retrieval: A case study on Deep + Learning methodologies in Biodiversity publications + + +
+ Deep Learning (DL) techniques are increasingly applied in scientific studies +across various domains to address complex research questions. However, the +methodological details of these DL models are often hidden in the unstructured +text. As a result, critical information about how these models are designed, +trained, and evaluated is challenging to access and comprehend. To address this +issue, in this work, we use five different open-source Large Language Models +(LLMs): Llama-3 70B, Llama-3.1 70B, Mixtral-8x22B-Instruct-v0.1, Mixtral 8x7B, +and Gemma 2 9B in combination with Retrieval-Augmented Generation (RAG) +approach to extract and process DL methodological details from scientific +publications automatically. We built a voting classifier from the outputs of +five LLMs to accurately report DL methodological information. We tested our +approach using biodiversity publications, building upon our previous research. +To validate our pipeline, we employed two datasets of DL-related biodiversity +publications: a curated set of 100 publications from our prior work and a set +of 364 publications from the Ecological Informatics journal. Our results +demonstrate that the multi-LLM, RAG-assisted pipeline enhances the retrieval of +DL methodological information, achieving an accuracy of 69.5% (417 out of 600 +comparisons) based solely on textual content from publications. This +performance was assessed against human annotators who had access to code, +figures, tables, and other supplementary information. Although demonstrated in +biodiversity, our methodology is not limited to this field; it can be applied +across other scientific domains where detailed methodological reporting is +essential for advancing knowledge and ensuring reproducibility. This study +presents a scalable and reliable approach for automating information +extraction, facilitating better reproducibility and knowledge transfer across +studies. + +
+
+
+
+
+ + ☆ How Good is ChatGPT at Audiovisual Deepfake Detection: A Comparative + Study of ChatGPT, AI Models and Human Perception + + +
+ Multimodal deepfakes involving audiovisual manipulations are a growing threat +because they are difficult to detect with the naked eye or using unimodal deep +learningbased forgery detection methods. Audiovisual forensic models, while +more capable than unimodal models, require large training datasets and are +computationally expensive for training and inference. Furthermore, these models +lack interpretability and often do not generalize well to unseen manipulations. +In this study, we examine the detection capabilities of a large language model +(LLM) (i.e., ChatGPT) to identify and account for any possible visual and +auditory artifacts and manipulations in audiovisual deepfake content. Extensive +experiments are conducted on videos from a benchmark multimodal deepfake +dataset to evaluate the detection performance of ChatGPT and compare it with +the detection capabilities of state-of-the-art multimodal forensic models and +humans. Experimental results demonstrate the importance of domain knowledge and +prompt engineering for video forgery detection tasks using LLMs. Unlike +approaches based on end-to-end learning, ChatGPT can account for spatial and +spatiotemporal artifacts and inconsistencies that may exist within or across +modalities. Additionally, we discuss the limitations of ChatGPT for multimedia +forensic tasks. + +
+
+
+
+
+ + ☆ Automating Autograding: Large Language Models as Test Suite Generators + for Introductory Programming + + +
+ Automatically graded programming assignments provide instant feedback to +students and significantly reduce manual grading time for instructors. However, +creating comprehensive suites of test cases for programming problems within +automatic graders can be time-consuming and complex. The effort needed to +define test suites may deter some instructors from creating additional problems +or lead to inadequate test coverage, potentially resulting in misleading +feedback on student solutions. Such limitations may reduce student access to +the well-documented benefits of timely feedback when learning programming. + In this work, we evaluate the effectiveness of using Large Language Models +(LLMs), as part of a larger workflow, to automatically generate test suites for +CS1-level programming problems. Each problem's statement and reference solution +are provided to GPT-4 to produce a test suite that can be used by an +autograder. We evaluate our proposed approach using a sample of 26 problems, +and more than 25,000 attempted solutions to those problems, submitted by +students in an introductory programming course. We compare the performance of +the LLM-generated test suites against the instructor-created test suites for +each problem. Our findings reveal that LLM-generated test suites can correctly +identify most valid solutions, and for most problems are at least as +comprehensive as the instructor test suites. Additionally, the LLM-generated +test suites exposed ambiguities in some problem statements, underscoring their +potential to improve both autograding and instructional design. + +
+
+ comment: Submitted to Journal of Computer Assisted Learning +
+
+
+
+
+ + ☆ Cross Space and Time: A Spatio-Temporal Unitized Model for Traffic Flow + Forecasting + + +
+ Predicting spatio-temporal traffic flow presents significant challenges due +to complex interactions between spatial and temporal factors. Existing +approaches often address these dimensions in isolation, neglecting their +critical interdependencies. In this paper, we introduce the Spatio-Temporal +Unitized Model (STUM), a unified framework designed to capture both spatial and +temporal dependencies while addressing spatio-temporal heterogeneity through +techniques such as distribution alignment and feature fusion. It also ensures +both predictive accuracy and computational efficiency. Central to STUM is the +Adaptive Spatio-temporal Unitized Cell (ASTUC), which utilizes low-rank +matrices to seamlessly store, update, and interact with space, time, as well as +their correlations. Our framework is also modular, allowing it to integrate +with various spatio-temporal graph neural networks through components such as +backbone models, feature extractors, residual fusion blocks, and predictive +modules to collectively enhance forecasting outcomes. Experimental results +across multiple real-world datasets demonstrate that STUM consistently improves +prediction performance with minimal computational cost. These findings are +further supported by hyperparameter optimization, pre-training analysis, and +result visualization. We provide our source code for reproducibility at +https://anonymous.4open.science/r/STUM-E4F0. + +
+
+
+
+
+ + ☆ Enhancing Financial Domain Adaptation of Language Models via Model + Augmentation + + +
+ The domain adaptation of language models, including large language models +(LLMs), has become increasingly important as the use of such models continues +to expand. This study demonstrates the effectiveness of Composition to Augment +Language Models (CALM) in adapting to the financial domain. CALM is a model to +extend the capabilities of existing models by introducing cross-attention +between two LLMs with different functions. In our experiments, we developed a +CALM to enhance the financial performance of an LLM with strong response +capabilities by leveraging a financial-specialized LLM. Notably, the CALM was +trained using a financial dataset different from the one used to train the +financial-specialized LLM, confirming CALM's ability to adapt to various +datasets. The models were evaluated through quantitative Japanese financial +benchmarks and qualitative response comparisons, demonstrating that CALM +enables superior responses with higher scores than the original models and +baselines. Additionally, comparative experiments on connection points revealed +that connecting the middle layers of the models is most effective in +facilitating adaptation to the financial domain. These findings confirm that +CALM is a practical approach for adapting LLMs to the financial domain. + +
+
+
+
+
+ + ☆ Towards Unified Neural Decoding of Perceived, Spoken and Imagined Speech + from EEG Signals + + +
+ Brain signals accompany various information relevant to human actions and +mental imagery, making them crucial to interpreting and understanding human +intentions. Brain-computer interface technology leverages this brain activity +to generate external commands for controlling the environment, offering +critical advantages to individuals with paralysis or locked-in syndrome. Within +the brain-computer interface domain, brain-to-speech research has gained +attention, focusing on the direct synthesis of audible speech from brain +signals. Most current studies decode speech from brain activity using invasive +techniques and emphasize spoken speech data. However, humans express various +speech states, and distinguishing these states through non-invasive approaches +remains a significant yet challenging task. This research investigated the +effectiveness of deep learning models for non-invasive-based neural signal +decoding, with an emphasis on distinguishing between different speech +paradigms, including perceived, overt, whispered, and imagined speech, across +multiple frequency bands. The model utilizing the spatial conventional neural +network module demonstrated superior performance compared to other models, +especially in the gamma band. Additionally, imagined speech in the theta +frequency band, where deep learning also showed strong effects, exhibited +statistically significant differences compared to the other speech paradigms. + +
+
+
+
+
+ + ☆ Programming with AI: Evaluating ChatGPT, Gemini, AlphaCode, and GitHub + Copilot for Programmers + + +
+ Our everyday lives now heavily rely on artificial intelligence (AI) powered +large language models (LLMs). Like regular users, programmers are also +benefiting from the newest large language models. In response to the critical +role that AI models play in modern software development, this study presents a +thorough evaluation of leading programming assistants, including ChatGPT, +Gemini(Bard AI), AlphaCode, and GitHub Copilot. The evaluation is based on +tasks like natural language processing and code generation accuracy in +different programming languages like Java, Python and C++. Based on the +results, it has emphasized their strengths and weaknesses and the importance of +further modifications to increase the reliability and accuracy of the latest +popular models. Although these AI assistants illustrate a high level of +progress in language understanding and code generation, along with ethical +considerations and responsible usage, they provoke a necessity for discussion. +With time, developing more refined AI technology is essential for achieving +advanced solutions in various fields, especially with the knowledge of the +feature intricacies of these models and their implications. This study offers a +comparison of different LLMs and provides essential feedback on the rapidly +changing area of AI models. It also emphasizes the need for ethical +developmental practices to actualize AI models' full potential. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Transferable Adversarial Attacks against ASR + + +
+ Given the extensive research and real-world applications of automatic speech +recognition (ASR), ensuring the robustness of ASR models against minor input +perturbations becomes a crucial consideration for maintaining their +effectiveness in real-time scenarios. Previous explorations into ASR model +robustness have predominantly revolved around evaluating accuracy on white-box +settings with full access to ASR models. Nevertheless, full ASR model details +are often not available in real-world applications. Therefore, evaluating the +robustness of black-box ASR models is essential for a comprehensive +understanding of ASR model resilience. In this regard, we thoroughly study the +vulnerability of practical black-box attacks in cutting-edge ASR models and +propose to employ two advanced time-domain-based transferable attacks alongside +our differentiable feature extractor. We also propose a speech-aware gradient +optimization approach (SAGO) for ASR, which forces mistranscription with +minimal impact on human imperceptibility through voice activity detection rule +and a speech-aware gradient-oriented optimizer. Our comprehensive experimental +results reveal performance enhancements compared to baseline approaches across +five models on two databases. + +
+
+ comment: IEEE SPL +
+
+
+
+
+ + ☆ Comprehensive and Practical Evaluation of Retrieval-Augmented Generation + Systems for Medical Question Answering + + +
+ Retrieval-augmented generation (RAG) has emerged as a promising approach to +enhance the performance of large language models (LLMs) in knowledge-intensive +tasks such as those from medical domain. However, the sensitive nature of the +medical domain necessitates a completely accurate and trustworthy system. While +existing RAG benchmarks primarily focus on the standard retrieve-answer +setting, they overlook many practical scenarios that measure crucial aspects of +a reliable medical system. This paper addresses this gap by providing a +comprehensive evaluation framework for medical question-answering (QA) systems +in a RAG setting for these situations, including sufficiency, integration, and +robustness. We introduce Medical Retrieval-Augmented Generation Benchmark +(MedRGB) that provides various supplementary elements to four medical QA +datasets for testing LLMs' ability to handle these specific scenarios. +Utilizing MedRGB, we conduct extensive evaluations of both state-of-the-art +commercial LLMs and open-source models across multiple retrieval conditions. +Our experimental results reveals current models' limited ability to handle +noise and misinformation in the retrieved documents. We further analyze the +LLMs' reasoning processes to provides valuable insights and future directions +for developing RAG systems in this critical medical domain. + +
+
+
+
+
+ + ☆ Dynamic Neural Communication: Convergence of Computer Vision and + Brain-Computer Interface + + +
+ Interpreting human neural signals to decode static speech intentions such as +text or images and dynamic speech intentions such as audio or video is showing +great potential as an innovative communication tool. Human communication +accompanies various features, such as articulatory movements, facial +expressions, and internal speech, all of which are reflected in neural signals. +However, most studies only generate short or fragmented outputs, while +providing informative communication by leveraging various features from neural +signals remains challenging. In this study, we introduce a dynamic neural +communication method that leverages current computer vision and brain-computer +interface technologies. Our approach captures the user's intentions from neural +signals and decodes visemes in short time steps to produce dynamic visual +outputs. The results demonstrate the potential to rapidly capture and +reconstruct lip movements during natural speech attempts from human neural +signals, enabling dynamic neural communication through the convergence of +computer vision and brain--computer interface. + +
+
+ comment: 4 pages, 2 figures, 1 table, Name of Conference: International + Conference on Brain-Computer Interface +
+
+
+
+
+ + ☆ RibCageImp: A Deep Learning Framework for 3D Ribcage Implant Generation + + +
+ The recovery of damaged or resected ribcage structures requires precise, +custom-designed implants to restore the integrity and functionality of the +thoracic cavity. Traditional implant design methods rely mainly on manual +processes, making them time-consuming and susceptible to variability. In this +work, we explore the feasibility of automated ribcage implant generation using +deep learning. We present a framework based on 3D U-Net architecture that +processes CT scans to generate patient-specific implant designs. To the best of +our knowledge, this is the first investigation into automated thoracic implant +generation using deep learning approaches. Our preliminary results, while +moderate, highlight both the potential and the significant challenges in this +complex domain. These findings establish a foundation for future research in +automated ribcage reconstruction and identify key technical challenges that +need to be addressed for practical implementation. + +
+
+
+
+
+ + ☆ Improvement and Implementation of a Speech Emotion Recognition Model + Based on Dual-Layer LSTM + + +
+ This paper builds upon an existing speech emotion recognition model by adding +an additional LSTM layer to improve the accuracy and processing efficiency of +emotion recognition from audio data. By capturing the long-term dependencies +within audio sequences through a dual-layer LSTM network, the model can +recognize and classify complex emotional patterns more accurately. Experiments +conducted on the RAVDESS dataset validated this approach, showing that the +modified dual layer LSTM model improves accuracy by 2% compared to the +single-layer LSTM while significantly reducing recognition latency, thereby +enhancing real-time performance. These results indicate that the dual-layer +LSTM architecture is highly suitable for handling emotional features with +long-term dependencies, providing a viable optimization for speech emotion +recognition systems. This research provides a reference for practical +applications in fields like intelligent customer service, sentiment analysis +and human-computer interaction. + +
+
+
+
+
+ + ☆ Dynamic technology impact analysis: A multi-task learning approach to + patent citation prediction + + +
+ Machine learning (ML) models are valuable tools for analyzing the impact of +technology using patent citation information. However, existing ML-based +methods often struggle to account for the dynamic nature of the technology +impact over time and the interdependencies of these impacts across different +periods. This study proposes a multi-task learning (MTL) approach to enhance +the prediction of technology impact across various time frames by leveraging +knowledge sharing and simultaneously monitoring the evolution of technology +impact. First, we quantify the technology impacts and identify patterns through +citation analysis over distinct time periods. Next, we develop MTL models to +predict citation counts using multiple patent indicators over time. Finally, we +examine the changes in key input indicators and their patterns over different +periods using the SHapley Additive exPlanation method. We also offer guidelines +for validating and interpreting the results by employing statistical methods +and natural language processing techniques. A case study on battery +technologies demonstrates that our approach not only deepens the understanding +of technology impact, but also improves prediction accuracy, yielding valuable +insights for both academia and industry. + +
+
+
+
+
+ + ☆ DeBaTeR: Denoising Bipartite Temporal Graph for Recommendation + + +
+ Due to the difficulty of acquiring large-scale explicit user feedback, +implicit feedback (e.g., clicks or other interactions) is widely applied as an +alternative source of data, where user-item interactions can be modeled as a +bipartite graph. Due to the noisy and biased nature of implicit real-world +user-item interactions, identifying and rectifying noisy interactions are vital +to enhance model performance and robustness. Previous works on purifying +user-item interactions in collaborative filtering mainly focus on mining the +correlation between user/item embeddings and noisy interactions, neglecting the +benefit of temporal patterns in determining noisy interactions. Time +information, while enhancing the model utility, also bears its natural +advantage in helping to determine noisy edges, e.g., if someone usually watches +horror movies at night and talk shows in the morning, a record of watching a +horror movie in the morning is more likely to be noisy interaction. Armed with +this observation, we introduce a simple yet effective mechanism for generating +time-aware user/item embeddings and propose two strategies for denoising +bipartite temporal graph in recommender systems (DeBaTeR): the first is through +reweighting the adjacency matrix (DeBaTeR-A), where a reliability score is +defined to reweight the edges through both soft assignment and hard assignment; +the second is through reweighting the loss function (DeBaTeR-L), where weights +are generated to reweight user-item samples in the losses. Extensive +experiments have been conducted to demonstrate the efficacy of our methods and +illustrate how time information indeed helps identifying noisy edges. + +
+
+
+
+
+ + ☆ LEAP:D - A Novel Prompt-based Approach for Domain-Generalized Aerial + Object Detection ICIP 2024 + + +
+ Drone-captured images present significant challenges in object detection due +to varying shooting conditions, which can alter object appearance and shape. +Factors such as drone altitude, angle, and weather cause these variations, +influencing the performance of object detection algorithms. To tackle these +challenges, we introduce an innovative vision-language approach using learnable +prompts. This shift from conventional manual prompts aims to reduce +domain-specific knowledge interference, ultimately improving object detection +capabilities. Furthermore, we streamline the training process with a one-step +approach, updating the learnable prompt concurrently with model training, +enhancing efficiency without compromising performance. Our study contributes to +domain-generalized object detection by leveraging learnable prompts and +optimizing training processes. This enhances model robustness and adaptability +across diverse environments, leading to more effective aerial object detection. + +
+
+ comment: ICIP 2024 Workshop accepted paper +
+
+
+
+
+ + ☆ Gazing at Rewards: Eye Movements as a Lens into Human and AI + Decision-Making in Hybrid Visual Foraging + + +
+ Imagine searching a collection of coins for quarters ($0.25$), dimes +($0.10$), nickels ($0.05$), and pennies ($0.01$)-a hybrid foraging task where +observers look for multiple instances of multiple target types. In such tasks, +how do target values and their prevalence influence foraging and eye movement +behaviors (e.g., should you prioritize rare quarters or common nickels)? To +explore this, we conducted human psychophysics experiments, revealing that +humans are proficient reward foragers. Their eye fixations are drawn to regions +with higher average rewards, fixation durations are longer on more valuable +targets, and their cumulative rewards exceed chance, approaching the upper +bound of optimal foragers. To probe these decision-making processes of humans, +we developed a transformer-based Visual Forager (VF) model trained via +reinforcement learning. Our VF model takes a series of targets, their +corresponding values, and the search image as inputs, processes the images +using foveated vision, and produces a sequence of eye movements along with +decisions on whether to collect each fixated item. Our model outperforms all +baselines, achieves cumulative rewards comparable to those of humans, and +approximates human foraging behavior in eye movements and foraging biases +within time-limited environments. Furthermore, stress tests on +out-of-distribution tasks with novel targets, unseen values, and varying set +sizes demonstrate the VF model's effective generalization. Our work offers +valuable insights into the relationship between eye movements and +decision-making, with our model serving as a powerful tool for further +exploration of this connection. All data, code, and models will be made +publicly available. + +
+
+
+
+
+ + ☆ Advancing Diffusion Models: Alias-Free Resampling and Enhanced + Rotational Equivariance + + +
+ Recent advances in image generation, particularly via diffusion models, have +led to impressive improvements in image synthesis quality. Despite this, +diffusion models are still challenged by model-induced artifacts and limited +stability in image fidelity. In this work, we hypothesize that the primary +cause of this issue is the improper resampling operation that introduces +aliasing in the diffusion model and a careful alias-free resampling dictated by +image processing theory can improve the model's performance in image synthesis. +We propose the integration of alias-free resampling layers into the UNet +architecture of diffusion models without adding extra trainable parameters, +thereby maintaining computational efficiency. We then assess whether these +theory-driven modifications enhance image quality and rotational equivariance. +Our experimental results on benchmark datasets, including CIFAR-10, MNIST, and +MNIST-M, reveal consistent gains in image quality, particularly in terms of FID +and KID scores. Furthermore, we propose a modified diffusion process that +enables user-controlled rotation of generated images without requiring +additional training. Our findings highlight the potential of theory-driven +enhancements such as alias-free resampling in generative models to improve +image quality while maintaining model efficiency and pioneer future research +directions to incorporate them into video-generating diffusion models, enabling +deeper exploration of the applications of alias-free resampling in generative +modeling. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Towards Scalable Handwriting Communication via EEG Decoding and Latent + Embedding Integration + + +
+ In recent years, brain-computer interfaces have made advances in decoding +various motor-related tasks, including gesture recognition and movement +classification, utilizing electroencephalogram (EEG) data. These developments +are fundamental in exploring how neural signals can be interpreted to recognize +specific physical actions. This study centers on a written alphabet +classification task, where we aim to decode EEG signals associated with +handwriting. To achieve this, we incorporate hand kinematics to guide the +extraction of the consistent embeddings from high-dimensional neural recordings +using auxiliary variables (CEBRA). These CEBRA embeddings, along with the EEG, +are processed by a parallel convolutional neural network model that extracts +features from both data sources simultaneously. The model classifies nine +different handwritten characters, including symbols such as exclamation marks +and commas, within the alphabet. We evaluate the model using a quantitative +five-fold cross-validation approach and explore the structure of the embedding +space through visualizations. Our approach achieves a classification accuracy +of 91 % for the nine-class task, demonstrating the feasibility of fine-grained +handwriting decoding from EEG. + +
+
+ comment: 4 pages, 2 figures, 1 table, Name of Conference: International + Conference on Brain-Computer Interface +
+
+
+
+
+ + ☆ Artificial Theory of Mind and Self-Guided Social Organisation + + +
+ One of the challenges artificial intelligence (AI) faces is how a collection +of agents coordinate their behaviour to achieve goals that are not reachable by +any single agent. In a recent article by Ozmen et al this was framed as one of +six grand challenges: That AI needs to respect human cognitive processes at the +human-AI interaction frontier. We suggest that this extends to the AI-AI +frontier and that it should also reflect human psychology, as it is the only +successful framework we have from which to build out. In this extended abstract +we first make the case for collective intelligence in a general setting, +drawing on recent work from single neuron complexity in neural networks and ant +network adaptability in ant colonies. From there we introduce how species +relate to one another in an ecological network via niche selection, niche +choice, and niche conformity with the aim of forming an analogy with human +social network development as new agents join together and coordinate. From +there we show how our social structures are influenced by our neuro-physiology, +our psychology, and our language. This emphasises how individual people within +a social network influence the structure and performance of that network in +complex tasks, and that cognitive faculties such as Theory of Mind play a +central role. We finish by discussing the current state of the art in AI and +where there is potential for further development of a socially embodied +collective artificial intelligence that is capable of guiding its own social +structures. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Theory of Mind Enhances Collective Intelligence + + +
+ Collective Intelligence plays a central role in a large variety of fields, +from economics and evolutionary theory to neural networks and eusocial insects, +and it is also core to much of the work on emergence and self-organisation in +complex systems theory. However, in human collective intelligence there is +still much more to be understood in the relationship between specific +psychological processes at the individual level and the emergence of +self-organised structures at the social level. Previously psychological factors +have played a relatively minor role in the study of collective intelligence as +the principles are often quite general and applicable to humans just as readily +as insects or other agents without sophisticated psychologies. In this article +we emphasise, with examples from other complex adaptive systems, the broad +applicability of collective intelligence principles while the mechanisms and +time-scales differ significantly between examples. We contend that flexible +collective intelligence in human social settings is improved by our use of a +specific cognitive tool: our Theory of Mind. We identify several key +characteristics of psychologically mediated collective intelligence and show +that the development of a Theory of Mind is a crucial factor distinguishing +social collective intelligence from general collective intelligence. We then +place these capabilities in the context of the next steps in artificial +intelligence embedded in a future that includes an effective human-AI hybrid +social ecology. + +
+
+ comment: 20 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ Rationality based Innate-Values-driven Reinforcement Learning + + +
+ Innate values describe agents' intrinsic motivations, which reflect their +inherent interests and preferences to pursue goals and drive them to develop +diverse skills satisfying their various needs. The essence of reinforcement +learning (RL) is learning from interaction based on reward-driven behaviors, +much like natural agents. It is an excellent model to describe the +innate-values-driven (IV) behaviors of AI agents. Especially developing the +awareness of the AI agent through balancing internal and external utilities +based on its needs in different tasks is a crucial problem for individuals +learning to support AI agents integrating human society with safety and harmony +in the long term. This paper proposes a hierarchical compound intrinsic value +reinforcement learning model -- innate-values-driven reinforcement learning +termed IVRL to describe the complex behaviors of AI agents' interaction. We +formulated the IVRL model and proposed two IVRL models: DQN and A2C. By +comparing them with benchmark algorithms such as DQN, DDQN, A2C, and PPO in the +Role-Playing Game (RPG) reinforcement learning test platform VIZDoom, we +demonstrated that rationally organizing various individual needs can +effectively achieve better performance. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2401.05572 +
+
+
+
+
+ + ☆ The \emph{Optimist}: Towards Fully Automated Graph Theory Research + + +
+ This paper introduces the \emph{Optimist}, an autonomous system developed to +advance automated conjecture generation in graph theory. Leveraging +mixed-integer programming (MIP) and heuristic methods, the \emph{Optimist} +generates conjectures that both rediscover established theorems and propose +novel inequalities. Through a combination of memory-based computation and +agent-like adaptability, the \emph{Optimist} iteratively refines its +conjectures by integrating new data, enabling a feedback process with minimal +human (\emph{or machine}) intervention. Initial experiments reveal the +\emph{Optimist}'s potential to uncover foundational results in graph theory, as +well as to produce conjectures of interest for future exploration. This work +also outlines the \emph{Optimist}'s evolving integration with a counterpart +agent, the \emph{Pessimist} (a human \emph{or machine} agent), to establish a +dueling system that will drive fully automated graph theory research. + +
+
+
+
+
+ + ☆ ABCI 3.0: Evolution of the leading AI infrastructure in Japan + + +
+ ABCI 3.0 is the latest version of the ABCI, a large-scale open AI +infrastructure that AIST has been operating since August 2018 and will be fully +operational in January 2025. ABCI 3.0 consists of computing servers equipped +with 6128 of the NVIDIA H200 GPUs and an all-flash storage system. Its peak +performance is 6.22 exaflops in half precision and 3.0 exaflops in single +precision, which is 7 to 13 times faster than the previous system, ABCI 2.0. It +also more than doubles both storage capacity and theoretical read/write +performance. ABCI 3.0 is expected to accelerate research and development, +evaluation, and workforce development of cutting-edge AI technologies, with a +particular focus on generative AI. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ DROJ: A Prompt-Driven Attack against Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated exceptional capabilities +across various natural language processing tasks. Due to their training on +internet-sourced datasets, LLMs can sometimes generate objectionable content, +necessitating extensive alignment with human feedback to avoid such outputs. +Despite massive alignment efforts, LLMs remain susceptible to adversarial +jailbreak attacks, which usually are manipulated prompts designed to circumvent +safety mechanisms and elicit harmful responses. Here, we introduce a novel +approach, Directed Rrepresentation Optimization Jailbreak (DROJ), which +optimizes jailbreak prompts at the embedding level to shift the hidden +representations of harmful queries towards directions that are more likely to +elicit affirmative responses from the model. Our evaluations on LLaMA-2-7b-chat +model show that DROJ achieves a 100\% keyword-based Attack Success Rate (ASR), +effectively preventing direct refusals. However, the model occasionally +produces repetitive and non-informative responses. To mitigate this, we +introduce a helpfulness system prompt that enhances the utility of the model's +responses. Our code is available at +https://github.com/Leon-Leyang/LLM-Safeguard. + +
+
+
+
+
+ + ☆ VCBench: A Controllable Benchmark for Symbolic and Abstract Challenges + in Video Cognition + + +
+ Recent advancements in Large Video-Language Models (LVLMs) have driven the +development of benchmarks designed to assess cognitive abilities in video-based +tasks. However, most existing benchmarks heavily rely on web-collected videos +paired with human annotations or model-generated questions, which limit control +over the video content and fall short in evaluating advanced cognitive +abilities involving symbolic elements and abstract concepts. To address these +limitations, we introduce VCBench, a controllable benchmark to assess LVLMs' +cognitive abilities, involving symbolic and abstract concepts at varying +difficulty levels. By generating video data with the Python-based engine, +VCBench allows for precise control over the video content, creating dynamic, +task-oriented videos that feature complex scenes and abstract concepts. Each +task pairs with tailored question templates that target specific cognitive +challenges, providing a rigorous evaluation test. Our evaluation reveals that +even state-of-the-art (SOTA) models, such as Qwen2-VL-72B, struggle with simple +video cognition tasks involving abstract concepts, with performance sharply +dropping by 19% as video complexity rises. These findings reveal the current +limitations of LVLMs in advanced cognitive tasks and highlight the critical +role of VCBench in driving research toward more robust LVLMs for complex video +cognition challenges. + +
+
+
+
+
+ + ☆ Provocation: Who benefits from "inclusion" in Generative AI? NeurIPS 2024 + + +
+ The demands for accurate and representative generative AI systems means there +is an increased demand on participatory evaluation structures. While these +participatory structures are paramount to to ensure non-dominant values, +knowledge and material culture are also reflected in AI models and the media +they generate, we argue that dominant structures of community participation in +AI development and evaluation are not explicit enough about the benefits and +harms that members of socially marginalized groups may experience as a result +of their participation. Without explicit interrogation of these benefits by AI +developers, as a community we may remain blind to the immensity of systemic +change that is needed as well. To support this provocation, we present a +speculative case study, developed from our own collective experiences as AI +researchers. We use this speculative context to itemize the barriers that need +to be overcome in order for the proposed benefits to marginalized communities +to be realized, and harms mitigated. + +
+
+ comment: 3 pages, 1 figure. Published as a Short Paper in the NeurIPS 2024 + Workshop on Evaluating Evaluations: Examining Best Practices for Measuring + Broader Impacts of Generative AI +
+
+
+
+
+ + ☆ Heuristical Comparison of Vision Transformers Against Convolutional + Neural Networks for Semantic Segmentation on Remote Sensing Imagery + + +
+ Vision Transformers (ViT) have recently brought a new wave of research in the +field of computer vision. These models have done particularly well in the field +of image classification and segmentation. Research on semantic and instance +segmentation has emerged to accelerate with the inception of the new +architecture, with over 80\% of the top 20 benchmarks for the iSAID dataset +being either based on the ViT architecture or the attention mechanism behind +its success. This paper focuses on the heuristic comparison of three key +factors of using (or not using) ViT for semantic segmentation of remote sensing +aerial images on the iSAID. The experimental results observed during the course +of the research were under the scrutinization of the following objectives: 1. +Use of weighted fused loss function for the maximum mean Intersection over +Union (mIoU) score, Dice score, and minimization or conservation of entropy or +class representation, 2. Comparison of transfer learning on Meta's MaskFormer, +a ViT-based semantic segmentation model, against generic UNet Convolutional +Neural Networks (CNNs) judged over mIoU, Dice scores, training efficiency, and +inference time, and 3. What do we lose for what we gain? i.e., the comparison +of the two models against current state-of-art segmentation models. We show the +use of the novel combined weighted loss function significantly boosts the CNN +model's performance capacities as compared to transfer learning the ViT. The +code for this implementation can be found on +\url{https://github.com/ashimdahal/ViT-vs-CNN-ImageSegmentation}. + +
+
+
+
+
+ + ☆ NeuralDEM -- Real-time Simulation of Industrial Particulate Flows + + +
+ Advancements in computing power have made it possible to numerically simulate +large-scale fluid-mechanical and/or particulate systems, many of which are +integral to core industrial processes. Among the different numerical methods +available, the discrete element method (DEM) provides one of the most accurate +representations of a wide range of physical systems involving granular and +discontinuous materials. Consequently, DEM has become a widely accepted +approach for tackling engineering problems connected to granular flows and +powder mechanics. Additionally, DEM can be integrated with grid-based +computational fluid dynamics (CFD) methods, enabling the simulation of chemical +processes taking place, e.g., in fluidized beds. However, DEM is +computationally intensive because of the intrinsic multiscale nature of +particulate systems, restricting simulation duration or number of particles. +Towards this end, NeuralDEM presents an end-to-end approach to replace slow +numerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM +is capable of picturing long-term transport processes across different regimes +using macroscopic observables without any reference to microscopic model +parameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an +underlying continuous field, while simultaneously modeling macroscopic behavior +directly as additional auxiliary fields. Second, NeuralDEM introduces +multi-branch neural operators scalable to real-time modeling of +industrially-sized scenarios - from slow and pseudo-steady to fast and +transient. Such scenarios have previously posed insurmountable challenges for +deep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM +fluidized bed reactors of 160k CFD cells and 500k DEM particles for +trajectories of 28s. NeuralDEM will open many new doors to advanced engineering +and much faster process cycles. + +
+
+ comment: Project page: https://nx-ai.github.io/NeuralDEM/ +
+
+
+
+
+ + ☆ Adopting RAG for LLM-Aided Future Vehicle Design + + +
+ In this paper, we explore the integration of Large Language Models (LLMs) +with Retrieval-Augmented Generation (RAG) to enhance automated design and +software development in the automotive industry. We present two case studies: a +standardization compliance chatbot and a design copilot, both utilizing RAG to +provide accurate, context-aware responses. We evaluate four LLMs-GPT-4o, +LLAMA3, Mistral, and Mixtral -- comparing their answering accuracy and +execution time. Our results demonstrate that while GPT-4 offers superior +performance, LLAMA3 and Mistral also show promising capabilities for local +deployment, addressing data privacy concerns in automotive applications. This +study highlights the potential of RAG-augmented LLMs in improving design +workflows and compliance in automotive engineering. + +
+
+ comment: Conference paper accepted in IEEE FLLM 2024 +
+
+
+
+
+ + ☆ LEAP:D -- A Novel Prompt-based Approach for Domain-Generalized Aerial + Object Detection ICIP 2024 + + +
+ Drone-captured images present significant challenges in object detection due +to varying shooting conditions, which can alter object appearance and shape. +Factors such as drone altitude, angle, and weather cause these variations, +influencing the performance of object detection algorithms. To tackle these +challenges, we introduce an innovative vision-language approach using learnable +prompts. This shift from conventional manual prompts aims to reduce +domain-specific knowledge interference, ultimately improving object detection +capabilities. Furthermore, we streamline the training process with a one-step +approach, updating the learnable prompt concurrently with model training, +enhancing efficiency without compromising performance. Our study contributes to +domain-generalized object detection by leveraging learnable prompts and +optimizing training processes. This enhances model robustness and adaptability +across diverse environments, leading to more effective aerial object detection. + +
+
+ comment: ICIP 2024 Workshop accepted paper +
+
+
+
+
+ + ☆ Self-Supervised Radio Pre-training: Toward Foundational Models for + Spectrogram Learning + + +
+ Foundational deep learning (DL) models are general models, trained on large, +diverse, and unlabelled datasets, typically using self-supervised learning +techniques have led to significant advancements especially in natural language +processing. These pretrained models can be fine-tuned for related downstream +tasks, offering faster development and reduced training costs, while often +achieving improved performance. In this work, we introduce Masked Spectrogram +Modeling, a novel self-supervised learning approach for pretraining +foundational DL models on radio signals. Adopting a Convolutional LSTM +architecture for efficient spatio-temporal processing, we pretrain the model +with an unlabelled radio dataset collected from over-the-air measurements. +Subsequently, the pretrained model is fine-tuned for two downstream tasks: +spectrum forecasting and segmentation. Experimental results demonstrate that +our methodology achieves competitive performance in both forecasting accuracy +and segmentation, validating its effectiveness for developing foundational +radio models. + +
+
+
+
+
+ + ☆ Deep Autoencoders for Unsupervised Anomaly Detection in Wildfire + Prediction + + +
+ Wildfires pose a significantly increasing hazard to global ecosystems due to +the climate crisis. Due to its complex nature, there is an urgent need for +innovative approaches to wildfire prediction, such as machine learning. This +research took a unique approach, differentiating from classical supervised +learning, and addressed the gap in unsupervised wildfire prediction using +autoencoders and clustering techniques for anomaly detection. Historical +weather and normalised difference vegetation index datasets of Australia for +2005 - 2021 were utilised. Two main unsupervised approaches were analysed. The +first used a deep autoencoder to obtain latent features, which were then fed +into clustering models, isolation forest, local outlier factor and one-class +SVM for anomaly detection. The second approach used a deep autoencoder to +reconstruct the input data and use reconstruction errors to identify anomalies. +Long Short-Term Memory (LSTM) autoencoders and fully connected (FC) +autoencoders were employed in this part, both in an unsupervised way learning +only from nominal data. The FC autoencoder outperformed its counterparts, +achieving an accuracy of 0.71, an F1-score of 0.74, and an MCC of 0.42. These +findings highlight the practicality of this method, as it effectively predicts +wildfires in the absence of ground truth, utilising an unsupervised learning +technique. + +
+
+ comment: 33 pages, 18 figure, 16 tables. To appear in Earth and Space Science +
+
+
+
+
+ + ☆ Real-time Adapting Routing (RAR): Improving Efficiency Through + Continuous Learning in Software Powered by Layered Foundation Models + + +
+ To balance the quality and inference cost of a Foundation Model (FM, such as +large language models (LLMs)) powered software, people often opt to train a +routing model that routes requests to FMs with different sizes and +capabilities. Existing routing models rely on learning the optimal routing +decision from carefully curated data, require complex computations to be +updated, and do not consider the potential evolution of weaker FMs. In this +paper, we propose Real-time Adaptive Routing (RAR), an approach to continuously +adapt FM routing decisions while using guided in-context learning to enhance +the capabilities of weaker FM. The goal is to reduce reliance on stronger, more +expensive FMs. We evaluate our approach on different subsets of the popular +MMLU benchmark. Over time, our approach routes 50.2% fewer requests to +computationally expensive models while maintaining around 90.5% of the general +response quality. In addition, the guides generated from stronger models have +shown intra-domain generalization and led to a better quality of responses +compared to an equivalent approach with a standalone weaker FM. + +
+
+
+
+
+ + ☆ A Benchmark for Long-Form Medical Question Answering NeurIPS 2024 + + +
+ There is a lack of benchmarks for evaluating large language models (LLMs) in +long-form medical question answering (QA). Most existing medical QA evaluation +benchmarks focus on automatic metrics and multiple-choice questions. While +valuable, these benchmarks fail to fully capture or assess the complexities of +real-world clinical applications where LLMs are being deployed. Furthermore, +existing studies on evaluating long-form answer generation in medical QA are +primarily closed-source, lacking access to human medical expert annotations, +which makes it difficult to reproduce results and enhance existing baselines. +In this work, we introduce a new publicly available benchmark featuring +real-world consumer medical questions with long-form answer evaluations +annotated by medical doctors. We performed pairwise comparisons of responses +from various open and closed-source medical and general-purpose LLMs based on +criteria such as correctness, helpfulness, harmfulness, and bias. Additionally, +we performed a comprehensive LLM-as-a-judge analysis to study the alignment +between human judgments and LLMs. Our preliminary results highlight the strong +potential of open LLMs in medical QA compared to leading closed models. Code & +Data: https://github.com/lavita-ai/medical-eval-sphere + +
+
+ comment: AIM-FM: Advancements in Medical Foundation Models Workshop, 38th + Conference on Neural Information Processing Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ A Self-Supervised Model for Multi-modal Stroke Risk Prediction + + +
+ Predicting stroke risk is a complex challenge that can be enhanced by +integrating diverse clinically available data modalities. This study introduces +a self-supervised multimodal framework that combines 3D brain imaging, clinical +data, and image-derived features to improve stroke risk prediction prior to +onset. By leveraging large unannotated clinical datasets, the framework +captures complementary and synergistic information across image and tabular +data modalities. Our approach is based on a contrastive learning framework that +couples contrastive language-image pretraining with an image-tabular matching +module, to better align multimodal data representations in a shared latent +space. The model is trained on the UK Biobank, which includes structural brain +MRI and clinical data. We benchmark its performance against state-of-the-art +unimodal and multimodal methods using tabular, image, and image-tabular +combinations under diverse frozen and trainable model settings. The proposed +model outperformed self-supervised tabular (image) methods by 2.6% (2.6%) in +ROC-AUC and by 3.3% (5.6%) in balanced accuracy. Additionally, it showed a 7.6% +increase in balanced accuracy compared to the best multimodal supervised model. +Through interpretable tools, our approach demonstrated better integration of +tabular and image data, providing richer and more aligned embeddings. +Gradient-weighted Class Activation Mapping heatmaps further revealed activated +brain regions commonly associated in the literature with brain aging, stroke +risk, and clinical outcomes. This robust self-supervised multimodal framework +surpasses state-of-the-art methods for stroke risk prediction and offers a +strong foundation for future studies integrating diverse data modalities to +advance clinical predictive modelling. + +
+
+ comment: Accepted as oral paper at AIM-FM workshop, Neurips 2024 +
+
+
+
+
+ + ☆ WelQrate: Defining the Gold Standard in Small Molecule Drug Discovery + Benchmarking + + +
+ While deep learning has revolutionized computer-aided drug discovery, the AI +community has predominantly focused on model innovation and placed less +emphasis on establishing best benchmarking practices. We posit that without a +sound model evaluation framework, the AI community's efforts cannot reach their +full potential, thereby slowing the progress and transfer of innovation into +real-world drug discovery. Thus, in this paper, we seek to establish a new gold +standard for small molecule drug discovery benchmarking, WelQrate. +Specifically, our contributions are threefold: WelQrate Dataset Collection - we +introduce a meticulously curated collection of 9 datasets spanning 5 +therapeutic target classes. Our hierarchical curation pipelines, designed by +drug discovery experts, go beyond the primary high-throughput screen by +leveraging additional confirmatory and counter screens along with rigorous +domain-driven preprocessing, such as Pan-Assay Interference Compounds (PAINS) +filtering, to ensure the high-quality data in the datasets; WelQrate Evaluation +Framework - we propose a standardized model evaluation framework considering +high-quality datasets, featurization, 3D conformation generation, evaluation +metrics, and data splits, which provides a reliable benchmarking for drug +discovery experts conducting real-world virtual screening; Benchmarking - we +evaluate model performance through various research questions using the +WelQrate dataset collection, exploring the effects of different models, dataset +quality, featurization methods, and data splitting strategies on the results. +In summary, we recommend adopting our proposed WelQrate as the gold standard in +small molecule drug discovery benchmarking. The WelQrate dataset collection, +along with the curation codes, and experimental scripts are all publicly +available at WelQrate.org. + +
+
+ comment: * denotes equal contribution +
+
+
+
+
+ + ☆ Evaluating Loss Landscapes from a Topology Perspective + + +
+ Characterizing the loss of a neural network with respect to model parameters, +i.e., the loss landscape, can provide valuable insights into properties of that +model. Various methods for visualizing loss landscapes have been proposed, but +less emphasis has been placed on quantifying and extracting actionable and +reproducible insights from these complex representations. Inspired by powerful +tools from topological data analysis (TDA) for summarizing the structure of +high-dimensional data, here we characterize the underlying shape (or topology) +of loss landscapes, quantifying the topology to reveal new insights about +neural networks. To relate our findings to the machine learning (ML) +literature, we compute simple performance metrics (e.g., accuracy, error), and +we characterize the local structure of loss landscapes using Hessian-based +metrics (e.g., largest eigenvalue, trace, eigenvalue spectral density). +Following this approach, we study established models from image pattern +recognition (e.g., ResNets) and scientific ML (e.g., physics-informed neural +networks), and we show how quantifying the shape of loss landscapes can provide +new insights into model performance and learning dynamics. + +
+
+
+
+
+ + ☆ Deep Learning for Fetal Inflammatory Response Diagnosis in the Umbilical + Cord + + +
+ Inflammation of the umbilical cord can be seen as a result of ascending +intrauterine infection or other inflammatory stimuli. Acute fetal inflammatory +response (FIR) is characterized by infiltration of the umbilical cord by fetal +neutrophils, and can be associated with neonatal sepsis or fetal inflammatory +response syndrome. Recent advances in deep learning in digital pathology have +demonstrated favorable performance across a wide range of clinical tasks, such +as diagnosis and prognosis. In this study we classified FIR from whole slide +images (WSI). We digitized 4100 histological slides of umbilical cord stained +with hematoxylin and eosin(H&E) and extracted placental diagnoses from the +electronic health record. We build models using attention-based whole slide +learning models. We compared strategies between features extracted by a model +(ConvNeXtXLarge) pretrained on non-medical images (ImageNet), and one +pretrained using histopathology images (UNI). We trained multiple iterations of +each model and combined them into an ensemble. The predictions from the +ensemble of models trained using UNI achieved an overall balanced accuracy of +0.836 on the test dataset. In comparison, the ensembled predictions using +ConvNeXtXLarge had a lower balanced accuracy of 0.7209. Heatmaps generated from +top accuracy model appropriately highlighted arteritis in cases of FIR 2. In +FIR 1, the highest performing model assigned high attention to areas of +activated-appearing stroma in Wharton's Jelly. However, other high-performing +models assigned attention to umbilical vessels. We developed models for +diagnosis of FIR from placental histology images, helping reduce interobserver +variability among pathologists. Future work may examine the utility of these +models for identifying infants at risk of systemic inflammatory response or +early onset neonatal sepsis. + +
+
+
+
+
+ + ♻ ☆ Enhancing Maritime Trajectory Forecasting via H3 Index and Causal + Language Modelling (CLM) + + +
+ The prediction of ship trajectories is a growing field of study in artificial +intelligence. Traditional methods rely on the use of LSTM, GRU networks, and +even Transformer architectures for the prediction of spatio-temporal series. +This study proposes a viable alternative for predicting these trajectories +using only GNSS positions. It considers this spatio-temporal problem as a +natural language processing problem. The latitude/longitude coordinates of AIS +messages are transformed into cell identifiers using the H3 index. Thanks to +the pseudo-octal representation, it becomes easier for language models to learn +the spatial hierarchy of the H3 index. The method is compared with a classical +Kalman filter, widely used in the maritime domain, and introduces the Fr\'echet +distance as the main evaluation metric. We show that it is possible to predict +ship trajectories quite precisely up to 8 hours ahead with 30 minutes of +context, using solely GNSS positions, without relying on any additional +information such as speed, course, or external conditions - unlike many +traditional methods. We demonstrate that this alternative works well enough to +predict trajectories worldwide. + +
+
+ comment: 28 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ Quantitative Assessment of Intersectional Empathetic Bias and + Understanding + + +
+ A growing amount of literature critiques the current operationalizations of +empathy based on loose definitions of the construct. Such definitions +negatively affect dataset quality, model robustness, and evaluation +reliability. We propose an empathy evaluation framework that operationalizes +empathy close to its psychological origins. The framework measures the variance +in responses of LLMs to prompts using existing metrics for empathy and +emotional valence. The variance is introduced through the controlled generation +of the prompts by varying social biases affecting context understanding, thus +impacting empathetic understanding. The control over generation ensures high +theoretical validity of the constructs in the prompt dataset. Also, it makes +high-quality translation, especially into languages that currently have +little-to-no way of evaluating empathy or bias, such as the Slavonic family, +more manageable. Using chosen LLMs and various prompt types, we demonstrate the +empathy evaluation with the framework, including multiple-choice answers and +free generation. The variance in our initial evaluation sample is small and we +were unable to measure convincing differences between the empathetic +understanding in contexts given by different social groups. However, the +results are promising because the models showed significant alterations their +reasoning chains needed to capture the relatively subtle changes in the +prompts. This provides the basis for future research into the construction of +the evaluation sample and statistical methods for measuring the results. + +
+
+
+
+
+ + ♻ ☆ Lifted Inference beyond First-Order Logic + + +
+ Weighted First Order Model Counting (WFOMC) is fundamental to probabilistic +inference in statistical relational learning models. As WFOMC is known to be +intractable in general ($\#$P-complete), logical fragments that admit +polynomial time WFOMC are of significant interest. Such fragments are called +domain liftable. Recent works have shown that the two-variable fragment of +first order logic extended with counting quantifiers ($\mathrm{C^2}$) is +domain-liftable. However, many properties of real-world data, like acyclicity +in citation networks and connectivity in social networks, cannot be modeled in +$\mathrm{C^2}$, or first order logic in general. In this work, we expand the +domain liftability of $\mathrm{C^2}$ with multiple such properties. We show +that any $\mathrm{C^2}$ sentence remains domain liftable when one of its +relations is restricted to represent a directed acyclic graph, a connected +graph, a tree (resp. a directed tree) or a forest (resp. a directed forest). +All our results rely on a novel and general methodology of "counting by +splitting". Besides their application to probabilistic inference, our results +provide a general framework for counting combinatorial structures. We expand a +vast array of previous results in discrete mathematics literature on directed +acyclic graphs, phylogenetic networks, etc. + +
+
+ comment: Under Review at the Artificial Intelligence Journal. Added two new + lemmas for counting by splitting in the Main approach section. Added + experiments with Markov Logic.arXiv admin note: text overlap with + arXiv:2302.09830 +
+
+
+
+
+ + ♻ ☆ Learning Multi-Agent Loco-Manipulation for Long-Horizon Quadrupedal + Pushing + + +
+ Recently, quadrupedal locomotion has achieved significant success, but their +manipulation capabilities, particularly in handling large objects, remain +limited, restricting their usefulness in demanding real-world applications such +as search and rescue, construction, industrial automation, and room +organization. This paper tackles the task of obstacle-aware, long-horizon +pushing by multiple quadrupedal robots. We propose a hierarchical multi-agent +reinforcement learning framework with three levels of control. The high-level +controller integrates an RRT planner and a centralized adaptive policy to +generate subgoals, while the mid-level controller uses a decentralized +goal-conditioned policy to guide the robots toward these sub-goals. A +pre-trained low-level locomotion policy executes the movement commands. We +evaluate our method against several baselines in simulation, demonstrating +significant improvements over baseline approaches, with 36.0% higher success +rates and 24.5% reduction in completion time than the best baseline. Our +framework successfully enables long-horizon, obstacle-aware manipulation tasks +like Push-Cuboid and Push-T on Go1 robots in the real world. + +
+
+
+
+
+ + ♻ ☆ Equivariant Symmetry Breaking Sets + + +
+ Equivariant neural networks (ENNs) have been shown to be extremely effective +in applications involving underlying symmetries. By construction ENNs cannot +produce lower symmetry outputs given a higher symmetry input. However, symmetry +breaking occurs in many physical systems and we may obtain a less symmetric +stable state from an initial highly symmetric one. Hence, it is imperative that +we understand how to systematically break symmetry in ENNs. In this work, we +propose a novel symmetry breaking framework that is fully equivariant and is +the first which fully addresses spontaneous symmetry breaking. We emphasize +that our approach is general and applicable to equivariance under any group. To +achieve this, we introduce the idea of symmetry breaking sets (SBS). Rather +than redesign existing networks, we design sets of symmetry breaking objects +which we feed into our network based on the symmetry of our inputs and outputs. +We show there is a natural way to define equivariance on these sets, which +gives an additional constraint. Minimizing the size of these sets equates to +data efficiency. We prove that minimizing these sets translates to a well +studied group theory problem, and tabulate solutions to this problem for the +point groups. Finally, we provide some examples of symmetry breaking to +demonstrate how our approach works in practice. The code for these examples is +available at \url{https://github.com/atomicarchitects/equivariant-SBS}. + +
+
+ comment: 50 pages, 19 figures Published in Transactions on Machine Learning + Research, October 2024 +
+
+
+
+
+ + ♻ ☆ FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning + in AI + + +
+ We introduce FrontierMath, a benchmark of hundreds of original, exceptionally +challenging mathematics problems crafted and vetted by expert mathematicians. +The questions cover most major branches of modern mathematics -- from +computationally intensive problems in number theory and real analysis to +abstract questions in algebraic geometry and category theory. Solving a typical +problem requires multiple hours of effort from a researcher in the relevant +branch of mathematics, and for the upper end questions, multiple days. +FrontierMath uses new, unpublished problems and automated verification to +reliably evaluate models while minimizing risk of data contamination. Current +state-of-the-art AI models solve under 2% of problems, revealing a vast gap +between AI capabilities and the prowess of the mathematical community. As AI +systems advance toward expert-level mathematical abilities, FrontierMath offers +a rigorous testbed that quantifies their progress. + +
+
+
+
+
+ + ♻ ☆ Is Linear Feedback on Smoothed Dynamics Sufficient for Stabilizing + Contact-Rich Plans? ICRA2025 + + +
+ Designing planners and controllers for contact-rich manipulation is extremely +challenging as contact violates the smoothness conditions that many +gradient-based controller synthesis tools assume. Contact smoothing +approximates a non-smooth system with a smooth one, allowing one to use these +synthesis tools more effectively. However, applying classical control synthesis +methods to smoothed contact dynamics remains relatively under-explored. This +paper analyzes the efficacy of linear controller synthesis using differential +simulators based on contact smoothing. We introduce natural baselines for +leveraging contact smoothing to compute (a) open-loop plans robust to uncertain +conditions and/or dynamics, and (b) feedback gains to stabilize around +open-loop plans. Using robotic bimanual whole-body manipulation as a testbed, +we perform extensive empirical experiments on over 300 trajectories and analyze +why LQR seems insufficient for stabilizing contact-rich plans. The video +summarizing this paper and hardware experiments is found here: +https://youtu.be/HLaKi6qbwQg?si=_zCAmBBD6rGSitm9. + +
+
+ comment: Under review for ICRA2025 +
+
+
+
+
+ + ♻ ☆ Knowledge Bases in Support of Large Language Models for Processing Web + News + + +
+ Large Language Models (LLMs) have received considerable interest in wide +applications lately. During pre-training via massive datasets, such a model +implicitly memorizes the factual knowledge of trained datasets in its hidden +parameters. However, knowledge held implicitly in parameters often makes its +use by downstream applications ineffective due to the lack of common-sense +reasoning. In this article, we introduce a general framework that permits to +build knowledge bases with an aid of LLMs, tailored for processing Web news. +The framework applies a rule-based News Information Extractor (NewsIE) to news +items for extracting their relational tuples, referred to as knowledge bases, +which are then graph-convoluted with the implicit knowledge facts of news items +obtained by LLMs, for their classification. It involves two lightweight +components: 1) NewsIE: for extracting the structural information of every news +item, in the form of relational tuples; 2) BERTGraph: for graph convoluting the +implicit knowledge facts with relational tuples extracted by NewsIE. We have +evaluated our framework under different news-related datasets for news category +classification, with promising experimental results. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Affordance-based Robot Manipulation with Flow Matching + + +
+ We present a framework for assistive robot manipulation, which focuses on two +fundamental challenges: first, efficiently adapting large-scale models to +downstream scene affordance understanding tasks, especially in daily living +scenarios where gathering multi-task data involving humans requires strenuous +effort; second, effectively learning robot trajectories by grounding the visual +affordance model. We tackle the first challenge by employing a +parameter-efficient prompt tuning method that prepends learnable text prompts +to the frozen vision model to predict manipulation affordances in multi-task +scenarios. Then we propose to learn robot trajectories guided by affordances in +a supervised Flow Matching method. Flow matching represents a robot visuomotor +policy as a conditional process of flowing random waypoints to desired robot +trajectories. Finally, we introduce a real-world dataset with 10 tasks across +Activities of Daily Living to test our framework. Our extensive evaluation +highlights that the proposed prompt tuning method for learning manipulation +affordance with language prompter achieves competitive performance and even +outperforms other finetuning protocols across data scales, while satisfying +parameter efficiency. Learning multi-task robot trajectories with flow matching +policy also leads to consistently better generalization performance and faster +inference than alternative behavior cloning methods, especially given +multimodal robot action distributions. Our framework seamlessly unifies +affordance model learning and trajectory generation with flow matching for +robot manipulation. + +
+
+
+
+
+ + ♻ ☆ Can LLMs Recognize Toxicity? A Structured Investigation Framework and + Toxicity Metric + + +
+ In the pursuit of developing Large Language Models (LLMs) that adhere to +societal standards, it is imperative to detect the toxicity in the generated +text. The majority of existing toxicity metrics rely on encoder models trained +on specific toxicity datasets, which are susceptible to out-of-distribution +(OOD) problems and depend on the dataset's definition of toxicity. In this +paper, we introduce a robust metric grounded on LLMs to flexibly measure +toxicity according to the given definition. We first analyze the toxicity +factors, followed by an examination of the intrinsic toxic attributes of LLMs +to ascertain their suitability as evaluators. Finally, we evaluate the +performance of our metric with detailed analysis. Our empirical results +demonstrate outstanding performance in measuring toxicity within verified +factors, improving on conventional metrics by 12 points in the F1 score. Our +findings also indicate that upstream toxicity significantly influences +downstream metrics, suggesting that LLMs are unsuitable for toxicity +evaluations within unverified factors. + +
+
+ comment: 8 page long +
+
+
+
+
+ + ♻ ☆ A Similarity-Based Oversampling Method for Multi-label Imbalanced Text + Data + + +
+ In real-world applications, as data availability increases, obtaining labeled +data for machine learning (ML) projects remains challenging due to the high +costs and intensive efforts required for data annotation. Many ML projects, +particularly those focused on multi-label classification, also grapple with +data imbalance issues, where certain classes may lack sufficient data to train +effective classifiers. This study introduces and examines a novel oversampling +method for multi-label text classification, designed to address performance +challenges associated with data imbalance. The proposed method identifies +potential new samples from unlabeled data by leveraging similarity measures +between instances. By iteratively searching the unlabeled dataset, the method +locates instances similar to those in underrepresented classes and evaluates +their contribution to classifier performance enhancement. Instances that +demonstrate performance improvement are then added to the labeled dataset. +Experimental results indicate that the proposed approach effectively enhances +classifier performance post-oversampling. + +
+
+
+
+
+ + ♻ ☆ IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of + brain MR images + + +
+ In MRI studies, the aggregation of imaging data from multiple acquisition +sites enhances sample size but may introduce site-related variabilities that +hinder consistency in subsequent analyses. Deep learning methods for image +translation have emerged as a solution for harmonizing MR images across sites. +In this study, we introduce IGUANe (Image Generation with Unified Adversarial +Networks), an original 3D model that leverages the strengths of domain +translation and straightforward application of style transfer methods for +multicenter brain MR image harmonization. IGUANe extends CycleGAN by +integrating an arbitrary number of domains for training through a many-to-one +architecture. The framework based on domain pairs enables the implementation of +sampling strategies that prevent confusion between site-related and biological +variabilities. During inference, the model can be applied to any image, even +from an unknown acquisition site, making it a universal generator for +harmonization. Trained on a dataset comprising T1-weighted images from 11 +different scanners, IGUANe was evaluated on data from unseen sites. The +assessments included the transformation of MR images with traveling subjects, +the preservation of pairwise distances between MR images within domains, the +evolution of volumetric patterns related to age and Alzheimer$'$s disease (AD), +and the performance in age regression and patient classification tasks. +Comparisons with other harmonization and normalization methods suggest that +IGUANe better preserves individual information in MR images and is more +suitable for maintaining and reinforcing variabilities related to age and AD. +Future studies may further assess IGUANe in other multicenter contexts, either +using the same model or retraining it for applications to different image +modalities. IGUANe is available at +https://github.com/RocaVincent/iguane_harmonization.git. + +
+
+ comment: 29 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Optimizing Automatic Summarization of Long Clinical Records Using + Dynamic Context Extension:Testing and Evaluation of the NBCE Method + + +
+ Summarizing patient clinical notes is vital for reducing documentation +burdens. Current manual summarization makes medical staff struggle. We propose +an automatic method using LLMs, but long inputs cause LLMs to lose context, +reducing output quality especially in small size model. We used a 7B model, +open-calm-7b, enhanced with Native Bayes Context Extend and a redesigned +decoding mechanism to reference one sentence at a time, keeping inputs within +context windows, 2048 tokens. Our improved model achieved near parity with +Google's over 175B Gemini on ROUGE-L metrics with 200 samples, indicating +strong performance using less resources, enhancing automated EMR summarization +feasibility. + +
+
+
+
+
+ + ♻ ☆ Doob's Lagrangian: A Sample-Efficient Variational Approach to Transition + Path Sampling NeurIPS 2024 + + +
+ Rare event sampling in dynamical systems is a fundamental problem arising in +the natural sciences, which poses significant computational challenges due to +an exponentially large space of trajectories. For settings where the dynamical +system of interest follows a Brownian motion with known drift, the question of +conditioning the process to reach a given endpoint or desired rare event is +definitively answered by Doob's h-transform. However, the naive estimation of +this transform is infeasible, as it requires simulating sufficiently many +forward trajectories to estimate rare event probabilities. In this work, we +propose a variational formulation of Doob's h-transform as an optimization +problem over trajectories between a given initial point and the desired ending +point. To solve this optimization, we propose a simulation-free training +objective with a model parameterization that imposes the desired boundary +conditions by design. Our approach significantly reduces the search space over +trajectories and avoids expensive trajectory simulation and inefficient +importance sampling estimators which are required in existing methods. We +demonstrate the ability of our method to find feasible transition paths on +real-world molecular simulation and protein folding tasks. + +
+
+ comment: Accepted as Spotlight at Conference on Neural Information Processing + Systems (NeurIPS 2024); Alanine dipeptide results updated after fixing + unphysical parameterization +
+
+
+
+
+ + ♻ ☆ ROCKET-1: Mastering Open-World Interaction with Visual-Temporal Context + Prompting + + +
+ Vision-language models (VLMs) have excelled in multimodal tasks, but adapting +them to embodied decision-making in open-world environments presents +challenges. One critical issue is bridging the gap between discrete entities in +low-level observations and the abstract concepts required for effective +planning. A common solution is building hierarchical agents, where VLMs serve +as high-level reasoners that break down tasks into executable sub-tasks, +typically specified using language. However, language suffers from the +inability to communicate detailed spatial information. We propose +visual-temporal context prompting, a novel communication protocol between VLMs +and policy models. This protocol leverages object segmentation from past +observations to guide policy-environment interactions. Using this approach, we +train ROCKET-1, a low-level policy that predicts actions based on concatenated +visual observations and segmentation masks, supported by real-time object +tracking from SAM-2. Our method unlocks the potential of VLMs, enabling them to +tackle complex tasks that demand spatial reasoning. Experiments in Minecraft +show that our approach enables agents to achieve previously unattainable tasks, +with a $\mathbf{76}\%$ absolute improvement in open-world interaction +performance. Codes and demos are now available on the project page: +https://craftjarvis.github.io/ROCKET-1. + +
+
+
+
+
+ + ♻ ☆ From Explicit Rules to Implicit Reasoning in an Interpretable Violence + Monitoring System + + +
+ Recently, research based on pre-trained models has demonstrated outstanding +performance in violence surveillance tasks. However, most of them were +black-box systems which faced challenges regarding explainability during +training and inference processes. An important question is how to incorporate +explicit knowledge into these implicit models, thereby designing expertdriven +and interpretable violence surveillance systems. This paper proposes a new +paradigm for weakly supervised violence monitoring (WSVM) called Rule base +Violence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure +with different designs for images and text. One of the branches is called the +implicit branch, which uses only visual features for coarse-grained binary +classification. In this branch, image feature extraction is divided into two +channels: one responsible for extracting scene frames and the other focusing on +extracting actions. The other branch is called the explicit branch, which +utilizes language-image alignment to perform fine-grained classification. For +the language channel design in the explicit branch, the proposed RuleVM uses +the state-of-the-art YOLOWorld model to detect objects in video frames, and +association rules are identified through data mining methods as descriptions of +the video. Leveraging the dual-branch architecture, RuleVM achieves +interpretable coarse-grained and fine-grained violence surveillance. Extensive +experiments were conducted on two commonly used benchmarks, and the results +show that RuleVM achieved the best performance in both coarse-grained and +finegrained monitoring, significantly outperforming existing state-ofthe-art +methods. Moreover, interpretability experiments uncovered some interesting +rules, such as the observation that as the number of people increases, the risk +level of violent behavior also rises. + +
+
+ comment: 12 pages,7 figures IEEE TSMCA (Under review) +
+
+
+
+
+ + ♻ ☆ Advancements in Visual Language Models for Remote Sensing: Datasets, + Capabilities, and Enhancement Techniques + + +
+ Recently, the remarkable success of ChatGPT has sparked a renewed wave of +interest in artificial intelligence (AI), and the advancements in visual +language models (VLMs) have pushed this enthusiasm to new heights. Differring +from previous AI approaches that generally formulated different tasks as +discriminative models, VLMs frame tasks as generative models and align language +with visual information, enabling the handling of more challenging problems. +The remote sensing (RS) field, a highly practical domain, has also embraced +this new trend and introduced several VLM-based RS methods that have +demonstrated promising performance and enormous potential. In this paper, we +first review the fundamental theories related to VLM, then summarize the +datasets constructed for VLMs in remote sensing and the various tasks they +addressed. Finally, we categorize the improvement methods into three main parts +according to the core components of VLMs and provide a detailed introduction +and comparison of these methods. A project associated with this review has been +created at https://github.com/taolijie11111/VLMs-in-RS-review. + +
+
+
+
+
+ + ♻ ☆ Grounding is All You Need? Dual Temporal Grounding for Video Dialog + + +
+ In the realm of video dialog response generation, the understanding of video +content and the temporal nuances of conversation history are paramount. While a +segment of current research leans heavily on large-scale pretrained +visual-language models and often overlooks temporal dynamics, another delves +deep into spatial-temporal relationships within videos but demands intricate +object trajectory pre-extractions and sidelines dialog temporal dynamics. This +paper introduces the Dual Temporal Grounding-enhanced Video Dialog model +(DTGVD), strategically designed to merge the strengths of both dominant +approaches. It emphasizes dual temporal relationships by predicting dialog +turn-specific temporal regions, filtering video content accordingly, and +grounding responses in both video and dialog contexts. One standout feature of +DTGVD is its heightened attention to chronological interplay. By recognizing +and acting upon the dependencies between different dialog turns, it captures +more nuanced conversational dynamics. To further bolster the alignment between +video and dialog temporal dynamics, we've implemented a list-wise contrastive +learning strategy. Within this framework, accurately grounded turn-clip +pairings are designated as positive samples, while less precise pairings are +categorized as negative. This refined classification is then funneled into our +holistic end-to-end response generation mechanism. Evaluations using +AVSD@DSTC-7 and AVSD@DSTC-8 datasets underscore the superiority of our +methodology. + +
+
+
+
+
+ + ♻ ☆ ClavaDDPM: Multi-relational Data Synthesis with Cluster-guided Diffusion + Models + + +
+ Recent research in tabular data synthesis has focused on single tables, +whereas real-world applications often involve complex data with tens or +hundreds of interconnected tables. Previous approaches to synthesizing +multi-relational (multi-table) data fall short in two key aspects: scalability +for larger datasets and capturing long-range dependencies, such as correlations +between attributes spread across different tables. Inspired by the success of +diffusion models in tabular data modeling, we introduce + $\textbf{C}luster$ $\textbf{La}tent$ $\textbf{Va}riable$ $guided$ +$\textbf{D}enoising$ $\textbf{D}iffusion$ $\textbf{P}robabilistic$ +$\textbf{M}odels$ (ClavaDDPM). This novel approach leverages clustering labels +as intermediaries to model relationships between tables, specifically focusing +on foreign key constraints. ClavaDDPM leverages the robust generation +capabilities of diffusion models while incorporating efficient algorithms to +propagate the learned latent variables across tables. This enables ClavaDDPM to +capture long-range dependencies effectively. + Extensive evaluations on multi-table datasets of varying sizes show that +ClavaDDPM significantly outperforms existing methods for these long-range +dependencies while remaining competitive on utility metrics for single-table +data. + +
+
+
+
+
+ + ♻ ☆ IRCAN: Mitigating Knowledge Conflicts in LLM Generation via Identifying + and Reweighting Context-Aware Neurons NeurIPS 2024 + + +
+ It is widely acknowledged that large language models (LLMs) encode a vast +reservoir of knowledge after being trained on mass data. Recent studies +disclose knowledge conflicts in LLM generation, wherein outdated or incorrect +parametric knowledge (i.e., encoded knowledge) contradicts new knowledge +provided in the context. To mitigate such knowledge conflicts, we propose a +novel framework, IRCAN (Identifying and Reweighting Context-Aware Neurons) to +capitalize on neurons that are crucial in processing contextual cues. +Specifically, IRCAN first identifies neurons that significantly contribute to +context processing, utilizing a context-aware attribution score derived from +integrated gradients. Subsequently, the identified context-aware neurons are +strengthened via reweighting. In doing so, we steer LLMs to generate +context-sensitive outputs with respect to the new knowledge provided in the +context. Extensive experiments conducted across a variety of models and tasks +demonstrate that IRCAN not only achieves remarkable improvements in handling +knowledge conflicts but also offers a scalable, plug-and-play solution that can +be integrated seamlessly with existing models. Our codes are released at +https://github.com/danshi777/IRCAN. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ An interpretable generative multimodal neuroimaging-genomics framework + for decoding Alzheimer's disease + + +
+ Alzheimer's disease (AD) is the most prevalent form of dementia with a +progressive decline in cognitive abilities. The AD continuum encompasses a +prodromal stage known as MCI, where patients may either progress to AD (MCIc) +or remain stable (MCInc). Understanding AD mechanisms requires complementary +analyses relying on different data sources, leading to the development of +multimodal DL models. We leveraged structural and functional MRI to investigate +the disease-induced GM and functional network connectivity changes. Moreover, +considering AD's strong genetic component, we introduced SNPs as a third +channel. Missing one or more modalities is a typical concern of multimodal +methods. We hence propose a novel DL-based classification framework where a +generative module employing Cycle GAN was adopted for imputing missing data in +the latent space. Additionally, we adopted an XAI method, Integrated Gradients, +to extract features' relevance, enhancing our understanding of the learned +representations. Two tasks were addressed: AD detection and MCI conversion +prediction. Experimental results showed that our framework reached the SOA in +the classification of CN/AD with an average test accuracy of $0.926\pm0.02$. +For the MCInc/MCIc task, we achieved an average prediction accuracy of +$0.711\pm0.01$ using the pre-trained model for CN and AD. The interpretability +analysis revealed that significant GM modulations led the classification +performance in cortical and subcortical brain areas well known for their +association with AD. Impairments in sensory-motor and visual functional network +connectivity along AD, as well as mutations in SNPs defining biological +processes linked to endocytosis, amyloid-beta, and cholesterol, were identified +as contributors to the results. Overall, our integrative DL model shows promise +for AD detection and MCI prediction, while shading light on important +biological insights. + +
+
+ comment: 28 pages, 8 figures, submitted to a journal +
+
+
+
+
+ + ♻ ☆ Uncovering communities of pipelines in the task-fMRI analytical space + + +
+ Analytical workflows in functional magnetic resonance imaging are highly +flexible with limited best practices as to how to choose a pipeline. While it +has been shown that the use of different pipelines might lead to different +results, there is still a lack of understanding of the factors that drive these +differences and of the stability of these differences across contexts. We use +community detection algorithms to explore the pipeline space and assess the +stability of pipeline relationships across different contexts. We show that +there are subsets of pipelines that give similar results, especially those +sharing specific parameters (e.g. number of motion regressors, software +packages, etc.). Those pipeline-to-pipeline patterns are stable across groups +of participants but not across different tasks. By visualizing the differences +between communities, we show that the pipeline space is mainly driven by the +size of the activation area in the brain and the scale of statistic values in +statistic maps. + +
+
+ comment: Accepted at the 2024 IEEE International Conference on Image + Processing +
+
+
+
+
+ + ♻ ☆ A taxonomy of explanations to support Explainability-by-Design + + +
+ As automated decision-making solutions are increasingly applied to all +aspects of everyday life, capabilities to generate meaningful explanations for +a variety of stakeholders (i.e., decision-makers, recipients of decisions, +auditors, regulators...) become crucial. In this paper, we present a taxonomy +of explanations that was developed as part of a holistic +'Explainability-by-Design' approach for the purposes of the project PLEAD. The +taxonomy was built with a view to produce explanations for a wide range of +requirements stemming from a variety of regulatory frameworks or policies set +at the organizational level either to translate high-level compliance +requirements or to meet business needs. The taxonomy comprises nine dimensions. +It is used as a stand-alone classifier of explanations conceived as detective +controls, in order to aid supportive automated compliance strategies. A +machinereadable format of the taxonomy is provided in the form of a light +ontology and the benefits of starting the Explainability-by-Design journey with +such a taxonomy are demonstrated through a series of examples. + +
+
+
+
+
+ + ♻ ☆ SM3-Text-to-Query: Synthetic Multi-Model Medical Text-to-Query Benchmark NeurIPS 2024 + + +
+ Electronic health records (EHRs) are stored in various database systems with +different database models on heterogeneous storage architectures, such as +relational databases, document stores, or graph databases. These different +database models have a big impact on query complexity and performance. While +this has been a known fact in database research, its implications for the +growing number of Text-to-Query systems have surprisingly not been investigated +so far. In this paper, we present SM3-Text-to-Query, the first multi-model +medical Text-to-Query benchmark based on synthetic patient data from Synthea, +following the SNOMED-CT taxonomy -- a widely used knowledge graph ontology +covering medical terminology. SM3-Text-to-Query provides data representations +for relational databases (PostgreSQL), document stores (MongoDB), and graph +databases (Neo4j and GraphDB (RDF)), allowing the evaluation across four +popular query languages, namely SQL, MQL, Cypher, and SPARQL. We systematically +and manually develop 408 template questions, which we augment to construct a +benchmark of 10K diverse natural language question/query pairs for these four +query languages (40K pairs overall). On our dataset, we evaluate several common +in-context-learning (ICL) approaches for a set of representative closed and +open-source LLMs. Our evaluation sheds light on the trade-offs between database +models and query languages for different ICL strategies and LLMs. Last, +SM3-Text-to-Query is easily extendable to additional query languages or real, +standard-based patient databases. + +
+
+ comment: NeurIPS 2024 Track Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Toward Green and Human-Like Artificial Intelligence: A Complete Survey + on Contemporary Few-Shot Learning Approaches + + +
+ Despite deep learning's widespread success, its data-hungry and +computationally expensive nature makes it impractical for many data-constrained +real-world applications. Few-Shot Learning (FSL) aims to address these +limitations by enabling rapid adaptation to novel learning tasks, seeing +significant growth in recent years. This survey provides a comprehensive +overview of the field's latest advancements. Initially, FSL is formally +defined, and its relationship with different learning fields is presented. A +novel taxonomy is introduced, extending previously proposed ones, and +real-world applications in classic and novel fields are described. Finally, +recent trends shaping the field, outstanding challenges, and promising future +research directions are discussed. + +
+
+ comment: 35 pages, 9 figures. Submitted to ACM Computing Surveys +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration From Cognitive Psychology + + +
+ The cognitive mechanism by which Large Language Models (LLMs) solve +mathematical problems remains a widely debated and unresolved issue. Currently, +there is little interpretable experimental evidence that connects LLMs' +problem-solving with human cognitive psychology.To determine if LLMs possess +human-like mathematical reasoning, we modified the problems used in the human +Cognitive Reflection Test (CRT). Our results show that, even with the use of +Chains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model +(noted for its reasoning capabilities), have a high error rate when solving +these modified CRT problems. Specifically, the average accuracy rate dropped by +up to 50% compared to the original questions.Further analysis of LLMs' +incorrect answers suggests that they primarily rely on pattern matching from +their training data, which aligns more with human intuition (System 1 thinking) +rather than with human-like reasoning (System 2 thinking). This finding +challenges the belief that LLMs have genuine mathematical reasoning abilities +comparable to humans. As a result, this work may adjust overly optimistic views +on LLMs' progress towards artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ An improved tabular data generator with VAE-GMM integration + + +
+ The rising use of machine learning in various fields requires robust methods +to create synthetic tabular data. Data should preserve key characteristics +while addressing data scarcity challenges. Current approaches based on +Generative Adversarial Networks, such as the state-of-the-art CTGAN model, +struggle with the complex structures inherent in tabular data. These data often +contain both continuous and discrete features with non-Gaussian distributions. +Therefore, we propose a novel Variational Autoencoder (VAE)-based model that +addresses these limitations. Inspired by the TVAE model, our approach +incorporates a Bayesian Gaussian Mixture model (BGM) within the VAE +architecture. This avoids the limitations imposed by assuming a strictly +Gaussian latent space, allowing for a more accurate representation of the +underlying data distribution during data generation. Furthermore, our model +offers enhanced flexibility by allowing the use of various differentiable +distributions for individual features, making it possible to handle both +continuous and discrete data types. We thoroughly validate our model on three +real-world datasets with mixed data types, including two medically relevant +ones, based on their resemblance and utility. This evaluation demonstrates +significant outperformance against CTGAN and TVAE, establishing its potential +as a valuable tool for generating synthetic tabular data in various domains, +particularly in healthcare. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ More Expressive Attention with Negative Weights + + +
+ We propose a novel attention mechanism, named Cog Attention, that enables +attention weights to be negative for enhanced expressiveness, which stems from +two key factors: (1) Cog Attention can shift the token deletion and copying +function from a static OV matrix to dynamic QK inner products, with the OV +matrix now focusing more on refinement or modification. The attention head can +simultaneously delete, copy, or retain tokens by assigning them negative, +positive, or minimal attention weights, respectively. As a result, a single +attention head becomes more flexible and expressive. (2) Cog Attention improves +the model's robustness against representational collapse, which can occur when +earlier tokens are over-squashed into later positions, leading to homogeneous +representations. Negative weights reduce effective information paths from +earlier to later tokens, helping to mitigate this issue. We develop +Transformer-like models which use Cog Attention as attention modules, including +decoder-only models for language modeling and U-ViT diffusion models for image +generation. Experiments show that models using Cog Attention exhibit superior +performance compared to those employing traditional softmax attention modules. +Our approach suggests a promising research direction for rethinking and +breaking the entrenched constraints of traditional softmax attention, such as +the requirement for non-negative weights. + +
+
+
+
+
+ + ♻ ☆ Dual-Segment Clustering Strategy for Hierarchical Federated Learning in + Heterogeneous Wireless Environments + + +
+ Non-independent and identically distributed (Non- IID) data adversely affects +federated learning (FL) while heterogeneity in communication quality can +undermine the reliability of model parameter transmission, potentially +degrading wireless FL convergence. This paper proposes a novel dual-segment +clustering (DSC) strategy that jointly addresses communication and data +heterogeneity in FL. This is achieved by defining a new signal-to-noise ratio +(SNR) matrix and information quantity matrix to capture the communication and +data heterogeneity, respectively. The celebrated affinity propagation algorithm +is leveraged to iteratively refine the clustering of clients based on the newly +defined matrices effectively enhancing model aggregation in heterogeneous +environments. The convergence analysis and experimental results show that the +DSC strategy can improve the convergence rate of wireless FL and demonstrate +superior accuracy in heterogeneous environments compared to classical +clustering methods. + +
+
+
+
+
+ + ♻ ☆ STARFlow: Spatial Temporal Feature Re-embedding with Attentive Learning + for Real-world Scene Flow 3DV 2025 + + +
+ Scene flow prediction is a crucial underlying task in understanding dynamic +scenes as it offers fundamental motion information. However, contemporary scene +flow methods encounter three major challenges. Firstly, flow estimation solely +based on local receptive fields lacks long-dependency matching of point pairs. +To address this issue, we propose global attentive flow embedding to match +all-to-all point pairs in both feature space and Euclidean space, providing +global initialization before local refinement. Secondly, there are deformations +existing in non-rigid objects after warping, which leads to variations in the +spatiotemporal relation between the consecutive frames. For a more precise +estimation of residual flow, a spatial temporal feature re-embedding module is +devised to acquire the sequence features after deformation. Furthermore, +previous methods perform poor generalization due to the significant domain gap +between the synthesized and LiDAR-scanned datasets. We leverage novel domain +adaptive losses to effectively bridge the gap of motion inference from +synthetic to real-world. Experiments demonstrate that our approach achieves +state-of-the-art performance across various datasets, with particularly +outstanding results on real-world LiDAR-scanned datasets. Our code is available +at https://github.com/O-VIGIA/StarFlow. + +
+
+ comment: This paper was renamed to:"SSRFlow: Semantic-aware Fusion with + Spatial Temporal Re-embedding for Real-world Scene Flow" [arXiv:2408.07825] + and was accepted in 3DV 2025 +
+
+
+
+
+ + ♻ ☆ The Roles of Generative Artificial Intelligence in Internet of Electric + Vehicles + + +
+ With the advancements of generative artificial intelligence (GenAI) models, +their capabilities are expanding significantly beyond content generation and +the models are increasingly being used across diverse applications. +Particularly, GenAI shows great potential in addressing challenges in the +electric vehicle (EV) ecosystem ranging from charging management to +cyber-attack prevention. In this paper, we specifically consider Internet of +electric vehicles (IoEV) and we categorize GenAI for IoEV into four different +layers namely, EV's battery layer, individual EV layer, smart grid layer, and +security layer. We introduce various GenAI techniques used in each layer of +IoEV applications. Subsequently, public datasets available for training the +GenAI models are summarized. Finally, we provide recommendations for future +directions. This survey not only categorizes the applications of GenAI in IoEV +across different layers but also serves as a valuable resource for researchers +and practitioners by highlighting the design and implementation challenges +within each layer. Furthermore, it provides a roadmap for future research +directions, enabling the development of more robust and efficient IoEV systems +through the integration of advanced GenAI techniques. + +
+
+ comment: 25 Pages +
+
+
+
+
+ + ♻ ☆ Towards Objective and Unbiased Decision Assessments with LLM-Enhanced + Hierarchical Attention Networks + + +
+ How objective and unbiased are we while making decisions? This work +investigates cognitive bias identification in high-stake decision making +process by human experts, questioning its effectiveness in real-world settings, +such as candidates assessments for university admission. We begin with a +statistical analysis assessing correlations among different decision points +among in the current process, which discovers discrepancies that imply +cognitive bias and inconsistency in decisions. This motivates our exploration +of bias-aware AI-augmented workflow that surpass human judgment. We propose +BGM-HAN, an enhanced Hierarchical Attention Network with Byte-Pair Encoding, +Gated Residual Connections and Multi-Head Attention. Using it as a backbone +model, we further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow, +which simulate real-world decision-making. In our experiments, both the +proposed model and the agentic workflow significantly improves on both human +judgment and alternative models, validated with real-world data. + +
+
+ comment: Source code is available at: https://github.com/junhua/bgm-han +
+
+
+
+
+ + ♻ ☆ LProtector: An LLM-driven Vulnerability Detection System + + +
+ This paper presents LProtector, an automated vulnerability detection system +for C/C++ codebases driven by the large language model (LLM) GPT-4o and +Retrieval-Augmented Generation (RAG). As software complexity grows, traditional +methods face challenges in detecting vulnerabilities effectively. LProtector +leverages GPT-4o's powerful code comprehension and generation capabilities to +perform binary classification and identify vulnerabilities within target +codebases. We conducted experiments on the Big-Vul dataset, showing that +LProtector outperforms two state-of-the-art baselines in terms of F1 score, +demonstrating the potential of integrating LLMs with vulnerability detection. + +
+
+ comment: 5 pages, 4 figures. This is a preprint version of the article. The + final version will be published in the proceedings of the IEEE conference +
+
+
+
+
+ + ♻ ☆ Not All Heads Matter: A Head-Level KV Cache Compression Method with + Integrated Retrieval and Reasoning + + +
+ Key-Value (KV) caching is a common technique to enhance the computational +efficiency of Large Language Models (LLMs), but its memory overhead grows +rapidly with input length. Prior work has shown that not all tokens are equally +important for text generation, proposing layer-level KV cache compression to +selectively retain key information. Recognizing the distinct roles of attention +heads in generation, we propose HeadKV, a head-level KV cache compression +method, and HeadKV-R2, which leverages a novel contextual reasoning ability +estimation for compression. Our approach operates at the level of individual +heads, estimating their importance for contextual QA tasks that require both +retrieval and reasoning capabilities. Extensive experiments across diverse +benchmarks (LongBench, LooGLE), model architectures (e.g., Llama-3-8B-Instruct, +Mistral-7B-Instruct), and long-context abilities tests demonstrate that our +head-level KV cache compression significantly outperforms strong baselines, +particularly in low-resource settings (KV size = 64 & 128). Notably, our method +retains just 1.5% of the KV cache while achieving 97% of the performance of the +full KV cache on the contextual question answering benchmark.Codes are +available at https://github.com/FYYFU/HeadKV + +
+
+ comment: 18pages +
+
+
+
+
+ + ♻ ☆ A Review of Large Language Models and Autonomous Agents in Chemistry + + +
+ Large language models (LLMs) have emerged as powerful tools in chemistry, +significantly impacting molecule design, property prediction, and synthesis +optimization. This review highlights LLM capabilities in these domains and +their potential to accelerate scientific discovery through automation. We also +review LLM-based autonomous agents: LLMs with a broader set of tools to +interact with their surrounding environment. These agents perform diverse tasks +such as paper scraping, interfacing with automated laboratories, and synthesis +planning. As agents are an emerging topic, we extend the scope of our review of +agents beyond chemistry and discuss across any scientific domains. This review +covers the recent history, current capabilities, and design of LLMs and +autonomous agents, addressing specific challenges, opportunities, and future +directions in chemistry. Key challenges include data quality and integration, +model interpretability, and the need for standard benchmarks, while future +directions point towards more sophisticated multi-modal agents and enhanced +collaboration between agents and experimental methods. Due to the quick pace of +this field, a repository has been built to keep track of the latest studies: +https://github.com/ur-whitelab/LLMs-in-science. + +
+
+
+
+
+ + ♻ ☆ Dense Connector for MLLMs NeurIPS 2024 + + +
+ Do we fully leverage the potential of visual encoder in Multimodal Large +Language Models (MLLMs)? The recent outstanding performance of MLLMs in +multimodal understanding has garnered broad attention from both academia and +industry. In the current MLLM rat race, the focus seems to be predominantly on +the linguistic side. We witness the rise of larger and higher-quality +instruction datasets, as well as the involvement of larger-sized LLMs. Yet, +scant attention has been directed towards the visual signals utilized by MLLMs, +often assumed to be the final high-level features extracted by a frozen visual +encoder. In this paper, we introduce the Dense Connector - a simple, effective, +and plug-and-play vision-language connector that significantly enhances +existing MLLMs by leveraging multi-layer visual features, with minimal +additional computational overhead. Building on this, we also propose the +Efficient Dense Connector, which achieves performance comparable to LLaVA-v1.5 +with only 25% of the visual tokens. Furthermore, our model, trained solely on +images, showcases remarkable zero-shot capabilities in video understanding as +well. Experimental results across various vision encoders, image resolutions, +training dataset scales, varying sizes of LLMs (2.7B->70B), and diverse +architectures of MLLMs (e.g., LLaVA-v1.5, LLaVA-NeXT and Mini-Gemini) validate +the versatility and scalability of our approach, achieving state-of-the-art +performance across 19 image and video benchmarks. We hope that this work will +provide valuable experience and serve as a basic module for future MLLM +development. Code is available at https://github.com/HJYao00/DenseConnector . + +
+
+ comment: 27 pages, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating Modern Approaches in 3D Scene Reconstruction: NeRF vs + Gaussian-Based Methods + + +
+ Exploring the capabilities of Neural Radiance Fields (NeRF) and +Gaussian-based methods in the context of 3D scene reconstruction, this study +contrasts these modern approaches with traditional Simultaneous Localization +and Mapping (SLAM) systems. Utilizing datasets such as Replica and ScanNet, we +assess performance based on tracking accuracy, mapping fidelity, and view +synthesis. Findings reveal that NeRF excels in view synthesis, offering unique +capabilities in generating new perspectives from existing data, albeit at +slower processing speeds. Conversely, Gaussian-based methods provide rapid +processing and significant expressiveness but lack comprehensive scene +completion. Enhanced by global optimization and loop closure techniques, newer +methods like NICE-SLAM and SplaTAM not only surpass older frameworks such as +ORB-SLAM2 in terms of robustness but also demonstrate superior performance in +dynamic and complex environments. This comparative analysis bridges theoretical +research with practical implications, shedding light on future developments in +robust 3D scene reconstruction across various real-world applications. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ♻ ☆ Integrating Symbolic Reasoning into Neural Generative Models for Design + Generation + + +
+ Design generation requires tight integration of neural and symbolic +reasoning, as good design must meet explicit user needs and honor implicit +rules for aesthetics, utility, and convenience. Current automated design tools +driven by neural networks produce appealing designs but cannot satisfy user +specifications and utility requirements. Symbolic reasoning tools, such as +constraint programming, cannot perceive low-level visual information in images +or capture subtle aspects such as aesthetics. We introduce the Spatial +Reasoning Integrated Generator (SPRING) for design generation. SPRING embeds a +neural and symbolic integrated spatial reasoning module inside the deep +generative network. The spatial reasoning module samples the set of locations +of objects to be generated from a backtrack-free distribution. This +distribution modifies the implicit preference distribution, which is learned by +a recursive neural network to capture utility and aesthetics. Sampling from the +backtrack-free distribution is accomplished by a symbolic reasoning approach, +SampleSearch, which zeros out the probability of sampling spatial locations +violating explicit user specifications. Embedding symbolic reasoning into +neural generation guarantees that the output of SPRING satisfies user +requirements. Furthermore, SPRING offers interpretability, allowing users to +visualize and diagnose the generation process through the bounding boxes. +SPRING is also adept at managing novel user specifications not encountered +during its training, thanks to its proficiency in zero-shot constraint +transfer. Quantitative evaluations and a human study reveal that SPRING +outperforms baseline generative models, excelling in delivering high design +quality and better meeting user specifications. + +
+
+
+
+
+ + ♻ ☆ Interpolating neural network: A lightweight yet precise architecture for + data training, equation solving, and parameter calibration + + +
+ Artificial intelligence (AI) has revolutionized software development, +shifting from task-specific codes (Software 1.0) to neural network-based +approaches (Software 2.0). However, applying this transition in engineering +software presents challenges, including low surrogate model accuracy, the curse +of dimensionality in inverse design, and rising complexity in physical +simulations. We introduce an interpolating neural network (INN), grounded in +interpolation theory and tensor decomposition, to realize Engineering Software +2.0 by advancing data training, partial differential equation solving, and +parameter calibration. INN offers orders of magnitude fewer trainable/solvable +parameters for comparable model accuracy than traditional multi-layer +perceptron (MLP) or physics-informed neural networks (PINN). Demonstrated in +metal additive manufacturing, INN rapidly constructs an accurate surrogate +model of Laser Powder Bed Fusion (L-PBF) heat transfer simulation, achieving +sub-10-micrometer resolution for a 10 mm path in under 15 minutes on a single +GPU. This makes a transformative step forward across all domains essential to +engineering software. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ X-SHIELD: Regularization for eXplainable Artificial Intelligence + + +
+ As artificial intelligence systems become integral across domains, the demand +for explainability grows, the called eXplainable artificial intelligence (XAI). +Existing efforts primarily focus on generating and evaluating explanations for +black-box models while a critical gap in directly enhancing models remains +through these evaluations. It is important to consider the potential of this +explanation process to improve model quality with a feedback on training as +well. XAI may be used to improve model performance while boosting its +explainability. Under this view, this paper introduces Transformation - +Selective Hidden Input Evaluation for Learning Dynamics (T-SHIELD), a +regularization family designed to improve model quality by hiding features of +input, forcing the model to generalize without those features. Within this +family, we propose the XAI - SHIELD(X-SHIELD), a regularization for explainable +artificial intelligence, which uses explanations to select specific features to +hide. In contrast to conventional approaches, X-SHIELD regularization +seamlessly integrates into the objective function enhancing model +explainability while also improving performance. Experimental validation on +benchmark datasets underscores X-SHIELD's effectiveness in improving +performance and overall explainability. The improvement is validated through +experiments comparing models with and without the X-SHIELD regularization, with +further analysis exploring the rationale behind its design choices. This +establishes X-SHIELD regularization as a promising pathway for developing +reliable artificial intelligence regularization. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ A Machine with Short-Term, Episodic, and Semantic Memory Systems + + +
+ Inspired by the cognitive science theory of the explicit human memory +systems, we have modeled an agent with short-term, episodic, and semantic +memory systems, each of which is modeled with a knowledge graph. To evaluate +this system and analyze the behavior of this agent, we designed and released +our own reinforcement learning agent environment, "the Room", where an agent +has to learn how to encode, store, and retrieve memories to maximize its return +by answering questions. We show that our deep Q-learning based agent +successfully learns whether a short-term memory should be forgotten, or rather +be stored in the episodic or semantic memory systems. Our experiments indicate +that an agent with human-like memory systems can outperform an agent without +this memory structure in the environment. + +
+
+
+
+
+ + ♻ ☆ Security and Privacy Challenges of Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have demonstrated extraordinary capabilities and +contributed to multiple fields, such as generating and summarizing text, +language translation, and question-answering. Nowadays, LLM is becoming a very +popular tool in computerized language processing tasks, with the capability to +analyze complicated linguistic patterns and provide relevant and appropriate +responses depending on the context. While offering significant advantages, +these models are also vulnerable to security and privacy attacks, such as +jailbreaking attacks, data poisoning attacks, and Personally Identifiable +Information (PII) leakage attacks. This survey provides a thorough review of +the security and privacy challenges of LLMs for both training data and users, +along with the application-based risks in various domains, such as +transportation, education, and healthcare. We assess the extent of LLM +vulnerabilities, investigate emerging security and privacy attacks for LLMs, +and review the potential defense mechanisms. Additionally, the survey outlines +existing research gaps in this domain and highlights future research +directions. + +
+
+
+
+
+ + ♻ ☆ Mitigating Partial Observability in Sequential Decision Processes via + the Lambda Discrepancy + + +
+ Reinforcement learning algorithms typically rely on the assumption that the +environment dynamics and value function can be expressed in terms of a +Markovian state representation. However, when state information is only +partially observable, how can an agent learn such a state representation, and +how can it detect when it has found one? We introduce a metric that can +accomplish both objectives, without requiring access to -- or knowledge of -- +an underlying, unobservable state space. Our metric, the $\lambda$-discrepancy, +is the difference between two distinct temporal difference (TD) value +estimates, each computed using TD($\lambda$) with a different value of +$\lambda$. Since TD($\lambda{=}0$) makes an implicit Markov assumption and +TD($\lambda{=}1$) does not, a discrepancy between these estimates is a +potential indicator of a non-Markovian state representation. Indeed, we prove +that the $\lambda$-discrepancy is exactly zero for all Markov decision +processes and almost always non-zero for a broad class of partially observable +environments. We also demonstrate empirically that, once detected, minimizing +the $\lambda$-discrepancy can help with learning a memory function to mitigate +the corresponding partial observability. We then train a reinforcement learning +agent that simultaneously constructs two recurrent value networks with +different $\lambda$ parameters and minimizes the difference between them as an +auxiliary loss. The approach scales to challenging partially observable +domains, where the resulting agent frequently performs significantly better +(and never performs worse) than a baseline recurrent agent with only a single +value network. + +
+
+ comment: GitHub URL: https://github.com/brownirl/lambda_discrepancy; Project + page: https://lambda-discrepancy.github.io/ +
+
+
+
+
+ + ♻ ☆ GPT-4V Cannot Generate Radiology Reports Yet ML4H + + +
+ GPT-4V's purported strong multimodal abilities raise interests in using it to +automate radiology report writing, but there lacks thorough evaluations. In +this work, we perform a systematic evaluation of GPT-4V in generating radiology +reports on two chest X-ray report datasets: MIMIC-CXR and IU X-Ray. We attempt +to directly generate reports using GPT-4V through different prompting +strategies and find that it fails terribly in both lexical metrics and clinical +efficacy metrics. To understand the low performance, we decompose the task into +two steps: 1) the medical image reasoning step of predicting medical condition +labels from images; and 2) the report synthesis step of generating reports from +(groundtruth) conditions. We show that GPT-4V's performance in image reasoning +is consistently low across different prompts. In fact, the distributions of +model-predicted labels remain constant regardless of which groundtruth +conditions are present on the image, suggesting that the model is not +interpreting chest X-rays meaningfully. Even when given groundtruth conditions +in report synthesis, its generated reports are less correct and less +natural-sounding than a finetuned LLaMA-2. Altogether, our findings cast doubt +on the viability of using GPT-4V in a radiology workflow. + +
+
+ comment: 24 pages, 3 figures, code: + https://github.com/ChicagoHAI/cxr-eval-gpt-4v Findings paper presented at + Machine Learning for Health (ML4H) symposium 2024, December 15-16, 2024, + Vancouver, Canada, 26 pages +
+
+
+
+
+ + ♻ ☆ ShaRP: A Novel Feature Importance Framework for Ranking + + +
+ Algorithmic decisions in critical domains such as hiring, college admissions, +and lending are often based on rankings. Because of the impact these decisions +have on individuals, organizations, and population groups, there is a need to +understand them: to help individuals improve their position in a ranking, +design better ranking procedures, and check whether a procedure is legally +compliant. In this paper, we present ShaRP - Shapley for Rankings and +Preferences - a framework that explains the contributions of features to +different aspects of a ranked outcome and is based on Shapley values. Using +ShaRP, we show that even when the scoring function used by an algorithmic +ranker is known and linear, the feature weights do not correspond to their +Shapley value contribution. The contributions instead depend on the feature +distributions and the subtle local interactions between the scoring features. + ShaRP builds on the Quantitative Input Influence framework to compute the +contributions of features for multiple - ranking specific - Quantities of +Interest, including score, rank, pair-wise preference, and top-k. We show the +results of an extensive experimental validation of ShaRP using real and +synthetic datasets. We demonstrate that feature importance can be computed +efficiently, and that ShaRP compares favorably to several prior local feature +importance methods, in terms of both generality and quality of explanations. +Among our results, we highlight a case study on the CS Rankings dataset. +Contrary to expectation, we find that a strong track record in Systems research +is much more important than AI research for placing a CS department among the +top-10%. + ShaRP is available as an open-source library at +https://github.com/DataResponsibly/ShaRP and is already used in teaching. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ Decentralized Coordination of Distributed Energy Resources through Local + Energy Markets and Deep Reinforcement Learning + + +
+ As distributed energy resources (DERs) grow, the electricity grid faces +increased net load variability at the grid edge, impacting operability and +reliability. Transactive energy, facilitated through local energy markets, +offers a decentralized, indirect demand response solution, with model-free +control techniques, such as deep reinforcement learning (DRL), enabling +automated, decentralized participation. However, existing studies largely +overlook community-level net load variability, focusing instead on +socioeconomic metrics. + This study addresses this gap by using DRL agents to automate end-user +participation in a local energy market (ALEX), where agents act independently +to minimize individual energy bills. Results reveal a strong link between bill +reduction and decreased net load variability, assessed across metrics such as +ramping rate, load factor, and peak demand over various time horizons. Using a +no-control baseline, DRL agents are benchmarked against a near-optimal dynamic +programming approach. The dynamic programming benchmark achieves reductions of +22.05 percent, 83.92 percent, and 24.09 percent in daily import, export, and +peak demand, respectively, while the DRL agents show comparable or superior +results with reductions of 21.93 percent, 84.46 percent, and 27.02 percent. + This study demonstrates the effectiveness of DRL in decentralized grid +management, highlighting its scalability and near-optimal performance in +reducing net load variability within community-driven energy markets. + +
+
+ comment: preprint, submitted to Energy and AI +
+
+
+
+
+
+
+
+ + Computation and Language 67 + +
+
+
+ + ☆ A Bayesian Optimization Approach to Machine Translation Reranking + + +
+ Reranking a list of candidates from a machine translation system with an +external scoring model and returning the highest-scoring candidate remains a +simple and effective method for improving the overall output quality. +Translation scoring models continue to grow in size, with the best models being +comparable to generation models. Thus, reranking can add substantial +computational cost to the translation pipeline. In this work, we pose reranking +as a Bayesian optimization (BayesOpt) problem. By strategically selecting +candidates to score based on a balance of exploration and exploitation, we show +that it is possible to find top-scoring candidates when scoring only a fraction +of the candidate list. For instance, our method achieves the same CometKiwi +score using only 70 scoring evaluations compared a baseline system using 180. +We present a multi-fidelity setting for BayesOpt, where the candidates are +first scored with a cheaper but noisier proxy scoring model, which further +improves the cost-performance tradeoff when using smaller but well-trained +distilled proxy scorers. + +
+
+ comment: v1: Preprint version +
+
+
+
+
+ + ☆ LLM Hallucination Reasoning with Zero-shot Knowledge Test + + +
+ LLM hallucination, where LLMs occasionally generate unfaithful text, poses +significant challenges for their practical applications. Most existing +detection methods rely on external knowledge, LLM fine-tuning, or +hallucination-labeled datasets, and they do not distinguish between different +types of hallucinations, which are crucial for improving detection performance. +We introduce a new task, Hallucination Reasoning, which classifies +LLM-generated text into one of three categories: aligned, misaligned, and +fabricated. Our novel zero-shot method assesses whether LLM has enough +knowledge about a given prompt and text. Our experiments conducted on new +datasets demonstrate the effectiveness of our method in hallucination reasoning +and underscore its importance for enhancing detection performance. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Squeezed Attention: Accelerating Long Context Length LLM Inference + + +
+ Emerging Large Language Model (LLM) applications require long input prompts +to perform complex downstream tasks like document analysis and code generation. +For these long context length applications, the length of the input prompt +poses a significant challenge in terms of inference efficiency since the +inference costs increase linearly with sequence length. However, for many of +these applications, much of the context in the prompt is fixed across different +user inputs, thereby providing the opportunity to perform offline optimizations +to process user inputs quickly, as they are received. In this work, we propose +Squeezed Attention as a mechanism to accelerate LLM applications where a large +portion of the input prompt is fixed. We first leverage K-means clustering +offline to group the keys for the fixed context based on semantic similarity +and represent each cluster with a single centroid value. During inference, we +compare query tokens from the user input with the centroids to predict which of +the keys from the fixed context are semantically relevant and need to be loaded +during inference. We then compute exact attention using only these important +keys from the fixed context, thereby reducing bandwidth and computational +costs. We also extend our method to use a hierarchical centroid lookup to +identify important keys, which can reduce the complexity of attention from +linear to logarithmic with respect to the context length. We implement +optimized Triton kernels for centroid comparison and sparse FlashAttention with +important keys, achieving more than 4x speedups during both the prefill and +generation phases for long-context inference. Furthermore, we have extensively +evaluated our method on various long-context benchmarks including LongBench, +where it achieves a 3x reduction in KV cache budget without accuracy loss and +up to an 8x reduction with <0.5 point accuracy gap for various models. + +
+
+
+
+
+ + ☆ Adaptive Decoding via Latent Preference Optimization + + +
+ During language model decoding, it is known that using higher temperature +sampling gives more creative responses, while lower temperatures are more +factually accurate. However, such models are commonly applied to general +instruction following, which involves both creative and fact seeking tasks, +using a single fixed temperature across all examples and tokens. In this work, +we introduce Adaptive Decoding, a layer added to the model to select the +sampling temperature dynamically at inference time, at either the token or +example level, in order to optimize performance. To learn its parameters we +introduce Latent Preference Optimization (LPO) a general approach to train +discrete latent variables such as choices of temperature. Our method +outperforms all fixed decoding temperatures across a range of tasks that +require different temperatures, including UltraFeedback, Creative Story +Writing, and GSM8K. + +
+
+
+
+
+ + ☆ On the Limits of Language Generation: Trade-Offs Between Hallucination + and Mode Collapse + + +
+ Specifying all desirable properties of a language model is challenging, but +certain requirements seem essential. Given samples from an unknown language, +the trained model should produce valid strings not seen in training and be +expressive enough to capture the language's full richness. Otherwise, +outputting invalid strings constitutes "hallucination," and failing to capture +the full range leads to "mode collapse." We ask if a language model can meet +both requirements. + We investigate this within a statistical language generation setting building +on Gold and Angluin. Here, the model receives random samples from a +distribution over an unknown language K, which belongs to a possibly infinite +collection of languages. The goal is to generate unseen strings from K. We say +the model generates from K with consistency and breadth if, as training size +increases, its output converges to all unseen strings in K. + Kleinberg and Mullainathan [KM24] asked if consistency and breadth in +language generation are possible. We answer this negatively: for a large class +of language models, including next-token prediction models, this is impossible +for most collections of candidate languages. This contrasts with [KM24]'s +result, showing consistent generation without breadth is possible for any +countable collection of languages. Our finding highlights that generation with +breadth fundamentally differs from generation without breadth. + As a byproduct, we establish near-tight bounds on the number of samples +needed for generation with or without breadth. + Finally, our results offer hope: consistent generation with breadth is +achievable for any countable collection of languages when negative examples +(strings outside K) are available alongside positive ones. This suggests that +post-training feedback, which encodes negative examples, can be crucial in +reducing hallucinations while limiting mode collapse. + +
+
+ comment: Abstract shortened to fit arXiv limit +
+
+
+
+
+ + ☆ PTR: Precision-Driven Tool Recommendation for Large Language Models + + +
+ By augmenting Large Language Models (LLMs) with external tools, their +capacity to solve complex problems has been significantly enhanced. However, +despite ongoing advancements in the parsing capabilities of LLMs, incorporating +all available tools simultaneously in the prompt remains impractical due to the +vast number of external tools. Consequently, it is essential to provide LLMs +with a precise set of tools tailored to the specific task, considering both +quantity and quality. Current tool retrieval methods primarily focus on +refining the ranking list of tools and directly packaging a fixed number of +top-ranked tools as the tool set. However, these approaches often fail to equip +LLMs with the optimal set of tools prior to execution, since the optimal number +of tools for different tasks could be different, resulting in inefficiencies +such as redundant or unsuitable tools, which impede immediate access to the +most relevant tools. This paper addresses the challenge of recommending precise +toolsets for LLMs. We introduce the problem of tool recommendation, define its +scope, and propose a novel Precision-driven Tool Recommendation (PTR) approach. +PTR captures an initial, concise set of tools by leveraging historical tool +bundle usage and dynamically adjusts the tool set by performing tool matching, +culminating in a multi-view-based tool addition. Additionally, we present a new +dataset, RecTools, and a metric, TRACC, designed to evaluate the effectiveness +of tool recommendation for LLMs. We further validate our design choices through +comprehensive experiments, demonstrating promising accuracy across two open +benchmarks and our RecTools dataset. + +
+
+
+
+
+ + ☆ The Moral Foundations Weibo Corpus + + +
+ Moral sentiments expressed in natural language significantly influence both +online and offline environments, shaping behavioral styles and interaction +patterns, including social media selfpresentation, cyberbullying, adherence to +social norms, and ethical decision-making. To effectively measure moral +sentiments in natural language processing texts, it is crucial to utilize +large, annotated datasets that provide nuanced understanding for accurate +analysis and modeltraining. However, existing corpora, while valuable, often +face linguistic limitations. To address this gap in the Chinese language +domain,we introduce the Moral Foundation Weibo Corpus. This corpus consists of +25,671 Chinese comments on Weibo, encompassing six diverse topic areas. Each +comment is manually annotated by at least three systematically trained +annotators based on ten moral categories derived from a grounded theory of +morality. To assess annotator reliability, we present the kappa testresults, a +gold standard for measuring consistency. Additionally, we apply several the +latest large language models to supplement the manual annotations, conducting +analytical experiments to compare their performance and report baseline results +for moral sentiment classification. + +
+
+
+
+
+ + ☆ Initial Nugget Evaluation Results for the TREC 2024 RAG Track with the + AutoNuggetizer Framework + + +
+ This report provides an initial look at partial results from the TREC 2024 +Retrieval-Augmented Generation (RAG) Track. We have identified RAG evaluation +as a barrier to continued progress in information access (and more broadly, +natural language processing and artificial intelligence), and it is our hope +that we can contribute to tackling the many challenges in this space. The +central hypothesis we explore in this work is that the nugget evaluation +methodology, originally developed for the TREC Question Answering Track in +2003, provides a solid foundation for evaluating RAG systems. As such, our +efforts have focused on "refactoring" this methodology, specifically applying +large language models to both automatically create nuggets and to automatically +assign nuggets to system answers. We call this the AutoNuggetizer framework. +Within the TREC setup, we are able to calibrate our fully automatic process +against a manual process whereby nuggets are created by human assessors +semi-manually and then assigned manually to system answers. Based on initial +results across 21 topics from 45 runs, we observe a strong correlation between +scores derived from a fully automatic nugget evaluation and a (mostly) manual +nugget evaluation by human assessors. This suggests that our fully automatic +evaluation process can be used to guide future iterations of RAG systems. + +
+
+
+
+
+ + ☆ LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models + + +
+ This work explores expanding the capabilities of large language models (LLMs) +pretrained on text to generate 3D meshes within a unified model. This offers +key advantages of (1) leveraging spatial knowledge already embedded in LLMs, +derived from textual sources like 3D tutorials, and (2) enabling conversational +3D generation and mesh understanding. A primary challenge is effectively +tokenizing 3D mesh data into discrete tokens that LLMs can process seamlessly. +To address this, we introduce LLaMA-Mesh, a novel approach that represents the +vertex coordinates and face definitions of 3D meshes as plain text, allowing +direct integration with LLMs without expanding the vocabulary. We construct a +supervised fine-tuning (SFT) dataset enabling pretrained LLMs to (1) generate +3D meshes from text prompts, (2) produce interleaved text and 3D mesh outputs +as required, and (3) understand and interpret 3D meshes. Our work is the first +to demonstrate that LLMs can be fine-tuned to acquire complex spatial knowledge +for 3D mesh generation in a text-based format, effectively unifying the 3D and +text modalities. LLaMA-Mesh achieves mesh generation quality on par with models +trained from scratch while maintaining strong text generation performance. + +
+
+ comment: See the project website at + https://research.nvidia.com/labs/toronto-ai/LLaMA-Mesh/ +
+
+
+
+
+ + ☆ BabyLM Challenge: Exploring the Effect of Variation Sets on Language + Model Training Efficiency + + +
+ While current large language models have achieved a remarkable success, their +data efficiency remains a challenge to overcome. Recently it has been suggested +that child-directed speech (CDS) can improve training data efficiency of modern +language models based on Transformer neural networks. However, it is not yet +understood which specific properties of CDS are effective for training these +models. In the context of the BabyLM Challenge, we focus on Variation Sets +(VSs), sets of consecutive utterances expressing a similar intent with slightly +different words and structures, which are ubiquitous in CDS. To assess the +impact of VSs on training data efficiency, we augment CDS data with different +proportions of artificial VSs and use these datasets to train an +auto-regressive model, GPT-2. We find that the best proportion of VSs depends +on the evaluation benchmark: BLiMP and GLUE scores benefit from the presence of +VSs, but EWOK scores do not. Additionally, the results vary depending on +multiple factors such as the number of epochs and the order of utterance +presentation. Taken together, these findings suggest that VSs can have a +beneficial influence on language models, while leaving room for further +investigation. + +
+
+ comment: This paper accepted BabyLM challenge 2024 at CONLL 2024 +
+
+
+
+
+ + ☆ Piecing It All Together: Verifying Multi-Hop Multimodal Claims + + +
+ Existing claim verification datasets often do not require systems to perform +complex reasoning or effectively interpret multimodal evidence. To address +this, we introduce a new task: multi-hop multimodal claim verification. This +task challenges models to reason over multiple pieces of evidence from diverse +sources, including text, images, and tables, and determine whether the combined +multimodal evidence supports or refutes a given claim. To study this task, we +construct MMCV, a large-scale dataset comprising 16k multi-hop claims paired +with multimodal evidence, generated and refined using large language models, +with additional input from human feedback. We show that MMCV is challenging +even for the latest state-of-the-art multimodal large language models, +especially as the number of reasoning hops increases. Additionally, we +establish a human performance benchmark on a subset of MMCV. We hope this +dataset and its evaluation task will encourage future research in multimodal +multi-hop claim verification. + +
+
+
+
+
+ + ☆ A Practical Guide to Fine-tuning Language Models with Limited Data + + +
+ Employing pre-trained Large Language Models (LLMs) has become the de facto +standard in Natural Language Processing (NLP) despite their extensive data +requirements. Motivated by the recent surge in research focused on training +LLMs with limited data, particularly in low-resource domains and languages, +this paper surveys recent transfer learning approaches to optimize model +performance in downstream tasks where data is scarce. We first address initial +and continued pre-training strategies to better leverage prior knowledge in +unseen domains and languages. We then examine how to maximize the utility of +limited data during fine-tuning and few-shot learning. The final section takes +a task-specific perspective, reviewing models and methods suited for different +levels of data scarcity. Our goal is to provide practitioners with practical +guidelines for overcoming the challenges posed by constrained data while also +highlighting promising directions for future research. + +
+
+
+
+
+ + ☆ Communication Compression for Tensor Parallel LLM Inference + + +
+ Large Language Models (LLMs) have pushed the frontier of artificial +intelligence but are comprised of hundreds of billions of parameters and +operations. For faster inference latency, LLMs are deployed on multiple +hardware accelerators through various Model Parallelism strategies. Our paper +looks into the details on one such strategy - Tensor Parallel - and proposes to +reduce latency by compressing inter-accelerator communication. We leverage fine +grained quantization techniques to compress selected activations by 3.5 - 4.5x. +Our proposed method leads up to 2x reduction of time-to-first-token (TTFT) with +negligible model performance degradation. + +
+
+
+
+
+ + ☆ The Use of Readability Metrics in Legal Text: A Systematic Literature + Review + + +
+ Understanding the text in legal documents can be challenging due to their +complex structure and the inclusion of domain-specific jargon. Laws and +regulations are often crafted in such a manner that engagement with them +requires formal training, potentially leading to vastly different +interpretations of the same texts. Linguistic complexity is an important +contributor to the difficulties experienced by readers. Simplifying texts could +enhance comprehension across a broader audience, not just among trained +professionals. Various metrics have been developed to measure document +readability. Therefore, we adopted a systematic review approach to examine the +linguistic and readability metrics currently employed for legal and regulatory +texts. A total of 3566 initial papers were screened, with 34 relevant studies +found and further assessed. Our primary objective was to identify which current +metrics were applied for evaluating readability within the legal field. Sixteen +different metrics were identified, with the Flesch-Kincaid Grade Level being +the most frequently used method. The majority of studies (73.5%) were found in +the domain of "informed consent forms". From the analysis, it is clear that not +all legal domains are well represented in terms of readability metrics and that +there is a further need to develop more consensus on which metrics should be +applied for legal documents. + +
+
+
+
+
+ + ☆ MM-Eval: A Hierarchical Benchmark for Modern Mongolian Evaluation in + LLMs + + +
+ Large language models (LLMs) excel in high-resource languages but face +notable challenges in low-resource languages like Mongolian. This paper +addresses these challenges by categorizing capabilities into language abilities +(syntax and semantics) and cognitive abilities (knowledge and reasoning). To +systematically evaluate these areas, we developed MM-Eval, a specialized +dataset based on Modern Mongolian Language Textbook I and enriched with WebQSP +and MGSM datasets. + Preliminary experiments on models including Qwen2-7B-Instruct, GLM4-9b-chat, +Llama3.1-8B-Instruct, GPT-4, and DeepseekV2.5 revealed that: 1) all models +performed better on syntactic tasks than semantic tasks, highlighting a gap in +deeper language understanding; and 2) knowledge tasks showed a moderate +decline, suggesting that models can transfer general knowledge from +high-resource to low-resource contexts. + The release of MM-Eval, comprising 569 syntax, 677 semantics, 344 knowledge, +and 250 reasoning tasks, offers valuable insights for advancing NLP and LLMs in +low-resource languages like Mongolian. The dataset is available at +https://github.com/joenahm/MM-Eval. + +
+
+
+
+
+ + ☆ Robot Tasks with Fuzzy Time Requirements from Natural Language + Instructions + + +
+ Natural language allows robot programming to be accessible to everyone. +However, the inherent fuzziness in natural language poses challenges for +inflexible, traditional robot systems. We focus on instructions with fuzzy time +requirements (e.g., "start in a few minutes"). Building on previous robotics +research, we introduce fuzzy skills. These define an execution by the robot +with so-called satisfaction functions representing vague execution time +requirements. Such functions express a user's satisfaction over potential +starting times for skill execution. When the robot handles multiple fuzzy +skills, the satisfaction function provides a temporal tolerance window for +execution, thus, enabling optimal scheduling based on satisfaction. We +generalized such functions based on individual user expectations with a user +study. The participants rated their satisfaction with an instruction's +execution at various times. Our investigations reveal that trapezoidal +functions best approximate the users' satisfaction. Additionally, the results +suggest that users are more lenient if the execution is specified further into +the future. + +
+
+ comment: 9 pages, 8 figures, to be published in 2024 IEEE International + Conference on Robotic Computing (IRC) +
+
+
+
+
+ + ☆ Everyone deserves their voice to be heard: Analyzing Predictive Gender + Bias in ASR Models Applied to Dutch Speech Data ECML + + +
+ Recent research has shown that state-of-the-art (SotA) Automatic Speech +Recognition (ASR) systems, such as Whisper, often exhibit predictive biases +that disproportionately affect various demographic groups. This study focuses +on identifying the performance disparities of Whisper models on Dutch speech +data from the Common Voice dataset and the Dutch National Public Broadcasting +organisation. We analyzed the word error rate, character error rate and a +BERT-based semantic similarity across gender groups. We used the moral +framework of Weerts et al. (2022) to assess quality of service harms and +fairness, and to provide a nuanced discussion on the implications of these +biases, particularly for automatic subtitling. Our findings reveal substantial +disparities in word error rate (WER) among gender groups across all model +sizes, with bias identified through statistical testing. + +
+
+ comment: Accepted at ECML PKDD 2024, 4th Workshop on Bias and Fairness in AI + (BIAS) +
+
+
+
+
+ + ☆ Less is More: Unseen Domain Fake News Detection via Causal Propagation + Substructures + + +
+ The spread of fake news on social media poses significant threats to +individuals and society. Text-based and graph-based models have been employed +for fake news detection by analysing news content and propagation networks, +showing promising results in specific scenarios. However, these data-driven +models heavily rely on pre-existing in-distribution data for training, limiting +their performance when confronted with fake news from emerging or previously +unseen domains, known as out-of-distribution (OOD) data. Tackling OOD fake news +is a challenging yet critical task. In this paper, we introduce the Causal +Subgraph-oriented Domain Adaptive Fake News Detection (CSDA) model, designed to +enhance zero-shot fake news detection by extracting causal substructures from +propagation graphs using in-distribution data and generalising this approach to +OOD data. The model employs a graph neural network based mask generation +process to identify dominant nodes and edges within the propagation graph, +using these substructures for fake news detection. Additionally, the +performance of CSDA is further improved through contrastive learning in +few-shot scenarios, where a limited amount of OOD data is available for +training. Extensive experiments on public social media datasets demonstrate +that CSDA effectively handles OOD fake news detection, achieving a 7 to 16 +percents accuracy improvement over other state-of-the-art models. + +
+
+ comment: 9 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Re-Parameterization of Lightweight Transformer for On-Device Speech + Emotion Recognition + + +
+ With the increasing implementation of machine learning models on edge or +Internet-of-Things (IoT) devices, deploying advanced models on +resource-constrained IoT devices remains challenging. Transformer models, a +currently dominant neural architecture, have achieved great success in broad +domains but their complexity hinders its deployment on IoT devices with limited +computation capability and storage size. Although many model compression +approaches have been explored, they often suffer from notorious performance +degradation. To address this issue, we introduce a new method, namely +Transformer Re-parameterization, to boost the performance of lightweight +Transformer models. It consists of two processes: the High-Rank Factorization +(HRF) process in the training stage and the deHigh-Rank Factorization (deHRF) +process in the inference stage. In the former process, we insert an additional +linear layer before the Feed-Forward Network (FFN) of the lightweight +Transformer. It is supposed that the inserted HRF layers can enhance the model +learning capability. In the later process, the auxiliary HRF layer will be +merged together with the following FFN layer into one linear layer and thus +recover the original structure of the lightweight model. To examine the +effectiveness of the proposed method, we evaluate it on three widely used +Transformer variants, i.e., ConvTransformer, Conformer, and SpeechFormer +networks, in the application of speech emotion recognition on the IEMOCAP, M3ED +and DAIC-WOZ datasets. Experimental results show that our proposed method +consistently improves the performance of lightweight Transformers, even making +them comparable to large models. The proposed re-parameterization approach +enables advanced Transformer models to be deployed on resource-constrained IoT +devices. + +
+
+
+
+
+ + ☆ DriveThru: a Document Extraction Platform and Benchmark Datasets for + Indonesian Local Language Archives + + +
+ Indonesia is one of the most diverse countries linguistically. However, +despite this linguistic diversity, Indonesian languages remain underrepresented +in Natural Language Processing (NLP) research and technologies. In the past two +years, several efforts have been conducted to construct NLP resources for +Indonesian languages. However, most of these efforts have been focused on +creating manual resources thus difficult to scale to more languages. Although +many Indonesian languages do not have a web presence, locally there are +resources that document these languages well in printed forms such as books, +magazines, and newspapers. Digitizing these existing resources will enable +scaling of Indonesian language resource construction to many more languages. In +this paper, we propose an alternative method of creating datasets by digitizing +documents, which have not previously been used to build digital language +resources in Indonesia. DriveThru is a platform for extracting document content +utilizing Optical Character Recognition (OCR) techniques in its system to +provide language resource building with less manual effort and cost. This paper +also studies the utility of current state-of-the-art LLM for post-OCR +correction to show the capability of increasing the character accuracy rate +(CAR) and word accuracy rate (WAR) compared to off-the-shelf OCR. + +
+
+ comment: 12 pages, 3 figures, 6 tables +
+
+
+
+
+ + ☆ DTELS: Towards Dynamic Granularity of Timeline Summarization + + +
+ The rapid proliferation of online news has posed significant challenges in +tracking the continuous development of news topics. Traditional timeline +summarization constructs a chronological summary of the events but often lacks +the flexibility to meet the diverse granularity needs. To overcome this +limitation, we introduce a new paradigm, Dynamic-granularity TimELine +Summarization, (DTELS), which aims to construct adaptive timelines based on +user instructions or requirements. This paper establishes a comprehensive +benchmark for DTLES that includes: (1) an evaluation framework grounded in +journalistic standards to assess the timeline quality across four dimensions: +Informativeness, Granular Consistency, Factuality, and Coherence; (2) a +large-scale, multi-source dataset with multiple granularity timeline +annotations based on a consensus process to facilitate authority; (3) extensive +experiments and analysis with two proposed solutions based on Large Language +Models (LLMs) and existing state-of-the-art TLS methods. The experimental +results demonstrate the effectiveness of LLM-based solutions. However, even the +most advanced LLMs struggle to consistently generate timelines that are both +informative and granularly consistent, highlighting the challenges of the DTELS +task. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ StreamAdapter: Efficient Test Time Adaptation from Contextual Streams + + +
+ In-context learning (ICL) allows large language models (LLMs) to adapt to new +tasks directly from the given demonstrations without requiring gradient +updates. While recent advances have expanded context windows to accommodate +more demonstrations, this approach increases inference costs without +necessarily improving performance. To mitigate these issues, We propose +StreamAdapter, a novel approach that directly updates model parameters from +context at test time, eliminating the need for explicit in-context +demonstrations. StreamAdapter employs context mapping and weight absorption +mechanisms to dynamically transform ICL demonstrations into parameter updates +with minimal additional parameters. By reducing reliance on numerous in-context +examples, StreamAdapter significantly reduce inference costs and allows for +efficient inference with constant time complexity, regardless of demonstration +count. Extensive experiments across diverse tasks and model architectures +demonstrate that StreamAdapter achieves comparable or superior adaptation +capability to ICL while requiring significantly fewer demonstrations. The +superior task adaptation and context encoding capabilities of StreamAdapter on +both language understanding and generation tasks provides a new perspective for +adapting LLMs at test time using context, allowing for more efficient +adaptation across scenarios and more cost-effective inference + +
+
+ comment: 22 Pages, 9 Figures +
+
+
+
+
+ + ☆ Cross-Modal Consistency in Multimodal Large Language Models + + +
+ Recent developments in multimodal methodologies have marked the beginning of +an exciting era for models adept at processing diverse data types, encompassing +text, audio, and visual content. Models like GPT-4V, which merge computer +vision with advanced language processing, exhibit extraordinary proficiency in +handling intricate tasks that require a simultaneous understanding of both +textual and visual information. Prior research efforts have meticulously +evaluated the efficacy of these Vision Large Language Models (VLLMs) in various +domains, including object detection, image captioning, and other related +fields. However, existing analyses have often suffered from limitations, +primarily centering on the isolated evaluation of each modality's performance +while neglecting to explore their intricate cross-modal interactions. +Specifically, the question of whether these models achieve the same level of +accuracy when confronted with identical task instances across different +modalities remains unanswered. In this study, we take the initiative to delve +into the interaction and comparison among these modalities of interest by +introducing a novel concept termed cross-modal consistency. Furthermore, we +propose a quantitative evaluation framework founded on this concept. Our +experimental findings, drawn from a curated collection of parallel +vision-language datasets developed by us, unveil a pronounced inconsistency +between the vision and language modalities within GPT-4V, despite its portrayal +as a unified multimodal model. Our research yields insights into the +appropriate utilization of such models and hints at potential avenues for +enhancing their design. + +
+
+
+
+
+ + ☆ Jailbreak Attacks and Defenses against Multimodal Generative Models: A + Survey + + +
+ The rapid evolution of multimodal foundation models has led to significant +advancements in cross-modal understanding and generation across diverse +modalities, including text, images, audio, and video. However, these models +remain susceptible to jailbreak attacks, which can bypass built-in safety +mechanisms and induce the production of potentially harmful content. +Consequently, understanding the methods of jailbreak attacks and existing +defense mechanisms is essential to ensure the safe deployment of multimodal +generative models in real-world scenarios, particularly in security-sensitive +applications. To provide comprehensive insight into this topic, this survey +reviews jailbreak and defense in multimodal generative models. First, given the +generalized lifecycle of multimodal jailbreak, we systematically explore +attacks and corresponding defense strategies across four levels: input, +encoder, generator, and output. Based on this analysis, we present a detailed +taxonomy of attack methods, defense mechanisms, and evaluation frameworks +specific to multimodal generative models. Additionally, we cover a wide range +of input-output configurations, including modalities such as Any-to-Text, +Any-to-Vision, and Any-to-Any within generative systems. Finally, we highlight +current research challenges and propose potential directions for future +research.The open-source repository corresponding to this work can be found at +https://github.com/liuxuannan/Awesome-Multimodal-Jailbreak. + +
+
+ comment: ongoing work +
+
+
+
+
+ + ☆ DAHL: Domain-specific Automated Hallucination Evaluation of Long-Form + Text through a Benchmark Dataset in Biomedicine EMNLP2024 + + +
+ We introduce DAHL, a benchmark dataset and automated evaluation system +designed to assess hallucination in long-form text generation, specifically +within the biomedical domain. Our benchmark dataset, meticulously curated from +biomedical research papers, consists of 8,573 questions across 29 categories. +DAHL evaluates fact-conflicting hallucinations in Large Language Models (LLMs) +by deconstructing responses into atomic units, each representing a single piece +of information. The accuracy of these responses is averaged to produce the DAHL +Score, offering a more in-depth evaluation of hallucinations compared to +previous methods that rely on multiple-choice tasks. We conduct experiments +with 8 different models, finding that larger models tend to hallucinate less; +however, beyond a model size of 7 to 8 billion parameters, further scaling does +not significantly improve factual accuracy. The DAHL Score holds potential as +an efficient alternative to human-annotated preference labels, being able to be +expanded to other specialized domains. We release the dataset and code in +public. + +
+
+ comment: EMNLP2024/FEVER +
+
+
+
+
+ + ☆ Enhancing Financial Domain Adaptation of Language Models via Model + Augmentation + + +
+ The domain adaptation of language models, including large language models +(LLMs), has become increasingly important as the use of such models continues +to expand. This study demonstrates the effectiveness of Composition to Augment +Language Models (CALM) in adapting to the financial domain. CALM is a model to +extend the capabilities of existing models by introducing cross-attention +between two LLMs with different functions. In our experiments, we developed a +CALM to enhance the financial performance of an LLM with strong response +capabilities by leveraging a financial-specialized LLM. Notably, the CALM was +trained using a financial dataset different from the one used to train the +financial-specialized LLM, confirming CALM's ability to adapt to various +datasets. The models were evaluated through quantitative Japanese financial +benchmarks and qualitative response comparisons, demonstrating that CALM +enables superior responses with higher scores than the original models and +baselines. Additionally, comparative experiments on connection points revealed +that connecting the middle layers of the models is most effective in +facilitating adaptation to the financial domain. These findings confirm that +CALM is a practical approach for adapting LLMs to the financial domain. + +
+
+
+
+
+ + ☆ HateGPT: Unleashing GPT-3.5 Turbo to Combat Hate Speech on X + + +
+ The widespread use of social media platforms like Twitter and Facebook has +enabled people of all ages to share their thoughts and experiences, leading to +an immense accumulation of user-generated content. However, alongside the +benefits, these platforms also face the challenge of managing hate speech and +offensive content, which can undermine rational discourse and threaten +democratic values. As a result, there is a growing need for automated methods +to detect and mitigate such content, especially given the complexity of +conversations that may require contextual analysis across multiple languages, +including code-mixed languages like Hinglish, German-English, and Bangla. We +participated in the English task where we have to classify English tweets into +two categories namely Hate and Offensive and Non Hate-Offensive. In this work, +we experiment with state-of-the-art large language models like GPT-3.5 Turbo +via prompting to classify tweets into Hate and Offensive or Non Hate-Offensive. +In this study, we evaluate the performance of a classification model using +Macro-F1 scores across three distinct runs. The Macro-F1 score, which balances +precision and recall across all classes, is used as the primary metric for +model evaluation. The scores obtained are 0.756 for run 1, 0.751 for run 2, and +0.754 for run 3, indicating a high level of performance with minimal variance +among the runs. The results suggest that the model consistently performs well +in terms of precision and recall, with run 1 showing the highest performance. +These findings highlight the robustness and reliability of the model across +different runs. + +
+
+ comment: Accepted at FIRE 2024 (Track: Hate Speech and Offensive Content + Identification in English and Indo-Aryan Languages (HASOC)). arXiv admin + note: text overlap with arXiv:2411.05039, arXiv:2411.06946 +
+
+
+
+
+ + ☆ Comprehensive and Practical Evaluation of Retrieval-Augmented Generation + Systems for Medical Question Answering + + +
+ Retrieval-augmented generation (RAG) has emerged as a promising approach to +enhance the performance of large language models (LLMs) in knowledge-intensive +tasks such as those from medical domain. However, the sensitive nature of the +medical domain necessitates a completely accurate and trustworthy system. While +existing RAG benchmarks primarily focus on the standard retrieve-answer +setting, they overlook many practical scenarios that measure crucial aspects of +a reliable medical system. This paper addresses this gap by providing a +comprehensive evaluation framework for medical question-answering (QA) systems +in a RAG setting for these situations, including sufficiency, integration, and +robustness. We introduce Medical Retrieval-Augmented Generation Benchmark +(MedRGB) that provides various supplementary elements to four medical QA +datasets for testing LLMs' ability to handle these specific scenarios. +Utilizing MedRGB, we conduct extensive evaluations of both state-of-the-art +commercial LLMs and open-source models across multiple retrieval conditions. +Our experimental results reveals current models' limited ability to handle +noise and misinformation in the retrieved documents. We further analyze the +LLMs' reasoning processes to provides valuable insights and future directions +for developing RAG systems in this critical medical domain. + +
+
+
+
+
+ + ☆ Unstructured Text Enhanced Open-domain Dialogue System: A Systematic + Survey + + +
+ Incorporating external knowledge into dialogue generation has been proven to +benefit the performance of an open-domain Dialogue System (DS), such as +generating informative or stylized responses, controlling conversation topics. +In this article, we study the open-domain DS that uses unstructured text as +external knowledge sources (\textbf{U}nstructured \textbf{T}ext +\textbf{E}nhanced \textbf{D}ialogue \textbf{S}ystem, \textbf{UTEDS}). The +existence of unstructured text entails distinctions between UTEDS and +traditional data-driven DS and we aim to analyze these differences. We first +give the definition of the UTEDS related concepts, then summarize the recently +released datasets and models. We categorize UTEDS into Retrieval and Generative +models and introduce them from the perspective of model components. The +retrieval models consist of Fusion, Matching, and Ranking modules, while the +generative models comprise Dialogue and Knowledge Encoding, Knowledge +Selection, and Response Generation modules. We further summarize the evaluation +methods utilized in UTEDS and analyze the current models' performance. At last, +we discuss the future development trends of UTEDS, hoping to inspire new +research in this field. + +
+
+ comment: 45 pages, 3 Figures, 11 Tables +
+
+
+
+
+ + ☆ DROJ: A Prompt-Driven Attack against Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated exceptional capabilities +across various natural language processing tasks. Due to their training on +internet-sourced datasets, LLMs can sometimes generate objectionable content, +necessitating extensive alignment with human feedback to avoid such outputs. +Despite massive alignment efforts, LLMs remain susceptible to adversarial +jailbreak attacks, which usually are manipulated prompts designed to circumvent +safety mechanisms and elicit harmful responses. Here, we introduce a novel +approach, Directed Rrepresentation Optimization Jailbreak (DROJ), which +optimizes jailbreak prompts at the embedding level to shift the hidden +representations of harmful queries towards directions that are more likely to +elicit affirmative responses from the model. Our evaluations on LLaMA-2-7b-chat +model show that DROJ achieves a 100\% keyword-based Attack Success Rate (ASR), +effectively preventing direct refusals. However, the model occasionally +produces repetitive and non-informative responses. To mitigate this, we +introduce a helpfulness system prompt that enhances the utility of the model's +responses. Our code is available at +https://github.com/Leon-Leyang/LLM-Safeguard. + +
+
+
+
+
+ + ☆ P-MMEval: A Parallel Multilingual Multitask Benchmark for Consistent + Evaluation of LLMs + + +
+ Recent advancements in large language models (LLMs) showcase varied +multilingual capabilities across tasks like translation, code generation, and +reasoning. Previous assessments often limited their scope to fundamental +natural language processing (NLP) or isolated capability-specific tasks. To +alleviate this drawback, we aim to present a comprehensive multilingual +multitask benchmark. First, we present a pipeline for selecting available and +reasonable benchmarks from massive ones, addressing the oversight in previous +work regarding the utility of these benchmarks, i.e., their ability to +differentiate between models being evaluated. Leveraging this pipeline, we +introduce P-MMEval, a large-scale benchmark covering effective fundamental and +capability-specialized datasets. Furthermore, P-MMEval delivers consistent +language coverage across various datasets and provides parallel samples. +Finally, we conduct extensive experiments on representative multilingual model +series to compare performances across models, analyze dataset effectiveness, +examine prompt impacts on model performances, and explore the relationship +between multilingual performances and factors such as tasks, model sizes, and +languages. These insights offer valuable guidance for future research. The +dataset is available at https://huggingface.co/datasets/Qwen/P-MMEval. + +
+
+
+
+
+ + ☆ Personalized Help for Optimizing Low-Skilled Users' Strategy + + +
+ AIs can beat humans in game environments; however, how helpful those agents +are to human remains understudied. We augment CICERO, a natural language agent +that demonstrates superhuman performance in Diplomacy, to generate both move +and message advice based on player intentions. A dozen Diplomacy games with +novice and experienced players, with varying advice settings, show that some of +the generated advice is beneficial. It helps novices compete with experienced +players and in some instances even surpass them. The mere presence of advice +can be advantageous, even if players do not follow it. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ A Benchmark for Long-Form Medical Question Answering NeurIPS 2024 + + +
+ There is a lack of benchmarks for evaluating large language models (LLMs) in +long-form medical question answering (QA). Most existing medical QA evaluation +benchmarks focus on automatic metrics and multiple-choice questions. While +valuable, these benchmarks fail to fully capture or assess the complexities of +real-world clinical applications where LLMs are being deployed. Furthermore, +existing studies on evaluating long-form answer generation in medical QA are +primarily closed-source, lacking access to human medical expert annotations, +which makes it difficult to reproduce results and enhance existing baselines. +In this work, we introduce a new publicly available benchmark featuring +real-world consumer medical questions with long-form answer evaluations +annotated by medical doctors. We performed pairwise comparisons of responses +from various open and closed-source medical and general-purpose LLMs based on +criteria such as correctness, helpfulness, harmfulness, and bias. Additionally, +we performed a comprehensive LLM-as-a-judge analysis to study the alignment +between human judgments and LLMs. Our preliminary results highlight the strong +potential of open LLMs in medical QA compared to leading closed models. Code & +Data: https://github.com/lavita-ai/medical-eval-sphere + +
+
+ comment: AIM-FM: Advancements in Medical Foundation Models Workshop, 38th + Conference on Neural Information Processing Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ Evaluating Gender Bias in Large Language Models + + +
+ Gender bias in artificial intelligence has become an important issue, +particularly in the context of language models used in communication-oriented +applications. This study examines the extent to which Large Language Models +(LLMs) exhibit gender bias in pronoun selection in occupational contexts. The +analysis evaluates the models GPT-4, GPT-4o, PaLM 2 Text Bison and Gemini 1.0 +Pro using a self-generated dataset. The jobs considered include a range of +occupations, from those with a significant male presence to those with a +notable female concentration, as well as jobs with a relatively equal gender +distribution. Three different sentence processing methods were used to assess +potential gender bias: masked tokens, unmasked sentences, and sentence +completion. In addition, the LLMs suggested names of individuals in specific +occupations, which were then examined for gender distribution. The results show +a positive correlation between the models' pronoun choices and the gender +distribution present in U.S. labor force data. Female pronouns were more often +associated with female-dominated occupations, while male pronouns were more +often associated with male-dominated occupations. Sentence completion showed +the strongest correlation with actual gender distribution, while name +generation resulted in a more balanced 'politically correct' gender +distribution, albeit with notable variations in predominantly male or female +occupations. Overall, the prompting method had a greater impact on gender +distribution than the model selection itself, highlighting the complexity of +addressing gender bias in LLMs. The findings highlight the importance of +prompting in gender mapping. + +
+
+ comment: 13 pages, 12 figures, 1 table +
+
+
+
+
+ + ☆ Evaluating the Predictive Capacity of ChatGPT for Academic Peer Review + Outcomes Across Multiple Platforms + + +
+ While previous studies have demonstrated that Large Language Models (LLMs) +can predict peer review outcomes to some extent, this paper builds on that by +introducing two new contexts and employing a more robust method - averaging +multiple ChatGPT scores. The findings that averaging 30 ChatGPT predictions, +based on reviewer guidelines and using only the submitted titles and abstracts, +failed to predict peer review outcomes for F1000Research (Spearman's rho=0.00). +However, it produced mostly weak positive correlations with the quality +dimensions of SciPost Physics (rho=0.25 for validity, rho=0.25 for originality, +rho=0.20 for significance, and rho = 0.08 for clarity) and a moderate positive +correlation for papers from the International Conference on Learning +Representations (ICLR) (rho=0.38). Including the full text of articles +significantly increased the correlation for ICLR (rho=0.46) and slightly +improved it for F1000Research (rho=0.09), while it had variable effects on the +four quality dimension correlations for SciPost LaTeX files. The use of +chain-of-thought system prompts slightly increased the correlation for +F1000Research (rho=0.10), marginally reduced it for ICLR (rho=0.37), and +further decreased it for SciPost Physics (rho=0.16 for validity, rho=0.18 for +originality, rho=0.18 for significance, and rho=0.05 for clarity). Overall, the +results suggest that in some contexts, ChatGPT can produce weak pre-publication +quality assessments. However, the effectiveness of these assessments and the +optimal strategies for employing them vary considerably across different +platforms, journals, and conferences. Additionally, the most suitable inputs +for ChatGPT appear to differ depending on the platform. + +
+
+
+
+
+ + ♻ ☆ Quantitative Assessment of Intersectional Empathetic Bias and + Understanding + + +
+ A growing amount of literature critiques the current operationalizations of +empathy based on loose definitions of the construct. Such definitions +negatively affect dataset quality, model robustness, and evaluation +reliability. We propose an empathy evaluation framework that operationalizes +empathy close to its psychological origins. The framework measures the variance +in responses of LLMs to prompts using existing metrics for empathy and +emotional valence. The variance is introduced through the controlled generation +of the prompts by varying social biases affecting context understanding, thus +impacting empathetic understanding. The control over generation ensures high +theoretical validity of the constructs in the prompt dataset. Also, it makes +high-quality translation, especially into languages that currently have +little-to-no way of evaluating empathy or bias, such as the Slavonic family, +more manageable. Using chosen LLMs and various prompt types, we demonstrate the +empathy evaluation with the framework, including multiple-choice answers and +free generation. The variance in our initial evaluation sample is small and we +were unable to measure convincing differences between the empathetic +understanding in contexts given by different social groups. However, the +results are promising because the models showed significant alterations their +reasoning chains needed to capture the relatively subtle changes in the +prompts. This provides the basis for future research into the construction of +the evaluation sample and statistical methods for measuring the results. + +
+
+
+
+
+ + ♻ ☆ Verifiable by Design: Aligning Language Models to Quote from + Pre-Training Data + + +
+ To trust the fluent generations of large language models (LLMs), humans must +be able to verify their correctness against trusted, external sources. Recent +efforts, such as providing citations via retrieved documents or post-hoc +provenance, enhance verifiability but provide no guarantees on their +correctness. To address these limitations, we tackle the verifiability goal +with a different philosophy: trivializing the verification process by +developing models that quote verbatim statements from trusted sources in their +pre-training data. We propose Quote-Tuning, which demonstrates the feasibility +of aligning models to quote. The core of Quote-Tuning is a fast membership +inference function that efficiently verifies text against trusted corpora. We +leverage this tool to design a reward function to quantify quotes in model +responses, and curate datasets for preference learning. Experiments show that +Quote-Tuning significantly increases verbatim quotes from high-quality +documents by up to 130% relative to base models while maintaining response +quality. Quote-Tuning is applicable in different tasks, generalizes to +out-of-domain data and diverse model families, and provides additional benefits +to truthfulness. Our method not only serves as a hassle-free method to increase +quoting but also opens up avenues for improving LLM trustworthiness through +better verifiability. + +
+
+
+
+
+ + ♻ ☆ AutoDefense: Multi-Agent LLM Defense against Jailbreak Attacks + + +
+ Despite extensive pre-training in moral alignment to prevent generating +harmful information, large language models (LLMs) remain vulnerable to +jailbreak attacks. In this paper, we propose AutoDefense, a multi-agent defense +framework that filters harmful responses from LLMs. With the response-filtering +mechanism, our framework is robust against different jailbreak attack prompts, +and can be used to defend different victim models. AutoDefense assigns +different roles to LLM agents and employs them to complete the defense task +collaboratively. The division in tasks enhances the overall +instruction-following of LLMs and enables the integration of other defense +components as tools. With AutoDefense, small open-source LMs can serve as +agents and defend larger models against jailbreak attacks. Our experiments show +that AutoDefense can effectively defense against different jailbreak attacks, +while maintaining the performance at normal user request. For example, we +reduce the attack success rate on GPT-3.5 from 55.74% to 7.95% using +LLaMA-2-13b with a 3-agent system. Our code and data are publicly available at +https://github.com/XHMY/AutoDefense. + +
+
+
+
+
+ + ♻ ☆ VRSD: Rethinking Similarity and Diversity for Retrieval in Large + Language Models + + +
+ Vector retrieval algorithms are essential for semantic queries within the +rapidly evolving landscape of Large Language Models (LLMs). The ability to +retrieve vectors that satisfy both similarity and diversity criteria +substantially enhances the performance of LLMs. Although Maximal Marginal +Relevance (MMR) is widely employed in retrieval scenarios requiring relevance +and diversity, variations in the parameter $\lambda$ lead to fluctuations that +complicate the optimization trajectory in vector spaces. This obscures the +direction of improvement and highlights the lack of a robust theoretical +analysis regarding similarity and diversity constraints in retrieval processes. +To address these challenges, this paper introduces a novel approach that +characterizes both constraints through the relationship between the sum vector +and the query vector. The proximity of these vectors ensures the similarity +constraint, while requiring individual vectors within the sum vector to diverge +in their alignment with the query vector satisfies the diversity constraint. We +first formulate a new combinatorial optimization problem, selecting k vectors +from a candidate set such that their sum vector maximally aligns with the query +vector, and demonstrate that this problem is NP-complete. This result +underscores the inherent difficulty of simultaneously achieving similarity and +diversity in vector retrieval, thereby providing a theoretical foundation for +future research. Subsequently, we present the heuristic algorithm Vectors +Retrieval with Similarity and Diversity, VRSD, which features a clear +optimization objective and eliminates the need for preset parameters. VRSD also +achieves a modest reduction in time complexity compared to MMR. Empirical +validation confirms that VRSD significantly outperforms MMR across various +datasets. + +
+
+
+
+
+ + ♻ ☆ Value Residual Learning For Alleviating Attention Concentration In + Transformers + + +
+ Transformers can capture long-range dependencies using self-attention, +allowing tokens to attend to all others directly. However, stacking multiple +attention layers leads to attention concentration. One natural way to address +this issue is to use cross-layer attention, allowing information from earlier +layers to be directly accessible to later layers. However, this approach is +computationally expensive. To address this problem, we propose Transformer with +residual value (ResFormer) which approximates cross-layer attention through +adding a residual connection from the values of the the first layer to all +subsequent layers. Based on this method, one variant is the Transformer with +single layer value (SVFormer), where all layers share the same value embedding +from first layer, reducing the $KV$ cache by nearly 50\%. Comprehensive +empirical evidence demonstrates that ResFormer mitigates attention +concentration problem in deeper layers and enhances representation across most +layers, outperforming the vanilla Transformer, DenseFormer, and NeuTRENO in +training error as well as downstream tasks. Further visualization results +suggest that Resformer alleviates attention sinks through avoiding value-state +drains. SVFormer trains significantly faster than the vanilla Transformer and +performs better than other methods like GQA and CLA, with performance +influenced by sequence length and cumulative learning rate. + +
+
+
+
+
+ + ♻ ☆ Knowledge Bases in Support of Large Language Models for Processing Web + News + + +
+ Large Language Models (LLMs) have received considerable interest in wide +applications lately. During pre-training via massive datasets, such a model +implicitly memorizes the factual knowledge of trained datasets in its hidden +parameters. However, knowledge held implicitly in parameters often makes its +use by downstream applications ineffective due to the lack of common-sense +reasoning. In this article, we introduce a general framework that permits to +build knowledge bases with an aid of LLMs, tailored for processing Web news. +The framework applies a rule-based News Information Extractor (NewsIE) to news +items for extracting their relational tuples, referred to as knowledge bases, +which are then graph-convoluted with the implicit knowledge facts of news items +obtained by LLMs, for their classification. It involves two lightweight +components: 1) NewsIE: for extracting the structural information of every news +item, in the form of relational tuples; 2) BERTGraph: for graph convoluting the +implicit knowledge facts with relational tuples extracted by NewsIE. We have +evaluated our framework under different news-related datasets for news category +classification, with promising experimental results. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Improving Arabic Multi-Label Emotion Classification using Stacked + Embeddings and Hybrid Loss Function + + +
+ In multi-label emotion classification, particularly for low-resource +languages like Arabic, the challenges of class imbalance and label correlation +hinder model performance, especially in accurately predicting minority +emotions. To address these issues, this study proposes a novel approach that +combines stacked embeddings, meta-learning, and a hybrid loss function to +enhance multi-label emotion classification for the Arabic language. The study +extracts contextual embeddings from three fine-tuned language +models-ArabicBERT, MarBERT, and AraBERT-which are then stacked to form enriched +embeddings. A meta-learner is trained on these stacked embeddings, and the +resulting concatenated representations are provided as input to a Bi-LSTM +model, followed by a fully connected neural network for multi-label +classification. To further improve performance, a hybrid loss function is +introduced, incorporating class weighting, label correlation matrix, and +contrastive learning, effectively addressing class imbalances and improving the +handling of label correlations. Extensive experiments validate the proposed +model's performance across key metrics such as Precision, Recall, F1-Score, +Jaccard Accuracy, and Hamming Loss. The class-wise performance analysis +demonstrates the hybrid loss function's ability to significantly reduce +disparities between majority and minority classes, resulting in a more balanced +emotion classification. An ablation study highlights the contribution of each +component, showing the superiority of the model compared to baseline approaches +and other loss functions. This study not only advances multi-label emotion +classification for Arabic but also presents a generalizable framework that can +be adapted to other languages and domains, providing a significant step forward +in addressing the challenges of low-resource emotion classification tasks. + +
+
+ comment: The paper is submitted in Scientific Reports and is currently under + review +
+
+
+
+
+ + ♻ ☆ Can LLMs Recognize Toxicity? A Structured Investigation Framework and + Toxicity Metric + + +
+ In the pursuit of developing Large Language Models (LLMs) that adhere to +societal standards, it is imperative to detect the toxicity in the generated +text. The majority of existing toxicity metrics rely on encoder models trained +on specific toxicity datasets, which are susceptible to out-of-distribution +(OOD) problems and depend on the dataset's definition of toxicity. In this +paper, we introduce a robust metric grounded on LLMs to flexibly measure +toxicity according to the given definition. We first analyze the toxicity +factors, followed by an examination of the intrinsic toxic attributes of LLMs +to ascertain their suitability as evaluators. Finally, we evaluate the +performance of our metric with detailed analysis. Our empirical results +demonstrate outstanding performance in measuring toxicity within verified +factors, improving on conventional metrics by 12 points in the F1 score. Our +findings also indicate that upstream toxicity significantly influences +downstream metrics, suggesting that LLMs are unsuitable for toxicity +evaluations within unverified factors. + +
+
+ comment: 8 page long +
+
+
+
+
+ + ♻ ☆ SLIMER-IT: Zero-Shot NER on Italian Language + + +
+ Traditional approaches to Named Entity Recognition (NER) frame the task into +a BIO sequence labeling problem. Although these systems often excel in the +downstream task at hand, they require extensive annotated data and struggle to +generalize to out-of-distribution input domains and unseen entity types. On the +contrary, Large Language Models (LLMs) have demonstrated strong zero-shot +capabilities. While several works address Zero-Shot NER in English, little has +been done in other languages. In this paper, we define an evaluation framework +for Zero-Shot NER, applying it to the Italian language. Furthermore, we +introduce SLIMER-IT, the Italian version of SLIMER, an instruction-tuning +approach for zero-shot NER leveraging prompts enriched with definition and +guidelines. Comparisons with other state-of-the-art models, demonstrate the +superiority of SLIMER-IT on never-seen-before entity tags. + +
+
+
+
+
+ + ♻ ☆ IRCAN: Mitigating Knowledge Conflicts in LLM Generation via Identifying + and Reweighting Context-Aware Neurons NeurIPS 2024 + + +
+ It is widely acknowledged that large language models (LLMs) encode a vast +reservoir of knowledge after being trained on mass data. Recent studies +disclose knowledge conflicts in LLM generation, wherein outdated or incorrect +parametric knowledge (i.e., encoded knowledge) contradicts new knowledge +provided in the context. To mitigate such knowledge conflicts, we propose a +novel framework, IRCAN (Identifying and Reweighting Context-Aware Neurons) to +capitalize on neurons that are crucial in processing contextual cues. +Specifically, IRCAN first identifies neurons that significantly contribute to +context processing, utilizing a context-aware attribution score derived from +integrated gradients. Subsequently, the identified context-aware neurons are +strengthened via reweighting. In doing so, we steer LLMs to generate +context-sensitive outputs with respect to the new knowledge provided in the +context. Extensive experiments conducted across a variety of models and tasks +demonstrate that IRCAN not only achieves remarkable improvements in handling +knowledge conflicts but also offers a scalable, plug-and-play solution that can +be integrated seamlessly with existing models. Our codes are released at +https://github.com/danshi777/IRCAN. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration From Cognitive Psychology + + +
+ The cognitive mechanism by which Large Language Models (LLMs) solve +mathematical problems remains a widely debated and unresolved issue. Currently, +there is little interpretable experimental evidence that connects LLMs' +problem-solving with human cognitive psychology.To determine if LLMs possess +human-like mathematical reasoning, we modified the problems used in the human +Cognitive Reflection Test (CRT). Our results show that, even with the use of +Chains of Thought (CoT) prompts, mainstream LLMs, including the latest o1 model +(noted for its reasoning capabilities), have a high error rate when solving +these modified CRT problems. Specifically, the average accuracy rate dropped by +up to 50% compared to the original questions.Further analysis of LLMs' +incorrect answers suggests that they primarily rely on pattern matching from +their training data, which aligns more with human intuition (System 1 thinking) +rather than with human-like reasoning (System 2 thinking). This finding +challenges the belief that LLMs have genuine mathematical reasoning abilities +comparable to humans. As a result, this work may adjust overly optimistic views +on LLMs' progress towards artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ More Expressive Attention with Negative Weights + + +
+ We propose a novel attention mechanism, named Cog Attention, that enables +attention weights to be negative for enhanced expressiveness, which stems from +two key factors: (1) Cog Attention can shift the token deletion and copying +function from a static OV matrix to dynamic QK inner products, with the OV +matrix now focusing more on refinement or modification. The attention head can +simultaneously delete, copy, or retain tokens by assigning them negative, +positive, or minimal attention weights, respectively. As a result, a single +attention head becomes more flexible and expressive. (2) Cog Attention improves +the model's robustness against representational collapse, which can occur when +earlier tokens are over-squashed into later positions, leading to homogeneous +representations. Negative weights reduce effective information paths from +earlier to later tokens, helping to mitigate this issue. We develop +Transformer-like models which use Cog Attention as attention modules, including +decoder-only models for language modeling and U-ViT diffusion models for image +generation. Experiments show that models using Cog Attention exhibit superior +performance compared to those employing traditional softmax attention modules. +Our approach suggests a promising research direction for rethinking and +breaking the entrenched constraints of traditional softmax attention, such as +the requirement for non-negative weights. + +
+
+
+
+
+ + ♻ ☆ Exploring the Potential of Multimodal LLM with Knowledge-Intensive + Multimodal ASR EMNLP 2024 + + +
+ Recent advancements in multimodal large language models (MLLMs) have made +significant progress in integrating information across various modalities, yet +real-world applications in educational and scientific domains remain +challenging. This paper introduces the Multimodal Scientific ASR (MS-ASR) task, +which focuses on transcribing scientific conference videos by leveraging visual +information from slides to enhance the accuracy of technical terminologies. +Realized that traditional metrics like WER fall short in assessing performance +accurately, prompting the proposal of severity-aware WER (SWER) that considers +the content type and severity of ASR errors. We propose the Scientific Vision +Augmented ASR (SciVASR) framework as a baseline method, enabling MLLMs to +improve transcript quality through post-editing. Evaluations of +state-of-the-art MLLMs, including GPT-4o, show a 45% improvement over +speech-only baselines, highlighting the importance of multimodal information +integration. + +
+
+ comment: Accepted to EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Can Small Language Models Learn, Unlearn, and Retain Noise Patterns? + + +
+ Small Language Models (SLMs) are generally considered more compact versions +of large language models (LLMs). This study investigates the ability of SLMs +with parameters between 1 and 3 billion to learn, retain, and subsequently +eliminate different types of noise present in the data. Four pre-trained SLMs +were utilized for this: Olmo 1B, Qwen1.5 1.8B, Gemma 2B, and Phi2 2.7B. The +models were instruction-tuned on noise-free data and tested using in-context +examples to determine if they could learn noise through examples. Subsequently, +noise patterns were introduced in instruction tuning to evaluate the noise +learning, unlearning, and retention capabilities of the models. Olmo, the +smallest model, was highly sensitive to noise, quickly adapting to noisy +patterns. Phi2 resisted learning character-level and transliteration noise, +likely due to its carefully curated, structured, and high-quality pretraining +data. Gemma excelled with transliteration noise, likely benefiting from its +multilingual pretraining. The findings can be used to develop robust training +strategies for SLMs. + +
+
+
+
+
+ + ♻ ☆ Language Models Encode the Value of Numbers Linearly + + +
+ Large language models (LLMs) have exhibited impressive competence in various +tasks, but their internal mechanisms on mathematical problems are still +under-explored. In this paper, we study a fundamental question: how language +models encode the value of numbers, a basic element in math. To study the +question, we construct a synthetic dataset comprising addition problems and +utilize linear probes to read out input numbers from the hidden states. +Experimental results support the existence of encoded number values in LLMs on +different layers, and these values can be extracted via linear probes. Further +experiments show that LLMs store their calculation results in a similar manner, +and we can intervene the output via simple vector additions, proving the causal +connection between encoded numbers and language model outputs. Our research +provides evidence that LLMs encode the value of numbers linearly, offering +insights for better exploring, designing, and utilizing numeric information in +LLMs. + +
+
+ comment: The code and data are available at + https://github.com/solitaryzero/NumProbe +
+
+
+
+
+ + ♻ ☆ On Context Utilization in Summarization with Large Language Models ACL 2024 + + +
+ Large language models (LLMs) excel in abstractive summarization tasks, +delivering fluent and pertinent summaries. Recent advancements have extended +their capabilities to handle long-input contexts, exceeding 100k tokens. +However, in question answering, language models exhibit uneven utilization of +their input context. They tend to favor the initial and final segments, +resulting in a U-shaped performance pattern concerning where the answer is +located within the input. This bias raises concerns, particularly in +summarization where crucial content may be dispersed throughout the source +document(s). Besides, in summarization, mapping facts from the source to the +summary is not trivial as salient content is usually re-phrased. In this paper, +we conduct the first comprehensive study on context utilization and position +bias in summarization. Our analysis encompasses 6 LLMs, 10 datasets, and 5 +evaluation metrics. We introduce a new evaluation benchmark called MiddleSum on +the which we benchmark two alternative inference methods to alleviate position +bias: hierarchical summarization and incremental summarization. Our code and +data can be found here: https://github.com/ntunlp/MiddleSum. + +
+
+ comment: ACL 2024. 9 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Large Language Models for Power Scheduling: A User-Centric Approach + + +
+ While traditional optimization and scheduling schemes are designed to meet +fixed, predefined system requirements, future systems are moving toward +user-driven approaches and personalized services, aiming to achieve high +quality-of-experience (QoE) and flexibility. This challenge is particularly +pronounced in wireless and digitalized energy networks, where users' +requirements have largely not been taken into consideration due to the lack of +a common language between users and machines. The emergence of powerful large +language models (LLMs) marks a radical departure from traditional +system-centric methods into more advanced user-centric approaches by providing +a natural communication interface between users and devices. In this paper, for +the first time, we introduce a novel architecture for resource scheduling +problems by constructing three LLM agents to convert an arbitrary user's voice +request (VRQ) into a resource allocation vector. Specifically, we design an LLM +intent recognition agent to translate the request into an optimization problem +(OP), an LLM OP parameter identification agent, and an LLM OP solving agent. To +evaluate system performance, we construct a database of typical VRQs in the +context of electric vehicle (EV) charging. As a proof of concept, we primarily +use Llama 3 8B. Through testing with different prompt engineering scenarios, +the obtained results demonstrate the efficiency of the proposed architecture. +The conducted performance analysis allows key insights to be extracted. For +instance, having a larger set of candidate OPs to model the real-world problem +might degrade the final performance because of a higher recognition/OP +classification noise level. All results and codes are open source. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Summarization Re-ranking ACL + + +
+ With the rise of task-specific pre-training objectives, abstractive +summarization models like PEGASUS offer appealing zero-shot performance on +downstream summarization tasks. However, the performance of such unsupervised +models still lags significantly behind their supervised counterparts. Similarly +to the supervised setup, we notice a very high variance in quality among +summary candidates from these models while only one candidate is kept as the +summary output. In this paper, we propose to re-rank summary candidates in an +unsupervised manner, aiming to close the performance gap between unsupervised +and supervised models. Our approach improves the unsupervised PEGASUS by up to +7.27% and ChatGPT by up to 6.86% relative mean ROUGE across four widely-adopted +summarization benchmarks ; and achieves relative gains of 7.51% (up to 23.73% +from XSum to WikiHow) averaged over 30 zero-shot transfer setups (finetuning on +a dataset, evaluating on another). + +
+
+ comment: 9 pages, 1 figure, 10 tables, 23 appendix pages, ACL Findings 2023 +
+
+
+
+
+ + ♻ ☆ Towards Objective and Unbiased Decision Assessments with LLM-Enhanced + Hierarchical Attention Networks + + +
+ How objective and unbiased are we while making decisions? This work +investigates cognitive bias identification in high-stake decision making +process by human experts, questioning its effectiveness in real-world settings, +such as candidates assessments for university admission. We begin with a +statistical analysis assessing correlations among different decision points +among in the current process, which discovers discrepancies that imply +cognitive bias and inconsistency in decisions. This motivates our exploration +of bias-aware AI-augmented workflow that surpass human judgment. We propose +BGM-HAN, an enhanced Hierarchical Attention Network with Byte-Pair Encoding, +Gated Residual Connections and Multi-Head Attention. Using it as a backbone +model, we further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow, +which simulate real-world decision-making. In our experiments, both the +proposed model and the agentic workflow significantly improves on both human +judgment and alternative models, validated with real-world data. + +
+
+ comment: Source code is available at: https://github.com/junhua/bgm-han +
+
+
+
+
+ + ♻ ☆ Characterization of Political Polarized Users Attacked by Language + Toxicity on Twitter + + +
+ Understanding the dynamics of language toxicity on social media is important +for us to investigate the propagation of misinformation and the development of +echo chambers for political scenarios such as U.S. presidential elections. +Recent research has used large-scale data to investigate the dynamics across +social media platforms. However, research on the toxicity dynamics is not +enough. This study aims to provide a first exploration of the potential +language toxicity flow among Left, Right and Center users. Specifically, we aim +to examine whether Left users were easier to be attacked by language toxicity. +In this study, more than 500M Twitter posts were examined. It was discovered +that Left users received much more toxic replies than Right and Center users. + +
+
+
+
+
+ + ♻ ☆ Problematic Tokens: Tokenizer Bias in Large Language Models + + +
+ Recent advancements in large language models(LLMs), such as GPT-4 and GPT-4o, +have shown exceptional performance, especially in languages with abundant +resources like English, thanks to extensive datasets that ensure robust +training. Conversely, these models exhibit limitations when processing +under-resourced languages such as Chinese and Korean, where issues including +hallucinatory responses remain prevalent. This paper traces the roots of these +disparities to the tokenization process inherent to these models. Specifically, +it explores how the tokenizers vocabulary, often used to speed up the +tokenization process and reduce tokens but constructed independently of the +actual model training data, inadequately represents non-English languages. This +misrepresentation results in the propagation of under-trained or untrained +tokens, which perpetuate biases and pose serious concerns related to data +security and ethical standards. We aim to dissect the tokenization mechanics of +GPT-4o, illustrating how its simplified token-handling methods amplify these +risks and offer strategic solutions to mitigate associated security and ethical +issues. Through this study, we emphasize the critical need to rethink +tokenization frameworks to foster more equitable and secure AI technologies. +The code and data are available at https://github.com/yeyimilk/LLMGPT4o + +
+
+ comment: 11th IEEE Special session on Privacy and Security of Big Data (PSBD + 2024) +
+
+
+
+
+ + ♻ ☆ From Instance Training to Instruction Learning: Task Adapters Generation + from Instructions NeurIPS 2024 + + +
+ Large language models (LLMs) have acquired the ability to solve general tasks +by utilizing instruction finetuning (IFT). However, IFT still relies heavily on +instance training of extensive task data, which greatly limits the adaptability +of LLMs to real-world scenarios where labeled task instances are scarce and +broader task generalization becomes paramount. Contrary to LLMs, humans acquire +skills and complete tasks not merely through repeated practice but also by +understanding and following instructional guidelines. This paper is dedicated +to simulating human learning to address the shortcomings of instance training, +focusing on instruction learning to enhance cross-task generalization. Within +this context, we introduce Task Adapters Generation from Instructions (TAGI), +which automatically constructs the task-specific model in a parameter +generation manner based on the given task instructions without retraining for +unseen tasks. Specifically, we utilize knowledge distillation to enhance the +consistency between TAGI developed through Learning with Instruction and +task-specific models developed through Training with Instance, by aligning the +labels, output logits, and adapter parameters between them. TAGI is endowed +with cross-task generalization capabilities through a two-stage training +process that includes hypernetwork pretraining and finetuning. We evaluate TAGI +on the Super-Natural Instructions and P3 datasets. The experimental results +demonstrate that TAGI can match or even outperform traditional meta-trained +models and other hypernetwork models, while significantly reducing +computational requirements. + +
+
+ comment: accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Dynamic Rewarding with Prompt Optimization Enables Tuning-free + Self-Alignment of Language Models EMNLP 2024 + + +
+ Aligning Large Language Models (LLMs) traditionally relies on costly training +and human preference annotations. Self-alignment seeks to reduce these expenses +by enabling models to align themselves. To further lower costs and achieve +alignment without any expensive tuning or annotations, we introduce a new +tuning-free approach for self-alignment, Dynamic Rewarding with Prompt +Optimization (DRPO). Our approach leverages a search-based optimization +framework that allows LLMs to iteratively self-improve and craft the optimal +alignment instructions, all without additional training or human intervention. +The core of DRPO is a dynamic rewarding mechanism, which identifies and +rectifies model-specific alignment weaknesses, allowing LLMs to adapt +efficiently to diverse alignment challenges. Empirical evaluations on eight +recent LLMs, both open- and closed-sourced, demonstrate that DRPO significantly +enhances alignment performance, with base models outperforming their +SFT/RLHF-tuned counterparts. Moreover, the prompts automatically optimized by +DRPO surpass those curated by human experts, further validating the +effectiveness of our approach. Our findings highlight the great potential of +current LLMs to achieve adaptive self-alignment through inference-time +optimization, complementing tuning-based alignment methods. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ Not All Heads Matter: A Head-Level KV Cache Compression Method with + Integrated Retrieval and Reasoning + + +
+ Key-Value (KV) caching is a common technique to enhance the computational +efficiency of Large Language Models (LLMs), but its memory overhead grows +rapidly with input length. Prior work has shown that not all tokens are equally +important for text generation, proposing layer-level KV cache compression to +selectively retain key information. Recognizing the distinct roles of attention +heads in generation, we propose HeadKV, a head-level KV cache compression +method, and HeadKV-R2, which leverages a novel contextual reasoning ability +estimation for compression. Our approach operates at the level of individual +heads, estimating their importance for contextual QA tasks that require both +retrieval and reasoning capabilities. Extensive experiments across diverse +benchmarks (LongBench, LooGLE), model architectures (e.g., Llama-3-8B-Instruct, +Mistral-7B-Instruct), and long-context abilities tests demonstrate that our +head-level KV cache compression significantly outperforms strong baselines, +particularly in low-resource settings (KV size = 64 & 128). Notably, our method +retains just 1.5% of the KV cache while achieving 97% of the performance of the +full KV cache on the contextual question answering benchmark.Codes are +available at https://github.com/FYYFU/HeadKV + +
+
+ comment: 18pages +
+
+
+
+
+ + ♻ ☆ LLM2CLIP: Powerful Language Model Unlocks Richer Visual Representation + + +
+ CLIP is one of the most important multimodal foundational models today. What +powers CLIP's capabilities? The rich supervision signals provided by natural +language, the carrier of human knowledge, shape a powerful cross-modal +representation space. However, with the rapid advancements in large language +models LLMs like GPT-4 and LLaMA, the boundaries of language comprehension and +generation are continually being pushed. This raises an intriguing question: +can the capabilities of LLMs be harnessed to further improve multimodal +representation learning? The potential benefits of incorporating LLMs into CLIP +are clear. LLMs' strong textual understanding can fundamentally improve CLIP's +ability to handle image captions, drastically enhancing its ability to process +long and complex texts, a well-known limitation of vanilla CLIP. Moreover, LLMs +are trained on a vast corpus of text, possessing open-world knowledge. This +allows them to expand on caption information during training, increasing the +efficiency of the learning process. In this paper, we propose LLM2CLIP, a novel +approach that embraces the power of LLMs to unlock CLIP's potential. By +fine-tuning the LLM in the caption space with contrastive learning, we extract +its textual capabilities into the output embeddings, significantly improving +the output layer's textual discriminability. We then design an efficient +training process where the fine-tuned LLM acts as a powerful teacher for CLIP's +visual encoder. Thanks to the LLM's presence, we can now incorporate longer and +more complex captions without being restricted by vanilla CLIP's text encoder's +context window and ability limitations. Our experiments demonstrate that this +approach brings substantial improvements in cross-modal tasks. + +
+
+
+
+
+ + ♻ ☆ Continuous Rating as Reliable Human Evaluation of Simultaneous Speech + Translation + + +
+ Simultaneous speech translation (SST) can be evaluated on simulated online +events where human evaluators watch subtitled videos and continuously express +their satisfaction by pressing buttons (so called Continuous Rating). +Continuous Rating is easy to collect, but little is known about its +reliability, or relation to comprehension of foreign language document by SST +users. In this paper, we contrast Continuous Rating with factual questionnaires +on judges with different levels of source language knowledge. Our results show +that Continuous Rating is easy and reliable SST quality assessment if the +judges have at least limited knowledge of the source language. Our study +indicates users' preferences on subtitle layout and presentation style and, +most importantly, provides a significant evidence that users with advanced +source language knowledge prefer low latency over fewer re-translations. + +
+
+ comment: Published at WMT 2022: https://aclanthology.org/2022.wmt-1.9/ +
+
+
+
+
+ + ♻ ☆ A Cognitive Architecture for Machine Consciousness and Artificial + Superintelligence: Thought Is Structured by the Iterative Updating of Working + Memory + + +
+ This article provides an analytical framework for how to simulate human-like +thought processes within a computer. It describes how attention and memory +should be structured, updated, and utilized to search for associative additions +to the stream of thought. The focus is on replicating the dynamics of the +mammalian working memory system, which features two forms of persistent +activity: sustained firing (preserving information on the order of seconds) and +synaptic potentiation (preserving information from minutes to hours). The +article uses a series of figures to systematically demonstrate how the +iterative updating of these working memory stores provides functional +organization to behavior, cognition, and awareness. + In a machine learning implementation, these two memory stores should be +updated continuously and in an iterative fashion. This means each state should +preserve a proportion of the coactive representations from the state before it +(where each representation is an ensemble of neural network nodes). This makes +each state a revised iteration of the preceding state and causes successive +configurations to overlap and blend with respect to the information they +contain. Thus, the set of concepts in working memory will evolve gradually and +incrementally over time. Transitions between states happen as persistent +activity spreads activation energy throughout the hierarchical network, +searching long-term memory for the most appropriate representation to be added +to the global workspace. The result is a chain of associatively linked +intermediate states capable of advancing toward a solution or goal. Iterative +updating is conceptualized here as an information processing strategy, a model +of working memory, a theory of consciousness, and an algorithm for designing +and programming artificial intelligence (AI, AGI, and ASI). + +
+
+ comment: 88 pages and 53 figures +
+
+
+
+
+ + ♻ ☆ A Review of Large Language Models and Autonomous Agents in Chemistry + + +
+ Large language models (LLMs) have emerged as powerful tools in chemistry, +significantly impacting molecule design, property prediction, and synthesis +optimization. This review highlights LLM capabilities in these domains and +their potential to accelerate scientific discovery through automation. We also +review LLM-based autonomous agents: LLMs with a broader set of tools to +interact with their surrounding environment. These agents perform diverse tasks +such as paper scraping, interfacing with automated laboratories, and synthesis +planning. As agents are an emerging topic, we extend the scope of our review of +agents beyond chemistry and discuss across any scientific domains. This review +covers the recent history, current capabilities, and design of LLMs and +autonomous agents, addressing specific challenges, opportunities, and future +directions in chemistry. Key challenges include data quality and integration, +model interpretability, and the need for standard benchmarks, while future +directions point towards more sophisticated multi-modal agents and enhanced +collaboration between agents and experimental methods. Due to the quick pace of +this field, a repository has been built to keep track of the latest studies: +https://github.com/ur-whitelab/LLMs-in-science. + +
+
+
+
+
+ + ♻ ☆ CLIMB: A Benchmark of Clinical Bias in Large Language Models + + +
+ Large language models (LLMs) are increasingly applied to clinical +decision-making. However, their potential to exhibit bias poses significant +risks to clinical equity. Currently, there is a lack of benchmarks that +systematically evaluate such clinical bias in LLMs. While in downstream tasks, +some biases of LLMs can be avoided such as by instructing the model to answer +"I'm not sure...", the internal bias hidden within the model still lacks deep +studies. We introduce CLIMB (shorthand for A Benchmark of Clinical Bias in +Large Language Models), a pioneering comprehensive benchmark to evaluate both +intrinsic (within LLMs) and extrinsic (on downstream tasks) bias in LLMs for +clinical decision tasks. Notably, for intrinsic bias, we introduce a novel +metric, AssocMAD, to assess the disparities of LLMs across multiple demographic +groups. Additionally, we leverage counterfactual intervention to evaluate +extrinsic bias in a task of clinical diagnosis prediction. Our experiments +across popular and medically adapted LLMs, particularly from the Mistral and +LLaMA families, unveil prevalent behaviors with both intrinsic and extrinsic +bias. This work underscores the critical need to mitigate clinical bias and +sets a new standard for future evaluations of LLMs' clinical bias. + +
+
+
+
+
+ + ♻ ☆ Security and Privacy Challenges of Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have demonstrated extraordinary capabilities and +contributed to multiple fields, such as generating and summarizing text, +language translation, and question-answering. Nowadays, LLM is becoming a very +popular tool in computerized language processing tasks, with the capability to +analyze complicated linguistic patterns and provide relevant and appropriate +responses depending on the context. While offering significant advantages, +these models are also vulnerable to security and privacy attacks, such as +jailbreaking attacks, data poisoning attacks, and Personally Identifiable +Information (PII) leakage attacks. This survey provides a thorough review of +the security and privacy challenges of LLMs for both training data and users, +along with the application-based risks in various domains, such as +transportation, education, and healthcare. We assess the extent of LLM +vulnerabilities, investigate emerging security and privacy attacks for LLMs, +and review the potential defense mechanisms. Additionally, the survey outlines +existing research gaps in this domain and highlights future research +directions. + +
+
+
+
+
+ + ♻ ☆ GPT-4V Cannot Generate Radiology Reports Yet ML4H + + +
+ GPT-4V's purported strong multimodal abilities raise interests in using it to +automate radiology report writing, but there lacks thorough evaluations. In +this work, we perform a systematic evaluation of GPT-4V in generating radiology +reports on two chest X-ray report datasets: MIMIC-CXR and IU X-Ray. We attempt +to directly generate reports using GPT-4V through different prompting +strategies and find that it fails terribly in both lexical metrics and clinical +efficacy metrics. To understand the low performance, we decompose the task into +two steps: 1) the medical image reasoning step of predicting medical condition +labels from images; and 2) the report synthesis step of generating reports from +(groundtruth) conditions. We show that GPT-4V's performance in image reasoning +is consistently low across different prompts. In fact, the distributions of +model-predicted labels remain constant regardless of which groundtruth +conditions are present on the image, suggesting that the model is not +interpreting chest X-rays meaningfully. Even when given groundtruth conditions +in report synthesis, its generated reports are less correct and less +natural-sounding than a finetuned LLaMA-2. Altogether, our findings cast doubt +on the viability of using GPT-4V in a radiology workflow. + +
+
+ comment: 24 pages, 3 figures, code: + https://github.com/ChicagoHAI/cxr-eval-gpt-4v Findings paper presented at + Machine Learning for Health (ML4H) symposium 2024, December 15-16, 2024, + Vancouver, Canada, 26 pages +
+
+
+
+
+ + ♻ ☆ Methods of Automatic Matrix Language Determination for Code-Switched + Speech EMNLP 2024 + + +
+ Code-switching (CS) is the process of speakers interchanging between two or +more languages which in the modern world becomes increasingly common. In order +to better describe CS speech the Matrix Language Frame (MLF) theory introduces +the concept of a Matrix Language, which is the language that provides the +grammatical structure for a CS utterance. In this work the MLF theory was used +to develop systems for Matrix Language Identity (MLID) determination. The MLID +of English/Mandarin and English/Spanish CS text and speech was compared to +acoustic language identity (LID), which is a typical way to identify a language +in monolingual utterances. MLID predictors from audio show higher correlation +with the textual principles than LID in all cases while also outperforming LID +in an MLID recognition task based on F1 macro (60%) and correlation score +(0.38). This novel approach has identified that non-English languages (Mandarin +and Spanish) are preferred over the English language as the ML contrary to the +monolingual choice of LID. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 50 + +
+
+
+ + ☆ Experience-based Subproblem Planning for Multi-Robot Motion Planning + + +
+ Multi-robot systems enhance efficiency and productivity across various +applications, from manufacturing to surveillance. While single-robot motion +planning has improved by using databases of prior solutions, extending this +approach to multi-robot motion planning (MRMP) presents challenges due to the +increased complexity and diversity of tasks and configurations. Recent discrete +methods have attempted to address this by focusing on relevant +lower-dimensional subproblems, but they are inadequate for complex scenarios +like those involving manipulator robots. To overcome this, we propose a novel +approach that %leverages experience-based planning by constructs and utilizes +databases of solutions for smaller sub-problems. By focusing on interactions +between fewer robots, our method reduces the need for exhaustive database +growth, allowing for efficient handling of more complex MRMP scenarios. We +validate our approach with experiments involving both mobile and manipulator +robots, demonstrating significant improvements over existing methods in +scalability and planning efficiency. Our contributions include a rapidly +constructed database for low-dimensional MRMP problems, a framework for +applying these solutions to larger problems, and experimental validation with +up to 32 mobile and 16 manipulator robots. + +
+
+
+
+
+ + ☆ Goal-oriented Semantic Communication for Robot Arm Reconstruction in + Digital Twin: Feature and Temporal Selections + + +
+ As one of the most promising technologies in industry, the Digital Twin (DT) +facilitates real-time monitoring and predictive analysis for real-world systems +by precisely reconstructing virtual replicas of physical entities. However, +this reconstruction faces unprecedented challenges due to the everincreasing +communication overhead, especially for digital robot arm reconstruction. To +this end, we propose a novel goal-oriented semantic communication (GSC) +framework to extract the GSC information for the robot arm reconstruction task +in the DT, with the aim of minimising the communication load under the strict +and relaxed reconstruction error constraints. Unlike the traditional +reconstruction framework that periodically transmits a reconstruction message +for real-time DT reconstruction, our framework implements a feature selection +(FS) algorithm to extract the semantic information from the reconstruction +message, and a deep reinforcement learning-based temporal selection algorithm +to selectively transmit the semantic information over time. We validate our +proposed GSC framework through both Pybullet simulations and lab experiments +based on the Franka Research 3 robot arm. For a range of distinct robotic +tasks, simulation results show that our framework can reduce the communication +load by at least 59.5% under strict reconstruction error constraints and 80% +under relaxed reconstruction error constraints, compared with traditional +communication framework. Also, experimental results confirm the effectiveness +of our framework, where the communication load is reduced by 53% in strict +constraint case and 74% in relaxed constraint case. The demo is available at: +https://youtu.be/2OdeHKxcgnk. + +
+
+ comment: Submitted to IEEE for potential publication +
+
+
+
+
+ + ☆ Offline Adaptation of Quadruped Locomotion using Diffusion Models + + +
+ We present a diffusion-based approach to quadrupedal locomotion that +simultaneously addresses the limitations of learning and interpolating between +multiple skills and of (modes) offline adapting to new locomotion behaviours +after training. This is the first framework to apply classifier-free guided +diffusion to quadruped locomotion and demonstrate its efficacy by extracting +goal-conditioned behaviour from an originally unlabelled dataset. We show that +these capabilities are compatible with a multi-skill policy and can be applied +with little modification and minimal compute overhead, i.e., running entirely +on the robots onboard CPU. We verify the validity of our approach with hardware +experiments on the ANYmal quadruped platform. + +
+
+
+
+
+ + ☆ LUDO: Low-Latency Understanding of Highly Deformable Objects using Point + Cloud Occupancy Functions + + +
+ Accurately determining the shape and location of internal structures within +deformable objects is crucial for medical tasks that require precise targeting, +such as robotic biopsies. We introduce LUDO, a method for accurate low-latency +understanding of deformable objects. LUDO reconstructs objects in their +deformed state, including their internal structures, from a single-view point +cloud observation in under 30 ms using occupancy networks. We demonstrate +LUDO's abilities for autonomous targeting of internal regions of interest +(ROIs) in highly deformable objects. Additionally, LUDO provides uncertainty +estimates and explainability for its predictions, both of which are important +in safety-critical applications such as surgical interventions. We evaluate +LUDO in real-world robotic experiments, achieving a success rate of 98.9% for +puncturing various ROIs inside highly deformable objects. LUDO demonstrates the +potential to interact with deformable objects without the need for deformable +registration methods. + +
+
+
+
+
+ + ☆ Voxeland: Probabilistic Instance-Aware Semantic Mapping with + Evidence-based Uncertainty Quantification + + +
+ Robots in human-centered environments require accurate scene understanding to +perform high-level tasks effectively. This understanding can be achieved +through instance-aware semantic mapping, which involves reconstructing elements +at the level of individual instances. Neural networks, the de facto solution +for scene understanding, still face limitations such as overconfident incorrect +predictions with out-of-distribution objects or generating inaccurate +masks.Placing excessive reliance on these predictions makes the reconstruction +susceptible to errors, reducing the robustness of the resulting maps and +hampering robot operation. In this work, we propose Voxeland, a probabilistic +framework for incrementally building instance-aware semantic maps. Inspired by +the Theory of Evidence, Voxeland treats neural network predictions as +subjective opinions regarding map instances at both geometric and semantic +levels. These opinions are aggregated over time to form evidences, which are +formalized through a probabilistic model. This enables us to quantify +uncertainty in the reconstruction process, facilitating the identification of +map areas requiring improvement (e.g. reobservation or reclassification). As +one strategy to exploit this, we incorporate a Large Vision-Language Model +(LVLM) to perform semantic level disambiguation for instances with high +uncertainty. Results from the standard benchmarking on the publicly available +SceneNN dataset demonstrate that Voxeland outperforms state-of-the-art methods, +highlighting the benefits of incorporating and leveraging both instance- and +semantic-level uncertainties to enhance reconstruction robustness. This is +further validated through qualitative experiments conducted on the real-world +ScanNet dataset. + +
+
+
+
+
+ + ☆ Energy Optimal Traversal Between Hover Waypoints for Lift+Cruise + Electric Powered Aircraft + + +
+ Advanced Air Mobility aircraft require energy efficient flight plans to be +economically viable. This paper defines minimum energy direct trajectories +between waypoints for Lift+Cruise electric Vertical Take-Off and Landing +(eVTOL) aircraft. Energy consumption is optimized over accelerated and cruise +flight profiles with consideration of mode transitions. Because eVTOL +operations start and end in hover for vertical take-off and landing, hover +waypoints are utilized. Energy consumption is modeled as a function of airspeed +for each flight mode, providing the basis to prove energy optimality for +multi-mode traversal. Wind magnitude and direction dictate feasibility of +straight-line traversal because Lift+Cruise aircraft point into the relative +wind direction while hovering but also have a maximum heading rate constraint. +Energy and power use for an experimentally validated QuadPlane small eVTOL +aircraft are characterized with respect to airspeed and acceleration in all +flight modes. Optimal QuadPlane traversals are presented. Constraints on +acceleration and wind are derived for straight-line QuadPlane traversal. +Results show an optimal QuadPlane $500m$ traversal between hover waypoints +saves $71\%$ energy compared to pure vertical flight traversal for a +representative case study with a direct $4m/s$ crosswind. Energy optimal eVTOL +direct trajectory definition with transitions to and from hover is novel to +this work. Future work should model three-dimensional flight and wind as well +as optimize maneuver primitives when required. + +
+
+ comment: 34 pages, 17 figures and 5 tables +
+
+
+
+
+ + ☆ Robot See, Robot Do: Imitation Reward for Noisy Financial Environments + + +
+ The sequential nature of decision-making in financial asset trading aligns +naturally with the reinforcement learning (RL) framework, making RL a common +approach in this domain. However, the low signal-to-noise ratio in financial +markets results in noisy estimates of environment components, including the +reward function, which hinders effective policy learning by RL agents. Given +the critical importance of reward function design in RL problems, this paper +introduces a novel and more robust reward function by leveraging imitation +learning, where a trend labeling algorithm acts as an expert. We integrate +imitation (expert's) feedback with reinforcement (agent's) feedback in a +model-free RL algorithm, effectively embedding the imitation learning problem +within the RL paradigm to handle the stochasticity of reward signals. Empirical +results demonstrate that this novel approach improves financial performance +metrics compared to traditional benchmarks and RL agents trained solely using +reinforcement feedback. + +
+
+
+
+
+ + ☆ On the Application of Model Predictive Control to a Weighted Coverage + Path Planning Problem + + +
+ This paper considers the application of Model Predictive Control (MPC) to a +weighted coverage path planning (WCPP) problem. The problem appears in a wide +range of practical applications, such as search and rescue (SAR) missions. The +basic setup is that one (or multiple) agents can move around a given search +space and collect rewards from a given spatial distribution. Unlike an +artificial potential field, each reward can only be collected once. In contrast +to a Traveling Salesman Problem (TSP), the agent moves in a continuous space. +Moreover, he is not obliged to cover all locations and/or may return to +previously visited locations. The WCPP problem is tackled by a new Model +Predictive Control (MPC) formulation with so-called Coverage Constraints (CCs). +It is shown that the solution becomes more effective if the solver is +initialized with a TSP-based heuristic. With and without this initialization, +the proposed MPC approach clearly outperforms a naive MPC formulation, as +demonstrated in a small simulation study. + +
+
+
+
+
+ + ☆ Precision-Focused Reinforcement Learning Model for Robotic Object + Pushing + + +
+ Non-prehensile manipulation, such as pushing objects to a desired target +position, is an important skill for robots to assist humans in everyday +situations. However, the task is challenging due to the large variety of +objects with different and sometimes unknown physical properties, such as +shape, size, mass, and friction. This can lead to the object overshooting its +target position, requiring fast corrective movements of the robot around the +object, especially in cases where objects need to be precisely pushed. In this +paper, we improve the state-of-the-art by introducing a new memory-based +vision-proprioception RL model to push objects more precisely to target +positions using fewer corrective movements. + +
+
+
+
+
+ + ☆ Lo-MARVE: A Low Cost Autonomous Underwater Vehicle for Marine + Exploration + + +
+ This paper presents Low-cost Marine Autonomous Robotic Vehicle Explorer +(Lo-MARVE), a novel autonomous underwater vehicle (AUV) designed to provide a +low cost solution for underwater exploration and environmental monitoring in +shallow water environments. Lo-MARVE offers a cost-effective alternative to +existing AUVs, featuring a modular design, low-cost sensors, and wireless +communication capabilities. The total cost of Lo-MARVE is approximately EUR +500. Lo-MARVE is developed using the Raspberry Pi 4B microprocessor, with +control software written in Python. The proposed AUV was validated through +field testing outside of a laboratory setting, in the freshwater environment of +the River Corrib in Galway, Ireland. This demonstrates its ability to navigate +autonomously, collect data, and communicate effectively outside of a controlled +laboratory setting. The successful deployment of Lo-MARVE in a real-world +environment validates its proof of concept. + +
+
+ comment: This paper was presented at the 12th International Conference on + Control, Mechatronics and Automation (ICCMA 2024), held in London, UK, from + November 11-13, 2024 +
+
+
+
+
+ + ☆ NavAgent: Multi-scale Urban Street View Fusion For UAV Embodied + Vision-and-Language Navigation + + +
+ Vision-and-Language Navigation (VLN), as a widely discussed research +direction in embodied intelligence, aims to enable embodied agents to navigate +in complicated visual environments through natural language commands. Most +existing VLN methods focus on indoor ground robot scenarios. However, when +applied to UAV VLN in outdoor urban scenes, it faces two significant +challenges. First, urban scenes contain numerous objects, which makes it +challenging to match fine-grained landmarks in images with complex textual +descriptions of these landmarks. Second, overall environmental information +encompasses multiple modal dimensions, and the diversity of representations +significantly increases the complexity of the encoding process. To address +these challenges, we propose NavAgent, the first urban UAV embodied navigation +model driven by a large Vision-Language Model. NavAgent undertakes navigation +tasks by synthesizing multi-scale environmental information, including +topological maps (global), panoramas (medium), and fine-grained landmarks +(local). Specifically, we utilize GLIP to build a visual recognizer for +landmark capable of identifying and linguisticizing fine-grained landmarks. +Subsequently, we develop dynamically growing scene topology map that integrate +environmental information and employ Graph Convolutional Networks to encode +global environmental data. In addition, to train the visual recognizer for +landmark, we develop NavAgent-Landmark2K, the first fine-grained landmark +dataset for real urban street scenes. In experiments conducted on the Touchdown +and Map2seq datasets, NavAgent outperforms strong baseline models. The code and +dataset will be released to the community to facilitate the exploration and +development of outdoor VLN. + +
+
+
+
+
+ + ☆ Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space + Exploration by Reinforcement Learning Agent ICRA 2025 + + +
+ Grasping by a robot in unstructured environments is deemed a critical +challenge because of the requirement for effective adaptation to a wide +variation in object geometries, material properties, and other environmental +factors. In this paper, we propose a novel framework for robotic grasping based +on the idea of compressing high-dimensional target and gripper features in a +common latent space using a set of autoencoders. Our approach simplifies +grasping by using three autoencoders dedicated to the target, the gripper, and +a third one that fuses their latent representations. This allows the RL agent +to achieve higher learning rates at the initial stages of exploration of a new +environment, as well as at non-zero shot grasp attempts. The agent explores the +latent space of the third autoencoder for better quality grasp without explicit +reconstruction of objects. By implementing the PoWER algorithm into the RL +training process, updates on the agent's policy will be made through the +perturbation in the reward-weighted latent space. The successful exploration +efficiently constrains both position and pose integrity for feasible executions +of grasps. We evaluate our system on a diverse set of objects, demonstrating +the high success rate in grasping with minimum computational overhead. We found +that approach enhances the adaptation of the RL agent by more than 35 \% in +simulation experiments. + +
+
+ comment: Submitted for review at IEEE ICRA 2025 +
+
+
+
+
+ + ☆ ACROSS: A Deformation-Based Cross-Modal Representation for Robotic + Tactile Perception ICRA2025 + + +
+ Tactile perception is essential for human interaction with the environment +and is becoming increasingly crucial in robotics. Tactile sensors like the +BioTac mimic human fingertips and provide detailed interaction data. Despite +its utility in applications like slip detection and object identification, this +sensor is now deprecated, making many existing valuable datasets obsolete. +However, recreating similar datasets with newer sensor technologies is both +tedious and time-consuming. Therefore, it is crucial to adapt these existing +datasets for use with new setups and modalities. In response, we introduce +ACROSS, a novel framework for translating data between tactile sensors by +exploiting sensor deformation information. We demonstrate the approach by +translating BioTac signals into the DIGIT sensor. Our framework consists of +first converting the input signals into 3D deformation meshes. We then +transition from the 3D deformation mesh of one sensor to the mesh of another, +and finally convert the generated 3D deformation mesh into the corresponding +output space. We demonstrate our approach to the most challenging problem of +going from a low-dimensional tactile representation to a high-dimensional one. +In particular, we transfer the tactile signals of a BioTac sensor to DIGIT +tactile images. Our approach enables the continued use of valuable datasets and +the exchange of data between groups with different setups. + +
+
+ comment: Paper Submitted to ICRA2025. arXiv admin note: text overlap with + arXiv:2410.14310 +
+
+
+
+
+ + ☆ Learning Robust Grasping Strategy Through Tactile Sensing and Adaption + Skill + + +
+ Robust grasping represents an essential task in robotics, necessitating +tactile feedback and reactive grasping adjustments for robust grasping of +objects. Previous research has extensively combined tactile sensing with +grasping, primarily relying on rule-based approaches, frequently neglecting +post-grasping difficulties such as external disruptions or inherent +uncertainties of the object's physics and geometry. To address these +limitations, this paper introduces an human-demonstration-based adaptive +grasping policy base on tactile, which aims to achieve robust gripping while +resisting disturbances to maintain grasp stability. Our trained model +generalizes to daily objects with seven different sizes, shapes, and textures. +Experimental results demonstrate that our method performs well in dynamic and +force interaction tasks and exhibits excellent generalization ability. + +
+
+
+
+
+ + ☆ A Cost-effective, Stand-alone, and Real-time TinyML-Based Gait Diagnosis + Unit Aimed at Lower-limb Robotic Prostheses and Exoskeletons + + +
+ Robotic prostheses and exoskeletons can do wonders compared to their +non-robotic counterpart. However, in a cost-soaring world where 1 in every 10 +patients has access to normal medical prostheses, access to advanced ones is, +unfortunately, extremely limited especially due to their high cost, a +significant portion of which is contributed to by the diagnosis and controlling +units. However, affordability is often not a major concern for developing such +devices as with cost reduction, performance is also found to be deducted due to +the cost vs. performance trade-off. Considering the gravity of such +circumstances, the goal of this research was to propose an affordable wearable +real-time gait diagnosis unit (GDU) aimed at robotic prostheses and +exoskeletons. As a proof of concept, it has also developed the GDU prototype +which leveraged TinyML to run two parallel quantized int8 models into an ESP32 +NodeMCU development board (7.30 USD) to effectively classify five gait +scenarios (idle, walk, run, hopping, and skip) and generate an anomaly score +based on acceleration data received from two attached IMUs. The developed +wearable gait diagnosis stand-alone unit could be fitted to any prosthesis or +exoskeleton and could effectively classify the gait scenarios with an overall +accuracy of 92% and provide anomaly scores within 95-96 ms with only 3 seconds +of gait data in real-time. + +
+
+
+
+
+ + ☆ Learning Dynamic Cognitive Map with Autonomous Navigation + + +
+ Inspired by animal navigation strategies, we introduce a novel computational +model to navigate and map a space rooted in biologically inspired principles. +Animals exhibit extraordinary navigation prowess, harnessing memory, +imagination, and strategic decision-making to traverse complex and aliased +environments adeptly. Our model aims to replicate these capabilities by +incorporating a dynamically expanding cognitive map over predicted poses within +an Active Inference framework, enhancing our agent's generative model +plasticity to novelty and environmental changes. Through structure learning and +active inference navigation, our model demonstrates efficient exploration and +exploitation, dynamically expanding its model capacity in response to +anticipated novel un-visited locations and updating the map given new evidence +contradicting previous beliefs. Comparative analyses in mini-grid environments +with the Clone-Structured Cognitive Graph model (CSCG), which shares similar +objectives, highlight our model's ability to rapidly learn environmental +structures within a single episode, with minimal navigation overlap. Our model +achieves this without prior knowledge of observation and world dimensions, +underscoring its robustness and efficacy in navigating intricate environments. + +
+
+ comment: under submission at Frontiers Computer Neuroscience +
+
+
+
+
+ + ☆ 3D Multi-Object Tracking with Semi-Supervised GRU-Kalman Filter + + +
+ 3D Multi-Object Tracking (MOT), a fundamental component of environmental +perception, is essential for intelligent systems like autonomous driving and +robotic sensing. Although Tracking-by-Detection frameworks have demonstrated +excellent performance in recent years, their application in real-world +scenarios faces significant challenges. Object movement in complex environments +is often highly nonlinear, while existing methods typically rely on linear +approximations of motion. Furthermore, system noise is frequently modeled as a +Gaussian distribution, which fails to capture the true complexity of the noise +dynamics. These oversimplified modeling assumptions can lead to significant +reductions in tracking precision. To address this, we propose a GRU-based MOT +method, which introduces a learnable Kalman filter into the motion module. This +approach is able to learn object motion characteristics through data-driven +learning, thereby avoiding the need for manual model design and model error. At +the same time, to avoid abnormal supervision caused by the wrong association +between annotations and trajectories, we design a semi-supervised learning +strategy to accelerate the convergence speed and improve the robustness of the +model. Evaluation experiment on the nuScenes and Argoverse2 datasets +demonstrates that our system exhibits superior performance and significant +potential compared to traditional TBD methods. + +
+
+
+
+
+ + ☆ BAMAX: Backtrack Assisted Multi-Agent Exploration using Reinforcement + Learning + + +
+ Autonomous robots collaboratively exploring an unknown environment is still +an open problem. The problem has its roots in coordination among non-stationary +agents, each with only a partial view of information. The problem is compounded +when the multiple robots must completely explore the environment. In this +paper, we introduce Backtrack Assisted Multi-Agent Exploration using +Reinforcement Learning (BAMAX), a method for collaborative exploration in +multi-agent systems which attempts to explore an entire virtual environment. As +in the name, BAMAX leverages backtrack assistance to enhance the performance of +agents in exploration tasks. To evaluate BAMAX against traditional approaches, +we present the results of experiments conducted across multiple hexagonal +shaped grids sizes, ranging from 10x10 to 60x60. The results demonstrate that +BAMAX outperforms other methods in terms of faster coverage and less +backtracking across these environments. + +
+
+
+
+
+ + ☆ MambaXCTrack: Mamba-based Tracker with SSM Cross-correlation and Motion + Prompt for Ultrasound Needle Tracking + + +
+ Ultrasound (US)-guided needle insertion is widely employed in percutaneous +interventions. However, providing feedback on the needle tip position via US +image presents challenges due to noise, artifacts, and the thin imaging plane +of US, which degrades needle features and leads to intermittent tip visibility. +In this paper, a Mamba-based US needle tracker MambaXCTrack utilizing +structured state space models cross-correlation (SSMX-Corr) and implicit motion +prompt is proposed, which is the first application of Mamba in US needle +tracking. The SSMX-Corr enhances cross-correlation by long-range modeling and +global searching of distant semantic features between template and search maps, +benefiting the tracking under noise and artifacts by implicitly learning +potential distant semantic cues. By combining with cross-map interleaved scan +(CIS), local pixel-wise interaction with positional inductive bias can also be +introduced to SSMX-Corr. The implicit low-level motion descriptor is proposed +as a non-visual prompt to enhance tracking robustness, addressing the +intermittent tip visibility problem. Extensive experiments on a dataset with +motorized needle insertion in both phantom and tissue samples demonstrate that +the proposed tracker outperforms other state-of-the-art trackers while ablation +studies further highlight the effectiveness of each proposed tracking module. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Integrative Wrapping System for a Dual-Arm Humanoid Robot + + +
+ Flexible object manipulation of paper and cloth is a major research challenge +in robot manipulation. Although there have been efforts to develop hardware +that enables specific actions and to realize a single action of paper folding +using sim-to-real and learning, there have been few proposals for humanoid +robots and systems that enable continuous, multi-step actions of flexible +materials. Wrapping an object with paper and tape is more complex and diverse +than traditional manipulation research due to the increased number of objects +that need to be handled, as well as the three-dimensionality of the operation. +In this research, necessary information is organized and coded based on the +characteristics of each object handled in wrapping. We also generalize the +hardware configuration, manipulation method, and recognition system that enable +humanoid wrapping operations. The system will include manipulation with +admittance control focusing on paper tension and state evaluation using point +clouds to handle three-dimensional flexible objects. Finally, wrapping objects +with different shapes is experimented with to show the generality and +effectiveness of the proposed system. + +
+
+ comment: Accepted Humanoids2024 +
+
+
+
+
+ + ☆ DG-SLAM: Robust Dynamic Gaussian Splatting SLAM with Hybrid Pose + Optimization + + +
+ Achieving robust and precise pose estimation in dynamic scenes is a +significant research challenge in Visual Simultaneous Localization and Mapping +(SLAM). Recent advancements integrating Gaussian Splatting into SLAM systems +have proven effective in creating high-quality renderings using explicit 3D +Gaussian models, significantly improving environmental reconstruction fidelity. +However, these approaches depend on a static environment assumption and face +challenges in dynamic environments due to inconsistent observations of geometry +and photometry. To address this problem, we propose DG-SLAM, the first robust +dynamic visual SLAM system grounded in 3D Gaussians, which provides precise +camera pose estimation alongside high-fidelity reconstructions. Specifically, +we propose effective strategies, including motion mask generation, adaptive +Gaussian point management, and a hybrid camera tracking algorithm to improve +the accuracy and robustness of pose estimation. Extensive experiments +demonstrate that DG-SLAM delivers state-of-the-art performance in camera pose +estimation, map reconstruction, and novel-view synthesis in dynamic scenes, +outperforming existing methods meanwhile preserving real-time rendering +ability. + +
+
+
+
+
+ + ☆ Efficient Trajectory Generation in 3D Environments with Multi-Level Map + Construction + + +
+ We propose a robust and efficient framework to generate global trajectories +for ground robots in complex 3D environments. The proposed method takes point +cloud as input and efficiently constructs a multi-level map using triangular +patches as the basic elements. A kinematic path search is adopted on the +patches, where motion primitives on different patches combine to form the +global min-time cost initial trajectory. We use a same-level expansion method +to locate the nearest obstacle for each trajectory waypoint and construct an +objective function with curvature, smoothness and obstacle terms for +optimization. We evaluate the method on several complex 3D point cloud maps. +Compared to existing methods, our method demonstrates higher robustness to +point cloud noise, enabling the generation of high quality trajectory while +maintaining high computational efficiency. Our code will be publicly available +at https://github.com/ck-tian/MLMC-planner. + +
+
+
+
+
+ + ☆ When to Localize? A POMDP Approach + + +
+ Robots often localize to lower navigational errors and facilitate downstream, +high-level tasks. However, a robot may want to selectively localize when +localization is costly (such as with resource-constrained robots) or +inefficient (for example, submersibles that need to surface), especially when +navigating in environments with variable numbers of hazards such as obstacles +and shipping lanes. In this study, we propose a method that helps a robot +determine ``when to localize'' to 1) minimize such actions and 2) not exceed +the probability of failure (such as surfacing within high-traffic shipping +lanes). We formulate our method as a Constrained Partially Observable Markov +Decision Process and use the Cost-Constrained POMCP solver to plan the robot's +actions. The solver simulates failure probabilities to decide if a robot moves +to its goal or localizes to prevent failure. We performed numerical experiments +with multiple baselines. + +
+
+ comment: Accepted to the 2024 IEEE International Symposium on Safety, + Security, and Rescue Robotics (SSRR). 6 pages, 6 figures +
+
+
+
+
+ + ☆ MBA-SLAM: Motion Blur Aware Dense Visual SLAM with Radiance Fields + Representation + + +
+ Emerging 3D scene representations, such as Neural Radiance Fields (NeRF) and +3D Gaussian Splatting (3DGS), have demonstrated their effectiveness in +Simultaneous Localization and Mapping (SLAM) for photo-realistic rendering, +particularly when using high-quality video sequences as input. However, +existing methods struggle with motion-blurred frames, which are common in +real-world scenarios like low-light or long-exposure conditions. This often +results in a significant reduction in both camera localization accuracy and map +reconstruction quality. To address this challenge, we propose a dense visual +SLAM pipeline (i.e. MBA-SLAM) to handle severe motion-blurred inputs. Our +approach integrates an efficient motion blur-aware tracker with either neural +radiance fields or Gaussian Splatting based mapper. By accurately modeling the +physical image formation process of motion-blurred images, our method +simultaneously learns 3D scene representation and estimates the cameras' local +trajectory during exposure time, enabling proactive compensation for motion +blur caused by camera movement. In our experiments, we demonstrate that +MBA-SLAM surpasses previous state-of-the-art methods in both camera +localization and map reconstruction, showcasing superior performance across a +range of datasets, including synthetic and real datasets featuring sharp images +as well as those affected by motion blur, highlighting the versatility and +robustness of our approach. Code is available at +https://github.com/WU-CVGL/MBA-SLAM. + +
+
+
+
+
+ + ☆ Control of Biohybrid Actuators using NeuroEvolution + + +
+ In medical-related tasks, soft robots can perform better than conventional +robots because of their compliant building materials and the movements they are +able perform. However, designing soft robot controllers is not an easy task, +due to the non-linear properties of their materials. Since human expertise to +design such controllers is yet not sufficiently effective, a formal design +process is needed. The present research proposes neuroevolution-based +algorithms as the core mechanism to automatically generate controllers for +biohybrid actuators that can be used on future medical devices, such as a +catheter that will deliver drugs. The controllers generated by methodologies +based on Neuroevolution of Augmenting Topologies (NEAT) and Hypercube-based +NEAT (HyperNEAT) are compared against the ones generated by a standard genetic +algorithm (SGA). In specific, the metrics considered are the maximum +displacement in upward bending movement and the robustness to control different +biohybrid actuator morphologies without redesigning the control strategy. +Results indicate that the neuroevolution-based algorithms produce better suited +controllers than the SGA. In particular, NEAT designed the best controllers, +achieving up to 25% higher displacement when compared with SGA-produced +specialised controllers trained over a single morphology and 23% when compared +with general purpose controllers trained over a set of morphologies. + +
+
+
+
+
+ + ☆ Open-World Task and Motion Planning via Vision-Language Model Inferred + Constraints + + +
+ Foundation models trained on internet-scale data, such as Vision-Language +Models (VLMs), excel at performing tasks involving common sense, such as visual +question answering. Despite their impressive capabilities, these models cannot +currently be directly applied to challenging robot manipulation problems that +require complex and precise continuous reasoning. Task and Motion Planning +(TAMP) systems can control high-dimensional continuous systems over long +horizons through combining traditional primitive robot operations. However, +these systems require detailed model of how the robot can impact its +environment, preventing them from directly interpreting and addressing novel +human objectives, for example, an arbitrary natural language goal. We propose +deploying VLMs within TAMP systems by having them generate discrete and +continuous language-parameterized constraints that enable TAMP to reason about +open-world concepts. Specifically, we propose algorithms for VLM partial +planning that constrain a TAMP system's discrete temporal search and VLM +continuous constraints interpretation to augment the traditional manipulation +constraints that TAMP systems seek to satisfy. We demonstrate our approach on +two robot embodiments, including a real world robot, across several +manipulation tasks, where the desired objectives are conveyed solely through +language. + +
+
+
+
+
+ + ☆ Multimodal Object Detection using Depth and Image Data for Manufacturing + Parts + + +
+ Manufacturing requires reliable object detection methods for precise picking +and handling of diverse types of manufacturing parts and components. +Traditional object detection methods utilize either only 2D images from cameras +or 3D data from lidars or similar 3D sensors. However, each of these sensors +have weaknesses and limitations. Cameras do not have depth perception and 3D +sensors typically do not carry color information. These weaknesses can +undermine the reliability and robustness of industrial manufacturing systems. +To address these challenges, this work proposes a multi-sensor system combining +an red-green-blue (RGB) camera and a 3D point cloud sensor. The two sensors are +calibrated for precise alignment of the multimodal data captured from the two +hardware devices. A novel multimodal object detection method is developed to +process both RGB and depth data. This object detector is based on the Faster +R-CNN baseline that was originally designed to process only camera images. The +results show that the multimodal model significantly outperforms the depth-only +and RGB-only baselines on established object detection metrics. More +specifically, the multimodal model improves mAP by 13% and raises Mean +Precision by 11.8% in comparison to the RGB-only baseline. Compared to the +depth-only baseline, it improves mAP by 78% and raises Mean Precision by 57%. +Hence, this method facilitates more reliable and robust object detection in +service to smart manufacturing applications. + +
+
+
+
+
+ + ☆ ClevrSkills: Compositional Language and Visual Reasoning in Robotics NeurIPS 2024 + + +
+ Robotics tasks are highly compositional by nature. For example, to perform a +high-level task like cleaning the table a robot must employ low-level +capabilities of moving the effectors to the objects on the table, pick them up +and then move them off the table one-by-one, while re-evaluating the +consequently dynamic scenario in the process. Given that large vision language +models (VLMs) have shown progress on many tasks that require high level, +human-like reasoning, we ask the question: if the models are taught the +requisite low-level capabilities, can they compose them in novel ways to +achieve interesting high-level tasks like cleaning the table without having to +be explicitly taught so? To this end, we present ClevrSkills - a benchmark +suite for compositional reasoning in robotics. ClevrSkills is an environment +suite developed on top of the ManiSkill2 simulator and an accompanying dataset. +The dataset contains trajectories generated on a range of robotics tasks with +language and visual annotations as well as multi-modal prompts as task +specification. The suite includes a curriculum of tasks with three levels of +compositional understanding, starting with simple tasks requiring basic motor +skills. We benchmark multiple different VLM baselines on ClevrSkills and show +that even after being pre-trained on large numbers of tasks, these models fail +on compositional reasoning in robotics tasks. + +
+
+ comment: To appear at NeurIPS 2024 (D&B track) +
+
+
+
+
+ + ☆ DART-LLM: Dependency-Aware Multi-Robot Task Decomposition and Execution + using Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated significant reasoning +capabilities in robotic systems. However, their deployment in multi-robot +systems remains fragmented and struggles to handle complex task dependencies +and parallel execution. This study introduces the DART-LLM (Dependency-Aware +Multi-Robot Task Decomposition and Execution using Large Language Models) +system, designed to address these challenges. DART-LLM utilizes LLMs to parse +natural language instructions, decomposing them into multiple subtasks with +dependencies to establish complex task sequences, thereby enhancing efficient +coordination and parallel execution in multi-robot systems. The system includes +the QA LLM module, Breakdown Function modules, Actuation module, and a +Vision-Language Model (VLM)-based object detection module, enabling task +decomposition and execution from natural language instructions to robotic +actions. Experimental results demonstrate that DART-LLM excels in handling +long-horizon tasks and collaborative tasks with complex dependencies. Even when +using smaller models like Llama 3.1 8B, the system achieves good performance, +highlighting DART-LLM's robustness in terms of model size. Please refer to the +project website \url{https://wyd0817.github.io/project-dart-llm/} for videos +and code. + +
+
+ comment: Submitted to the 2025 IEEE International Conference on Robotics & + Automation on September 15, 2024 +
+
+
+
+
+ + ☆ Predictive Visuo-Tactile Interactive Perception Framework for Object + Properties Inference + + +
+ Interactive exploration of the unknown physical properties of objects such as +stiffness, mass, center of mass, friction coefficient, and shape is crucial for +autonomous robotic systems operating continuously in unstructured environments. +Precise identification of these properties is essential to manipulate objects +in a stable and controlled way, and is also required to anticipate the outcomes +of (prehensile or non-prehensile) manipulation actions such as pushing, +pulling, lifting, etc. Our study focuses on autonomously inferring the physical +properties of a diverse set of various homogeneous, heterogeneous, and +articulated objects utilizing a robotic system equipped with vision and tactile +sensors. We propose a novel predictive perception framework for identifying +object properties of the diverse objects by leveraging versatile exploratory +actions: non-prehensile pushing and prehensile pulling. As part of the +framework, we propose a novel active shape perception to seamlessly initiate +exploration. Our innovative dual differentiable filtering with Graph Neural +Networks learns the object-robot interaction and performs consistent inference +of indirectly observable time-invariant object properties. In addition, we +formulate a $N$-step information gain approach to actively select the most +informative actions for efficient learning and inference. Extensive real-robot +experiments with planar objects show that our predictive perception framework +results in better performance than the state-of-the-art baseline and +demonstrate our framework in three major applications for i) object tracking, +ii) goal-driven task, and iii) change in environment detection. + +
+
+
+
+
+ + ☆ Learning-Based Control Barrier Function with Provably Safe Guarantees: + Reducing Conservatism with Heading-Aware Safety Margin + + +
+ We propose a learning-based Control Barrier Function (CBF) to reduce +conservatism in collision avoidance of car-like robots. Traditional CBFs often +use Euclidean distance between robots' centers as safety margin, neglecting +headings and simplifying geometries to circles. While this ensures smooth, +differentiable safety functions required by CBFs, it can be overly conservative +in tight environments. To address this limitation, we design a heading-aware +safety margin that accounts for the robots' orientations, enabling a less +conservative and more accurate estimation of safe regions. Since the function +computing this safety margin is non-differentiable, we approximate it with a +neural network to ensure differentiability and facilitate integration with +CBFs. We describe how we achieve bounded learning error and incorporate the +upper bound into the CBF to provide formal safety guarantees through forward +invariance. We show that our CBF is a high-order CBF with relative degree two +for a system with two robots whose dynamics are modeled by the nonlinear +kinematic bicycle model. Experimental results in overtaking and bypassing +scenarios reveal a 33.5 % reduction in conservatism compared to traditional +methods, while maintaining safety. Code: https://github.com/bassamlab/sigmarl + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ UAV survey coverage path planning of complex regions containing + exclusion zones + + +
+ This article addresses the challenge of UAV survey coverage path planning for +areas that are complex concave polygons, containing exclusion zones or +obstacles. While standard drone path planners typically generate coverage paths +for simple convex polygons, this study proposes a method to manage more +intricate regions, including boundary splits, merges, and interior holes. To +achieve this, polygonal decomposition techniques are used to partition the +target area into convex sub-regions. The sub-polygons are then merged using a +depth-first search algorithm, followed by the generation of continuous +Boustrophedon paths based on connected components. Polygonal offset by the +straight skeleton method was used to ensure a constant safe distance from the +exclusion zones. This approach allows UAV path planning in environments with +complex geometric constraints. + +
+
+
+
+
+ + ♻ ☆ $π_0$: A Vision-Language-Action Flow Model for General Robot Control + + +
+ Robot learning holds tremendous promise to unlock the full potential of +flexible, general, and dexterous robot systems, as well as to address some of +the deepest questions in artificial intelligence. However, bringing robot +learning to the level of generality required for effective real-world systems +faces major obstacles in terms of data, generalization, and robustness. In this +paper, we discuss how generalist robot policies (i.e., robot foundation models) +can address these challenges, and how we can design effective generalist robot +policies for complex and highly dexterous tasks. We propose a novel flow +matching architecture built on top of a pre-trained vision-language model (VLM) +to inherit Internet-scale semantic knowledge. We then discuss how this model +can be trained on a large and diverse dataset from multiple dexterous robot +platforms, including single-arm robots, dual-arm robots, and mobile +manipulators. We evaluate our model in terms of its ability to perform tasks in +zero shot after pre-training, follow language instructions from people and from +a high-level VLM policy, and its ability to acquire new skills via fine-tuning. +Our results cover a wide variety of tasks, such as laundry folding, table +cleaning, and assembling boxes. + +
+
+ comment: See project website for videos: + https://physicalintelligence.company/blog/pi0 +
+
+
+
+
+ + ♻ ☆ Harnessing Smartphone Sensors for Enhanced Road Safety: A Comprehensive + Dataset and Review + + +
+ Severe collisions can result from aggressive driving and poor road +conditions, emphasizing the need for effective monitoring to ensure safety. +Smartphones, with their array of built-in sensors, offer a practical and +affordable solution for road-sensing. However, the lack of reliable, +standardized datasets has hindered progress in assessing road conditions and +driving patterns. This study addresses this gap by introducing a comprehensive +dataset derived from smartphone sensors, which surpasses existing datasets by +incorporating a diverse range of sensors including accelerometer, gyroscope, +magnetometer, GPS, gravity, orientation, and uncalibrated sensors. These +sensors capture extensive parameters such as acceleration force, gravitation, +rotation rate, magnetic field strength, and vehicle speed, providing a detailed +understanding of road conditions and driving behaviors. The dataset is designed +to enhance road safety, infrastructure maintenance, traffic management, and +urban planning. By making this dataset available to the community, the study +aims to foster collaboration, inspire further research, and facilitate the +development of innovative solutions in intelligent transportation systems. + +
+
+ comment: 29 pages, 14 Figures, journal paper, submitted into Scientific Data + Journal +
+
+
+
+
+ + ♻ ☆ HumanVLA: Towards Vision-Language Directed Object Rearrangement by + Physical Humanoid NeurIPS 2024 + + +
+ Physical Human-Scene Interaction (HSI) plays a crucial role in numerous +applications. + However, existing HSI techniques are limited to specific object dynamics and +privileged information, which prevents the development of more comprehensive +applications. + To address this limitation, we introduce HumanVLA for general object +rearrangement directed by practical vision and language. + A teacher-student framework is utilized to develop HumanVLA. + A state-based teacher policy is trained first using goal-conditioned +reinforcement learning and adversarial motion prior. + Then, it is distilled into a vision-language-action model via behavior +cloning. + We propose several key insights to facilitate the large-scale learning +process. + To support general object rearrangement by physical humanoid, we introduce a +novel Human-in-the-Room dataset encompassing various rearrangement tasks. + Through extensive experiments and analysis, we demonstrate the effectiveness +of the proposed approach. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Learning Dynamic Tasks on a Large-scale Soft Robot in a Handful of + Trials IROS + + +
+ Soft robots offer more flexibility, compliance, and adaptability than +traditional rigid robots. They are also typically lighter and cheaper to +manufacture. However, their use in real-world applications is limited due to +modeling challenges and difficulties in integrating effective proprioceptive +sensors. Large-scale soft robots ($\approx$ two meters in length) have greater +modeling complexity due to increased inertia and related effects of gravity. +Common efforts to ease these modeling difficulties such as assuming simple +kinematic and dynamics models also limit the general capabilities of soft +robots and are not applicable in tasks requiring fast, dynamic motion like +throwing and hammering. To overcome these challenges, we propose a +data-efficient Bayesian optimization-based approach for learning control +policies for dynamic tasks on a large-scale soft robot. Our approach optimizes +the task objective function directly from commanded pressures, without +requiring approximate kinematics or dynamics as an intermediate step. We +demonstrate the effectiveness of our approach through both simulated and +real-world experiments. + +
+
+ comment: 9 pages, 5 figures, Proceedings of the International Conference on + Intelligent Robots and Systems (IROS) +
+
+
+
+
+ + ♻ ☆ Morphological Symmetries in Robotics + + +
+ We present a comprehensive framework for studying and leveraging +morphological symmetries in robotic systems. These are intrinsic properties of +the robot's morphology, frequently observed in animal biology and robotics, +which stem from the replication of kinematic structures and the symmetrical +distribution of mass. We illustrate how these symmetries extend to the robot's +state space and both proprioceptive and exteroceptive sensor measurements, +resulting in the equivariance of the robot's equations of motion and optimal +control policies. Thus, we recognize morphological symmetries as a relevant and +previously unexplored physics-informed geometric prior, with significant +implications for both data-driven and analytical methods used in modeling, +control, estimation and design in robotics. For data-driven methods, we +demonstrate that morphological symmetries can enhance the sample efficiency and +generalization of machine learning models through data augmentation, or by +applying equivariant/invariant constraints on the model's architecture. In the +context of analytical methods, we employ abstract harmonic analysis to +decompose the robot's dynamics into a superposition of lower-dimensional, +independent dynamics. We substantiate our claims with both synthetic and +real-world experiments conducted on bipedal and quadrupedal robots. Lastly, we +introduce the repository MorphoSymm to facilitate the practical use of the +theory and applications outlined in this work. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Multiple noncooperative targets encirclement by relative distance-based + positioning and neural antisynchronization control + + +
+ From prehistoric encirclement for hunting to GPS orbiting the earth for +positioning, target encirclement has numerous real world applications. However, +encircling multiple non-cooperative targets in GPS-denied environments remains +challenging. In this work, multiple targets encirclement by using a minimum of +two tasking agents, is considered where the relative distance measurements +between the agents and the targets can be obtained by using onboard sensors. +Based on the measurements, the center of all the targets is estimated directly +by a fuzzy wavelet neural network (FWNN) and the least squares fit method. +Then, a new distributed anti-synchronization controller (DASC) is designed so +that the two tasking agents are able to encircle all targets while staying +opposite to each other. In particular, the radius of the desired encirclement +trajectory can be dynamically determined to avoid potential collisions between +the two agents and all targets. Based on the Lyapunov stability analysis +method, the convergence proofs of the neural network prediction error, the +target-center position estimation error, and the controller error are addressed +respectively. Finally, both numerical simulations and UAV flight experiments +are conducted to demonstrate the validity of the encirclement algorithms. The +flight tests recorded video and other simulation results can be found in +https://youtu.be/B8uTorBNrl4. + +
+
+
+
+
+ + ♻ ☆ Stem-OB: Generalizable Visual Imitation Learning with Stem-Like + Convergent Observation through Diffusion Inversion + + +
+ Visual imitation learning methods demonstrate strong performance, yet they +lack generalization when faced with visual input perturbations, including +variations in lighting and textures, impeding their real-world application. We +propose Stem-OB that utilizes pretrained image diffusion models to suppress +low-level visual differences while maintaining high-level scene structures. +This image inversion process is akin to transforming the observation into a +shared representation, from which other observations stem, with extraneous +details removed. Stem-OB contrasts with data-augmentation approaches as it is +robust to various unspecified appearance changes without the need for +additional training. Our method is a simple yet highly effective plug-and-play +solution. Empirical results confirm the effectiveness of our approach in +simulated tasks and show an exceptionally significant improvement in real-world +applications, with an average increase of 22.2% in success rates compared to +the best baseline. See https://hukz18.github.io/Stem-Ob/ for more info. + +
+
+ comment: Arxiv preprint version, website: https://hukz18.github.io/Stem-Ob/ +
+
+
+
+
+ + ♻ ☆ Single-grasp deformable object discrimination: the effect of gripper + morphology, sensing modalities, and action parameters + + +
+ In haptic object discrimination, the effect of gripper embodiment, action +parameters, and sensory channels has not been systematically studied. We used +two anthropomorphic hands and two 2-finger grippers to grasp two sets of +deformable objects. On the object classification task, we found: (i) among +classifiers, SVM on sensory features and LSTM on raw time series performed best +across all grippers; (ii) faster compression speeds degraded performance; (iii) +generalization to different grasping configurations was limited; transfer to +different compression speeds worked well for the Barrett Hand only. +Visualization of the feature spaces using PCA showed that gripper morphology +and action parameters were the main source of variance, making generalization +across embodiment or grip configurations very difficult. On the highly +challenging dataset consisting of polyurethane foams alone, only the Barrett +Hand achieved excellent performance. Tactile sensors can thus provide a key +advantage even if recognition is based on stiffness rather than shape. The data +set with 24,000 measurements is publicly available. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ DEIO: Deep Event Inertial Odometry + + +
+ Event cameras are bio-inspired, motion-activated sensors that demonstrate +impressive potential in handling challenging situations, such as motion blur +and high-dynamic range. Despite their promise, existing event-based +simultaneous localization and mapping (SLAM) approaches exhibit limited +performance in real-world applications. On the other hand, state-of-the-art +SLAM approaches that incorporate deep neural networks for better robustness and +applicability. However, these is a lack of research in fusing learning-based +event SLAM methods with IMU, which could be indispensable to push the +event-based SLAM to large-scale, low-texture or complex scenarios. In this +paper, we propose DEIO, the first monocular deep event-inertial odometry +framework that combines learning-based method with traditional nonlinear +graph-based optimization. Specifically, we tightly integrate a trainable +event-based differentiable bundle adjustment (e-DBA) with the IMU +pre-integration in a factor graph which employs keyframe-based sliding window +optimization. Numerical Experiments in nine public challenge datasets show that +our method can achieve superior performance compared with the image-based and +event-based benchmarks. The source code is available at: +https://github.com/arclab-hku/DEIO. + +
+
+
+
+
+ + ♻ ☆ Rendering Stable Features Improves Sampling-Based Localisation with + Neural Radiance Fields + + +
+ Neural radiance fields (NeRFs) are a powerful tool for implicit scene +representations, allowing for differentiable rendering and the ability to make +predictions about unseen viewpoints. There has been growing interest in object +and scene-based localisation using NeRFs, with a number of recent works relying +on sampling-based or Monte-Carlo localisation schemes. Unfortunately, these can +be extremely computationally expensive, requiring multiple network forward +passes to infer camera or object pose. To alleviate this, a variety of sampling +strategies have been applied, many relying on keypoint recognition techniques +from classical computer vision. This work conducts a systematic empirical +comparison of these approaches and shows that in contrast to conventional +feature matching approaches for geometry-based localisation, sampling-based +localisation using NeRFs benefits significantly from stable features. Results +show that rendering stable features provides significantly better estimation +with a tenfold reduction in the number of forward passes required. + +
+
+ comment: Accepted at the 2024 Australasian Conference on Robotics and + Automation (ACRA 2024) +
+
+
+
+
+ + ♻ ☆ Learning Memory Mechanisms for Decision Making through Demonstrations + + +
+ In Partially Observable Markov Decision Processes, integrating an agent's +history into memory poses a significant challenge for decision-making. +Traditional imitation learning, relying on observation-action pairs for expert +demonstrations, fails to capture the expert's memory mechanisms used in +decision-making. To capture memory processes as demonstrations, we introduce +the concept of memory dependency pairs $(p, q)$ indicating that events at time +$p$ are recalled for decision-making at time $q$. We introduce AttentionTuner +to leverage memory dependency pairs in Transformers and find significant +improvements across several tasks compared to standard Transformers when +evaluated on Memory Gym and the Long-term Memory Benchmark. Code is available +at https://github.com/WilliamYue37/AttentionTuner. + +
+
+
+
+
+ + ♻ ☆ Proleptic Temporal Ensemble for Improving the Speed of Robot Tasks + Generated by Imitation Learning + + +
+ Imitation learning, which enables robots to learn behaviors from +demonstrations by human, has emerged as a promising solution for generating +robot motions in such environments. The imitation learning-based robot motion +generation method, however, has the drawback of depending on the demonstrator's +task execution speed. This paper presents a novel temporal ensemble approach +applied to imitation learning algorithms, allowing for execution of future +actions. The proposed method leverages existing demonstration data and +pre-trained policies, offering the advantages of requiring no additional +computation and being easy to implement. The algorithms performance was +validated through real-world experiments involving robotic block color sorting, +demonstrating up to 3x increase in task execution speed while maintaining a +high success rate compared to the action chunking with transformer method. This +study highlights the potential for significantly improving the performance of +imitation learning-based policies, which were previously limited by the +demonstrator's speed. It is expected to contribute substantially to future +advancements in autonomous object manipulation technologies aimed at enhancing +productivity. + +
+
+ comment: This paper was initially submitted to The Journal of Korea Robotics + Society on Oct. 22, 2024, and a revised version was submitted on Nov. 13, + 2024. It is currently under review +
+
+
+
+
+ + ♻ ☆ A Comparative Study on State-Action Spaces for Learning Viewpoint + Selection and Manipulation with Diffusion Policy ICRA 2025 + + +
+ Robotic manipulation tasks often rely on static cameras for perception, which +can limit flexibility, particularly in scenarios like robotic surgery and +cluttered environments where mounting static cameras is impractical. Ideally, +robots could jointly learn a policy for dynamic viewpoint and manipulation. +However, it remains unclear which state-action space is most suitable for this +complex learning process. To enable manipulation with dynamic viewpoints and to +better understand impacts from different state-action spaces on this policy +learning process, we conduct a comparative study on the state-action spaces for +policy learning and their impacts on the performance of visuomotor policies +that integrate viewpoint selection with manipulation. Specifically, we examine +the configuration space of the robotic system, the end-effector space with a +dual-arm Inverse Kinematics (IK) solver, and the reduced end-effector space +with a look-at IK solver to optimize rotation for viewpoint selection. We also +assess variants with different rotation representations. Our results +demonstrate that state-action spaces utilizing Euler angles with the look-at IK +achieve superior task success rates compared to other spaces. Further analysis +suggests that these performance differences are driven by inherent variations +in the high-frequency components across different state-action spaces and +rotation representations. + +
+
+ comment: Submitted to ICRA 2025. Website: + https://apollo-lab-yale.github.io/spaces_comparative_study/ +
+
+
+
+
+ + ♻ ☆ Embedding Pose Graph, Enabling 3D Foundation Model Capabilities with a + Compact Representation + + +
+ This paper presents the Embedding Pose Graph (EPG), an innovative method that +combines the strengths of foundation models with a simple 3D representation +suitable for robotics applications. Addressing the need for efficient spatial +understanding in robotics, EPG provides a compact yet powerful approach by +attaching foundation model features to the nodes of a pose graph. Unlike +traditional methods that rely on bulky data formats like voxel grids or point +clouds, EPG is lightweight and scalable. It facilitates a range of robotic +tasks, including open-vocabulary querying, disambiguation, image-based +querying, language-directed navigation, and re-localization in 3D environments. +We showcase the effectiveness of EPG in handling these tasks, demonstrating its +capacity to improve how robots interact with and navigate through complex +spaces. Through both qualitative and quantitative assessments, we illustrate +EPG's strong performance and its ability to outperform existing methods in +re-localization. Our work introduces a crucial step forward in enabling robots +to efficiently understand and operate within large-scale 3D spaces. + +
+
+
+
+
+ + ♻ ☆ Neural-Rendezvous: Provably Robust Guidance and Control to Encounter + Interstellar Objects + + +
+ Interstellar objects (ISOs) are likely representatives of primitive materials +invaluable in understanding exoplanetary star systems. Due to their poorly +constrained orbits with generally high inclinations and relative velocities, +however, exploring ISOs with conventional human-in-the-loop approaches is +significantly challenging. This paper presents Neural-Rendezvous -- a deep +learning-based guidance and control framework for encountering fast-moving +objects, including ISOs, robustly, accurately, and autonomously in real time. +It uses pointwise minimum norm tracking control on top of a guidance policy +modeled by a spectrally-normalized deep neural network, where its +hyperparameters are tuned with a loss function directly penalizing the MPC +state trajectory tracking error. We show that Neural-Rendezvous provides a high +probability exponential bound on the expected spacecraft delivery error, the +proof of which leverages stochastic incremental stability analysis. In +particular, it is used to construct a non-negative function with a +supermartingale property, explicitly accounting for the ISO state uncertainty +and the local nature of nonlinear state estimation guarantees. In numerical +simulations, Neural-Rendezvous is demonstrated to satisfy the expected error +bound for 100 ISO candidates. This performance is also empirically validated +using our spacecraft simulator and in high-conflict and distributed UAV swarm +reconfiguration with up to 20 UAVs. + +
+
+ comment: Preprint Version, Accepted: October, 2024 (One-minute YouTube + summary: https://youtu.be/q3e0LYS2IYQ, DOI: + https://doi.org/10.2514/1.G007671) +
+
+
+
+
+ + ♻ Faster Algorithms for Growing Collision-Free Convex Polytopes in Robot + Configuration Space + + +
+ We propose two novel algorithms for constructing convex collision-free +polytopes in robot configuration space. Finding these polytopes enables the +application of stronger motion-planning frameworks such as trajectory +optimization with Graphs of Convex Sets [1] and is currently a major roadblock +in the adoption of these approaches. In this paper, we build upon IRIS-NP +(Iterative Regional Inflation by Semidefinite & Nonlinear Programming) [2] to +significantly improve tunability, runtimes, and scaling to complex +environments. IRIS-NP uses nonlinear programming paired with uniform random +initialization to find configurations on the boundary of the free configuration +space. Our key insight is that finding near-by configuration-space obstacles +using sampling is inexpensive and greatly accelerates region generation. We +propose two algorithms using such samples to either employ nonlinear +programming more efficiently (IRIS-NP2 ) or circumvent it altogether using a +massively-parallel zero-order optimization strategy (IRIS-ZO). We also propose +a termination condition that controls the probability of exceeding a +user-specified permissible fraction-in-collision, eliminating a significant +source of tuning difficulty in IRIS-NP. We compare performance across eight +robot environments, showing that IRIS-ZO achieves an order-of-magnitude speed +advantage over IRIS-NP. IRISNP2, also significantly faster than IRIS-NP, builds +larger polytopes using fewer hyperplanes, enabling faster downstream +computation. Website: https://sites.google.com/view/fastiris + +
+
+ comment: 16 pages, 6 figures, accepted for publication in the proceedings of + the International Symposium for Robotics Research 2024 +
+
+
+
+
+ + ♻ ☆ Electrokinetic Propulsion for Electronically Integrated Microscopic + Robots + + +
+ Semiconductor microelectronics are emerging as a powerful tool for building +smart, autonomous robots too small to see with the naked eye. Yet a number of +existing microrobot platforms, despite significant advantages in speed, +robustness, power consumption, or ease of fabrication, have no clear path +towards electronics integration, limiting their intelligence and sophistication +when compared to electronic cousins. Here, we show how to upgrade a +self-propelled particle into an an electronically integrated microrobot, +reaping the best of both in a single design. Inspired by electrokinetic +micromotors, these robots generate electric fields in a surrounding fluid, and +by extension propulsive electrokinetic flows. The underlying physics is +captured by a model in which robot speed is proportional to applied current, +making design and control straightforward. As proof, we build basic robots that +use on-board circuits and a closed-loop optical control scheme to navigate +waypoints and move in coordinated swarms at speeds of up to one body length per +second. Broadly, the unification of micromotor propulsion with on-robot +electronics clears the way for robust, fast, easy to manufacture, +electronically programmable microrobots that operate reliably over months to +years. + +
+
+
+
+
+ + ♻ ☆ Interstellar Object Accessibility and Mission Design + + +
+ Interstellar objects (ISOs) represent a compelling and under-explored +category of celestial bodies, providing physical laboratories to understand the +formation of our solar system and probe the composition and properties of +material formed in exoplanetary systems. In this work, we investigate existing +approaches to designing successful flyby missions to ISOs, including a deep +learning-driven guidance and control algorithm for ISOs traveling at velocities +over 60 km/s. We have generated spacecraft trajectories to a series of +synthetic representative ISOs, simulating a ground campaign to observe the +target and resolve its state, thereby determining the cruise and close approach +delta-Vs required for the encounter. We discuss the accessibility of and +mission design to ISOs with varying characteristics, with special focuses on 1) +state covariance estimation throughout the cruise, 2) handoffs from traditional +navigation approaches to novel autonomous navigation for fast flyby regimes, +and 3) overall recommendations about preparing for the future in situ +exploration of these targets. The lessons learned also apply to the fast flyby +of other small bodies, e.g., long-period comets and potentially hazardous +asteroids, which also require tactical responses with similar characteristics. + +
+
+ comment: IEEE Aerospace Conference, Preprint Version, Accepted: November 2022 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 111 + +
+
+
+ + ☆ 4D Gaussian Splatting in the Wild with Uncertainty-Aware Regularization NeurIPS 2024 + + +
+ Novel view synthesis of dynamic scenes is becoming important in various +applications, including augmented and virtual reality. We propose a novel 4D +Gaussian Splatting (4DGS) algorithm for dynamic scenes from casually recorded +monocular videos. To overcome the overfitting problem of existing work for +these real-world videos, we introduce an uncertainty-aware regularization that +identifies uncertain regions with few observations and selectively imposes +additional priors based on diffusion models and depth smoothness on such +regions. This approach improves both the performance of novel view synthesis +and the quality of training image reconstruction. We also identify the +initialization problem of 4DGS in fast-moving dynamic regions, where the +Structure from Motion (SfM) algorithm fails to provide reliable 3D landmarks. +To initialize Gaussian primitives in such regions, we present a dynamic region +densification method using the estimated depth maps and scene flow. Our +experiments show that the proposed method improves the performance of 4DGS +reconstruction from a video captured by a handheld monocular camera and also +exhibits promising results in few-shot static scene reconstruction. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ A Short Note on Evaluating RepNet for Temporal Repetition Counting in + Videos + + +
+ We discuss some consistent issues on how RepNet has been evaluated in various +papers. As a way to mitigate these issues, we report RepNet performance results +on different datasets, and release evaluation code and the RepNet checkpoint to +obtain these results. Code URL: +https://github.com/google-research/google-research/blob/master/repnet/ + +
+
+
+
+
+ + ☆ Multimodal Instruction Tuning with Hybrid State Space Models + + +
+ Handling lengthy context is crucial for enhancing the recognition and +understanding capabilities of multimodal large language models (MLLMs) in +applications such as processing high-resolution images or high frame rate +videos. The rise in image resolution and frame rate substantially increases +computational demands due to the increased number of input tokens. This +challenge is further exacerbated by the quadratic complexity with respect to +sequence length of the self-attention mechanism. Most prior works either +pre-train models with long contexts, overlooking the efficiency problem, or +attempt to reduce the context length via downsampling (e.g., identify the key +image patches or frames) to decrease the context length, which may result in +information loss. To circumvent this issue while keeping the remarkable +effectiveness of MLLMs, we propose a novel approach using a hybrid +transformer-MAMBA model to efficiently handle long contexts in multimodal +applications. Our multimodal model can effectively process long context input +exceeding 100k tokens, outperforming existing models across various benchmarks. +Remarkably, our model enhances inference efficiency for high-resolution images +and high-frame-rate videos by about 4 times compared to current models, with +efficiency gains increasing as image resolution or video frames rise. +Furthermore, our model is the first to be trained on low-resolution images or +low-frame-rate videos while being capable of inference on high-resolution +images and high-frame-rate videos, offering flexibility for inference in +diverse scenarios. + +
+
+
+
+
+ + ☆ LUDO: Low-Latency Understanding of Highly Deformable Objects using Point + Cloud Occupancy Functions + + +
+ Accurately determining the shape and location of internal structures within +deformable objects is crucial for medical tasks that require precise targeting, +such as robotic biopsies. We introduce LUDO, a method for accurate low-latency +understanding of deformable objects. LUDO reconstructs objects in their +deformed state, including their internal structures, from a single-view point +cloud observation in under 30 ms using occupancy networks. We demonstrate +LUDO's abilities for autonomous targeting of internal regions of interest +(ROIs) in highly deformable objects. Additionally, LUDO provides uncertainty +estimates and explainability for its predictions, both of which are important +in safety-critical applications such as surgical interventions. We evaluate +LUDO in real-world robotic experiments, achieving a success rate of 98.9% for +puncturing various ROIs inside highly deformable objects. LUDO demonstrates the +potential to interact with deformable objects without the need for deformable +registration methods. + +
+
+
+
+
+ + ☆ Sharingan: Extract User Action Sequence from Desktop Recordings + + +
+ Video recordings of user activities, particularly desktop recordings, offer a +rich source of data for understanding user behaviors and automating processes. +However, despite advancements in Vision-Language Models (VLMs) and their +increasing use in video analysis, extracting user actions from desktop +recordings remains an underexplored area. This paper addresses this gap by +proposing two novel VLM-based methods for user action extraction: the Direct +Frame-Based Approach (DF), which inputs sampled frames directly into VLMs, and +the Differential Frame-Based Approach (DiffF), which incorporates explicit +frame differences detected via computer vision techniques. We evaluate these +methods using a basic self-curated dataset and an advanced benchmark adapted +from prior work. Our results show that the DF approach achieves an accuracy of +70% to 80% in identifying user actions, with the extracted action sequences +being re-playable though Robotic Process Automation. We find that while VLMs +show potential, incorporating explicit UI changes can degrade performance, +making the DF approach more reliable. This work represents the first +application of VLMs for extracting user action sequences from desktop +recordings, contributing new methods, benchmarks, and insights for future +research. + +
+
+
+
+
+ + ☆ Masked Image Modeling Boosting Semi-Supervised Semantic Segmentation + + +
+ In view of the fact that semi- and self-supervised learning share a +fundamental principle, effectively modeling knowledge from unlabeled data, +various semi-supervised semantic segmentation methods have integrated +representative self-supervised learning paradigms for further regularization. +However, the potential of the state-of-the-art generative self-supervised +paradigm, masked image modeling, has been scarcely studied. This paradigm +learns the knowledge through establishing connections between the masked and +visible parts of masked image, during the pixel reconstruction process. By +inheriting and extending this insight, we successfully leverage masked image +modeling to boost semi-supervised semantic segmentation. Specifically, we +introduce a novel class-wise masked image modeling that independently +reconstructs different image regions according to their respective classes. In +this way, the mask-induced connections are established within each class, +mitigating the semantic confusion that arises from plainly reconstructing +images in basic masked image modeling. To strengthen these intra-class +connections, we further develop a feature aggregation strategy that minimizes +the distances between features corresponding to the masked and visible parts +within the same class. Additionally, in semantic space, we explore the +application of masked image modeling to enhance regularization. Extensive +experiments conducted on well-known benchmarks demonstrate that our approach +achieves state-of-the-art performance. The code will be available at +https://github.com/haoxt/S4MIM. + +
+
+ comment: 13 pages. This work has been submitted to the IEEE for possible + publication +
+
+
+
+
+ + ☆ Weakly-Supervised Anomaly Detection in Surveillance Videos Based on + Two-Stream I3D Convolution Network + + +
+ The widespread implementation of urban surveillance systems has necessitated +more sophisticated techniques for anomaly detection to ensure enhanced public +safety. This paper presents a significant advancement in the field of anomaly +detection through the application of Two-Stream Inflated 3D (I3D) Convolutional +Networks. These networks substantially outperform traditional 3D Convolutional +Networks (C3D) by more effectively extracting spatial and temporal features +from surveillance videos, thus improving the precision of anomaly detection. +Our research advances the field by implementing a weakly supervised learning +framework based on Multiple Instance Learning (MIL), which uniquely +conceptualizes surveillance videos as collections of 'bags' that contain +instances (video clips). Each instance is innovatively processed through a +ranking mechanism that prioritizes clips based on their potential to display +anomalies. This novel strategy not only enhances the accuracy and precision of +anomaly detection but also significantly diminishes the dependency on extensive +manual annotations. Moreover, through meticulous optimization of model +settings, including the choice of optimizer, our approach not only establishes +new benchmarks in the performance of anomaly detection systems but also offers +a scalable and efficient solution for real-world surveillance applications. +This paper contributes significantly to the field of computer vision by +delivering a more adaptable, efficient, and context-aware anomaly detection +system, which is poised to redefine practices in urban surveillance. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ Which Viewpoint Shows it Best? Language for Weakly Supervising View + Selection in Multi-view Videos + + +
+ Given a multi-view video, which viewpoint is most informative for a human +observer? Existing methods rely on heuristics or expensive ``best-view" +supervision to answer this question, limiting their applicability. We propose a +weakly supervised approach that leverages language accompanying an +instructional multi-view video as a means to recover its most informative +viewpoint(s). Our key hypothesis is that the more accurately an individual view +can predict a view-agnostic text summary, the more informative it is. To put +this into action, we propose a framework that uses the relative accuracy of +view-dependent caption predictions as a proxy for best view pseudo-labels. +Then, those pseudo-labels are used to train a view selector, together with an +auxiliary camera pose predictor that enhances view-sensitivity. During +inference, our model takes as input only a multi-view video -- no language or +camera poses -- and returns the best viewpoint to watch at each timestep. On +two challenging datasets comprised of diverse multi-camera setups and how-to +activities, our model consistently outperforms state-of-the-art baselines, both +with quantitative metrics and human evaluation. + +
+
+
+
+
+ + ☆ Retrieval Augmented Recipe Generation WACV + + +
+ Given the potential applications of generating recipes from food images, this +area has garnered significant attention from researchers in recent years. +Existing works for recipe generation primarily utilize a two-stage training +method, first generating ingredients and then obtaining instructions from both +the image and ingredients. Large Multi-modal Models (LMMs), which have achieved +notable success across a variety of vision and language tasks, shed light to +generating both ingredients and instructions directly from images. +Nevertheless, LMMs still face the common issue of hallucinations during recipe +generation, leading to suboptimal performance. To tackle this, we propose a +retrieval augmented large multimodal model for recipe generation. We first +introduce Stochastic Diversified Retrieval Augmentation (SDRA) to retrieve +recipes semantically related to the image from an existing datastore as a +supplement, integrating them into the prompt to add diverse and rich context to +the input image. Additionally, Self-Consistency Ensemble Voting mechanism is +proposed to determine the most confident prediction recipes as the final +output. It calculates the consistency among generated recipe candidates, which +use different retrieval recipes as context for generation. Extensive +experiments validate the effectiveness of our proposed method, which +demonstrates state-of-the-art (SOTA) performance in recipe generation tasks on +the Recipe1M dataset. + +
+
+ comment: ACCEPT on IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ☆ High-resolution optical and acoustic remote sensing datasets of the Puck + Lagoon, Southern Baltic + + +
+ The very shallow marine basin of Puck Lagoon in the southern Baltic Sea, on +the Northern coast of Poland, hosts valuable benthic habitats and cultural +heritage sites. These include, among others, protected Zostera marina meadows, +one of the Baltic's major medieval harbours, a ship graveyard, and likely other +submerged features that are yet to be discovered. Prior to this project, no +comprehensive high-resolution remote sensing data were available for this area. +This article describes the first Digital Elevation Models (DEMs) derived from a +combination of airborne bathymetric LiDAR, multibeam echosounder, airborne +photogrammetry and satellite imagery. These datasets also include multibeam +echosounder backscatter and LiDAR intensity, allowing determination of the +character and properties of the seafloor. Combined, these datasets are a vital +resource for assessing and understanding seafloor morphology, benthic habitats, +cultural heritage, and submerged landscapes. Given the significance of Puck +Lagoon's hydrographical, ecological, geological, and archaeological environs, +the high-resolution bathymetry, acquired by our project, can provide the +foundation for sustainable management and informed decision-making for this +area of interest. + +
+
+
+
+
+ + ☆ TRACE: Transformer-based Risk Assessment for Clinical Evaluation + + +
+ We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation), +a novel method for clinical risk assessment based on clinical data, leveraging +the self-attention mechanism for enhanced feature interaction and result +interpretation. Our approach is able to handle different data modalities, +including continuous, categorical and multiple-choice (checkbox) attributes. +The proposed architecture features a shared representation of the clinical data +obtained by integrating specialized embeddings of each data modality, enabling +the detection of high-risk individuals using Transformer encoder layers. To +assess the effectiveness of the proposed method, a strong baseline based on +non-negative multi-layer perceptrons (MLPs) is introduced. The proposed method +outperforms various baselines widely used in the domain of clinical risk +assessment, while effectively handling missing values. In terms of +explainability, our Transformer-based method offers easily interpretable +results via attention weights, further enhancing the clinicians' +decision-making process. + +
+
+
+
+
+ + ☆ A Survey on Vision Autoregressive Model + + +
+ Autoregressive models have demonstrated great performance in natural language +processing (NLP) with impressive scalability, adaptability and +generalizability. Inspired by their notable success in NLP field, +autoregressive models have been intensively investigated recently for computer +vision, which perform next-token predictions by representing visual data as +visual tokens and enables autoregressive modelling for a wide range of vision +tasks, ranging from visual generation and visual understanding to the very +recent multimodal generation that unifies visual generation and understanding +with a single autoregressive model. This paper provides a systematic review of +vision autoregressive models, including the development of a taxonomy of +existing methods and highlighting their major contributions, strengths, and +limitations, covering various vision tasks such as image generation, video +generation, image editing, motion generation, medical image analysis, 3D +generation, robotic manipulation, unified multimodal generation, etc. Besides, +we investigate and analyze the latest advancements in autoregressive models, +including thorough benchmarking and discussion of existing methods across +various evaluation datasets. Finally, we outline key challenges and promising +directions for future research, offering a roadmap to guide further +advancements in vision autoregressive models. + +
+
+
+
+
+ + ☆ OSMLoc: Single Image-Based Visual Localization in OpenStreetMap with + Geometric and Semantic Guidances + + +
+ OpenStreetMap (OSM), an online and versatile source of volunteered geographic +information (VGI), is widely used for human self-localization by matching +nearby visual observations with vectorized map data. However, due to the +divergence in modalities and views, image-to-OSM (I2O) matching and +localization remain challenging for robots, preventing the full utilization of +VGI data in the unmanned ground vehicles and logistic industry. Inspired by the +fact that the human brain relies on geometric and semantic understanding of +sensory information for spatial localization tasks, we propose the OSMLoc in +this paper. OSMLoc is a brain-inspired single-image visual localization method +with semantic and geometric guidance to improve accuracy, robustness, and +generalization ability. First, we equip the OSMLoc with the visual foundational +model to extract powerful image features. Second, a geometry-guided depth +distribution adapter is proposed to bridge the monocular depth estimation and +camera-to-BEV transform. Thirdly, the semantic embeddings from the OSM data are +utilized as auxiliary guidance for image-to-OSM feature matching. To validate +the proposed OSMLoc, we collect a worldwide cross-area and cross-condition (CC) +benchmark for extensive evaluation. Experiments on the MGL dataset, CC +validation benchmark, and KITTI dataset have demonstrated the superiority of +our method. Code, pre-trained models, CC validation benchmark, and additional +results are available on: https://github.com/WHU-USI3DV/OSMLoc + +
+
+ comment: 15 pages, technical report +
+
+
+
+
+ + ☆ Toward Human Understanding with Controllable Synthesis + + +
+ Training methods to perform robust 3D human pose and shape (HPS) estimation +requires diverse training images with accurate ground truth. While BEDLAM +demonstrates the potential of traditional procedural graphics to generate such +data, the training images are clearly synthetic. In contrast, generative image +models produce highly realistic images but without ground truth. Putting these +methods together seems straightforward: use a generative model with the body +ground truth as controlling signal. However, we find that, the more realistic +the generated images, the more they deviate from the ground truth, making them +inappropriate for training and evaluation. Enhancements of realistic details, +such as clothing and facial expressions, can lead to subtle yet significant +deviations from the ground truth, potentially misleading training models. We +empirically verify that this misalignment causes the accuracy of HPS networks +to decline when trained with generated images. To address this, we design a +controllable synthesis method that effectively balances image realism with +precise ground truth. We use this to create the Generative BEDLAM (Gen-B) +dataset, which improves the realism of the existing synthetic BEDLAM dataset +while preserving ground truth accuracy. We perform extensive experiments, with +various noise-conditioning strategies, to evaluate the tradeoff between visual +realism and HPS accuracy. We show, for the first time, that generative image +models can be controlled by traditional graphics methods to produce training +data that increases the accuracy of HPS methods. + +
+
+
+
+
+ + ☆ MikuDance: Animating Character Art with Mixed Motion Dynamics + + +
+ We propose MikuDance, a diffusion-based pipeline incorporating mixed motion +dynamics to animate stylized character art. MikuDance consists of two key +techniques: Mixed Motion Modeling and Mixed-Control Diffusion, to address the +challenges of high-dynamic motion and reference-guidance misalignment in +character art animation. Specifically, a Scene Motion Tracking strategy is +presented to explicitly model the dynamic camera in pixel-wise space, enabling +unified character-scene motion modeling. Building on this, the Mixed-Control +Diffusion implicitly aligns the scale and body shape of diverse characters with +motion guidance, allowing flexible control of local character motion. +Subsequently, a Motion-Adaptive Normalization module is incorporated to +effectively inject global scene motion, paving the way for comprehensive +character art animation. Through extensive experiments, we demonstrate the +effectiveness and generalizability of MikuDance across various character art +and motion guidance, consistently producing high-quality animations with +remarkable motion dynamics. + +
+
+
+
+
+ + ☆ Towards More Accurate Fake Detection on Images Generated from Advanced + Generative and Neural Rendering Models + + +
+ The remarkable progress in neural-network-driven visual data generation, +especially with neural rendering techniques like Neural Radiance Fields and 3D +Gaussian splatting, offers a powerful alternative to GANs and diffusion models. +These methods can produce high-fidelity images and lifelike avatars, +highlighting the need for robust detection methods. In response, an +unsupervised training technique is proposed that enables the model to extract +comprehensive features from the Fourier spectrum magnitude, thereby overcoming +the challenges of reconstructing the spectrum due to its centrosymmetric +properties. By leveraging the spectral domain and dynamically combining it with +spatial domain information, we create a robust multimodal detector that +demonstrates superior generalization capabilities in identifying challenging +synthetic images generated by the latest image synthesis techniques. To address +the absence of a 3D neural rendering-based fake image database, we develop a +comprehensive database that includes images generated by diverse neural +rendering techniques, providing a robust foundation for evaluating and +advancing detection methods. + +
+
+ comment: 13 pages, 8 Figures +
+
+
+
+
+ + ☆ Zero-shot capability of SAM-family models for bone segmentation in CT + scans + + +
+ The Segment Anything Model (SAM) and similar models build a family of +promptable foundation models (FMs) for image and video segmentation. The object +of interest is identified using prompts, such as bounding boxes or points. With +these FMs becoming part of medical image segmentation, extensive evaluation +studies are required to assess their strengths and weaknesses in clinical +setting. Since the performance is highly dependent on the chosen prompting +strategy, it is important to investigate different prompting techniques to +define optimal guidelines that ensure effective use in medical image +segmentation. Currently, no dedicated evaluation studies exist specifically for +bone segmentation in CT scans, leaving a gap in understanding the performance +for this task. Thus, we use non-iterative, ``optimal'' prompting strategies +composed of bounding box, points and combinations to test the zero-shot +capability of SAM-family models for bone CT segmentation on three different +skeletal regions. Our results show that the best settings depend on the model +type and size, dataset characteristics and objective to optimize. Overall, SAM +and SAM2 prompted with a bounding box in combination with the center point for +all the components of an object yield the best results across all tested +settings. As the results depend on multiple factors, we provide a guideline for +informed decision-making in 2D prompting with non-interactive, ''optimal'' +prompts. + +
+
+
+
+
+ + ☆ LG-Gaze: Learning Geometry-aware Continuous Prompts for Language-Guided + Gaze Estimation ECCV 2024 + + +
+ The ability of gaze estimation models to generalize is often significantly +hindered by various factors unrelated to gaze, especially when the training +dataset is limited. Current strategies aim to address this challenge through +different domain generalization techniques, yet they have had limited success +due to the risk of overfitting when solely relying on value labels for +regression. Recent progress in pre-trained vision-language models has motivated +us to capitalize on the abundant semantic information available. We propose a +novel approach in this paper, reframing the gaze estimation task as a +vision-language alignment issue. Our proposed framework, named Language-Guided +Gaze Estimation (LG-Gaze), learns continuous and geometry-sensitive features +for gaze estimation benefit from the rich prior knowledges of vision-language +models. Specifically, LG-Gaze aligns gaze features with continuous linguistic +features through our proposed multimodal contrastive regression loss, which +customizes adaptive weights for different negative samples. Furthermore, to +better adapt to the labels for gaze estimation task, we propose a +geometry-aware interpolation method to obtain more precise gaze embeddings. +Through extensive experiments, we validate the efficacy of our framework in +four different cross-domain evaluation tasks. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Generalized Pose Space Embeddings for Training In-the-Wild using + Anaylis-by-Synthesis + + +
+ Modern pose estimation models are trained on large, manually-labelled +datasets which are costly and may not cover the full extent of human poses and +appearances in the real world. With advances in neural rendering, +analysis-by-synthesis and the ability to not only predict, but also render the +pose, is becoming an appealing framework, which could alleviate the need for +large scale manual labelling efforts. While recent work have shown the +feasibility of this approach, the predictions admit many flips due to a +simplistic intermediate skeleton representation, resulting in low precision and +inhibiting the acquisition of any downstream knowledge such as +three-dimensional positioning. We solve this problem with a more expressive +intermediate skeleton representation capable of capturing the semantics of the +pose (left and right), which significantly reduces flips. To successfully train +this new representation, we extend the analysis-by-synthesis framework with a +training protocol based on synthetic data. We show that our representation +results in less flips and more accurate predictions. Our approach outperforms +previous models trained with analysis-by-synthesis on standard benchmarks. + +
+
+
+
+
+ + ☆ Slender Object Scene Segmentation in Remote Sensing Image Based on + Learnable Morphological Skeleton with Segment Anything Model + + +
+ Morphological methods play a crucial role in remote sensing image processing, +due to their ability to capture and preserve small structural details. However, +most of the existing deep learning models for semantic segmentation are based +on the encoder-decoder architecture including U-net and Segment Anything Model +(SAM), where the downsampling process tends to discard fine details. In this +paper, we propose a new approach that integrates learnable morphological +skeleton prior into deep neural networks using the variational method. To +address the difficulty in backpropagation in neural networks caused by the +non-differentiability presented in classical morphological operations, we +provide a smooth representation of the morphological skeleton and design a +variational segmentation model integrating morphological skeleton prior by +employing operator splitting and dual methods. Then, we integrate this model +into the network architecture of SAM, which is achieved by adding a token to +mask decoder and modifying the final sigmoid layer, ensuring the final +segmentation results preserve the skeleton structure as much as possible. +Experimental results on remote sensing datasets, including buildings and roads, +demonstrate that our method outperforms the original SAM on slender object +segmentation and exhibits better generalization capability. + +
+
+
+
+
+ + ☆ NavAgent: Multi-scale Urban Street View Fusion For UAV Embodied + Vision-and-Language Navigation + + +
+ Vision-and-Language Navigation (VLN), as a widely discussed research +direction in embodied intelligence, aims to enable embodied agents to navigate +in complicated visual environments through natural language commands. Most +existing VLN methods focus on indoor ground robot scenarios. However, when +applied to UAV VLN in outdoor urban scenes, it faces two significant +challenges. First, urban scenes contain numerous objects, which makes it +challenging to match fine-grained landmarks in images with complex textual +descriptions of these landmarks. Second, overall environmental information +encompasses multiple modal dimensions, and the diversity of representations +significantly increases the complexity of the encoding process. To address +these challenges, we propose NavAgent, the first urban UAV embodied navigation +model driven by a large Vision-Language Model. NavAgent undertakes navigation +tasks by synthesizing multi-scale environmental information, including +topological maps (global), panoramas (medium), and fine-grained landmarks +(local). Specifically, we utilize GLIP to build a visual recognizer for +landmark capable of identifying and linguisticizing fine-grained landmarks. +Subsequently, we develop dynamically growing scene topology map that integrate +environmental information and employ Graph Convolutional Networks to encode +global environmental data. In addition, to train the visual recognizer for +landmark, we develop NavAgent-Landmark2K, the first fine-grained landmark +dataset for real urban street scenes. In experiments conducted on the Touchdown +and Map2seq datasets, NavAgent outperforms strong baseline models. The code and +dataset will be released to the community to facilitate the exploration and +development of outdoor VLN. + +
+
+
+
+
+ + ☆ UIFormer: A Unified Transformer-based Framework for Incremental Few-Shot + Object Detection and Instance Segmentation + + +
+ This paper introduces a novel framework for unified incremental few-shot +object detection (iFSOD) and instance segmentation (iFSIS) using the +Transformer architecture. Our goal is to create an optimal solution for +situations where only a few examples of novel object classes are available, +with no access to training data for base or old classes, while maintaining high +performance across both base and novel classes. To achieve this, We extend +Mask-DINO into a two-stage incremental learning framework. Stage 1 focuses on +optimizing the model using the base dataset, while Stage 2 involves fine-tuning +the model on novel classes. Besides, we incorporate a classifier selection +strategy that assigns appropriate classifiers to the encoder and decoder +according to their distinct functions. Empirical evidence indicates that this +approach effectively mitigates the over-fitting on novel classes learning. +Furthermore, we implement knowledge distillation to prevent catastrophic +forgetting of base classes. Comprehensive evaluations on the COCO and LVIS +datasets for both iFSIS and iFSOD tasks demonstrate that our method +significantly outperforms state-of-the-art approaches. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Saliency Map-based Image Retrieval using Invariant Krawtchouk Moments + + +
+ With the widespread adoption of digital devices equipped with cameras and the +rapid development of Internet technology, numerous content-based image +retrieval systems and novel image feature extraction techniques have emerged in +recent years. This paper introduces a saliency map-based image retrieval +approach using invariant Krawtchouk moments (SM-IKM) to enhance retrieval speed +and accuracy. The proposed method applies a global contrast-based salient +region detection algorithm to create a saliency map that effectively isolates +the foreground from the background. It then combines multiple orders of +invariant Krawtchouk moments (IKM) with local binary patterns (LBPs) and color +histograms to comprehensively represent the foreground and background. +Additionally, it incorporates LBPs derived from the saliency map to improve +discriminative power, facilitating more precise image differentiation. A +bag-of-visual-words (BoVW) model is employed to generate a codebook for +classification and discrimination. By using compact IKMs in the BoVW framework +and integrating a range of region-based feature-including color histograms, +LBPs, and saliency map-enhanced LBPs, our proposed SM-IKM achieves efficient +and accurate image retrieval. xtensive experiments on publicly available +datasets, such as Caltech 101 and Wang, demonstrate that SM-IKM outperforms +recent state-of-the-art retrieval methods. The source code for SM-IKM is +available at github.com/arnejad/SMIKM. + +
+
+
+
+
+ + ☆ APDDv2: Aesthetics of Paintings and Drawings Dataset with Artist Labeled + Scores and Comments + + +
+ Datasets play a pivotal role in training visual models, facilitating the +development of abstract understandings of visual features through diverse image +samples and multidimensional attributes. However, in the realm of aesthetic +evaluation of artistic images, datasets remain relatively scarce. Existing +painting datasets are often characterized by limited scoring dimensions and +insufficient annotations, thereby constraining the advancement and application +of automatic aesthetic evaluation methods in the domain of painting. To bridge +this gap, we introduce the Aesthetics Paintings and Drawings Dataset (APDD), +the first comprehensive collection of paintings encompassing 24 distinct +artistic categories and 10 aesthetic attributes. Building upon the initial +release of APDDv1, our ongoing research has identified opportunities for +enhancement in data scale and annotation precision. Consequently, APDDv2 boasts +an expanded image corpus and improved annotation quality, featuring detailed +language comments to better cater to the needs of both researchers and +practitioners seeking high-quality painting datasets. Furthermore, we present +an updated version of the Art Assessment Network for Specific Painting Styles, +denoted as ArtCLIP. Experimental validation demonstrates the superior +performance of this revised model in the realm of aesthetic evaluation, +surpassing its predecessor in accuracy and efficacy. The dataset and model are +available at https://github.com/BestiVictory/APDDv2.git. + +
+
+
+
+
+ + ☆ MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal + Lymphatic Vessel Segmentation ML4H 2024 + + +
+ Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste +products from the human brain. An impairment in their functionality has been +associated with aging as well as brain disorders like multiple sclerosis and +Alzheimer's disease. However, MLVs have only recently been described for the +first time in magnetic resonance imaging (MRI), and their ramified structure +renders manual segmentation particularly difficult. Further, as there is no +consistent notion of their appearance, human-annotated MLV structures contain a +high inter-rater variability that most automatic segmentation methods cannot +take into account. In this work, we propose a new rater-aware training scheme +for the popular nnU-Net model, and we explore rater-based ensembling strategies +for accurate and consistent segmentation of MLVs. This enables us to boost +nnU-Net's performance while obtaining explicit predictions in different +annotation styles and a rater-based uncertainty estimation. Our final model, +MLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to +the human reference standard. The model further matches the human inter-rater +reliability and replicates age-related associations with MLV volume. + +
+
+ comment: ML4H 2024 +
+
+
+
+
+ + ☆ Classification and Morphological Analysis of DLBCL Subtypes in + H\&E-Stained Slides + + +
+ We address the challenge of automated classification of diffuse large B-cell +lymphoma (DLBCL) into its two primary subtypes: activated B-cell-like (ABC) and +germinal center B-cell-like (GCB). Accurate classification between these +subtypes is essential for determining the appropriate therapeutic strategy, +given their distinct molecular profiles and treatment responses. Our proposed +deep learning model demonstrates robust performance, achieving an average area +under the curve (AUC) of (87.4 pm 5.7)\% during cross-validation. It shows a +high positive predictive value (PPV), highlighting its potential for clinical +application, such as triaging for molecular testing. To gain biological +insights, we performed an analysis of morphological features of ABC and GCB +subtypes. We segmented cell nuclei using a pre-trained deep neural network and +compared the statistics of geometric and color features for ABC and GCB. We +found that the distributions of these features were not very different for the +two subtypes, which suggests that the visual differences between them are more +subtle. These results underscore the potential of our method to assist in more +precise subtype classification and can contribute to improved treatment +management and outcomes for patients of DLBCL. + +
+
+
+
+
+ + ☆ Efficient Whole Slide Image Classification through Fisher Vector + Representation + + +
+ The advancement of digital pathology, particularly through computational +analysis of whole slide images (WSI), is poised to significantly enhance +diagnostic precision and efficiency. However, the large size and complexity of +WSIs make it difficult to analyze and classify them using computers. This study +introduces a novel method for WSI classification by automating the +identification and examination of the most informative patches, thus +eliminating the need to process the entire slide. Our method involves +two-stages: firstly, it extracts only a few patches from the WSIs based on +their pathological significance; and secondly, it employs Fisher vectors (FVs) +for representing features extracted from these patches, which is known for its +robustness in capturing fine-grained details. This approach not only +accentuates key pathological features within the WSI representation but also +significantly reduces computational overhead, thus making the process more +efficient and scalable. We have rigorously evaluated the proposed method across +multiple datasets to benchmark its performance against comprehensive WSI +analysis and contemporary weakly-supervised learning methodologies. The +empirical results indicate that our focused analysis of select patches, +combined with Fisher vector representation, not only aligns with, but at times +surpasses, the classification accuracy of standard practices. Moreover, this +strategy notably diminishes computational load and resource expenditure, +thereby establishing an efficient and precise framework for WSI analysis in the +realm of digital pathology. + +
+
+
+
+
+ + ☆ BillBoard Splatting (BBSplat): Learnable Textured Primitives for Novel + View Synthesis + + +
+ We present billboard Splatting (BBSplat) - a novel approach for 3D scene +representation based on textured geometric primitives. BBSplat represents the +scene as a set of optimizable textured planar primitives with learnable RGB +textures and alpha-maps to control their shape. BBSplat primitives can be used +in any Gaussian Splatting pipeline as drop-in replacements for Gaussians. Our +method's qualitative and quantitative improvements over 3D and 2D Gaussians are +most noticeable when fewer primitives are used, when BBSplat achieves over 1200 +FPS. Our novel regularization term encourages textures to have a sparser +structure, unlocking an efficient compression that leads to a reduction in +storage space of the model. Our experiments show the efficiency of BBSplat on +standard datasets of real indoor and outdoor scenes such as Tanks&Temples, DTU, +and Mip-NeRF-360. We demonstrate improvements on PSNR, SSIM, and LPIPS metrics +compared to the state-of-the-art, especially for the case when fewer primitives +are used, which, on the other hand, leads to up to 2 times inference speed +improvement for the same rendering quality. + +
+
+
+
+
+ + ☆ Impact of Iris Pigmentation on Performance Bias in Visible Iris + Verification Systems: A Comparative Study + + +
+ Iris recognition technology plays a critical role in biometric identification +systems, but their performance can be affected by variations in iris +pigmentation. In this work, we investigate the impact of iris pigmentation on +the efficacy of biometric recognition systems, focusing on a comparative +analysis of blue and dark irises. Data sets were collected using multiple +devices, including P1, P2, and P3 smartphones [4], to assess the robustness of +the systems in different capture environments [19]. Both traditional machine +learning techniques and deep learning models were used, namely Open-Iris, +ViT-b, and ResNet50, to evaluate performance metrics such as Equal Error Rate +(EER) and True Match Rate (TMR). Our results indicate that iris recognition +systems generally exhibit higher accuracy for blue irises compared to dark +irises. Furthermore, we examined the generalization capabilities of these +systems across different iris colors and devices, finding that while training +on diverse datasets enhances recognition performance, the degree of improvement +is contingent on the specific model and device used. Our analysis also +identifies inherent biases in recognition performance related to iris color and +cross-device variability. These findings underscore the need for more inclusive +dataset collection and model refinement to reduce bias and promote equitable +biometric recognition across varying iris pigmentation and device +configurations. + +
+
+ comment: 14 pages, 5 figures, 5 Tables +
+
+
+
+
+ + ☆ UNSCT-HRNet: Modeling Anatomical Uncertainty for Landmark Detection in + Total Hip Arthroplasty + + +
+ Total hip arthroplasty (THA) relies on accurate landmark detection from +radiographic images, but unstructured data caused by irregular patient postures +or occluded anatomical markers pose significant challenges for existing +methods. To address this, we propose UNSCT-HRNet (Unstructured CT - +High-Resolution Net), a deep learning-based framework that integrates a Spatial +Relationship Fusion (SRF) module and an Uncertainty Estimation (UE) module. The +SRF module, utilizing coordinate convolution and polarized attention, enhances +the model's ability to capture complex spatial relationships. Meanwhile, the UE +module which based on entropy ensures predictions are anatomically relevant. +For unstructured data, the proposed method can predict landmarks without +relying on the fixed number of points, which shows higher accuracy and better +robustness comparing with the existing methods. Our UNSCT-HRNet demonstrates +over a 60% improvement across multiple metrics in unstructured data. The +experimental results also reveal that our approach maintains good performance +on the structured dataset. Overall, the proposed UNSCT-HRNet has the potential +to be used as a new reliable, automated solution for THA surgical planning and +postoperative monitoring. + +
+
+
+
+
+ + ☆ Methodology for a Statistical Analysis of Influencing Factors on 3D + Object Detection Performance + + +
+ In autonomous driving, object detection is an essential task to perceive the +environment by localizing and classifying objects. Most object detection +algorithms rely on deep learning for their superior performance. However, their +black box nature makes it challenging to ensure safety. In this paper, we +propose a first-of-its-kind methodology for statistical analysis of the +influence of various factors related to the objects to detect or the +environment on the detection performance of both LiDAR- and camera-based 3D +object detectors. We perform a univariate analysis between each of the factors +and the detection error in order to compare the strength of influence. To +better identify potential sources of detection errors, we also analyze the +performance in dependency of the influencing factors and examine the +interdependencies between the different influencing factors. Recognizing the +factors that influence detection performance helps identify robustness issues +in the trained object detector and supports the safety approval of object +detection systems. + +
+
+
+
+
+ + ☆ A survey on Graph Deep Representation Learning for Facial Expression + Recognition + + +
+ This comprehensive review delves deeply into the various methodologies +applied to facial expression recognition (FER) through the lens of graph +representation learning (GRL). Initially, we introduce the task of FER and the +concepts of graph representation and GRL. Afterward, we discuss some of the +most prevalent and valuable databases for this task. We explore promising +approaches for graph representation in FER, including graph diffusion, +spatio-temporal graphs, and multi-stream architectures. Finally, we identify +future research opportunities and provide concluding remarks. + +
+
+
+
+
+ + ☆ HyperFace: Generating Synthetic Face Recognition Datasets by Exploring + Face Embedding Hypersphere NeurIPS 2024 + + +
+ Face recognition datasets are often collected by crawling Internet and +without individuals' consents, raising ethical and privacy concerns. Generating +synthetic datasets for training face recognition models has emerged as a +promising alternative. However, the generation of synthetic datasets remains +challenging as it entails adequate inter-class and intra-class variations. +While advances in generative models have made it easier to increase intra-class +variations in face datasets (such as pose, illumination, etc.), generating +sufficient inter-class variation is still a difficult task. In this paper, we +formulate the dataset generation as a packing problem on the embedding space +(represented on a hypersphere) of a face recognition model and propose a new +synthetic dataset generation approach, called HyperFace. We formalize our +packing problem as an optimization problem and solve it with a gradient +descent-based approach. Then, we use a conditional face generator model to +synthesize face images from the optimized embeddings. We use our generated +datasets to train face recognition models and evaluate the trained models on +several benchmarking real datasets. Our experimental results show that models +trained with HyperFace achieve state-of-the-art performance in training face +recognition using synthetic datasets. + +
+
+ comment: Accepted in NeurIPS 2024 Safe Generative AI Workshop +
+
+
+
+
+ + ☆ Can MLLMs Guide Weakly-Supervised Temporal Action Localization Tasks? + + +
+ Recent breakthroughs in Multimodal Large Language Models (MLLMs) have gained +significant recognition within the deep learning community, where the fusion of +the Video Foundation Models (VFMs) and Large Language Models(LLMs) has proven +instrumental in constructing robust video understanding systems, effectively +surmounting constraints associated with predefined visual tasks. These +sophisticated MLLMs exhibit remarkable proficiency in comprehending videos, +swiftly attaining unprecedented performance levels across diverse benchmarks. +However, their operation demands substantial memory and computational +resources, underscoring the continued importance of traditional models in video +comprehension tasks. In this paper, we introduce a novel learning paradigm +termed MLLM4WTAL. This paradigm harnesses the potential of MLLM to offer +temporal action key semantics and complete semantic priors for conventional +Weakly-supervised Temporal Action Localization (WTAL) methods. MLLM4WTAL +facilitates the enhancement of WTAL by leveraging MLLM guidance. It achieves +this by integrating two distinct modules: Key Semantic Matching (KSM) and +Complete Semantic Reconstruction (CSR). These modules work in tandem to +effectively address prevalent issues like incomplete and over-complete outcomes +common in WTAL methods. Rigorous experiments are conducted to validate the +efficacy of our proposed approach in augmenting the performance of various +heterogeneous WTAL models. + +
+
+
+
+
+ + ☆ Trap-MID: Trapdoor-based Defense against Model Inversion Attacks NeurIPS + + +
+ Model Inversion (MI) attacks pose a significant threat to the privacy of Deep +Neural Networks by recovering training data distribution from well-trained +models. While existing defenses often rely on regularization techniques to +reduce information leakage, they remain vulnerable to recent attacks. In this +paper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to +mislead MI attacks. A trapdoor is integrated into the model to predict a +specific label when the input is injected with the corresponding trigger. +Consequently, this trapdoor information serves as the "shortcut" for MI +attacks, leading them to extract trapdoor triggers rather than private data. We +provide theoretical insights into the impacts of trapdoor's effectiveness and +naturalness on deceiving MI attacks. In addition, empirical experiments +demonstrate the state-of-the-art defense performance of Trap-MID against +various MI attacks without the requirements for extra data or large +computational overhead. Our source code is publicly available at +https://github.com/ntuaislab/Trap-MID. + +
+
+ comment: Accepted by Neural Information Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ Biomass phenotyping of oilseed rape through UAV multi-view oblique + imaging with 3DGS and SAM model + + +
+ Biomass estimation of oilseed rape is crucial for optimizing crop +productivity and breeding strategies. While UAV-based imaging has advanced +high-throughput phenotyping, current methods often rely on orthophoto images, +which struggle with overlapping leaves and incomplete structural information in +complex field environments. This study integrates 3D Gaussian Splatting (3DGS) +with the Segment Anything Model (SAM) for precise 3D reconstruction and biomass +estimation of oilseed rape. UAV multi-view oblique images from 36 angles were +used to perform 3D reconstruction, with the SAM module enhancing point cloud +segmentation. The segmented point clouds were then converted into point cloud +volumes, which were fitted to ground-measured biomass using linear regression. +The results showed that 3DGS (7k and 30k iterations) provided high accuracy, +with peak signal-to-noise ratios (PSNR) of 27.43 and 29.53 and training times +of 7 and 49 minutes, respectively. This performance exceeded that of structure +from motion (SfM) and mipmap Neural Radiance Fields (Mip-NeRF), demonstrating +superior efficiency. The SAM module achieved high segmentation accuracy, with a +mean intersection over union (mIoU) of 0.961 and an F1-score of 0.980. +Additionally, a comparison of biomass extraction models found the point cloud +volume model to be the most accurate, with an determination coefficient (R2) of +0.976, root mean square error (RMSE) of 2.92 g/plant, and mean absolute +percentage error (MAPE) of 6.81%, outperforming both the plot crop volume and +individual crop volume models. This study highlights the potential of combining +3DGS with multi-view UAV imaging for improved biomass phenotyping. + +
+
+
+
+
+ + ☆ AD-DINO: Attention-Dynamic DINO for Distance-Aware Embodied Reference + Understanding + + +
+ Embodied reference understanding is crucial for intelligent agents to predict +referents based on human intention through gesture signals and language +descriptions. This paper introduces the Attention-Dynamic DINO, a novel +framework designed to mitigate misinterpretations of pointing gestures across +various interaction contexts. Our approach integrates visual and textual +features to simultaneously predict the target object's bounding box and the +attention source in pointing gestures. Leveraging the distance-aware nature of +nonverbal communication in visual perspective taking, we extend the virtual +touch line mechanism and propose an attention-dynamic touch line to represent +referring gesture based on interactive distances. The combination of this +distance-aware approach and independent prediction of the attention source, +enhances the alignment between objects and the gesture represented line. +Extensive experiments on the YouRefIt dataset demonstrate the efficacy of our +gesture information understanding method in significantly improving task +performance. Our model achieves 76.4% accuracy at the 0.25 IoU threshold and, +notably, surpasses human performance at the 0.75 IoU threshold, marking a first +in this domain. Comparative experiments with distance-unaware understanding +methods from previous research further validate the superiority of the +Attention-Dynamic Touch Line across diverse contexts. + +
+
+
+
+
+ + ☆ Machine Unlearning on Pre-trained Models by Residual Feature Alignment + Using LoRA + + +
+ Machine unlearning is new emerged technology that removes a subset of the +training data from a trained model without affecting the model performance on +the remaining data. This topic is becoming increasingly important in protecting +user privacy and eliminating harmful or outdated data. The key challenge lies +in effectively and efficiently unlearning specific information without +compromising the model's utility on the retained data. For the pre-trained +models, fine-tuning is an important way to achieve the unlearning target. +Previous work typically fine-tuned the entire model's parameters, which incurs +significant computation costs. In addition, the fine-tuning process may cause +shifts in the intermediate layer features, affecting the model's overall +utility. In this work, we propose a novel and efficient machine unlearning +method on pre-trained models. We term the method as Residual Feature Alignment +Unlearning. Specifically, we leverage LoRA (Low-Rank Adaptation) to decompose +the model's intermediate features into pre-trained features and residual +features. By adjusting the residual features, we align the unlearned model with +the pre-trained model at the intermediate feature level to achieve both +unlearning and remaining targets. The method aims to learn the zero residuals +on the retained set and shifted residuals on the unlearning set. Extensive +experiments on numerous datasets validate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ A Heterogeneous Graph Neural Network Fusing Functional and Structural + Connectivity for MCI Diagnosis + + +
+ Brain connectivity alternations associated with brain disorders have been +widely reported in resting-state functional imaging (rs-fMRI) and diffusion +tensor imaging (DTI). While many dual-modal fusion methods based on graph +neural networks (GNNs) have been proposed, they generally follow homogenous +fusion ways ignoring rich heterogeneity of dual-modal information. To address +this issue, we propose a novel method that integrates functional and structural +connectivity based on heterogeneous graph neural networks (HGNNs) to better +leverage the rich heterogeneity in dual-modal images. We firstly use blood +oxygen level dependency and whiter matter structure information provided by +rs-fMRI and DTI to establish homo-meta-path, capturing node relationships +within the same modality. At the same time, we propose to establish +hetero-meta-path based on structure-function coupling and brain community +searching to capture relations among cross-modal nodes. Secondly, we further +introduce a heterogeneous graph pooling strategy that automatically balances +homo- and hetero-meta-path, effectively leveraging heterogeneous information +and preventing feature confusion after pooling. Thirdly, based on the +flexibility of heterogeneous graphs, we propose a heterogeneous graph data +augmentation approach that can conveniently address the sample imbalance issue +commonly seen in clinical diagnosis. We evaluate our method on ADNI-3 dataset +for mild cognitive impairment (MCI) diagnosis. Experimental results indicate +the proposed method is effective and superior to other algorithms, with a mean +classification accuracy of 93.3%. + +
+
+
+
+
+ + ☆ The VLLM Safety Paradox: Dual Ease in Jailbreak Attack and Defense + + +
+ The vulnerability of Vision Large Language Models (VLLMs) to jailbreak +attacks appears as no surprise. However, recent defense mechanisms against +these attacks have reached near-saturation performance on benchmarks, often +with minimal effort. This simultaneous high performance in both attack and +defense presents a perplexing paradox. Resolving it is critical for advancing +the development of trustworthy models. To address this research gap, we first +investigate why VLLMs are prone to these attacks. We then make a key +observation: existing defense mechanisms suffer from an \textbf{over-prudence} +problem, resulting in unexpected abstention even in the presence of benign +inputs. Additionally, we find that the two representative evaluation methods +for jailbreak often exhibit chance agreement. This limitation makes it +potentially misleading when evaluating attack strategies or defense mechanisms. +Beyond these empirical observations, our another contribution in this work is +to repurpose the guardrails of LLMs on the shelf, as an effective alternative +detector prior to VLLM response. We believe these findings offer useful +insights to rethink the foundational development of VLLM safety with respect to +benchmark datasets, evaluation methods, and defense strategies. + +
+
+
+
+
+ + ☆ V2X-R: Cooperative LiDAR-4D Radar Fusion for 3D Object Detection with + Denoising Diffusion + + +
+ Current Vehicle-to-Everything (V2X) systems have significantly enhanced 3D +object detection using LiDAR and camera data. However, these methods suffer +from performance degradation in adverse weather conditions. The weatherrobust +4D radar provides Doppler and additional geometric information, raising the +possibility of addressing this challenge. To this end, we present V2X-R, the +first simulated V2X dataset incorporating LiDAR, camera, and 4D radar. V2X-R +contains 12,079 scenarios with 37,727 frames of LiDAR and 4D radar point +clouds, 150,908 images, and 170,859 annotated 3D vehicle bounding boxes. +Subsequently, we propose a novel cooperative LiDAR-4D radar fusion pipeline for +3D object detection and implement it with various fusion strategies. To achieve +weather-robust detection, we additionally propose a Multi-modal Denoising +Diffusion (MDD) module in our fusion pipeline. MDD utilizes weather-robust 4D +radar feature as a condition to prompt the diffusion model to denoise noisy +LiDAR features. Experiments show that our LiDAR-4D radar fusion pipeline +demonstrates superior performance in the V2X-R dataset. Over and above this, +our MDD module further improved the performance of basic fusion model by up to +5.73%/6.70% in foggy/snowy conditions with barely disrupting normal +performance. The dataset and code will be publicly available at: +https://github.com/ylwhxht/V2X-R. + +
+
+
+
+
+ + ☆ MambaXCTrack: Mamba-based Tracker with SSM Cross-correlation and Motion + Prompt for Ultrasound Needle Tracking + + +
+ Ultrasound (US)-guided needle insertion is widely employed in percutaneous +interventions. However, providing feedback on the needle tip position via US +image presents challenges due to noise, artifacts, and the thin imaging plane +of US, which degrades needle features and leads to intermittent tip visibility. +In this paper, a Mamba-based US needle tracker MambaXCTrack utilizing +structured state space models cross-correlation (SSMX-Corr) and implicit motion +prompt is proposed, which is the first application of Mamba in US needle +tracking. The SSMX-Corr enhances cross-correlation by long-range modeling and +global searching of distant semantic features between template and search maps, +benefiting the tracking under noise and artifacts by implicitly learning +potential distant semantic cues. By combining with cross-map interleaved scan +(CIS), local pixel-wise interaction with positional inductive bias can also be +introduced to SSMX-Corr. The implicit low-level motion descriptor is proposed +as a non-visual prompt to enhance tracking robustness, addressing the +intermittent tip visibility problem. Extensive experiments on a dataset with +motorized needle insertion in both phantom and tissue samples demonstrate that +the proposed tracker outperforms other state-of-the-art trackers while ablation +studies further highlight the effectiveness of each proposed tracking module. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ EgoVid-5M: A Large-Scale Video-Action Dataset for Egocentric Video + Generation + + +
+ Video generation has emerged as a promising tool for world simulation, +leveraging visual data to replicate real-world environments. Within this +context, egocentric video generation, which centers on the human perspective, +holds significant potential for enhancing applications in virtual reality, +augmented reality, and gaming. However, the generation of egocentric videos +presents substantial challenges due to the dynamic nature of egocentric +viewpoints, the intricate diversity of actions, and the complex variety of +scenes encountered. Existing datasets are inadequate for addressing these +challenges effectively. To bridge this gap, we present EgoVid-5M, the first +high-quality dataset specifically curated for egocentric video generation. +EgoVid-5M encompasses 5 million egocentric video clips and is enriched with +detailed action annotations, including fine-grained kinematic control and +high-level textual descriptions. To ensure the integrity and usability of the +dataset, we implement a sophisticated data cleaning pipeline designed to +maintain frame consistency, action coherence, and motion smoothness under +egocentric conditions. Furthermore, we introduce EgoDreamer, which is capable +of generating egocentric videos driven simultaneously by action descriptions +and kinematic control signals. The EgoVid-5M dataset, associated action +annotations, and all data cleansing metadata will be released for the +advancement of research in egocentric video generation. + +
+
+ comment: Project Page: https://egovid.github.io/ +
+
+
+
+
+ + ☆ Multiscale Graph Construction Using Non-local Cluster Features + + +
+ This paper presents a multiscale graph construction method using both graph +and signal features. Multiscale graph is a hierarchical representation of the +graph, where a node at each level indicates a cluster in a finer resolution. To +obtain the hierarchical clusters, existing methods often use graph clustering; +however, they may ignore signal variations. As a result, these methods could +fail to detect the clusters having similar features on nodes. In this paper, we +consider graph and node-wise features simultaneously for multiscale clustering +of a graph. With given clusters of the graph, the clusters are merged +hierarchically in three steps: 1) Feature vectors in the clusters are +extracted. 2) Similarities among cluster features are calculated using optimal +transport. 3) A variable $k$-nearest neighbor graph (V$k$NNG) is constructed +and graph spectral clustering is applied to the V$k$NNG to obtain clusters at a +coarser scale. Additionally, the multiscale graph in this paper has +\textit{non-local} characteristics: Nodes with similar features are merged even +if they are spatially separated. In experiments on multiscale image and point +cloud segmentation, we demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ A Chinese Multi-label Affective Computing Dataset Based on Social Media + Network Users + + +
+ Emotion and personality are central elements in understanding human +psychological states. Emotions reflect an individual subjective experiences, +while personality reveals relatively stable behavioral and cognitive patterns. +Existing affective computing datasets often annotate emotion and personality +traits separately, lacking fine-grained labeling of micro-emotions and emotion +intensity in both single-label and multi-label classifications. Chinese emotion +datasets are extremely scarce, and datasets capturing Chinese user personality +traits are even more limited. To address these gaps, this study collected data +from the major social media platform Weibo, screening 11,338 valid users from +over 50,000 individuals with diverse MBTI personality labels and acquiring +566,900 posts along with the user MBTI personality tags. Using the EQN method, +we compiled a multi-label Chinese affective computing dataset that integrates +the same user's personality traits with six emotions and micro-emotions, each +annotated with intensity levels. Validation results across multiple NLP +classification models demonstrate the dataset strong utility. This dataset is +designed to advance machine recognition of complex human emotions and provide +data support for research in psychology, education, marketing, finance, and +politics. + +
+
+
+
+
+ + ☆ DyConfidMatch: Dynamic Thresholding and Re-sampling for 3D + Semi-supervised Learning + + +
+ Semi-supervised learning (SSL) leverages limited labeled and abundant +unlabeled data but often faces challenges with data imbalance, especially in 3D +contexts. This study investigates class-level confidence as an indicator of +learning status in 3D SSL, proposing a novel method that utilizes dynamic +thresholding to better use unlabeled data, particularly from underrepresented +classes. A re-sampling strategy is also introduced to mitigate bias towards +well-represented classes, ensuring equitable class representation. Through +extensive experiments in 3D SSL, our method surpasses state-of-the-art +counterparts in classification and detection tasks, highlighting its +effectiveness in tackling data imbalance. This approach presents a significant +advancement in SSL for 3D datasets, providing a robust solution for data +imbalance issues. + +
+
+ comment: Accepted by Pattern Recognition Journal +
+
+
+
+
+ + ☆ DEEGITS: Deep Learning based Framework for Measuring Heterogenous + Traffic State in Challenging Traffic Scenarios + + +
+ This paper presents DEEGITS (Deep Learning Based Heterogeneous Traffic State +Measurement), a comprehensive framework that leverages state-of-the-art +convolutional neural network (CNN) techniques to accurately and rapidly detect +vehicles and pedestrians, as well as to measure traffic states in challenging +scenarios (i.e., congestion, occlusion). In this study, we enhance the training +dataset through data fusion, enabling simultaneous detection of vehicles and +pedestrians. Image preprocessing and augmentation are subsequently performed to +improve the quality and quantity of the dataset. Transfer learning is applied +on the YOLOv8 pretrained model to increase the model's capability to identify a +diverse array of vehicles. Optimal hyperparameters are obtained using the Grid +Search algorithm, with the Stochastic Gradient Descent (SGD) optimizer +outperforming other optimizers under these settings. Extensive experimentation +and evaluation demonstrate substantial accuracy within the detection framework, +with the model achieving 0.794 mAP@0.5 on the validation set and 0.786 mAP@0.5 +on the test set, surpassing previous benchmarks on similar datasets. The +DeepSORT multi-object tracking algorithm is incorporated to track detected +vehicles and pedestrians in this study. Finally, the framework is tested to +measure heterogeneous traffic states in mixed traffic conditions. Two locations +with differing traffic compositions and congestion levels are selected: one +motorized-dominant location with moderate density and one +non-motorized-dominant location with higher density. Errors are statistically +insignificant for both cases, showing correlations from 0.99 to 0.88 and 0.91 +to 0.97 for heterogeneous traffic flow and speed measurements, respectively. + +
+
+ comment: Submitted for presentation at the 103 rd Annual Meeting of + Transportation Research Board and publication in Transportation Research + Record: Journal of Transportation Research Board +
+
+
+
+
+ + ☆ Enhancing Multimodal Query Representation via Visual Dialogues for + End-to-End Knowledge Retrieval + + +
+ Existing multimodal retrieval systems often rely on disjointed models for +image comprehension, such as object detectors and caption generators, leading +to cumbersome implementations and training processes. To overcome this +limitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a +text retriever with the ability to understand multimodal queries via dynamic +modality interaction. Ret-XKnow leverages a partial convolution mechanism to +focus on visual information relevant to the given textual query, thereby +enhancing multimodal query representations. To effectively learn multimodal +interaction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset +automatically constructed from visual dialogue datasets. Our dataset +construction process ensures that the dialogues are transformed into suitable +information retrieval tasks using a text retriever. We demonstrate that our +approach not only significantly improves retrieval performance in zero-shot +settings but also achieves substantial improvements in fine-tuning scenarios. +Our code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow. + +
+
+
+
+
+ + ☆ SASE: A Searching Architecture for Squeeze and Excitation Operations + + +
+ In the past few years, channel-wise and spatial-wise attention blocks have +been widely adopted as supplementary modules in deep neural networks, enhancing +network representational abilities while introducing low complexity. Most +attention modules follow a squeeze-and-excitation paradigm. However, to design +such attention modules, requires a substantial amount of experiments and +computational resources. Neural Architecture Search (NAS), meanwhile, is able +to automate the design of neural networks and spares the numerous experiments +required for an optimal architecture. This motivates us to design a search +architecture that can automatically find near-optimal attention modules through +NAS. We propose SASE, a Searching Architecture for Squeeze and Excitation +operations, to form a plug-and-play attention block by searching within certain +search space. The search space is separated into 4 different sets, each +corresponds to the squeeze or excitation operation along the channel or spatial +dimension. Additionally, the search sets include not only existing attention +blocks but also other operations that have not been utilized in attention +mechanisms before. To the best of our knowledge, SASE is the first attempt to +subdivide the attention search space and search for architectures beyond +currently known attention modules. The searched attention module is tested with +extensive experiments across a range of visual tasks. Experimental results +indicate that visual backbone networks (ResNet-50/101) using the SASE attention +module achieved the best performance compared to those using the current +state-of-the-art attention modules. Codes are included in the supplementary +material, and they will be made public later. + +
+
+
+
+
+ + ☆ Motion Control for Enhanced Complex Action Video Generation + + +
+ Existing text-to-video (T2V) models often struggle with generating videos +with sufficiently pronounced or complex actions. A key limitation lies in the +text prompt's inability to precisely convey intricate motion details. To +address this, we propose a novel framework, MVideo, designed to produce +long-duration videos with precise, fluid actions. MVideo overcomes the +limitations of text prompts by incorporating mask sequences as an additional +motion condition input, providing a clearer, more accurate representation of +intended actions. Leveraging foundational vision models such as GroundingDINO +and SAM2, MVideo automatically generates mask sequences, enhancing both +efficiency and robustness. Our results demonstrate that, after training, MVideo +effectively aligns text prompts with motion conditions to produce videos that +simultaneously meet both criteria. This dual control mechanism allows for more +dynamic video generation by enabling alterations to either the text prompt or +motion condition independently, or both in tandem. Furthermore, MVideo supports +motion condition editing and composition, facilitating the generation of videos +with more complex actions. MVideo thus advances T2V motion generation, setting +a strong benchmark for improved action depiction in current video diffusion +models. Our project page is available at https://mvideo-v1.github.io/. + +
+
+ comment: Project page: https://mvideo-v1.github.io/ +
+
+
+
+
+ + ☆ Robust Divergence Learning for Missing-Modality Segmentation + + +
+ Multimodal Magnetic Resonance Imaging (MRI) provides essential complementary +information for analyzing brain tumor subregions. While methods using four +common MRI modalities for automatic segmentation have shown success, they often +face challenges with missing modalities due to image quality issues, +inconsistent protocols, allergic reactions, or cost factors. Thus, developing a +segmentation paradigm that handles missing modalities is clinically valuable. A +novel single-modality parallel processing network framework based on H\"older +divergence and mutual information is introduced. Each modality is independently +input into a shared network backbone for parallel processing, preserving unique +information. Additionally, a dynamic sharing framework is introduced that +adjusts network parameters based on modality availability. A H\"older +divergence and mutual information-based loss functions are used for evaluating +discrepancies between predictions and labels. Extensive testing on the BraTS +2018 and BraTS 2020 datasets demonstrates that our method outperforms existing +techniques in handling missing modalities and validates each component's +effectiveness. + +
+
+
+
+
+ + ☆ Choix d'un espace de représentation image adapté à la détection + de réseaux routiers + + +
+ These last years, algorithms allowing to decompose an image into its +structures and textures components have emerged. In this paper, we present an +application of this type of decomposition to the problem road network detection +in aerial or satelite imagery. The algorithmic procedure involves the image +decomposition (using a unique property), an alignment detection step based on +the Gestalt theory, and a refinement step using statistical active contours. + +
+
+ comment: in French language +
+
+
+
+
+ + ☆ Noisy image decomposition: a new structure, texture and noise model + based on local adaptivity + + +
+ These last few years, image decomposition algorithms have been proposed to +split an image into two parts: the structures and the textures. These +algorithms are not adapted to the case of noisy images because the textures are +corrupted by noise. In this paper, we propose a new model which decomposes an +image into three parts (structures, textures and noise) based on a local +regularization scheme. We compare our results with the recent work of Aujol and +Chambolle. We finish by giving another model which combines the advantages of +the two previous ones. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2411.05265 +
+
+
+
+
+ + ☆ Restoration algorithms and system performance evaluation for active + imagers + + +
+ This paper deals with two fields related to active imaging system. First, we +begin to explore image processing algorithms to restore the artefacts like +speckle, scintillation and image dancing caused by atmospheric turbulence. +Next, we examine how to evaluate the performance of this kind of systems. To do +this task, we propose a modified version of the german TRM3 metric which +permits to get MTF-like measures. We use the database acquired during NATO-TG40 +field trials to make our tests. + +
+
+
+
+
+ + ☆ MBA-SLAM: Motion Blur Aware Dense Visual SLAM with Radiance Fields + Representation + + +
+ Emerging 3D scene representations, such as Neural Radiance Fields (NeRF) and +3D Gaussian Splatting (3DGS), have demonstrated their effectiveness in +Simultaneous Localization and Mapping (SLAM) for photo-realistic rendering, +particularly when using high-quality video sequences as input. However, +existing methods struggle with motion-blurred frames, which are common in +real-world scenarios like low-light or long-exposure conditions. This often +results in a significant reduction in both camera localization accuracy and map +reconstruction quality. To address this challenge, we propose a dense visual +SLAM pipeline (i.e. MBA-SLAM) to handle severe motion-blurred inputs. Our +approach integrates an efficient motion blur-aware tracker with either neural +radiance fields or Gaussian Splatting based mapper. By accurately modeling the +physical image formation process of motion-blurred images, our method +simultaneously learns 3D scene representation and estimates the cameras' local +trajectory during exposure time, enabling proactive compensation for motion +blur caused by camera movement. In our experiments, we demonstrate that +MBA-SLAM surpasses previous state-of-the-art methods in both camera +localization and map reconstruction, showcasing superior performance across a +range of datasets, including synthetic and real datasets featuring sharp images +as well as those affected by motion blur, highlighting the versatility and +robustness of our approach. Code is available at +https://github.com/WU-CVGL/MBA-SLAM. + +
+
+
+
+
+ + ☆ LBONet: Supervised Spectral Descriptors for Shape Analysis + + +
+ The Laplace-Beltrami operator has established itself in the field of +non-rigid shape analysis due to its many useful properties such as being +invariant under isometric transformation, having a countable eigensystem +forming an orthonormal basis, and fully characterizing geodesic distances of +the manifold. However, this invariancy only applies under isometric +deformations, which leads to a performance breakdown in many real-world +applications. In recent years emphasis has been placed upon extracting optimal +features using deep learning methods, however spectral signatures play a +crucial role and still add value. In this paper we take a step back, revisiting +the LBO and proposing a supervised way to learn several operators on a +manifold. Depending on the task, by applying these functions, we can train the +LBO eigenbasis to be more task-specific. The optimization of the LBO leads to +enormous improvements to established descriptors such as the heat kernel +signature in various tasks such as retrieval, classification, segmentation, and +correspondence, proving the adaption of the LBO eigenbasis to both global and +highly local learning settings. + +
+
+ comment: 14 pages, 13 figure +
+
+
+
+
+ + ☆ Drone Detection using Deep Neural Networks Trained on Pure Synthetic + Data + + +
+ Drone detection has benefited from improvements in deep neural networks, but +like many other applications, suffers from the availability of accurate data +for training. Synthetic data provides a potential for low-cost data generation +and has been shown to improve data availability and quality. However, models +trained on synthetic datasets need to prove their ability to perform on +real-world data, known as the problem of sim-to-real transferability. Here, we +present a drone detection Faster-RCNN model trained on a purely synthetic +dataset that transfers to real-world data. We found that it achieves an AP_50 +of 97.0% when evaluated on the MAV-Vid - a real dataset of flying drones - +compared with 97.8% for an equivalent model trained on real-world data. Our +results show that using synthetic data for drone detection has the potential to +reduce data collection costs and improve labelling quality. These findings +could be a starting point for more elaborate synthetic drone datasets. For +example, realistic recreations of specific scenarios could de-risk the dataset +generation of safety-critical applications such as the detection of drones at +airports. Further, synthetic data may enable reliable drone detection systems, +which could benefit other areas, such as unmanned traffic management systems. +The code is available +https://github.com/mazqtpopx/cranfield-synthetic-drone-detection alongside the +datasets +https://huggingface.co/datasets/mazqtpopx/cranfield-synthetic-drone-detection. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ A multidimensional measurement of photorealistic avatar quality of + experience + + +
+ Photorealistic avatars are human avatars that look, move, and talk like real +people. The performance of photorealistic avatars has significantly improved +recently based on objective metrics such as PSNR, SSIM, LPIPS, FID, and FVD. +However, recent photorealistic avatar publications do not provide subjective +tests of the avatars to measure human usability factors. We provide an open +source test framework to subjectively measure photorealistic avatar performance +in ten dimensions: realism, trust, comfortableness using, comfortableness +interacting with, appropriateness for work, creepiness, formality, affinity, +resemblance to the person, and emotion accuracy. We show that the correlation +of nine of these subjective metrics with PSNR, SSIM, LPIPS, FID, and FVD is +weak, and moderate for emotion accuracy. The crowdsourced subjective test +framework is highly reproducible and accurate when compared to a panel of +experts. We analyze a wide range of avatars from photorealistic to cartoon-like +and show that some photorealistic avatars are approaching real video +performance based on these dimensions. We also find that for avatars above a +certain level of realism, eight of these measured dimensions are strongly +correlated. In particular, for photorealistic avatars there is a linear +relationship between avatar affinity and realism; in other words, there is no +uncanny valley effect for photorealistic avatars in the telecommunication +scenario. We provide several extensions of this test framework for future work +and discuss design implications for telecommunication systems. The test +framework is available at https://github.com/microsoft/P.910. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2204.06784 +
+
+
+
+
+ + ☆ Multimodal Object Detection using Depth and Image Data for Manufacturing + Parts + + +
+ Manufacturing requires reliable object detection methods for precise picking +and handling of diverse types of manufacturing parts and components. +Traditional object detection methods utilize either only 2D images from cameras +or 3D data from lidars or similar 3D sensors. However, each of these sensors +have weaknesses and limitations. Cameras do not have depth perception and 3D +sensors typically do not carry color information. These weaknesses can +undermine the reliability and robustness of industrial manufacturing systems. +To address these challenges, this work proposes a multi-sensor system combining +an red-green-blue (RGB) camera and a 3D point cloud sensor. The two sensors are +calibrated for precise alignment of the multimodal data captured from the two +hardware devices. A novel multimodal object detection method is developed to +process both RGB and depth data. This object detector is based on the Faster +R-CNN baseline that was originally designed to process only camera images. The +results show that the multimodal model significantly outperforms the depth-only +and RGB-only baselines on established object detection metrics. More +specifically, the multimodal model improves mAP by 13% and raises Mean +Precision by 11.8% in comparison to the RGB-only baseline. Compared to the +depth-only baseline, it improves mAP by 78% and raises Mean Precision by 57%. +Hence, this method facilitates more reliable and robust object detection in +service to smart manufacturing applications. + +
+
+
+
+
+ + ☆ A Transformer-Based Visual Piano Transcription Algorithm + + +
+ Automatic music transcription (AMT) for musical performances is a long +standing problem in the field of Music Information Retrieval (MIR). Visual +piano transcription (VPT) is a multimodal subproblem of AMT which focuses on +extracting a symbolic representation of a piano performance from visual +information only (e.g., from a top-down video of the piano keyboard). Inspired +by the success of Transformers for audio-based AMT, as well as their recent +successes in other computer vision tasks, in this paper we present a +Transformer based architecture for VPT. The proposed VPT system combines a +piano bounding box detection model with an onset and pitch detection model, +allowing our system to perform well in more naturalistic conditions like +imperfect image crops around the piano and slightly tilted images. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ☆ CoMiX: Cross-Modal Fusion with Deformable Convolutions for HSI-X + Semantic Segmentation + + +
+ Improving hyperspectral image (HSI) semantic segmentation by exploiting +complementary information from a supplementary data type (referred to +X-modality) is promising but challenging due to differences in imaging sensors, +image content, and resolution. Current techniques struggle to enhance +modality-specific and modality-shared information, as well as to capture +dynamic interaction and fusion between different modalities. In response, this +study proposes CoMiX, an asymmetric encoder-decoder architecture with +deformable convolutions (DCNs) for HSI-X semantic segmentation. CoMiX is +designed to extract, calibrate, and fuse information from HSI and X data. Its +pipeline includes an encoder with two parallel and interacting backbones and a +lightweight all-multilayer perceptron (ALL-MLP) decoder. The encoder consists +of four stages, each incorporating 2D DCN blocks for the X model to accommodate +geometric variations and 3D DCN blocks for HSIs to adaptively aggregate +spatial-spectral features. Additionally, each stage includes a Cross-Modality +Feature enhancement and eXchange (CMFeX) module and a feature fusion module +(FFM). CMFeX is designed to exploit spatial-spectral correlations from +different modalities to recalibrate and enhance modality-specific and +modality-shared features while adaptively exchanging complementary information +between them. Outputs from CMFeX are fed into the FFM for fusion and passed to +the next stage for further information learning. Finally, the outputs from each +FFM are integrated by the ALL-MLP decoder for final prediction. Extensive +experiments demonstrate that our CoMiX achieves superior performance and +generalizes well to various multimodal recognition tasks. The CoMiX code will +be released. + +
+
+
+
+
+ + ☆ Bridging the Visual Gap: Fine-Tuning Multimodal Models with + Knowledge-Adapted Captions + + +
+ Recent research increasingly focuses on training vision-language models +(VLMs) with long, detailed image captions. However, small-scale VLMs often +struggle to balance the richness of these captions with the risk of +hallucinating content during fine-tuning. In this paper, we explore how well +VLMs adapt to such captions. To quantify caption quality, we propose Decomposed +NLI (DNLI), an evaluation framework that breaks down generated captions into +individual propositions, assessing each in isolation. This fine-grained +analysis reveals a critical balance between capturing descriptive details and +preventing hallucinations. Our findings show that simply reducing caption +complexity or employing standard data curation techniques does not effectively +resolve this issue. To tackle this challenge, we introduce Knowledge Adapted +(KnowAda) fine-tuning, a data-centric approach that automatically adapts +training data with the model's existing knowledge and visual understanding. +KnowAda minimizes hallucinations while preserving high descriptiveness. We +validate this approach across several small-scale VLMs (up to 7B parameters) +and dense caption datasets, demonstrating that KnowAda effectively balances +hallucination reduction and descriptiveness. Our results show that KnowAda +outperforms various baselines in both automatic metrics and human evaluations. +We will release our code and models. + +
+
+
+
+
+ + ☆ Scale Contrastive Learning with Selective Attentions for Blind Image + Quality Assessment + + +
+ Blind image quality assessment (BIQA) serves as a fundamental task in +computer vision, yet it often fails to consistently align with human subjective +perception. Recent advances show that multi-scale evaluation strategies are +promising due to their ability to replicate the hierarchical structure of human +vision. However, the effectiveness of these strategies is limited by a lack of +understanding of how different image scales influence perceived quality. This +paper addresses two primary challenges: the significant redundancy of +information across different scales, and the confusion caused by combining +features from these scales, which may vary widely in quality. To this end, a +new multi-scale BIQA framework is proposed, namely Contrast-Constrained +Scale-Focused IQA Framework (CSFIQA). CSFIQA features a selective focus +attention mechanism to minimize information redundancy and highlight critical +quality-related information. Additionally, CSFIQA includes a scale-level +contrastive learning module equipped with a noise sample matching mechanism to +identify quality discrepancies across the same image content at different +scales. By exploring the intrinsic relationship between image scales and the +perceived quality, the proposed CSFIQA achieves leading performance on eight +benchmark datasets, e.g., achieving SRCC values of 0.967 (versus 0.947 in CSIQ) +and 0.905 (versus 0.876 in LIVEC). + +
+
+
+
+
+ + ☆ Computed tomography using meta-optics + + +
+ Computer vision tasks require processing large amounts of data to perform +image classification, segmentation, and feature extraction. Optical +preprocessors can potentially reduce the number of floating point operations +required by computer vision tasks, enabling low-power and low-latency +operation. However, existing optical preprocessors are mostly learned and hence +strongly depend on the training data, and thus lack universal applicability. In +this paper, we present a metaoptic imager, which implements the Radon transform +obviating the need for training the optics. High quality image reconstruction +with a large compression ratio of 0.6% is presented through the use of the +Simultaneous Algebraic Reconstruction Technique. Image classification with 90% +accuracy is presented on an experimentally measured Radon dataset through +neural network trained on digitally transformed images. + +
+
+
+
+
+ + ☆ IDCIA: Immunocytochemistry Dataset for Cellular Image Analysis + + +
+ We present a new annotated microscopic cellular image dataset to improve the +effectiveness of machine learning methods for cellular image analysis. Cell +counting is an important step in cell analysis. Typically, domain experts +manually count cells in a microscopic image. Automated cell counting can +potentially eliminate this tedious, time-consuming process. However, a good, +labeled dataset is required for training an accurate machine learning model. +Our dataset includes microscopic images of cells, and for each image, the cell +count and the location of individual cells. The data were collected as part of +an ongoing study investigating the potential of electrical stimulation to +modulate stem cell differentiation and possible applications for neural repair. +Compared to existing publicly available datasets, our dataset has more images +of cells stained with more variety of antibodies (protein components of immune +responses against invaders) typically used for cell analysis. The experimental +results on this dataset indicate that none of the five existing models under +this study are able to achieve sufficiently accurate count to replace the +manual methods. The dataset is available at +https://figshare.com/articles/dataset/Dataset/21970604. + +
+
+
+
+
+ + ☆ Fluoroformer: Scaling multiple instance learning to multiplexed images + via attention-based channel fusion ML4H + + +
+ Though multiple instance learning (MIL) has been a foundational strategy in +computational pathology for processing whole slide images (WSIs), current +approaches are designed for traditional hematoxylin and eosin (H&E) slides +rather than emerging multiplexed technologies. Here, we present an MIL +strategy, the Fluoroformer module, that is specifically tailored to multiplexed +WSIs by leveraging scaled dot-product attention (SDPA) to interpretably fuse +information across disparate channels. On a cohort of 434 non-small cell lung +cancer (NSCLC) samples, we show that the Fluoroformer both obtains strong +prognostic performance and recapitulates immuno-oncological hallmarks of NSCLC. +Our technique thereby provides a path for adapting state-of-the-art AI +techniques to emerging spatial biology assays. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 14 pages +
+
+
+
+
+ + ♻ ☆ Scaling Properties of Diffusion Models for Perceptual Tasks + + +
+ In this paper, we argue that iterative computation with diffusion models +offers a powerful paradigm for not only generation but also visual perception +tasks. We unify tasks such as depth estimation, optical flow, and amodal +segmentation under the framework of image-to-image translation, and show how +diffusion models benefit from scaling training and test-time compute for these +perceptual tasks. Through a careful analysis of these scaling properties, we +formulate compute-optimal training and inference recipes to scale diffusion +models for visual perception tasks. Our models achieve competitive performance +to state-of-the-art methods using significantly less data and compute. To +access our code and models, see https://scaling-diffusion-perception.github.io . + +
+
+
+
+
+ + ♻ ☆ Forensic Iris Image-Based Post-Mortem Interval Estimation + + +
+ Post-mortem iris recognition is an emerging application of iris-based human +identification in a forensic setup. One factor that may be useful in +conditioning iris recognition methods is the tissue decomposition level, which +is correlated with the post-mortem interval (PMI), \ie the number of hours that +have elapsed since death. PMI, however, is not always available, and its +precise estimation remains one of the core challenges in forensic examination. +This paper presents the first known to us method of the PMI estimation directly +from iris images captured after death. To assess the feasibility of the +iris-based PMI estimation, we designed models predicting the PMI from (a) +near-infrared (NIR), (b) visible (RGB), and (c) multispectral (RGB+NIR) +forensic iris images. Models were evaluated following a 10-fold +cross-validation, in (S1) sample-disjoint, (S2) subject-disjoint, and (S3) +cross-dataset scenarios. We explore two data balancing techniques for S3: +resampling-based balancing (S3-real), and synthetic data-supplemented balancing +(S3-synthetic). We found that using the multispectral data offers a +spectacularly low mean absolute error (MAE) of $\approx 3.5$ hours in the +scenario (S1), a bit worse MAE $\approx 17.5$ hours in the scenario (S2), and +MAE $\approx 45.77$ hours in the scenario (S3). Additionally, supplementing the +training set with synthetically-generated forensic iris images (S3-synthetic) +significantly enhances the models' ability to generalize to new NIR, RGB and +multispectral data collected in a different lab. This suggests that if the +environmental conditions are favorable (\eg, bodies are kept in low +temperatures), forensic iris images provide features that are indicative of the +PMI and can be automatically estimated. + +
+
+
+
+
+ + ♻ ☆ Regional Style and Color Transfer + + +
+ This paper presents a novel contribution to the field of regional style +transfer. Existing methods often suffer from the drawback of applying style +homogeneously across the entire image, leading to stylistic inconsistencies or +foreground object twisted when applied to image with foreground elements such +as person figures. To address this limitation, we propose a new approach that +leverages a segmentation network to precisely isolate foreground objects within +the input image. Subsequently, style transfer is applied exclusively to the +background region. The isolated foreground objects are then carefully +reintegrated into the style-transferred background. To enhance the visual +coherence between foreground and background, a color transfer step is employed +on the foreground elements prior to their rein-corporation. Finally, we utilize +feathering techniques to achieve a seamless amalgamation of foreground and +background, resulting in a visually unified and aesthetically pleasing final +composition. Extensive evaluations demonstrate that our proposed approach +yields significantly more natural stylistic transformations compared to +conventional methods. + +
+
+ comment: Accepted by 2024 5th International Conference on Computer Vision, + Image and Deep Learning +
+
+
+
+
+ + ♻ ☆ A Single Transformer for Scalable Vision-Language Modeling + + +
+ We present SOLO, a single transformer for Scalable visiOn-Language mOdeling. +Current large vision-language models (LVLMs) such as LLaVA mostly employ +heterogeneous architectures that connect pre-trained visual encoders with large +language models (LLMs) to facilitate visual recognition and complex reasoning. +Although achieving remarkable performance with relatively lightweight training, +we identify four primary scalability limitations: (1) The visual capacity is +constrained by pre-trained visual encoders, which are typically an order of +magnitude smaller than LLMs. (2) The heterogeneous architecture complicates the +use of established hardware and software infrastructure. (3) Study of scaling +laws on such architecture must consider three separate components - visual +encoder, connector, and LLMs, which complicates the analysis. (4) The use of +existing visual encoders typically requires following a pre-defined +specification of image inputs pre-processing, for example, by reshaping inputs +to fixed-resolution square images, which presents difficulties in processing +and training on high-resolution images or those with unusual aspect ratio. A +unified single Transformer architecture, like SOLO, effectively addresses these +scalability concerns in LVLMs; however, its limited adoption in the modern +context likely stems from the absence of reliable training recipes that balance +both modalities and ensure stable training for billion-scale models. In this +paper, we introduce the first open-source training recipe for developing SOLO, +an open-source 7B LVLM using moderate academic resources. The training recipe +involves initializing from LLMs, sequential pre-training on ImageNet and +web-scale data, and instruction fine-tuning on our curated high-quality +datasets. On extensive evaluation, SOLO demonstrates performance comparable to +LLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning. + +
+
+ comment: Accepted to TMLR +
+
+
+
+
+ + ♻ ☆ GaussianObject: High-Quality 3D Object Reconstruction from Four Views + with Gaussian Splatting SIGGRAPH + + +
+ Reconstructing and rendering 3D objects from highly sparse views is of +critical importance for promoting applications of 3D vision techniques and +improving user experience. However, images from sparse views only contain very +limited 3D information, leading to two significant challenges: 1) Difficulty in +building multi-view consistency as images for matching are too few; 2) +Partially omitted or highly compressed object information as view coverage is +insufficient. To tackle these challenges, we propose GaussianObject, a +framework to represent and render the 3D object with Gaussian splatting that +achieves high rendering quality with only 4 input images. We first introduce +techniques of visual hull and floater elimination, which explicitly inject +structure priors into the initial optimization process to help build multi-view +consistency, yielding a coarse 3D Gaussian representation. Then we construct a +Gaussian repair model based on diffusion models to supplement the omitted +object information, where Gaussians are further refined. We design a +self-generating strategy to obtain image pairs for training the repair model. +We further design a COLMAP-free variant, where pre-given accurate camera poses +are not required, which achieves competitive quality and facilitates wider +applications. GaussianObject is evaluated on several challenging datasets, +including MipNeRF360, OmniObject3D, OpenIllumination, and our-collected unposed +images, achieving superior performance from only four views and significantly +outperforming previous SOTA methods. Our demo is available at +https://gaussianobject.github.io/, and the code has been released at +https://github.com/GaussianObject/GaussianObject. + +
+
+ comment: ACM Transactions on Graphics (SIGGRAPH Asia 2024). Project page: + https://gaussianobject.github.io/ Code: + https://github.com/chensjtu/GaussianObject +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from over-reliance on unimodal biases (e.g., language bias +and vision bias), leading to incorrect answers or hallucinations in complex +multimodal tasks. To investigate this issue, we propose a causal framework to +interpret the biases in Visual Question Answering (VQA) problems. Within this +framework, we conduct an in-depth causal analysis to assess the causal effect +of these biases on MLLM predictions. Based on the analysis, we introduce 1) a +novel MORE dataset with 12,000 challenging VQA instances requiring multi-hop +reasoning and overcoming unimodal biases. 2) a causality-enhanced agent +framework CAVE that guides models to comprehensively integrate information from +different modalities and mitigate biases. Our experiments show that MLLMs +perform poorly on MORE, indicating strong unimodal biases and limited semantic +understanding. However, when integrated with our CAVE, promising improvements +in reasoning and bias mitigation can be seen. These findings provide important +insights for the development of more robust MLLMs and contribute to the broader +goal of advancing multimodal AI systems capable of deeper understanding and +reasoning. Our project page is at https://github.com/OpenCausaLab/MORE. + +
+
+
+
+
+ + ♻ ☆ Textured-GS: Gaussian Splatting with Spatially Defined Color and Opacity + + +
+ In this paper, we introduce Textured-GS, an innovative method for rendering +Gaussian splatting that incorporates spatially defined color and opacity +variations using Spherical Harmonics (SH). This approach enables each Gaussian +to exhibit a richer representation by accommodating varying colors and +opacities across its surface, significantly enhancing rendering quality +compared to traditional methods. To demonstrate the merits of our approach, we +have adapted the Mini-Splatting architecture to integrate textured Gaussians +without increasing the number of Gaussians. Our experiments across multiple +real-world datasets show that Textured-GS consistently outperforms both the +baseline Mini-Splatting and standard 3DGS in terms of visual fidelity. The +results highlight the potential of Textured-GS to advance Gaussian-based +rendering technologies, promising more efficient and high-quality scene +reconstructions. Our implementation is available at +https://github.com/ZhentaoHuang/Textured-GS. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ HiFi-Syn: Hierarchical Granularity Discrimination for High-Fidelity + Synthesis of MR Images with Structure Preservation + + +
+ Synthesizing medical images while preserving their structural information is +crucial in medical research. In such scenarios, the preservation of anatomical +content becomes especially important. Although recent advances have been made +by incorporating instance-level information to guide translation, these methods +overlook the spatial coherence of structural-level representation and the +anatomical invariance of content during translation. To address these issues, +we introduce hierarchical granularity discrimination, which exploits various +levels of semantic information present in medical images. Our strategy utilizes +three levels of discrimination granularity: pixel-level discrimination using a +Brain Memory Bank, structure-level discrimination on each brain structure with +a re-weighting strategy to focus on hard samples, and global-level +discrimination to ensure anatomical consistency during translation. The image +translation performance of our strategy has been evaluated on three independent +datasets (UK Biobank, IXI, and BraTS 2018), and it has outperformed +state-of-the-art algorithms. Particularly, our model excels not only in +synthesizing normal structures but also in handling abnormal (pathological) +structures, such as brain tumors, despite the variations in contrast observed +across different imaging modalities due to their pathological characteristics. +The diagnostic value of synthesized MR images containing brain tumors has been +evaluated by radiologists. This indicates that our model may offer an +alternative solution in scenarios where specific MR modalities of patients are +unavailable. Extensive experiments further demonstrate the versatility of our +method, providing unique insights into medical image translation. + +
+
+
+
+
+ + ♻ ☆ BoQ: A Place is Worth a Bag of Learnable Queries CVPR 2024 + + +
+ In visual place recognition, accurately identifying and matching images of +locations under varying environmental conditions and viewpoints remains a +significant challenge. In this paper, we introduce a new technique, called +Bag-of-Queries (BoQ), which learns a set of global queries designed to capture +universal place-specific attributes. Unlike existing methods that employ +self-attention and generate the queries directly from the input features, BoQ +employs distinct learnable global queries, which probe the input features via +cross-attention, ensuring consistent information aggregation. In addition, our +technique provides an interpretable attention mechanism and integrates with +both CNN and Vision Transformer backbones. The performance of BoQ is +demonstrated through extensive experiments on 14 large-scale benchmarks. It +consistently outperforms current state-of-the-art techniques including NetVLAD, +MixVPR and EigenPlaces. Moreover, as a global retrieval technique (one-stage), +BoQ surpasses two-stage retrieval methods, such as Patch-NetVLAD, TransVPR and +R2Former, all while being orders of magnitude faster and more efficient. The +code and model weights are publicly available at +https://github.com/amaralibey/Bag-of-Queries. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Economists + + +
+ Deep learning provides powerful methods to impute structured information from +large-scale, unstructured text and image datasets. For example, economists +might wish to detect the presence of economic activity in satellite images, or +to measure the topics or entities mentioned in social media, the congressional +record, or firm filings. This review introduces deep neural networks, covering +methods such as classifiers, regression models, generative AI, and embedding +models. Applications include classification, document digitization, record +linkage, and methods for data exploration in massive scale text and image +corpora. When suitable methods are used, deep learning models can be cheap to +tune and can scale affordably to problems involving millions or billions of +data points.. The review is accompanied by a companion website, EconDL, with +user-friendly demo notebooks, software resources, and a knowledge base that +provides technical details and additional applications. + +
+
+
+
+
+ + ♻ ☆ Into the Fog: Evaluating Robustness of Multiple Object Tracking + + +
+ State-of-the-art Multiple Object Tracking (MOT) approaches have shown +remarkable performance when trained and evaluated on current benchmarks. +However, these benchmarks primarily consist of clear weather scenarios, +overlooking adverse atmospheric conditions such as fog, haze, smoke and dust. +As a result, the robustness of trackers against these challenging conditions +remains underexplored. To address this gap, we introduce physics-based +volumetric fog simulation method for arbitrary MOT datasets, utilizing +frame-by-frame monocular depth estimation and a fog formation optical model. We +enhance our simulation by rendering both homogeneous and heterogeneous fog and +propose to use the dark channel prior method to estimate atmospheric light, +showing promising results even in night and indoor scenes. We present the +leading benchmark MOTChallenge (third release) augmented with fog (smoke for +indoor scenes) of various intensities and conduct a comprehensive evaluation of +MOT methods, revealing their limitations under fog and fog-like challenges. + +
+
+
+
+
+ + ♻ ☆ Optimal Transport on the Lie Group of Roto-translations + + +
+ The roto-translation group SE2 has been of active interest in image analysis +due to methods that lift the image data to multi-orientation representations +defined on this Lie group. This has led to impactful applications of +crossing-preserving flows for image de-noising, geodesic tracking, and +roto-translation equivariant deep learning. In this paper, we develop a +computational framework for optimal transportation over Lie groups, with a +special focus on SE2. We make several theoretical contributions (generalizable +to matrix Lie groups) such as the non-optimality of group actions as transport +maps, invariance and equivariance of optimal transport, and the quality of the +entropic-regularized optimal transport plan using geodesic distance +approximations. We develop a Sinkhorn like algorithm that can be efficiently +implemented using fast and accurate distance approximations of the Lie group +and GPU-friendly group convolutions. We report valuable advancements in the +experiments on 1) image barycentric interpolation, 2) interpolation of planar +orientation fields, and 3) Wasserstein gradient flows on SE2. We observe that +our framework of lifting images to SE2 and optimal transport with +left-invariant anisotropic metrics leads to equivariant transport along +dominant contours and salient line structures in the image. This yields sharper +and more meaningful interpolations compared to their counterparts on R^2 + +
+
+
+
+
+ + ♻ ☆ Extracting polygonal footprints in off-nadir images with Segment + Anything Model + + +
+ Building Footprint Extraction (BFE) from off-nadir aerial images often +involves roof segmentation and offset prediction to adjust roof boundaries to +the building footprint. However, this multi-stage approach typically produces +low-quality results, limiting its applicability in real-world data production. +To address this issue, we present OBMv2, an end-to-end and promptable model for +polygonal footprint prediction. Unlike its predecessor OBM, OBMv2 introduces a +novel Self Offset Attention (SOFA) mechanism that improves performance across +diverse building types, from bungalows to skyscrapers, enabling end-to-end +footprint prediction without post-processing. Additionally, we propose a +Multi-level Information System (MISS) to effectively leverage roof masks, +building masks, and offsets for accurate footprint prediction. We evaluate +OBMv2 on the BONAI and OmniCity-view3 datasets and demonstrate its +generalization on the Huizhou test set. The code will be available at +https://github.com/likaiucas/OBMv2. + +
+
+
+
+
+ + ♻ ☆ V-LoL: A Diagnostic Dataset for Visual Logical Learning + + +
+ Despite the successes of recent developments in visual AI, different +shortcomings still exist; from missing exact logical reasoning, to abstract +generalization abilities, to understanding complex and noisy scenes. +Unfortunately, existing benchmarks, were not designed to capture more than a +few of these aspects. Whereas deep learning datasets focus on visually complex +data but simple visual reasoning tasks, inductive logic datasets involve +complex logical learning tasks, however, lack the visual component. To address +this, we propose the diagnostic visual logical learning dataset, V-LoL, that +seamlessly combines visual and logical challenges. Notably, we introduce the +first instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic +benchmark in symbolic AI, the Michalski train problem. By incorporating +intricate visual scenes and flexible logical reasoning tasks within a versatile +framework, V-LoL-Train provides a platform for investigating a wide range of +visual logical learning challenges. We evaluate a variety of AI systems +including traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our +evaluations demonstrate that even SOTA AI faces difficulties in dealing with +visual logical learning challenges, highlighting unique advantages and +limitations of each methodology. Overall, V-LoL opens up new avenues for +understanding and enhancing current abilities in visual logical learning for AI +systems. + +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ VoxelKeypointFusion: Generalizable Multi-View Multi-Person Pose + Estimation + + +
+ In the rapidly evolving field of computer vision, the task of accurately +estimating the poses of multiple individuals from various viewpoints presents a +formidable challenge, especially if the estimations should be reliable as well. +This work presents an extensive evaluation of the generalization capabilities +of multi-view multi-person pose estimators to unseen datasets and presents a +new algorithm with strong performance in this task. It also studies the +improvements by additionally using depth information. Since the new approach +can not only generalize well to unseen datasets, but also to different +keypoints, the first multi-view multi-person whole-body estimator is presented. +To support further research on those topics, all of the work is publicly +accessible. + +
+
+
+
+
+ + ♻ ☆ SLYKLatent: A Learning Framework for Gaze Estimation Using Deep Facial + Feature Learning + + +
+ In this research, we present SLYKLatent, a novel approach for enhancing gaze +estimation by addressing appearance instability challenges in datasets due to +aleatoric uncertainties, covariant shifts, and test domain generalization. +SLYKLatent utilizes Self-Supervised Learning for initial training with facial +expression datasets, followed by refinement with a patch-based tri-branch +network and an inverse explained variance-weighted training loss function. Our +evaluation on benchmark datasets achieves a 10.9% improvement on Gaze360, +supersedes top MPIIFaceGaze results with 3.8%, and leads on a subset of +ETH-XGaze by 11.6%, surpassing existing methods by significant margins. +Adaptability tests on RAF-DB and Affectnet show 86.4% and 60.9% accuracies, +respectively. Ablation studies confirm the effectiveness of SLYKLatent's novel +components. + +
+
+
+
+
+ + ♻ ☆ Snakes and Ladders: Two Steps Up for VideoMamba + + +
+ Video understanding requires the extraction of rich spatio-temporal +representations, which transformer models achieve through self-attention. +Unfortunately, self-attention poses a computational burden. In NLP, Mamba has +surfaced as an efficient alternative for transformers. However, Mamba's +successes do not trivially extend to vision tasks, including those in video +analysis. In this paper, we theoretically analyze the differences between +self-attention and Mamba. We identify two limitations in Mamba's token +processing: historical decay and element contradiction. We propose +VideoMambaPro (VMP) that solves the identified limitations by adding masked +backward computation and elemental residual connections to a VideoMamba +backbone. Differently sized VideoMambaPro models surpass VideoMamba by 1.6-2.8% +and 1.1-1.9% top-1 on Kinetics-400 and Something-Something V2, respectively. +Even without extensive pre-training, our models present an increasingly +attractive and efficient alternative to current transformer models. Moreover, +our two solutions are orthogonal to recent advances in Vision Mamba models, and +are likely to provide further improvements in future models. + +
+
+ comment: New updated experiment results +
+
+
+
+
+ + ♻ ☆ Automatic dataset shift identification to support root cause analysis of + AI performance drift + + +
+ Shifts in data distribution can substantially harm the performance of +clinical AI models. Hence, various methods have been developed to detect the +presence of such shifts at deployment time. However, root causes of dataset +shifts are varied, and the choice of shift mitigation strategies is highly +dependent on the precise type of shift encountered at test time. As such, +detecting test-time dataset shift is not sufficient: precisely identifying +which type of shift has occurred is critical. In this work, we propose the +first unsupervised dataset shift identification framework, effectively +distinguishing between prevalence shift (caused by a change in the label +distribution), covariate shift (caused by a change in input characteristics) +and mixed shifts (simultaneous prevalence and covariate shifts). We discuss the +importance of self-supervised encoders for detecting subtle covariate shifts +and propose a novel shift detector leveraging both self-supervised encoders and +task model outputs for improved shift detection. We report promising results +for the proposed shift identification framework across three different imaging +modalities (chest radiography, digital mammography, and retinal fundus images) +on five types of real-world dataset shifts, using four large publicly available +datasets. + +
+
+ comment: Code available at + https://github.com/biomedia-mira/shift_identification +
+
+
+
+
+ + ♻ ☆ CLASS-M: Adaptive stain separation-based contrastive learning with + pseudo-labeling for histopathological image classification + + +
+ Histopathological image classification is an important task in medical image +analysis. Recent approaches generally rely on weakly supervised learning due to +the ease of acquiring case-level labels from pathology reports. However, +patch-level classification is preferable in applications where only a limited +number of cases are available or when local prediction accuracy is critical. On +the other hand, acquiring extensive datasets with localized labels for training +is not feasible. In this paper, we propose a semi-supervised patch-level +histopathological image classification model, named CLASS-M, that does not +require extensively labeled datasets. CLASS-M is formed by two main parts: a +contrastive learning module that uses separated Hematoxylin and Eosin images +generated through an adaptive stain separation process, and a module with +pseudo-labels using MixUp. We compare our model with other state-of-the-art +models on two clear cell renal cell carcinoma datasets. We demonstrate that our +CLASS-M model has the best performance on both datasets. Our code is available +at github.com/BzhangURU/Paper_CLASS-M/tree/main + +
+
+
+
+
+ + ♻ ☆ A Review of Electromagnetic Elimination Methods for low-field portable + MRI scanner + + +
+ This paper analyzes conventional and deep learning methods for eliminating +electromagnetic interference (EMI) in MRI systems. We compare traditional +analytical and adaptive techniques with advanced deep learning approaches. Key +strengths and limitations of each method are highlighted. Recent advancements +in active EMI elimination, such as external EMI receiver coils, are discussed +alongside deep learning methods, which show superior EMI suppression by +leveraging neural networks trained on MRI data. While deep learning improves +EMI elimination and diagnostic capabilities, it introduces security and safety +concerns, particularly in commercial applications. A balanced approach, +integrating conventional reliability with deep learning's advanced +capabilities, is proposed for more effective EMI suppression in MRI systems. + +
+
+ comment: Accepted by 2024 5th International Conference on Machine Learning and + Computer Application +
+
+
+
+
+ + ♻ ☆ Exploring Test-Time Adaptation for Object Detection in Continually + Changing Environments + + +
+ Real-world application models are commonly deployed in dynamic environments, +where the target domain distribution undergoes temporal changes. Continual +Test-Time Adaptation (CTTA) has recently emerged as a promising technique to +gradually adapt a source-trained model to continually changing target domains. +Despite recent advancements in addressing CTTA, two critical issues remain: 1) +Fixed thresholds for pseudo-labeling in existing methodologies lead to +low-quality pseudo-labels, as model confidence varies across categories and +domains; 2) Stochastic parameter restoration methods for mitigating +catastrophic forgetting fail to preserve critical information effectively, due +to their intrinsic randomness. To tackle these challenges for detection models +in CTTA scenarios, we present AMROD, featuring three core components. Firstly, +the object-level contrastive learning module extracts object-level features for +contrastive learning to refine the feature representation in the target domain. +Secondly, the adaptive monitoring module dynamically skips unnecessary +adaptation and updates the category-specific threshold based on predicted +confidence scores to enable efficiency and improve the quality of +pseudo-labels. Lastly, the adaptive randomized restoration mechanism +selectively reset inactive parameters with higher possibilities, ensuring the +retention of essential knowledge. We demonstrate the effectiveness of AMROD on +four CTTA object detection tasks, where AMROD outperforms existing methods, +especially achieving a 3.2 mAP improvement and a 20% increase in efficiency on +the Cityscapes-to-Cityscapes-C CTTA task. The code will be released. + +
+
+
+
+
+ + ♻ ☆ LT-DARTS: An Architectural Approach to Enhance Deep Long-Tailed Learning + + +
+ Deep long-tailed recognition has been widely studied to address the issue of +imbalanced data distributions in real-world scenarios. However, there has been +insufficient focus on the design of neural architectures, despite empirical +evidence suggesting that architecture can significantly impact performance. In +this paper, we attempt to mitigate long-tailed issues through architectural +improvements. To simplify the design process, we utilize Differential +Architecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS +methods struggle to perform well in long-tailed scenarios. To tackle this +challenge, we introduce Long-Tailed Differential Architecture Search +(LT-DARTS). Specifically, we conduct extensive experiments to explore +architectural components that demonstrate better performance on long-tailed +data and propose a new search space based on our observations. This ensures +that the architecture obtained through our search process incorporates superior +components. Additionally, we propose replacing the learnable linear classifier +with an Equiangular Tight Frame (ETF) classifier to further enhance our method. +This classifier effectively alleviates the biased search process and prevents +performance collapse. Extensive experimental evaluations demonstrate that our +approach consistently improves upon existing methods from an orthogonal +perspective and achieves state-of-the-art results with simple enhancements. + +
+
+
+
+
+ + ♻ ☆ Advantages of Neural Population Coding for Deep Learning + + +
+ Scalar variables, e.g., the orientation of a shape in an image, are commonly +predicted using a single output neuron in a neural network. In contrast, the +mammalian cortex represents variables with a population of neurons. In this +population code, each neuron is most active at its preferred value and shows +partial activity for other values. Here, we investigate the benefit of using a +population code for the output layer of a neural network. We compare population +codes against single-neuron outputs and one-hot vectors. First, we show +theoretically and in experiments with synthetic data that population codes +improve robustness to input noise in networks of stacked linear layers. Second, +we demonstrate the benefit of using population codes to encode ambiguous +outputs, such as the pose of symmetric objects. Using the T-LESS dataset of +feature-less real-world objects, we show that population codes improve the +accuracy of predicting 3D object orientation from image input. + +
+
+
+
+
+ + ♻ ☆ Continual Learning in the Frequency Domain NeurIPS 2024 + + +
+ Continual learning (CL) is designed to learn new tasks while preserving +existing knowledge. Replaying samples from earlier tasks has proven to be an +effective method to mitigate the forgetting of previously acquired knowledge. +However, the current research on the training efficiency of rehearsal-based +methods is insufficient, which limits the practical application of CL systems +in resource-limited scenarios. The human visual system (HVS) exhibits varying +sensitivities to different frequency components, enabling the efficient +elimination of visually redundant information. Inspired by HVS, we propose a +novel framework called Continual Learning in the Frequency Domain (CLFD). To +our knowledge, this is the first study to utilize frequency domain features to +enhance the performance and efficiency of CL training on edge devices. For the +input features of the feature extractor, CLFD employs wavelet transform to map +the original input image into the frequency domain, thereby effectively +reducing the size of input feature maps. Regarding the output features of the +feature extractor, CLFD selectively utilizes output features for distinct +classes for classification, thereby balancing the reusability and interference +of output features based on the frequency domain similarity of the classes +across various tasks. Optimizing only the input and output features of the +feature extractor allows for seamless integration of CLFD with various +rehearsal-based methods. Extensive experiments conducted in both cloud and edge +environments demonstrate that CLFD consistently improves the performance of +state-of-the-art (SOTA) methods in both precision and training efficiency. +Specifically, CLFD can increase the accuracy of the SOTA CL method by up to +6.83% and reduce the training time by 2.6$\times$. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Show Me What and Where has Changed? Question Answering and Grounding for + Remote Sensing Change Detection + + +
+ Remote sensing change detection aims to perceive changes occurring on the +Earth's surface from remote sensing data in different periods, and feed these +changes back to humans. However, most existing methods only focus on detecting +change regions, lacking the capability to interact with users to identify +changes that the users expect. In this paper, we introduce a new task named +Change Detection Question Answering and Grounding (CDQAG), which extends the +traditional change detection task by providing interpretable textual answers +and intuitive visual evidence. To this end, we construct the first CDQAG +benchmark dataset, termed QAG-360K, comprising over 360K triplets of questions, +textual answers, and corresponding high-quality visual masks. It encompasses 10 +essential land-cover categories and 8 comprehensive question types, which +provides a valuable and diverse dataset for remote sensing applications. +Furthermore, we present VisTA, a simple yet effective baseline method that +unifies the tasks of question answering and grounding by delivering both visual +and textual answers. Our method achieves state-of-the-art results on both the +classic change detection-based visual question answering (CDVQA) and the +proposed CDQAG datasets. Extensive qualitative and quantitative experimental +results provide useful insights for developing better CDQAG models, and we hope +that our work can inspire further research in this important yet underexplored +research field. The proposed benchmark dataset and method are available at +https://github.com/like413/VisTA. + +
+
+
+
+
+ + ♻ ☆ From Explicit Rules to Implicit Reasoning in an Interpretable Violence + Monitoring System + + +
+ Recently, research based on pre-trained models has demonstrated outstanding +performance in violence surveillance tasks. However, most of them were +black-box systems which faced challenges regarding explainability during +training and inference processes. An important question is how to incorporate +explicit knowledge into these implicit models, thereby designing expert-driven +and interpretable violence surveillance systems. This paper proposes a new +paradigm for weakly supervised violence monitoring (WSVM) called Rule base +Violence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure +with different designs for images and text. One of the branches is called the +implicit branch, which uses only visual features for coarse-grained binary +classification. In this branch, image feature extraction is divided into two +channels: one responsible for extracting scene frames and the other focusing on +extracting actions. The other branch is called the explicit branch, which +utilizes language-image alignment to perform fine-grained classification. For +the language channel design in the explicit branch, the proposed RuleCLIP uses +the state-of-the-art YOLO-World model to detect objects in video frames, and +association rules are identified through data mining methods as descriptions of +the video. Leveraging the dual-branch architecture, RuleVM achieves +interpretable coarse-grained and fine-grained violence surveillance. Extensive +experiments were conducted on two commonly used benchmarks, and the results +show that RuleCLIP achieved the best performance in both coarse-grained and +fine-grained monitoring, significantly outperforming existing state-of-the-art +methods. Moreover, interpretability experiments uncovered some interesting +rules, such as the observation that as the number of people increases, the risk +level of violent behavior also rises. + +
+
+ comment: 12 pages,7 figures IEEE TSMCA (Under review) +
+
+
+
+
+ + ♻ ☆ *: Improving the 3D detector by introducing Voxel2Pillar feature + encoding and extracting multi-scale features + + +
+ The multi-line LiDAR is widely used in autonomous vehicles, so point +cloud-based 3D detectors are essential for autonomous driving. Extracting rich +multi-scale features is crucial for point cloud-based 3D detectors in +autonomous driving due to significant differences in the size of different +types of objects. However, because of the real-time requirements, large-size +convolution kernels are rarely used to extract large-scale features in the +backbone. Current 3D detectors commonly use feature pyramid networks to obtain +large-scale features; however, some objects containing fewer point clouds are +further lost during down-sampling, resulting in degraded performance. Since +pillar-based schemes require much less computation than voxel-based schemes, +they are more suitable for constructing real-time 3D detectors. Hence, we +propose the *, a pillar-based scheme. We redesigned the feature encoding, the +backbone, and the neck of the 3D detector. We propose the Voxel2Pillar feature +encoding, which uses a sparse convolution constructor to construct pillars with +richer point cloud features, especially height features. The Voxel2Pillar adds +more learnable parameters to the feature encoding, enabling the initial pillars +to have higher performance ability. We extract multi-scale and large-scale +features in the proposed fully sparse backbone, which does not utilize +large-size convolutional kernels; the backbone consists of the proposed +multi-scale feature extraction module. The neck consists of the proposed sparse +ConvNeXt, whose simple structure significantly improves the performance. We +validate the effectiveness of the proposed * on the Waymo Open Dataset, and the +object detection accuracy for vehicles, pedestrians, and cyclists is improved. +We also verify the effectiveness of each proposed module in detail through +ablation studies. + +
+
+ comment: Due to experimental data errors, it needs to be withdrawn +
+
+
+
+
+ + ♻ ☆ Stem-OB: Generalizable Visual Imitation Learning with Stem-Like + Convergent Observation through Diffusion Inversion + + +
+ Visual imitation learning methods demonstrate strong performance, yet they +lack generalization when faced with visual input perturbations, including +variations in lighting and textures, impeding their real-world application. We +propose Stem-OB that utilizes pretrained image diffusion models to suppress +low-level visual differences while maintaining high-level scene structures. +This image inversion process is akin to transforming the observation into a +shared representation, from which other observations stem, with extraneous +details removed. Stem-OB contrasts with data-augmentation approaches as it is +robust to various unspecified appearance changes without the need for +additional training. Our method is a simple yet highly effective plug-and-play +solution. Empirical results confirm the effectiveness of our approach in +simulated tasks and show an exceptionally significant improvement in real-world +applications, with an average increase of 22.2% in success rates compared to +the best baseline. See https://hukz18.github.io/Stem-Ob/ for more info. + +
+
+ comment: Arxiv preprint version, website: https://hukz18.github.io/Stem-Ob/ +
+
+
+
+
+ + ♻ ☆ LAuReL: Learned Augmented Residual Layer ICML + + +
+ One of the core pillars of efficient deep learning methods is architectural +improvements such as the residual/skip connection, which has led to +significantly better model convergence and quality. Since then the residual +connection has become ubiquitous in not just convolutional neural networks but +also transformer-based architectures, the backbone of LLMs. + In this paper we introduce \emph{Learned Augmented Residual Layer} (LAuReL) +-- a novel generalization of the canonical residual connection -- with the goal +to be an in-situ replacement of the latter while outperforming on both model +quality and footprint metrics. Our experiments show that using \laurel can help +boost performance for both vision and language models. For example, on the +ResNet-50, ImageNet 1K task, it achieves $60\%$ of the gains from adding an +extra layer, while only adding $0.003\%$ more parameters, and matches it while +adding $2.6\times$ fewer parameters. + +
+
+ comment: Accepted at the 2nd Efficient Systems for Foundation Models Workshop + at the International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ♻ ☆ Projecting Gaussian Ellipsoids While Avoiding Affine Projection + Approximation + + +
+ Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its +real-time rendering speed and state-of-the-art rendering quality. However, +during the rendering process, the use of the Jacobian of the affine +approximation of the projection transformation leads to inevitable errors, +resulting in blurriness, artifacts and a lack of scene consistency in the final +rendered images. To address this issue, we introduce an ellipsoid-based +projection method to calculate the projection of Gaussian ellipsoid on the +image plane, witch is the primitive of 3D Gaussian Splatting. As our proposed +ellipsoid-based projection method cannot handle Gaussian ellipsoids with camera +origins inside them or parts lying below $z=0$ plane in the camera space, we +designed a pre-filtering strategy. Experiments over multiple widely adopted +benchmark datasets show that using our ellipsoid-based projection method can +enhance the rendering quality of 3D Gaussian Splatting and its extensions. + +
+
+
+
+
+ + ♻ ☆ ViTOC: Vision Transformer and Object-aware Captioner + + +
+ This paper presents ViTOC (Vision Transformer and Object-aware Captioner), a +novel vision-language model for image captioning that addresses the challenges +of accuracy and diversity in generated descriptions. Unlike conventional +approaches, ViTOC employs a dual-path architecture based on Vision Transformer +and object detector, effectively fusing global visual features and local object +information through learnable vectors. The model introduces an innovative +object-aware prompting strategy that significantly enhances its capability in +handling long-tail data. Experiments on the standard COCO dataset demonstrate +that ViTOC outperforms baseline models across all evaluation metrics. +Additionally, we propose a reference-free evaluation method based on CLIP to +further validate the model's effectiveness. By utilizing pretrained visual +model parameters, ViTOC achieves efficient end-to-end training. + +
+
+
+
+
+ + ♻ ☆ Leveraging Pre-trained Models for FF-to-FFPE Histopathological Image + Translation + + +
+ The two primary types of Hematoxylin and Eosin (H&E) slides in histopathology +are Formalin-Fixed Paraffin-Embedded (FFPE) and Fresh Frozen (FF). FFPE slides +offer high quality histopathological images but require a labor-intensive +acquisition process. In contrast, FF slides can be prepared quickly, but the +image quality is relatively poor. Our task is to translate FF images into FFPE +style, thereby improving the image quality for diagnostic purposes. In this +paper, we propose Diffusion-FFPE, a method for FF-to-FFPE histopathological +image translation using a pre-trained diffusion model. Specifically, we utilize +a one-step diffusion model as the generator, which we fine-tune using LoRA +adapters within an adversarial learning framework. To enable the model to +effectively capture both global structural patterns and local details, we +introduce a multi-scale feature fusion module that leverages two VAE encoders +to extract features at different image resolutions, performing feature fusion +before inputting them into the UNet. Additionally, a pre-trained +vision-language model for histopathology serves as the backbone for the +discriminator, enhancing model performance. Our FF-to-FFPE translation +experiments on the TCGA-NSCLC dataset demonstrate that the proposed approach +outperforms existing methods. The code and models are released at +https://github.com/QilaiZhang/Diffusion-FFPE. + +
+
+ comment: Accepted at IEEE BIBM 2024 +
+
+
+
+
+ + ♻ ☆ Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised + Medical Image Segmentation + + +
+ Semi-supervised learning has received considerable attention for its +potential to leverage abundant unlabeled data to enhance model robustness. +Pseudo labeling is a widely used strategy in semi supervised learning. However, +existing methods often suffer from noise contamination, which can undermine +model performance. To tackle this challenge, we introduce a novel +Synergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework. +Built upon the mean teacher network, we employ a Mix Augmentation module to +enhance the unlabeled data. By evaluating the synergy before and after +augmentation, we strategically partition the pseudo labels into distinct +regions. Additionally, we introduce a Region Loss Evaluation module to assess +the loss across each delineated area. Extensive experiments conducted on the LA +dataset have demonstrated superior performance over state-of-the-art +techniques, underscoring the efficiency and practicality of our framework. + +
+
+
+
+
+ + ♻ ☆ Six-Point Method for Multi-Camera Systems with Reduced Solution Space ECCV + + +
+ Relative pose estimation using point correspondences (PC) is a widely used +technique. A minimal configuration of six PCs is required for two views of +generalized cameras. In this paper, we present several minimal solvers that use +six PCs to compute the 6DOF relative pose of multi-camera systems, including a +minimal solver for the generalized camera and two minimal solvers for the +practical configuration of two-camera rigs. The equation construction is based +on the decoupling of rotation and translation. Rotation is represented by +Cayley or quaternion parametrization, and translation can be eliminated by +using the hidden variable technique. Ray bundle constraints are found and +proven when a subset of PCs relate the same cameras across two views. This is +the key to reducing the number of solutions and generating numerically stable +solvers. Moreover, all configurations of six-point problems for multi-camera +systems are enumerated. Extensive experiments demonstrate the superior accuracy +and efficiency of our solvers compared to state-of-the-art six-point methods. +The code is available at https://github.com/jizhaox/relpose-6pt + +
+
+ comment: Accepted to the European Conference on Computer Vision (ECCV), 2024, + for an oral presentation +
+
+
+
+
+ + ♻ ☆ DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring + + +
+ Coronary artery disease (CAD), one of the most common cause of mortality in +the world. Coronary artery calcium (CAC) scoring using computed tomography (CT) +is key for risk assessment to prevent coronary disease. Previous studies on +risk assessment and calcification detection in CT scans primarily use +approaches based on UNET architecture, frequently implemented on pre-built +models. However, these models are limited by the availability of annotated CT +scans containing CAC and suffering from imbalanced dataset, decreasing +performance of CAC segmentation and scoring. In this study, we extend this +approach by incorporating the self-supervised learning (SSL) technique of DINO +(self-distillation with no labels) to eliminate limitations of scarce annotated +data in CT scans. The DINO model's ability to train without requiring CAC area +annotations enhances its robustness in generating distinct features. The DINO +model is trained on to focus specifically on calcified areas by using labels, +aiming to generate features that effectively capture and highlight key +characteristics. The label-guided DINO (DINO-LG) enhances classification by +distinguishing CT slices that contain calcification from those that do not, +performing 57% better than the standard DINO model in this task. CAC scoring +and segmentation tasks are performed by a basic U-NET architecture, fed +specifically with CT slices containing calcified areas as identified by the +DINO-LG model. This targeted identification performed by DINO-LG model improves +CAC segmentation performance by approximately 10% and significant increase in +CAC scoring accuracy. + +
+
+ comment: Developed by Center for Applied Artificial Intelligence (CAAI), + University of Kentucky +
+
+
+
+
+ + ♻ ☆ Personalize to generalize: Towards a universal medical multi-modality + generalization through personalization + + +
+ The differences among medical imaging modalities, driven by distinct +underlying principles, pose significant challenges for generalization in +multi-modal medical tasks. Beyond modality gaps, individual variations, such as +differences in organ size and metabolic rate, further impede a model's ability +to generalize effectively across both modalities and diverse populations. +Despite the importance of personalization, existing approaches to multi-modal +generalization often neglect individual differences, focusing solely on common +anatomical features. This limitation may result in weakened generalization in +various medical tasks. In this paper, we unveil that personalization is +critical for multi-modal generalization. Specifically, we propose an approach +to achieve personalized generalization through approximating the underlying +personalized invariant representation ${X}_h$ across various modalities by +leveraging individual-level constraints and a learnable biological prior. We +validate the feasibility and benefits of learning a personalized ${X}_h$, +showing that this representation is highly generalizable and transferable +across various multi-modal medical tasks. Extensive experimental results +consistently show that the additionally incorporated personalization +significantly improves performance and generalization across diverse scenarios, +confirming its effectiveness. + +
+
+
+
+
+ + ♻ ☆ MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation + Models, Convolutional Neural Networks, and Uncertainty Quantification for + High-Speed Video Phase Detection Data + + +
+ Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in +nuclear reactors, chemical processing, and electronics cooling for detecting +vapor, liquid, and microlayer phases. Traditional segmentation models face +pixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ +introduces VideoSAM, a hybrid framework leveraging convolutional neural +networks (CNNs) and transformer-based vision models to enhance segmentation +accuracy and generalizability across complex multimodal PD tasks. Methods: +VideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced +feature extraction and segmentation across diverse HSV PD modalities, spanning +fluids like water, FC-72, nitrogen, and argon under varied heat flux +conditions. The framework also incorporates uncertainty quantification (UQ) to +assess pixel-based discretization errors, delivering reliable metrics such as +contact line density and dry area fraction under experimental conditions. +Results: VideoSAM outperforms SAM and modality-specific CNN models in +segmentation accuracy, excelling in environments with complex phase boundaries, +overlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid +architecture supports cross-dataset generalization, adapting effectively to +varying modalities. The UQ module provides accurate error estimates, enhancing +the reliability of segmentation outputs for advanced HSV PD research. +Conclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD +segmentation, addressing previous limitations with advanced deep learning and +UQ techniques. The open-source datasets and tools introduced enable scalable, +precise, and adaptable segmentation for multimodal PD datasets, supporting +advancements in HSV analysis and autonomous experimentation. The codes and data +used for this paper are publicly available at: +\url{https://github.com/chikap421/mseg_vcuq} + +
+
+ comment: Under Review in EAAI +
+
+
+
+
+ + ♻ ☆ RealisHuman: A Two-Stage Approach for Refining Malformed Human Parts in + Generated Images + + +
+ In recent years, diffusion models have revolutionized visual generation, +outperforming traditional frameworks like Generative Adversarial Networks +(GANs). However, generating images of humans with realistic semantic parts, +such as hands and faces, remains a significant challenge due to their intricate +structural complexity. To address this issue, we propose a novel +post-processing solution named RealisHuman. The RealisHuman framework operates +in two stages. First, it generates realistic human parts, such as hands or +faces, using the original malformed parts as references, ensuring consistent +details with the original image. Second, it seamlessly integrates the rectified +human parts back into their corresponding positions by repainting the +surrounding areas to ensure smooth and realistic blending. The RealisHuman +framework significantly enhances the realism of human generation, as +demonstrated by notable improvements in both qualitative and quantitative +metrics. Code is available at https://github.com/Wangbenzhi/RealisHuman. + +
+
+
+
+
+ + ♻ ☆ Interpretability Needs a New Paradigm + + +
+ Interpretability is the study of explaining models in understandable terms to +humans. At present, interpretability is divided into two paradigms: the +intrinsic paradigm, which believes that only models designed to be explained +can be explained, and the post-hoc paradigm, which believes that black-box +models can be explained. At the core of this debate is how each paradigm +ensures its explanations are faithful, i.e., true to the model's behavior. This +is important, as false but convincing explanations lead to unsupported +confidence in artificial intelligence (AI), which can be dangerous. This +paper's position is that we should think about new paradigms while staying +vigilant regarding faithfulness. First, by examining the history of paradigms +in science, we see that paradigms are constantly evolving. Then, by examining +the current paradigms, we can understand their underlying beliefs, the value +they bring, and their limitations. Finally, this paper presents 3 emerging +paradigms for interpretability. The first paradigm designs models such that +faithfulness can be easily measured. Another optimizes models such that +explanations become faithful. The last paradigm proposes to develop models that +produce both a prediction and an explanation. + +
+
+
+
+
+ + ♻ ☆ Taming Latent Diffusion Model for Neural Radiance Field Inpainting ECCV 2024 + + +
+ Neural Radiance Field (NeRF) is a representation for 3D reconstruction from +multi-view images. Despite some recent work showing preliminary success in +editing a reconstructed NeRF with diffusion prior, they remain struggling to +synthesize reasonable geometry in completely uncovered regions. One major +reason is the high diversity of synthetic contents from the diffusion model, +which hinders the radiance field from converging to a crisp and deterministic +geometry. Moreover, applying latent diffusion models on real data often yields +a textural shift incoherent to the image condition due to auto-encoding errors. +These two problems are further reinforced with the use of pixel-distance +losses. To address these issues, we propose tempering the diffusion model's +stochasticity with per-scene customization and mitigating the textural shift +with masked adversarial training. During the analyses, we also found the +commonly used pixel and perceptual losses are harmful in the NeRF inpainting +task. Through rigorous experiments, our framework yields state-of-the-art NeRF +inpainting results on various real-world scenes. Project page: +https://hubert0527.github.io/MALD-NeRF + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://hubert0527.github.io/MALD-NeRF +
+
+
+
+
+ + ♻ ☆ ConMe: Rethinking Evaluation of Compositional Reasoning for Modern VLMs NeurIPS 2024 + + +
+ Compositional Reasoning (CR) entails grasping the significance of attributes, +relations, and word order. Recent Vision-Language Models (VLMs), comprising a +visual encoder and a Large Language Model (LLM) decoder, have demonstrated +remarkable proficiency in such reasoning tasks. This prompts a crucial +question: have VLMs effectively tackled the CR challenge? We conjecture that +existing CR benchmarks may not adequately push the boundaries of modern VLMs +due to the reliance on an LLM-only negative text generation pipeline. +Consequently, the negatives produced either appear as outliers from the natural +language distribution learned by VLMs' LLM decoders or as improbable within the +corresponding image context. To address these limitations, we introduce ConMe +-- a compositional reasoning benchmark and a novel data generation pipeline +leveraging VLMs to produce `hard CR Q&A'. Through a new concept of VLMs +conversing with each other to collaboratively expose their weaknesses, our +pipeline autonomously generates, evaluates, and selects challenging +compositional reasoning questions, establishing a robust CR benchmark, also +subsequently validated manually. Our benchmark provokes a noteworthy, up to +33%, decrease in CR performance compared to preceding benchmarks, reinstating +the CR challenge even for state-of-the-art VLMs. + +
+
+ comment: NeurIPS 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ Beyond the Doors of Perception: Vision Transformers Represent Relations + Between Objects + + +
+ Though vision transformers (ViTs) have achieved state-of-the-art performance +in a variety of settings, they exhibit surprising failures when performing +tasks involving visual relations. This begs the question: how do ViTs attempt +to perform tasks that require computing visual relations between objects? Prior +efforts to interpret ViTs tend to focus on characterizing relevant low-level +visual features. In contrast, we adopt methods from mechanistic +interpretability to study the higher-level visual algorithms that ViTs use to +perform abstract visual reasoning. We present a case study of a fundamental, +yet surprisingly difficult, relational reasoning task: judging whether two +visual entities are the same or different. We find that pretrained ViTs +fine-tuned on this task often exhibit two qualitatively different stages of +processing despite having no obvious inductive biases to do so: 1) a perceptual +stage wherein local object features are extracted and stored in a disentangled +representation, and 2) a relational stage wherein object representations are +compared. In the second stage, we find evidence that ViTs can learn to +represent somewhat abstract visual relations, a capability that has long been +considered out of reach for artificial neural networks. Finally, we demonstrate +that failures at either stage can prevent a model from learning a generalizable +solution to our fairly simple tasks. By understanding ViTs in terms of discrete +processing stages, one can more precisely diagnose and rectify shortcomings of +existing and future models. + +
+
+
+
+
+ + ♻ ☆ A metric embedding kernel for live cell microscopy signaling patterns + + +
+ Live cell microscopy captures 5-D $(x,y,z,channel,time)$ movies that display +patterns of cellular motion and signaling dynamics. We present here a metric +kernel function for spatiotemporal patterns of cell signaling dynamics in 5-D +live cell microscopy movies unique in requiring no a priori knowledge of +expected pattern dynamics, and no training data. The approach uses Kolmogorov +complexity theory to compute a metric distance between movies and to measure +the meaningful information among subsets of movies. Cell signaling kymographs +store at each spatiotemporal cell centroid the cell signaling state, or a +functional output such as velocity. Patterns of similarity are identified via +the metric normalized compression distance (NCD). The NCD is a reproducing +kernel for a Hilbert space that represents the input cell signaling kymographs +as points in a low dimensional embedding that optimally captures the pattern +similarity identified by the NCD throughout the space. The only parameter is +the expected cell radii ($\mu m$). A new formulation of the cluster structure +function optimally estimates the meaningful information captured by the +embedding. Also presented is the cell signaling structure function (SSF), a +Kolmogorov structure function that optimally measures cell signaling state as +nuclear intensity w.r.t. surrounding cytoplasm, a significant improvement +compared to the current state-of-the-art cytonuclear ratio. Results are +presented quantifying the impact of ERK and AKT signaling between different +oncogenic mutations, and by the relation between ERK signaling and cellular +velocity patterns for movies of 2-D monolayers of human breast epithelial +(MCF10A) cells, 3-D MCF10A spheroids under optogenetic manipulation of ERK, and +human induced pluripotent stem cells. + +
+
+
+
+
+ + ♻ ☆ Confidence Trigger Detection: Accelerating Real-time + Tracking-by-detection Systems + + +
+ Real-time object tracking necessitates a delicate balance between speed and +accuracy, a challenge exacerbated by the computational demands of deep learning +methods. In this paper, we propose Confidence-Triggered Detection (CTD), an +innovative approach that strategically bypasses object detection for frames +closely resembling intermediate states, leveraging tracker confidence scores. +CTD not only enhances tracking speed but also preserves accuracy, surpassing +existing tracking algorithms. Through extensive evaluation across various +tracker confidence thresholds, we identify an optimal trade-off between +tracking speed and accuracy, providing crucial insights for parameter +fine-tuning and enhancing CTD's practicality in real-world scenarios. Our +experiments across diverse detection models underscore the robustness and +versatility of the CTD framework, demonstrating its potential to enable +real-time tracking in resource-constrained environments. + +
+
+ comment: Accepted by 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+
+
+
+ + Systems and Control 24 + +
+
+
+ + ☆ Goal-oriented Semantic Communication for Robot Arm Reconstruction in + Digital Twin: Feature and Temporal Selections + + +
+ As one of the most promising technologies in industry, the Digital Twin (DT) +facilitates real-time monitoring and predictive analysis for real-world systems +by precisely reconstructing virtual replicas of physical entities. However, +this reconstruction faces unprecedented challenges due to the everincreasing +communication overhead, especially for digital robot arm reconstruction. To +this end, we propose a novel goal-oriented semantic communication (GSC) +framework to extract the GSC information for the robot arm reconstruction task +in the DT, with the aim of minimising the communication load under the strict +and relaxed reconstruction error constraints. Unlike the traditional +reconstruction framework that periodically transmits a reconstruction message +for real-time DT reconstruction, our framework implements a feature selection +(FS) algorithm to extract the semantic information from the reconstruction +message, and a deep reinforcement learning-based temporal selection algorithm +to selectively transmit the semantic information over time. We validate our +proposed GSC framework through both Pybullet simulations and lab experiments +based on the Franka Research 3 robot arm. For a range of distinct robotic +tasks, simulation results show that our framework can reduce the communication +load by at least 59.5% under strict reconstruction error constraints and 80% +under relaxed reconstruction error constraints, compared with traditional +communication framework. Also, experimental results confirm the effectiveness +of our framework, where the communication load is reduced by 53% in strict +constraint case and 74% in relaxed constraint case. The demo is available at: +https://youtu.be/2OdeHKxcgnk. + +
+
+ comment: Submitted to IEEE for potential publication +
+
+
+
+
+ + ☆ AI-Enhanced Inverter Fault and Anomaly Detection System for Distributed + Energy Resources in Microgrids + + +
+ The integration of Distributed Energy Resources (DERs) into power +distribution systems has made microgrids foundational to grid modernization. +These DERs, connected through power electronic inverters, create power +electronics dominated grid architecture, introducing unique challenges for +fault detection. While external line faults are widely studied, inverter faults +remain a critical yet underexplored issue. This paper proposes various data +mining techniques for the effective detection and localization of inverter +faults-essential for preventing catastrophic grid failures. Furthermore, the +difficulty of differentiating between system anomalies and internal inverter +faults within Power Electronics-Driven Grids (PEDGs) is addressed. To enhance +grid resilience, this work applies advanced artificial intelligence methods to +distinguish anomalies from true internal faults, identifying the specific +malfunctioning switch. The proposed FaultNet-ML methodology is validated on a +9-bus system dominated by inverters, illustrating its robustness in a PEDG +environment. + +
+
+ comment: 5 pages, 2 figures, submitted to 2025 IEEE Power and Energy Society + General Meeting (PESGM 2025), Austin, TX +
+
+
+
+
+ + ☆ Clutter-Aware Target Detection for ISAC in a Millimeter-Wave Cell-Free + Massive MIMO System + + +
+ In this paper, we investigate the performance of an integrated sensing and +communication (ISAC) system within a cell-free massive multiple-input +multiple-output (MIMO) system. Each access point (AP) operates in the +millimeter-wave (mmWave) frequency band. The APs jointly serve the user +equipments (UEs) in the downlink while simultaneously detecting a target +through dedicated sensing beams, which are directed toward a reconfigurable +intelligent surface (RIS). Although the AP-RIS, RIS-target, and AP-target +channels have both line-of-sight (LoS) and non-line-of-sight (NLoS) parts, it +is assumed only knowledge of the LoS paths is available. A key contribution of +this study is the consideration of clutter, which degrades the target detection +if not handled. We propose an algorithm to alternatively optimize the transmit +power allocation and the RIS phase-shift matrix, maximizing the target +signal-to-clutter-plus-noise ratio (SCNR) while ensuring a minimum +signal-to-interference-plus-noise ratio (SINR) for the UEs. Numerical results +demonstrate that exploiting clutter subspace significantly enhances detection +probability, particularly at high clutter-to-noise ratios, and reveal that an +increased number of transmit side clusters impair detection performance. +Finally, we highlight the performance gains achieved using a dedicated sensing +stream. + +
+
+ comment: submitted to IEEE ICC25 +
+
+
+
+
+ + ☆ Logic-based Knowledge Awareness for Autonomous Agents in Continuous + Spaces + + +
+ This paper presents a step towards a formal controller design method for +autonomous agents based on knowledge awareness to improve decision-making. Our +approach is to first create an organized repository of information (a knowledge +base) for autonomous agents which can be accessed and then translated into +temporal specifications. Secondly, to develop a controller with formal +guarantees that meets a combination of mission-specific objective and the +specification from the knowledge base, we utilize an abstraction-based +controller design (ABCD) approach, capable of managing both nonlinear dynamics +and temporal requirements. Unlike the conventional offline ABCD approach, our +method dynamically updates the controller whenever the knowledge base prompts +changes in the specifications. A three-dimensional nonlinear car model +navigating an urban road scenario with traffic signs and obstacles is +considered for validation. Results show the effectiveness of the method in +guiding the autonomous agents to the target while complying with the knowledge +base and the mission-specific objective. + +
+
+
+
+
+ + ☆ Recommender systems and reinforcement learning for building control and + occupant interaction: A text-mining driven review of scientific literature + + +
+ The indoor environment greatly affects health and well-being; enhancing +health and reducing energy use in these settings is a key research focus. With +advancing Information and Communication Technology (ICT), recommendation +systems and reinforcement learning have emerged as promising methods to induce +behavioral changes that improve indoor environments and building energy +efficiency. This study employs text-mining and Natural Language Processing +(NLP) to examine these approaches in building control and occupant interaction. +Analyzing approximately 27,000 articles from the ScienceDirect database, we +found extensive use of recommendation systems and reinforcement learning for +space optimization, location recommendations, and personalized control +suggestions. Despite broad applications, their use in optimizing indoor +environments and energy efficiency is limited. Traditional recommendation +algorithms are commonly used, but optimizing indoor conditions and energy +efficiency often requires advanced machine learning techniques like +reinforcement and deep learning. This review highlights the potential for +expanding recommender systems and reinforcement learning applications in +buildings and indoor environments. Areas for innovation include predictive +maintenance, building-related product recommendations, and optimizing +environments for specific needs like sleep and productivity enhancements based +on user feedback. + +
+
+
+
+
+ + ☆ Identification of Power Systems with Droop-Controlled Units Using Neural + Ordinary Differential Equations + + +
+ In future power systems, the detailed structure and dynamics may not always +be fully known. This is due to an increasing number of distributed energy +resources, such as photovoltaic generators, battery storage systems, heat pumps +and electric vehicles, as well as a shift towards active distribution grids. +Obtaining physically-based models for simulation and control synthesis can +therefore become challenging. Differential equations, where the right-hand side +is represented by a neural network, i.e., neural ordinary differential +equations (NODEs), have a great potential to serve as a data-driven black-box +model to overcome this challenge. This paper explores their use in identifying +the dynamics of droop-controlled grid-forming units based on inputs and state +measurements. In numerical studies, various NODE structures used with different +numerical solvers are trained and evaluated. Moreover, they are compared to the +sparse identification of nonlinear dynamics (SINDy) method. The results +demonstrate that even though SINDy yields more accurate models, NODEs achieve +good prediction performance without prior knowledge about the system's +nonlinearities which SINDy requires to work best. + +
+
+
+
+
+ + ☆ On the Application of Model Predictive Control to a Weighted Coverage + Path Planning Problem + + +
+ This paper considers the application of Model Predictive Control (MPC) to a +weighted coverage path planning (WCPP) problem. The problem appears in a wide +range of practical applications, such as search and rescue (SAR) missions. The +basic setup is that one (or multiple) agents can move around a given search +space and collect rewards from a given spatial distribution. Unlike an +artificial potential field, each reward can only be collected once. In contrast +to a Traveling Salesman Problem (TSP), the agent moves in a continuous space. +Moreover, he is not obliged to cover all locations and/or may return to +previously visited locations. The WCPP problem is tackled by a new Model +Predictive Control (MPC) formulation with so-called Coverage Constraints (CCs). +It is shown that the solution becomes more effective if the solver is +initialized with a TSP-based heuristic. With and without this initialization, +the proposed MPC approach clearly outperforms a naive MPC formulation, as +demonstrated in a small simulation study. + +
+
+
+
+
+ + ☆ Robust Optimal Power Flow Against Adversarial Attacks: A Tri-Level + Optimization Approach + + +
+ In power systems, unpredictable events like extreme weather, equipment +failures, and cyberattacks present significant challenges to ensuring safety +and reliability. Ensuring resilience in the face of these uncertainties is +crucial for reliable and efficient operations. This paper presents a tri-level +optimization approach for robust power system operations that effectively +address worst-case attacks. The first stage focuses on optimizing economic +dispatch under normal operating conditions, aiming to minimize generation costs +while maintaining the supply-demand balance. The second stage introduces an +adversarial attack model, identifying worst-case scenarios that maximize the +system's vulnerability by targeting distributed generation (DG). In the third +stage, mitigation strategies are developed using fast-response energy storage +systems (ESS) to minimize disruptions caused by these attacks. By integrating +economic dispatch, vulnerability assessment, and mitigation into a unified +framework, this approach provides a robust solution for enhancing power system +resilience and safety against evolving adversarial threats. The approach is +validated using the IEEE-33 node distribution system to demonstrate its +effectiveness in achieving both cost efficiency and system resilience. + +
+
+ comment: This work has been submitted for possible publication +
+
+
+
+
+ + ☆ Future state prediction based on observer for missile system + + +
+ Guided missile accuracy and precision is negatively impacted by seeker delay, +more specifically by the delay introduced by a mechanical seeker gimbal and the +computational time taken to process the raw data. To meet the demands and +expectations of modern missiles systems, the impact of this hardware limitation +must be reduced. This paper presents a new observer design that predicts the +future state of a seeker signal, augmenting the guidance system to mitigate the +effects of this delay. The design is based on a novel two-step differentiator, +which produces the estimated future time derivatives of the signal. The input +signal can be nonlinear and provides for simple integration into existing +systems. A bespoke numerical guided missile simulation is used to demonstrate +the performance of the observer within a missile guidance system. Both +non-manoeuvring and randomly manoeuvring target engagement scenarios are +considered. + +
+
+
+
+
+ + ☆ Robust performance for switched systems with constrained switching and + its application to weakly hard real-time control systems + + +
+ Many cyber-physical systems can naturally be formulated as switched systems +with constrained switching. This includes systems where one of the signals in +the feedback loop may be lost. Possible sources for losses are shared or +unreliable communication media in networked control systems, or signals which +are discarded, e.g., when using a shared computation device such as a processor +in real-time control applications. The use of switched systems with constrained +switching is not limited to cyber-physical systems but, includes many other +relevant applications such as power systems and modeling virus mutations. In +this chapter, we introduce a framework for analyzing and designing controllers +which guarantee robust quadratic performance for switched systems with +constrained switching. The possible switching sequences are described by the +language of a labeled graph where the labels are linked to the different +subsystems. The subsystems are allowed to have different input and output +dimensions, and their state-space representations can be affected by a broad +class of uncertainties in a rational way. The proposed framework exploits ideas +from dissipativity-based linear control theory to derive analysis and synthesis +inequalities given by linear matrix inequalities. We demonstrate how the +proposed framework can be applied to the design of controllers for uncertain +weakly hard real-time control systems - a system class naturally appearing in +networked and real-time control. + +
+
+
+
+
+ + ☆ Inference-Aware State Reconstruction for Industrial Metaverse under + Synchronous/Asynchronous Short-Packet Transmission + + +
+ We consider a real-time state reconstruction system for industrial metaverse. +The time-varying physical process states in real space are captured by multiple +sensors via wireless links, and then reconstructed in virtual space. In this +paper, we use the spatial-temporal correlation of the sensor data of interest +to infer the real-time data of the target sensor to reduce the mean squared +error (MSE) of reconstruction for industrial metaverse under short-packet +transmission (SPT). Both synchronous and asynchronous transmission modes for +multiple sensors are considered. It is proved that the average MSE of +reconstruction and average block error probability (BLEP) have a positive +correlation under inference with synchronous transmission scheme, and they have +a negative correlation in some conditions under inference with asynchronous +transmission scheme. Also, it is proved that the average MSE of reconstruction +with inference can be significantly lower than that without inference, even +under weak mean squared spatial correlation (MSSC). In addition, closed-form +MSSC thresholds are derived for the superiority regions of the inference with +synchronous transmission and inference with asynchronous transmission schemes, +respectively. Adaptations of blocklength and time shift of asynchronous +transmission are conducted to minimize the average MSE of reconstruction. +Simulation results show that the two schemes significantly outperform the no +inference case, with an average MSE reduction of more than 50%. + +
+
+
+
+
+ + ☆ Neural Network Certification Informed Power System Transient Stability + Preventive Control with Renewable Energy + + +
+ Existing machine learning-based surrogate modeling methods for transient +stability constrained-optimal power flow (TSC-OPF) lack certifications in the +presence of unseen disturbances or uncertainties. This may lead to divergence +of TSC-OPF or insecure control strategies. This paper proposes a neural network +certification-informed power system transient stability preventive control +method considering the impacts of various uncertainty resources, such as errors +from measurements, fluctuations in renewable energy sources (RESs) and loads, +etc. A deep belief network (DBN) is trained to estimate the transient +stability, replacing the time-consuming time-domain simulation-based +calculations. Then, DBN is embedded into the iterations of the primal-dual +interior-point method to solve TSC-OPF. To guarantee the robustness of the +solutions, the neural network verifier $\alpha, \beta$-CROWN to deal with +uncertainties from RESs and loads is proposed. The yielded certification +results allow us to further adjust the transient stability safety margin under +the iterated TSC-OPF solution process, balancing system security and economics. +Numerical results on a modified western South Carolina 500-bus system +demonstrate that the proposed method can efficiently and quickly obtain the +safety-verified preventive control strategy through RES curtailment and +generator dispatch with only a slight increase in cost. + +
+
+
+
+
+ + ☆ Learning-Based Control Barrier Function with Provably Safe Guarantees: + Reducing Conservatism with Heading-Aware Safety Margin + + +
+ We propose a learning-based Control Barrier Function (CBF) to reduce +conservatism in collision avoidance of car-like robots. Traditional CBFs often +use Euclidean distance between robots' centers as safety margin, neglecting +headings and simplifying geometries to circles. While this ensures smooth, +differentiable safety functions required by CBFs, it can be overly conservative +in tight environments. To address this limitation, we design a heading-aware +safety margin that accounts for the robots' orientations, enabling a less +conservative and more accurate estimation of safe regions. Since the function +computing this safety margin is non-differentiable, we approximate it with a +neural network to ensure differentiability and facilitate integration with +CBFs. We describe how we achieve bounded learning error and incorporate the +upper bound into the CBF to provide formal safety guarantees through forward +invariance. We show that our CBF is a high-order CBF with relative degree two +for a system with two robots whose dynamics are modeled by the nonlinear +kinematic bicycle model. Experimental results in overtaking and bypassing +scenarios reveal a 33.5 % reduction in conservatism compared to traditional +methods, while maintaining safety. Code: https://github.com/bassamlab/sigmarl + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Reliability, Resilience and Human Factors Engineering for Trustworthy AI + Systems + + +
+ As AI systems become integral to critical operations across industries and +services, ensuring their reliability and safety is essential. We offer a +framework that integrates established reliability and resilience engineering +principles into AI systems. By applying traditional metrics such as failure +rate and Mean Time Between Failures (MTBF) along with resilience engineering +and human reliability analysis, we propose an integrate framework to manage AI +system performance, and prevent or efficiently recover from failures. Our work +adapts classical engineering methods to AI systems and outlines a research +agenda for future technical studies. We apply our framework to a real-world AI +system, using system status data from platforms such as openAI, to demonstrate +its practical applicability. This framework aligns with emerging global +standards and regulatory frameworks, providing a methodology to enhance the +trustworthiness of AI systems. Our aim is to guide policy, regulation, and the +development of reliable, safe, and adaptable AI technologies capable of +consistent performance in real-world environments. + +
+
+
+
+
+ + ☆ Intelligent Adaptive Metasurface in Complex Wireless Environments + + +
+ The programmable metasurface is regarded as one of the most promising +transformative technologies for next-generation wireless system applications. +Due to the lack of effective perception ability of the external electromagnetic +environment, there are numerous challenges in the intelligent regulation of +wireless channels, and it still relies on external sensors to reshape +electromagnetic environment as desired. To address that problem, we propose an +adaptive metasurface (AMS) which integrates the capabilities of acquiring +wireless environment information and manipulating reflected electromagnetic +(EM) waves in a programmable manner. The proposed design endows the +metasurfaces with excellent capabilities to sense the complex electromagnetic +field distributions around them and then dynamically manipulate the waves and +signals in real time under the guidance of the sensed information, eliminating +the need for prior knowledge or external inputs about the wireless environment. +For verification, a prototype of the proposed AMS is constructed, and its dual +capabilities of sensing and manipulation are experimentally validated. +Additionally, different integrated sensing and communication (ISAC) scenarios +with and without the aid of the AMS are established. The effectiveness of the +AMS in enhancing communication quality is well demonstrated in complex +electromagnetic environments, highlighting its beneficial application potential +in future wireless systems. + +
+
+
+
+
+ + ♻ ☆ Controlling Large Electric Vehicle Charging Stations via User Behavior + Modeling and Stochastic Programming + + +
+ This paper introduces an Electric Vehicle Charging Station (EVCS) model that +incorporates real-world constraints, such as slot power limitations, contract +threshold overruns penalties, or early disconnections of electric vehicles +(EVs). We propose a formulation of the problem of EVCS control under +uncertainty, and implement two Multi-Stage Stochastic Programming approaches +that leverage user-provided information, namely, Model Predictive Control and +Two-Stage Stochastic Programming. The model addresses uncertainties in charging +session start and end times, as well as in energy demand. A user's behavior +model based on a sojourn-time-dependent stochastic process enhances cost +reduction while maintaining customer satisfaction. The benefits of the two +proposed methods are showcased against two baselines over a 22-day simulation +using a real-world dataset. The two-stage approach demonstrates robustness +against early disconnections by considering a wider range of uncertainty +scenarios for optimization. The algorithm prioritizing user satisfaction over +electricity cost achieves a 20% and 36% improvement in two user satisfaction +metrics compared to an industry-standard baseline. Additionally, the algorithm +striking the best balance between cost and user satisfaction exhibits a mere 3% +relative cost increase compared to the theoretically optimal baseline - for +which the nonanticipativity constraint is relaxed - while attaining 94% and 84% +of the user satisfaction performance in the two used satisfaction metrics. + +
+
+
+
+
+ + ♻ ☆ Morphological Symmetries in Robotics + + +
+ We present a comprehensive framework for studying and leveraging +morphological symmetries in robotic systems. These are intrinsic properties of +the robot's morphology, frequently observed in animal biology and robotics, +which stem from the replication of kinematic structures and the symmetrical +distribution of mass. We illustrate how these symmetries extend to the robot's +state space and both proprioceptive and exteroceptive sensor measurements, +resulting in the equivariance of the robot's equations of motion and optimal +control policies. Thus, we recognize morphological symmetries as a relevant and +previously unexplored physics-informed geometric prior, with significant +implications for both data-driven and analytical methods used in modeling, +control, estimation and design in robotics. For data-driven methods, we +demonstrate that morphological symmetries can enhance the sample efficiency and +generalization of machine learning models through data augmentation, or by +applying equivariant/invariant constraints on the model's architecture. In the +context of analytical methods, we employ abstract harmonic analysis to +decompose the robot's dynamics into a superposition of lower-dimensional, +independent dynamics. We substantiate our claims with both synthetic and +real-world experiments conducted on bipedal and quadrupedal robots. Lastly, we +introduce the repository MorphoSymm to facilitate the practical use of the +theory and applications outlined in this work. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ A Link-Based Flow Model with Turn-Level Queue Transmission and + Time-Varying Free-Flow Speed for Urban Road Networks + + +
+ Macroscopic link-based flow models are efficient for simulating flow +propagation in urban road networks. Existing link-based flow models described +traffic states of a link with two state variables of link inflow and outflow +and assumed homogeneous traffic states within a whole link. Consequently, the +turn-level queue length change within the link can not be captured, resulting +in underrepresented queue spillback. Moreover, a constant link free-flow speed +was assumed to formulate models, restricting their applicability in modeling +phenomena involving time-varying free-flow speed. This study proposed a new +link-based flow model by introducing an additional state variable of link queue +inflow and adapting the link outflow to be free-flow speed-dependent. In our +model, the vehicle propagation within each link is described by the link +inflow, queue inflow, and outflow, which depends on the link free-flow speed +changes. A node model is further defined to capture the presence of signal +control and potential queue spillback, which estimates the constrained flow +propagation between adjacent road segments. Simulation experiments were +conducted on a single intersection and a network with consecutive intersections +to verify the proposed model performance. Results demonstrate the predictive +power of the proposed model in predicting traffic operations of intersections +with multiple turning movements and time-varying free-flow speed. Our model +outperforms the baseline link-based flow model and preserves the computational +tractability property of link-based flow models. + +
+
+
+
+
+ + ♻ ☆ Multi-Objective Complementary Control + + +
+ This paper proposes a novel multi-objective control framework for linear +time-invariant systems in which performance and robustness can be achieved in a +complementary way instead of a trade-off. In particular, a state-space solution +is first established for a new stabilizing control structure consisting of two +independently designed controllers coordinated with a Youla-type operator ${\bm +Q}$. It is then shown by performance analysis that these two independently +designed controllers operate in a naturally complementary way for a tracking +control system, due to the coordination function of ${\bm Q}$ driven by the +residual signal of a Luenberger observer. Moreover, it is pointed out that +${\bm Q}$ could be further optimized with an additional gain factor to achieve +improved performance, through a data-driven methodology for a measured cost +function. + +
+
+
+
+
+ + ♻ ☆ On-demand Cold Start Frequency Reduction with Off-Policy Reinforcement + Learning in Serverless Computing + + +
+ Function-as-a-Service (FaaS) is a cloud computing paradigm offering an +event-driven execution model to applications. It features serverless attributes +by eliminating resource management responsibilities from developers, and offers +transparent and on-demand scalability of applications. To provide seamless +on-demand scalability, new function instances are prepared to serve the +incoming workload in the absence or unavailability of function instances. +However, FaaS platforms are known to suffer from cold starts, where this +function provisioning process introduces a non-negligible delay in function +response and reduces the end-user experience. Therefore, the presented work +focuses on reducing the frequent, on-demand cold starts on the platform by +using Reinforcement Learning(RL). The proposed approach uses model-free +Q-learning that consider function metrics such as CPU utilization, existing +function instances, and response failure rate, to proactively initialize +functions, in advance, based on the expected demand. The proposed solution is +implemented on Kubeless and evaluated using an open-source function invocation +trace applied to a matrix multiplication function. The evaluation results +demonstrate a favourable performance of the RL-based agent when compared to +Kubeless' default policy and a function keep-alive policy by improving +throughput by up to 8.81% and reducing computation load and resource wastage by +up to 55% and 37%, respectively, that is a direct outcome of reduced cold +starts. + +
+
+ comment: 13 figures, 24 pages, 3 tables +
+
+
+
+
+ + ♻ ☆ Neural-Rendezvous: Provably Robust Guidance and Control to Encounter + Interstellar Objects + + +
+ Interstellar objects (ISOs) are likely representatives of primitive materials +invaluable in understanding exoplanetary star systems. Due to their poorly +constrained orbits with generally high inclinations and relative velocities, +however, exploring ISOs with conventional human-in-the-loop approaches is +significantly challenging. This paper presents Neural-Rendezvous -- a deep +learning-based guidance and control framework for encountering fast-moving +objects, including ISOs, robustly, accurately, and autonomously in real time. +It uses pointwise minimum norm tracking control on top of a guidance policy +modeled by a spectrally-normalized deep neural network, where its +hyperparameters are tuned with a loss function directly penalizing the MPC +state trajectory tracking error. We show that Neural-Rendezvous provides a high +probability exponential bound on the expected spacecraft delivery error, the +proof of which leverages stochastic incremental stability analysis. In +particular, it is used to construct a non-negative function with a +supermartingale property, explicitly accounting for the ISO state uncertainty +and the local nature of nonlinear state estimation guarantees. In numerical +simulations, Neural-Rendezvous is demonstrated to satisfy the expected error +bound for 100 ISO candidates. This performance is also empirically validated +using our spacecraft simulator and in high-conflict and distributed UAV swarm +reconfiguration with up to 20 UAVs. + +
+
+ comment: Preprint Version, Accepted: October, 2024 (One-minute YouTube + summary: https://youtu.be/q3e0LYS2IYQ, DOI: + https://doi.org/10.2514/1.G007671) +
+
+
+
+
+ + ♻ ☆ Structure-Preserving Model Order Reduction for Nonlinear DAE Models of + Power Networks + + +
+ This paper deals with the joint reduction of the number of dynamic and +algebraic states of a nonlinear differential-algebraic equation (NDAE) model of +a power network. The dynamic states depict the internal states of generators, +loads, renewables, whereas the algebraic ones define network states such as +voltages and phase angles. In the current literature of power system model +order reduction (MOR), the algebraic constraints are usually neglected and the +power network is commonly modeled via a set of ordinary differential equations +(ODEs) instead of NDAEs. Thus, reduction is usually carried out for the dynamic +states only and the algebraic variables are kept intact. This leaves a +significant part of the system's size and complexity unreduced. This paper +addresses this aforementioned limitation by jointly reducing both dynamic and +algebraic variables. As compared to the literature the proposed MOR techniques +are endowed with the following features: (i) no system linearization is +required, (ii) require no transformation to an equivalent or approximate ODE +representation, (iii) guarantee that the reduced order model to be +NDAE-structured and thus preserves the differential-algebraic structure of +original power system model, and (iv) can seamlessly reduce both dynamic and +algebraic variables while maintaining high accuracy. Case studies performed on +a 2000-bus power system reveal that the proposed MOR techniques are able to +reduce system order while maintaining accuracy. + +
+
+
+
+
+ + ♻ ☆ How many autonomous vehicles are required to stabilize traffic flow? + + +
+ The collective behavior of human-driven vehicles (HVs) produces the +well-known stop-and-go waves potentially leading to higher fuel consumption and +emissions. This paper investigates the stabilization of traffic flow via a +minimum number of autonomous vehicles (AVs) subject to constraints on the +control parameters aiming to reduce the number of vehicles on the road while +achieving lower fuel consumption and emissions. The unconstrained scenario has +been well-studied in recent studies. The main motivation to investigate the +constrained scenario is that, in realistic engineering applications, lower and +upper bounds exist on the control parameters. For the constrained scenario, we +optimally find the minimum number of required AVs (via computing the optimal +lower bound on the AV penetration rate) to stabilize traffic flow for a given +number of HVs. As an immediate consequence, we conclude that for a given number +of AVs, the number of HVs in the stabilized traffic flow may not be arbitrarily +large in the constrained scenario unlike the unconstrained scenario studied in +the literature. We systematically propose a procedure to compute the optimal +lower bound on the AV penetration rate using nonlinear optimization techniques. +Finally, we validate the theoretical results via numerical simulations. +Numerical simulations suggest that enlarging the constraint intervals makes a +smaller optimal lower bound on the AV penetration rate attainable. However, it +leads to a slower transient response due to a dominant pole closer to the +origin. + +
+
+
+
+
+ + ♻ ☆ Interstellar Object Accessibility and Mission Design + + +
+ Interstellar objects (ISOs) represent a compelling and under-explored +category of celestial bodies, providing physical laboratories to understand the +formation of our solar system and probe the composition and properties of +material formed in exoplanetary systems. In this work, we investigate existing +approaches to designing successful flyby missions to ISOs, including a deep +learning-driven guidance and control algorithm for ISOs traveling at velocities +over 60 km/s. We have generated spacecraft trajectories to a series of +synthetic representative ISOs, simulating a ground campaign to observe the +target and resolve its state, thereby determining the cruise and close approach +delta-Vs required for the encounter. We discuss the accessibility of and +mission design to ISOs with varying characteristics, with special focuses on 1) +state covariance estimation throughout the cruise, 2) handoffs from traditional +navigation approaches to novel autonomous navigation for fast flyby regimes, +and 3) overall recommendations about preparing for the future in situ +exploration of these targets. The lessons learned also apply to the fast flyby +of other small bodies, e.g., long-period comets and potentially hazardous +asteroids, which also require tactical responses with similar characteristics. + +
+
+ comment: IEEE Aerospace Conference, Preprint Version, Accepted: November 2022 +
+
+
+
+
+
+
+
+ + Machine Learning 176 + +
+
+
+ + ☆ A Short Note on Evaluating RepNet for Temporal Repetition Counting in + Videos + + +
+ We discuss some consistent issues on how RepNet has been evaluated in various +papers. As a way to mitigate these issues, we report RepNet performance results +on different datasets, and release evaluation code and the RepNet checkpoint to +obtain these results. Code URL: +https://github.com/google-research/google-research/blob/master/repnet/ + +
+
+
+
+
+ + ☆ The Limited Impact of Medical Adaptation of Large Language and + Vision-Language Models EMNLP 2024 + + +
+ Several recent works seek to develop foundation models specifically for +medical applications, adapting general-purpose large language models (LLMs) and +vision-language models (VLMs) via continued pretraining on publicly available +biomedical corpora. These works typically claim that such domain-adaptive +pretraining (DAPT) improves performance on downstream medical tasks, such as +answering medical licensing exam questions. In this paper, we compare ten +public "medical" LLMs and two VLMs against their corresponding base models, +arriving at a different conclusion: all medical VLMs and nearly all medical +LLMs fail to consistently improve over their base models in the zero-/few-shot +prompting and supervised fine-tuning regimes for medical question-answering +(QA). For instance, across all tasks and model pairs we consider in the 3-shot +setting, medical LLMs only outperform their base models in 22.7% of cases, +reach a (statistical) tie in 36.8% of cases, and are significantly worse than +their base models in the remaining 40.5% of cases. Our conclusions are based on +(i) comparing each medical model head-to-head, directly against the +corresponding base model; (ii) optimizing the prompts for each model separately +in zero-/few-shot prompting; and (iii) accounting for statistical uncertainty +in comparisons. While these basic practices are not consistently adopted in the +literature, our ablations show that they substantially impact conclusions. +Meanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs +can show performance improvements, but the benefits do not carry over to tasks +based on clinical notes. Our findings suggest that state-of-the-art +general-domain models may already exhibit strong medical knowledge and +reasoning capabilities, and offer recommendations to strengthen the conclusions +of future studies. + +
+
+ comment: Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes + additional results on clinical note QA tasks and supervised fine-tuning + evaluations +
+
+
+
+
+ + ☆ Unsupervised Parameter-free Outlier Detection using HDBSCAN* Outlier + Profiles + + +
+ In machine learning and data mining, outliers are data points that +significantly differ from the dataset and often introduce irrelevant +information that can induce bias in its statistics and models. Therefore, +unsupervised methods are crucial to detect outliers if there is limited or no +information about them. Global-Local Outlier Scores based on Hierarchies +(GLOSH) is an unsupervised outlier detection method within HDBSCAN*, a +state-of-the-art hierarchical clustering method. GLOSH estimates outlier scores +for each data point by comparing its density to the highest density of the +region they reside in the HDBSCAN* hierarchy. GLOSH may be sensitive to +HDBSCAN*'s minpts parameter that influences density estimation. With limited +knowledge about the data, choosing an appropriate minpts value beforehand is +challenging as one or some minpts values may better represent the underlying +cluster structure than others. Additionally, in the process of searching for +``potential outliers'', one has to define the number of outliers n a dataset +has, which may be impractical and is often unknown. In this paper, we propose +an unsupervised strategy to find the ``best'' minpts value, leveraging the +range of GLOSH scores across minpts values to identify the value for which +GLOSH scores can best identify outliers from the rest of the dataset. Moreover, +we propose an unsupervised strategy to estimate a threshold for classifying +points into inliers and (potential) outliers without the need to pre-define any +value. Our experiments show that our strategies can automatically find the +minpts value and threshold that yield the best or near best outlier detection +results using GLOSH. + +
+
+ comment: Accepted at IEEE International Conference on Big Data, IEEE BigData + 2024 +
+
+
+
+
+ + ☆ LLMStinger: Jailbreaking LLMs using RL fine-tuned LLMs AAAI 2025 + + +
+ We introduce LLMStinger, a novel approach that leverages Large Language +Models (LLMs) to automatically generate adversarial suffixes for jailbreak +attacks. Unlike traditional methods, which require complex prompt engineering +or white-box access, LLMStinger uses a reinforcement learning (RL) loop to +fine-tune an attacker LLM, generating new suffixes based on existing attacks +for harmful questions from the HarmBench benchmark. Our method significantly +outperforms existing red-teaming approaches (we compared against 15 of the +latest methods), achieving a +57.2% improvement in Attack Success Rate (ASR) on +LLaMA2-7B-chat and a +50.3% ASR increase on Claude 2, both models known for +their extensive safety measures. Additionally, we achieved a 94.97% ASR on +GPT-3.5 and 99.4% on Gemma-2B-it, demonstrating the robustness and adaptability +of LLMStinger across open and closed-source models. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ☆ Interaction Testing in Variation Analysis + + +
+ Relationships of cause and effect are of prime importance for explaining +scientific phenomena. Often, rather than just understanding the effects of +causes, researchers also wish to understand how a cause $X$ affects an outcome +$Y$ mechanistically -- i.e., what are the causal pathways that are activated +between $X$ and $Y$. For analyzing such questions, a range of methods has been +developed over decades under the rubric of causal mediation analysis. +Traditional mediation analysis focuses on decomposing the average treatment +effect (ATE) into direct and indirect effects, and therefore focuses on the ATE +as the central quantity. This corresponds to providing explanations for +associations in the interventional regime, such as when the treatment $X$ is +randomized. Commonly, however, it is of interest to explain associations in the +observational regime, and not just in the interventional regime. In this paper, +we introduce \text{variation analysis}, an extension of mediation analysis that +focuses on the total variation (TV) measure between $X$ and $Y$, written as +$\mathrm{E}[Y \mid X=x_1] - \mathrm{E}[Y \mid X=x_0]$. The TV measure +encompasses both causal and confounded effects, as opposed to the ATE which +only encompasses causal (direct and mediated) variations. In this way, the TV +measure is suitable for providing explanations in the natural regime and +answering questions such as ``why is $X$ associated with $Y$?''. Our focus is +on decomposing the TV measure, in a way that explicitly includes direct, +indirect, and confounded variations. Furthermore, we also decompose the TV +measure to include interaction terms between these different pathways. +Subsequently, interaction testing is introduced, involving hypothesis tests to +determine if interaction terms are significantly different from zero. If +interactions are not significant, more parsimonious decompositions of the TV +measure can be used. + +
+
+
+
+
+ + ☆ Oblique Bayesian additive regression trees + + +
+ Current implementations of Bayesian Additive Regression Trees (BART) are +based on axis-aligned decision rules that recursively partition the feature +space using a single feature at a time. Several authors have demonstrated that +oblique trees, whose decision rules are based on linear combinations of +features, can sometimes yield better predictions than axis-aligned trees and +exhibit excellent theoretical properties. We develop an oblique version of BART +that leverages a data-adaptive decision rule prior that recursively partitions +the feature space along random hyperplanes. Using several synthetic and +real-world benchmark datasets, we systematically compared our oblique BART +implementation to axis-aligned BART and other tree ensemble methods, finding +that oblique BART was competitive with -- and sometimes much better than -- +those methods. + +
+
+
+
+
+ + ☆ Offline Adaptation of Quadruped Locomotion using Diffusion Models + + +
+ We present a diffusion-based approach to quadrupedal locomotion that +simultaneously addresses the limitations of learning and interpolating between +multiple skills and of (modes) offline adapting to new locomotion behaviours +after training. This is the first framework to apply classifier-free guided +diffusion to quadruped locomotion and demonstrate its efficacy by extracting +goal-conditioned behaviour from an originally unlabelled dataset. We show that +these capabilities are compatible with a multi-skill policy and can be applied +with little modification and minimal compute overhead, i.e., running entirely +on the robots onboard CPU. We verify the validity of our approach with hardware +experiments on the ANYmal quadruped platform. + +
+
+
+
+
+ + ☆ Model agnostic local variable importance for locally dependent + relationships + + +
+ Global variable importance measures are commonly used to interpret machine +learning model results. Local variable importance techniques assess how +variables contribute to individual observations rather than the entire dataset. +Current methods typically fail to accurately reflect locally dependent +relationships between variables and instead focus on marginal importance +values. Additionally, they are not natively adapted for multi-class +classification problems. We propose a new model-agnostic method for calculating +local variable importance, CLIQUE, that captures locally dependent +relationships, contains improvements over permutation-based methods, and can be +directly applied to multi-class classification problems. Simulated and +real-world examples show that CLIQUE emphasizes locally dependent information +and properly reduces bias in regions where variables do not affect the +response. + +
+
+
+
+
+ + ☆ Process-aware Human Activity Recognition + + +
+ Humans naturally follow distinct patterns when conducting their daily +activities, which are driven by established practices and processes, such as +production workflows, social norms and daily routines. Human activity +recognition (HAR) algorithms usually use neural networks or machine learning +techniques to analyse inherent relationships within the data. However, these +approaches often overlook the contextual information in which the data are +generated, potentially limiting their effectiveness. We propose a novel +approach that incorporates process information from context to enhance the HAR +performance. Specifically, we align probabilistic events generated by machine +learning models with process models derived from contextual information. This +alignment adaptively weighs these two sources of information to optimise HAR +accuracy. Our experiments demonstrate that our approach achieves better +accuracy and Macro F1-score compared to baseline models. + +
+
+
+
+
+ + ☆ FinRobot: AI Agent for Equity Research and Valuation with Large Language + Models + + +
+ As financial markets grow increasingly complex, there is a rising need for +automated tools that can effectively assist human analysts in equity research, +particularly within sell-side research. While Generative AI (GenAI) has +attracted significant attention in this field, existing AI solutions often fall +short due to their narrow focus on technical factors and limited capacity for +discretionary judgment. These limitations hinder their ability to adapt to new +data in real-time and accurately assess risks, which diminishes their practical +value for investors. + This paper presents FinRobot, the first AI agent framework specifically +designed for equity research. FinRobot employs a multi-agent Chain of Thought +(CoT) system, integrating both quantitative and qualitative analyses to emulate +the comprehensive reasoning of a human analyst. The system is structured around +three specialized agents: the Data-CoT Agent, which aggregates diverse data +sources for robust financial integration; the Concept-CoT Agent, which mimics +an analysts reasoning to generate actionable insights; and the Thesis-CoT +Agent, which synthesizes these insights into a coherent investment thesis and +report. FinRobot provides thorough company analysis supported by precise +numerical data, industry-appropriate valuation metrics, and realistic risk +assessments. Its dynamically updatable data pipeline ensures that research +remains timely and relevant, adapting seamlessly to new financial information. +Unlike existing automated research tools, such as CapitalCube and Wright +Reports, FinRobot delivers insights comparable to those produced by major +brokerage firms and fundamental research vendors. We open-source FinRobot at +\url{https://github. com/AI4Finance-Foundation/FinRobot}. + +
+
+ comment: The 1st Workshop on LLMs and Generative AI for Finance, ICAIF 2024 +
+
+
+
+
+ + ☆ Deep Learning Accelerated Quantum Transport Simulations in + Nanoelectronics: From Break Junctions to Field-Effect Transistors + + +
+ Quantum transport calculations are essential for understanding and designing +nanoelectronic devices, yet the trade-off between accuracy and computational +efficiency has long limited their practical applications. We present a general +framework that combines the deep learning tight-binding Hamiltonian (DeePTB) +approach with the non-equilibrium Green's Function (NEGF) method, enabling +efficient quantum transport calculations while maintaining first-principles +accuracy. We demonstrate the capabilities of the DeePTB-NEGF framework through +two representative applications: comprehensive simulation of break junction +systems, where conductance histograms show good agreement with experimental +measurements in both metallic contact and single-molecule junction cases; and +simulation of carbon nanotube field effect transistors through self-consistent +NEGF-Poisson calculations, capturing essential physics including the +electrostatic potential and transfer characteristic curves under finite bias +conditions. This framework bridges the gap between first-principles accuracy +and computational efficiency, providing a powerful tool for high-throughput +quantum transport simulations across different scales in nanoelectronics. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Learning Gaussian Multi-Index Models with Gradient Flow: Time Complexity + and Directional Convergence AISTATS 2025 + + +
+ This work focuses on the gradient flow dynamics of a neural network model +that uses correlation loss to approximate a multi-index function on +high-dimensional standard Gaussian data. Specifically, the multi-index function +we consider is a sum of neurons $f^*(x) \!=\! \sum_{j=1}^k \! \sigma^*(v_j^T +x)$ where $v_1, \dots, v_k$ are unit vectors, and $\sigma^*$ lacks the first +and second Hermite polynomials in its Hermite expansion. It is known that, for +the single-index case ($k\!=\!1$), overcoming the search phase requires +polynomial time complexity. We first generalize this result to multi-index +functions characterized by vectors in arbitrary directions. After the search +phase, it is not clear whether the network neurons converge to the index +vectors, or get stuck at a sub-optimal solution. When the index vectors are +orthogonal, we give a complete characterization of the fixed points and prove +that neurons converge to the nearest index vectors. Therefore, using $n \! +\asymp \! k \log k$ neurons ensures finding the full set of index vectors with +gradient flow with high probability over random initialization. When $ v_i^T +v_j \!=\! \beta \! \geq \! 0$ for all $i \neq j$, we prove the existence of a +sharp threshold $\beta_c \!=\! c/(c+k)$ at which the fixed point that computes +the average of the index vectors transitions from a saddle point to a minimum. +Numerical simulations show that using a correlation loss and a mild +overparameterization suffices to learn all of the index vectors when they are +nearly orthogonal, however, the correlation loss fails when the dot product +between the index vectors exceeds a certain threshold. + +
+
+ comment: 21 pages, 6 figures, under review by AISTATS 2025 +
+
+
+
+
+ + ☆ Locally Private Sampling with Public Data + + +
+ Local differential privacy (LDP) is increasingly employed in +privacy-preserving machine learning to protect user data before sharing it with +an untrusted aggregator. Most LDP methods assume that users possess only a +single data record, which is a significant limitation since users often gather +extensive datasets (e.g., images, text, time-series data) and frequently have +access to public datasets. To address this limitation, we propose a locally +private sampling framework that leverages both the private and public datasets +of each user. Specifically, we assume each user has two distributions: $p$ and +$q$ that represent their private dataset and the public dataset, respectively. +The objective is to design a mechanism that generates a private sample +approximating $p$ while simultaneously preserving $q$. We frame this objective +as a minimax optimization problem using $f$-divergence as the utility measure. +We fully characterize the minimax optimal mechanisms for general +$f$-divergences provided that $p$ and $q$ are discrete distributions. +Remarkably, we demonstrate that this optimal mechanism is universal across all +$f$-divergences. Experiments validate the effectiveness of our minimax optimal +sampler compared to the state-of-the-art locally private sampler. + +
+
+
+
+
+ + ☆ Can sparse autoencoders be used to decompose and interpret steering + vectors? + + +
+ Steering vectors are a promising approach to control the behaviour of large +language models. However, their underlying mechanisms remain poorly understood. +While sparse autoencoders (SAEs) may offer a potential method to interpret +steering vectors, recent findings show that SAE-reconstructed vectors often +lack the steering properties of the original vectors. This paper investigates +why directly applying SAEs to steering vectors yields misleading +decompositions, identifying two reasons: (1) steering vectors fall outside the +input distribution for which SAEs are designed, and (2) steering vectors can +have meaningful negative projections in feature directions, which SAEs are not +designed to accommodate. These limitations hinder the direct use of SAEs for +interpreting steering vectors. + +
+
+
+
+
+ + ☆ Optimal Oblivious Subspace Embeddings with Near-optimal Sparsity + + +
+ An oblivious subspace embedding is a random $m\times n$ matrix $\Pi$ such +that, for any $d$-dimensional subspace, with high probability $\Pi$ preserves +the norms of all vectors in that subspace within a $1\pm\epsilon$ factor. In +this work, we give an oblivious subspace embedding with the optimal dimension +$m=\Theta(d/\epsilon^2)$ that has a near-optimal sparsity of $\tilde +O(1/\epsilon)$ non-zero entries per column of $\Pi$. This is the first result +to nearly match the conjecture of Nelson and Nguyen [FOCS 2013] in terms of the +best sparsity attainable by an optimal oblivious subspace embedding, improving +on a prior bound of $\tilde O(1/\epsilon^6)$ non-zeros per column [Chenakkod et +al., STOC 2024]. We further extend our approach to the non-oblivious setting, +proposing a new family of Leverage Score Sparsified embeddings with Independent +Columns, which yield faster runtimes for matrix approximation and regression +tasks. + In our analysis, we develop a new method which uses a decoupling argument +together with the cumulant method for bounding the edge universality error of +isotropic random matrices. To achieve near-optimal sparsity, we combine this +general-purpose approach with new traces inequalities that leverage the +specific structure of our subspace embedding construction. + +
+
+
+
+
+ + ☆ Mapping Methane -- The Impact of Dairy Farm Practices on Emissions + Through Satellite Data and Machine Learning + + +
+ This study investigates the correlation between dairy farm characteristics +and methane concentrations as derived from satellite observations in Eastern +Canada. Utilizing data from 11 dairy farms collected between January 2020 and +December 2022, we integrated Sentinel-5P satellite methane data with critical +farm-level attributes, including herd genetics, feeding practices, and +management strategies. Initial analyses revealed significant correlations with +methane concentrations, leading to the application of Variance Inflation Factor +(VIF) and Principal Component Analysis (PCA) to address multicollinearity and +enhance model stability. Subsequently, machine learning models - specifically +Random Forest and Neural Networks - were employed to evaluate feature +importance and predict methane emissions. Our findings indicate a strong +negative correlation between the Estimated Breeding Value (EBV) for protein +percentage and methane concentrations, suggesting that genetic selection for +higher milk protein content could be an effective strategy for emissions +reduction. The integration of atmospheric transport models with satellite data +further refined our emission estimates, significantly enhancing accuracy and +spatial resolution. This research underscores the potential of advanced +satellite monitoring, machine learning techniques, and atmospheric modeling in +improving methane emission assessments within the dairy sector. It emphasizes +the critical role of farm-specific characteristics in developing effective +mitigation strategies. Future investigations should focus on expanding the +dataset and incorporating inversion modeling for more precise emission +quantification. Balancing ecological impacts with economic viability will be +essential for fostering sustainable dairy farming practices. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Flow reconstruction in time-varying geometries using graph neural + networks + + +
+ The paper presents a Graph Attention Convolutional Network (GACN) for flow +reconstruction from very sparse data in time-varying geometries. The model +incorporates a feature propagation algorithm as a preprocessing step to handle +extremely sparse inputs, leveraging information from neighboring nodes to +initialize missing features. In addition, a binary indicator is introduced as a +validity mask to distinguish between the original and propagated data points, +enabling more effective learning from sparse inputs. Trained on a unique data +set of Direct Numerical Simulations (DNS) of a motored engine at a technically +relevant operating condition, the GACN shows robust performance across +different resolutions and domain sizes and can effectively handle unstructured +data and variable input sizes. The model is tested on previously unseen DNS +data as well as on an experimental data set from Particle Image Velocimetry +(PIV) measurements that were not considered during training. A comparative +analysis shows that the GACN consistently outperforms both a conventional +Convolutional Neural Network (CNN) and cubic interpolation methods on the DNS +and PIV test sets by achieving lower reconstruction errors and better capturing +fine-scale turbulent structures. In particular, the GACN effectively +reconstructs flow fields from domains up to 14 times larger than those observed +during training, with the performance advantage increasing for larger domains. + +
+
+
+
+
+ + ☆ Energy Dissipation Preserving Physics Informed Neural Network for + Allen-Cahn Equations + + +
+ This paper investigates a numerical solution of Allen-Cahn equation with +constant and degenerate mobility, with polynomial and logarithmic energy +functionals, with deterministic and random initial functions, and with +advective term in one, two, and three spatial dimensions, based on the +physics-informed neural network (PINN). To improve the learning capacity of the +PINN, we incorporate the energy dissipation property of the Allen-Cahn equation +as a penalty term into the loss function of the network. To facilitate the +learning process of random initials, we employ a continuous analogue of the +initial random condition by utilizing the Fourier series expansion. Adaptive +methods from traditional numerical analysis are also integrated to enhance the +effectiveness of the proposed PINN. Numerical results indicate a consistent +decrease in the discrete energy, while also revealing phenomena such as phase +separation and metastability. + +
+
+
+
+
+ + ☆ ScaleNet: Scale Invariance Learning in Directed Graphs + + +
+ Graph Neural Networks (GNNs) have advanced relational data analysis but lack +invariance learning techniques common in image classification. In node +classification with GNNs, it is actually the ego-graph of the center node that +is classified. This research extends the scale invariance concept to node +classification by drawing an analogy to image processing: just as scale +invariance being used in image classification to capture multi-scale features, +we propose the concept of ``scaled ego-graphs''. Scaled ego-graphs generalize +traditional ego-graphs by replacing undirected single-edges with +``scaled-edges'', which are ordered sequences of multiple directed edges. We +empirically assess the performance of the proposed scale invariance in graphs +on seven benchmark datasets, across both homophilic and heterophilic +structures. Our scale-invariance-based graph learning outperforms inception +models derived from random walks by being simpler, faster, and more accurate. +The scale invariance explains inception models' success on homophilic graphs +and limitations on heterophilic graphs. To ensure applicability of inception +model to heterophilic graphs as well, we further present ScaleNet, an +architecture that leverages multi-scaled features. ScaleNet achieves +state-of-the-art results on five out of seven datasets (four homophilic and one +heterophilic) and matches top performance on the remaining two, demonstrating +its excellent applicability. This represents a significant advance in graph +learning, offering a unified framework that enhances node classification across +various graph types. Our code is available at +https://github.com/Qin87/ScaleNet/tree/July25. + +
+
+ comment: Scale invariance in node classification is demonstrated and applied + in graph transformation to develop ScaleNet, which achieves state-of-the-art + performance on both homophilic and heterophilic directed graphs +
+
+
+
+
+ + ☆ Weakly-Supervised Anomaly Detection in Surveillance Videos Based on + Two-Stream I3D Convolution Network + + +
+ The widespread implementation of urban surveillance systems has necessitated +more sophisticated techniques for anomaly detection to ensure enhanced public +safety. This paper presents a significant advancement in the field of anomaly +detection through the application of Two-Stream Inflated 3D (I3D) Convolutional +Networks. These networks substantially outperform traditional 3D Convolutional +Networks (C3D) by more effectively extracting spatial and temporal features +from surveillance videos, thus improving the precision of anomaly detection. +Our research advances the field by implementing a weakly supervised learning +framework based on Multiple Instance Learning (MIL), which uniquely +conceptualizes surveillance videos as collections of 'bags' that contain +instances (video clips). Each instance is innovatively processed through a +ranking mechanism that prioritizes clips based on their potential to display +anomalies. This novel strategy not only enhances the accuracy and precision of +anomaly detection but also significantly diminishes the dependency on extensive +manual annotations. Moreover, through meticulous optimization of model +settings, including the choice of optimizer, our approach not only establishes +new benchmarks in the performance of anomaly detection systems but also offers +a scalable and efficient solution for real-world surveillance applications. +This paper contributes significantly to the field of computer vision by +delivering a more adaptable, efficient, and context-aware anomaly detection +system, which is poised to redefine practices in urban surveillance. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ Optimal Transport-Based Displacement Interpolation with Data + Augmentation for Reduced Order Modeling of Nonlinear Dynamical Systems + + +
+ We present a novel reduced-order Model (ROM) that leverages optimal transport +(OT) theory and displacement interpolation to enhance the representation of +nonlinear dynamics in complex systems. While traditional ROM techniques face +challenges in this scenario, especially when data (i.e., observational +snapshots) is limited, our method addresses these issues by introducing a data +augmentation strategy based on OT principles. The proposed framework generates +interpolated solutions tracing geodesic paths in the space of probability +distributions, enriching the training dataset for the ROM. A key feature of our +approach is its ability to provide a continuous representation of the +solution's dynamics by exploiting a virtual-to-real time mapping. This enables +the reconstruction of solutions at finer temporal scales than those provided by +the original data. To further improve prediction accuracy, we employ Gaussian +Process Regression to learn the residual and correct the representation between +the interpolated snapshots and the physical solution. We demonstrate the +effectiveness of our methodology with atmospheric mesoscale benchmarks +characterized by highly nonlinear, advection-dominated dynamics. Our results +show improved accuracy and efficiency in predicting complex system behaviors, +indicating the potential of this approach for a wide range of applications in +computational physics and engineering. + +
+
+
+
+
+ + ☆ Bayesian Comparisons Between Representations + + +
+ Which neural networks are similar is a fundamental question for both machine +learning and neuroscience. Our novel method compares representations based on +Bayesian statistics about linear readouts from the representations. Concretely, +we suggest to use the total variation distance or Jensen-Shannon distance +between prior predictive distributions to compare representations. The prior +predictive distribution is a full description of the inductive bias and +generalization of a model in Bayesian statistics, making it a great basis for +comparisons. As Jensen-Shannon distance and total variation distance are +metrics our dissimilarity measures are pseudo-metrics for representations. For +a linear readout, our metrics just depend on the linear kernel matrix of the +representations. Thus, our metrics connects linear read-out based comparisons +to kernel based metrics like centered kernel alignment and representational +similarity analysis. We apply our new metrics to deep neural networks trained +on ImageNet-1k. Our new metrics can be computed efficiently including a +stochastic gradient without dimensionality reductions of the representations. +It broadly agrees with existing metrics, but is more stringent. It varies less +across different random image samples, and it measures how well two +representations could be distinguished based on a linear read out. Thus our +metric nicely extends our toolkit for comparing representations. + +
+
+
+
+
+ + ☆ Recommender systems and reinforcement learning for building control and + occupant interaction: A text-mining driven review of scientific literature + + +
+ The indoor environment greatly affects health and well-being; enhancing +health and reducing energy use in these settings is a key research focus. With +advancing Information and Communication Technology (ICT), recommendation +systems and reinforcement learning have emerged as promising methods to induce +behavioral changes that improve indoor environments and building energy +efficiency. This study employs text-mining and Natural Language Processing +(NLP) to examine these approaches in building control and occupant interaction. +Analyzing approximately 27,000 articles from the ScienceDirect database, we +found extensive use of recommendation systems and reinforcement learning for +space optimization, location recommendations, and personalized control +suggestions. Despite broad applications, their use in optimizing indoor +environments and energy efficiency is limited. Traditional recommendation +algorithms are commonly used, but optimizing indoor conditions and energy +efficiency often requires advanced machine learning techniques like +reinforcement and deep learning. This review highlights the potential for +expanding recommender systems and reinforcement learning applications in +buildings and indoor environments. Areas for innovation include predictive +maintenance, building-related product recommendations, and optimizing +environments for specific needs like sleep and productivity enhancements based +on user feedback. + +
+
+
+
+
+ + ☆ Searching Latent Program Spaces + + +
+ Program synthesis methods aim to automatically generate programs restricted +to a language that can explain a given specification of input-output pairs. +While purely symbolic approaches suffer from a combinatorial search space, +recent methods leverage neural networks to learn distributions over program +structures to narrow this search space significantly, enabling more efficient +search. However, for challenging problems, it remains difficult to train models +to perform program synthesis in one shot, making test-time search essential. +Most neural methods lack structured search mechanisms during inference, relying +instead on stochastic sampling or gradient updates, which can be inefficient. +In this work, we propose the Latent Program Network (LPN), a general algorithm +for program induction that learns a distribution over latent programs in a +continuous space, enabling efficient search and test-time adaptation. We +explore how to train these networks to optimize for test-time computation and +demonstrate the use of gradient-based search both during training and at test +time. We evaluate LPN on ARC-AGI, a program synthesis benchmark that evaluates +performance by generalizing programs to new inputs rather than explaining the +underlying specification. We show that LPN can generalize beyond its training +distribution and adapt to unseen tasks by utilizing test-time computation, +outperforming algorithms without test-time adaptation mechanisms. + +
+
+ comment: Code available at https://github.com/clement-bonnet/lpn +
+
+
+
+
+ + ☆ MVKTrans: Multi-View Knowledge Transfer for Robust Multiomics + Classification + + +
+ The distinct characteristics of multiomics data, including complex +interactions within and across biological layers and disease heterogeneity +(e.g., heterogeneity in etiology and clinical symptoms), drive us to develop +novel designs to address unique challenges in multiomics prediction. In this +paper, we propose the multi-view knowledge transfer learning (MVKTrans) +framework, which transfers intra- and inter-omics knowledge in an adaptive +manner by reviewing data heterogeneity and suppressing bias transfer, thereby +enhancing classification performance. Specifically, we design a graph +contrastive module that is trained on unlabeled data to effectively learn and +transfer the underlying intra-omics patterns to the supervised task. This +unsupervised pretraining promotes learning general and unbiased representations +for each modality, regardless of the downstream tasks. In light of the varying +discriminative capacities of modalities across different diseases and/or +samples, we introduce an adaptive and bi-directional cross-omics distillation +module. This module automatically identifies richer modalities and facilitates +dynamic knowledge transfer from more informative to less informative omics, +thereby enabling a more robust and generalized integration. Extensive +experiments on four real biomedical datasets demonstrate the superior +performance and robustness of MVKTrans compared to the state-of-the-art. Code +and data are available at https://github.com/Yaolab-fantastic/MVKTrans. + +
+
+
+
+
+ + ☆ TRACE: Transformer-based Risk Assessment for Clinical Evaluation + + +
+ We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation), +a novel method for clinical risk assessment based on clinical data, leveraging +the self-attention mechanism for enhanced feature interaction and result +interpretation. Our approach is able to handle different data modalities, +including continuous, categorical and multiple-choice (checkbox) attributes. +The proposed architecture features a shared representation of the clinical data +obtained by integrating specialized embeddings of each data modality, enabling +the detection of high-risk individuals using Transformer encoder layers. To +assess the effectiveness of the proposed method, a strong baseline based on +non-negative multi-layer perceptrons (MLPs) is introduced. The proposed method +outperforms various baselines widely used in the domain of clinical risk +assessment, while effectively handling missing values. In terms of +explainability, our Transformer-based method offers easily interpretable +results via attention weights, further enhancing the clinicians' +decision-making process. + +
+
+
+
+
+ + ☆ Rethinking negative sampling in content-based news recommendation + + +
+ News recommender systems are hindered by the brief lifespan of articles, as +they undergo rapid relevance decay. Recent studies have demonstrated the +potential of content-based neural techniques in tackling this problem. However, +these models often involve complex neural architectures and often lack +consideration for negative examples. In this study, we posit that the careful +sampling of negative examples has a big impact on the model's outcome. We +devise a negative sampling technique that not only improves the accuracy of the +model but also facilitates the decentralization of the recommendation system. +The experimental results obtained using the MIND dataset demonstrate that the +accuracy of the method under consideration can compete with that of +State-of-the-Art models. The utilization of the sampling technique is essential +in reducing model complexity and accelerating the training process, while +maintaining a high level of accuracy. Finally, we discuss how decentralized +models can help improve privacy and scalability. + +
+
+
+
+
+ + ☆ FedSub: Introducing class-aware Subnetworks Fusion to Enhance + Personalized Federated Learning in Ubiquitous Systems + + +
+ Personalized Federated Learning is essential in AI-driven ubiquitous systems, +supporting the distributed development of models able to adapt to diverse and +evolving user behaviors while safeguarding privacy. Despite addressing +heterogeneous user data distributions in collaborative model training, existing +methods often face limitations balancing personalization and generalization, +oversimplifying user similarities, or relying heavily on global models. In this +paper, we propose FedSub, a novel federated approach designed to enhance +personalization through the use of class-aware prototypes and model +subnetworks. Prototypes serve as compact representations of user data, +clustered on the server to identify similarities based on specific label +patterns. Concurrently, subnetworks -- model components necessary to process +each class -- are extracted locally and fused by the server according to these +clusters, producing highly tailored model updates for each user. This +fine-grained, class-specific aggregation of clients' models allows FedSub to +capture the unique characteristics of individual user data patterns. The +effectiveness of FedSub is validated in three real-world scenarios +characterized by high data heterogeneity, derived from human activity +recognition and mobile health applications. Experimental evaluations +demonstrate FedSub's performance improvements with respect to the +state-of-the-art and significant advancements in personalization for ubiquitous +systems based on personal mobile and wearable devices. + +
+
+ comment: Submitted to Proceedings of the ACM on Interactive, Mobile, Wearable + and Ubiquitous Technologies (IMWUT) +
+
+
+
+
+ + ☆ Measuring similarity between embedding spaces using induced neighborhood + graphs + + +
+ Deep Learning techniques have excelled at generating embedding spaces that +capture semantic similarities between items. Often these representations are +paired, enabling experiments with analogies (pairs within the same domain) and +cross-modality (pairs across domains). These experiments are based on specific +assumptions about the geometry of embedding spaces, which allow finding paired +items by extrapolating the positional relationships between embedding pairs in +the training dataset, allowing for tasks such as finding new analogies, and +multimodal zero-shot classification. In this work, we propose a metric to +evaluate the similarity between paired item representations. Our proposal is +built from the structural similarity between the nearest-neighbors induced +graphs of each representation, and can be configured to compare spaces based on +different distance metrics and on different neighborhood sizes. We demonstrate +that our proposal can be used to identify similar structures at different +scales, which is hard to achieve with kernel methods such as Centered Kernel +Alignment (CKA). We further illustrate our method with two case studies: an +analogy task using GloVe embeddings, and zero-shot classification in the +CIFAR-100 dataset using CLIP embeddings. Our results show that accuracy in both +analogy and zero-shot classification tasks correlates with the embedding +similarity. These findings can help explain performance differences in these +tasks, and may lead to improved design of paired-embedding models in the +future. + +
+
+
+
+
+ + ☆ UniMat: Unifying Materials Embeddings through Multi-modal Learning + + +
+ Materials science datasets are inherently heterogeneous and are available in +different modalities such as characterization spectra, atomic structures, +microscopic images, and text-based synthesis conditions. The advancements in +multi-modal learning, particularly in vision and language models, have opened +new avenues for integrating data in different forms. In this work, we evaluate +common techniques in multi-modal learning (alignment and fusion) in unifying +some of the most important modalities in materials science: atomic structure, +X-ray diffraction patterns (XRD), and composition. We show that structure graph +modality can be enhanced by aligning with XRD patterns. Additionally, we show +that aligning and fusing more experimentally accessible data formats, such as +XRD patterns and compositions, can create more robust joint embeddings than +individual modalities across various tasks. This lays the groundwork for future +studies aiming to exploit the full potential of multi-modal data in materials +science, facilitating more informed decision-making in materials design and +discovery. + +
+
+
+
+
+ + ☆ Accelerating Quasi-Static Time Series Simulations with Foundation Models + + +
+ Quasi-static time series (QSTS) simulations have great potential for +evaluating the grid's ability to accommodate the large-scale integration of +distributed energy resources. However, as grids expand and operate closer to +their limits, iterative power flow solvers, central to QSTS simulations, become +computationally prohibitive and face increasing convergence issues. Neural +power flow solvers provide a promising alternative, speeding up power flow +computations by 3 to 4 orders of magnitude, though they are costly to train. In +this paper, we envision how recently introduced grid foundation models could +improve the economic viability of neural power flow solvers. Conceptually, +these models amortize training costs by serving as a foundation for a range of +grid operation and planning tasks beyond power flow solving, with only minimal +fine-tuning required. We call for collaboration between the AI and power grid +communities to develop and open-source these models, enabling all operators, +even those with limited resources, to benefit from AI without building +solutions from scratch. + +
+
+ comment: Equal contributors: A.P. and F.M.; Lead contact: A.P +
+
+
+
+
+ + ☆ Estimating unknown parameters in differential equations with a + reinforcement learning based PSO method + + +
+ Differential equations offer a foundational yet powerful framework for +modeling interactions within complex dynamic systems and are widely applied +across numerous scientific fields. One common challenge in this area is +estimating the unknown parameters of these dynamic relationships. However, +traditional numerical optimization methods rely on the selection of initial +parameter values, making them prone to local optima. Meanwhile, deep learning +and Bayesian methods require training models on specific differential +equations, resulting in poor versatility. This paper reformulates the parameter +estimation problem of differential equations as an optimization problem by +introducing the concept of particles from the particle swarm optimization +algorithm. Building on reinforcement learning-based particle swarm optimization +(RLLPSO), this paper proposes a novel method, DERLPSO, for estimating unknown +parameters of differential equations. We compared its performance on three +typical ordinary differential equations with the state-of-the-art methods, +including the RLLPSO algorithm, traditional numerical methods, deep learning +approaches, and Bayesian methods. The experimental results demonstrate that our +DERLPSO consistently outperforms other methods in terms of performance, +achieving an average Mean Square Error of 1.13e-05, which reduces the error by +approximately 4 orders of magnitude compared to other methods. Apart from +ordinary differential equations, our DERLPSO also show great promise for +estimating unknown parameters of partial differential equations. The DERLPSO +method proposed in this paper has high accuracy, is independent of initial +parameter values, and possesses strong versatility and stability. This work +provides new insights into unknown parameter estimation for differential +equations. + +
+
+
+
+
+ + ☆ Towards Secure Intelligent O-RAN Architecture: Vulnerabilities, Threats + and Promising Technical Solutions using LLMs + + +
+ The evolution of wireless communication systems will be fundamentally +impacted by an open radio access network (O-RAN), a new concept defining an +intelligent architecture with enhanced flexibility, openness, and the ability +to slice services more efficiently. For all its promises, and like any +technological advancement, O-RAN is not without risks that need to be carefully +assessed and properly addressed to accelerate its wide adoption in future +mobile networks. In this paper, we present an in-depth security analysis of the +O-RAN architecture, discussing the potential threats that may arise in the +different O-RAN architecture layers and their impact on the Confidentiality, +Integrity, and Availability (CIA) triad. We also promote the potential of zero +trust, Moving Target Defense (MTD), blockchain, and large language models(LLM) +technologies in fortifying O-RAN's security posture. Furthermore, we +numerically demonstrate the effectiveness of MTD in empowering robust deep +reinforcement learning methods for dynamic network slice admission control in +the O-RAN architecture. Moreover, we examine the effect of explainable AI (XAI) +based on LLMs in securing the system. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Gaussian Mixture Models Based Augmentation Enhances GNN Generalization + + +
+ Graph Neural Networks (GNNs) have shown great promise in tasks like node and +graph classification, but they often struggle to generalize, particularly to +unseen or out-of-distribution (OOD) data. These challenges are exacerbated when +training data is limited in size or diversity. To address these issues, we +introduce a theoretical framework using Rademacher complexity to compute a +regret bound on the generalization error and then characterize the effect of +data augmentation. This framework informs the design of GMM-GDA, an efficient +graph data augmentation (GDA) algorithm leveraging the capability of Gaussian +Mixture Models (GMMs) to approximate any distribution. Our approach not only +outperforms existing augmentation techniques in terms of generalization but +also offers improved time complexity, making it highly suitable for real-world +applications. + +
+
+
+
+
+ + ☆ Robot See, Robot Do: Imitation Reward for Noisy Financial Environments + + +
+ The sequential nature of decision-making in financial asset trading aligns +naturally with the reinforcement learning (RL) framework, making RL a common +approach in this domain. However, the low signal-to-noise ratio in financial +markets results in noisy estimates of environment components, including the +reward function, which hinders effective policy learning by RL agents. Given +the critical importance of reward function design in RL problems, this paper +introduces a novel and more robust reward function by leveraging imitation +learning, where a trend labeling algorithm acts as an expert. We integrate +imitation (expert's) feedback with reinforcement (agent's) feedback in a +model-free RL algorithm, effectively embedding the imitation learning problem +within the RL paradigm to handle the stochasticity of reward signals. Empirical +results demonstrate that this novel approach improves financial performance +metrics compared to traditional benchmarks and RL agents trained solely using +reinforcement feedback. + +
+
+
+
+
+ + ☆ Deep Generative Demand Learning for Newsvendor and Pricing + + +
+ We consider data-driven inventory and pricing decisions in the feature-based +newsvendor problem, where demand is influenced by both price and contextual +features and is modeled without any structural assumptions. The unknown demand +distribution results in a challenging conditional stochastic optimization +problem, further complicated by decision-dependent uncertainty and the +integration of features. Inspired by recent advances in deep generative +learning, we propose a novel approach leveraging conditional deep generative +models (cDGMs) to address these challenges. cDGMs learn the demand distribution +and generate probabilistic demand forecasts conditioned on price and features. +This generative approach enables accurate profit estimation and supports the +design of algorithms for two key objectives: (1) optimizing inventory for +arbitrary prices, and (2) jointly determining optimal pricing and inventory +levels. We provide theoretical guarantees for our approach, including the +consistency of profit estimation and convergence of our decisions to the +optimal solution. Extensive simulations-ranging from simple to complex +scenarios, including one involving textual features-and a real-world case study +demonstrate the effectiveness of our approach. Our method opens a new paradigm +in management science and operations research, is adaptable to extensions of +the newsvendor and pricing problems, and holds potential for solving other +conditional stochastic optimization problems. + +
+
+ comment: 30 pages, 6 figures +
+
+
+
+
+ + ☆ Dynamic Subset Tuning: Expanding the Operational Range of + Parameter-Efficient Training for Large Language Models NeurIPS 2024 + + +
+ We propose a novel parameter-efficient training (PET) method for large +language models that adapts models to downstream tasks by optimizing a small +subset of the existing model parameters. Unlike prior methods, this subset is +not fixed in location but rather which parameters are modified evolves over the +course of training. This dynamic parameter selection can yield good performance +with many fewer parameters than extant methods. Our method enables a seamless +scaling of the subset size across an arbitrary proportion of the total model +size, while popular PET approaches like prompt tuning and LoRA cover only a +small part of this spectrum. We match or outperform prompt tuning and LoRA in +most cases on a variety of NLP tasks (MT, QA, GSM8K, SuperGLUE) for a given +parameter budget across different model families and sizes. + +
+
+ comment: NeurIPS 2024 Workshop on Adaptive Foundation Models +
+
+
+
+
+ + ☆ XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL + + +
+ To tackle the challenges of large language model performance in natural +language to SQL tasks, we introduce XiYan-SQL, an innovative framework that +employs a multi-generator ensemble strategy to improve candidate generation. We +introduce M-Schema, a semi-structured schema representation method designed to +enhance the understanding of database structures. To enhance the quality and +diversity of generated candidate SQL queries, XiYan-SQL integrates the +significant potential of in-context learning (ICL) with the precise control of +supervised fine-tuning. On one hand, we propose a series of training strategies +to fine-tune models to generate high-quality candidates with diverse +preferences. On the other hand, we implement the ICL approach with an example +selection method based on named entity recognition to prevent overemphasis on +entities. The refiner optimizes each candidate by correcting logical or +syntactical errors. To address the challenge of identifying the best candidate, +we fine-tune a selection model to distinguish nuances of candidate SQL queries. +The experimental results on multiple dialect datasets demonstrate the +robustness of XiYan-SQL in addressing challenges across different scenarios. +Overall, our proposed XiYan-SQL achieves the state-of-the-art execution +accuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on +NL2GQL, and a competitive score of 72.23% on the Bird development benchmark. +The proposed framework not only enhances the quality and diversity of SQL +queries but also outperforms previous methods. + +
+
+
+
+
+ + ☆ Hopfield-Fenchel-Young Networks: A Unified Framework for Associative + Memory Retrieval + + +
+ Associative memory models, such as Hopfield networks and their modern +variants, have garnered renewed interest due to advancements in memory capacity +and connections with self-attention in transformers. In this work, we introduce +a unified framework-Hopfield-Fenchel-Young networks-which generalizes these +models to a broader family of energy functions. Our energies are formulated as +the difference between two Fenchel-Young losses: one, parameterized by a +generalized entropy, defines the Hopfield scoring mechanism, while the other +applies a post-transformation to the Hopfield output. By utilizing Tsallis and +norm entropies, we derive end-to-end differentiable update rules that enable +sparse transformations, uncovering new connections between loss margins, +sparsity, and exact retrieval of single memory patterns. We further extend this +framework to structured Hopfield networks using the SparseMAP transformation, +allowing the retrieval of pattern associations rather than a single pattern. +Our framework unifies and extends traditional and modern Hopfield networks and +provides an energy minimization perspective for widely used +post-transformations like $\ell_2$-normalization and layer normalization-all +through suitable choices of Fenchel-Young losses and by using convex analysis +as a building block. Finally, we validate our Hopfield-Fenchel-Young networks +on diverse memory recall tasks, including free and sequential recall. +Experiments on simulated data, image retrieval, multiple instance learning, and +text rationalization demonstrate the effectiveness of our approach. + +
+
+ comment: 49 pages, 14 figures. arXiv admin note: text overlap with + arXiv:2402.13725 +
+
+
+
+
+ + ☆ DeepUQ: Assessing the Aleatoric Uncertainties from two Deep Learning + Methods NeurIPS 2024 + + +
+ Assessing the quality of aleatoric uncertainty estimates from uncertainty +quantification (UQ) deep learning methods is important in scientific contexts, +where uncertainty is physically meaningful and important to characterize and +interpret exactly. We systematically compare aleatoric uncertainty measured by +two UQ techniques, Deep Ensembles (DE) and Deep Evidential Regression (DER). +Our method focuses on both zero-dimensional (0D) and two-dimensional (2D) data, +to explore how the UQ methods function for different data dimensionalities. We +investigate uncertainty injected on the input and output variables and include +a method to propagate uncertainty in the case of input uncertainty so that we +can compare the predicted aleatoric uncertainty to the known values. We +experiment with three levels of noise. The aleatoric uncertainty predicted +across all models and experiments scales with the injected noise level. +However, the predicted uncertainty is miscalibrated to $\rm{std}(\sigma_{\rm +al})$ with the true uncertainty for half of the DE experiments and almost all +of the DER experiments. The predicted uncertainty is the least accurate for +both UQ methods for the 2D input uncertainty experiment and the high-noise +level. While these results do not apply to more complex data, they highlight +that further research on post-facto calibration for these methods would be +beneficial, particularly for high-noise and high-dimensional settings. + +
+
+ comment: Accepted to the Machine Learning for Physical Sciences workshop at + NeurIPS 2024; 11 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Intelligent Algorithms For Signature Diagnostics Of Three-Phase Motors + + +
+ The application of machine learning (ML) algorithms in the intelligent +diagnosis of three-phase engines has the potential to significantly enhance +diagnostic performance and accuracy. Traditional methods largely rely on +signature analysis, which, despite being a standard practice, can benefit from +the integration of advanced ML techniques. In our study, we innovate by +combining state of the art algorithms with a novel unsupervised anomaly +generation methodology that takes into account physics model of the engine. +This hybrid approach leverages the strengths of both supervised ML and +unsupervised signature analysis, achieving superior diagnostic accuracy and +reliability along with a wide industrial application. Our experimental results +demonstrate that this method significantly outperforms existing ML and non-ML +state-of-the-art approaches while retaining the practical advantages of an +unsupervised methodology. The findings highlight the potential of our approach +to significantly contribute to the field of engine diagnostics, offering a +robust and efficient solution for real-world applications. + +
+
+
+
+
+ + ☆ Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space + Exploration by Reinforcement Learning Agent ICRA 2025 + + +
+ Grasping by a robot in unstructured environments is deemed a critical +challenge because of the requirement for effective adaptation to a wide +variation in object geometries, material properties, and other environmental +factors. In this paper, we propose a novel framework for robotic grasping based +on the idea of compressing high-dimensional target and gripper features in a +common latent space using a set of autoencoders. Our approach simplifies +grasping by using three autoencoders dedicated to the target, the gripper, and +a third one that fuses their latent representations. This allows the RL agent +to achieve higher learning rates at the initial stages of exploration of a new +environment, as well as at non-zero shot grasp attempts. The agent explores the +latent space of the third autoencoder for better quality grasp without explicit +reconstruction of objects. By implementing the PoWER algorithm into the RL +training process, updates on the agent's policy will be made through the +perturbation in the reward-weighted latent space. The successful exploration +efficiently constrains both position and pose integrity for feasible executions +of grasps. We evaluate our system on a diverse set of objects, demonstrating +the high success rate in grasping with minimum computational overhead. We found +that approach enhances the adaptation of the RL agent by more than 35 \% in +simulation experiments. + +
+
+ comment: Submitted for review at IEEE ICRA 2025 +
+
+
+
+
+ + ☆ Learning Locally Adaptive Metrics that Enhance Structural Representation + with $\texttt{LAMINAR}$ NeurIPS 2024 + + +
+ We present $\texttt{LAMINAR}$, a novel unsupervised machine learning pipeline +designed to enhance the representation of structure within data via producing a +more-informative distance metric. Analysis methods in the physical sciences +often rely on standard metrics to define geometric relationships in data, which +may fail to capture the underlying structure of complex data sets. +$\texttt{LAMINAR}$ addresses this by using a continuous-normalising-flow and +inverse-transform-sampling to define a Riemannian manifold in the data space +without the need for the user to specify a metric over the data a-priori. The +result is a locally-adaptive-metric that produces structurally-informative +density-based distances. We demonstrate the utility of $\texttt{LAMINAR}$ by +comparing its output to the Euclidean metric for structured data sets. + +
+
+ comment: Accepted to the NeurIPS 2024 Machine Learning and the Physical + Sciences workshop. 6 pages, 6 figures +
+
+
+
+
+ + ☆ Leveraging Pre-Trained Neural Networks to Enhance Machine Learning with + Variational Quantum Circuits + + +
+ Quantum Machine Learning (QML) offers tremendous potential but is currently +limited by the availability of qubits. We introduce an innovative approach that +utilizes pre-trained neural networks to enhance Variational Quantum Circuits +(VQC). This technique effectively separates approximation error from qubit +count and removes the need for restrictive conditions, making QML more viable +for real-world applications. Our method significantly improves parameter +optimization for VQC while delivering notable gains in representation and +generalization capabilities, as evidenced by rigorous theoretical analysis and +extensive empirical testing on quantum dot classification tasks. Moreover, our +results extend to applications such as human genome analysis, demonstrating the +broad applicability of our approach. By addressing the constraints of current +quantum hardware, our work paves the way for a new era of advanced QML +applications, unlocking the full potential of quantum computing in fields such +as machine learning, materials science, medicine, mimetics, and various +interdisciplinary areas. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ Graph Neural Networks in Supply Chain Analytics and Optimization: + Concepts, Perspectives, Dataset and Benchmarks + + +
+ Graph Neural Networks (GNNs) have recently gained traction in transportation, +bioinformatics, language and image processing, but research on their +application to supply chain management remains limited. Supply chains are +inherently graph-like, making them ideal for GNN methodologies, which can +optimize and solve complex problems. The barriers include a lack of proper +conceptual foundations, familiarity with graph applications in SCM, and +real-world benchmark datasets for GNN-based supply chain research. To address +this, we discuss and connect supply chains with graph structures for effective +GNN application, providing detailed formulations, examples, mathematical +definitions, and task guidelines. Additionally, we present a multi-perspective +real-world benchmark dataset from a leading FMCG company in Bangladesh, +focusing on supply chain planning. We discuss various supply chain tasks using +GNNs and benchmark several state-of-the-art models on homogeneous and +heterogeneous graphs across six supply chain analytics tasks. Our analysis +shows that GNN-based models consistently outperform statistical Machine +Learning and other Deep Learning models by around 10-30% in regression, 10-30% +in classification and detection tasks, and 15-40% in anomaly detection tasks on +designated metrics. With this work, we lay the groundwork for solving supply +chain problems using GNNs, supported by conceptual discussions, methodological +insights, and a comprehensive dataset. + +
+
+ comment: 27 Pages. Extended journal version of SupplyGraph (arXiv:2401.15299). + In Review +
+
+
+
+
+ + ☆ MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal + Lymphatic Vessel Segmentation ML4H 2024 + + +
+ Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste +products from the human brain. An impairment in their functionality has been +associated with aging as well as brain disorders like multiple sclerosis and +Alzheimer's disease. However, MLVs have only recently been described for the +first time in magnetic resonance imaging (MRI), and their ramified structure +renders manual segmentation particularly difficult. Further, as there is no +consistent notion of their appearance, human-annotated MLV structures contain a +high inter-rater variability that most automatic segmentation methods cannot +take into account. In this work, we propose a new rater-aware training scheme +for the popular nnU-Net model, and we explore rater-based ensembling strategies +for accurate and consistent segmentation of MLVs. This enables us to boost +nnU-Net's performance while obtaining explicit predictions in different +annotation styles and a rater-based uncertainty estimation. Our final model, +MLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to +the human reference standard. The model further matches the human inter-rater +reliability and replicates age-related associations with MLV volume. + +
+
+ comment: ML4H 2024 +
+
+
+
+
+ + ☆ Efficient Whole Slide Image Classification through Fisher Vector + Representation + + +
+ The advancement of digital pathology, particularly through computational +analysis of whole slide images (WSI), is poised to significantly enhance +diagnostic precision and efficiency. However, the large size and complexity of +WSIs make it difficult to analyze and classify them using computers. This study +introduces a novel method for WSI classification by automating the +identification and examination of the most informative patches, thus +eliminating the need to process the entire slide. Our method involves +two-stages: firstly, it extracts only a few patches from the WSIs based on +their pathological significance; and secondly, it employs Fisher vectors (FVs) +for representing features extracted from these patches, which is known for its +robustness in capturing fine-grained details. This approach not only +accentuates key pathological features within the WSI representation but also +significantly reduces computational overhead, thus making the process more +efficient and scalable. We have rigorously evaluated the proposed method across +multiple datasets to benchmark its performance against comprehensive WSI +analysis and contemporary weakly-supervised learning methodologies. The +empirical results indicate that our focused analysis of select patches, +combined with Fisher vector representation, not only aligns with, but at times +surpasses, the classification accuracy of standard practices. Moreover, this +strategy notably diminishes computational load and resource expenditure, +thereby establishing an efficient and precise framework for WSI analysis in the +realm of digital pathology. + +
+
+
+
+
+ + ☆ SAD-TIME: a Spatiotemporal-fused network for depression detection with + Automated multi-scale Depth-wise and TIME-interval-related common feature + extractor + + +
+ Background and Objective: Depression is a severe mental disorder, and +accurate diagnosis is pivotal to the cure and rehabilitation of people with +depression. However, the current questionnaire-based diagnostic methods could +bring subjective biases and may be denied by subjects. In search of a more +objective means of diagnosis, researchers have begun to experiment with deep +learning-based methods for identifying depressive disorders in recent years. +Methods: In this study, a novel Spatiotemporal-fused network with Automated +multi-scale Depth-wise and TIME-interval-related common feature extractor +(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common +features extractor (CFE), a spatial sector (SpS), a modified temporal sector +(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale +depth-wise 1D-convolutional neural network and a time-interval embedding +generator, where the unique information of each channel is preserved. The SpS +fuses the functional connectivity with the distance-based connectivity +containing spatial position of EEG electrodes. A multi-head-attention graph +convolutional network is also applied in the SpS to fuse the features from +different EEG channels. The TeS is based on long short-term memory and graph +transformer networks, where the temporal information of different time-windows +is fused. Moreover, the DAL is used after the SpS to obtain the +domain-invariant feature. Results: Experimental results under tenfold +cross-validation show that the proposed SAD-TIME method achieves 92.00% and +94.00% depression classification accuracies on two datasets, respectively, in +cross-subject mode. Conclusion: SAD-TIME is a robust depression detection +model, where the automatedly-generated features, the SpS and the TeS assist the +classification performance with the fusion of the innate spatiotemporal +information in the EEG signals. + +
+
+ comment: 21pages, 7 figures +
+
+
+
+
+ + ☆ An Information Theoretic Approach to Operationalize Right to Data + Protection + + +
+ The widespread practice of indiscriminate data scraping to fine-tune language +models (LMs) raises significant legal and ethical concerns, particularly +regarding compliance with data protection laws such as the General Data +Protection Regulation (GDPR). This practice often results in the unauthorized +use of personal information, prompting growing debate within the academic and +regulatory communities. Recent works have introduced the concept of generating +unlearnable datasets (by adding imperceptible noise to the clean data), such +that the underlying model achieves lower loss during training but fails to +generalize to the unseen test setting. Though somewhat effective, these +approaches are predominantly designed for images and are limited by several +practical constraints like requiring knowledge of the target model. To this +end, we introduce RegText, a framework that injects imperceptible spurious +correlations into natural language datasets, effectively rendering them +unlearnable without affecting semantic content. We demonstrate RegText's +utility through rigorous empirical analysis of small and large LMs. Notably, +RegText can restrict newer models like GPT-4o and Llama from learning on our +generated data, resulting in a drop in their test accuracy compared to their +zero-shot performance and paving the way for generating unlearnable text to +protect public data. + +
+
+ comment: First two authors contributed equally to this work +
+
+
+
+
+ + ☆ Methodology for a Statistical Analysis of Influencing Factors on 3D + Object Detection Performance + + +
+ In autonomous driving, object detection is an essential task to perceive the +environment by localizing and classifying objects. Most object detection +algorithms rely on deep learning for their superior performance. However, their +black box nature makes it challenging to ensure safety. In this paper, we +propose a first-of-its-kind methodology for statistical analysis of the +influence of various factors related to the objects to detect or the +environment on the detection performance of both LiDAR- and camera-based 3D +object detectors. We perform a univariate analysis between each of the factors +and the detection error in order to compare the strength of influence. To +better identify potential sources of detection errors, we also analyze the +performance in dependency of the influencing factors and examine the +interdependencies between the different influencing factors. Recognizing the +factors that influence detection performance helps identify robustness issues +in the trained object detector and supports the safety approval of object +detection systems. + +
+
+
+
+
+ + ☆ Learning Model Agnostic Explanations via Constraint Programming + + +
+ Interpretable Machine Learning faces a recurring challenge of explaining the +predictions made by opaque classifiers such as ensemble models, kernel methods, +or neural networks in terms that are understandable to humans. When the model +is viewed as a black box, the objective is to identify a small set of features +that jointly determine the black box response with minimal error. However, +finding such model-agnostic explanations is computationally demanding, as the +problem is intractable even for binary classifiers. In this paper, the task is +framed as a Constraint Optimization Problem, where the constraint solver seeks +an explanation of minimum error and bounded size for an input data instance and +a set of samples generated by the black box. From a theoretical perspective, +this constraint programming approach offers PAC-style guarantees for the output +explanation. We evaluate the approach empirically on various datasets and show +that it statistically outperforms the state-of-the-art heuristic Anchors +method. + +
+
+
+
+
+ + ☆ Trap-MID: Trapdoor-based Defense against Model Inversion Attacks NeurIPS + + +
+ Model Inversion (MI) attacks pose a significant threat to the privacy of Deep +Neural Networks by recovering training data distribution from well-trained +models. While existing defenses often rely on regularization techniques to +reduce information leakage, they remain vulnerable to recent attacks. In this +paper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to +mislead MI attacks. A trapdoor is integrated into the model to predict a +specific label when the input is injected with the corresponding trigger. +Consequently, this trapdoor information serves as the "shortcut" for MI +attacks, leading them to extract trapdoor triggers rather than private data. We +provide theoretical insights into the impacts of trapdoor's effectiveness and +naturalness on deceiving MI attacks. In addition, empirical experiments +demonstrate the state-of-the-art defense performance of Trap-MID against +various MI attacks without the requirements for extra data or large +computational overhead. Our source code is publicly available at +https://github.com/ntuaislab/Trap-MID. + +
+
+ comment: Accepted by Neural Information Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ Machine Unlearning on Pre-trained Models by Residual Feature Alignment + Using LoRA + + +
+ Machine unlearning is new emerged technology that removes a subset of the +training data from a trained model without affecting the model performance on +the remaining data. This topic is becoming increasingly important in protecting +user privacy and eliminating harmful or outdated data. The key challenge lies +in effectively and efficiently unlearning specific information without +compromising the model's utility on the retained data. For the pre-trained +models, fine-tuning is an important way to achieve the unlearning target. +Previous work typically fine-tuned the entire model's parameters, which incurs +significant computation costs. In addition, the fine-tuning process may cause +shifts in the intermediate layer features, affecting the model's overall +utility. In this work, we propose a novel and efficient machine unlearning +method on pre-trained models. We term the method as Residual Feature Alignment +Unlearning. Specifically, we leverage LoRA (Low-Rank Adaptation) to decompose +the model's intermediate features into pre-trained features and residual +features. By adjusting the residual features, we align the unlearned model with +the pre-trained model at the intermediate feature level to achieve both +unlearning and remaining targets. The method aims to learn the zero residuals +on the retained set and shifted residuals on the unlearning set. Extensive +experiments on numerous datasets validate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ One STEP at a time: Language Agents are Stepwise Planners + + +
+ Language agents have shown promising adaptability in dynamic environments to +perform complex tasks. However, despite the versatile knowledge embedded in +large language models, these agents still fall short when it comes to tasks +that require planning. We introduce STEP, a novel framework designed to +efficiently learn from previous experiences to enhance the planning +capabilities of language agents in future steps. Concretely, STEP functions +through four interconnected components. First, the Planner takes on the task, +breaks it down into subtasks and provides relevant insights. Then the Executor +generates action candidates, while the Evaluator ensures the actions align with +learned rules from previous experiences. Lastly, Memory stores experiences to +inform future decisions. In the ScienceWorld benchmark, our results show that +STEP consistently outperforms state-of-the-art models, achieving an overall +score of 67.4 and successfully completing 12 out of 18 tasks. These findings +highlight STEP's potential as a framework for enhancing planning capabilities +in language agents, paving the way for more sophisticated task-solving in +dynamic environments. + +
+
+
+
+
+ + ☆ Properties of fairness measures in the context of varying class + imbalance and protected group ratios + + +
+ Society is increasingly relying on predictive models in fields like criminal +justice, credit risk management, or hiring. To prevent such automated systems +from discriminating against people belonging to certain groups, fairness +measures have become a crucial component in socially relevant applications of +machine learning. However, existing fairness measures have been designed to +assess the bias between predictions for protected groups without considering +the imbalance in the classes of the target variable. Current research on the +potential effect of class imbalance on fairness focuses on practical +applications rather than dataset-independent measure properties. In this paper, +we study the general properties of fairness measures for changing class and +protected group proportions. For this purpose, we analyze the probability mass +functions of six of the most popular group fairness measures. We also measure +how the probability of achieving perfect fairness changes for varying class +imbalance ratios. Moreover, we relate the dataset-independent properties of +fairness measures described in this paper to classifier fairness in real-life +tasks. Our results show that measures such as Equal Opportunity and Positive +Predictive Parity are more sensitive to changes in class imbalance than +Accuracy Equality. These findings can help guide researchers and practitioners +in choosing the most appropriate fairness measures for their classification +problems. + +
+
+
+
+
+ + ☆ Material Property Prediction with Element Attribute Knowledge Graphs and + Multimodal Representation Learning + + +
+ Machine learning has become a crucial tool for predicting the properties of +crystalline materials. However, existing methods primarily represent material +information by constructing multi-edge graphs of crystal structures, often +overlooking the chemical and physical properties of elements (such as atomic +radius, electronegativity, melting point, and ionization energy), which have a +significant impact on material performance. To address this limitation, we +first constructed an element property knowledge graph and utilized an embedding +model to encode the element attributes within the knowledge graph. Furthermore, +we propose a multimodal fusion framework, ESNet, which integrates element +property features with crystal structure features to generate joint multimodal +representations. This provides a more comprehensive perspective for predicting +the performance of crystalline materials, enabling the model to consider both +microstructural composition and chemical characteristics of the materials. We +conducted experiments on the Materials Project benchmark dataset, which showed +leading performance in the bandgap prediction task and achieved results on a +par with existing benchmarks in the formation energy prediction task. + +
+
+
+
+
+ + ☆ Quantifying Qualitative Insights: Leveraging LLMs to Market Predict + + +
+ Recent advancements in Large Language Models (LLMs) have the potential to +transform financial analytics by integrating numerical and textual data. +However, challenges such as insufficient context when fusing multimodal +information and the difficulty in measuring the utility of qualitative outputs, +which LLMs generate as text, have limited their effectiveness in tasks such as +financial forecasting. This study addresses these challenges by leveraging +daily reports from securities firms to create high-quality contextual +information. The reports are segmented into text-based key factors and combined +with numerical data, such as price information, to form context sets. By +dynamically updating few-shot examples based on the query time, the sets +incorporate the latest information, forming a highly relevant set closely +aligned with the query point. Additionally, a crafted prompt is designed to +assign scores to the key factors, converting qualitative insights into +quantitative results. The derived scores undergo a scaling process, +transforming them into real-world values that are used for prediction. Our +experiments demonstrate that LLMs outperform time-series models in market +forecasting, though challenges such as imperfect reproducibility and limited +explainability remain. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ CLaSP: Learning Concepts for Time-Series Signals from Natural Language + Supervision + + +
+ This paper proposes a foundation model called "CLaSP" that can search time +series signals using natural language that describes the characteristics of the +signals as queries. Previous efforts to represent time series signal data in +natural language have had challenges in designing a conventional class of time +series signal characteristics, formulating their quantification, and creating a +dictionary of synonyms. To overcome these limitations, the proposed method +introduces a neural network based on contrastive learning. This network is +first trained using the datasets TRUCE and SUSHI, which consist of time series +signals and their corresponding natural language descriptions. Previous studies +have proposed vocabularies that data analysts use to describe signal +characteristics, and SUSHI was designed to cover these terms. We believe that a +neural network trained on these datasets will enable data analysts to search +using natural language vocabulary. Furthermore, our method does not require a +dictionary of predefined synonyms, and it leverages common sense knowledge +embedded in a large-scale language model (LLM). Experimental results +demonstrate that CLaSP enables natural language search of time series signal +data and can accurately learn the points at which signal data changes. + +
+
+
+
+
+ + ☆ Interpretable Syntactic Representations Enable Hierarchical Word Vectors + + +
+ The distributed representations currently used are dense and uninterpretable, +leading to interpretations that themselves are relative, overcomplete, and hard +to interpret. We propose a method that transforms these word vectors into +reduced syntactic representations. The resulting representations are compact +and interpretable allowing better visualization and comparison of the word +vectors and we successively demonstrate that the drawn interpretations are in +line with human judgment. The syntactic representations are then used to create +hierarchical word vectors using an incremental learning approach similar to the +hierarchical aspect of human learning. As these representations are drawn from +pre-trained vectors, the generation process and learning approach are +computationally efficient. Most importantly, we find out that syntactic +representations provide a plausible interpretation of the vectors and +subsequent hierarchical vectors outperform the original vectors in benchmark +tests. + +
+
+
+
+
+ + ☆ Physics Informed Distillation for Diffusion Models + + +
+ Diffusion models have recently emerged as a potent tool in generative +modeling. However, their inherent iterative nature often results in sluggish +image generation due to the requirement for multiple model evaluations. Recent +progress has unveiled the intrinsic link between diffusion models and +Probability Flow Ordinary Differential Equations (ODEs), thus enabling us to +conceptualize diffusion models as ODE systems. Simultaneously, Physics Informed +Neural Networks (PINNs) have substantiated their effectiveness in solving +intricate differential equations through implicit modeling of their solutions. +Building upon these foundational insights, we introduce Physics Informed +Distillation (PID), which employs a student model to represent the solution of +the ODE system corresponding to the teacher diffusion model, akin to the +principles employed in PINNs. Through experiments on CIFAR 10 and ImageNet +64x64, we observe that PID achieves performance comparable to recent +distillation methods. Notably, it demonstrates predictable trends concerning +method-specific hyperparameters and eliminates the need for synthetic dataset +generation during the distillation process. Both of which contribute to its +easy-to-use nature as a distillation approach for Diffusion Models. Our code +and pre-trained checkpoint are publicly available at: +https://github.com/pantheon5100/pid_diffusion.git. + +
+
+
+
+
+ + ☆ Federated Graph Learning with Graphless Clients + + +
+ Federated Graph Learning (FGL) is tasked with training machine learning +models, such as Graph Neural Networks (GNNs), for multiple clients, each with +its own graph data. Existing methods usually assume that each client has both +node features and graph structure of its graph data. In real-world scenarios, +however, there exist federated systems where only a part of the clients have +such data while other clients (i.e. graphless clients) may only have node +features. This naturally leads to a novel problem in FGL: how to jointly train +a model over distributed graph data with graphless clients? In this paper, we +propose a novel framework FedGLS to tackle the problem in FGL with graphless +clients. In FedGLS, we devise a local graph learner on each graphless client +which learns the local graph structure with the structure knowledge transferred +from other clients. To enable structure knowledge transfer, we design a GNN +model and a feature encoder on each client. During local training, the feature +encoder retains the local graph structure knowledge together with the GNN model +via knowledge distillation, and the structure knowledge is transferred among +clients in global update. Our extensive experiments demonstrate the superiority +of the proposed FedGLS over five baselines. + +
+
+ comment: Accepted by Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ☆ Surprisingly Popular Voting for Concentric Rank-Order Models + + +
+ An important problem on social information sites is the recovery of ground +truth from individual reports when the experts are in the minority. The wisdom +of the crowd, i.e. the collective opinion of a group of individuals fails in +such a scenario. However, the surprisingly popular (SP) +algorithm~\cite{prelec2017solution} can recover the ground truth even when the +experts are in the minority, by asking the individuals to report additional +prediction reports--their beliefs about the reports of others. Several recent +works have extended the surprisingly popular algorithm to an equivalent voting +rule (SP-voting) to recover the ground truth ranking over a set of $m$ +alternatives. However, we are yet to fully understand when SP-voting can +recover the ground truth ranking, and if so, how many samples (votes and +predictions) it needs. We answer this question by proposing two rank-order +models and analyzing the sample complexity of SP-voting under these models. In +particular, we propose concentric mixtures of Mallows and Plackett-Luce models +with $G (\ge 2)$ groups. Our models generalize previously proposed concentric +mixtures of Mallows models with $2$ groups, and we highlight the importance of +$G > 2$ groups by identifying three distinct groups (expert, intermediate, and +non-expert) from existing datasets. Next, we provide conditions on the +parameters of the underlying models so that SP-voting can recover ground-truth +rankings with high probability, and also derive sample complexities under the +same. We complement the theoretical results by evaluating SP-voting on +simulated and real datasets. + +
+
+
+
+
+ + ☆ Coverage Analysis for Digital Cousin Selection -- Improving + Multi-Environment Q-Learning + + +
+ Q-learning is widely employed for optimizing various large-dimensional +networks with unknown system dynamics. Recent advancements include +multi-environment mixed Q-learning (MEMQ) algorithms, which utilize multiple +independent Q-learning algorithms across multiple, structurally related but +distinct environments and outperform several state-of-the-art Q-learning +algorithms in terms of accuracy, complexity, and robustness. We herein conduct +a comprehensive probabilistic coverage analysis to ensure optimal data coverage +conditions for MEMQ algorithms. First, we derive upper and lower bounds on the +expectation and variance of different coverage coefficients (CC) for MEMQ +algorithms. Leveraging these bounds, we develop a simple way of comparing the +utilities of multiple environments in MEMQ algorithms. This approach appears to +be near optimal versus our previously proposed partial ordering approach. We +also present a novel CC-based MEMQ algorithm to improve the accuracy and +complexity of existing MEMQ algorithms. Numerical experiments are conducted +using random network graphs with four different graph properties. Our algorithm +can reduce the average policy error (APE) by 65% compared to partial ordering +and is 95% faster than the exhaustive search. It also achieves 60% less APE +than several state-of-the-art reinforcement learning and prior MEMQ algorithms. +Additionally, we numerically verify the theoretical results and show their +scalability with the action-space size. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Communication Efficient Decentralization for Smoothed Online Convex + Optimization + + +
+ We study the multi-agent Smoothed Online Convex Optimization (SOCO) problem, +where $N$ agents interact through a communication graph. In each round, each +agent $i$ receives a strongly convex hitting cost function $f^i_t$ in an online +fashion and selects an action $x^i_t \in \mathbb{R}^d$. The objective is to +minimize the global cumulative cost, which includes the sum of individual +hitting costs $f^i_t(x^i_t)$, a temporal "switching cost" for changing +decisions, and a spatial "dissimilarity cost" that penalizes deviations in +decisions among neighboring agents. We propose the first decentralized +algorithm for multi-agent SOCO and prove its asymptotic optimality. Our +approach allows each agent to operate using only local information from its +immediate neighbors in the graph. For finite-time performance, we establish +that the optimality gap in competitive ratio decreases with the time horizon +$T$ and can be conveniently tuned based on the per-round computation available +to each agent. Moreover, our results hold even when the communication graph +changes arbitrarily and adaptively over time. Finally, we establish that the +computational complexity per round depends only logarithmically on the number +of agents and almost linearly on their degree within the graph, ensuring +scalability for large-system implementations. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ☆ Bangla Grammatical Error Detection Leveraging Transformer-based Token + Classification + + +
+ Bangla is the seventh most spoken language by a total number of speakers in +the world, and yet the development of an automated grammar checker in this +language is an understudied problem. Bangla grammatical error detection is a +task of detecting sub-strings of a Bangla text that contain grammatical, +punctuation, or spelling errors, which is crucial for developing an automated +Bangla typing assistant. Our approach involves breaking down the task as a +token classification problem and utilizing state-of-the-art transformer-based +models. Finally, we combine the output of these models and apply rule-based +post-processing to generate a more reliable and comprehensive result. Our +system is evaluated on a dataset consisting of over 25,000 texts from various +sources. Our best model achieves a Levenshtein distance score of 1.04. Finally, +we provide a detailed analysis of different components of our system. + +
+
+
+
+
+ + ☆ Learning-Augmented Algorithms for Online Concave Packing and Convex + Covering Problems + + +
+ Learning-augmented algorithms have been extensively studied across the +computer science community in the recent years, driven by advances in machine +learning predictors, which can provide additional information to augment +classical algorithms. Such predictions are especially powerful in the context +of online problems, where decisions have to be made without knowledge of the +future, and which traditionally exhibits impossibility results bounding the +performance of any online algorithm. The study of learning-augmented algorithms +thus aims to use external advice prudently, to overcome classical impossibility +results when the advice is accurate, and still perform comparably to the +state-of-the-art online algorithms even when the advice is inaccurate. + In this paper, we present learning-augmented algorithmic frameworks for two +fundamental optimizations settings, extending and generalizing prior works. For +online packing with concave objectives, we present a simple but overarching +strategy that switches between the advice and the state-of-the-art online +algorithm. For online covering with convex objectives, we greatly extend +primal-dual methods for online convex covering programs by Azar et al. (FOCS +2016) and previous learning-augmented framework for online covering linear +programs from the literature, to many new applications. We show that our +algorithms break impossibility results when the advice is accurate, while +maintaining comparable performance with state-of-the-art classical online +algorithms even when the advice is erroneous. + +
+
+ comment: 38 pages. In submission +
+
+
+
+
+ + ☆ Neural Conjugate Flows: Physics-informed architectures with flow + structure + + +
+ We introduce Neural Conjugate Flows (NCF), a class of neural network +architectures equipped with exact flow structure. By leveraging topological +conjugation, we prove that these networks are not only naturally isomorphic to +a continuous group, but are also universal approximators for flows of ordinary +differential equation (ODEs). Furthermore, topological properties of these +flows can be enforced by the architecture in an interpretable manner. We +demonstrate in numerical experiments how this topological group structure leads +to concrete computational gains over other physics informed neural networks in +estimating and extrapolating latent dynamics of ODEs, while training up to five +times faster than other flow-based architectures. + +
+
+
+
+
+ + ☆ Are LLMs Prescient? A Continuous Evaluation using Daily News as the + Oracle + + +
+ Many existing evaluation benchmarks for Large Language Models (LLMs) quickly +become outdated due to the emergence of new models and training data. These +benchmarks also fall short in assessing how LLM performance changes over time, +as they consist of static questions without a temporal dimension. To address +these limitations, we propose using future event prediction as a continuous +evaluation method to assess LLMs' temporal generalization and forecasting +abilities. Our benchmark, Daily Oracle, automatically generates question-answer +(QA) pairs from daily news, challenging LLMs to predict "future" event +outcomes. Our findings reveal that as pre-training data becomes outdated, LLM +performance degrades over time. While Retrieval Augmented Generation (RAG) has +the potential to enhance prediction accuracy, the performance degradation +pattern persists, highlighting the need for continuous model updates. + +
+
+
+
+
+ + ☆ Conditional Variable Flow Matching: Transforming Conditional Densities + with Amortized Conditional Optimal Transport + + +
+ Forecasting stochastic nonlinear dynamical systems under the influence of +conditioning variables is a fundamental challenge repeatedly encountered across +the biological and physical sciences. While flow-based models can impressively +predict the temporal evolution of probability distributions representing +possible outcomes of a specific process, existing frameworks cannot +satisfactorily account for the impact of conditioning variables on these +dynamics. Amongst several limitations, existing methods require training data +with paired conditions and are developed for discrete conditioning variables. +We propose Conditional Variable Flow Matching (CVFM), a framework for learning +flows transforming conditional distributions with amortization across +continuous conditioning variables - permitting predictions across the +conditional density manifold. This is accomplished through several novel +advances, in particular, simultaneous sample conditioned flows over the main +and conditioning variables, alongside a conditional Wasserstein distance and +kernel facilitating conditional optimal transport. Collectively, these advances +allow for learning system dynamics provided measurement data whose states and +conditioning variables are not in correspondence. We demonstrate CVFM on a +suite of increasingly challenging problems, including discrete and continuous +conditional mapping benchmarks, image-to-image domain transfer, and modeling +the temporal evolution of materials internal structure during manufacturing +processes. We observe that CVFM results in improved performance and convergence +characteristics over alternative conditional variants. + +
+
+
+
+
+ + ☆ SDDBench: A Benchmark for Synthesizable Drug Design + + +
+ A significant challenge in wet lab experiments with current drug design +generative models is the trade-off between pharmacological properties and +synthesizability. Molecules predicted to have highly desirable properties are +often difficult to synthesize, while those that are easily synthesizable tend +to exhibit less favorable properties. As a result, evaluating the +synthesizability of molecules in general drug design scenarios remains a +significant challenge in the field of drug discovery. The commonly used +synthetic accessibility (SA) score aims to evaluate the ease of synthesizing +generated molecules, but it falls short of guaranteeing that synthetic routes +can actually be found. Inspired by recent advances in top-down synthetic route +generation, we propose a new, data-driven metric to evaluate molecule +synthesizability. Our approach directly assesses the feasibility of synthetic +routes for a given molecule through our proposed round-trip score. This novel +metric leverages the synergistic duality between retrosynthetic planners and +reaction predictors, both of which are trained on extensive reaction datasets. +To demonstrate the efficacy of our method, we conduct a comprehensive +evaluation of round-trip scores alongside search success rate across a range of +representative molecule generative models. Code is available at +https://github.com/SongtaoLiu0823/SDDBench. + +
+
+
+
+
+ + ☆ TowerDebias: A Novel Debiasing Method based on the Tower Property + + +
+ Decision-making processes have increasingly come to rely on sophisticated +machine learning tools, raising concerns about the fairness of their +predictions with respect to any sensitive groups. The widespread use of +commercial black-box machine learning models necessitates careful consideration +of their legal and ethical implications on consumers. In situations where users +have access to these "black-box" models, a key question emerges: how can we +mitigate or eliminate the influence of sensitive attributes, such as race or +gender? We propose towerDebias (tDB), a novel approach designed to reduce the +influence of sensitive variables in predictions made by black-box models. Using +the Tower Property from probability theory, tDB aims to improve prediction +fairness during the post-processing stage in a manner amenable to the +Fairness-Utility Tradeoff. This method is highly flexible, requiring no prior +knowledge of the original model's internal structure, and can be extended to a +range of different applications. We provide a formal improvement theorem for +tDB and demonstrate its effectiveness in both regression and classification +tasks, underscoring its impact on the fairness-utility tradeoff. + +
+
+ comment: To be submitted to a journal soon +
+
+
+
+
+ + ☆ RESOLVE: Relational Reasoning with Symbolic and Object-Level Features + Using Vector Symbolic Processing + + +
+ Modern transformer-based encoder-decoder architectures struggle with +reasoning tasks due to their inability to effectively extract relational +information between input objects (data/tokens). Recent work introduced the +Abstractor module, embedded between transformer layers, to address this gap. +However, the Abstractor layer while excelling at capturing relational +information (pure relational reasoning), faces challenges in tasks that require +both object and relational-level reasoning (partial relational reasoning). To +address this, we propose RESOLVE, a neuro-vector symbolic architecture that +combines object-level features with relational representations in +high-dimensional spaces, using fast and efficient operations such as bundling +(summation) and binding (Hadamard product) allowing both object-level features +and relational representations to coexist within the same structure without +interfering with one another. RESOLVE is driven by a novel attention mechanism +that operates in a bipolar high dimensional space, allowing fast attention +score computation compared to the state-of-the-art. By leveraging this design, +the model achieves both low compute latency and memory efficiency. RESOLVE also +offers better generalizability while achieving higher accuracy in purely +relational reasoning tasks such as sorting as well as partial relational +reasoning tasks such as math problem-solving compared to state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Hashing for Protein Structure Similarity Search + + +
+ Protein structure similarity search (PSSS), which tries to search proteins +with similar structures, plays a crucial role across diverse domains from drug +design to protein function prediction and molecular evolution. Traditional +alignment-based PSSS methods, which directly calculate alignment on the protein +structures, are highly time-consuming with high memory cost. Recently, +alignment-free methods, which represent protein structures as fixed-length +real-valued vectors, are proposed for PSSS. Although these methods have lower +time and memory cost than alignment-based methods, their time and memory cost +is still too high for large-scale PSSS, and their accuracy is unsatisfactory. +In this paper, we propose a novel method, called +$\underline{\text{p}}$r$\underline{\text{o}}$tein +$\underline{\text{s}}$tructure $\underline{\text{h}}$ashing (POSH), for PSSS. +POSH learns a binary vector representation for each protein structure, which +can dramatically reduce the time and memory cost for PSSS compared with +real-valued vector representation based methods. Furthermore, in POSH we also +propose expressive hand-crafted features and a structure encoder to well model +both node and edge interactions in proteins. Experimental results on real +datasets show that POSH can outperform other methods to achieve +state-of-the-art accuracy. Furthermore, POSH achieves a memory saving of more +than six times and speed improvement of more than four times, compared with +other methods. + +
+
+
+
+
+ + ☆ Least Squares Training of Quadratic Convolutional Neural Networks with + Applications to System Theory + + +
+ This paper provides a least squares formulation for the training of a 2-layer +convolutional neural network using quadratic activation functions, a 2-norm +loss function, and no regularization term. Using this method, an analytic +expression for the globally optimal weights is obtained alongside a quadratic +input-output equation for the network. These properties make the network a +viable tool in system theory by enabling further analysis, such as the +sensitivity of the output to perturbations in the input, which is crucial for +safety-critical systems such as aircraft or autonomous vehicles.The least +squares method is compared to previously proposed strategies for training +quadratic networks and to a back-propagation-trained ReLU network. The proposed +method is applied to a system identification problem and a GPS position +estimation problem. The least squares network is shown to have a significantly +reduced training time with minimal compromises on prediction accuracy alongside +the advantages of having an analytic input-output equation. Although these +results only apply to 2-layer networks, this paper motivates the exploration of +deeper quadratic networks in the context of system theory. + +
+
+
+
+
+ + ☆ GPTree: Towards Explainable Decision-Making via LLM-powered Decision + Trees + + +
+ Traditional decision tree algorithms are explainable but struggle with +non-linear, high-dimensional data, limiting its applicability in complex +decision-making. Neural networks excel at capturing complex patterns but +sacrifice explainability in the process. In this work, we present GPTree, a +novel framework combining explainability of decision trees with the advanced +reasoning capabilities of LLMs. GPTree eliminates the need for feature +engineering and prompt chaining, requiring only a task-specific prompt and +leveraging a tree-based structure to dynamically split samples. We also +introduce an expert-in-the-loop feedback mechanism to further enhance +performance by enabling human intervention to refine and rebuild decision +paths, emphasizing the harmony between human expertise and machine +intelligence. Our decision tree achieved a 7.8% precision rate for identifying +"unicorn" startups at the inception stage of a startup, surpassing gpt-4o with +few-shot learning as well as the best human decision-makers (3.1% to 5.6%). + +
+
+
+
+
+ + ☆ Drone Detection using Deep Neural Networks Trained on Pure Synthetic + Data + + +
+ Drone detection has benefited from improvements in deep neural networks, but +like many other applications, suffers from the availability of accurate data +for training. Synthetic data provides a potential for low-cost data generation +and has been shown to improve data availability and quality. However, models +trained on synthetic datasets need to prove their ability to perform on +real-world data, known as the problem of sim-to-real transferability. Here, we +present a drone detection Faster-RCNN model trained on a purely synthetic +dataset that transfers to real-world data. We found that it achieves an AP_50 +of 97.0% when evaluated on the MAV-Vid - a real dataset of flying drones - +compared with 97.8% for an equivalent model trained on real-world data. Our +results show that using synthetic data for drone detection has the potential to +reduce data collection costs and improve labelling quality. These findings +could be a starting point for more elaborate synthetic drone datasets. For +example, realistic recreations of specific scenarios could de-risk the dataset +generation of safety-critical applications such as the detection of drones at +airports. Further, synthetic data may enable reliable drone detection systems, +which could benefit other areas, such as unmanned traffic management systems. +The code is available +https://github.com/mazqtpopx/cranfield-synthetic-drone-detection alongside the +datasets +https://huggingface.co/datasets/mazqtpopx/cranfield-synthetic-drone-detection. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Code-mixed LLM: Improve Large Language Models' Capability to Handle + Code-Mixing through Reinforcement Learning from AI Feedback + + +
+ Code-mixing(CM) or code-switching(CSW) refers to the juxtaposition of +linguistic units from two or more languages during the conversation or +sometimes even a single utterance. Code-mixing introduces unique challenges in +daily life, such as syntactic mismatches and semantic blending, that are rarely +encountered in monolingual settings. Large language models (LLMs) have +revolutionized the field of natural language processing (NLP) by offering +unprecedented capabilities in understanding human languages. However, the +effectiveness of current state-of-the-art multilingual LLMs has not yet been +fully explored in the CM scenario. To fill this gap, we first benchmark the +performance of multilingual LLMs on various code-mixing NLP tasks. Then we +propose to improve the multilingual LLMs' ability to understand code-mixing +through reinforcement learning from human feedback (RLHF) and code-mixed +machine translation tasks. Given the high-cost and time-consuming preference +labeling procedure, we improve this by utilizing LLMs as annotators to perform +the reinforcement learning from AI feedback (RLAIF). The experiments show the +effectiveness of the proposed method. + +
+
+ comment: initial version: 5 pages, 2 figures +
+
+
+
+
+ + ☆ Continuous GNN-based Anomaly Detection on Edge using Efficient Adaptive + Knowledge Graph Learning + + +
+ The increasing demand for robust security solutions across various industries +has made Video Anomaly Detection (VAD) a critical task in applications such as +intelligent surveillance, evidence investigation, and violence detection. +Traditional approaches to VAD often rely on finetuning large pre-trained +models, which can be computationally expensive and impractical for real-time or +resource-constrained environments. To address this, MissionGNN introduced a +more efficient method by training a graph neural network (GNN) using a fixed +knowledge graph (KG) derived from large language models (LLMs) like GPT-4. +While this approach demonstrated significant efficiency in computational power +and memory, it faces limitations in dynamic environments where frequent updates +to the KG are necessary due to evolving behavior trends and shifting data +patterns. These updates typically require cloud-based computation, posing +challenges for edge computing applications. In this paper, we propose a novel +framework that facilitates continuous KG adaptation directly on edge devices, +overcoming the limitations of cloud dependency. Our method dynamically modifies +the KG through a three-phase process: pruning, alternating, and creating nodes, +enabling real-time adaptation to changing data trends. This continuous learning +approach enhances the robustness of anomaly detection models, making them more +suitable for deployment in dynamic and resource-constrained environments. + +
+
+ comment: Accepted to DATE 2025 +
+
+
+
+
+ + ☆ Language-Model Prior Overcomes Cold-Start Items + + +
+ The growth of recommender systems (RecSys) is driven by digitization and the +need for personalized content in areas such as e-commerce and video streaming. +The content in these systems often changes rapidly and therefore they +constantly face the ongoing cold-start problem, where new items lack +interaction data and are hard to value. Existing solutions for the cold-start +problem, such as content-based recommenders and hybrid methods, leverage item +metadata to determine item similarities. The main challenge with these methods +is their reliance on structured and informative metadata to capture detailed +item similarities, which may not always be available. This paper introduces a +novel approach for cold-start item recommendation that utilizes the language +model (LM) to estimate item similarities, which are further integrated as a +Bayesian prior with classic recommender systems. This approach is generic and +able to boost the performance of various recommenders. Specifically, our +experiments integrate it with both sequential and collaborative filtering-based +recommender and evaluate it on two real-world datasets, demonstrating the +enhanced performance of the proposed approach. + +
+
+ comment: This paper is dedicated to cold-start item recommendation using + language-model priors +
+
+
+
+
+ + ☆ Minimax Optimal Two-Sample Testing under Local Differential Privacy + + +
+ We explore the trade-off between privacy and statistical utility in private +two-sample testing under local differential privacy (LDP) for both multinomial +and continuous data. We begin by addressing the multinomial case, where we +introduce private permutation tests using practical privacy mechanisms such as +Laplace, discrete Laplace, and Google's RAPPOR. We then extend our multinomial +approach to continuous data via binning and study its uniform separation rates +under LDP over H\"older and Besov smoothness classes. The proposed tests for +both discrete and continuous cases rigorously control the type I error for any +finite sample size, strictly adhere to LDP constraints, and achieve minimax +separation rates under LDP. The attained minimax rates reveal inherent +privacy-utility trade-offs that are unavoidable in private testing. To address +scenarios with unknown smoothness parameters in density testing, we propose an +adaptive test based on a Bonferroni-type approach that ensures robust +performance without prior knowledge of the smoothness parameters. We validate +our theoretical findings with extensive numerical experiments and demonstrate +the practical relevance and effectiveness of our proposed methods. + +
+
+ comment: 59 pages, 5 figures +
+
+
+
+
+ + ☆ Optimisation Strategies for Ensuring Fairness in Machine Learning: With + and Without Demographics + + +
+ Ensuring fairness has emerged as one of the primary concerns in AI and its +related algorithms. Over time, the field of machine learning fairness has +evolved to address these issues. This paper provides an extensive overview of +this field and introduces two formal frameworks to tackle open questions in +machine learning fairness. + In one framework, operator-valued optimisation and min-max objectives are +employed to address unfairness in time-series problems. This approach showcases +state-of-the-art performance on the notorious COMPAS benchmark dataset, +demonstrating its effectiveness in real-world scenarios. + In the second framework, the challenge of lacking sensitive attributes, such +as gender and race, in commonly used datasets is addressed. This issue is +particularly pressing because existing algorithms in this field predominantly +rely on the availability or estimations of such attributes to assess and +mitigate unfairness. Here, a framework for a group-blind bias-repair is +introduced, aiming to mitigate bias without relying on sensitive attributes. +The efficacy of this approach is showcased through analyses conducted on the +Adult Census Income dataset. + Additionally, detailed algorithmic analyses for both frameworks are provided, +accompanied by convergence guarantees, ensuring the robustness and reliability +of the proposed methodologies. + +
+
+ comment: PhD thesis. arXiv admin note: text overlap with arXiv:2310.11407 +
+
+
+
+
+ + ☆ SAFELOC: Overcoming Data Poisoning Attacks in Heterogeneous Federated + Machine Learning for Indoor Localization + + +
+ Machine learning (ML) based indoor localization solutions are critical for +many emerging applications, yet their efficacy is often compromised by +hardware/software variations across mobile devices (i.e., device heterogeneity) +and the threat of ML data poisoning attacks. Conventional methods aimed at +countering these challenges show limited resilience to the uncertainties +created by these phenomena. In response, in this paper, we introduce SAFELOC, a +novel framework that not only minimizes localization errors under these +challenging conditions but also ensures model compactness for efficient mobile +device deployment. Our framework targets a distributed and co-operative +learning environment that uses federated learning (FL) to preserve user data +privacy and assumes heterogeneous mobile devices carried by users (just like in +most real-world scenarios). Within this heterogeneous FL context, SAFELOC +introduces a novel fused neural network architecture that performs data +poisoning detection and localization, with a low model footprint. Additionally, +a dynamic saliency map-based aggregation strategy is designed to adapt based on +the severity of the detected data poisoning scenario. Experimental evaluations +demonstrate that SAFELOC achieves improvements of up to 5.9x in mean +localization error, 7.8x in worst-case localization error, and a 2.1x reduction +in model inference latency compared to state-of-the-art indoor localization +frameworks, across diverse building floorplans, mobile devices, and ML data +poisoning attack scenarios. + +
+
+
+
+
+ + ☆ ClevrSkills: Compositional Language and Visual Reasoning in Robotics NeurIPS 2024 + + +
+ Robotics tasks are highly compositional by nature. For example, to perform a +high-level task like cleaning the table a robot must employ low-level +capabilities of moving the effectors to the objects on the table, pick them up +and then move them off the table one-by-one, while re-evaluating the +consequently dynamic scenario in the process. Given that large vision language +models (VLMs) have shown progress on many tasks that require high level, +human-like reasoning, we ask the question: if the models are taught the +requisite low-level capabilities, can they compose them in novel ways to +achieve interesting high-level tasks like cleaning the table without having to +be explicitly taught so? To this end, we present ClevrSkills - a benchmark +suite for compositional reasoning in robotics. ClevrSkills is an environment +suite developed on top of the ManiSkill2 simulator and an accompanying dataset. +The dataset contains trajectories generated on a range of robotics tasks with +language and visual annotations as well as multi-modal prompts as task +specification. The suite includes a curriculum of tasks with three levels of +compositional understanding, starting with simple tasks requiring basic motor +skills. We benchmark multiple different VLM baselines on ClevrSkills and show +that even after being pre-trained on large numbers of tasks, these models fail +on compositional reasoning in robotics tasks. + +
+
+ comment: To appear at NeurIPS 2024 (D&B track) +
+
+
+
+
+ + ☆ Anomaly Detection in Large-Scale Cloud Systems: An Industry Case and + Dataset + + +
+ As Large-Scale Cloud Systems (LCS) become increasingly complex, effective +anomaly detection is critical for ensuring system reliability and performance. +However, there is a shortage of large-scale, real-world datasets available for +benchmarking anomaly detection methods. + To address this gap, we introduce a new high-dimensional dataset from IBM +Cloud, collected over 4.5 months from the IBM Cloud Console. This dataset +comprises 39,365 rows and 117,448 columns of telemetry data. Additionally, we +demonstrate the application of machine learning models for anomaly detection +and discuss the key challenges faced in this process. + This study and the accompanying dataset provide a resource for researchers +and practitioners in cloud system monitoring. It facilitates more efficient +testing of anomaly detection methods in real-world data, helping to advance the +development of robust solutions to maintain the health and performance of +large-scale cloud infrastructures. + +
+
+
+
+
+ + ☆ Transformer-based Time-Series Biomarker Discovery for COPD Diagnosis NeurIPS 2024 + + +
+ Chronic Obstructive Pulmonary Disorder (COPD) is an irreversible and +progressive disease which is highly heritable. Clinically, COPD is defined +using the summary measures derived from a spirometry test but these are not +always adequate. Here we show that using the high-dimensional raw spirogram can +provide a richer signal compared to just using the summary measures. We design +a transformer-based deep learning technique to process the raw spirogram values +along with demographic information and predict clinically-relevant endpoints +related to COPD. Our method is able to perform better than prior works while +being more computationally efficient. Using the weights learned by the model, +we make the framework more interpretable by identifying parts of the spirogram +that are important for the model predictions. Pairing up with a board-certified +pulmonologist, we also provide clinical insights into the different aspects of +the spirogram and show that the explanations obtained from the model align with +underlying medical knowledge. + +
+
+ comment: Accepted as a workshop paper to NeurIPS 2024 +
+
+
+
+
+ + ☆ Bridging the Visual Gap: Fine-Tuning Multimodal Models with + Knowledge-Adapted Captions + + +
+ Recent research increasingly focuses on training vision-language models +(VLMs) with long, detailed image captions. However, small-scale VLMs often +struggle to balance the richness of these captions with the risk of +hallucinating content during fine-tuning. In this paper, we explore how well +VLMs adapt to such captions. To quantify caption quality, we propose Decomposed +NLI (DNLI), an evaluation framework that breaks down generated captions into +individual propositions, assessing each in isolation. This fine-grained +analysis reveals a critical balance between capturing descriptive details and +preventing hallucinations. Our findings show that simply reducing caption +complexity or employing standard data curation techniques does not effectively +resolve this issue. To tackle this challenge, we introduce Knowledge Adapted +(KnowAda) fine-tuning, a data-centric approach that automatically adapts +training data with the model's existing knowledge and visual understanding. +KnowAda minimizes hallucinations while preserving high descriptiveness. We +validate this approach across several small-scale VLMs (up to 7B parameters) +and dense caption datasets, demonstrating that KnowAda effectively balances +hallucination reduction and descriptiveness. Our results show that KnowAda +outperforms various baselines in both automatic metrics and human evaluations. +We will release our code and models. + +
+
+
+
+
+ + ☆ Cut Your Losses in Large-Vocabulary Language Models + + +
+ As language models grow ever larger, so do their vocabularies. This has +shifted the memory footprint of LLMs during training disproportionately to one +single layer: the cross-entropy in the loss computation. Cross-entropy builds +up a logit matrix with entries for each pair of input tokens and vocabulary +items and, for small models, consumes an order of magnitude more memory than +the rest of the LLM combined. We propose Cut Cross-Entropy (CCE), a method that +computes the cross-entropy loss without materializing the logits for all tokens +into global memory. Rather, CCE only computes the logit for the correct token +and evaluates the log-sum-exp over all logits on the fly. We implement a custom +kernel that performs the matrix multiplications and the log-sum-exp reduction +over the vocabulary in flash memory, making global memory consumption for the +cross-entropy computation negligible. This has a dramatic effect. Taking the +Gemma 2 (2B) model as an example, CCE reduces the memory footprint of the loss +computation from 24 GB to 1 MB, and the total training-time memory consumption +of the classifier head from 28 GB to 1 GB. To improve the throughput of CCE, we +leverage the inherent sparsity of softmax and propose to skip elements of the +gradient computation that have a negligible (i.e., below numerical precision) +contribution to the gradient. Experiments demonstrate that the dramatic +reduction in memory consumption is accomplished without sacrificing training +speed or convergence. + +
+
+ comment: Code is available at https://github.com/apple/ml-cross-entropy +
+
+
+
+
+ + ☆ Refusal in LLMs is an Affine Function + + +
+ We propose affine concept editing (ACE) as an approach for steering language +models' behavior by intervening directly in activations. We begin with an +affine decomposition of model activation vectors and show that prior methods +for steering model behavior correspond to subsets of terms of this +decomposition. We then provide a derivation of ACE and test it on refusal using +Llama 3 8B and Hermes Eagle RWKV v5. ACE ultimately combines affine subspace +projection and activation addition to reliably control the model's refusal +responses across prompt types. We evaluate the results using LLM-based scoring +on a collection of harmful and harmless prompts. Our experiments demonstrate +that ACE consistently achieves more precise control over model behavior and +generalizes to models where directional ablation via affine subspace projection +alone produces incoherent outputs. Code for reproducing our results is +available at https://github.com/EleutherAI/steering-llama3 . + +
+
+
+
+
+ + ☆ Microfoundation Inference for Strategic Prediction + + +
+ Often in prediction tasks, the predictive model itself can influence the +distribution of the target variable, a phenomenon termed performative +prediction. Generally, this influence stems from strategic actions taken by +stakeholders with a vested interest in predictive models. A key challenge that +hinders the widespread adaptation of performative prediction in machine +learning is that practitioners are generally unaware of the social impacts of +their predictions. To address this gap, we propose a methodology for learning +the distribution map that encapsulates the long-term impacts of predictive +models on the population. Specifically, we model agents' responses as a +cost-adjusted utility maximization problem and propose estimates for said cost. +Our approach leverages optimal transport to align pre-model exposure (ex ante) +and post-model exposure (ex post) distributions. We provide a rate of +convergence for this proposed estimate and assess its quality through empirical +demonstrations on a credit-scoring dataset. + +
+
+
+
+
+ + ☆ Parameter Inference via Differentiable Diffusion Bridge Importance + Sampling + + +
+ We introduce a methodology for performing parameter inference in +high-dimensional, non-linear diffusion processes. We illustrate its +applicability for obtaining insights into the evolution of and relationships +between species, including ancestral state reconstruction. Estimation is +performed by utilising score matching to approximate diffusion bridges, which +are subsequently used in an importance sampler to estimate log-likelihoods. The +entire setup is differentiable, allowing gradient ascent on approximated +log-likelihoods. This allows both parameter inference and diffusion mean +estimation. This novel, numerically stable, score matching-based parameter +inference framework is presented and demonstrated on biological two- and +three-dimensional morphometry data. + +
+
+
+
+
+ + ☆ Non-Euclidean High-Order Smooth Convex Optimization + + +
+ We develop algorithms for the optimization of convex objectives that have +H\"older continuous $q$-th derivatives with respect to a $p$-norm by using a +$q$-th order oracle, for $p, q \geq 1$. We can also optimize other structured +functions. We do this by developing a non-Euclidean inexact accelerated +proximal point method that makes use of an inexact uniformly convex +regularizer. We also provide nearly matching lower bounds for any deterministic +algorithm that interacts with the function via a local oracle. + +
+
+
+
+
+ + ☆ Lynx: Enabling Efficient MoE Inference through Dynamic Batch-Aware + Expert Selection + + +
+ Mixture-of-Experts (MoE) architectures have recently gained popularity in +enabling efficient scaling of large language models. However, we uncover a +fundamental tension: while MoEs are designed for selective expert activation, +production serving requires request batching, which forces the activation of +all experts and negates MoE's efficiency benefits during the decode phase. We +present Lynx, a system that enables efficient MoE inference through dynamic, +batch-aware expert selection. Our key insight is that expert importance varies +significantly across tokens and inference phases, creating opportunities for +runtime optimization. Lynx leverages this insight through a lightweight +framework that dynamically reduces active experts while preserving model +accuracy. Our evaluations show that Lynx achieves up to 1.55x reduction in +inference latency while maintaining negligible accuracy loss from baseline +model across complex code generation and mathematical reasoning tasks. + +
+
+
+
+
+ + ☆ Sparse Upcycling: Inference Inefficient Finetuning NeurIPS + + +
+ Small, highly trained, open-source large language models are widely used due +to their inference efficiency, but further improving their quality remains a +challenge. Sparse upcycling is a promising approach that transforms a +pretrained dense model into a Mixture-of-Experts (MoE) architecture, increasing +the model's parameter count and quality. In this work, we compare the +effectiveness of sparse upcycling against continued pretraining (CPT) across +different model sizes, compute budgets, and pretraining durations. Our +experiments show that sparse upcycling can achieve better quality, with +improvements of over 20% relative to CPT in certain scenarios. However, this +comes with a significant inference cost, leading to 40% slowdowns in +high-demand inference settings for larger models. Our findings highlight the +trade-off between model quality and inference efficiency, offering insights for +practitioners seeking to balance model quality and deployment constraints. + +
+
+ comment: 12 pages, 4 figures, To appear in the 4th NeurIPS Workshop on + Efficient Natural Language and Speech Processing (ENLSP), 2024 +
+
+
+
+
+ + ☆ Inconsistencies In Consistency Models: Better ODE Solving Does Not Imply + Better Samples NeurIPS 2024 + + +
+ Although diffusion models can generate remarkably high-quality samples, they +are intrinsically bottlenecked by their expensive iterative sampling procedure. +Consistency models (CMs) have recently emerged as a promising diffusion model +distillation method, reducing the cost of sampling by generating high-fidelity +samples in just a few iterations. Consistency model distillation aims to solve +the probability flow ordinary differential equation (ODE) defined by an +existing diffusion model. CMs are not directly trained to minimize error +against an ODE solver, rather they use a more computationally tractable +objective. As a way to study how effectively CMs solve the probability flow +ODE, and the effect that any induced error has on the quality of generated +samples, we introduce Direct CMs, which \textit{directly} minimize this error. +Intriguingly, we find that Direct CMs reduce the ODE solving error compared to +CMs but also result in significantly worse sample quality, calling into +question why exactly CMs work well in the first place. Full code is available +at: https://github.com/layer6ai-labs/direct-cms. + +
+
+ comment: NeurIPS 2024 ATTRIB Workshop +
+
+
+
+
+ + ♻ ☆ Regional Style and Color Transfer + + +
+ This paper presents a novel contribution to the field of regional style +transfer. Existing methods often suffer from the drawback of applying style +homogeneously across the entire image, leading to stylistic inconsistencies or +foreground object twisted when applied to image with foreground elements such +as person figures. To address this limitation, we propose a new approach that +leverages a segmentation network to precisely isolate foreground objects within +the input image. Subsequently, style transfer is applied exclusively to the +background region. The isolated foreground objects are then carefully +reintegrated into the style-transferred background. To enhance the visual +coherence between foreground and background, a color transfer step is employed +on the foreground elements prior to their rein-corporation. Finally, we utilize +feathering techniques to achieve a seamless amalgamation of foreground and +background, resulting in a visually unified and aesthetically pleasing final +composition. Extensive evaluations demonstrate that our proposed approach +yields significantly more natural stylistic transformations compared to +conventional methods. + +
+
+ comment: Accepted by 2024 5th International Conference on Computer Vision, + Image and Deep Learning +
+
+
+
+
+ + ♻ ☆ A Single Transformer for Scalable Vision-Language Modeling + + +
+ We present SOLO, a single transformer for Scalable visiOn-Language mOdeling. +Current large vision-language models (LVLMs) such as LLaVA mostly employ +heterogeneous architectures that connect pre-trained visual encoders with large +language models (LLMs) to facilitate visual recognition and complex reasoning. +Although achieving remarkable performance with relatively lightweight training, +we identify four primary scalability limitations: (1) The visual capacity is +constrained by pre-trained visual encoders, which are typically an order of +magnitude smaller than LLMs. (2) The heterogeneous architecture complicates the +use of established hardware and software infrastructure. (3) Study of scaling +laws on such architecture must consider three separate components - visual +encoder, connector, and LLMs, which complicates the analysis. (4) The use of +existing visual encoders typically requires following a pre-defined +specification of image inputs pre-processing, for example, by reshaping inputs +to fixed-resolution square images, which presents difficulties in processing +and training on high-resolution images or those with unusual aspect ratio. A +unified single Transformer architecture, like SOLO, effectively addresses these +scalability concerns in LVLMs; however, its limited adoption in the modern +context likely stems from the absence of reliable training recipes that balance +both modalities and ensure stable training for billion-scale models. In this +paper, we introduce the first open-source training recipe for developing SOLO, +an open-source 7B LVLM using moderate academic resources. The training recipe +involves initializing from LLMs, sequential pre-training on ImageNet and +web-scale data, and instruction fine-tuning on our curated high-quality +datasets. On extensive evaluation, SOLO demonstrates performance comparable to +LLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning. + +
+
+ comment: Accepted to TMLR +
+
+
+
+
+ + ♻ ☆ Physics-Informed Geometry-Aware Neural Operator + + +
+ Engineering design problems often involve solving parametric Partial +Differential Equations (PDEs) under variable PDE parameters and domain +geometry. Recently, neural operators have shown promise in learning PDE +operators and quickly predicting the PDE solutions. However, training these +neural operators typically requires large datasets, the acquisition of which +can be prohibitively expensive. To overcome this, physics-informed training +offers an alternative way of building neural operators, eliminating the high +computational costs associated with Finite Element generation of training data. +Nevertheless, current physics-informed neural operators struggle with +limitations, either in handling varying domain geometries or varying PDE +parameters. In this research, we introduce a novel method, the Physics-Informed +Geometry-Aware Neural Operator (PI-GANO), designed to simultaneously generalize +across both PDE parameters and domain geometries. We adopt a geometry encoder +to capture the domain geometry features, and design a novel pipeline to +integrate this component within the existing DCON architecture. Numerical +results demonstrate the accuracy and efficiency of the proposed method. All the +codes and data related to this work are available on GitHub: +https://github.com/WeihengZ/Physics-informed-Neural-Foundation-Operator. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.13646 +
+
+
+
+
+ + ♻ ☆ Insights and Current Gaps in Open-Source LLM Vulnerability Scanners: A + Comparative Analysis + + +
+ This report presents a comparative analysis of open-source vulnerability +scanners for conversational large language models (LLMs). As LLMs become +integral to various applications, they also present potential attack surfaces, +exposed to security risks such as information leakage and jailbreak attacks. +Our study evaluates prominent scanners - Garak, Giskard, PyRIT, and +CyberSecEval - that adapt red-teaming practices to expose these +vulnerabilities. We detail the distinctive features and practical use of these +scanners, outline unifying principles of their design and perform quantitative +evaluations to compare them. These evaluations uncover significant reliability +issues in detecting successful attacks, highlighting a fundamental gap for +future development. Additionally, we contribute a preliminary labelled dataset, +which serves as an initial step to bridge this gap. Based on the above, we +provide strategic recommendations to assist organizations choose the most +suitable scanner for their red-teaming needs, accounting for customizability, +test suite comprehensiveness, and industry-specific use cases. + +
+
+ comment: 15 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ $π_0$: A Vision-Language-Action Flow Model for General Robot Control + + +
+ Robot learning holds tremendous promise to unlock the full potential of +flexible, general, and dexterous robot systems, as well as to address some of +the deepest questions in artificial intelligence. However, bringing robot +learning to the level of generality required for effective real-world systems +faces major obstacles in terms of data, generalization, and robustness. In this +paper, we discuss how generalist robot policies (i.e., robot foundation models) +can address these challenges, and how we can design effective generalist robot +policies for complex and highly dexterous tasks. We propose a novel flow +matching architecture built on top of a pre-trained vision-language model (VLM) +to inherit Internet-scale semantic knowledge. We then discuss how this model +can be trained on a large and diverse dataset from multiple dexterous robot +platforms, including single-arm robots, dual-arm robots, and mobile +manipulators. We evaluate our model in terms of its ability to perform tasks in +zero shot after pre-training, follow language instructions from people and from +a high-level VLM policy, and its ability to acquire new skills via fine-tuning. +Our results cover a wide variety of tasks, such as laundry folding, table +cleaning, and assembling boxes. + +
+
+ comment: See project website for videos: + https://physicalintelligence.company/blog/pi0 +
+
+
+
+
+ + ♻ ☆ Physics-informed Discretization-independent Deep Compositional Operator + Network + + +
+ Solving parametric Partial Differential Equations (PDEs) for a broad range of +parameters is a critical challenge in scientific computing. To this end, neural +operators, which \textcolor{black}{predicts the PDE solution with variable PDE +parameter inputs}, have been successfully used. However, the training of neural +operators typically demands large training datasets, the acquisition of which +can be prohibitively expensive. To address this challenge, physics-informed +training can offer a cost-effective strategy. However, current physics-informed +neural operators face limitations, either in handling irregular domain shapes +or in in generalizing to various discrete representations of PDE parameters. In +this research, we introduce a novel physics-informed model architecture which +can generalize to various discrete representations of PDE parameters and +irregular domain shapes. Particularly, inspired by deep operator neural +networks, our model involves a discretization-independent learning of parameter +embedding repeatedly, and this parameter embedding is integrated with the +response embeddings through multiple compositional layers, for more +expressivity. Numerical results demonstrate the accuracy and efficiency of the +proposed method. All the codes and data related to this work are available on +GitHub: https://github.com/WeihengZ/PI-DCON. + +
+
+
+
+
+ + ♻ ☆ A Universal Deep Learning Framework for Materials X-ray Absorption + Spectra + + +
+ X-ray absorption spectroscopy (XAS) is a powerful characterization technique +for probing the local chemical environment of absorbing atoms. However, +analyzing XAS data presents significant challenges, often requiring extensive, +computationally intensive simulations, as well as significant domain expertise. +These limitations hinder the development of fast, robust XAS analysis pipelines +that are essential in high-throughput studies and for autonomous +experimentation. We address these challenges with OmniXAS, a framework that +contains a suite of transfer learning approaches for XAS prediction, each +contributing to improved accuracy and efficiency, as demonstrated on K-edge +spectra database covering eight 3d transition metals (Ti-Cu). The OmniXAS +framework is built upon three distinct strategies. First, we use M3GNet to +derive latent representations of the local chemical environment of absorption +sites as input for XAS prediction, achieving up to order-of-magnitude +improvements over conventional featurization techniques. Second, we employ a +hierarchical transfer learning strategy, training a universal multi-task model +across elements before fine-tuning for element-specific predictions. Models +based on this cascaded approach after element-wise fine-tuning outperform +element-specific models by up to 69%. Third, we implement cross-fidelity +transfer learning, adapting a universal model to predict spectra generated by +simulation of a different fidelity with a higher computational cost. This +approach improves prediction accuracy by up to 11% over models trained on the +target fidelity alone. Our approach boosts the throughput of XAS modeling by +orders of magnitude versus first-principles simulations and is extendable to +XAS prediction for a broader range of elements. This transfer learning +framework is generalizable to enhance deep-learning models that target other +properties in materials research. + +
+
+ comment: Main manuscript: 22 pages, 11 figures. Supplemental material (12 + pages, 6 figures) available as a separate file in arXiv ancillary files + (additional downloadable files) +
+
+
+
+
+ + ♻ ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information + Seeking in Large Language Models NeurIPS 2024 + + +
+ In the face of uncertainty, the ability to *seek information* is of +fundamental importance. In many practical applications, such as medical +diagnosis and troubleshooting, the information needed to solve the task is not +initially given and has to be actively sought by asking follow-up questions +(for example, a doctor asking a patient for more details about their symptoms). +In this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to +augment large language models with the ability to actively seek information by +asking effective questions. UoT combines 1) an *uncertainty-aware simulation +approach* which enables the model to simulate possible future scenarios and how +likely they are to occur, 2) *uncertainty-based rewards* motivated by +information gain which incentivizes the model to seek information, and 3) a +*reward propagation scheme* to select the optimal question to ask in a way that +maximizes the expected reward. In experiments on medical diagnosis, +troubleshooting, and the `20 Questions` game, UoT achieves an average +performance improvement of 38.1% in the rate of successful task completion +across multiple LLMs compared with direct prompting and also improves +efficiency (i.e., the number of questions needed to complete the task). Our +code has been released [here](https://github.com/zhiyuanhubj/UoT) + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Active Inference Meeting Energy-Efficient Control of Parallel and + Identical Machines + + +
+ We investigate the application of active inference in developing +energy-efficient control agents for manufacturing systems. Active inference, +rooted in neuroscience, provides a unified probabilistic framework integrating +perception, learning, and action, with inherent uncertainty quantification +elements. Our study explores deep active inference, an emerging field that +combines deep learning with the active inference decision-making framework. +Leveraging a deep active inference agent, we focus on controlling parallel and +identical machine workstations to enhance energy efficiency. We address +challenges posed by the problem's stochastic nature and delayed policy response +by introducing tailored enhancements to existing agent architectures. +Specifically, we introduce multi-step transition and hybrid horizon methods to +mitigate the need for complex planning. Our experimental results demonstrate +the effectiveness of these enhancements and highlight the potential of the +active inference-based approach. + +
+
+ comment: Accepted at the 10th International Conference on Machine Learning, + Optimization, and Data Science +
+
+
+
+
+ + ♻ ☆ On Training Survival Models with Scoring Rules + + +
+ Scoring rules are an established way of comparing predictive performances +across model classes. In the context of survival analysis, they require +adaptation in order to accommodate censoring. This work investigates using +scoring rules for model training rather than evaluation. Doing so, we establish +a general framework for training survival models that is model agnostic and can +learn event time distributions parametrically or non-parametrically. In +addition, our framework is not restricted to any specific scoring rule. While +we focus on neural network-based implementations, we also provide +proof-of-concept implementations using gradient boosting, generalized additive +models, and trees. Empirical comparisons on synthetic and real-world data +indicate that scoring rules can be successfully incorporated into model +training and yield competitive predictive performance with established +time-to-event models. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ CGRclust: Chaos Game Representation for Twin Contrastive Clustering of + Unlabelled DNA Sequences + + +
+ This study proposes CGRclust, a novel combination of unsupervised twin +contrastive clustering of Chaos Game Representations (CGR) of DNA sequences, +with convolutional neural networks (CNNs). To the best of our knowledge, +CGRclust is the first method to use unsupervised learning for image +classification (herein applied to two-dimensional CGR images) for clustering +datasets of DNA sequences. CGRclust overcomes the limitations of traditional +sequence classification methods by leveraging unsupervised twin contrastive +learning to detect distinctive sequence patterns, without requiring DNA +sequence alignment or biological/taxonomic labels. CGRclust accurately +clustered twenty-five diverse datasets, with sequence lengths ranging from 664 +bp to 100 kbp, including mitochondrial genomes of fish, fungi, and protists, as +well as viral whole genome assemblies and synthetic DNA sequences. Compared +with three recent clustering methods for DNA sequences (DeLUCS, iDeLUCS, and +MeShClust v3.0.), CGRclust is the only method that surpasses 81.70% accuracy +across all four taxonomic levels tested for mitochondrial DNA genomes of fish. +Moreover, CGRclust also consistently demonstrates superior performance across +all the viral genomic datasets. The high clustering accuracy of CGRclust on +these twenty-five datasets, which vary significantly in terms of sequence +length, number of genomes, number of clusters, and level of taxonomy, +demonstrates its robustness, scalability, and versatility. + +
+
+ comment: 28 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Optimal vintage factor analysis with deflation varimax + + +
+ Vintage factor analysis is one important type of factor analysis that aims to +first find a low-dimensional representation of the original data, and then to +seek a rotation such that the rotated low-dimensional representation is +scientifically meaningful. The most widely used vintage factor analysis is the +Principal Component Analysis (PCA) followed by the varimax rotation. Despite +its popularity, little theoretical guarantee can be provided to date mainly +because varimax rotation requires to solve a non-convex optimization over the +set of orthogonal matrices. + In this paper, we propose a deflation varimax procedure that solves each row +of an orthogonal matrix sequentially. In addition to its net computational gain +and flexibility, we are able to fully establish theoretical guarantees for the +proposed procedure in a broader context. Adopting this new deflation varimax as +the second step after PCA, we further analyze this two step procedure under a +general class of factor models. Our results show that it estimates the factor +loading matrix in the minimax optimal rate when the signal-to-noise-ratio (SNR) +is moderate or large. In the low SNR regime, we offer possible improvement over +using PCA and the deflation varimax when the additive noise under the factor +model is structured. The modified procedure is shown to be minimax optimal in +all SNR regimes. Our theory is valid for finite sample and allows the number of +the latent factors to grow with the sample size as well as the ambient +dimension to grow with, or even exceed, the sample size. Extensive simulation +and real data analysis further corroborate our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ On the Effects of Data Scale on UI Control Agents NeurIPS 2024 + + +
+ Autonomous agents that control computer interfaces to accomplish human tasks +are emerging. Leveraging LLMs to power such agents has been of special +interest, but unless fine-tuned on human-collected task demonstrations, +performance is still relatively low. In this work we study whether fine-tuning +alone is a viable approach for building real-world computer control agents. In +particularly, we investigate how performance measured on both high and +low-level tasks in domain and out of domain scales as more training data is +collected. To this end we collect and release a new dataset, AndroidControl, +consisting of 15,283 demonstrations of everyday tasks with Android apps. +Compared to existing datasets, each AndroidControl task instance includes both +high and low-level human-generated instructions, allowing us to explore the +level of task complexity an agent can handle. Moreover, AndroidControl is the +most diverse computer control dataset to date, including 14,548 unique tasks +over 833 Android apps, thus allowing us to conduct in-depth analysis of the +model performance in and out of the domain of the training data. Using the +dataset, we find that when tested in domain fine-tuned models outperform zero +and few-shot baselines and scale in such a way that robust performance might +feasibly be obtained simply by collecting more data. Out of domain, performance +scales significantly more slowly and suggests that in particular for high-level +tasks, fine-tuning on more data alone may be insufficient for achieving robust +out-of-domain performance. + +
+
+ comment: NeurIPS 2024 (Datasets and Benchmarks) +
+
+
+
+
+ + ♻ ☆ AudioProtoPNet: An interpretable deep learning model for bird sound + classification + + +
+ Deep learning models have significantly advanced acoustic bird monitoring by +being able to recognize numerous bird species based on their vocalizations. +However, traditional deep learning models are black boxes that provide no +insight into their underlying computations, limiting their usefulness to +ornithologists and machine learning engineers. Explainable models could +facilitate debugging, knowledge discovery, trust, and interdisciplinary +collaboration. This study introduces AudioProtoPNet, an adaptation of the +Prototypical Part Network (ProtoPNet) for multi-label bird sound +classification. It is an inherently interpretable model that uses a ConvNeXt +backbone to extract embeddings, with the classification layer replaced by a +prototype learning classifier trained on these embeddings. The classifier +learns prototypical patterns of each bird species' vocalizations from +spectrograms of training instances. During inference, audio recordings are +classified by comparing them to the learned prototypes in the embedding space, +providing explanations for the model's decisions and insights into the most +informative embeddings of each bird species. The model was trained on the +BirdSet training dataset, which consists of 9,734 bird species and over 6,800 +hours of recordings. Its performance was evaluated on the seven test datasets +of BirdSet, covering different geographical regions. AudioProtoPNet +outperformed the state-of-the-art model Perch, achieving an average AUROC of +0.90 and a cmAP of 0.42, with relative improvements of 7.1% and 16.7% over +Perch, respectively. These results demonstrate that even for the challenging +task of multi-label bird sound classification, it is possible to develop +powerful yet inherently interpretable deep learning models that provide +valuable insights for ornithologists and machine learning engineers. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Implicit Bias of Mirror Flow on Separable Data + + +
+ We examine the continuous-time counterpart of mirror descent, namely mirror +flow, on classification problems which are linearly separable. Such problems +are minimised `at infinity' and have many possible solutions; we study which +solution is preferred by the algorithm depending on the mirror potential. For +exponential tailed losses and under mild assumptions on the potential, we show +that the iterates converge in direction towards a $\phi_\infty$-maximum margin +classifier. The function $\phi_\infty$ is the \textit{horizon function} of the +mirror potential and characterises its shape `at infinity'. When the potential +is separable, a simple formula allows to compute this function. We analyse +several examples of potentials and provide numerical experiments highlighting +our results. + +
+
+ comment: Neurips camera ready. Minor changes from the previous versions. + Mainly added full iterate trajectories (Figure 4) +
+
+
+
+
+ + ♻ ☆ Rethinking Distribution Shifts: Empirical Analysis and Inductive + Modeling for Tabular Data NeurIPS 2023 + + +
+ Different distribution shifts require different interventions, and algorithms +must be grounded in the specific shifts they address. However, methodological +development for robust algorithms typically relies on structural assumptions +that lack empirical validation. Advocating for an empirically grounded +data-driven approach to research, we build an empirical testbed comprising +natural shifts across 5 tabular datasets and 60,000 method configurations +encompassing imbalanced learning and distributionally robust optimization (DRO) +methods. We find $Y|X$-shifts are most prevalent on our testbed, in stark +contrast to the heavy focus on $X$ (covariate)-shifts in the ML literature. The +performance of robust algorithms varies significantly over shift types, and is +no better than that of vanilla methods. To understand why, we conduct an +in-depth empirical analysis of DRO methods and find that although often +neglected by researchers, implementation details -- such as the choice of +underlying model class (e.g., XGBoost) and hyperparameter selection -- have a +bigger impact on performance than the ambiguity set or its radius. To further +bridge that gap between methodological research and practice, we design case +studies that illustrate how such a data-driven, inductive understanding of +distribution shifts can enhance both data-centric and algorithmic +interventions. + +
+
+ comment: Conference version appeared in NeurIPS 2023, previously titled "On + the Need for a Language Describing Distribution Shifts: Illustrations on + Tabular Datasets" +
+
+
+
+
+ + ♻ ☆ Calibrating Bayesian Generative Machine Learning for Bayesiamplification + + +
+ Recently, combinations of generative and Bayesian machine learning have been +introduced in particle physics for both fast detector simulation and inference +tasks. These neural networks aim to quantify the uncertainty on the generated +distribution originating from limited training statistics. The interpretation +of a distribution-wide uncertainty however remains ill-defined. We show a clear +scheme for quantifying the calibration of Bayesian generative machine learning +models. For a Continuous Normalizing Flow applied to a low-dimensional toy +example, we evaluate the calibration of Bayesian uncertainties from either a +mean-field Gaussian weight posterior, or Monte Carlo sampling network weights, +to gauge their behaviour on unsteady distribution edges. Well calibrated +uncertainties can then be used to roughly estimate the number of uncorrelated +truth samples that are equivalent to the generated sample and clearly indicate +data amplification for smooth features of the distribution. + +
+
+ comment: 15 pages, 6 figures, updated references, fixed typo +
+
+
+
+
+ + ♻ ☆ Neural Persistence Dynamics + + +
+ We consider the problem of learning the dynamics in the topology of +time-evolving point clouds, the prevalent spatiotemporal model for systems +exhibiting collective behavior, such as swarms of insects and birds or +particles in physics. In such systems, patterns emerge from (local) +interactions among self-propelled entities. While several well-understood +governing equations for motion and interaction exist, they are notoriously +difficult to fit to data, as most prior work requires knowledge about +individual motion trajectories, i.e., a requirement that is challenging to +satisfy with an increasing number of entities. To evade such confounding +factors, we investigate collective behavior from a $\textit{topological +perspective}$, but instead of summarizing entire observation sequences (as done +previously), we propose learning a latent dynamical model from topological +features $\textit{per time point}$. The latter is then used to formulate a +downstream regression task to predict the parametrization of some a priori +specified governing equation. We implement this idea based on a latent ODE +learned from vectorized (static) persistence diagrams and show that a +combination of recent stability results for persistent homology justifies this +modeling choice. Various (ablation) experiments not only demonstrate the +relevance of each model component but provide compelling empirical evidence +that our proposed model - $\textit{Neural Persistence Dynamics}$ - +substantially outperforms the state-of-the-art across a diverse set of +parameter regression tasks. + +
+
+
+
+
+ + ♻ ☆ Circuit design in biology and machine learning. I. Random networks and + dimensional reduction + + +
+ A biological circuit is a neural or biochemical cascade, taking inputs and +producing outputs. How have biological circuits learned to solve environmental +challenges over the history of life? The answer certainly follows Dobzhansky's +famous quote that ``nothing in biology makes sense except in the light of +evolution.'' But that quote leaves out the mechanistic basis by which natural +selection's trial-and-error learning happens, which is exactly what we have to +understand. How does the learning process that designs biological circuits +actually work? How much insight can we gain about the form and function of +biological circuits by studying the processes that have made those circuits? +Because life's circuits must often solve the same problems as those faced by +machine learning, such as environmental tracking, homeostatic control, +dimensional reduction, or classification, we can begin by considering how +machine learning designs computational circuits to solve problems. We can then +ask: How much insight do those computational circuits provide about the design +of biological circuits? How much does biology differ from computers in the +particular circuit designs that it uses to solve problems? This article steps +through two classic machine learning models to set the foundation for analyzing +broad questions about the design of biological circuits. One insight is the +surprising power of randomly connected networks. Another is the central role of +internal models of the environment embedded within biological circuits, +illustrated by a model of dimensional reduction and trend prediction. Overall, +many challenges in biology have machine learning analogs, suggesting hypotheses +about how biology's circuits are designed. + +
+
+ comment: Added background info in two text boxes and new figure, edited + throughout +
+
+
+
+
+ + ♻ ☆ No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design + Choices + + +
+ Advances in generative models have made it possible for AI-generated text, +code, and images to mirror human-generated content in many applications. +Watermarking, a technique that aims to embed information in the output of a +model to verify its source, is useful for mitigating the misuse of such +AI-generated content. However, we show that common design choices in LLM +watermarking schemes make the resulting systems surprisingly susceptible to +attack -- leading to fundamental trade-offs in robustness, utility, and +usability. To navigate these trade-offs, we rigorously study a set of simple +yet effective attacks on common watermarking systems, and propose guidelines +and defenses for LLM watermarking in practice. + +
+
+
+
+
+ + ♻ ☆ Controlling Large Electric Vehicle Charging Stations via User Behavior + Modeling and Stochastic Programming + + +
+ This paper introduces an Electric Vehicle Charging Station (EVCS) model that +incorporates real-world constraints, such as slot power limitations, contract +threshold overruns penalties, or early disconnections of electric vehicles +(EVs). We propose a formulation of the problem of EVCS control under +uncertainty, and implement two Multi-Stage Stochastic Programming approaches +that leverage user-provided information, namely, Model Predictive Control and +Two-Stage Stochastic Programming. The model addresses uncertainties in charging +session start and end times, as well as in energy demand. A user's behavior +model based on a sojourn-time-dependent stochastic process enhances cost +reduction while maintaining customer satisfaction. The benefits of the two +proposed methods are showcased against two baselines over a 22-day simulation +using a real-world dataset. The two-stage approach demonstrates robustness +against early disconnections by considering a wider range of uncertainty +scenarios for optimization. The algorithm prioritizing user satisfaction over +electricity cost achieves a 20% and 36% improvement in two user satisfaction +metrics compared to an industry-standard baseline. Additionally, the algorithm +striking the best balance between cost and user satisfaction exhibits a mere 3% +relative cost increase compared to the theoretically optimal baseline - for +which the nonanticipativity constraint is relaxed - while attaining 94% and 84% +of the user satisfaction performance in the two used satisfaction metrics. + +
+
+
+
+
+ + ♻ ☆ Exponential separations between classical and quantum learners + + +
+ Despite significant effort, the quantum machine learning community has only +demonstrated quantum learning advantages for artificial cryptography-inspired +datasets when dealing with classical data. In this paper we address the +challenge of finding learning problems where quantum learning algorithms can +achieve a provable exponential speedup over classical learning algorithms. We +reflect on computational learning theory concepts related to this question and +discuss how subtle differences in definitions can result in significantly +different requirements and tasks for the learner to meet and solve. We examine +existing learning problems with provable quantum speedups and find that they +largely rely on the classical hardness of evaluating the function that +generates the data, rather than identifying it. To address this, we present two +new learning separations where the classical difficulty primarily lies in +identifying the function generating the data. Furthermore, we explore +computational hardness assumptions that can be leveraged to prove quantum +speedups in scenarios where data is quantum-generated, which implies likely +quantum advantages in a plethora of more natural settings (e.g., in condensed +matter and high energy physics). We also discuss the limitations of the +classical shadow paradigm in the context of learning separations, and how +physically-motivated settings such as characterizing phases of matter and +Hamiltonian learning fit in the computational learning framework. + +
+
+ comment: this article supersedes arXiv:2208.06339 +
+
+
+
+
+ + ♻ ☆ On the Robustness of Neural Collapse and the Neural Collapse of + Robustness + + +
+ Neural Collapse refers to the curious phenomenon in the end of training of a +neural network, where feature vectors and classification weights converge to a +very simple geometrical arrangement (a simplex). While it has been observed +empirically in various cases and has been theoretically motivated, its +connection with crucial properties of neural networks, like their +generalization and robustness, remains unclear. In this work, we study the +stability properties of these simplices. We find that the simplex structure +disappears under small adversarial attacks, and that perturbed examples "leap" +between simplex vertices. We further analyze the geometry of networks that are +optimized to be robust against adversarial perturbations of the input, and find +that Neural Collapse is a pervasive phenomenon in these cases as well, with +clean and perturbed representations forming aligned simplices, and giving rise +to a robust simple nearest-neighbor classifier. By studying the propagation of +the amount of collapse inside the network, we identify novel properties of both +robust and non-robust machine learning models, and show that earlier, unlike +later layers maintain reliable simplices on perturbed data. Our code is +available at https://github.com/JingtongSu/robust_neural_collapse . + +
+
+ comment: Transactions on Machine Learning Research, 2024 +
+
+
+
+
+ + ♻ ☆ Predictive Inference in Multi-environment Scenarios + + +
+ We address the challenge of constructing valid confidence intervals and sets +in problems of prediction across multiple environments. We investigate two +types of coverage suitable for these problems, extending the jackknife and +split-conformal methods to show how to obtain distribution-free coverage in +such non-traditional, potentially hierarchical data-generating scenarios. We +demonstrate a novel resizing method to adapt to problem difficulty, which +applies both to existing approaches for predictive inference and the methods we +develop; this reduces prediction set sizes using limited information from the +test environment, a key to the methods' practical performance, which we +evaluate through neurochemical sensing and species classification datasets. Our +contributions also include extensions for settings with non-real-valued +responses, a theory of consistency for predictive inference in these general +problems, and insights on the limits of conditional coverage. + +
+
+
+
+
+ + ♻ ☆ Investigating the Effectiveness of Explainability Methods in Parkinson's + Detection from Speech + + +
+ Speech impairments in Parkinson's disease (PD) provide significant early +indicators for diagnosis. While models for speech-based PD detection have shown +strong performance, their interpretability remains underexplored. This study +systematically evaluates several explainability methods to identify PD-specific +speech features, aiming to support the development of accurate, interpretable +models for clinical decision-making in PD diagnosis and monitoring. Our +methodology involves (i) obtaining attributions and saliency maps using +mainstream interpretability techniques, (ii) quantitatively evaluating the +faithfulness of these maps and their combinations obtained via union and +intersection through a range of established metrics, and (iii) assessing the +information conveyed by the saliency maps for PD detection from an auxiliary +classifier. Our results reveal that, while explanations are aligned with the +classifier, they often fail to provide valuable information for domain experts. + +
+
+ comment: The first two authors contributed equally to this research: author + order is alphabetical +
+
+
+
+
+ + ♻ ☆ GeSubNet: Gene Interaction Inference for Disease Subtype Network + Generation ICLR 2025 + + +
+ Retrieving gene functional networks from knowledge databases presents a +challenge due to the mismatch between disease networks and subtype-specific +variations. Current solutions, including statistical and deep learning methods, +often fail to effectively integrate gene interaction knowledge from databases +or explicitly learn subtype-specific interactions. To address this mismatch, we +propose GeSubNet, which learns a unified representation capable of predicting +gene interactions while distinguishing between different disease subtypes. +Graphs generated by such representations can be considered subtype-specific +networks. GeSubNet is a multi-step representation learning framework with three +modules: First, a deep generative model learns distinct disease subtypes from +patient gene expression profiles. Second, a graph neural network captures +representations of prior gene networks from knowledge databases, ensuring +accurate physical gene interactions. Finally, we integrate these two +representations using an inference loss that leverages graph generation +capabilities, conditioned on the patient separation loss, to refine +subtype-specific information in the learned representation. GeSubNet +consistently outperforms traditional methods, with average improvements of +30.6%, 21.0%, 20.1%, and 56.6% across four graph evaluation metrics, averaged +over four cancer datasets. Particularly, we conduct a biological simulation +experiment to assess how the behavior of selected genes from over 11,000 +candidates affects subtypes or patient distributions. The results show that the +generated network has the potential to identify subtype-specific genes with an +83% likelihood of impacting patient distribution shifts. The GeSubNet resource +is available: https://anonymous.4open.science/r/GeSubNet/ + +
+
+ comment: Under review as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Gradient Normalization Provably Benefits Nonconvex SGD under + Heavy-Tailed Noise + + +
+ This paper investigates the roles of gradient normalization and clipping in +ensuring the convergence of Stochastic Gradient Descent (SGD) under +heavy-tailed noise. While existing approaches consider gradient clipping +indispensable for SGD convergence, we theoretically demonstrate that gradient +normalization alone without clipping is sufficient to ensure convergence. +Furthermore, we establish that combining gradient normalization with clipping +offers significantly improved convergence rates compared to using either +technique in isolation, particularly as gradient noise diminishes. With these +results, our work provides the first theoretical evidence demonstrating the +benefits of gradient normalization in SGD under heavy-tailed noise. Finally, we +introduce an accelerated SGD variant that incorporates both gradient +normalization and clipping, further enhancing convergence rates under +heavy-tailed noise. + +
+
+
+
+
+ + ♻ ☆ V-LoL: A Diagnostic Dataset for Visual Logical Learning + + +
+ Despite the successes of recent developments in visual AI, different +shortcomings still exist; from missing exact logical reasoning, to abstract +generalization abilities, to understanding complex and noisy scenes. +Unfortunately, existing benchmarks, were not designed to capture more than a +few of these aspects. Whereas deep learning datasets focus on visually complex +data but simple visual reasoning tasks, inductive logic datasets involve +complex logical learning tasks, however, lack the visual component. To address +this, we propose the diagnostic visual logical learning dataset, V-LoL, that +seamlessly combines visual and logical challenges. Notably, we introduce the +first instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic +benchmark in symbolic AI, the Michalski train problem. By incorporating +intricate visual scenes and flexible logical reasoning tasks within a versatile +framework, V-LoL-Train provides a platform for investigating a wide range of +visual logical learning challenges. We evaluate a variety of AI systems +including traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our +evaluations demonstrate that even SOTA AI faces difficulties in dealing with +visual logical learning challenges, highlighting unique advantages and +limitations of each methodology. Overall, V-LoL opens up new avenues for +understanding and enhancing current abilities in visual logical learning for AI +systems. + +
+
+
+
+
+ + ♻ ☆ Are Large Language Models Table-based Fact-Checkers? + + +
+ Table-based Fact Verification (TFV) aims to extract the entailment relation +between statements and structured tables. Existing TFV methods based on +small-scaled models suffer from insufficient labeled data and weak zero-shot +ability. Recently, the appearance of Large Language Models (LLMs) has gained +lots of attraction in research fields. They have shown powerful zero-shot and +in-context learning abilities on several NLP tasks, but their potential on TFV +is still unknown. In this work, we implement a preliminary study about whether +LLMs are table-based fact-checkers. In detail, we design diverse prompts to +explore how the in-context learning can help LLMs in TFV, i.e., zero-shot and +few-shot TFV capability. Besides, we carefully design and construct TFV +instructions to study the performance gain brought by the instruction tuning of +LLMs. Experimental results demonstrate that LLMs can achieve acceptable results +on zero-shot and few-shot TFV with prompt engineering, while instruction-tuning +can stimulate the TFV capability significantly. We also make some valuable +findings about the format of zero-shot prompts and the number of in-context +examples. Finally, we analyze some possible directions to promote the accuracy +of TFV via LLMs, which is beneficial to further research of table reasoning. + +
+
+ comment: CSCWD 2024 +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation EMNLP 2024 + + +
+ It is often desirable to distill the capabilities of large language models +(LLMs) into smaller student models due to compute and memory constraints. One +way to do this for classification tasks is via dataset synthesis, which can be +accomplished by generating examples of each label from the LLM. Prior +approaches to synthesis use few-shot prompting, which relies on the LLM's +parametric knowledge to generate usable examples. However, this leads to issues +of repetition, bias towards popular entities, and stylistic differences from +human text. In this work, we propose Synthesize by Retrieval and Refinement +(SynthesizRR), which uses retrieval augmentation to introduce variety into the +dataset synthesis process: as retrieved passages vary, the LLM is seeded with +different content to generate its examples. We empirically study the synthesis +of six datasets, covering topic classification, sentiment analysis, tone +detection, and humor, requiring complex synthesis strategies. We find that +SynthesizRR greatly improves lexical and semantic diversity, similarity to +human-written text, and distillation performance, when compared to 32-shot +prompting and four prior approaches. We release our code to perform all steps +at https://github.com/amazon-science/synthesizrr + +
+
+ comment: Published as a main conference paper at EMNLP 2024. Code available at + https://github.com/amazon-science/synthesizrr +
+
+
+
+
+ + ♻ ☆ Harmonic Path Integral Diffusion + + +
+ In this manuscript, we present a novel approach for sampling from a +continuous multivariate probability distribution, which may either be +explicitly known (up to a normalization factor) or represented via empirical +samples. Our method constructs a time-dependent bridge from a delta function +centered at the origin of the state space at $t=0$, optimally transforming it +into the target distribution at $t=1$. We formulate this as a Stochastic +Optimal Control problem of the Path Integral Control type, with a cost function +comprising (in its basic form) a quadratic control term, a quadratic state +term, and a terminal constraint. This framework, which we refer to as Harmonic +Path Integral Diffusion (H-PID), leverages an analytical solution through a +mapping to an auxiliary quantum harmonic oscillator in imaginary time. + The H-PID framework results in a set of efficient sampling algorithms, +without the incorporation of Neural Networks. The algorithms are validated on +two standard use cases: a mixture of Gaussians over a grid and images from +CIFAR-10. The transparency of the method allows us to analyze the algorithms in +detail, particularly revealing that the current weighted state is an order +parameter for the dynamic phase transition, signaling earlier, at $t<1$, that +the sample generation process is almost complete. We contrast these algorithms +with other sampling methods, particularly simulated annealing and path integral +sampling, highlighting their advantages in terms of analytical control, +accuracy, and computational efficiency on benchmark problems. + Additionally, we extend the methodology to more general cases where the +underlying stochastic differential equation includes an external deterministic, +possibly non-conservative force, and where the cost function incorporates a +gauge potential term. + +
+
+
+
+
+ + ♻ ☆ Exact Fractional Inference via Re-Parametrization & Interpolation + between Tree-Re-Weighted- and Belief Propagation- Algorithms + + +
+ Computing the partition function, $Z$, of an Ising model over a graph of $N$ +\enquote{spins} is most likely exponential in $N$. Efficient variational +methods, such as Belief Propagation (BP) and Tree Re-Weighted (TRW) algorithms, +compute $Z$ approximately by minimizing the respective (BP- or TRW-) free +energy. We generalize the variational scheme by building a $\lambda$-fractional +interpolation, $Z^{(\lambda)}$, where $\lambda=0$ and $\lambda=1$ correspond to +TRW- and BP-approximations, respectively. This fractional scheme -- coined +Fractional Belief Propagation (FBP) -- guarantees that in the attractive +(ferromagnetic) case $Z^{(TRW)} \geq Z^{(\lambda)} \geq Z^{(BP)}$, and there +exists a unique (\enquote{exact}) $\lambda_*$ such that $Z=Z^{(\lambda_*)}$. +Generalizing the re-parametrization approach of +\citep{wainwright_tree-based_2002} and the loop series approach of +\citep{chertkov_loop_2006}, we show how to express $Z$ as a product, $\forall +\lambda:\ Z=Z^{(\lambda)}{\tilde Z}^{(\lambda)}$, where the multiplicative +correction, ${\tilde Z}^{(\lambda)}$, is an expectation over a node-independent +probability distribution built from node-wise fractional marginals. Our +theoretical analysis is complemented by extensive experiments with models from +Ising ensembles over planar and random graphs of medium and large sizes. Our +empirical study yields a number of interesting observations, such as the +ability to estimate ${\tilde Z}^{(\lambda)}$ with $O(N^{2::4})$ fractional +samples and suppression of variation in $\lambda_*$ estimates with an increase +in $N$ for instances from a particular random Ising ensemble, where $[2::4]$ +indicates a range from $2$ to $4$. We also discuss the applicability of this +approach to the problem of image de-noising. + +
+
+
+
+
+ + ♻ ☆ A General Recipe for the Analysis of Randomized Multi-Armed Bandit + Algorithms + + +
+ In this paper we propose a general methodology to derive regret bounds for +randomized multi-armed bandit algorithms. It consists in checking a set of +sufficient conditions on the sampling probability of each arm and on the family +of distributions to prove a logarithmic regret. As a direct application we +revisit two famous bandit algorithms, Minimum Empirical Divergence (MED) and +Thompson Sampling (TS), under various models for the distributions including +single parameter exponential families, Gaussian distributions, bounded +distributions, or distributions satisfying some conditions on their moments. In +particular, we prove that MED is asymptotically optimal for all these models, +but also provide a simple regret analysis of some TS algorithms for which the +optimality is already known. We then further illustrate the interest of our +approach, by analyzing a new Non-Parametric TS algorithm (h-NPTS), adapted to +some families of unbounded reward distributions with a bounded h-moment. This +model can for instance capture some non-parametric families of distributions +whose variance is upper bounded by a known constant. + +
+
+
+
+
+ + ♻ ☆ Active learning of digenic functions with boolean matrix logic + programming + + +
+ We apply logic-based machine learning techniques to facilitate cellular +engineering and drive biological discovery, based on comprehensive databases of +metabolic processes called genome-scale metabolic network models (GEMs). +Predicted host behaviours are not always correctly described by GEMs. Learning +the intricate genetic interactions within GEMs presents computational and +empirical challenges. To address these, we describe a novel approach called +Boolean Matrix Logic Programming (BMLP) by leveraging boolean matrices to +evaluate large logic programs. We introduce a new system, $BMLP_{active}$, +which efficiently explores the genomic hypothesis space by guiding informative +experimentation through active learning. In contrast to sub-symbolic methods, +$BMLP_{active}$ encodes a state-of-the-art GEM of a widely accepted bacterial +host in an interpretable and logical representation using datalog logic +programs. Notably, $BMLP_{active}$ can successfully learn the interaction +between a gene pair with fewer training examples than random experimentation, +overcoming the increase in experimental design space. $BMLP_{active}$ enables +rapid optimisation of metabolic models and offers a realistic approach to a +self-driving lab for microbial engineering. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.06724 +
+
+
+
+
+ + ♻ ☆ The Probabilistic Tsetlin Machine: A Novel Approach to Uncertainty + Quantification + + +
+ Tsetlin Machines (TMs) have emerged as a compelling alternative to +conventional deep learning methods, offering notable advantages such as smaller +memory footprint, faster inference, fault-tolerant properties, and +interpretability. Although various adaptations of TMs have expanded their +applicability across diverse domains, a fundamental gap remains in +understanding how TMs quantify uncertainty in their predictions. In response, +this paper introduces the Probabilistic Tsetlin Machine (PTM) framework, aimed +at providing a robust, reliable, and interpretable approach for uncertainty +quantification. Unlike the original TM, the PTM learns the probability of +staying on each state of each Tsetlin Automaton (TA) across all clauses. These +probabilities are updated using the feedback tables that are part of the TM +framework: Type I and Type II feedback. During inference, TAs decide their +actions by sampling states based on learned probability distributions, akin to +Bayesian neural networks when generating weight values. In our experimental +analysis, we first illustrate the spread of the probabilities across TA states +for the noisy-XOR dataset. Then we evaluate the PTM alongside benchmark models +using both simulated and real-world datasets. The experiments on the simulated +dataset reveal the PTM's effectiveness in uncertainty quantification, +particularly in delineating decision boundaries and identifying regions of high +uncertainty. Moreover, when applied to multiclass classification tasks using +the Iris dataset, the PTM demonstrates competitive performance in terms of +predictive entropy and expected calibration error, showcasing its potential as +a reliable tool for uncertainty estimation. Our findings underscore the +importance of selecting appropriate models for accurate uncertainty +quantification in predictive tasks, with the PTM offering a particularly +interpretable and effective solution. + +
+
+ comment: 12 pages, 5 figures, 6 tables, accepted and presented at ICAAI 2024, + London +
+
+
+
+
+ + ♻ ☆ Exact, Tractable Gauss-Newton Optimization in Deep Reversible + Architectures Reveal Poor Generalization NeurIPS 2024 + + +
+ Second-order optimization has been shown to accelerate the training of deep +neural networks in many applications, often yielding faster progress per +iteration on the training loss compared to first-order optimizers. However, the +generalization properties of second-order methods are still being debated. +Theoretical investigations have proved difficult to carry out outside the +tractable settings of heavily simplified model classes -- thus, the relevance +of existing theories to practical deep learning applications remains unclear. +Similarly, empirical studies in large-scale models and real datasets are +significantly confounded by the necessity to approximate second-order updates +in practice. It is often unclear whether the observed generalization behaviour +arises specifically from the second-order nature of the parameter updates, or +instead reflects the specific structured (e.g.\ Kronecker) approximations used +or any damping-based interpolation towards first-order updates. Here, we show +for the first time that exact Gauss-Newton (GN) updates take on a tractable +form in a class of deep reversible architectures that are sufficiently +expressive to be meaningfully applied to common benchmark datasets. We exploit +this novel setting to study the training and generalization properties of the +GN optimizer. We find that exact GN generalizes poorly. In the mini-batch +training setting, this manifests as rapidly saturating progress even on the +\emph{training} loss, with parameter updates found to overfit each +mini-batchatch without producing the features that would support generalization +to other mini-batches. We show that our experiments run in the ``lazy'' regime, +in which the neural tangent kernel (NTK) changes very little during the course +of training. This behaviour is associated with having no significant changes in +neural representations, explaining the lack of generalization. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ A Review of Electromagnetic Elimination Methods for low-field portable + MRI scanner + + +
+ This paper analyzes conventional and deep learning methods for eliminating +electromagnetic interference (EMI) in MRI systems. We compare traditional +analytical and adaptive techniques with advanced deep learning approaches. Key +strengths and limitations of each method are highlighted. Recent advancements +in active EMI elimination, such as external EMI receiver coils, are discussed +alongside deep learning methods, which show superior EMI suppression by +leveraging neural networks trained on MRI data. While deep learning improves +EMI elimination and diagnostic capabilities, it introduces security and safety +concerns, particularly in commercial applications. A balanced approach, +integrating conventional reliability with deep learning's advanced +capabilities, is proposed for more effective EMI suppression in MRI systems. + +
+
+ comment: Accepted by 2024 5th International Conference on Machine Learning and + Computer Application +
+
+
+
+
+ + ♻ ☆ TreeC: a method to generate interpretable energy management systems + using a metaheuristic algorithm + + +
+ Energy management systems (EMS) have traditionally been implemented using +rule-based control (RBC) and model predictive control (MPC) methods. However, +recent research has explored the use of reinforcement learning (RL) as a +promising alternative. This paper introduces TreeC, a machine learning method +that utilizes the covariance matrix adaptation evolution strategy metaheuristic +algorithm to generate an interpretable EMS modeled as a decision tree. Unlike +RBC and MPC approaches, TreeC learns the decision strategy of the EMS based on +historical data, adapting the control model to the controlled energy grid. The +decision strategy is represented as a decision tree, providing interpretability +compared to RL methods that often rely on black-box models like neural +networks. TreeC is evaluated against MPC with perfect forecast and RL EMSs in +two case studies taken from literature: an electric grid case and a household +heating case. In the electric grid case, TreeC achieves an average energy loss +and constraint violation score of 19.2, which is close to MPC and RL EMSs that +achieve scores of 14.4 and 16.2 respectively. All three methods control the +electric grid well especially when compared to the random EMS, which obtains an +average score of 12 875. In the household heating case, TreeC performs +similarly to MPC on the adjusted and averaged electricity cost and total +discomfort (0.033 EUR/m$^2$ and 0.42 Kh for TreeC compared to 0.037 EUR/m$^2$ +and 2.91 kH for MPC), while outperforming RL (0.266 EUR/m$^2$ and 24.41 Kh). + +
+
+ comment: Accepted version Knowledge based system +
+
+
+
+
+ + ♻ ☆ The effect of dataset size and the process of big data mining for + investigating solar-thermal desalination by using machine learning + + +
+ Machine learning's application in solar-thermal desalination is limited by +data shortage and inconsistent analysis. This study develops an optimized +dataset collection and analysis process for the representative solar still. By +ultra-hydrophilic treatment on the condensation cover, the dataset collection +process reduces the collection time by 83.3%. Over 1,000 datasets are +collected, which is nearly one order of magnitude larger than up-to-date works. +Then, a new interdisciplinary process flow is proposed. Some meaningful results +are obtained that were not addressed by previous studies. It is found that +Radom Forest might be a better choice for datasets larger than 1,000 due to +both high accuracy and fast speed. Besides, the dataset range affects the +quantified importance (weighted value) of factors significantly, with up to a +115% increment. Moreover, the results show that machine learning has a high +accuracy on the extrapolation prediction of productivity, where the minimum +mean relative prediction error is just around 4%. The results of this work not +only show the necessity of the dataset characteristics' effect but also provide +a standard process for studying solar-thermal desalination by machine learning, +which would pave the way for interdisciplinary study. + +
+
+
+
+
+ + ♻ ☆ Advantages of Neural Population Coding for Deep Learning + + +
+ Scalar variables, e.g., the orientation of a shape in an image, are commonly +predicted using a single output neuron in a neural network. In contrast, the +mammalian cortex represents variables with a population of neurons. In this +population code, each neuron is most active at its preferred value and shows +partial activity for other values. Here, we investigate the benefit of using a +population code for the output layer of a neural network. We compare population +codes against single-neuron outputs and one-hot vectors. First, we show +theoretically and in experiments with synthetic data that population codes +improve robustness to input noise in networks of stacked linear layers. Second, +we demonstrate the benefit of using population codes to encode ambiguous +outputs, such as the pose of symmetric objects. Using the T-LESS dataset of +feature-less real-world objects, we show that population codes improve the +accuracy of predicting 3D object orientation from image input. + +
+
+
+
+
+ + ♻ ☆ AudioMarkBench: Benchmarking Robustness of Audio Watermarking NeurIPS + + +
+ The increasing realism of synthetic speech, driven by advancements in +text-to-speech models, raises ethical concerns regarding impersonation and +disinformation. Audio watermarking offers a promising solution via embedding +human-imperceptible watermarks into AI-generated audios. However, the +robustness of audio watermarking against common/adversarial perturbations +remains understudied. We present AudioMarkBench, the first systematic benchmark +for evaluating the robustness of audio watermarking against watermark removal +and watermark forgery. AudioMarkBench includes a new dataset created from +Common-Voice across languages, biological sexes, and ages, 3 state-of-the-art +watermarking methods, and 15 types of perturbations. We benchmark the +robustness of these methods against the perturbations in no-box, black-box, and +white-box settings. Our findings highlight the vulnerabilities of current +watermarking techniques and emphasize the need for more robust and fair audio +watermarking solutions. Our dataset and code are publicly available at +https://github.com/moyangkuo/AudioMarkBench. + +
+
+ comment: To appear in NeurIPS Datasets and Benchmarks, 2024 +
+
+
+
+
+ + ♻ ☆ pLDDT-Predictor: High-speed Protein Screening Using Transformer and ESM2 + + +
+ Recent advancements in protein structure prediction, particularly AlphaFold2, +have revolutionized structural biology by achieving near-experimental accuracy +($\text{average RMSD} < 1.5\text{\AA}$). However, the computational demands of +these models (approximately 30 minutes per protein on an RTX 4090) +significantly limit their application in high-throughput protein screening. +While large language models like ESM (Evolutionary Scale Modeling) have shown +promise in extracting structural information directly from protein sequences, +rapid assessment of protein structure quality for large-scale analyses remains +a major challenge. + We introduce pLDDT-Predictor, a high-speed protein screening tool that +achieves a $250,000\times$ speedup compared to AlphaFold2 by leveraging +pre-trained ESM2 protein embeddings and a Transformer architecture. Our model +predicts AlphaFold2's pLDDT (predicted Local Distance Difference Test) scores +with a Pearson correlation of 0.7891 and processes proteins in just 0.007 +seconds on average. Using a comprehensive dataset of 1.5 million diverse +protein sequences (ranging from 50 to 2048 amino acids), we demonstrate that +pLDDT-Predictor accurately classifies high-confidence structures (pLDDT $>$ 70) +with 91.2\% accuracy and achieves an MSE of 84.8142 compared to AlphaFold2's +predictions. + The source code and pre-trained models are freely available at +\url{https://github.com/jw-chae/pLDDT_Predictor}, enabling the research +community to perform rapid, large-scale protein structure quality assessments. + +
+
+ comment: 6 pages main topic, 8 pages including citiation, 4 figures +
+
+
+
+
+ + ♻ ☆ DM4Steal: Diffusion Model For Link Stealing Attack On Graph Neural + Networks + + +
+ Graph has become increasingly integral to the advancement of recommendation +systems, particularly with the fast development of graph neural network(GNN). +By exploring the virtue of rich node features and link information, GNN is +designed to provide personalized and accurate suggestions. Meanwhile, the +privacy leakage of GNN in such contexts has also captured special attention. +Prior work has revealed that a malicious user can utilize auxiliary knowledge +to extract sensitive link data of the target graph, integral to recommendation +systems, via the decision made by the target GNN model. This poses a +significant risk to the integrity and confidentiality of data used in +recommendation system. Though important, previous works on GNN's privacy +leakage are still challenged in three aspects, i.e., limited stealing attack +scenarios, sub-optimal attack performance, and adaptation against defense. To +address these issues, we propose a diffusion model based link stealing attack, +named DM4Steal. It differs previous work from three critical aspects. (i) +Generality: aiming at six attack scenarios with limited auxiliary knowledge, we +propose a novel training strategy for diffusion models so that DM4Steal is +transferable to diverse attack scenarios. (ii) Effectiveness: benefiting from +the retention of semantic structure in the diffusion model during the training +process, DM4Steal is capable to learn the precise topology of the target graph +through the GNN decision process. (iii) Adaptation: when GNN is defensive +(e.g., DP, Dropout), DM4Steal relies on the stability that comes from sampling +the score model multiple times to keep performance degradation to a minimum, +thus DM4Steal implements successful adaptive attack on defensive GNN. + +
+
+ comment: We found that there were critical problems in our paper, and we + needed to redo the experiment, which was incomplete +
+
+
+
+
+ + ♻ ☆ LAuReL: Learned Augmented Residual Layer ICML + + +
+ One of the core pillars of efficient deep learning methods is architectural +improvements such as the residual/skip connection, which has led to +significantly better model convergence and quality. Since then the residual +connection has become ubiquitous in not just convolutional neural networks but +also transformer-based architectures, the backbone of LLMs. + In this paper we introduce \emph{Learned Augmented Residual Layer} (LAuReL) +-- a novel generalization of the canonical residual connection -- with the goal +to be an in-situ replacement of the latter while outperforming on both model +quality and footprint metrics. Our experiments show that using \laurel can help +boost performance for both vision and language models. For example, on the +ResNet-50, ImageNet 1K task, it achieves $60\%$ of the gains from adding an +extra layer, while only adding $0.003\%$ more parameters, and matches it while +adding $2.6\times$ fewer parameters. + +
+
+ comment: Accepted at the 2nd Efficient Systems for Foundation Models Workshop + at the International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ♻ ☆ Data movement limits to frontier model training + + +
+ We present a theoretical model of distributed training, and use it to analyze +how far dense and sparse training runs can be scaled. Under our baseline +assumptions, given a three month training duration, data movement bottlenecks +begin to significantly lower hardware utilization for training runs exceeding +about $10^{28}$ FLOP, two orders of magnitude above the largest training run to +date, suggesting the arrival of fundamental barriers to scaling in three years +given recent rates of growth. A training run exceeding about $10^{31}$ FLOP is +infeasible even at low utilization. However, more aggressive batch size scaling +and/or shorter and fatter model shapes, if achievable, have the potential to +permit much larger training runs. + +
+
+
+
+
+ + ♻ ☆ DAGER: Exact Gradient Inversion for Large Language Models + + +
+ Federated learning works by aggregating locally computed gradients from +multiple clients, thus enabling collaborative training without sharing private +client data. However, prior work has shown that the data can actually be +recovered by the server using so-called gradient inversion attacks. While these +attacks perform well when applied on images, they are limited in the text +domain and only permit approximate reconstruction of small batches and short +input sequences. In this work, we propose DAGER, the first algorithm to recover +whole batches of input text exactly. DAGER leverages the low-rank structure of +self-attention layer gradients and the discrete nature of token embeddings to +efficiently check if a given token sequence is part of the client data. We use +this check to exactly recover full batches in the honest-but-curious setting +without any prior on the data for both encoder- and decoder-based architectures +using exhaustive heuristic search and a greedy approach, respectively. We +provide an efficient GPU implementation of DAGER and show experimentally that +it recovers full batches of size up to 128 on large language models (LLMs), +beating prior attacks in speed (20x at same batch size), scalability (10x +larger batches), and reconstruction quality (ROUGE-1/2 > 0.99). + +
+
+
+
+
+ + ♻ ☆ Neural Network Verification with Branch-and-Bound for General + Nonlinearities + + +
+ Branch-and-bound (BaB) is among the most effective techniques for neural +network (NN) verification. However, existing works on BaB for NN verification +have mostly focused on NNs with piecewise linear activations, especially ReLU +networks. In this paper, we develop a general framework, named GenBaB, to +conduct BaB on general nonlinearities to verify NNs with general architectures, +based on linear bound propagation for NN verification. To decide which neuron +to branch, we design a new branching heuristic which leverages linear bounds as +shortcuts to efficiently estimate the potential improvement after branching. To +decide nontrivial branching points for general nonlinear functions, we propose +to pre-optimize branching points, which can be efficiently leveraged during +verification with a lookup table. We demonstrate the effectiveness of our +GenBaB on verifying a wide range of NNs, including NNs with activation +functions such as Sigmoid, Tanh, Sine and GeLU, as well as NNs involving +multi-dimensional nonlinear operations such as multiplications in LSTMs and +Vision Transformers. Our framework also allows the verification of general +nonlinear computation graphs and enables verification applications beyond +simple NNs, particularly for AC Optimal Power Flow (ACOPF). GenBaB is part of +the latest $\alpha,\!\beta$-CROWN, the winner of the 4th and the 5th +International Verification of Neural Networks Competition (VNN-COMP 2023 and +2024). + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ ADI: Adversarial Dominating Inputs in Vertical Federated Learning + Systems + + +
+ Vertical federated learning (VFL) system has recently become prominent as a +concept to process data distributed across many individual sources without the +need to centralize it. Multiple participants collaboratively train models based +on their local data in a privacy-aware manner. To date, VFL has become a de +facto solution to securely learn a model among organizations, allowing +knowledge to be shared without compromising privacy of any individuals. Despite +the prosperous development of VFL systems, we find that certain inputs of a +participant, named adversarial dominating inputs (ADIs), can dominate the joint +inference towards the direction of the adversary's will and force other +(victim) participants to make negligible contributions, losing rewards that are +usually offered regarding the importance of their contributions in federated +learning scenarios. We conduct a systematic study on ADIs by first proving +their existence in typical VFL systems. We then propose gradient-based methods +to synthesize ADIs of various formats and exploit common VFL systems. We +further launch greybox fuzz testing, guided by the saliency score of ``victim'' +participants, to perturb adversary-controlled inputs and systematically explore +the VFL attack surface in a privacy-preserving manner. We conduct an in-depth +study on the influence of critical parameters and settings in synthesizing +ADIs. Our study reveals new VFL attack opportunities, promoting the +identification of unknown threats before breaches and building more secure VFL +systems. + +
+
+
+
+
+ + ♻ ☆ Mitigating Gradient Overlap in Deep Residual Networks with Gradient + Normalization for Improved Non-Convex Optimization + + +
+ In deep learning, Residual Networks (ResNets) have proven effective in +addressing the vanishing gradient problem, allowing for the successful training +of very deep networks. However, skip connections in ResNets can lead to +gradient overlap, where gradients from both the learned transformation and the +skip connection combine, potentially resulting in overestimated gradients. This +overestimation can cause inefficiencies in optimization, as some updates may +overshoot optimal regions, affecting weight updates. To address this, we +examine Z-score Normalization (ZNorm) as a technique to manage gradient +overlap. ZNorm adjusts the gradient scale, standardizing gradients across +layers and reducing the negative impact of overlapping gradients. Our +experiments demonstrate that ZNorm improves training process, especially in +non-convex optimization scenarios common in deep learning, where finding +optimal solutions is challenging. These findings suggest that ZNorm can affect +the gradient flow, enhancing performance in large-scale data processing where +accuracy is critical. + +
+
+
+
+
+ + ♻ ☆ Doubly Mild Generalization for Offline Reinforcement Learning NeurIPS 2024 + + +
+ Offline Reinforcement Learning (RL) suffers from the extrapolation error and +value overestimation. From a generalization perspective, this issue can be +attributed to the over-generalization of value functions or policies towards +out-of-distribution (OOD) actions. Significant efforts have been devoted to +mitigating such generalization, and recent in-sample learning approaches have +further succeeded in entirely eschewing it. Nevertheless, we show that mild +generalization beyond the dataset can be trusted and leveraged to improve +performance under certain conditions. To appropriately exploit generalization +in offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild +action generalization and (ii) mild generalization propagation. The former +refers to selecting actions in a close neighborhood of the dataset to maximize +the Q values. Even so, the potential erroneous generalization can still be +propagated, accumulated, and exacerbated by bootstrapping. In light of this, +the latter concept is introduced to mitigate the generalization propagation +without impeding the propagation of RL learning signals. Theoretically, DMG +guarantees better performance than the in-sample optimal policy in the oracle +generalization scenario. Even under worst-case generalization, DMG can still +control value overestimation at a certain level and lower bound the +performance. Empirically, DMG achieves state-of-the-art performance across +Gym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting +from its flexibility in both generalization aspects, DMG enjoys a seamless +transition from offline to online learning and attains strong online +fine-tuning performance. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Rethinking the Power of Timestamps for Robust Time Series Forecasting: A + Global-Local Fusion Perspective NeurIPS 2024 + + +
+ Time series forecasting has played a pivotal role across various industries, +including finance, transportation, energy, healthcare, and climate. Due to the +abundant seasonal information they contain, timestamps possess the potential to +offer robust global guidance for forecasting techniques. However, existing +works primarily focus on local observations, with timestamps being treated +merely as an optional supplement that remains underutilized. When data gathered +from the real world is polluted, the absence of global information will damage +the robust prediction capability of these algorithms. To address these +problems, we propose a novel framework named GLAFF. Within this framework, the +timestamps are modeled individually to capture the global dependencies. Working +as a plugin, GLAFF adaptively adjusts the combined weights for global and local +information, enabling seamless collaboration with any time series forecasting +backbone. Extensive experiments conducted on nine real-world datasets +demonstrate that GLAFF significantly enhances the average performance of widely +used mainstream forecasting models by 12.5%, surpassing the previous +state-of-the-art method by 5.5%. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised + Medical Image Segmentation + + +
+ Semi-supervised learning has received considerable attention for its +potential to leverage abundant unlabeled data to enhance model robustness. +Pseudo labeling is a widely used strategy in semi supervised learning. However, +existing methods often suffer from noise contamination, which can undermine +model performance. To tackle this challenge, we introduce a novel +Synergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework. +Built upon the mean teacher network, we employ a Mix Augmentation module to +enhance the unlabeled data. By evaluating the synergy before and after +augmentation, we strategically partition the pseudo labels into distinct +regions. Additionally, we introduce a Region Loss Evaluation module to assess +the loss across each delineated area. Extensive experiments conducted on the LA +dataset have demonstrated superior performance over state-of-the-art +techniques, underscoring the efficiency and practicality of our framework. + +
+
+
+
+
+ + ♻ ☆ Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and + Tabnet with SMOTEENN + + +
+ Bank credit risk is a significant challenge in modern financial transactions, +and the ability to identify qualified credit card holders among a large number +of applicants is crucial for the profitability of a bank'sbank's credit card +business. In the past, screening applicants'applicants' conditions often +required a significant amount of manual labor, which was time-consuming and +labor-intensive. Although the accuracy and reliability of previously used ML +models have been continuously improving, the pursuit of more reliable and +powerful AI intelligent models is undoubtedly the unremitting pursuit by major +banks in the financial industry. In this study, we used a dataset of over +40,000 records provided by a commercial bank as the research object. We +compared various dimensionality reduction techniques such as PCA and T-SNE for +preprocessing high-dimensional datasets and performed in-depth adaptation and +tuning of distributed models such as LightGBM and XGBoost, as well as deep +models like Tabnet. After a series of research and processing, we obtained +excellent research results by combining SMOTEENN with these techniques. The +experiments demonstrated that LightGBM combined with PCA and SMOTEENN +techniques can assist banks in accurately predicting potential high-quality +customers, showing relatively outstanding performance compared to other models. + +
+
+ comment: 8 pagess on IEEE ICPICS +
+
+
+
+
+ + ♻ ☆ General Geospatial Inference with a Population Dynamics Foundation Model + + +
+ Supporting the health and well-being of dynamic populations around the world +requires governmental agencies, organizations and researchers to understand and +reason over complex relationships between human behavior and local contexts in +order to identify high-risk groups and strategically allocate limited +resources. Traditional approaches to these classes of problems often entail +developing manually curated, task-specific features and models to represent +human behavior and the natural and built environment, which can be challenging +to adapt to new, or even, related tasks. To address this, we introduce a +Population Dynamics Foundation Model (PDFM) that aims to capture the +relationships between diverse data modalities and is applicable to a broad +range of geospatial tasks. We first construct a geo-indexed dataset for postal +codes and counties across the United States, capturing rich aggregated +information on human behavior from maps, busyness, and aggregated search +trends, and environmental factors such as weather and air quality. We then +model this data and the complex relationships between locations using a graph +neural network, producing embeddings that can be adapted to a wide range of +downstream tasks using relatively simple models. We evaluate the effectiveness +of our approach by benchmarking it on 27 downstream tasks spanning three +distinct domains: health indicators, socioeconomic factors, and environmental +measurements. The approach achieves state-of-the-art performance on all 27 +geospatial interpolation tasks, and on 25 out of the 27 extrapolation and +super-resolution tasks. We combined the PDFM with a state-of-the-art +forecasting foundation model, TimesFM, to predict unemployment and poverty, +achieving performance that surpasses fully supervised forecasting. The full set +of embeddings and sample code are publicly available for researchers. + +
+
+ comment: 28 pages, 16 figures, preprint; v2: updated github url +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Analyzing Meta-algorithms in Online Convex + Optimization + + +
+ In this paper, we analyze the problem of online convex optimization in +different settings, including different feedback types +(full-information/semi-bandit/bandit/etc) in either stochastic or +non-stochastic setting and different notions of regret (static adversarial +regret/dynamic regret/adaptive regret). This is done through a framework which +allows us to systematically propose and analyze meta-algorithms for the various +settings described above. We show that any algorithm for online linear +optimization with fully adaptive adversaries is an algorithm for online convex +optimization. We also show that any such algorithm that requires +full-information feedback may be transformed to an algorithm with semi-bandit +feedback with comparable regret bound. We further show that algorithms that are +designed for fully adaptive adversaries using deterministic semi-bandit +feedback can obtain similar bounds using only stochastic semi-bandit feedback +when facing oblivious adversaries. We use this to describe general +meta-algorithms to convert first order algorithms to zeroth order algorithms +with comparable regret bounds. Our framework allows us to analyze online +optimization in various settings, recovers several results in the literature +with a simplified proof technique, and provides new results. + +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Verifiable Neural Network Inference Service + + +
+ Machine learning has revolutionized data analysis and pattern recognition, +but its resource-intensive training has limited accessibility. Machine Learning +as a Service (MLaaS) simplifies this by enabling users to delegate their data +samples to an MLaaS provider and obtain the inference result using a +pre-trained model. Despite its convenience, leveraging MLaaS poses significant +privacy and reliability concerns to the client. Specifically, sensitive +information from the client inquiry data can be leaked to an adversarial MLaaS +provider. Meanwhile, the lack of a verifiability guarantee can potentially +result in biased inference results or even unfair payment issues. While +existing trustworthy machine learning techniques, such as those relying on +verifiable computation or secure computation, offer solutions to privacy and +reliability concerns, they fall short of simultaneously protecting the privacy +of client data and providing provable inference verifiability. + In this paper, we propose vPIN, a privacy-preserving and verifiable CNN +inference scheme that preserves privacy for client data samples while ensuring +verifiability for the inference. vPIN makes use of partial homomorphic +encryption and commit-and-prove succinct non-interactive argument of knowledge +techniques to achieve desirable security properties. In vPIN, we develop +various optimization techniques to minimize the proving circuit for homomorphic +inference evaluation thereby, improving the efficiency and performance of our +technique. We fully implemented and evaluated our vPIN scheme on standard +datasets (e.g., MNIST, CIFAR-10). Our experimental results show that vPIN +achieves high efficiency in terms of proving time, verification time, and proof +size, while providing client data privacy guarantees and provable +verifiability. + +
+
+ comment: Accepted at the Annual Computer Security Applications Conference + (ACSAC) 2024. Source code: github.com/vt-asaplab/vPIN +
+
+
+
+
+ + ♻ ☆ Learning Memory Mechanisms for Decision Making through Demonstrations + + +
+ In Partially Observable Markov Decision Processes, integrating an agent's +history into memory poses a significant challenge for decision-making. +Traditional imitation learning, relying on observation-action pairs for expert +demonstrations, fails to capture the expert's memory mechanisms used in +decision-making. To capture memory processes as demonstrations, we introduce +the concept of memory dependency pairs $(p, q)$ indicating that events at time +$p$ are recalled for decision-making at time $q$. We introduce AttentionTuner +to leverage memory dependency pairs in Transformers and find significant +improvements across several tasks compared to standard Transformers when +evaluated on Memory Gym and the Long-term Memory Benchmark. Code is available +at https://github.com/WilliamYue37/AttentionTuner. + +
+
+
+
+
+ + ♻ ☆ MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation + Models, Convolutional Neural Networks, and Uncertainty Quantification for + High-Speed Video Phase Detection Data + + +
+ Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in +nuclear reactors, chemical processing, and electronics cooling for detecting +vapor, liquid, and microlayer phases. Traditional segmentation models face +pixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ +introduces VideoSAM, a hybrid framework leveraging convolutional neural +networks (CNNs) and transformer-based vision models to enhance segmentation +accuracy and generalizability across complex multimodal PD tasks. Methods: +VideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced +feature extraction and segmentation across diverse HSV PD modalities, spanning +fluids like water, FC-72, nitrogen, and argon under varied heat flux +conditions. The framework also incorporates uncertainty quantification (UQ) to +assess pixel-based discretization errors, delivering reliable metrics such as +contact line density and dry area fraction under experimental conditions. +Results: VideoSAM outperforms SAM and modality-specific CNN models in +segmentation accuracy, excelling in environments with complex phase boundaries, +overlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid +architecture supports cross-dataset generalization, adapting effectively to +varying modalities. The UQ module provides accurate error estimates, enhancing +the reliability of segmentation outputs for advanced HSV PD research. +Conclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD +segmentation, addressing previous limitations with advanced deep learning and +UQ techniques. The open-source datasets and tools introduced enable scalable, +precise, and adaptable segmentation for multimodal PD datasets, supporting +advancements in HSV analysis and autonomous experimentation. The codes and data +used for this paper are publicly available at: +\url{https://github.com/chikap421/mseg_vcuq} + +
+
+ comment: Under Review in EAAI +
+
+
+
+
+ + ♻ ☆ SPDIM: Source-Free Unsupervised Conditional and Label Shift Adaptation + in EEG + + +
+ The non-stationary nature of electroencephalography (EEG) introduces +distribution shifts across domains (e.g., days and subjects), posing a +significant challenge to EEG-based neurotechnology generalization. Without +labeled calibration data for target domains, the problem is a source-free +unsupervised domain adaptation (SFUDA) problem. For scenarios with constant +label distribution, Riemannian geometry-aware statistical alignment frameworks +on the symmetric positive definite (SPD) manifold are considered +state-of-the-art. However, many practical scenarios, including EEG-based sleep +staging, exhibit label shifts. Here, we propose a geometric deep learning +framework for SFUDA problems under specific distribution shifts, including +label shifts. We introduce a novel, realistic generative model and show that +prior Riemannian statistical alignment methods on the SPD manifold can +compensate for specific marginal and conditional distribution shifts but hurt +generalization under label shifts. As a remedy, we propose a +parameter-efficient manifold optimization strategy termed SPDIM. SPDIM uses the +information maximization principle to learn a single SPD-manifold-constrained +parameter per target domain. In simulations, we demonstrate that SPDIM can +compensate for the shifts under our generative model. Moreover, using public +EEG-based brain-computer interface and sleep staging datasets, we show that +SPDIM outperforms prior approaches. + +
+
+
+
+
+ + ♻ ☆ Sample Complexity of Opinion Formation on Networks with Linear + Regression Models + + +
+ Consider public health officials aiming to spread awareness about a new +vaccine in a community interconnected by a social network. How can they +distribute information with minimal resources, so as to avoid polarization and +ensure community-wide convergence of opinion? To tackle such challenges, we +initiate the study of sample complexity of opinion convergence in networks. Our +framework is built on the recognized opinion formation game, where we regard +the opinion of each agent as a data-derived model, unlike previous works that +treat opinions as data-independent scalars. The opinion model for every agent +is initially learned from its local samples and evolves game-theoretically as +all agents communicate with neighbors and revise their models towards an +equilibrium. Our focus is on the sample complexity needed to ensure that the +opinions converge to an equilibrium such that the final model of every agent +has low generalization error. + Our paper has two main technical results. First, we present a novel +polynomial time optimization framework to quantify the total sample complexity +for arbitrary networks, when the underlying learning problem is (generalized) +linear regression. Second, we leverage this optimization to study the network +gain which measures the improvement of sample complexity when learning over a +network compared to that in isolation. Towards this end, we derive network gain +bounds for various network classes including cliques, star graphs, and random +regular graphs. Additionally, our framework provides a method to study sample +distribution within the network, suggesting that it is sufficient to allocate +samples inversely to the degree. Empirical results on both synthetic and +real-world networks strongly support our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ BIOSCAN-5M: A Multimodal Dataset for Insect Biodiversity + + +
+ As part of an ongoing worldwide effort to comprehend and monitor insect +biodiversity, this paper presents the BIOSCAN-5M Insect dataset to the machine +learning community and establish several benchmark tasks. BIOSCAN-5M is a +comprehensive dataset containing multi-modal information for over 5 million +insect specimens, and it significantly expands existing image-based biological +datasets by including taxonomic labels, raw nucleotide barcode sequences, +assigned barcode index numbers, geographical, and size information. We propose +three benchmark experiments to demonstrate the impact of the multi-modal data +types on the classification and clustering accuracy. First, we pretrain a +masked language model on the DNA barcode sequences of the BIOSCAN-5M dataset, +and demonstrate the impact of using this large reference library on species- +and genus-level classification performance. Second, we propose a zero-shot +transfer learning task applied to images and DNA barcodes to cluster feature +embeddings obtained from self-supervised learning, to investigate whether +meaningful clusters can be derived from these representation embeddings. Third, +we benchmark multi-modality by performing contrastive learning on DNA barcodes, +image data, and taxonomic information. This yields a general shared embedding +space enabling taxonomic classification using multiple types of information and +modalities. The code repository of the BIOSCAN-5M Insect dataset is available +at https://github.com/bioscan-ml/BIOSCAN-5M. + +
+
+
+
+
+ + ♻ ☆ Interpretability Needs a New Paradigm + + +
+ Interpretability is the study of explaining models in understandable terms to +humans. At present, interpretability is divided into two paradigms: the +intrinsic paradigm, which believes that only models designed to be explained +can be explained, and the post-hoc paradigm, which believes that black-box +models can be explained. At the core of this debate is how each paradigm +ensures its explanations are faithful, i.e., true to the model's behavior. This +is important, as false but convincing explanations lead to unsupported +confidence in artificial intelligence (AI), which can be dangerous. This +paper's position is that we should think about new paradigms while staying +vigilant regarding faithfulness. First, by examining the history of paradigms +in science, we see that paradigms are constantly evolving. Then, by examining +the current paradigms, we can understand their underlying beliefs, the value +they bring, and their limitations. Finally, this paper presents 3 emerging +paradigms for interpretability. The first paradigm designs models such that +faithfulness can be easily measured. Another optimizes models such that +explanations become faithful. The last paradigm proposes to develop models that +produce both a prediction and an explanation. + +
+
+
+
+
+ + ♻ ☆ Probabilistic Emulation of a Global Climate Model with Spherical + DYffusion NeurIPS 2024 + + +
+ Data-driven deep learning models are transforming global weather forecasting. +It is an open question if this success can extend to climate modeling, where +the complexity of the data and long inference rollouts pose significant +challenges. Here, we present the first conditional generative model that +produces accurate and physically consistent global climate ensemble simulations +by emulating a coarse version of the United States' primary operational global +forecast model, FV3GFS. Our model integrates the dynamics-informed diffusion +framework (DYffusion) with the Spherical Fourier Neural Operator (SFNO) +architecture, enabling stable 100-year simulations at 6-hourly timesteps while +maintaining low computational overhead compared to single-step deterministic +baselines. The model achieves near gold-standard performance for climate model +emulation, outperforming existing approaches and demonstrating promising +ensemble skill. This work represents a significant advance towards efficient, +data-driven climate simulations that can enhance our understanding of the +climate system and inform adaptation strategies. + +
+
+ comment: NeurIPS 2024; Code is available at + https://github.com/Rose-STL-Lab/spherical-dyffusion +
+
+
+
+
+ + ♻ ☆ Taming Latent Diffusion Model for Neural Radiance Field Inpainting ECCV 2024 + + +
+ Neural Radiance Field (NeRF) is a representation for 3D reconstruction from +multi-view images. Despite some recent work showing preliminary success in +editing a reconstructed NeRF with diffusion prior, they remain struggling to +synthesize reasonable geometry in completely uncovered regions. One major +reason is the high diversity of synthetic contents from the diffusion model, +which hinders the radiance field from converging to a crisp and deterministic +geometry. Moreover, applying latent diffusion models on real data often yields +a textural shift incoherent to the image condition due to auto-encoding errors. +These two problems are further reinforced with the use of pixel-distance +losses. To address these issues, we propose tempering the diffusion model's +stochasticity with per-scene customization and mitigating the textural shift +with masked adversarial training. During the analyses, we also found the +commonly used pixel and perceptual losses are harmful in the NeRF inpainting +task. Through rigorous experiments, our framework yields state-of-the-art NeRF +inpainting results on various real-world scenes. Project page: +https://hubert0527.github.io/MALD-NeRF + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://hubert0527.github.io/MALD-NeRF +
+
+
+
+
+ + ♻ ☆ Super Consistency of Neural Network Landscapes and Learning Rate + Transfer + + +
+ Recently, there has been growing evidence that if the width and depth of a +neural network are scaled toward the so-called rich feature learning limit +(\mup and its depth extension), then some hyperparameters -- such as the +learning rate -- exhibit transfer from small to very large models. From an +optimization perspective, this phenomenon is puzzling, as it implies that the +loss landscape is consistently similar across very different model sizes. In +this work, we study the landscape through the lens of the loss Hessian, with a +focus on its largest eigenvalue (i.e. the sharpness), and find that certain +spectral properties under $\mu$P are largely independent of the size of the +network, and remain consistent as training progresses. We name this property +Super Consistency of the landscape. On the other hand, we show that in the +Neural Tangent Kernel (NTK) and other scaling regimes, the sharpness exhibits +very different dynamics at different scales. But what causes these differences +in the sharpness dynamics? Through a connection between the Hessian's and the +NTK's spectrum, we argue that the cause lies in the presence (for $\mu$P) or +progressive absence (for the NTK scaling) of feature learning. We corroborate +our claims with a substantial suite of experiments, covering a wide range of +datasets and architectures: from ResNets and Vision Transformers trained on +benchmark vision datasets to Transformers-based language models trained on +WikiText. + +
+
+ comment: The paper has been accepted at Neurips 2024. This is a revised + version of the paper previously titled "Why do Learning Rates Transfer? + Reconciling Optimization and Scaling Limits for Deep Learning" +
+
+
+
+
+ + ♻ ☆ "No Matter What You Do": Purifying GNN Models via Backdoor Unlearning + + +
+ Recent studies have exposed that GNNs are vulnerable to several adversarial +attacks, among which backdoor attack is one of the toughest. Similar to Deep +Neural Networks (DNNs), backdoor attacks in GNNs lie in the fact that the +attacker modifies a portion of graph data by embedding triggers and enforces +the model to learn the trigger feature during the model training process. +Despite the massive prior backdoor defense works on DNNs, defending against +backdoor attacks in GNNs is largely unexplored, severely hindering the +widespread application of GNNs in real-world tasks. To bridge this gap, we +present GCleaner, the first backdoor mitigation method on GNNs. GCleaner can +mitigate the presence of the backdoor logic within backdoored GNNs by reversing +the backdoor learning procedure, aiming to restore the model performance to a +level similar to that is directly trained on the original clean dataset. To +achieve this objective, we ask: How to recover universal and hard backdoor +triggers in GNNs? How to unlearn the backdoor trigger feature while maintaining +the model performance? We conduct the graph trigger recovery via the +explanation method to identify optimal trigger locations, facilitating the +search of universal and hard backdoor triggers in the feature space of the +backdoored model through maximal similarity. Subsequently, we introduce the +backdoor unlearning mechanism, which combines knowledge distillation and +gradient-based explainable knowledge for fine-grained backdoor erasure. +Extensive experimental evaluations on four benchmark datasets demonstrate that +GCleaner can reduce the backdoor attack success rate to 10% with only 1% of +clean data, and has almost negligible degradation in model performance, which +far outperforms the state-of-the-art (SOTA) defense methods. + +
+
+ comment: 18 pages, 12 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Data-Prep-Kit: getting your data ready for LLM application development + + +
+ Data preparation is the first and a very important step towards any Large +Language Model (LLM) development. This paper introduces an easy-to-use, +extensible, and scale-flexible open-source data preparation toolkit called Data +Prep Kit (DPK). DPK is architected and designed to enable users to scale their +data preparation to their needs. With DPK they can prepare data on a local +machine or effortlessly scale to run on a cluster with thousands of CPU Cores. +DPK comes with a highly scalable, yet extensible set of modules that transform +natural language and code data. If the user needs additional transforms, they +can be easily developed using extensive DPK support for transform creation. +These modules can be used independently or pipelined to perform a series of +operations. In this paper, we describe DPK architecture and show its +performance from a small scale to a very large number of CPUs. The modules from +DPK have been used for the preparation of Granite Models [1] [2]. We believe +DPK is a valuable contribution to the AI community to easily prepare data to +enhance the performance of their LLM models or to fine-tune models with +Retrieval-Augmented Generation (RAG). + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Neural-Rendezvous: Provably Robust Guidance and Control to Encounter + Interstellar Objects + + +
+ Interstellar objects (ISOs) are likely representatives of primitive materials +invaluable in understanding exoplanetary star systems. Due to their poorly +constrained orbits with generally high inclinations and relative velocities, +however, exploring ISOs with conventional human-in-the-loop approaches is +significantly challenging. This paper presents Neural-Rendezvous -- a deep +learning-based guidance and control framework for encountering fast-moving +objects, including ISOs, robustly, accurately, and autonomously in real time. +It uses pointwise minimum norm tracking control on top of a guidance policy +modeled by a spectrally-normalized deep neural network, where its +hyperparameters are tuned with a loss function directly penalizing the MPC +state trajectory tracking error. We show that Neural-Rendezvous provides a high +probability exponential bound on the expected spacecraft delivery error, the +proof of which leverages stochastic incremental stability analysis. In +particular, it is used to construct a non-negative function with a +supermartingale property, explicitly accounting for the ISO state uncertainty +and the local nature of nonlinear state estimation guarantees. In numerical +simulations, Neural-Rendezvous is demonstrated to satisfy the expected error +bound for 100 ISO candidates. This performance is also empirically validated +using our spacecraft simulator and in high-conflict and distributed UAV swarm +reconfiguration with up to 20 UAVs. + +
+
+ comment: Preprint Version, Accepted: October, 2024 (One-minute YouTube + summary: https://youtu.be/q3e0LYS2IYQ, DOI: + https://doi.org/10.2514/1.G007671) +
+
+
+
+
+ + ♻ ☆ Feature Selection Based on Wasserstein Distance + + +
+ This paper presents a novel feature selection method leveraging the +Wasserstein distance to improve feature selection in machine learning. Unlike +traditional methods based on correlation or Kullback-Leibler (KL) divergence, +our approach uses the Wasserstein distance to assess feature similarity, +inherently capturing class relationships and making it robust to noisy labels. +We introduce a Markov blanket-based feature selection algorithm and demonstrate +its effectiveness. Our analysis shows that the Wasserstein distance-based +feature selection method effectively reduces the impact of noisy labels without +relying on specific noise models. We provide a lower bound on its +effectiveness, which remains meaningful even in the presence of noise. +Experimental results across multiple datasets demonstrate that our approach +consistently outperforms traditional methods, particularly in noisy settings. + +
+
+
+
+
+ + ♻ ☆ Statistical Advantages of Perturbing Cosine Router in Mixture of Experts + + +
+ The cosine router in Mixture of Experts (MoE) has recently emerged as an +attractive alternative to the conventional linear router. Indeed, the cosine +router demonstrates favorable performance in image and language tasks and +exhibits better ability to mitigate the representation collapse issue, which +often leads to parameter redundancy and limited representation potentials. +Despite its empirical success, a comprehensive analysis of the cosine router in +MoE has been lacking. Considering the least square estimation of the cosine +routing MoE, we demonstrate that due to the intrinsic interaction of the model +parameters in the cosine router via some partial differential equations, +regardless of the structures of the experts, the estimation rates of experts +and model parameters can be as slow as $\mathcal{O}(1/\log^{\tau}(n))$ where +$\tau > 0$ is some constant and $n$ is the sample size. Surprisingly, these +pessimistic non-polynomial convergence rates can be circumvented by the widely +used technique in practice to stabilize the cosine router -- simply adding +noises to the $L^2$ norms in the cosine router, which we refer to as +\textit{perturbed cosine router}. Under the strongly identifiable settings of +the expert functions, we prove that the estimation rates for both the experts +and model parameters under the perturbed cosine routing MoE are significantly +improved to polynomial rates. Finally, we conduct extensive simulation studies +in both synthetic and real data settings to empirically validate our +theoretical results. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Occam Gradient Descent + + +
+ Deep learning neural network models must be large enough to adapt to their +problem domain, while small enough to avoid overfitting training data during +gradient descent. To balance these competing demands, overprovisioned deep +learning models such as transformers are trained for a single epoch on large +data sets, and hence inefficient with both computing resources and training +data. In response to these inefficiencies, we exploit learning theory to derive +Occam Gradient Descent, an algorithm that interleaves adaptive reduction of +model size to minimize generalization error, with gradient descent on model +weights to minimize fitting error. In contrast, traditional gradient descent +greedily minimizes fitting error without regard to generalization error. Our +algorithm simultaneously descends the space of weights and topological size of +any neural network without modification. With respect to loss, compute and +model size, our experiments show (a) on image classification benchmarks, linear +and convolutional neural networks trained with Occam Gradient Descent +outperform traditional gradient descent with or without post-train pruning; (b) +on a range of tabular data classification tasks, neural networks trained with +Occam Gradient Descent outperform traditional gradient descent, as well as +Random Forests; (c) on natural language transformers, Occam Gradient Descent +outperforms traditional gradient descent. + +
+
+
+
+
+ + ♻ ☆ Mixed Effects Deep Learning for the interpretable analysis of single + cell RNA sequencing data by quantifying and visualizing batch effects + + +
+ Single-cell RNA sequencing (scRNA-seq) data are often confounded by technical +or biological batch effects. Existing deep learning models mitigate these +effects but often discard batch-specific information, potentially losing +valuable biological insights. We propose a Mixed Effects Deep Learning (MEDL) +autoencoder framework that separately models batch-invariant (fixed effects) +and batch-specific (random effects) components. By decoupling batch-invariant +biological states from batch variations, our framework integrates both into +predictive models. Our approach also generates 2D visualizations of how the +same cell appears across batches, enhancing interpretability. Retaining both +fixed and random effect latent spaces improves classification accuracy. + We applied our framework to three datasets spanning the cardiovascular system +(Healthy Heart), Autism Spectrum Disorder (ASD), and Acute Myeloid Leukemia +(AML). With 147 batches in the Healthy Heart dataset, far exceeding typical +numbers, we tested our framework's ability to handle many batches. In the ASD +dataset, our approach captured donor heterogeneity between autistic and healthy +individuals. In the AML dataset, it distinguished donor heterogeneity despite +missing cell types and diseased donors exhibiting both healthy and malignant +cells. These results highlight our framework's ability to characterize fixed +and random effects, enhance batch effect visualization, and improve prediction +accuracy across diverse datasets. + +
+
+ comment: Main manuscript: 29 pages, including 10 figures and 8 tables. + Supplemental material: 17 pages +
+
+
+
+
+ + ♻ ☆ Minibatch Optimal Transport and Perplexity Bound Estimation in Discrete + Flow Matching + + +
+ Outperforming autoregressive models on categorical data distributions, such +as textual data, remains challenging for continuous diffusion and flow models. +Discrete flow matching, a recent framework for modeling categorical data, has +shown competitive performance with autoregressive models. Despite its +similarities with continuous flow matching, the rectification strategy applied +in the continuous version does not directly extend to the discrete one due to +the inherent stochasticity of discrete paths. This limitation necessitates +exploring alternative methods to minimize state transitions during generation. +To address this, we propose a dynamic-optimal-transport-like minimization +objective for discrete flows with convex interpolants and derive its equivalent +Kantorovich formulation. The latter defines transport cost solely in terms of +inter-state similarity and is optimized using a minibatch strategy. Another +limitation we address in the discrete flow framework is model evaluation. +Unlike continuous flows, wherein the instantaneous change of variables enables +density estimation, discrete models lack a similar mechanism due to the +inherent non-determinism and discontinuity of their paths. To alleviate this +issue, we propose an upper bound on the perplexity of discrete flow models, +enabling performance evaluation and comparison with other methods. + +
+
+
+
+
+ + ♻ ☆ A metric embedding kernel for live cell microscopy signaling patterns + + +
+ Live cell microscopy captures 5-D $(x,y,z,channel,time)$ movies that display +patterns of cellular motion and signaling dynamics. We present here a metric +kernel function for spatiotemporal patterns of cell signaling dynamics in 5-D +live cell microscopy movies unique in requiring no a priori knowledge of +expected pattern dynamics, and no training data. The approach uses Kolmogorov +complexity theory to compute a metric distance between movies and to measure +the meaningful information among subsets of movies. Cell signaling kymographs +store at each spatiotemporal cell centroid the cell signaling state, or a +functional output such as velocity. Patterns of similarity are identified via +the metric normalized compression distance (NCD). The NCD is a reproducing +kernel for a Hilbert space that represents the input cell signaling kymographs +as points in a low dimensional embedding that optimally captures the pattern +similarity identified by the NCD throughout the space. The only parameter is +the expected cell radii ($\mu m$). A new formulation of the cluster structure +function optimally estimates the meaningful information captured by the +embedding. Also presented is the cell signaling structure function (SSF), a +Kolmogorov structure function that optimally measures cell signaling state as +nuclear intensity w.r.t. surrounding cytoplasm, a significant improvement +compared to the current state-of-the-art cytonuclear ratio. Results are +presented quantifying the impact of ERK and AKT signaling between different +oncogenic mutations, and by the relation between ERK signaling and cellular +velocity patterns for movies of 2-D monolayers of human breast epithelial +(MCF10A) cells, 3-D MCF10A spheroids under optogenetic manipulation of ERK, and +human induced pluripotent stem cells. + +
+
+
+
+
+ + ♻ ☆ Mixed Newton Method for Optimization in Complex Spaces + + +
+ In this paper, we modify and apply the recently introduced Mixed Newton +Method, which is originally designed for minimizing real-valued functions of +complex variables, to the minimization of real-valued functions of real +variables by extending the functions to complex space. We show that arbitrary +regularizations preserve the favorable local convergence properties of the +method, and construct a special type of regularization used to prevent +convergence to complex minima. We compare several variants of the method +applied to training neural networks with real and complex parameters. + +
+
+ comment: 16 pages, 7 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Machine Learning Global Simulation of Nonlocal Gravity Wave Propagation + + +
+ Global climate models typically operate at a grid resolution of hundreds of +kilometers and fail to resolve atmospheric mesoscale processes, e.g., clouds, +precipitation, and gravity waves (GWs). Model representation of these processes +and their sources is essential to the global circulation and planetary energy +budget, but subgrid scale contributions from these processes are often only +approximately represented in models using parameterizations. These +parameterizations are subject to approximations and idealizations, which limit +their capability and accuracy. The most drastic of these approximations is the +"single-column approximation" which completely neglects the horizontal +evolution of these processes, resulting in key biases in current climate +models. With a focus on atmospheric GWs, we present the first-ever global +simulation of atmospheric GW fluxes using machine learning (ML) models trained +on the WINDSET dataset to emulate global GW emulation in the atmosphere, as an +alternative to traditional single-column parameterizations. Using an Attention +U-Net-based architecture trained on globally resolved GW momentum fluxes, we +illustrate the importance and effectiveness of global nonlocality, when +simulating GWs using data-driven schemes. + +
+
+ comment: International Conference on Machine Learning 2024 +
+
+
+
+
+ + ♻ ☆ Defending Large Language Models Against Attacks With Residual Stream + Activation Analysis + + +
+ The widespread adoption of Large Language Models (LLMs), exemplified by +OpenAI's ChatGPT, brings to the forefront the imperative to defend against +adversarial threats on these models. These attacks, which manipulate an LLM's +output by introducing malicious inputs, undermine the model's integrity and the +trust users place in its outputs. In response to this challenge, our paper +presents an innovative defensive strategy, given white box access to an LLM, +that harnesses residual activation analysis between transformer layers of the +LLM. We apply a novel methodology for analyzing distinctive activation patterns +in the residual streams for attack prompt classification. We curate multiple +datasets to demonstrate how this method of classification has high accuracy +across multiple types of attack scenarios, including our newly-created attack +dataset. Furthermore, we enhance the model's resilience by integrating safety +fine-tuning techniques for LLMs in order to measure its effect on our +capability to detect attacks. The results underscore the effectiveness of our +approach in enhancing the detection and mitigation of adversarial inputs, +advancing the security framework within which LLMs operate. + +
+
+
+
+
+ + ♻ ☆ MDCure: A Scalable Pipeline for Multi-Document Instruction-Following + + +
+ Multi-document (MD) processing is crucial for LLMs to handle real-world tasks +such as summarization and question-answering across large sets of documents. +While LLMs have improved at processing long inputs, MD contexts still present +challenges, such as managing inter-document dependencies, redundancy, and +incoherent structures. We introduce MDCure, a scalable and effective +fine-tuning pipeline to enhance the MD capabilities of LLMs without the +computational cost of pre-training or reliance on human annotated data. MDCure +is based on generation of high-quality synthetic MD instruction data from sets +of related articles via targeted prompts. We further introduce MDCureRM, a +multi-objective reward model which filters generated data based on their +training utility for MD settings. With MDCure, we fine-tune a variety of LLMs, +from the FlanT5, Qwen2, and LLAMA3.1 model families, up to 70B parameters in +size. Extensive evaluations on a wide range of MD and long-context benchmarks +spanning various tasks show MDCure consistently improves performance over +pre-trained baselines and over corresponding base models by up to 75.5%. Our +code, datasets, and models are available at https://github.com/yale-nlp/MDCure. + +
+
+
+
+
+ + ♻ ☆ Explainable AI through a Democratic Lens: DhondtXAI for Proportional + Feature Importance Using the D'Hondt Method + + +
+ In democratic societies, electoral systems play a crucial role in translating +public preferences into political representation. Among these, the D'Hondt +method is widely used to ensure proportional representation, balancing fair +representation with governmental stability. Recently, there has been a growing +interest in applying similar principles of proportional representation to +enhance interpretability in machine learning, specifically in Explainable AI +(XAI). This study investigates the integration of D'Hondt-based voting +principles in the DhondtXAI method, which leverages resource allocation +concepts to interpret feature importance within AI models. Through a comparison +of SHAP (Shapley Additive Explanations) and DhondtXAI, we evaluate their +effectiveness in feature attribution within CatBoost and XGBoost models for +breast cancer and diabetes prediction, respectively. The DhondtXAI approach +allows for alliance formation and thresholding to enhance interpretability, +representing feature importance as seats in a parliamentary view. Statistical +correlation analyses between SHAP values and DhondtXAI allocations support the +consistency of interpretations, demonstrating DhondtXAI's potential as a +complementary tool for understanding feature importance in AI models. The +results highlight that integrating electoral principles, such as proportional +representation and alliances, into AI explainability can improve user +understanding, especially in high-stakes fields like healthcare. + +
+
+
+
+
+ + ♻ ☆ Confidence Trigger Detection: Accelerating Real-time + Tracking-by-detection Systems + + +
+ Real-time object tracking necessitates a delicate balance between speed and +accuracy, a challenge exacerbated by the computational demands of deep learning +methods. In this paper, we propose Confidence-Triggered Detection (CTD), an +innovative approach that strategically bypasses object detection for frames +closely resembling intermediate states, leveraging tracker confidence scores. +CTD not only enhances tracking speed but also preserves accuracy, surpassing +existing tracking algorithms. Through extensive evaluation across various +tracker confidence thresholds, we identify an optimal trade-off between +tracking speed and accuracy, providing crucial insights for parameter +fine-tuning and enhancing CTD's practicality in real-world scenarios. Our +experiments across diverse detection models underscore the robustness and +versatility of the CTD framework, demonstrating its potential to enable +real-time tracking in resource-constrained environments. + +
+
+ comment: Accepted by 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Interstellar Object Accessibility and Mission Design + + +
+ Interstellar objects (ISOs) represent a compelling and under-explored +category of celestial bodies, providing physical laboratories to understand the +formation of our solar system and probe the composition and properties of +material formed in exoplanetary systems. In this work, we investigate existing +approaches to designing successful flyby missions to ISOs, including a deep +learning-driven guidance and control algorithm for ISOs traveling at velocities +over 60 km/s. We have generated spacecraft trajectories to a series of +synthetic representative ISOs, simulating a ground campaign to observe the +target and resolve its state, thereby determining the cruise and close approach +delta-Vs required for the encounter. We discuss the accessibility of and +mission design to ISOs with varying characteristics, with special focuses on 1) +state covariance estimation throughout the cruise, 2) handoffs from traditional +navigation approaches to novel autonomous navigation for fast flyby regimes, +and 3) overall recommendations about preparing for the future in situ +exploration of these targets. The lessons learned also apply to the fast flyby +of other small bodies, e.g., long-period comets and potentially hazardous +asteroids, which also require tactical responses with similar characteristics. + +
+
+ comment: IEEE Aerospace Conference, Preprint Version, Accepted: November 2022 +
+
+
+
+
+
+
+
+ + Artificial Intelligence 162 + +
+
+
+ + ☆ 4D Gaussian Splatting in the Wild with Uncertainty-Aware Regularization NeurIPS 2024 + + +
+ Novel view synthesis of dynamic scenes is becoming important in various +applications, including augmented and virtual reality. We propose a novel 4D +Gaussian Splatting (4DGS) algorithm for dynamic scenes from casually recorded +monocular videos. To overcome the overfitting problem of existing work for +these real-world videos, we introduce an uncertainty-aware regularization that +identifies uncertain regions with few observations and selectively imposes +additional priors based on diffusion models and depth smoothness on such +regions. This approach improves both the performance of novel view synthesis +and the quality of training image reconstruction. We also identify the +initialization problem of 4DGS in fast-moving dynamic regions, where the +Structure from Motion (SfM) algorithm fails to provide reliable 3D landmarks. +To initialize Gaussian primitives in such regions, we present a dynamic region +densification method using the estimated depth maps and scene flow. Our +experiments show that the proposed method improves the performance of 4DGS +reconstruction from a video captured by a handheld monocular camera and also +exhibits promising results in few-shot static scene reconstruction. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ A Short Note on Evaluating RepNet for Temporal Repetition Counting in + Videos + + +
+ We discuss some consistent issues on how RepNet has been evaluated in various +papers. As a way to mitigate these issues, we report RepNet performance results +on different datasets, and release evaluation code and the RepNet checkpoint to +obtain these results. Code URL: +https://github.com/google-research/google-research/blob/master/repnet/ + +
+
+
+
+
+ + ☆ Causal Explanations for Image Classifiers + + +
+ Existing algorithms for explaining the output of image classifiers use +different definitions of explanations and a variety of techniques to extract +them. However, none of the existing tools use a principled approach based on +formal definitions of causes and explanations for the explanation extraction. +In this paper we present a novel black-box approach to computing explanations +grounded in the theory of actual causality. We prove relevant theoretical +results and present an algorithm for computing approximate explanations based +on these definitions. We prove termination of our algorithm and discuss its +complexity and the amount of approximation compared to the precise definition. +We implemented the framework in a tool rex and we present experimental results +and a comparison with state-of-the-art tools. We demonstrate that rex is the +most efficient tool and produces the smallest explanations, in addition to +outperforming other black-box tools on standard quality measures. + +
+
+
+
+
+ + ☆ The Limited Impact of Medical Adaptation of Large Language and + Vision-Language Models EMNLP 2024 + + +
+ Several recent works seek to develop foundation models specifically for +medical applications, adapting general-purpose large language models (LLMs) and +vision-language models (VLMs) via continued pretraining on publicly available +biomedical corpora. These works typically claim that such domain-adaptive +pretraining (DAPT) improves performance on downstream medical tasks, such as +answering medical licensing exam questions. In this paper, we compare ten +public "medical" LLMs and two VLMs against their corresponding base models, +arriving at a different conclusion: all medical VLMs and nearly all medical +LLMs fail to consistently improve over their base models in the zero-/few-shot +prompting and supervised fine-tuning regimes for medical question-answering +(QA). For instance, across all tasks and model pairs we consider in the 3-shot +setting, medical LLMs only outperform their base models in 22.7% of cases, +reach a (statistical) tie in 36.8% of cases, and are significantly worse than +their base models in the remaining 40.5% of cases. Our conclusions are based on +(i) comparing each medical model head-to-head, directly against the +corresponding base model; (ii) optimizing the prompts for each model separately +in zero-/few-shot prompting; and (iii) accounting for statistical uncertainty +in comparisons. While these basic practices are not consistently adopted in the +literature, our ablations show that they substantially impact conclusions. +Meanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs +can show performance improvements, but the benefits do not carry over to tasks +based on clinical notes. Our findings suggest that state-of-the-art +general-domain models may already exhibit strong medical knowledge and +reasoning capabilities, and offer recommendations to strengthen the conclusions +of future studies. + +
+
+ comment: Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes + additional results on clinical note QA tasks and supervised fine-tuning + evaluations +
+
+
+
+
+ + ☆ Interaction Testing in Variation Analysis + + +
+ Relationships of cause and effect are of prime importance for explaining +scientific phenomena. Often, rather than just understanding the effects of +causes, researchers also wish to understand how a cause $X$ affects an outcome +$Y$ mechanistically -- i.e., what are the causal pathways that are activated +between $X$ and $Y$. For analyzing such questions, a range of methods has been +developed over decades under the rubric of causal mediation analysis. +Traditional mediation analysis focuses on decomposing the average treatment +effect (ATE) into direct and indirect effects, and therefore focuses on the ATE +as the central quantity. This corresponds to providing explanations for +associations in the interventional regime, such as when the treatment $X$ is +randomized. Commonly, however, it is of interest to explain associations in the +observational regime, and not just in the interventional regime. In this paper, +we introduce \text{variation analysis}, an extension of mediation analysis that +focuses on the total variation (TV) measure between $X$ and $Y$, written as +$\mathrm{E}[Y \mid X=x_1] - \mathrm{E}[Y \mid X=x_0]$. The TV measure +encompasses both causal and confounded effects, as opposed to the ATE which +only encompasses causal (direct and mediated) variations. In this way, the TV +measure is suitable for providing explanations in the natural regime and +answering questions such as ``why is $X$ associated with $Y$?''. Our focus is +on decomposing the TV measure, in a way that explicitly includes direct, +indirect, and confounded variations. Furthermore, we also decompose the TV +measure to include interaction terms between these different pathways. +Subsequently, interaction testing is introduced, involving hypothesis tests to +determine if interaction terms are significantly different from zero. If +interactions are not significant, more parsimonious decompositions of the TV +measure can be used. + +
+
+
+
+
+ + ☆ Data-driven Surface Solar Irradiance Estimation using Neural Operators + at Global Scale + + +
+ Accurate surface solar irradiance (SSI) forecasting is essential for +optimizing renewable energy systems, particularly in the context of long-term +energy planning on a global scale. This paper presents a pioneering approach to +solar radiation forecasting that leverages recent advancements in numerical +weather prediction (NWP) and data-driven machine learning weather models. These +advances facilitate long, stable rollouts and enable large ensemble forecasts, +enhancing the reliability of predictions. Our flexible model utilizes variables +forecast by these NWP and AI weather models to estimate 6-hourly SSI at global +scale. Developed using NVIDIA Modulus, our model represents the first adaptive +global framework capable of providing long-term SSI forecasts. Furthermore, it +can be fine-tuned using satellite data, which significantly enhances its +performance in the fine-tuned regions, while maintaining accuracy elsewhere. +The improved accuracy of these forecasts has substantial implications for the +integration of solar energy into power grids, enabling more efficient energy +management and contributing to the global transition to renewable energy +sources. + +
+
+
+
+
+ + ☆ AstroM$^3$: A self-supervised multimodal model for astronomy + + +
+ While machine-learned models are now routinely employed to facilitate +astronomical inquiry, model inputs tend to be limited to a primary data source +(namely images or time series) and, in the more advanced approaches, some +metadata. Yet with the growing use of wide-field, multiplexed observational +resources, individual sources of interest often have a broad range of +observational modes available. Here we construct an astronomical multimodal +dataset and propose AstroM$^3$, a self-supervised pre-training approach that +enables a model to learn from multiple modalities simultaneously. Specifically, +we extend the CLIP (Contrastive Language-Image Pretraining) model to a trimodal +setting, allowing the integration of time-series photometry data, spectra, and +astrophysical metadata. In a fine-tuning supervised setting, our results +demonstrate that CLIP pre-training improves classification performance for +time-series photometry, where accuracy increases from 84.6% to 91.5%. +Furthermore, CLIP boosts classification accuracy by up to 12.6% when the +availability of labeled data is limited, showing the effectiveness of +leveraging larger corpora of unlabeled data. In addition to fine-tuned +classification, we can use the trained model in other downstream tasks that are +not explicitly contemplated during the construction of the self-supervised +model. In particular we show the efficacy of using the learned embeddings for +misclassifications identification, similarity search, and anomaly detection. +One surprising highlight is the "rediscovery" of Mira subtypes and two +Rotational variable subclasses using manifold learning and dimension reduction +algorithm. To our knowledge this is the first construction of an $n>2$ mode +model in astronomy. Extensions to $n>3$ modes is naturally anticipated with +this approach. + +
+
+
+
+
+ + ☆ Offline Adaptation of Quadruped Locomotion using Diffusion Models + + +
+ We present a diffusion-based approach to quadrupedal locomotion that +simultaneously addresses the limitations of learning and interpolating between +multiple skills and of (modes) offline adapting to new locomotion behaviours +after training. This is the first framework to apply classifier-free guided +diffusion to quadruped locomotion and demonstrate its efficacy by extracting +goal-conditioned behaviour from an originally unlabelled dataset. We show that +these capabilities are compatible with a multi-skill policy and can be applied +with little modification and minimal compute overhead, i.e., running entirely +on the robots onboard CPU. We verify the validity of our approach with hardware +experiments on the ANYmal quadruped platform. + +
+
+
+
+
+ + ☆ Process-aware Human Activity Recognition + + +
+ Humans naturally follow distinct patterns when conducting their daily +activities, which are driven by established practices and processes, such as +production workflows, social norms and daily routines. Human activity +recognition (HAR) algorithms usually use neural networks or machine learning +techniques to analyse inherent relationships within the data. However, these +approaches often overlook the contextual information in which the data are +generated, potentially limiting their effectiveness. We propose a novel +approach that incorporates process information from context to enhance the HAR +performance. Specifically, we align probabilistic events generated by machine +learning models with process models derived from contextual information. This +alignment adaptively weighs these two sources of information to optimise HAR +accuracy. Our experiments demonstrate that our approach achieves better +accuracy and Macro F1-score compared to baseline models. + +
+
+
+
+
+ + ☆ Rethinking CyberSecEval: An LLM-Aided Approach to Evaluation Critique NeurIPS 2024 + + +
+ A key development in the cybersecurity evaluations space is the work carried +out by Meta, through their CyberSecEval approach. While this work is +undoubtedly a useful contribution to a nascent field, there are notable +features that limit its utility. Key drawbacks focus on the insecure code +detection part of Meta's methodology. We explore these limitations, and use our +exploration as a test case for LLM-assisted benchmark analysis. + +
+
+ comment: NeurIPS 2024, 2 pages +
+
+
+
+
+ + ☆ Evaluating World Models with LLM for Decision Making + + +
+ World model emerges as a key module in decision making, where MuZero and +Dreamer achieve remarkable successes in complex tasks. Recent work leverages +Large Language Models (LLMs) as general world simulators to simulate the +dynamics of the world due to their generalizability. LLMs also serve as the +world model for deliberative reasoning in Reasoning via Planning (RAP) and Tree +of Thought (ToT). However, the world models are either evaluated as a general +world simulator, or as a functional module of the agent, i.e., predicting the +transitions to assist the planning. In this work, we propose a comprehensive +evaluation of the world models with LLMs from the decision making perspective. +Specifically, we leverage the 31 diverse environments from (Wang et al., +2023;2024) and curate the rule-based policy of each environment for the diverse +evaluation. Then, we design three main tasks, i.e., policy verification, action +proposal, and policy planning, where the world models can be used for decision +making solely. Finally, we conduct the comprehensive evaluation of the advanced +LLMs, i.e., GPT-4o and GPT-4o-mini, on the environments for the three main +tasks under various settings. The key observations include: i) GPT-4o +significantly outperforms GPT-4o-mini on the three main tasks, especially for +the tasks which require the domain knowledge, ii) the performance of the world +model with LLM will be decreased for long-term decision-making tasks, and iii) +the combination of different functionalities of the world model will brings +additional unstabilities of the performance. + +
+
+
+
+
+ + ☆ Can sparse autoencoders be used to decompose and interpret steering + vectors? + + +
+ Steering vectors are a promising approach to control the behaviour of large +language models. However, their underlying mechanisms remain poorly understood. +While sparse autoencoders (SAEs) may offer a potential method to interpret +steering vectors, recent findings show that SAE-reconstructed vectors often +lack the steering properties of the original vectors. This paper investigates +why directly applying SAEs to steering vectors yields misleading +decompositions, identifying two reasons: (1) steering vectors fall outside the +input distribution for which SAEs are designed, and (2) steering vectors can +have meaningful negative projections in feature directions, which SAEs are not +designed to accommodate. These limitations hinder the direct use of SAEs for +interpreting steering vectors. + +
+
+
+
+
+ + ☆ Zero-shot Cross-lingual Transfer Learning with Multiple Source and + Target Languages for Information Extraction: Language Selection and + Adversarial Training + + +
+ The majority of previous researches addressing multi-lingual IE are limited +to zero-shot cross-lingual single-transfer (one-to-one) setting, with +high-resource languages predominantly as source training data. As a result, +these works provide little understanding and benefit for the realistic goal of +developing a multi-lingual IE system that can generalize to as many languages +as possible. Our study aims to fill this gap by providing a detailed analysis +on Cross-Lingual Multi-Transferability (many-to-many transfer learning), for +the recent IE corpora that cover a diverse set of languages. Specifically, we +first determine the correlation between single-transfer performance and a wide +range of linguistic-based distances. From the obtained insights, a combined +language distance metric can be developed that is not only highly correlated +but also robust across different tasks and model scales. Next, we investigate +the more general zero-shot multi-lingual transfer settings where multiple +languages are involved in the training and evaluation processes. Language +clustering based on the newly defined distance can provide directions for +achieving the optimal cost-performance trade-off in data (languages) selection +problem. Finally, a relational-transfer setting is proposed to further +incorporate multi-lingual unlabeled data based on adversarial training using +the relation induced from the above linguistic distance. + +
+
+
+
+
+ + ☆ Sharingan: Extract User Action Sequence from Desktop Recordings + + +
+ Video recordings of user activities, particularly desktop recordings, offer a +rich source of data for understanding user behaviors and automating processes. +However, despite advancements in Vision-Language Models (VLMs) and their +increasing use in video analysis, extracting user actions from desktop +recordings remains an underexplored area. This paper addresses this gap by +proposing two novel VLM-based methods for user action extraction: the Direct +Frame-Based Approach (DF), which inputs sampled frames directly into VLMs, and +the Differential Frame-Based Approach (DiffF), which incorporates explicit +frame differences detected via computer vision techniques. We evaluate these +methods using a basic self-curated dataset and an advanced benchmark adapted +from prior work. Our results show that the DF approach achieves an accuracy of +70% to 80% in identifying user actions, with the extracted action sequences +being re-playable though Robotic Process Automation. We find that while VLMs +show potential, incorporating explicit UI changes can degrade performance, +making the DF approach more reliable. This work represents the first +application of VLMs for extracting user action sequences from desktop +recordings, contributing new methods, benchmarks, and insights for future +research. + +
+
+
+
+
+ + ☆ SANDWICH: Towards an Offline, Differentiable, Fully-Trainable Wireless + Neural Ray-Tracing Surrogate ICASSP 2025 + + +
+ Wireless ray-tracing (RT) is emerging as a key tool for three-dimensional +(3D) wireless channel modeling, driven by advances in graphical rendering. +Current approaches struggle to accurately model beyond 5G (B5G) network +signaling, which often operates at higher frequencies and is more susceptible +to environmental conditions and changes. Existing online learning solutions +require real-time environmental supervision during training, which is both +costly and incompatible with GPU-based processing. In response, we propose a +novel approach that redefines ray trajectory generation as a sequential +decision-making problem, leveraging generative models to jointly learn the +optical, physical, and signal properties within each designated environment. +Our work introduces the Scene-Aware Neural Decision Wireless Channel Raytracing +Hierarchy (SANDWICH), an innovative offline, fully differentiable approach that +can be trained entirely on GPUs. SANDWICH offers superior performance compared +to existing online learning methods, outperforms the baseline by 4e^-2 radian +in RT accuracy, and only fades 0.5 dB away from toplined channel gain +estimation. + +
+
+ comment: Submitted in ICASSP 2025 +
+
+
+
+
+ + ☆ Flow reconstruction in time-varying geometries using graph neural + networks + + +
+ The paper presents a Graph Attention Convolutional Network (GACN) for flow +reconstruction from very sparse data in time-varying geometries. The model +incorporates a feature propagation algorithm as a preprocessing step to handle +extremely sparse inputs, leveraging information from neighboring nodes to +initialize missing features. In addition, a binary indicator is introduced as a +validity mask to distinguish between the original and propagated data points, +enabling more effective learning from sparse inputs. Trained on a unique data +set of Direct Numerical Simulations (DNS) of a motored engine at a technically +relevant operating condition, the GACN shows robust performance across +different resolutions and domain sizes and can effectively handle unstructured +data and variable input sizes. The model is tested on previously unseen DNS +data as well as on an experimental data set from Particle Image Velocimetry +(PIV) measurements that were not considered during training. A comparative +analysis shows that the GACN consistently outperforms both a conventional +Convolutional Neural Network (CNN) and cubic interpolation methods on the DNS +and PIV test sets by achieving lower reconstruction errors and better capturing +fine-scale turbulent structures. In particular, the GACN effectively +reconstructs flow fields from domains up to 14 times larger than those observed +during training, with the performance advantage increasing for larger domains. + +
+
+
+
+
+ + ☆ Separating Tongue from Thought: Activation Patching Reveals + Language-Agnostic Concept Representations in Transformers ICML 2024 + + +
+ A central question in multilingual language modeling is whether large +language models (LLMs) develop a universal concept representation, disentangled +from specific languages. In this paper, we address this question by analyzing +latent representations (latents) during a word translation task in +transformer-based LLMs. We strategically extract latents from a source +translation prompt and insert them into the forward pass on a target +translation prompt. By doing so, we find that the output language is encoded in +the latent at an earlier layer than the concept to be translated. Building on +this insight, we conduct two key experiments. First, we demonstrate that we can +change the concept without changing the language and vice versa through +activation patching alone. Second, we show that patching with the mean over +latents across different languages does not impair and instead improves the +models' performance in translating the concept. Our results provide evidence +for the existence of language-agnostic concept representations within the +investigated models. + +
+
+ comment: 12 pages, 10 figures, previously published under the title "How Do + Llamas Process Multilingual Text? A Latent Exploration through Activation + Patching" at the ICML 2024 mechanistic interpretability workshop + https://openreview.net/forum?id=0ku2hIm4BS +
+
+
+
+
+ + ☆ Polymetis:Large Language Modeling for Multiple Material Domains + + +
+ As the application of large language models in various fields continues to +expand, materials science also ushers in opportunities for AI-driven +innovation. The traditional way of relying on manual search for materials +science-related information is now using artificial intelligence technology as +an auxiliary tool to improve the efficiency of materials science research. To +accelerate researchers' knowledge acquisition and intelligent decision-making +support in materials science research, this paper proposes a large language +model Polymetis model for a variety of materials fields, aiming to provide +highly professional knowledge answers in the field of materials, covering +energy materials, functional materials, alloy materials, physical chemistry, +biology, and other material directions. The model uses a dataset of about 2 +million material knowledge instructions, and in the process of building the +dataset, we developed the Intelligent Extraction Large Model (IELM), which is +specially used to extract and form structured knowledge from scientific texts, +avoiding a large number of costs that need to be manually annotated, and +improving efficiency. We inject this data into the GLM4-9B model for learning +to enhance its inference capabilities in a variety of material domains. In +addition, we have introduced enhanced prompt strategies to ensure that the +answers to the model are more organized and comprehensive, providing efficient +and comprehensive intelligent support for the diverse needs of materials +science exploration, and promoting the development of material science. + +
+
+
+
+
+ + ☆ Searching Latent Program Spaces + + +
+ Program synthesis methods aim to automatically generate programs restricted +to a language that can explain a given specification of input-output pairs. +While purely symbolic approaches suffer from a combinatorial search space, +recent methods leverage neural networks to learn distributions over program +structures to narrow this search space significantly, enabling more efficient +search. However, for challenging problems, it remains difficult to train models +to perform program synthesis in one shot, making test-time search essential. +Most neural methods lack structured search mechanisms during inference, relying +instead on stochastic sampling or gradient updates, which can be inefficient. +In this work, we propose the Latent Program Network (LPN), a general algorithm +for program induction that learns a distribution over latent programs in a +continuous space, enabling efficient search and test-time adaptation. We +explore how to train these networks to optimize for test-time computation and +demonstrate the use of gradient-based search both during training and at test +time. We evaluate LPN on ARC-AGI, a program synthesis benchmark that evaluates +performance by generalizing programs to new inputs rather than explaining the +underlying specification. We show that LPN can generalize beyond its training +distribution and adapt to unseen tasks by utilizing test-time computation, +outperforming algorithms without test-time adaptation mechanisms. + +
+
+ comment: Code available at https://github.com/clement-bonnet/lpn +
+
+
+
+
+ + ☆ MVKTrans: Multi-View Knowledge Transfer for Robust Multiomics + Classification + + +
+ The distinct characteristics of multiomics data, including complex +interactions within and across biological layers and disease heterogeneity +(e.g., heterogeneity in etiology and clinical symptoms), drive us to develop +novel designs to address unique challenges in multiomics prediction. In this +paper, we propose the multi-view knowledge transfer learning (MVKTrans) +framework, which transfers intra- and inter-omics knowledge in an adaptive +manner by reviewing data heterogeneity and suppressing bias transfer, thereby +enhancing classification performance. Specifically, we design a graph +contrastive module that is trained on unlabeled data to effectively learn and +transfer the underlying intra-omics patterns to the supervised task. This +unsupervised pretraining promotes learning general and unbiased representations +for each modality, regardless of the downstream tasks. In light of the varying +discriminative capacities of modalities across different diseases and/or +samples, we introduce an adaptive and bi-directional cross-omics distillation +module. This module automatically identifies richer modalities and facilitates +dynamic knowledge transfer from more informative to less informative omics, +thereby enabling a more robust and generalized integration. Extensive +experiments on four real biomedical datasets demonstrate the superior +performance and robustness of MVKTrans compared to the state-of-the-art. Code +and data are available at https://github.com/Yaolab-fantastic/MVKTrans. + +
+
+
+
+
+ + ☆ TRACE: Transformer-based Risk Assessment for Clinical Evaluation + + +
+ We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation), +a novel method for clinical risk assessment based on clinical data, leveraging +the self-attention mechanism for enhanced feature interaction and result +interpretation. Our approach is able to handle different data modalities, +including continuous, categorical and multiple-choice (checkbox) attributes. +The proposed architecture features a shared representation of the clinical data +obtained by integrating specialized embeddings of each data modality, enabling +the detection of high-risk individuals using Transformer encoder layers. To +assess the effectiveness of the proposed method, a strong baseline based on +non-negative multi-layer perceptrons (MLPs) is introduced. The proposed method +outperforms various baselines widely used in the domain of clinical risk +assessment, while effectively handling missing values. In terms of +explainability, our Transformer-based method offers easily interpretable +results via attention weights, further enhancing the clinicians' +decision-making process. + +
+
+
+
+
+ + ☆ Rethinking negative sampling in content-based news recommendation + + +
+ News recommender systems are hindered by the brief lifespan of articles, as +they undergo rapid relevance decay. Recent studies have demonstrated the +potential of content-based neural techniques in tackling this problem. However, +these models often involve complex neural architectures and often lack +consideration for negative examples. In this study, we posit that the careful +sampling of negative examples has a big impact on the model's outcome. We +devise a negative sampling technique that not only improves the accuracy of the +model but also facilitates the decentralization of the recommendation system. +The experimental results obtained using the MIND dataset demonstrate that the +accuracy of the method under consideration can compete with that of +State-of-the-Art models. The utilization of the sampling technique is essential +in reducing model complexity and accelerating the training process, while +maintaining a high level of accuracy. Finally, we discuss how decentralized +models can help improve privacy and scalability. + +
+
+
+
+
+ + ☆ Scholarly Wikidata: Population and Exploration of Conference Data in + Wikidata using LLMs + + +
+ Several initiatives have been undertaken to conceptually model the domain of +scholarly data using ontologies and to create respective Knowledge Graphs. Yet, +the full potential seems unleashed, as automated means for automatic population +of said ontologies are lacking, and respective initiatives from the Semantic +Web community are not necessarily connected: we propose to make scholarly data +more sustainably accessible by leveraging Wikidata's infrastructure and +automating its population in a sustainable manner through LLMs by tapping into +unstructured sources like conference Web sites and proceedings texts as well as +already existing structured conference datasets. While an initial analysis +shows that Semantic Web conferences are only minimally represented in Wikidata, +we argue that our methodology can help to populate, evolve and maintain +scholarly data as a community within Wikidata. Our main contributions include +(a) an analysis of ontologies for representing scholarly data to identify gaps +and relevant entities/properties in Wikidata, (b) semi-automated extraction -- +requiring (minimal) manual validation -- of conference metadata (e.g., +acceptance rates, organizer roles, programme committee members, best paper +awards, keynotes, and sponsors) from websites and proceedings texts using LLMs. +Finally, we discuss (c) extensions to visualization tools in the Wikidata +context for data exploration of the generated scholarly data. Our study focuses +on data from 105 Semantic Web-related conferences and extends/adds more than +6000 entities in Wikidata. It is important to note that the method can be more +generally applicable beyond Semantic Web-related conferences for enhancing +Wikidata's utility as a comprehensive scholarly resource. + Source Repository: https://github.com/scholarly-wikidata/ + DOI: https://doi.org/10.5281/zenodo.10989709 + License: Creative Commons CC0 (Data), MIT (Code) + +
+
+ comment: 17 pages, accepted at EKAW-24 +
+
+
+
+
+ + ☆ Analogical Reasoning Within a Conceptual Hyperspace IJCAI 2024 + + +
+ We propose an approach to analogical inference that marries the +neuro-symbolic computational power of complex-sampled hyperdimensional +computing (HDC) with Conceptual Spaces Theory (CST), a promising theory of +semantic meaning. CST sketches, at an abstract level, approaches to analogical +inference that go beyond the standard predicate-based structure mapping +theories. But it does not describe how such an approach can be operationalized. +We propose a concrete HDC-based architecture that computes several types of +analogy classified by CST. We present preliminary proof-of-concept experimental +results within a toy domain and describe how it can perform category-based and +property-based analogical reasoning. + +
+
+ comment: Analogy-angle workshop full paper at IJCAI 2024 +
+
+
+
+
+ + ☆ A Survey on Vision Autoregressive Model + + +
+ Autoregressive models have demonstrated great performance in natural language +processing (NLP) with impressive scalability, adaptability and +generalizability. Inspired by their notable success in NLP field, +autoregressive models have been intensively investigated recently for computer +vision, which perform next-token predictions by representing visual data as +visual tokens and enables autoregressive modelling for a wide range of vision +tasks, ranging from visual generation and visual understanding to the very +recent multimodal generation that unifies visual generation and understanding +with a single autoregressive model. This paper provides a systematic review of +vision autoregressive models, including the development of a taxonomy of +existing methods and highlighting their major contributions, strengths, and +limitations, covering various vision tasks such as image generation, video +generation, image editing, motion generation, medical image analysis, 3D +generation, robotic manipulation, unified multimodal generation, etc. Besides, +we investigate and analyze the latest advancements in autoregressive models, +including thorough benchmarking and discussion of existing methods across +various evaluation datasets. Finally, we outline key challenges and promising +directions for future research, offering a roadmap to guide further +advancements in vision autoregressive models. + +
+
+
+
+
+ + ☆ Estimating unknown parameters in differential equations with a + reinforcement learning based PSO method + + +
+ Differential equations offer a foundational yet powerful framework for +modeling interactions within complex dynamic systems and are widely applied +across numerous scientific fields. One common challenge in this area is +estimating the unknown parameters of these dynamic relationships. However, +traditional numerical optimization methods rely on the selection of initial +parameter values, making them prone to local optima. Meanwhile, deep learning +and Bayesian methods require training models on specific differential +equations, resulting in poor versatility. This paper reformulates the parameter +estimation problem of differential equations as an optimization problem by +introducing the concept of particles from the particle swarm optimization +algorithm. Building on reinforcement learning-based particle swarm optimization +(RLLPSO), this paper proposes a novel method, DERLPSO, for estimating unknown +parameters of differential equations. We compared its performance on three +typical ordinary differential equations with the state-of-the-art methods, +including the RLLPSO algorithm, traditional numerical methods, deep learning +approaches, and Bayesian methods. The experimental results demonstrate that our +DERLPSO consistently outperforms other methods in terms of performance, +achieving an average Mean Square Error of 1.13e-05, which reduces the error by +approximately 4 orders of magnitude compared to other methods. Apart from +ordinary differential equations, our DERLPSO also show great promise for +estimating unknown parameters of partial differential equations. The DERLPSO +method proposed in this paper has high accuracy, is independent of initial +parameter values, and possesses strong versatility and stability. This work +provides new insights into unknown parameter estimation for differential +equations. + +
+
+
+
+
+ + ☆ A System Level Performance Evaluation for Superconducting Digital + Systems + + +
+ Superconducting Digital (SCD) technology offers significant potential for +enhancing the performance of next generation large scale compute workloads. By +leveraging advanced lithography and a 300 mm platform, SCD devices can reduce +energy consumption and boost computational power. This paper presents a +cross-layer modeling approach to evaluate the system-level performance benefits +of SCD architectures for Large Language Model (LLM) training and inference. Our +findings, based on experimental data and Pulse Conserving Logic (PCL) design +principles, demonstrate substantial performance gain in both training and +inference. We are, thus, able to convincingly show that the SCD technology can +address memory and interconnect limitations of present day solutions for +next-generation compute systems. + +
+
+ comment: 8 figures +
+
+
+
+
+ + ☆ Towards More Accurate Fake Detection on Images Generated from Advanced + Generative and Neural Rendering Models + + +
+ The remarkable progress in neural-network-driven visual data generation, +especially with neural rendering techniques like Neural Radiance Fields and 3D +Gaussian splatting, offers a powerful alternative to GANs and diffusion models. +These methods can produce high-fidelity images and lifelike avatars, +highlighting the need for robust detection methods. In response, an +unsupervised training technique is proposed that enables the model to extract +comprehensive features from the Fourier spectrum magnitude, thereby overcoming +the challenges of reconstructing the spectrum due to its centrosymmetric +properties. By leveraging the spectral domain and dynamically combining it with +spatial domain information, we create a robust multimodal detector that +demonstrates superior generalization capabilities in identifying challenging +synthetic images generated by the latest image synthesis techniques. To address +the absence of a 3D neural rendering-based fake image database, we develop a +comprehensive database that includes images generated by diverse neural +rendering techniques, providing a robust foundation for evaluating and +advancing detection methods. + +
+
+ comment: 13 pages, 8 Figures +
+
+
+
+
+ + ☆ DipMe: Haptic Recognition of Granular Media for Tangible Interactive + Applications + + +
+ While tangible user interface has shown its power in naturally interacting +with rigid or soft objects, users cannot conveniently use different types of +granular materials as the interaction media. We introduce DipMe as a smart +device to recognize the types of granular media in real time, which can be used +to connect the granular materials in the physical world with various virtual +content. Other than vision-based solutions, we propose a dip operation of our +device and exploit the haptic signals to recognize different types of granular +materials. With modern machine learning tools, we find the haptic signals from +different granular media are distinguishable by DipMe. With the online granular +object recognition, we build several tangible interactive applications, +demonstrating the effects of DipMe in perceiving granular materials and its +potential in developing a tangible user interface with granular objects as the +new media. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ Precision-Focused Reinforcement Learning Model for Robotic Object + Pushing + + +
+ Non-prehensile manipulation, such as pushing objects to a desired target +position, is an important skill for robots to assist humans in everyday +situations. However, the task is challenging due to the large variety of +objects with different and sometimes unknown physical properties, such as +shape, size, mass, and friction. This can lead to the object overshooting its +target position, requiring fast corrective movements of the robot around the +object, especially in cases where objects need to be precisely pushed. In this +paper, we improve the state-of-the-art by introducing a new memory-based +vision-proprioception RL model to push objects more precisely to target +positions using fewer corrective movements. + +
+
+
+
+
+ + ☆ Lo-MARVE: A Low Cost Autonomous Underwater Vehicle for Marine + Exploration + + +
+ This paper presents Low-cost Marine Autonomous Robotic Vehicle Explorer +(Lo-MARVE), a novel autonomous underwater vehicle (AUV) designed to provide a +low cost solution for underwater exploration and environmental monitoring in +shallow water environments. Lo-MARVE offers a cost-effective alternative to +existing AUVs, featuring a modular design, low-cost sensors, and wireless +communication capabilities. The total cost of Lo-MARVE is approximately EUR +500. Lo-MARVE is developed using the Raspberry Pi 4B microprocessor, with +control software written in Python. The proposed AUV was validated through +field testing outside of a laboratory setting, in the freshwater environment of +the River Corrib in Galway, Ireland. This demonstrates its ability to navigate +autonomously, collect data, and communicate effectively outside of a controlled +laboratory setting. The successful deployment of Lo-MARVE in a real-world +environment validates its proof of concept. + +
+
+ comment: This paper was presented at the 12th International Conference on + Control, Mechatronics and Automation (ICCMA 2024), held in London, UK, from + November 11-13, 2024 +
+
+
+
+
+ + ☆ XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL + + +
+ To tackle the challenges of large language model performance in natural +language to SQL tasks, we introduce XiYan-SQL, an innovative framework that +employs a multi-generator ensemble strategy to improve candidate generation. We +introduce M-Schema, a semi-structured schema representation method designed to +enhance the understanding of database structures. To enhance the quality and +diversity of generated candidate SQL queries, XiYan-SQL integrates the +significant potential of in-context learning (ICL) with the precise control of +supervised fine-tuning. On one hand, we propose a series of training strategies +to fine-tune models to generate high-quality candidates with diverse +preferences. On the other hand, we implement the ICL approach with an example +selection method based on named entity recognition to prevent overemphasis on +entities. The refiner optimizes each candidate by correcting logical or +syntactical errors. To address the challenge of identifying the best candidate, +we fine-tune a selection model to distinguish nuances of candidate SQL queries. +The experimental results on multiple dialect datasets demonstrate the +robustness of XiYan-SQL in addressing challenges across different scenarios. +Overall, our proposed XiYan-SQL achieves the state-of-the-art execution +accuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on +NL2GQL, and a competitive score of 72.23% on the Bird development benchmark. +The proposed framework not only enhances the quality and diversity of SQL +queries but also outperforms previous methods. + +
+
+
+
+
+ + ☆ DeepUQ: Assessing the Aleatoric Uncertainties from two Deep Learning + Methods NeurIPS 2024 + + +
+ Assessing the quality of aleatoric uncertainty estimates from uncertainty +quantification (UQ) deep learning methods is important in scientific contexts, +where uncertainty is physically meaningful and important to characterize and +interpret exactly. We systematically compare aleatoric uncertainty measured by +two UQ techniques, Deep Ensembles (DE) and Deep Evidential Regression (DER). +Our method focuses on both zero-dimensional (0D) and two-dimensional (2D) data, +to explore how the UQ methods function for different data dimensionalities. We +investigate uncertainty injected on the input and output variables and include +a method to propagate uncertainty in the case of input uncertainty so that we +can compare the predicted aleatoric uncertainty to the known values. We +experiment with three levels of noise. The aleatoric uncertainty predicted +across all models and experiments scales with the injected noise level. +However, the predicted uncertainty is miscalibrated to $\rm{std}(\sigma_{\rm +al})$ with the true uncertainty for half of the DE experiments and almost all +of the DER experiments. The predicted uncertainty is the least accurate for +both UQ methods for the 2D input uncertainty experiment and the high-noise +level. While these results do not apply to more complex data, they highlight +that further research on post-facto calibration for these methods would be +beneficial, particularly for high-noise and high-dimensional settings. + +
+
+ comment: Accepted to the Machine Learning for Physical Sciences workshop at + NeurIPS 2024; 11 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Optimizing Automatic Summarization of Long Clinical Records Using + Dynamic Context Extension:Testing and Evaluation of the NBCE Method + + +
+ Summarizing patient clinical notes is vital for reducing documentation +burdens. Current manual summarization makes medical staff struggle. We propose +an automatic method using LLMs, but long inputs cause LLMs to lose context, +reducing output quality especially in small size model. We used a 7B model, +open-calm-7b, enhanced with Native Bayes Context Extend and a redesigned +decoding mechanism to reference one sentence at a time, keeping inputs within +context windows, 2048 tokens. Our improved model achieved near parity with +Google's over 175B Gemini on ROUGE-L metrics with 200 samples, indicating +strong performance using less resources, enhancing automated EMR summarization +feasibility. + +
+
+
+
+
+ + ☆ An Empirical Examination of the Evaluative AI Framework + + +
+ This study empirically examines the "Evaluative AI" framework, which aims to +enhance the decision-making process for AI users by transitioning from a +recommendation-based approach to a hypothesis-driven one. Rather than offering +direct recommendations, this framework presents users pro and con evidence for +hypotheses to support more informed decisions. However, findings from the +current behavioral experiment reveal no significant improvement in +decision-making performance and limited user engagement with the evidence +provided, resulting in cognitive processes similar to those observed in +traditional AI systems. Despite these results, the framework still holds +promise for further exploration in future research. + +
+
+
+
+
+ + ☆ Intelligent Algorithms For Signature Diagnostics Of Three-Phase Motors + + +
+ The application of machine learning (ML) algorithms in the intelligent +diagnosis of three-phase engines has the potential to significantly enhance +diagnostic performance and accuracy. Traditional methods largely rely on +signature analysis, which, despite being a standard practice, can benefit from +the integration of advanced ML techniques. In our study, we innovate by +combining state of the art algorithms with a novel unsupervised anomaly +generation methodology that takes into account physics model of the engine. +This hybrid approach leverages the strengths of both supervised ML and +unsupervised signature analysis, achieving superior diagnostic accuracy and +reliability along with a wide industrial application. Our experimental results +demonstrate that this method significantly outperforms existing ML and non-ML +state-of-the-art approaches while retaining the practical advantages of an +unsupervised methodology. The findings highlight the potential of our approach +to significantly contribute to the field of engine diagnostics, offering a +robust and efficient solution for real-world applications. + +
+
+
+
+
+ + ☆ Leveraging LLMs for Predictive Insights in Food Policy and Behavioral + Interventions + + +
+ Food consumption and production contribute significantly to global greenhouse +gas emissions, making them crucial entry points for mitigating climate change +and maintaining a liveable planet. Over the past two decades, food policy +initiatives have explored interventions to reshape production and consumption +patterns, focusing on reducing food waste and curbing ruminant meat +consumption. While the evidence of "what works" improves, evaluating which +policies are appropriate and effective in specific contexts remains difficult +due to external validity challenges. This paper demonstrates that a fine-tuned +large language model (LLM) can accurately predict the direction of outcomes in +approximately 80\% of empirical studies measuring dietary-based impacts (e.g. +food choices, sales, waste) resulting from behavioral interventions and +policies. Approximately 75 prompts were required to achieve optimal results, +with performance showing signs of catastrophic loss beyond this point. Our +findings indicate that greater input detail enhances predictive accuracy, +although the model still faces challenges with unseen studies, underscoring the +importance of a representative training sample. As LLMs continue to improve and +diversify, they hold promise for advancing data-driven, evidence-based +policymaking. + +
+
+
+
+
+ + ☆ Neural Corrective Machine Unranking + + +
+ Machine unlearning in neural information retrieval (IR) systems requires +removing specific data whilst maintaining model performance. Applying existing +machine unlearning methods to IR may compromise retrieval effectiveness or +inadvertently expose unlearning actions due to the removal of particular items +from the retrieved results presented to users. We formalise corrective +unranking, which extends machine unlearning in (neural) IR context by +integrating substitute documents to preserve ranking integrity, and propose a +novel teacher-student framework, Corrective unRanking Distillation (CuRD), for +this task. CuRD (1) facilitates forgetting by adjusting the (trained) neural IR +model such that its output relevance scores of to-be-forgotten samples mimic +those of low-ranking, non-retrievable samples; (2) enables correction by +fine-tuning the relevance scores for the substitute samples to match those of +corresponding to-be-forgotten samples closely; (3) seeks to preserve +performance on samples that are not targeted for forgetting. We evaluate CuRD +on four neural IR models (BERTcat, BERTdot, ColBERT, PARADE) using MS MARCO and +TREC CAR datasets. Experiments with forget set sizes from 1 % and 20 % of the +training dataset demonstrate that CuRD outperforms seven state-of-the-art +baselines in terms of forgetting and correction while maintaining model +retention and generalisation capabilities. + +
+
+ comment: submitted to Information Sciences +
+
+
+
+
+ + ☆ LogLLM: Log-based Anomaly Detection Using Large Language Models + + +
+ Software systems often record important runtime information in logs to help +with troubleshooting. Log-based anomaly detection has become a key research +area that aims to identify system issues through log data, ultimately enhancing +the reliability of software systems. Traditional deep learning methods often +struggle to capture the semantic information embedded in log data, which is +typically organized in natural language. In this paper, we propose LogLLM, a +log-based anomaly detection framework that leverages large language models +(LLMs). LogLLM employs BERT for extracting semantic vectors from log messages, +while utilizing Llama, a transformer decoder-based model, for classifying log +sequences. Additionally, we introduce a projector to align the vector +representation spaces of BERT and Llama, ensuring a cohesive understanding of +log semantics. Unlike conventional methods that require log parsers to extract +templates, LogLLM preprocesses log messages with regular expressions, +streamlining the entire process. Our framework is trained through a novel +three-stage procedure designed to enhance performance and adaptability. +Experimental results across four public datasets demonstrate that LogLLM +outperforms state-of-the-art methods. Even when handling unstable logs, it +effectively captures the semantic meaning of log messages and detects anomalies +accurately. + +
+
+
+
+
+ + ☆ Leveraging Pre-Trained Neural Networks to Enhance Machine Learning with + Variational Quantum Circuits + + +
+ Quantum Machine Learning (QML) offers tremendous potential but is currently +limited by the availability of qubits. We introduce an innovative approach that +utilizes pre-trained neural networks to enhance Variational Quantum Circuits +(VQC). This technique effectively separates approximation error from qubit +count and removes the need for restrictive conditions, making QML more viable +for real-world applications. Our method significantly improves parameter +optimization for VQC while delivering notable gains in representation and +generalization capabilities, as evidenced by rigorous theoretical analysis and +extensive empirical testing on quantum dot classification tasks. Moreover, our +results extend to applications such as human genome analysis, demonstrating the +broad applicability of our approach. By addressing the constraints of current +quantum hardware, our work paves the way for a new era of advanced QML +applications, unlocking the full potential of quantum computing in fields such +as machine learning, materials science, medicine, mimetics, and various +interdisciplinary areas. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ Deeper Insights into Learning Performance of Stochastic Configuration + Networks + + +
+ Stochastic Configuration Networks (SCNs) are a class of randomized neural +networks that integrate randomized algorithms within an incremental learning +framework. A defining feature of SCNs is the supervisory mechanism, which +adaptively adjusts the distribution to generate effective random basis +functions, thereby enabling error-free learning. In this paper, we present a +comprehensive analysis of the impact of the supervisory mechanism on the +learning performance of SCNs. Our findings reveal that the current SCN +framework evaluates the effectiveness of each random basis function in reducing +residual errors using a lower bound on its error reduction potential, which +constrains SCNs' overall learning efficiency. Specifically, SCNs may fail to +consistently select the most effective random candidate as the new basis +function during each training iteration. To overcome this problem, we propose a +novel method for evaluating the hidden layer's output matrix, supported by a +new supervisory mechanism that accurately assesses the error reduction +potential of random basis functions without requiring the computation of the +Moore-Penrose inverse of the output matrix. This approach enhances the +selection of basis functions, reducing computational complexity and improving +the overall scalability and learning capabilities of SCNs. We introduce a +Recursive Moore-Penrose Inverse-SCN (RMPI-SCN) training scheme based on the new +supervisory mechanism and demonstrate its effectiveness through simulations +over some benchmark datasets. Experiments show that RMPI-SCN outperforms the +conventional SCN in terms of learning capability, underscoring its potential to +advance the SCN framework for large-scale data modeling applications. + +
+
+
+
+
+ + ☆ MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal + Lymphatic Vessel Segmentation ML4H 2024 + + +
+ Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste +products from the human brain. An impairment in their functionality has been +associated with aging as well as brain disorders like multiple sclerosis and +Alzheimer's disease. However, MLVs have only recently been described for the +first time in magnetic resonance imaging (MRI), and their ramified structure +renders manual segmentation particularly difficult. Further, as there is no +consistent notion of their appearance, human-annotated MLV structures contain a +high inter-rater variability that most automatic segmentation methods cannot +take into account. In this work, we propose a new rater-aware training scheme +for the popular nnU-Net model, and we explore rater-based ensembling strategies +for accurate and consistent segmentation of MLVs. This enables us to boost +nnU-Net's performance while obtaining explicit predictions in different +annotation styles and a rater-based uncertainty estimation. Our final model, +MLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to +the human reference standard. The model further matches the human inter-rater +reliability and replicates age-related associations with MLV volume. + +
+
+ comment: ML4H 2024 +
+
+
+
+
+ + ☆ ACROSS: A Deformation-Based Cross-Modal Representation for Robotic + Tactile Perception ICRA2025 + + +
+ Tactile perception is essential for human interaction with the environment +and is becoming increasingly crucial in robotics. Tactile sensors like the +BioTac mimic human fingertips and provide detailed interaction data. Despite +its utility in applications like slip detection and object identification, this +sensor is now deprecated, making many existing valuable datasets obsolete. +However, recreating similar datasets with newer sensor technologies is both +tedious and time-consuming. Therefore, it is crucial to adapt these existing +datasets for use with new setups and modalities. In response, we introduce +ACROSS, a novel framework for translating data between tactile sensors by +exploiting sensor deformation information. We demonstrate the approach by +translating BioTac signals into the DIGIT sensor. Our framework consists of +first converting the input signals into 3D deformation meshes. We then +transition from the 3D deformation mesh of one sensor to the mesh of another, +and finally convert the generated 3D deformation mesh into the corresponding +output space. We demonstrate our approach to the most challenging problem of +going from a low-dimensional tactile representation to a high-dimensional one. +In particular, we transfer the tactile signals of a BioTac sensor to DIGIT +tactile images. Our approach enables the continued use of valuable datasets and +the exchange of data between groups with different setups. + +
+
+ comment: Paper Submitted to ICRA2025. arXiv admin note: text overlap with + arXiv:2410.14310 +
+
+
+
+
+ + ☆ Gendered Words and Grant Rates: A Textual Analysis of Disparate Outcomes + in the Patent System + + +
+ This study examines gender disparities in patent law by analyzing the textual +content of patent applications. While prior research has primarily focused on +the study of metadata (i.e., filing year or technological class), we employ +machine learning and natural language processing techniques to derive latent +information from patent texts. In particular, these methods are used to predict +inventor gender based on textual characteristics. We find that gender can be +identified with notable accuracy - even without knowing the inventor's name. +This ability to discern gender through text suggests that anonymized patent +examination - often proposed as a solution to mitigate disparities in patent +grant rate - may not fully address gender-specific outcomes in securing a +patent. Our analysis additionally identifies gendered differences in textual +choices within patent documents and the fields in which inventors choose to +work. These findings highlight the complex interaction between textual choices, +gender, and success in securing a patent. As discussed herein, this raises +critical questions about the efficacy of current proposals aimed at achieving +gender parity and efficiency in the patent system. + +
+
+
+
+
+ + ☆ SAD-TIME: a Spatiotemporal-fused network for depression detection with + Automated multi-scale Depth-wise and TIME-interval-related common feature + extractor + + +
+ Background and Objective: Depression is a severe mental disorder, and +accurate diagnosis is pivotal to the cure and rehabilitation of people with +depression. However, the current questionnaire-based diagnostic methods could +bring subjective biases and may be denied by subjects. In search of a more +objective means of diagnosis, researchers have begun to experiment with deep +learning-based methods for identifying depressive disorders in recent years. +Methods: In this study, a novel Spatiotemporal-fused network with Automated +multi-scale Depth-wise and TIME-interval-related common feature extractor +(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common +features extractor (CFE), a spatial sector (SpS), a modified temporal sector +(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale +depth-wise 1D-convolutional neural network and a time-interval embedding +generator, where the unique information of each channel is preserved. The SpS +fuses the functional connectivity with the distance-based connectivity +containing spatial position of EEG electrodes. A multi-head-attention graph +convolutional network is also applied in the SpS to fuse the features from +different EEG channels. The TeS is based on long short-term memory and graph +transformer networks, where the temporal information of different time-windows +is fused. Moreover, the DAL is used after the SpS to obtain the +domain-invariant feature. Results: Experimental results under tenfold +cross-validation show that the proposed SAD-TIME method achieves 92.00% and +94.00% depression classification accuracies on two datasets, respectively, in +cross-subject mode. Conclusion: SAD-TIME is a robust depression detection +model, where the automatedly-generated features, the SpS and the TeS assist the +classification performance with the fusion of the innate spatiotemporal +information in the EEG signals. + +
+
+ comment: 21pages, 7 figures +
+
+
+
+
+ + ☆ Explainers' Mental Representations of Explainees' Needs in Everyday + Explanations + + +
+ In explanations, explainers have mental representations of explainees' +developing knowledge and shifting interests regarding the explanandum. These +mental representations are dynamic in nature and develop over time, thereby +enabling explainers to react to explainees' needs by adapting and customizing +the explanation. XAI should be able to react to explainees' needs in a similar +manner. Therefore, a component that incorporates aspects of explainers' mental +representations of explainees is required. In this study, we took first steps +by investigating explainers' mental representations in everyday explanations of +technological artifacts. According to the dual nature theory, technological +artifacts require explanations with two distinct perspectives, namely +observable and measurable features addressing "Architecture" or interpretable +aspects addressing "Relevance". We conducted extended semi structured pre-, +post- and video recall-interviews with explainers (N=9) in the context of an +explanation. The transcribed interviews were analyzed utilizing qualitative +content analysis. The explainers' answers regarding the explainees' knowledge +and interests with regard to the technological artifact emphasized the +vagueness of early assumptions of explainers toward strong beliefs in the +course of explanations. The assumed knowledge of explainees in the beginning is +centered around Architecture and develops toward knowledge with regard to both +Architecture and Relevance. In contrast, explainers assumed higher interests in +Relevance in the beginning to interests regarding both Architecture and +Relevance in the further course of explanations. Further, explainers often +finished the explanation despite their perception that explainees still had +gaps in knowledge. These findings are transferred into practical implications +relevant for user models for adaptive explainable systems. + +
+
+
+
+
+ + ☆ An Information Theoretic Approach to Operationalize Right to Data + Protection + + +
+ The widespread practice of indiscriminate data scraping to fine-tune language +models (LMs) raises significant legal and ethical concerns, particularly +regarding compliance with data protection laws such as the General Data +Protection Regulation (GDPR). This practice often results in the unauthorized +use of personal information, prompting growing debate within the academic and +regulatory communities. Recent works have introduced the concept of generating +unlearnable datasets (by adding imperceptible noise to the clean data), such +that the underlying model achieves lower loss during training but fails to +generalize to the unseen test setting. Though somewhat effective, these +approaches are predominantly designed for images and are limited by several +practical constraints like requiring knowledge of the target model. To this +end, we introduce RegText, a framework that injects imperceptible spurious +correlations into natural language datasets, effectively rendering them +unlearnable without affecting semantic content. We demonstrate RegText's +utility through rigorous empirical analysis of small and large LMs. Notably, +RegText can restrict newer models like GPT-4o and Llama from learning on our +generated data, resulting in a drop in their test accuracy compared to their +zero-shot performance and paving the way for generating unlearnable text to +protect public data. + +
+
+ comment: First two authors contributed equally to this work +
+
+
+
+
+ + ☆ Towards Objective and Unbiased Decision Assessments with LLM-Enhanced + Hierarchical Attention Networks + + +
+ How objective and unbiased are we while making decisions? This work +investigates cognitive bias identification in high-stake decision making +process by human experts, questioning its effectiveness in real-world settings, +such as candidates assessments for university admission. We begin with a +statistical analysis assessing correlations among different decision points +among in the current process, which discovers discrepancies that imply +cognitive bias and inconsistency in decisions. This motivates our exploration +of bias-aware AI-augmented workflow that surpass human judgment. We propose +BGM-HAN, a hierarchical attention network enhanced by byte-pair encoding, +multi-head attention and gated residual connection. Using it as backbone model, +we further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow, which +simulate real-world decision-making. In our experiments, both the proposed +model and the agentic workflow significantly improves on both human judgment +and alternative models, validated with real-world data. + +
+
+
+
+
+ + ☆ Learning Model Agnostic Explanations via Constraint Programming + + +
+ Interpretable Machine Learning faces a recurring challenge of explaining the +predictions made by opaque classifiers such as ensemble models, kernel methods, +or neural networks in terms that are understandable to humans. When the model +is viewed as a black box, the objective is to identify a small set of features +that jointly determine the black box response with minimal error. However, +finding such model-agnostic explanations is computationally demanding, as the +problem is intractable even for binary classifiers. In this paper, the task is +framed as a Constraint Optimization Problem, where the constraint solver seeks +an explanation of minimum error and bounded size for an input data instance and +a set of samples generated by the black box. From a theoretical perspective, +this constraint programming approach offers PAC-style guarantees for the output +explanation. We evaluate the approach empirically on various datasets and show +that it statistically outperforms the state-of-the-art heuristic Anchors +method. + +
+
+
+
+
+ + ☆ Building Trustworthy AI: Transparent AI Systems via Large Language + Models, Ontologies, and Logical Reasoning (TranspNet) + + +
+ Growing concerns over the lack of transparency in AI, particularly in +high-stakes fields like healthcare and finance, drive the need for explainable +and trustworthy systems. While Large Language Models (LLMs) perform +exceptionally well in generating accurate outputs, their "black box" nature +poses significant challenges to transparency and trust. To address this, the +paper proposes the TranspNet pipeline, which integrates symbolic AI with LLMs. +By leveraging domain expert knowledge, retrieval-augmented generation (RAG), +and formal reasoning frameworks like Answer Set Programming (ASP), TranspNet +enhances LLM outputs with structured reasoning and verification. This approach +ensures that AI systems deliver not only accurate but also explainable and +trustworthy results, meeting regulatory demands for transparency and +accountability. TranspNet provides a comprehensive solution for developing AI +systems that are reliable and interpretable, making it suitable for real-world +applications where trust is critical. + +
+
+
+
+
+ + ☆ Crystal Structure Generation Based On Material Properties + + +
+ The discovery of new materials is very important to the field of materials +science. When researchers explore new materials, they often have expected +performance requirements for their crystal structure. In recent years, +data-driven methods have made great progress in the direction plane of crystal +structure generation, but there is still a lack of methods that can effectively +map material properties to crystal structure. In this paper, we propose a +Crystal DiT model to generate the crystal structure from the expected material +properties by embedding the material properties and combining the symmetry +information predicted by the large language model. Experimental verification +shows that our proposed method has good performance. + +
+
+
+
+
+ + ☆ Symbolic-AI-Fusion Deep Learning (SAIF-DL): Encoding Knowledge into + Training with Answer Set Programming Loss Penalties by a Novel Loss Function + Approach + + +
+ This paper presents a hybrid methodology that enhances the training process +of deep learning (DL) models by embedding domain expert knowledge using +ontologies and answer set programming (ASP). By integrating these symbolic AI +methods, we encode domain-specific constraints, rules, and logical reasoning +directly into the model's learning process, thereby improving both performance +and trustworthiness. The proposed approach is flexible and applicable to both +regression and classification tasks, demonstrating generalizability across +various fields such as healthcare, autonomous systems, engineering, and battery +manufacturing applications. Unlike other state-of-the-art methods, the strength +of our approach lies in its scalability across different domains. The design +allows for the automation of the loss function by simply updating the ASP +rules, making the system highly scalable and user-friendly. This facilitates +seamless adaptation to new domains without significant redesign, offering a +practical solution for integrating expert knowledge into DL models in +industrial settings such as battery manufacturing. + +
+
+
+
+
+ + ☆ Trap-MID: Trapdoor-based Defense against Model Inversion Attacks NeurIPS + + +
+ Model Inversion (MI) attacks pose a significant threat to the privacy of Deep +Neural Networks by recovering training data distribution from well-trained +models. While existing defenses often rely on regularization techniques to +reduce information leakage, they remain vulnerable to recent attacks. In this +paper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to +mislead MI attacks. A trapdoor is integrated into the model to predict a +specific label when the input is injected with the corresponding trigger. +Consequently, this trapdoor information serves as the "shortcut" for MI +attacks, leading them to extract trapdoor triggers rather than private data. We +provide theoretical insights into the impacts of trapdoor's effectiveness and +naturalness on deceiving MI attacks. In addition, empirical experiments +demonstrate the state-of-the-art defense performance of Trap-MID against +various MI attacks without the requirements for extra data or large +computational overhead. Our source code is publicly available at +https://github.com/ntuaislab/Trap-MID. + +
+
+ comment: Accepted by Neural Information Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ Learning Dynamic Cognitive Map with Autonomous Navigation + + +
+ Inspired by animal navigation strategies, we introduce a novel computational +model to navigate and map a space rooted in biologically inspired principles. +Animals exhibit extraordinary navigation prowess, harnessing memory, +imagination, and strategic decision-making to traverse complex and aliased +environments adeptly. Our model aims to replicate these capabilities by +incorporating a dynamically expanding cognitive map over predicted poses within +an Active Inference framework, enhancing our agent's generative model +plasticity to novelty and environmental changes. Through structure learning and +active inference navigation, our model demonstrates efficient exploration and +exploitation, dynamically expanding its model capacity in response to +anticipated novel un-visited locations and updating the map given new evidence +contradicting previous beliefs. Comparative analyses in mini-grid environments +with the Clone-Structured Cognitive Graph model (CSCG), which shares similar +objectives, highlight our model's ability to rapidly learn environmental +structures within a single episode, with minimal navigation overlap. Our model +achieves this without prior knowledge of observation and world dimensions, +underscoring its robustness and efficacy in navigating intricate environments. + +
+
+ comment: under submission at Frontiers Computer Neuroscience +
+
+
+
+
+ + ☆ Towards Optimizing a Retrieval Augmented Generation using Large Language + Model on Academic Data + + +
+ Given the growing trend of many organizations integrating Retrieval Augmented +Generation (RAG) into their operations, we assess RAG on domain-specific data +and test state-of-the-art models across various optimization techniques. We +incorporate four optimizations; Multi-Query, Child-Parent-Retriever, Ensemble +Retriever, and In-Context-Learning, to enhance the functionality and +performance in the academic domain. We focus on data retrieval, specifically +targeting various study programs at a large technical university. We +additionally introduce a novel evaluation approach, the RAG Confusion Matrix +designed to assess the effectiveness of various configurations within the RAG +framework. By exploring the integration of both open-source (e.g., Llama2, +Mistral) and closed-source (GPT-3.5 and GPT-4) Large Language Models, we offer +valuable insights into the application and optimization of RAG frameworks in +domain-specific contexts. Our experiments show a significant performance +increase when including multi-query in the retrieval phase. + +
+
+
+
+
+ + ☆ 3D Multi-Object Tracking with Semi-Supervised GRU-Kalman Filter + + +
+ 3D Multi-Object Tracking (MOT), a fundamental component of environmental +perception, is essential for intelligent systems like autonomous driving and +robotic sensing. Although Tracking-by-Detection frameworks have demonstrated +excellent performance in recent years, their application in real-world +scenarios faces significant challenges. Object movement in complex environments +is often highly nonlinear, while existing methods typically rely on linear +approximations of motion. Furthermore, system noise is frequently modeled as a +Gaussian distribution, which fails to capture the true complexity of the noise +dynamics. These oversimplified modeling assumptions can lead to significant +reductions in tracking precision. To address this, we propose a GRU-based MOT +method, which introduces a learnable Kalman filter into the motion module. This +approach is able to learn object motion characteristics through data-driven +learning, thereby avoiding the need for manual model design and model error. At +the same time, to avoid abnormal supervision caused by the wrong association +between annotations and trajectories, we design a semi-supervised learning +strategy to accelerate the convergence speed and improve the robustness of the +model. Evaluation experiment on the nuScenes and Argoverse2 datasets +demonstrates that our system exhibits superior performance and significant +potential compared to traditional TBD methods. + +
+
+
+
+
+ + ☆ One STEP at a time: Language Agents are Stepwise Planners + + +
+ Language agents have shown promising adaptability in dynamic environments to +perform complex tasks. However, despite the versatile knowledge embedded in +large language models, these agents still fall short when it comes to tasks +that require planning. We introduce STEP, a novel framework designed to +efficiently learn from previous experiences to enhance the planning +capabilities of language agents in future steps. Concretely, STEP functions +through four interconnected components. First, the Planner takes on the task, +breaks it down into subtasks and provides relevant insights. Then the Executor +generates action candidates, while the Evaluator ensures the actions align with +learned rules from previous experiences. Lastly, Memory stores experiences to +inform future decisions. In the ScienceWorld benchmark, our results show that +STEP consistently outperforms state-of-the-art models, achieving an overall +score of 67.4 and successfully completing 12 out of 18 tasks. These findings +highlight STEP's potential as a framework for enhancing planning capabilities +in language agents, paving the way for more sophisticated task-solving in +dynamic environments. + +
+
+
+
+
+ + ☆ A Heterogeneous Graph Neural Network Fusing Functional and Structural + Connectivity for MCI Diagnosis + + +
+ Brain connectivity alternations associated with brain disorders have been +widely reported in resting-state functional imaging (rs-fMRI) and diffusion +tensor imaging (DTI). While many dual-modal fusion methods based on graph +neural networks (GNNs) have been proposed, they generally follow homogenous +fusion ways ignoring rich heterogeneity of dual-modal information. To address +this issue, we propose a novel method that integrates functional and structural +connectivity based on heterogeneous graph neural networks (HGNNs) to better +leverage the rich heterogeneity in dual-modal images. We firstly use blood +oxygen level dependency and whiter matter structure information provided by +rs-fMRI and DTI to establish homo-meta-path, capturing node relationships +within the same modality. At the same time, we propose to establish +hetero-meta-path based on structure-function coupling and brain community +searching to capture relations among cross-modal nodes. Secondly, we further +introduce a heterogeneous graph pooling strategy that automatically balances +homo- and hetero-meta-path, effectively leveraging heterogeneous information +and preventing feature confusion after pooling. Thirdly, based on the +flexibility of heterogeneous graphs, we propose a heterogeneous graph data +augmentation approach that can conveniently address the sample imbalance issue +commonly seen in clinical diagnosis. We evaluate our method on ADNI-3 dataset +for mild cognitive impairment (MCI) diagnosis. Experimental results indicate +the proposed method is effective and superior to other algorithms, with a mean +classification accuracy of 93.3%. + +
+
+
+
+
+ + ☆ Enhanced Classroom Dialogue Sequences Analysis with a Hybrid AI Agent: + Merging Expert Rule-Base with Large Language Models + + +
+ Classroom dialogue plays a crucial role in fostering student engagement and +deeper learning. However, analysing dialogue sequences has traditionally relied +on either theoretical frameworks or empirical descriptions of practice, with +limited integration between the two. This study addresses this gap by +developing a comprehensive rule base of dialogue sequences and an Artificial +Intelligence (AI) agent that combines expert-informed rule-based systems with a +large language model (LLM). The agent applies expert knowledge while adapting +to the complexities of natural language, enabling accurate and flexible +categorisation of classroom dialogue sequences. By synthesising findings from +over 30 studies, we established a comprehensive framework for dialogue +analysis. The agent was validated against human expert coding, achieving high +levels of precision and reliability. The results demonstrate that the agent +provides theory-grounded and adaptive functions, tremendously enhancing the +efficiency and scalability of classroom dialogue analysis, offering significant +potential in improving classroom teaching practices and supporting teacher +professional development. + +
+
+
+
+
+ + ☆ Material Property Prediction with Element Attribute Knowledge Graphs and + Multimodal Representation Learning + + +
+ Machine learning has become a crucial tool for predicting the properties of +crystalline materials. However, existing methods primarily represent material +information by constructing multi-edge graphs of crystal structures, often +overlooking the chemical and physical properties of elements (such as atomic +radius, electronegativity, melting point, and ionization energy), which have a +significant impact on material performance. To address this limitation, we +first constructed an element property knowledge graph and utilized an embedding +model to encode the element attributes within the knowledge graph. Furthermore, +we propose a multimodal fusion framework, ESNet, which integrates element +property features with crystal structure features to generate joint multimodal +representations. This provides a more comprehensive perspective for predicting +the performance of crystalline materials, enabling the model to consider both +microstructural composition and chemical characteristics of the materials. We +conducted experiments on the Materials Project benchmark dataset, which showed +leading performance in the bandgap prediction task and achieved results on a +par with existing benchmarks in the formation energy prediction task. + +
+
+
+
+
+ + ☆ DiVR: incorporating context from diverse VR scenes for human trajectory + prediction + + +
+ Virtual environments provide a rich and controlled setting for collecting +detailed data on human behavior, offering unique opportunities for predicting +human trajectories in dynamic scenes. However, most existing approaches have +overlooked the potential of these environments, focusing instead on static +contexts without considering userspecific factors. Employing the CREATTIVE3D +dataset, our work models trajectories recorded in virtual reality (VR) scenes +for diverse situations including road-crossing tasks with user interactions and +simulated visual impairments. We propose Diverse Context VR Human Motion +Prediction (DiVR), a cross-modal transformer based on the Perceiver +architecture that integrates both static and dynamic scene context using a +heterogeneous graph convolution network. We conduct extensive experiments +comparing DiVR against existing architectures including MLP, LSTM, and +transformers with gaze and point cloud context. Additionally, we also stress +test our model's generalizability across different users, tasks, and scenes. +Results show that DiVR achieves higher accuracy and adaptability compared to +other models and to static graphs. This work highlights the advantages of using +VR datasets for context-aware human trajectory modeling, with potential +applications in enhancing user experiences in the metaverse. Our source code is +publicly available at https://gitlab.inria.fr/ffrancog/creattive3d-divr-model. + +
+
+
+
+
+ + ☆ BAMAX: Backtrack Assisted Multi-Agent Exploration using Reinforcement + Learning + + +
+ Autonomous robots collaboratively exploring an unknown environment is still +an open problem. The problem has its roots in coordination among non-stationary +agents, each with only a partial view of information. The problem is compounded +when the multiple robots must completely explore the environment. In this +paper, we introduce Backtrack Assisted Multi-Agent Exploration using +Reinforcement Learning (BAMAX), a method for collaborative exploration in +multi-agent systems which attempts to explore an entire virtual environment. As +in the name, BAMAX leverages backtrack assistance to enhance the performance of +agents in exploration tasks. To evaluate BAMAX against traditional approaches, +we present the results of experiments conducted across multiple hexagonal +shaped grids sizes, ranging from 10x10 to 60x60. The results demonstrate that +BAMAX outperforms other methods in terms of faster coverage and less +backtracking across these environments. + +
+
+
+
+
+ + ☆ RLInspect: An Interactive Visual Approach to Assess Reinforcement + Learning Algorithm + + +
+ Reinforcement Learning (RL) is a rapidly growing area of machine learning +that finds its application in a broad range of domains, from finance and +healthcare to robotics and gaming. Compared to other machine learning +techniques, RL agents learn from their own experiences using trial and error, +and improve their performance over time. However, assessing RL models can be +challenging, which makes it difficult to interpret their behaviour. While +reward is a widely used metric to evaluate RL models, it may not always provide +an accurate measure of training performance. In some cases, the reward may seem +increasing while the model's performance is actually decreasing, leading to +misleading conclusions about the effectiveness of the training. To overcome +this limitation, we have developed RLInspect - an interactive visual analytic +tool, that takes into account different components of the RL model - state, +action, agent architecture and reward, and provides a more comprehensive view +of the RL training. By using RLInspect, users can gain insights into the +model's behaviour, identify issues during training, and potentially correct +them effectively, leading to a more robust and reliable RL system. + +
+
+
+
+
+ + ☆ Physics Informed Distillation for Diffusion Models + + +
+ Diffusion models have recently emerged as a potent tool in generative +modeling. However, their inherent iterative nature often results in sluggish +image generation due to the requirement for multiple model evaluations. Recent +progress has unveiled the intrinsic link between diffusion models and +Probability Flow Ordinary Differential Equations (ODEs), thus enabling us to +conceptualize diffusion models as ODE systems. Simultaneously, Physics Informed +Neural Networks (PINNs) have substantiated their effectiveness in solving +intricate differential equations through implicit modeling of their solutions. +Building upon these foundational insights, we introduce Physics Informed +Distillation (PID), which employs a student model to represent the solution of +the ODE system corresponding to the teacher diffusion model, akin to the +principles employed in PINNs. Through experiments on CIFAR 10 and ImageNet +64x64, we observe that PID achieves performance comparable to recent +distillation methods. Notably, it demonstrates predictable trends concerning +method-specific hyperparameters and eliminates the need for synthetic dataset +generation during the distillation process. Both of which contribute to its +easy-to-use nature as a distillation approach for Diffusion Models. Our code +and pre-trained checkpoint are publicly available at: +https://github.com/pantheon5100/pid_diffusion.git. + +
+
+
+
+
+ + ☆ Developing an Effective Training Dataset to Enhance the Performance of + AI-based Speaker Separation Systems + + +
+ This paper addresses the challenge of speaker separation, which remains an +active research topic despite the promising results achieved in recent years. +These results, however, often degrade in real recording conditions due to the +presence of noise, echo, and other interferences. This is because neural models +are typically trained on synthetic datasets consisting of mixed audio signals +and their corresponding ground truths, which are generated using computer +software and do not fully represent the complexities of real-world recording +scenarios. The lack of realistic training sets for speaker separation remains a +major hurdle, as obtaining individual sounds from mixed audio signals is a +nontrivial task. To address this issue, we propose a novel method for +constructing a realistic training set that includes mixture signals and +corresponding ground truths for each speaker. We evaluate this dataset on a +deep learning model and compare it to a synthetic dataset. We got a 1.65 dB +improvement in Scale Invariant Signal to Distortion Ratio (SI-SDR) for speaker +separation accuracy in realistic mixing. Our findings highlight the potential +of realistic training sets for enhancing the performance of speaker separation +models in real-world scenarios. + +
+
+ comment: in Arabic language +
+
+
+
+
+ + ☆ A Fuzzy Reinforcement LSTM-based Long-term Prediction Model for Fault + Conditions in Nuclear Power Plants + + +
+ Early fault detection and timely maintenance scheduling can significantly +mitigate operational risks in NPPs and enhance the reliability of operator +decision-making. Therefore, it is necessary to develop an efficient Prognostics +and Health Management (PHM) multi-step prediction model for predicting of +system health status and prompt execution of maintenance operations. In this +study, we propose a novel predictive model that integrates reinforcement +learning with Long Short-Term Memory (LSTM) neural networks and the Expert +Fuzzy Evaluation Method. The model is validated using parameter data for 20 +different breach sizes in the Main Steam Line Break (MSLB) accident condition +of the CPR1000 pressurized water reactor simulation model and it demonstrates a +remarkable capability in accurately forecasting NPP parameter changes up to 128 +steps ahead (with a time interval of 10 seconds per step, i.e., 1280 seconds), +thereby satisfying the temporal advance requirement for fault prognostics in +NPPs. Furthermore, this method provides an effective reference solution for PHM +applications such as anomaly detection and remaining useful life prediction. + +
+
+
+
+
+ + ☆ Surprisingly Popular Voting for Concentric Rank-Order Models + + +
+ An important problem on social information sites is the recovery of ground +truth from individual reports when the experts are in the minority. The wisdom +of the crowd, i.e. the collective opinion of a group of individuals fails in +such a scenario. However, the surprisingly popular (SP) +algorithm~\cite{prelec2017solution} can recover the ground truth even when the +experts are in the minority, by asking the individuals to report additional +prediction reports--their beliefs about the reports of others. Several recent +works have extended the surprisingly popular algorithm to an equivalent voting +rule (SP-voting) to recover the ground truth ranking over a set of $m$ +alternatives. However, we are yet to fully understand when SP-voting can +recover the ground truth ranking, and if so, how many samples (votes and +predictions) it needs. We answer this question by proposing two rank-order +models and analyzing the sample complexity of SP-voting under these models. In +particular, we propose concentric mixtures of Mallows and Plackett-Luce models +with $G (\ge 2)$ groups. Our models generalize previously proposed concentric +mixtures of Mallows models with $2$ groups, and we highlight the importance of +$G > 2$ groups by identifying three distinct groups (expert, intermediate, and +non-expert) from existing datasets. Next, we provide conditions on the +parameters of the underlying models so that SP-voting can recover ground-truth +rankings with high probability, and also derive sample complexities under the +same. We complement the theoretical results by evaluating SP-voting on +simulated and real datasets. + +
+
+
+
+
+ + ☆ A Chinese Multi-label Affective Computing Dataset Based on Social Media + Network Users + + +
+ Emotion and personality are central elements in understanding human +psychological states. Emotions reflect an individual subjective experiences, +while personality reveals relatively stable behavioral and cognitive patterns. +Existing affective computing datasets often annotate emotion and personality +traits separately, lacking fine-grained labeling of micro-emotions and emotion +intensity in both single-label and multi-label classifications. Chinese emotion +datasets are extremely scarce, and datasets capturing Chinese user personality +traits are even more limited. To address these gaps, this study collected data +from the major social media platform Weibo, screening 11,338 valid users from +over 50,000 individuals with diverse MBTI personality labels and acquiring +566,900 posts along with the user MBTI personality tags. Using the EQN method, +we compiled a multi-label Chinese affective computing dataset that integrates +the same user's personality traits with six emotions and micro-emotions, each +annotated with intensity levels. Validation results across multiple NLP +classification models demonstrate the dataset strong utility. This dataset is +designed to advance machine recognition of complex human emotions and provide +data support for research in psychology, education, marketing, finance, and +politics. + +
+
+
+
+
+ + ☆ Generative AI for Data Augmentation in Wireless Networks: Analysis, + Applications, and Case Study + + +
+ Data augmentation is a powerful technique to mitigate data scarcity. However, +owing to fundamental differences in wireless data structures, traditional data +augmentation techniques may not be suitable for wireless data. Fortunately, +Generative Artificial Intelligence (GenAI) can be an effective alternative to +wireless data augmentation due to its excellent data generation capability. +This article systemically explores the potential and effectiveness of +GenAI-driven data augmentation in wireless networks. We first briefly review +data augmentation techniques, discuss their limitations in wireless networks, +and introduce generative data augmentation, including reviewing GenAI models +and their applications in data augmentation. We then explore the application +prospects of GenAI-driven data augmentation in wireless networks from the +physical, network, and application layers, which provides a GenAI-driven data +augmentation architecture for each application. Subsequently, we propose a +general generative diffusion model-based data augmentation framework for Wi-Fi +gesture recognition, which uses transformer-based diffusion models to generate +high-quality channel state information data. Furthermore, we develop residual +neural network models for Wi-Fi gesture recognition to evaluate the role of +augmented data and conduct a case study based on a real dataset. Simulation +results demonstrate the effectiveness of the proposed framework. Finally, we +discuss research directions for generative data augmentation. + +
+
+
+
+
+ + ☆ DEEGITS: Deep Learning based Framework for Measuring Heterogenous + Traffic State in Challenging Traffic Scenarios + + +
+ This paper presents DEEGITS (Deep Learning Based Heterogeneous Traffic State +Measurement), a comprehensive framework that leverages state-of-the-art +convolutional neural network (CNN) techniques to accurately and rapidly detect +vehicles and pedestrians, as well as to measure traffic states in challenging +scenarios (i.e., congestion, occlusion). In this study, we enhance the training +dataset through data fusion, enabling simultaneous detection of vehicles and +pedestrians. Image preprocessing and augmentation are subsequently performed to +improve the quality and quantity of the dataset. Transfer learning is applied +on the YOLOv8 pretrained model to increase the model's capability to identify a +diverse array of vehicles. Optimal hyperparameters are obtained using the Grid +Search algorithm, with the Stochastic Gradient Descent (SGD) optimizer +outperforming other optimizers under these settings. Extensive experimentation +and evaluation demonstrate substantial accuracy within the detection framework, +with the model achieving 0.794 mAP@0.5 on the validation set and 0.786 mAP@0.5 +on the test set, surpassing previous benchmarks on similar datasets. The +DeepSORT multi-object tracking algorithm is incorporated to track detected +vehicles and pedestrians in this study. Finally, the framework is tested to +measure heterogeneous traffic states in mixed traffic conditions. Two locations +with differing traffic compositions and congestion levels are selected: one +motorized-dominant location with moderate density and one +non-motorized-dominant location with higher density. Errors are statistically +insignificant for both cases, showing correlations from 0.99 to 0.88 and 0.91 +to 0.97 for heterogeneous traffic flow and speed measurements, respectively. + +
+
+ comment: Submitted for presentation at the 103 rd Annual Meeting of + Transportation Research Board and publication in Transportation Research + Record: Journal of Transportation Research Board +
+
+
+
+
+ + ☆ Enhancing Multimodal Query Representation via Visual Dialogues for + End-to-End Knowledge Retrieval + + +
+ Existing multimodal retrieval systems often rely on disjointed models for +image comprehension, such as object detectors and caption generators, leading +to cumbersome implementations and training processes. To overcome this +limitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a +text retriever with the ability to understand multimodal queries via dynamic +modality interaction. Ret-XKnow leverages a partial convolution mechanism to +focus on visual information relevant to the given textual query, thereby +enhancing multimodal query representations. To effectively learn multimodal +interaction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset +automatically constructed from visual dialogue datasets. Our dataset +construction process ensures that the dialogues are transformed into suitable +information retrieval tasks using a text retriever. We demonstrate that our +approach not only significantly improves retrieval performance in zero-shot +settings but also achieves substantial improvements in fine-tuning scenarios. +Our code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow. + +
+
+
+
+
+ + ☆ Are LLMs Prescient? A Continuous Evaluation using Daily News as the + Oracle + + +
+ Many existing evaluation benchmarks for Large Language Models (LLMs) quickly +become outdated due to the emergence of new models and training data. These +benchmarks also fall short in assessing how LLM performance changes over time, +as they consist of static questions without a temporal dimension. To address +these limitations, we propose using future event prediction as a continuous +evaluation method to assess LLMs' temporal generalization and forecasting +abilities. Our benchmark, Daily Oracle, automatically generates question-answer +(QA) pairs from daily news, challenging LLMs to predict "future" event +outcomes. Our findings reveal that as pre-training data becomes outdated, LLM +performance degrades over time. While Retrieval Augmented Generation (RAG) has +the potential to enhance prediction accuracy, the performance degradation +pattern persists, highlighting the need for continuous model updates. + +
+
+
+
+
+ + ☆ Responsible AI in Construction Safety: Systematic Evaluation of Large + Language Models and Prompt Engineering + + +
+ Construction remains one of the most hazardous sectors. Recent advancements +in AI, particularly Large Language Models (LLMs), offer promising opportunities +for enhancing workplace safety. However, responsible integration of LLMs +requires systematic evaluation, as deploying them without understanding their +capabilities and limitations risks generating inaccurate information, fostering +misplaced confidence, and compromising worker safety. This study evaluates the +performance of two widely used LLMs, GPT-3.5 and GPT-4o, across three +standardized exams administered by the Board of Certified Safety Professionals +(BCSP). Using 385 questions spanning seven safety knowledge areas, the study +analyzes the models' accuracy, consistency, and reliability. Results show that +both models consistently exceed the BCSP benchmark, with GPT-4o achieving an +accuracy rate of 84.6% and GPT-3.5 reaching 73.8%. Both models demonstrate +strengths in safety management systems and hazard identification and control, +but exhibit weaknesses in science, mathematics, emergency response, and fire +prevention. An error analysis identifies four primary limitations affecting LLM +performance: lack of knowledge, reasoning flaws, memory issues, and calculation +errors. Our study also highlights the impact of prompt engineering strategies, +with variations in accuracy reaching 13.5% for GPT-3.5 and 7.9% for GPT-4o. +However, no single prompt configuration proves universally effective. This +research advances knowledge in three ways: by identifying areas where LLMs can +support safety practices and where human oversight remains essential, by +offering practical insights into improving LLM implementation through prompt +engineering, and by providing evidence-based direction for future research and +development. These contributions support the responsible integration of AI in +construction safety management toward achieving zero injuries. + +
+
+ comment: 29 pages, 5 figures +
+
+
+
+
+ + ☆ PerceiverS: A Multi-Scale Perceiver with Effective Segmentation for + Long-Term Expressive Symbolic Music Generation + + +
+ Music generation has progressed significantly, especially in the domain of +audio generation. However, generating symbolic music that is both +long-structured and expressive remains a significant challenge. In this paper, +we propose PerceiverS (Segmentation and Scale), a novel architecture designed +to address this issue by leveraging both Effective Segmentation and Multi-Scale +attention mechanisms. Our approach enhances symbolic music generation by +simultaneously learning long-term structural dependencies and short-term +expressive details. By combining cross-attention and self-attention in a +Multi-Scale setting, PerceiverS captures long-range musical structure while +preserving performance nuances. The proposed model, evaluated on datasets like +Maestro, demonstrates improvements in generating coherent and diverse music +with both structural consistency and expressive variation. The project demos +and the generated music samples can be accessed through the link: +https://perceivers.github.io. + +
+
+
+
+
+ + ☆ R3HF: Reward Redistribution for Enhancing Reinforcement Learning from + Human Feedback + + +
+ Reinforcement learning from human feedback (RLHF) provides a paradigm for +aligning large language models (LLMs) with human preferences. This involves the +initial training of a reward model based on pairwise human feedback. The reward +model is subsequently utilized in reinforcement learning to assess the scores +of each generated sentence as a whole, further guiding the optimization of +LLMs. However, current approaches have a significant shortcoming: \emph{They +allocate a single, sparse, and delayed reward to an entire sequence of output}. +This may overlook some significant individual contributions of each token +towards the desired outcome. To overcome this limitation, our paper proposes a +novel reward redistribution method called R3HF, which facilitates a more +fine-grained, token-level reward allocation. Specifically, our method treats +the reward prediction task of the reward model as a regression problem. As a +result, the redistributed rewards are computed by evaluating the specific +contribution of each token to the reward model's output. This detailed approach +improves the model's understanding of language nuances, leading to more precise +enhancements in its performance. Our method is crafted to integrate seamlessly +with most current techniques while incurring minimal computational costs. +Through comprehensive experiments across diverse datasets and tasks, we have +verified the effectiveness and superiority of our approach. + +
+
+
+
+
+ + ☆ DNN Task Assignment in UAV Networks: A Generative AI Enhanced + Multi-Agent Reinforcement Learning Approach + + +
+ Unmanned Aerial Vehicles (UAVs) possess high mobility and flexible deployment +capabilities, prompting the development of UAVs for various application +scenarios within the Internet of Things (IoT). The unique capabilities of UAVs +give rise to increasingly critical and complex tasks in uncertain and +potentially harsh environments. The substantial amount of data generated from +these applications necessitates processing and analysis through deep neural +networks (DNNs). However, UAVs encounter challenges due to their limited +computing resources when managing DNN models. This paper presents a joint +approach that combines multiple-agent reinforcement learning (MARL) and +generative diffusion models (GDM) for assigning DNN tasks to a UAV swarm, aimed +at reducing latency from task capture to result output. To address these +challenges, we first consider the task size of the target area to be inspected +and the shortest flying path as optimization constraints, employing a greedy +algorithm to resolve the subproblem with a focus on minimizing the UAV's flying +path and the overall system cost. In the second stage, we introduce a novel DNN +task assignment algorithm, termed GDM-MADDPG, which utilizes the reverse +denoising process of GDM to replace the actor network in multi-agent deep +deterministic policy gradient (MADDPG). This approach generates specific DNN +task assignment actions based on agents' observations in a dynamic environment. +Simulation results indicate that our algorithm performs favorably compared to +benchmarks in terms of path planning, Age of Information (AoI), energy +consumption, and task load balancing. + +
+
+
+
+
+ + ☆ TowerDebias: A Novel Debiasing Method based on the Tower Property + + +
+ Decision-making processes have increasingly come to rely on sophisticated +machine learning tools, raising concerns about the fairness of their +predictions with respect to any sensitive groups. The widespread use of +commercial black-box machine learning models necessitates careful consideration +of their legal and ethical implications on consumers. In situations where users +have access to these "black-box" models, a key question emerges: how can we +mitigate or eliminate the influence of sensitive attributes, such as race or +gender? We propose towerDebias (tDB), a novel approach designed to reduce the +influence of sensitive variables in predictions made by black-box models. Using +the Tower Property from probability theory, tDB aims to improve prediction +fairness during the post-processing stage in a manner amenable to the +Fairness-Utility Tradeoff. This method is highly flexible, requiring no prior +knowledge of the original model's internal structure, and can be extended to a +range of different applications. We provide a formal improvement theorem for +tDB and demonstrate its effectiveness in both regression and classification +tasks, underscoring its impact on the fairness-utility tradeoff. + +
+
+ comment: To be submitted to a journal soon +
+
+
+
+
+ + ☆ RESOLVE: Relational Reasoning with Symbolic and Object-Level Features + Using Vector Symbolic Processing + + +
+ Modern transformer-based encoder-decoder architectures struggle with +reasoning tasks due to their inability to effectively extract relational +information between input objects (data/tokens). Recent work introduced the +Abstractor module, embedded between transformer layers, to address this gap. +However, the Abstractor layer while excelling at capturing relational +information (pure relational reasoning), faces challenges in tasks that require +both object and relational-level reasoning (partial relational reasoning). To +address this, we propose RESOLVE, a neuro-vector symbolic architecture that +combines object-level features with relational representations in +high-dimensional spaces, using fast and efficient operations such as bundling +(summation) and binding (Hadamard product) allowing both object-level features +and relational representations to coexist within the same structure without +interfering with one another. RESOLVE is driven by a novel attention mechanism +that operates in a bipolar high dimensional space, allowing fast attention +score computation compared to the state-of-the-art. By leveraging this design, +the model achieves both low compute latency and memory efficiency. RESOLVE also +offers better generalizability while achieving higher accuracy in purely +relational reasoning tasks such as sorting as well as partial relational +reasoning tasks such as math problem-solving compared to state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Hashing for Protein Structure Similarity Search + + +
+ Protein structure similarity search (PSSS), which tries to search proteins +with similar structures, plays a crucial role across diverse domains from drug +design to protein function prediction and molecular evolution. Traditional +alignment-based PSSS methods, which directly calculate alignment on the protein +structures, are highly time-consuming with high memory cost. Recently, +alignment-free methods, which represent protein structures as fixed-length +real-valued vectors, are proposed for PSSS. Although these methods have lower +time and memory cost than alignment-based methods, their time and memory cost +is still too high for large-scale PSSS, and their accuracy is unsatisfactory. +In this paper, we propose a novel method, called +$\underline{\text{p}}$r$\underline{\text{o}}$tein +$\underline{\text{s}}$tructure $\underline{\text{h}}$ashing (POSH), for PSSS. +POSH learns a binary vector representation for each protein structure, which +can dramatically reduce the time and memory cost for PSSS compared with +real-valued vector representation based methods. Furthermore, in POSH we also +propose expressive hand-crafted features and a structure encoder to well model +both node and edge interactions in proteins. Experimental results on real +datasets show that POSH can outperform other methods to achieve +state-of-the-art accuracy. Furthermore, POSH achieves a memory saving of more +than six times and speed improvement of more than four times, compared with +other methods. + +
+
+
+
+
+ + ☆ Knowledge Bases in Support of Large Language Models for Processing Web + News + + +
+ Large Language Models (LLMs) have received considerable interest in wide +applications lately. During pre-training via massive datasets, such a model +implicitly memorizes the factual knowledge of trained datasets in its hidden +parameters. However, knowledge held implicitly in parameters often makes its +use by downstream applications ineffective due to the lack of common-sense +reasoning. In this article, we introduce a general framework that permits to +build knowledge bases with an aid of LLMs, tailored for processing Web news. +The framework applies a rule-based News Information Extractor (NewsIE) to news +items for extracting their relational tuples, referred to as knowledge bases, +which are then graph-convoluted with the implicit knowledge facts of news items +obtained by LLMs, for their classification. It involves two lightweight +components: 1) NewsIE: for extracting the structural information of every news +item, in the form of relational tuples; 2) BERTGraph: for graph convoluting the +implicit knowledge facts with relational tuples extracted by NewsIE. We have +evaluated our framework under different news-related datasets for news category +classification, with promising experimental results. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ GPTree: Towards Explainable Decision-Making via LLM-powered Decision + Trees + + +
+ Traditional decision tree algorithms are explainable but struggle with +non-linear, high-dimensional data, limiting its applicability in complex +decision-making. Neural networks excel at capturing complex patterns but +sacrifice explainability in the process. In this work, we present GPTree, a +novel framework combining explainability of decision trees with the advanced +reasoning capabilities of LLMs. GPTree eliminates the need for feature +engineering and prompt chaining, requiring only a task-specific prompt and +leveraging a tree-based structure to dynamically split samples. We also +introduce an expert-in-the-loop feedback mechanism to further enhance +performance by enabling human intervention to refine and rebuild decision +paths, emphasizing the harmony between human expertise and machine +intelligence. Our decision tree achieved a 7.8% precision rate for identifying +"unicorn" startups at the inception stage of a startup, surpassing gpt-4o with +few-shot learning as well as the best human decision-makers (3.1% to 5.6%). + +
+
+
+
+
+ + ☆ VALTEST: Automated Validation of Language Model Generated Test Cases + + +
+ Large Language Models (LLMs) have demonstrated significant potential in +automating software testing, specifically in generating unit test cases. +However, the validation of LLM-generated test cases remains a challenge, +particularly when the ground truth is unavailable. This paper introduces +VALTEST, a novel framework designed to automatically validate test cases +generated by LLMs by leveraging token probabilities. We evaluate VALTEST using +nine test suites generated from three datasets (HumanEval, MBPP, and LeetCode) +across three LLMs (GPT-4o, GPT-3.5-turbo, and LLama3.1 8b). By extracting +statistical features from token probabilities, we train a machine learning +model to predict test case validity. VALTEST increases the validity rate of +test cases by 6.2% to 24%, depending on the dataset and LLM. Our results +suggest that token probabilities are reliable indicators for distinguishing +between valid and invalid test cases, which provides a robust solution for +improving the correctness of LLM-generated test cases in software testing. In +addition, we found that replacing the identified invalid test cases by VALTEST, +using a Chain-of-Thought prompting results in a more effective test suite while +keeping the high validity rates. + +
+
+
+
+
+ + ☆ Set-Based Retrograde Analysis: Precomputing the Solution to 24-card + Bridge Double Dummy Deals + + +
+ Retrograde analysis is used in game-playing programs to solve states at the +end of a game, working backwards toward the start of the game. The algorithm +iterates through and computes the perfect-play value for as many states as +resources allow. We introduce setrograde analysis which achieves the same +results by operating on sets of states that have the same game value. The +algorithm is demonstrated by computing exact solutions for Bridge double dummy +card-play. For deals with 24 cards remaining to be played ($10^{27}$ states, +which can be reduced to $10^{15}$ states using preexisting techniques), we +strongly solve all deals. The setrograde algorithm performs a factor of $10^3$ +fewer search operations than a standard retrograde algorithm, producing a +database with a factor of $10^4$ fewer entries. For applicable domains, this +allows retrograde searching to reach unprecedented search depths. + +
+
+
+
+
+ + ☆ Drone Detection using Deep Neural Networks Trained on Pure Synthetic + Data + + +
+ Drone detection has benefited from improvements in deep neural networks, but +like many other applications, suffers from the availability of accurate data +for training. Synthetic data provides a potential for low-cost data generation +and has been shown to improve data availability and quality. However, models +trained on synthetic datasets need to prove their ability to perform on +real-world data, known as the problem of sim-to-real transferability. Here, we +present a drone detection Faster-RCNN model trained on a purely synthetic +dataset that transfers to real-world data. We found that it achieves an AP_50 +of 97.0% when evaluated on the MAV-Vid - a real dataset of flying drones - +compared with 97.8% for an equivalent model trained on real-world data. Our +results show that using synthetic data for drone detection has the potential to +reduce data collection costs and improve labelling quality. These findings +could be a starting point for more elaborate synthetic drone datasets. For +example, realistic recreations of specific scenarios could de-risk the dataset +generation of safety-critical applications such as the detection of drones at +airports. Further, synthetic data may enable reliable drone detection systems, +which could benefit other areas, such as unmanned traffic management systems. +The code is available +https://github.com/mazqtpopx/cranfield-synthetic-drone-detection alongside the +datasets +https://huggingface.co/datasets/mazqtpopx/cranfield-synthetic-drone-detection. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Code-mixed LLM: Improve Large Language Models' Capability to Handle + Code-Mixing through Reinforcement Learning from AI Feedback + + +
+ Code-mixing(CM) or code-switching(CSW) refers to the juxtaposition of +linguistic units from two or more languages during the conversation or +sometimes even a single utterance. Code-mixing introduces unique challenges in +daily life, such as syntactic mismatches and semantic blending, that are rarely +encountered in monolingual settings. Large language models (LLMs) have +revolutionized the field of natural language processing (NLP) by offering +unprecedented capabilities in understanding human languages. However, the +effectiveness of current state-of-the-art multilingual LLMs has not yet been +fully explored in the CM scenario. To fill this gap, we first benchmark the +performance of multilingual LLMs on various code-mixing NLP tasks. Then we +propose to improve the multilingual LLMs' ability to understand code-mixing +through reinforcement learning from human feedback (RLHF) and code-mixed +machine translation tasks. Given the high-cost and time-consuming preference +labeling procedure, we improve this by utilizing LLMs as annotators to perform +the reinforcement learning from AI feedback (RLAIF). The experiments show the +effectiveness of the proposed method. + +
+
+ comment: initial version: 5 pages, 2 figures +
+
+
+
+
+ + ☆ Liner Shipping Network Design with Reinforcement Learning + + +
+ This paper proposes a novel reinforcement learning framework to address the +Liner Shipping Network Design Problem (LSNDP), a challenging combinatorial +optimization problem focused on designing cost-efficient maritime shipping +routes. Traditional methods for solving the LSNDP typically involve decomposing +the problem into sub-problems, such as network design and multi-commodity flow, +which are then tackled using approximate heuristics or large neighborhood +search (LNS) techniques. In contrast, our approach employs a model-free +reinforcement learning algorithm on the network design, integrated with a +heuristic-based multi-commodity flow solver, to produce competitive results on +the publicly available LINERLIB benchmark. Additionally, our method also +demonstrates generalization capabilities by producing competitive solutions on +the benchmark instances after training on perturbed instances. + +
+
+
+
+
+ + ☆ Language-Model Prior Overcomes Cold-Start Items + + +
+ The growth of recommender systems (RecSys) is driven by digitization and the +need for personalized content in areas such as e-commerce and video streaming. +The content in these systems often changes rapidly and therefore they +constantly face the ongoing cold-start problem, where new items lack +interaction data and are hard to value. Existing solutions for the cold-start +problem, such as content-based recommenders and hybrid methods, leverage item +metadata to determine item similarities. The main challenge with these methods +is their reliance on structured and informative metadata to capture detailed +item similarities, which may not always be available. This paper introduces a +novel approach for cold-start item recommendation that utilizes the language +model (LM) to estimate item similarities, which are further integrated as a +Bayesian prior with classic recommender systems. This approach is generic and +able to boost the performance of various recommenders. Specifically, our +experiments integrate it with both sequential and collaborative filtering-based +recommender and evaluate it on two real-world datasets, demonstrating the +enhanced performance of the proposed approach. + +
+
+ comment: This paper is dedicated to cold-start item recommendation using + language-model priors +
+
+
+
+
+ + ☆ Multimodal Object Detection using Depth and Image Data for Manufacturing + Parts + + +
+ Manufacturing requires reliable object detection methods for precise picking +and handling of diverse types of manufacturing parts and components. +Traditional object detection methods utilize either only 2D images from cameras +or 3D data from lidars or similar 3D sensors. However, each of these sensors +have weaknesses and limitations. Cameras do not have depth perception and 3D +sensors typically do not carry color information. These weaknesses can +undermine the reliability and robustness of industrial manufacturing systems. +To address these challenges, this work proposes a multi-sensor system combining +an red-green-blue (RGB) camera and a 3D point cloud sensor. The two sensors are +calibrated for precise alignment of the multimodal data captured from the two +hardware devices. A novel multimodal object detection method is developed to +process both RGB and depth data. This object detector is based on the Faster +R-CNN baseline that was originally designed to process only camera images. The +results show that the multimodal model significantly outperforms the depth-only +and RGB-only baselines on established object detection metrics. More +specifically, the multimodal model improves mAP by 13% and raises Mean +Precision by 11.8% in comparison to the RGB-only baseline. Compared to the +depth-only baseline, it improves mAP by 78% and raises Mean Precision by 57%. +Hence, this method facilitates more reliable and robust object detection in +service to smart manufacturing applications. + +
+
+
+
+
+ + ☆ SAFELOC: Overcoming Data Poisoning Attacks in Heterogeneous Federated + Machine Learning for Indoor Localization + + +
+ Machine learning (ML) based indoor localization solutions are critical for +many emerging applications, yet their efficacy is often compromised by +hardware/software variations across mobile devices (i.e., device heterogeneity) +and the threat of ML data poisoning attacks. Conventional methods aimed at +countering these challenges show limited resilience to the uncertainties +created by these phenomena. In response, in this paper, we introduce SAFELOC, a +novel framework that not only minimizes localization errors under these +challenging conditions but also ensures model compactness for efficient mobile +device deployment. Our framework targets a distributed and co-operative +learning environment that uses federated learning (FL) to preserve user data +privacy and assumes heterogeneous mobile devices carried by users (just like in +most real-world scenarios). Within this heterogeneous FL context, SAFELOC +introduces a novel fused neural network architecture that performs data +poisoning detection and localization, with a low model footprint. Additionally, +a dynamic saliency map-based aggregation strategy is designed to adapt based on +the severity of the detected data poisoning scenario. Experimental evaluations +demonstrate that SAFELOC achieves improvements of up to 5.9x in mean +localization error, 7.8x in worst-case localization error, and a 2.1x reduction +in model inference latency compared to state-of-the-art indoor localization +frameworks, across diverse building floorplans, mobile devices, and ML data +poisoning attack scenarios. + +
+
+
+
+
+ + ☆ The Systems Engineering Approach in Times of Large Language Models + + +
+ Using Large Language Models (LLMs) to address critical societal problems +requires adopting this novel technology into socio-technical systems. However, +the complexity of such systems and the nature of LLMs challenge such a vision. +It is unlikely that the solution to such challenges will come from the +Artificial Intelligence (AI) community itself. Instead, the Systems Engineering +approach is better equipped to facilitate the adoption of LLMs by prioritising +the problems and their context before any other aspects. This paper introduces +the challenges LLMs generate and surveys systems research efforts for +engineering AI-based systems. We reveal how the systems engineering principles +have supported addressing similar issues to the ones LLMs pose and discuss our +findings to provide future directions for adopting LLMs. + +
+
+ comment: This paper has been accepted for the upcoming 58th Hawaii + International Conference on System Sciences (HICSS-58) +
+
+
+
+
+ + ☆ Virtual teaching assistant for undergraduate students using natural + language processing & deep learning + + +
+ Online education's popularity has been continuously increasing over the past +few years. Many universities were forced to switch to online education as a +result of COVID-19. In many cases, even after more than two years of online +instruction, colleges were unable to resume their traditional classroom +programs. A growing number of institutions are considering blended learning +with some parts in-person and the rest of the learning taking place online. +Nevertheless, many online education systems are inefficient, and this results +in a poor rate of student retention. In this paper, we are offering a primary +dataset, the initial implementation of a virtual teaching assistant named +VTA-bot, and its system architecture. Our primary implementation of the +suggested system consists of a chatbot that can be queried about the content +and topics of the fundamental python programming language course. Students in +their first year of university will be benefited from this strategy, which aims +to increase student participation and involvement in online education. + +
+
+
+
+
+ + ☆ IDCIA: Immunocytochemistry Dataset for Cellular Image Analysis + + +
+ We present a new annotated microscopic cellular image dataset to improve the +effectiveness of machine learning methods for cellular image analysis. Cell +counting is an important step in cell analysis. Typically, domain experts +manually count cells in a microscopic image. Automated cell counting can +potentially eliminate this tedious, time-consuming process. However, a good, +labeled dataset is required for training an accurate machine learning model. +Our dataset includes microscopic images of cells, and for each image, the cell +count and the location of individual cells. The data were collected as part of +an ongoing study investigating the potential of electrical stimulation to +modulate stem cell differentiation and possible applications for neural repair. +Compared to existing publicly available datasets, our dataset has more images +of cells stained with more variety of antibodies (protein components of immune +responses against invaders) typically used for cell analysis. The experimental +results on this dataset indicate that none of the five existing models under +this study are able to achieve sufficiently accurate count to replace the +manual methods. The dataset is available at +https://figshare.com/articles/dataset/Dataset/21970604. + +
+
+
+
+
+ + ☆ Reliability, Resilience and Human Factors Engineering for Trustworthy AI + Systems + + +
+ As AI systems become integral to critical operations across industries and +services, ensuring their reliability and safety is essential. We offer a +framework that integrates established reliability and resilience engineering +principles into AI systems. By applying traditional metrics such as failure +rate and Mean Time Between Failures (MTBF) along with resilience engineering +and human reliability analysis, we propose an integrate framework to manage AI +system performance, and prevent or efficiently recover from failures. Our work +adapts classical engineering methods to AI systems and outlines a research +agenda for future technical studies. We apply our framework to a real-world AI +system, using system status data from platforms such as openAI, to demonstrate +its practical applicability. This framework aligns with emerging global +standards and regulatory frameworks, providing a methodology to enhance the +trustworthiness of AI systems. Our aim is to guide policy, regulation, and the +development of reliable, safe, and adaptable AI technologies capable of +consistent performance in real-world environments. + +
+
+
+
+
+ + ☆ CoCoP: Enhancing Text Classification with LLM through Code Completion + Prompt + + +
+ Text classification is a fundamental task in natural language processing +(NLP), and large language models (LLMs) have demonstrated their capability to +perform this task across various domains. However, the performance of LLMs +heavily depends on the quality of their input prompts. Recent studies have also +shown that LLMs exhibit remarkable results in code-related tasks. To leverage +the capabilities of LLMs in text classification, we propose the Code Completion +Prompt (CoCoP) method, which transforms the text classification problem into a +code completion task. CoCoP significantly improves text classification +performance across diverse datasets by utilizing LLMs' code-completion +capability. For instance, CoCoP enhances the accuracy of the SST2 dataset by +more than 20%. Moreover, when CoCoP integrated with LLMs specifically designed +for code-related tasks (code models), such as CodeLLaMA, this method +demonstrates better or comparable performance to few-shot learning techniques +while using only one-tenth of the model size. The source code of our proposed +method will be available to the public upon the acceptance of the paper. + +
+
+
+
+
+ + ☆ Fluoroformer: Scaling multiple instance learning to multiplexed images + via attention-based channel fusion ML4H + + +
+ Though multiple instance learning (MIL) has been a foundational strategy in +computational pathology for processing whole slide images (WSIs), current +approaches are designed for traditional hematoxylin and eosin (H&E) slides +rather than emerging multiplexed technologies. Here, we present an MIL +strategy, the Fluoroformer module, that is specifically tailored to multiplexed +WSIs by leveraging scaled dot-product attention (SDPA) to interpretably fuse +information across disparate channels. On a cohort of 434 non-small cell lung +cancer (NSCLC) samples, we show that the Fluoroformer both obtains strong +prognostic performance and recapitulates immuno-oncological hallmarks of NSCLC. +Our technique thereby provides a path for adapting state-of-the-art AI +techniques to emerging spatial biology assays. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 14 pages +
+
+
+
+
+ + ☆ Inconsistencies In Consistency Models: Better ODE Solving Does Not Imply + Better Samples NeurIPS 2024 + + +
+ Although diffusion models can generate remarkably high-quality samples, they +are intrinsically bottlenecked by their expensive iterative sampling procedure. +Consistency models (CMs) have recently emerged as a promising diffusion model +distillation method, reducing the cost of sampling by generating high-fidelity +samples in just a few iterations. Consistency model distillation aims to solve +the probability flow ordinary differential equation (ODE) defined by an +existing diffusion model. CMs are not directly trained to minimize error +against an ODE solver, rather they use a more computationally tractable +objective. As a way to study how effectively CMs solve the probability flow +ODE, and the effect that any induced error has on the quality of generated +samples, we introduce Direct CMs, which \textit{directly} minimize this error. +Intriguingly, we find that Direct CMs reduce the ODE solving error compared to +CMs but also result in significantly worse sample quality, calling into +question why exactly CMs work well in the first place. Full code is available +at: https://github.com/layer6ai-labs/direct-cms. + +
+
+ comment: NeurIPS 2024 ATTRIB Workshop +
+
+
+
+
+ + ♻ ☆ Scaling Properties of Diffusion Models for Perceptual Tasks + + +
+ In this paper, we argue that iterative computation with diffusion models +offers a powerful paradigm for not only generation but also visual perception +tasks. We unify tasks such as depth estimation, optical flow, and amodal +segmentation under the framework of image-to-image translation, and show how +diffusion models benefit from scaling training and test-time compute for these +perceptual tasks. Through a careful analysis of these scaling properties, we +formulate compute-optimal training and inference recipes to scale diffusion +models for visual perception tasks. Our models achieve competitive performance +to state-of-the-art methods using significantly less data and compute. To +access our code and models, see https://scaling-diffusion-perception.github.io . + +
+
+
+
+
+ + ♻ ☆ Regional Style and Color Transfer + + +
+ This paper presents a novel contribution to the field of regional style +transfer. Existing methods often suffer from the drawback of applying style +homogeneously across the entire image, leading to stylistic inconsistencies or +foreground object twisted when applied to image with foreground elements such +as person figures. To address this limitation, we propose a new approach that +leverages a segmentation network to precisely isolate foreground objects within +the input image. Subsequently, style transfer is applied exclusively to the +background region. The isolated foreground objects are then carefully +reintegrated into the style-transferred background. To enhance the visual +coherence between foreground and background, a color transfer step is employed +on the foreground elements prior to their rein-corporation. Finally, we utilize +feathering techniques to achieve a seamless amalgamation of foreground and +background, resulting in a visually unified and aesthetically pleasing final +composition. Extensive evaluations demonstrate that our proposed approach +yields significantly more natural stylistic transformations compared to +conventional methods. + +
+
+ comment: Accepted by 2024 5th International Conference on Computer Vision, + Image and Deep Learning +
+
+
+
+
+ + ♻ ☆ OML: Open, Monetizable, and Loyal AI + + +
+ Artificial Intelligence (AI) has steadily improved across a wide range of +tasks. However, the development and deployment of AI are almost entirely +controlled by a few powerful organizations that are racing to create Artificial +General Intelligence (AGI). The centralized entities make decisions with little +public oversight, shaping the future of humanity, often with unforeseen +consequences. In this paper, we propose OML, which stands for Open, +Monetizable, and Loyal AI, an approach designed to democratize AI development. +OML is realized through an interdisciplinary framework spanning AI, blockchain, +and cryptography. We present several ideas for constructing OML using +technologies such as Trusted Execution Environments (TEE), traditional +cryptographic primitives like fully homomorphic encryption and functional +encryption, obfuscation, and AI-native solutions rooted in the sample +complexity and intrinsic hardness of AI tasks. A key innovation of our work is +introducing a new scientific field: AI-native cryptography. Unlike conventional +cryptography, which focuses on discrete data and binary security guarantees, +AI-native cryptography exploits the continuous nature of AI data +representations and their low-dimensional manifolds, focusing on improving +approximate performance. One core idea is to transform AI attack methods, such +as data poisoning, into security tools. This novel approach serves as a +foundation for OML 1.0 which uses model fingerprinting to protect the integrity +and ownership of AI models. The spirit of OML is to establish a decentralized, +open, and transparent platform for AI development, enabling the community to +contribute, monetize, and take ownership of AI models. By decentralizing +control and ensuring transparency through blockchain technology, OML prevents +the concentration of power and provides accountability in AI development that +has not been possible before. + +
+
+ comment: 60 pages, 22 figures +
+
+
+
+
+ + ♻ ☆ Harnessing Smartphone Sensors for Enhanced Road Safety: A Comprehensive + Dataset and Review + + +
+ Severe collisions can result from aggressive driving and poor road +conditions, emphasizing the need for effective monitoring to ensure safety. +Smartphones, with their array of built-in sensors, offer a practical and +affordable solution for road-sensing. However, the lack of reliable, +standardized datasets has hindered progress in assessing road conditions and +driving patterns. This study addresses this gap by introducing a comprehensive +dataset derived from smartphone sensors, which surpasses existing datasets by +incorporating a diverse range of sensors including accelerometer, gyroscope, +magnetometer, GPS, gravity, orientation, and uncalibrated sensors. These +sensors capture extensive parameters such as acceleration force, gravitation, +rotation rate, magnetic field strength, and vehicle speed, providing a detailed +understanding of road conditions and driving behaviors. The dataset is designed +to enhance road safety, infrastructure maintenance, traffic management, and +urban planning. By making this dataset available to the community, the study +aims to foster collaboration, inspire further research, and facilitate the +development of innovative solutions in intelligent transportation systems. + +
+
+ comment: 29 pages, 14 Figures, journal paper, submitted into Scientific Data + Journal +
+
+
+
+
+ + ♻ ☆ AI Consciousness is Inevitable: A Theoretical Computer Science + Perspective + + +
+ We look at consciousness through the lens of Theoretical Computer Science, a +branch of mathematics that studies computation under resource limitations. From +this perspective, we develop a formal machine model for consciousness. The +model is inspired by Alan Turing's simple yet powerful model of computation and +Bernard Baars' theater model of consciousness. Though extremely simple, the +model aligns at a high level with many of the major scientific theories of +human and animal consciousness, supporting our claim that machine consciousness +is inevitable. + +
+
+
+
+
+ + ♻ ☆ A Universal Deep Learning Framework for Materials X-ray Absorption + Spectra + + +
+ X-ray absorption spectroscopy (XAS) is a powerful characterization technique +for probing the local chemical environment of absorbing atoms. However, +analyzing XAS data presents significant challenges, often requiring extensive, +computationally intensive simulations, as well as significant domain expertise. +These limitations hinder the development of fast, robust XAS analysis pipelines +that are essential in high-throughput studies and for autonomous +experimentation. We address these challenges with OmniXAS, a framework that +contains a suite of transfer learning approaches for XAS prediction, each +contributing to improved accuracy and efficiency, as demonstrated on K-edge +spectra database covering eight 3d transition metals (Ti-Cu). The OmniXAS +framework is built upon three distinct strategies. First, we use M3GNet to +derive latent representations of the local chemical environment of absorption +sites as input for XAS prediction, achieving up to order-of-magnitude +improvements over conventional featurization techniques. Second, we employ a +hierarchical transfer learning strategy, training a universal multi-task model +across elements before fine-tuning for element-specific predictions. Models +based on this cascaded approach after element-wise fine-tuning outperform +element-specific models by up to 69%. Third, we implement cross-fidelity +transfer learning, adapting a universal model to predict spectra generated by +simulation of a different fidelity with a higher computational cost. This +approach improves prediction accuracy by up to 11% over models trained on the +target fidelity alone. Our approach boosts the throughput of XAS modeling by +orders of magnitude versus first-principles simulations and is extendable to +XAS prediction for a broader range of elements. This transfer learning +framework is generalizable to enhance deep-learning models that target other +properties in materials research. + +
+
+ comment: Main manuscript: 22 pages, 11 figures. Supplemental material (12 + pages, 6 figures) available as a separate file in arXiv ancillary files + (additional downloadable files) +
+
+
+
+
+ + ♻ ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information + Seeking in Large Language Models NeurIPS 2024 + + +
+ In the face of uncertainty, the ability to *seek information* is of +fundamental importance. In many practical applications, such as medical +diagnosis and troubleshooting, the information needed to solve the task is not +initially given and has to be actively sought by asking follow-up questions +(for example, a doctor asking a patient for more details about their symptoms). +In this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to +augment large language models with the ability to actively seek information by +asking effective questions. UoT combines 1) an *uncertainty-aware simulation +approach* which enables the model to simulate possible future scenarios and how +likely they are to occur, 2) *uncertainty-based rewards* motivated by +information gain which incentivizes the model to seek information, and 3) a +*reward propagation scheme* to select the optimal question to ask in a way that +maximizes the expected reward. In experiments on medical diagnosis, +troubleshooting, and the `20 Questions` game, UoT achieves an average +performance improvement of 38.1% in the rate of successful task completion +across multiple LLMs compared with direct prompting and also improves +efficiency (i.e., the number of questions needed to complete the task). Our +code has been released [here](https://github.com/zhiyuanhubj/UoT) + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Active Inference Meeting Energy-Efficient Control of Parallel and + Identical Machines + + +
+ We investigate the application of active inference in developing +energy-efficient control agents for manufacturing systems. Active inference, +rooted in neuroscience, provides a unified probabilistic framework integrating +perception, learning, and action, with inherent uncertainty quantification +elements. Our study explores deep active inference, an emerging field that +combines deep learning with the active inference decision-making framework. +Leveraging a deep active inference agent, we focus on controlling parallel and +identical machine workstations to enhance energy efficiency. We address +challenges posed by the problem's stochastic nature and delayed policy response +by introducing tailored enhancements to existing agent architectures. +Specifically, we introduce multi-step transition and hybrid horizon methods to +mitigate the need for complex planning. Our experimental results demonstrate +the effectiveness of these enhancements and highlight the potential of the +active inference-based approach. + +
+
+ comment: Accepted at the 10th International Conference on Machine Learning, + Optimization, and Data Science +
+
+
+
+
+ + ♻ ☆ China and the U.S. produce more impactful AI research when collaborating + together + + +
+ Artificial Intelligence (AI) has become a disruptive technology, promising to +grant a significant economic and strategic advantage to nations that harness +its power. China, with its recent push towards AI adoption, is challenging the +U.S.'s position as the global leader in this field. Given AI's massive +potential, as well as the fierce geopolitical tensions between China and the +U.S., several recent policies have been put in place to discourage AI +scientists from migrating to, or collaborating with, the other nation. +Nevertheless, the extent of talent migration and cross-border collaboration are +not fully understood. Here, we analyze a dataset of over 350,000 AI scientists +and 5,000,000 AI papers. We find that since 2000, China and the U.S. have led +the field in terms of impact, novelty, productivity, and workforce. Most AI +scientists who move to China come from the U.S., and most who move to the U.S. +come from China, highlighting a notable bidirectional talent migration. +Moreover, the vast majority of those moving in either direction have Asian +ancestry. Upon moving, those scientists continue to collaborate frequently with +those in the origin country. Although the number of collaborations between the +two countries has increased since the dawn of the millennium, such +collaborations continue to be relatively rare. A matching experiment reveals +that the two countries have always been more impactful when collaborating than +when each works without the other. These findings suggest that instead of +suppressing cross-border migration and collaboration between the two nations, +the science could benefit from promoting such activities. + +
+
+ comment: 38 pages, 15 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ On Training Survival Models with Scoring Rules + + +
+ Scoring rules are an established way of comparing predictive performances +across model classes. In the context of survival analysis, they require +adaptation in order to accommodate censoring. This work investigates using +scoring rules for model training rather than evaluation. Doing so, we establish +a general framework for training survival models that is model agnostic and can +learn event time distributions parametrically or non-parametrically. In +addition, our framework is not restricted to any specific scoring rule. While +we focus on neural network-based implementations, we also provide +proof-of-concept implementations using gradient boosting, generalized additive +models, and trees. Empirical comparisons on synthetic and real-world data +indicate that scoring rules can be successfully incorporated into model +training and yield competitive predictive performance with established +time-to-event models. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ On the Effects of Data Scale on UI Control Agents NeurIPS 2024 + + +
+ Autonomous agents that control computer interfaces to accomplish human tasks +are emerging. Leveraging LLMs to power such agents has been of special +interest, but unless fine-tuned on human-collected task demonstrations, +performance is still relatively low. In this work we study whether fine-tuning +alone is a viable approach for building real-world computer control agents. In +particularly, we investigate how performance measured on both high and +low-level tasks in domain and out of domain scales as more training data is +collected. To this end we collect and release a new dataset, AndroidControl, +consisting of 15,283 demonstrations of everyday tasks with Android apps. +Compared to existing datasets, each AndroidControl task instance includes both +high and low-level human-generated instructions, allowing us to explore the +level of task complexity an agent can handle. Moreover, AndroidControl is the +most diverse computer control dataset to date, including 14,548 unique tasks +over 833 Android apps, thus allowing us to conduct in-depth analysis of the +model performance in and out of the domain of the training data. Using the +dataset, we find that when tested in domain fine-tuned models outperform zero +and few-shot baselines and scale in such a way that robust performance might +feasibly be obtained simply by collecting more data. Out of domain, performance +scales significantly more slowly and suggests that in particular for high-level +tasks, fine-tuning on more data alone may be insufficient for achieving robust +out-of-domain performance. + +
+
+ comment: NeurIPS 2024 (Datasets and Benchmarks) +
+
+
+
+
+ + ♻ ☆ ProactivePIM: Accelerating Weight-Sharing Embedding Layer with PIM for + Scalable Recommendation System + + +
+ The personalized recommendation system's continuous size growth poses new +challenges for model inference. Although weight-sharing algorithms have been +proposed to reduce embedding table capacity, they increase memory access. +Recent advancements in processing-in-memory (PIM) successfully enhance the +recommendation system's throughput by exploiting memory parallelism, but our +analysis shows that those algorithms introduce CPU-PIM communication overhead +into prior PIM systems, compromising the PIM throughput. We propose +ProactivePIM, a specialized memory architecture integrated with PIM technology +tailored to accelerate the weight-sharing algorithms. ProacitvePIM integrates +an SRAM cache within the PIM with an efficient prefetching scheme to leverage a +unique locality of the algorithm and eliminate CPU-PIM communication. + +
+
+ comment: 7 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Rethinking Distribution Shifts: Empirical Analysis and Inductive + Modeling for Tabular Data NeurIPS 2023 + + +
+ Different distribution shifts require different interventions, and algorithms +must be grounded in the specific shifts they address. However, methodological +development for robust algorithms typically relies on structural assumptions +that lack empirical validation. Advocating for an empirically grounded +data-driven approach to research, we build an empirical testbed comprising +natural shifts across 5 tabular datasets and 60,000 method configurations +encompassing imbalanced learning and distributionally robust optimization (DRO) +methods. We find $Y|X$-shifts are most prevalent on our testbed, in stark +contrast to the heavy focus on $X$ (covariate)-shifts in the ML literature. The +performance of robust algorithms varies significantly over shift types, and is +no better than that of vanilla methods. To understand why, we conduct an +in-depth empirical analysis of DRO methods and find that although often +neglected by researchers, implementation details -- such as the choice of +underlying model class (e.g., XGBoost) and hyperparameter selection -- have a +bigger impact on performance than the ambiguity set or its radius. To further +bridge that gap between methodological research and practice, we design case +studies that illustrate how such a data-driven, inductive understanding of +distribution shifts can enhance both data-centric and algorithmic +interventions. + +
+
+ comment: Conference version appeared in NeurIPS 2023, previously titled "On + the Need for a Language Describing Distribution Shifts: Illustrations on + Tabular Datasets" +
+
+
+
+
+ + ♻ ☆ Calibrating Bayesian Generative Machine Learning for Bayesiamplification + + +
+ Recently, combinations of generative and Bayesian machine learning have been +introduced in particle physics for both fast detector simulation and inference +tasks. These neural networks aim to quantify the uncertainty on the generated +distribution originating from limited training statistics. The interpretation +of a distribution-wide uncertainty however remains ill-defined. We show a clear +scheme for quantifying the calibration of Bayesian generative machine learning +models. For a Continuous Normalizing Flow applied to a low-dimensional toy +example, we evaluate the calibration of Bayesian uncertainties from either a +mean-field Gaussian weight posterior, or Monte Carlo sampling network weights, +to gauge their behaviour on unsteady distribution edges. Well calibrated +uncertainties can then be used to roughly estimate the number of uncorrelated +truth samples that are equivalent to the generated sample and clearly indicate +data amplification for smooth features of the distribution. + +
+
+ comment: 15 pages, 6 figures, updated references, fixed typo +
+
+
+
+
+ + ♻ ☆ AutoSAT: Automatically Optimize SAT Solvers via Large Language Models + + +
+ Conflict-Driven Clause Learning (CDCL) is the mainstream framework for +solving the Satisfiability problem (SAT), and CDCL solvers typically rely on +various heuristics, which have a significant impact on their performance. +Modern CDCL solvers, such as MiniSat and Kissat, commonly incorporate several +heuristics and select one to use according to simple rules, requiring +significant time and expert effort to fine-tune in practice. The pervasion of +Large Language Models (LLMs) provides a potential solution to address this +issue. However, generating a CDCL solver from scratch is not effective due to +the complexity and context volume of SAT solvers. Instead, we propose AutoSAT, +a framework that automatically optimizes heuristics in a pre-defined modular +search space based on existing CDCL solvers. Unlike existing automated +algorithm design approaches focusing on hyperparameter tuning and operator +selection, AutoSAT can generate new efficient heuristics. In this first attempt +at optimizing SAT solvers using LLMs, several strategies including the greedy +hill climber and (1+1) Evolutionary Algorithm are employed to guide LLMs to +search for better heuristics. Experimental results demonstrate that LLMs can +generally enhance the performance of CDCL solvers. A realization of AutoSAT +outperforms MiniSat on 9 out of 12 datasets and even surpasses the +state-of-the-art hybrid solver Kissat on 4 datasets. + +
+
+
+
+
+ + ♻ ☆ Toxicity Detection is NOT all you Need: Measuring the Gaps to Supporting + Volunteer Content Moderators + + +
+ Extensive efforts in automated approaches for content moderation have been +focused on developing models to identify toxic, offensive, and hateful content +with the aim of lightening the load for moderators. Yet, it remains uncertain +whether improvements on those tasks have truly addressed moderators' needs in +accomplishing their work. In this paper, we surface gaps between past research +efforts that have aimed to provide automation for aspects of content moderation +and the needs of volunteer content moderators, regarding identifying violations +of various moderation rules. To do so, we conduct a model review on Hugging +Face to reveal the availability of models to cover various moderation rules and +guidelines from three exemplar forums. We further put state-of-the-art LLMs to +the test, evaluating how well these models perform in flagging violations of +platform rules from one particular forum. Finally, we conduct a user survey +study with volunteer moderators to gain insight into their perspectives on +useful moderation models. Overall, we observe a non-trivial gap, as missing +developed models and LLMs exhibit moderate to low performance on a significant +portion of the rules. Moderators' reports provide guides for future work on +developing moderation assistant models. + +
+
+
+
+
+ + ♻ ☆ Extending choice assessments to choice functions: An algorithm for + computing the natural extension + + +
+ We study how to infer new choices from prior choices using the framework of +choice functions, a unifying mathematical framework for decision-making based +on sets of preference orders. In particular, we define the natural (most +conservative) extension of a given choice assessment to a coherent choice +function -- whenever possible -- and use this natural extension to make new +choices. We provide a practical algorithm for computing this natural extension +and various ways to improve scalability. Finally, we test these algorithms for +different types of choice assessments. + +
+
+ comment: 40 pages, 8 figures, pre-print for International Journal of + Approximate Reasoning +
+
+
+
+
+ + ♻ ☆ Controlling Large Electric Vehicle Charging Stations via User Behavior + Modeling and Stochastic Programming + + +
+ This paper introduces an Electric Vehicle Charging Station (EVCS) model that +incorporates real-world constraints, such as slot power limitations, contract +threshold overruns penalties, or early disconnections of electric vehicles +(EVs). We propose a formulation of the problem of EVCS control under +uncertainty, and implement two Multi-Stage Stochastic Programming approaches +that leverage user-provided information, namely, Model Predictive Control and +Two-Stage Stochastic Programming. The model addresses uncertainties in charging +session start and end times, as well as in energy demand. A user's behavior +model based on a sojourn-time-dependent stochastic process enhances cost +reduction while maintaining customer satisfaction. The benefits of the two +proposed methods are showcased against two baselines over a 22-day simulation +using a real-world dataset. The two-stage approach demonstrates robustness +against early disconnections by considering a wider range of uncertainty +scenarios for optimization. The algorithm prioritizing user satisfaction over +electricity cost achieves a 20% and 36% improvement in two user satisfaction +metrics compared to an industry-standard baseline. Additionally, the algorithm +striking the best balance between cost and user satisfaction exhibits a mere 3% +relative cost increase compared to the theoretically optimal baseline - for +which the nonanticipativity constraint is relaxed - while attaining 94% and 84% +of the user satisfaction performance in the two used satisfaction metrics. + +
+
+
+
+
+ + ♻ ☆ Morphological Symmetries in Robotics + + +
+ We present a comprehensive framework for studying and leveraging +morphological symmetries in robotic systems. These are intrinsic properties of +the robot's morphology, frequently observed in animal biology and robotics, +which stem from the replication of kinematic structures and the symmetrical +distribution of mass. We illustrate how these symmetries extend to the robot's +state space and both proprioceptive and exteroceptive sensor measurements, +resulting in the equivariance of the robot's equations of motion and optimal +control policies. Thus, we recognize morphological symmetries as a relevant and +previously unexplored physics-informed geometric prior, with significant +implications for both data-driven and analytical methods used in modeling, +control, estimation and design in robotics. For data-driven methods, we +demonstrate that morphological symmetries can enhance the sample efficiency and +generalization of machine learning models through data augmentation, or by +applying equivariant/invariant constraints on the model's architecture. In the +context of analytical methods, we employ abstract harmonic analysis to +decompose the robot's dynamics into a superposition of lower-dimensional, +independent dynamics. We substantiate our claims with both synthetic and +real-world experiments conducted on bipedal and quadrupedal robots. Lastly, we +introduce the repository MorphoSymm to facilitate the practical use of the +theory and applications outlined in this work. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Into the Fog: Evaluating Robustness of Multiple Object Tracking + + +
+ State-of-the-art Multiple Object Tracking (MOT) approaches have shown +remarkable performance when trained and evaluated on current benchmarks. +However, these benchmarks primarily consist of clear weather scenarios, +overlooking adverse atmospheric conditions such as fog, haze, smoke and dust. +As a result, the robustness of trackers against these challenging conditions +remains underexplored. To address this gap, we introduce physics-based +volumetric fog simulation method for arbitrary MOT datasets, utilizing +frame-by-frame monocular depth estimation and a fog formation optical model. We +enhance our simulation by rendering both homogeneous and heterogeneous fog and +propose to use the dark channel prior method to estimate atmospheric light, +showing promising results even in night and indoor scenes. We present the +leading benchmark MOTChallenge (third release) augmented with fog (smoke for +indoor scenes) of various intensities and conduct a comprehensive evaluation of +MOT methods, revealing their limitations under fog and fog-like challenges. + +
+
+
+
+
+ + ♻ ☆ Online Dynamic Pricing for Electric Vehicle Charging Stations with + Reservations + + +
+ The transition to electric vehicles (EVs), coupled with the rise of renewable +energy sources, will significantly impact the electric grid. Unlike +conventional fuel sources, electricity for EVs is constrained by grid capacity, +price fluctuations, and long EV charging times, requiring new pricing solutions +to manage demand and supply. This paper proposes a model for online dynamic +pricing of reserved EV charging services, including reservation, parking, and +charging as a bundled service priced as a whole. Our approach focuses on the +individual charging station operator, employing a stochastic demand model and +online dynamic pricing based on expected demand. The proposed model uses a +Markov Decision Process (MDP) formulation to optimize sequential pricing +decisions for charging session requests. A key contribution is the novel +definition and quantification of discretization error introduced by the +discretization of the Poisson process for use in the MDP. The model's viability +is demonstrated with a heuristic solution method based on Monte-Carlo tree +search, offering a viable path for real-world application. + +
+
+ comment: 45 pages, 11 figure, prepared for submission to IEEE Transactions on + Intelligent Transportation Systems (T-ITS) +
+
+
+
+
+ + ♻ ☆ Investigating the Effectiveness of Explainability Methods in Parkinson's + Detection from Speech + + +
+ Speech impairments in Parkinson's disease (PD) provide significant early +indicators for diagnosis. While models for speech-based PD detection have shown +strong performance, their interpretability remains underexplored. This study +systematically evaluates several explainability methods to identify PD-specific +speech features, aiming to support the development of accurate, interpretable +models for clinical decision-making in PD diagnosis and monitoring. Our +methodology involves (i) obtaining attributions and saliency maps using +mainstream interpretability techniques, (ii) quantitatively evaluating the +faithfulness of these maps and their combinations obtained via union and +intersection through a range of established metrics, and (iii) assessing the +information conveyed by the saliency maps for PD detection from an auxiliary +classifier. Our results reveal that, while explanations are aligned with the +classifier, they often fail to provide valuable information for domain experts. + +
+
+ comment: The first two authors contributed equally to this research: author + order is alphabetical +
+
+
+
+
+ + ♻ ☆ GeSubNet: Gene Interaction Inference for Disease Subtype Network + Generation ICLR 2025 + + +
+ Retrieving gene functional networks from knowledge databases presents a +challenge due to the mismatch between disease networks and subtype-specific +variations. Current solutions, including statistical and deep learning methods, +often fail to effectively integrate gene interaction knowledge from databases +or explicitly learn subtype-specific interactions. To address this mismatch, we +propose GeSubNet, which learns a unified representation capable of predicting +gene interactions while distinguishing between different disease subtypes. +Graphs generated by such representations can be considered subtype-specific +networks. GeSubNet is a multi-step representation learning framework with three +modules: First, a deep generative model learns distinct disease subtypes from +patient gene expression profiles. Second, a graph neural network captures +representations of prior gene networks from knowledge databases, ensuring +accurate physical gene interactions. Finally, we integrate these two +representations using an inference loss that leverages graph generation +capabilities, conditioned on the patient separation loss, to refine +subtype-specific information in the learned representation. GeSubNet +consistently outperforms traditional methods, with average improvements of +30.6%, 21.0%, 20.1%, and 56.6% across four graph evaluation metrics, averaged +over four cancer datasets. Particularly, we conduct a biological simulation +experiment to assess how the behavior of selected genes from over 11,000 +candidates affects subtypes or patient distributions. The results show that the +generated network has the potential to identify subtype-specific genes with an +83% likelihood of impacting patient distribution shifts. The GeSubNet resource +is available: https://anonymous.4open.science/r/GeSubNet/ + +
+
+ comment: Under review as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ V-LoL: A Diagnostic Dataset for Visual Logical Learning + + +
+ Despite the successes of recent developments in visual AI, different +shortcomings still exist; from missing exact logical reasoning, to abstract +generalization abilities, to understanding complex and noisy scenes. +Unfortunately, existing benchmarks, were not designed to capture more than a +few of these aspects. Whereas deep learning datasets focus on visually complex +data but simple visual reasoning tasks, inductive logic datasets involve +complex logical learning tasks, however, lack the visual component. To address +this, we propose the diagnostic visual logical learning dataset, V-LoL, that +seamlessly combines visual and logical challenges. Notably, we introduce the +first instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic +benchmark in symbolic AI, the Michalski train problem. By incorporating +intricate visual scenes and flexible logical reasoning tasks within a versatile +framework, V-LoL-Train provides a platform for investigating a wide range of +visual logical learning challenges. We evaluate a variety of AI systems +including traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our +evaluations demonstrate that even SOTA AI faces difficulties in dealing with +visual logical learning challenges, highlighting unique advantages and +limitations of each methodology. Overall, V-LoL opens up new avenues for +understanding and enhancing current abilities in visual logical learning for AI +systems. + +
+
+
+
+
+ + ♻ ☆ Are Large Language Models Table-based Fact-Checkers? + + +
+ Table-based Fact Verification (TFV) aims to extract the entailment relation +between statements and structured tables. Existing TFV methods based on +small-scaled models suffer from insufficient labeled data and weak zero-shot +ability. Recently, the appearance of Large Language Models (LLMs) has gained +lots of attraction in research fields. They have shown powerful zero-shot and +in-context learning abilities on several NLP tasks, but their potential on TFV +is still unknown. In this work, we implement a preliminary study about whether +LLMs are table-based fact-checkers. In detail, we design diverse prompts to +explore how the in-context learning can help LLMs in TFV, i.e., zero-shot and +few-shot TFV capability. Besides, we carefully design and construct TFV +instructions to study the performance gain brought by the instruction tuning of +LLMs. Experimental results demonstrate that LLMs can achieve acceptable results +on zero-shot and few-shot TFV with prompt engineering, while instruction-tuning +can stimulate the TFV capability significantly. We also make some valuable +findings about the format of zero-shot prompts and the number of in-context +examples. Finally, we analyze some possible directions to promote the accuracy +of TFV via LLMs, which is beneficial to further research of table reasoning. + +
+
+ comment: CSCWD 2024 +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ Target-driven Attack for Large Language Models + + +
+ Current large language models (LLM) provide a strong foundation for +large-scale user-oriented natural language tasks. Many users can easily inject +adversarial text or instructions through the user interface, thus causing LLM +model security challenges like the language model not giving the correct +answer. Although there is currently a large amount of research on black-box +attacks, most of these black-box attacks use random and heuristic strategies. +It is unclear how these strategies relate to the success rate of attacks and +thus effectively improve model robustness. To solve this problem, we propose +our target-driven black-box attack method to maximize the KL divergence between +the conditional probabilities of the clean text and the attack text to redefine +the attack's goal. We transform the distance maximization problem into two +convex optimization problems based on the attack goal to solve the attack text +and estimate the covariance. Furthermore, the projected gradient descent +algorithm solves the vector corresponding to the attack text. Our target-driven +black-box attack approach includes two attack strategies: token manipulation +and misinformation attack. Experimental results on multiple Large Language +Models and datasets demonstrate the effectiveness of our attack method. + +
+
+ comment: 12 pages, 7 figures. This work is an extension of the + arXiv:2404.07234 work. We propose new methods. 27th European Conference on + Artificial Intelligence 2024 +
+
+
+
+
+ + ♻ ☆ SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation EMNLP 2024 + + +
+ It is often desirable to distill the capabilities of large language models +(LLMs) into smaller student models due to compute and memory constraints. One +way to do this for classification tasks is via dataset synthesis, which can be +accomplished by generating examples of each label from the LLM. Prior +approaches to synthesis use few-shot prompting, which relies on the LLM's +parametric knowledge to generate usable examples. However, this leads to issues +of repetition, bias towards popular entities, and stylistic differences from +human text. In this work, we propose Synthesize by Retrieval and Refinement +(SynthesizRR), which uses retrieval augmentation to introduce variety into the +dataset synthesis process: as retrieved passages vary, the LLM is seeded with +different content to generate its examples. We empirically study the synthesis +of six datasets, covering topic classification, sentiment analysis, tone +detection, and humor, requiring complex synthesis strategies. We find that +SynthesizRR greatly improves lexical and semantic diversity, similarity to +human-written text, and distillation performance, when compared to 32-shot +prompting and four prior approaches. We release our code to perform all steps +at https://github.com/amazon-science/synthesizrr + +
+
+ comment: Published as a main conference paper at EMNLP 2024. Code available at + https://github.com/amazon-science/synthesizrr +
+
+
+
+
+ + ♻ ☆ Effective ML Model Versioning in Edge Networks + + +
+ Machine learning (ML) models, data and software need to be regularly updated +whenever essential version updates are released and feasible for integration. +This is a basic but most challenging requirement to satisfy in the edge, due to +the various system constraints and the major impact that an update can have on +robustness and stability. In this paper, we formulate for the first time the ML +model versioning optimization problem, and propose effective solutions, +including the update automation with reinforcement learning (RL) based +algorithm. We study the edge network environment due to the known constraints +in performance, response time, security, and reliability, which make updates +especially challenging. The performance study shows that model version updates +can be fully and effectively automated with reinforcement learning method. We +show that for every range of server load values, the proper versioning can be +found that improves security, reliability and/or ML model accuracy, while +assuring a comparably lower response time. + +
+
+ comment: This paper is uploaded here for research community, thus it is for + non-commercial purposes +
+
+
+
+
+ + ♻ ☆ Vikhr: Constructing a State-of-the-art Bilingual Open-Source + Instruction-Following Large Language Model for Russian EMNLP-2024 + + +
+ There has been a surge in developing various Large Language Models (LLMs). +However, text generation for languages other than English often faces +significant challenges, including poor generation quality and reduced +computational performance due to the disproportionate representation of tokens +in the model's vocabulary. In this work, we address these issues by developing +a pipeline for adapting English-oriented pre-trained models to other languages +and constructing efficient bilingual LLMs. Using this pipeline, we construct +Vikhr, a state-of-the-art bilingual open-source instruction-following LLM +designed specifically for the Russian language. "Vikhr" refers to the name of +the Mistral LLM series and means a "strong gust of wind." Unlike previous +Russian-language models that typically rely on LoRA adapters on top of +English-oriented models, sacrificing performance for lower training costs, +Vikhr features an adapted tokenizer vocabulary and undergoes continued +pre-training and instruction tuning of all weights. This not only enhances the +model's performance but also significantly improves its computational and +contextual efficiency. The remarkable performance of Vikhr across various +Russian-language benchmarks can also be attributed to our efforts in expanding +instruction datasets and corpora for continued pre-training. Vikhr not only +sets a new state of the art among open-source LLMs for Russian but even +outperforms some proprietary closed-source models on certain benchmarks. The +model weights, instruction sets, and code are publicly available. + +
+
+ comment: Accepted at WMRL @ EMNLP-2024 +
+
+
+
+
+ + ♻ ☆ Automatic dataset shift identification to support root cause analysis of + AI performance drift + + +
+ Shifts in data distribution can substantially harm the performance of +clinical AI models. Hence, various methods have been developed to detect the +presence of such shifts at deployment time. However, root causes of dataset +shifts are varied, and the choice of shift mitigation strategies is highly +dependent on the precise type of shift encountered at test time. As such, +detecting test-time dataset shift is not sufficient: precisely identifying +which type of shift has occurred is critical. In this work, we propose the +first unsupervised dataset shift identification framework, effectively +distinguishing between prevalence shift (caused by a change in the label +distribution), covariate shift (caused by a change in input characteristics) +and mixed shifts (simultaneous prevalence and covariate shifts). We discuss the +importance of self-supervised encoders for detecting subtle covariate shifts +and propose a novel shift detector leveraging both self-supervised encoders and +task model outputs for improved shift detection. We report promising results +for the proposed shift identification framework across three different imaging +modalities (chest radiography, digital mammography, and retinal fundus images) +on five types of real-world dataset shifts, using four large publicly available +datasets. + +
+
+ comment: Code available at + https://github.com/biomedia-mira/shift_identification +
+
+
+
+
+ + ♻ ☆ Active learning of digenic functions with boolean matrix logic + programming + + +
+ We apply logic-based machine learning techniques to facilitate cellular +engineering and drive biological discovery, based on comprehensive databases of +metabolic processes called genome-scale metabolic network models (GEMs). +Predicted host behaviours are not always correctly described by GEMs. Learning +the intricate genetic interactions within GEMs presents computational and +empirical challenges. To address these, we describe a novel approach called +Boolean Matrix Logic Programming (BMLP) by leveraging boolean matrices to +evaluate large logic programs. We introduce a new system, $BMLP_{active}$, +which efficiently explores the genomic hypothesis space by guiding informative +experimentation through active learning. In contrast to sub-symbolic methods, +$BMLP_{active}$ encodes a state-of-the-art GEM of a widely accepted bacterial +host in an interpretable and logical representation using datalog logic +programs. Notably, $BMLP_{active}$ can successfully learn the interaction +between a gene pair with fewer training examples than random experimentation, +overcoming the increase in experimental design space. $BMLP_{active}$ enables +rapid optimisation of metabolic models and offers a realistic approach to a +self-driving lab for microbial engineering. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.06724 +
+
+
+
+
+ + ♻ ☆ From Text to Treatment Effects: A Meta-Learning Approach to Handling + Text-Based Confounding NeurIPS 2024 + + +
+ One of the central goals of causal machine learning is the accurate +estimation of heterogeneous treatment effects from observational data. In +recent years, meta-learning has emerged as a flexible, model-agnostic paradigm +for estimating conditional average treatment effects (CATE) using any +supervised model. This paper examines the performance of meta-learners when the +confounding variables are expressed in text. Through synthetic data +experiments, we show that learners using pre-trained text representations of +confounders, in addition to tabular background variables, achieve improved CATE +estimates compared to those relying solely on the tabular variables, +particularly when sufficient data is available. However, due to the entangled +nature of the text embeddings, these models do not fully match the performance +of meta-learners with perfect confounder knowledge. These findings highlight +both the potential and the limitations of pre-trained text representations for +causal inference and open up interesting avenues for future research. + +
+
+ comment: Presented at the NeurIPS 2024 Workshop on Causal Representation + Learning +
+
+
+
+
+ + ♻ ☆ The Probabilistic Tsetlin Machine: A Novel Approach to Uncertainty + Quantification + + +
+ Tsetlin Machines (TMs) have emerged as a compelling alternative to +conventional deep learning methods, offering notable advantages such as smaller +memory footprint, faster inference, fault-tolerant properties, and +interpretability. Although various adaptations of TMs have expanded their +applicability across diverse domains, a fundamental gap remains in +understanding how TMs quantify uncertainty in their predictions. In response, +this paper introduces the Probabilistic Tsetlin Machine (PTM) framework, aimed +at providing a robust, reliable, and interpretable approach for uncertainty +quantification. Unlike the original TM, the PTM learns the probability of +staying on each state of each Tsetlin Automaton (TA) across all clauses. These +probabilities are updated using the feedback tables that are part of the TM +framework: Type I and Type II feedback. During inference, TAs decide their +actions by sampling states based on learned probability distributions, akin to +Bayesian neural networks when generating weight values. In our experimental +analysis, we first illustrate the spread of the probabilities across TA states +for the noisy-XOR dataset. Then we evaluate the PTM alongside benchmark models +using both simulated and real-world datasets. The experiments on the simulated +dataset reveal the PTM's effectiveness in uncertainty quantification, +particularly in delineating decision boundaries and identifying regions of high +uncertainty. Moreover, when applied to multiclass classification tasks using +the Iris dataset, the PTM demonstrates competitive performance in terms of +predictive entropy and expected calibration error, showcasing its potential as +a reliable tool for uncertainty estimation. Our findings underscore the +importance of selecting appropriate models for accurate uncertainty +quantification in predictive tasks, with the PTM offering a particularly +interpretable and effective solution. + +
+
+ comment: 12 pages, 5 figures, 6 tables, accepted and presented at ICAAI 2024, + London +
+
+
+
+
+ + ♻ ☆ Exact, Tractable Gauss-Newton Optimization in Deep Reversible + Architectures Reveal Poor Generalization NeurIPS 2024 + + +
+ Second-order optimization has been shown to accelerate the training of deep +neural networks in many applications, often yielding faster progress per +iteration on the training loss compared to first-order optimizers. However, the +generalization properties of second-order methods are still being debated. +Theoretical investigations have proved difficult to carry out outside the +tractable settings of heavily simplified model classes -- thus, the relevance +of existing theories to practical deep learning applications remains unclear. +Similarly, empirical studies in large-scale models and real datasets are +significantly confounded by the necessity to approximate second-order updates +in practice. It is often unclear whether the observed generalization behaviour +arises specifically from the second-order nature of the parameter updates, or +instead reflects the specific structured (e.g.\ Kronecker) approximations used +or any damping-based interpolation towards first-order updates. Here, we show +for the first time that exact Gauss-Newton (GN) updates take on a tractable +form in a class of deep reversible architectures that are sufficiently +expressive to be meaningfully applied to common benchmark datasets. We exploit +this novel setting to study the training and generalization properties of the +GN optimizer. We find that exact GN generalizes poorly. In the mini-batch +training setting, this manifests as rapidly saturating progress even on the +\emph{training} loss, with parameter updates found to overfit each +mini-batchatch without producing the features that would support generalization +to other mini-batches. We show that our experiments run in the ``lazy'' regime, +in which the neural tangent kernel (NTK) changes very little during the course +of training. This behaviour is associated with having no significant changes in +neural representations, explaining the lack of generalization. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ A Review of Electromagnetic Elimination Methods for low-field portable + MRI scanner + + +
+ This paper analyzes conventional and deep learning methods for eliminating +electromagnetic interference (EMI) in MRI systems. We compare traditional +analytical and adaptive techniques with advanced deep learning approaches. Key +strengths and limitations of each method are highlighted. Recent advancements +in active EMI elimination, such as external EMI receiver coils, are discussed +alongside deep learning methods, which show superior EMI suppression by +leveraging neural networks trained on MRI data. While deep learning improves +EMI elimination and diagnostic capabilities, it introduces security and safety +concerns, particularly in commercial applications. A balanced approach, +integrating conventional reliability with deep learning's advanced +capabilities, is proposed for more effective EMI suppression in MRI systems. + +
+
+ comment: Accepted by 2024 5th International Conference on Machine Learning and + Computer Application +
+
+
+
+
+ + ♻ ☆ An Axiomatic Study of the Evaluation of Enthymeme Decoding in Weighted + Structured Argumentation + + +
+ An argument can be seen as a pair consisting of a set of premises and a claim +supported by them. Arguments used by humans are often enthymemes, i.e., some +premises are implicit. To better understand, evaluate, and compare enthymemes, +it is essential to decode them, i.e., to find the missing premisses. Many +enthymeme decodings are possible. We need to distinguish between reasonable +decodings and unreasonable ones. However, there is currently no research in the +literature on "How to evaluate decodings?". To pave the way and achieve this +goal, we introduce seven criteria related to decoding, based on different +research areas. Then, we introduce the notion of criterion measure, the +objective of which is to evaluate a decoding with regard to a certain +criterion. Since such measures need to be validated, we introduce several +desirable properties for them, called axioms. Another main contribution of the +paper is the construction of certain criterion measures that are validated by +our axioms. Such measures can be used to identify the best enthymemes +decodings. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ LT-DARTS: An Architectural Approach to Enhance Deep Long-Tailed Learning + + +
+ Deep long-tailed recognition has been widely studied to address the issue of +imbalanced data distributions in real-world scenarios. However, there has been +insufficient focus on the design of neural architectures, despite empirical +evidence suggesting that architecture can significantly impact performance. In +this paper, we attempt to mitigate long-tailed issues through architectural +improvements. To simplify the design process, we utilize Differential +Architecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS +methods struggle to perform well in long-tailed scenarios. To tackle this +challenge, we introduce Long-Tailed Differential Architecture Search +(LT-DARTS). Specifically, we conduct extensive experiments to explore +architectural components that demonstrate better performance on long-tailed +data and propose a new search space based on our observations. This ensures +that the architecture obtained through our search process incorporates superior +components. Additionally, we propose replacing the learnable linear classifier +with an Equiangular Tight Frame (ETF) classifier to further enhance our method. +This classifier effectively alleviates the biased search process and prevents +performance collapse. Extensive experimental evaluations demonstrate that our +approach consistently improves upon existing methods from an orthogonal +perspective and achieves state-of-the-art results with simple enhancements. + +
+
+
+
+
+ + ♻ ☆ Advantages of Neural Population Coding for Deep Learning + + +
+ Scalar variables, e.g., the orientation of a shape in an image, are commonly +predicted using a single output neuron in a neural network. In contrast, the +mammalian cortex represents variables with a population of neurons. In this +population code, each neuron is most active at its preferred value and shows +partial activity for other values. Here, we investigate the benefit of using a +population code for the output layer of a neural network. We compare population +codes against single-neuron outputs and one-hot vectors. First, we show +theoretically and in experiments with synthetic data that population codes +improve robustness to input noise in networks of stacked linear layers. Second, +we demonstrate the benefit of using population codes to encode ambiguous +outputs, such as the pose of symmetric objects. Using the T-LESS dataset of +feature-less real-world objects, we show that population codes improve the +accuracy of predicting 3D object orientation from image input. + +
+
+
+
+
+ + ♻ ☆ From Explicit Rules to Implicit Reasoning in an Interpretable Violence + Monitoring System + + +
+ Recently, research based on pre-trained models has demonstrated outstanding +performance in violence surveillance tasks. However, most of them were +black-box systems which faced challenges regarding explainability during +training and inference processes. An important question is how to incorporate +explicit knowledge into these implicit models, thereby designing expert-driven +and interpretable violence surveillance systems. This paper proposes a new +paradigm for weakly supervised violence monitoring (WSVM) called Rule base +Violence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure +with different designs for images and text. One of the branches is called the +implicit branch, which uses only visual features for coarse-grained binary +classification. In this branch, image feature extraction is divided into two +channels: one responsible for extracting scene frames and the other focusing on +extracting actions. The other branch is called the explicit branch, which +utilizes language-image alignment to perform fine-grained classification. For +the language channel design in the explicit branch, the proposed RuleCLIP uses +the state-of-the-art YOLO-World model to detect objects in video frames, and +association rules are identified through data mining methods as descriptions of +the video. Leveraging the dual-branch architecture, RuleVM achieves +interpretable coarse-grained and fine-grained violence surveillance. Extensive +experiments were conducted on two commonly used benchmarks, and the results +show that RuleCLIP achieved the best performance in both coarse-grained and +fine-grained monitoring, significantly outperforming existing state-of-the-art +methods. Moreover, interpretability experiments uncovered some interesting +rules, such as the observation that as the number of people increases, the risk +level of violent behavior also rises. + +
+
+ comment: 12 pages,7 figures IEEE TSMCA (Under review) +
+
+
+
+
+ + ♻ ☆ pLDDT-Predictor: High-speed Protein Screening Using Transformer and ESM2 + + +
+ Recent advancements in protein structure prediction, particularly AlphaFold2, +have revolutionized structural biology by achieving near-experimental accuracy +($\text{average RMSD} < 1.5\text{\AA}$). However, the computational demands of +these models (approximately 30 minutes per protein on an RTX 4090) +significantly limit their application in high-throughput protein screening. +While large language models like ESM (Evolutionary Scale Modeling) have shown +promise in extracting structural information directly from protein sequences, +rapid assessment of protein structure quality for large-scale analyses remains +a major challenge. + We introduce pLDDT-Predictor, a high-speed protein screening tool that +achieves a $250,000\times$ speedup compared to AlphaFold2 by leveraging +pre-trained ESM2 protein embeddings and a Transformer architecture. Our model +predicts AlphaFold2's pLDDT (predicted Local Distance Difference Test) scores +with a Pearson correlation of 0.7891 and processes proteins in just 0.007 +seconds on average. Using a comprehensive dataset of 1.5 million diverse +protein sequences (ranging from 50 to 2048 amino acids), we demonstrate that +pLDDT-Predictor accurately classifies high-confidence structures (pLDDT $>$ 70) +with 91.2\% accuracy and achieves an MSE of 84.8142 compared to AlphaFold2's +predictions. + The source code and pre-trained models are freely available at +\url{https://github.com/jw-chae/pLDDT_Predictor}, enabling the research +community to perform rapid, large-scale protein structure quality assessments. + +
+
+ comment: 6 pages main topic, 8 pages including citiation, 4 figures +
+
+
+
+
+ + ♻ ☆ LAuReL: Learned Augmented Residual Layer ICML + + +
+ One of the core pillars of efficient deep learning methods is architectural +improvements such as the residual/skip connection, which has led to +significantly better model convergence and quality. Since then the residual +connection has become ubiquitous in not just convolutional neural networks but +also transformer-based architectures, the backbone of LLMs. + In this paper we introduce \emph{Learned Augmented Residual Layer} (LAuReL) +-- a novel generalization of the canonical residual connection -- with the goal +to be an in-situ replacement of the latter while outperforming on both model +quality and footprint metrics. Our experiments show that using \laurel can help +boost performance for both vision and language models. For example, on the +ResNet-50, ImageNet 1K task, it achieves $60\%$ of the gains from adding an +extra layer, while only adding $0.003\%$ more parameters, and matches it while +adding $2.6\times$ fewer parameters. + +
+
+ comment: Accepted at the 2nd Efficient Systems for Foundation Models Workshop + at the International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ♻ ☆ Data movement limits to frontier model training + + +
+ We present a theoretical model of distributed training, and use it to analyze +how far dense and sparse training runs can be scaled. Under our baseline +assumptions, given a three month training duration, data movement bottlenecks +begin to significantly lower hardware utilization for training runs exceeding +about $10^{28}$ FLOP, two orders of magnitude above the largest training run to +date, suggesting the arrival of fundamental barriers to scaling in three years +given recent rates of growth. A training run exceeding about $10^{31}$ FLOP is +infeasible even at low utilization. However, more aggressive batch size scaling +and/or shorter and fatter model shapes, if achievable, have the potential to +permit much larger training runs. + +
+
+
+
+
+ + ♻ ☆ Neural Network Verification with Branch-and-Bound for General + Nonlinearities + + +
+ Branch-and-bound (BaB) is among the most effective techniques for neural +network (NN) verification. However, existing works on BaB for NN verification +have mostly focused on NNs with piecewise linear activations, especially ReLU +networks. In this paper, we develop a general framework, named GenBaB, to +conduct BaB on general nonlinearities to verify NNs with general architectures, +based on linear bound propagation for NN verification. To decide which neuron +to branch, we design a new branching heuristic which leverages linear bounds as +shortcuts to efficiently estimate the potential improvement after branching. To +decide nontrivial branching points for general nonlinear functions, we propose +to pre-optimize branching points, which can be efficiently leveraged during +verification with a lookup table. We demonstrate the effectiveness of our +GenBaB on verifying a wide range of NNs, including NNs with activation +functions such as Sigmoid, Tanh, Sine and GeLU, as well as NNs involving +multi-dimensional nonlinear operations such as multiplications in LSTMs and +Vision Transformers. Our framework also allows the verification of general +nonlinear computation graphs and enables verification applications beyond +simple NNs, particularly for AC Optimal Power Flow (ACOPF). GenBaB is part of +the latest $\alpha,\!\beta$-CROWN, the winner of the 4th and the 5th +International Verification of Neural Networks Competition (VNN-COMP 2023 and +2024). + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Spin glass model of in-context learning + + +
+ Large language models show a surprising in-context learning ability -- being +able to use a prompt to form a prediction for a query, yet without additional +training, in stark contrast to old-fashioned supervised learning. Providing a +mechanistic interpretation and linking the empirical phenomenon to physics are +thus challenging and remain unsolved. We study a simple yet expressive +transformer with linear attention and map this structure to a spin glass model +with real-valued spins, where the couplings and fields explain the intrinsic +disorder in data. The spin glass model explains how the weight parameters +interact with each other during pre-training, and further clarifies why an +unseen function can be predicted by providing only a prompt yet without further +training. Our theory reveals that for single-instance learning, increasing the +task diversity leads to the emergence of in-context learning, by allowing the +Boltzmann distribution to converge to a unique correct solution of weight +parameters. Therefore the pre-trained transformer displays a prediction power +in a novel prompt setting. The proposed analytically tractable model thus +offers a promising avenue for thinking about how to interpret many intriguing +but puzzling properties of large language models. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Towards Reliable Evaluation of Neural Program Repair with Natural + Robustness Testing + + +
+ In this paper, we propose shifting the focus of robustness evaluation for +Neural Program Repair (NPR) techniques toward naturally-occurring data +transformations. To accomplish this, we first examine the naturalness of +semantic-preserving transformations through a two-stage human study. This study +includes (1) interviews with senior software developers to establish concrete +criteria for evaluating the naturalness of these transformations, and (2) a +survey involving 10 developers to assess the naturalness of 1,178 +transformations, i.e., pairs of original and transformed programs, applied to +225 real-world bugs. Our findings show that only 60% of these transformations +are deemed natural, while 20% are considered unnatural, with strong agreement +among annotators. Moreover, the unnaturalness of these transformations +significantly impacts both their applicability to benchmarks and the +conclusions drawn from robustness testing. Next, we conduct natural robustness +testing on NPR techniques to assess their true effectiveness against real-world +data variations. Our experimental results reveal a substantial number of +prediction changes in NPR techniques, leading to significant reductions in both +plausible and correct patch rates when comparing performance on the original +and transformed datasets. Additionally, we observe notable differences in +performance improvements between NPR techniques, suggesting potential biases on +NPR evaluation introduced by limited datasets. Finally, we propose an LLM-based +metric to automate the assessment of transformation naturalness, ensuring the +scalability of natural robustness testing. + +
+
+
+
+
+ + ♻ ☆ Mitigating Gradient Overlap in Deep Residual Networks with Gradient + Normalization for Improved Non-Convex Optimization + + +
+ In deep learning, Residual Networks (ResNets) have proven effective in +addressing the vanishing gradient problem, allowing for the successful training +of very deep networks. However, skip connections in ResNets can lead to +gradient overlap, where gradients from both the learned transformation and the +skip connection combine, potentially resulting in overestimated gradients. This +overestimation can cause inefficiencies in optimization, as some updates may +overshoot optimal regions, affecting weight updates. To address this, we +examine Z-score Normalization (ZNorm) as a technique to manage gradient +overlap. ZNorm adjusts the gradient scale, standardizing gradients across +layers and reducing the negative impact of overlapping gradients. Our +experiments demonstrate that ZNorm improves training process, especially in +non-convex optimization scenarios common in deep learning, where finding +optimal solutions is challenging. These findings suggest that ZNorm can affect +the gradient flow, enhancing performance in large-scale data processing where +accuracy is critical. + +
+
+
+
+
+ + ♻ ☆ Doubly Mild Generalization for Offline Reinforcement Learning NeurIPS 2024 + + +
+ Offline Reinforcement Learning (RL) suffers from the extrapolation error and +value overestimation. From a generalization perspective, this issue can be +attributed to the over-generalization of value functions or policies towards +out-of-distribution (OOD) actions. Significant efforts have been devoted to +mitigating such generalization, and recent in-sample learning approaches have +further succeeded in entirely eschewing it. Nevertheless, we show that mild +generalization beyond the dataset can be trusted and leveraged to improve +performance under certain conditions. To appropriately exploit generalization +in offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild +action generalization and (ii) mild generalization propagation. The former +refers to selecting actions in a close neighborhood of the dataset to maximize +the Q values. Even so, the potential erroneous generalization can still be +propagated, accumulated, and exacerbated by bootstrapping. In light of this, +the latter concept is introduced to mitigate the generalization propagation +without impeding the propagation of RL learning signals. Theoretically, DMG +guarantees better performance than the in-sample optimal policy in the oracle +generalization scenario. Even under worst-case generalization, DMG can still +control value overestimation at a certain level and lower bound the +performance. Empirically, DMG achieves state-of-the-art performance across +Gym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting +from its flexibility in both generalization aspects, DMG enjoys a seamless +transition from offline to online learning and attains strong online +fine-tuning performance. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Arrhythmia Classification Using Graph Neural Networks Based on + Correlation Matrix + + +
+ With the advancements in graph neural network, there has been increasing +interest in applying this network to ECG signal analysis. In this study, we +generated an adjacency matrix using correlation matrix of extracted features +and applied a graph neural network to classify arrhythmias. The proposed model +was compared with existing approaches from the literature. The results +demonstrated that precision and recall for all arrhythmia classes exceeded 50%, +suggesting that this method can be considered an approach for arrhythmia +classification. + +
+
+ comment: Accepted for BIBM 2024 AIBH Workshop +
+
+
+
+
+ + ♻ ☆ Evaluating AI-Generated Essays with GRE Analytical Writing Assessment + + +
+ The recent revolutionary advance in generative AI enables the generation of +realistic and coherent texts by large language models (LLMs). Despite many +existing evaluation metrics on the quality of the generated texts, there is +still a lack of rigorous assessment of how well LLMs perform in complex and +demanding writing assessments. This study examines essays generated by ten +leading LLMs for the analytical writing assessment of the Graduate Record Exam +(GRE). We assessed these essays using both human raters and the e-rater +automated scoring engine as used in the GRE scoring pipeline. Notably, the +top-performing Gemini and GPT-4o received an average score of 4.78 and 4.67, +respectively, falling between "generally thoughtful, well-developed analysis of +the issue and conveys meaning clearly" and "presents a competent analysis of +the issue and conveys meaning with acceptable clarity" according to the GRE +scoring guideline. We also evaluated the detection accuracy of these essays, +with detectors trained on essays generated by the same and different LLMs. + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and + Tabnet with SMOTEENN + + +
+ Bank credit risk is a significant challenge in modern financial transactions, +and the ability to identify qualified credit card holders among a large number +of applicants is crucial for the profitability of a bank'sbank's credit card +business. In the past, screening applicants'applicants' conditions often +required a significant amount of manual labor, which was time-consuming and +labor-intensive. Although the accuracy and reliability of previously used ML +models have been continuously improving, the pursuit of more reliable and +powerful AI intelligent models is undoubtedly the unremitting pursuit by major +banks in the financial industry. In this study, we used a dataset of over +40,000 records provided by a commercial bank as the research object. We +compared various dimensionality reduction techniques such as PCA and T-SNE for +preprocessing high-dimensional datasets and performed in-depth adaptation and +tuning of distributed models such as LightGBM and XGBoost, as well as deep +models like Tabnet. After a series of research and processing, we obtained +excellent research results by combining SMOTEENN with these techniques. The +experiments demonstrated that LightGBM combined with PCA and SMOTEENN +techniques can assist banks in accurately predicting potential high-quality +customers, showing relatively outstanding performance compared to other models. + +
+
+ comment: 8 pagess on IEEE ICPICS +
+
+
+
+
+ + ♻ ☆ Fair Summarization: Bridging Quality and Diversity in Extractive + Summaries NeurIPS 2024 + + +
+ Fairness in multi-document summarization of user-generated content remains a +critical challenge in natural language processing (NLP). Existing summarization +methods often fail to ensure equitable representation across different social +groups, leading to biased outputs. In this paper, we introduce two novel +methods for fair extractive summarization: FairExtract, a clustering-based +approach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints. +We evaluate these methods using Divsumm summarization dataset of White-aligned, +Hispanic, and African-American dialect tweets and compare them against relevant +baselines. The results obtained using a comprehensive set of summarization +quality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well +as a fairness metric F, demonstrate that FairExtract and FairGPT achieve +superior fairness while maintaining competitive summarization quality. +Additionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that +integrate quality and fairness into a single evaluation framework, offering a +more nuanced understanding of the trade-offs between these objectives. This +work highlights the importance of fairness in summarization and sets a +benchmark for future research in fairness-aware NLP models. + +
+
+ comment: Accepted at Algorithmic Fairness through the Lens of Metrics and + Evaluation Workshop @ NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring + + +
+ Coronary artery disease (CAD), one of the most common cause of mortality in +the world. Coronary artery calcium (CAC) scoring using computed tomography (CT) +is key for risk assessment to prevent coronary disease. Previous studies on +risk assessment and calcification detection in CT scans primarily use +approaches based on UNET architecture, frequently implemented on pre-built +models. However, these models are limited by the availability of annotated CT +scans containing CAC and suffering from imbalanced dataset, decreasing +performance of CAC segmentation and scoring. In this study, we extend this +approach by incorporating the self-supervised learning (SSL) technique of DINO +(self-distillation with no labels) to eliminate limitations of scarce annotated +data in CT scans. The DINO model's ability to train without requiring CAC area +annotations enhances its robustness in generating distinct features. The DINO +model is trained on to focus specifically on calcified areas by using labels, +aiming to generate features that effectively capture and highlight key +characteristics. The label-guided DINO (DINO-LG) enhances classification by +distinguishing CT slices that contain calcification from those that do not, +performing 57% better than the standard DINO model in this task. CAC scoring +and segmentation tasks are performed by a basic U-NET architecture, fed +specifically with CT slices containing calcified areas as identified by the +DINO-LG model. This targeted identification performed by DINO-LG model improves +CAC segmentation performance by approximately 10% and significant increase in +CAC scoring accuracy. + +
+
+ comment: Developed by Center for Applied Artificial Intelligence (CAAI), + University of Kentucky +
+
+
+
+
+ + ♻ ☆ Personalize to generalize: Towards a universal medical multi-modality + generalization through personalization + + +
+ The differences among medical imaging modalities, driven by distinct +underlying principles, pose significant challenges for generalization in +multi-modal medical tasks. Beyond modality gaps, individual variations, such as +differences in organ size and metabolic rate, further impede a model's ability +to generalize effectively across both modalities and diverse populations. +Despite the importance of personalization, existing approaches to multi-modal +generalization often neglect individual differences, focusing solely on common +anatomical features. This limitation may result in weakened generalization in +various medical tasks. In this paper, we unveil that personalization is +critical for multi-modal generalization. Specifically, we propose an approach +to achieve personalized generalization through approximating the underlying +personalized invariant representation ${X}_h$ across various modalities by +leveraging individual-level constraints and a learnable biological prior. We +validate the feasibility and benefits of learning a personalized ${X}_h$, +showing that this representation is highly generalizable and transferable +across various multi-modal medical tasks. Extensive experimental results +consistently show that the additionally incorporated personalization +significantly improves performance and generalization across diverse scenarios, +confirming its effectiveness. + +
+
+
+
+
+ + ♻ ☆ On-demand Cold Start Frequency Reduction with Off-Policy Reinforcement + Learning in Serverless Computing + + +
+ Function-as-a-Service (FaaS) is a cloud computing paradigm offering an +event-driven execution model to applications. It features serverless attributes +by eliminating resource management responsibilities from developers, and offers +transparent and on-demand scalability of applications. To provide seamless +on-demand scalability, new function instances are prepared to serve the +incoming workload in the absence or unavailability of function instances. +However, FaaS platforms are known to suffer from cold starts, where this +function provisioning process introduces a non-negligible delay in function +response and reduces the end-user experience. Therefore, the presented work +focuses on reducing the frequent, on-demand cold starts on the platform by +using Reinforcement Learning(RL). The proposed approach uses model-free +Q-learning that consider function metrics such as CPU utilization, existing +function instances, and response failure rate, to proactively initialize +functions, in advance, based on the expected demand. The proposed solution is +implemented on Kubeless and evaluated using an open-source function invocation +trace applied to a matrix multiplication function. The evaluation results +demonstrate a favourable performance of the RL-based agent when compared to +Kubeless' default policy and a function keep-alive policy by improving +throughput by up to 8.81% and reducing computation load and resource wastage by +up to 55% and 37%, respectively, that is a direct outcome of reduced cold +starts. + +
+
+ comment: 13 figures, 24 pages, 3 tables +
+
+
+
+
+ + ♻ ☆ ShaRP: A Novel Feature Importance Framework for Ranking + + +
+ Algorithmic decisions in critical domains such as hiring, college admissions, +and lending are often based on rankings. Because of the impact these decisions +have on individuals, organizations, and population groups, there is a need to +understand them: to help individuals improve their position in a ranking, +design better ranking procedures, and check whether a procedure is legally +compliant. In this paper, we present ShaRP -- Shapley for Rankings and +Preferences -- a framework that explains the contributions of features to +different aspects of a ranked outcome and is based on Shapley values. Using +ShaRP, we show that even when the scoring function used by an algorithmic +ranker is known and linear, the feature weights do not correspond to their +Shapley value contribution. The contributions instead depend on the feature +distributions and the subtle local interactions between the scoring features. + ShaRP builds on the Quantitative Input Influence framework to compute the +contributions of features for multiple -- ranking specific -- Quantities of +Interest, including score, rank, pair-wise preference, and top-k. We show the +results of an extensive experimental validation of ShaRP using real and +synthetic datasets. We demonstrate that feature importance can be computed +efficiently, and that ShaRP compares favorably to several prior local feature +importance methods, in terms of both generality and quality of explanations. +Among our results, we highlight a case study on the CS Rankings dataset. +Contrary to expectation, we find that a strong track record in Systems research +is much more important than AI research for placing a CS department among the +top-10%. ShaRP is available at latex for matplotlib +togetherhttps://github.com/DataResponsibly/ShaRP. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ Probabilistic Emulation of a Global Climate Model with Spherical + DYffusion NeurIPS 2024 + + +
+ Data-driven deep learning models are transforming global weather forecasting. +It is an open question if this success can extend to climate modeling, where +the complexity of the data and long inference rollouts pose significant +challenges. Here, we present the first conditional generative model that +produces accurate and physically consistent global climate ensemble simulations +by emulating a coarse version of the United States' primary operational global +forecast model, FV3GFS. Our model integrates the dynamics-informed diffusion +framework (DYffusion) with the Spherical Fourier Neural Operator (SFNO) +architecture, enabling stable 100-year simulations at 6-hourly timesteps while +maintaining low computational overhead compared to single-step deterministic +baselines. The model achieves near gold-standard performance for climate model +emulation, outperforming existing approaches and demonstrating promising +ensemble skill. This work represents a significant advance towards efficient, +data-driven climate simulations that can enhance our understanding of the +climate system and inform adaptation strategies. + +
+
+ comment: NeurIPS 2024; Code is available at + https://github.com/Rose-STL-Lab/spherical-dyffusion +
+
+
+
+
+ + ♻ ☆ FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning + in AI + + +
+ We introduce FrontierMath, a benchmark of hundreds of original, exceptionally +challenging mathematics problems crafted and vetted by expert mathematicians. +The questions cover most major branches of modern mathematics -- from +computationally intensive problems in number theory and real analysis to +abstract questions in algebraic geometry and category theory. Solving a typical +problem requires multiple hours of effort from a researcher in the relevant +branch of mathematics, and for the upper end questions, multiple days. +FrontierMath uses new, unpublished problems and automated verification to +reliably evaluate models while minimizing risk of data contamination. Current +state-of-the-art AI models solve under 2% of problems, revealing a vast gap +between AI capabilities and the prowess of the mathematical community. As AI +systems advance toward expert-level mathematical abilities, FrontierMath offers +a rigorous testbed that quantifies their progress. + +
+
+
+
+
+ + ♻ ☆ Taming Latent Diffusion Model for Neural Radiance Field Inpainting ECCV 2024 + + +
+ Neural Radiance Field (NeRF) is a representation for 3D reconstruction from +multi-view images. Despite some recent work showing preliminary success in +editing a reconstructed NeRF with diffusion prior, they remain struggling to +synthesize reasonable geometry in completely uncovered regions. One major +reason is the high diversity of synthetic contents from the diffusion model, +which hinders the radiance field from converging to a crisp and deterministic +geometry. Moreover, applying latent diffusion models on real data often yields +a textural shift incoherent to the image condition due to auto-encoding errors. +These two problems are further reinforced with the use of pixel-distance +losses. To address these issues, we propose tempering the diffusion model's +stochasticity with per-scene customization and mitigating the textural shift +with masked adversarial training. During the analyses, we also found the +commonly used pixel and perceptual losses are harmful in the NeRF inpainting +task. Through rigorous experiments, our framework yields state-of-the-art NeRF +inpainting results on various real-world scenes. Project page: +https://hubert0527.github.io/MALD-NeRF + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://hubert0527.github.io/MALD-NeRF +
+
+
+
+
+ + ♻ ☆ Data-Prep-Kit: getting your data ready for LLM application development + + +
+ Data preparation is the first and a very important step towards any Large +Language Model (LLM) development. This paper introduces an easy-to-use, +extensible, and scale-flexible open-source data preparation toolkit called Data +Prep Kit (DPK). DPK is architected and designed to enable users to scale their +data preparation to their needs. With DPK they can prepare data on a local +machine or effortlessly scale to run on a cluster with thousands of CPU Cores. +DPK comes with a highly scalable, yet extensible set of modules that transform +natural language and code data. If the user needs additional transforms, they +can be easily developed using extensive DPK support for transform creation. +These modules can be used independently or pipelined to perform a series of +operations. In this paper, we describe DPK architecture and show its +performance from a small scale to a very large number of CPUs. The modules from +DPK have been used for the preparation of Granite Models [1] [2]. We believe +DPK is a valuable contribution to the AI community to easily prepare data to +enhance the performance of their LLM models or to fine-tune models with +Retrieval-Augmented Generation (RAG). + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Neural-Rendezvous: Provably Robust Guidance and Control to Encounter + Interstellar Objects + + +
+ Interstellar objects (ISOs) are likely representatives of primitive materials +invaluable in understanding exoplanetary star systems. Due to their poorly +constrained orbits with generally high inclinations and relative velocities, +however, exploring ISOs with conventional human-in-the-loop approaches is +significantly challenging. This paper presents Neural-Rendezvous -- a deep +learning-based guidance and control framework for encountering fast-moving +objects, including ISOs, robustly, accurately, and autonomously in real time. +It uses pointwise minimum norm tracking control on top of a guidance policy +modeled by a spectrally-normalized deep neural network, where its +hyperparameters are tuned with a loss function directly penalizing the MPC +state trajectory tracking error. We show that Neural-Rendezvous provides a high +probability exponential bound on the expected spacecraft delivery error, the +proof of which leverages stochastic incremental stability analysis. In +particular, it is used to construct a non-negative function with a +supermartingale property, explicitly accounting for the ISO state uncertainty +and the local nature of nonlinear state estimation guarantees. In numerical +simulations, Neural-Rendezvous is demonstrated to satisfy the expected error +bound for 100 ISO candidates. This performance is also empirically validated +using our spacecraft simulator and in high-conflict and distributed UAV swarm +reconfiguration with up to 20 UAVs. + +
+
+ comment: Preprint Version, Accepted: October, 2024 (One-minute YouTube + summary: https://youtu.be/q3e0LYS2IYQ, DOI: + https://doi.org/10.2514/1.G007671) +
+
+
+
+
+ + ♻ ☆ Trustful LLMs: Customizing and Grounding Text Generation with Knowledge + Bases and Dual Decoders + + +
+ Although people are impressed by the content generation skills of large +language models, the use of LLMs, such as ChatGPT, is limited by the domain +grounding of the content. The correctness and groundedness of the generated +content need to be based on a verified context, such as results from +Retrieval-Augmented Generation (RAG). One important issue when adapting LLMs to +a customized domain is that the generated responses are often incomplete, or +the additions are not verified and may even be hallucinated. Prior studies on +hallucination detection have focused on evaluation metrics, which are not +easily adaptable to dynamic domains and can be vulnerable to attacks like +jail-breaking. In this work, we propose 1) a post-processing algorithm that +leverages knowledge triplets in RAG context to correct hallucinations and 2) a +dual-decoder model that fuses RAG context to guide the generation process. + +
+
+
+
+
+ + ♻ ☆ Students' Perceptions and Use of Generative AI Tools for Programming + Across Different Computing Courses + + +
+ Investigation of students' perceptions and opinions on the use of generative +artificial intelligence (GenAI) in education is a topic gaining much interest. +Studies addressing this are typically conducted with large heterogeneous +groups, at one moment in time. However, how students perceive and use GenAI +tools can potentially depend on many factors, including their background +knowledge, familiarity with the tools, and the learning goals and policies of +the courses they are taking. + In this study we explore how students following computing courses use GenAI +for programming-related tasks across different programs and courses: Bachelor +and Master, in courses in which learning programming is the learning goal, +courses that require programming as a means to achieve another goal, and in +courses in which programming is optional, but can be useful. We are also +interested in changes over time, since GenAI capabilities are changing at a +fast pace, and users are adopting GenAI increasingly. + We conducted three consecutive surveys (fall `23, winter `23, and spring `24) +among students of all computing programs of a large European research +university. We asked questions on the use in education, ethics, and job +prospects, and we included specific questions on the (dis)allowed use of GenAI +tools in the courses they were taking at the time. + We received 264 responses, which we quantitatively and qualitatively +analyzed, to find out how students have employed GenAI tools across 59 +different computing courses, and whether the opinion of an average student +about these tools evolves over time. Our study contributes to the emerging +discussion of how to differentiate GenAI use across different courses, and how +to align its use with the learning goals of a computing course. + +
+
+ comment: Accepted to Koli Calling 24. Numbers in Table 1, row 1 updated +
+
+
+
+
+ + ♻ ☆ Beyond the Doors of Perception: Vision Transformers Represent Relations + Between Objects + + +
+ Though vision transformers (ViTs) have achieved state-of-the-art performance +in a variety of settings, they exhibit surprising failures when performing +tasks involving visual relations. This begs the question: how do ViTs attempt +to perform tasks that require computing visual relations between objects? Prior +efforts to interpret ViTs tend to focus on characterizing relevant low-level +visual features. In contrast, we adopt methods from mechanistic +interpretability to study the higher-level visual algorithms that ViTs use to +perform abstract visual reasoning. We present a case study of a fundamental, +yet surprisingly difficult, relational reasoning task: judging whether two +visual entities are the same or different. We find that pretrained ViTs +fine-tuned on this task often exhibit two qualitatively different stages of +processing despite having no obvious inductive biases to do so: 1) a perceptual +stage wherein local object features are extracted and stored in a disentangled +representation, and 2) a relational stage wherein object representations are +compared. In the second stage, we find evidence that ViTs can learn to +represent somewhat abstract visual relations, a capability that has long been +considered out of reach for artificial neural networks. Finally, we demonstrate +that failures at either stage can prevent a model from learning a generalizable +solution to our fairly simple tasks. By understanding ViTs in terms of discrete +processing stages, one can more precisely diagnose and rectify shortcomings of +existing and future models. + +
+
+
+
+
+ + ♻ ☆ Explainable AI through a Democratic Lens: DhondtXAI for Proportional + Feature Importance Using the D'Hondt Method + + +
+ In democratic societies, electoral systems play a crucial role in translating +public preferences into political representation. Among these, the D'Hondt +method is widely used to ensure proportional representation, balancing fair +representation with governmental stability. Recently, there has been a growing +interest in applying similar principles of proportional representation to +enhance interpretability in machine learning, specifically in Explainable AI +(XAI). This study investigates the integration of D'Hondt-based voting +principles in the DhondtXAI method, which leverages resource allocation +concepts to interpret feature importance within AI models. Through a comparison +of SHAP (Shapley Additive Explanations) and DhondtXAI, we evaluate their +effectiveness in feature attribution within CatBoost and XGBoost models for +breast cancer and diabetes prediction, respectively. The DhondtXAI approach +allows for alliance formation and thresholding to enhance interpretability, +representing feature importance as seats in a parliamentary view. Statistical +correlation analyses between SHAP values and DhondtXAI allocations support the +consistency of interpretations, demonstrating DhondtXAI's potential as a +complementary tool for understanding feature importance in AI models. The +results highlight that integrating electoral principles, such as proportional +representation and alliances, into AI explainability can improve user +understanding, especially in high-stakes fields like healthcare. + +
+
+
+
+
+ + ♻ ☆ Confidence Trigger Detection: Accelerating Real-time + Tracking-by-detection Systems + + +
+ Real-time object tracking necessitates a delicate balance between speed and +accuracy, a challenge exacerbated by the computational demands of deep learning +methods. In this paper, we propose Confidence-Triggered Detection (CTD), an +innovative approach that strategically bypasses object detection for frames +closely resembling intermediate states, leveraging tracker confidence scores. +CTD not only enhances tracking speed but also preserves accuracy, surpassing +existing tracking algorithms. Through extensive evaluation across various +tracker confidence thresholds, we identify an optimal trade-off between +tracking speed and accuracy, providing crucial insights for parameter +fine-tuning and enhancing CTD's practicality in real-world scenarios. Our +experiments across diverse detection models underscore the robustness and +versatility of the CTD framework, demonstrating its potential to enable +real-time tracking in resource-constrained environments. + +
+
+ comment: Accepted by 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+
+
+
+ + Computation and Language 62 + +
+
+
+ + ☆ The Limited Impact of Medical Adaptation of Large Language and + Vision-Language Models EMNLP 2024 + + +
+ Several recent works seek to develop foundation models specifically for +medical applications, adapting general-purpose large language models (LLMs) and +vision-language models (VLMs) via continued pretraining on publicly available +biomedical corpora. These works typically claim that such domain-adaptive +pretraining (DAPT) improves performance on downstream medical tasks, such as +answering medical licensing exam questions. In this paper, we compare ten +public "medical" LLMs and two VLMs against their corresponding base models, +arriving at a different conclusion: all medical VLMs and nearly all medical +LLMs fail to consistently improve over their base models in the zero-/few-shot +prompting and supervised fine-tuning regimes for medical question-answering +(QA). For instance, across all tasks and model pairs we consider in the 3-shot +setting, medical LLMs only outperform their base models in 22.7% of cases, +reach a (statistical) tie in 36.8% of cases, and are significantly worse than +their base models in the remaining 40.5% of cases. Our conclusions are based on +(i) comparing each medical model head-to-head, directly against the +corresponding base model; (ii) optimizing the prompts for each model separately +in zero-/few-shot prompting; and (iii) accounting for statistical uncertainty +in comparisons. While these basic practices are not consistently adopted in the +literature, our ablations show that they substantially impact conclusions. +Meanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs +can show performance improvements, but the benefits do not carry over to tasks +based on clinical notes. Our findings suggest that state-of-the-art +general-domain models may already exhibit strong medical knowledge and +reasoning capabilities, and offer recommendations to strengthen the conclusions +of future studies. + +
+
+ comment: Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes + additional results on clinical note QA tasks and supervised fine-tuning + evaluations +
+
+
+
+
+ + ☆ CamemBERT 2.0: A Smarter French Language Model Aged to Perfection + + +
+ French language models, such as CamemBERT, have been widely adopted across +industries for natural language processing (NLP) tasks, with models like +CamemBERT seeing over 4 million downloads per month. However, these models face +challenges due to temporal concept drift, where outdated training data leads to +a decline in performance, especially when encountering new topics and +terminology. This issue emphasizes the need for updated models that reflect +current linguistic trends. In this paper, we introduce two new versions of the +CamemBERT base model-CamemBERTav2 and CamemBERTv2-designed to address these +challenges. CamemBERTav2 is based on the DeBERTaV3 architecture and makes use +of the Replaced Token Detection (RTD) objective for better contextual +understanding, while CamemBERTv2 is built on RoBERTa, which uses the Masked +Language Modeling (MLM) objective. Both models are trained on a significantly +larger and more recent dataset with longer context length and an updated +tokenizer that enhances tokenization performance for French. We evaluate the +performance of these models on both general-domain NLP tasks and +domain-specific applications, such as medical field tasks, demonstrating their +versatility and effectiveness across a range of use cases. Our results show +that these updated models vastly outperform their predecessors, making them +valuable tools for modern NLP systems. All our new models, as well as +intermediate checkpoints, are made openly available on Huggingface. + +
+
+
+
+
+ + ☆ Can sparse autoencoders be used to decompose and interpret steering + vectors? + + +
+ Steering vectors are a promising approach to control the behaviour of large +language models. However, their underlying mechanisms remain poorly understood. +While sparse autoencoders (SAEs) may offer a potential method to interpret +steering vectors, recent findings show that SAE-reconstructed vectors often +lack the steering properties of the original vectors. This paper investigates +why directly applying SAEs to steering vectors yields misleading +decompositions, identifying two reasons: (1) steering vectors fall outside the +input distribution for which SAEs are designed, and (2) steering vectors can +have meaningful negative projections in feature directions, which SAEs are not +designed to accommodate. These limitations hinder the direct use of SAEs for +interpreting steering vectors. + +
+
+
+
+
+ + ☆ Zero-shot Cross-lingual Transfer Learning with Multiple Source and + Target Languages for Information Extraction: Language Selection and + Adversarial Training + + +
+ The majority of previous researches addressing multi-lingual IE are limited +to zero-shot cross-lingual single-transfer (one-to-one) setting, with +high-resource languages predominantly as source training data. As a result, +these works provide little understanding and benefit for the realistic goal of +developing a multi-lingual IE system that can generalize to as many languages +as possible. Our study aims to fill this gap by providing a detailed analysis +on Cross-Lingual Multi-Transferability (many-to-many transfer learning), for +the recent IE corpora that cover a diverse set of languages. Specifically, we +first determine the correlation between single-transfer performance and a wide +range of linguistic-based distances. From the obtained insights, a combined +language distance metric can be developed that is not only highly correlated +but also robust across different tasks and model scales. Next, we investigate +the more general zero-shot multi-lingual transfer settings where multiple +languages are involved in the training and evaluation processes. Language +clustering based on the newly defined distance can provide directions for +achieving the optimal cost-performance trade-off in data (languages) selection +problem. Finally, a relational-transfer setting is proposed to further +incorporate multi-lingual unlabeled data based on adversarial training using +the relation induced from the above linguistic distance. + +
+
+
+
+
+ + ☆ Multi-Perspective Stance Detection + + +
+ Subjective NLP tasks usually rely on human annotations provided by multiple +annotators, whose judgments may vary due to their diverse backgrounds and life +experiences. Traditional methods often aggregate multiple annotations into a +single ground truth, disregarding the diversity in perspectives that arises +from annotator disagreement. In this preliminary study, we examine the effect +of including multiple annotations on model accuracy in classification. Our +methodology investigates the performance of perspective-aware classification +models in stance detection task and further inspects if annotator disagreement +affects the model confidence. The results show that multi-perspective approach +yields better classification performance outperforming the baseline which uses +the single label. This entails that designing more inclusive perspective-aware +AI models is not only an essential first step in implementing responsible and +ethical AI, but it can also achieve superior results than using the traditional +approaches. + +
+
+
+
+
+ + ☆ Separating Tongue from Thought: Activation Patching Reveals + Language-Agnostic Concept Representations in Transformers ICML 2024 + + +
+ A central question in multilingual language modeling is whether large +language models (LLMs) develop a universal concept representation, disentangled +from specific languages. In this paper, we address this question by analyzing +latent representations (latents) during a word translation task in +transformer-based LLMs. We strategically extract latents from a source +translation prompt and insert them into the forward pass on a target +translation prompt. By doing so, we find that the output language is encoded in +the latent at an earlier layer than the concept to be translated. Building on +this insight, we conduct two key experiments. First, we demonstrate that we can +change the concept without changing the language and vice versa through +activation patching alone. Second, we show that patching with the mean over +latents across different languages does not impair and instead improves the +models' performance in translating the concept. Our results provide evidence +for the existence of language-agnostic concept representations within the +investigated models. + +
+
+ comment: 12 pages, 10 figures, previously published under the title "How Do + Llamas Process Multilingual Text? A Latent Exploration through Activation + Patching" at the ICML 2024 mechanistic interpretability workshop + https://openreview.net/forum?id=0ku2hIm4BS +
+
+
+
+
+ + ☆ A Comparative Study of Discrete Speech Tokens for Semantic-Related Tasks + with Large Language Models + + +
+ With the rise of Speech Large Language Models (Speech LLMs), there has been +growing interest in discrete speech tokens for their ability to integrate with +text-based tokens seamlessly. Compared to most studies that focus on continuous +speech features, although discrete-token based LLMs have shown promising +results on certain tasks, the performance gap between these two paradigms is +rarely explored. In this paper, we present a fair and thorough comparison +between discrete and continuous features across a variety of semantic-related +tasks using a light-weight LLM (Qwen1.5-0.5B). Our findings reveal that +continuous features generally outperform discrete tokens, particularly in tasks +requiring fine-grained semantic understanding. Moreover, this study goes beyond +surface-level comparison by identifying key factors behind the +under-performance of discrete tokens, such as limited token granularity and +inefficient information retention. To enhance the performance of discrete +tokens, we explore potential aspects based on our analysis. We hope our results +can offer new insights into the opportunities for advancing discrete speech +tokens in Speech LLMs. + +
+
+ comment: 5 tables, 4 figures +
+
+
+
+
+ + ☆ Dynamic Rewarding with Prompt Optimization Enables Tuning-free + Self-Alignment of Language Models EMNLP 2024 + + +
+ Aligning Large Language Models (LLMs) traditionally relies on costly training +and human preference annotations. Self-alignment seeks to reduce these expenses +by enabling models to align themselves. To further lower costs and achieve +alignment without any expensive tuning or annotations, we introduce a new +tuning-free approach for self-alignment, Dynamic Rewarding with Prompt +Optimization (\ours). Our approach leverages a search-based optimization +framework that allows LLMs to iteratively self-improve and craft the optimal +alignment instructions, all without additional training or human intervention. +The core of \ours is a dynamic rewarding mechanism, which identifies and +rectifies model-specific alignment weaknesses, allowing LLMs to adapt +efficiently to diverse alignment challenges. Empirical evaluations on eight +recent LLMs, both open- and closed-sourced, demonstrate that \ours +significantly enhances alignment performance, with base models outperforming +their SFT/RLHF-tuned counterparts. Moreover, the prompts automatically +optimized by \ours surpass those curated by human experts, further validating +the effectiveness of our approach. Our findings highlight the great potential +of current LLMs to achieve adaptive self-alignment through inference-time +optimization, complementing tuning-based alignment methods. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ☆ Analyst Reports and Stock Performance: Evidence from the Chinese Market + + +
+ This article applies natural language processing (NLP) to extract and +quantify textual information to predict stock performance. Using an extensive +dataset of Chinese analyst reports and employing a customized BERT deep +learning model for Chinese text, this study categorizes the sentiment of the +reports as positive, neutral, or negative. The findings underscore the +predictive capacity of this sentiment indicator for stock volatility, excess +returns, and trading volume. Specifically, analyst reports with strong positive +sentiment will increase excess return and intraday volatility, and vice versa, +reports with strong negative sentiment also increase volatility and trading +volume, but decrease future excess return. The magnitude of this effect is +greater for positive sentiment reports than for negative sentiment reports. +This article contributes to the empirical literature on sentiment analysis and +the response of the stock market to news in the Chinese stock market. + +
+
+
+
+
+ + ☆ Are Triggers Needed for Document-Level Event Extraction? + + +
+ Most existing work on event extraction has focused on sentence-level texts +and presumes the identification of a trigger-span -- a word or phrase in the +input that evokes the occurrence of an event of interest. Event arguments are +then extracted with respect to the trigger. Indeed, triggers are treated as +integral to, and trigger detection as an essential component of, event +extraction. In this paper, we provide the first investigation of the role of +triggers for the more difficult and much less studied task of document-level +event extraction. We analyze their usefulness in multiple end-to-end and +pipelined neural event extraction models for three document-level event +extraction datasets, measuring performance using triggers of varying quality +(human-annotated, LLM-generated, keyword-based, and random). Our research shows +that trigger effectiveness varies based on the extraction task's +characteristics and data quality, with basic, automatically-generated triggers +serving as a viable alternative to human-annotated ones. Furthermore, providing +detailed event descriptions to the extraction model helps maintain robust +performance even when trigger quality degrades. Perhaps surprisingly, we also +find that the mere existence of trigger input, even random ones, is important +for prompt-based LLM approaches to the task. + +
+
+
+
+
+ + ☆ Theoretical Analysis of Byte-Pair Encoding + + +
+ Byte-Pair Encoding (BPE) is a widely used method for subword tokenization, +with origins in grammar-based text compression. It is employed in a variety of +language processing tasks such as machine translation or large language model +(LLM) pretraining, to create a token dictionary of a prescribed size. Most +evaluations of BPE to date are empirical, and the reasons for its good +practical performance are not well understood. + In this paper we focus on the optimization problem underlying BPE: finding a +pair encoding that achieves optimal compression utility. We show that this +problem is APX-complete, indicating that it is unlikely to admit a +polynomial-time approximation scheme. This answers, in a stronger form, a +question recently raised by Zouhar et al. + On the positive side, we show that BPE approximates the compression utility +of the optimal pair encoding to a worst-case factor between $0.333$ and +$0.625$. Our results aim to explain the ongoing success of BPE and are, to our +knowledge, the first rigorous guarantees on its compression utility that hold +for all inputs. + +
+
+
+
+
+ + ☆ Dynamic Subset Tuning: Expanding the Operational Range of + Parameter-Efficient Training for Large Language Models NeurIPS 2024 + + +
+ We propose a novel parameter-efficient training (PET) method for large +language models that adapts models to downstream tasks by optimizing a small +subset of the existing model parameters. Unlike prior methods, this subset is +not fixed in location but rather which parameters are modified evolves over the +course of training. This dynamic parameter selection can yield good performance +with many fewer parameters than extant methods. Our method enables a seamless +scaling of the subset size across an arbitrary proportion of the total model +size, while popular PET approaches like prompt tuning and LoRA cover only a +small part of this spectrum. We match or outperform prompt tuning and LoRA in +most cases on a variety of NLP tasks (MT, QA, GSM8K, SuperGLUE) for a given +parameter budget across different model families and sizes. + +
+
+ comment: NeurIPS 2024 Workshop on Adaptive Foundation Models +
+
+
+
+
+ + ☆ XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL + + +
+ To tackle the challenges of large language model performance in natural +language to SQL tasks, we introduce XiYan-SQL, an innovative framework that +employs a multi-generator ensemble strategy to improve candidate generation. We +introduce M-Schema, a semi-structured schema representation method designed to +enhance the understanding of database structures. To enhance the quality and +diversity of generated candidate SQL queries, XiYan-SQL integrates the +significant potential of in-context learning (ICL) with the precise control of +supervised fine-tuning. On one hand, we propose a series of training strategies +to fine-tune models to generate high-quality candidates with diverse +preferences. On the other hand, we implement the ICL approach with an example +selection method based on named entity recognition to prevent overemphasis on +entities. The refiner optimizes each candidate by correcting logical or +syntactical errors. To address the challenge of identifying the best candidate, +we fine-tune a selection model to distinguish nuances of candidate SQL queries. +The experimental results on multiple dialect datasets demonstrate the +robustness of XiYan-SQL in addressing challenges across different scenarios. +Overall, our proposed XiYan-SQL achieves the state-of-the-art execution +accuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on +NL2GQL, and a competitive score of 72.23% on the Bird development benchmark. +The proposed framework not only enhances the quality and diversity of SQL +queries but also outperforms previous methods. + +
+
+
+
+
+ + ☆ CorrSynth -- A Correlated Sampling Method for Diverse Dataset Generation + from LLMs EMNLP 2024 + + +
+ Large language models (LLMs) have demonstrated remarkable performance in +diverse tasks using zero-shot and few-shot prompting. Even though their +capabilities of data synthesis have been studied well in recent years, the +generated data suffers from a lack of diversity, less adherence to the prompt, +and potential biases that creep into the data from the generator model. In this +work, we tackle the challenge of generating datasets with high diversity, upon +which a student model is trained for downstream tasks. Taking the route of +decoding-time guidance-based approaches, we propose CorrSynth, which generates +data that is more diverse and faithful to the input prompt using a correlated +sampling strategy. Further, our method overcomes the complexity drawbacks of +some other guidance-based techniques like classifier-based guidance. With +extensive experiments, we show the effectiveness of our approach and +substantiate our claims. In particular, we perform intrinsic evaluation to show +the improvements in diversity. Our experiments show that CorrSynth improves +both student metrics and intrinsic metrics upon competitive baselines across +four datasets, showing the innate advantage of our method. + +
+
+ comment: Published as a main conference paper at EMNLP 2024; First two authors + contributed equally +
+
+
+
+
+ + ☆ Neural Topic Modeling with Large Language Models in the Loop + + +
+ Topic modeling is a fundamental task in natural language processing, allowing +the discovery of latent thematic structures in text corpora. While Large +Language Models (LLMs) have demonstrated promising capabilities in topic +discovery, their direct application to topic modeling suffers from issues such +as incomplete topic coverage, misalignment of topics, and inefficiency. To +address these limitations, we propose LLM-ITL, a novel LLM-in-the-loop +framework that integrates LLMs with many existing Neural Topic Models (NTMs). +In LLM-ITL, global topics and document representations are learned through the +NTM, while an LLM refines the topics via a confidence-weighted Optimal +Transport (OT)-based alignment objective. This process enhances the +interpretability and coherence of the learned topics, while maintaining the +efficiency of NTMs. Extensive experiments demonstrate that LLM-ITL can help +NTMs significantly improve their topic interpretability while maintaining the +quality of document representation. + +
+
+
+
+
+ + ☆ Tree-of-Table: Unleashing the Power of LLMs for Enhanced Large-Scale + Table Understanding + + +
+ The ubiquity and value of tables as semi-structured data across various +domains necessitate advanced methods for understanding their complexity and +vast amounts of information. Despite the impressive capabilities of large +language models (LLMs) in advancing the natural language understanding +frontier, their application to large-scale tabular data presents significant +challenges, specifically regarding table size and complex intricate +relationships. Existing works have shown promise with small-scale tables but +often flounder when tasked with the complex reasoning required by larger, +interconnected tables found in real-world scenarios. To address this gap, we +introduce "Tree-of-Table", a novel approach designed to enhance LLMs' reasoning +capabilities over large and complex tables. Our method employs Table +Condensation and Decomposition to distill and reorganize relevant data into a +manageable format, followed by the construction of a hierarchical Table-Tree +that facilitates tree-structured reasoning. Through a meticulous Table-Tree +Execution process, we systematically unravel the tree-structured reasoning +chain to derive the solutions. Experiments across diverse datasets, including +WikiTQ, TableFact, FeTaQA, and BIRD, demonstrate that Tree-of-Table sets a new +benchmark with superior performance, showcasing remarkable efficiency and +generalization capabilities in large-scale table reasoning. + +
+
+
+
+
+ + ☆ An Information Theoretic Approach to Operationalize Right to Data + Protection + + +
+ The widespread practice of indiscriminate data scraping to fine-tune language +models (LMs) raises significant legal and ethical concerns, particularly +regarding compliance with data protection laws such as the General Data +Protection Regulation (GDPR). This practice often results in the unauthorized +use of personal information, prompting growing debate within the academic and +regulatory communities. Recent works have introduced the concept of generating +unlearnable datasets (by adding imperceptible noise to the clean data), such +that the underlying model achieves lower loss during training but fails to +generalize to the unseen test setting. Though somewhat effective, these +approaches are predominantly designed for images and are limited by several +practical constraints like requiring knowledge of the target model. To this +end, we introduce RegText, a framework that injects imperceptible spurious +correlations into natural language datasets, effectively rendering them +unlearnable without affecting semantic content. We demonstrate RegText's +utility through rigorous empirical analysis of small and large LMs. Notably, +RegText can restrict newer models like GPT-4o and Llama from learning on our +generated data, resulting in a drop in their test accuracy compared to their +zero-shot performance and paving the way for generating unlearnable text to +protect public data. + +
+
+ comment: First two authors contributed equally to this work +
+
+
+
+
+ + ☆ Towards Objective and Unbiased Decision Assessments with LLM-Enhanced + Hierarchical Attention Networks + + +
+ How objective and unbiased are we while making decisions? This work +investigates cognitive bias identification in high-stake decision making +process by human experts, questioning its effectiveness in real-world settings, +such as candidates assessments for university admission. We begin with a +statistical analysis assessing correlations among different decision points +among in the current process, which discovers discrepancies that imply +cognitive bias and inconsistency in decisions. This motivates our exploration +of bias-aware AI-augmented workflow that surpass human judgment. We propose +BGM-HAN, a hierarchical attention network enhanced by byte-pair encoding, +multi-head attention and gated residual connection. Using it as backbone model, +we further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow, which +simulate real-world decision-making. In our experiments, both the proposed +model and the agentic workflow significantly improves on both human judgment +and alternative models, validated with real-world data. + +
+
+
+
+
+ + ☆ Towards Evaluating Large Language Models for Graph Query Generation + + +
+ Large Language Models (LLMs) are revolutionizing the landscape of Generative +Artificial Intelligence (GenAI), with innovative LLM-backed solutions emerging +rapidly. However, when applied to database technologies, specifically query +generation for graph databases and Knowledge Graphs (KGs), LLMs still face +significant challenges. While research on LLM-driven query generation for +Structured Query Language (SQL) exists, similar systems for graph databases +remain underdeveloped. This paper presents a comparative study addressing the +challenge of generating Cypher queries a powerful language for interacting with +graph databases using open-access LLMs. We rigorously evaluate several LLM +agents (OpenAI ChatGPT 4o, Claude Sonnet 3.5, Google Gemini Pro 1.5, and a +locally deployed Llama 3.1 8B) using a designed few-shot learning prompt and +Retrieval Augmented Generation (RAG) backed by Chain-of-Thoughts (CoT) +reasoning. Our empirical analysis of query generation accuracy reveals that +Claude Sonnet 3.5 outperforms its counterparts in this specific domain. +Further, we highlight promising future research directions to address the +identified limitations and advance LLM-driven query generation for graph +databases. + +
+
+ comment: Paper accepted and will be presented at CSCI2024 in December 2024, + Later will be published at Springer LNCS +
+
+
+
+
+ + ☆ One STEP at a time: Language Agents are Stepwise Planners + + +
+ Language agents have shown promising adaptability in dynamic environments to +perform complex tasks. However, despite the versatile knowledge embedded in +large language models, these agents still fall short when it comes to tasks +that require planning. We introduce STEP, a novel framework designed to +efficiently learn from previous experiences to enhance the planning +capabilities of language agents in future steps. Concretely, STEP functions +through four interconnected components. First, the Planner takes on the task, +breaks it down into subtasks and provides relevant insights. Then the Executor +generates action candidates, while the Evaluator ensures the actions align with +learned rules from previous experiences. Lastly, Memory stores experiences to +inform future decisions. In the ScienceWorld benchmark, our results show that +STEP consistently outperforms state-of-the-art models, achieving an overall +score of 67.4 and successfully completing 12 out of 18 tasks. These findings +highlight STEP's potential as a framework for enhancing planning capabilities +in language agents, paving the way for more sophisticated task-solving in +dynamic environments. + +
+
+
+
+
+ + ☆ CLaSP: Learning Concepts for Time-Series Signals from Natural Language + Supervision + + +
+ This paper proposes a foundation model called "CLaSP" that can search time +series signals using natural language that describes the characteristics of the +signals as queries. Previous efforts to represent time series signal data in +natural language have had challenges in designing a conventional class of time +series signal characteristics, formulating their quantification, and creating a +dictionary of synonyms. To overcome these limitations, the proposed method +introduces a neural network based on contrastive learning. This network is +first trained using the datasets TRUCE and SUSHI, which consist of time series +signals and their corresponding natural language descriptions. Previous studies +have proposed vocabularies that data analysts use to describe signal +characteristics, and SUSHI was designed to cover these terms. We believe that a +neural network trained on these datasets will enable data analysts to search +using natural language vocabulary. Furthermore, our method does not require a +dictionary of predefined synonyms, and it leverages common sense knowledge +embedded in a large-scale language model (LLM). Experimental results +demonstrate that CLaSP enables natural language search of time series signal +data and can accurately learn the points at which signal data changes. + +
+
+
+
+
+ + ☆ Interpretable Syntactic Representations Enable Hierarchical Word Vectors + + +
+ The distributed representations currently used are dense and uninterpretable, +leading to interpretations that themselves are relative, overcomplete, and hard +to interpret. We propose a method that transforms these word vectors into +reduced syntactic representations. The resulting representations are compact +and interpretable allowing better visualization and comparison of the word +vectors and we successively demonstrate that the drawn interpretations are in +line with human judgment. The syntactic representations are then used to create +hierarchical word vectors using an incremental learning approach similar to the +hierarchical aspect of human learning. As these representations are drawn from +pre-trained vectors, the generation process and learning approach are +computationally efficient. Most importantly, we find out that syntactic +representations provide a plausible interpretation of the vectors and +subsequent hierarchical vectors outperform the original vectors in benchmark +tests. + +
+
+
+
+
+ + ☆ Refining Translations with LLMs: A Constraint-Aware Iterative Prompting + Approach + + +
+ Large language models (LLMs) have demonstrated remarkable proficiency in +machine translation (MT), even without specific training on the languages in +question. However, translating rare words in low-resource or domain-specific +contexts remains challenging for LLMs. To address this issue, we propose a +multi-step prompt chain that enhances translation faithfulness by prioritizing +key terms crucial for semantic accuracy. Our method first identifies these +keywords and retrieves their translations from a bilingual dictionary, +integrating them into the LLM's context using Retrieval-Augmented Generation +(RAG). We further mitigate potential output hallucinations caused by long +prompts through an iterative self-checking mechanism, where the LLM refines its +translations based on lexical and semantic constraints. Experiments using Llama +and Qwen as base models on the FLORES-200 and WMT datasets demonstrate +significant improvements over baselines, highlighting the effectiveness of our +approach in enhancing translation faithfulness and robustness, particularly in +low-resource scenarios. + +
+
+
+
+
+ + ☆ A Chinese Multi-label Affective Computing Dataset Based on Social Media + Network Users + + +
+ Emotion and personality are central elements in understanding human +psychological states. Emotions reflect an individual subjective experiences, +while personality reveals relatively stable behavioral and cognitive patterns. +Existing affective computing datasets often annotate emotion and personality +traits separately, lacking fine-grained labeling of micro-emotions and emotion +intensity in both single-label and multi-label classifications. Chinese emotion +datasets are extremely scarce, and datasets capturing Chinese user personality +traits are even more limited. To address these gaps, this study collected data +from the major social media platform Weibo, screening 11,338 valid users from +over 50,000 individuals with diverse MBTI personality labels and acquiring +566,900 posts along with the user MBTI personality tags. Using the EQN method, +we compiled a multi-label Chinese affective computing dataset that integrates +the same user's personality traits with six emotions and micro-emotions, each +annotated with intensity levels. Validation results across multiple NLP +classification models demonstrate the dataset strong utility. This dataset is +designed to advance machine recognition of complex human emotions and provide +data support for research in psychology, education, marketing, finance, and +politics. + +
+
+
+
+
+ + ☆ Bangla Grammatical Error Detection Leveraging Transformer-based Token + Classification + + +
+ Bangla is the seventh most spoken language by a total number of speakers in +the world, and yet the development of an automated grammar checker in this +language is an understudied problem. Bangla grammatical error detection is a +task of detecting sub-strings of a Bangla text that contain grammatical, +punctuation, or spelling errors, which is crucial for developing an automated +Bangla typing assistant. Our approach involves breaking down the task as a +token classification problem and utilizing state-of-the-art transformer-based +models. Finally, we combine the output of these models and apply rule-based +post-processing to generate a more reliable and comprehensive result. Our +system is evaluated on a dataset consisting of over 25,000 texts from various +sources. Our best model achieves a Levenshtein distance score of 1.04. Finally, +we provide a detailed analysis of different components of our system. + +
+
+
+
+
+ + ☆ Are LLMs Prescient? A Continuous Evaluation using Daily News as the + Oracle + + +
+ Many existing evaluation benchmarks for Large Language Models (LLMs) quickly +become outdated due to the emergence of new models and training data. These +benchmarks also fall short in assessing how LLM performance changes over time, +as they consist of static questions without a temporal dimension. To address +these limitations, we propose using future event prediction as a continuous +evaluation method to assess LLMs' temporal generalization and forecasting +abilities. Our benchmark, Daily Oracle, automatically generates question-answer +(QA) pairs from daily news, challenging LLMs to predict "future" event +outcomes. Our findings reveal that as pre-training data becomes outdated, LLM +performance degrades over time. While Retrieval Augmented Generation (RAG) has +the potential to enhance prediction accuracy, the performance degradation +pattern persists, highlighting the need for continuous model updates. + +
+
+
+
+
+ + ☆ R3HF: Reward Redistribution for Enhancing Reinforcement Learning from + Human Feedback + + +
+ Reinforcement learning from human feedback (RLHF) provides a paradigm for +aligning large language models (LLMs) with human preferences. This involves the +initial training of a reward model based on pairwise human feedback. The reward +model is subsequently utilized in reinforcement learning to assess the scores +of each generated sentence as a whole, further guiding the optimization of +LLMs. However, current approaches have a significant shortcoming: \emph{They +allocate a single, sparse, and delayed reward to an entire sequence of output}. +This may overlook some significant individual contributions of each token +towards the desired outcome. To overcome this limitation, our paper proposes a +novel reward redistribution method called R3HF, which facilitates a more +fine-grained, token-level reward allocation. Specifically, our method treats +the reward prediction task of the reward model as a regression problem. As a +result, the redistributed rewards are computed by evaluating the specific +contribution of each token to the reward model's output. This detailed approach +improves the model's understanding of language nuances, leading to more precise +enhancements in its performance. Our method is crafted to integrate seamlessly +with most current techniques while incurring minimal computational costs. +Through comprehensive experiments across diverse datasets and tasks, we have +verified the effectiveness and superiority of our approach. + +
+
+
+
+
+ + ☆ Knowledge Bases in Support of Large Language Models for Processing Web + News + + +
+ Large Language Models (LLMs) have received considerable interest in wide +applications lately. During pre-training via massive datasets, such a model +implicitly memorizes the factual knowledge of trained datasets in its hidden +parameters. However, knowledge held implicitly in parameters often makes its +use by downstream applications ineffective due to the lack of common-sense +reasoning. In this article, we introduce a general framework that permits to +build knowledge bases with an aid of LLMs, tailored for processing Web news. +The framework applies a rule-based News Information Extractor (NewsIE) to news +items for extracting their relational tuples, referred to as knowledge bases, +which are then graph-convoluted with the implicit knowledge facts of news items +obtained by LLMs, for their classification. It involves two lightweight +components: 1) NewsIE: for extracting the structural information of every news +item, in the form of relational tuples; 2) BERTGraph: for graph convoluting the +implicit knowledge facts with relational tuples extracted by NewsIE. We have +evaluated our framework under different news-related datasets for news category +classification, with promising experimental results. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ A Large-Scale Study of Relevance Assessments with Large Language Models: + An Initial Look + + +
+ The application of large language models to provide relevance assessments +presents exciting opportunities to advance information retrieval, natural +language processing, and beyond, but to date many unknowns remain. This paper +reports on the results of a large-scale evaluation (the TREC 2024 RAG Track) +where four different relevance assessment approaches were deployed in situ: the +"standard" fully manual process that NIST has implemented for decades and three +different alternatives that take advantage of LLMs to different extents using +the open-source UMBRELA tool. This setup allows us to correlate system rankings +induced by the different approaches to characterize tradeoffs between cost and +quality. We find that in terms of nDCG@20, nDCG@100, and Recall@100, system +rankings induced by automatically generated relevance assessments from UMBRELA +correlate highly with those induced by fully manual assessments across a +diverse set of 77 runs from 19 teams. Our results suggest that automatically +generated UMBRELA judgments can replace fully manual judgments to accurately +capture run-level effectiveness. Surprisingly, we find that LLM assistance does +not appear to increase correlation with fully manual assessments, suggesting +that costs associated with human-in-the-loop processes do not bring obvious +tangible benefits. Overall, human assessors appear to be stricter than UMBRELA +in applying relevance criteria. Our work validates the use of LLMs in academic +TREC-style evaluations and provides the foundation for future studies. + +
+
+
+
+
+ + ☆ Code-mixed LLM: Improve Large Language Models' Capability to Handle + Code-Mixing through Reinforcement Learning from AI Feedback + + +
+ Code-mixing(CM) or code-switching(CSW) refers to the juxtaposition of +linguistic units from two or more languages during the conversation or +sometimes even a single utterance. Code-mixing introduces unique challenges in +daily life, such as syntactic mismatches and semantic blending, that are rarely +encountered in monolingual settings. Large language models (LLMs) have +revolutionized the field of natural language processing (NLP) by offering +unprecedented capabilities in understanding human languages. However, the +effectiveness of current state-of-the-art multilingual LLMs has not yet been +fully explored in the CM scenario. To fill this gap, we first benchmark the +performance of multilingual LLMs on various code-mixing NLP tasks. Then we +propose to improve the multilingual LLMs' ability to understand code-mixing +through reinforcement learning from human feedback (RLHF) and code-mixed +machine translation tasks. Given the high-cost and time-consuming preference +labeling procedure, we improve this by utilizing LLMs as annotators to perform +the reinforcement learning from AI feedback (RLAIF). The experiments show the +effectiveness of the proposed method. + +
+
+ comment: initial version: 5 pages, 2 figures +
+
+
+
+
+ + ☆ Bridging the Visual Gap: Fine-Tuning Multimodal Models with + Knowledge-Adapted Captions + + +
+ Recent research increasingly focuses on training vision-language models +(VLMs) with long, detailed image captions. However, small-scale VLMs often +struggle to balance the richness of these captions with the risk of +hallucinating content during fine-tuning. In this paper, we explore how well +VLMs adapt to such captions. To quantify caption quality, we propose Decomposed +NLI (DNLI), an evaluation framework that breaks down generated captions into +individual propositions, assessing each in isolation. This fine-grained +analysis reveals a critical balance between capturing descriptive details and +preventing hallucinations. Our findings show that simply reducing caption +complexity or employing standard data curation techniques does not effectively +resolve this issue. To tackle this challenge, we introduce Knowledge Adapted +(KnowAda) fine-tuning, a data-centric approach that automatically adapts +training data with the model's existing knowledge and visual understanding. +KnowAda minimizes hallucinations while preserving high descriptiveness. We +validate this approach across several small-scale VLMs (up to 7B parameters) +and dense caption datasets, demonstrating that KnowAda effectively balances +hallucination reduction and descriptiveness. Our results show that KnowAda +outperforms various baselines in both automatic metrics and human evaluations. +We will release our code and models. + +
+
+
+
+
+ + ☆ Cut Your Losses in Large-Vocabulary Language Models + + +
+ As language models grow ever larger, so do their vocabularies. This has +shifted the memory footprint of LLMs during training disproportionately to one +single layer: the cross-entropy in the loss computation. Cross-entropy builds +up a logit matrix with entries for each pair of input tokens and vocabulary +items and, for small models, consumes an order of magnitude more memory than +the rest of the LLM combined. We propose Cut Cross-Entropy (CCE), a method that +computes the cross-entropy loss without materializing the logits for all tokens +into global memory. Rather, CCE only computes the logit for the correct token +and evaluates the log-sum-exp over all logits on the fly. We implement a custom +kernel that performs the matrix multiplications and the log-sum-exp reduction +over the vocabulary in flash memory, making global memory consumption for the +cross-entropy computation negligible. This has a dramatic effect. Taking the +Gemma 2 (2B) model as an example, CCE reduces the memory footprint of the loss +computation from 24 GB to 1 MB, and the total training-time memory consumption +of the classifier head from 28 GB to 1 GB. To improve the throughput of CCE, we +leverage the inherent sparsity of softmax and propose to skip elements of the +gradient computation that have a negligible (i.e., below numerical precision) +contribution to the gradient. Experiments demonstrate that the dramatic +reduction in memory consumption is accomplished without sacrificing training +speed or convergence. + +
+
+ comment: Code is available at https://github.com/apple/ml-cross-entropy +
+
+
+
+
+ + ☆ Refusal in LLMs is an Affine Function + + +
+ We propose affine concept editing (ACE) as an approach for steering language +models' behavior by intervening directly in activations. We begin with an +affine decomposition of model activation vectors and show that prior methods +for steering model behavior correspond to subsets of terms of this +decomposition. We then provide a derivation of ACE and test it on refusal using +Llama 3 8B and Hermes Eagle RWKV v5. ACE ultimately combines affine subspace +projection and activation addition to reliably control the model's refusal +responses across prompt types. We evaluate the results using LLM-based scoring +on a collection of harmful and harmless prompts. Our experiments demonstrate +that ACE consistently achieves more precise control over model behavior and +generalizes to models where directional ablation via affine subspace projection +alone produces incoherent outputs. Code for reproducing our results is +available at https://github.com/EleutherAI/steering-llama3 . + +
+
+
+
+
+ + ☆ CoCoP: Enhancing Text Classification with LLM through Code Completion + Prompt + + +
+ Text classification is a fundamental task in natural language processing +(NLP), and large language models (LLMs) have demonstrated their capability to +perform this task across various domains. However, the performance of LLMs +heavily depends on the quality of their input prompts. Recent studies have also +shown that LLMs exhibit remarkable results in code-related tasks. To leverage +the capabilities of LLMs in text classification, we propose the Code Completion +Prompt (CoCoP) method, which transforms the text classification problem into a +code completion task. CoCoP significantly improves text classification +performance across diverse datasets by utilizing LLMs' code-completion +capability. For instance, CoCoP enhances the accuracy of the SST2 dataset by +more than 20%. Moreover, when CoCoP integrated with LLMs specifically designed +for code-related tasks (code models), such as CodeLLaMA, this method +demonstrates better or comparable performance to few-shot learning techniques +while using only one-tenth of the model size. The source code of our proposed +method will be available to the public upon the acceptance of the paper. + +
+
+
+
+
+ + ☆ Robustness and Confounders in the Demographic Alignment of LLMs with + Human Perceptions of Offensiveness ACL'25 + + +
+ Large language models (LLMs) are known to exhibit demographic biases, yet few +studies systematically evaluate these biases across multiple datasets or +account for confounding factors. In this work, we examine LLM alignment with +human annotations in five offensive language datasets, comprising approximately +220K annotations. Our findings reveal that while demographic traits, +particularly race, influence alignment, these effects are inconsistent across +datasets and often entangled with other factors. Confounders -- such as +document difficulty, annotator sensitivity, and within-group agreement -- +account for more variation in alignment patterns than demographic traits alone. +Specifically, alignment increases with higher annotator sensitivity and group +agreement, while greater document difficulty corresponds to reduced alignment. +Our results underscore the importance of multi-dataset analyses and +confounder-aware methodologies in developing robust measures of demographic +bias in LLMs. + +
+
+ comment: 18 pages, 8 figures, ACL'25 +
+
+
+
+
+ + ☆ Sparse Upcycling: Inference Inefficient Finetuning NeurIPS + + +
+ Small, highly trained, open-source large language models are widely used due +to their inference efficiency, but further improving their quality remains a +challenge. Sparse upcycling is a promising approach that transforms a +pretrained dense model into a Mixture-of-Experts (MoE) architecture, increasing +the model's parameter count and quality. In this work, we compare the +effectiveness of sparse upcycling against continued pretraining (CPT) across +different model sizes, compute budgets, and pretraining durations. Our +experiments show that sparse upcycling can achieve better quality, with +improvements of over 20% relative to CPT in certain scenarios. However, this +comes with a significant inference cost, leading to 40% slowdowns in +high-demand inference settings for larger models. Our findings highlight the +trade-off between model quality and inference efficiency, offering insights for +practitioners seeking to balance model quality and deployment constraints. + +
+
+ comment: 12 pages, 4 figures, To appear in the 4th NeurIPS Workshop on + Efficient Natural Language and Speech Processing (ENLSP), 2024 +
+
+
+
+
+ + ♻ ☆ A Single Transformer for Scalable Vision-Language Modeling + + +
+ We present SOLO, a single transformer for Scalable visiOn-Language mOdeling. +Current large vision-language models (LVLMs) such as LLaVA mostly employ +heterogeneous architectures that connect pre-trained visual encoders with large +language models (LLMs) to facilitate visual recognition and complex reasoning. +Although achieving remarkable performance with relatively lightweight training, +we identify four primary scalability limitations: (1) The visual capacity is +constrained by pre-trained visual encoders, which are typically an order of +magnitude smaller than LLMs. (2) The heterogeneous architecture complicates the +use of established hardware and software infrastructure. (3) Study of scaling +laws on such architecture must consider three separate components - visual +encoder, connector, and LLMs, which complicates the analysis. (4) The use of +existing visual encoders typically requires following a pre-defined +specification of image inputs pre-processing, for example, by reshaping inputs +to fixed-resolution square images, which presents difficulties in processing +and training on high-resolution images or those with unusual aspect ratio. A +unified single Transformer architecture, like SOLO, effectively addresses these +scalability concerns in LVLMs; however, its limited adoption in the modern +context likely stems from the absence of reliable training recipes that balance +both modalities and ensure stable training for billion-scale models. In this +paper, we introduce the first open-source training recipe for developing SOLO, +an open-source 7B LVLM using moderate academic resources. The training recipe +involves initializing from LLMs, sequential pre-training on ImageNet and +web-scale data, and instruction fine-tuning on our curated high-quality +datasets. On extensive evaluation, SOLO demonstrates performance comparable to +LLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning. + +
+
+ comment: Accepted to TMLR +
+
+
+
+
+ + ♻ ☆ MILU: A Multi-task Indic Language Understanding Benchmark + + +
+ Evaluating Large Language Models (LLMs) in low-resource and linguistically +diverse languages remains a significant challenge in NLP, particularly for +languages using non-Latin scripts like those spoken in India. Existing +benchmarks predominantly focus on English, leaving substantial gaps in +assessing LLM capabilities in these languages. We introduce MILU, a Multi task +Indic Language Understanding Benchmark, a comprehensive evaluation benchmark +designed to address this gap. MILU spans 8 domains and 42 subjects across 11 +Indic languages, reflecting both general and culturally specific knowledge. +With an India-centric design, incorporates material from regional and +state-level examinations, covering topics such as local history, arts, +festivals, and laws, alongside standard subjects like science and mathematics. +We evaluate over 45 LLMs, and find that current LLMs struggle with MILU, with +GPT-4o achieving the highest average accuracy at 72 percent. Open multilingual +models outperform language-specific fine-tuned models, which perform only +slightly better than random baselines. Models also perform better in high +resource languages as compared to low resource ones. Domain-wise analysis +indicates that models perform poorly in culturally relevant areas like Arts and +Humanities, Law and Governance compared to general fields like STEM. To the +best of our knowledge, MILU is the first of its kind benchmark focused on Indic +languages, serving as a crucial step towards comprehensive cultural evaluation. +All code, benchmarks, and artifacts are publicly available to foster open +research. + +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from over-reliance on unimodal biases (e.g., language bias +and vision bias), leading to incorrect answers or hallucinations in complex +multimodal tasks. To investigate this issue, we propose a causal framework to +interpret the biases in Visual Question Answering (VQA) problems. Within this +framework, we conduct an in-depth causal analysis to assess the causal effect +of these biases on MLLM predictions. Based on the analysis, we introduce 1) a +novel MORE dataset with 12,000 challenging VQA instances requiring multi-hop +reasoning and overcoming unimodal biases. 2) a causality-enhanced agent +framework CAVE that guides models to comprehensively integrate information from +different modalities and mitigate biases. Our experiments show that MLLMs +perform poorly on MORE, indicating strong unimodal biases and limited semantic +understanding. However, when integrated with our CAVE, promising improvements +in reasoning and bias mitigation can be seen. These findings provide important +insights for the development of more robust MLLMs and contribute to the broader +goal of advancing multimodal AI systems capable of deeper understanding and +reasoning. Our project page is at https://github.com/OpenCausaLab/MORE. + +
+
+
+
+
+ + ♻ ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information + Seeking in Large Language Models NeurIPS 2024 + + +
+ In the face of uncertainty, the ability to *seek information* is of +fundamental importance. In many practical applications, such as medical +diagnosis and troubleshooting, the information needed to solve the task is not +initially given and has to be actively sought by asking follow-up questions +(for example, a doctor asking a patient for more details about their symptoms). +In this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to +augment large language models with the ability to actively seek information by +asking effective questions. UoT combines 1) an *uncertainty-aware simulation +approach* which enables the model to simulate possible future scenarios and how +likely they are to occur, 2) *uncertainty-based rewards* motivated by +information gain which incentivizes the model to seek information, and 3) a +*reward propagation scheme* to select the optimal question to ask in a way that +maximizes the expected reward. In experiments on medical diagnosis, +troubleshooting, and the `20 Questions` game, UoT achieves an average +performance improvement of 38.1% in the rate of successful task completion +across multiple LLMs compared with direct prompting and also improves +efficiency (i.e., the number of questions needed to complete the task). Our +code has been released [here](https://github.com/zhiyuanhubj/UoT) + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language + Models + + +
+ New LLM evaluation benchmarks are important to align with the rapid +development of Large Language Models (LLMs). In this work, we present Chinese +SimpleQA, the first comprehensive Chinese benchmark to evaluate the factuality +ability of language models to answer short questions, and Chinese SimpleQA +mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, +Easy-to-evaluate). Specifically, first, we focus on the Chinese language over 6 +major topics with 99 diverse subtopics. Second, we conduct a comprehensive +quality control process to achieve high-quality questions and answers, where +the reference answers are static and cannot be changed over time. Third, +following SimpleQA, the questions and answers are very short, and the grading +process is easy-to-evaluate based on OpenAI API. Based on Chinese SimpleQA, we +perform a comprehensive evaluation on the factuality abilities of existing +LLMs. Finally, we hope that Chinese SimpleQA could guide the developers to +better understand the Chinese factuality abilities of their models and +facilitate the growth of foundation models. + +
+
+
+
+
+ + ♻ ☆ Toxicity Detection is NOT all you Need: Measuring the Gaps to Supporting + Volunteer Content Moderators + + +
+ Extensive efforts in automated approaches for content moderation have been +focused on developing models to identify toxic, offensive, and hateful content +with the aim of lightening the load for moderators. Yet, it remains uncertain +whether improvements on those tasks have truly addressed moderators' needs in +accomplishing their work. In this paper, we surface gaps between past research +efforts that have aimed to provide automation for aspects of content moderation +and the needs of volunteer content moderators, regarding identifying violations +of various moderation rules. To do so, we conduct a model review on Hugging +Face to reveal the availability of models to cover various moderation rules and +guidelines from three exemplar forums. We further put state-of-the-art LLMs to +the test, evaluating how well these models perform in flagging violations of +platform rules from one particular forum. Finally, we conduct a user survey +study with volunteer moderators to gain insight into their perspectives on +useful moderation models. Overall, we observe a non-trivial gap, as missing +developed models and LLMs exhibit moderate to low performance on a significant +portion of the rules. Moderators' reports provide guides for future work on +developing moderation assistant models. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Economists + + +
+ Deep learning provides powerful methods to impute structured information from +large-scale, unstructured text and image datasets. For example, economists +might wish to detect the presence of economic activity in satellite images, or +to measure the topics or entities mentioned in social media, the congressional +record, or firm filings. This review introduces deep neural networks, covering +methods such as classifiers, regression models, generative AI, and embedding +models. Applications include classification, document digitization, record +linkage, and methods for data exploration in massive scale text and image +corpora. When suitable methods are used, deep learning models can be cheap to +tune and can scale affordably to problems involving millions or billions of +data points.. The review is accompanied by a companion website, EconDL, with +user-friendly demo notebooks, software resources, and a knowledge base that +provides technical details and additional applications. + +
+
+
+
+
+ + ♻ ☆ No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design + Choices + + +
+ Advances in generative models have made it possible for AI-generated text, +code, and images to mirror human-generated content in many applications. +Watermarking, a technique that aims to embed information in the output of a +model to verify its source, is useful for mitigating the misuse of such +AI-generated content. However, we show that common design choices in LLM +watermarking schemes make the resulting systems surprisingly susceptible to +attack -- leading to fundamental trade-offs in robustness, utility, and +usability. To navigate these trade-offs, we rigorously study a set of simple +yet effective attacks on common watermarking systems, and propose guidelines +and defenses for LLM watermarking in practice. + +
+
+
+
+
+ + ♻ ☆ General LLMs as Instructors for Domain-Specific LLMs: A Sequential + Fusion Method to Integrate Extraction and Editing + + +
+ The substantial interest in updating Large Language Models (LLMs) without +retraining from scratch is accompanied by several challenges. This is +particularly true when updating LLMs with datasets that necessitate +domain-expert reasoning across extensive texts, despite limited samples. We +termed the scenario as the Few-Shot Domain-Expert Reasoning for Updating LLMs +(FDoR-UL). Traditional methods such as Low-Rank Adaptation (LoRA) and Retrieval +Augmented Generation (RAG) are inadequate for addressing this critical issue, +particularly evident in our exploration of a specific medical dataset that +epitomizes the distinct needs of FDoR-UL. To tackle this challenge, we +introduce a Sequential Fusion method to integrate knowledge from complex +contexts into LLMs. This method employs a two-stage framework: initially +leveraging general LLMs to perform relation extraction for knowledge +acquisition from complex texts, followed by updating domain-specific LLMs +through Knowledge Editing (KE). Employing our method, domain-specific LLMs +achieved a 71.7% accuracy (an average gain of 39.1%) in question-answering +tasks. Furthermore, we expanded our evaluation to a novel economics-management +dataset we developed, where our method achieved a 75.0% accuracy (an average +gain of 45.0%). These findings underscore the effectiveness and flexibility of +our approach in FDoR-UL across various domains. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ♻ ☆ Is Moral Self-correction An Innate Capability of Large Language Models? + A Mechanistic Analysis to Self-correction + + +
+ Though intensive attentions to the self-correction capability of Large +Language Models (LLMs), the underlying mechanism of this capability is still +under-explored. In this paper, we aim to answer two fundamental questions for +moral self-correction: (1) how different components in self-correction, such as +Chain-of-Thought (CoT) reasoning, external feedback, and instructional prompts, +interact to enable moral self-correction; and (2) is the self-correction one of +LLMs' innate capabilities? To answer the first question, we examine how +different self-correction components interact to intervene the embedded +morality within hidden states, therefore contributing to different performance. +For the second question, we (i) evaluate the robustness of moral +self-correction by introducing natural language interventions of weak evidence +into prompts; (ii) propose a validation framework, self-distinguish, that +requires effective self-correction to enable LLMs to distinguish between +desirable and undesirable outputs. Our experimental results indicate that there +is no universally optimal self-correction method for the tasks considered, +although external feedback and CoT can contribute to additional performance +gains. However, our mechanistic analysis reveals negative interactions among +instructional prompts, CoT, and external feedback, suggesting a conflict +between internal knowledge and external feedback. The self-distinguish +experiments demonstrate that while LLMs can self-correct their responses, they +are unable to reliably distinguish between desired and undesired outputs. With +our empirical evidence, we can conclude that moral self-correction is not an +innate capability of LLMs acquired during pretraining. + +
+
+
+
+
+ + ♻ ☆ Enhancing Post-Hoc Attributions in Long Document Comprehension via + Coarse Grained Answer Decomposition + + +
+ Accurately attributing answer text to its source document is crucial for +developing a reliable question-answering system. However, attribution for long +documents remains largely unexplored. Post-hoc attribution systems are designed +to map answer text back to the source document, yet the granularity of this +mapping has not been addressed. Furthermore, a critical question arises: What +exactly should be attributed? This involves identifying the specific +information units within an answer that require grounding. In this paper, we +propose and investigate a novel approach to the factual decomposition of +generated answers for attribution, employing template-based in-context +learning. To accomplish this, we utilize the question and integrate negative +sampling during few-shot in-context learning for decomposition. This approach +enhances the semantic understanding of both abstractive and extractive answers. +We examine the impact of answer decomposition by providing a thorough +examination of various attribution approaches, ranging from retrieval-based +techniques to LLM-based attributors. + +
+
+
+
+
+ + ♻ ☆ Are Large Language Models Table-based Fact-Checkers? + + +
+ Table-based Fact Verification (TFV) aims to extract the entailment relation +between statements and structured tables. Existing TFV methods based on +small-scaled models suffer from insufficient labeled data and weak zero-shot +ability. Recently, the appearance of Large Language Models (LLMs) has gained +lots of attraction in research fields. They have shown powerful zero-shot and +in-context learning abilities on several NLP tasks, but their potential on TFV +is still unknown. In this work, we implement a preliminary study about whether +LLMs are table-based fact-checkers. In detail, we design diverse prompts to +explore how the in-context learning can help LLMs in TFV, i.e., zero-shot and +few-shot TFV capability. Besides, we carefully design and construct TFV +instructions to study the performance gain brought by the instruction tuning of +LLMs. Experimental results demonstrate that LLMs can achieve acceptable results +on zero-shot and few-shot TFV with prompt engineering, while instruction-tuning +can stimulate the TFV capability significantly. We also make some valuable +findings about the format of zero-shot prompts and the number of in-context +examples. Finally, we analyze some possible directions to promote the accuracy +of TFV via LLMs, which is beneficial to further research of table reasoning. + +
+
+ comment: CSCWD 2024 +
+
+
+
+
+ + ♻ ☆ Target-driven Attack for Large Language Models + + +
+ Current large language models (LLM) provide a strong foundation for +large-scale user-oriented natural language tasks. Many users can easily inject +adversarial text or instructions through the user interface, thus causing LLM +model security challenges like the language model not giving the correct +answer. Although there is currently a large amount of research on black-box +attacks, most of these black-box attacks use random and heuristic strategies. +It is unclear how these strategies relate to the success rate of attacks and +thus effectively improve model robustness. To solve this problem, we propose +our target-driven black-box attack method to maximize the KL divergence between +the conditional probabilities of the clean text and the attack text to redefine +the attack's goal. We transform the distance maximization problem into two +convex optimization problems based on the attack goal to solve the attack text +and estimate the covariance. Furthermore, the projected gradient descent +algorithm solves the vector corresponding to the attack text. Our target-driven +black-box attack approach includes two attack strategies: token manipulation +and misinformation attack. Experimental results on multiple Large Language +Models and datasets demonstrate the effectiveness of our attack method. + +
+
+ comment: 12 pages, 7 figures. This work is an extension of the + arXiv:2404.07234 work. We propose new methods. 27th European Conference on + Artificial Intelligence 2024 +
+
+
+
+
+ + ♻ ☆ SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation EMNLP 2024 + + +
+ It is often desirable to distill the capabilities of large language models +(LLMs) into smaller student models due to compute and memory constraints. One +way to do this for classification tasks is via dataset synthesis, which can be +accomplished by generating examples of each label from the LLM. Prior +approaches to synthesis use few-shot prompting, which relies on the LLM's +parametric knowledge to generate usable examples. However, this leads to issues +of repetition, bias towards popular entities, and stylistic differences from +human text. In this work, we propose Synthesize by Retrieval and Refinement +(SynthesizRR), which uses retrieval augmentation to introduce variety into the +dataset synthesis process: as retrieved passages vary, the LLM is seeded with +different content to generate its examples. We empirically study the synthesis +of six datasets, covering topic classification, sentiment analysis, tone +detection, and humor, requiring complex synthesis strategies. We find that +SynthesizRR greatly improves lexical and semantic diversity, similarity to +human-written text, and distillation performance, when compared to 32-shot +prompting and four prior approaches. We release our code to perform all steps +at https://github.com/amazon-science/synthesizrr + +
+
+ comment: Published as a main conference paper at EMNLP 2024. Code available at + https://github.com/amazon-science/synthesizrr +
+
+
+
+
+ + ♻ ☆ Vikhr: Constructing a State-of-the-art Bilingual Open-Source + Instruction-Following Large Language Model for Russian EMNLP-2024 + + +
+ There has been a surge in developing various Large Language Models (LLMs). +However, text generation for languages other than English often faces +significant challenges, including poor generation quality and reduced +computational performance due to the disproportionate representation of tokens +in the model's vocabulary. In this work, we address these issues by developing +a pipeline for adapting English-oriented pre-trained models to other languages +and constructing efficient bilingual LLMs. Using this pipeline, we construct +Vikhr, a state-of-the-art bilingual open-source instruction-following LLM +designed specifically for the Russian language. "Vikhr" refers to the name of +the Mistral LLM series and means a "strong gust of wind." Unlike previous +Russian-language models that typically rely on LoRA adapters on top of +English-oriented models, sacrificing performance for lower training costs, +Vikhr features an adapted tokenizer vocabulary and undergoes continued +pre-training and instruction tuning of all weights. This not only enhances the +model's performance but also significantly improves its computational and +contextual efficiency. The remarkable performance of Vikhr across various +Russian-language benchmarks can also be attributed to our efforts in expanding +instruction datasets and corpora for continued pre-training. Vikhr not only +sets a new state of the art among open-source LLMs for Russian but even +outperforms some proprietary closed-source models on certain benchmarks. The +model weights, instruction sets, and code are publicly available. + +
+
+ comment: Accepted at WMRL @ EMNLP-2024 +
+
+
+
+
+ + ♻ ☆ Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models + with Simple and Effective Scaffold Token Removal + + +
+ Byte Pair Encoding (BPE) serves as a foundation method for text tokenization +in the Natural Language Processing (NLP) field. Despite its wide adoption, the +original BPE algorithm harbors an inherent flaw: it inadvertently introduces a +frequency imbalance for tokens in the text corpus. Since BPE iteratively merges +the most frequent token pair in the text corpus to generate a new token and +keeps all generated tokens in the vocabulary, it unavoidably holds tokens that +primarily act as components of a longer token and appear infrequently on their +own. We term such tokens as Scaffold Tokens. Due to their infrequent +occurrences in the text corpus, Scaffold Tokens pose a learning imbalance +issue. To address that issue, we propose Scaffold-BPE, which incorporates a +dynamic scaffold token removal mechanism by parameter-free, computation-light, +and easy-to-implement modifications to the original BPE method. This novel +approach ensures the exclusion of low-frequency Scaffold Tokens from the token +representations for given texts, thereby mitigating the issue of frequency +imbalance and facilitating model training. On extensive experiments across +language modeling and even machine translation, Scaffold-BPE consistently +outperforms the original BPE, well demonstrating its effectiveness. + +
+
+
+
+
+ + ♻ ☆ Spin glass model of in-context learning + + +
+ Large language models show a surprising in-context learning ability -- being +able to use a prompt to form a prediction for a query, yet without additional +training, in stark contrast to old-fashioned supervised learning. Providing a +mechanistic interpretation and linking the empirical phenomenon to physics are +thus challenging and remain unsolved. We study a simple yet expressive +transformer with linear attention and map this structure to a spin glass model +with real-valued spins, where the couplings and fields explain the intrinsic +disorder in data. The spin glass model explains how the weight parameters +interact with each other during pre-training, and further clarifies why an +unseen function can be predicted by providing only a prompt yet without further +training. Our theory reveals that for single-instance learning, increasing the +task diversity leads to the emergence of in-context learning, by allowing the +Boltzmann distribution to converge to a unique correct solution of weight +parameters. Therefore the pre-trained transformer displays a prediction power +in a novel prompt setting. The proposed analytically tractable model thus +offers a promising avenue for thinking about how to interpret many intriguing +but puzzling properties of large language models. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Query Optimization for Parametric Knowledge Refinement in + Retrieval-Augmented Large Language Models + + +
+ We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel +approach designed to bridge the pre-retrieval information gap in +Retrieval-Augmented Generation (RAG) systems through query optimization +tailored to meet the specific knowledge requirements of Large Language Models +(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR +framework begins by extracting parametric knowledge from LLMs, followed by +using a specialized query optimizer for refining these queries. This process +ensures the retrieval of only the most pertinent information essential for +generating accurate responses. Moreover, to enhance flexibility and reduce +computational costs, we propose a trainable scheme for our pipeline that +utilizes a smaller, tunable model as the query optimizer, which is refined +through knowledge distillation from a larger teacher model. Our evaluations on +various question-answering (QA) datasets and with different retrieval systems +show that ERRR consistently outperforms existing baselines, proving to be a +versatile and cost-effective module for improving the utility and accuracy of +RAG systems. + +
+
+
+
+
+ + ♻ ☆ Evaluating AI-Generated Essays with GRE Analytical Writing Assessment + + +
+ The recent revolutionary advance in generative AI enables the generation of +realistic and coherent texts by large language models (LLMs). Despite many +existing evaluation metrics on the quality of the generated texts, there is +still a lack of rigorous assessment of how well LLMs perform in complex and +demanding writing assessments. This study examines essays generated by ten +leading LLMs for the analytical writing assessment of the Graduate Record Exam +(GRE). We assessed these essays using both human raters and the e-rater +automated scoring engine as used in the GRE scoring pipeline. Notably, the +top-performing Gemini and GPT-4o received an average score of 4.78 and 4.67, +respectively, falling between "generally thoughtful, well-developed analysis of +the issue and conveys meaning clearly" and "presents a competent analysis of +the issue and conveys meaning with acceptable clarity" according to the GRE +scoring guideline. We also evaluated the detection accuracy of these essays, +with detectors trained on essays generated by the same and different LLMs. + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Multi-IF: Benchmarking LLMs on Multi-Turn and Multilingual Instructions + Following + + +
+ Large Language Models (LLMs) have demonstrated impressive capabilities in +various tasks, including instruction following, which is crucial for aligning +model outputs with user expectations. However, evaluating LLMs' ability to +follow instructions remains challenging due to the complexity and subjectivity +of human language. Current benchmarks primarily focus on single-turn, +monolingual instructions, which do not adequately reflect the complexities of +real-world applications that require handling multi-turn and multilingual +interactions. To address this gap, we introduce Multi-IF, a new benchmark +designed to assess LLMs' proficiency in following multi-turn and multilingual +instructions. Multi-IF, which utilizes a hybrid framework combining LLM and +human annotators, expands upon the IFEval by incorporating multi-turn sequences +and translating the English prompts into another 7 languages, resulting in a +dataset of 4,501 multilingual conversations, where each has three turns. Our +evaluation of 14 state-of-the-art LLMs on Multi-IF reveals that it presents a +significantly more challenging task than existing benchmarks. All the models +tested showed a higher rate of failure in executing instructions correctly with +each additional turn. For example, o1-preview drops from 0.877 at the first +turn to 0.707 at the third turn in terms of average accuracy over all +languages. Moreover, languages with non-Latin scripts (Hindi, Russian, and +Chinese) generally exhibit higher error rates, suggesting potential limitations +in the models' multilingual capabilities. We release Multi-IF prompts and the +evaluation code base to encourage further research in this critical area. + +
+
+
+
+
+ + ♻ ☆ Experiences from Creating a Benchmark for Sentiment Classification for + Varieties of English + + +
+ Existing benchmarks often fail to account for linguistic diversity, like +language variants of English. In this paper, we share our experiences from our +ongoing project of building a sentiment classification benchmark for three +variants of English: Australian (en-AU), Indian (en-IN), and British (en-UK) +English. Using Google Places reviews, we explore the effects of various +sampling techniques based on label semantics, review length, and sentiment +proportion and report performances on three fine-tuned BERT-based models. Our +initial evaluation reveals significant performance variations influenced by +sample characteristics, label semantics, and language variety, highlighting the +need for nuanced benchmark design. We offer actionable insights for researchers +to create robust benchmarks, emphasising the importance of diverse sampling, +careful label definition, and comprehensive evaluation across linguistic +varieties. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Fair Summarization: Bridging Quality and Diversity in Extractive + Summaries NeurIPS 2024 + + +
+ Fairness in multi-document summarization of user-generated content remains a +critical challenge in natural language processing (NLP). Existing summarization +methods often fail to ensure equitable representation across different social +groups, leading to biased outputs. In this paper, we introduce two novel +methods for fair extractive summarization: FairExtract, a clustering-based +approach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints. +We evaluate these methods using Divsumm summarization dataset of White-aligned, +Hispanic, and African-American dialect tweets and compare them against relevant +baselines. The results obtained using a comprehensive set of summarization +quality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well +as a fairness metric F, demonstrate that FairExtract and FairGPT achieve +superior fairness while maintaining competitive summarization quality. +Additionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that +integrate quality and fairness into a single evaluation framework, offering a +more nuanced understanding of the trade-offs between these objectives. This +work highlights the importance of fairness in summarization and sets a +benchmark for future research in fairness-aware NLP models. + +
+
+ comment: Accepted at Algorithmic Fairness through the Lens of Metrics and + Evaluation Workshop @ NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Interpretability Needs a New Paradigm + + +
+ Interpretability is the study of explaining models in understandable terms to +humans. At present, interpretability is divided into two paradigms: the +intrinsic paradigm, which believes that only models designed to be explained +can be explained, and the post-hoc paradigm, which believes that black-box +models can be explained. At the core of this debate is how each paradigm +ensures its explanations are faithful, i.e., true to the model's behavior. This +is important, as false but convincing explanations lead to unsupported +confidence in artificial intelligence (AI), which can be dangerous. This +paper's position is that we should think about new paradigms while staying +vigilant regarding faithfulness. First, by examining the history of paradigms +in science, we see that paradigms are constantly evolving. Then, by examining +the current paradigms, we can understand their underlying beliefs, the value +they bring, and their limitations. Finally, this paper presents 3 emerging +paradigms for interpretability. The first paradigm designs models such that +faithfulness can be easily measured. Another optimizes models such that +explanations become faithful. The last paradigm proposes to develop models that +produce both a prediction and an explanation. + +
+
+
+
+
+ + ♻ ☆ Data-Prep-Kit: getting your data ready for LLM application development + + +
+ Data preparation is the first and a very important step towards any Large +Language Model (LLM) development. This paper introduces an easy-to-use, +extensible, and scale-flexible open-source data preparation toolkit called Data +Prep Kit (DPK). DPK is architected and designed to enable users to scale their +data preparation to their needs. With DPK they can prepare data on a local +machine or effortlessly scale to run on a cluster with thousands of CPU Cores. +DPK comes with a highly scalable, yet extensible set of modules that transform +natural language and code data. If the user needs additional transforms, they +can be easily developed using extensive DPK support for transform creation. +These modules can be used independently or pipelined to perform a series of +operations. In this paper, we describe DPK architecture and show its +performance from a small scale to a very large number of CPUs. The modules from +DPK have been used for the preparation of Granite Models [1] [2]. We believe +DPK is a valuable contribution to the AI community to easily prepare data to +enhance the performance of their LLM models or to fine-tune models with +Retrieval-Augmented Generation (RAG). + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Trustful LLMs: Customizing and Grounding Text Generation with Knowledge + Bases and Dual Decoders + + +
+ Although people are impressed by the content generation skills of large +language models, the use of LLMs, such as ChatGPT, is limited by the domain +grounding of the content. The correctness and groundedness of the generated +content need to be based on a verified context, such as results from +Retrieval-Augmented Generation (RAG). One important issue when adapting LLMs to +a customized domain is that the generated responses are often incomplete, or +the additions are not verified and may even be hallucinated. Prior studies on +hallucination detection have focused on evaluation metrics, which are not +easily adaptable to dynamic domains and can be vulnerable to attacks like +jail-breaking. In this work, we propose 1) a post-processing algorithm that +leverages knowledge triplets in RAG context to correct hallucinations and 2) a +dual-decoder model that fuses RAG context to guide the generation process. + +
+
+
+
+
+ + ♻ ☆ MDCure: A Scalable Pipeline for Multi-Document Instruction-Following + + +
+ Multi-document (MD) processing is crucial for LLMs to handle real-world tasks +such as summarization and question-answering across large sets of documents. +While LLMs have improved at processing long inputs, MD contexts still present +challenges, such as managing inter-document dependencies, redundancy, and +incoherent structures. We introduce MDCure, a scalable and effective +fine-tuning pipeline to enhance the MD capabilities of LLMs without the +computational cost of pre-training or reliance on human annotated data. MDCure +is based on generation of high-quality synthetic MD instruction data from sets +of related articles via targeted prompts. We further introduce MDCureRM, a +multi-objective reward model which filters generated data based on their +training utility for MD settings. With MDCure, we fine-tune a variety of LLMs, +from the FlanT5, Qwen2, and LLAMA3.1 model families, up to 70B parameters in +size. Extensive evaluations on a wide range of MD and long-context benchmarks +spanning various tasks show MDCure consistently improves performance over +pre-trained baselines and over corresponding base models by up to 75.5%. Our +code, datasets, and models are available at https://github.com/yale-nlp/MDCure. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Robotics 39 + +
+
+
+ + ☆ LLMPhy: Complex Physical Reasoning Using Large Language Models and World + Models + + +
+ Physical reasoning is an important skill needed for robotic agents when +operating in the real world. However, solving such reasoning problems often +involves hypothesizing and reflecting over complex multi-body interactions +under the effect of a multitude of physical forces and thus learning all such +interactions poses a significant hurdle for state-of-the-art machine learning +frameworks, including large language models (LLMs). To study this problem, we +propose a new physical reasoning task and a dataset, dubbed TraySim. Our task +involves predicting the dynamics of several objects on a tray that is given an +external impact -- the domino effect of the ensued object interactions and +their dynamics thus offering a challenging yet controlled setup, with the goal +of reasoning being to infer the stability of the objects after the impact. To +solve this complex physical reasoning task, we present LLMPhy, a zero-shot +black-box optimization framework that leverages the physics knowledge and +program synthesis abilities of LLMs, and synergizes these abilities with the +world models built into modern physics engines. Specifically, LLMPhy uses an +LLM to generate code to iteratively estimate the physical hyperparameters of +the system (friction, damping, layout, etc.) via an implicit +analysis-by-synthesis approach using a (non-differentiable) simulator in the +loop and uses the inferred parameters to imagine the dynamics of the scene +towards solving the reasoning task. To show the effectiveness of LLMPhy, we +present experiments on our TraySim dataset to predict the steady-state poses of +the objects. Our results show that the combination of the LLM and the physics +engine leads to state-of-the-art zero-shot physical reasoning performance, +while demonstrating superior convergence against standard black-box +optimization methods and better estimation of the physical parameters. + +
+
+
+
+
+ + ☆ Learning Memory Mechanisms for Decision Making through Demonstrations + + +
+ In Partially Observable Markov Decision Processes, integrating an agent's +history into memory poses a significant challenge for decision-making. +Traditional imitation learning, relying on observation-action pairs for expert +demonstrations, fails to capture the expert's memory mechanisms used in +decision-making. To capture memory processes as demonstrations, we introduce +the concept of \textbf{memory dependency pairs} $(p, q)$ indicating that events +at time $p$ are recalled for decision-making at time $q$. We introduce +\textbf{AttentionTuner} to leverage memory dependency pairs in Transformers and +find significant improvements across several tasks compared to standard +Transformers when evaluated on Memory Gym and the Long-term Memory Benchmark. +Code is available at https://github.com/WilliamYue37/AttentionTuner . + +
+
+
+
+
+ + Prediction of Acoustic Communication Performance for AUVs using Gaussian + Process Classification + + +
+ Cooperating autonomous underwater vehicles (AUVs) often rely on acoustic +communication to coordinate their actions effectively. However, the reliability +of underwater acoustic communication decreases as the communication range +between vehicles increases. Consequently, teams of cooperating AUVs typically +make conservative assumptions about the maximum range at which they can +communicate reliably. To address this limitation, we propose a novel approach +that involves learning a map representing the probability of successful +communication based on the locations of the transmitting and receiving +vehicles. This probabilistic communication map accounts for factors such as the +range between vehicles, environmental noise, and multi-path effects at a given +location. In pursuit of this goal, we investigate the application of Gaussian +process binary classification to generate the desired communication map. We +specialize existing results to this specific binary classification problem and +explore methods to incorporate uncertainty in vehicle location into the mapping +process. Furthermore, we compare the prediction performance of the probability +communication map generated using binary classification with that of a +signal-to-noise ratio (SNR) communication map generated using Gaussian process +regression. Our approach is experimentally validated using communication and +navigation data collected during trials with a pair of Virginia Tech 690 AUVs. + +
+
+
+
+
+ + ☆ Minimally Invasive Flexible Needle Manipulation Based on Finite Element + Simulation and Cross Entropy Method + + +
+ We present a novel approach for minimally invasive flexible needle +manipulations by pairing a real-time finite element simulator with the +cross-entropy method. Additionally, we demonstrate how a kinematic-driven +bang-bang controller can complement the control framework for better tracking +performance. We show how electromagnetic (EM) tracking can be readily +incorporated into the framework to provide controller feedback. Tissue phantom +experiment with EM tracking shows the average targeting error is $0.16 \pm +0.29mm$. + +
+
+ comment: Submitted to IEEE International Conference on Robotics and Automation + 2025 +
+
+
+
+
+ + ☆ Iterative Learning Control with Mismatch Compensation for Residual + Vibration Suppression in Delta Robots + + +
+ Unwanted vibrations stemming from the energy-optimized design of Delta robots +pose a challenge in their operation, especially with respect to precise +reference tracking. To improve tracking accuracy, this paper proposes an +adaptive mismatch-compensated iterative learning controller based on input +shaping techniques. We establish a dynamic model considering the +electromechanical rigid-flexible coupling of the Delta robot, which integrates +the permanent magnet synchronous motor. Using this model, we design an +optimization-based input shaper, considering the natural frequency of the +robot, which varies with the configuration. We proposed an iterative learning +controller for the delta robot to improve tracking accuracy. Our iterative +learning controller incorporates model mismatch where the mismatch approximated +by a fuzzy logic structure. The convergence property of the proposed controller +is proved using a Barrier Composite Energy Function, providing a guarantee that +the tracking errors along the iteration axis converge to zero. Moreover, +adaptive parameter update laws are designed to ensure convergence. Finally, we +perform a series of high-fidelity simulations of the Delta robot using Simscape +to demonstrate the effectiveness of the proposed control strategy. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ NL-SLAM for OC-VLN: Natural Language Grounded SLAM for Object-Centric + VLN + + +
+ Landmark-based navigation (e.g. go to the wooden desk) and relative +positional navigation (e.g. move 5 meters forward) are distinct navigation +challenges solved very differently in existing robotics navigation methodology. +We present a new dataset, OC-VLN, in order to distinctly evaluate grounding +object-centric natural language navigation instructions in a method for +performing landmark-based navigation. We also propose Natural Language grounded +SLAM (NL-SLAM), a method to ground natural language instruction to robot +observations and poses. We actively perform NL-SLAM in order to follow +object-centric natural language navigation instructions. Our methods leverage +pre-trained vision and language foundation models and require no task-specific +training. We construct two strong baselines from state-of-the-art methods on +related tasks, Object Goal Navigation and Vision Language Navigation, and we +show that our approach, NL-SLAM, outperforms these baselines across all our +metrics of success on OC-VLN. Finally, we successfully demonstrate the +effectiveness of NL-SLAM for performing navigation instruction following in the +real world on a Boston Dynamics Spot robot. + +
+
+
+
+
+ + ☆ Robust Adaptive Safe Robotic Grasping with Tactile Sensing + + +
+ Robotic grasping requires safe force interaction to prevent a grasped object +from being damaged or slipping out of the hand. In this vein, this paper +proposes an integrated framework for grasping with formal safety guarantees +based on Control Barrier Functions. We first design contact force and force +closure constraints, which are enforced by a safety filter to accomplish safe +grasping with finger force control. For sensory feedback, we develop a +technique to estimate contact point, force, and torque from tactile sensors at +each finger. We verify the framework with various safety filters in a numerical +simulation under a two-finger grasping scenario. We then experimentally +validate the framework by grasping multiple objects, including fragile lab +glassware, in a real robotic setup, showing that safe grasping can be +successfully achieved in the real world. We evaluate the performance of each +safety filter in the context of safety violation and conservatism, and find +that disturbance observer-based control barrier functions provide superior +performance for safety guarantees with minimum conservatism. The demonstration +video is available at https://youtu.be/Cuj47mkXRdg. + +
+
+
+
+
+ + ☆ Singularity-Avoidance Control of Robotic Systems with Model Mismatch and + Actuator Constraints + + +
+ Singularities, manifesting as special configuration states, deteriorate robot +performance and may even lead to a loss of control over the system. This paper +addresses the kinematic singularity concerns in robotic systems with model +mismatch and actuator constraints through control barrier functions (CBFs). We +propose a learning-based control strategy to prevent robots entering +singularity regions. More precisely, we leverage Gaussian process (GP) +regression to learn the unknown model mismatch, where the prediction error is +restricted by a deterministic bound. Moreover, we offer the criteria for +parameter selection to ensure the feasibility of CBFs subject to actuator +constraints. The proposed approach is validated by high-fidelity simulations on +a 2 degrees-of-freedom (DoFs) planar robot. + +
+
+ comment: This work has been submitted to ECC 2025 for possible publication +
+
+
+
+
+ + ☆ Horticultural Temporal Fruit Monitoring via 3D Instance Segmentation and + Re-Identification using Point Clouds + + +
+ Robotic fruit monitoring is a key step toward automated agricultural +production systems. Robots can significantly enhance plant and temporal fruit +monitoring by providing precise, high-throughput assessments that overcome the +limitations of traditional manual methods. Fruit monitoring is a challenging +task due to the significant variation in size, shape, orientation, and +occlusion of fruits. Also, fruits may be harvested or newly grown between +recording sessions. Most methods are 2D image-based and they lack the 3D +structure, depth, and spatial information, which represent key aspects of fruit +monitoring. 3D colored point clouds, instead, can offer this information but +they introduce challenges such as their sparsity and irregularity. In this +paper, we present a novel approach for temporal fruit monitoring that addresses +point clouds collected in a greenhouse over time. Our method segments fruits +using a learning-based instance segmentation approach directly on the point +cloud. Each segmented fruit is processed by a 3D sparse convolutional neural +network to extract descriptors, which are used in an attention-based matching +network to associate fruits with their instances from previous data +collections. Experimental results on a real dataset of strawberries demonstrate +that our approach outperforms other methods for fruits re-identification over +time, allowing for precise temporal fruit monitoring in real and complex +scenarios. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ Navigation with QPHIL: Quantizing Planner for Hierarchical Implicit + Q-Learning + + +
+ Offline Reinforcement Learning (RL) has emerged as a powerful alternative to +imitation learning for behavior modeling in various domains, particularly in +complex navigation tasks. An existing challenge with Offline RL is the +signal-to-noise ratio, i.e. how to mitigate incorrect policy updates due to +errors in value estimates. Towards this, multiple works have demonstrated the +advantage of hierarchical offline RL methods, which decouples high-level path +planning from low-level path following. In this work, we present a novel +hierarchical transformer-based approach leveraging a learned quantizer of the +space. This quantization enables the training of a simpler zone-conditioned +low-level policy and simplifies planning, which is reduced to discrete +autoregressive prediction. Among other benefits, zone-level reasoning in +planning enables explicit trajectory stitching rather than implicit stitching +based on noisy value function estimates. By combining this transformer-based +planner with recent advancements in offline RL, our proposed approach achieves +state-of-the-art results in complex long-distance navigation environments. + +
+
+ comment: Under review. Code will be released upon acceptance +
+
+
+
+
+ + ☆ EMPERROR: A Flexible Generative Perception Error Model for Probing + Self-Driving Planners + + +
+ To handle the complexities of real-world traffic, learning planners for +self-driving from data is a promising direction. While recent approaches have +shown great progress, they typically assume a setting in which the ground-truth +world state is available as input. However, when deployed, planning needs to be +robust to the long-tail of errors incurred by a noisy perception system, which +is often neglected in evaluation. To address this, previous work has proposed +drawing adversarial samples from a perception error model (PEM) mimicking the +noise characteristics of a target object detector. However, these methods use +simple PEMs that fail to accurately capture all failure modes of detection. In +this paper, we present EMPERROR, a novel transformer-based generative PEM, +apply it to stress-test an imitation learning (IL)-based planner and show that +it imitates modern detectors more faithfully than previous work. Furthermore, +it is able to produce realistic noisy inputs that increase the planner's +collision rate by up to 85%, demonstrating its utility as a valuable tool for a +more complete evaluation of self-driving planners. + +
+
+ comment: Project page: https://lasnik.github.io/emperror/ +
+
+
+
+
+ + ☆ OWLed: Outlier-weighed Layerwise Pruning for Efficient Autonomous + Driving Framework + + +
+ The integration of Large Language Models (LLMs) into autonomous driving +systems offers promising enhancements in environmental understanding and +decision-making. However, the substantial computational demands of deploying +LLMs locally on vehicles render this approach unfeasible for real-world +automotive applications. To address this challenge, we introduce OWLed, the +Outlier-Weighed Layerwise Pruning for Efficient Autonomous Driving Framework +that leverages outlier-weighted layerwise sparsity for model compression. Our +method assigns non-uniform sparsity ratios to different layers based on the +distribution of outlier features, significantly reducing the model size without +the need for fine-tuning. To ensure the compressed model adapts well to +autonomous driving tasks, we incorporate driving environment data into both the +calibration and pruning processes. Our empirical studies reveal that the +encoder component is more sensitive to pruning than the LLM, highlighting its +critical role in the system. Experimental results demonstrate that OWLed +outperforms existing methods in perception, action prediction, and language +understanding while substantially lowering computational requirements. These +findings underscore the potential of combining advanced pruning techniques with +LLMs to develop efficient and robust autonomous driving systems capable of +handling complex scenarios. Code will be made publicly available. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ RINO: Accurate, Robust Radar-Inertial Odometry with Non-Iterative + Estimation + + +
+ Precise localization and mapping are critical for achieving autonomous +navigation in self-driving vehicles. However, ego-motion estimation still faces +significant challenges, particularly when GNSS failures occur or under extreme +weather conditions (e.g., fog, rain, and snow). In recent years, scanning radar +has emerged as an effective solution due to its strong penetration +capabilities. Nevertheless, scanning radar data inherently contains high levels +of noise, necessitating hundreds to thousands of iterations of optimization to +estimate a reliable transformation from the noisy data. Such iterative solving +is time-consuming, unstable, and prone to failure. To address these challenges, +we propose an accurate and robust Radar-Inertial Odometry system, RINO, which +employs a non-iterative solving approach. Our method decouples rotation and +translation estimation and applies an adaptive voting scheme for 2D rotation +estimation, enhancing efficiency while ensuring consistent solving time. +Additionally, the approach implements a loosely coupled system between the +scanning radar and an inertial measurement unit (IMU), leveraging Error-State +Kalman Filtering (ESKF). Notably, we successfully estimated the uncertainty of +the pose estimation from the scanning radar, incorporating this into the +filter's Maximum A Posteriori estimation, a consideration that has been +previously overlooked. Validation on publicly available datasets demonstrates +that RINO outperforms state-of-the-art methods and baselines in both accuracy +and robustness. Our code is available at https://github.com/yangsc4063/rino. + +
+
+
+
+
+ + ☆ Human Arm Pose Estimation with a Shoulder-worn Force-Myography Device + for Human-Robot Interaction + + +
+ Accurate human pose estimation is essential for effective Human-Robot +Interaction (HRI). By observing a user's arm movements, robots can respond +appropriately, whether it's providing assistance or avoiding collisions. While +visual perception offers potential for human pose estimation, it can be +hindered by factors like poor lighting or occlusions. Additionally, wearable +inertial sensors, though useful, require frequent calibration as they do not +provide absolute position information. Force-myography (FMG) is an alternative +approach where muscle perturbations are externally measured. It has been used +to observe finger movements, but its application to full arm state estimation +is unexplored. In this letter, we investigate the use of a wearable FMG device +that can observe the state of the human arm for real-time applications of HRI. +We propose a Transformer-based model to map FMG measurements from the shoulder +of the user to the physical pose of the arm. The model is also shown to be +transferable to other users with limited decline in accuracy. Through +real-world experiments with a robotic arm, we demonstrate collision avoidance +without relying on visual perception. + +
+
+
+
+
+ + ☆ A Simple Multi-agent Joint Prediction Method for Autonomous Driving + + +
+ Predicting future motions of road participants is an important task for +driving autonomously. Most existing models excel at predicting the marginal +trajectory of a single agent, but predicting joint trajectories for multiple +agents that are consistent within a scene remains a challenge. Previous +research has often focused on marginal predictions, but the importance of joint +predictions has become increasingly apparent. Joint prediction aims to generate +trajectories that are consistent across the entire scene. Our research builds +upon the SIMPL baseline to explore methods for generating scene-consistent +trajectories. We tested our algorithm on the Argoverse 2 dataset, and +experimental results demonstrate that our approach can generate +scene-consistent trajectories. Compared to the SIMPL baseline, our method +significantly reduces the collision rate of joint trajectories within the +scene. + +
+
+
+
+
+ + ☆ Multiple Non-cooperative Targets Encirclement by Relative Distance based + Positioning and Neural Anti-Synchronization Control + + +
+ From prehistoric encirclement for hunting to GPS orbiting the earth for +positioning, target encirclement has numerous real world applications. However, +encircling multiple non-cooperative targets in GPS-denied environments remains +challenging. In this work, multiple targets encirclement by using a minimum of +two tasking agents, is considered where the relative distance measurements +between the agents and the targets can be obtained by using onboard sensors. +Based on the measurements, the center of all the targets is estimated directly +by a fuzzy wavelet neural network (FWNN) and the least squares fit method. +Then, a new distributed anti-synchronization controller (DASC) is designed so +that the two tasking agents are able to encircle all targets while staying +opposite to each other. In particular, the radius of the desired encirclement +trajectory can be dynamically determined to avoid potential collisions between +the two agents and all targets. Based on the Lyapunov stability analysis +method, the convergence proofs of the neural network prediction error, the +target-center position estimation error, and the controller error are addressed +respectively. Finally, both numerical simulations and UAV flight experiments +are conducted to demonstrate the validity of the encirclement algorithms. The +flight tests recorded video and other simulation results can be found in +https://youtu.be/B8uTorBNrl4. + +
+
+
+
+
+ + ☆ A High-frequency Pneumatic Oscillator for Soft Robotics + + +
+ Soft robots, while highly adaptable to diverse environments through various +actuation methods, still face significant performance boundary due to the +inherent properties of materials. These limitations manifest in the challenge +of guaranteeing rapid response and large-scale movements simultaneously, +ultimately restricting the robots' absolute speed and overall efficiency. In +this paper, we introduce a high-frequency pneumatic oscillator (HIPO) to +overcome these challenges. Through a collision-induced phase resetting +mechanism, our HIPO leverages event-based nonlinearity to trigger +self-oscillation of pneumatic actuator, which positively utilizes intrinsic +characteristics of materials. This enables the system to spontaneously generate +periodic control signals and directly produce motion responses, eliminating the +need for incorporating external actuation components. By efficiently and +rapidly converting internal energy of airflow into the kinetic energy of +robots, HIPO achieves a frequency of up to 20 Hz. Furthermore, we demonstrate +the versatility and high-performance capabilities of HIPO through bio-inspired +robots: an insect-like fast-crawler (with speeds up to 50.27 cm/s), a +high-frequency butterfly-like wing-flapper, and a maneuverable duck-like +swimmer. By eliminating external components and seamlessly fusing signal +generation, energy conversion, and motion output, HIPO unleashes rapid and +efficient motion, unlocking potential for high-performance soft robotics. + +
+
+
+
+
+ + ☆ Robotic Control Optimization Through Kernel Selection in Safe Bayesian + Optimization + + +
+ Control system optimization has long been a fundamental challenge in +robotics. While recent advancements have led to the development of control +algorithms that leverage learning-based approaches, such as SafeOpt, to +optimize single feedback controllers, scaling these methods to high-dimensional +complex systems with multiple controllers remains an open problem. In this +paper, we propose a novel learning-based control optimization method, which +enhances the additive Gaussian process-based Safe Bayesian Optimization +algorithm to efficiently tackle high-dimensional problems through kernel +selection. We use PID controller optimization in drones as a representative +example and test the method on Safe Control Gym, a benchmark designed for +evaluating safe control techniques. We show that the proposed method provides a +more efficient and optimal solution for high-dimensional control optimization +problems, demonstrating significant improvements over existing techniques. + +
+
+ comment: Accepted by 2024 IEEE International Conference on Robotics and + Biomimetics (ROBIO) +
+
+
+
+
+ + ☆ SP-VIO: Robust and Efficient Filter-Based Visual Inertial Odometry with + State Transformation Model and Pose-Only Visual Description + + +
+ Due to the advantages of high computational efficiency and small memory +requirements, filter-based visual inertial odometry (VIO) has a good +application prospect in miniaturized and payload-constrained embedded systems. +However, the filter-based method has the problem of insufficient accuracy. To +this end, we propose the State transformation and Pose-only VIO (SP-VIO) by +rebuilding the state and measurement models, and considering further visual +deprived conditions. In detail, we first proposed a system model based on the +double state transformation extended Kalman filter (DST-EKF), which has been +proven to have better observability and consistency than the models based on +extended Kalman filter (EKF) and state transformation extended Kalman filter +(ST-EKF). Secondly, to reduce the influence of linearization error caused by +inaccurate 3D reconstruction, we adopt the Pose-only (PO) theory to decouple +the measurement model from 3D features. Moreover, to deal with visual deprived +conditions, we propose a double state transformation Rauch-Tung-Striebel +(DST-RTS) backtracking method to optimize motion trajectories during visual +interruption. + Experiments on public (EuRoC, Tum-VI, KITTI) and personal datasets show that +SP-VIO has better accuracy and efficiency than state-of-the-art (SOTA) VIO +algorithms, and has better robustness under visual deprived conditions. + +
+
+
+
+
+ + ☆ Learning Autonomous Docking Operation of Fully Actuated Autonomous + Surface Vessel from Expert data + + +
+ This paper presents an approach for autonomous docking of a fully actuated +autonomous surface vessel using expert demonstration data. We frame the docking +problem as an imitation learning task and employ inverse reinforcement learning +(IRL) to learn a reward function from expert trajectories. A two-stage neural +network architecture is implemented to incorporate both environmental context +from sensors and vehicle kinematics into the reward function. The learned +reward is then used with a motion planner to generate docking trajectories. +Experiments in simulation demonstrate the effectiveness of this approach in +producing human-like docking behaviors across different environmental +configurations. + +
+
+ comment: 5 pages, 8 figures, IEEE Oceans Halifax 2024 Conference, Presented in + September 2024 in IEEE Oceans Conference in Halifax, Canada as a Student + Poster +
+
+
+
+
+ + ☆ Effective Virtual Reality Teleoperation of an Upper-body Humanoid with + Modified Task Jacobians and Relaxed Barrier Functions for Self-Collision + Avoidance IROS 2022 + + +
+ We present an approach for retartgeting off-the-shelf Virtual Reality (VR) +trackers to effectively teleoperate an upper-body humanoid while ensuring +self-collision-free motions. Key to the effectiveness was the proper assignment +of trackers to joint sets via modified task Jacobians and relaxed barrier +functions for self-collision avoidance. The approach was validated on +Apptronik's Astro hardware by demonstrating manipulation capabilities on a +table-top environment with pick-and-place box packing and a two-handed box pick +up and handover task. + +
+
+ comment: XR & Robotics Workshop, IROS 2022 +
+
+
+
+
+ + ☆ Enhanced Monocular Visual Odometry with AR Poses and Integrated INS-GPS + for Robust Localization in Urban Environments + + +
+ This paper introduces a cost effective localization system combining +monocular visual odometry , augmented reality (AR) poses, and integrated +INS-GPS data. We address monocular VO scale factor issues using AR poses and +enhance accuracy with INS and GPS data, filtered through an Extended Kalman +Filter . Our approach, tested using manually annotated trajectories from Google +Street View, achieves an RMSE of 1.529 meters over a 1 km track. Future work +will focus on real-time mobile implementation and further integration of +visual-inertial odometry for robust localization. This method offers lane-level +accuracy with minimal hardware, making advanced navigation more accessible. + +
+
+ comment: The copyright of this paper would be given to IEEE after "acceptance + of paper by IEEE" +
+
+
+
+
+ + ☆ Point Cloud Context Analysis for Rehabilitation Grasping Assistance + + +
+ Controlling hand exoskeletons for assisting impaired patients in grasping +tasks is challenging because it is difficult to infer user intent. We +hypothesize that majority of daily grasping tasks fall into a small set of +categories or modes which can be inferred through real-time analysis of +environmental geometry from 3D point clouds. This paper presents a low-cost, +real-time system for semantic image labeling of household scenes with the +objective to inform and assist activities of daily living. The system consists +of a miniature depth camera, an inertial measurement unit and a microprocessor. +It is able to achieve 85% or higher accuracy at classification of predefined +modes while processing complex 3D scenes at over 30 frames per second. Within +each mode it can detect and localize graspable objects. Grasping points can be +correctly estimated on average within 1 cm for simple object geometries. The +system has potential applications in robotic-assisted rehabilitation as well as +manual task assistance. + +
+
+
+
+
+ + ☆ Emergent functional dynamics of link-bots + + +
+ Synthetic active collectives, composed of many nonliving individuals capable +of cooperative changes in group shape and dynamics, hold promise for practical +applications and for the elucidation of guiding principles of natural +collectives. However, the design of collective robotic systems that operate +effectively without intelligence or complex control at either the individual or +group level is challenging. We investigate how simple steric interaction +constraints between active individuals produce a versatile active system with +promising functionality. Here we introduce the link-bot: a V-shape-based, +single-stranded chain composed of active bots whose dynamics are defined by its +geometric link constraints, allowing it to possess scale- and processing-free +programmable collective behaviors. A variety of emergent properties arise from +this dynamic system, including locomotion, navigation, transportation, and +competitive or cooperative interactions. Through the control of a few link +parameters, link-bots show rich usefulness by performing a variety of divergent +tasks, including traversing or obstructing narrow spaces, passing by or +enclosing objects, and propelling loads in both forward and backward +directions. The reconfigurable nature of the link-bot suggests that our +approach may significantly contribute to the development of programmable soft +robotic systems with minimal information and materials at any scale. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ☆ Visual Tracking with Intermittent Visibility: Switched Control Design + and Implementation + + +
+ This paper addresses the problem of visual target tracking in scenarios where +a pursuer may experience intermittent loss of visibility of the target. The +design of a Switched Visual Tracker (SVT) is presented which aims to meet the +competing requirements of maintaining both proximity and visibility. SVT +alternates between a visual tracking mode for following the target, and a +recovery mode for regaining visual contact when the target falls out of sight. +We establish the stability of SVT by extending the average dwell time theorem +from switched systems theory, which may be of independent interest. Our +implementation of SVT on an Agilicious drone [1] illustrates its effectiveness +on tracking various target trajectories: it reduces the average tracking error +by up to 45% and significantly improves visibility duration compared to a +baseline algorithm. The results show that our approach effectively handles +intermittent vision loss, offering enhanced robustness and adaptability for +real-world autonomous missions. Additionally, we demonstrate how the stability +analysis provides valuable guidance for selecting parameters, such as tracking +speed and recovery distance, to optimize the SVT's performance. + +
+
+
+
+
+ + ☆ Simultaneous Locomotion Mode Classification and Continuous Gait Phase + Estimation for Transtibial Prostheses + + +
+ Recognizing and identifying human locomotion is a critical step to ensuring +fluent control of wearable robots, such as transtibial prostheses. In +particular, classifying the intended locomotion mode and estimating the gait +phase are key. In this work, a novel, interpretable, and computationally +efficient algorithm is presented for simultaneously predicting locomotion mode +and gait phase. Using able-bodied (AB) and transtibial prosthesis (PR) data, +seven locomotion modes are tested including slow, medium, and fast level +walking (0.6, 0.8, and 1.0 m/s), ramp ascent/descent (5 degrees), and stair +ascent/descent (20 cm height). Overall classification accuracy was 99.1$\%$ and +99.3$\%$ for the AB and PR conditions, respectively. The average gait phase +error across all data was less than 4$\%$. Exploiting the structure of the +data, computational efficiency reached 2.91 $\mu$s per time step. The time +complexity of this algorithm scales as $O(N\cdot M)$ with the number of +locomotion modes $M$ and samples per gait cycle $N$. This efficiency and high +accuracy could accommodate a much larger set of locomotion modes ($\sim$ 700 on +Open-Source Leg Prosthesis) to handle the wide range of activities pursued by +individuals during daily living. + +
+
+
+
+
+ + ♻ ☆ Goal-oriented Semantic Communications for Robotic Waypoint Transmission: + The Value and Age of Information Approach + + +
+ The ultra-reliable and low-latency communication (URLLC) service of the +fifth-generation (5G) mobile communication network struggles to support safe +robot operation. Nowadays, the sixth-generation (6G) mobile communication +network is proposed to provide hyper-reliable and low-latency communication to +enable safer control for robots. However, current 5G/ 6G research mainly +focused on improving communication performance, while the robotics community +mostly assumed communication to be ideal. To jointly consider communication and +robotic control with a focus on the specific robotic task, we propose +goal-oriented semantic communication in robotic control (GSRC) to exploit the +context of data and its importance in achieving the task at both transmitter +and receiver. At the transmitter, we propose a deep reinforcement learning +algorithm to generate optimal control and command (C&C) data and a proactive +repetition scheme (DeepPro) to increase the successful transmission +probability. At the receiver, we design the value of information (VoI) and age +of information (AoI) based queue ordering mechanism (VA-QOM) to rank the queue +based on the semantic information extracted from AoI and VoI. The simulation +results validate that our proposed GSRC framework achieves a 91.5% improvement +in the mean square error compared to the traditional unmanned aerial vehicle +control framework. + +
+
+ comment: The paper has been accepted in IEEE TWC +
+
+
+
+
+ + ♻ ☆ Dynamic planning in hierarchical active inference + + +
+ By dynamic planning, we refer to the ability of the human brain to infer and +impose motor trajectories related to cognitive decisions. A recent paradigm, +active inference, brings fundamental insights into the adaptation of biological +organisms, constantly striving to minimize prediction errors to restrict +themselves to life-compatible states. Over the past years, many studies have +shown how human and animal behaviors could be explained in terms of active +inference - either as discrete decision-making or continuous motor control - +inspiring innovative solutions in robotics and artificial intelligence. Still, +the literature lacks a comprehensive outlook on effectively planning realistic +actions in changing environments. Setting ourselves the goal of modeling +complex tasks such as tool use, we delve into the topic of dynamic planning in +active inference, keeping in mind two crucial aspects of biological behavior: +the capacity to understand and exploit affordances for object manipulation, and +to learn the hierarchical interactions between the self and the environment, +including other agents. We start from a simple unit and gradually describe more +advanced structures, comparing recently proposed design choices and providing +basic examples. This study distances itself from traditional views centered on +neural networks and reinforcement learning, and points toward a yet unexplored +direction in active inference: hybrid representations in hierarchical models. + +
+
+
+
+
+ + ♻ Bootstrapping Reinforcement Learning with Imitation for Vision-Based + Agile Flight CoRL + + +
+ Learning visuomotor policies for agile quadrotor flight presents significant +difficulties, primarily from inefficient policy exploration caused by +high-dimensional visual inputs and the need for precise and low-latency +control. To address these challenges, we propose a novel approach that combines +the performance of Reinforcement Learning (RL) and the sample efficiency of +Imitation Learning (IL) in the task of vision-based autonomous drone racing. +While RL provides a framework for learning high-performance controllers through +trial and error, it faces challenges with sample efficiency and computational +demands due to the high dimensionality of visual inputs. Conversely, IL +efficiently learns from visual expert demonstrations, but it remains limited by +the expert's performance and state distribution. To overcome these limitations, +our policy learning framework integrates the strengths of both approaches. Our +framework contains three phases: training a teacher policy using RL with +privileged state information, distilling it into a student policy via IL, and +adaptive fine-tuning via RL. Testing in both simulated and real-world scenarios +shows our approach can not only learn in scenarios where RL from scratch fails +but also outperforms existing IL methods in both robustness and performance, +successfully navigating a quadrotor through a race course using only visual +information. Videos of the experiments are available at +https://rpg.ifi.uzh.ch/bootstrap-rl-with-il/index.html. + +
+
+ comment: 8th Annual Conference on Robot Learning (CoRL) +
+
+
+
+
+ + ♻ ☆ Towards Generalist Robot Learning from Internet Video: A Survey + + +
+ Scaling deep learning to massive, diverse internet data has yielded +remarkably general capabilities in visual and natural language understanding +and generation. However, data has remained scarce and challenging to collect in +robotics, seeing robot learning struggle to obtain similarly general +capabilities. Promising Learning from Videos (LfV) methods aim to address the +robotics data bottleneck by augmenting traditional robot data with large-scale +internet video data. This video data offers broad foundational information +regarding physical behaviour and the underlying physics of the world, and thus +can be highly informative for a generalist robot. + In this survey, we present a thorough overview of the emerging field of LfV. +We outline fundamental concepts, including the benefits and challenges of LfV. +We provide a comprehensive review of current methods for extracting knowledge +from large-scale internet video, addressing key challenges in LfV, and boosting +downstream robot and reinforcement learning via the use of video data. The +survey concludes with a critical discussion of challenges and opportunities in +LfV. Here, we advocate for scalable foundation model approaches that can +leverage the full range of available internet video to improve the learning of +robot policies and dynamics models. We hope this survey can inform and catalyse +further LfV research, driving progress towards the development of +general-purpose robots. + +
+
+
+
+
+ + ♻ ☆ LiCS: Navigation using Learned-imitation on Cluttered Space + + +
+ In this letter, we propose a robust and fast navigation system in a narrow +indoor environment for UGV (Unmanned Ground Vehicle) using 2D LiDAR and +odometry. We used behavior cloning with Transformer neural network to learn the +optimization-based baseline algorithm. We inject Gaussian noise during expert +demonstration to increase the robustness of learned policy. We evaluate the +performance of LiCS using both simulation and hardware experiments. It +outperforms all other baselines in terms of navigation performance and can +maintain its robust performance even on highly cluttered environments. During +the hardware experiments, LiCS can maintain safe navigation at maximum speed of +$1.5\ m/s$. + +
+
+ comment: 6 pages, 4 figures. This work has been submitted to the IEEE for + possible publication +
+
+
+
+
+ + ♻ ☆ Admittance Visuomotor Policy Learning for General-Purpose Contact-Rich + Manipulations + + +
+ Contact force in contact-rich environments is an essential modality for +robots to perform general-purpose manipulation tasks, as it provides +information to compensate for the deficiencies of visual and proprioceptive +data in collision perception, high-precision grasping, and efficient +manipulation. In this paper, we propose an admittance visuomotor policy +framework for continuous, general-purpose, contact-rich manipulations. During +demonstrations, we designed a low-cost, user-friendly teleoperation system with +contact interaction, aiming to gather compliant robot demonstrations and +accelerate the data collection process. During training and inference, we +propose a diffusion-based model to plan action trajectories and desired contact +forces from multimodal observation that includes contact force, vision and +proprioception. We utilize an admittance controller for compliance action +execution. A comparative evaluation with two state-of-the-art methods was +conducted on five challenging tasks, each focusing on different action +primitives, to demonstrate our framework's generalization capabilities. Results +show our framework achieves the highest success rate and exhibits smoother and +more efficient contact compared to other methods, the contact force required to +complete each tasks was reduced on average by 48.8%, and the success rate was +increased on average by 15.3%. Videos are available at +https://ryanjiao.github.io/AdmitDiffPolicy/. + +
+
+ comment: 8 pages, 7 figures. This is the second version of the paper, and it + is subject to further revisions. The current submission does not necessarily + reflect the final quality or content of the paper +
+
+
+
+
+ + ♻ ☆ Cross-Domain Transfer Learning using Attention Latent Features for + Multi-Agent Trajectory Prediction + + +
+ With the advancements of sensor hardware, traffic infrastructure and deep +learning architectures, trajectory prediction of vehicles has established a +solid foundation in intelligent transportation systems. However, existing +solutions are often tailored to specific traffic networks at particular time +periods. Consequently, deep learning models trained on one network may struggle +to generalize effectively to unseen networks. To address this, we proposed a +novel spatial-temporal trajectory prediction framework that performs +cross-domain adaption on the attention representation of a Transformer-based +model. A graph convolutional network is also integrated to construct dynamic +graph feature embeddings that accurately model the complex spatial-temporal +interactions between the multi-agent vehicles across multiple traffic domains. +The proposed framework is validated on two case studies involving the +cross-city and cross-period settings. Experimental results show that our +proposed framework achieves superior trajectory prediction and domain +adaptation performances over the state-of-the-art models. + +
+
+ comment: Accepted at the IEEE International Conference on Systems, Man, and + Cybernetics 2024 +
+
+
+
+
+ + ♻ ☆ Human-in-the-Loop Segmentation of Multi-species Coral Imagery CVPR2024 + + +
+ Marine surveys by robotic underwater and surface vehicles result in +substantial quantities of coral reef imagery, however labeling these images is +expensive and time-consuming for domain experts. Point label propagation is a +technique that uses existing images labeled with sparse points to create +augmented ground truth data, which can be used to train a semantic segmentation +model. In this work, we show that recent advances in large foundation models +facilitate the creation of augmented ground truth masks using only features +extracted by the denoised version of the DINOv2 foundation model and K-Nearest +Neighbors (KNN), without any pre-training. For images with extremely sparse +labels, we present a labeling method based on human-in-the-loop principles, +which greatly enhances annotation efficiency: in the case that there are 5 +point labels per image, our human-in-the-loop method outperforms the prior +state-of-the-art by 14.2% for pixel accuracy and 19.7% for mIoU; and by 8.9% +and 18.3% if there are 10 point labels. When human-in-the-loop labeling is not +available, using the denoised DINOv2 features with a KNN still improves on the +prior state-of-the-art by 2.7% for pixel accuracy and 5.8% for mIoU (5 grid +points). On the semantic segmentation task, we outperform the prior +state-of-the-art by 8.8% for pixel accuracy and by 13.5% for mIoU when only 5 +point labels are used for point label propagation. Additionally, we perform a +comprehensive study into the impacts of the point label placement style and the +number of points on the point label propagation quality, and make several +recommendations for improving the efficiency of labeling images with points. + +
+
+ comment: Journal article preprint of extended paper, 30 pages, 11 figures. + Original conference paper (v2) accepted at the CVPR2024 3rd Workshop on + Learning with Limited Labelled Data for Image and Video Understanding + (L3D-IVU) +
+
+
+
+
+ + ♻ ☆ Traversability-Aware Legged Navigation by Learning from Real-World + Visual Data + + +
+ The enhanced mobility brought by legged locomotion empowers quadrupedal +robots to navigate through complex and unstructured environments. However, +optimizing agile locomotion while accounting for the varying energy costs of +traversing different terrains remains an open challenge. Most previous work +focuses on planning trajectories with traversability cost estimation based on +human-labeled environmental features. However, this human-centric approach is +insufficient because it does not account for the varying capabilities of the +robot locomotion controllers over challenging terrains. To address this, we +develop a novel traversability estimator in a robot-centric manner, based on +the value function of the robot's locomotion controller. This estimator is +integrated into a new learning-based RGBD navigation framework. The framework +employs multiple training stages to develop a planner that guides the robot in +avoiding obstacles and hard-to-traverse terrains while reaching its goals. The +training of the navigation planner is directly performed in the real world +using a sample efficient reinforcement learning method that utilizes both +online data and offline datasets. Through extensive benchmarking, we +demonstrate that the proposed framework achieves the best performance in +accurate traversability cost estimation and efficient learning from multi-modal +data (including the robot's color and depth vision, as well as proprioceptive +feedback) for real-world training. Using the proposed method, a quadrupedal +robot learns to perform traversability-aware navigation through trial and error +in various real-world environments with challenging terrains that are difficult +to classify using depth vision alone. Moreover, the robot demonstrates the +ability to generalize the learned navigation skills to unseen scenarios. Video +can be found at https://youtu.be/RSqnIWZ1qks. + +
+
+
+
+
+ + ♻ ReKep: Spatio-Temporal Reasoning of Relational Keypoint Constraints for + Robotic Manipulation + + +
+ Representing robotic manipulation tasks as constraints that associate the +robot and the environment is a promising way to encode desired robot behaviors. +However, it remains unclear how to formulate the constraints such that they are +1) versatile to diverse tasks, 2) free of manual labeling, and 3) optimizable +by off-the-shelf solvers to produce robot actions in real-time. In this work, +we introduce Relational Keypoint Constraints (ReKep), a visually-grounded +representation for constraints in robotic manipulation. Specifically, ReKep is +expressed as Python functions mapping a set of 3D keypoints in the environment +to a numerical cost. We demonstrate that by representing a manipulation task as +a sequence of Relational Keypoint Constraints, we can employ a hierarchical +optimization procedure to solve for robot actions (represented by a sequence of +end-effector poses in SE(3)) with a perception-action loop at a real-time +frequency. Furthermore, in order to circumvent the need for manual +specification of ReKep for each new task, we devise an automated procedure that +leverages large vision models and vision-language models to produce ReKep from +free-form language instructions and RGB-D observations. We present system +implementations on a wheeled single-arm platform and a stationary dual-arm +platform that can perform a large variety of manipulation tasks, featuring +multi-stage, in-the-wild, bimanual, and reactive behaviors, all without +task-specific data or environment models. Website at +https://rekep-robot.github.io/. + +
+
+
+
+
+ + ♻ ☆ WildScenes: A Benchmark for 2D and 3D Semantic Segmentation in + Large-scale Natural Environments + + +
+ Recent progress in semantic scene understanding has primarily been enabled by +the availability of semantically annotated bi-modal (camera and LiDAR) datasets +in urban environments. However, such annotated datasets are also needed for +natural, unstructured environments to enable semantic perception for +applications, including conservation, search and rescue, environment +monitoring, and agricultural automation. Therefore, we introduce $WildScenes$, +a bi-modal benchmark dataset consisting of multiple large-scale, sequential +traversals in natural environments, including semantic annotations in +high-resolution 2D images and dense 3D LiDAR point clouds, and accurate 6-DoF +pose information. The data is (1) trajectory-centric with accurate localization +and globally aligned point clouds, (2) calibrated and synchronized to support +bi-modal training and inference, and (3) containing different natural +environments over 6 months to support research on domain adaptation. Our 3D +semantic labels are obtained via an efficient, automated process that transfers +the human-annotated 2D labels from multiple views into 3D point cloud +sequences, thus circumventing the need for expensive and time-consuming human +annotation in 3D. We introduce benchmarks on 2D and 3D semantic segmentation +and evaluate a variety of recent deep-learning techniques to demonstrate the +challenges in semantic segmentation in natural environments. We propose +train-val-test splits for standard benchmarks as well as domain adaptation +benchmarks and utilize an automated split generation technique to ensure the +balance of class label distributions. The $WildScenes$ benchmark webpage is +https://csiro-robotics.github.io/WildScenes, and the data is publicly available +at https://data.csiro.au/collection/csiro:61541 . + +
+
+ comment: Accepted in the The International Journal of Robotics Research (IJRR) +
+
+
+
+
+ + ♻ ☆ Towards Efficient Motion Planning for UAVs: Lazy A* Search with Motion + Primitives + + +
+ Search-based motion planning algorithms have been widely utilized for +unmanned aerial vehicles (UAVs). However, deploying these algorithms on real +UAVs faces challenges due to limited onboard computational resources. The +algorithms struggle to find solutions in high-dimensional search spaces and +require considerable time to ensure that the trajectories are dynamically +feasible. This paper incorporates the lazy search concept into search-based +planning algorithms to address the critical issue of real-time planning for +collision-free and dynamically feasible trajectories on UAVs. We demonstrate +that the lazy search motion planning algorithm can efficiently find optimal +trajectories and significantly improve computational efficiency. + +
+
+
+
+
+ + ♻ ☆ CoBL-Diffusion: Diffusion-Based Conditional Robot Planning in Dynamic + Environments Using Control Barrier and Lyapunov Functions + + +
+ Equipping autonomous robots with the ability to navigate safely and +efficiently around humans is a crucial step toward achieving trusted robot +autonomy. However, generating robot plans while ensuring safety in dynamic +multi-agent environments remains a key challenge. Building upon recent work on +leveraging deep generative models for robot planning in static environments, +this paper proposes CoBL-Diffusion, a novel diffusion-based safe robot planner +for dynamic environments. CoBL-Diffusion uses Control Barrier and Lyapunov +functions to guide the denoising process of a diffusion model, iteratively +refining the robot control sequence to satisfy the safety and stability +constraints. We demonstrate the effectiveness of the proposed model using two +settings: a synthetic single-agent environment and a real-world pedestrian +dataset. Our results show that CoBL-Diffusion generates smooth trajectories +that enable the robot to reach goal locations while maintaining a low collision +rate with dynamic obstacles. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 148 + +
+
+
+ + ☆ Material Transforms from Disentangled NeRF Representations + + +
+ In this paper, we first propose a novel method for transferring material +transformations across different scenes. Building on disentangled Neural +Radiance Field (NeRF) representations, our approach learns to map Bidirectional +Reflectance Distribution Functions (BRDF) from pairs of scenes observed in +varying conditions, such as dry and wet. The learned transformations can then +be applied to unseen scenes with similar materials, therefore effectively +rendering the transformation learned with an arbitrary level of intensity. +Extensive experiments on synthetic scenes and real-world objects validate the +effectiveness of our approach, showing that it can learn various +transformations such as wetness, painting, coating, etc. Our results highlight +not only the versatility of our method but also its potential for practical +applications in computer graphics. We publish our method implementation, along +with our synthetic/real datasets on +https://github.com/astra-vision/BRDFTransform + +
+
+
+
+
+ + ☆ Scaling Properties of Diffusion Models for Perceptual Tasks + + +
+ In this paper, we argue that iterative computation with diffusion models +offers a powerful paradigm for not only generation but also visual perception +tasks. We unify tasks such as depth estimation, optical flow, and segmentation +under image-to-image translation, and show how diffusion models benefit from +scaling training and test-time compute for these perception tasks. Through a +careful analysis of these scaling behaviors, we present various techniques to +efficiently train diffusion models for visual perception tasks. Our models +achieve improved or comparable performance to state-of-the-art methods using +significantly less data and compute. To use our code and models, see +https://scaling-diffusion-perception.github.io . + +
+
+
+
+
+ + ☆ GaussianAnything: Interactive Point Cloud Latent Diffusion for 3D + Generation + + +
+ While 3D content generation has advanced significantly, existing methods +still face challenges with input formats, latent space design, and output +representations. This paper introduces a novel 3D generation framework that +addresses these challenges, offering scalable, high-quality 3D generation with +an interactive Point Cloud-structured Latent space. Our framework employs a +Variational Autoencoder (VAE) with multi-view posed RGB-D(epth)-N(ormal) +renderings as input, using a unique latent space design that preserves 3D shape +information, and incorporates a cascaded latent diffusion model for improved +shape-texture disentanglement. The proposed method, GaussianAnything, supports +multi-modal conditional 3D generation, allowing for point cloud, caption, and +single/multi-view image inputs. Notably, the newly proposed latent space +naturally enables geometry-texture disentanglement, thus allowing 3D-aware +editing. Experimental results demonstrate the effectiveness of our approach on +multiple datasets, outperforming existing methods in both text- and +image-conditioned 3D generation. + +
+
+ comment: project page: https://nirvanalan.github.io/projects/GA/ +
+
+
+
+
+ + ☆ LLMPhy: Complex Physical Reasoning Using Large Language Models and World + Models + + +
+ Physical reasoning is an important skill needed for robotic agents when +operating in the real world. However, solving such reasoning problems often +involves hypothesizing and reflecting over complex multi-body interactions +under the effect of a multitude of physical forces and thus learning all such +interactions poses a significant hurdle for state-of-the-art machine learning +frameworks, including large language models (LLMs). To study this problem, we +propose a new physical reasoning task and a dataset, dubbed TraySim. Our task +involves predicting the dynamics of several objects on a tray that is given an +external impact -- the domino effect of the ensued object interactions and +their dynamics thus offering a challenging yet controlled setup, with the goal +of reasoning being to infer the stability of the objects after the impact. To +solve this complex physical reasoning task, we present LLMPhy, a zero-shot +black-box optimization framework that leverages the physics knowledge and +program synthesis abilities of LLMs, and synergizes these abilities with the +world models built into modern physics engines. Specifically, LLMPhy uses an +LLM to generate code to iteratively estimate the physical hyperparameters of +the system (friction, damping, layout, etc.) via an implicit +analysis-by-synthesis approach using a (non-differentiable) simulator in the +loop and uses the inferred parameters to imagine the dynamics of the scene +towards solving the reasoning task. To show the effectiveness of LLMPhy, we +present experiments on our TraySim dataset to predict the steady-state poses of +the objects. Our results show that the combination of the LLM and the physics +engine leads to state-of-the-art zero-shot physical reasoning performance, +while demonstrating superior convergence against standard black-box +optimization methods and better estimation of the physical parameters. + +
+
+
+
+
+ + ☆ Wavelet Latent Diffusion (Wala): Billion-Parameter 3D Generative Model + with Compact Wavelet Encodings + + +
+ Large-scale 3D generative models require substantial computational resources +yet often fall short in capturing fine details and complex geometries at high +resolutions. We attribute this limitation to the inefficiency of current +representations, which lack the compactness required to model the generative +models effectively. To address this, we introduce a novel approach called +Wavelet Latent Diffusion, or WaLa, that encodes 3D shapes into wavelet-based, +compact latent encodings. Specifically, we compress a $256^3$ signed distance +field into a $12^3 \times 4$ latent grid, achieving an impressive 2427x +compression ratio with minimal loss of detail. This high level of compression +allows our method to efficiently train large-scale generative networks without +increasing the inference time. Our models, both conditional and unconditional, +contain approximately one billion parameters and successfully generate +high-quality 3D shapes at $256^3$ resolution. Moreover, WaLa offers rapid +inference, producing shapes within two to four seconds depending on the +condition, despite the model's scale. We demonstrate state-of-the-art +performance across multiple datasets, with significant improvements in +generation quality, diversity, and computational efficiency. We open-source our +code and, to the best of our knowledge, release the largest pretrained 3D +generative models across different modalities. + +
+
+
+
+
+ + ☆ Artistic Neural Style Transfer Algorithms with Activation Smoothing + + +
+ The works of Gatys et al. demonstrated the capability of Convolutional Neural +Networks (CNNs) in creating artistic style images. This process of transferring +content images in different styles is called Neural Style Transfer (NST). In +this paper, we re-implement image-based NST, fast NST, and arbitrary NST. We +also explore to utilize ResNet with activation smoothing in NST. Extensive +experimental results demonstrate that smoothing transformation can greatly +improve the quality of stylization results. + +
+
+ comment: 8 pages,7 figures +
+
+
+
+
+ + ☆ DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring + + +
+ Coronary artery disease (CAD), one of the most common cause of mortality in +the world. Coronary artery calcium (CAC) scoring using computed tomography (CT) +is key for risk assessment to prevent coronary disease. Previous studies on +risk assessment and calcification detection in CT scans primarily use +approaches based on UNET architecture, frequently implemented on pre-built +models. However, these models are limited by the availability of annotated CT +scans containing CAC and suffering from imbalanced dataset, decreasing +performance of CAC segmentation and scoring. In this study, we extend this +approach by incorporating the self-supervised learning (SSL) technique of DINO +(self-distillation with no labels) to eliminate limitations of scarce annotated +data in CT scans. The DINO model's ability to train without requiring CAC area +annotations enhances its robustness in generating distinct features. The DINO +model is trained on to focus specifically on calcified areas by using labels, +aiming to generate features that effectively capture and highlight key +characteristics. The label-guided DINO (DINO-LG) enhances classification by +distinguishing CT slices that contain calcification from those that do not, +performing 57% better than the standard DINO model in this task. CAC scoring +and segmentation tasks are performed by a basic U-NET architecture, fed +specifically with CT slices containing calcified areas as identified by the +DINO-LG model. This targeted identification performed by DINO-LG model improves +CAC segmentation performance by approximately 10% and significant increase in +CAC scoring accuracy. + +
+
+ comment: Developed by Center for Applied Artificial Intelligence (CAAI), + University of Kentucky +
+
+
+
+
+ + ☆ JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified + Multimodal Understanding and Generation + + +
+ We present JanusFlow, a powerful framework that unifies image understanding +and generation in a single model. JanusFlow introduces a minimalist +architecture that integrates autoregressive language models with rectified +flow, a state-of-the-art method in generative modeling. Our key finding +demonstrates that rectified flow can be straightforwardly trained within the +large language model framework, eliminating the need for complex architectural +modifications. To further improve the performance of our unified model, we +adopt two key strategies: (i) decoupling the understanding and generation +encoders, and (ii) aligning their representations during unified training. +Extensive experiments show that JanusFlow achieves comparable or superior +performance to specialized models in their respective domains, while +significantly outperforming existing unified approaches across standard +benchmarks. This work represents a step toward more efficient and versatile +vision-language models. + +
+
+
+
+
+ + ☆ Commissioning An All-Sky Infrared Camera Array for Detection Of Airborne + Objects + + +
+ To date there is little publicly available scientific data on Unidentified +Aerial Phenomena (UAP) whose properties and kinematics purportedly reside +outside the performance envelope of known phenomena. To address this +deficiency, the Galileo Project is designing, building, and commissioning a +multi-modal ground-based observatory to continuously monitor the sky and +conduct a rigorous long-term aerial census of all aerial phenomena, including +natural and human-made. One of the key instruments is an all-sky infrared +camera array using eight uncooled long-wave infrared FLIR Boson 640 cameras. +Their calibration includes a novel extrinsic calibration method using airplane +positions from Automatic Dependent Surveillance-Broadcast (ADS-B) data. We +establish a first baseline for the system performance over five months of field +operation, using a real-world dataset derived from ADS-B data, synthetic 3-D +trajectories, and a hand-labelled real-world dataset. We report acceptance +rates (e.g. viewable airplanes that are recorded) and detection efficiencies +(e.g. recorded airplanes which are successfully detected) for a variety of +weather conditions, range and aircraft size. We reconstruct $\sim$500,000 +trajectories of aerial objects from this commissioning period. A toy outlier +search focused on large sinuosity of the 2-D reconstructed trajectories flags +about 16% of trajectories as outliers. After manual review, 144 trajectories +remain ambiguous: they are likely mundane objects but cannot be elucidated at +this stage of development without distance and kinematics estimation or other +sensor modalities. Our observed count of ambiguous outliers combined with +systematic uncertainties yields an upper limit of 18,271 outliers count for the +five-month interval at a 95% confidence level. This likelihood-based method to +evaluate significance is applicable to all of our future outlier searches. + +
+
+
+
+
+ + ☆ SimBase: A Simple Baseline for Temporal Video Grounding + + +
+ This paper presents SimBase, a simple yet effective baseline for temporal +video grounding. While recent advances in temporal grounding have led to +impressive performance, they have also driven network architectures toward +greater complexity, with a range of methods to (1) capture temporal +relationships and (2) achieve effective multimodal fusion. In contrast, this +paper explores the question: How effective can a simplified approach be? To +investigate, we design SimBase, a network that leverages lightweight, +one-dimensional temporal convolutional layers instead of complex temporal +structures. For cross-modal interaction, SimBase only employs an element-wise +product instead of intricate multimodal fusion. Remarkably, SimBase achieves +state-of-the-art results on two large-scale datasets. As a simple yet powerful +baseline, we hope SimBase will spark new ideas and streamline future +evaluations in temporal video grounding. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ DuoLift-GAN:Reconstructing CT from Single-view and Biplanar X-Rays with + Generative Adversarial Networks + + +
+ Computed tomography (CT) provides highly detailed three-dimensional (3D) +medical images but is costly, time-consuming, and often inaccessible in +intraoperative settings (Organization et al. 2011). Recent advancements have +explored reconstructing 3D chest volumes from sparse 2D X-rays, such as +single-view or orthogonal double-view images. However, current models tend to +process 2D images in a planar manner, prioritizing visual realism over +structural accuracy. In this work, we introduce DuoLift Generative Adversarial +Networks (DuoLift-GAN), a novel architecture with dual branches that +independently elevate 2D images and their features into 3D representations. +These 3D outputs are merged into a unified 3D feature map and decoded into a +complete 3D chest volume, enabling richer 3D information capture. We also +present a masked loss function that directs reconstruction towards critical +anatomical regions, improving structural accuracy and visual quality. This +paper demonstrates that DuoLift-GAN significantly enhances reconstruction +accuracy while achieving superior visual realism compared to existing methods. + +
+
+
+
+
+ + ☆ Automatic dataset shift identification to support root cause analysis of + AI performance drift + + +
+ Shifts in data distribution can substantially harm the performance of +clinical AI models. Hence, various methods have been developed to detect the +presence of such shifts at deployment time. However, root causes of dataset +shifts are varied, and the choice of shift mitigation strategies is highly +dependent on the precise type of shift encountered at test time. As such, +detecting test-time dataset shift is not sufficient: precisely identifying +which type of shift has occurred is critical. In this work, we propose the +first unsupervised dataset shift identification framework, effectively +distinguishing between prevalence shift (caused by a change in the label +distribution), covariate shift (caused by a change in input characteristics) +and mixed shifts (simultaneous prevalence and covariate shifts). We discuss the +importance of self-supervised encoders for detecting subtle covariate shifts +and propose a novel shift detector leveraging both self-supervised encoders and +task model outputs for improved shift detection. We report promising results +for the proposed shift identification framework across three different imaging +modalities (chest radiography, digital mammography, and retinal fundus images) +on five types of real-world dataset shifts, using four large publicly available +datasets. + +
+
+ comment: Code available at + https://github.com/biomedia-mira/shift_identification +
+
+
+
+
+ + ☆ Learning Disentangled Representations for Perceptual Point Cloud Quality + Assessment via Mutual Information Minimization + + +
+ No-Reference Point Cloud Quality Assessment (NR-PCQA) aims to objectively +assess the human perceptual quality of point clouds without relying on +pristine-quality point clouds for reference. It is becoming increasingly +significant with the rapid advancement of immersive media applications such as +virtual reality (VR) and augmented reality (AR). However, current NR-PCQA +models attempt to indiscriminately learn point cloud content and distortion +representations within a single network, overlooking their distinct +contributions to quality information. To address this issue, we propose DisPA, +a novel disentangled representation learning framework for NR-PCQA. The +framework trains a dual-branch disentanglement network to minimize mutual +information (MI) between representations of point cloud content and distortion. +Specifically, to fully disentangle representations, the two branches adopt +different philosophies: the content-aware encoder is pretrained by a masked +auto-encoding strategy, which can allow the encoder to capture semantic +information from rendered images of distorted point clouds; the +distortion-aware encoder takes a mini-patch map as input, which forces the +encoder to focus on low-level distortion patterns. Furthermore, we utilize an +MI estimator to estimate the tight upper bound of the actual MI and further +minimize it to achieve explicit representation disentanglement. Extensive +experimental results demonstrate that DisPA outperforms state-of-the-art +methods on multiple PCQA datasets. + +
+
+
+
+
+ + ☆ Isometric Transformations for Image Augmentation in Mueller Matrix + Polarimetry + + +
+ Mueller matrix polarimetry captures essential information about polarized +light interactions with a sample, presenting unique challenges for data +augmentation in deep learning due to its distinct structure. While +augmentations are an effective and affordable way to enhance dataset diversity +and reduce overfitting, standard transformations like rotations and flips do +not preserve the polarization properties in Mueller matrix images. To this end, +we introduce a versatile simulation framework that applies physically +consistent rotations and flips to Mueller matrices, tailored to maintain +polarization fidelity. Our experimental results across multiple datasets reveal +that conventional augmentations can lead to misleading results when applied to +polarimetric data, underscoring the necessity of our physics-based approach. In +our experiments, we first compare our polarization-specific augmentations +against real-world captures to validate their physical consistency. We then +apply these augmentations in a semantic segmentation task, achieving +substantial improvements in model generalization and performance. This study +underscores the necessity of physics-informed data augmentation for +polarimetric imaging in deep learning (DL), paving the way for broader adoption +and more robust applications across diverse research in the field. In +particular, our framework unlocks the potential of DL models for polarimetric +datasets with limited sample sizes. Our code implementation is available at +github.com/hahnec/polar_augment. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ TLDR: Traffic Light Detection using Fourier Domain Adaptation in Hostile + WeatheR + + +
+ The scarcity of comprehensive datasets in the traffic light detection and +recognition domain and the poor performance of state-of-the-art models under +hostile weather conditions present significant challenges. To address these +issues, this paper proposes a novel approach by merging two widely used +datasets, LISA and S2TLD. The merged dataset is further processed to tackle +class imbalance, a common problem in this domain. This merged dataset becomes +our source domain. Synthetic rain and fog are added to the dataset to create +our target domain. We employ Fourier Domain Adaptation (FDA) to create a final +dataset with a minimized domain gap between the two datasets, helping the model +trained on this final dataset adapt to rainy and foggy weather conditions. +Additionally, we explore Semi-Supervised Learning (SSL) techniques to leverage +the available data more effectively. Experimental results demonstrate that +models trained on FDA-augmented images outperform those trained without FDA +across confidence-dependent and independent metrics, like mAP50, mAP50-95, +Precision, and Recall. The best-performing model, YOLOv8, achieved a Precision +increase of 5.1860%, Recall increase of 14.8009%, mAP50 increase of 9.5074%, +and mAP50-95 increase of 19.5035%. On average, percentage increases of 7.6892% +in Precision, 19.9069% in Recall, 15.8506% in mAP50, and 23.8099% in mAP50-95 +were observed across all models, highlighting the effectiveness of FDA in +mitigating the impact of adverse weather conditions on model performance. These +improvements pave the way for real-world applications where reliable +performance in challenging environmental conditions is critical. + +
+
+ comment: Under Review at IEEE Transactions of Artificial Intelligence. 10 + Pages, 7 Figures +
+
+
+
+
+ + ☆ Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse + Tensor-based Transformer + + +
+ The evolution of 3D visualization techniques has fundamentally transformed +how we interact with digital content. At the forefront of this change is point +cloud technology, offering an immersive experience that surpasses traditional +2D representations. However, the massive data size of point clouds presents +significant challenges in data compression. Current methods for lossy point +cloud attribute compression (PCAC) generally focus on reconstructing the +original point clouds with minimal error. However, for point cloud +visualization scenarios, the reconstructed point clouds with distortion still +need to undergo a complex rendering process, which affects the final +user-perceived quality. In this paper, we propose an end-to-end deep learning +framework that seamlessly integrates PCAC with differentiable rendering, +denoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of +rendered multiview images for viewing. In a differentiable manner, the impact +of the rendering process on the reconstructed point clouds is taken into +account. Moreover, we characterize point clouds as sparse tensors and propose a +sparse tensor-based transformer, called SP-Trans. By aligning with the local +density of the point cloud and utilizing an enhanced local attention mechanism, +SP-Trans captures the intricate relationships within the point cloud, further +improving feature analysis and synthesis within the framework. Extensive +experiments demonstrate that the proposed RO-PCAC achieves state-of-the-art +compression performance, compared to existing reconstruction-oriented methods, +including traditional, learning-based, and hybrid methods. + +
+
+
+
+
+ + ☆ Joint multi-dimensional dynamic attention and transformer for general + image restoration + + +
+ Outdoor images often suffer from severe degradation due to rain, haze, and +noise, impairing image quality and challenging high-level tasks. Current image +restoration methods struggle to handle complex degradation while maintaining +efficiency. This paper introduces a novel image restoration architecture that +combines multi-dimensional dynamic attention and self-attention within a U-Net +framework. To leverage the global modeling capabilities of transformers and the +local modeling capabilities of convolutions, we integrate sole CNNs in the +encoder-decoder and sole transformers in the latent layer. Additionally, we +design convolutional kernels with selected multi-dimensional dynamic attention +to capture diverse degraded inputs efficiently. A transformer block with +transposed self-attention further enhances global feature extraction while +maintaining efficiency. Extensive experiments demonstrate that our method +achieves a better balance between performance and computational complexity +across five image restoration tasks: deraining, deblurring, denoising, +dehazing, and enhancement, as well as superior performance for high-level +vision tasks. The source code will be available at +https://github.com/House-yuyu/MDDA-former. + +
+
+
+
+
+ + ☆ INTRABENCH: Interactive Radiological Benchmark + + +
+ Current interactive segmentation approaches, inspired by the success of +META's Segment Anything model, have achieved notable advancements, however, +they come with substantial limitations that hinder their practical application +in real clinical scenarios. These include unrealistic human interaction +requirements, such as slice-by-slice operations for 2D models on 3D data, a +lack of iterative refinement, and insufficient evaluation experiments. These +shortcomings prevent accurate assessment of model performance and lead to +inconsistent outcomes across studies. IntRaBench overcomes these challenges by +offering a comprehensive and reproducible framework for evaluating interactive +segmentation methods in realistic, clinically relevant scenarios. It includes +diverse datasets, target structures, and segmentation models, and provides a +flexible codebase that allows seamless integration of new models and prompting +strategies. Additionally, we introduce advanced techniques to minimize +clinician interaction, ensuring fair comparisons between 2D and 3D models. By +open-sourcing IntRaBench, we invite the research community to integrate their +models and prompting techniques, ensuring continuous and transparent evaluation +of interactive segmentation models in 3D medical imaging. + +
+
+ comment: Undergoing Peer-Review +
+
+
+
+
+ + ☆ Diverse capability and scaling of diffusion and auto-regressive models + when learning abstract rules NeurIPS2024 + + +
+ Humans excel at discovering regular structures from limited samples and +applying inferred rules to novel settings. We investigate whether modern +generative models can similarly learn underlying rules from finite samples and +perform reasoning through conditional sampling. Inspired by Raven's Progressive +Matrices task, we designed GenRAVEN dataset, where each sample consists of +three rows, and one of 40 relational rules governing the object position, +number, or attributes applies to all rows. We trained generative models to +learn the data distribution, where samples are encoded as integer arrays to +focus on rule learning. We compared two generative model families: diffusion +(EDM, DiT, SiT) and autoregressive models (GPT2, Mamba). We evaluated their +ability to generate structurally consistent samples and perform panel +completion via unconditional and conditional sampling. We found diffusion +models excel at unconditional generation, producing more novel and consistent +samples from scratch and memorizing less, but performing less well in panel +completion, even with advanced conditional sampling methods. Conversely, +autoregressive models excel at completing missing panels in a rule-consistent +manner but generate less consistent samples unconditionally. We observe diverse +data scaling behaviors: for both model families, rule learning emerges at a +certain dataset size - around 1000s examples per rule. With more training data, +diffusion models improve both their unconditional and conditional generation +capabilities. However, for autoregressive models, while panel completion +improves with more training data, unconditional generation consistency +declines. Our findings highlight complementary capabilities and limitations of +diffusion and autoregressive models in rule learning and reasoning tasks, +suggesting avenues for further research into their mechanisms and potential for +human-like reasoning. + +
+
+ comment: 12 pages, 5 figures. Accepted to NeurIPS2024 Workshop on System 2 + Reasoning At Scale as long paper +
+
+
+
+
+ + ☆ CDXFormer: Boosting Remote Sensing Change Detection with Extended Long + Short-Term Memory + + +
+ In complex scenes and varied conditions, effectively integrating +spatial-temporal context is crucial for accurately identifying changes. +However, current RS-CD methods lack a balanced consideration of performance and +efficiency. CNNs lack global context, Transformers have quadratic computational +complexity, and Mambas are restricted by CUDA acceleration. In this paper, we +propose CDXFormer, with a core component that is a powerful XLSTM-based feature +enhancement layer, integrating the advantages of linear computational +complexity, global context perception, and strong interpret-ability. +Specifically, we introduce a scale-specific Feature Enhancer layer, +incorporating a Cross-Temporal Global Perceptron customized for +semantic-accurate deep features, and a Cross-Temporal Spatial Refiner +customized for detail-rich shallow features. Additionally, we propose a +Cross-Scale Interactive Fusion module to progressively interact global change +representations with spatial responses. Extensive experimental results +demonstrate that CDXFormer achieves state-of-the-art performance across three +benchmark datasets, offering a compelling balance between efficiency and +accuracy. Code is available at https://github.com/xwmaxwma/rschange. + +
+
+
+
+
+ + ☆ NL-SLAM for OC-VLN: Natural Language Grounded SLAM for Object-Centric + VLN + + +
+ Landmark-based navigation (e.g. go to the wooden desk) and relative +positional navigation (e.g. move 5 meters forward) are distinct navigation +challenges solved very differently in existing robotics navigation methodology. +We present a new dataset, OC-VLN, in order to distinctly evaluate grounding +object-centric natural language navigation instructions in a method for +performing landmark-based navigation. We also propose Natural Language grounded +SLAM (NL-SLAM), a method to ground natural language instruction to robot +observations and poses. We actively perform NL-SLAM in order to follow +object-centric natural language navigation instructions. Our methods leverage +pre-trained vision and language foundation models and require no task-specific +training. We construct two strong baselines from state-of-the-art methods on +related tasks, Object Goal Navigation and Vision Language Navigation, and we +show that our approach, NL-SLAM, outperforms these baselines across all our +metrics of success on OC-VLN. Finally, we successfully demonstrate the +effectiveness of NL-SLAM for performing navigation instruction following in the +real world on a Boston Dynamics Spot robot. + +
+
+
+
+
+ + ☆ Towards Vision Mixture of Experts for Wildlife Monitoring on the Edge + + +
+ The explosion of IoT sensors in industrial, consumer and remote sensing use +cases has come with unprecedented demand for computing infrastructure to +transmit and to analyze petabytes of data. Concurrently, the world is slowly +shifting its focus towards more sustainable computing. For these reasons, there +has been a recent effort to reduce the footprint of related computing +infrastructure, especially by deep learning algorithms, for advanced insight +generation. The `TinyML' community is actively proposing methods to save +communication bandwidth and excessive cloud storage costs while reducing +algorithm inference latency and promoting data privacy. Such proposed +approaches should ideally process multiple types of data, including time +series, audio, satellite images, and video, near the network edge as multiple +data streams has been shown to improve the discriminative ability of learning +algorithms, especially for generating fine grained results. Incidentally, there +has been recent work on data driven conditional computation of subnetworks that +has shown real progress in using a single model to share parameters among very +different types of inputs such as images and text, reducing the computation +requirement of multi-tower multimodal networks. Inspired by such line of work, +we explore similar per patch conditional computation for the first time for +mobile vision transformers (vision only case), that will eventually be used for +single-tower multimodal edge models. We evaluate the model on Cornell Sap +Sucker Woods 60, a fine grained bird species discrimination dataset. Our +initial experiments uses $4X$ fewer parameters compared to MobileViTV2-1.0 with +a $1$% accuracy drop on the iNaturalist '21 birds test data provided as part of +the SSW60 dataset. + +
+
+
+
+
+ + ☆ Large-scale Remote Sensing Image Target Recognition and Automatic + Annotation + + +
+ This paper presents a method for object recognition and automatic labeling in +large-area remote sensing images called LRSAA. The method integrates YOLOv11 +and MobileNetV3-SSD object detection algorithms through ensemble learning to +enhance model performance. Furthermore, it employs Poisson disk sampling +segmentation techniques and the EIOU metric to optimize the training and +inference processes of segmented images, followed by the integration of +results. This approach not only reduces the demand for computational resources +but also achieves a good balance between accuracy and speed. The source code +for this project has been made publicly available on +https://github.com/anaerovane/LRSAA. + +
+
+
+
+
+ + ☆ Horticultural Temporal Fruit Monitoring via 3D Instance Segmentation and + Re-Identification using Point Clouds + + +
+ Robotic fruit monitoring is a key step toward automated agricultural +production systems. Robots can significantly enhance plant and temporal fruit +monitoring by providing precise, high-throughput assessments that overcome the +limitations of traditional manual methods. Fruit monitoring is a challenging +task due to the significant variation in size, shape, orientation, and +occlusion of fruits. Also, fruits may be harvested or newly grown between +recording sessions. Most methods are 2D image-based and they lack the 3D +structure, depth, and spatial information, which represent key aspects of fruit +monitoring. 3D colored point clouds, instead, can offer this information but +they introduce challenges such as their sparsity and irregularity. In this +paper, we present a novel approach for temporal fruit monitoring that addresses +point clouds collected in a greenhouse over time. Our method segments fruits +using a learning-based instance segmentation approach directly on the point +cloud. Each segmented fruit is processed by a 3D sparse convolutional neural +network to extract descriptors, which are used in an attention-based matching +network to associate fruits with their instances from previous data +collections. Experimental results on a real dataset of strawberries demonstrate +that our approach outperforms other methods for fruits re-identification over +time, allowing for precise temporal fruit monitoring in real and complex +scenarios. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ Interaction Asymmetry: A General Principle for Learning Composable + Abstractions + + +
+ Learning disentangled representations of concepts and re-composing them in +unseen ways is crucial for generalizing to out-of-domain situations. However, +the underlying properties of concepts that enable such disentanglement and +compositional generalization remain poorly understood. In this work, we propose +the principle of interaction asymmetry which states: "Parts of the same concept +have more complex interactions than parts of different concepts". We formalize +this via block diagonality conditions on the $(n+1)$th order derivatives of the +generator mapping concepts to observed data, where different orders of +"complexity" correspond to different $n$. Using this formalism, we prove that +interaction asymmetry enables both disentanglement and compositional +generalization. Our results unify recent theoretical results for learning +concepts of objects, which we show are recovered as special cases with +$n\!=\!0$ or $1$. We provide results for up to $n\!=\!2$, thus extending these +prior works to more flexible generator functions, and conjecture that the same +proof strategies generalize to larger $n$. Practically, our theory suggests +that, to disentangle concepts, an autoencoder should penalize its latent +capacity and the interactions between concepts during decoding. We propose an +implementation of these criteria using a flexible Transformer-based VAE, with a +novel regularizer on the attention weights of the decoder. On synthetic image +datasets consisting of objects, we provide evidence that this model can achieve +comparable object disentanglement to existing models that use more explicit +object-centric priors. + +
+
+ comment: Preprint, under review +
+
+
+
+
+ + ☆ Novel View Synthesis with Pixel-Space Diffusion Models + + +
+ Synthesizing a novel view from a single input image is a challenging task. +Traditionally, this task was approached by estimating scene depth, warping, and +inpainting, with machine learning models enabling parts of the pipeline. More +recently, generative models are being increasingly employed in novel view +synthesis (NVS), often encompassing the entire end-to-end system. In this work, +we adapt a modern diffusion model architecture for end-to-end NVS in the pixel +space, substantially outperforming previous state-of-the-art (SOTA) techniques. +We explore different ways to encode geometric information into the network. Our +experiments show that while these methods may enhance performance, their impact +is minor compared to utilizing improved generative models. Moreover, we +introduce a novel NVS training scheme that utilizes single-view datasets, +capitalizing on their relative abundance compared to their multi-view +counterparts. This leads to improved generalization capabilities to scenes with +out-of-domain content. + +
+
+
+
+
+ + ☆ AdaSemiCD: An Adaptive Semi-Supervised Change Detection Method Based on + Pseudo-Label Evaluation + + +
+ Change Detection (CD) is an essential field in remote sensing, with a primary +focus on identifying areas of change in bi-temporal image pairs captured at +varying intervals of the same region by a satellite. The data annotation +process for the CD task is both time-consuming and labor-intensive. To make +better use of the scarce labeled data and abundant unlabeled data, we present +an adaptive dynamic semi-supervised learning method, AdaSemiCD, to improve the +use of pseudo-labels and optimize the training process. Initially, due to the +extreme class imbalance inherent in CD, the model is more inclined to focus on +the background class, and it is easy to confuse the boundary of the target +object. Considering these two points, we develop a measurable evaluation metric +for pseudo-labels that enhances the representation of information entropy by +class rebalancing and amplification of confusing areas to give a larger weight +to prospects change objects. Subsequently, to enhance the reliability of +sample-wise pseudo-labels, we introduce the AdaFusion module, which is capable +of dynamically identifying the most uncertain region and substituting it with +more trustworthy content. Lastly, to ensure better training stability, we +introduce the AdaEMA module, which updates the teacher model using only batches +of trusted samples. Experimental results from LEVIR-CD, WHU-CD, and CDD +datasets validate the efficacy and universality of our proposed adaptive +training framework. + +
+
+
+
+
+ + ☆ SAV-SE: Scene-aware Audio-Visual Speech Enhancement with Selective State + Space Model + + +
+ Speech enhancement plays an essential role in various applications, and the +integration of visual information has been demonstrated to bring substantial +advantages. However, the majority of current research concentrates on the +examination of facial and lip movements, which can be compromised or entirely +inaccessible in scenarios where occlusions occur or when the camera view is +distant. Whereas contextual visual cues from the surrounding environment have +been overlooked: for example, when we see a dog bark, our brain has the innate +ability to discern and filter out the barking noise. To this end, in this +paper, we introduce a novel task, i.e. SAV-SE. To our best knowledge, this is +the first proposal to use rich contextual information from synchronized video +as auxiliary cues to indicate the type of noise, which eventually improves the +speech enhancement performance. Specifically, we propose the VC-S$^2$E method, +which incorporates the Conformer and Mamba modules for their complementary +strengths. Extensive experiments are conducted on public MUSIC, AVSpeech and +AudioSet datasets, where the results demonstrate the superiority of VC-S$^2$E +over other competitive methods. We will make the source code publicly +available. Project demo page: https://AVSEPage.github.io/ + +
+
+
+
+
+ + ☆ LapGSR: Laplacian Reconstructive Network for Guided Thermal + Super-Resolution + + +
+ In the last few years, the fusion of multi-modal data has been widely studied +for various applications such as robotics, gesture recognition, and autonomous +navigation. Indeed, high-quality visual sensors are expensive, and +consumer-grade sensors produce low-resolution images. Researchers have +developed methods to combine RGB color images with non-visual data, such as +thermal, to overcome this limitation to improve resolution. Fusing multiple +modalities to produce visually appealing, high-resolution images often requires +dense models with millions of parameters and a heavy computational load, which +is commonly attributed to the intricate architecture of the model. + We propose LapGSR, a multimodal, lightweight, generative model incorporating +Laplacian image pyramids for guided thermal super-resolution. This approach +uses a Laplacian Pyramid on RGB color images to extract vital edge information, +which is then used to bypass heavy feature map computation in the higher layers +of the model in tandem with a combined pixel and adversarial loss. LapGSR +preserves the spatial and structural details of the image while also being +efficient and compact. This results in a model with significantly fewer +parameters than other SOTA models while demonstrating excellent results on two +cross-domain datasets viz. ULB17-VT and VGTSR datasets. + +
+
+
+
+
+ + ☆ Constraint Learning for Parametric Point Cloud + + +
+ Parametric point clouds are sampled from CAD shapes, have become increasingly +prevalent in industrial manufacturing. However, most existing point cloud +learning methods focus on the geometric features, such as local and global +features or developing efficient convolution operations, overlooking the +important attribute of constraints inherent in CAD shapes, which limits these +methods' ability to fully comprehend CAD shapes. To address this issue, we +analyzed the effect of constraints, and proposed its deep learning-friendly +representation, after that, the Constraint Feature Learning Network (CstNet) is +developed to extract and leverage constraints. Our CstNet includes two stages. +The Stage 1 extracts constraints from B-Rep data or point cloud. The Stage 2 +leverages coordinates and constraints to enhance the comprehend of CAD shapes. +Additionally, we built up the Parametric 20,000 Multi-modal Dataset for the +scarcity of labeled B-Rep datasets. Experiments demonstrate that our CstNet +achieved state-of-the-art performance on both public and proposed CAD shapes +datasets. To the best of our knowledge, CstNet is the first constraint-based +learning method tailored for CAD shapes analysis. + +
+
+
+
+
+ + ☆ Efficient 3D Perception on Multi-Sweep Point Cloud with Gumbel Spatial + Pruning + + +
+ This paper studies point cloud perception within outdoor environments. +Existing methods face limitations in recognizing objects located at a distance +or occluded, due to the sparse nature of outdoor point clouds. In this work, we +observe a significant mitigation of this problem by accumulating multiple +temporally consecutive LiDAR sweeps, resulting in a remarkable improvement in +perception accuracy. However, the computation cost also increases, hindering +previous approaches from utilizing a large number of LiDAR sweeps. To tackle +this challenge, we find that a considerable portion of points in the +accumulated point cloud is redundant, and discarding these points has minimal +impact on perception accuracy. We introduce a simple yet effective Gumbel +Spatial Pruning (GSP) layer that dynamically prunes points based on a learned +end-to-end sampling. The GSP layer is decoupled from other network components +and thus can be seamlessly integrated into existing point cloud network +architectures. Without incurring additional computational overhead, we increase +the number of LiDAR sweeps from 10, a common practice, to as many as 40. +Consequently, there is a significant enhancement in perception performance. For +instance, in nuScenes 3D object detection and BEV map segmentation tasks, our +pruning strategy improves the vanilla TransL baseline and other baseline +methods. + +
+
+
+
+
+ + ☆ 3D Focusing-and-Matching Network for Multi-Instance Point Cloud + Registration NeurIPS 2024 + + +
+ Multi-instance point cloud registration aims to estimate the pose of all +instances of a model point cloud in the whole scene. Existing methods all adopt +the strategy of first obtaining the global correspondence and then clustering +to obtain the pose of each instance. However, due to the cluttered and occluded +objects in the scene, it is difficult to obtain an accurate correspondence +between the model point cloud and all instances in the scene. To this end, we +propose a simple yet powerful 3D focusing-and-matching network for +multi-instance point cloud registration by learning the multiple pair-wise +point cloud registration. Specifically, we first present a 3D multi-object +focusing module to locate the center of each object and generate object +proposals. By using self-attention and cross-attention to associate the model +point cloud with structurally similar objects, we can locate potential matching +instances by regressing object centers. Then, we propose a 3D dual masking +instance matching module to estimate the pose between the model point cloud and +each object proposal. It performs instance mask and overlap mask masks to +accurately predict the pair-wise correspondence. Extensive experiments on two +public benchmarks, Scan2CAD and ROBI, show that our method achieves a new +state-of-the-art performance on the multi-instance point cloud registration +task. Code is available at https://github.com/zlynpu/3DFMNet. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ No-Reference Point Cloud Quality Assessment via Graph Convolutional + Network + + +
+ Three-dimensional (3D) point cloud, as an emerging visual media format, is +increasingly favored by consumers as it can provide more realistic visual +information than two-dimensional (2D) data. Similar to 2D plane images and +videos, point clouds inevitably suffer from quality degradation and information +loss through multimedia communication systems. Therefore, automatic point cloud +quality assessment (PCQA) is of critical importance. In this work, we propose a +novel no-reference PCQA method by using a graph convolutional network (GCN) to +characterize the mutual dependencies of multi-view 2D projected image contents. +The proposed GCN-based PCQA (GC-PCQA) method contains three modules, i.e., +multi-view projection, graph construction, and GCN-based quality prediction. +First, multi-view projection is performed on the test point cloud to obtain a +set of horizontally and vertically projected images. Then, a +perception-consistent graph is constructed based on the spatial relations among +different projected images. Finally, reasoning on the constructed graph is +performed by GCN to characterize the mutual dependencies and interactions +between different projected images, and aggregate feature information of +multi-view projected images for final quality prediction. Experimental results +on two publicly available benchmark databases show that our proposed GC-PCQA +can achieve superior performance than state-of-the-art quality assessment +metrics. The code will be available at: https://github.com/chenwuwq/GC-PCQA. + +
+
+ comment: Accepted by IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ ALOcc: Adaptive Lifting-based 3D Semantic Occupancy and Cost + Volume-based Flow Prediction + + +
+ Vision-based semantic occupancy and flow prediction plays a crucial role in +providing spatiotemporal cues for real-world tasks, such as autonomous driving. +Existing methods prioritize higher accuracy to cater to the demands of these +tasks. In this work, we strive to improve performance by introducing a series +of targeted improvements for 3D semantic occupancy prediction and flow +estimation. First, we introduce an occlusion-aware adaptive lifting mechanism +with a depth denoising technique to improve the robustness of 2D-to-3D feature +transformation and reduce the reliance on depth priors. Second, we strengthen +the semantic consistency between 3D features and their original 2D modalities +by utilizing shared semantic prototypes to jointly constrain both 2D and 3D +features. This is complemented by confidence- and category-based sampling +strategies to tackle long-tail challenges in 3D space. To alleviate the feature +encoding burden in the joint prediction of semantics and flow, we propose a BEV +cost volume-based prediction method that links flow and semantic features +through a cost volume and employs a classification-regression supervision +scheme to address the varying flow scales in dynamic scenes. Our purely +convolutional architecture framework, named ALOcc, achieves an optimal tradeoff +between speed and accuracy achieving state-of-the-art results on multiple +benchmarks. On Occ3D and training without the camera visible mask, our ALOcc +achieves an absolute gain of 2.5\% in terms of RayIoU while operating at a +comparable speed compared to the state-of-the-art, using the same input size +(256$\times$704) and ResNet-50 backbone. Our method also achieves 2nd place in +the CVPR24 Occupancy and Flow Prediction Competition. + +
+
+
+
+
+ + ☆ EMPERROR: A Flexible Generative Perception Error Model for Probing + Self-Driving Planners + + +
+ To handle the complexities of real-world traffic, learning planners for +self-driving from data is a promising direction. While recent approaches have +shown great progress, they typically assume a setting in which the ground-truth +world state is available as input. However, when deployed, planning needs to be +robust to the long-tail of errors incurred by a noisy perception system, which +is often neglected in evaluation. To address this, previous work has proposed +drawing adversarial samples from a perception error model (PEM) mimicking the +noise characteristics of a target object detector. However, these methods use +simple PEMs that fail to accurately capture all failure modes of detection. In +this paper, we present EMPERROR, a novel transformer-based generative PEM, +apply it to stress-test an imitation learning (IL)-based planner and show that +it imitates modern detectors more faithfully than previous work. Furthermore, +it is able to produce realistic noisy inputs that increase the planner's +collision rate by up to 85%, demonstrating its utility as a valuable tool for a +more complete evaluation of self-driving planners. + +
+
+ comment: Project page: https://lasnik.github.io/emperror/ +
+
+
+
+
+ + ☆ Emotion Classification of Children Expressions + + +
+ This paper proposes a process for a classification model for the facial +expressions. The proposed process would aid in specific categorisation of +children's emotions from 2 emotions namely 'Happy' and 'Sad'. Since the +existing emotion recognition systems algorithms primarily train on adult faces, +the model developed is achieved by using advanced concepts of models with +Squeeze-andExcitation blocks, Convolutional Block Attention modules, and robust +data augmentation. Stable Diffusion image synthesis was used for expanding and +diversifying the data set generating realistic and various training samples. +The model designed using Batch Normalisation, Dropout, and SE Attention +mechanisms for the classification of children's emotions achieved an accuracy +rate of 89\% due to these methods improving the precision of emotion +recognition in children. The relative importance of this issue is raised in +this study with an emphasis on the call for a more specific model in emotion +detection systems for the young generation with specific direction on how the +young people can be assisted to manage emotions while online. + +
+
+
+
+
+ + ☆ Enhancing Ultra High Resolution Remote Sensing Imagery Analysis with + ImageRAG + + +
+ Ultra High Resolution (UHR) remote sensing imagery (RSI) (e.g. 100,000 +$\times$ 100,000 pixels or more) poses a significant challenge for current +Remote Sensing Multimodal Large Language Models (RSMLLMs). If choose to resize +the UHR image to standard input image size, the extensive spatial and +contextual information that UHR images contain will be neglected. Otherwise, +the original size of these images often exceeds the token limits of standard +RSMLLMs, making it difficult to process the entire image and capture long-range +dependencies to answer the query based on the abundant visual context. In this +paper, we introduce ImageRAG for RS, a training-free framework to address the +complexities of analyzing UHR remote sensing imagery. By transforming UHR +remote sensing image analysis task to image's long context selection task, we +design an innovative image contextual retrieval mechanism based on the +Retrieval-Augmented Generation (RAG) technique, denoted as ImageRAG. ImageRAG's +core innovation lies in its ability to selectively retrieve and focus on the +most relevant portions of the UHR image as visual contexts that pertain to a +given query. Fast path and slow path are proposed in this framework to handle +this task efficiently and effectively. ImageRAG allows RSMLLMs to manage +extensive context and spatial information from UHR RSI, ensuring the analysis +is both accurate and efficient. + +
+
+
+
+
+ + ☆ Fast Disentangled Slim Tensor Learning for Multi-view Clustering + + +
+ Tensor-based multi-view clustering has recently received significant +attention due to its exceptional ability to explore cross-view high-order +correlations. However, most existing methods still encounter some limitations. +(1) Most of them explore the correlations among different affinity matrices, +making them unscalable to large-scale data. (2) Although some methods address +it by introducing bipartite graphs, they may result in sub-optimal solutions +caused by an unstable anchor selection process. (3) They generally ignore the +negative impact of latent semantic-unrelated information in each view. To +tackle these issues, we propose a new approach termed fast Disentangled Slim +Tensor Learning (DSTL) for multi-view clustering . Instead of focusing on the +multi-view graph structures, DSTL directly explores the high-order correlations +among multi-view latent semantic representations based on matrix factorization. +To alleviate the negative influence of feature redundancy, inspired by robust +PCA, DSTL disentangles the latent low-dimensional representation into a +semantic-unrelated part and a semantic-related part for each view. +Subsequently, two slim tensors are constructed with tensor-based +regularization. To further enhance the quality of feature disentanglement, the +semantic-related representations are aligned across views through a consensus +alignment indicator. Our proposed model is computationally efficient and can be +solved effectively. Extensive experiments demonstrate the superiority and +efficiency of DSTL over state-of-the-art approaches. The code of DSTL is +available at https://github.com/dengxu-nju/DSTL. + +
+
+ comment: 13 pages,6 figures, will be published to IEEE TMM +
+
+
+
+
+ + ☆ AI enhanced diagnosis of Peyronies disease a novel approach using + Computer Vision + + +
+ This study presents an innovative AI-driven tool for diagnosing Peyronie's +Disease (PD), a condition that affects between 0.3% and 13.1% of men worldwide. +Our method uses key point detection on both images and videos to measure penile +curvature angles, utilizing advanced computer vision techniques. This tool has +demonstrated high accuracy in identifying anatomical landmarks, validated +against conventional goniometer measurements. Traditional PD diagnosis often +involves subjective and invasive methods, which can lead to patient discomfort +and inaccuracies. Our approach offers a precise, reliable, and non-invasive +diagnostic tool to address these drawbacks. The model distinguishes between PD +and normal anatomical changes with a sensitivity of 96.7% and a specificity of +100%. This advancement represents a significant improvement in urological +diagnostics, greatly enhancing the efficacy and convenience of PD assessment +for healthcare providers and patients. + +
+
+ comment: 8 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Evaluating the Generation of Spatial Relations in Text and Image + Generative Models + + +
+ Understanding spatial relations is a crucial cognitive ability for both +humans and AI. While current research has predominantly focused on the +benchmarking of text-to-image (T2I) models, we propose a more comprehensive +evaluation that includes \textit{both} T2I and Large Language Models (LLMs). As +spatial relations are naturally understood in a visuo-spatial manner, we +develop an approach to convert LLM outputs into an image, thereby allowing us +to evaluate both T2I models and LLMs \textit{visually}. We examined the spatial +relation understanding of 8 prominent generative models (3 T2I models and 5 +LLMs) on a set of 10 common prepositions, as well as assess the feasibility of +automatic evaluation methods. Surprisingly, we found that T2I models only +achieve subpar performance despite their impressive general image-generation +abilities. Even more surprisingly, our results show that LLMs are significantly +more accurate than T2I models in generating spatial relations, despite being +primarily trained on textual data. We examined reasons for model failures and +highlight gaps that can be filled to enable more spatially faithful +generations. + +
+
+
+
+
+ + ☆ HMIL: Hierarchical Multi-Instance Learning for Fine-Grained Whole Slide + Image Classification + + +
+ Fine-grained classification of whole slide images (WSIs) is essential in +precision oncology, enabling precise cancer diagnosis and personalized +treatment strategies. The core of this task involves distinguishing subtle +morphological variations within the same broad category of gigapixel-resolution +images, which presents a significant challenge. While the multi-instance +learning (MIL) paradigm alleviates the computational burden of WSIs, existing +MIL methods often overlook hierarchical label correlations, treating +fine-grained classification as a flat multi-class classification task. To +overcome these limitations, we introduce a novel hierarchical multi-instance +learning (HMIL) framework. By facilitating on the hierarchical alignment of +inherent relationships between different hierarchy of labels at instance and +bag level, our approach provides a more structured and informative learning +process. Specifically, HMIL incorporates a class-wise attention mechanism that +aligns hierarchical information at both the instance and bag levels. +Furthermore, we introduce supervised contrastive learning to enhance the +discriminative capability for fine-grained classification and a +curriculum-based dynamic weighting module to adaptively balance the +hierarchical feature during training. Extensive experiments on our large-scale +cytology cervical cancer (CCC) dataset and two public histology datasets, BRACS +and PANDA, demonstrate the state-of-the-art class-wise and overall performance +of our HMIL framework. Our source code is available at +https://github.com/ChengJin-git/HMIL. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Understanding Audiovisual Deepfake Detection: Techniques, Challenges, + Human Factors and Perceptual Insights + + +
+ Deep Learning has been successfully applied in diverse fields, and its impact +on deepfake detection is no exception. Deepfakes are fake yet realistic +synthetic content that can be used deceitfully for political impersonation, +phishing, slandering, or spreading misinformation. Despite extensive research +on unimodal deepfake detection, identifying complex deepfakes through joint +analysis of audio and visual streams remains relatively unexplored. To fill +this gap, this survey first provides an overview of audiovisual deepfake +generation techniques, applications, and their consequences, and then provides +a comprehensive review of state-of-the-art methods that combine audio and +visual modalities to enhance detection accuracy, summarizing and critically +analyzing their strengths and limitations. Furthermore, we discuss existing +open source datasets for a deeper understanding, which can contribute to the +research community and provide necessary information to beginners who want to +analyze deep learning-based audiovisual methods for video forensics. By +bridging the gap between unimodal and multimodal approaches, this paper aims to +improve the effectiveness of deepfake detection strategies and guide future +research in cybersecurity and media integrity. + +
+
+
+
+
+ + ☆ Maritime Search and Rescue Missions with Aerial Images: A Survey + + +
+ The speed of response by search and rescue teams at sea is of vital +importance, as survival may depend on it. Recent technological advancements +have led to the development of more efficient systems for locating individuals +involved in a maritime incident, such as the use of Unmanned Aerial Vehicles +(UAVs) equipped with cameras and other integrated sensors. Over the past +decade, several researchers have contributed to the development of automatic +systems capable of detecting people using aerial images, particularly by +leveraging the advantages of deep learning. In this article, we provide a +comprehensive review of the existing literature on this topic. We analyze the +methods proposed to date, including both traditional techniques and more +advanced approaches based on machine learning and neural networks. +Additionally, we take into account the use of synthetic data to cover a wider +range of scenarios without the need to deploy a team to collect data, which is +one of the major obstacles for these systems. Overall, this paper situates the +reader in the field of detecting people at sea using aerial images by quickly +identifying the most suitable methodology for each scenario, as well as +providing an in-depth discussion and direction for future trends. + +
+
+
+
+
+ + ☆ xCG: Explainable Cell Graphs for Survival Prediction in Non-Small Cell + Lung Cancer ML4H + + +
+ Understanding how deep learning models predict oncology patient risk can +provide critical insights into disease progression, support clinical +decision-making, and pave the way for trustworthy and data-driven precision +medicine. Building on recent advances in the spatial modeling of the tumor +microenvironment using graph neural networks, we present an explainable cell +graph (xCG) approach for survival prediction. We validate our model on a public +cohort of imaging mass cytometry (IMC) data for 416 cases of lung +adenocarcinoma. We explain survival predictions in terms of known phenotypes on +the cell level by computing risk attributions over cell graphs, for which we +propose an efficient grid-based layer-wise relevance propagation (LRP) method. +Our ablation studies highlight the importance of incorporating the cancer stage +and model ensembling to improve the quality of risk estimates. Our xCG method, +together with the IMC data, is made publicly available to support further +research. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 11 pages +
+
+
+
+
+ + ☆ Breaking the Low-Rank Dilemma of Linear Attention + + +
+ The Softmax attention mechanism in Transformer models is notoriously +computationally expensive, particularly due to its quadratic complexity, posing +significant challenges in vision applications. In contrast, linear attention +provides a far more efficient solution by reducing the complexity to linear +levels. However, compared to Softmax attention, linear attention often +experiences significant performance degradation. Our experiments indicate that +this performance drop is due to the low-rank nature of linear attention's +feature map, which hinders its ability to adequately model complex spatial +information. In this paper, to break the low-rank dilemma of linear attention, +we conduct rank analysis from two perspectives: the KV buffer and the output +features. Consequently, we introduce Rank-Augmented Linear Attention (RALA), +which rivals the performance of Softmax attention while maintaining linear +complexity and high efficiency. Based on RALA, we construct the Rank-Augmented +Vision Linear Transformer (RAVLT). Extensive experiments demonstrate that RAVLT +achieves excellent performance across various vision tasks. Specifically, +without using any additional labels, data, or supervision during training, +RAVLT achieves an 84.4% Top-1 accuracy on ImageNet-1k with only 26M parameters +and 4.6G FLOPs. This result significantly surpasses previous linear attention +mechanisms, fully illustrating the potential of RALA. Code will be available at +https://github.com/qhfan/RALA. + +
+
+
+
+
+ + ☆ Leveraging Previous Steps: A Training-free Fast Solver for Flow + Diffusion + + +
+ Flow diffusion models (FDMs) have recently shown potential in generation +tasks due to the high generation quality. However, the current ordinary +differential equation (ODE) solver for FDMs, e.g., the Euler solver, still +suffers from slow generation since ODE solvers need many number function +evaluations (NFE) to keep high-quality generation. In this paper, we propose a +novel training-free flow-solver to reduce NFE while maintaining high-quality +generation. The key insight for the flow-solver is to leverage the previous +steps to reduce the NFE, where a cache is created to reuse these results from +the previous steps. Specifically, the Taylor expansion is first used to +approximate the ODE. To calculate the high-order derivatives of Taylor +expansion, the flow-solver proposes to use the previous steps and a polynomial +interpolation to approximate it, where the number of orders we could +approximate equals the number of previous steps we cached. We also prove that +the flow-solver has a more minor approximation error and faster generation +speed. Experimental results on the CIFAR-10, CelebA-HQ, LSUN-Bedroom, +LSUN-Church, ImageNet, and real text-to-image generation prove the efficiency +of the flow-solver. Specifically, the flow-solver improves the FID-30K from +13.79 to 6.75, from 46.64 to 19.49 with $\text{NFE}=10$ on CIFAR-10 and +LSUN-Church, respectively. + +
+
+
+
+
+ + ☆ Unraveling the Connections between Flow Matching and Diffusion + Probabilistic Models in Training-free Conditional Generation + + +
+ Training-free conditional generation aims to leverage the unconditional +diffusion models to implement the conditional generation, where flow-matching +(FM) and diffusion probabilistic models (DPMs) are two mature unconditional +diffusion models that achieve high-quality generation. Two questions were asked +in this paper: What are the underlying connections between FM and DPMs in +training-free conditional generation? Can we leverage DPMs to improve the +training-free conditional generation for FM? We first show that a probabilistic +diffusion path can be associated with the FM and DPMs. Then, we reformulate the +ordinary differential equation (ODE) of FM based on the score function of DPMs, +and thus, the conditions in FM can be incorporated as those in DPMs. Finally, +we propose two posterior sampling methods to estimate the conditional term and +achieve a training-free conditional generation of FM. Experimental results show +that our proposed method could be implemented for various conditional +generation tasks. Our method can generate higher-quality results than the +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Mix from Failure: Confusion-Pairing Mixup for Long-Tailed Recognition + + +
+ Long-tailed image recognition is a computer vision problem considering a +real-world class distribution rather than an artificial uniform. Existing +methods typically detour the problem by i) adjusting a loss function, ii) +decoupling classifier learning, or iii) proposing a new multi-head architecture +called experts. In this paper, we tackle the problem from a different +perspective to augment a training dataset to enhance the sample diversity of +minority classes. Specifically, our method, namely Confusion-Pairing Mixup +(CP-Mix), estimates the confusion distribution of the model and handles the +data deficiency problem by augmenting samples from confusion pairs in +real-time. In this way, CP-Mix trains the model to mitigate its weakness and +distinguish a pair of classes it frequently misclassifies. In addition, CP-Mix +utilizes a novel mixup formulation to handle the bias in decision boundaries +that originated from the imbalanced dataset. Extensive experiments demonstrate +that CP-Mix outperforms existing methods for long-tailed image recognition and +successfully relieves the confusion of the classifier. + +
+
+
+
+
+ + ☆ Artificial Intelligence for Biomedical Video Generation + + +
+ As a prominent subfield of Artificial Intelligence Generated Content (AIGC), +video generation has achieved notable advancements in recent years. The +introduction of Sora-alike models represents a pivotal breakthrough in video +generation technologies, significantly enhancing the quality of synthesized +videos. Particularly in the realm of biomedicine, video generation technology +has shown immense potential such as medical concept explanation, disease +simulation, and biomedical data augmentation. In this article, we thoroughly +examine the latest developments in video generation models and explore their +applications, challenges, and future opportunities in the biomedical sector. We +have conducted an extensive review and compiled a comprehensive list of +datasets from various sources to facilitate the development and evaluation of +video generative models in biomedicine. Given the rapid progress in this field, +we have also created a github repository to regularly update the advances of +biomedical video generation at: +https://github.com/Lee728243228/Biomedical-Video-Generation + +
+
+
+
+
+ + ☆ Quantum Information-Empowered Graph Neural Network for Hyperspectral + Change Detection + + +
+ Change detection (CD) is a critical remote sensing technique for identifying +changes in the Earth's surface over time. The outstanding substance +identifiability of hyperspectral images (HSIs) has significantly enhanced the +detection accuracy, making hyperspectral change detection (HCD) an essential +technology. The detection accuracy can be further upgraded by leveraging the +graph structure of HSIs, motivating us to adopt the graph neural networks +(GNNs) in solving HCD. For the first time, this work introduces quantum deep +network (QUEEN) into HCD. Unlike GNN and CNN, both extracting the +affine-computing features, QUEEN provides fundamentally different +unitary-computing features. We demonstrate that through the unitary feature +extraction procedure, QUEEN provides radically new information for deciding +whether there is a change or not. Hierarchically, a graph feature learning +(GFL) module exploits the graph structure of the bitemporal HSIs at the +superpixel level, while a quantum feature learning (QFL) module learns the +quantum features at the pixel level, as a complementary to GFL by preserving +pixel-level detailed spatial information not retained in the superpixels. In +the final classification stage, a quantum classifier is designed to cooperate +with a traditional fully connected classifier. The superior HCD performance of +the proposed QUEEN-empowered GNN (i.e., QUEEN-G) will be experimentally +demonstrated on real hyperspectral datasets. + +
+
+ comment: This work has been accepted by IEEE Transactions on Geoscience and + Remote Sensing (TGRS) +
+
+
+
+
+ + ☆ SegQC: a segmentation network-based framework for multi-metric + segmentation quality control and segmentation error detection in volumetric + medical images + + +
+ Quality control of structures segmentation in volumetric medical images is +important for identifying segmentation errors in clinical practice and for +facilitating model development. This paper introduces SegQC, a novel framework +for segmentation quality estimation and segmentation error detection. SegQC +computes an estimate measure of the quality of a segmentation in volumetric +scans and in their individual slices and identifies possible segmentation error +regions within a slice. The key components include: 1. SegQC-Net, a deep +network that inputs a scan and its segmentation mask and outputs segmentation +error probabilities for each voxel in the scan; 2. three new segmentation +quality metrics, two overlap metrics and a structure size metric, computed from +the segmentation error probabilities; 3. a new method for detecting possible +segmentation errors in scan slices computed from the segmentation error +probabilities. We introduce a new evaluation scheme to measure segmentation +error discrepancies based on an expert radiologist corrections of automatically +produced segmentations that yields smaller observer variability and is closer +to actual segmentation errors. We demonstrate SegQC on three fetal structures +in 198 fetal MRI scans: fetal brain, fetal body and the placenta. To assess the +benefits of SegQC, we compare it to the unsupervised Test Time Augmentation +(TTA)-based quality estimation. Our studies indicate that SegQC outperforms +TTA-based quality estimation in terms of Pearson correlation and MAE for fetal +body and fetal brain structures segmentation. Our segmentation error detection +method achieved recall and precision rates of 0.77 and 0.48 for fetal body, and +0.74 and 0.55 for fetal brain segmentation error detection respectively. SegQC +enhances segmentation metrics estimation for whole scans and individual slices, +as well as provides error regions detection. + +
+
+ comment: 28 pages, 9 figures +
+
+
+
+
+ + ☆ Grounded Video Caption Generation + + +
+ We propose a new task, dataset and model for grounded video caption +generation. This task unifies captioning and object grounding in video, where +the objects in the caption are grounded in the video via temporally consistent +bounding boxes. We introduce the following contributions. First, we present a +task definition and a manually annotated test dataset for this task, referred +to as GROunded Video Caption Generation (GROC). Second, we introduce a +large-scale automatic annotation method leveraging an existing model for +grounded still image captioning together with an LLM for summarising +frame-level captions into temporally consistent captions in video. Furthermore, +we prompt the LLM to track by language -- classifying noun phrases from the +frame-level captions into noun phrases of the video-level generated caption. We +apply this approach to videos from the HowTo100M dataset, which results in a +new large-scale training dataset, called HowToGround, with automatically +annotated captions and spatio-temporally consistent bounding boxes with +coherent natural language labels. Third, we introduce a new grounded video +caption generation model, called VideoGround, and train the model on the new +automatically annotated HowToGround dataset. Finally, results of our +VideoGround model set the state of the art for the new task of grounded video +caption generation. We perform extensive ablations and demonstrate the +importance of key technical contributions of our model. + +
+
+
+
+
+ + ☆ Semantic segmentation on multi-resolution optical and microwave data + using deep learning + + +
+ Presently, deep learning and convolutional neural networks (CNNs) are widely +used in the fields of image processing, image classification, object +identification and many more. In this work, we implemented convolutional neural +network based modified U-Net model and VGG-UNet model to automatically identify +objects from satellite imagery captured using high resolution Indian remote +sensing satellites and then to pixel wise classify satellite data into various +classes. In this paper, Cartosat 2S (~1m spatial resolution) datasets were used +and deep learning models were implemented to detect building shapes and ships +from the test datasets with an accuracy of more than 95%. In another +experiment, microwave data (varied resolution) from RISAT-1 was taken as an +input and ships and trees were detected with an accuracy of >96% from these +datasets. For the classification of images into multiple-classes, deep learning +model was trained on multispectral Cartosat images. Model generated results +were then tested using ground truth. Multi-label classification results were +obtained with an accuracy (IoU) of better than 95%. Total six different +problems were attempted using deep learning models and IoU accuracies in the +range of 85% to 98% were achieved depending on the degree of complexity. + +
+
+
+
+
+ + ☆ Projecting Gaussian Ellipsoids While Avoiding Affine Projection + Approximation + + +
+ Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its +real-time rendering speed and state-of-the-art rendering quality. However, +during the rendering process, the use of the Jacobian of the affine +approximation of the projection transformation leads to inevitable errors, +resulting in blurriness, artifacts and a lack of scene consistency in the final +rendered images. To address this issue, we introduce an ellipsoid-based +projection method to calculate the projection of Gaussian ellipsoid on the +image plane, witch is the primitive of 3D Gaussian Splatting. As our proposed +ellipsoid-based projection method cannot handle Gaussian ellipsoids with camera +origins inside them or parts lying below $z=0$ plane in the camera space, we +designed a pre-filtering strategy. Experiments over multiple widely adopted +benchmark datasets show that using our ellipsoid-based projection method can +enhance the rendering quality of 3D Gaussian Splatting and its extensions. + +
+
+
+
+
+ + ☆ Atmospheric turbulence restoration by diffeomorphic image registration + and blind deconvolution + + +
+ A novel approach is presented in this paper to improve images which are +altered by atmospheric turbulence. Two new algorithms are presented based on +two combinations of a blind deconvolution block, an elastic registration block +and a temporal filter block. The algorithms are tested on real images acquired +in the desert in New Mexico by the NATO RTG40 group. + +
+
+
+
+
+ + ☆ IR image databases generation under target intrinsic thermal variability + constraints + + +
+ This paper deals with the problem of infrared image database generation for +ATR assessment purposes. Huge databases are required to have quantitative and +objective performance evaluations. We propose a method which superimpose +targets and occultants on background under image quality metrics constraints to +generate realistic images. We also propose a method to generate target +signatures with intrinsic thermal variability based on 3D models plated with +real infrared textures. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2411.06695 +
+
+
+
+
+ + ☆ Génération de bases de données images IR sous contraintes avec + variabilité thermique intrinsèque des cibles + + +
+ In this communication, we propose a method which permits to simulate images +of targets in infrared imagery by superimposition of vehicle signatures in +background, eventually with occultants. We develop a principle which authorizes +us to generate different thermal configurations of target signatures. This +method enables us to easily generate huge datasets for ATR algorithms +performance evaluation. + +
+
+ comment: in French language, GRETSI Symposium on Signal and Image Processing, + Dijon, France, September 2009 +
+
+
+
+
+ + ☆ Uncertainty-Aware Test-Time Adaptation for Inverse Consistent + Diffeomorphic Lung Image Registration + + +
+ Diffeomorphic deformable image registration ensures smooth invertible +transformations across inspiratory and expiratory chest CT scans. Yet, in +practice, deep learning-based diffeomorphic methods struggle to capture large +deformations between inspiratory and expiratory volumes, and therefore lack +inverse consistency. Existing methods also fail to account for model +uncertainty, which can be useful for improving performance. We propose an +uncertainty-aware test-time adaptation framework for inverse consistent +diffeomorphic lung registration. Our method uses Monte Carlo (MC) dropout to +estimate spatial uncertainty that is used to improve model performance. We +train and evaluate our method for inspiratory-to-expiratory CT registration on +a large cohort of 675 subjects from the COPDGene study, achieving a higher Dice +similarity coefficient (DSC) between the lung boundaries (0.966) compared to +both VoxelMorph (0.953) and TransMorph (0.953). Our method demonstrates +consistent improvements in the inverse registration direction as well with an +overall DSC of 0.966, higher than VoxelMorph (0.958) and TransMorph (0.956). +Paired t-tests indicate statistically significant improvements. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Multi-task Feature Enhancement Network for No-Reference Image Quality + Assessment + + +
+ Due to the scarcity of labeled samples in Image Quality Assessment (IQA) +datasets, numerous recent studies have proposed multi-task based strategies, +which explore feature information from other tasks or domains to boost the IQA +task. Nevertheless, multi-task strategies based No-Reference Image Quality +Assessment (NR-IQA) methods encounter several challenges. First, existing +methods have not explicitly exploited texture details, which significantly +influence the image quality. Second, multi-task methods conventionally +integrate features through simple operations such as addition or concatenation, +thereby diminishing the network's capacity to accurately represent distorted +features. To tackle these challenges, we introduce a novel multi-task NR-IQA +framework. Our framework consists of three key components: a high-frequency +extraction network, a quality estimation network, and a distortion-aware +network. The high-frequency extraction network is designed to guide the model's +focus towards high-frequency information, which is highly related to the +texture details. Meanwhile, the distortion-aware network extracts +distortion-related features to distinguish different distortion types. To +effectively integrate features from different tasks, a feature fusion module is +developed based on an attention mechanism. Empirical results from five standard +IQA databases confirm that our method not only achieves high performance but +also exhibits robust generalization ability. + +
+
+
+
+
+ + ☆ GaussianCut: Interactive segmentation via graph cut for 3D Gaussian + Splatting + + +
+ We introduce GaussianCut, a new method for interactive multiview segmentation +of scenes represented as 3D Gaussians. Our approach allows for selecting the +objects to be segmented by interacting with a single view. It accepts intuitive +user input, such as point clicks, coarse scribbles, or text. Using 3D Gaussian +Splatting (3DGS) as the underlying scene representation simplifies the +extraction of objects of interest which are considered to be a subset of the +scene's Gaussians. Our key idea is to represent the scene as a graph and use +the graph-cut algorithm to minimize an energy function to effectively partition +the Gaussians into foreground and background. To achieve this, we construct a +graph based on scene Gaussians and devise a segmentation-aligned energy +function on the graph to combine user inputs with scene properties. To obtain +an initial coarse segmentation, we leverage 2D image/video segmentation models +and further refine these coarse estimates using our graph construction. Our +empirical evaluations show the adaptability of GaussianCut across a diverse set +of scenes. GaussianCut achieves competitive performance with state-of-the-art +approaches for 3D segmentation without requiring any additional +segmentation-aware training. + +
+
+
+
+
+ + ☆ Contrastive Language Prompting to Ease False Positives in Medical + Anomaly Detection + + +
+ A pre-trained visual-language model, contrastive language-image pre-training +(CLIP), successfully accomplishes various downstream tasks with text prompts, +such as finding images or localizing regions within the image. Despite CLIP's +strong multi-modal data capabilities, it remains limited in specialized +environments, such as medical applications. For this purpose, many CLIP +variants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives +related to normal regions persist. Thus, we aim to present a simple yet +important goal of reducing false positives in medical anomaly detection. We +introduce a Contrastive LAnguage Prompting (CLAP) method that leverages both +positive and negative text prompts. This straightforward approach identifies +potential lesion regions by visual attention to the positive prompts in the +given image. To reduce false positives, we attenuate attention on normal +regions using negative prompts. Extensive experiments with the BMAD dataset, +including six biomedical benchmarks, demonstrate that CLAP method enhances +anomaly detection performance. Our future plans include developing an automated +fine prompting method for more practical usage. + +
+
+ comment: 4 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Depthwise Separable Convolutions with Deep Residual Convolutions + + +
+ The recent advancement of edge computing enables researchers to optimize +various deep learning architectures to employ them in edge devices. In this +study, we aim to optimize Xception architecture which is one of the most +popular deep learning algorithms for computer vision applications. The Xception +architecture is highly effective for object detection tasks. However, it comes +with a significant computational cost. The computational complexity of Xception +sometimes hinders its deployment on resource-constrained edge devices. To +address this, we propose an optimized Xception architecture tailored for edge +devices, aiming for lightweight and efficient deployment. We incorporate the +depthwise separable convolutions with deep residual convolutions of the +Xception architecture to develop a small and efficient model for edge devices. +The resultant architecture reduces parameters, memory usage, and computational +load. The proposed architecture is evaluated on the CIFAR 10 object detection +dataset. The evaluation result of our experiment also shows the proposed +architecture is smaller in parameter size and requires less training time while +outperforming Xception architecture performance. + +
+
+ comment: Course Project Report +
+
+
+
+
+ + ☆ HiCoM: Hierarchical Coherent Motion for Streamable Dynamic Scene with 3D + Gaussian Splatting NeurIPS 2024 + + +
+ The online reconstruction of dynamic scenes from multi-view streaming videos +faces significant challenges in training, rendering and storage efficiency. +Harnessing superior learning speed and real-time rendering capabilities, 3D +Gaussian Splatting (3DGS) has recently demonstrated considerable potential in +this field. However, 3DGS can be inefficient in terms of storage and prone to +overfitting by excessively growing Gaussians, particularly with limited views. +This paper proposes an efficient framework, dubbed HiCoM, with three key +components. First, we construct a compact and robust initial 3DGS +representation using a perturbation smoothing strategy. Next, we introduce a +Hierarchical Coherent Motion mechanism that leverages the inherent non-uniform +distribution and local consistency of 3D Gaussians to swiftly and accurately +learn motions across frames. Finally, we continually refine the 3DGS with +additional Gaussians, which are later merged into the initial 3DGS to maintain +consistency with the evolving scene. To preserve a compact representation, an +equivalent number of low-opacity Gaussians that minimally impact the +representation are removed before processing subsequent frames. Extensive +experiments conducted on two widely used datasets show that our framework +improves learning efficiency of the state-of-the-art methods by about $20\%$ +and reduces the data storage by $85\%$, achieving competitive free-viewpoint +video synthesis quality but with higher robustness and stability. Moreover, by +parallel learning multiple frames simultaneously, our HiCoM decreases the +average training wall time to $<2$ seconds per frame with negligible +performance degradation, substantially boosting real-world applicability and +responsiveness. + +
+
+ comment: Accepted to NeurIPS 2024; Code is avaliable at + https://github.com/gqk/HiCoM +
+
+
+
+
+ + ☆ SparrowVQE: Visual Question Explanation for Course Content Understanding + + +
+ Visual Question Answering (VQA) research seeks to create AI systems to answer +natural language questions in images, yet VQA methods often yield overly +simplistic and short answers. This paper aims to advance the field by +introducing Visual Question Explanation (VQE), which enhances the ability of +VQA to provide detailed explanations rather than brief responses and address +the need for more complex interaction with visual content. We first created an +MLVQE dataset from a 14-week streamed video machine learning course, including +885 slide images, 110,407 words of transcripts, and 9,416 designed +question-answer (QA) pairs. Next, we proposed a novel SparrowVQE, a small 3 +billion parameters multimodal model. We trained our model with a three-stage +training mechanism consisting of multimodal pre-training (slide images and +transcripts feature alignment), instruction tuning (tuning the pre-trained +model with transcripts and QA pairs), and domain fine-tuning (fine-tuning slide +image and QA pairs). Eventually, our SparrowVQE can understand and connect +visual information using the SigLIP model with transcripts using the Phi-2 +language model with an MLP adapter. Experimental results demonstrate that our +SparrowVQE achieves better performance in our developed MLVQE dataset and +outperforms state-of-the-art methods in the other five benchmark VQA datasets. +The source code is available at +\url{https://github.com/YoushanZhang/SparrowVQE}. + +
+
+
+
+
+ + ☆ A Novel Automatic Real-time Motion Tracking Method for Magnetic + Resonance Imaging-guided Radiotherapy: Leveraging the Enhanced + Tracking-Learning-Detection Framework with Automatic Segmentation + + +
+ Objective: Ensuring the precision in motion tracking for MRI-guided +Radiotherapy (MRIgRT) is crucial for the delivery of effective treatments. This +study refined the motion tracking accuracy in MRIgRT through the innovation of +an automatic real-time tracking method, leveraging an enhanced +Tracking-Learning-Detection (ETLD) framework coupled with automatic +segmentation. Methods: We developed a novel MRIgRT motion tracking method by +integrating two primary methods: the ETLD framework and an improved Chan-Vese +model (ICV), named ETLD+ICV. The TLD framework was upgraded to suit real-time +cine MRI, including advanced image preprocessing, no-reference image quality +assessment, an enhanced median-flow tracker, and a refined detector with +dynamic search region adjustments. Additionally, ICV was combined for precise +coverage of the target volume, which refined the segmented region frame by +frame using tracking results, with key parameters optimized. Tested on 3.5D MRI +scans from 10 patients with liver metastases, our method ensures precise +tracking and accurate segmentation vital for MRIgRT. Results: An evaluation of +106,000 frames across 77 treatment fractions revealed sub-millimeter tracking +errors of less than 0.8mm, with over 99% precision and 98% recall for all +subjects, underscoring the robustness and efficacy of the ETLD. Moreover, the +ETLD+ICV yielded a dice global score of more than 82% for all subjects, +demonstrating the proposed method's extensibility and precise target volume +coverage. Conclusions: This study successfully developed an automatic real-time +motion tracking method for MRIgRT that markedly surpasses current methods. The +novel method not only delivers exceptional precision in tracking and +segmentation but also demonstrates enhanced adaptability to clinical demands, +positioning it as an indispensable asset in the quest to augment the efficacy +of radiotherapy treatments. + +
+
+
+
+
+ + ☆ LAUREL: Learned Augmented Residual Layer ICML + + +
+ One of the core pillars of efficient deep learning methods is architectural +improvements such as the residual/skip connection, which has led to +significantly better model convergence and quality. Since then the residual +connection has become ubiquitous in not just convolutional neural networks but +also transformer-based architectures, the backbone of LLMs. + In this paper we introduce \emph{Learned Augmented Residual Layer} (LAuReL) +-- a novel generalization of the canonical residual connection -- with the goal +to be an in-situ replacement of the latter while outperforming on both model +quality and footprint metrics. Our experiments show that using \laurel can help +boost performance for both vision and language models. For example, on the +ResNet-50, ImageNet 1K task, it achieves $60\%$ of the gains from adding an +extra layer, while only adding $0.003\%$ more parameters, and matches it while +adding $2.6\times$ fewer parameters. + +
+
+ comment: Accepted at the 2nd Efficient Systems for Foundation Models Workshop + at the International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ☆ Quantifying Knowledge Distillation Using Partial Information + Decomposition NeurIPS 2024 + + +
+ Knowledge distillation provides an effective method for deploying complex +machine learning models in resource-constrained environments. It typically +involves training a smaller student model to emulate either the probabilistic +outputs or the internal feature representations of a larger teacher model. By +doing so, the student model often achieves substantially better performance on +a downstream task compared to when it is trained independently. Nevertheless, +the teacher's internal representations can also encode noise or additional +information that may not be relevant to the downstream task. This observation +motivates our primary question: What are the information-theoretic limits of +knowledge transfer? To this end, we leverage a body of work in information +theory called Partial Information Decomposition (PID) to quantify the +distillable and distilled knowledge of a teacher's representation corresponding +to a given student and a downstream task. Moreover, we demonstrate that this +metric can be practically used in distillation to address challenges caused by +the complexity gap between the teacher and the student representations. + +
+
+ comment: Accepted at NeurIPS 2024 Machine Learning and Compression Workshop +
+
+
+
+
+ + ☆ GUS-IR: Gaussian Splatting with Unified Shading for Inverse Rendering + + +
+ Recovering the intrinsic physical attributes of a scene from images, +generally termed as the inverse rendering problem, has been a central and +challenging task in computer vision and computer graphics. In this paper, we +present GUS-IR, a novel framework designed to address the inverse rendering +problem for complicated scenes featuring rough and glossy surfaces. This paper +starts by analyzing and comparing two prominent shading techniques popularly +used for inverse rendering, forward shading and deferred shading, effectiveness +in handling complex materials. More importantly, we propose a unified shading +solution that combines the advantages of both techniques for better +decomposition. In addition, we analyze the normal modeling in 3D Gaussian +Splatting (3DGS) and utilize the shortest axis as normal for each particle in +GUS-IR, along with a depth-related regularization, resulting in improved +geometric representation and better shape reconstruction. Furthermore, we +enhance the probe-based baking scheme proposed by GS-IR to achieve more +accurate ambient occlusion modeling to better handle indirect illumination. +Extensive experiments have demonstrated the superior performance of GUS-IR in +achieving precise intrinsic decomposition and geometric representation, +supporting many downstream tasks (such as relighting, retouching) in computer +vision, graphics, and extended reality. + +
+
+ comment: 15 pages, 11 figures +
+
+
+
+
+ + ☆ Semi-Truths: A Large-Scale Dataset of AI-Augmented Images for Evaluating + Robustness of AI-Generated Image detectors NeurIPS 2024 + + +
+ Text-to-image diffusion models have impactful applications in art, design, +and entertainment, yet these technologies also pose significant risks by +enabling the creation and dissemination of misinformation. Although recent +advancements have produced AI-generated image detectors that claim robustness +against various augmentations, their true effectiveness remains uncertain. Do +these detectors reliably identify images with different levels of augmentation? +Are they biased toward specific scenes or data distributions? To investigate, +we introduce SEMI-TRUTHS, featuring 27,600 real images, 223,400 masks, and +1,472,700 AI-augmented images that feature targeted and localized perturbations +produced using diverse augmentation techniques, diffusion models, and data +distributions. Each augmented image is accompanied by metadata for standardized +and targeted evaluation of detector robustness. Our findings suggest that +state-of-the-art detectors exhibit varying sensitivities to the types and +degrees of perturbations, data distributions, and augmentation methods used, +offering new insights into their performance and limitations. The code for the +augmentation and evaluation pipeline is available at +https://github.com/J-Kruk/SemiTruths. + +
+
+ comment: Accepted at NeurIPS 2024 Track Datasets & Benchmarks Track +
+
+
+
+
+ + ☆ MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation + Models, Convolutional Neural Networks, and Uncertainty Quantification for + High-Speed Video Phase Detection Data + + +
+ Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in +nuclear reactors, chemical processing, and electronics cooling for detecting +vapor, liquid, and microlayer phases. Traditional segmentation models face +pixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ +introduces VideoSAM, a hybrid framework leveraging convolutional neural +networks (CNNs) and transformer-based vision models to enhance segmentation +accuracy and generalizability across complex multimodal PD tasks. Methods: +VideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced +feature extraction and segmentation across diverse HSV PD modalities, spanning +fluids like water, FC-72, nitrogen, and argon under varied heat flux +conditions. The framework also incorporates uncertainty quantification (UQ) to +assess pixel-based discretization errors, delivering reliable metrics such as +contact line density and dry area fraction under experimental conditions. +Results: VideoSAM outperforms SAM and modality-specific CNN models in +segmentation accuracy, excelling in environments with complex phase boundaries, +overlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid +architecture supports cross-dataset generalization, adapting effectively to +varying modalities. The UQ module provides accurate error estimates, enhancing +the reliability of segmentation outputs for advanced HSV PD research. +Conclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD +segmentation, addressing previous limitations with advanced deep learning and +UQ techniques. The open-source datasets and tools introduced enable scalable, +precise, and adaptable segmentation for multimodal PD datasets, supporting +advancements in HSV analysis and autonomous experimentation. + +
+
+ comment: Under Review in EAAI +
+
+
+
+
+ + ☆ MureObjectStitch: Multi-reference Image Composition + + +
+ Generative image composition aims to regenerate the given foreground object +in the background image to produce a realistic composite image. In this work, +we propose an effective finetuning strategy for generative image composition +model, in which we finetune a pretrained model using one or more images +containing the same foreground object. Moreover, we propose a multi-reference +strategy, which allows the model to take in multiple reference images of the +foreground object. The experiments on MureCOM dataset verify the effectiveness +of our method. + +
+
+
+
+
+ + ☆ BLIP3-KALE: Knowledge Augmented Large-Scale Dense Captions + + +
+ We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that +bridges the gap between descriptive synthetic captions and factual web-scale +alt-text. KALE augments synthetic dense image captions with web-scale alt-text +to generate factually grounded image captions. Our two-stage approach leverages +large vision-language models and language models to create knowledge-augmented +captions, which are then used to train a specialized VLM for scaling up the +dataset. We train vision-language models on KALE and demonstrate improvements +on vision-language tasks. Our experiments show the utility of KALE for training +more capable and knowledgeable multimodal models. We release the KALE dataset +at https://huggingface.co/datasets/Salesforce/blip3-kale + +
+
+
+
+
+ + ☆ Tracing the Roots: Leveraging Temporal Dynamics in Diffusion + Trajectories for Origin Attribution + + +
+ Diffusion models have revolutionized image synthesis, garnering significant +research interest in recent years. Diffusion is an iterative algorithm in which +samples are generated step-by-step, starting from pure noise. This process +introduces the notion of diffusion trajectories, i.e., paths from the standard +Gaussian distribution to the target image distribution. In this context, we +study discriminative algorithms operating on these trajectories. Specifically, +given a pre-trained diffusion model, we consider the problem of classifying +images as part of the training dataset, generated by the model or originating +from an external source. Our approach demonstrates the presence of patterns +across steps that can be leveraged for classification. We also conduct ablation +studies, which reveal that using higher-order gradient features to characterize +the trajectories leads to significant performance gains and more robust +algorithms. + +
+
+
+
+
+ + ☆ All-in-one Weather-degraded Image Restoration via Adaptive + Degradation-aware Self-prompting Model + + +
+ Existing approaches for all-in-one weather-degraded image restoration suffer +from inefficiencies in leveraging degradation-aware priors, resulting in +sub-optimal performance in adapting to different weather conditions. To this +end, we develop an adaptive degradation-aware self-prompting model (ADSM) for +all-in-one weather-degraded image restoration. Specifically, our model employs +the contrastive language-image pre-training model (CLIP) to facilitate the +training of our proposed latent prompt generators (LPGs), which represent three +types of latent prompts to characterize the degradation type, degradation +property and image caption. Moreover, we integrate the acquired +degradation-aware prompts into the time embedding of diffusion model to improve +degradation perception. Meanwhile, we employ the latent caption prompt to guide +the reverse sampling process using the cross-attention mechanism, thereby +guiding the accurate image reconstruction. Furthermore, to accelerate the +reverse sampling procedure of diffusion model and address the limitations of +frequency perception, we introduce a wavelet-oriented noise estimating network +(WNE-Net). Extensive experiments conducted on eight publicly available datasets +demonstrate the effectiveness of our proposed approach in both task-specific +and all-in-one applications. + +
+
+
+
+
+ + ☆ DPU: Dynamic Prototype Updating for Multimodal Out-of-Distribution + Detection + + +
+ Out-of-distribution (OOD) detection is essential for ensuring the robustness +of machine learning models by identifying samples that deviate from the +training distribution. While traditional OOD detection has primarily focused on +single-modality inputs, such as images, recent advances in multimodal models +have demonstrated the potential of leveraging multiple modalities (e.g., video, +optical flow, audio) to enhance detection performance. However, existing +methods often overlook intra-class variability within in-distribution (ID) +data, assuming that samples of the same class are perfectly cohesive and +consistent. This assumption can lead to performance degradation, especially +when prediction discrepancies are uniformly amplified across all samples. To +address this issue, we propose Dynamic Prototype Updating (DPU), a novel +plug-and-play framework for multimodal OOD detection that accounts for +intra-class variations. Our method dynamically updates class center +representations for each class by measuring the variance of similar samples +within each batch, enabling adaptive adjustments. This approach allows us to +amplify prediction discrepancies based on the updated class centers, thereby +improving the model's robustness and generalization across different +modalities. Extensive experiments on two tasks, five datasets, and nine base +OOD algorithms demonstrate that DPU significantly improves OOD detection +performance, setting a new state-of-the-art in multimodal OOD detection, with +improvements of up to 80 percent in Far-OOD detection. To facilitate +accessibility and reproducibility, our code is publicly available on GitHub. + +
+
+
+
+
+ + ☆ GTA: Global Tracklet Association for Multi-Object Tracking in Sports ACCV 2024 + + +
+ Multi-object tracking in sports scenarios has become one of the focal points +in computer vision, experiencing significant advancements through the +integration of deep learning techniques. Despite these breakthroughs, +challenges remain, such as accurately re-identifying players upon re-entry into +the scene and minimizing ID switches. In this paper, we propose an +appearance-based global tracklet association algorithm designed to enhance +tracking performance by splitting tracklets containing multiple identities and +connecting tracklets seemingly from the same identity. This method can serve as +a plug-and-play refinement tool for any multi-object tracker to further boost +their performance. The proposed method achieved a new state-of-the-art +performance on the SportsMOT dataset with HOTA score of 81.04%. Similarly, on +the SoccerNet dataset, our method enhanced multiple trackers' performance, +consistently increasing the HOTA score from 79.41% to 83.11%. These significant +and consistent improvements across different trackers and datasets underscore +our proposed method's potential impact on the application of sports player +tracking. We open-source our project codebase at +https://github.com/sjc042/gta-link.git. + +
+
+ comment: Accepted by ACCV 2024 MLCSA Workshop +
+
+
+
+
+ + ☆ Latent Space Disentanglement in Diffusion Transformers Enables Precise + Zero-shot Semantic Editing + + +
+ Diffusion Transformers (DiTs) have recently achieved remarkable success in +text-guided image generation. In image editing, DiTs project text and image +inputs to a joint latent space, from which they decode and synthesize new +images. However, it remains largely unexplored how multimodal information +collectively forms this joint space and how they guide the semantics of the +synthesized images. In this paper, we investigate the latent space of DiT +models and uncover two key properties: First, DiT's latent space is inherently +semantically disentangled, where different semantic attributes can be +controlled by specific editing directions. Second, consistent semantic editing +requires utilizing the entire joint latent space, as neither encoded image nor +text alone contains enough semantic information. We show that these editing +directions can be obtained directly from text prompts, enabling precise +semantic control without additional training or mask annotations. Based on +these insights, we propose a simple yet effective Encode-Identify-Manipulate +(EIM) framework for zero-shot fine-grained image editing. Specifically, we +first encode both the given source image and the text prompt that describes the +image, to obtain the joint latent embedding. Then, using our proposed Hessian +Score Distillation Sampling (HSDS) method, we identify editing directions that +control specific target attributes while preserving other image features. These +directions are guided by text prompts and used to manipulate the latent +embeddings. Moreover, we propose a new metric to quantify the disentanglement +degree of the latent space of diffusion models. Extensive experiment results on +our new curated benchmark dataset and analysis demonstrate DiT's +disentanglement properties and effectiveness of the EIM framework. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2408.13335 +
+
+
+
+
+ + ☆ An Explainable Machine Learning Approach for Age and Gender Estimation + in Living Individuals Using Dental Biometrics + + +
+ Objectives: Age and gender estimation is crucial for various applications, +including forensic investigations and anthropological studies. This research +aims to develop a predictive system for age and gender estimation in living +individuals, leveraging dental measurements such as Coronal Height (CH), +Coronal Pulp Cavity Height (CPCH), and Tooth Coronal Index (TCI). Methods: +Machine learning models were employed in our study, including Cat Boost +Classifier (Catboost), Gradient Boosting Machine (GBM), Ada Boost Classifier +(AdaBoost), Random Forest (RF), eXtreme Gradient Boosting (XGB), Light Gradient +Boosting Machine (LGB), and Extra Trees Classifier (ETC), to analyze dental +data from 862 living individuals (459 males and 403 females). Specifically, +periapical radiographs from six teeth per individual were utilized, including +premolars and molars from both maxillary and mandibular. A novel ensemble +learning technique was developed, which uses multiple models each tailored to +distinct dental metrics, to estimate age and gender accurately. Furthermore, an +explainable AI model has been created utilizing SHAP, enabling dental experts +to make judicious decisions based on comprehensible insight. Results: The RF +and XGB models were particularly effective, yielding the highest F1 score for +age and gender estimation. Notably, the XGB model showed a slightly better +performance in age estimation, achieving an F1 score of 73.26%. A similar trend +for the RF model was also observed in gender estimation, achieving a F1 score +of 77.53%. Conclusions: This study marks a significant advancement in dental +forensic methods, showcasing the potential of machine learning to automate age +and gender estimation processes with improved accuracy. + +
+
+
+
+
+ + ☆ TractoEmbed: Modular Multi-level Embedding framework for white matter + tract segmentation ICPR + + +
+ White matter tract segmentation is crucial for studying brain structural +connectivity and neurosurgical planning. However, segmentation remains +challenging due to issues like class imbalance between major and minor tracts, +structural similarity, subject variability, symmetric streamlines between +hemispheres etc. To address these challenges, we propose TractoEmbed, a modular +multi-level embedding framework, that encodes localized representations through +learning tasks in respective encoders. In this paper, TractoEmbed introduces a +novel hierarchical streamline data representation that captures maximum spatial +information at each level i.e. individual streamlines, clusters, and patches. +Experiments show that TractoEmbed outperforms state-of-the-art methods in white +matter tract segmentation across different datasets, and spanning various age +groups. The modular framework directly allows the integration of additional +embeddings in future works. + +
+
+ comment: Accepted at 27th International Conference on Pattern Recognition + (ICPR), 2024 15 pages, 2 figures +
+
+
+
+
+ + ☆ Comprehensive and Comparative Analysis between Transfer Learning and + Custom Built VGG and CNN-SVM Models for Wildfire Detection + + +
+ Contemporary Artificial Intelligence (AI) and Machine Learning (ML) research +places a significant emphasis on transfer learning, showcasing its +transformative potential in enhancing model performance across diverse domains. +This paper examines the efficiency and effectiveness of transfer learning in +the context of wildfire detection. Three purpose-built models -- Visual +Geometry Group (VGG)-7, VGG-10, and Convolutional Neural Network (CNN)-Support +Vector Machine(SVM) CNN-SVM -- are rigorously compared with three pretrained +models -- VGG-16, VGG-19, and Residual Neural Network (ResNet) ResNet101. We +trained and evaluated these models using a dataset that captures the +complexities of wildfires, incorporating variables such as varying lighting +conditions, time of day, and diverse terrains. The objective is to discern how +transfer learning performs against models trained from scratch in addressing +the intricacies of the wildfire detection problem. By assessing the performance +metrics, including accuracy, precision, recall, and F1 score, a comprehensive +understanding of the advantages and disadvantages of transfer learning in this +specific domain is obtained. This study contributes valuable insights to the +ongoing discourse, guiding future directions in AI and ML research. Keywords: +Wildfire prediction, deep learning, machine learning fire, detection + +
+
+ comment: In Proc. of the 2024 IEEE International Conference On Intelligent + Computing in Data Sciences +
+
+
+
+
+ + ☆ EAPCR: A Universal Feature Extractor for Scientific Data without + Explicit Feature Relation Patterns + + +
+ Conventional methods, including Decision Tree (DT)-based methods, have been +effective in scientific tasks, such as non-image medical diagnostics, system +anomaly detection, and inorganic catalysis efficiency prediction. However, most +deep-learning techniques have struggled to surpass or even match this level of +success as traditional machine-learning methods. The primary reason is that +these applications involve multi-source, heterogeneous data where features lack +explicit relationships. This contrasts with image data, where pixels exhibit +spatial relationships; textual data, where words have sequential dependencies; +and graph data, where nodes are connected through established associations. The +absence of explicit Feature Relation Patterns (FRPs) presents a significant +challenge for deep learning techniques in scientific applications that are not +image, text, and graph-based. In this paper, we introduce EAPCR, a universal +feature extractor designed for data without explicit FRPs. Tested across +various scientific tasks, EAPCR consistently outperforms traditional methods +and bridges the gap where deep learning models fall short. To further +demonstrate its robustness, we synthesize a dataset without explicit FRPs. +While Kolmogorov-Arnold Network (KAN) and feature extractors like Convolutional +Neural Networks (CNNs), Graph Convolutional Networks (GCNs), and Transformers +struggle, EAPCR excels, demonstrating its robustness and superior performance +in scientific tasks without FRPs. + +
+
+
+
+
+ + ☆ TomoGRAF: A Robust and Generalizable Reconstruction Network for + Single-View Computed Tomography + + +
+ Computed tomography (CT) provides high spatial resolution visualization of 3D +structures for scientific and clinical applications. Traditional +analytical/iterative CT reconstruction algorithms require hundreds of angular +data samplings, a condition that may not be met in practice due to physical and +mechanical limitations. Sparse view CT reconstruction has been proposed using +constrained optimization and machine learning methods with varying success, +less so for ultra-sparse view CT reconstruction with one to two views. Neural +radiance field (NeRF) is a powerful tool for reconstructing and rendering 3D +natural scenes from sparse views, but its direct application to 3D medical +image reconstruction has been minimally successful due to the differences +between optical and X-ray photon transportation. Here, we develop a novel +TomoGRAF framework incorporating the unique X-ray transportation physics to +reconstruct high-quality 3D volumes using ultra-sparse projections without +prior. TomoGRAF captures the CT imaging geometry, simulates the X-ray casting +and tracing process, and penalizes the difference between simulated and ground +truth CT sub-volume during training. We evaluated the performance of TomoGRAF +on an unseen dataset of distinct imaging characteristics from the training data +and demonstrated a vast leap in performance compared with state-of-the-art deep +learning and NeRF methods. TomoGRAF provides the first generalizable solution +for image-guided radiotherapy and interventional radiology applications, where +only one or a few X-ray views are available, but 3D volumetric information is +desired. + +
+
+
+
+
+ + ☆ CameraHMR: Aligning People with Perspective 3DV 2025 + + +
+ We address the challenge of accurate 3D human pose and shape estimation from +monocular images. The key to accuracy and robustness lies in high-quality +training data. Existing training datasets containing real images with pseudo +ground truth (pGT) use SMPLify to fit SMPL to sparse 2D joint locations, +assuming a simplified camera with default intrinsics. We make two contributions +that improve pGT accuracy. First, to estimate camera intrinsics, we develop a +field-of-view prediction model (HumanFoV) trained on a dataset of images +containing people. We use the estimated intrinsics to enhance the 4D-Humans +dataset by incorporating a full perspective camera model during SMPLify +fitting. Second, 2D joints provide limited constraints on 3D body shape, +resulting in average-looking bodies. To address this, we use the BEDLAM dataset +to train a dense surface keypoint detector. We apply this detector to the +4D-Humans dataset and modify SMPLify to fit the detected keypoints, resulting +in significantly more realistic body shapes. Finally, we upgrade the HMR2.0 +architecture to include the estimated camera parameters. We iterate model +training and SMPLify fitting initialized with the previously trained model. +This leads to more accurate pGT and a new model, CameraHMR, with +state-of-the-art accuracy. Code and pGT are available for research purposes. + +
+
+ comment: 3DV 2025 +
+
+
+
+
+ + ☆ TIPO: Text to Image with Text Presampling for Prompt Optimization + + +
+ TIPO (Text to Image with text pre-sampling for Prompt Optimization) is an +innovative framework designed to enhance text-to-image (T2I) generation by +language model (LM) for automatic prompt engineering. By refining and extending +user-provided prompts, TIPO bridges the gap between simple inputs and the +detailed prompts required for high-quality image generation. Unlike previous +approaches that rely on Large Language Models (LLMs) or reinforcement learning +(RL), TIPO adjusts user input prompts with the distribution of a trained prompt +dataset, eliminating the need for complex runtime cost via lightweight model. +This pre-sampling approach enables efficient and scalable prompt optimization, +grounded in the model's training distribution. Experimental results demonstrate +TIPO's effectiveness in improving aesthetic scores, reducing image corruption, +and better aligning generated images with dataset distributions. These findings +highlight the critical role of prompt engineering in T2I systems and open +avenues for broader applications of automatic prompt refinement. + +
+
+ comment: 21 pages, 13 figures +
+
+
+
+
+ + ☆ Deep Learning 2.0: Artificial Neurons That Matter -- Reject Correlation, + Embrace Orthogonality CVPR 2025 + + +
+ We introduce a yat-product-powered neural network, the Neural Matter Network +(NMN), a breakthrough in deep learning that achieves non-linear pattern +recognition without activation functions. Our key innovation relies on the +yat-product and yat-product, which naturally induces non-linearity by +projecting inputs into a pseudo-metric space, eliminating the need for +traditional activation functions while maintaining only a softmax layer for +final class probability distribution. This approach simplifies network +architecture and provides unprecedented transparency into the network's +decision-making process. Our comprehensive empirical evaluation across +different datasets demonstrates that NMN consistently outperforms traditional +MLPs. The results challenge the assumption that separate activation functions +are necessary for effective deep-learning models. The implications of this work +extend beyond immediate architectural benefits, by eliminating intermediate +activation functions while preserving non-linear capabilities, yat-MLP +establishes a new paradigm for neural network design that combines simplicity +with effectiveness. Most importantly, our approach provides unprecedented +insights into the traditionally opaque "black-box" nature of neural networks, +offering a clearer understanding of how these models process and classify +information. + +
+
+ comment: Submitted to CVPR 2025 +
+
+
+
+
+ + ♻ ☆ LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and + 200+ FPS NeurIPS 2024 + + +
+ Recent advances in real-time neural rendering using point-based techniques +have enabled broader adoption of 3D representations. However, foundational +approaches like 3D Gaussian Splatting impose substantial storage overhead, as +Structure-from-Motion (SfM) points can grow to millions, often requiring +gigabyte-level disk space for a single unbounded scene. This growth presents +scalability challenges and hinders splatting efficiency. To address this, we +introduce LightGaussian, a method for transforming 3D Gaussians into a more +compact format. Inspired by Network Pruning, LightGaussian identifies Gaussians +with minimal global significance on scene reconstruction, and applies a pruning +and recovery process to reduce redundancy while preserving visual quality. +Knowledge distillation and pseudo-view augmentation then transfer spherical +harmonic coefficients to a lower degree, yielding compact representations. +Gaussian Vector Quantization, based on each Gaussian's global significance, +further lowers bitwidth with minimal accuracy loss. LightGaussian achieves an +average 15x compression rate while boosting FPS from 144 to 237 within the +3D-GS framework, enabling efficient complex scene representation on the +Mip-NeRF 360 and Tank & Temple datasets. The proposed Gaussian pruning approach +is also adaptable to other 3D representations (e.g., Scaffold-GS), +demonstrating strong generalization capabilities. + +
+
+ comment: NeurIPS 2024, Project page: https://lightgaussian.github.io/ +
+
+
+
+
+ + ♻ ☆ Odd-One-Out: Anomaly Detection by Comparing with Neighbors + + +
+ This paper introduces a novel anomaly detection (AD) problem that focuses on +identifying `odd-looking' objects relative to the other instances in a given +scene. In contrast to the traditional AD benchmarks, anomalies in our task are +scene-specific, defined by the regular instances that make up the majority. +Since object instances may be only partly visible from a single viewpoint, our +setting employs multiple views of each scene as input. To provide a testbed for +future research in this task, we introduce two benchmarks, ToysAD-8K and +PartsAD-15K. We propose a novel method that constructs 3D object-centric +representations from multiple 2D views for each instance and detects the +anomalous ones through a cross-instance comparison. We rigorously analyze our +method quantitatively and qualitatively on the presented benchmarks. + +
+
+ comment: Codes & Dataset at https://github.com/VICO-UoE/OddOneOutAD +
+
+
+
+
+ + ♻ ☆ WavShadow: Wavelet Based Shadow Segmentation and Removal + + +
+ Shadow removal and segmentation remain challenging tasks in computer vision, +particularly in complex real world scenarios. This study presents a novel +approach that enhances the ShadowFormer model by incorporating Masked +Autoencoder (MAE) priors and Fast Fourier Convolution (FFC) blocks, leading to +significantly faster convergence and improved performance. We introduce key +innovations: (1) integration of MAE priors trained on Places2 dataset for +better context understanding, (2) adoption of Haar wavelet features for +enhanced edge detection and multiscale analysis, and (3) implementation of a +modified SAM Adapter for robust shadow segmentation. Extensive experiments on +the challenging DESOBA dataset demonstrate that our approach achieves state of +the art results, with notable improvements in both convergence speed and shadow +removal quality. + +
+
+
+
+
+ + ♻ ☆ Meta-Learned Modality-Weighted Knowledge Distillation for Robust + Multi-Modal Learning with Missing Data + + +
+ In multi-modal learning, some modalities are more influential than others, +and their absence can have a significant impact on classification/segmentation +accuracy. Addressing this challenge, we propose a novel approach called +Meta-learned Modality-weighted Knowledge Distillation (MetaKD), which enables +multi-modal models to maintain high accuracy even when key modalities are +missing. MetaKD adaptively estimates the importance weight of each modality +through a meta-learning process. These learned importance weights guide a +pairwise modality-weighted knowledge distillation process, allowing +high-importance modalities to transfer knowledge to lower-importance ones, +resulting in robust performance despite missing inputs. Unlike previous methods +in the field, which are often task-specific and require significant +modifications, our approach is designed to work in multiple tasks (e.g., +segmentation and classification) with minimal adaptation. Experimental results +on five prevalent datasets, including three Brain Tumor Segmentation datasets +(BraTS2018, BraTS2019 and BraTS2020), the Alzheimer's Disease Neuroimaging +Initiative (ADNI) classification dataset and the Audiovision-MNIST +classification dataset, demonstrate the proposed model is able to outperform +the compared models by a large margin. + +
+
+
+
+
+ + ♻ ☆ Interpret Your Decision: Logical Reasoning Regularization for + Generalization in Visual Classification NeurIPS2024 + + +
+ Vision models excel in image classification but struggle to generalize to +unseen data, such as classifying images from unseen domains or discovering +novel categories. In this paper, we explore the relationship between logical +reasoning and deep learning generalization in visual classification. A logical +regularization termed L-Reg is derived which bridges a logical analysis +framework to image classification. Our work reveals that L-Reg reduces the +complexity of the model in terms of the feature distribution and classifier +weights. Specifically, we unveil the interpretability brought by L-Reg, as it +enables the model to extract the salient features, such as faces to persons, +for classification. Theoretical analysis and experiments demonstrate that L-Reg +enhances generalization across various scenarios, including multi-domain +generalization and generalized category discovery. In complex real-world +scenarios where images span unknown classes and unseen domains, L-Reg +consistently improves generalization, highlighting its practical efficacy. + +
+
+ comment: Accepted by NeurIPS2024 as Spotlight +
+
+
+
+
+ + ♻ ☆ Pseudo-triplet Guided Few-shot Composed Image Retrieval + + +
+ Composed Image Retrieval (CIR) is a challenging task that aims to retrieve +the target image with a multimodal query, i.e., a reference image, and its +complementary modification text. As previous supervised or zero-shot learning +paradigms all fail to strike a good trade-off between the model's +generalization ability and retrieval performance, recent researchers have +introduced the task of few-shot CIR (FS-CIR) and proposed a textual +inversion-based network based on pretrained CLIP model to realize it. Despite +its promising performance, the approach encounters two key limitations: simply +relying on the few annotated samples for CIR model training and +indiscriminately selecting training triplets for CIR model fine-tuning. To +address these two limitations, we propose a novel two-stage pseudo triplet +guided few-shot CIR scheme, dubbed PTG-FSCIR. In the first stage, we propose an +attentive masking and captioning-based pseudo triplet generation method, to +construct pseudo triplets from pure image data and use them to fulfill the +CIR-task specific pertaining. In the second stage, we propose a challenging +triplet-based CIR fine-tuning method, where we design a pseudo modification +text-based sample challenging score estimation strategy and a robust top +range-based random sampling strategy for sampling robust challenging triplets +to promote the model fine-tuning. Notably, our scheme is plug-and-play and +compatible with any existing supervised CIR models. We test our scheme across +two backbones on three public datasets (i.e., FashionIQ, CIRR, and +Birds-to-Words), achieving maximum improvements of 13.3%, 22.2%, and 17.4% +respectively, demonstrating our scheme's efficacy. + +
+
+ comment: 10pages +
+
+
+
+
+ + ♻ Bootstrapping Reinforcement Learning with Imitation for Vision-Based + Agile Flight CoRL + + +
+ Learning visuomotor policies for agile quadrotor flight presents significant +difficulties, primarily from inefficient policy exploration caused by +high-dimensional visual inputs and the need for precise and low-latency +control. To address these challenges, we propose a novel approach that combines +the performance of Reinforcement Learning (RL) and the sample efficiency of +Imitation Learning (IL) in the task of vision-based autonomous drone racing. +While RL provides a framework for learning high-performance controllers through +trial and error, it faces challenges with sample efficiency and computational +demands due to the high dimensionality of visual inputs. Conversely, IL +efficiently learns from visual expert demonstrations, but it remains limited by +the expert's performance and state distribution. To overcome these limitations, +our policy learning framework integrates the strengths of both approaches. Our +framework contains three phases: training a teacher policy using RL with +privileged state information, distilling it into a student policy via IL, and +adaptive fine-tuning via RL. Testing in both simulated and real-world scenarios +shows our approach can not only learn in scenarios where RL from scratch fails +but also outperforms existing IL methods in both robustness and performance, +successfully navigating a quadrotor through a race course using only visual +information. Videos of the experiments are available at +https://rpg.ifi.uzh.ch/bootstrap-rl-with-il/index.html. + +
+
+ comment: 8th Annual Conference on Robot Learning (CoRL) +
+
+
+
+
+ + ♻ ☆ REVEX: A Unified Framework for Removal-Based Explainable Artificial + Intelligence in Video + + +
+ We developed REVEX, a removal-based video explanations framework. This work +extends fine-grained explanation frameworks for computer vision data and adapts +six existing techniques to video by adding temporal information and local +explanations. The adapted methods were evaluated across networks, datasets, +image classes, and evaluation metrics. By decomposing explanation into steps, +strengths and weaknesses were revealed in the studied methods, for example, on +pixel clustering and perturbations in the input. Video LIME outperformed other +methods with deletion values up to 31\% lower and insertion up to 30\% higher, +depending on method and network. Video RISE achieved superior performance in +the average drop metric, with values 10\% lower. In contrast, +localization-based metrics revealed low performance across all methods, with +significant variation depending on network. Pointing game accuracy reached +53\%, and IoU-based metrics remained below 20\%. Drawing on the findings across +XAI methods, we further examine the limitations of the employed XAI evaluation +metrics and highlight their suitability in different applications. + +
+
+
+
+
+ + ♻ ☆ Transfer Learning for Wildlife Classification: Evaluating YOLOv8 against + DenseNet, ResNet, and VGGNet on a Custom Dataset + + +
+ This study evaluates the performance of various deep learning models, +specifically DenseNet, ResNet, VGGNet, and YOLOv8, for wildlife species +classification on a custom dataset. The dataset comprises 575 images of 23 +endangered species sourced from reputable online repositories. The study +utilizes transfer learning to fine-tune pre-trained models on the dataset, +focusing on reducing training time and enhancing classification accuracy. The +results demonstrate that YOLOv8 outperforms other models, achieving a training +accuracy of 97.39% and a validation F1-score of 96.50%. These findings suggest +that YOLOv8, with its advanced architecture and efficient feature extraction +capabilities, holds great promise for automating wildlife monitoring and +conservation efforts. + +
+
+ comment: This is published in Journal of Artificial Intelligence and Capsule + Networks, December 2024, Volume 6, Issue 4, Pages 415-435 +
+
+
+
+
+ + ♻ ☆ LLMs Can Evolve Continually on Modality for X-Modal Reasoning + + +
+ Multimodal Large Language Models (MLLMs) have gained significant attention +due to their impressive capabilities in multimodal understanding. However, +existing methods rely heavily on extensive modal-specific pretraining and +joint-modal tuning, leading to significant computational burdens when expanding +to new modalities. In this paper, we propose PathWeave, a flexible and scalable +framework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs +to continually EVolve on modalities for $\mathbb{X}$-modal reasoning. We +leverage the concept of Continual Learning and develop an incremental training +strategy atop pre-trained MLLMs, enabling their expansion to new modalities +using uni-modal data, without executing joint-modal pretraining. In detail, a +novel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and +cross-modal adapters are seamlessly integrated to facilitate efficient modality +alignment and collaboration. Additionally, an MoE-based gating module is +applied between two types of adapters to further enhance the multimodal +interaction. To investigate the proposed method, we establish a challenging +benchmark called Continual Learning of Modality (MCL), which consists of +high-quality QA data from five distinct modalities: image, video, audio, depth +and point cloud. Extensive experiments demonstrate the effectiveness of the +proposed AnA framework on learning plasticity and memory stability during +continual learning. Furthermore, PathWeave performs comparably to +state-of-the-art MLLMs while concurrently reducing parameter training burdens +by 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave + +
+
+
+
+
+ + ♻ ☆ L4DR: LiDAR-4DRadar Fusion for Weather-Robust 3D Object Detection + + +
+ LiDAR-based vision systems are integral for 3D object detection, which is +crucial for autonomous navigation. However, they suffer from performance +degradation in adverse weather conditions due to the quality deterioration of +LiDAR point clouds. Fusing LiDAR with the weather-robust 4D radar sensor is +expected to solve this problem. However, the fusion of LiDAR and 4D radar is +challenging because they differ significantly in terms of data quality and the +degree of degradation in adverse weather. To address these issues, we introduce +L4DR, a weather-robust 3D object detection method that effectively achieves +LiDAR and 4D Radar fusion. Our L4DR includes Multi-Modal Encoding (MME) and +Foreground-Aware Denoising (FAD) technique to reconcile sensor gaps, which is +the first exploration of the complementarity of early fusion between LiDAR and +4D radar. Additionally, we design an Inter-Modal and Intra-Modal ({IM}2 ) +parallel feature extraction backbone coupled with a Multi-Scale Gated Fusion +(MSGF) module to counteract the varying degrees of sensor degradation under +adverse weather conditions. Experimental evaluation on a VoD dataset with +simulated fog proves that L4DR is more adaptable to changing weather +conditions. It delivers a significant performance increase under different fog +levels, improving the 3D mAP by up to 20.0% over the traditional LiDAR-only +approach. Moreover, the results on the K-Radar dataset validate the consistent +performance improvement of L4DR in real-world adverse weather conditions. + +
+
+
+
+
+ + ♻ ☆ Adapting Segment Anything Model to Multi-modal Salient Object Detection + with Semantic Feature Fusion Guidance + + +
+ Although most existing multi-modal salient object detection (SOD) methods +demonstrate effectiveness through training models from scratch, the limited +multi-modal data hinders these methods from reaching optimality. In this paper, +we propose a novel framework to explore and exploit the powerful feature +representation and zero-shot generalization ability of the pre-trained Segment +Anything Model (SAM) for multi-modal SOD. Despite serving as a recent vision +fundamental model, driving the class-agnostic SAM to comprehend and detect +salient objects accurately is non-trivial, especially in challenging scenes. To +this end, we develop \underline{SAM} with se\underline{m}antic +f\underline{e}ature fu\underline{s}ion guidanc\underline{e} (Sammese), which +incorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to +multi-modal SOD tasks. However, it is difficult for SAM trained on single-modal +data to directly mine the complementary benefits of multi-modal inputs and +comprehensively utilize them to achieve accurate saliency prediction. To +address these issues, we first design a multi-modal complementary fusion module +to extract robust multi-modal semantic features by integrating information from +visible and thermal or depth image pairs. Then, we feed the extracted +multi-modal semantic features into both the SAM image encoder and mask decoder +for fine-tuning and prompting, respectively. Specifically, in the image +encoder, a multi-modal adapter is proposed to adapt the single-modal SAM to +multi-modal information. In the mask decoder, a semantic-geometric prompt +generation strategy is proposed to produce corresponding embeddings with +various saliency cues. Extensive experiments on both RGB-D and RGB-T SOD +benchmarks show the effectiveness of the proposed framework. The code will be +available at \url{https://github.com/Angknpng/Sammese}. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for + Adversarial Defense NeurIPS 2024 + + +
+ Despite ongoing efforts to defend neural classifiers from adversarial +attacks, they remain vulnerable, especially to unseen attacks. In contrast, +humans are difficult to be cheated by subtle manipulations, since we make +judgments only based on essential factors. Inspired by this observation, we +attempt to model label generation with essential label-causative factors and +incorporate label-non-causative factors to assist data generation. For an +adversarial example, we aim to discriminate the perturbations as non-causative +factors and make predictions only based on the label-causative factors. +Concretely, we propose a casual diffusion model (CausalDiff) that adapts +diffusion models for conditional data generation and disentangles the two types +of casual factors by learning towards a novel casual information bottleneck +objective. Empirically, CausalDiff has significantly outperformed +state-of-the-art defense methods on various unseen attacks, achieving an +average robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on +CIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition +Benchmark). + +
+
+ comment: accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Act in Collusion: A Persistent Distributed Multi-Target Backdoor in + Federated Learning + + +
+ Federated learning, a novel paradigm designed to protect data privacy, is +vulnerable to backdoor attacks due to its distributed nature. Current research +often designs attacks based on a single attacker with a single backdoor, +overlooking more realistic and complex threats in federated learning. We +propose a more practical threat model for federated learning: the distributed +multi-target backdoor. In this model, multiple attackers control different +clients, embedding various triggers and targeting different classes, +collaboratively implanting backdoors into the global model via central +aggregation. Empirical validation shows that existing methods struggle to +maintain the effectiveness of multiple backdoors in the global model. Our key +insight is that similar backdoor triggers cause parameter conflicts and +injecting new backdoors disrupts gradient directions, significantly weakening +some backdoors performance. To solve this, we propose a Distributed +Multi-Target Backdoor Attack (DMBA), ensuring efficiency and persistence of +backdoors from different malicious clients. To avoid parameter conflicts, we +design a multi-channel dispersed frequency trigger strategy to maximize trigger +differences. To mitigate gradient interference, we introduce backdoor replay in +local training to neutralize conflicting gradients. Extensive validation shows +that 30 rounds after the attack, Attack Success Rates of three different +backdoors from various clients remain above 93%. The code will be made publicly +available after the review period. + +
+
+
+
+
+ + ♻ ☆ PhyTracker: An Online Tracker for Phytoplankton + + +
+ Phytoplankton, a crucial component of aquatic ecosystems, requires efficient +monitoring to understand marine ecological processes and environmental +conditions. Traditional phytoplankton monitoring methods, relying on non-in +situ observations, are time-consuming and resource-intensive, limiting timely +analysis. To address these limitations, we introduce PhyTracker, an intelligent +in situ tracking framework designed for automatic tracking of phytoplankton. +PhyTracker overcomes significant challenges unique to phytoplankton monitoring, +such as constrained mobility within water flow, inconspicuous appearance, and +the presence of impurities. Our method incorporates three innovative modules: a +Texture-enhanced Feature Extraction (TFE) module, an Attention-enhanced +Temporal Association (ATA) module, and a Flow-agnostic Movement Refinement +(FMR) module. These modules enhance feature capture, differentiate between +phytoplankton and impurities, and refine movement characteristics, +respectively. Extensive experiments on the PMOT dataset validate the +superiority of PhyTracker in phytoplankton tracking, and additional tests on +the MOT dataset demonstrate its general applicability, outperforming +conventional tracking methods. This work highlights key differences between +phytoplankton and traditional objects, offering an effective solution for +phytoplankton monitoring. + +
+
+ comment: 13pages,eleven figures +
+
+
+
+
+ + ♻ ☆ TUNeS: A Temporal U-Net with Self-Attention for Video-based Surgical + Phase Recognition + + +
+ To enable context-aware computer assistance in the operating room of the +future, cognitive systems need to understand automatically which surgical phase +is being performed by the medical team. The primary source of information for +surgical phase recognition is typically video, which presents two challenges: +extracting meaningful features from the video stream and effectively modeling +temporal information in the sequence of visual features. For temporal modeling, +attention mechanisms have gained popularity due to their ability to capture +long-range dependencies. In this paper, we explore design choices for attention +in existing temporal models for surgical phase recognition and propose a novel +approach that uses attention more effectively and does not require hand-crafted +constraints: TUNeS, an efficient and simple temporal model that incorporates +self-attention at the core of a convolutional U-Net structure. In addition, we +propose to train the feature extractor, a standard CNN, together with an LSTM +on preferably long video segments, i.e., with long temporal context. In our +experiments, almost all temporal models performed better on top of feature +extractors that were trained with longer temporal context. On these +contextualized features, TUNeS achieves state-of-the-art results on the +Cholec80 dataset. This study offers new insights on how to use attention +mechanisms to build accurate and efficient temporal models for surgical phase +recognition. Implementing automatic surgical phase recognition is essential to +automate the analysis and optimization of surgical workflows and to enable +context-aware computer assistance during surgery, thus ultimately improving +patient care. + +
+
+
+
+
+ + ♻ ☆ Extreme Rotation Estimation in the Wild + + +
+ We present a technique and benchmark dataset for estimating the relative 3D +orientation between a pair of Internet images captured in an extreme setting, +where the images have limited or non-overlapping field of views. Prior work +targeting extreme rotation estimation assume constrained 3D environments and +emulate perspective images by cropping regions from panoramic views. However, +real images captured in the wild are highly diverse, exhibiting variation in +both appearance and camera intrinsics. In this work, we propose a +Transformer-based method for estimating relative rotations in extreme +real-world settings, and contribute the ExtremeLandmarkPairs dataset, assembled +from scene-level Internet photo collections. Our evaluation demonstrates that +our approach succeeds in estimating the relative rotations in a wide variety of +extreme-view Internet image pairs, outperforming various baselines, including +dedicated rotation estimation techniques and contemporary 3D reconstruction +methods. + +
+
+ comment: Project webpage: + https://tau-vailab.github.io/ExtremeRotationsInTheWild/ +
+
+
+
+
+ + ♻ ☆ Gaussian Process Emulators for Few-Shot Segmentation in Cardiac MRI + + +
+ Segmentation of cardiac magnetic resonance images (MRI) is crucial for the +analysis and assessment of cardiac function, helping to diagnose and treat +various cardiovascular diseases. Most recent techniques rely on deep learning +and usually require an extensive amount of labeled data. To overcome this +problem, few-shot learning has the capability of reducing data dependency on +labeled data. In this work, we introduce a new method that merges few-shot +learning with a U-Net architecture and Gaussian Process Emulators (GPEs), +enhancing data integration from a support set for improved performance. GPEs +are trained to learn the relation between the support images and the +corresponding masks in latent space, facilitating the segmentation of unseen +query images given only a small labeled support set at inference. We test our +model with the M&Ms-2 public dataset to assess its ability to segment the heart +in cardiac magnetic resonance imaging from different orientations, and compare +it with state-of-the-art unsupervised and few-shot methods. Our architecture +shows higher DICE coefficients compared to these methods, especially in the +more challenging setups where the size of the support set is considerably +small. + +
+
+ comment: Accepted at Statistical Atlases and Computational Modeling of the + Heart (STACOM) Workshop 2024 +
+
+
+
+
+ + ♻ ☆ MEGA-Bench: Scaling Multimodal Evaluation to over 500 Real-World Tasks + + +
+ We present MEGA-Bench, an evaluation suite that scales multimodal evaluation +to over 500 real-world tasks, to address the highly heterogeneous daily use +cases of end users. Our objective is to optimize for a set of high-quality data +samples that cover a highly diverse and rich set of multimodal tasks, while +enabling cost-effective and accurate model evaluation. In particular, we +collected 505 realistic tasks encompassing over 8,000 samples from 16 expert +annotators to extensively cover the multimodal task space. Instead of unifying +these problems into standard multi-choice questions (like MMMU, MMBench, and +MMT-Bench), we embrace a wide range of output formats like numbers, phrases, +code, \LaTeX, coordinates, JSON, free-form, etc. To accommodate these formats, +we developed over 40 metrics to evaluate these tasks. Unlike existing +benchmarks, MEGA-Bench offers a fine-grained capability report across multiple +dimensions (e.g., application, input type, output format, skill), allowing +users to interact with and visualize model capabilities in depth. We evaluate a +wide variety of frontier vision-language models on MEGA-Bench to understand +their capabilities across these dimensions. + +
+
+ comment: Technical report. Project page: + https://tiger-ai-lab.github.io/MEGA-Bench/. v2 includes more evaluated models + and a single-image setting +
+
+
+
+
+ + ♻ ☆ Scalar Function Topology Divergence: Comparing Topology of 3D Objects + + +
+ We propose a new topological tool for computer vision - Scalar Function +Topology Divergence (SFTD), which measures the dissimilarity of multi-scale +topology between sublevel sets of two functions having a common domain. +Functions can be defined on an undirected graph or Euclidean space of any +dimensionality. Most of the existing methods for comparing topology are based +on Wasserstein distance between persistence barcodes and they don't take into +account the localization of topological features. The minimization of SFTD +ensures that the corresponding topological features of scalar functions are +located in the same places. The proposed tool provides useful visualizations +depicting areas where functions have topological dissimilarities. We provide +applications of the proposed method to 3D computer vision. In particular, +experiments demonstrate that SFTD as an additional loss improves the +reconstruction of cellular 3D shapes from 2D fluorescence microscopy images, +and helps to identify topological errors in 3D segmentation. Additionally, we +show that SFTD outperforms Betti matching loss in 2D segmentation problems. + +
+
+
+
+
+ + ♻ ☆ Revisiting the Adversarial Robustness of Vision Language Models: a + Multimodal Perspective + + +
+ Pretrained vision-language models (VLMs) like CLIP exhibit exceptional +generalization across diverse downstream tasks. While recent studies reveal +their vulnerability to adversarial attacks, research to date has primarily +focused on enhancing the robustness of image encoders against image-based +attacks, with defenses against text-based and multimodal attacks remaining +largely unexplored. To this end, this work presents the first comprehensive +study on improving the adversarial robustness of VLMs against attacks targeting +image, text, and multimodal inputs. This is achieved by proposing multimodal +contrastive adversarial training (MMCoA). Such an approach strengthens the +robustness of both image and text encoders by aligning the clean text +embeddings with adversarial image embeddings, and adversarial text embeddings +with clean image embeddings. The robustness of the proposed MMCoA is examined +against existing defense methods over image, text, and multimodal attacks on +the CLIP model. Extensive experiments on 15 datasets across two tasks reveal +the characteristics of different adversarial defense methods under distinct +distribution shifts and dataset complexities across the three attack types. +This paves the way for a unified framework of adversarial robustness against +different modality attacks, opening up new possibilities for securing VLMs +against multimodal attacks. The code is available at +https://github.com/ElleZWQ/MMCoA.git. + +
+
+ comment: 17 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Exploring Advanced Large Language Models with LLMsuite + + +
+ This tutorial explores the advancements and challenges in the development of +Large Language Models (LLMs) such as ChatGPT and Gemini. It addresses inherent +limitations like temporal knowledge cutoffs, mathematical inaccuracies, and the +generation of incorrect information, proposing solutions like Retrieval +Augmented Generation (RAG), Program-Aided Language Models (PAL), and frameworks +such as ReAct and LangChain. The integration of these techniques enhances LLM +performance and reliability, especially in multi-step reasoning and complex +task execution. The paper also covers fine-tuning strategies, including +instruction fine-tuning, parameter-efficient methods like LoRA, and +Reinforcement Learning from Human Feedback (RLHF) as well as Reinforced +Self-Training (ReST). Additionally, it provides a comprehensive survey of +transformer architectures and training techniques for LLMs. The source code can +be accessed by contacting the author via email for a request. + +
+
+ comment: Keywords: Language Model Benchmarking, Pre-Trained LLM Comparison, + LLM Performance Analysis, NLP Model Evaluation Tools, Public Dataset + Inference for LLMs, BLEU and ROUGE Metrics for LLM, Open Source LLM Testing + Tools, Large Language Model Evaluation Software, NLP Benchmarking Suite, + Comprehensive LLM Evaluation Toolkit +
+
+
+
+
+ + ♻ ☆ OmAgent: A Multi-modal Agent Framework for Complex Video Understanding + with Task Divide-and-Conquer + + +
+ Recent advancements in Large Language Models (LLMs) have expanded their +capabilities to multimodal contexts, including comprehensive video +understanding. However, processing extensive videos such as 24-hour CCTV +footage or full-length films presents significant challenges due to the vast +data and processing demands. Traditional methods, like extracting key frames or +converting frames to text, often result in substantial information loss. To +address these shortcomings, we develop OmAgent, efficiently stores and +retrieves relevant video frames for specific queries, preserving the detailed +content of videos. Additionally, it features an Divide-and-Conquer Loop capable +of autonomous reasoning, dynamically invoking APIs and tools to enhance query +processing and accuracy. This approach ensures robust video understanding, +significantly reducing information loss. Experimental results affirm OmAgent's +efficacy in handling various types of videos and complex tasks. Moreover, we +have endowed it with greater autonomy and a robust tool-calling system, +enabling it to accomplish even more intricate tasks. + +
+
+
+
+
+ + ♻ ☆ Memory-Efficient Pseudo-Labeling for Online Source-Free Universal Domain + Adaptation using a Gaussian Mixture Model WACV + + +
+ In practice, domain shifts are likely to occur between training and test +data, necessitating domain adaptation (DA) to adjust the pre-trained source +model to the target domain. Recently, universal domain adaptation (UniDA) has +gained attention for addressing the possibility of an additional category +(label) shift between the source and target domain. This means new classes can +appear in the target data, some source classes may no longer be present, or +both at the same time. For practical applicability, UniDA methods must handle +both source-free and online scenarios, enabling adaptation without access to +the source data and performing batch-wise updates in parallel with prediction. +In an online setting, preserving knowledge across batches is crucial. However, +existing methods often require substantial memory, which is impractical because +memory is limited and valuable, in particular on embedded systems. Therefore, +we consider memory-efficiency as an additional constraint. To achieve +memory-efficient online source-free universal domain adaptation (SF-UniDA), we +propose a novel method that continuously captures the distribution of known +classes in the feature space using a Gaussian mixture model (GMM). This +approach, combined with entropy-based out-of-distribution detection, allows for +the generation of reliable pseudo-labels. Finally, we combine a contrastive +loss with a KL divergence loss to perform the adaptation. Our approach not only +achieves state-of-the-art results in all experiments on the DomainNet and +Office-Home datasets but also significantly outperforms the existing methods on +the challenging VisDA-C dataset, setting a new benchmark for online SF-UniDA. +Our code is available at https://github.com/pascalschlachter/GMM. + +
+
+ comment: Accepted at IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ♻ ☆ HYPNOS : Highly Precise Foreground-focused Diffusion Finetuning for + Inanimate Objects ACCV + + +
+ In recent years, personalized diffusion-based text-to-image generative tasks +have been a hot topic in computer vision studies. A robust diffusion model is +determined by its ability to perform near-perfect reconstruction of certain +product outcomes given few related input samples. Unfortunately, the current +prominent diffusion-based finetuning technique falls short in maintaining the +foreground object consistency while being constrained to produce diverse +backgrounds in the image outcome. In the worst scenario, the overfitting issue +may occur, meaning that the foreground object is less controllable due to the +condition above, for example, the input prompt information is transferred +ambiguously to both foreground and background regions, instead of the supposed +background region only. To tackle the issues above, we proposed Hypnos, a +highly precise foreground-focused diffusion finetuning technique. On the image +level, this strategy works best for inanimate object generation tasks, and to +do so, Hypnos implements two main approaches, namely: (i) a content-centric +prompting strategy and (ii) the utilization of our additional +foreground-focused discriminative module. The utilized module is connected with +the diffusion model and finetuned with our proposed set of supervision +mechanism. Combining the strategies above yielded to the foreground-background +disentanglement capability of the diffusion model. Our experimental results +showed that the proposed strategy gave a more robust performance and visually +pleasing results compared to the former technique. For better elaborations, we +also provided extensive studies to assess the fruitful outcomes above, which +reveal how personalization behaves in regard to several training conditions. + +
+
+ comment: 26 pages, 12 figures, to appear on the Rich Media with Generative AI + workshop in conjunction with Asian Conference on Computer Vision (ACCV) 2024 +
+
+
+
+
+ + ♻ ☆ LEO: Generative Latent Image Animator for Human Video Synthesis + + +
+ Spatio-temporal coherency is a major challenge in synthesizing high quality +videos, particularly in synthesizing human videos that contain rich global and +local deformations. To resolve this challenge, previous approaches have +resorted to different features in the generation process aimed at representing +appearance and motion. However, in the absence of strict mechanisms to +guarantee such disentanglement, a separation of motion from appearance has +remained challenging, resulting in spatial distortions and temporal jittering +that break the spatio-temporal coherency. Motivated by this, we here propose +LEO, a novel framework for human video synthesis, placing emphasis on +spatio-temporal coherency. Our key idea is to represent motion as a sequence of +flow maps in the generation process, which inherently isolate motion from +appearance. We implement this idea via a flow-based image animator and a Latent +Motion Diffusion Model (LMDM). The former bridges a space of motion codes with +the space of flow maps, and synthesizes video frames in a warp-and-inpaint +manner. LMDM learns to capture motion prior in the training data by +synthesizing sequences of motion codes. Extensive quantitative and qualitative +analysis suggests that LEO significantly improves coherent synthesis of human +videos over previous methods on the datasets TaichiHD, FaceForensics and +CelebV-HQ. In addition, the effective disentanglement of appearance and motion +in LEO allows for two additional tasks, namely infinite-length human video +synthesis, as well as content-preserving video editing. + +
+
+ comment: IJCV 2024, Project webpage: https://wyhsirius.github.io/LEO-project/ +
+
+
+
+
+ + ♻ ☆ Decoupling Fine Detail and Global Geometry for Compressed Depth Map + Super-Resolution ECCV 2024 + + +
+ Recovering high-quality depth maps from compressed sources has gained +significant attention due to the limitations of consumer-grade depth cameras +and the bandwidth restrictions during data transmission. However, current +methods still suffer from two challenges. First, bit-depth compression produces +a uniform depth representation in regions with subtle variations, hindering the +recovery of detailed information. Second, densely distributed random noise +reduces the accuracy of estimating the global geometric structure of the scene. +To address these challenges, we propose a novel framework, termed +geometry-decoupled network (GDNet), for compressed depth map super-resolution +that decouples the high-quality depth map reconstruction process by handling +global and detailed geometric features separately. To be specific, we propose +the fine geometry detail encoder (FGDE), which is designed to aggregate fine +geometry details in high-resolution low-level image features while +simultaneously enriching them with complementary information from +low-resolution context-level image features. In addition, we develop the global +geometry encoder (GGE) that aims at suppressing noise and extracting global +geometric information effectively via constructing compact feature +representation in a low-rank space. We conduct experiments on multiple +benchmark datasets, demonstrating that our GDNet significantly outperforms +current methods in terms of geometric consistency and detail recovery. In the +ECCV 2024 AIM Compressed Depth Upsampling Challenge, our solution won the 1st +place award. Our codes will be available. + +
+
+ comment: The 1st place award for the ECCV 2024 AIM Compressed Depth Upsampling + Challenge +
+
+
+
+
+ + ♻ ☆ High-throughput 3D shape completion of potato tubers on a harvester + + +
+ Potato yield is an important metric for farmers to further optimize their +cultivation practices. Potato yield can be estimated on a harvester using an +RGB-D camera that can estimate the three-dimensional (3D) volume of individual +potato tubers. A challenge, however, is that the 3D shape derived from RGB-D +images is only partially completed, underestimating the actual volume. To +address this issue, we developed a 3D shape completion network, called CoRe++, +which can complete the 3D shape from RGB-D images. CoRe++ is a deep learning +network that consists of a convolutional encoder and a decoder. The encoder +compresses RGB-D images into latent vectors that are used by the decoder to +complete the 3D shape using the deep signed distance field network (DeepSDF). +To evaluate our CoRe++ network, we collected partial and complete 3D point +clouds of 339 potato tubers on an operational harvester in Japan. On the 1425 +RGB-D images in the test set (representing 51 unique potato tubers), our +network achieved a completion accuracy of 2.8 mm on average. For volumetric +estimation, the root mean squared error (RMSE) was 22.6 ml, and this was better +than the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml). +We found that the RMSE can be further reduced to 18.2 ml when performing the 3D +shape completion in the center of the RGB-D image. With an average 3D shape +completion time of 10 milliseconds per tuber, we can conclude that CoRe++ is +both fast and accurate enough to be implemented on an operational harvester for +high-throughput potato yield estimation. CoRe++'s high-throughput and accurate +processing allows it to be applied to other tuber, fruit and vegetable crops, +thereby enabling versatile, accurate and real-time yield monitoring in +precision agriculture. Our code, network weights and dataset are publicly +available at https://github.com/UTokyo-FieldPhenomics-Lab/corepp.git. + +
+
+ comment: 20 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ TransAgent: Transfer Vision-Language Foundation Models with + Heterogeneous Agent Collaboration NeurIPS 2024 + + +
+ Vision-language foundation models (such as CLIP) have recently shown their +power in transfer learning, owing to large-scale image-text pre-training. +However, target domain data in the downstream tasks can be highly different +from the pre-training phase, which makes it hard for such a single model to +generalize well. Alternatively, there exists a wide range of expert models that +contain diversified vision and/or language knowledge pre-trained on different +modalities, tasks, networks, and datasets. Unfortunately, these models are +"isolated agents" with heterogeneous structures, and how to integrate their +knowledge for generalizing CLIP-like models has not been fully explored. To +bridge this gap, we propose a general and concise TransAgent framework, which +transports the knowledge of the isolated agents in a unified manner, and +effectively guides CLIP to generalize with multi-source knowledge distillation. +With such a distinct framework, we flexibly collaborate with 11 heterogeneous +agents to empower vision-language foundation models, without further cost in +the inference phase. Finally, our TransAgent achieves state-of-the-art +performance on 11 visual recognition datasets. Under the same low-shot setting, +it outperforms the popular CoOp with around 10% on average, and 20% on EuroSAT +which contains large domain shifts. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ How Does the Textual Information Affect the Retrieval of Multimodal + In-Context Learning? EMNLP 2024 + + +
+ The increase in parameter size of multimodal large language models (MLLMs) +introduces significant capabilities, particularly in-context learning, where +MLLMs enhance task performance without updating pre-trained parameters. This +effectiveness, however, hinges on the appropriate selection of in-context +examples, a process that is currently biased towards visual data, overlooking +textual information. Furthermore, the area of supervised retrievers for MLLMs, +crucial for optimal in-context example selection, continues to be +uninvestigated. Our study offers an in-depth evaluation of the impact of +textual information on the unsupervised selection of in-context examples in +multimodal contexts, uncovering a notable sensitivity of retriever performance +to the employed modalities. Responding to this, we introduce a novel supervised +MLLM-retriever MSIER that employs a neural network to select examples that +enhance multimodal in-context learning efficiency. This approach is validated +through extensive testing across three distinct tasks, demonstrating the +method's effectiveness. Additionally, we investigate the influence of +modalities on our supervised retrieval method's training and pinpoint factors +contributing to our model's success. This exploration paves the way for future +advancements, highlighting the potential for refined in-context learning in +MLLMs through the strategic use of multimodal data. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Zero-Shot NAS via the Suppression of Local Entropy Decrease + + +
+ Architecture performance evaluation is the most time-consuming part of neural +architecture search (NAS). Zero-Shot NAS accelerates the evaluation by +utilizing zero-cost proxies instead of training. Though effective, existing +zero-cost proxies require invoking backpropagations or running networks on +input data, making it difficult to further accelerate the computation of +proxies. To alleviate this issue, architecture topologies are used to evaluate +the performance of networks in this study. We prove that particular +architectural topologies decrease the local entropy of feature maps, which +degrades specific features to a bias, thereby reducing network performance. +Based on this proof, architectural topologies are utilized to quantify the +suppression of local entropy decrease (SED) as a data-free and running-free +proxy. Experimental results show that SED outperforms most state-of-the-art +proxies in terms of architecture selection on five benchmarks, with computation +time reduced by three orders of magnitude. We further compare the SED-based NAS +with state-of-the-art proxies. SED-based NAS selects the architecture with +higher accuracy and fewer parameters in only one second. The theoretical +analyses of local entropy and experimental results demonstrate that the +suppression of local entropy decrease facilitates selecting optimal +architectures in Zero-Shot NAS. + +
+
+ comment: 8 pages, 2 figures. Corrected typos and latex template +
+
+
+
+
+ + ♻ ☆ CALoR: Towards Comprehensive Model Inversion Defense + + +
+ Model Inversion Attacks (MIAs) aim at recovering privacy-sensitive training +data from the knowledge encoded in the released machine learning models. Recent +advances in the MIA field have significantly enhanced the attack performance +under multiple scenarios, posing serious privacy risks of Deep Neural Networks +(DNNs). However, the development of defense strategies against MIAs is +relatively backward to resist the latest MIAs and existing defenses fail to +achieve further trade-off between model utility and model robustness. In this +paper, we provide an in-depth analysis from the perspective of intrinsic +vulnerabilities of MIAs, comprehensively uncovering the weaknesses inherent in +the basic pipeline, which are partially investigated in the previous defenses. +Building upon these new insights, we propose a robust defense mechanism, +integrating Confidence Adaptation and Low-Rank compression(CALoR). Our method +includes a novel robustness-enhanced classification loss specially-designed for +model inversion defenses and reveals the extraordinary effectiveness of +compressing the classification header. With CALoR, we can mislead the +optimization objective, reduce the leaked information and impede the +backpropagation of MIAs, thus mitigating the risk of privacy leakage. Extensive +experimental results demonstrate that our method achieves state-of-the-art +(SOTA) defense performance against MIAs and exhibits superior generalization to +existing defenses across various scenarios. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ Transformer-Based Tooth Alignment Prediction With Occlusion And + Collision Constraints + + +
+ The planning of digital orthodontic treatment requires providing tooth +alignment, which not only consumes a lot of time and labor to determine +manually but also relays clinical experiences heavily. In this work, we +proposed a lightweight tooth alignment neural network based on +Swin-transformer. We first re-organized 3D point clouds based on virtual arch +lines and converted them into order-sorted multi-channel textures, which +improves the accuracy and efficiency simultaneously. We then designed two new +occlusal loss functions that quantitatively evaluate the occlusal relationship +between the upper and lower jaws. They are important clinical constraints, +first introduced to the best of our knowledge, and lead to cutting-edge +prediction accuracy. To train our network, we collected a large digital +orthodontic dataset that has 591 clinical cases, including various complex +clinical cases. This dataset will benefit the community after its release since +there is no open dataset so far. Furthermore, we also proposed two new +orthodontic dataset augmentation methods considering tooth spatial distribution +and occlusion. We evaluated our method with this dataset and extensive +experiments, including comparisons with STAT methods and ablation studies, and +demonstrate the high prediction accuracy of our method. + +
+
+ comment: Modify formatting errors, optimize content layout +
+
+
+
+
+ + ♻ ☆ Boosting Open-Domain Continual Learning via Leveraging Intra-domain + Category-aware Prototype + + +
+ Despite recent progress in enhancing the efficacy of Open-Domain Continual +Learning (ODCL) in Vision-Language Models (VLM), failing to (1) correctly +identify the Task-ID of a test image and (2) use only the category set +corresponding to the Task-ID, while preserving the knowledge related to each +domain, cannot address the two primary challenges of ODCL: forgetting old +knowledge and maintaining zero-shot capabilities, as well as the confusions +caused by category-relatedness between domains. In this paper, we propose a +simple yet effective solution: leveraging intra-domain category-aware +prototypes for ODCL in CLIP (DPeCLIP), where the prototype is the key to +bridging the above two processes. Concretely, we propose a training-free +Task-ID discriminator method, by utilizing prototypes as classifiers for +identifying Task-IDs. Furthermore, to maintain the knowledge corresponding to +each domain, we incorporate intra-domain category-aware prototypes as domain +prior prompts into the training process. Extensive experiments conducted on 11 +different datasets demonstrate the effectiveness of our approach, achieving +2.37% and 1.14% average improvement in class-incremental and task-incremental +settings, respectively. + +
+
+
+
+
+ + ♻ ☆ SCSA: Exploring the Synergistic Effects Between Spatial and Channel + Attention + + +
+ Channel and spatial attentions have respectively brought significant +improvements in extracting feature dependencies and spatial structure relations +for various downstream vision tasks. While their combination is more beneficial +for leveraging their individual strengths, the synergy between channel and +spatial attentions has not been fully explored, lacking in fully harness the +synergistic potential of multi-semantic information for feature guidance and +mitigation of semantic disparities. Our study attempts to reveal the +synergistic relationship between spatial and channel attention at multiple +semantic levels, proposing a novel Spatial and Channel Synergistic Attention +module (SCSA). Our SCSA consists of two parts: the Shareable Multi-Semantic +Spatial Attention (SMSA) and the Progressive Channel-wise Self-Attention +(PCSA). SMSA integrates multi-semantic information and utilizes a progressive +compression strategy to inject discriminative spatial priors into PCSA's +channel self-attention, effectively guiding channel recalibration. +Additionally, the robust feature interactions based on the self-attention +mechanism in PCSA further mitigate the disparities in multi-semantic +information among different sub-features within SMSA. We conduct extensive +experiments on seven benchmark datasets, including classification on +ImageNet-1K, object detection on MSCOCO 2017, segmentation on ADE20K, and four +other complex scene detection datasets. Our results demonstrate that our +proposed SCSA not only surpasses the current state-of-the-art attention but +also exhibits enhanced generalization capabilities across various task +scenarios. The code and models are available at: +https://github.com/HZAI-ZJNU/SCSA. + +
+
+ comment: We added experiments for the classification task and updated the + corresponding sections accordingly. The paper formatting has also been + revised +
+
+
+
+
+ + ♻ ☆ Improving Training-free Conditional Diffusion Model via Fisher + Information + + +
+ Training-free conditional diffusion models have received great attention in +conditional image generation tasks. However, they require a computationally +expensive conditional score estimator to let the intermediate results of each +step in the reverse process toward the condition, which causes slow conditional +generation. In this paper, we propose a novel Fisher information-based +conditional diffusion (FICD) model to generate high-quality samples according +to the condition. In particular, we further explore the conditional term from +the perspective of Fisher information, where we show Fisher information can act +as a weight to measure the informativeness of the condition in each generation +step. According to this new perspective, we can control and gain more +information along the conditional direction in the generation space. Thus, we +propose the upper bound of the Fisher information to reformulate the +conditional term, which increases the information gain and decreases the time +cost. Experimental results also demonstrate that the proposed FICD can offer up +to 2x speed-ups under the same sampling steps as most baselines. Meanwhile, +FICD can improve the generation quality in various tasks compared to the +baselines with a low computation cost. + +
+
+
+
+
+ + ♻ ☆ CIMIL-CRC: a clinically-informed multiple instance learning framework + for patient-level colorectal cancer molecular subtypes classification from + H\&E stained images + + +
+ Treatment approaches for colorectal cancer (CRC) are highly dependent on the +molecular subtype, as immunotherapy has shown efficacy in cases with +microsatellite instability (MSI) but is ineffective for the microsatellite +stable (MSS) subtype. There is promising potential in utilizing deep neural +networks (DNNs) to automate the differentiation of CRC subtypes by analyzing +Hematoxylin and Eosin (H\&E) stained whole-slide images (WSIs). Due to the +extensive size of WSIs, Multiple Instance Learning (MIL) techniques are +typically explored. However, existing MIL methods focus on identifying the most +representative image patches for classification, which may result in the loss +of critical information. Additionally, these methods often overlook clinically +relevant information, like the tendency for MSI class tumors to predominantly +occur on the proximal (right side) colon. We introduce `CIMIL-CRC', a DNN +framework that: 1) solves the MSI/MSS MIL problem by efficiently combining a +pre-trained feature extraction model with principal component analysis (PCA) to +aggregate information from all patches, and 2) integrates clinical priors, +particularly the tumor location within the colon, into the model to enhance +patient-level classification accuracy. We assessed our CIMIL-CRC method using +the average area under the curve (AUC) from a 5-fold cross-validation +experimental setup for model development on the TCGA-CRC-DX cohort, contrasting +it with a baseline patch-level classification, MIL-only approach, and +Clinically-informed patch-level classification approach. Our CIMIL-CRC +outperformed all methods (AUROC: $0.92\pm0.002$ (95\% CI 0.91-0.92), vs. +$0.79\pm0.02$ (95\% CI 0.76-0.82), $0.86\pm0.01$ (95\% CI 0.85-0.88), and +$0.87\pm0.01$ (95\% CI 0.86-0.88), respectively). The improvement was +statistically significant. + +
+
+ comment: Accepted to the journal 'Computer Methods and Programs in + Biomedicine' +
+
+
+
+
+ + ♻ ☆ Leveraging Pre-trained Models for FF-to-FFPE Histopathological Image + Translation + + +
+ The two primary types of Hematoxylin and Eosin (H&E) slides in histopathology +are Formalin-Fixed Paraffin-Embedded (FFPE) and Fresh Frozen (FF). FFPE slides +offer high quality histopathological images but require a labor-intensive +acquisition process. In contrast, FF slides can be prepared quickly, but the +image quality is relatively poor. Our task is to translate FF images into FFPE +style, thereby improving the image quality for diagnostic purposes. In this +paper, we propose Diffusion-FFPE, a method for FF-to-FFPE histopathological +image translation using a pre-trained diffusion model. Specifically, we employ +a one-step diffusion model as the generator and fine-tune it with LoRA adapters +using adversarial learning objectives. To ensure that the model effectively +captures both global structural information and local details, we propose a +multi-scale feature fusion (MFF) module. This module utilizes two VAE encoders +to extract features of varying image sizes and performs feature fusion before +feeding them into the UNet. Furthermore, we utilize a pre-trained +vision-language model for histopathology as the backbone for the discriminator +to further improve performance We conducted FF-to-FFPE translation experiments +on the TCGA-NSCLC datasets, and our method achieved better performance compared +to other methods. The code and models are released at +https://github.com/QilaiZhang/Diffusion-FFPE. + +
+
+
+
+
+ + ♻ ☆ Add-it: Training-Free Object Insertion in Images With Pretrained + Diffusion Models + + +
+ Adding Object into images based on text instructions is a challenging task in +semantic image editing, requiring a balance between preserving the original +scene and seamlessly integrating the new object in a fitting location. Despite +extensive efforts, existing models often struggle with this balance, +particularly with finding a natural location for adding an object in complex +scenes. We introduce Add-it, a training-free approach that extends diffusion +models' attention mechanisms to incorporate information from three key sources: +the scene image, the text prompt, and the generated image itself. Our weighted +extended-attention mechanism maintains structural consistency and fine details +while ensuring natural object placement. Without task-specific fine-tuning, +Add-it achieves state-of-the-art results on both real and generated image +insertion benchmarks, including our newly constructed "Additing Affordance +Benchmark" for evaluating object placement plausibility, outperforming +supervised methods. Human evaluations show that Add-it is preferred in over 80% +of cases, and it also demonstrates improvements in various automated metrics. + +
+
+ comment: Project page is at https://research.nvidia.com/labs/par/addit/ +
+
+
+
+
+ + ♻ ☆ Style Transfer: From Stitching to Neural Networks + + +
+ This article compares two style transfer methods in image processing: the +traditional method, which synthesizes new images by stitching together small +patches from existing images, and a modern machine learning-based approach that +uses a segmentation network to isolate foreground objects and apply style +transfer solely to the background. The traditional method excels in creating +artistic abstractions but can struggle with seamlessness, whereas the machine +learning method preserves the integrity of foreground elements while enhancing +the background, offering improved aesthetic quality and computational +efficiency. Our study indicates that machine learning-based methods are more +suited for real-world applications where detail preservation in foreground +elements is essential. + +
+
+
+
+
+ + ♻ ☆ Exploring Diverse Methods in Visual Question Answering + + +
+ This study explores innovative methods for improving Visual Question +Answering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and +attention mechanisms. Leveraging a balanced VQA dataset, we investigate three +distinct strategies. Firstly, GAN-based approaches aim to generate answer +embeddings conditioned on image and question inputs, showing potential but +struggling with more complex tasks. Secondly, autoencoder-based techniques +focus on learning optimal embeddings for questions and images, achieving +comparable results with GAN due to better ability on complex questions. Lastly, +attention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB), +address language priors and attention modeling, albeit with a +complexity-performance trade-off. This study underscores the challenges and +opportunities in VQA and suggests avenues for future research, including +alternative GAN formulations and attentional mechanisms. + +
+
+ comment: Accepted by 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ SplatFormer: Point Transformer for Robust 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3DGS) has recently transformed photorealistic +reconstruction, achieving high visual fidelity and real-time performance. +However, rendering quality significantly deteriorates when test views deviate +from the camera angles used during training, posing a major challenge for +applications in immersive free-viewpoint rendering and navigation. In this +work, we conduct a comprehensive evaluation of 3DGS and related novel view +synthesis methods under out-of-distribution (OOD) test camera scenarios. By +creating diverse test cases with synthetic and real-world datasets, we +demonstrate that most existing methods, including those incorporating various +regularization techniques and data-driven priors, struggle to generalize +effectively to OOD views. To address this limitation, we introduce SplatFormer, +the first point transformer model specifically designed to operate on Gaussian +splats. SplatFormer takes as input an initial 3DGS set optimized under limited +training views and refines it in a single forward pass, effectively removing +potential artifacts in OOD test views. To our knowledge, this is the first +successful application of point transformers directly on 3DGS sets, surpassing +the limitations of previous multi-scene training methods, which could handle +only a restricted number of input views during inference. Our model +significantly improves rendering quality under extreme novel views, achieving +state-of-the-art performance in these challenging scenarios and outperforming +various 3DGS regularization techniques, multi-scene models tailored for sparse +view synthesis, and diffusion-based frameworks. + +
+
+ comment: Code and dataset: https://github.com/ChenYutongTHU/SplatFormer + Project page: https://sergeyprokudin.github.io/splatformer/ +
+
+
+
+
+ + ♻ ☆ Enhance Image-to-Image Generation with LLaVA-generated Prompts + + +
+ This paper presents a novel approach to enhance image-to-image generation by +leveraging the multimodal capabilities of the Large Language and Vision +Assistant (LLaVA). We propose a framework where LLaVA analyzes input images and +generates textual descriptions, hereinafter LLaVA-generated prompts. These +prompts, along with the original image, are fed into the image-to-image +generation pipeline. This enriched representation guides the generation process +towards outputs that exhibit a stronger resemblance to the input image. +Extensive experiments demonstrate the effectiveness of LLaVA-generated prompts +in promoting image similarity. We observe a significant improvement in the +visual coherence between the generated and input images compared to traditional +methods. Future work will explore fine-tuning LLaVA prompts for increased +control over the creative process. By providing more specific details within +the prompts, we aim to achieve a delicate balance between faithfulness to the +original image and artistic expression in the generated outputs. + +
+
+ comment: Accepted by 2024 5th International Conference on Information Science, + Parallel and Distributed Systems +
+
+
+
+
+ + ♻ ☆ Temporal-Mapping Photography for Event Cameras + + +
+ Event cameras, or Dynamic Vision Sensors (DVS) are novel neuromorphic sensors +that capture brightness changes as a continuous stream of "events" rather than +traditional intensity frames. Converting sparse events to dense intensity +frames faithfully has long been an ill-posed problem. Previous methods have +primarily focused on converting events to video in dynamic scenes or with a +moving camera. In this paper, for the first time, we realize events to dense +intensity image conversion using a stationary event camera in static scenes +with a transmittance adjustment device for brightness modulation. Different +from traditional methods that mainly rely on event integration, the proposed +Event-Based Temporal Mapping Photography (EvTemMap) measures the time of event +emitting for each pixel. Then, the resulting Temporal Matrix is converted to an +intensity frame with a temporal mapping neural network. At the hardware level, +the proposed EvTemMap is implemented by combining a transmittance adjustment +device with a DVS, named Adjustable Transmittance Dynamic Vision Sensor +(AT-DVS). Additionally, we collected TemMat dataset under various conditions +including low-light and high dynamic range scenes. The experimental results +showcase the high dynamic range, fine-grained details, and high-grayscale +resolution of the proposed EvTemMap. The code and dataset are available in +https://github.com/YuHanBaozju/EvTemMap + +
+
+ comment: 18 pages, 10 figures, 1 Supplementary materials +
+
+
+
+
+ + ♻ ☆ GenRec: Unifying Video Generation and Recognition with Diffusion Models + + +
+ Video diffusion models are able to generate high-quality videos by learning +strong spatial-temporal priors on large-scale datasets. In this paper, we aim +to investigate whether such priors derived from a generative process are +suitable for video recognition, and eventually joint optimization of generation +and recognition. Building upon Stable Video Diffusion, we introduce GenRec, the +first unified framework trained with a random-frame conditioning process so as +to learn generalized spatial-temporal representations. The resulting framework +can naturally supports generation and recognition, and more importantly is +robust even when visual inputs contain limited information. Extensive +experiments demonstrate the efficacy of GenRec for both recognition and +generation. In particular, GenRec achieves competitive recognition performance, +offering 75.8% and 87.2% accuracy on SSV2 and K400, respectively. GenRec also +performs the best on class-conditioned image-to-video generation, achieving +46.5 and 49.3 FVD scores on SSV2 and EK-100 datasets. Furthermore, GenRec +demonstrates extraordinary robustness in scenarios that only limited frames can +be observed. Code will be available at https://github.com/wengzejia1/GenRec. + +
+
+ comment: 19 pages, 6 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Reminding Multimodal Large Language Models of Object-aware Knowledge + with Retrieved Tags EMNLP 2024 + + +
+ Despite recent advances in the general visual instruction-following ability +of Multimodal Large Language Models (MLLMs), they still struggle with critical +problems when required to provide a precise and detailed response to a visual +instruction: (1) failure to identify novel objects or entities, (2) mention of +non-existent objects, and (3) neglect of object's attributed details. Intuitive +solutions include improving the size and quality of data or using larger +foundation models. They show effectiveness in mitigating these issues, but at +an expensive cost of collecting a vast amount of new data and introducing a +significantly larger model. Standing at the intersection of these approaches, +we examine the three object-oriented problems from the perspective of the +image-to-text mapping process by the multimodal connector. In this paper, we +first identify the limitations of multimodal connectors stemming from +insufficient training data. Driven by this, we propose to enhance the mapping +with retrieval-augmented tag tokens, which contain rich object-aware +information such as object names and attributes. With our Tag-grounded visual +instruction tuning with retrieval Augmentation (TUNA), we outperform baselines +that share the same language model and training data on 12 benchmarks. +Furthermore, we show the zero-shot capability of TUNA when provided with +specific datastores. + +
+
+ comment: Main Conference at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Human-in-the-Loop Segmentation of Multi-species Coral Imagery CVPR2024 + + +
+ Marine surveys by robotic underwater and surface vehicles result in +substantial quantities of coral reef imagery, however labeling these images is +expensive and time-consuming for domain experts. Point label propagation is a +technique that uses existing images labeled with sparse points to create +augmented ground truth data, which can be used to train a semantic segmentation +model. In this work, we show that recent advances in large foundation models +facilitate the creation of augmented ground truth masks using only features +extracted by the denoised version of the DINOv2 foundation model and K-Nearest +Neighbors (KNN), without any pre-training. For images with extremely sparse +labels, we present a labeling method based on human-in-the-loop principles, +which greatly enhances annotation efficiency: in the case that there are 5 +point labels per image, our human-in-the-loop method outperforms the prior +state-of-the-art by 14.2% for pixel accuracy and 19.7% for mIoU; and by 8.9% +and 18.3% if there are 10 point labels. When human-in-the-loop labeling is not +available, using the denoised DINOv2 features with a KNN still improves on the +prior state-of-the-art by 2.7% for pixel accuracy and 5.8% for mIoU (5 grid +points). On the semantic segmentation task, we outperform the prior +state-of-the-art by 8.8% for pixel accuracy and by 13.5% for mIoU when only 5 +point labels are used for point label propagation. Additionally, we perform a +comprehensive study into the impacts of the point label placement style and the +number of points on the point label propagation quality, and make several +recommendations for improving the efficiency of labeling images with points. + +
+
+ comment: Journal article preprint of extended paper, 30 pages, 11 figures. + Original conference paper (v2) accepted at the CVPR2024 3rd Workshop on + Learning with Limited Labelled Data for Image and Video Understanding + (L3D-IVU) +
+
+
+
+
+ + ♻ ☆ MMLongBench-Doc: Benchmarking Long-context Document Understanding with + Visualizations NeurIPS 2024 + + +
+ Understanding documents with rich layouts and multi-modal components is a +long-standing and practical task. Recent Large Vision-Language Models (LVLMs) +have made remarkable strides in various tasks, particularly in single-page +document understanding (DU). However, their abilities on long-context DU remain +an open problem. This work presents MMLongBench-Doc, a long-context, +multi-modal benchmark comprising 1,062 expert-annotated questions. Distinct +from previous datasets, it is constructed upon 130 lengthy PDF-formatted +documents with an average of 49.4 pages and 20,971 textual tokens. Towards +comprehensive evaluation, answers to these questions rely on pieces of evidence +from (1) different sources (text, image, chart, table, and layout structure) +and (2) various locations (i.e. page number). Moreover, 33.2% of the questions +are cross-page questions requiring evidence across multiple pages. 22.8% of the +questions are designed to be unanswerable for detecting potential +hallucinations. Experiments on 14 LVLMs demonstrate that long-context DU +greatly challenges current models. Notably, the best-performing model, GPT-4o, +achieves an F1 score of only 42.7%, while the second-best, GPT-4V, scores +31.4%. Furthermore, 12 LVLMs (all except GPT-4o and GPT-4V) even present worse +performance than their LLM counterparts which are fed with lossy-parsed OCR +documents. These results validate the necessity of future research toward more +capable long-context LVLMs. Project Page: +https://mayubo2333.github.io/MMLongBench-Doc + +
+
+ comment: Accepted to NeurIPS 2024 Datasets and Benchmarks Track (Spotlight) +
+
+
+
+
+ + ♻ ReKep: Spatio-Temporal Reasoning of Relational Keypoint Constraints for + Robotic Manipulation + + +
+ Representing robotic manipulation tasks as constraints that associate the +robot and the environment is a promising way to encode desired robot behaviors. +However, it remains unclear how to formulate the constraints such that they are +1) versatile to diverse tasks, 2) free of manual labeling, and 3) optimizable +by off-the-shelf solvers to produce robot actions in real-time. In this work, +we introduce Relational Keypoint Constraints (ReKep), a visually-grounded +representation for constraints in robotic manipulation. Specifically, ReKep is +expressed as Python functions mapping a set of 3D keypoints in the environment +to a numerical cost. We demonstrate that by representing a manipulation task as +a sequence of Relational Keypoint Constraints, we can employ a hierarchical +optimization procedure to solve for robot actions (represented by a sequence of +end-effector poses in SE(3)) with a perception-action loop at a real-time +frequency. Furthermore, in order to circumvent the need for manual +specification of ReKep for each new task, we devise an automated procedure that +leverages large vision models and vision-language models to produce ReKep from +free-form language instructions and RGB-D observations. We present system +implementations on a wheeled single-arm platform and a stationary dual-arm +platform that can perform a large variety of manipulation tasks, featuring +multi-stage, in-the-wild, bimanual, and reactive behaviors, all without +task-specific data or environment models. Website at +https://rekep-robot.github.io/. + +
+
+
+
+
+ + ♻ ☆ MIRAGE: Multimodal Identification and Recognition of Annotations in + Indian General Prescriptions + + +
+ Hospitals in India still rely on handwritten medical records despite the +availability of Electronic Medical Records (EMR), complicating statistical +analysis and record retrieval. Handwritten records pose a unique challenge, +requiring specialized data for training models to recognize medications and +their recommendation patterns. While traditional handwriting recognition +approaches employ 2-D LSTMs, recent studies have explored using Multimodal +Large Language Models (MLLMs) for OCR tasks. Building on this approach, we +focus on extracting medication names and dosages from simulated medical +records. Our methodology MIRAGE (Multimodal Identification and Recognition of +Annotations in indian GEneral prescriptions) involves fine-tuning the QWEN VL, +LLaVA 1.6 and Idefics2 models on 743,118 high resolution simulated medical +record images-fully annotated from 1,133 doctors across India. Our approach +achieves 82% accuracy in extracting medication names and dosages. + +
+
+ comment: 5 pages, 9 figures, 3 tables, submitted to ISBI 2025 +
+
+
+
+
+ + ♻ ☆ DreamScape: 3D Scene Creation via Gaussian Splatting joint Correlation + Modeling + + +
+ Recent progress in text-to-3D creation has been propelled by integrating the +potent prior of Diffusion Models from text-to-image generation into the 3D +domain. Nevertheless, generating 3D scenes characterized by multiple instances +and intricate arrangements remains challenging. In this study, we present +DreamScape, a method for creating highly consistent 3D scenes solely from +textual descriptions, leveraging the strong 3D representation capabilities of +Gaussian Splatting and the complex arrangement abilities of large language +models (LLMs). Our approach involves a 3D Gaussian Guide ($3{DG^2}$) for scene +representation, consisting of semantic primitives (objects) and their spatial +transformations and relationships derived directly from text prompts using +LLMs. This compositional representation allows for local-to-global optimization +of the entire scene. A progressive scale control is tailored during local +object generation, ensuring that objects of different sizes and densities adapt +to the scene, which addresses training instability issue arising from simple +blending in the subsequent global optimization stage. To mitigate potential +biases of LLM priors, we model collision relationships between objects at the +global level, enhancing physical correctness and overall realism. Additionally, +to generate pervasive objects like rain and snow distributed extensively across +the scene, we introduce a sparse initialization and densification strategy. +Experiments demonstrate that DreamScape offers high usability and +controllability, enabling the generation of high-fidelity 3D scenes from only +text prompts and achieving state-of-the-art performance compared to other +methods. + +
+
+
+
+
+ + ♻ ☆ LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated + Image Detection CVPR 2024 + + +
+ The evolution of Diffusion Models has dramatically improved image generation +quality, making it increasingly difficult to differentiate between real and +generated images. This development, while impressive, also raises significant +privacy and security concerns. In response to this, we propose a novel Latent +REconstruction error guided feature REfinement method (LaRE^2) for detecting +the diffusion-generated images. We come up with the Latent Reconstruction Error +(LaRE), the first reconstruction-error based feature in the latent space for +generated image detection. LaRE surpasses existing methods in terms of feature +extraction efficiency while preserving crucial cues required to differentiate +between the real and the fake. To exploit LaRE, we propose an Error-Guided +feature REfinement module (EGRE), which can refine the image feature guided by +LaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an +align-then-refine mechanism, which effectively refines the image feature for +generated-image detection from both spatial and channel perspectives. Extensive +experiments on the large-scale GenImage benchmark demonstrate the superiority +of our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1% +average ACC/AP across 8 different image generators. LaRE also surpasses +existing methods in terms of feature extraction cost, delivering an impressive +speed enhancement of 8 times. Code is available. + +
+
+ comment: CVPR 2024. Code is available at https://github.com/luo3300612/LaRE +
+
+
+
+
+ + ♻ ☆ WildScenes: A Benchmark for 2D and 3D Semantic Segmentation in + Large-scale Natural Environments + + +
+ Recent progress in semantic scene understanding has primarily been enabled by +the availability of semantically annotated bi-modal (camera and LiDAR) datasets +in urban environments. However, such annotated datasets are also needed for +natural, unstructured environments to enable semantic perception for +applications, including conservation, search and rescue, environment +monitoring, and agricultural automation. Therefore, we introduce $WildScenes$, +a bi-modal benchmark dataset consisting of multiple large-scale, sequential +traversals in natural environments, including semantic annotations in +high-resolution 2D images and dense 3D LiDAR point clouds, and accurate 6-DoF +pose information. The data is (1) trajectory-centric with accurate localization +and globally aligned point clouds, (2) calibrated and synchronized to support +bi-modal training and inference, and (3) containing different natural +environments over 6 months to support research on domain adaptation. Our 3D +semantic labels are obtained via an efficient, automated process that transfers +the human-annotated 2D labels from multiple views into 3D point cloud +sequences, thus circumventing the need for expensive and time-consuming human +annotation in 3D. We introduce benchmarks on 2D and 3D semantic segmentation +and evaluate a variety of recent deep-learning techniques to demonstrate the +challenges in semantic segmentation in natural environments. We propose +train-val-test splits for standard benchmarks as well as domain adaptation +benchmarks and utilize an automated split generation technique to ensure the +balance of class label distributions. The $WildScenes$ benchmark webpage is +https://csiro-robotics.github.io/WildScenes, and the data is publicly available +at https://data.csiro.au/collection/csiro:61541 . + +
+
+ comment: Accepted in the The International Journal of Robotics Research (IJRR) +
+
+
+
+
+ + ♻ ☆ Utilizing Graph Generation for Enhanced Domain Adaptive Object Detection + + +
+ The problem of Domain Adaptive in the field of Object Detection involves the +transfer of object detection models from labeled source domains to unannotated +target domains. Recent advancements in this field aim to address domain +discrepancies by aligning pixel-pairs across domains within a non-Euclidean +graphical space, thereby minimizing semantic distribution variance. Despite +their remarkable achievements, these methods often use coarse semantic +representations to model graphs, mainly due to ignoring non-informative +elements and failing to focus on precise semantic alignment. Additionally, the +generation of coarse graphs inherently introduces abnormal nodes, posing +challenges and potentially biasing domain adaptation outcomes. Consequently, we +propose a framework, which utilizes the Graph Generation to enhance the quality +of DAOD (\method{}). Specifically, we introduce a Node Refinement module that +utilizes a memory bank to reconstruct noisy sampled nodes while applying +contrastive regularization to noisy features. To enhance semantic alignment, we +propose separating domain-specific styles from category invariance encoded +within graph covariances, which allows us to selectively remove domain-specific +styles while preserving category-invariant information, thus facilitating more +accurate semantic alignment across different domains. Furthermore, we propose a +Graph Optimization adaptor, leveraging variational inference to mitigate the +impact of abnormal nodes. Extensive experimentation across three adaptation +benchmarks validates that \method{} achieves state-of-the-art performance in +the task of unsupervised domain adaptation. + +
+
+
+
+
+ + ♻ ☆ Neural Gaffer: Relighting Any Object via Diffusion + + +
+ Single-image relighting is a challenging task that involves reasoning about +the complex interplay between geometry, materials, and lighting. Many prior +methods either support only specific categories of images, such as portraits, +or require special capture conditions, like using a flashlight. Alternatively, +some methods explicitly decompose a scene into intrinsic components, such as +normals and BRDFs, which can be inaccurate or under-expressive. In this work, +we propose a novel end-to-end 2D relighting diffusion model, called Neural +Gaffer, that takes a single image of any object and can synthesize an accurate, +high-quality relit image under any novel environmental lighting condition, +simply by conditioning an image generator on a target environment map, without +an explicit scene decomposition. Our method builds on a pre-trained diffusion +model, and fine-tunes it on a synthetic relighting dataset, revealing and +harnessing the inherent understanding of lighting present in the diffusion +model. We evaluate our model on both synthetic and in-the-wild Internet imagery +and demonstrate its advantages in terms of generalization and accuracy. +Moreover, by combining with other generative methods, our model enables many +downstream 2D tasks, such as text-based relighting and object insertion. Our +model can also operate as a strong relighting prior for 3D tasks, such as +relighting a radiance field. + +
+
+ comment: Project Website: https://neural-gaffer.github.io +
+
+
+
+
+ + ♻ ☆ Video Diffusion Models are Training-free Motion Interpreter and + Controller NeurIPS 2024 + + +
+ Video generation primarily aims to model authentic and customized motion +across frames, making understanding and controlling the motion a crucial topic. +Most diffusion-based studies on video motion focus on motion customization with +training-based paradigms, which, however, demands substantial training +resources and necessitates retraining for diverse models. Crucially, these +approaches do not explore how video diffusion models encode cross-frame motion +information in their features, lacking interpretability and transparency in +their effectiveness. To answer this question, this paper introduces a novel +perspective to understand, localize, and manipulate motion-aware features in +video diffusion models. Through analysis using Principal Component Analysis +(PCA), our work discloses that robust motion-aware feature already exists in +video diffusion models. We present a new MOtion FeaTure (MOFT) by eliminating +content correlation information and filtering motion channels. MOFT provides a +distinct set of benefits, including the ability to encode comprehensive motion +information with clear interpretability, extraction without the need for +training, and generalizability across diverse architectures. Leveraging MOFT, +we propose a novel training-free video motion control framework. Our method +demonstrates competitive performance in generating natural and faithful motion, +providing architecture-agnostic insights and applicability in a variety of +downstream tasks. + +
+
+ comment: Accepted by NeurIPS 2024. Project Page: + https://xizaoqu.github.io/moft/ +
+
+
+
+
+ + ♻ ☆ Diffusion Models Meet Remote Sensing: Principles, Methods, and + Perspectives + + +
+ As a newly emerging advance in deep generative models, diffusion models have +achieved state-of-the-art results in many fields, including computer vision, +natural language processing, and molecule design. The remote sensing (RS) +community has also noticed the powerful ability of diffusion models and quickly +applied them to a variety of tasks for image processing. Given the rapid +increase in research on diffusion models in the field of RS, it is necessary to +conduct a comprehensive review of existing diffusion model-based RS papers, to +help researchers recognize the potential of diffusion models and provide some +directions for further exploration. Specifically, this article first introduces +the theoretical background of diffusion models, and then systematically reviews +the applications of diffusion models in RS, including image generation, +enhancement, and interpretation. Finally, the limitations of existing RS +diffusion models and worthy research directions for further exploration are +discussed and summarized. + +
+
+
+
+
+ + ♻ ☆ Tuning Timestep-Distilled Diffusion Model Using Pairwise Sample + Optimization + + +
+ Recent advancements in timestep-distilled diffusion models have enabled +high-quality image generation that rivals non-distilled multi-step models, but +with significantly fewer inference steps. While such models are attractive for +applications due to the low inference cost and latency, fine-tuning them with a +naive diffusion objective would result in degraded and blurry outputs. An +intuitive alternative is to repeat the diffusion distillation process with a +fine-tuned teacher model, which produces good results but is cumbersome and +computationally intensive; the distillation training usually requires magnitude +higher of training compute compared to fine-tuning for specific image styles. +In this paper, we present an algorithm named pairwise sample optimization +(PSO), which enables the direct fine-tuning of an arbitrary timestep-distilled +diffusion model. PSO introduces additional reference images sampled from the +current time-step distilled model, and increases the relative likelihood margin +between the training images and reference images. This enables the model to +retain its few-step generation ability, while allowing for fine-tuning of its +output distribution. We also demonstrate that PSO is a generalized formulation +which can be flexibly extended to both offline-sampled and online-sampled +pairwise data, covering various popular objectives for diffusion model +preference optimization. We evaluate PSO in both preference optimization and +other fine-tuning tasks, including style transfer and concept customization. We +show that PSO can directly adapt distilled models to human-preferred generation +with both offline and online-generated pairwise preference image data. PSO also +demonstrates effectiveness in style transfer and concept customization by +directly tuning timestep-distilled diffusion models. + +
+
+
+
+
+ + ♻ ☆ Functional Imaging Constrained Diffusion for Brain PET Synthesis from + Structural MRI + + +
+ Magnetic resonance imaging (MRI) and positron emission tomography (PET) are +increasingly used in multimodal analysis of neurodegenerative disorders. While +MRI is broadly utilized in clinical settings, PET is less accessible. Many +studies have attempted to use deep generative models to synthesize PET from MRI +scans. However, they often suffer from unstable training and inadequately +preserve brain functional information conveyed by PET. To this end, we propose +a functional imaging constrained diffusion (FICD) framework for 3D brain PET +image synthesis with paired structural MRI as input condition, through a new +constrained diffusion model (CDM). The FICD introduces noise to PET and then +progressively removes it with CDM, ensuring high output fidelity throughout a +stable training phase. The CDM learns to predict denoised PET with a functional +imaging constraint introduced to ensure voxel-wise alignment between each +denoised PET and its ground truth. Quantitative and qualitative analyses +conducted on 293 subjects with paired T1-weighted MRI and +18F-fluorodeoxyglucose (FDG)-PET scans suggest that FICD achieves superior +performance in generating FDG-PET data compared to state-of-the-art methods. We +further validate the effectiveness of the proposed FICD on data from a total of +1,262 subjects through three downstream tasks, with experimental results +suggesting its utility and generalizability. + +
+
+
+
+
+ + ♻ ☆ TraceFL: Interpretability-Driven Debugging in Federated Learning via + Neuron Provenance + + +
+ In Federated Learning, clients train models on local data and send updates to +a central server, which aggregates them into a global model using a fusion +algorithm. This collaborative yet privacy-preserving training comes at a +cost--FL developers face significant challenges in attributing global model +predictions to specific clients. Localizing responsible clients is a crucial +step towards (a) excluding clients primarily responsible for incorrect +predictions and (b) encouraging clients who contributed high-quality models to +continue participating in the future. Existing ML explainability approaches are +inherently inapplicable as they are designed for single-model, centralized +training. + We introduce TraceFL, a fine-grained neuron provenance capturing mechanism +that identifies clients responsible for the global model's prediction by +tracking the flow of information from individual clients to the global model. +Since inference on different inputs activates a different set of neurons of the +global model, TraceFL dynamically quantifies the significance of the global +model's neurons in a given prediction. It then selectively picks a slice of the +most crucial neurons in the global model and maps them to the corresponding +neurons in every participating client to determine each client's contribution, +ultimately localizing the responsible client. We evaluate TraceFL on six +datasets, including two real-world medical imaging datasets and four neural +networks, including advanced models such as GPT. TraceFL achieves 99% accuracy +in localizing the responsible client in FL tasks spanning both image and text +classification tasks. At a time when state-of-the-art ML debugging approaches +are mostly domain-specific (e.g., image classification only), TraceFL is the +first technique to enable highly accurate automated reasoning across a wide +range of FL applications. + +
+
+ comment: Accepted at 2025 IEEE/ACM 47th International Conference on Software + Engineering (ICSE) +
+
+
+
+
+ + ♻ ☆ Strike the Balance: On-the-Fly Uncertainty based User Interactions for + Long-Term Video Object Segmentation ACCV 2024 + + +
+ In this paper, we introduce a variant of video object segmentation (VOS) that +bridges interactive and semi-automatic approaches, termed Lazy Video Object +Segmentation (ziVOS). In contrast, to both tasks, which handle video object +segmentation in an off-line manner (i.e., pre-recorded sequences), we propose +through ziVOS to target online recorded sequences. Here, we strive to strike a +balance between performance and robustness for long-term scenarios by +soliciting user feedback's on-the-fly during the segmentation process. Hence, +we aim to maximize the tracking duration of an object of interest, while +requiring minimal user corrections to maintain tracking over an extended +period. We propose a competitive baseline, i.e., Lazy-XMem, as a reference for +future works in ziVOS. Our proposed approach uses an uncertainty estimation of +the tracking state to determine whether a user interaction is necessary to +refine the model's prediction. To quantitatively assess the performance of our +method and the user's workload, we introduce complementary metrics alongside +those already established in the field. We evaluate our approach using the +recently introduced LVOS dataset, which offers numerous long-term videos. Our +code is publicly available at https://github.com/Vujas-Eteph/LazyXMem. + +
+
+ comment: Accepted at ACCV 2024 +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. + +
+
+
+
+
+ + ♻ ☆ LoTLIP: Improving Language-Image Pre-training for Long Text + Understanding + + +
+ Understanding long text is of great demands in practice but beyond the reach +of most language-image pre-training (LIP) models. In this work, we empirically +confirm that the key reason causing such an issue is that the training images +are usually paired with short captions, leaving certain tokens easily +overshadowed by salient tokens. Towards this problem, our initial attempt is to +relabel the data with long captions, however, directly learning with which may +lead to performance degradation in understanding short text (e.g., in the image +classification task). Then, after incorporating corner tokens to aggregate +diverse textual information, we manage to help the model catch up to its +original level of short text understanding yet greatly enhance its capability +of long text understanding. We further look into whether the model can +continuously benefit from longer captions and notice a clear trade-off between +the performance and the efficiency. Finally, we validate the effectiveness of +our approach using a self-constructed large-scale dataset, which consists of +100M long caption oriented text-image pairs. Our method demonstrates superior +performance in long-text-image retrieval tasks. The project page is available +at https://wuw2019.github.io/lot-lip. + +
+
+
+
+
+
+
+
+ + Systems and Control 42 + +
+
+
+ + ☆ Degradation mode estimation using reconstructed open circuit voltage + curves from multi-year home storage field data + + +
+ A battery's open circuit voltage (OCV) curve can be seen as its +electrochemical signature. Its shape and age-related shift provide information +on aging processes and material composition on both electrodes. However, most +OCV analyses have to be conducted in laboratories or specified field tests to +ensure suitable data quality. Here, we present a method that reconstructs the +OCV curve continuously over the lifetime of a battery using the operational +data of home storage field measurements over eight years. We show that +low-dynamic operational phases, such as the overnight household supply with +electricity, are suitable for recreating quasi OCV curves. We apply incremental +capacity analysis and differential voltage analysis and show that known +features of interest from laboratory measurements can be tracked to determine +degradation modes in field operation. The dominant degradation mode observed +for the home storage systems under evaluation is the loss of lithium inventory, +while the loss of active material might be present in some cases. We apply the +method to lithium nickel manganese cobalt oxide (NMC), a blend of lithium +manganese oxide (LMO) and NMC, and lithium iron phosphate (LFP) batteries. +Field capacity tests validate the method. + +
+
+ comment: 17 pages, 10 Figures, 1 Table +
+
+
+
+
+ + ☆ A Symmetry-Preserving Reduced-Order Observer + + +
+ A symmetry-preserving, reduced-order state observer is presented for the +unmeasured part of a system's state, where the nonlinear system dynamics +exhibit symmetry under the action of a Lie group. The proposed observer takes +advantage of this symmetry through the use of a moving frame that constructs +invariant mappings of the measurements. Sufficient conditions for the observer +to be asymptotically stable are developed by studying the stability of an +invariant error system. As an illustrative example, the observer is applied to +the problem of rigid-body velocity estimation, which demonstrate how exploiting +the symmetry of the system can simplify the stabilization of the estimation +error dynamics. + +
+
+ comment: 6 pages, 6 figures, Submission to the 2025 American Control + Conference +
+
+
+
+
+ + ☆ Optimal Control of Mechanical Ventilators with Learned Respiratory + Dynamics + + +
+ Deciding on appropriate mechanical ventilator management strategies +significantly impacts the health outcomes for patients with respiratory +diseases. Acute Respiratory Distress Syndrome (ARDS) is one such disease that +requires careful ventilator operation to be effectively treated. In this work, +we frame the management of ventilators for patients with ARDS as a sequential +decision making problem using the Markov decision process framework. We +implement and compare controllers based on clinical guidelines contained in the +ARDSnet protocol, optimal control theory, and learned latent dynamics +represented as neural networks. The Pulse Physiology Engine's respiratory +dynamics simulator is used to establish a repeatable benchmark, gather +simulated data, and quantitatively compare these controllers. We score +performance in terms of measured improvement in established ARDS health markers +(pertaining to improved respiratory rate, oxygenation, and vital signs). Our +results demonstrate that techniques leveraging neural networks and optimal +control can automatically discover effective ventilation management strategies +without access to explicit ventilator management procedures or guidelines (such +as those defined in the ARDSnet protocol). + +
+
+ comment: 2024 IEEE 37th International Symposium on Computer-Based Medical + Systems (CBMS), 7 pages, 3 figures +
+
+
+
+
+ + ☆ Control-Oriented Models Inform Synthetic Biology Strategies in CAR T + Cell Immunotherapy + + +
+ Chimeric antigen receptor (CAR) T cell therapy is revolutionizing the +treatment of blood cancers. Mathematical models that can predict the +effectiveness of immunotherapies such as CAR T are of increasing interest due +to their ability to reduce the number of experiments performed and to guide the +theoretical development of new therapeutic strategies. {Following this +rationale, we propose the use of control-oriented models to guide the +augmentation of CAR T therapy with synthetic gene circuitry. Here we present an +initial investigation where we adapt a previously developed CAR T model for +control-oriented purposes. We then explore the impact of realistic alternative +activation methods as control inputs to ensure effective tumor clearance. + +
+
+
+
+
+ + ☆ Stochastic MPC for Finite Gaussian Mixture Disturbances with Guarantees + + +
+ This paper presents a stochastic model predictive control (SMPC) algorithm +for linear systems subject to additive Gaussian mixture disturbances, with the +goal of satisfying chance constraints. To synthesize a control strategy, the +stochastic control problem is reformulated into an MPC problem. The +reformulation begins by decoupling the mixture distribution and decomposing the +system dynamics. Using stochastic simulation relations, we then redefine the +stochastic control problem onto the resultant abstract system. Next, constraint +tightening forms an MPC problem subject to finite disturbances. A branching +control is introduced to solve the MPC problem. Finally, a controller +refinement procedure determines a valid control strategy. Our contribution is +an extension of the SMPC literature to accommodate Gaussian mixture +disturbances while retaining recursive feasibility and closed-loop guarantees. +We illustrate the retention of guarantees with a case study of vehicle control +on an ill-maintained road. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ Efficient Creation of Behavior Models with Variable Modeling Depths Used + in Digital Twins + + +
+ Behavior models form an integral component of Digital Twins. The specific +characteristics of these models may vary depending on the use case. One of +these key characteristics is the modeling depth. Behavior models with a lower +modeling depth depict the behavior of the asset in an abstract way, while those +with a higher modeling depth depict the behavior in detail. Even if very +detailed behavior models are flexible and realistic, they also require a lot of +resources such as computing power, simulation time and memory requirements. In +some applications, however, only limited resources are available. The automated +creation of Digital Twins is of crucial importance for their widespread use. +Although there are methods for the automated creation of behavior models for +Digital Twins with a specific modeling depth, there is currently no method for +the automated creation of behavior models with varying modeling depths. This +article presents such an approach and demonstrates its advantages using two +industrial use cases. It is demonstrated that the automatically created +behavior models of lower modeling depth yield results that are almost identical +to those of models with a higher modeling depth, but with significantly reduced +computing time and required memory. This enables the efficient use of behavior +models in a variety of use cases, regardless of the availability of resources. + +
+
+
+
+
+ + ☆ Iterative Learning Control with Mismatch Compensation for Residual + Vibration Suppression in Delta Robots + + +
+ Unwanted vibrations stemming from the energy-optimized design of Delta robots +pose a challenge in their operation, especially with respect to precise +reference tracking. To improve tracking accuracy, this paper proposes an +adaptive mismatch-compensated iterative learning controller based on input +shaping techniques. We establish a dynamic model considering the +electromechanical rigid-flexible coupling of the Delta robot, which integrates +the permanent magnet synchronous motor. Using this model, we design an +optimization-based input shaper, considering the natural frequency of the +robot, which varies with the configuration. We proposed an iterative learning +controller for the delta robot to improve tracking accuracy. Our iterative +learning controller incorporates model mismatch where the mismatch approximated +by a fuzzy logic structure. The convergence property of the proposed controller +is proved using a Barrier Composite Energy Function, providing a guarantee that +the tracking errors along the iteration axis converge to zero. Moreover, +adaptive parameter update laws are designed to ensure convergence. Finally, we +perform a series of high-fidelity simulations of the Delta robot using Simscape +to demonstrate the effectiveness of the proposed control strategy. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Robust Adaptive Safe Robotic Grasping with Tactile Sensing + + +
+ Robotic grasping requires safe force interaction to prevent a grasped object +from being damaged or slipping out of the hand. In this vein, this paper +proposes an integrated framework for grasping with formal safety guarantees +based on Control Barrier Functions. We first design contact force and force +closure constraints, which are enforced by a safety filter to accomplish safe +grasping with finger force control. For sensory feedback, we develop a +technique to estimate contact point, force, and torque from tactile sensors at +each finger. We verify the framework with various safety filters in a numerical +simulation under a two-finger grasping scenario. We then experimentally +validate the framework by grasping multiple objects, including fragile lab +glassware, in a real robotic setup, showing that safe grasping can be +successfully achieved in the real world. We evaluate the performance of each +safety filter in the context of safety violation and conservatism, and find +that disturbance observer-based control barrier functions provide superior +performance for safety guarantees with minimum conservatism. The demonstration +video is available at https://youtu.be/Cuj47mkXRdg. + +
+
+
+
+
+ + ☆ Singularity-Avoidance Control of Robotic Systems with Model Mismatch and + Actuator Constraints + + +
+ Singularities, manifesting as special configuration states, deteriorate robot +performance and may even lead to a loss of control over the system. This paper +addresses the kinematic singularity concerns in robotic systems with model +mismatch and actuator constraints through control barrier functions (CBFs). We +propose a learning-based control strategy to prevent robots entering +singularity regions. More precisely, we leverage Gaussian process (GP) +regression to learn the unknown model mismatch, where the prediction error is +restricted by a deterministic bound. Moreover, we offer the criteria for +parameter selection to ensure the feasibility of CBFs subject to actuator +constraints. The proposed approach is validated by high-fidelity simulations on +a 2 degrees-of-freedom (DoFs) planar robot. + +
+
+ comment: This work has been submitted to ECC 2025 for possible publication +
+
+
+
+
+ + ☆ Optimizing Traffic Signal Control using High-Dimensional State + Representation and Efficient Deep Reinforcement Learning + + +
+ In reinforcement learning-based (RL-based) traffic signal control (TSC), +decisions on the signal timing are made based on the available information on +vehicles at a road intersection. This forms the state representation for the RL +environment which can either be high-dimensional containing several variables +or a low-dimensional vector. Current studies suggest that using high +dimensional state representations does not lead to improved performance on TSC. +However, we argue, with experimental results, that the use of high dimensional +state representations can, in fact, lead to improved TSC performance with +improvements up to 17.9% of the average waiting time. This high-dimensional +representation is obtainable using the cost-effective vehicle-to-infrastructure +(V2I) communication, encouraging its adoption for TSC. Additionally, given the +large size of the state, we identified the need to have computational efficient +models and explored model compression via pruning. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Data-Driven Graph Switching for Cyber-Resilient Control in Microgrids + + +
+ Distributed microgrids are conventionally dependent on communication networks +to achieve secondary control objectives. This dependence makes them vulnerable +to stealth data integrity attacks (DIAs) where adversaries may perform +manipulations via infected transmitters and repeaters to jeopardize stability. +This paper presents a physics-guided, supervised Artificial Neural Network +(ANN)-based framework that identifies communication-level cyberattacks in +microgrids by analyzing whether incoming measurements will cause abnormal +behavior of the secondary control layer. If abnormalities are detected, an +iteration through possible spanning tree graph topologies that can be used to +fulfill secondary control objectives is done. Then, a communication network +topology that would not create secondary control abnormalities is identified +and enforced for maximum stability. By altering the communication graph +topology, the framework eliminates the dependence of the secondary control +layer on inputs from compromised cyber devices helping it achieve resilience +without instability. Several case studies are provided showcasing the +robustness of the framework against False Data Injections and repeater-level +Man-in-the-Middle attacks. To understand practical feasibility, robustness is +also verified against larger microgrid sizes and in the presence of varying +noise levels. Our findings indicate that performance can be affected when +attempting scalability in the presence of noise. However, the framework +operates robustly in low-noise settings. + +
+
+ comment: Accepted in IEEE Design Methodologies Conference (DMC) 2024 +
+
+
+
+
+ + ☆ Spike Talk in Power Electronic Grids -- Leveraging Post Moore's + Computing Laws + + +
+ Emerging distributed generation demands highly reliable and resilient +coordinating control in microgrids. To improve on these aspects, spiking neural +network is leveraged, as a grid-edge intelligence tool to establish a talkative +infrastructure, Spike Talk, expediting coordination in next-generation +microgrids without the need of communication at all. This paper unravels the +physics behind Spike Talk from the perspective of its distributed +infrastructure, which aims to address the Von Neumann Bottleneck. Relying on +inferring information via power flows in tie lines, Spike Talk allows adaptive +and flexible control and coordination itself, and features in synaptic +plasticity facilitating online and local training functionality. Preliminary +case studies are demonstrated with results, while more extensive validations +are to be included as future scopes of work. + +
+
+ comment: The manuscript has been accepted for publication in the Proceedings + of 2024 IEEE Design Methodologies for Power Electronics Conference (DMC2024) +
+
+
+
+
+ + ☆ Safety Filter Design for Articulated Frame Steering Vehicles In the + Presence of Actuator Dynamics Using High-Order Control Barrier Functions + + +
+ Articulated Frame Steering (AFS) vehicles are widely used in heavy-duty +industries, where they often operate near operators and laborers. Therefore, +designing safe controllers for AFS vehicles is essential. In this paper, we +develop a Quadratic Program (QP)-based safety filter that ensures feasibility +for AFS vehicles with affine actuator dynamics. To achieve this, we first +derive the general equations of motion for AFS vehicles, incorporating affine +actuator dynamics. We then introduce a novel High-Order Control Barrier +Function (HOCBF) candidate with equal relative degrees for both system +controls. Finally, we design a Parametric Adaptive HOCBF (PACBF) and an +always-feasible, QP-based safety filter. Numerical simulations of AFS vehicle +kinematics demonstrate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Reducing Conservativeness of Controlled-Invariant Safe Sets by + Introducing a Novel Synthesis of Control Barrier Certificates + + +
+ Finding a controlled-invariant safe set for a given system with state and +control constraints plays an important role in safety-critical systems. Current +methods typically produce conservative solutions. In this paper, we introduce a +method to generate controlled-invariant safe sets for nonlinear polynomial +control-affine dynamical systems by using the notion of Control Barrier +Certificates (CBCs). To this end, we relax CBC conditions into Sum of Squares +(SOS) constraints, to be solved by an SOS program. We first assume a +controlled-invariant safe set (although small) exists for the system. We then +propose a method to iteratively enlarge the safe set. We theoretically prove +that our method enlarges the safe set in each iteration. We also demonstrate +the efficacy of our method through simulated numerical examples in 2D and 3D +for single and multi-input dynamical systems and empirically show that our +method produces a larger controlled-invariant safe set in these examples, +compared to a state-of-the-art technique using Control Barrier Function (CBF). + +
+
+
+
+
+ + ☆ Node Reliability: Approximation, Upper Bounds, and Applications to + Network Robustness + + +
+ This paper discusses the reliability of a graph in which the links are +perfectly reliable but the nodes may fail with certain probability p. +Calculating graph node reliability is an NP-Hard problem. We introduce an +efficient and accurate Monte Carlo method and a stochastic approximation for +the node reliability polynomial based solely on the degree distribution. We +provide the formulas for the node reliability polynomial of both Erdos-Renyi +graphs and Random Geometric graphs. The phase transition in the node +reliability of Erdos-Renyi graphs is also discussed. Additionally, we propose +two increasingly accurate upper bounds for the node reliability polynomial +solely based on the graph's degree distributions. The advantages and +disadvantages of these two upper bounds are thoroughly compared. Beyond the +computation of node reliability polynomials, we also estimate the number of cut +sets and present a solution to the reliability-based network enhancement +problem. + +
+
+
+
+
+ + ☆ $\mathscr{H}_2$ Model Reduction for Linear Quantum Systems + + +
+ In this paper, an $\mathscr{H}_2$ norm-based model reduction method for +linear quantum systems is presented, which can obtain a physically realizable +model with a reduced order for closely approximating the original system. The +model reduction problem is described as an optimization problem, whose +objective is taken as an $\mathscr{H}_2$ norm of the difference between the +transfer function of the original system and that of the reduced one. Different +from classical model reduction problems, physical realizability conditions for +guaranteeing that the reduced-order system is also a quantum system should be +taken as nonlinear constraints in the optimization. To solve the optimization +problem with such nonlinear constraints, we employ a matrix inequality approach +to transform nonlinear inequality constraints into readily solvable linear +matrix inequalities (LMIs) and nonlinear equality constraints, so that the +optimization problem can be solved by a lifting variables approach. We +emphasize that different from existing work, which only introduces a criterion +to evaluate the performance after model reduction, we guide our method to +obtain an optimal reduced model with respect to the $\mathscr{H}_2$ norm. In +addition, the above approach for model reduction is extended to passive linear +quantum systems. Finally, examples of active and passive linear quantum systems +validate the efficacy of the proposed method. + +
+
+ comment: 13 pages,3 figures +
+
+
+
+
+ + ☆ Modelling and Control of Subsonic Missile for Air-to-Air Interception + + +
+ Subsonic missiles play an important role in modern air-to-air combat +scenarios - utilized by the F-35 Lightning II - but require complex Guidance, +Navigation and Control systems to manoeuvre with 30G's of acceleration to +intercept successfully. Challenges with mathematically modelling and +controlling such a dynamic system must be addressed, high frequency noise +rejected, and actuator delay compensated for. This paper aims to investigate +the control systems necessary for interception. It also proposes a subsonic +design utilizing literature and prior research, suggests aerodynamic +derivatives, and analyses a designed 2D reduced pitch autopilot control system +response against performances. The pitch autopilot model contains an optimized +PID controller, 2nd order actuator, lead compensator and Kalman Filter, that +rejects time varying disturbances and high frequency noise expected during +flight. Simulation results confirm the effectiveness of the proposed method +through reduction in rise time (21%), settle time (10%), and highlighted its +high frequency deficiency with respect to the compensator integration. The +actuator delay of 100ms has been negated by the augmented compensator autopilot +controller so that it exceeds system performance requirements (1) & (3). +However, (2) is not satisfied as 370% overshoot exists. This research confirms +the importance of a lead compensator in missile GNC systems and furthers +control design application through a specific configuration. Future research +should build upon methods and models presented to construct and test an +interception scenario. + +
+
+
+
+
+ + ☆ Robust control for uncertain air-to-air missile systems + + +
+ Air-to-air missiles are used on many modern military combat aircraft for +self-defence. It is imperative for the pilots using the weapons that the +missiles hit their target first time. The important goals for a missile control +system to achieve are minimising the time constant, overshoot, and settling +time of the missile dynamics. The combination of high angles of attack, +time-varying mass, thrust, and centre of gravity, actuator delay, and signal +noise create a highly non-linear dynamic system with many uncertainties that is +extremely challenging to control. A robust control system based on saturated +sliding mode control is proposed to overcome the time-varying parameters and +non-linearities. A lag compensator is designed to overcome actuator delay. A +second-order filter is selected to reduce high-frequency measurement noise. +When combined, the proposed solutions can make the system stable despite the +existence of changing mass, centre of gravity, thrust, and sensor noise. The +system was evaluated for desired pitch angles of 0{\deg} to 90{\deg}. The time +constant for the system stayed below 0.27s for all conditions, with +satisfactory performance for both settling time and overshoot. + +
+
+
+
+
+ + ☆ Longitudinal dynamic modelling and control for a quad-tilt rotor UAV + + +
+ Tilt rotor aircraft combine the benefits of both helicopters and fixed wing +aircraft, this makes them popular for a variety of applications, including +Search and Rescue and VVIP transport. However, due to the multiple flight +modes, significant challenges with regards to the control system design are +experienced. The main challenges with VTOL aircraft, comes during the dynamic +phase (mode transition), where the aircraft transitions from a hover state to +full forwards flight. In this transition phase the aerodynamic lift and torque +generated by the wing/control surfaces increases and as such, the rotor thrust, +and the tilt rate must be carefully considered, such that the height and +attitude remain invariant during the mode transition. In this paper, a digital +PID controller with the applicable digital filter and data hold functions is +designed so that a successful mode transition between hover and forwards flight +can be ascertained. Finally, the presented control system for the tilt-rotor +UAV is demonstrated through simulations by using the MATLAB software suite. The +performance obtained from the simulations confirm the success of the +implemented methods, with full stability in all three degrees of freedom being +demonstrated. + +
+
+
+
+
+ + ☆ Robotic Control Optimization Through Kernel Selection in Safe Bayesian + Optimization + + +
+ Control system optimization has long been a fundamental challenge in +robotics. While recent advancements have led to the development of control +algorithms that leverage learning-based approaches, such as SafeOpt, to +optimize single feedback controllers, scaling these methods to high-dimensional +complex systems with multiple controllers remains an open problem. In this +paper, we propose a novel learning-based control optimization method, which +enhances the additive Gaussian process-based Safe Bayesian Optimization +algorithm to efficiently tackle high-dimensional problems through kernel +selection. We use PID controller optimization in drones as a representative +example and test the method on Safe Control Gym, a benchmark designed for +evaluating safe control techniques. We show that the proposed method provides a +more efficient and optimal solution for high-dimensional control optimization +problems, demonstrating significant improvements over existing techniques. + +
+
+ comment: Accepted by 2024 IEEE International Conference on Robotics and + Biomimetics (ROBIO) +
+
+
+
+
+ + ☆ Constructive RNNs: An Error-Recurrence Perspective on Time-Variant Zero + Finding Problem Solving Under Uncertainty + + +
+ When facing time-variant problems in analog computing, the desirable RNN +design requires finite-time convergence and robustness with respect to various +types of uncertainties, due to the time-variant nature and difficulties in +implementation. It is very worthwhile to explore terminal zeroing neural +networks, through examining and applying available attracting laws. In this +paper, from a control-theoretic point of view, an error recurrence system +approach is presented by equipping with uncertainty compensation in the +pre-specified error dynamics, capable of enhancing robustness properly. Novel +rectifying actions are designed to make finite-time settling so that the +convergence speed and the computing accuracy of time-variant computing can be +improved. Double-power and power-exponential rectifying actions are +respectively formed to construct specific models, while the particular +expressions of settling time function for the former are presented, and for the +latter the proximate settling-time estimations are given, with which the +fixed-time convergence of the corresponding models is in turn established. +Moreover, the uncertainty compensation by the signum/smoothing-signum +techniques are adopted for finite-duration stabilization. Theoretical results +are presented to demonstrate effectiveness (involving fixed-time convergence +and robustness) of the proposed computing schemes for the time-variant QP +problem solving. + +
+
+
+
+
+ + ☆ Lateral String Stability in Autonomous & Connected Vehicle Platoons + + +
+ This paper addresses the lateral control of Autonomous and Connected Vehicles +(ACVs) in a platoon executing an Emergency Lane Change (ELC) maneuver. These +maneuvers are typically triggered by emergency signals from the front or rear +of the platoon in response to the need to avoid obstacles or allow other +vehicles to pass. The study assumes that ACVs maintain reliable connectivity, +enabling each following vehicle to access GPS position traces of both the lead +and immediately preceding vehicles in the platoon. We demonstrate that lateral +string stability in the ACV platoon can be achieved using communicated +information solely from the lead and preceding vehicles. Additionally, we +present a lateral control framework for ACVs, which helps track a discretized +preview of the trajectory constructed from the communicated data. This +framework involves constructing two distinct trajectories based on the preview +data from the lead and preceding vehicles, calculating the associated errors +and lateral control actions for each, and then integrating these to generate a +steering command. Numerical results validate the effectiveness of the proposed +lateral control scheme. + +
+
+ comment: 18th IEEE International Conference on Vehicular Electronics and + Safety 2024 (ICVES) +
+
+
+
+
+ + ☆ Convergence Guarantees for Differentiable Optimization-based Control + Policy + + +
+ Effective control of real-world systems necessitates the development of +controllers that are not only performant but also interpretable. To this end, +the field has seen a surge in model-based control policies, which first +leverage historical data to learn system cost and dynamics, and then utilize +the learned models for control. However, due to this decoupling, model-based +control policies fall short when deployed in optimal control settings and lack +convergence guarantees for achieving optimality. In this paper, we present +DiffOP, a Differentiable Optimization-based Policy for optimal control. In the +proposed framework, control actions are derived by solving an optimization, +where the control cost and system's dynamics can be parameterized as neural +networks. The key idea of DiffOP, inspired by differentiable optimization +techniques, is to jointly learn the control policy using both policy gradients +and optimization gradients, while utilizing actual cost feedback during system +interaction. Further, this study presents the first theoretical analysis of the +convergence rates and sample complexity for learning the optimization control +policy with a policy gradient approach. + +
+
+
+
+
+ + ☆ Two-Layer Attention Optimization for Bimanual Coordination + + +
+ Bimanual tasks performed by human agents present unique optimal control +considerations compared to cyberphysical agents. These considerations include +minimizing attention, distributing attention across two isolated hands, and +coordinating the two hands to reach a broader goal. In this work, we propose a +two-layer controller that captures these considerations. The upper layer solves +an attention distribution problem, while the two lower layer controllers (one +per hand) tracks a trajectory using the solution given by the upper layer. We +introduce a formulation of the attention controller where attention is a vector +that is bound within a hyperbolic feasible region, which is determined by +specifications of the task the lower layer controllers. This two-layer +controller is used to optimize a single-player game of pong, where the agent +must rally the ball between two paddles for as long as possible. We find that +adding an attention layer on top of the lower controllers allows the agent to +coordinate the left and right hands, which minimizes attention and control +effort over the course of the rallying task. + +
+
+ comment: American Controls Conference (under review) +
+
+
+
+
+ + ☆ Research on fault diagnosis of nuclear power first-second circuit based + on hierarchical multi-granularity classification network + + +
+ The safe and reliable operation of complex electromechanical systems in +nuclear power plants is crucial for the safe production of nuclear power plants +and their nuclear power unit. Therefore, accurate and timely fault diagnosis of +nuclear power systems is of great significance for ensuring the safe and +reliable operation of nuclear power plants. The existing fault diagnosis +methods mainly target a single device or subsystem, making it difficult to +analyze the inherent connections and mutual effects between different types of +faults at the entire unit level. This article uses the AP1000 full-scale +simulator to simulate the important mechanical component failures of some key +systems in the primary and secondary circuits of nuclear power units, and +constructs a fault dataset. Meanwhile, a hierarchical multi granularity +classification fault diagnosis model based on the EfficientNet large model is +proposed, aiming to achieve hierarchical classification of nuclear power +faults. The results indicate that the proposed fault diagnosis model can +effectively classify faults in different circuits and system components of +nuclear power units into hierarchical categories. However, the fault dataset in +this study was obtained from a simulator, which may introduce additional +information due to parameter redundancy, thereby affecting the diagnostic +performance of the model. + +
+
+
+
+
+ + ☆ Input-Based Ensemble-Learning Method for Dynamic Memory Configuration of + Serverless Computing Functions + + +
+ In today's Function-as-a-Service offerings, a programmer is usually +responsible for configuring function memory for its successful execution, which +allocates proportional function resources such as CPU and network. However, +right-sizing the function memory force developers to speculate performance and +make ad-hoc configuration decisions. Recent research has highlighted that a +function's input characteristics, such as input size, type and number of +inputs, significantly impact its resource demand, run-time performance and +costs with fluctuating workloads. This correlation further makes memory +configuration a non-trivial task. On that account, an input-aware function +memory allocator not only improves developer productivity by completely hiding +resource-related decisions but also drives an opportunity to reduce resource +wastage and offer a finer-grained cost-optimised pricing scheme. Therefore, we +present MemFigLess, a serverless solution that estimates the memory requirement +of a serverless function with input-awareness. The framework executes function +profiling in an offline stage and trains a multi-output Random Forest +Regression model on the collected metrics to invoke input-aware optimal +configurations. We evaluate our work with the state-of-the-art approaches on +AWS Lambda service to find that MemFigLess is able to capture the input-aware +resource relationships and allocate upto 82% less resources and save up to 87% +run-time costs. + +
+
+ comment: 10 pages, 2 tables, 28 figures, accepted conference paper - UCC'24 +
+
+
+
+
+ + ☆ System-Level Analysis for mm-Wave Full-Duplex Transceivers + + +
+ This paper conducts a comprehensive system-level analysis of mm-Wave +full-duplex transceivers, focusing on a receiver employing a four-stage +self-interference cancellation (SIC) process. The analysis aims to optimize the +noise and linearity performance requirements of each transceiver block, +ensuring that the self-interference (SI) signal does not compromise the +receiver's error vector magnitude (EVM) for an OFDM 64-QAM signal. +Additionally, the necessary SIC for each stage is calculated to establish +feasible noise and linearity specifications for a CMOS-based implementation. +The resulting specifications are subsequently validated within a MATLAB +Simulink environment, confirming the accuracy of the computed requirements for +each block. + +
+
+
+
+
+ + ☆ Collision-Free Multi-Agent Coverage Control for Non-Cooperating Swarms: + Preliminary Results + + +
+ The main contribution of this paper is a methodology for multiple +non-cooperating swarms of unmanned aerial vehicles to independently cover a +common area. In contrast to previous research on coverage control involving +more than one swarm, this paper does not assume cooperation between distinct +groups but considers them as entirely independent units following their own +objectives. Using Voronoi tesselation, collision-free motion of agents within +the same swarm has been proved before. However, as is shown in Example 1 of +this paper, in the case of multiple swarms with inter-swarm but without +intra-swarm collaboration, these guarantees do not hold. We address this issue +by proposing an algorithm to achieve maximum coverage with multiple swarms +while avoiding collisions between agents. Thus, the Optimal Reciprocal +Collision Avoidance method used for safe navigation in multi-agent scenarios is +adapted to suit the needs of Voronoi-based coverage control with more than one +swarm. The functionality of the proposed technique is validated through Monte +Carlo simulations. + +
+
+
+
+
+ + ☆ Shaping Frequency Dynamics in Modern Power Systems with Grid-forming + Converters + + +
+ In this paper, frequency dynamics in modern power systems with a high +penetration of converter-based generation is analysed. A fundamental analysis +of the frequency dynamics is performed to identify the limitations and +challenges when the converter penetration is increased. The voltage-source +behaviour is found as an essential characteristic of converters to improve the +initial frequency derivative of Synchronous Generators (SGs). A detailed +small-signal analysis, based on the system's eigenvalues, participation factors +and mode shapes, is then performed in a reduced system for different converter +penetrations, showing that the flexibility of grid-forming (GFOR) converters as +well as the system's inertia reduction may lead to have a more controllable +system frequency. First-order frequency responses can be programmed for high +converter penetrations, when GFOR operation can impose their dominance over +SGs. These results have been validated in the IEEE 118-bus system simulated in +PSCAD. + +
+
+ comment: 11 pages, 17 figures +
+
+
+
+
+ + ☆ Optimal Constant Climb Airspeed with Variable Cost Index for + All-electric Aircraft + + +
+ This paper presents for the first time an approach to minimize direct +operational costs (DOC) for all-electric aircraft during the climb phase, +introducing a time-varying cost index (CI). The CI is modeled as a dynamic +parameter commanded by Air Traffic Control (ATC), allowing the aircraft to +maintain a constant airspeed throughout the climb, while respecting the air +traffic regulations. This paper also explores the implications of a +time-varying CI on the determination of optimal airspeed and climbing time for +all-electric aircraft. Additionally, it provides the necessary equations to +calculate both the optimal climb airspeed and climb duration. The proposed +methodology has been validated through a simulated scenario that reflects +actual operational procedures. As a result, optimal values for climb airspeed, +climbing time, and energy consumption have been established, paving the way for +future applications of this methodology to advanced air mobility all-electric +vehicles. + +
+
+ comment: 6 pages, 4 figures. arXiv admin note: text overlap with + arXiv:2410.01045 +
+
+
+
+
+ + ☆ Visual Tracking with Intermittent Visibility: Switched Control Design + and Implementation + + +
+ This paper addresses the problem of visual target tracking in scenarios where +a pursuer may experience intermittent loss of visibility of the target. The +design of a Switched Visual Tracker (SVT) is presented which aims to meet the +competing requirements of maintaining both proximity and visibility. SVT +alternates between a visual tracking mode for following the target, and a +recovery mode for regaining visual contact when the target falls out of sight. +We establish the stability of SVT by extending the average dwell time theorem +from switched systems theory, which may be of independent interest. Our +implementation of SVT on an Agilicious drone [1] illustrates its effectiveness +on tracking various target trajectories: it reduces the average tracking error +by up to 45% and significantly improves visibility duration compared to a +baseline algorithm. The results show that our approach effectively handles +intermittent vision loss, offering enhanced robustness and adaptability for +real-world autonomous missions. Additionally, we demonstrate how the stability +analysis provides valuable guidance for selecting parameters, such as tracking +speed and recovery distance, to optimize the SVT's performance. + +
+
+
+
+
+ + ☆ Space-Air-Ground Integrated MEC-Assisted Industrial Cyber-Physical + Systems: An Online Decentralized Optimization Approach + + +
+ Cloud computing and edge/fog computing are playing a pivotal role in driving +the transformation of industrial cyber-physical systems (ICPS) towards greater +intelligence and automation by providing high-quality computation offloading +services to Internet of Things devices (IoTDs). Recently, space-air-ground +integrated multi-access edge computing (SAGIMEC) is emerging as a promising +architecture combining edge computing and cloud computing, which has the +potential to be integrated with ICPS to accelerate the realization of the above +vision. In this work, we first present an SAGIMEC-assisted ICPS architecture +that incorporates edge computing and cloud computing through seamless +connectivity supported by satellite networks to achieve determinism in +connectivity, networked computing, and intelligent networked control. Then, we +formulate a joint satellite selection, computation offloading, communication +resource allocation, computation resource allocation, and UAV trajectory +control optimization problem (JSC4OP) to maximize the quality of service (QoS) +of IoTDs. This problem considers both the dynamics and uncertainties of the +system environment, as well as the limited resources and energy of UAVs. Given +the complexity of JSC4OP, we propose an online decentralized optimization +approach (ODOA) to solve the problem. Specifically, JSC4OP is first transformed +into a real-time decision-making optimization problem (RDOP) by leveraging +Lyapunov optimization. Then, to solve the RDOP, we introduce an online +learning-based latency prediction method to predict the uncertain system +environment and a game theoretic decision-making method to make real-time +decisions. Finally, theoretical analysis confirms the effectiveness of the +ODOA, while the simulation results demonstrate that the proposed ODOA +outperforms other alternative approaches in terms of overall system +performance. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.11918 +
+
+
+
+
+ + ♻ ☆ Foundation Models for the Electric Power Grid + + +
+ Foundation models (FMs) currently dominate news headlines. They employ +advanced deep learning architectures to extract structural information +autonomously from vast datasets through self-supervision. The resulting rich +representations of complex systems and dynamics can be applied to many +downstream applications. Therefore, FMs can find uses in electric power grids, +challenged by the energy transition and climate change. In this paper, we call +for the development of, and state why we believe in, the potential of FMs for +electric grids. We highlight their strengths and weaknesses amidst the +challenges of a changing grid. We argue that an FM learning from diverse grid +data and topologies could unlock transformative capabilities, pioneering a new +approach in leveraging AI to redefine how we manage complexity and uncertainty +in the electric grid. Finally, we discuss a power grid FM concept, namely +GridFM, based on graph neural networks and show how different downstream tasks +benefit. + +
+
+ comment: Major equal contributors: H.F.H., T.B., B.G., L.S.A.M., A.P., A.V., + J.W.; Significant equal contributors: J.B., A.B.M., S.C., I.F., B.H., R.J., + K.K., V.M., F.M., M.D.M., O.R., H.S., L.X., E.S.Y., A.Z.; Other equal + contributors: A.J.B., R.J.B., B.P.B., J.S., S.S; Lead contact: H.F.H +
+
+
+
+
+ + ♻ ☆ Goal-oriented Semantic Communications for Robotic Waypoint Transmission: + The Value and Age of Information Approach + + +
+ The ultra-reliable and low-latency communication (URLLC) service of the +fifth-generation (5G) mobile communication network struggles to support safe +robot operation. Nowadays, the sixth-generation (6G) mobile communication +network is proposed to provide hyper-reliable and low-latency communication to +enable safer control for robots. However, current 5G/ 6G research mainly +focused on improving communication performance, while the robotics community +mostly assumed communication to be ideal. To jointly consider communication and +robotic control with a focus on the specific robotic task, we propose +goal-oriented semantic communication in robotic control (GSRC) to exploit the +context of data and its importance in achieving the task at both transmitter +and receiver. At the transmitter, we propose a deep reinforcement learning +algorithm to generate optimal control and command (C&C) data and a proactive +repetition scheme (DeepPro) to increase the successful transmission +probability. At the receiver, we design the value of information (VoI) and age +of information (AoI) based queue ordering mechanism (VA-QOM) to rank the queue +based on the semantic information extracted from AoI and VoI. The simulation +results validate that our proposed GSRC framework achieves a 91.5% improvement +in the mean square error compared to the traditional unmanned aerial vehicle +control framework. + +
+
+ comment: The paper has been accepted in IEEE TWC +
+
+
+
+
+ + ♻ ☆ Small Noise Analysis of Non-Parametric Closed-Loop Identification + + +
+ We revisit the problem of non-parametric closed-loop identification in +frequency domain; we give a brief survey of the literature and provide a small +noise analysis of the direct, indirect, and joint input-output methods when two +independent experiments with identical excitation are used. The analysis is +asymptotic in the noise variance (i.e., as the standard deviation of the +innovations $\sigma \to 0$), for a finite data record of length $N$. We +highlight the relationship between the estimators accuracy and the loop shape +via asymptotic variance expressions given in terms of the sensitivity function. +The results are illustrated using a numerical simulation example. + +
+
+
+
+
+ + ♻ ☆ ElectricityEmissions.jl: A Framework for the Comparison of Carbon + Intensity Signals + + +
+ An increasing number of individuals, companies and organizations are +interested in computing and minimizing the carbon emissions associated with +their real-time electricity consumption. To achieve this, they require a carbon +signal, i.e. a metric that defines the real-time carbon intensity of their +electricity supply. Unfortunately, in a grid with multiple generation sources +and multiple consumers, there is no unambiguous way to trace electricity from +source to sink. This makes it hard to define an appropriate signal, leading to +a raging discussion about how to best quantify the carbon footprint of +electricity. + This paper seeks to inform the discussion about which carbon signal is better +or more suitable for two important use cases, namely carbon-informed load +shifting and carbon accounting. We do this by developing a new software package +ElectricityEmissions$.$jl, that computes several established and newly proposed +carbon emission metrics for standard electric grid test cases. We also +demonstrate how the package can be used to investigate the effects of using +these metrics to guide load shifting. Our results affirm previous research, +which showed that the choice of carbon emission metric has significant impact +on shifting results and associated carbon emission reductions. In addition, we +demonstrate the impact of load shifting on both the consumers that perform the +shifting and consumers that do not. Disconcertingly, we observe that shifting +according to common metrics such as average carbon emissions can reduce the +amount of emissions allocated to data center, but cause an increase in the +total emissions of the system. + +
+
+
+
+
+ + ♻ ☆ DistRL: An Asynchronous Distributed Reinforcement Learning Framework for + On-Device Control Agents + + +
+ On-device control agents, especially on mobile devices, are responsible for +operating mobile devices to fulfill users' requests, enabling seamless and +intuitive interactions. Integrating Multimodal Large Language Models (MLLMs) +into these agents enhances their ability to understand and execute complex +commands, thereby improving user experience. However, fine-tuning MLLMs for +on-device control presents significant challenges due to limited data +availability and inefficient online training processes. This paper introduces +DistRL, a novel framework designed to enhance the efficiency of online RL +fine-tuning for mobile device control agents. DistRL employs centralized +training and decentralized data acquisition to ensure efficient fine-tuning in +the context of dynamic online interactions. Additionally, the framework is +backed by our tailor-made RL algorithm, which effectively balances exploration +with the prioritized utilization of collected data to ensure stable and robust +training. Our experiments show that, on average, DistRL delivers a 3X +improvement in training efficiency and enables training data collection 2.4X +faster than the leading synchronous multi-machine methods. Notably, after +training, DistRL achieves a 20% relative improvement in success rate compared +to state-of-the-art methods on general Android tasks from an open benchmark, +significantly outperforming existing approaches while maintaining the same +training time. These results validate DistRL as a scalable and efficient +solution, offering substantial improvements in both training efficiency and +agent performance for real-world, in-the-wild device control tasks. + +
+
+ comment: Paper and Appendix, 25 pages +
+
+
+
+
+ + ♻ ☆ Distributionally Robust Model Predictive Control: Closed-loop Guarantees + and Scalable Algorithms + + +
+ We establish a collection of closed-loop guarantees and propose a scalable +optimization algorithm for distributionally robust model predictive control +(DRMPC) applied to linear systems, convex constraints, and quadratic costs. Via +standard assumptions for the terminal cost and constraint, we establish +distribtionally robust long-term and stage-wise performance guarantees for the +closed-loop system. We further demonstrate that a common choice of the terminal +cost, i.e., via the discrete-algebraic Riccati equation, renders the origin +input-to-state stable for the closed-loop system. This choice also ensures that +the exact long-term performance of the closed-loop system is independent of the +choice of ambiguity set for the DRMPC formulation. Thus, we establish +conditions under which DRMPC does not provide a long-term performance benefit +relative to stochastic MPC. To solve the DRMPC optimization problem, we propose +a Newton-type algorithm that empirically achieves superlinear convergence and +guarantees the feasibility of each iterate. We demonstrate the implications of +the closed-loop guarantees and the scalability of the proposed algorithm via +two examples. To facilitate the reproducibility of the results, we also provide +open-source code to implement the proposed algorithm and generate the figures. + +
+
+ comment: 36 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Timer-Based Coverage Control for Mobile Sensors + + +
+ This work investigates the coverage control problem over a static, compact, +and convex workspace and develops a hybrid extension of the continuous-time +Lloyd algorithm. Each agent in a multi-agent system (MAS) is equipped with a +timer mechanism that generates intermittent measurement and control update +events, which may occur asynchronously between agents. Between consecutive +event times, as determined by the corresponding timer mechanism, the controller +of each agent is held constant. These controllers are shown to drive the +configuration of the MAS into a neighborhood of the set of centroidal Voronoi +configurations, i.e., the minimizers of the standard locational cost. The +combination of continuous-time dynamics with intermittently updated control +inputs is modeled as a hybrid system. The coverage objective is posed as a set +attractivity problem for hybrid systems, where an invariance-based convergence +analysis yields sufficient conditions that ensure maximal solutions of the +hybrid system asymptotically converge to a desired set. A brief simulation +example is included to showcase the result. + +
+
+
+
+
+ + ♻ ☆ OCMDP: Observation-Constrained Markov Decision Process + + +
+ In many practical applications, decision-making processes must balance the +costs of acquiring information with the benefits it provides. Traditional +control systems often assume full observability, an unrealistic assumption when +observations are expensive. We tackle the challenge of simultaneously learning +observation and control strategies in such cost-sensitive environments by +introducing the Observation-Constrained Markov Decision Process (OCMDP), where +the policy influences the observability of the true state. To manage the +complexity arising from the combined observation and control actions, we +develop an iterative, model-free deep reinforcement learning algorithm that +separates the sensing and control components of the policy. This decomposition +enables efficient learning in the expanded action space by focusing on when and +what to observe, as well as determining optimal control actions, without +requiring knowledge of the environment's dynamics. We validate our approach on +a simulated diagnostic task and a realistic healthcare environment using +HeartPole. Given both scenarios, the experimental results demonstrate that our +model achieves a substantial reduction in observation costs on average, +significantly outperforming baseline methods by a notable margin in efficiency. + +
+
+ comment: Full paper, 14 Pages +
+
+
+
+
+ + ♻ ☆ Skipped Adjacency Pulse Width Modulation: Zero Voltage Switching over + Full Duty Cycle Range for Hybrid Flying Capacitor Multi-Level Converters + without Dynamic Level Changing + + +
+ This paper proposes a method to achieve zero voltage switching (ZVS) across +the full duty cycle range in hybrid flying capacitor multilevel (FCML) +converters, eliminating the need for dynamic level changing and active +re-balancing. Utilizing skipped adjacency pulse width modulation (SAPWM), this +approach avoids the nearest pole voltage level, thereby increasing volt-seconds +within specific duty cycle range. The method uses a modified PWM scheme, which +preserves effective pole voltage by changing duty reference and employing +digital logic processing. Simulation results verify the proposed method +achieving full-range ZVS. This SAPWM technique is compatible with hybrid FCML +converters with various levels, offering enhanced efficiency and reduced +switching losses. + +
+
+ comment: 7 pages, 13 figures, pre-print +
+
+
+
+
+ + ♻ ☆ Local Synchronization of Power System Devices + + +
+ This paper introduces a novel concept of local synchronization of power +systems devices based on the difference between the complex frequency of the +voltage and current injected at terminals. Formal definitions are provided to +account for bounded and asymptotic local synchronization. The definitions are +suitable for modern power systems as they remove classical assumptions limiting +the application of the concept of synchronization to synchronous machines and +omitting voltage dynamics. The paper also provides a systematic analytical +description of the synchronization mechanisms of common power system devices. +Finally, a variety of examples is included to illustrate the theoretical value +and practical application of the proposed definitions to power systems modeling +and stability analysis. + +
+
+
+
+
+
+
+
+ + Machine Learning 157 + +
+
+
+ + ☆ LLMPhy: Complex Physical Reasoning Using Large Language Models and World + Models + + +
+ Physical reasoning is an important skill needed for robotic agents when +operating in the real world. However, solving such reasoning problems often +involves hypothesizing and reflecting over complex multi-body interactions +under the effect of a multitude of physical forces and thus learning all such +interactions poses a significant hurdle for state-of-the-art machine learning +frameworks, including large language models (LLMs). To study this problem, we +propose a new physical reasoning task and a dataset, dubbed TraySim. Our task +involves predicting the dynamics of several objects on a tray that is given an +external impact -- the domino effect of the ensued object interactions and +their dynamics thus offering a challenging yet controlled setup, with the goal +of reasoning being to infer the stability of the objects after the impact. To +solve this complex physical reasoning task, we present LLMPhy, a zero-shot +black-box optimization framework that leverages the physics knowledge and +program synthesis abilities of LLMs, and synergizes these abilities with the +world models built into modern physics engines. Specifically, LLMPhy uses an +LLM to generate code to iteratively estimate the physical hyperparameters of +the system (friction, damping, layout, etc.) via an implicit +analysis-by-synthesis approach using a (non-differentiable) simulator in the +loop and uses the inferred parameters to imagine the dynamics of the scene +towards solving the reasoning task. To show the effectiveness of LLMPhy, we +present experiments on our TraySim dataset to predict the steady-state poses of +the objects. Our results show that the combination of the LLM and the physics +engine leads to state-of-the-art zero-shot physical reasoning performance, +while demonstrating superior convergence against standard black-box +optimization methods and better estimation of the physical parameters. + +
+
+
+
+
+ + ☆ Leonardo vindicated: Pythagorean trees for minimal reconstruction of the + natural branching structures + + +
+ Trees continue to fascinate with their natural beauty and as engineering +masterpieces optimal with respect to several independent criteria. Pythagorean +tree is a well-known fractal design that realistically mimics the natural tree +branching structures. We study various types of Pythagorean-like fractal trees +with different shapes of the base, branching angles and relaxed scales in an +attempt to identify and explain which variants are the closest match to the +branching structures commonly observed in the natural world. Pursuing +simultaneously the realism and minimalism of the fractal tree model, we have +developed a flexibly parameterised and fast algorithm to grow and visually +examine deep Pythagorean-inspired fractal trees with the capability to orderly +over- or underestimate the Leonardo da Vinci's tree branching rule as well as +control various imbalances and branching angles. We tested the realism of the +generated fractal tree images by means of the classification accuracy of +detecting natural tree with the transfer-trained deep Convolutional Neural +Networks (CNNs). Having empirically established the parameters of the fractal +trees that maximize the CNN's natural tree class classification accuracy we +have translated them back to the scales and angles of branches and came to the +interesting conclusions that support the da Vinci branching rule and golden +ratio based scaling for both the shape of the branch and imbalance between the +child branches, and claim the flexibly parameterized fractal trees can be used +to generate artificial examples to train robust detectors of different species +of trees. + +
+
+ comment: 22 pages, lots of hi res figures I had to reduce quality of, + submitting as a requirement to the Theory of Computing Journal +
+
+
+
+
+ + ☆ Language Models as Causal Effect Generators + + +
+ We present a framework for large language model (LLM) based data generation +with controllable causal structure. In particular, we define a procedure for +turning any language model and any directed acyclic graph (DAG) into a +sequence-driven structural causal model (SD-SCM). Broadly speaking, an SD-SCM +is a causal model with user-defined structure and LLM-defined structural +equations. We characterize how an SD-SCM allows sampling from observational, +interventional, and counterfactual distributions according to the desired +causal structure. We then leverage this procedure to propose a new type of +benchmark for causal inference methods, generating individual-level +counterfactual data without needing to manually specify functional +relationships between variables. We create an example benchmark consisting of +thousands of datasets, and test a suite of popular estimation methods on these +datasets for average, conditional average, and individual treatment effect +estimation, both with and without hidden confounding. Apart from generating +data, the same procedure also allows us to test for the presence of a causal +effect that might be encoded in an LLM. This procedure can underpin auditing +LLMs for misinformation, discrimination, or otherwise undesirable behavior. We +believe SD-SCMs can serve as a useful tool in any application that would +benefit from sequential data with controllable causal structure. + +
+
+
+
+
+ + ☆ Wavelet Latent Diffusion (Wala): Billion-Parameter 3D Generative Model + with Compact Wavelet Encodings + + +
+ Large-scale 3D generative models require substantial computational resources +yet often fall short in capturing fine details and complex geometries at high +resolutions. We attribute this limitation to the inefficiency of current +representations, which lack the compactness required to model the generative +models effectively. To address this, we introduce a novel approach called +Wavelet Latent Diffusion, or WaLa, that encodes 3D shapes into wavelet-based, +compact latent encodings. Specifically, we compress a $256^3$ signed distance +field into a $12^3 \times 4$ latent grid, achieving an impressive 2427x +compression ratio with minimal loss of detail. This high level of compression +allows our method to efficiently train large-scale generative networks without +increasing the inference time. Our models, both conditional and unconditional, +contain approximately one billion parameters and successfully generate +high-quality 3D shapes at $256^3$ resolution. Moreover, WaLa offers rapid +inference, producing shapes within two to four seconds depending on the +condition, despite the model's scale. We demonstrate state-of-the-art +performance across multiple datasets, with significant improvements in +generation quality, diversity, and computational efficiency. We open-source our +code and, to the best of our knowledge, release the largest pretrained 3D +generative models across different modalities. + +
+
+
+
+
+ + ☆ Investigating the Effectiveness of Explainability Methods in Parkinson's + Detection from Speech + + +
+ Speech impairments in Parkinson's disease (PD) provide significant early +indicators for diagnosis. While models for speech-based PD detection have shown +strong performance, their interpretability remains underexplored. This study +systematically evaluates several explainability methods to identify PD-specific +speech features, aiming to support the development of accurate, interpretable +models for clinical decision-making in PD diagnosis and monitoring. Our +methodology involves (i) obtaining attributions and saliency maps using +mainstream interpretability techniques, (ii) quantitatively evaluating the +faithfulness of these maps and their combinations obtained via union and +intersection through a range of established metrics, and (iii) assessing the +information conveyed by the saliency maps for PD detection from an auxiliary +classifier. Our results reveal that, while explanations are aligned with the +classifier, they often fail to provide valuable information for domain experts. + +
+
+ comment: The first two authors contributed equally to this research: author + order is alphabetical +
+
+
+
+
+ + ☆ Derivational Morphology Reveals Analogical Generalization in Large + Language Models + + +
+ What mechanisms underlie linguistic generalization in large language models +(LLMs)? This question has attracted considerable attention, with most studies +analyzing the extent to which the language skills of LLMs resemble rules. As of +yet, it is not known whether linguistic generalization in LLMs could equally +well be explained as the result of analogical processes, which can be +formalized as similarity operations on stored exemplars. A key shortcoming of +prior research is its focus on linguistic phenomena with a high degree of +regularity, for which rule-based and analogical approaches make the same +predictions. Here, we instead examine derivational morphology, specifically +English adjective nominalization, which displays notable variability. We +introduce a new method for investigating linguistic generalization in LLMs: +focusing on GPT-J, we fit cognitive models that instantiate rule-based and +analogical learning to the LLM training data and compare their predictions on a +set of nonce adjectives with those of the LLM, allowing us to draw direct +conclusions regarding underlying mechanisms. As expected, rule-based and +analogical models explain the predictions of GPT-J equally well for adjectives +with regular nominalization patterns. However, for adjectives with variable +nominalization patterns, the analogical model provides a much better match. +Furthermore, GPT-J's behavior is sensitive to the individual word frequencies, +even for regular forms, a behavior that is consistent with an analogical +account of regular forms but not a rule-based one. These findings refute the +hypothesis that GPT-J's linguistic generalization on adjective nominalization +involves rules, suggesting similarity operations on stored exemplars as the +underlying mechanism. Overall, our study suggests that analogical processes +play a bigger role in the linguistic generalization of LLMs than previously +thought. + +
+
+
+
+
+ + ☆ Exact, Tractable Gauss-Newton Optimization in Deep Reversible + Architectures Reveal Poor Generalization NeurIPS 2024 + + +
+ Second-order optimization has been shown to accelerate the training of deep +neural networks in many applications, often yielding faster progress per +iteration on the training loss compared to first-order optimizers.However, the +generalization properties of second-order methods are still being debated. +Theoretical investigations have proved difficult to carry out outside the +tractable settings of heavily simplified model classes -- thus, the relevance +of existing theories to practical deep learning applications remains unclear. +Similarly, empirical studies in large-scale models and real datasets are +significantly confounded by the necessity to approximate second-order updates +in practice. It is often unclear whether the observed generalization behaviour +arises specifically from the second-order nature of the parameter updates, or +instead reflects the specific structured (e.g.\ Kronecker) approximations used +or any damping-based interpolation towards first-order updates. Here, we show +for the first time that exact Gauss-Newton (GN) updates take on a tractable +form in a class of deep reversible architectures that are sufficiently +expressive to be meaningfully applied to common benchmark datasets. We exploit +this novel setting to study the training and generalization properties of the +GN optimizer. We find that exact GN generalizes poorly. In the mini-batch +training setting, this manifests as rapidly saturating progress even on the +\emph{training} loss, with parameter updates found to overfit each +mini-batchatch without producing the features that would support generalization +to other mini-batches. We show that our experiments run in the ``lazy'' regime, +in which the neural tangent kernel (NTK) changes very little during the course +of training. This behaviour is associated with having no significant changes in +neural representations, explaining the lack of generalization. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ Doubly Robust Regression Discontinuity Designs + + +
+ This study introduces a doubly robust (DR) estimator for regression +discontinuity (RD) designs. In RD designs, treatment effects are estimated in a +quasi-experimental setting where treatment assignment depends on whether a +running variable surpasses a predefined cutoff. A common approach in RD +estimation is to apply nonparametric regression methods, such as local linear +regression. In such an approach, the validity relies heavily on the consistency +of nonparametric estimators and is limited by the nonparametric convergence +rate, thereby preventing $\sqrt{n}$-consistency. To address these issues, we +propose the DR-RD estimator, which combines two distinct estimators for the +conditional expected outcomes. If either of these estimators is consistent, the +treatment effect estimator remains consistent. Furthermore, due to the +debiasing effect, our proposed estimator achieves $\sqrt{n}$-consistency if +both regression estimators satisfy certain mild conditions, which also +simplifies statistical inference. + +
+
+
+
+
+ + ☆ Optimal Control of Mechanical Ventilators with Learned Respiratory + Dynamics + + +
+ Deciding on appropriate mechanical ventilator management strategies +significantly impacts the health outcomes for patients with respiratory +diseases. Acute Respiratory Distress Syndrome (ARDS) is one such disease that +requires careful ventilator operation to be effectively treated. In this work, +we frame the management of ventilators for patients with ARDS as a sequential +decision making problem using the Markov decision process framework. We +implement and compare controllers based on clinical guidelines contained in the +ARDSnet protocol, optimal control theory, and learned latent dynamics +represented as neural networks. The Pulse Physiology Engine's respiratory +dynamics simulator is used to establish a repeatable benchmark, gather +simulated data, and quantitatively compare these controllers. We score +performance in terms of measured improvement in established ARDS health markers +(pertaining to improved respiratory rate, oxygenation, and vital signs). Our +results demonstrate that techniques leveraging neural networks and optimal +control can automatically discover effective ventilation management strategies +without access to explicit ventilator management procedures or guidelines (such +as those defined in the ARDSnet protocol). + +
+
+ comment: 2024 IEEE 37th International Symposium on Computer-Based Medical + Systems (CBMS), 7 pages, 3 figures +
+
+
+
+
+ + ☆ Sleep Staging from Airflow Signals Using Fourier Approximations of + Persistence Curves + + +
+ Sleep staging is a challenging task, typically manually performed by sleep +technologists based on electroencephalogram and other biosignals of patients +taken during overnight sleep studies. Recent work aims to leverage automated +algorithms to perform sleep staging not based on electroencephalogram signals, +but rather based on the airflow signals of subjects. Prior work uses ideas from +topological data analysis (TDA), specifically Hermite function expansions of +persistence curves (HEPC) to featurize airflow signals. However, finite order +HEPC captures only partial information. In this work, we propose Fourier +approximations of persistence curves (FAPC), and use this technique to perform +sleep staging based on airflow signals. We analyze performance using an XGBoost +model on 1155 pediatric sleep studies taken from the Nationwide Children's +Hospital Sleep DataBank (NCHSDB), and find that FAPC methods provide +complimentary information to HEPC methods alone, leading to a 4.9% increase in +performance over baseline methods. + +
+
+
+
+
+ + ☆ On the Convergence of Continual Federated Learning Using Incrementally + Aggregated Gradients + + +
+ The holy grail of machine learning is to enable Continual Federated Learning +(CFL) to enhance the efficiency, privacy, and scalability of AI systems while +learning from streaming data. The primary challenge of a CFL system is to +overcome global catastrophic forgetting, wherein the accuracy of the global +model trained on new tasks declines on the old tasks. In this work, we propose +Continual Federated Learning with Aggregated Gradients (C-FLAG), a novel +replay-memory based federated strategy consisting of edge-based gradient +updates on memory and aggregated gradients on the current data. We provide +convergence analysis of the C-FLAG approach which addresses forgetting and bias +while converging at a rate of $O(1/\sqrt{T})$ over $T$ communication rounds. We +formulate an optimization sub-problem that minimizes catastrophic forgetting, +translating CFL into an iterative algorithm with adaptive learning rates that +ensure seamless learning across tasks. We empirically show that C-FLAG +outperforms several state-of-the-art baselines on both task and +class-incremental settings with respect to metrics such as accuracy and +forgetting. + +
+
+
+
+
+ + ☆ Tukey g-and-h neural network regression for non-Gaussian data + + +
+ This paper addresses non-Gaussian regression with neural networks via the use +of the Tukey g-and-h distribution.The Tukey g-and-h transform is a flexible +parametric transform with two parameters $g$ and $h$ which, when applied to a +standard normal random variable, introduces both skewness and kurtosis, +resulting in a distribution commonly called the Tukey g-and-h distribution. +Specific values of $g$ and $h$ produce good approximations to other families of +distributions, such as the Cauchy and student-t distributions. The flexibility +of the Tukey g-and-h distribution has driven its popularity in the statistical +community, in applied sciences and finance. In this work we consider the +training of a neural network to predict the parameters of a Tukey g-and-h +distribution in a regression framework via the minimization of the +corresponding negative log-likelihood, despite the latter having no closed-form +expression. We demonstrate the efficiency of our procedure in simulated +examples and apply our method to a real-world dataset of global crop yield for +several types of crops. Finally, we show how we can carry out a goodness-of-fit +analysis between the predicted distributions and the test data. A Pytorch +implementation is made available on Github and as a Pypi package. + +
+
+
+
+
+ + ☆ Learning Memory Mechanisms for Decision Making through Demonstrations + + +
+ In Partially Observable Markov Decision Processes, integrating an agent's +history into memory poses a significant challenge for decision-making. +Traditional imitation learning, relying on observation-action pairs for expert +demonstrations, fails to capture the expert's memory mechanisms used in +decision-making. To capture memory processes as demonstrations, we introduce +the concept of \textbf{memory dependency pairs} $(p, q)$ indicating that events +at time $p$ are recalled for decision-making at time $q$. We introduce +\textbf{AttentionTuner} to leverage memory dependency pairs in Transformers and +find significant improvements across several tasks compared to standard +Transformers when evaluated on Memory Gym and the Long-term Memory Benchmark. +Code is available at https://github.com/WilliamYue37/AttentionTuner . + +
+
+
+
+
+ + ☆ Towards Low-bit Communication for Tensor Parallel LLM Inference + + +
+ Tensor parallelism provides an effective way to increase server large +language model (LLM) inference efficiency despite adding an additional +communication cost. However, as server LLMs continue to scale in size, they +will need to be distributed across more devices, magnifying the communication +cost. One way to approach this problem is with quantization, but current +methods for LLMs tend to avoid quantizing the features that tensor parallelism +needs to communicate. Taking advantage of consistent outliers in communicated +features, we introduce a quantization method that reduces communicated values +on average from 16 bits to 4.2 bits while preserving nearly all of the original +performance. For instance, our method maintains around 98.0% and 99.5% of Gemma +2 27B's and Llama 2 13B's original performance, respectively, averaged across +all tasks we evaluated on. + +
+
+
+
+
+ + ☆ Doubly Mild Generalization for Offline Reinforcement Learning NeurIPS 2024 + + +
+ Offline Reinforcement Learning (RL) suffers from the extrapolation error and +value overestimation. From a generalization perspective, this issue can be +attributed to the over-generalization of value functions or policies towards +out-of-distribution (OOD) actions. Significant efforts have been devoted to +mitigating such generalization, and recent in-sample learning approaches have +further succeeded in entirely eschewing it. Nevertheless, we show that mild +generalization beyond the dataset can be trusted and leveraged to improve +performance under certain conditions. To appropriately exploit generalization +in offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild +action generalization and (ii) mild generalization propagation. The former +refers to selecting actions in a close neighborhood of the dataset to maximize +the Q values. Even so, the potential erroneous generalization can still be +propagated, accumulated, and exacerbated by bootstrapping. In light of this, +the latter concept is introduced to mitigate the generalization propagation +without impeding the propagation of RL learning signals. Theoretically, DMG +guarantees better performance than the in-sample optimal policy in the oracle +generalization scenario. Even under worst-case generalization, DMG can still +control value overestimation at a certain level and lower bound the +performance. Empirically, DMG achieves state-of-the-art performance across +Gym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting +from its flexibility in both generalization aspects, DMG enjoys a seamless +transition from offline to online learning and attains strong online +fine-tuning performance. + +
+
+ comment: Accepted to NeurIPS 2024. arXiv admin note: substantial text overlap + with arXiv:2410.19400 +
+
+
+
+
+ + Prediction of Acoustic Communication Performance for AUVs using Gaussian + Process Classification + + +
+ Cooperating autonomous underwater vehicles (AUVs) often rely on acoustic +communication to coordinate their actions effectively. However, the reliability +of underwater acoustic communication decreases as the communication range +between vehicles increases. Consequently, teams of cooperating AUVs typically +make conservative assumptions about the maximum range at which they can +communicate reliably. To address this limitation, we propose a novel approach +that involves learning a map representing the probability of successful +communication based on the locations of the transmitting and receiving +vehicles. This probabilistic communication map accounts for factors such as the +range between vehicles, environmental noise, and multi-path effects at a given +location. In pursuit of this goal, we investigate the application of Gaussian +process binary classification to generate the desired communication map. We +specialize existing results to this specific binary classification problem and +explore methods to incorporate uncertainty in vehicle location into the mapping +process. Furthermore, we compare the prediction performance of the probability +communication map generated using binary classification with that of a +signal-to-noise ratio (SNR) communication map generated using Gaussian process +regression. Our approach is experimentally validated using communication and +navigation data collected during trials with a pair of Virginia Tech 690 AUVs. + +
+
+
+
+
+ + ☆ A Stochastic Optimization Framework for Private and Fair Learning From + Decentralized Data + + +
+ Machine learning models are often trained on sensitive data (e.g., medical +records and race/gender) that is distributed across different "silos" (e.g., +hospitals). These federated learning models may then be used to make +consequential decisions, such as allocating healthcare resources. Two key +challenges emerge in this setting: (i) maintaining the privacy of each person's +data, even if other silos or an adversary with access to the central server +tries to infer this data; (ii) ensuring that decisions are fair to different +demographic groups (e.g., race/gender). In this paper, we develop a novel +algorithm for private and fair federated learning (FL). Our algorithm satisfies +inter-silo record-level differential privacy (ISRL-DP), a strong notion of +private FL requiring that silo i's sent messages satisfy record-level +differential privacy for all i. Our framework can be used to promote different +fairness notions, including demographic parity and equalized odds. We prove +that our algorithm converges under mild smoothness assumptions on the loss +function, whereas prior work required strong convexity for convergence. As a +byproduct of our analysis, we obtain the first convergence guarantee for +ISRL-DP nonconvex-strongly concave min-max FL. Experiments demonstrate the +state-of-the-art fairness-accuracy tradeoffs of our algorithm across different +privacy levels. + +
+
+
+
+
+ + ☆ INTRABENCH: Interactive Radiological Benchmark + + +
+ Current interactive segmentation approaches, inspired by the success of +META's Segment Anything model, have achieved notable advancements, however, +they come with substantial limitations that hinder their practical application +in real clinical scenarios. These include unrealistic human interaction +requirements, such as slice-by-slice operations for 2D models on 3D data, a +lack of iterative refinement, and insufficient evaluation experiments. These +shortcomings prevent accurate assessment of model performance and lead to +inconsistent outcomes across studies. IntRaBench overcomes these challenges by +offering a comprehensive and reproducible framework for evaluating interactive +segmentation methods in realistic, clinically relevant scenarios. It includes +diverse datasets, target structures, and segmentation models, and provides a +flexible codebase that allows seamless integration of new models and prompting +strategies. Additionally, we introduce advanced techniques to minimize +clinician interaction, ensuring fair comparisons between 2D and 3D models. By +open-sourcing IntRaBench, we invite the research community to integrate their +models and prompting techniques, ensuring continuous and transparent evaluation +of interactive segmentation models in 3D medical imaging. + +
+
+ comment: Undergoing Peer-Review +
+
+
+
+
+ + ☆ Diverse capability and scaling of diffusion and auto-regressive models + when learning abstract rules NeurIPS2024 + + +
+ Humans excel at discovering regular structures from limited samples and +applying inferred rules to novel settings. We investigate whether modern +generative models can similarly learn underlying rules from finite samples and +perform reasoning through conditional sampling. Inspired by Raven's Progressive +Matrices task, we designed GenRAVEN dataset, where each sample consists of +three rows, and one of 40 relational rules governing the object position, +number, or attributes applies to all rows. We trained generative models to +learn the data distribution, where samples are encoded as integer arrays to +focus on rule learning. We compared two generative model families: diffusion +(EDM, DiT, SiT) and autoregressive models (GPT2, Mamba). We evaluated their +ability to generate structurally consistent samples and perform panel +completion via unconditional and conditional sampling. We found diffusion +models excel at unconditional generation, producing more novel and consistent +samples from scratch and memorizing less, but performing less well in panel +completion, even with advanced conditional sampling methods. Conversely, +autoregressive models excel at completing missing panels in a rule-consistent +manner but generate less consistent samples unconditionally. We observe diverse +data scaling behaviors: for both model families, rule learning emerges at a +certain dataset size - around 1000s examples per rule. With more training data, +diffusion models improve both their unconditional and conditional generation +capabilities. However, for autoregressive models, while panel completion +improves with more training data, unconditional generation consistency +declines. Our findings highlight complementary capabilities and limitations of +diffusion and autoregressive models in rule learning and reasoning tasks, +suggesting avenues for further research into their mechanisms and potential for +human-like reasoning. + +
+
+ comment: 12 pages, 5 figures. Accepted to NeurIPS2024 Workshop on System 2 + Reasoning At Scale as long paper +
+
+
+
+
+ + ☆ CDXFormer: Boosting Remote Sensing Change Detection with Extended Long + Short-Term Memory + + +
+ In complex scenes and varied conditions, effectively integrating +spatial-temporal context is crucial for accurately identifying changes. +However, current RS-CD methods lack a balanced consideration of performance and +efficiency. CNNs lack global context, Transformers have quadratic computational +complexity, and Mambas are restricted by CUDA acceleration. In this paper, we +propose CDXFormer, with a core component that is a powerful XLSTM-based feature +enhancement layer, integrating the advantages of linear computational +complexity, global context perception, and strong interpret-ability. +Specifically, we introduce a scale-specific Feature Enhancer layer, +incorporating a Cross-Temporal Global Perceptron customized for +semantic-accurate deep features, and a Cross-Temporal Spatial Refiner +customized for detail-rich shallow features. Additionally, we propose a +Cross-Scale Interactive Fusion module to progressively interact global change +representations with spatial responses. Extensive experimental results +demonstrate that CDXFormer achieves state-of-the-art performance across three +benchmark datasets, offering a compelling balance between efficiency and +accuracy. Code is available at https://github.com/xwmaxwma/rschange. + +
+
+
+
+
+ + ☆ Tucano: Advancing Neural Text Generation for Portuguese + + +
+ Significant advances have been made in natural language processing in recent +years. However, our current deep learning approach to language modeling +requires substantial resources in terms of data and computation. One of the +side effects of this data-hungry paradigm is the current schism between +languages, separating those considered high-resource, where most of the +development happens and resources are available, and the low-resource ones, +which struggle to attain the same level of performance and autonomy. This study +aims to introduce a new set of resources to stimulate the future development of +neural text generation in Portuguese. In this work, we document the development +of GigaVerbo, a concatenation of deduplicated Portuguese text corpora amounting +to 200 billion tokens. Via this corpus, we trained a series of +decoder-transformers named Tucano. Our models perform equal or superior to +other Portuguese and multilingual language models of similar size in several +Portuguese benchmarks. The evaluation of our models also reveals that model +performance on many currently available benchmarks used by the Portuguese NLP +community has little to no correlation with the scaling of token ingestion +during training, highlighting the limitations of such evaluations when it comes +to the assessment of Portuguese generative language models. All derivatives of +our study are openly released on GitHub and Hugging Face. See +https://nkluge-correa.github.io/Tucano/ + +
+
+
+
+
+ + ☆ Evidential time-to-event prediction model with well-calibrated + uncertainty estimation + + +
+ Time-to-event analysis, or Survival analysis, provides valuable insights into +clinical prognosis and treatment recommendations. However, this task is +typically more challenging than other regression tasks due to the censored +observations. Moreover, concerns regarding the reliability of predictions +persist among clinicians, mainly attributed to the absence of confidence +assessment, robustness, and calibration of prediction. To address those +challenges, we introduce an evidential regression model designed especially for +time-to-event prediction tasks, with which the most plausible event time, is +directly quantified by aggregated Gaussian random fuzzy numbers (GRFNs). The +GRFNs are a newly introduced family of random fuzzy subsets of the real line +that generalizes both Gaussian random variables and Gaussian possibility +distributions. Different from conventional methods that construct models based +on strict data distribution, e.g., proportional hazard function, our model only +assumes the event time is encoded in a real line GFRN without any strict +distribution assumption, therefore offering more flexibility in complex data +scenarios. Furthermore, the epistemic and aleatory uncertainty regarding the +event time is quantified within the aggregated GRFN as well. Our model can, +therefore, provide more detailed clinical decision-making guidance with two +more degrees of information. The model is fit by minimizing a generalized +negative log-likelihood function that accounts for data censoring based on +uncertainty evidence reasoning. Experimental results on simulated datasets with +varying data distributions and censoring scenarios, as well as on real-world +datasets across diverse clinical settings and tasks, demonstrate that our model +achieves both accurate and reliable performance, outperforming state-of-the-art +methods. + +
+
+
+
+
+ + ☆ FRUGAL: Memory-Efficient Optimization by Reducing State Overhead for + Scalable Training + + +
+ With the increase in the number of parameters in large language models, the +process of pre-training and fine-tuning increasingly demands larger volumes of +GPU memory. A significant portion of this memory is typically consumed by the +optimizer state. To overcome this challenge, recent approaches such as low-rank +adaptation (LoRA (Hu et al., 2021)), low-rank gradient projection (GaLore (Zhao +et al., 2024)), and blockwise optimization (BAdam (Luo et al., 2024)) have been +proposed. However, in all these algorithms, the $\textit{effective rank of the +weight updates remains low-rank}$, which can lead to a substantial loss of +information from the gradient. This loss can be critically important, +especially during the pre-training stage. In this paper, we introduce +$\texttt{FRUGAL}$ ($\textbf{F}$ull-$\textbf{R}$ank $\textbf{U}$pdates with +$\textbf{G}$r$\textbf{A}$dient sp$\textbf{L}$itting), a new memory-efficient +optimization framework. $\texttt{FRUGAL}$ leverages gradient splitting to +perform low-dimensional updates using advanced algorithms (such as Adam), while +updates along the remaining directions are executed via state-free methods like +SGD or signSGD (Bernstein et al., 2018). Our framework can be integrated with +various low-rank update selection techniques, including GaLore and BAdam. We +provide theoretical convergence guarantees for our framework when using SGDM +for low-dimensional updates and SGD for state-free updates. Additionally, our +method consistently outperforms concurrent approaches across various fixed +memory budgets, achieving state-of-the-art results in pre-training and +fine-tuning tasks while balancing memory efficiency and performance metrics. + +
+
+
+
+
+ + ☆ Dynamical-VAE-based Hindsight to Learn the Causal Dynamics of + Factored-POMDPs + + +
+ Learning representations of underlying environmental dynamics from partial +observations is a critical challenge in machine learning. In the context of +Partially Observable Markov Decision Processes (POMDPs), state representations +are often inferred from the history of past observations and actions. We +demonstrate that incorporating future information is essential to accurately +capture causal dynamics and enhance state representations. To address this, we +introduce a Dynamical Variational Auto-Encoder (DVAE) designed to learn causal +Markovian dynamics from offline trajectories in a POMDP. Our method employs an +extended hindsight framework that integrates past, current, and multi-step +future information within a factored-POMDP setting. Empirical results reveal +that this approach uncovers the causal graph governing hidden state transitions +more effectively than history-based and typical hindsight-based models. + +
+
+
+
+
+ + ☆ Suite-IN: Aggregating Motion Features from Apple Suite for Robust + Inertial Navigation + + +
+ With the rapid development of wearable technology, devices like smartphones, +smartwatches, and headphones equipped with IMUs have become essential for +applications such as pedestrian positioning. However, traditional pedestrian +dead reckoning (PDR) methods struggle with diverse motion patterns, while +recent data-driven approaches, though improving accuracy, often lack robustness +due to reliance on a single device.In our work, we attempt to enhance the +positioning performance using the low-cost commodity IMUs embedded in the +wearable devices. We propose a multi-device deep learning framework named +Suite-IN, aggregating motion data from Apple Suite for inertial navigation. +Motion data captured by sensors on different body parts contains both local and +global motion information, making it essential to reduce the negative effects +of localized movements and extract global motion representations from multiple +devices. + +
+
+
+
+
+ + ☆ Efficient Federated Finetuning of Tiny Transformers with + Resource-Constrained Devices + + +
+ In recent years, Large Language Models (LLMs) through Transformer structures +have dominated many machine learning tasks, especially text processing. +However, these models require massive amounts of data for training and induce +high resource requirements, particularly in terms of the large number of +Floating Point Operations (FLOPs) and the high amounts of memory needed. To +fine-tune such a model in a parameter-efficient way, techniques like Adapter or +LoRA have been developed. However, we observe that the application of LoRA, +when used in federated learning (FL), while still being parameter-efficient, is +memory and FLOP inefficient. Based on that observation, we develop a novel +layer finetuning scheme that allows devices in cross-device FL to make use of +pretrained neural networks (NNs) while adhering to given resource constraints. +We show that our presented scheme outperforms the current state of the art when +dealing with homogeneous or heterogeneous computation and memory constraints +and is on par with LoRA regarding limited communication, thereby achieving +significantly higher accuracies in FL training. + +
+
+
+
+
+ + ☆ Dual-Criterion Model Aggregation in Federated Learning: Balancing Data + Quantity and Quality + + +
+ Federated learning (FL) has become one of the key methods for +privacy-preserving collaborative learning, as it enables the transfer of models +without requiring local data exchange. Within the FL framework, an aggregation +algorithm is recognized as one of the most crucial components for ensuring the +efficacy and security of the system. Existing average aggregation algorithms +typically assume that all client-trained data holds equal value or that weights +are based solely on the quantity of data contributed by each client. In +contrast, alternative approaches involve training the model locally after +aggregation to enhance adaptability. However, these approaches fundamentally +ignore the inherent heterogeneity between different clients' data and the +complexity of variations in data at the aggregation stage, which may lead to a +suboptimal global model. + To address these issues, this study proposes a novel dual-criterion weighted +aggregation algorithm involving the quantity and quality of data from the +client node. Specifically, we quantify the data used for training and perform +multiple rounds of local model inference accuracy evaluation on a specialized +dataset to assess the data quality of each client. These two factors are +utilized as weights within the aggregation process, applied through a +dynamically weighted summation of these two factors. This approach allows the +algorithm to adaptively adjust the weights, ensuring that every client can +contribute to the global model, regardless of their data's size or initial +quality. Our experiments show that the proposed algorithm outperforms several +existing state-of-the-art aggregation approaches on both a general-purpose +open-source dataset, CIFAR-10, and a dataset specific to visual obstacle +avoidance. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Federated Low-Rank Adaptation with Differential Privacy over Wireless + Networks + + +
+ Fine-tuning large pre-trained foundation models (FMs) on distributed edge +devices presents considerable computational and privacy challenges. Federated +fine-tuning (FedFT) mitigates some privacy issues by facilitating collaborative +model training without the need to share raw data. To lessen the computational +burden on resource-limited devices, combining low-rank adaptation (LoRA) with +federated learning enables parameter-efficient fine-tuning. Additionally, the +split FedFT architecture partitions an FM between edge devices and a central +server, reducing the necessity for complete model deployment on individual +devices. However, the risk of privacy eavesdropping attacks in FedFT remains a +concern, particularly in sensitive areas such as healthcare and finance. In +this paper, we propose a split FedFT framework with differential privacy (DP) +over wireless networks, where the inherent wireless channel noise in the uplink +transmission is utilized to achieve DP guarantees without adding an extra +artificial noise. We shall investigate the impact of the wireless noise on +convergence performance of the proposed framework. We will also show that by +updating only one of the low-rank matrices in the split FedFT with DP, the +proposed method can mitigate the noise amplification effect. Simulation results +will demonstrate that the proposed framework achieves higher accuracy under +strict privacy budgets compared to baseline methods. + +
+
+ comment: 6 pages, 3 figures, submitted to IEEE ICC 2025 +
+
+
+
+
+ + ☆ Kernel-based retrieval models for hyperspectral image data optimized + with Kernel Flows + + +
+ Kernel-based statistical methods are efficient, but their performance depends +heavily on the selection of kernel parameters. In literature, the optimization +studies on kernel-based chemometric methods is limited and often reduced to +grid searching. Previously, the authors introduced Kernel Flows (KF) to learn +kernel parameters for Kernel Partial Least-Squares (K-PLS) regression. KF is +easy to implement and helps minimize overfitting. In cases of high collinearity +between spectra and biogeophysical quantities in spectroscopy, simpler methods +like Principal Component Regression (PCR) may be more suitable. In this study, +we propose a new KF-type approach to optimize Kernel Principal Component +Regression (K-PCR) and test it alongside KF-PLS. Both methods are benchmarked +against non-linear regression techniques using two hyperspectral remote sensing +datasets. + +
+
+
+
+
+ + ☆ PatchCTG: Patch Cardiotocography Transformer for Antepartum Fetal Health + Monitoring + + +
+ Antepartum Cardiotocography (CTG) is vital for fetal health monitoring, but +traditional methods like the Dawes-Redman system are often limited by high +inter-observer variability, leading to inconsistent interpretations and +potential misdiagnoses. This paper introduces PatchCTG, a transformer-based +model specifically designed for CTG analysis, employing patch-based +tokenisation, instance normalisation and channel-independent processing to +capture essential local and global temporal dependencies within CTG signals. +PatchCTG was evaluated on the Oxford Maternity (OXMAT) dataset, comprising over +20,000 CTG traces across diverse clinical outcomes after applying the inclusion +and exclusion criteria. With extensive hyperparameter optimisation, PatchCTG +achieved an AUC of 77%, with specificity of 88% and sensitivity of 57% at +Youden's index threshold, demonstrating adaptability to various clinical needs. +Testing across varying temporal thresholds showed robust predictive +performance, particularly with finetuning on data closer to delivery, achieving +a sensitivity of 52% and specificity of 88% for near-delivery cases. These +findings suggest the potential of PatchCTG to enhance clinical decision-making +in antepartum care by providing a reliable, objective tool for fetal health +assessment. The source code is available at +https://github.com/jaleedkhan/PatchCTG. + +
+
+
+
+
+ + ☆ Interaction Asymmetry: A General Principle for Learning Composable + Abstractions + + +
+ Learning disentangled representations of concepts and re-composing them in +unseen ways is crucial for generalizing to out-of-domain situations. However, +the underlying properties of concepts that enable such disentanglement and +compositional generalization remain poorly understood. In this work, we propose +the principle of interaction asymmetry which states: "Parts of the same concept +have more complex interactions than parts of different concepts". We formalize +this via block diagonality conditions on the $(n+1)$th order derivatives of the +generator mapping concepts to observed data, where different orders of +"complexity" correspond to different $n$. Using this formalism, we prove that +interaction asymmetry enables both disentanglement and compositional +generalization. Our results unify recent theoretical results for learning +concepts of objects, which we show are recovered as special cases with +$n\!=\!0$ or $1$. We provide results for up to $n\!=\!2$, thus extending these +prior works to more flexible generator functions, and conjecture that the same +proof strategies generalize to larger $n$. Practically, our theory suggests +that, to disentangle concepts, an autoencoder should penalize its latent +capacity and the interactions between concepts during decoding. We propose an +implementation of these criteria using a flexible Transformer-based VAE, with a +novel regularizer on the attention weights of the decoder. On synthetic image +datasets consisting of objects, we provide evidence that this model can achieve +comparable object disentanglement to existing models that use more explicit +object-centric priors. + +
+
+ comment: Preprint, under review +
+
+
+
+
+ + ☆ Likelihood as a Performance Gauge for Retrieval-Augmented Generation NAACL 2025 + + +
+ Recent work finds that retrieval-augmented generation with large language +models is prone to be influenced by the order of retrieved documents in the +context. However, the lack of in-depth analysis limits the use of this +phenomenon for prompt engineering in practice. In this study, we posit that +likelihoods serve as an effective gauge for language model performance. Through +experiments on two question-answering datasets with a variety of +state-of-the-art language models, we reveal correlations between answer +accuracy and the likelihood of the question at both the corpus level and the +instance level. In addition, we find that question likelihood can also indicate +the position of the task-relevant information in the context. Based on these +findings, we propose two methods that use question likelihood as a gauge for +selecting and constructing prompts that lead to better performance. We +demonstrate their effectiveness with experiments. In addition, our +likelihood-based methods are efficient, as they only need to compute the +likelihood of the input, requiring much fewer language model passes than +heuristic prompt engineering methods that require generating responses. Our +analysis deepens our understanding of how input prompts affect model +performance and provides a promising direction for efficient prompt +optimization. + +
+
+ comment: Under review at NAACL 2025. Code is available at + https://github.com/lyutyuh/poptimizer +
+
+
+
+
+ + ☆ Automatic Album Sequencing + + +
+ Album sequencing is a critical part of the album production process. +Recently, a data-driven approach was proposed that sequences general +collections of independent media by extracting the narrative essence of the +items in the collections. While this approach implies an album sequencing +technique, it is not widely accessible to a less technical audience, requiring +advanced knowledge of machine learning techniques to use. To address this, we +introduce a new user-friendly web-based tool that allows a less technical +audience to upload music tracks, execute this technique in one click, and +subsequently presents the result in a clean visualization to the user. To both +increase the number of templates available to the user and address shortcomings +of previous work, we also introduce a new direct transformer-based album +sequencing method. We find that our more direct method outperforms a random +baseline but does not reach the same performance as the narrative essence +approach. Both methods are included in our web-based user interface, and this +-- alongside a full copy of our implementation -- is publicly available at +https://github.com/dylanashley/automatic-album-sequencing + +
+
+ comment: presented as a late breaking demo in the 25th International Society + for Music Information Retrieval Conference; 3 pages in main text, 3 figures + in main text; source code available at + https://github.com/dylanashley/automatic-album-sequencing +
+
+
+
+
+ + ☆ ASER: Activation Smoothing and Error Reconstruction for Large Language + Model Quantization + + +
+ Quantization stands as a pivotal technique for large language model (LLM) +serving, yet it poses significant challenges particularly in achieving +effective low-bit quantization. The limited numerical mapping makes the +quantized model produce a non-trivial error, bringing out intolerable +performance degration. This paper is anchored in the basic idea of model +compression objectives, and delves into the layer-wise error distribution of +LLMs during post-training quantization. Subsequently, we introduce ASER, an +algorithm consisting of (1) Error Reconstruction: low-rank compensation for +quantization error with LoRA-style matrices constructed by whitening SVD; (2) +Activation Smoothing: outlier extraction to gain smooth activation and better +error compensation. ASER is capable of quantizing typical LLMs to low-bit ones, +particularly preserving accuracy even in W4A8 per-channel setup. Experimental +results show that ASER is competitive among the state-of-the-art quantization +algorithms, showing potential to activation quantization, with minor overhead. + +
+
+
+
+
+ + ☆ Navigation with QPHIL: Quantizing Planner for Hierarchical Implicit + Q-Learning + + +
+ Offline Reinforcement Learning (RL) has emerged as a powerful alternative to +imitation learning for behavior modeling in various domains, particularly in +complex navigation tasks. An existing challenge with Offline RL is the +signal-to-noise ratio, i.e. how to mitigate incorrect policy updates due to +errors in value estimates. Towards this, multiple works have demonstrated the +advantage of hierarchical offline RL methods, which decouples high-level path +planning from low-level path following. In this work, we present a novel +hierarchical transformer-based approach leveraging a learned quantizer of the +space. This quantization enables the training of a simpler zone-conditioned +low-level policy and simplifies planning, which is reduced to discrete +autoregressive prediction. Among other benefits, zone-level reasoning in +planning enables explicit trajectory stitching rather than implicit stitching +based on noisy value function estimates. By combining this transformer-based +planner with recent advancements in offline RL, our proposed approach achieves +state-of-the-art results in complex long-distance navigation environments. + +
+
+ comment: Under review. Code will be released upon acceptance +
+
+
+
+
+ + ☆ Spatially Regularized Graph Attention Autoencoder Framework for + Detecting Rainfall Extremes + + +
+ We introduce a novel Graph Attention Autoencoder (GAE) with spatial +regularization to address the challenge of scalable anomaly detection in +spatiotemporal rainfall data across India from 1990 to 2015. Our model +leverages a Graph Attention Network (GAT) to capture spatial dependencies and +temporal dynamics in the data, further enhanced by a spatial regularization +term ensuring geographic coherence. We construct two graph datasets employing +rainfall, pressure, and temperature attributes from the Indian Meteorological +Department and ERA5 Reanalysis on Single Levels, respectively. Our network +operates on graph representations of the data, where nodes represent geographic +locations, and edges, inferred through event synchronization, denote +significant co-occurrences of rainfall events. Through extensive experiments, +we demonstrate that our GAE effectively identifies anomalous rainfall patterns +across the Indian landscape. Our work paves the way for sophisticated +spatiotemporal anomaly detection methodologies in climate science, contributing +to better climate change preparedness and response strategies. + +
+
+
+
+
+ + ☆ Exploring the loss landscape of regularized neural networks via convex + duality + + +
+ We discuss several aspects of the loss landscape of regularized neural +networks: the structure of stationary points, connectivity of optimal +solutions, path with nonincreasing loss to arbitrary global optimum, and the +nonuniqueness of optimal solutions, by casting the problem into an equivalent +convex problem and considering its dual. Starting from two-layer neural +networks with scalar output, we first characterize the solution set of the +convex problem using its dual and further characterize all stationary points. +With the characterization, we show that the topology of the global optima goes +through a phase transition as the width of the network changes, and construct +counterexamples where the problem may have a continuum of optimal solutions. +Finally, we show that the solution set characterization and connectivity +results can be extended to different architectures, including two-layer +vector-valued neural networks and parallel three-layer neural networks. + +
+
+
+
+
+ + ☆ Convergence Rate Analysis of LION + + +
+ The LION (evoLved sIgn mOmeNtum) optimizer for deep neural network training +was found by Google via program search, with the simple sign update yet showing +impressive performance in training large scale networks. Although previous +studies have investigated its convergence properties, a comprehensive analysis, +especially the convergence rate, is still desirable. Recognizing that LION can +be regarded as solving a specific constrained problem, this paper focuses on +demonstrating its convergence to the Karush-Kuhn-Tucker (KKT) point at the rate +of $\cal O(\sqrt{d}K^{-1/4})$ measured by gradient $\ell_1$ norm, where $d$ is +the problem dimension and $K$ is the number of iteration steps. Step further, +we remove the constraint and establish that LION converges to the critical +point of the general unconstrained problem at the same rate. This rate not only +delivers the currently optimal dependence on the problem dimension $d$ but also +tightly matches the theoretical lower bound for nonconvex stochastic +optimization algorithms, which is typically measured using the gradient +$\ell_2$ norm, with respect to the number of iterations $K$. Through extensive +experiments, we not only demonstrate that LION achieves lower loss and higher +performance compared to standard SGD, but also empirically confirm that the +gradient $\ell_1/\ell_2$ norm ratio aligns with $\Theta(\sqrt{d})$, thus +proving that our convergence rate matches the theoretical lower bound with +respect to $d$ in the empirical sense. + +
+
+
+
+
+ + ☆ EMPERROR: A Flexible Generative Perception Error Model for Probing + Self-Driving Planners + + +
+ To handle the complexities of real-world traffic, learning planners for +self-driving from data is a promising direction. While recent approaches have +shown great progress, they typically assume a setting in which the ground-truth +world state is available as input. However, when deployed, planning needs to be +robust to the long-tail of errors incurred by a noisy perception system, which +is often neglected in evaluation. To address this, previous work has proposed +drawing adversarial samples from a perception error model (PEM) mimicking the +noise characteristics of a target object detector. However, these methods use +simple PEMs that fail to accurately capture all failure modes of detection. In +this paper, we present EMPERROR, a novel transformer-based generative PEM, +apply it to stress-test an imitation learning (IL)-based planner and show that +it imitates modern detectors more faithfully than previous work. Furthermore, +it is able to produce realistic noisy inputs that increase the planner's +collision rate by up to 85%, demonstrating its utility as a valuable tool for a +more complete evaluation of self-driving planners. + +
+
+ comment: Project page: https://lasnik.github.io/emperror/ +
+
+
+
+
+ + ☆ OWLed: Outlier-weighed Layerwise Pruning for Efficient Autonomous + Driving Framework + + +
+ The integration of Large Language Models (LLMs) into autonomous driving +systems offers promising enhancements in environmental understanding and +decision-making. However, the substantial computational demands of deploying +LLMs locally on vehicles render this approach unfeasible for real-world +automotive applications. To address this challenge, we introduce OWLed, the +Outlier-Weighed Layerwise Pruning for Efficient Autonomous Driving Framework +that leverages outlier-weighted layerwise sparsity for model compression. Our +method assigns non-uniform sparsity ratios to different layers based on the +distribution of outlier features, significantly reducing the model size without +the need for fine-tuning. To ensure the compressed model adapts well to +autonomous driving tasks, we incorporate driving environment data into both the +calibration and pruning processes. Our empirical studies reveal that the +encoder component is more sensitive to pruning than the LLM, highlighting its +critical role in the system. Experimental results demonstrate that OWLed +outperforms existing methods in perception, action prediction, and language +understanding while substantially lowering computational requirements. These +findings underscore the potential of combining advanced pruning techniques with +LLMs to develop efficient and robust autonomous driving systems capable of +handling complex scenarios. Code will be made publicly available. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Test Where Decisions Matter: Importance-driven Testing for Deep + Reinforcement Learning + + +
+ In many Deep Reinforcement Learning (RL) problems, decisions in a trained +policy vary in significance for the expected safety and performance of the +policy. Since RL policies are very complex, testing efforts should concentrate +on states in which the agent's decisions have the highest impact on the +expected outcome. In this paper, we propose a novel model-based method to +rigorously compute a ranking of state importance across the entire state space. +We then focus our testing efforts on the highest-ranked states. In this paper, +we focus on testing for safety. However, the proposed methods can be easily +adapted to test for performance. In each iteration, our testing framework +computes optimistic and pessimistic safety estimates. These estimates provide +lower and upper bounds on the expected outcomes of the policy execution across +all modeled states in the state space. Our approach divides the state space +into safe and unsafe regions upon convergence, providing clear insights into +the policy's weaknesses. Two important properties characterize our approach. +(1) Optimal Test-Case Selection: At any time in the testing process, our +approach evaluates the policy in the states that are most critical for safety. +(2) Guaranteed Safety: Our approach can provide formal verification guarantees +over the entire state space by sampling only a fraction of the policy. Any +safety properties assured by the pessimistic estimate are formally proven to +hold for the policy. We provide a detailed evaluation of our framework on +several examples, showing that our method discovers unsafe policy behavior with +low testing effort. + +
+
+
+
+
+ + ☆ What Do Learning Dynamics Reveal About Generalization in LLM Reasoning? + + +
+ Despite the remarkable capabilities of modern large language models (LLMs), +the mechanisms behind their problem-solving abilities remain elusive. In this +work, we aim to better understand how the learning dynamics of LLM finetuning +shapes downstream generalization. Our analysis focuses on reasoning tasks, +whose problem structure allows us to distinguish between memorization (the +exact replication of reasoning steps from the training data) and performance +(the correctness of the final solution). We find that a model's generalization +behavior can be effectively characterized by a training metric we call +pre-memorization train accuracy: the accuracy of model samples on training +queries before they begin to copy the exact reasoning steps from the training +set. On the dataset level, this metric is able to reliably predict test +accuracy, achieving $R^2$ of around or exceeding 0.9 across various models +(Llama3 8, Gemma2 9B), datasets (GSM8k, MATH), and training configurations. On +a per-example level, this metric is also indicative of whether individual model +predictions are robust to perturbations in the training query. By connecting a +model's learning behavior to its generalization, pre-memorization train +accuracy can guide targeted improvements to training strategies. We focus on +data curation as an example, and show that prioritizing examples with low +pre-memorization accuracy leads to 1.5-2x improvements in data efficiency +compared to i.i.d. data scaling, and outperforms other standard data curation +techniques. + +
+
+
+
+
+ + ☆ Safe Exploitative Play with Untrusted Type Beliefs NeurIPS 2024 + + +
+ The combination of the Bayesian game and learning has a rich history, with +the idea of controlling a single agent in a system composed of multiple agents +with unknown behaviors given a set of types, each specifying a possible +behavior for the other agents. The idea is to plan an agent's own actions with +respect to those types which it believes are most likely to maximize the +payoff. However, the type beliefs are often learned from past actions and +likely to be incorrect. With this perspective in mind, we consider an agent in +a game with type predictions of other components, and investigate the impact of +incorrect beliefs to the agent's payoff. In particular, we formally define a +tradeoff between risk and opportunity by comparing the payoff obtained against +the optimal payoff, which is represented by a gap caused by trusting or +distrusting the learned beliefs. Our main results characterize the tradeoff by +establishing upper and lower bounds on the Pareto front for both normal-form +and stochastic Bayesian games, with numerical results provided. + +
+
+ comment: 26 pages, NeurIPS 2024 +
+
+
+
+
+ + ☆ Rethinking Structure Learning For Graph Neural Networks + + +
+ To improve the performance of Graph Neural Networks (GNNs), Graph Structure +Learning (GSL) has been extensively applied to reconstruct or refine original +graph structures, effectively addressing issues like heterophily, +over-squashing, and noisy structures. While GSL is generally thought to improve +GNN performance, it often leads to longer training times and more +hyperparameter tuning. Besides, the distinctions among current GSL methods +remain ambiguous from the perspective of GNN training, and there is a lack of +theoretical analysis to quantify their effectiveness. Recent studies further +suggest that, under fair comparisons with the same hyperparameter tuning, GSL +does not consistently outperform baseline GNNs. This motivates us to ask a +critical question: is GSL really useful for GNNs? To address this question, +this paper makes two key contributions. First, we propose a new GSL framework, +which includes three steps: GSL base (the representation used for GSL) +construction, new structure construction, and view fusion, to better understand +the effectiveness of GSL in GNNs. Second, after graph convolution, we analyze +the differences in mutual information (MI) between node representations derived +from the original topology and those from the newly constructed topology. +Surprisingly, our empirical observations and theoretical analysis show that no +matter which type of graph structure construction methods are used, after +feeding the same GSL bases to the newly constructed graph, there is no MI gain +compared to the original GSL bases. To fairly reassess the effectiveness of +GSL, we conduct ablation experiments and find that it is the pretrained GSL +bases that enhance GNN performance, and in most cases, GSL cannot improve GNN +performance. This finding encourages us to rethink the essential components in +GNNs, such as self-training and structural encoding, in GNN design rather than +GSL. + +
+
+
+
+
+ + ☆ Is Graph Convolution Always Beneficial For Every Feature? + + +
+ Graph Neural Networks (GNNs) have demonstrated strong capabilities in +processing structured data. While traditional GNNs typically treat each feature +dimension equally during graph convolution, we raise an important question: Is +the graph convolution operation equally beneficial for each feature? If not, +the convolution operation on certain feature dimensions can possibly lead to +harmful effects, even worse than the convolution-free models. In prior studies, +to assess the impacts of graph convolution on features, people proposed metrics +based on feature homophily to measure feature consistency with the graph +topology. However, these metrics have shown unsatisfactory alignment with GNN +performance and have not been effectively employed to guide feature selection +in GNNs. To address these limitations, we introduce a novel metric, Topological +Feature Informativeness (TFI), to distinguish between GNN-favored and +GNN-disfavored features, where its effectiveness is validated through both +theoretical analysis and empirical observations. Based on TFI, we propose a +simple yet effective Graph Feature Selection (GFS) method, which processes +GNN-favored and GNN-disfavored features separately, using GNNs and non-GNN +models. Compared to original GNNs, GFS significantly improves the extraction of +useful topological information from each feature with comparable computational +costs. Extensive experiments show that after applying GFS to 8 baseline and +state-of-the-art (SOTA) GNN architectures across 10 datasets, 83.75% of the +GFS-augmented cases show significant performance boosts. Furthermore, our +proposed TFI metric outperforms other feature selection methods. These results +validate the effectiveness of both GFS and TFI. Additionally, we demonstrate +that GFS's improvements are robust to hyperparameter tuning, highlighting its +potential as a universal method for enhancing various GNN architectures. + +
+
+
+
+
+ + ☆ Understanding Audiovisual Deepfake Detection: Techniques, Challenges, + Human Factors and Perceptual Insights + + +
+ Deep Learning has been successfully applied in diverse fields, and its impact +on deepfake detection is no exception. Deepfakes are fake yet realistic +synthetic content that can be used deceitfully for political impersonation, +phishing, slandering, or spreading misinformation. Despite extensive research +on unimodal deepfake detection, identifying complex deepfakes through joint +analysis of audio and visual streams remains relatively unexplored. To fill +this gap, this survey first provides an overview of audiovisual deepfake +generation techniques, applications, and their consequences, and then provides +a comprehensive review of state-of-the-art methods that combine audio and +visual modalities to enhance detection accuracy, summarizing and critically +analyzing their strengths and limitations. Furthermore, we discuss existing +open source datasets for a deeper understanding, which can contribute to the +research community and provide necessary information to beginners who want to +analyze deep learning-based audiovisual methods for video forensics. By +bridging the gap between unimodal and multimodal approaches, this paper aims to +improve the effectiveness of deepfake detection strategies and guide future +research in cybersecurity and media integrity. + +
+
+
+
+
+ + ☆ xCG: Explainable Cell Graphs for Survival Prediction in Non-Small Cell + Lung Cancer ML4H + + +
+ Understanding how deep learning models predict oncology patient risk can +provide critical insights into disease progression, support clinical +decision-making, and pave the way for trustworthy and data-driven precision +medicine. Building on recent advances in the spatial modeling of the tumor +microenvironment using graph neural networks, we present an explainable cell +graph (xCG) approach for survival prediction. We validate our model on a public +cohort of imaging mass cytometry (IMC) data for 416 cases of lung +adenocarcinoma. We explain survival predictions in terms of known phenotypes on +the cell level by computing risk attributions over cell graphs, for which we +propose an efficient grid-based layer-wise relevance propagation (LRP) method. +Our ablation studies highlight the importance of incorporating the cancer stage +and model ensembling to improve the quality of risk estimates. Our xCG method, +together with the IMC data, is made publicly available to support further +research. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 11 pages +
+
+
+
+
+ + ☆ Top-$nσ$: Not All Logits Are You Need + + +
+ Large language models (LLMs) typically employ greedy decoding or +low-temperature sampling for reasoning tasks, reflecting a perceived trade-off +between diversity and accuracy. We challenge this convention by introducing +top-$n\sigma$, a novel sampling method that operates directly on pre-softmax +logits by leveraging a statistical threshold. Our key insight is that logits +naturally separate into a Gaussian-distributed noisy region and a distinct +informative region, enabling efficient token filtering without complex +probability manipulations. Unlike existing methods (e.g., top-$p$, min-$p$) +that inadvertently include more noise tokens at higher temperatures, +top-$n\sigma$ maintains a stable sampling space regardless of temperature +scaling. We also provide a theoretical analysis of top-$n\sigma$ to better +understand its behavior. The extensive experimental results across four +reasoning-focused datasets demonstrate that our method not only outperforms +existing sampling approaches but also surpasses greedy decoding, while +maintaining consistent performance even at high temperatures. + +
+
+
+
+
+ + ☆ Exploring Multi-Agent Reinforcement Learning for Unrelated Parallel + Machine Scheduling + + +
+ Scheduling problems pose significant challenges in resource, industry, and +operational management. This paper addresses the Unrelated Parallel Machine +Scheduling Problem (UPMS) with setup times and resources using a Multi-Agent +Reinforcement Learning (MARL) approach. The study introduces the Reinforcement +Learning environment and conducts empirical analyses, comparing MARL with +Single-Agent algorithms. The experiments employ various deep neural network +policies for single- and Multi-Agent approaches. Results demonstrate the +efficacy of the Maskable extension of the Proximal Policy Optimization (PPO) +algorithm in Single-Agent scenarios and the Multi-Agent PPO algorithm in +Multi-Agent setups. While Single-Agent algorithms perform adequately in reduced +scenarios, Multi-Agent approaches reveal challenges in cooperative learning but +a scalable capacity. This research contributes insights into applying MARL +techniques to scheduling optimization, emphasizing the need for algorithmic +sophistication balanced with scalability for intelligent scheduling solutions. + +
+
+ comment: 11 pages, 5 figures, 4 tables, article submitted to a journal +
+
+
+
+
+ + ☆ CJST: CTC Compressor based Joint Speech and Text Training for + Decoder-Only ASR ICASSP2025 + + +
+ CTC compressor can be an effective approach to integrate audio encoders to +decoder-only models, which has gained growing interest for different speech +applications. In this work, we propose a novel CTC compressor based joint +speech and text training (CJST) framework for decoder-only ASR. CJST matches +speech and text modalities from both directions by exploring a simple modality +adaptor and several features of the CTC compressor, including sequence +compression, on-the-fly forced peaky alignment and CTC class embeddings. +Experimental results on the Librispeech and TED-LIUM2 corpora show that the +proposed CJST achieves an effective text injection without the need of duration +handling, leading to the best performance for both in-domain and cross-domain +scenarios. We also provide a comprehensive study on CTC compressor, covering +various compression modes, edge case handling and behavior under both clean and +noisy data conditions, which reveals the most robust setting to use CTC +compressor for decoder-only models. + +
+
+ comment: submitted to ICASSP2025 +
+
+
+
+
+ + ☆ Circuit Complexity Bounds for RoPE-based Transformer Architecture + + +
+ Characterizing the express power of the Transformer architecture is critical +to understanding its capacity limits and scaling law. Recent works provide the +circuit complexity bounds to Transformer-like architecture. On the other hand, +Rotary Position Embedding ($\mathsf{RoPE}$) has emerged as a crucial technique +in modern large language models, offering superior performance in capturing +positional information compared to traditional position embeddings, which shows +great potential in application prospects, particularly for the long context +scenario. Empirical evidence also suggests that $\mathsf{RoPE}$-based +Transformer architectures demonstrate greater generalization capabilities +compared to conventional Transformer models. In this work, we establish a +tighter circuit complexity bound for Transformers with $\mathsf{RoPE}$ +attention. Our key contribution is that we show that unless $\mathsf{TC}^0 = +\mathsf{NC}^1$, a $\mathsf{RoPE}$-based Transformer with +$\mathrm{poly}(n)$-precision, $O(1)$ layers, hidden dimension $d \leq O(n)$ +cannot solve the arithmetic problem or the Boolean formula value problem. This +result significantly demonstrates the fundamental limitation of the +expressivity of the $\mathsf{RoPE}$-based Transformer architecture, although it +achieves giant empirical success. Our theoretical framework not only +establishes tighter complexity bounds but also may instruct further work on the +$\mathsf{RoPE}$-based Transformer. + +
+
+
+
+
+ + ☆ SegQC: a segmentation network-based framework for multi-metric + segmentation quality control and segmentation error detection in volumetric + medical images + + +
+ Quality control of structures segmentation in volumetric medical images is +important for identifying segmentation errors in clinical practice and for +facilitating model development. This paper introduces SegQC, a novel framework +for segmentation quality estimation and segmentation error detection. SegQC +computes an estimate measure of the quality of a segmentation in volumetric +scans and in their individual slices and identifies possible segmentation error +regions within a slice. The key components include: 1. SegQC-Net, a deep +network that inputs a scan and its segmentation mask and outputs segmentation +error probabilities for each voxel in the scan; 2. three new segmentation +quality metrics, two overlap metrics and a structure size metric, computed from +the segmentation error probabilities; 3. a new method for detecting possible +segmentation errors in scan slices computed from the segmentation error +probabilities. We introduce a new evaluation scheme to measure segmentation +error discrepancies based on an expert radiologist corrections of automatically +produced segmentations that yields smaller observer variability and is closer +to actual segmentation errors. We demonstrate SegQC on three fetal structures +in 198 fetal MRI scans: fetal brain, fetal body and the placenta. To assess the +benefits of SegQC, we compare it to the unsupervised Test Time Augmentation +(TTA)-based quality estimation. Our studies indicate that SegQC outperforms +TTA-based quality estimation in terms of Pearson correlation and MAE for fetal +body and fetal brain structures segmentation. Our segmentation error detection +method achieved recall and precision rates of 0.77 and 0.48 for fetal body, and +0.74 and 0.55 for fetal brain segmentation error detection respectively. SegQC +enhances segmentation metrics estimation for whole scans and individual slices, +as well as provides error regions detection. + +
+
+ comment: 28 pages, 9 figures +
+
+
+
+
+ + ☆ Decision Feedback In-Context Symbol Detection over Block-Fading Channels + + +
+ Pre-trained Transformers, through in-context learning (ICL), have +demonstrated exceptional capabilities to adapt to new tasks using example +prompts \textit{without model update}. Transformer-based wireless receivers, +where prompts consist of the pilot data in the form of transmitted and received +signal pairs, have shown high estimation accuracy when pilot data are abundant. +However, pilot information is often costly and limited in practice. In this +work, we propose the \underline{DE}cision \underline{F}eedback +\underline{IN}-Cont\underline{E}xt \underline{D}etection (DEFINED) solution as +a new wireless receiver design, which bypasses channel estimation and directly +performs symbol detection using the (sometimes extremely) limited pilot data. +The key innovation in DEFINED is the proposed decision feedback mechanism in +ICL, where we sequentially incorporate the detected symbols into the prompts to +improve the detections for subsequent symbols. Extensive experiments across a +broad range of wireless communication settings demonstrate that DEFINED +achieves significant performance improvements, in some cases only needing a +single pilot pair. + +
+
+
+
+
+ + ☆ Entropy Controllable Direct Preference Optimization + + +
+ In the post-training of large language models (LLMs), Reinforcement Learning +from Human Feedback (RLHF) is an effective approach to achieve generation +aligned with human preferences. Direct Preference Optimization (DPO) allows for +policy training with a simple binary cross-entropy loss without a reward model. +The objective of DPO is regularized by reverse KL divergence that encourages +mode-seeking fitting to the reference policy. Nonetheless, we indicate that +minimizing reverse KL divergence could fail to capture a mode of the reference +distribution, which may hurt the policy's performance. Based on this +observation, we propose a simple modification to DPO, H-DPO, which allows for +control over the entropy of the resulting policy, enhancing the distribution's +sharpness and thereby enabling mode-seeking fitting more effectively. In our +experiments, we show that H-DPO outperformed DPO across various tasks, +demonstrating superior results in pass@$k$ evaluations for mathematical tasks. +Moreover, H-DPO is simple to implement, requiring only minor modifications to +the loss calculation of DPO, which makes it highly practical and promising for +wide-ranging applications in the training of LLMs. + +
+
+
+
+
+ + ☆ Overcoming the Curse of Dimensionality in Reinforcement Learning Through + Approximate Factorization + + +
+ Reinforcement Learning (RL) algorithms are known to suffer from the curse of +dimensionality, which refers to the fact that large-scale problems often lead +to exponentially high sample complexity. A common solution is to use deep +neural networks for function approximation; however, such approaches typically +lack theoretical guarantees. To provably address the curse of dimensionality, +we observe that many real-world problems exhibit task-specific model structures +that, when properly leveraged, can improve the sample efficiency of RL. +Building on this insight, we propose overcoming the curse of dimensionality by +approximately factorizing the original Markov decision processes (MDPs) into +smaller, independently evolving MDPs. This factorization enables the +development of sample-efficient RL algorithms in both model-based and +model-free settings, with the latter involving a variant of variance-reduced +Q-learning. We provide improved sample complexity guarantees for both proposed +algorithms. Notably, by leveraging model structure through the approximate +factorization of the MDP, the dependence of sample complexity on the size of +the state-action space can be exponentially reduced. Numerically, we +demonstrate the practicality of our proposed methods through experiments on +both synthetic MDP tasks and a wind farm-equipped storage control problem. + +
+
+ comment: 61 pages, 10 figures +
+
+
+
+
+ + ☆ Disentangling Tabular Data towards Better One-Class Anomaly Detection + + +
+ Tabular anomaly detection under the one-class classification setting poses a +significant challenge, as it involves accurately conceptualizing "normal" +derived exclusively from a single category to discern anomalies from normal +data variations. Capturing the intrinsic correlation among attributes within +normal samples presents one promising method for learning the concept. To do +so, the most recent effort relies on a learnable mask strategy with a +reconstruction task. However, this wisdom may suffer from the risk of producing +uniform masks, i.e., essentially nothing is masked, leading to less effective +correlation learning. To address this issue, we presume that attributes related +to others in normal samples can be divided into two non-overlapping and +correlated subsets, defined as CorrSets, to capture the intrinsic correlation +effectively. Accordingly, we introduce an innovative method that disentangles +CorrSets from normal tabular data. To our knowledge, this is a pioneering +effort to apply the concept of disentanglement for one-class anomaly detection +on tabular data. Extensive experiments on 20 tabular datasets show that our +method substantially outperforms the state-of-the-art methods and leads to an +average performance improvement of 6.1% on AUC-PR and 2.1% on AUC-ROC. + +
+
+
+
+
+ + ☆ Uncertainty-Aware Test-Time Adaptation for Inverse Consistent + Diffeomorphic Lung Image Registration + + +
+ Diffeomorphic deformable image registration ensures smooth invertible +transformations across inspiratory and expiratory chest CT scans. Yet, in +practice, deep learning-based diffeomorphic methods struggle to capture large +deformations between inspiratory and expiratory volumes, and therefore lack +inverse consistency. Existing methods also fail to account for model +uncertainty, which can be useful for improving performance. We propose an +uncertainty-aware test-time adaptation framework for inverse consistent +diffeomorphic lung registration. Our method uses Monte Carlo (MC) dropout to +estimate spatial uncertainty that is used to improve model performance. We +train and evaluate our method for inspiratory-to-expiratory CT registration on +a large cohort of 675 subjects from the COPDGene study, achieving a higher Dice +similarity coefficient (DSC) between the lung boundaries (0.966) compared to +both VoxelMorph (0.953) and TransMorph (0.953). Our method demonstrates +consistent improvements in the inverse registration direction as well with an +overall DSC of 0.966, higher than VoxelMorph (0.958) and TransMorph (0.956). +Paired t-tests indicate statistically significant improvements. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Zer0-Jack: A Memory-efficient Gradient-based Jailbreaking Method for + Black-box Multi-modal Large Language Models + + +
+ Jailbreaking methods, which induce Multi-modal Large Language Models (MLLMs) +to output harmful responses, raise significant safety concerns. Among these +methods, gradient-based approaches, which use gradients to generate malicious +prompts, have been widely studied due to their high success rates in white-box +settings, where full access to the model is available. However, these methods +have notable limitations: they require white-box access, which is not always +feasible, and involve high memory usage. To address scenarios where white-box +access is unavailable, attackers often resort to transfer attacks. In transfer +attacks, malicious inputs generated using white-box models are applied to +black-box models, but this typically results in reduced attack performance. To +overcome these challenges, we propose Zer0-Jack, a method that bypasses the +need for white-box access by leveraging zeroth-order optimization. We propose +patch coordinate descent to efficiently generate malicious image inputs to +directly attack black-box MLLMs, which significantly reduces memory usage +further. Through extensive experiments, Zer0-Jack achieves a high attack +success rate across various models, surpassing previous transfer-based methods +and performing comparably with existing white-box jailbreak techniques. +Notably, Zer0-Jack achieves a 95\% attack success rate on MiniGPT-4 with the +Harmful Behaviors Multi-modal Dataset on a black-box setting, demonstrating its +effectiveness. Additionally, we show that Zer0-Jack can directly attack +commercial MLLMs such as GPT-4o. Codes are provided in the supplement. + +
+
+ comment: Accepted to Neurips SafeGenAi Workshop 2024 +
+
+
+
+
+ + ☆ Exogenous Randomness Empowering Random Forests + + +
+ We offer theoretical and empirical insights into the impact of exogenous +randomness on the effectiveness of random forests with tree-building rules +independent of training data. We formally introduce the concept of exogenous +randomness and identify two types of commonly existing randomness: Type I from +feature subsampling, and Type II from tie-breaking in tree-building processes. +We develop non-asymptotic expansions for the mean squared error (MSE) for both +individual trees and forests and establish sufficient and necessary conditions +for their consistency. In the special example of the linear regression model +with independent features, our MSE expansions are more explicit, providing more +understanding of the random forests' mechanisms. It also allows us to derive an +upper bound on the MSE with explicit consistency rates for trees and forests. +Guided by our theoretical findings, we conduct simulations to further explore +how exogenous randomness enhances random forest performance. Our findings +unveil that feature subsampling reduces both the bias and variance of random +forests compared to individual trees, serving as an adaptive mechanism to +balance bias and variance. Furthermore, our results reveal an intriguing +phenomenon: the presence of noise features can act as a "blessing" in enhancing +the performance of random forests thanks to feature subsampling. + +
+
+ comment: 103 pages, 10 figures +
+
+
+
+
+ + ☆ Unraveling the Gradient Descent Dynamics of Transformers + + +
+ While the Transformer architecture has achieved remarkable success across +various domains, a thorough theoretical foundation explaining its optimization +dynamics is yet to be fully developed. In this study, we aim to bridge this +understanding gap by answering the following two core questions: (1) Which +types of Transformer architectures allow Gradient Descent (GD) to achieve +guaranteed convergence? and (2) Under what initial conditions and architectural +specifics does the Transformer achieve rapid convergence during training? By +analyzing the loss landscape of a single Transformer layer using Softmax and +Gaussian attention kernels, our work provides concrete answers to these +questions. Our findings demonstrate that, with appropriate weight +initialization, GD can train a Transformer model (with either kernel type) to +achieve a global optimal solution, especially when the input embedding +dimension is large. Nonetheless, certain scenarios highlight potential +pitfalls: training a Transformer using the Softmax attention kernel may +sometimes lead to suboptimal local solutions. In contrast, the Gaussian +attention kernel exhibits a much favorable behavior. Our empirical study +further validate the theoretical findings. + +
+
+
+
+
+ + ☆ Accident Impact Prediction based on a deep convolutional and recurrent + neural network model + + +
+ Traffic accidents pose a significant threat to public safety, resulting in +numerous fatalities, injuries, and a substantial economic burden each year. The +development of predictive models capable of real-time forecasting of +post-accident impact using readily available data can play a crucial role in +preventing adverse outcomes and enhancing overall safety. However, existing +accident predictive models encounter two main challenges: first, reliance on +either costly or non-real-time data, and second the absence of a comprehensive +metric to measure post-accident impact accurately. To address these +limitations, this study proposes a deep neural network model known as the +cascade model. It leverages readily available real-world data from Los Angeles +County to predict post-accident impacts. The model consists of two components: +Long Short-Term Memory (LSTM) and Convolutional Neural Network (CNN). The LSTM +model captures temporal patterns, while the CNN extracts patterns from the +sparse accident dataset. Furthermore, an external traffic congestion dataset is +incorporated to derive a new feature called the "accident impact" factor, which +quantifies the influence of an accident on surrounding traffic flow. Extensive +experiments were conducted to demonstrate the effectiveness of the proposed +hybrid machine learning method in predicting the post-accident impact compared +to state-of-the-art baselines. The results reveal a higher precision in +predicting minimal impacts (i.e., cases with no reported accidents) and a +higher recall in predicting more significant impacts (i.e., cases with reported +accidents). + +
+
+ comment: 28 pages, 18 figures +
+
+
+
+
+ + ☆ Model Stealing for Any Low-Rank Language Model + + +
+ Model stealing, where a learner tries to recover an unknown model via +carefully chosen queries, is a critical problem in machine learning, as it +threatens the security of proprietary models and the privacy of data they are +trained on. In recent years, there has been particular interest in stealing +large language models (LLMs). In this paper, we aim to build a theoretical +understanding of stealing language models by studying a simple and +mathematically tractable setting. We study model stealing for Hidden Markov +Models (HMMs), and more generally low-rank language models. + We assume that the learner works in the conditional query model, introduced +by Kakade, Krishnamurthy, Mahajan and Zhang. Our main result is an efficient +algorithm in the conditional query model, for learning any low-rank +distribution. In other words, our algorithm succeeds at stealing any language +model whose output distribution is low-rank. This improves upon the previous +result by Kakade, Krishnamurthy, Mahajan and Zhang, which also requires the +unknown distribution to have high "fidelity", a property that holds only in +restricted cases. There are two key insights behind our algorithm: First, we +represent the conditional distributions at each timestep by constructing +barycentric spanners among a collection of vectors of exponentially large +dimension. Second, for sampling from our representation, we iteratively solve a +sequence of convex optimization problems that involve projection in relative +entropy to prevent compounding of errors over the length of the sequence. This +is an interesting example where, at least theoretically, allowing a machine +learning model to solve more complex problems at inference time can lead to +drastic improvements in its performance. + +
+
+
+
+
+ + ☆ Effective Virtual Reality Teleoperation of an Upper-body Humanoid with + Modified Task Jacobians and Relaxed Barrier Functions for Self-Collision + Avoidance IROS 2022 + + +
+ We present an approach for retartgeting off-the-shelf Virtual Reality (VR) +trackers to effectively teleoperate an upper-body humanoid while ensuring +self-collision-free motions. Key to the effectiveness was the proper assignment +of trackers to joint sets via modified task Jacobians and relaxed barrier +functions for self-collision avoidance. The approach was validated on +Apptronik's Astro hardware by demonstrating manipulation capabilities on a +table-top environment with pick-and-place box packing and a two-handed box pick +up and handover task. + +
+
+ comment: XR & Robotics Workshop, IROS 2022 +
+
+
+
+
+ + ☆ SecEncoder: Logs are All You Need in Security + + +
+ Large and Small Language Models (LMs) are typically pretrained using +extensive volumes of text, which are sourced from publicly accessible platforms +such as Wikipedia, Book Corpus, or through web scraping. These models, due to +their exposure to a wide range of language data, exhibit impressive +generalization capabilities and can perform a multitude of tasks +simultaneously. However, they often fall short when it comes to domain-specific +tasks due to their broad training data. This paper introduces SecEncoder, a +specialized small language model that is pretrained using security logs. +SecEncoder is designed to address the domain-specific limitations of general +LMs by focusing on the unique language and patterns found in security logs. +Experimental results indicate that SecEncoder outperforms other LMs, such as +BERTlarge, DeBERTa-v3-large and OpenAI's Embedding (textembedding-ada-002) +models, which are pretrained mainly on natural language, across various tasks. +Furthermore, although SecEncoder is primarily pretrained on log data, it +outperforms models pretrained on natural language for a range of tasks beyond +log analysis, such as incident prioritization and threat intelligence document +retrieval. This suggests that domain specific pretraining with logs can +significantly enhance the performance of LMs in security. These findings pave +the way for future research into security-specific LMs and their potential +applications. + +
+
+
+
+
+ + ☆ Collaborative and Federated Black-box Optimization: A Bayesian + Optimization Perspective + + +
+ We focus on collaborative and federated black-box optimization (BBOpt), where +agents optimize their heterogeneous black-box functions through collaborative +sequential experimentation. From a Bayesian optimization perspective, we +address the fundamental challenges of distributed experimentation, +heterogeneity, and privacy within BBOpt, and propose three unifying frameworks +to tackle these issues: (i) a global framework where experiments are centrally +coordinated, (ii) a local framework that allows agents to make decisions based +on minimal shared information, and (iii) a predictive framework that enhances +local surrogates through collaboration to improve decision-making. We +categorize existing methods within these frameworks and highlight key open +questions to unlock the full potential of federated BBOpt. Our overarching goal +is to shift federated learning from its predominantly descriptive/predictive +paradigm to a prescriptive one, particularly in the context of BBOpt - an +inherently sequential decision-making problem. + +
+
+
+
+
+ + ☆ Bayesian Deep Learning Approach for Real-time Lane-based Arrival Curve + Reconstruction at Intersection using License Plate Recognition Data + + +
+ The acquisition of real-time and accurate traffic arrival information is of +vital importance for proactive traffic control systems, especially in partially +connected vehicle environments. License plate recognition (LPR) data that +record both vehicle departures and identities are proven to be desirable in +reconstructing lane-based arrival curves in previous works. Existing LPR +databased methods are predominantly designed for reconstructing historical +arrival curves. For real-time reconstruction of multi-lane urban roads, it is +pivotal to determine the lane choice of real-time link-based arrivals, which +has not been exploited in previous studies. In this study, we propose a +Bayesian deep learning approach for real-time lane-based arrival curve +reconstruction, in which the lane choice patterns and uncertainties of +link-based arrivals are both characterized. Specifically, the learning process +is designed to effectively capture the relationship between partially observed +link-based arrivals and lane-based arrivals, which can be physically +interpreted as lane choice proportion. Moreover, the lane choice uncertainties +are characterized using Bayesian parameter inference techniques, minimizing +arrival curve reconstruction uncertainties, especially in low LPR data matching +rate conditions. Real-world experiment results conducted in multiple matching +rate scenarios demonstrate the superiority and necessity of lane choice +modeling in reconstructing arrival curves. + +
+
+ comment: accepted by T-ITS +
+
+
+
+
+ + ☆ Robust Offline Reinforcement Learning for Non-Markovian Decision + Processes + + +
+ Distributionally robust offline reinforcement learning (RL) aims to find a +policy that performs the best under the worst environment within an uncertainty +set using an offline dataset collected from a nominal model. While recent +advances in robust RL focus on Markov decision processes (MDPs), robust +non-Markovian RL is limited to planning problem where the transitions in the +uncertainty set are known. In this paper, we study the learning problem of +robust offline non-Markovian RL. Specifically, when the nominal model admits a +low-rank structure, we propose a new algorithm, featuring a novel dataset +distillation and a lower confidence bound (LCB) design for robust values under +different types of the uncertainty set. We also derive new dual forms for these +robust values in non-Markovian RL, making our algorithm more amenable to +practical implementation. By further introducing a novel type-I concentrability +coefficient tailored for offline low-rank non-Markovian decision processes, we +prove that our algorithm can find an $\epsilon$-optimal robust policy using +$O(1/\epsilon^2)$ offline samples. Moreover, we extend our algorithm to the +case when the nominal model does not have specific structure. With a new +type-II concentrability coefficient, the extended algorithm also enjoys +polynomial sample efficiency under all different types of the uncertainty set. + +
+
+
+
+
+ + ☆ FM-TS: Flow Matching for Time Series Generation + + +
+ Time series generation has emerged as an essential tool for analyzing +temporal data across numerous fields. While diffusion models have recently +gained significant attention in generating high-quality time series, they tend +to be computationally demanding and reliant on complex stochastic processes. To +address these limitations, we introduce FM-TS, a rectified Flow Matching-based +framework for Time Series generation, which simplifies the time series +generation process by directly optimizing continuous trajectories. This +approach avoids the need for iterative sampling or complex noise schedules +typically required in diffusion-based models. FM-TS is more efficient in terms +of training and inference. Moreover, FM-TS is highly adaptive, supporting both +conditional and unconditional time series generation. Notably, through our +novel inference design, the model trained in an unconditional setting can +seamlessly generalize to conditional tasks without the need for retraining. +Extensive benchmarking across both settings demonstrates that FM-TS +consistently delivers superior performance compared to existing approaches +while being more efficient in terms of training and inference. For instance, in +terms of discriminative score, FM-TS achieves 0.005, 0.019, 0.011, 0.005, +0.053, and 0.106 on the Sines, Stocks, ETTh, MuJoCo, Energy, and fMRI +unconditional time series datasets, respectively, significantly outperforming +the second-best method which achieves 0.006, 0.067, 0.061, 0.008, 0.122, and +0.167 on the same datasets. We have achieved superior performance in solar +forecasting and MuJoCo imputation tasks, significantly enhanced by our +innovative $t$ power sampling method. The code is available at +https://github.com/UNITES-Lab/FMTS. + +
+
+
+
+
+ + ☆ AdaS&S: a One-Shot Supernet Approach for Automatic Embedding Size Search + in Deep Recommender System + + +
+ Deep Learning Recommendation Model(DLRM)s utilize the embedding layer to +represent various categorical features. Traditional DLRMs adopt unified +embedding size for all features, leading to suboptimal performance and +redundant parameters. Thus, lots of Automatic Embedding size Search (AES) works +focus on obtaining mixed embedding sizes with strong model performance. +However, previous AES works can hardly address several challenges together: (1) +The search results of embedding sizes are unstable; (2) Recommendation effect +with AES results is unsatisfactory; (3) Memory cost of embeddings is +uncontrollable. To address these challenges, we propose a novel one-shot AES +framework called AdaS&S, in which a supernet encompassing various candidate +embeddings is built and AES is performed as searching network architectures +within it. Our framework contains two main stages: In the first stage, we +decouple training parameters from searching embedding sizes, and propose the +Adaptive Sampling method to yield a well-trained supernet, which further helps +to produce stable AES results. In the second stage, to obtain embedding sizes +that benefits the model effect, we design a reinforcement learning search +process which utilizes the supernet trained previously. Meanwhile, to adapt +searching to specific resource constraint, we introduce the resource +competition penalty to balance the model effectiveness and memory cost of +embeddings. We conduct extensive experiments on public datasets to show the +superiority of AdaS&S. Our method could improve AUC by about 0.3% while saving +about 20% of model parameters. Empirical analysis also shows that the stability +of searching results in AdaS&S significantly exceeds other methods. + +
+
+
+
+
+ + ☆ A Novel Automatic Real-time Motion Tracking Method for Magnetic + Resonance Imaging-guided Radiotherapy: Leveraging the Enhanced + Tracking-Learning-Detection Framework with Automatic Segmentation + + +
+ Objective: Ensuring the precision in motion tracking for MRI-guided +Radiotherapy (MRIgRT) is crucial for the delivery of effective treatments. This +study refined the motion tracking accuracy in MRIgRT through the innovation of +an automatic real-time tracking method, leveraging an enhanced +Tracking-Learning-Detection (ETLD) framework coupled with automatic +segmentation. Methods: We developed a novel MRIgRT motion tracking method by +integrating two primary methods: the ETLD framework and an improved Chan-Vese +model (ICV), named ETLD+ICV. The TLD framework was upgraded to suit real-time +cine MRI, including advanced image preprocessing, no-reference image quality +assessment, an enhanced median-flow tracker, and a refined detector with +dynamic search region adjustments. Additionally, ICV was combined for precise +coverage of the target volume, which refined the segmented region frame by +frame using tracking results, with key parameters optimized. Tested on 3.5D MRI +scans from 10 patients with liver metastases, our method ensures precise +tracking and accurate segmentation vital for MRIgRT. Results: An evaluation of +106,000 frames across 77 treatment fractions revealed sub-millimeter tracking +errors of less than 0.8mm, with over 99% precision and 98% recall for all +subjects, underscoring the robustness and efficacy of the ETLD. Moreover, the +ETLD+ICV yielded a dice global score of more than 82% for all subjects, +demonstrating the proposed method's extensibility and precise target volume +coverage. Conclusions: This study successfully developed an automatic real-time +motion tracking method for MRIgRT that markedly surpasses current methods. The +novel method not only delivers exceptional precision in tracking and +segmentation but also demonstrates enhanced adaptability to clinical demands, +positioning it as an indispensable asset in the quest to augment the efficacy +of radiotherapy treatments. + +
+
+
+
+
+ + ☆ LAUREL: Learned Augmented Residual Layer ICML + + +
+ One of the core pillars of efficient deep learning methods is architectural +improvements such as the residual/skip connection, which has led to +significantly better model convergence and quality. Since then the residual +connection has become ubiquitous in not just convolutional neural networks but +also transformer-based architectures, the backbone of LLMs. + In this paper we introduce \emph{Learned Augmented Residual Layer} (LAuReL) +-- a novel generalization of the canonical residual connection -- with the goal +to be an in-situ replacement of the latter while outperforming on both model +quality and footprint metrics. Our experiments show that using \laurel can help +boost performance for both vision and language models. For example, on the +ResNet-50, ImageNet 1K task, it achieves $60\%$ of the gains from adding an +extra layer, while only adding $0.003\%$ more parameters, and matches it while +adding $2.6\times$ fewer parameters. + +
+
+ comment: Accepted at the 2nd Efficient Systems for Foundation Models Workshop + at the International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ☆ ADMM for Structured Fractional Minimization + + +
+ We consider a class of structured fractional minimization problems, where the +numerator includes a differentiable function, a simple nonconvex nonsmooth +function, a concave nonsmooth function, and a convex nonsmooth function +composed with a linear operator, while the denominator is a continuous function +that is either weakly convex or has a weakly convex square root. These problems +are widespread and span numerous essential applications in machine learning and +data science. Existing methods are mainly based on subgradient methods and +smoothing proximal gradient methods, which may suffer from slow convergence and +numerical stability issues. In this paper, we introduce {\sf FADMM}, the first +Alternating Direction Method of Multipliers tailored for this class of +problems. {\sf FADMM} decouples the original problem into linearized proximal +subproblems, featuring two variants: one using Dinkelbach's parametric method +({\sf FADMM-D}) and the other using the quadratic transform method ({\sf +FADMM-Q}). By introducing a novel Lyapunov function, we establish that {\sf +FADMM} converges to $\epsilon$-approximate critical points of the problem +within an oracle complexity of $\mathcal{O}(1/\epsilon^{3})$. Our experiments +on synthetic and real-world data for sparse Fisher discriminant analysis, +robust Sharpe ratio minimization, and robust sparse recovery demonstrate the +effectiveness of our approach. + Keywords: Fractional Minimization, Nonconvex Optimization, Proximal +Linearized ADMM, Nonsmooth Optimization, Convergence Analysis + +
+
+
+
+
+ + ☆ Quantifying Knowledge Distillation Using Partial Information + Decomposition NeurIPS 2024 + + +
+ Knowledge distillation provides an effective method for deploying complex +machine learning models in resource-constrained environments. It typically +involves training a smaller student model to emulate either the probabilistic +outputs or the internal feature representations of a larger teacher model. By +doing so, the student model often achieves substantially better performance on +a downstream task compared to when it is trained independently. Nevertheless, +the teacher's internal representations can also encode noise or additional +information that may not be relevant to the downstream task. This observation +motivates our primary question: What are the information-theoretic limits of +knowledge transfer? To this end, we leverage a body of work in information +theory called Partial Information Decomposition (PID) to quantify the +distillable and distilled knowledge of a teacher's representation corresponding +to a given student and a downstream task. Moreover, we demonstrate that this +metric can be practically used in distillation to address challenges caused by +the complexity gap between the teacher and the student representations. + +
+
+ comment: Accepted at NeurIPS 2024 Machine Learning and Compression Workshop +
+
+
+
+
+ + ☆ Enhancing Link Prediction with Fuzzy Graph Attention Networks and + Dynamic Negative Sampling + + +
+ Link prediction is crucial for understanding complex networks but traditional +Graph Neural Networks (GNNs) often rely on random negative sampling, leading to +suboptimal performance. This paper introduces Fuzzy Graph Attention Networks +(FGAT), a novel approach integrating fuzzy rough sets for dynamic negative +sampling and enhanced node feature aggregation. Fuzzy Negative Sampling (FNS) +systematically selects high-quality negative edges based on fuzzy similarities, +improving training efficiency. FGAT layer incorporates fuzzy rough set +principles, enabling robust and discriminative node representations. +Experiments on two research collaboration networks demonstrate FGAT's superior +link prediction accuracy, outperforming state-of-the-art baselines by +leveraging the power of fuzzy rough sets for effective negative sampling and +node feature learning. + +
+
+
+
+
+ + ☆ Retrieval Augmented Time Series Forecasting + + +
+ Retrieval-augmented generation (RAG) is a central component of modern LLM +systems, particularly in scenarios where up-to-date information is crucial for +accurately responding to user queries or when queries exceed the scope of the +training data. The advent of time-series foundation models (TSFM), such as +Chronos, and the need for effective zero-shot forecasting performance across +various time-series domains motivates the question: Do benefits of RAG +similarly carry over to time series forecasting? In this paper, we advocate +that the dynamic and event-driven nature of time-series data makes RAG a +crucial component of TSFMs and introduce a principled RAG framework for +time-series forecasting, called Retrieval Augmented Forecasting (RAF). Within +RAF, we develop efficient strategies for retrieving related time-series +examples and incorporating them into forecast. Through experiments and +mechanistic studies, we demonstrate that RAF indeed improves the forecasting +accuracy across diverse time series domains and the improvement is more +significant for larger TSFM sizes. + +
+
+
+
+
+ + ☆ Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial + Approach + + +
+ Deep learning underpins most of the currently advanced natural language +processing (NLP) tasks such as textual classification, neural machine +translation (NMT), abstractive summarization and question-answering (QA). +However, the robustness of the models, particularly QA models, against +adversarial attacks is a critical concern that remains insufficiently explored. +This paper introduces QA-Attack (Question Answering Attack), a novel word-level +adversarial strategy that fools QA models. Our attention-based attack exploits +the customized attention mechanism and deletion ranking strategy to identify +and target specific words within contextual passages. It creates deceptive +inputs by carefully choosing and substituting synonyms, preserving grammatical +integrity while misleading the model to produce incorrect responses. Our +approach demonstrates versatility across various question types, particularly +when dealing with extensive long textual inputs. Extensive experiments on +multiple benchmark datasets demonstrate that QA-Attack successfully deceives +baseline QA models and surpasses existing adversarial techniques regarding +success rate, semantics changes, BLEU score, fluency and grammar error rate. + +
+
+
+
+
+ + ☆ NVCiM-PT: An NVCiM-assisted Prompt Tuning Framework for Edge LLMs + + +
+ Large Language Models (LLMs) deployed on edge devices, known as edge LLMs, +need to continuously fine-tune their model parameters from user-generated data +under limited resource constraints. However, most existing learning methods are +not applicable for edge LLMs because of their reliance on high resources and +low learning capacity. Prompt tuning (PT) has recently emerged as an effective +fine-tuning method for edge LLMs by only modifying a small portion of LLM +parameters, but it suffers from user domain shifts, resulting in repetitive +training and losing resource efficiency. Conventional techniques to address +domain shift issues often involve complex neural networks and sophisticated +training, which are incompatible for PT for edge LLMs. Therefore, an open +research question is how to address domain shift issues for edge LLMs with +limited resources. In this paper, we propose a prompt tuning framework for edge +LLMs, exploiting the benefits offered by non-volatile computing-in-memory +(NVCiM) architectures. We introduce a novel NVCiM-assisted PT framework, where +we narrow down the core operations to matrix-matrix multiplication, which can +then be accelerated by performing in-situ computation on NVCiM. To the best of +our knowledge, this is the first work employing NVCiM to improve the edge LLM +PT performance. + +
+
+ comment: Accepted by DATE 2025 +
+
+
+
+
+ + ☆ A Social Outcomes and Priorities centered (SOP) Framework for AI policy + + +
+ Rapid developments in AI and its adoption across various domains have +necessitated a need to build robust guardrails and risk containment plans while +ensuring equitable benefits for the betterment of society. The current +technology-centered approach has resulted in a fragmented, reactive, and +ineffective policy apparatus. This paper highlights the immediate and urgent +need to pivot to a society-centered approach to develop comprehensive, +coherent, forward-looking AI policy. To this end, we present a Social Outcomes +and Priorities centered (SOP) framework for AI policy along with proposals on +implementation of its various components. While the SOP framework is presented +from a US-centric view, the takeaways are general and applicable globally. + +
+
+
+
+
+ + ☆ Imitation Learning from Observations: An Autoregressive Mixture of + Experts Approach + + +
+ This paper presents a novel approach to imitation learning from observations, +where an autoregressive mixture of experts model is deployed to fit the +underlying policy. The parameters of the model are learned via a two-stage +framework. By leveraging the existing dynamics knowledge, the first stage of +the framework estimates the control input sequences and hence reduces the +problem complexity. At the second stage, the policy is learned by solving a +regularized maximum-likelihood estimation problem using the estimated control +input sequences. We further extend the learning procedure by incorporating a +Lyapunov stability constraint to ensure asymptotic stability of the identified +model, for accurate multi-step predictions. The effectiveness of the proposed +framework is validated using two autonomous driving datasets collected from +human demonstrations, demonstrating its practical applicability in modelling +complex nonlinear dynamics. + +
+
+
+
+
+ + ♻ ☆ VQC-Based Reinforcement Learning with Data Re-uploading: Performance and + Trainability + + +
+ Reinforcement Learning (RL) consists of designing agents that make +intelligent decisions without human supervision. When used alongside function +approximators such as Neural Networks (NNs), RL is capable of solving extremely +complex problems. Deep Q-Learning, a RL algorithm that uses Deep NNs, achieved +super-human performance in some specific tasks. Nonetheless, it is also +possible to use Variational Quantum Circuits (VQCs) as function approximators +in RL algorithms. This work empirically studies the performance and +trainability of such VQC-based Deep Q-Learning models in classic control +benchmark environments. More specifically, we research how data re-uploading +affects both these metrics. We show that the magnitude and the variance of the +gradients of these models remain substantial throughout training due to the +moving targets of Deep Q-Learning. Moreover, we empirically show that +increasing the number of qubits does not lead to an exponential vanishing +behavior of the magnitude and variance of the gradients for a PQC approximating +a 2-design, unlike what was expected due to the Barren Plateau Phenomenon. This +hints at the possibility of VQCs being specially adequate for being used as +function approximators in such a context. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ On the Utilization of Unique Node Identifiers in Graph Neural Networks + + +
+ Graph Neural Networks have inherent representational limitations due to their +message-passing structure. Recent work has suggested that these limitations can +be overcome by using unique node identifiers (UIDs). Here we argue that despite +the advantages of UIDs, one of their disadvantages is that they lose the +desirable property of permutation-equivariance. We thus propose to focus on UID +models that are permutation-equivariant, and present theoretical arguments for +their advantages. Motivated by this, we propose a method to regularize UID +models towards permutation equivariance, via a contrastive loss. We empirically +demonstrate that our approach improves generalization and extrapolation +abilities while providing faster training convergence. On the recent BREC +expressiveness benchmark, our proposed method achieves state-of-the-art +performance compared to other random-based approaches. + +
+
+
+
+
+ + ♻ ☆ Foundation Models for the Electric Power Grid + + +
+ Foundation models (FMs) currently dominate news headlines. They employ +advanced deep learning architectures to extract structural information +autonomously from vast datasets through self-supervision. The resulting rich +representations of complex systems and dynamics can be applied to many +downstream applications. Therefore, FMs can find uses in electric power grids, +challenged by the energy transition and climate change. In this paper, we call +for the development of, and state why we believe in, the potential of FMs for +electric grids. We highlight their strengths and weaknesses amidst the +challenges of a changing grid. We argue that an FM learning from diverse grid +data and topologies could unlock transformative capabilities, pioneering a new +approach in leveraging AI to redefine how we manage complexity and uncertainty +in the electric grid. Finally, we discuss a power grid FM concept, namely +GridFM, based on graph neural networks and show how different downstream tasks +benefit. + +
+
+ comment: Major equal contributors: H.F.H., T.B., B.G., L.S.A.M., A.P., A.V., + J.W.; Significant equal contributors: J.B., A.B.M., S.C., I.F., B.H., R.J., + K.K., V.M., F.M., M.D.M., O.R., H.S., L.X., E.S.Y., A.Z.; Other equal + contributors: A.J.B., R.J.B., B.P.B., J.S., S.S; Lead contact: H.F.H +
+
+
+
+
+ + ♻ ☆ LE-PDE++: Mamba for accelerating PDEs Simulations + + +
+ Partial Differential Equations are foundational in modeling science and +natural systems such as fluid dynamics and weather forecasting. The Latent +Evolution of PDEs method is designed to address the computational intensity of +classical and deep learning-based PDE solvers by proposing a scalable and +efficient alternative. To enhance the efficiency and accuracy of LE-PDE, we +incorporate the Mamba model, an advanced machine learning model known for its +predictive efficiency and robustness in handling complex dynamic systems with a +progressive learning strategy. The LE-PDE was tested on several benchmark +problems. The method demonstrated a marked reduction in computational time +compared to traditional solvers and standalone deep learning models while +maintaining high accuracy in predicting system behavior over time. Our method +doubles the inference speed compared to the LE-PDE while retaining the same +level of parameter efficiency, making it well-suited for scenarios requiring +long-term predictions. + +
+
+
+
+
+ + ♻ ☆ Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and + Tabnet with SMOTEENN + + +
+ Bank credit risk is a significant challenge in modern financial transactions, +and the ability to identify qualified credit card holders among a large number +of applicants is crucial for the profitability of a bank'sbank's credit card +business. In the past, screening applicants'applicants' conditions often +required a significant amount of manual labor, which was time-consuming and +labor-intensive. Although the accuracy and reliability of previously used ML +models have been continuously improving, the pursuit of more reliable and +powerful AI intelligent models is undoubtedly the unremitting pursuit by major +banks in the financial industry. In this study, we used a dataset of over +40,000 records provided by a commercial bank as the research object. We +compared various dimensionality reduction techniques such as PCA and T-SNE for +preprocessing high-dimensional datasets and performed in-depth adaptation and +tuning of distributed models such as LightGBM and XGBoost, as well as deep +models like Tabnet. After a series of research and processing, we obtained +excellent research results by combining SMOTEENN with these techniques. The +experiments demonstrated that LightGBM combined with PCA and SMOTEENN +techniques can assist banks in accurately predicting potential high-quality +customers, showing relatively outstanding performance compared to other models. + +
+
+ comment: 8 pagess on IEEE ICPICS +
+
+
+
+
+ + ♻ ☆ Advanced Payment Security System:XGBoost, LightGBM and SMOTE Integrated + + +
+ With the rise of various online and mobile payment systems, transaction fraud +has become a significant threat to financial security. This study explores the +application of advanced machine learning models, specifically based on XGBoost +and LightGBM, for developing a more accurate and robust Payment Security +Protection Model. To enhance data reliability, we meticulously processed the +data sources and applied SMOTE (Synthetic Minority Over-sampling Technique) to +address class imbalance and improve data representation. By selecting highly +correlated features, we aimed to strengthen the training process and boost +model performance. We conducted thorough performance evaluations of our +proposed models, comparing them against traditional methods including Random +Forest, Neural Network, and Logistic Regression. Using metrics such as +Precision, Recall, and F1 Score, we rigorously assessed their effectiveness. +Our detailed analyses and comparisons reveal that the combination of SMOTE with +XGBoost and LightGBM offers a highly efficient and powerful mechanism for +payment security protection. Moreover, the integration of XGBoost and LightGBM +in a Local Ensemble model further demonstrated outstanding performance. After +incorporating SMOTE, the new combined model achieved a significant improvement +of nearly 6\% over traditional models and around 5\% over its sub-models, +showcasing remarkable results. + +
+
+ comment: This paper is received by https://ieee-metacom.org +
+
+
+
+
+ + ♻ ☆ Credit Card Fraud Detection Using Advanced Transformer Model + + +
+ With the proliferation of various online and mobile payment systems, credit +card fraud has emerged as a significant threat to financial security. This +study focuses on innovative applications of the latest Transformer models for +more robust and precise fraud detection. To ensure the reliability of the data, +we meticulously processed the data sources, balancing the dataset to address +the issue of data sparsity significantly. We also selected highly correlated +vectors to strengthen the training process.To guarantee the reliability and +practicality of the new Transformer model, we conducted performance comparisons +with several widely adopted models, including Support Vector Machine (SVM), +Random Forest, Neural Network, and Logistic Regression. We rigorously compared +these models using metrics such as Precision, Recall, and F1 Score. Through +these detailed analyses and comparisons, we present to the readers a highly +efficient and powerful anti-fraud mechanism with promising prospects. The +results demonstrate that the Transformer model not only excels in traditional +applications but also shows great potential in niche areas like fraud +detection, offering a substantial advancement in the field. + +
+
+ comment: This paper have been received by https://ieee-metacom.org/ +
+
+
+
+
+ + ♻ ☆ Enhanced Credit Score Prediction Using Ensemble Deep Learning Model + + +
+ In contemporary economic society, credit scores are crucial for every +participant. A robust credit evaluation system is essential for the +profitability of core businesses such as credit cards, loans, and investments +for commercial banks and the financial sector. This paper combines +high-performance models like XGBoost and LightGBM, already widely used in +modern banking systems, with the powerful TabNet model. We have developed a +potent model capable of accurately determining credit score levels by +integrating Random Forest, XGBoost, and TabNet, and through the stacking +technique in ensemble modeling. This approach surpasses the limitations of +single models and significantly advances the precise credit score prediction. +In the following sections, we will explain the techniques we used and +thoroughly validate our approach by comprehensively comparing a series of +metrics such as Precision, Recall, F1, and AUC. By integrating Random Forest, +XGBoost, and with the TabNet deep learning architecture, these models +complement each other, demonstrating exceptionally strong overall performance. + +
+
+ comment: This paper have been accepted by sci of AI Journal +
+
+
+
+
+ + ♻ ☆ Levin Tree Search with Context Models + + +
+ Levin Tree Search (LTS) is a search algorithm that makes use of a policy (a +probability distribution over actions) and comes with a theoretical guarantee +on the number of expansions before reaching a goal node, depending on the +quality of the policy. This guarantee can be used as a loss function, which we +call the LTS loss, to optimize neural networks representing the policy +(LTS+NN). In this work we show that the neural network can be substituted with +parameterized context models originating from the online compression literature +(LTS+CM). We show that the LTS loss is convex under this new model, which +allows for using standard convex optimization tools, and obtain convergence +guarantees to the optimal parameters in an online setting for a given set of +solution trajectories -- guarantees that cannot be provided for neural +networks. The new LTS+CM algorithm compares favorably against LTS+NN on several +benchmarks: Sokoban (Boxoban), The Witness, and the 24-Sliding Tile puzzle +(STP). The difference is particularly large on STP, where LTS+NN fails to solve +most of the test instances while LTS+CM solves each test instance in a fraction +of a second. Furthermore, we show that LTS+CM is able to learn a policy that +solves the Rubik's cube in only a few hundred expansions, which considerably +improves upon previous machine learning techniques. + +
+
+
+
+
+ + ♻ ☆ Piecewise Linearity of Min-Norm Solution Map of a Nonconvexly + Regularized Convex Sparse Model + + +
+ It is well known that the minimum $\ell_2$-norm solution of the convex LASSO +model, say $\mathbf{x}_{\star}$, is a continuous piecewise linear function of +the regularization parameter $\lambda$, and its signed sparsity pattern is +constant within each linear piece. The current study is an extension of this +classic result, proving that the aforementioned properties extend to the +min-norm solution map $\mathbf{x}_{\star}(\mathbf{y},\lambda)$, where +$\mathbf{y}$ is the observed signal, for a generalization of LASSO termed the +scaled generalized minimax concave (sGMC) model. The sGMC model adopts a +nonconvex debiased variant of the $\ell_1$-norm as sparse regularizer, but its +objective function is overall-convex. Based on the geometric properties of +$\mathbf{x}_{\star}(\mathbf{y},\lambda)$, we propose an extension of the least +angle regression (LARS) algorithm, which iteratively computes the closed-form +expression of $\mathbf{x}_{\star}(\mathbf{y},\lambda)$ in each linear zone. +Under suitable conditions, the proposed algorithm provably obtains the whole +solution map $\mathbf{x}_{\star}(\mathbf{y},\lambda)$ within finite iterations. +Notably, our proof techniques for establishing continuity and piecewise +linearity of $\mathbf{x}_{\star}(\mathbf{y},\lambda)$ are novel, and they lead +to two side contributions: (a) our proofs establish continuity of the sGMC +solution set as a set-valued mapping of $(\mathbf{y},\lambda)$; (b) to prove +piecewise linearity and piecewise constant sparsity pattern of +$\mathbf{x}_{\star}(\mathbf{y},\lambda)$, we do not require any assumption that +previous work relies on (whereas to prove some additional properties of +$\mathbf{x}_{\star}(\mathbf{y},\lambda)$, we use a different set of assumptions +from previous work). + +
+
+ comment: 40 pages. Submitted to journal +
+
+
+
+
+ + ♻ ☆ RiNALMo: General-Purpose RNA Language Models Can Generalize Well on + Structure Prediction Tasks + + +
+ While RNA has recently been recognized as an interesting small-molecule drug +target, many challenges remain to be addressed before we take full advantage of +it. This emphasizes the necessity to improve our understanding of its +structures and functions. Over the years, sequencing technologies have produced +an enormous amount of unlabeled RNA data, which hides a huge potential. +Motivated by the successes of protein language models, we introduce RiboNucleic +Acid Language Model (RiNALMo) to unveil the hidden code of RNA. RiNALMo is the +largest RNA language model to date, with 650M parameters pre-trained on 36M +non-coding RNA sequences from several databases. It can extract hidden +knowledge and capture the underlying structure information implicitly embedded +within the RNA sequences. RiNALMo achieves state-of-the-art results on several +downstream tasks. Notably, we show that its generalization capabilities +overcome the inability of other deep learning methods for secondary structure +prediction to generalize on unseen RNA families. + +
+
+ comment: 31 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Provable Compositional Generalization for Object-Centric Learning ICLR 2024 + + +
+ Learning representations that generalize to novel compositions of known +concepts is crucial for bridging the gap between human and machine perception. +One prominent effort is learning object-centric representations, which are +widely conjectured to enable compositional generalization. Yet, it remains +unclear when this conjecture will be true, as a principled theoretical or +empirical understanding of compositional generalization is lacking. In this +work, we investigate when compositional generalization is guaranteed for +object-centric representations through the lens of identifiability theory. We +show that autoencoders that satisfy structural assumptions on the decoder and +enforce encoder-decoder consistency will learn object-centric representations +that provably generalize compositionally. We validate our theoretical result +and highlight the practical relevance of our assumptions through experiments on +synthetic image data. + +
+
+ comment: Oral at ICLR 2024. The first four authors contributed equally +
+
+
+
+
+ + ♻ ☆ Convolutional and Deep Learning based techniques for Time Series Ordinal + Classification + + +
+ Time Series Classification (TSC) covers the supervised learning problem where +input data is provided in the form of series of values observed through +repeated measurements over time, and whose objective is to predict the category +to which they belong. When the class values are ordinal, classifiers that take +this into account can perform better than nominal classifiers. Time Series +Ordinal Classification (TSOC) is the field covering this gap, yet unexplored in +the literature. There are a wide range of time series problems showing an +ordered label structure, and TSC techniques that ignore the order relationship +discard useful information. Hence, this paper presents a first benchmarking of +TSOC methodologies, exploiting the ordering of the target labels to boost the +performance of current TSC state-of-the-art. Both convolutional- and deep +learning-based methodologies (among the best performing alternatives for +nominal TSC) are adapted for TSOC. For the experiments, a selection of 29 +ordinal problems from two well-known archives has been made. In this way, this +paper contributes to the establishment of the state-of-the-art in TSOC. The +results obtained by ordinal versions are found to be significantly better than +current nominal TSC techniques in terms of ordinal performance metrics, +outlining the importance of considering the ordering of the labels when dealing +with this kind of problems. + +
+
+ comment: 13 pages, 9 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Basis-to-Basis Operator Learning Using Function Encoders + + +
+ We present Basis-to-Basis (B2B) operator learning, a novel approach for +learning operators on Hilbert spaces of functions based on the foundational +ideas of function encoders. We decompose the task of learning operators into +two parts: learning sets of basis functions for both the input and output +spaces and learning a potentially nonlinear mapping between the coefficients of +the basis functions. B2B operator learning circumvents many challenges of prior +works, such as requiring data to be at fixed locations, by leveraging classic +techniques such as least squares to compute the coefficients. It is especially +potent for linear operators, where we compute a mapping between bases as a +single matrix transformation with a closed-form solution. Furthermore, with +minimal modifications and using the deep theoretical connections between +function encoders and functional analysis, we derive operator learning +algorithms that are directly analogous to eigen-decomposition and singular +value decomposition. We empirically validate B2B operator learning on seven +benchmark operator learning tasks and show that it demonstrates a +two-orders-of-magnitude improvement in accuracy over existing approaches on +several benchmark tasks. + +
+
+
+
+
+ + ♻ ☆ Interpret Your Decision: Logical Reasoning Regularization for + Generalization in Visual Classification NeurIPS2024 + + +
+ Vision models excel in image classification but struggle to generalize to +unseen data, such as classifying images from unseen domains or discovering +novel categories. In this paper, we explore the relationship between logical +reasoning and deep learning generalization in visual classification. A logical +regularization termed L-Reg is derived which bridges a logical analysis +framework to image classification. Our work reveals that L-Reg reduces the +complexity of the model in terms of the feature distribution and classifier +weights. Specifically, we unveil the interpretability brought by L-Reg, as it +enables the model to extract the salient features, such as faces to persons, +for classification. Theoretical analysis and experiments demonstrate that L-Reg +enhances generalization across various scenarios, including multi-domain +generalization and generalized category discovery. In complex real-world +scenarios where images span unknown classes and unseen domains, L-Reg +consistently improves generalization, highlighting its practical efficacy. + +
+
+ comment: Accepted by NeurIPS2024 as Spotlight +
+
+
+
+
+ + ♻ ☆ A Manifold Perspective on the Statistical Generalization of Graph Neural + Networks + + +
+ Graph Neural Networks (GNNs) extend convolutional neural networks to operate +on graphs. Despite their impressive performances in various graph learning +tasks, the theoretical understanding of their generalization capability is +still lacking. Previous GNN generalization bounds ignore the underlying graph +structures, often leading to bounds that increase with the number of nodes -- a +behavior contrary to the one experienced in practice. In this paper, we take a +manifold perspective to establish the statistical generalization theory of GNNs +on graphs sampled from a manifold in the spectral domain. As demonstrated +empirically, we prove that the generalization bounds of GNNs decrease linearly +with the size of the graphs in the logarithmic scale, and increase linearly +with the spectral continuity constants of the filter functions. Notably, our +theory explains both node-level and graph-level tasks. Our result has two +implications: i) guaranteeing the generalization of GNNs to unseen data over +manifolds; ii) providing insights into the practical design of GNNs, i.e., +restrictions on the discriminability of GNNs are necessary to obtain a better +generalization performance. We demonstrate our generalization bounds of GNNs +using synthetic and multiple real-world datasets. + +
+
+ comment: 37 pages,25 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Dynamic planning in hierarchical active inference + + +
+ By dynamic planning, we refer to the ability of the human brain to infer and +impose motor trajectories related to cognitive decisions. A recent paradigm, +active inference, brings fundamental insights into the adaptation of biological +organisms, constantly striving to minimize prediction errors to restrict +themselves to life-compatible states. Over the past years, many studies have +shown how human and animal behaviors could be explained in terms of active +inference - either as discrete decision-making or continuous motor control - +inspiring innovative solutions in robotics and artificial intelligence. Still, +the literature lacks a comprehensive outlook on effectively planning realistic +actions in changing environments. Setting ourselves the goal of modeling +complex tasks such as tool use, we delve into the topic of dynamic planning in +active inference, keeping in mind two crucial aspects of biological behavior: +the capacity to understand and exploit affordances for object manipulation, and +to learn the hierarchical interactions between the self and the environment, +including other agents. We start from a simple unit and gradually describe more +advanced structures, comparing recently proposed design choices and providing +basic examples. This study distances itself from traditional views centered on +neural networks and reinforcement learning, and points toward a yet unexplored +direction in active inference: hybrid representations in hierarchical models. + +
+
+
+
+
+ + ♻ Bootstrapping Reinforcement Learning with Imitation for Vision-Based + Agile Flight CoRL + + +
+ Learning visuomotor policies for agile quadrotor flight presents significant +difficulties, primarily from inefficient policy exploration caused by +high-dimensional visual inputs and the need for precise and low-latency +control. To address these challenges, we propose a novel approach that combines +the performance of Reinforcement Learning (RL) and the sample efficiency of +Imitation Learning (IL) in the task of vision-based autonomous drone racing. +While RL provides a framework for learning high-performance controllers through +trial and error, it faces challenges with sample efficiency and computational +demands due to the high dimensionality of visual inputs. Conversely, IL +efficiently learns from visual expert demonstrations, but it remains limited by +the expert's performance and state distribution. To overcome these limitations, +our policy learning framework integrates the strengths of both approaches. Our +framework contains three phases: training a teacher policy using RL with +privileged state information, distilling it into a student policy via IL, and +adaptive fine-tuning via RL. Testing in both simulated and real-world scenarios +shows our approach can not only learn in scenarios where RL from scratch fails +but also outperforms existing IL methods in both robustness and performance, +successfully navigating a quadrotor through a race course using only visual +information. Videos of the experiments are available at +https://rpg.ifi.uzh.ch/bootstrap-rl-with-il/index.html. + +
+
+ comment: 8th Annual Conference on Robot Learning (CoRL) +
+
+
+
+
+ + ♻ ☆ Bandits with Abstention under Expert Advice + + +
+ We study the classic problem of prediction with expert advice under bandit +feedback. Our model assumes that one action, corresponding to the learner's +abstention from play, has no reward or loss on every trial. We propose the CBA +algorithm, which exploits this assumption to obtain reward bounds that can +significantly improve those of the classical Exp4 algorithm. We can view our +problem as the aggregation of confidence-rated predictors when the learner has +the option of abstention from play. Importantly, we are the first to achieve +bounds on the expected cumulative reward for general confidence-rated +predictors. In the special case of specialists we achieve a novel reward bound, +significantly improving previous bounds of SpecialistExp (treating abstention +as another action). As an example application, we discuss learning unions of +balls in a finite metric space. In this contextual setting, we devise an +efficient implementation of CBA, reducing the runtime from quadratic to almost +linear in the number of contexts. Preliminary experiments show that CBA +improves over existing bandit algorithms. + +
+
+
+
+
+ + ♻ ☆ DistRL: An Asynchronous Distributed Reinforcement Learning Framework for + On-Device Control Agents + + +
+ On-device control agents, especially on mobile devices, are responsible for +operating mobile devices to fulfill users' requests, enabling seamless and +intuitive interactions. Integrating Multimodal Large Language Models (MLLMs) +into these agents enhances their ability to understand and execute complex +commands, thereby improving user experience. However, fine-tuning MLLMs for +on-device control presents significant challenges due to limited data +availability and inefficient online training processes. This paper introduces +DistRL, a novel framework designed to enhance the efficiency of online RL +fine-tuning for mobile device control agents. DistRL employs centralized +training and decentralized data acquisition to ensure efficient fine-tuning in +the context of dynamic online interactions. Additionally, the framework is +backed by our tailor-made RL algorithm, which effectively balances exploration +with the prioritized utilization of collected data to ensure stable and robust +training. Our experiments show that, on average, DistRL delivers a 3X +improvement in training efficiency and enables training data collection 2.4X +faster than the leading synchronous multi-machine methods. Notably, after +training, DistRL achieves a 20% relative improvement in success rate compared +to state-of-the-art methods on general Android tasks from an open benchmark, +significantly outperforming existing approaches while maintaining the same +training time. These results validate DistRL as a scalable and efficient +solution, offering substantial improvements in both training efficiency and +agent performance for real-world, in-the-wild device control tasks. + +
+
+ comment: Paper and Appendix, 25 pages +
+
+
+
+
+ + ♻ ☆ LLMs Can Evolve Continually on Modality for X-Modal Reasoning + + +
+ Multimodal Large Language Models (MLLMs) have gained significant attention +due to their impressive capabilities in multimodal understanding. However, +existing methods rely heavily on extensive modal-specific pretraining and +joint-modal tuning, leading to significant computational burdens when expanding +to new modalities. In this paper, we propose PathWeave, a flexible and scalable +framework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs +to continually EVolve on modalities for $\mathbb{X}$-modal reasoning. We +leverage the concept of Continual Learning and develop an incremental training +strategy atop pre-trained MLLMs, enabling their expansion to new modalities +using uni-modal data, without executing joint-modal pretraining. In detail, a +novel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and +cross-modal adapters are seamlessly integrated to facilitate efficient modality +alignment and collaboration. Additionally, an MoE-based gating module is +applied between two types of adapters to further enhance the multimodal +interaction. To investigate the proposed method, we establish a challenging +benchmark called Continual Learning of Modality (MCL), which consists of +high-quality QA data from five distinct modalities: image, video, audio, depth +and point cloud. Extensive experiments demonstrate the effectiveness of the +proposed AnA framework on learning plasticity and memory stability during +continual learning. Furthermore, PathWeave performs comparably to +state-of-the-art MLLMs while concurrently reducing parameter training burdens +by 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave + +
+
+
+
+
+ + ♻ ☆ UniTE: A Survey and Unified Pipeline for Pre-training Spatiotemporal + Trajectory Embeddings + + +
+ Spatiotemporal trajectories are sequences of timestamped locations, which +enable a variety of analyses that in turn enable important real-world +applications. It is common to map trajectories to vectors, called embeddings, +before subsequent analyses. Thus, the qualities of embeddings are very +important. Methods for pre-training embeddings, which leverage unlabeled +trajectories for training universal embeddings, have shown promising +applicability across different tasks, thus attracting considerable interest. +However, research progress on this topic faces two key challenges: a lack of a +comprehensive overview of existing methods, resulting in several related +methods not being well-recognized, and the absence of a unified pipeline, +complicating the development of new methods and the analysis of methods. + We present UniTE, a survey and a unified pipeline for this domain. In doing +so, we present a comprehensive list of existing methods for pre-training +trajectory embeddings, which includes methods that either explicitly or +implicitly employ pre-training techniques. Further, we present a unified and +modular pipeline with publicly available underlying code, simplifying the +process of constructing and evaluating methods for pre-training trajectory +embeddings. Additionally, we contribute a selection of experimental results +using the proposed pipeline on real-world datasets. Implementation of the +pipeline is publicly available at https://github.com/Logan-Lin/UniTE. + +
+
+
+
+
+ + ♻ ☆ Efficient Hamiltonian, structure and trace distance learning of Gaussian + states + + +
+ In this work, we initiate the study of Hamiltonian learning for positive +temperature bosonic Gaussian states, the quantum generalization of the widely +studied problem of learning Gaussian graphical models. We obtain efficient +protocols, both in sample and computational complexity, for the task of +inferring the parameters of their underlying quadratic Hamiltonian under the +assumption of bounded temperature, squeezing, displacement and maximal degree +of the interaction graph. Our protocol only requires heterodyne measurements, +which are often experimentally feasible, and has a sample complexity that +scales logarithmically with the number of modes. Furthermore, we show that it +is possible to learn the underlying interaction graph in a similar setting and +sample complexity. Taken together, our results put the status of the quantum +Hamiltonian learning problem for continuous variable systems in a much more +advanced state when compared to spins, where state-of-the-art results are +either unavailable or quantitatively inferior to ours. In addition, we use our +techniques to obtain the first results on learning Gaussian states in trace +distance with a quadratic scaling in precision and polynomial in the number of +modes, albeit imposing certain restrictions on the Gaussian states. Our main +technical innovations are several continuity bounds for the covariance and +Hamiltonian matrix of a Gaussian state, which are of independent interest, +combined with what we call the local inversion technique. In essence, the local +inversion technique allows us to reliably infer the Hamiltonian of a Gaussian +state by only estimating in parallel submatrices of the covariance matrix whose +size scales with the desired precision, but not the number of modes. This way +we bypass the need to obtain precise global estimates of the covariance matrix, +controlling the sample complexity. + +
+
+ comment: 43 pages, 1 figure. Corrections to Lemma 4.1. Main results are + unchanged +
+
+
+
+
+ + ♻ ☆ SUMO: Search-Based Uncertainty Estimation for Model-Based Offline + Reinforcement Learning AAAI2025 + + +
+ The performance of offline reinforcement learning (RL) suffers from the +limited size and quality of static datasets. Model-based offline RL addresses +this issue by generating synthetic samples through a dynamics model to enhance +overall performance. To evaluate the reliability of the generated samples, +uncertainty estimation methods are often employed. However, model ensemble, the +most commonly used uncertainty estimation method, is not always the best +choice. In this paper, we propose a \textbf{S}earch-based \textbf{U}ncertainty +estimation method for \textbf{M}odel-based \textbf{O}ffline RL (SUMO) as an +alternative. SUMO characterizes the uncertainty of synthetic samples by +measuring their cross entropy against the in-distribution dataset samples, and +uses an efficient search-based method for implementation. In this way, SUMO can +achieve trustworthy uncertainty estimation. We integrate SUMO into several +model-based offline RL algorithms including MOPO and Adapted MOReL (AMOReL), +and provide theoretical analysis for them. Extensive experimental results on +D4RL datasets demonstrate that SUMO can provide more accurate uncertainty +estimation and boost the performance of base algorithms. These indicate that +SUMO could be a better uncertainty estimator for model-based offline RL when +used in either reward penalty or trajectory truncation. Our code is available +and will be open-source for further research and development. + +
+
+ comment: Submitted to AAAI2025 +
+
+
+
+
+ + ♻ ☆ Pessimistic Iterative Planning for Robust POMDPs + + +
+ Robust POMDPs extend classical POMDPs to handle model uncertainty. +Specifically, robust POMDPs exhibit so-called uncertainty sets on the +transition and observation models, effectively defining ranges of +probabilities. Policies for robust POMDPs must be (1) memory-based to account +for partial observability and (2) robust against model uncertainty to account +for the worst-case instances from the uncertainty sets. To compute such robust +memory-based policies, we propose the pessimistic iterative planning (PIP) +framework, which alternates between two main steps: (1) selecting a pessimistic +(non-robust) POMDP via worst-case probability instances from the uncertainty +sets; and (2) computing a finite-state controller (FSC) for this pessimistic +POMDP. We evaluate the performance of this FSC on the original robust POMDP and +use this evaluation in step (1) to select the next pessimistic POMDP. Within +PIP, we propose the rFSCNet algorithm. In each iteration, rFSCNet finds an FSC +through a recurrent neural network by using supervision policies optimized for +the pessimistic POMDP. The empirical evaluation in four benchmark environments +showcases improved robustness against several baseline methods and competitive +performance compared to a state-of-the-art robust POMDP solver. + +
+
+
+
+
+ + ♻ ☆ A Cross-Domain Benchmark for Active Learning NeurIPS 24 + + +
+ Active Learning (AL) deals with identifying the most informative samples for +labeling to reduce data annotation costs for supervised learning tasks. AL +research suffers from the fact that lifts from literature generalize poorly and +that only a small number of repetitions of experiments are conducted. To +overcome these obstacles, we propose CDALBench, the first active learning +benchmark which includes tasks in computer vision, natural language processing +and tabular learning. Furthermore, by providing an efficient, greedy oracle, +CDALBench can be evaluated with 50 runs for each experiment. We show, that both +the cross-domain character and a large amount of repetitions are crucial for +sophisticated evaluation of AL research. Concretely, we show that the +superiority of specific methods varies over the different domains, making it +important to evaluate Active Learning with a cross-domain benchmark. +Additionally, we show that having a large amount of runs is crucial. With only +conducting three runs as often done in the literature, the superiority of +specific methods can strongly vary with the specific runs. This effect is so +strong, that, depending on the seed, even a well-established method's +performance can be significantly better and significantly worse than random for +the same dataset. + +
+
+ comment: Accepted at NeurIPS 24 in the Benchmarks and Datasets Track. Updated + version of paper "Toward Comparable Active Learning" (arXiv:2311.18356). + "Toward Comparable Active Learning" is deprecated, please use this version. + arXiv admin note: text overlap with arXiv:2311.18356; text overlap with + arXiv:2301.10625 by other authors +
+
+
+
+
+ + ♻ ☆ Exploiting Activation Sparsity with Dense to Dynamic-k + Mixture-of-Experts Conversion + + +
+ Transformer models can face practical limitations due to their high +computational requirements. At the same time, such models exhibit significant +activation sparsity, which can be leveraged to reduce the inference cost by +converting parts of the network into equivalent Mixture-of-Experts (MoE) +layers. Despite the crucial role played by activation sparsity, its impact on +this process remains unexplored. We demonstrate that the efficiency of the +conversion can be significantly enhanced by a proper regularization of the +activation sparsity of the base model. Moreover, motivated by the high variance +of the number of activated neurons for different inputs, we introduce a more +effective dynamic-$k$ expert selection rule that adjusts the number of executed +experts on a per-token basis. To achieve further savings, we extend this +approach to multi-head attention projections. Finally, we develop an efficient +implementation that translates these computational savings into actual +wall-clock speedup. The proposed method, Dense to Dynamic-$k$ +Mixture-of-Experts (D2DMoE), outperforms existing approaches on common NLP and +vision tasks, reducing inference cost by up to 60% without significantly +impacting performance. + +
+
+
+
+
+ + ♻ ☆ A Survey on Integrated Sensing, Communication, and Computation + + +
+ The forthcoming generation of wireless technology, 6G, aims to usher in an +era of ubiquitous intelligent services, where everything is interconnected and +intelligent. This vision requires the seamless integration of three fundamental +modules: Sensing for information acquisition, communication for information +sharing, and computation for information processing and decision-making. These +modules are intricately linked, especially in complex tasks such as edge +learning and inference. However, the performance of these modules is +interdependent, creating a resource competition for time, energy, and +bandwidth. Existing techniques like integrated communication and computation +(ICC), integrated sensing and computation (ISC), and integrated sensing and +communication (ISAC) have made partial strides in addressing this challenge, +but they fall short of meeting the extreme performance requirements. To +overcome these limitations, it is essential to develop new techniques that +comprehensively integrate sensing, communication, and computation. This +integrated approach, known as Integrated Sensing, Communication, and +Computation (ISCC), offers a systematic perspective for enhancing task +performance. This paper begins with a comprehensive survey of historic and +related techniques such as ICC, ISC, and ISAC, highlighting their strengths and +limitations. It then discusses the benefits, functions, and challenges of ISCC. +Subsequently, the state-of-the-art signal designs for ISCC, along with network +resource management strategies specifically tailored for ISCC are explored. +Furthermore, this paper discusses the exciting research opportunities that lie +ahead for implementing ISCC in future advanced networks, and the unresolved +issues requiring further investigation. ISCC is expected to unlock the full +potential of intelligent connectivity, paving the way for groundbreaking +applications and services. + +
+
+ comment: In this version, a series of discussions have been added.The + benefits, functions, and challenges of ISCC are investigated using a new + section. Moreover, the unresolved issues of ISCC have been discussed +
+
+
+
+
+ + ♻ ☆ Provably Transformers Harness Multi-Concept Word Semantics for Efficient + In-Context Learning NeurIPS 2024 + + +
+ Transformer-based large language models (LLMs) have displayed remarkable +creative prowess and emergence capabilities. Existing empirical studies have +revealed a strong connection between these LLMs' impressive emergence abilities +and their in-context learning (ICL) capacity, allowing them to solve new tasks +using only task-specific prompts without further fine-tuning. On the other +hand, existing empirical and theoretical studies also show that there is a +linear regularity of the multi-concept encoded semantic representation behind +transformer-based LLMs. However, existing theoretical work fail to build up an +understanding of the connection between this regularity and the innovative +power of ICL. Additionally, prior work often focuses on simplified, unrealistic +scenarios involving linear transformers or unrealistic loss functions, and they +achieve only linear or sub-linear convergence rates. In contrast, this work +provides a fine-grained mathematical analysis to show how transformers leverage +the multi-concept semantics of words to enable powerful ICL and excellent +out-of-distribution ICL abilities, offering insights into how transformers +innovate solutions for certain unseen tasks encoded with multiple cross-concept +semantics. Inspired by empirical studies on the linear latent geometry of LLMs, +the analysis is based on a concept-based low-noise sparse coding prompt model. +Leveraging advanced techniques, this work showcases the exponential 0-1 loss +convergence over the highly non-convex training dynamics, which pioneeringly +incorporates the challenges of softmax self-attention, ReLU-activated MLPs, and +cross-entropy loss. Empirical simulations corroborate the theoretical findings. + +
+
+ comment: Accepted by the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Towards Generalist Robot Learning from Internet Video: A Survey + + +
+ Scaling deep learning to massive, diverse internet data has yielded +remarkably general capabilities in visual and natural language understanding +and generation. However, data has remained scarce and challenging to collect in +robotics, seeing robot learning struggle to obtain similarly general +capabilities. Promising Learning from Videos (LfV) methods aim to address the +robotics data bottleneck by augmenting traditional robot data with large-scale +internet video data. This video data offers broad foundational information +regarding physical behaviour and the underlying physics of the world, and thus +can be highly informative for a generalist robot. + In this survey, we present a thorough overview of the emerging field of LfV. +We outline fundamental concepts, including the benefits and challenges of LfV. +We provide a comprehensive review of current methods for extracting knowledge +from large-scale internet video, addressing key challenges in LfV, and boosting +downstream robot and reinforcement learning via the use of video data. The +survey concludes with a critical discussion of challenges and opportunities in +LfV. Here, we advocate for scalable foundation model approaches that can +leverage the full range of available internet video to improve the learning of +robot policies and dynamics models. We hope this survey can inform and catalyse +further LfV research, driving progress towards the development of +general-purpose robots. + +
+
+
+
+
+ + ♻ ☆ Gaussian Process Emulators for Few-Shot Segmentation in Cardiac MRI + + +
+ Segmentation of cardiac magnetic resonance images (MRI) is crucial for the +analysis and assessment of cardiac function, helping to diagnose and treat +various cardiovascular diseases. Most recent techniques rely on deep learning +and usually require an extensive amount of labeled data. To overcome this +problem, few-shot learning has the capability of reducing data dependency on +labeled data. In this work, we introduce a new method that merges few-shot +learning with a U-Net architecture and Gaussian Process Emulators (GPEs), +enhancing data integration from a support set for improved performance. GPEs +are trained to learn the relation between the support images and the +corresponding masks in latent space, facilitating the segmentation of unseen +query images given only a small labeled support set at inference. We test our +model with the M&Ms-2 public dataset to assess its ability to segment the heart +in cardiac magnetic resonance imaging from different orientations, and compare +it with state-of-the-art unsupervised and few-shot methods. Our architecture +shows higher DICE coefficients compared to these methods, especially in the +more challenging setups where the size of the support set is considerably +small. + +
+
+ comment: Accepted at Statistical Atlases and Computational Modeling of the + Heart (STACOM) Workshop 2024 +
+
+
+
+
+ + ♻ ☆ OCMDP: Observation-Constrained Markov Decision Process + + +
+ In many practical applications, decision-making processes must balance the +costs of acquiring information with the benefits it provides. Traditional +control systems often assume full observability, an unrealistic assumption when +observations are expensive. We tackle the challenge of simultaneously learning +observation and control strategies in such cost-sensitive environments by +introducing the Observation-Constrained Markov Decision Process (OCMDP), where +the policy influences the observability of the true state. To manage the +complexity arising from the combined observation and control actions, we +develop an iterative, model-free deep reinforcement learning algorithm that +separates the sensing and control components of the policy. This decomposition +enables efficient learning in the expanded action space by focusing on when and +what to observe, as well as determining optimal control actions, without +requiring knowledge of the environment's dynamics. We validate our approach on +a simulated diagnostic task and a realistic healthcare environment using +HeartPole. Given both scenarios, the experimental results demonstrate that our +model achieves a substantial reduction in observation costs on average, +significantly outperforming baseline methods by a notable margin in efficiency. + +
+
+ comment: Full paper, 14 Pages +
+
+
+
+
+ + ♻ ☆ Deep Learning with CNNs: A Compact Holistic Tutorial with Focus on + Supervised Regression (Preprint) + + +
+ In this tutorial, we present a compact and holistic discussion of Deep +Learning with a focus on Convolutional Neural Networks (CNNs) and supervised +regression. While there are numerous books and articles on the individual +topics we cover, comprehensive and detailed tutorials that address Deep +Learning from a foundational yet rigorous and accessible perspective are rare. +Most resources on CNNs are either too advanced, focusing on cutting-edge +architectures, or too narrow, addressing only specific applications like image +classification.This tutorial not only summarizes the most relevant concepts but +also provides an in-depth exploration of each, offering a complete yet agile +set of ideas. Moreover, we highlight the powerful synergy between learning +theory, statistic, and machine learning, which together underpin the Deep +Learning and CNN frameworks. We aim for this tutorial to serve as an optimal +resource for students, professors, and anyone interested in understanding the +foundations of Deep Learning. Upon acceptance we will provide an accompanying +repository under +\href{https://github.com/neoglez/deep-learning-tutorial}{https://github.com/neoglez/deep-learning-tutorial} + Keywords: Tutorial, Deep Learning, Convolutional Neural Networks, Machine +Learning. + +
+
+ comment: Submitted to the journal Machine Learning and Knowledge Extraction +
+
+
+
+
+ + ♻ ☆ RLHF Workflow: From Reward Modeling to Online RLHF + + +
+ We present the workflow of Online Iterative Reinforcement Learning from Human +Feedback (RLHF) in this technical report, which is widely reported to +outperform its offline counterpart by a large margin in the recent large +language model (LLM) literature. However, existing open-source RLHF projects +are still largely confined to the offline learning setting. In this technical +report, we aim to fill in this gap and provide a detailed recipe that is easy +to reproduce for online iterative RLHF. In particular, since online human +feedback is usually infeasible for open-source communities with limited +resources, we start by constructing preference models using a diverse set of +open-source datasets and use the constructed proxy preference model to +approximate human feedback. Then, we discuss the theoretical insights and +algorithmic principles behind online iterative RLHF, followed by a detailed +practical implementation. Our trained LLM achieves impressive performance on +LLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as +well as other academic benchmarks such as HumanEval and TruthfulQA. We have +shown that supervised fine-tuning (SFT) and iterative RLHF can obtain +state-of-the-art performance with fully open-source datasets. Further, we have +made our models, curated datasets, and comprehensive step-by-step code +guidebooks publicly available. Please refer to +https://github.com/RLHFlow/RLHF-Reward-Modeling and +https://github.com/RLHFlow/Online-RLHF for more detailed information. + +
+
+ comment: Published in Transactions on Machine Learning Research (09/2024) +
+
+
+
+
+ + ♻ ☆ Overview frequency principle/spectral bias in deep learning + + +
+ Understanding deep learning is increasingly emergent as it penetrates more +and more into industry and science. In recent years, a research line from +Fourier analysis sheds lights on this magical "black box" by showing a +Frequency Principle (F-Principle or spectral bias) of the training behavior of +deep neural networks (DNNs) -- DNNs often fit functions from low to high +frequency during the training. The F-Principle is first demonstrated by +onedimensional synthetic data followed by the verification in high-dimensional +real datasets. A series of works subsequently enhance the validity of the +F-Principle. This low-frequency implicit bias reveals the strength of neural +network in learning low-frequency functions as well as its deficiency in +learning high-frequency functions. Such understanding inspires the design of +DNN-based algorithms in practical problems, explains experimental phenomena +emerging in various scenarios, and further advances the study of deep learning +from the frequency perspective. Although incomplete, we provide an overview of +F-Principle and propose some open problems for future research. + +
+
+
+
+
+ + ♻ ☆ Mr.Steve: Instruction-Following Agents in Minecraft with What-Where-When + Memory + + +
+ Significant advances have been made in developing general-purpose embodied AI +in environments like Minecraft through the adoption of LLM-augmented +hierarchical approaches. While these approaches, which combine high-level +planners with low-level controllers, show promise, low-level controllers +frequently become performance bottlenecks due to repeated failures. In this +paper, we argue that the primary cause of failure in many low-level controllers +is the absence of an episodic memory system. To address this, we introduce Mr. +Steve (Memory Recall Steve-1), a novel low-level controller equipped with Place +Event Memory (PEM), a form of episodic memory that captures what, where, and +when information from episodes. This directly addresses the main limitation of +the popular low-level controller, Steve-1. Unlike previous models that rely on +short-term memory, PEM organizes spatial and event-based data, enabling +efficient recall and navigation in long-horizon tasks. Additionally, we propose +an Exploration Strategy and a Memory-Augmented Task Solving Framework, allowing +agents to alternate between exploration and task-solving based on recalled +events. Our approach significantly improves task-solving and exploration +efficiency compared to existing methods. We will release our code and demos on +the project page: https://sites.google.com/view/mr-steve. + +
+
+
+
+
+ + ♻ ☆ Scalar Function Topology Divergence: Comparing Topology of 3D Objects + + +
+ We propose a new topological tool for computer vision - Scalar Function +Topology Divergence (SFTD), which measures the dissimilarity of multi-scale +topology between sublevel sets of two functions having a common domain. +Functions can be defined on an undirected graph or Euclidean space of any +dimensionality. Most of the existing methods for comparing topology are based +on Wasserstein distance between persistence barcodes and they don't take into +account the localization of topological features. The minimization of SFTD +ensures that the corresponding topological features of scalar functions are +located in the same places. The proposed tool provides useful visualizations +depicting areas where functions have topological dissimilarities. We provide +applications of the proposed method to 3D computer vision. In particular, +experiments demonstrate that SFTD as an additional loss improves the +reconstruction of cellular 3D shapes from 2D fluorescence microscopy images, +and helps to identify topological errors in 3D segmentation. Additionally, we +show that SFTD outperforms Betti matching loss in 2D segmentation problems. + +
+
+
+
+
+ + ♻ ☆ LiCoEval: Evaluating LLMs on License Compliance in Code Generation + + +
+ Recent advances in Large Language Models (LLMs) have revolutionized code +generation, leading to widespread adoption of AI coding tools by developers. +However, LLMs can generate license-protected code without providing the +necessary license information, leading to potential intellectual property +violations during software production. This paper addresses the critical, yet +underexplored, issue of license compliance in LLM-generated code by +establishing a benchmark to evaluate the ability of LLMs to provide accurate +license information for their generated code. To establish this benchmark, we +conduct an empirical study to identify a reasonable standard for "striking +similarity" that excludes the possibility of independent creation, indicating a +copy relationship between the LLM output and certain open-source code. Based on +this standard, we propose LiCoEval, to evaluate the license compliance +capabilities of LLMs, i.e., the ability to provide accurate license or +copyright information when they generate code with striking similarity to +already existing copyrighted code. Using LiCoEval, we evaluate 14 popular LLMs, +finding that even top-performing LLMs produce a non-negligible proportion +(0.88% to 2.01%) of code strikingly similar to existing open-source +implementations. Notably, most LLMs fail to provide accurate license +information, particularly for code under copyleft licenses. These findings +underscore the urgent need to enhance LLM compliance capabilities in code +generation tasks. Our study provides a foundation for future research and +development to improve license compliance in AI-assisted software development, +contributing to both the protection of open-source software copyrights and the +mitigation of legal risks for LLM users. + +
+
+ comment: The 47th International Conference on Software Engineering(ICSE 2025) +
+
+
+
+
+ + ♻ ☆ Towards Human-AI Complementarity with Prediction Sets NeurIPS 2024 + + +
+ Decision support systems based on prediction sets have proven to be effective +at helping human experts solve classification tasks. Rather than providing +single-label predictions, these systems provide sets of label predictions +constructed using conformal prediction, namely prediction sets, and ask human +experts to predict label values from these sets. In this paper, we first show +that the prediction sets constructed using conformal prediction are, in +general, suboptimal in terms of average accuracy. Then, we show that the +problem of finding the optimal prediction sets under which the human experts +achieve the highest average accuracy is NP-hard. More strongly, unless P = NP, +we show that the problem is hard to approximate to any factor less than the +size of the label set. However, we introduce a simple and efficient greedy +algorithm that, for a large class of expert models and non-conformity scores, +is guaranteed to find prediction sets that provably offer equal or greater +performance than those constructed using conformal prediction. Further, using a +simulation study with both synthetic and real expert predictions, we +demonstrate that, in practice, our greedy algorithm finds near-optimal +prediction sets offering greater performance than conformal prediction. + +
+
+ comment: Published in NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ OpenUAS: Embeddings of Cities in Japan with Anchor Data for Cross-city + Analysis of Area Usage Patterns + + +
+ We publicly release OpenUAS, a dataset of area embeddings based on urban +usage patterns, including embeddings for over 1.3 million 50-meter square +meshes covering a total area of 3,300 square kilometers. This dataset is +valuable for analyzing area functions in fields such as market analysis, urban +planning, transportation infrastructure, and infection prediction. It captures +the characteristics of each area in the city, such as office districts and +residential areas, by employing an area embedding technique that utilizes +location information typically obtained by GPS. Numerous area embedding +techniques have been proposed, and while the public release of such embedding +datasets is technically feasible, it has not been realized. One reason for this +is that previous methods could not embed areas from different cities and +periods into the same embedding space without sharing raw location data. We +address this issue by developing an anchoring method that establishes anchors +within a shared embedding space. We publicly release this anchor dataset along +with area embedding datasets from several periods in eight major Japanese +cities. + +
+
+
+
+
+ + ♻ ☆ Fast and Functional Structured Data Generators Rooted in + Out-of-Equilibrium Physics + + +
+ In this study, we address the challenge of using energy-based models to +produce high-quality, label-specific data in complex structured datasets, such +as population genetics, RNA or protein sequences data. Traditional training +methods encounter difficulties due to inefficient Markov chain Monte Carlo +mixing, which affects the diversity of synthetic data and increases generation +times. To address these issues, we use a novel training algorithm that exploits +non-equilibrium effects. This approach, applied on the Restricted Boltzmann +Machine, improves the model's ability to correctly classify samples and +generate high-quality synthetic data in only a few sampling steps. The +effectiveness of this method is demonstrated by its successful application to +four different types of data: handwritten digits, mutations of human genomes +classified by continental origin, functionally characterized sequences of an +enzyme protein family, and homologous RNA sequences from specific taxonomies. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Robust Clustering on High-Dimensional Data with Stochastic Quantization + + +
+ This paper addresses the limitations of conventional vector quantization +algorithms, particularly K-Means and its variant K-Means++, and investigates +the Stochastic Quantization (SQ) algorithm as a scalable alternative for +high-dimensional unsupervised and semi-supervised learning tasks. Traditional +clustering algorithms often suffer from inefficient memory utilization during +computation, necessitating the loading of all data samples into memory, which +becomes impractical for large-scale datasets. While variants such as Mini-Batch +K-Means partially mitigate this issue by reducing memory usage, they lack +robust theoretical convergence guarantees due to the non-convex nature of +clustering problems. In contrast, the Stochastic Quantization algorithm +provides strong theoretical convergence guarantees, making it a robust +alternative for clustering tasks. We demonstrate the computational efficiency +and rapid convergence of the algorithm on an image classification problem with +partially labeled data, comparing model accuracy across various ratios of +labeled to unlabeled data. To address the challenge of high dimensionality, we +employ a Triplet Network to encode images into low-dimensional representations +in a latent space, which serve as a basis for comparing the efficiency of both +the Stochastic Quantization algorithm and traditional quantization algorithms. +Furthermore, we enhance the algorithm's convergence speed by introducing +modifications with an adaptive learning rate. + +
+
+ comment: 22 pages, 5 figures, to be published in the International Scientific + Technical Journal "Problems of Control and Informatics" +
+
+
+
+
+ + ♻ ☆ Learn from Heterophily: Heterophilous Information-enhanced Graph Neural + Network + + +
+ Under circumstances of heterophily, where nodes with different labels tend to +be connected based on semantic meanings, Graph Neural Networks (GNNs) often +exhibit suboptimal performance. Current studies on graph heterophily mainly +focus on aggregation calibration or neighbor extension and address the +heterophily issue by utilizing node features or structural information to +improve GNN representations. In this paper, we propose and demonstrate that the +valuable semantic information inherent in heterophily can be utilized +effectively in graph learning by investigating the distribution of neighbors +for each individual node within the graph. The theoretical analysis is carried +out to demonstrate the efficacy of the idea in enhancing graph learning. Based +on this analysis, we propose HiGNN, an innovative approach that constructs an +additional new graph structure, that integrates heterophilous information by +leveraging node distribution to enhance connectivity between nodes that share +similar semantic characteristics. We conduct empirical assessments on node +classification tasks using both homophilous and heterophilous benchmark +datasets and compare HiGNN to popular GNN baselines and SoTA methods, +confirming the effectiveness in improving graph representations. In addition, +by incorporating heterophilous information, we demonstrate a notable +enhancement in existing GNN-based approaches, and the homophily degree across +real-world datasets, thus affirming the efficacy of our approach. + +
+
+
+
+
+ + ♻ ☆ Model-independent cosmological inference post DESI DR1 BAO measurements + + +
+ In this work, we implement Gaussian process regression to reconstruct the +expansion history of the universe in a model-agnostic manner, using the +Pantheon-Plus SN-Ia compilation in combination with two different BAO +measurements (SDSS-IV and DESI DR1). In both the reconstructions, the +$\Lambda$CDM model is always included in the 95\% confidence intervals. We find +evidence that the DESI LRG data at $z_{\text{eff}} = 0.51$ is not an outlier +within our model-independent framework. We study the $\mathcal{O}m$-diagnostics +and the evolution of the total equation of state (EoS) of our universe, which +hint towards the possibility of a quintessence-like dark energy scenario with a +very slowly varying EoS, and a phantom-crossing in higher $z$. The entire +exercise is later complemented by considering two more SN-Ia compilations - +DES-5YR and Union3 - in combination with DESI BAO. Reconstruction with the DESI +BAO + DES-5YR SN data sets predicts that the $\Lambda$CDM model lies outside +the 3$\sigma$ confidence levels, whereas with DESI BAO + Union3 data, the +$\Lambda$CDM model is always included within 1$\sigma$. We also report +constraints on $H_0 r_d$ from our model-agnostic analysis, independent of the +pre-recombination physics. Our results point towards an $\approx$ 2$\sigma$ +discrepancy between the DESI + Pantheon-Plus and DESI + DES-5YR data sets, +which calls for further investigation. + +
+
+ comment: 10 pages, 6 sets of figures. Accepted for publication in PRD +
+
+
+
+
+ + ♻ ☆ Smooth Sensitivity for Learning Differentially-Private yet Accurate Rule + Lists + + +
+ Differentially-private (DP) mechanisms can be embedded into the design of a +machine learning algorithm to protect the resulting model against privacy +leakage. However, this often comes with a significant loss of accuracy due to +the noise added to enforce DP. In this paper, we aim at improving this +trade-off for a popular class of machine learning algorithms leveraging the +Gini impurity as an information gain criterion to greedily build interpretable +models such as decision trees or rule lists. To this end, we establish the +smooth sensitivity of the Gini impurity, which can be used to obtain thorough +DP guarantees while adding noise scaled with tighter magnitude. We illustrate +the applicability of this mechanism by integrating it within a greedy algorithm +producing rule list models, motivated by the fact that such models remain +understudied in the DP literature. Our theoretical analysis and experimental +results confirm that the DP rule lists models integrating smooth sensitivity +have higher accuracy that those using other DP frameworks based on global +sensitivity, for identical privacy budgets. + +
+
+
+
+
+ + ♻ ☆ SPARTAN: A Sparse Transformer Learning Local Causation + + +
+ Causal structures play a central role in world models that flexibly adapt to +changes in the environment. While recent works motivate the benefits of +discovering local causal graphs for dynamics modelling, in this work we +demonstrate that accurately capturing these relationships in complex settings +remains challenging for the current state-of-the-art. To remedy this +shortcoming, we postulate that sparsity is a critical ingredient for the +discovery of such local causal structures. To this end we present the SPARse +TrANsformer World model (SPARTAN), a Transformer-based world model that learns +local causal structures between entities in a scene. By applying sparsity +regularisation on the attention pattern between object-factored tokens, SPARTAN +identifies sparse local causal models that accurately predict future object +states. Furthermore, we extend our model to capture sparse interventions with +unknown targets on the dynamics of the environment. This results in a highly +interpretable world model that can efficiently adapt to changes. Empirically, +we evaluate SPARTAN against the current state-of-the-art in object-centric +world models on observation-based environments and demonstrate that our model +can learn accurate local causal graphs and achieve significantly improved +few-shot adaptation to changes in the dynamics of the environment as well as +robustness against removing irrelevant distractors. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot NAS via the Suppression of Local Entropy Decrease + + +
+ Architecture performance evaluation is the most time-consuming part of neural +architecture search (NAS). Zero-Shot NAS accelerates the evaluation by +utilizing zero-cost proxies instead of training. Though effective, existing +zero-cost proxies require invoking backpropagations or running networks on +input data, making it difficult to further accelerate the computation of +proxies. To alleviate this issue, architecture topologies are used to evaluate +the performance of networks in this study. We prove that particular +architectural topologies decrease the local entropy of feature maps, which +degrades specific features to a bias, thereby reducing network performance. +Based on this proof, architectural topologies are utilized to quantify the +suppression of local entropy decrease (SED) as a data-free and running-free +proxy. Experimental results show that SED outperforms most state-of-the-art +proxies in terms of architecture selection on five benchmarks, with computation +time reduced by three orders of magnitude. We further compare the SED-based NAS +with state-of-the-art proxies. SED-based NAS selects the architecture with +higher accuracy and fewer parameters in only one second. The theoretical +analyses of local entropy and experimental results demonstrate that the +suppression of local entropy decrease facilitates selecting optimal +architectures in Zero-Shot NAS. + +
+
+ comment: 8 pages, 2 figures. Corrected typos and latex template +
+
+
+
+
+ + ♻ ☆ CALoR: Towards Comprehensive Model Inversion Defense + + +
+ Model Inversion Attacks (MIAs) aim at recovering privacy-sensitive training +data from the knowledge encoded in the released machine learning models. Recent +advances in the MIA field have significantly enhanced the attack performance +under multiple scenarios, posing serious privacy risks of Deep Neural Networks +(DNNs). However, the development of defense strategies against MIAs is +relatively backward to resist the latest MIAs and existing defenses fail to +achieve further trade-off between model utility and model robustness. In this +paper, we provide an in-depth analysis from the perspective of intrinsic +vulnerabilities of MIAs, comprehensively uncovering the weaknesses inherent in +the basic pipeline, which are partially investigated in the previous defenses. +Building upon these new insights, we propose a robust defense mechanism, +integrating Confidence Adaptation and Low-Rank compression(CALoR). Our method +includes a novel robustness-enhanced classification loss specially-designed for +model inversion defenses and reveals the extraordinary effectiveness of +compressing the classification header. With CALoR, we can mislead the +optimization objective, reduce the leaked information and impede the +backpropagation of MIAs, thus mitigating the risk of privacy leakage. Extensive +experimental results demonstrate that our method achieves state-of-the-art +(SOTA) defense performance against MIAs and exhibits superior generalization to +existing defenses across various scenarios. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ Deception Detection from Linguistic and Physiological Data Streams Using + Bimodal Convolutional Neural Networks + + +
+ Deception detection is gaining increasing interest due to ethical and +security concerns. This paper explores the application of convolutional neural +networks for the purpose of multimodal deception detection. We use a dataset +built by interviewing 104 subjects about two topics, with one truthful and one +falsified response from each subject about each topic. In particular, we make +three main contributions. First, we extract linguistic and physiological +features from this data to train and construct the neural network models. +Second, we propose a fused convolutional neural network model using both +modalities in order to achieve an improved overall performance. Third, we +compare our new approach with earlier methods designed for multimodal deception +detection. We find that our system outperforms regular classification methods; +our results indicate the feasibility of using neural networks for deception +detection even in the presence of limited amounts of data. + +
+
+ comment: Accepted by 2024 5th International Conference on Information Science, + Parallel and Distributed Systems +
+
+
+
+
+ + ♻ ☆ Stochastic Super-resolution of Cosmological Simulations with Denoising + Diffusion Models + + +
+ In recent years, deep learning models have been successfully employed for +augmenting low-resolution cosmological simulations with small-scale +information, a task known as "super-resolution". So far, these cosmological +super-resolution models have relied on generative adversarial networks (GANs), +which can achieve highly realistic results, but suffer from various +shortcomings (e.g. low sample diversity). We introduce denoising diffusion +models as a powerful generative model for super-resolving cosmic large-scale +structure predictions (as a first proof-of-concept in two dimensions). To +obtain accurate results down to small scales, we develop a new "filter-boosted" +training approach that redistributes the importance of different scales in the +pixel-wise training objective. We demonstrate that our model not only produces +convincing super-resolution images and power spectra consistent at the percent +level, but is also able to reproduce the diversity of small-scale features +consistent with a given low-resolution simulation. This enables uncertainty +quantification for the generated small-scale features, which is critical for +the usefulness of such super-resolution models as a viable surrogate model for +cosmic structure formation. + +
+
+ comment: 9 pages, 8 figures, to be submitted to OJA, comments welcome +
+
+
+
+
+ + ♻ ☆ Online Iterative Reinforcement Learning from Human Feedback with General + Preference Model + + +
+ We investigate Reinforcement Learning from Human Feedback (RLHF) in the +context of a general preference oracle. In particular, we do not assume the +existence of a reward function and an oracle preference signal drawn from the +Bradley-Terry model as most of the prior works do. We consider a standard +mathematical formulation, the reverse-KL regularized minimax game between two +LLMs for RLHF under general preference oracle. The learning objective of this +formulation is to find a policy so that it is consistently preferred by the +KL-regularized preference oracle over any competing LLMs. We show that this +framework is strictly more general than the reward-based one, and propose +sample-efficient algorithms for both the offline learning from a pre-collected +preference dataset and online learning where we can query the preference oracle +along the way of training. Empirical studies verify the effectiveness of the +proposed framework. + +
+
+ comment: RLHF, Preference Learning, Alignment for LLMs +
+
+
+
+
+ + ♻ ☆ SKVQ: Sliding-window Key and Value Cache Quantization for Large Language + Models + + +
+ Large language models (LLMs) can now handle longer sequences of tokens, +enabling complex tasks like book understanding and generating lengthy novels. +However, the key-value (KV) cache required for LLMs consumes substantial memory +as context length increasing, becoming the bottleneck for deployment. In this +paper, we present a strategy called SKVQ, which stands for sliding-window KV +cache quantization, to address the issue of extremely low bitwidth KV cache +quantization. To achieve this, SKVQ rearranges the channels of the KV cache in +order to improve the similarity of channels in quantization groups, and applies +clipped dynamic quantization at the group level. Additionally, SKVQ ensures +that the most recent window tokens in the KV cache are preserved with high +precision. This helps maintain the accuracy of a small but important portion of +the KV cache.SKVQ achieves high compression ratios while maintaining accuracy. +Our evaluation on LLMs demonstrates that SKVQ surpasses previous quantization +approaches, allowing for quantization of the KV cache to 2-bit keys and 1.5-bit +values with minimal loss of accuracy. With SKVQ, it is possible to process +context lengths of up to 1M on an 80GB memory GPU for a 7b model and up to 7 +times faster decoding. + +
+
+
+
+
+ + ♻ ☆ Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM + Approach + + +
+ Accurate stock market predictions following earnings reports are crucial for +investors. Traditional methods, particularly classical machine learning models, +struggle with these predictions because they cannot effectively process and +interpret extensive textual data contained in earnings reports and often +overlook nuances that influence market movements. This paper introduces an +advanced approach by employing Large Language Models (LLMs) instruction +fine-tuned with a novel combination of instruction-based techniques and +quantized low-rank adaptation (QLoRA) compression. Our methodology integrates +'base factors', such as financial metric growth and earnings transcripts, with +'external factors', including recent market indices performances and analyst +grades, to create a rich, supervised dataset. This comprehensive dataset +enables our models to achieve superior predictive performance in terms of +accuracy, weighted F1, and Matthews correlation coefficient (MCC), especially +evident in the comparison with benchmarks such as GPT-4. We specifically +highlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases +significant improvements over baseline models. The paper also discusses the +potential of expanding the output capabilities to include a 'Hold' option and +extending the prediction horizon, aiming to accommodate various investment +styles and time frames. This study not only demonstrates the power of +integrating cutting-edge AI with fine-tuned financial data but also paves the +way for future research in enhancing AI-driven financial analysis tools. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ♻ ☆ Add-it: Training-Free Object Insertion in Images With Pretrained + Diffusion Models + + +
+ Adding Object into images based on text instructions is a challenging task in +semantic image editing, requiring a balance between preserving the original +scene and seamlessly integrating the new object in a fitting location. Despite +extensive efforts, existing models often struggle with this balance, +particularly with finding a natural location for adding an object in complex +scenes. We introduce Add-it, a training-free approach that extends diffusion +models' attention mechanisms to incorporate information from three key sources: +the scene image, the text prompt, and the generated image itself. Our weighted +extended-attention mechanism maintains structural consistency and fine details +while ensuring natural object placement. Without task-specific fine-tuning, +Add-it achieves state-of-the-art results on both real and generated image +insertion benchmarks, including our newly constructed "Additing Affordance +Benchmark" for evaluating object placement plausibility, outperforming +supervised methods. Human evaluations show that Add-it is preferred in over 80% +of cases, and it also demonstrates improvements in various automated metrics. + +
+
+ comment: Project page is at https://research.nvidia.com/labs/par/addit/ +
+
+
+
+
+ + ♻ ☆ Enhancing Exchange Rate Forecasting with Explainable Deep Learning + Models + + +
+ Accurate exchange rate prediction is fundamental to financial stability and +international trade, positioning it as a critical focus in economic and +financial research. Traditional forecasting models often falter when addressing +the inherent complexities and non-linearities of exchange rate data. This study +explores the application of advanced deep learning models, including LSTM, CNN, +and transformer-based architectures, to enhance the predictive accuracy of the +RMB/USD exchange rate. Utilizing 40 features across 6 categories, the analysis +identifies TSMixer as the most effective model for this task. A rigorous +feature selection process emphasizes the inclusion of key economic indicators, +such as China-U.S. trade volumes and exchange rates of other major currencies +like the euro-RMB and yen-dollar pairs. The integration of grad-CAM +visualization techniques further enhances model interpretability, allowing for +clearer identification of the most influential features and bolstering the +credibility of the predictions. These findings underscore the pivotal role of +fundamental economic data in exchange rate forecasting and highlight the +substantial potential of machine learning models to deliver more accurate and +reliable predictions, thereby serving as a valuable tool for financial analysis +and decision-making. + +
+
+ comment: Accepted by 2024 5th International Conference on Machine Learning and + Computer Application +
+
+
+
+
+ + ♻ ☆ A Comparative Study on Enhancing Prediction in Social Network + Advertisement through Data Augmentation + + +
+ In the ever-evolving landscape of social network advertising, the volume and +accuracy of data play a critical role in the performance of predictive models. +However, the development of robust predictive algorithms is often hampered by +the limited size and potential bias present in real-world datasets. This study +presents and explores a generative augmentation framework of social network +advertising data. Our framework explores three generative models for data +augmentation - Generative Adversarial Networks (GANs), Variational Autoencoders +(VAEs), and Gaussian Mixture Models (GMMs) - to enrich data availability and +diversity in the context of social network advertising analytics effectiveness. +By performing synthetic extensions of the feature space, we find that through +data augmentation, the performance of various classifiers has been +quantitatively improved. Furthermore, we compare the relative performance gains +brought by each data augmentation technique, providing insights for +practitioners to select appropriate techniques to enhance model performance. +This paper contributes to the literature by showing that synthetic data +augmentation alleviates the limitations imposed by small or imbalanced datasets +in the field of social network advertising. At the same time, this article also +provides a comparative perspective on the practicality of different data +augmentation methods, thereby guiding practitioners to choose appropriate +techniques to enhance model performance. + +
+
+ comment: Accepted by 2024 4th International Conference on Machine Learning and + Intelligent Systems Engineering (MLISE) +
+
+
+
+
+ + ♻ ☆ Time Series Modeling for Heart Rate Prediction: From ARIMA to + Transformers + + +
+ Cardiovascular disease (CVD) is a leading cause of death globally, +necessitating precise forecasting models for monitoring vital signs like heart +rate, blood pressure, and ECG. Traditional models, such as ARIMA and Prophet, +are limited by their need for manual parameter tuning and challenges in +handling noisy, sparse, and highly variable medical data. This study +investigates advanced deep learning models, including LSTM, and +transformer-based architectures, for predicting heart rate time series from the +MIT-BIH Database. Results demonstrate that deep learning models, particularly +PatchTST, significantly outperform traditional models across multiple metrics, +capturing complex patterns and dependencies more effectively. This research +underscores the potential of deep learning to enhance patient monitoring and +CVD management, suggesting substantial clinical benefits. Future work should +extend these findings to larger, more diverse datasets and real-world clinical +applications to further validate and optimize model performance. + +
+
+ comment: Accepted by 2024 6th International Conference on Electronic + Engineering and Informatics +
+
+
+
+
+ + ♻ ☆ Exploring Diverse Methods in Visual Question Answering + + +
+ This study explores innovative methods for improving Visual Question +Answering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and +attention mechanisms. Leveraging a balanced VQA dataset, we investigate three +distinct strategies. Firstly, GAN-based approaches aim to generate answer +embeddings conditioned on image and question inputs, showing potential but +struggling with more complex tasks. Secondly, autoencoder-based techniques +focus on learning optimal embeddings for questions and images, achieving +comparable results with GAN due to better ability on complex questions. Lastly, +attention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB), +address language priors and attention modeling, albeit with a +complexity-performance trade-off. This study underscores the challenges and +opportunities in VQA and suggests avenues for future research, including +alternative GAN formulations and attentional mechanisms. + +
+
+ comment: Accepted by 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Graph Agent Network: Empowering Nodes with Inference Capabilities for + Adversarial Resilience + + +
+ End-to-end training with global optimization have popularized graph neural +networks (GNNs) for node classification, yet inadvertently introduced +vulnerabilities to adversarial edge-perturbing attacks. Adversaries can exploit +the inherent opened interfaces of GNNs' input and output, perturbing critical +edges and thus manipulating the classification results. Current defenses, due +to their persistent utilization of global-optimization-based end-to-end +training schemes, inherently encapsulate the vulnerabilities of GNNs. This is +specifically evidenced in their inability to defend against targeted secondary +attacks. In this paper, we propose the Graph Agent Network (GAgN) to address +the aforementioned vulnerabilities of GNNs. GAgN is a graph-structured agent +network in which each node is designed as an 1-hop-view agent. Through the +decentralized interactions between agents, they can learn to infer global +perceptions to perform tasks including inferring embeddings, degrees and +neighbor relationships for given nodes. This empowers nodes to filtering +adversarial edges while carrying out classification tasks. Furthermore, agents' +limited view prevents malicious messages from propagating globally in GAgN, +thereby resisting global-optimization-based secondary attacks. We prove that +single-hidden-layer multilayer perceptrons (MLPs) are theoretically sufficient +to achieve these functionalities. Experimental results show that GAgN +effectively implements all its intended capabilities and, compared to +state-of-the-art defenses, achieves optimal classification accuracy on the +perturbed datasets. + +
+
+
+
+
+ + ♻ ☆ Identifying Backdoored Graphs in Graph Neural Network Training: An + Explanation-Based Approach with Novel Metrics + + +
+ Graph Neural Networks (GNNs) have gained popularity in numerous domains, yet +they are vulnerable to backdoor attacks that can compromise their performance +and ethical application. The detection of these attacks is crucial for +maintaining the reliability and security of GNN classification tasks, but +effective detection techniques are lacking. Recognizing the challenge in +detecting such intrusions, we devised a novel detection method that creatively +leverages graph-level explanations. By extracting and transforming secondary +outputs from GNN explanation mechanisms, we developed seven innovative metrics +for effective detection of backdoor attacks on GNNs. Additionally, we develop +an adaptive attack to rigorously evaluate our approach. We test our method on +multiple benchmark datasets and examine its efficacy against various attack +models. Our results show that our method can achieve high detection +performance, marking a significant advancement in safeguarding GNNs against +backdoor attacks. + +
+
+
+
+
+ + ♻ ☆ Feature Selection Based on Wasserstein Distance + + +
+ This paper presents a novel feature selection method leveraging the +Wasserstein distance to improve feature selection in machine learning. Unlike +traditional methods based on correlation or Kullback-Leibler (KL) divergence, +our approach uses the Wasserstein distance to assess feature similarity, +inherently capturing class relationships and making it robust to noisy labels. +We introduce a Markov blanket-based feature selection algorithm and demonstrate +its effectiveness. Our analysis shows that the Wasserstein distance-based +feature selection method effectively reduces the impact of noisy labels without +relying on specific noise models. We provide a lower bound on its +effectiveness, which remains meaningful even in the presence of noise. +Experimental results across multiple datasets demonstrate that our approach +consistently outperforms traditional methods, particularly in noisy settings. + +
+
+
+
+
+ + ♻ ☆ Game-theoretic LLM: Agent Workflow for Negotiation Games + + +
+ This paper investigates the rationality of large language models (LLMs) in +strategic decision-making contexts, specifically within the framework of game +theory. We evaluate several state-of-the-art LLMs across a spectrum of +complete-information and incomplete-information games. Our findings reveal that +LLMs frequently deviate from rational strategies, particularly as the +complexity of the game increases with larger payoff matrices or deeper +sequential trees. + To address these limitations, we design multiple game-theoretic workflows +that guide the reasoning and decision-making processes of LLMs. These workflows +aim to enhance the models' ability to compute Nash Equilibria and make rational +choices, even under conditions of uncertainty and incomplete information. +Experimental results demonstrate that the adoption of these workflows +significantly improves the rationality and robustness of LLMs in game-theoretic +tasks. Specifically, with the workflow, LLMs exhibit marked improvements in +identifying optimal strategies, achieving near-optimal allocations in +negotiation scenarios, and reducing susceptibility to exploitation during +negotiations. Furthermore, we explore the meta-strategic considerations of +whether it is rational for agents to adopt such workflows, recognizing that the +decision to use or forgo the workflow constitutes a game-theoretic issue in +itself. + Our research contributes to a deeper understanding of LLMs' decision-making +capabilities in strategic contexts and provides insights into enhancing their +rationality through structured workflows. The findings have implications for +the development of more robust and strategically sound AI agents capable of +navigating complex interactive environments. Code and data supporting this +study are available at \url{https://github.com/Wenyueh/game_theory}. + +
+
+ comment: 45 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Medication Recommendation via Dual Molecular Modalities and Multi-Step + Enhancement + + +
+ Existing works based on molecular knowledge neglect the 3D geometric +structure of molecules and fail to learn the high-dimensional information of +medications, leading to structural confusion. Additionally, it does not extract +key substructures from a single patient visit, resulting in the failure to +identify medication molecules suitable for the current patient visit. To +address the above limitations, we propose a bimodal molecular recommendation +framework named BiMoRec, which introduces 3D molecular structures to obtain +atomic 3D coordinates and edge indices, overcoming the inherent lack of +high-dimensional molecular information in 2D molecular structures. To retain +the fast training and prediction efficiency of the recommendation system, we +use bimodal graph contrastive pretraining to maximize the mutual information +between the two molecular modalities, achieving the fusion of 2D and 3D +molecular graphs. Additionally, we designed a molecular multi-step enhancement +mechanism to re-calibrate the molecular weights. Specifically, we employ a +pre-training method that captures both 2D and 3D molecular structure +representations, along with substructure representations, and leverages +contrastive learning to extract mutual information. We then use the pre-trained +encoder to generate molecular representations, enhancing them through a +three-step process: intra-visit, molecular per-visit, and latest-visit. +Finally, we apply temporal information aggregation to generate the final +medication combinations. Our implementation on the MIMIC-III and MIMIC-IV +datasets demonstrates that our method achieves state-of-the-art performance. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ MicroScopiQ: Accelerating Foundational Models through Outlier-Aware + Microscaling Quantization + + +
+ Quantization of foundational models (FMs) is significantly more challenging +than traditional DNNs due to the emergence of large magnitude features called +outliers. Existing outlier-aware algorithm/architecture co-design techniques +either use mixed-precision, retaining outliers at high precision but compromise +hardware efficiency, or quantize inliers and outliers at the same precision, +improving hardware efficiency at the cost of accuracy. To address this mutual +exclusivity, in this paper, we propose MicroScopiQ, a novel co-design technique +that leverages pruning to complement outlier-aware quantization. MicroScopiQ +retains outliers at higher precision while pruning a certain fraction of least +important weights to distribute the additional outlier bits; ensuring high +accuracy, aligned memory and hardware efficiency. We design a high-throughput, +low overhead accelerator architecture composed of simple multi-precision INT +processing elements and a novel network-on-chip called ReCoN that efficiently +abstracts the complexity of supporting high-precision outliers. Additionally, +unlike existing alternatives, MicroScopiQ does not assume any locality of +outlier weights, enabling applicability to a broad range of FMs. Extensive +experiments across various quantization settings show that MicroScopiQ achieves +SoTA quantization performance while simultaneously improving inference +performance by 3x and reducing energy by 2x over existing alternatives. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Human-in-the-Loop Segmentation of Multi-species Coral Imagery CVPR2024 + + +
+ Marine surveys by robotic underwater and surface vehicles result in +substantial quantities of coral reef imagery, however labeling these images is +expensive and time-consuming for domain experts. Point label propagation is a +technique that uses existing images labeled with sparse points to create +augmented ground truth data, which can be used to train a semantic segmentation +model. In this work, we show that recent advances in large foundation models +facilitate the creation of augmented ground truth masks using only features +extracted by the denoised version of the DINOv2 foundation model and K-Nearest +Neighbors (KNN), without any pre-training. For images with extremely sparse +labels, we present a labeling method based on human-in-the-loop principles, +which greatly enhances annotation efficiency: in the case that there are 5 +point labels per image, our human-in-the-loop method outperforms the prior +state-of-the-art by 14.2% for pixel accuracy and 19.7% for mIoU; and by 8.9% +and 18.3% if there are 10 point labels. When human-in-the-loop labeling is not +available, using the denoised DINOv2 features with a KNN still improves on the +prior state-of-the-art by 2.7% for pixel accuracy and 5.8% for mIoU (5 grid +points). On the semantic segmentation task, we outperform the prior +state-of-the-art by 8.8% for pixel accuracy and by 13.5% for mIoU when only 5 +point labels are used for point label propagation. Additionally, we perform a +comprehensive study into the impacts of the point label placement style and the +number of points on the point label propagation quality, and make several +recommendations for improving the efficiency of labeling images with points. + +
+
+ comment: Journal article preprint of extended paper, 30 pages, 11 figures. + Original conference paper (v2) accepted at the CVPR2024 3rd Workshop on + Learning with Limited Labelled Data for Image and Video Understanding + (L3D-IVU) +
+
+
+
+
+ + ♻ ☆ Self-Data Distillation for Recovering Quality in Pruned Large Language + Models + + +
+ Large language models have driven significant progress in natural language +processing, but their deployment requires substantial compute and memory +resources. As models scale, compression techniques become essential for +balancing model quality with computational efficiency. Structured pruning, +which removes less critical components of the model, is a promising strategy +for reducing complexity. However, one-shot pruning often results in significant +quality degradation, particularly in tasks requiring multi-step reasoning. To +recover lost quality, supervised fine-tuning (SFT) is commonly applied, but it +can lead to catastrophic forgetting by shifting the model's learned data +distribution. Therefore, addressing the degradation from both pruning and SFT +is essential to preserve the original model's quality. In this work, we utilize +self-data distilled fine-tuning to address these challenges. Our approach +leverages the original, unpruned model to generate a distilled dataset that +preserves semantic richness and mitigates catastrophic forgetting by +maintaining alignment with the base model's knowledge. Empirically, we +demonstrate that self-data distillation consistently outperforms standard SFT, +improving average accuracy by up to 8% on the HuggingFace OpenLLM Leaderboard +v1. Specifically, when pruning six decoder blocks on Llama3.1-8B Instruct +(i.e., 32 to 26 layers, reducing the model size from 8.03B to 6.72B +parameters), our method retains 91.2% of the original model's accuracy compared +to 81.7% with SFT, while reducing real-world FLOPs by 16.3%. Furthermore, +combining self-data distilled models through model merging yields enhanced +quality retention. Additionally, leveraging these pruned models in speculative +decoding increases token acceptance rates, thereby improving inference +efficiency in applied settings. + +
+
+ comment: 13 pages, 4 figures, 6 Tables (Main Paper) + 5 pages (Supplementary + Material) +
+
+
+
+
+ + ♻ ☆ An Efficient Privacy-aware Split Learning Framework for Satellite + Communications + + +
+ In the rapidly evolving domain of satellite communications, integrating +advanced machine learning techniques, particularly split learning, is crucial +for enhancing data processing and model training efficiency across satellites, +space stations, and ground stations. Traditional ML approaches often face +significant challenges within satellite networks due to constraints such as +limited bandwidth and computational resources. To address this gap, we propose +a novel framework for more efficient SL in satellite communications. Our +approach, Dynamic Topology Informed Pruning, namely DTIP, combines differential +privacy with graph and model pruning to optimize graph neural networks for +distributed learning. DTIP strategically applies differential privacy to raw +graph data and prunes GNNs, thereby optimizing both model size and +communication load across network tiers. Extensive experiments across diverse +datasets demonstrate DTIP's efficacy in enhancing privacy, accuracy, and +computational efficiency. Specifically, on Amazon2M dataset, DTIP maintains an +accuracy of 0.82 while achieving a 50% reduction in floating-point operations +per second. Similarly, on ArXiv dataset, DTIP achieves an accuracy of 0.85 +under comparable conditions. Our framework not only significantly improves the +operational efficiency of satellite communications but also establishes a new +benchmark in privacy-aware distributed learning, potentially revolutionizing +data handling in space-based networks. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ A Bayesian Framework for Causal Analysis of Recurrent Events with Timing + Misalignment + + +
+ Observational studies of recurrent event rates are common in biomedical +statistics. Broadly, the goal is to estimate differences in event rates under +two treatments within a defined target population over a specified followup +window. Estimation with observational data is challenging because, while +membership in the target population is defined in terms of eligibility +criteria, treatment is rarely observed exactly at the time of eligibility. +Ad-hoc solutions to this timing misalignment can induce bias by incorrectly +attributing prior event counts and person-time to treatment. Even if +eligibility and treatment are aligned, a terminal event process (e.g. death) +often stops the recurrent event process of interest. In practice, both +processes can be censored so that events are not observed over the entire +followup window. Our approach addresses misalignment by casting it as a +time-varying treatment problem: some patients are on treatment at eligibility +while others are off treatment but may switch to treatment at a specified time +- if they survive long enough. We define and identify an average causal effect +estimand under right-censoring. Estimation is done using a g-computation +procedure with a joint semiparametric Bayesian model for the death and +recurrent event processes. We apply the method to contrast hospitalization +rates among patients with different opioid treatments using Medicare insurance +claims data. + +
+
+
+
+
+ + ♻ ☆ vTune: Verifiable Fine-Tuning for LLMs Through Backdooring + + +
+ As fine-tuning large language models (LLMs) becomes increasingly prevalent, +users often rely on third-party services with limited visibility into their +fine-tuning processes. This lack of transparency raises the question: how do +consumers verify that fine-tuning services are performed correctly? For +instance, a service provider could claim to fine-tune a model for each user, +yet simply send all users back the same base model. To address this issue, we +propose vTune, a simple method that uses a small number of backdoor data points +added to the training data to provide a statistical test for verifying that a +provider fine-tuned a custom model on a particular user's dataset. Unlike +existing works, vTune is able to scale to verification of fine-tuning on +state-of-the-art LLMs, and can be used both with open-source and closed-source +models. We test our approach across several model families and sizes as well as +across multiple instruction-tuning datasets, and find that the statistical test +is satisfied with p-values on the order of $\sim 10^{-40}$, with no negative +impact on downstream task performance. Further, we explore several attacks that +attempt to subvert vTune and demonstrate the method's robustness to these +attacks. + +
+
+
+
+
+ + ♻ ☆ FoldMark: Protecting Protein Generative Models with Watermarking + + +
+ Protein structure is key to understanding protein function and is essential +for progress in bioengineering, drug discovery, and molecular biology. +Recently, with the incorporation of generative AI, the power and accuracy of +computational protein structure prediction/design have been improved +significantly. However, ethical concerns such as copyright protection and +harmful content generation (biosecurity) pose challenges to the wide +implementation of protein generative models. Here, we investigate whether it is +possible to embed watermarks into protein generative models and their outputs +for copyright authentication and the tracking of generated structures. As a +proof of concept, we propose a two-stage method FoldMark as a generalized +watermarking strategy for protein generative models. FoldMark first pretrain +watermark encoder and decoder, which can minorly adjust protein structures to +embed user-specific information and faithfully recover the information from the +encoded structure. In the second step, protein generative models are fine-tuned +with watermark-conditioned Low-Rank Adaptation (LoRA) modules to preserve +generation quality while learning to generate watermarked structures with high +recovery rates. Extensive experiments are conducted on open-source protein +structure prediction models (e.g., ESMFold and MultiFlow) and de novo structure +design models (e.g., FrameDiff and FoldFlow) and we demonstrate that our method +is effective across all these generative models. Meanwhile, our watermarking +framework only exerts a negligible impact on the original protein structure +quality and is robust under potential post-processing and adaptive attacks. + +
+
+
+
+
+ + ♻ ☆ The Inadequacy of Similarity-based Privacy Metrics: Privacy Attacks + against "Truly Anonymous" Synthetic Datasets + + +
+ Generative models producing synthetic data are meant to provide a +privacy-friendly approach to releasing data. However, their privacy guarantees +are only considered robust when models satisfy Differential Privacy (DP). Alas, +this is not a ubiquitous standard, as many leading companies (and, in fact, +research papers) use ad-hoc privacy metrics based on testing the statistical +similarity between synthetic and real data. In this paper, we examine the +privacy metrics used in real-world synthetic data deployments and demonstrate +their unreliability in several ways. First, we provide counter-examples where +severe privacy violations occur even if the privacy tests pass and instantiate +accurate membership and attribute inference attacks with minimal cost. We then +introduce ReconSyn, a reconstruction attack that generates multiple synthetic +datasets that are considered private by the metrics but actually leak +information unique to individual records. We show that ReconSyn recovers +78-100% of the outliers in the train data with only black-box access to a +single fitted generative model and the privacy metrics. In the process, we show +that applying DP only to the model does not mitigate this attack, as using +privacy metrics breaks the end-to-end DP pipeline. + +
+
+
+
+
+ + ♻ ☆ A Block Coordinate Descent Method for Nonsmooth Composite Optimization + under Orthogonality Constraints + + +
+ Nonsmooth composite optimization with orthogonality constraints is crucial in +statistical learning and data science, but it presents challenges due to its +nonsmooth objective and computationally expensive, non-convex constraints. In +this paper, we propose a new approach called \textbf{OBCD}, which leverages +Block Coordinate Descent (BCD) to address these challenges. \textbf{OBCD} is a +feasible method with a small computational footprint. In each iteration, it +updates $k$ rows of the solution matrix, where $k \geq 2$, while globally +solving a small nonsmooth optimization problem under orthogonality constraints. +We prove that \textbf{OBCD} converges to block-$k$ stationary points, which +offer stronger optimality than standard critical points. Notably, \textbf{OBCD} +is the first greedy descent method with monotonicity for this problem class. +Under the Kurdyka-Lojasiewicz (KL) inequality, we establish strong limit-point +convergence. We also extend \textbf{OBCD} with breakpoint searching methods for +subproblem solving and greedy strategies for working set selection. +Comprehensive experiments demonstrate the superior performance of our approach +across various tasks. + +
+
+
+
+
+ + ♻ ☆ Sketched Adaptive Federated Deep Learning: A Sharp Convergence Analysis + + +
+ Combining gradient compression methods (e.g., CountSketch, quantization) and +adaptive optimizers (e.g., Adam, AMSGrad) is a desirable goal in federated +learning (FL), with potential benefits on both fewer communication rounds and +less per-round communication. In spite of the preliminary empirical success of +sketched adaptive methods, existing convergence analyses show the communication +cost to have a linear dependence on the ambient dimension, i.e., number of +parameters, which is prohibitively high for modern deep learning models. In +this work, we introduce specific sketched adaptive federated learning (SAFL) +algorithms and, as our main contribution, provide theoretical convergence +analyses in different FL settings with guarantees on communication cost +depending only logarithmically (instead of linearly) on the ambient dimension. +Unlike existing analyses, we show that the entry-wise sketching noise existent +in the preconditioners and the first moments of SAFL can be implicitly +addressed by leveraging the recently-popularized anisotropic curvatures in deep +learning losses, e.g., fast decaying loss Hessian eigen-values. In the i.i.d. +client setting of FL, we show that SAFL achieves asymptotic $O(1/\sqrt{T})$ +convergence, and converges faster in the initial epochs. In the non-i.i.d. +client setting, where non-adaptive methods lack convergence guarantees, we show +that SACFL (SAFL with clipping) algorithms can provably converge in spite of +the additional heavy-tailed noise. Our theoretical claims are supported by +empirical studies on vision and language tasks, and in both fine-tuning and +training-from-scratch regimes. Surprisingly, as a by-product of our analysis, +the proposed SAFL methods are competitive with the state-of-the-art +communication-efficient federated learning algorithms based on error feedback. + +
+
+
+
+
+ + ♻ ☆ Wonderful Matrices: More Efficient and Effective Architecture for + Language Modeling Tasks + + +
+ We prove the availability of inner product form position encoding in the +state space dual algorithm and study the effectiveness of different position +embeddings in the hybrid quadratic causal self-attention and state space dual +algorithms. We propose inner function attention with dynamic mask, which can +improve the expressiveness of the attention algorithm and avoid the sequence +noise significantly affecting the accuracy of the attention score. We also +design cross domain mixture of experts, which can improve the granularity of +the sparse activation feedforward network while maintaining the efficiency of +parameter utilization and retrieval. The combination of these methods +constitutes our foundation model architecture: Wonderful Matrices. We conduct +experiments on the language modeling task and find that Wonderful Matrices are +more efficient and effective in handling complex language tasks. + +
+
+ comment: 28 pages, 8 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in + Variational AutoEncoder + + +
+ Symmetries of input and latent vectors have provided valuable insights for +disentanglement learning in VAEs. However, only a few works were proposed as an +unsupervised method, and even these works require known factor information in +the training data. We propose a novel method, Composite Factor-Aligned Symmetry +Learning (CFASL), which is integrated into VAEs for learning symmetry-based +disentanglement in unsupervised learning without any knowledge of the dataset +factor information. CFASL incorporates three novel features for learning +symmetry-based disentanglement: 1) Injecting inductive bias to align latent +vector dimensions to factor-aligned symmetries within an explicit learnable +symmetry code-book 2) Learning a composite symmetry to express unknown factors +change between two random samples by learning factor-aligned symmetries within +the codebook 3) Inducing a group equivariant encoder and decoder in training +VAEs with the two conditions. In addition, we propose an extended evaluation +metric for multi-factor changes in comparison to disentanglement evaluation in +VAEs. In quantitative and in-depth qualitative analysis, CFASL demonstrates a +significant improvement of disentanglement in single-factor change, and +multi-factor change conditions compared to state-of-the-art methods. + +
+
+ comment: Accepted in TMLR 25 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Non-overlapping, Schwarz-type Domain Decomposition Method for Physics + and Equality Constrained Artificial Neural Networks + + +
+ We present a non-overlapping, Schwarz-type domain decomposition method with a +generalized interface condition, designed for physics-informed machine learning +of partial differential equations (PDEs) in both forward and inverse contexts. +Our approach employs physics and equality-constrained artificial neural +networks (PECANN) within each subdomain. Unlike the original PECANN method, +which relies solely on initial and boundary conditions to constrain PDEs, our +method uses both boundary conditions and the governing PDE to constrain a +unique interface loss function for each subdomain. This modification improves +the learning of subdomain-specific interface parameters while reducing +communication overhead by delaying information exchange between neighboring +subdomains. To address the constrained optimization in each subdomain, we apply +an augmented Lagrangian method with a conditionally adaptive update strategy, +transforming the problem into an unconstrained dual optimization. A distinct +advantage of our domain decomposition method is its ability to learn solutions +to both Poisson's and Helmholtz equations, even in cases with high-wavenumber +and complex-valued solutions. Through numerical experiments with up to 64 +subdomains, we demonstrate that our method consistently generalizes well as the +number of subdomains increases. + +
+
+ comment: 49 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ PEaRL: Personalized Privacy of Human-Centric Systems using Early-Exit + Reinforcement Learning + + +
+ In the evolving landscape of human-centric systems, personalized privacy +solutions are becoming increasingly crucial due to the dynamic nature of human +interactions. Traditional static privacy models often fail to meet the diverse +and changing privacy needs of users. This paper introduces PEaRL, a system +designed to enhance privacy preservation by tailoring its approach to +individual behavioral patterns and preferences. While incorporating +reinforcement learning (RL) for its adaptability, PEaRL primarily focuses on +employing an early-exit strategy that dynamically balances privacy protection +and system utility. This approach addresses the challenges posed by the +variability and evolution of human behavior, which static privacy models +struggle to handle effectively. We evaluate PEaRL in two distinct contexts: +Smart Home environments and Virtual Reality (VR) Smart Classrooms. The +empirical results demonstrate PEaRL's capability to provide a personalized +tradeoff between user privacy and application utility, adapting effectively to +individual user preferences. On average, across both systems, PEaRL enhances +privacy protection by 31%, with a corresponding utility reduction of 24%. + +
+
+ comment: 15 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ A Unified Analysis on the Subgradient Upper Bounds for the Subgradient + Methods Minimizing Composite Nonconvex, Nonsmooth and Non-Lipschitz Functions + + +
+ This paper presents a unified analysis for the proximal subgradient method +(Prox-SubGrad) type approach to minimize an overall objective of $f(x)+r(x)$, +subject to convex constraints, where both $f$ and $r$ are weakly convex, +nonsmooth, and non-Lipschitz. Leveraging on the properties of the Moreau +envelope of weakly convex functions, we are able to relate error-bound +conditions, the growth conditions of the subgradients of the objective, and the +behavior of the proximal subgradient iterates on some remarkably broad classes +of objective functions. Various existing as well as new bounding conditions are +studied, leading to novel iteration complexity results. The terrain of our +exploration expands to stochastic proximal subgradient algorithms. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 156 + +
+
+
+ + ☆ Scaling Properties of Diffusion Models for Perceptual Tasks + + +
+ In this paper, we argue that iterative computation with diffusion models +offers a powerful paradigm for not only generation but also visual perception +tasks. We unify tasks such as depth estimation, optical flow, and segmentation +under image-to-image translation, and show how diffusion models benefit from +scaling training and test-time compute for these perception tasks. Through a +careful analysis of these scaling behaviors, we present various techniques to +efficiently train diffusion models for visual perception tasks. Our models +achieve improved or comparable performance to state-of-the-art methods using +significantly less data and compute. To use our code and models, see +https://scaling-diffusion-perception.github.io . + +
+
+
+
+
+ + ☆ GaussianAnything: Interactive Point Cloud Latent Diffusion for 3D + Generation + + +
+ While 3D content generation has advanced significantly, existing methods +still face challenges with input formats, latent space design, and output +representations. This paper introduces a novel 3D generation framework that +addresses these challenges, offering scalable, high-quality 3D generation with +an interactive Point Cloud-structured Latent space. Our framework employs a +Variational Autoencoder (VAE) with multi-view posed RGB-D(epth)-N(ormal) +renderings as input, using a unique latent space design that preserves 3D shape +information, and incorporates a cascaded latent diffusion model for improved +shape-texture disentanglement. The proposed method, GaussianAnything, supports +multi-modal conditional 3D generation, allowing for point cloud, caption, and +single/multi-view image inputs. Notably, the newly proposed latent space +naturally enables geometry-texture disentanglement, thus allowing 3D-aware +editing. Experimental results demonstrate the effectiveness of our approach on +multiple datasets, outperforming existing methods in both text- and +image-conditioned 3D generation. + +
+
+ comment: project page: https://nirvanalan.github.io/projects/GA/ +
+
+
+
+
+ + ☆ Learning with Less: Knowledge Distillation from Large Language Models + via Unlabeled Data + + +
+ In real-world NLP applications, Large Language Models (LLMs) offer promising +solutions due to their extensive training on vast datasets. However, the large +size and high computation demands of LLMs limit their practicality in many +applications, especially when further fine-tuning is required. To address these +limitations, smaller models are typically preferred for deployment. However, +their training is hindered by the scarcity of labeled data. In contrast, +unlabeled data is often readily which can be leveraged by using LLMs to +generate pseudo-labels for training smaller models. This enables the smaller +models (student) to acquire knowledge from LLMs(teacher) while reducing +computational costs. This process introduces challenges, such as potential +noisy pseudo-labels. Selecting high-quality and informative data is therefore +critical to enhance model performance while improving the efficiency of data +utilization. To address this, we propose LLKD that enables Learning with Less +computational resources and less data for Knowledge Distillation from LLMs. +LLKD is an adaptive sample selection method that incorporates signals from both +the teacher and student. Specifically, it prioritizes samples where the teacher +demonstrates high confidence in its labeling, indicating reliable labels, and +where the student exhibits a high information need, identifying challenging +samples that require further learning. Our comprehensive experiments show that +LLKD achieves superior performance across various datasets with higher data +efficiency. + +
+
+
+
+
+ + ☆ LLMPhy: Complex Physical Reasoning Using Large Language Models and World + Models + + +
+ Physical reasoning is an important skill needed for robotic agents when +operating in the real world. However, solving such reasoning problems often +involves hypothesizing and reflecting over complex multi-body interactions +under the effect of a multitude of physical forces and thus learning all such +interactions poses a significant hurdle for state-of-the-art machine learning +frameworks, including large language models (LLMs). To study this problem, we +propose a new physical reasoning task and a dataset, dubbed TraySim. Our task +involves predicting the dynamics of several objects on a tray that is given an +external impact -- the domino effect of the ensued object interactions and +their dynamics thus offering a challenging yet controlled setup, with the goal +of reasoning being to infer the stability of the objects after the impact. To +solve this complex physical reasoning task, we present LLMPhy, a zero-shot +black-box optimization framework that leverages the physics knowledge and +program synthesis abilities of LLMs, and synergizes these abilities with the +world models built into modern physics engines. Specifically, LLMPhy uses an +LLM to generate code to iteratively estimate the physical hyperparameters of +the system (friction, damping, layout, etc.) via an implicit +analysis-by-synthesis approach using a (non-differentiable) simulator in the +loop and uses the inferred parameters to imagine the dynamics of the scene +towards solving the reasoning task. To show the effectiveness of LLMPhy, we +present experiments on our TraySim dataset to predict the steady-state poses of +the objects. Our results show that the combination of the LLM and the physics +engine leads to state-of-the-art zero-shot physical reasoning performance, +while demonstrating superior convergence against standard black-box +optimization methods and better estimation of the physical parameters. + +
+
+
+
+
+ + ☆ Leonardo vindicated: Pythagorean trees for minimal reconstruction of the + natural branching structures + + +
+ Trees continue to fascinate with their natural beauty and as engineering +masterpieces optimal with respect to several independent criteria. Pythagorean +tree is a well-known fractal design that realistically mimics the natural tree +branching structures. We study various types of Pythagorean-like fractal trees +with different shapes of the base, branching angles and relaxed scales in an +attempt to identify and explain which variants are the closest match to the +branching structures commonly observed in the natural world. Pursuing +simultaneously the realism and minimalism of the fractal tree model, we have +developed a flexibly parameterised and fast algorithm to grow and visually +examine deep Pythagorean-inspired fractal trees with the capability to orderly +over- or underestimate the Leonardo da Vinci's tree branching rule as well as +control various imbalances and branching angles. We tested the realism of the +generated fractal tree images by means of the classification accuracy of +detecting natural tree with the transfer-trained deep Convolutional Neural +Networks (CNNs). Having empirically established the parameters of the fractal +trees that maximize the CNN's natural tree class classification accuracy we +have translated them back to the scales and angles of branches and came to the +interesting conclusions that support the da Vinci branching rule and golden +ratio based scaling for both the shape of the branch and imbalance between the +child branches, and claim the flexibly parameterized fractal trees can be used +to generate artificial examples to train robust detectors of different species +of trees. + +
+
+ comment: 22 pages, lots of hi res figures I had to reduce quality of, + submitting as a requirement to the Theory of Computing Journal +
+
+
+
+
+ + ☆ Language Models as Causal Effect Generators + + +
+ We present a framework for large language model (LLM) based data generation +with controllable causal structure. In particular, we define a procedure for +turning any language model and any directed acyclic graph (DAG) into a +sequence-driven structural causal model (SD-SCM). Broadly speaking, an SD-SCM +is a causal model with user-defined structure and LLM-defined structural +equations. We characterize how an SD-SCM allows sampling from observational, +interventional, and counterfactual distributions according to the desired +causal structure. We then leverage this procedure to propose a new type of +benchmark for causal inference methods, generating individual-level +counterfactual data without needing to manually specify functional +relationships between variables. We create an example benchmark consisting of +thousands of datasets, and test a suite of popular estimation methods on these +datasets for average, conditional average, and individual treatment effect +estimation, both with and without hidden confounding. Apart from generating +data, the same procedure also allows us to test for the presence of a causal +effect that might be encoded in an LLM. This procedure can underpin auditing +LLMs for misinformation, discrimination, or otherwise undesirable behavior. We +believe SD-SCMs can serve as a useful tool in any application that would +benefit from sequential data with controllable causal structure. + +
+
+
+
+
+ + ☆ Wavelet Latent Diffusion (Wala): Billion-Parameter 3D Generative Model + with Compact Wavelet Encodings + + +
+ Large-scale 3D generative models require substantial computational resources +yet often fall short in capturing fine details and complex geometries at high +resolutions. We attribute this limitation to the inefficiency of current +representations, which lack the compactness required to model the generative +models effectively. To address this, we introduce a novel approach called +Wavelet Latent Diffusion, or WaLa, that encodes 3D shapes into wavelet-based, +compact latent encodings. Specifically, we compress a $256^3$ signed distance +field into a $12^3 \times 4$ latent grid, achieving an impressive 2427x +compression ratio with minimal loss of detail. This high level of compression +allows our method to efficiently train large-scale generative networks without +increasing the inference time. Our models, both conditional and unconditional, +contain approximately one billion parameters and successfully generate +high-quality 3D shapes at $256^3$ resolution. Moreover, WaLa offers rapid +inference, producing shapes within two to four seconds depending on the +condition, despite the model's scale. We demonstrate state-of-the-art +performance across multiple datasets, with significant improvements in +generation quality, diversity, and computational efficiency. We open-source our +code and, to the best of our knowledge, release the largest pretrained 3D +generative models across different modalities. + +
+
+
+
+
+ + ☆ Investigating the Effectiveness of Explainability Methods in Parkinson's + Detection from Speech + + +
+ Speech impairments in Parkinson's disease (PD) provide significant early +indicators for diagnosis. While models for speech-based PD detection have shown +strong performance, their interpretability remains underexplored. This study +systematically evaluates several explainability methods to identify PD-specific +speech features, aiming to support the development of accurate, interpretable +models for clinical decision-making in PD diagnosis and monitoring. Our +methodology involves (i) obtaining attributions and saliency maps using +mainstream interpretability techniques, (ii) quantitatively evaluating the +faithfulness of these maps and their combinations obtained via union and +intersection through a range of established metrics, and (iii) assessing the +information conveyed by the saliency maps for PD detection from an auxiliary +classifier. Our results reveal that, while explanations are aligned with the +classifier, they often fail to provide valuable information for domain experts. + +
+
+ comment: The first two authors contributed equally to this research: author + order is alphabetical +
+
+
+
+
+ + ☆ ExpressivityArena: Can LLMs Express Information Implicitly? + + +
+ While Large Language Models (LLMs) have demonstrated remarkable performance +in certain dimensions, their ability to express implicit language cues that +human use for effective communication remains unclear. This paper presents +ExpressivityArena, a Python library for measuring the implicit communication +abilities of LLMs. We provide a comprehensive framework to evaluate +expressivity of arbitrary LLMs and explore its practical implications. To this +end, we refine the definition and measurements of ``expressivity,'' and use our +framework in a set of small experiments. These experiments test LLMs in +creative and logical tasks such as poetry, coding, and emotion-based responses. +They are then evaluated by an automated grader, through ExpressivityArena, +which we verify to be the most pragmatic for testing expressivity. Building on +these experiments, we deepen our understanding of the expressivity of LLMs by +assessing their ability to remain expressive in conversations. Our findings +indicate that LLMs are capable of generating and understanding expressive +content, however, with some limitations. These insights will inform the future +development and deployment of expressive LLMs. We provide the code for +ExpressivityArena alongside our paper. + +
+
+ comment: 8 pages, 22 figures +
+
+
+
+
+ + ☆ Can adversarial attacks by large language models be attributed? + + +
+ Attributing outputs from Large Language Models (LLMs) in adversarial +settings-such as cyberattacks and disinformation-presents significant +challenges that are likely to grow in importance. We investigate this +attribution problem using formal language theory, specifically language +identification in the limit as introduced by Gold and extended by Angluin. By +modeling LLM outputs as formal languages, we analyze whether finite text +samples can uniquely pinpoint the originating model. Our results show that due +to the non-identifiability of certain language classes, under some mild +assumptions about overlapping outputs from fine-tuned models it is +theoretically impossible to attribute outputs to specific LLMs with certainty. +This holds also when accounting for expressivity limitations of Transformer +architectures. Even with direct model access or comprehensive monitoring, +significant computational hurdles impede attribution efforts. These findings +highlight an urgent need for proactive measures to mitigate risks posed by +adversarial LLM use as their influence continues to expand. + +
+
+ comment: 7 pages, 1 figure +
+
+
+
+
+ + ☆ Derivational Morphology Reveals Analogical Generalization in Large + Language Models + + +
+ What mechanisms underlie linguistic generalization in large language models +(LLMs)? This question has attracted considerable attention, with most studies +analyzing the extent to which the language skills of LLMs resemble rules. As of +yet, it is not known whether linguistic generalization in LLMs could equally +well be explained as the result of analogical processes, which can be +formalized as similarity operations on stored exemplars. A key shortcoming of +prior research is its focus on linguistic phenomena with a high degree of +regularity, for which rule-based and analogical approaches make the same +predictions. Here, we instead examine derivational morphology, specifically +English adjective nominalization, which displays notable variability. We +introduce a new method for investigating linguistic generalization in LLMs: +focusing on GPT-J, we fit cognitive models that instantiate rule-based and +analogical learning to the LLM training data and compare their predictions on a +set of nonce adjectives with those of the LLM, allowing us to draw direct +conclusions regarding underlying mechanisms. As expected, rule-based and +analogical models explain the predictions of GPT-J equally well for adjectives +with regular nominalization patterns. However, for adjectives with variable +nominalization patterns, the analogical model provides a much better match. +Furthermore, GPT-J's behavior is sensitive to the individual word frequencies, +even for regular forms, a behavior that is consistent with an analogical +account of regular forms but not a rule-based one. These findings refute the +hypothesis that GPT-J's linguistic generalization on adjective nominalization +involves rules, suggesting similarity operations on stored exemplars as the +underlying mechanism. Overall, our study suggests that analogical processes +play a bigger role in the linguistic generalization of LLMs than previously +thought. + +
+
+
+
+
+ + ☆ Gini Coefficient as a Unified Metric for Evaluating Many-versus-Many + Similarity in Vector Spaces + + +
+ We demonstrate that Gini coefficients can be used as unified metrics to +evaluate many-versus-many (all-to-all) similarity in vector spaces. Our +analysis of various image datasets shows that images with the highest Gini +coefficients tend to be the most similar to one another, while images with the +lowest Gini coefficients are the least similar. We also show that this +relationship holds true for vectorized text embeddings from various corpuses, +highlighting the consistency of our method and its broad applicability across +different types of data. Additionally, we demonstrate that selecting machine +learning training samples that closely match the distribution of the testing +dataset is far more important than ensuring data diversity. Selection of +exemplary and iconic training samples with higher Gini coefficients leads to +significantly better model performance compared to simply having a diverse +training set with lower Gini coefficients. Thus, Gini coefficients can serve as +effective criteria for selecting machine learning training samples, with our +selection method outperforming random sampling methods in very sparse +information settings. + +
+
+
+
+
+ + ☆ Exact, Tractable Gauss-Newton Optimization in Deep Reversible + Architectures Reveal Poor Generalization NeurIPS 2024 + + +
+ Second-order optimization has been shown to accelerate the training of deep +neural networks in many applications, often yielding faster progress per +iteration on the training loss compared to first-order optimizers.However, the +generalization properties of second-order methods are still being debated. +Theoretical investigations have proved difficult to carry out outside the +tractable settings of heavily simplified model classes -- thus, the relevance +of existing theories to practical deep learning applications remains unclear. +Similarly, empirical studies in large-scale models and real datasets are +significantly confounded by the necessity to approximate second-order updates +in practice. It is often unclear whether the observed generalization behaviour +arises specifically from the second-order nature of the parameter updates, or +instead reflects the specific structured (e.g.\ Kronecker) approximations used +or any damping-based interpolation towards first-order updates. Here, we show +for the first time that exact Gauss-Newton (GN) updates take on a tractable +form in a class of deep reversible architectures that are sufficiently +expressive to be meaningfully applied to common benchmark datasets. We exploit +this novel setting to study the training and generalization properties of the +GN optimizer. We find that exact GN generalizes poorly. In the mini-batch +training setting, this manifests as rapidly saturating progress even on the +\emph{training} loss, with parameter updates found to overfit each +mini-batchatch without producing the features that would support generalization +to other mini-batches. We show that our experiments run in the ``lazy'' regime, +in which the neural tangent kernel (NTK) changes very little during the course +of training. This behaviour is associated with having no significant changes in +neural representations, explaining the lack of generalization. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring + + +
+ Coronary artery disease (CAD), one of the most common cause of mortality in +the world. Coronary artery calcium (CAC) scoring using computed tomography (CT) +is key for risk assessment to prevent coronary disease. Previous studies on +risk assessment and calcification detection in CT scans primarily use +approaches based on UNET architecture, frequently implemented on pre-built +models. However, these models are limited by the availability of annotated CT +scans containing CAC and suffering from imbalanced dataset, decreasing +performance of CAC segmentation and scoring. In this study, we extend this +approach by incorporating the self-supervised learning (SSL) technique of DINO +(self-distillation with no labels) to eliminate limitations of scarce annotated +data in CT scans. The DINO model's ability to train without requiring CAC area +annotations enhances its robustness in generating distinct features. The DINO +model is trained on to focus specifically on calcified areas by using labels, +aiming to generate features that effectively capture and highlight key +characteristics. The label-guided DINO (DINO-LG) enhances classification by +distinguishing CT slices that contain calcification from those that do not, +performing 57% better than the standard DINO model in this task. CAC scoring +and segmentation tasks are performed by a basic U-NET architecture, fed +specifically with CT slices containing calcified areas as identified by the +DINO-LG model. This targeted identification performed by DINO-LG model improves +CAC segmentation performance by approximately 10% and significant increase in +CAC scoring accuracy. + +
+
+ comment: Developed by Center for Applied Artificial Intelligence (CAAI), + University of Kentucky +
+
+
+
+
+ + ☆ JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified + Multimodal Understanding and Generation + + +
+ We present JanusFlow, a powerful framework that unifies image understanding +and generation in a single model. JanusFlow introduces a minimalist +architecture that integrates autoregressive language models with rectified +flow, a state-of-the-art method in generative modeling. Our key finding +demonstrates that rectified flow can be straightforwardly trained within the +large language model framework, eliminating the need for complex architectural +modifications. To further improve the performance of our unified model, we +adopt two key strategies: (i) decoupling the understanding and generation +encoders, and (ii) aligning their representations during unified training. +Extensive experiments show that JanusFlow achieves comparable or superior +performance to specialized models in their respective domains, while +significantly outperforming existing unified approaches across standard +benchmarks. This work represents a step toward more efficient and versatile +vision-language models. + +
+
+
+
+
+ + ☆ How To Discover Short, Shorter, and the Shortest Proofs of + Unsatisfiability: A Branch-and-Bound Approach for Resolution Proof Length + Minimization + + +
+ Modern software for propositional satisfiability problems gives a powerful +automated reasoning toolkit, capable of outputting not only a +satisfiable/unsatisfiable signal but also a justification of unsatisfiability +in the form of resolution proof (or a more expressive proof), which is commonly +used for verification purposes. Empirically, modern SAT solvers produce +relatively short proofs, however, there are no inherent guarantees that these +proofs cannot be significantly reduced. This paper proposes a novel +branch-and-bound algorithm for finding the shortest resolution proofs; to this +end, we introduce a layer list representation of proofs that groups clauses by +their level of indirection. As we show, this representation breaks all +permutational symmetries, thereby improving upon the state-of-the-art +symmetry-breaking and informing the design of a novel workflow for proof +minimization. In addition to that, we design pruning procedures that reason on +proof length lower bound, clause subsumption, and dominance. Our experiments +suggest that the proofs from state-of-the-art solvers could be shortened by +30-60% on the instances from SAT Competition 2002 and by 25-50% on small +synthetic formulas. When treated as an algorithm for finding the shortest +proof, our approach solves twice as many instances as the previous work based +on SAT solving and reduces the time to optimality by orders of magnitude for +the instances solved by both approaches. + +
+
+ comment: 42 pages, 16 figures, 8 tables, submitted to Journal of Artificial + Intelligence Research +
+
+
+
+
+ + ☆ Towards Low-bit Communication for Tensor Parallel LLM Inference + + +
+ Tensor parallelism provides an effective way to increase server large +language model (LLM) inference efficiency despite adding an additional +communication cost. However, as server LLMs continue to scale in size, they +will need to be distributed across more devices, magnifying the communication +cost. One way to approach this problem is with quantization, but current +methods for LLMs tend to avoid quantizing the features that tensor parallelism +needs to communicate. Taking advantage of consistent outliers in communicated +features, we introduce a quantization method that reduces communicated values +on average from 16 bits to 4.2 bits while preserving nearly all of the original +performance. For instance, our method maintains around 98.0% and 99.5% of Gemma +2 27B's and Llama 2 13B's original performance, respectively, averaged across +all tasks we evaluated on. + +
+
+
+
+
+ + ☆ DuoLift-GAN:Reconstructing CT from Single-view and Biplanar X-Rays with + Generative Adversarial Networks + + +
+ Computed tomography (CT) provides highly detailed three-dimensional (3D) +medical images but is costly, time-consuming, and often inaccessible in +intraoperative settings (Organization et al. 2011). Recent advancements have +explored reconstructing 3D chest volumes from sparse 2D X-rays, such as +single-view or orthogonal double-view images. However, current models tend to +process 2D images in a planar manner, prioritizing visual realism over +structural accuracy. In this work, we introduce DuoLift Generative Adversarial +Networks (DuoLift-GAN), a novel architecture with dual branches that +independently elevate 2D images and their features into 3D representations. +These 3D outputs are merged into a unified 3D feature map and decoded into a +complete 3D chest volume, enabling richer 3D information capture. We also +present a masked loss function that directs reconstruction towards critical +anatomical regions, improving structural accuracy and visual quality. This +paper demonstrates that DuoLift-GAN significantly enhances reconstruction +accuracy while achieving superior visual realism compared to existing methods. + +
+
+
+
+
+ + ☆ Automatic dataset shift identification to support root cause analysis of + AI performance drift + + +
+ Shifts in data distribution can substantially harm the performance of +clinical AI models. Hence, various methods have been developed to detect the +presence of such shifts at deployment time. However, root causes of dataset +shifts are varied, and the choice of shift mitigation strategies is highly +dependent on the precise type of shift encountered at test time. As such, +detecting test-time dataset shift is not sufficient: precisely identifying +which type of shift has occurred is critical. In this work, we propose the +first unsupervised dataset shift identification framework, effectively +distinguishing between prevalence shift (caused by a change in the label +distribution), covariate shift (caused by a change in input characteristics) +and mixed shifts (simultaneous prevalence and covariate shifts). We discuss the +importance of self-supervised encoders for detecting subtle covariate shifts +and propose a novel shift detector leveraging both self-supervised encoders and +task model outputs for improved shift detection. We report promising results +for the proposed shift identification framework across three different imaging +modalities (chest radiography, digital mammography, and retinal fundus images) +on five types of real-world dataset shifts, using four large publicly available +datasets. + +
+
+ comment: Code available at + https://github.com/biomedia-mira/shift_identification +
+
+
+
+
+ + ☆ Doubly Mild Generalization for Offline Reinforcement Learning NeurIPS 2024 + + +
+ Offline Reinforcement Learning (RL) suffers from the extrapolation error and +value overestimation. From a generalization perspective, this issue can be +attributed to the over-generalization of value functions or policies towards +out-of-distribution (OOD) actions. Significant efforts have been devoted to +mitigating such generalization, and recent in-sample learning approaches have +further succeeded in entirely eschewing it. Nevertheless, we show that mild +generalization beyond the dataset can be trusted and leveraged to improve +performance under certain conditions. To appropriately exploit generalization +in offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild +action generalization and (ii) mild generalization propagation. The former +refers to selecting actions in a close neighborhood of the dataset to maximize +the Q values. Even so, the potential erroneous generalization can still be +propagated, accumulated, and exacerbated by bootstrapping. In light of this, +the latter concept is introduced to mitigate the generalization propagation +without impeding the propagation of RL learning signals. Theoretically, DMG +guarantees better performance than the in-sample optimal policy in the oracle +generalization scenario. Even under worst-case generalization, DMG can still +control value overestimation at a certain level and lower bound the +performance. Empirically, DMG achieves state-of-the-art performance across +Gym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting +from its flexibility in both generalization aspects, DMG enjoys a seamless +transition from offline to online learning and attains strong online +fine-tuning performance. + +
+
+ comment: Accepted to NeurIPS 2024. arXiv admin note: substantial text overlap + with arXiv:2410.19400 +
+
+
+
+
+ + ☆ INTRABENCH: Interactive Radiological Benchmark + + +
+ Current interactive segmentation approaches, inspired by the success of +META's Segment Anything model, have achieved notable advancements, however, +they come with substantial limitations that hinder their practical application +in real clinical scenarios. These include unrealistic human interaction +requirements, such as slice-by-slice operations for 2D models on 3D data, a +lack of iterative refinement, and insufficient evaluation experiments. These +shortcomings prevent accurate assessment of model performance and lead to +inconsistent outcomes across studies. IntRaBench overcomes these challenges by +offering a comprehensive and reproducible framework for evaluating interactive +segmentation methods in realistic, clinically relevant scenarios. It includes +diverse datasets, target structures, and segmentation models, and provides a +flexible codebase that allows seamless integration of new models and prompting +strategies. Additionally, we introduce advanced techniques to minimize +clinician interaction, ensuring fair comparisons between 2D and 3D models. By +open-sourcing IntRaBench, we invite the research community to integrate their +models and prompting techniques, ensuring continuous and transparent evaluation +of interactive segmentation models in 3D medical imaging. + +
+
+ comment: Undergoing Peer-Review +
+
+
+
+
+ + ☆ Diverse capability and scaling of diffusion and auto-regressive models + when learning abstract rules NeurIPS2024 + + +
+ Humans excel at discovering regular structures from limited samples and +applying inferred rules to novel settings. We investigate whether modern +generative models can similarly learn underlying rules from finite samples and +perform reasoning through conditional sampling. Inspired by Raven's Progressive +Matrices task, we designed GenRAVEN dataset, where each sample consists of +three rows, and one of 40 relational rules governing the object position, +number, or attributes applies to all rows. We trained generative models to +learn the data distribution, where samples are encoded as integer arrays to +focus on rule learning. We compared two generative model families: diffusion +(EDM, DiT, SiT) and autoregressive models (GPT2, Mamba). We evaluated their +ability to generate structurally consistent samples and perform panel +completion via unconditional and conditional sampling. We found diffusion +models excel at unconditional generation, producing more novel and consistent +samples from scratch and memorizing less, but performing less well in panel +completion, even with advanced conditional sampling methods. Conversely, +autoregressive models excel at completing missing panels in a rule-consistent +manner but generate less consistent samples unconditionally. We observe diverse +data scaling behaviors: for both model families, rule learning emerges at a +certain dataset size - around 1000s examples per rule. With more training data, +diffusion models improve both their unconditional and conditional generation +capabilities. However, for autoregressive models, while panel completion +improves with more training data, unconditional generation consistency +declines. Our findings highlight complementary capabilities and limitations of +diffusion and autoregressive models in rule learning and reasoning tasks, +suggesting avenues for further research into their mechanisms and potential for +human-like reasoning. + +
+
+ comment: 12 pages, 5 figures. Accepted to NeurIPS2024 Workshop on System 2 + Reasoning At Scale as long paper +
+
+
+
+
+ + ☆ Leveraging Multimodal Models for Enhanced Neuroimaging Diagnostics in + Alzheimer's Disease + + +
+ The rapid advancements in Large Language Models (LLMs) and Vision-Language +Models (VLMs) have shown great potential in medical diagnostics, particularly +in radiology, where datasets such as X-rays are paired with human-generated +diagnostic reports. However, a significant research gap exists in the +neuroimaging field, especially for conditions such as Alzheimer's disease, due +to the lack of comprehensive diagnostic reports that can be utilized for model +fine-tuning. This paper addresses this gap by generating synthetic diagnostic +reports using GPT-4o-mini on structured data from the OASIS-4 dataset, which +comprises 663 patients. Using the synthetic reports as ground truth for +training and validation, we then generated neurological reports directly from +the images in the dataset leveraging the pre-trained BiomedCLIP and T5 models. +Our proposed method achieved a BLEU-4 score of 0.1827, ROUGE-L score of 0.3719, +and METEOR score of 0.4163, revealing its potential in generating clinically +relevant and accurate diagnostic reports. + +
+
+ comment: The paper has been accepted by the conference: "2024 International + Conference on Big Data (IEEE Big Data 2024)" +
+
+
+
+
+ + ☆ Trustful LLMs: Customizing and Grounding Text Generation with Knowledge + Bases and Dual Decoders + + +
+ Although people are impressed by the content generation skills of large +language models, the use of LLMs, such as ChatGPT, is limited by the domain +grounding of the content. The correctness and groundedness of the generated +content need to be based on a verified context, such as results from +Retrieval-Augmented Generation (RAG). One important issue when adapting LLMs to +a customized domain is that the generated responses are often incomplete, or +the additions are not verified and may even be hallucinated. Prior studies on +hallucination detection have focused on evaluation metrics, which are not +easily adaptable to dynamic domains and can be vulnerable to attacks like +jail-breaking. In this work, we propose 1) a post-processing algorithm that +leverages knowledge triplets in RAG context to correct hallucinations and 2) a +dual-decoder model that fuses RAG context to guide the generation process. + +
+
+
+
+
+ + ☆ Tucano: Advancing Neural Text Generation for Portuguese + + +
+ Significant advances have been made in natural language processing in recent +years. However, our current deep learning approach to language modeling +requires substantial resources in terms of data and computation. One of the +side effects of this data-hungry paradigm is the current schism between +languages, separating those considered high-resource, where most of the +development happens and resources are available, and the low-resource ones, +which struggle to attain the same level of performance and autonomy. This study +aims to introduce a new set of resources to stimulate the future development of +neural text generation in Portuguese. In this work, we document the development +of GigaVerbo, a concatenation of deduplicated Portuguese text corpora amounting +to 200 billion tokens. Via this corpus, we trained a series of +decoder-transformers named Tucano. Our models perform equal or superior to +other Portuguese and multilingual language models of similar size in several +Portuguese benchmarks. The evaluation of our models also reveals that model +performance on many currently available benchmarks used by the Portuguese NLP +community has little to no correlation with the scaling of token ingestion +during training, highlighting the limitations of such evaluations when it comes +to the assessment of Portuguese generative language models. All derivatives of +our study are openly released on GitHub and Hugging Face. See +https://nkluge-correa.github.io/Tucano/ + +
+
+
+
+
+ + ☆ IAE: Irony-based Adversarial Examples for Sentiment Analysis Systems + + +
+ Adversarial examples, which are inputs deliberately perturbed with +imperceptible changes to induce model errors, have raised serious concerns for +the reliability and security of deep neural networks (DNNs). While adversarial +attacks have been extensively studied in continuous data domains such as +images, the discrete nature of text presents unique challenges. In this paper, +we propose Irony-based Adversarial Examples (IAE), a method that transforms +straightforward sentences into ironic ones to create adversarial text. This +approach exploits the rhetorical device of irony, where the intended meaning is +opposite to the literal interpretation, requiring a deeper understanding of +context to detect. The IAE method is particularly challenging due to the need +to accurately locate evaluation words, substitute them with appropriate +collocations, and expand the text with suitable ironic elements while +maintaining semantic coherence. Our research makes the following key +contributions: (1) We introduce IAE, a strategy for generating textual +adversarial examples using irony. This method does not rely on pre-existing +irony corpora, making it a versatile tool for creating adversarial text in +various NLP tasks. (2) We demonstrate that the performance of several +state-of-the-art deep learning models on sentiment analysis tasks significantly +deteriorates when subjected to IAE attacks. This finding underscores the +susceptibility of current NLP systems to adversarial manipulation through +irony. (3) We compare the impact of IAE on human judgment versus NLP systems, +revealing that humans are less susceptible to the effects of irony in text. + +
+
+
+
+
+ + ☆ Ethical Concern Identification in NLP: A Corpus of ACL Anthology Ethics + Statements + + +
+ What ethical concerns, if any, do LLM researchers have? We introduce EthiCon, +a corpus of 1,580 ethical concern statements extracted from scientific papers +published in the ACL Anthology. We extract ethical concern keywords from the +statements and show promising results in automating the concern identification +process. Through a survey, we compare the ethical concerns of the corpus to the +concerns listed by the general public and professionals in the field. Finally, +we compare our retrieved ethical concerns with existing taxonomies pointing to +gaps and future research directions. + +
+
+
+
+
+ + ☆ Chain Association-based Attacking and Shielding Natural Language + Processing Systems + + +
+ Association as a gift enables people do not have to mention something in +completely straightforward words and allows others to understand what they +intend to refer to. In this paper, we propose a chain association-based +adversarial attack against natural language processing systems, utilizing the +comprehension gap between humans and machines. We first generate a chain +association graph for Chinese characters based on the association paradigm for +building search space of potential adversarial examples. Then, we introduce an +discrete particle swarm optimization algorithm to search for the optimal +adversarial examples. We conduct comprehensive experiments and show that +advanced natural language processing models and applications, including large +language models, are vulnerable to our attack, while humans appear good at +understanding the perturbed text. We also explore two methods, including +adversarial training and associative graph-based recovery, to shield systems +from chain association-based attack. Since a few examples that use some +derogatory terms, this paper contains materials that may be offensive or +upsetting to some people. + +
+
+
+
+
+ + ☆ Federated Learning for Discrete Optimal Transport with Large Population + under Incomplete Information + + +
+ Optimal transport is a powerful framework for the efficient allocation of +resources between sources and targets. However, traditional models often +struggle to scale effectively in the presence of large and heterogeneous +populations. In this work, we introduce a discrete optimal transport framework +designed to handle large-scale, heterogeneous target populations, characterized +by type distributions. We address two scenarios: one where the type +distribution of targets is known, and one where it is unknown. For the known +distribution, we propose a fully distributed algorithm to achieve optimal +resource allocation. In the case of unknown distribution, we develop a +federated learning-based approach that enables efficient computation of the +optimal transport scheme while preserving privacy. Case studies are provided to +evaluate the performance of our learning algorithm. + +
+
+
+
+
+ + ☆ Efficient Federated Finetuning of Tiny Transformers with + Resource-Constrained Devices + + +
+ In recent years, Large Language Models (LLMs) through Transformer structures +have dominated many machine learning tasks, especially text processing. +However, these models require massive amounts of data for training and induce +high resource requirements, particularly in terms of the large number of +Floating Point Operations (FLOPs) and the high amounts of memory needed. To +fine-tune such a model in a parameter-efficient way, techniques like Adapter or +LoRA have been developed. However, we observe that the application of LoRA, +when used in federated learning (FL), while still being parameter-efficient, is +memory and FLOP inefficient. Based on that observation, we develop a novel +layer finetuning scheme that allows devices in cross-device FL to make use of +pretrained neural networks (NNs) while adhering to given resource constraints. +We show that our presented scheme outperforms the current state of the art when +dealing with homogeneous or heterogeneous computation and memory constraints +and is on par with LoRA regarding limited communication, thereby achieving +significantly higher accuracies in FL training. + +
+
+
+
+
+ + ☆ PatchCTG: Patch Cardiotocography Transformer for Antepartum Fetal Health + Monitoring + + +
+ Antepartum Cardiotocography (CTG) is vital for fetal health monitoring, but +traditional methods like the Dawes-Redman system are often limited by high +inter-observer variability, leading to inconsistent interpretations and +potential misdiagnoses. This paper introduces PatchCTG, a transformer-based +model specifically designed for CTG analysis, employing patch-based +tokenisation, instance normalisation and channel-independent processing to +capture essential local and global temporal dependencies within CTG signals. +PatchCTG was evaluated on the Oxford Maternity (OXMAT) dataset, comprising over +20,000 CTG traces across diverse clinical outcomes after applying the inclusion +and exclusion criteria. With extensive hyperparameter optimisation, PatchCTG +achieved an AUC of 77%, with specificity of 88% and sensitivity of 57% at +Youden's index threshold, demonstrating adaptability to various clinical needs. +Testing across varying temporal thresholds showed robust predictive +performance, particularly with finetuning on data closer to delivery, achieving +a sensitivity of 52% and specificity of 88% for near-delivery cases. These +findings suggest the potential of PatchCTG to enhance clinical decision-making +in antepartum care by providing a reliable, objective tool for fetal health +assessment. The source code is available at +https://github.com/jaleedkhan/PatchCTG. + +
+
+
+
+
+ + ☆ RedCode: Risky Code Execution and Generation Benchmark for Code Agents NeurIPS 2024 + + +
+ With the rapidly increasing capabilities and adoption of code agents for +AI-assisted coding, safety concerns, such as generating or executing risky +code, have become significant barriers to the real-world deployment of these +agents. To provide comprehensive and practical evaluations on the safety of +code agents, we propose RedCode, a benchmark for risky code execution and +generation: (1) RedCode-Exec provides challenging prompts that could lead to +risky code execution, aiming to evaluate code agents' ability to recognize and +handle unsafe code. We provide a total of 4,050 risky test cases in Python and +Bash tasks with diverse input formats including code snippets and natural text. +They covers 25 types of critical vulnerabilities spanning 8 domains (e.g., +websites, file systems). We provide Docker environments and design +corresponding evaluation metrics to assess their execution results. (2) +RedCode-Gen provides 160 prompts with function signatures and docstrings as +input to assess whether code agents will follow instructions to generate +harmful code or software. Our empirical findings, derived from evaluating three +agent frameworks based on 19 LLMs, provide insights into code agents' +vulnerabilities. For instance, evaluations on RedCode-Exec show that agents are +more likely to reject executing risky operations on the operating system, but +are less likely to reject executing technically buggy code, indicating high +risks. Risky operations described in natural text lead to a lower rejection +rate than those in code format. Additionally, evaluations on RedCode-Gen show +that more capable base models and agents with stronger overall coding +abilities, such as GPT4, tend to produce more sophisticated and effective +harmful software. Our findings highlight the need for stringent safety +evaluations for diverse code agents. Our dataset and code are available at +https://github.com/AI-secure/RedCode. + +
+
+ comment: Accepted by NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Likelihood as a Performance Gauge for Retrieval-Augmented Generation NAACL 2025 + + +
+ Recent work finds that retrieval-augmented generation with large language +models is prone to be influenced by the order of retrieved documents in the +context. However, the lack of in-depth analysis limits the use of this +phenomenon for prompt engineering in practice. In this study, we posit that +likelihoods serve as an effective gauge for language model performance. Through +experiments on two question-answering datasets with a variety of +state-of-the-art language models, we reveal correlations between answer +accuracy and the likelihood of the question at both the corpus level and the +instance level. In addition, we find that question likelihood can also indicate +the position of the task-relevant information in the context. Based on these +findings, we propose two methods that use question likelihood as a gauge for +selecting and constructing prompts that lead to better performance. We +demonstrate their effectiveness with experiments. In addition, our +likelihood-based methods are efficient, as they only need to compute the +likelihood of the input, requiring much fewer language model passes than +heuristic prompt engineering methods that require generating responses. Our +analysis deepens our understanding of how input prompts affect model +performance and provides a promising direction for efficient prompt +optimization. + +
+
+ comment: Under review at NAACL 2025. Code is available at + https://github.com/lyutyuh/poptimizer +
+
+
+
+
+ + ☆ Automatic Album Sequencing + + +
+ Album sequencing is a critical part of the album production process. +Recently, a data-driven approach was proposed that sequences general +collections of independent media by extracting the narrative essence of the +items in the collections. While this approach implies an album sequencing +technique, it is not widely accessible to a less technical audience, requiring +advanced knowledge of machine learning techniques to use. To address this, we +introduce a new user-friendly web-based tool that allows a less technical +audience to upload music tracks, execute this technique in one click, and +subsequently presents the result in a clean visualization to the user. To both +increase the number of templates available to the user and address shortcomings +of previous work, we also introduce a new direct transformer-based album +sequencing method. We find that our more direct method outperforms a random +baseline but does not reach the same performance as the narrative essence +approach. Both methods are included in our web-based user interface, and this +-- alongside a full copy of our implementation -- is publicly available at +https://github.com/dylanashley/automatic-album-sequencing + +
+
+ comment: presented as a late breaking demo in the 25th International Society + for Music Information Retrieval Conference; 3 pages in main text, 3 figures + in main text; source code available at + https://github.com/dylanashley/automatic-album-sequencing +
+
+
+
+
+ + ☆ Spider 2.0: Evaluating Language Models on Real-World Enterprise + Text-to-SQL Workflows + + +
+ Real-world enterprise text-to-SQL workflows often involve complex cloud or +local data across various database systems, multiple SQL queries in various +dialects, and diverse operations from data transformation to analytics. We +introduce Spider 2.0, an evaluation framework comprising 632 real-world +text-to-SQL workflow problems derived from enterprise-level database use cases. +The databases in Spider 2.0 are sourced from real data applications, often +containing over 1,000 columns and stored in local or cloud database systems +such as BigQuery and Snowflake. We show that solving problems in Spider 2.0 +frequently requires understanding and searching through database metadata, +dialect documentation, and even project-level codebases. This challenge calls +for models to interact with complex SQL workflow environments, process +extremely long contexts, perform intricate reasoning, and generate multiple SQL +queries with diverse operations, often exceeding 100 lines, which goes far +beyond traditional text-to-SQL challenges. Our evaluations indicate that based +on o1-preview, our code agent framework successfully solves only 17.0% of the +tasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on +Spider 2.0 show that while language models have demonstrated remarkable +performance in code generation -- especially in prior text-to-SQL benchmarks -- +they require significant improvement in order to achieve adequate performance +for real-world enterprise usage. Progress on Spider 2.0 represents crucial +steps towards developing intelligent, autonomous, code agents for real-world +enterprise settings. Our code, baseline models, and data are available at +https://spider2-sql.github.io. + +
+
+
+
+
+ + ☆ ASER: Activation Smoothing and Error Reconstruction for Large Language + Model Quantization + + +
+ Quantization stands as a pivotal technique for large language model (LLM) +serving, yet it poses significant challenges particularly in achieving +effective low-bit quantization. The limited numerical mapping makes the +quantized model produce a non-trivial error, bringing out intolerable +performance degration. This paper is anchored in the basic idea of model +compression objectives, and delves into the layer-wise error distribution of +LLMs during post-training quantization. Subsequently, we introduce ASER, an +algorithm consisting of (1) Error Reconstruction: low-rank compensation for +quantization error with LoRA-style matrices constructed by whitening SVD; (2) +Activation Smoothing: outlier extraction to gain smooth activation and better +error compensation. ASER is capable of quantizing typical LLMs to low-bit ones, +particularly preserving accuracy even in W4A8 per-channel setup. Experimental +results show that ASER is competitive among the state-of-the-art quantization +algorithms, showing potential to activation quantization, with minor overhead. + +
+
+
+
+
+ + ☆ Navigation with QPHIL: Quantizing Planner for Hierarchical Implicit + Q-Learning + + +
+ Offline Reinforcement Learning (RL) has emerged as a powerful alternative to +imitation learning for behavior modeling in various domains, particularly in +complex navigation tasks. An existing challenge with Offline RL is the +signal-to-noise ratio, i.e. how to mitigate incorrect policy updates due to +errors in value estimates. Towards this, multiple works have demonstrated the +advantage of hierarchical offline RL methods, which decouples high-level path +planning from low-level path following. In this work, we present a novel +hierarchical transformer-based approach leveraging a learned quantizer of the +space. This quantization enables the training of a simpler zone-conditioned +low-level policy and simplifies planning, which is reduced to discrete +autoregressive prediction. Among other benefits, zone-level reasoning in +planning enables explicit trajectory stitching rather than implicit stitching +based on noisy value function estimates. By combining this transformer-based +planner with recent advancements in offline RL, our proposed approach achieves +state-of-the-art results in complex long-distance navigation environments. + +
+
+ comment: Under review. Code will be released upon acceptance +
+
+
+
+
+ + ☆ Optimizing Traffic Signal Control using High-Dimensional State + Representation and Efficient Deep Reinforcement Learning + + +
+ In reinforcement learning-based (RL-based) traffic signal control (TSC), +decisions on the signal timing are made based on the available information on +vehicles at a road intersection. This forms the state representation for the RL +environment which can either be high-dimensional containing several variables +or a low-dimensional vector. Current studies suggest that using high +dimensional state representations does not lead to improved performance on TSC. +However, we argue, with experimental results, that the use of high dimensional +state representations can, in fact, lead to improved TSC performance with +improvements up to 17.9% of the average waiting time. This high-dimensional +representation is obtainable using the cost-effective vehicle-to-infrastructure +(V2I) communication, encouraging its adoption for TSC. Additionally, given the +large size of the state, we identified the need to have computational efficient +models and explored model compression via pruning. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ SAV-SE: Scene-aware Audio-Visual Speech Enhancement with Selective State + Space Model + + +
+ Speech enhancement plays an essential role in various applications, and the +integration of visual information has been demonstrated to bring substantial +advantages. However, the majority of current research concentrates on the +examination of facial and lip movements, which can be compromised or entirely +inaccessible in scenarios where occlusions occur or when the camera view is +distant. Whereas contextual visual cues from the surrounding environment have +been overlooked: for example, when we see a dog bark, our brain has the innate +ability to discern and filter out the barking noise. To this end, in this +paper, we introduce a novel task, i.e. SAV-SE. To our best knowledge, this is +the first proposal to use rich contextual information from synchronized video +as auxiliary cues to indicate the type of noise, which eventually improves the +speech enhancement performance. Specifically, we propose the VC-S$^2$E method, +which incorporates the Conformer and Mamba modules for their complementary +strengths. Extensive experiments are conducted on public MUSIC, AVSpeech and +AudioSet datasets, where the results demonstrate the superiority of VC-S$^2$E +over other competitive methods. We will make the source code publicly +available. Project demo page: https://AVSEPage.github.io/ + +
+
+
+
+
+ + ☆ Unlocking Legal Knowledge with Multi-Layered Embedding-Based Retrieval + + +
+ This work addresses the challenge of capturing the complexities of legal +knowledge by proposing a multi-layered embedding-based retrieval method for +legal and legislative texts. Creating embeddings not only for individual +articles but also for their components (paragraphs, clauses) and structural +groupings (books, titles, chapters, etc), we seek to capture the subtleties of +legal information through the use of dense vectors of embeddings, representing +it at varying levels of granularity. Our method meets various information needs +by allowing the Retrieval Augmented Generation system to provide accurate +responses, whether for specific segments or entire sections, tailored to the +user's query. We explore the concepts of aboutness, semantic chunking, and +inherent hierarchy within legal texts, arguing that this method enhances the +legal information retrieval. Despite the focus being on Brazil's legislative +methods and the Brazilian Constitution, which follow a civil law tradition, our +findings should in principle be applicable across different legal systems, +including those adhering to common law traditions. Furthermore, the principles +of the proposed method extend beyond the legal domain, offering valuable +insights for organizing and retrieving information in any field characterized +by information encoded in hierarchical text. + +
+
+ comment: 27 pages, 10 figures +
+
+
+
+
+ + ☆ No-Reference Point Cloud Quality Assessment via Graph Convolutional + Network + + +
+ Three-dimensional (3D) point cloud, as an emerging visual media format, is +increasingly favored by consumers as it can provide more realistic visual +information than two-dimensional (2D) data. Similar to 2D plane images and +videos, point clouds inevitably suffer from quality degradation and information +loss through multimedia communication systems. Therefore, automatic point cloud +quality assessment (PCQA) is of critical importance. In this work, we propose a +novel no-reference PCQA method by using a graph convolutional network (GCN) to +characterize the mutual dependencies of multi-view 2D projected image contents. +The proposed GCN-based PCQA (GC-PCQA) method contains three modules, i.e., +multi-view projection, graph construction, and GCN-based quality prediction. +First, multi-view projection is performed on the test point cloud to obtain a +set of horizontally and vertically projected images. Then, a +perception-consistent graph is constructed based on the spatial relations among +different projected images. Finally, reasoning on the constructed graph is +performed by GCN to characterize the mutual dependencies and interactions +between different projected images, and aggregate feature information of +multi-view projected images for final quality prediction. Experimental results +on two publicly available benchmark databases show that our proposed GC-PCQA +can achieve superior performance than state-of-the-art quality assessment +metrics. The code will be available at: https://github.com/chenwuwq/GC-PCQA. + +
+
+ comment: Accepted by IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ Is Cognition consistent with Perception? Assessing and Mitigating + Multimodal Knowledge Conflicts in Document Understanding + + +
+ Multimodal large language models (MLLMs) have shown impressive capabilities +in document understanding, a rapidly growing research area with significant +industrial demand in recent years. As a multimodal task, document understanding +requires models to possess both perceptual and cognitive abilities. However, +current MLLMs often face conflicts between perception and cognition. Taking a +document VQA task (cognition) as an example, an MLLM might generate answers +that do not match the corresponding visual content identified by its OCR +(perception). This conflict suggests that the MLLM might struggle to establish +an intrinsic connection between the information it "sees" and what it +"understands." Such conflicts challenge the intuitive notion that cognition is +consistent with perception, hindering the performance and explainability of +MLLMs. In this paper, we define the conflicts between cognition and perception +as Cognition and Perception (C&P) knowledge conflicts, a form of multimodal +knowledge conflicts, and systematically assess them with a focus on document +understanding. Our analysis reveals that even GPT-4o, a leading MLLM, achieves +only 68.6% C&P consistency. To mitigate the C&P knowledge conflicts, we propose +a novel method called Multimodal Knowledge Consistency Fine-tuning. This method +first ensures task-specific consistency and then connects the cognitive and +perceptual knowledge. Our method significantly reduces C&P knowledge conflicts +across all tested MLLMs and enhances their performance in both cognitive and +perceptual tasks in most scenarios. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Training Data for Large Language Model + + +
+ In 2022, with the release of ChatGPT, large-scale language models gained +widespread attention. ChatGPT not only surpassed previous models in terms of +parameters and the scale of its pretraining corpus but also achieved +revolutionary performance improvements through fine-tuning on a vast amount of +high-quality, human-annotated data. This progress has led enterprises and +research institutions to recognize that building smarter and more powerful +models relies on rich and high-quality datasets. Consequently, the construction +and optimization of datasets have become a critical focus in the field of +artificial intelligence. This paper summarizes the current state of pretraining +and fine-tuning data for training large-scale language models, covering aspects +such as data scale, collection methods, data types and characteristics, +processing workflows, and provides an overview of available open-source +datasets. + +
+
+ comment: in Chinese language +
+
+
+
+
+ + ☆ New Emerged Security and Privacy of Pre-trained Model: a Survey and + Outlook + + +
+ Thanks to the explosive growth of data and the development of computational +resources, it is possible to build pre-trained models that can achieve +outstanding performance on various tasks, such as neural language processing, +computer vision, and more. Despite their powerful capabilities, pre-trained +models have also sparked attention to the emerging security challenges +associated with their real-world applications. Security and privacy issues, +such as leaking privacy information and generating harmful responses, have +seriously undermined users' confidence in these powerful models. Concerns are +growing as model performance improves dramatically. Researchers are eager to +explore the unique security and privacy issues that have emerged, their +distinguishing factors, and how to defend against them. However, the current +literature lacks a clear taxonomy of emerging attacks and defenses for +pre-trained models, which hinders a high-level and comprehensive understanding +of these questions. To fill the gap, we conduct a systematical survey on the +security risks of pre-trained models, proposing a taxonomy of attack and +defense methods based on the accessibility of pre-trained models' input and +weights in various security test scenarios. This taxonomy categorizes attacks +and defenses into No-Change, Input-Change, and Model-Change approaches. With +the taxonomy analysis, we capture the unique security and privacy issues of +pre-trained models, categorizing and summarizing existing security issues based +on their characteristics. In addition, we offer a timely and comprehensive +review of each category's strengths and limitations. Our survey concludes by +highlighting potential new research opportunities in the security and privacy +of pre-trained models. + +
+
+
+
+
+ + ☆ World Models: The Safety Perspective + + +
+ With the proliferation of the Large Language Model (LLM), the concept of +World Models (WM) has recently attracted a great deal of attention in the AI +research community, especially in the context of AI agents. It is arguably +evolving into an essential foundation for building AI agent systems. A WM is +intended to help the agent predict the future evolution of environmental states +or help the agent fill in missing information so that it can plan its actions +and behave safely. The safety property of WM plays a key role in their +effective use in critical applications. In this work, we review and analyze the +impacts of the current state-of-the-art in WM technology from the point of view +of trustworthiness and safety based on a comprehensive survey and the fields of +application envisaged. We provide an in-depth analysis of state-of-the-art WMs +and derive technical research challenges and their impact in order to call on +the research community to collaborate on improving the safety and +trustworthiness of WM. + +
+
+ comment: 8 pages, 3 figures, accepted at the International Workshop on + Dependability Modeling and Design (WDMD) during the IEEE International + Symposium on Software Reliability Engineering (ISSRE) +
+
+
+
+
+ + ☆ Enhancing Ultra High Resolution Remote Sensing Imagery Analysis with + ImageRAG + + +
+ Ultra High Resolution (UHR) remote sensing imagery (RSI) (e.g. 100,000 +$\times$ 100,000 pixels or more) poses a significant challenge for current +Remote Sensing Multimodal Large Language Models (RSMLLMs). If choose to resize +the UHR image to standard input image size, the extensive spatial and +contextual information that UHR images contain will be neglected. Otherwise, +the original size of these images often exceeds the token limits of standard +RSMLLMs, making it difficult to process the entire image and capture long-range +dependencies to answer the query based on the abundant visual context. In this +paper, we introduce ImageRAG for RS, a training-free framework to address the +complexities of analyzing UHR remote sensing imagery. By transforming UHR +remote sensing image analysis task to image's long context selection task, we +design an innovative image contextual retrieval mechanism based on the +Retrieval-Augmented Generation (RAG) technique, denoted as ImageRAG. ImageRAG's +core innovation lies in its ability to selectively retrieve and focus on the +most relevant portions of the UHR image as visual contexts that pertain to a +given query. Fast path and slow path are proposed in this framework to handle +this task efficiently and effectively. ImageRAG allows RSMLLMs to manage +extensive context and spatial information from UHR RSI, ensuring the analysis +is both accurate and efficient. + +
+
+
+
+
+ + ☆ Data-Driven Graph Switching for Cyber-Resilient Control in Microgrids + + +
+ Distributed microgrids are conventionally dependent on communication networks +to achieve secondary control objectives. This dependence makes them vulnerable +to stealth data integrity attacks (DIAs) where adversaries may perform +manipulations via infected transmitters and repeaters to jeopardize stability. +This paper presents a physics-guided, supervised Artificial Neural Network +(ANN)-based framework that identifies communication-level cyberattacks in +microgrids by analyzing whether incoming measurements will cause abnormal +behavior of the secondary control layer. If abnormalities are detected, an +iteration through possible spanning tree graph topologies that can be used to +fulfill secondary control objectives is done. Then, a communication network +topology that would not create secondary control abnormalities is identified +and enforced for maximum stability. By altering the communication graph +topology, the framework eliminates the dependence of the secondary control +layer on inputs from compromised cyber devices helping it achieve resilience +without instability. Several case studies are provided showcasing the +robustness of the framework against False Data Injections and repeater-level +Man-in-the-Middle attacks. To understand practical feasibility, robustness is +also verified against larger microgrid sizes and in the presence of varying +noise levels. Our findings indicate that performance can be affected when +attempting scalability in the presence of noise. However, the framework +operates robustly in low-noise settings. + +
+
+ comment: Accepted in IEEE Design Methodologies Conference (DMC) 2024 +
+
+
+
+
+ + ☆ Fast Disentangled Slim Tensor Learning for Multi-view Clustering + + +
+ Tensor-based multi-view clustering has recently received significant +attention due to its exceptional ability to explore cross-view high-order +correlations. However, most existing methods still encounter some limitations. +(1) Most of them explore the correlations among different affinity matrices, +making them unscalable to large-scale data. (2) Although some methods address +it by introducing bipartite graphs, they may result in sub-optimal solutions +caused by an unstable anchor selection process. (3) They generally ignore the +negative impact of latent semantic-unrelated information in each view. To +tackle these issues, we propose a new approach termed fast Disentangled Slim +Tensor Learning (DSTL) for multi-view clustering . Instead of focusing on the +multi-view graph structures, DSTL directly explores the high-order correlations +among multi-view latent semantic representations based on matrix factorization. +To alleviate the negative influence of feature redundancy, inspired by robust +PCA, DSTL disentangles the latent low-dimensional representation into a +semantic-unrelated part and a semantic-related part for each view. +Subsequently, two slim tensors are constructed with tensor-based +regularization. To further enhance the quality of feature disentanglement, the +semantic-related representations are aligned across views through a consensus +alignment indicator. Our proposed model is computationally efficient and can be +solved effectively. Extensive experiments demonstrate the superiority and +efficiency of DSTL over state-of-the-art approaches. The code of DSTL is +available at https://github.com/dengxu-nju/DSTL. + +
+
+ comment: 13 pages,6 figures, will be published to IEEE TMM +
+
+
+
+
+ + ☆ AI enhanced diagnosis of Peyronies disease a novel approach using + Computer Vision + + +
+ This study presents an innovative AI-driven tool for diagnosing Peyronie's +Disease (PD), a condition that affects between 0.3% and 13.1% of men worldwide. +Our method uses key point detection on both images and videos to measure penile +curvature angles, utilizing advanced computer vision techniques. This tool has +demonstrated high accuracy in identifying anatomical landmarks, validated +against conventional goniometer measurements. Traditional PD diagnosis often +involves subjective and invasive methods, which can lead to patient discomfort +and inaccuracies. Our approach offers a precise, reliable, and non-invasive +diagnostic tool to address these drawbacks. The model distinguishes between PD +and normal anatomical changes with a sensitivity of 96.7% and a specificity of +100%. This advancement represents a significant improvement in urological +diagnostics, greatly enhancing the efficacy and convenience of PD assessment +for healthcare providers and patients. + +
+
+ comment: 8 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Spike Talk in Power Electronic Grids -- Leveraging Post Moore's + Computing Laws + + +
+ Emerging distributed generation demands highly reliable and resilient +coordinating control in microgrids. To improve on these aspects, spiking neural +network is leveraged, as a grid-edge intelligence tool to establish a talkative +infrastructure, Spike Talk, expediting coordination in next-generation +microgrids without the need of communication at all. This paper unravels the +physics behind Spike Talk from the perspective of its distributed +infrastructure, which aims to address the Von Neumann Bottleneck. Relying on +inferring information via power flows in tie lines, Spike Talk allows adaptive +and flexible control and coordination itself, and features in synaptic +plasticity facilitating online and local training functionality. Preliminary +case studies are demonstrated with results, while more extensive validations +are to be included as future scopes of work. + +
+
+ comment: The manuscript has been accepted for publication in the Proceedings + of 2024 IEEE Design Methodologies for Power Electronics Conference (DMC2024) +
+
+
+
+
+ + ☆ Understanding Audiovisual Deepfake Detection: Techniques, Challenges, + Human Factors and Perceptual Insights + + +
+ Deep Learning has been successfully applied in diverse fields, and its impact +on deepfake detection is no exception. Deepfakes are fake yet realistic +synthetic content that can be used deceitfully for political impersonation, +phishing, slandering, or spreading misinformation. Despite extensive research +on unimodal deepfake detection, identifying complex deepfakes through joint +analysis of audio and visual streams remains relatively unexplored. To fill +this gap, this survey first provides an overview of audiovisual deepfake +generation techniques, applications, and their consequences, and then provides +a comprehensive review of state-of-the-art methods that combine audio and +visual modalities to enhance detection accuracy, summarizing and critically +analyzing their strengths and limitations. Furthermore, we discuss existing +open source datasets for a deeper understanding, which can contribute to the +research community and provide necessary information to beginners who want to +analyze deep learning-based audiovisual methods for video forensics. By +bridging the gap between unimodal and multimodal approaches, this paper aims to +improve the effectiveness of deepfake detection strategies and guide future +research in cybersecurity and media integrity. + +
+
+
+
+
+ + ☆ Exploring Multi-Agent Reinforcement Learning for Unrelated Parallel + Machine Scheduling + + +
+ Scheduling problems pose significant challenges in resource, industry, and +operational management. This paper addresses the Unrelated Parallel Machine +Scheduling Problem (UPMS) with setup times and resources using a Multi-Agent +Reinforcement Learning (MARL) approach. The study introduces the Reinforcement +Learning environment and conducts empirical analyses, comparing MARL with +Single-Agent algorithms. The experiments employ various deep neural network +policies for single- and Multi-Agent approaches. Results demonstrate the +efficacy of the Maskable extension of the Proximal Policy Optimization (PPO) +algorithm in Single-Agent scenarios and the Multi-Agent PPO algorithm in +Multi-Agent setups. While Single-Agent algorithms perform adequately in reduced +scenarios, Multi-Agent approaches reveal challenges in cooperative learning but +a scalable capacity. This research contributes insights into applying MARL +techniques to scheduling optimization, emphasizing the need for algorithmic +sophistication balanced with scalability for intelligent scheduling solutions. + +
+
+ comment: 11 pages, 5 figures, 4 tables, article submitted to a journal +
+
+
+
+
+ + ☆ Direct Preference Optimization Using Sparse Feature-Level Constraints + + +
+ The alignment of large language models (LLMs) with human preferences remains +a key challenge. While post-training techniques like Reinforcement Learning +from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have +achieved notable success, they often introduce computational inefficiencies and +training instability. In this paper, we propose Feature-level constrained +Preference Optimization (FPO), a novel method designed to simplify the +alignment process while ensuring stability. FPO leverages pre-trained Sparse +Autoencoders (SAEs) and introduces feature-level constraints, allowing for +efficient, sparsity-enforced alignment. Our approach enjoys efficiency by using +sparse features activated in a well-trained sparse autoencoder and the quality +of sequential KL divergence by using the feature-level offline reference. +Experimental results on benchmark datasets demonstrate that FPO achieves a +5.08% absolute improvement in win rate with much lower computational cost +compared to state-of-the-art baselines, making it a promising solution for +efficient and controllable LLM alignments. + +
+
+
+
+
+ + ☆ Multimodal Clinical Reasoning through Knowledge-augmented Rationale + Generation + + +
+ Clinical rationales play a pivotal role in accurate disease diagnosis; +however, many models predominantly use discriminative methods and overlook the +importance of generating supportive rationales. Rationale distillation is a +process that transfers knowledge from large language models (LLMs) to smaller +language models (SLMs), thereby enhancing the latter's ability to break down +complex tasks. Despite its benefits, rationale distillation alone is inadequate +for addressing domain knowledge limitations in tasks requiring specialized +expertise, such as disease diagnosis. Effectively embedding domain knowledge in +SLMs poses a significant challenge. While current LLMs are primarily geared +toward processing textual data, multimodal LLMs that incorporate time series +data, especially electronic health records (EHRs), are still evolving. To +tackle these limitations, we introduce ClinRaGen, an SLM optimized for +multimodal rationale generation in disease diagnosis. ClinRaGen incorporates a +unique knowledge-augmented attention mechanism to merge domain knowledge with +time series EHR data, utilizing a stepwise rationale distillation strategy to +produce both textual and time series-based clinical rationales. Our evaluations +show that ClinRaGen markedly improves the SLM's capability to interpret +multimodal EHR data and generate accurate clinical rationales, supporting more +reliable disease diagnosis, advancing LLM applications in healthcare, and +narrowing the performance divide between LLMs and SLMs. + +
+
+ comment: 11 pages. 4 figures +
+
+
+
+
+ + ☆ Optimizing Service Function Chain Mapping in Network Function + Virtualization through Simultaneous NF Decomposition and VNF Placement + + +
+ Network function virtualization enables network operators to implement new +services through a process called service function chain mapping. The concept +of Service Function Chain (SFC) is introduced to provide complex services, +which is an ordered set of Network Functions (NF). The network functions of an +SFC can be decomposed in several ways into some Virtual Network Functions +(VNF). Additionally, the decomposed NFs can be placed (mapped) as VNFs on +different machines on the underlying physical infrastructure. Selecting good +decompositions and good placements among the possible options greatly affects +both costs and service quality metrics. Previous research has addressed NF +decomposition and VNF placement as separate problems. However, in this paper, +we address both NF decomposition and VNF placement simultaneously as a single +problem. Since finding an optimal solution is NP-hard, we have employed +heuristic algorithms to solve the problem. Specifically, we have introduced a +multiobjective decomposition and mapping VNFs (MODMVNF) method based on the +non-dominated sorting genetic multi-objective algorithm (NSGAII) to solve the +problem. The goal is to find near-optimal decomposition and mapping on the +physical network at the same time to minimize the mapping cost and +communication latency of SFC. The comparison of the results of the proposed +method with the results obtained by solving ILP formulation of the problem as +well as the results obtained from the multi-objective particle swarm algorithm +shows the efficiency and effectiveness of the proposed method in terms of cost +and communication latency. + +
+
+
+
+
+ + ☆ Circuit Complexity Bounds for RoPE-based Transformer Architecture + + +
+ Characterizing the express power of the Transformer architecture is critical +to understanding its capacity limits and scaling law. Recent works provide the +circuit complexity bounds to Transformer-like architecture. On the other hand, +Rotary Position Embedding ($\mathsf{RoPE}$) has emerged as a crucial technique +in modern large language models, offering superior performance in capturing +positional information compared to traditional position embeddings, which shows +great potential in application prospects, particularly for the long context +scenario. Empirical evidence also suggests that $\mathsf{RoPE}$-based +Transformer architectures demonstrate greater generalization capabilities +compared to conventional Transformer models. In this work, we establish a +tighter circuit complexity bound for Transformers with $\mathsf{RoPE}$ +attention. Our key contribution is that we show that unless $\mathsf{TC}^0 = +\mathsf{NC}^1$, a $\mathsf{RoPE}$-based Transformer with +$\mathrm{poly}(n)$-precision, $O(1)$ layers, hidden dimension $d \leq O(n)$ +cannot solve the arithmetic problem or the Boolean formula value problem. This +result significantly demonstrates the fundamental limitation of the +expressivity of the $\mathsf{RoPE}$-based Transformer architecture, although it +achieves giant empirical success. Our theoretical framework not only +establishes tighter complexity bounds but also may instruct further work on the +$\mathsf{RoPE}$-based Transformer. + +
+
+
+
+
+ + ☆ Problem-Oriented Segmentation and Retrieval: Case Study on Tutoring + Conversations EMNLP 2024 + + +
+ Many open-ended conversations (e.g., tutoring lessons or business meetings) +revolve around pre-defined reference materials, like worksheets or meeting +bullets. To provide a framework for studying such conversation structure, we +introduce Problem-Oriented Segmentation & Retrieval (POSR), the task of jointly +breaking down conversations into segments and linking each segment to the +relevant reference item. As a case study, we apply POSR to education where +effectively structuring lessons around problems is critical yet difficult. We +present LessonLink, the first dataset of real-world tutoring lessons, featuring +3,500 segments, spanning 24,300 minutes of instruction and linked to 116 SAT +math problems. We define and evaluate several joint and independent approaches +for POSR, including segmentation (e.g., TextTiling), retrieval (e.g., ColBERT), +and large language models (LLMs) methods. Our results highlight that modeling +POSR as one joint task is essential: POSR methods outperform independent +segmentation and retrieval pipelines by up to +76% on joint metrics and surpass +traditional segmentation methods by up to +78% on segmentation metrics. We +demonstrate POSR's practical impact on downstream education applications, +deriving new insights on the language and time use in real-world lesson +structures. + +
+
+ comment: EMNLP 2024 Findings. Our code and dataset are open-sourced at + https://github.com/rosewang2008/posr +
+
+
+
+
+ + ☆ Entropy Controllable Direct Preference Optimization + + +
+ In the post-training of large language models (LLMs), Reinforcement Learning +from Human Feedback (RLHF) is an effective approach to achieve generation +aligned with human preferences. Direct Preference Optimization (DPO) allows for +policy training with a simple binary cross-entropy loss without a reward model. +The objective of DPO is regularized by reverse KL divergence that encourages +mode-seeking fitting to the reference policy. Nonetheless, we indicate that +minimizing reverse KL divergence could fail to capture a mode of the reference +distribution, which may hurt the policy's performance. Based on this +observation, we propose a simple modification to DPO, H-DPO, which allows for +control over the entropy of the resulting policy, enhancing the distribution's +sharpness and thereby enabling mode-seeking fitting more effectively. In our +experiments, we show that H-DPO outperformed DPO across various tasks, +demonstrating superior results in pass@$k$ evaluations for mathematical tasks. +Moreover, H-DPO is simple to implement, requiring only minor modifications to +the loss calculation of DPO, which makes it highly practical and promising for +wide-ranging applications in the training of LLMs. + +
+
+
+
+
+ + ☆ Overhead-free User-side Recommender Systems + + +
+ Traditionally, recommendation algorithms have been designed for service +developers. But recently, a new paradigm called user-side recommender systems +has been proposed. User-side recommender systems are built and used by end +users, in sharp contrast to traditional provider-side recommender systems. Even +if the official recommender system offered by the provider is not fair, end +users can create and enjoy their own user-side recommender systems by +themselves. Although the concept of user-side recommender systems is +attractive, the problem is they require tremendous communication costs between +the user and the official system. Even the most efficient user-side recommender +systems require about 5 times more costs than provider-side recommender +systems. Such high costs hinder the adoption of user-side recommender systems. +In this paper, we propose overhead-free user-side recommender systems, +RecCycle, which realizes user-side recommender systems without any +communication overhead. The main idea of RecCycle is to recycle past +recommendation results offered by the provider's recommender systems. The +ingredients of RecCycle can be retrieved ``for free,'' and it greatly reduces +the cost of user-side recommendations. In the experiments, we confirm that +RecCycle performs as well as state-of-the-art user-side recommendation +algorithms while RecCycle reduces costs significantly. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2208.09864, + arXiv:2403.15757 +
+
+
+
+
+ + ☆ A Comprehensive Survey of AI-Driven Advancements and Techniques in + Automated Program Repair and Code Generation + + +
+ Bug fixing and code generation have been core research topics in software +development for many years. The recent explosive growth in Large Language +Models has completely transformed these spaces, putting in reach incredibly +powerful tools for both. In this survey, 27 recent papers have been reviewed +and split into two groups: one dedicated to Automated Program Repair (APR) and +LLM integration and the other to code generation using LLMs. The first group +consists of new methods for bug detection and repair, which include locating +semantic errors, security vulnerabilities, and runtime failure bugs. The place +of LLMs in reducing manual debugging efforts is emphasized in this work by APR +toward context-aware fixes, with innovations that boost accuracy and efficiency +in automatic debugging. The second group dwells on code generation, providing +an overview of both general-purpose LLMs fine-tuned for programming and +task-specific models. It also presents methods to improve code generation, such +as identifier-aware training, fine-tuning at the instruction level, and +incorporating semantic code structures. This survey work contrasts the +methodologies in APR and code generation to identify trends such as using LLMs, +feedback loops to enable iterative code improvement and open-source models. It +also discusses the challenges of achieving functional correctness and security +and outlines future directions for research in LLM-based software development. + +
+
+ comment: A survey of recent developments in AI-assisted automated program + repair +
+
+
+
+
+ + ☆ Reinforcement Learning Framework for Quantitative Trading + + +
+ The inherent volatility and dynamic fluctuations within the financial stock +market underscore the necessity for investors to employ a comprehensive and +reliable approach that integrates risk management strategies, market trends, +and the movement trends of individual securities. By evaluating specific data, +investors can make more informed decisions. However, the current body of +literature lacks substantial evidence supporting the practical efficacy of +reinforcement learning (RL) agents, as many models have only demonstrated +success in back testing using historical data. This highlights the urgent need +for a more advanced methodology capable of addressing these challenges. There +is a significant disconnect in the effective utilization of financial +indicators to better understand the potential market trends of individual +securities. The disclosure of successful trading strategies is often restricted +within financial markets, resulting in a scarcity of widely documented and +published strategies leveraging RL. Furthermore, current research frequently +overlooks the identification of financial indicators correlated with various +market trends and their potential advantages. + This research endeavors to address these complexities by enhancing the +ability of RL agents to effectively differentiate between positive and negative +buy/sell actions using financial indicators. While we do not address all +concerns, this paper provides deeper insights and commentary on the utilization +of technical indicators and their benefits within reinforcement learning. This +work establishes a foundational framework for further exploration and +investigation of more complex scenarios. + +
+
+ comment: 8 pages, 9 figures, 3 tables, accepted at ICAIF 2024 FM4TS Workshop +
+
+
+
+
+ + ☆ Disentangling Tabular Data towards Better One-Class Anomaly Detection + + +
+ Tabular anomaly detection under the one-class classification setting poses a +significant challenge, as it involves accurately conceptualizing "normal" +derived exclusively from a single category to discern anomalies from normal +data variations. Capturing the intrinsic correlation among attributes within +normal samples presents one promising method for learning the concept. To do +so, the most recent effort relies on a learnable mask strategy with a +reconstruction task. However, this wisdom may suffer from the risk of producing +uniform masks, i.e., essentially nothing is masked, leading to less effective +correlation learning. To address this issue, we presume that attributes related +to others in normal samples can be divided into two non-overlapping and +correlated subsets, defined as CorrSets, to capture the intrinsic correlation +effectively. Accordingly, we introduce an innovative method that disentangles +CorrSets from normal tabular data. To our knowledge, this is a pioneering +effort to apply the concept of disentanglement for one-class anomaly detection +on tabular data. Extensive experiments on 20 tabular datasets show that our +method substantially outperforms the state-of-the-art methods and leads to an +average performance improvement of 6.1% on AUC-PR and 2.1% on AUC-ROC. + +
+
+
+
+
+ + ☆ Improving Grapheme-to-Phoneme Conversion through In-Context Knowledge + Retrieval with Large Language Models + + +
+ Grapheme-to-phoneme (G2P) conversion is a crucial step in Text-to-Speech +(TTS) systems, responsible for mapping grapheme to corresponding phonetic +representations. However, it faces ambiguities problems where the same grapheme +can represent multiple phonemes depending on contexts, posing a challenge for +G2P conversion. Inspired by the remarkable success of Large Language Models +(LLMs) in handling context-aware scenarios, contextual G2P conversion systems +with LLMs' in-context knowledge retrieval (ICKR) capabilities are proposed to +promote disambiguation capability. The efficacy of incorporating ICKR into G2P +conversion systems is demonstrated thoroughly on the Librig2p dataset. In +particular, the best contextual G2P conversion system using ICKR outperforms +the baseline with weighted average phoneme error rate (PER) reductions of 2.0% +absolute (28.9% relative). Using GPT-4 in the ICKR system can increase of 3.5% +absolute (3.8% relative) on the Librig2p dataset. + +
+
+ comment: accepted by ISCSLP 2024 +
+
+
+
+
+ + ☆ EUR/USD Exchange Rate Forecasting incorporating Text Mining Based on + Pre-trained Language Models and Deep Learning Methods + + +
+ This study introduces a novel approach for EUR/USD exchange rate forecasting +that integrates deep learning, textual analysis, and particle swarm +optimization (PSO). By incorporating online news and analysis texts as +qualitative data, the proposed PSO-LSTM model demonstrates superior performance +compared to traditional econometric and machine learning models. The research +employs advanced text mining techniques, including sentiment analysis using the +RoBERTa-Large model and topic modeling with LDA. Empirical findings underscore +the significant advantage of incorporating textual data, with the PSO-LSTM +model outperforming benchmark models such as SVM, SVR, ARIMA, and GARCH. +Ablation experiments reveal the contribution of each textual data category to +the overall forecasting performance. The study highlights the transformative +potential of artificial intelligence in finance and paves the way for future +research in real-time forecasting and the integration of alternative data +sources. + +
+
+
+
+
+ + ☆ Zer0-Jack: A Memory-efficient Gradient-based Jailbreaking Method for + Black-box Multi-modal Large Language Models + + +
+ Jailbreaking methods, which induce Multi-modal Large Language Models (MLLMs) +to output harmful responses, raise significant safety concerns. Among these +methods, gradient-based approaches, which use gradients to generate malicious +prompts, have been widely studied due to their high success rates in white-box +settings, where full access to the model is available. However, these methods +have notable limitations: they require white-box access, which is not always +feasible, and involve high memory usage. To address scenarios where white-box +access is unavailable, attackers often resort to transfer attacks. In transfer +attacks, malicious inputs generated using white-box models are applied to +black-box models, but this typically results in reduced attack performance. To +overcome these challenges, we propose Zer0-Jack, a method that bypasses the +need for white-box access by leveraging zeroth-order optimization. We propose +patch coordinate descent to efficiently generate malicious image inputs to +directly attack black-box MLLMs, which significantly reduces memory usage +further. Through extensive experiments, Zer0-Jack achieves a high attack +success rate across various models, surpassing previous transfer-based methods +and performing comparably with existing white-box jailbreak techniques. +Notably, Zer0-Jack achieves a 95\% attack success rate on MiniGPT-4 with the +Harmful Behaviors Multi-modal Dataset on a black-box setting, demonstrating its +effectiveness. Additionally, we show that Zer0-Jack can directly attack +commercial MLLMs such as GPT-4o. Codes are provided in the supplement. + +
+
+ comment: Accepted to Neurips SafeGenAi Workshop 2024 +
+
+
+
+
+ + ☆ Contrastive Language Prompting to Ease False Positives in Medical + Anomaly Detection + + +
+ A pre-trained visual-language model, contrastive language-image pre-training +(CLIP), successfully accomplishes various downstream tasks with text prompts, +such as finding images or localizing regions within the image. Despite CLIP's +strong multi-modal data capabilities, it remains limited in specialized +environments, such as medical applications. For this purpose, many CLIP +variants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives +related to normal regions persist. Thus, we aim to present a simple yet +important goal of reducing false positives in medical anomaly detection. We +introduce a Contrastive LAnguage Prompting (CLAP) method that leverages both +positive and negative text prompts. This straightforward approach identifies +potential lesion regions by visual attention to the positive prompts in the +given image. To reduce false positives, we attenuate attention on normal +regions using negative prompts. Extensive experiments with the BMAD dataset, +including six biomedical benchmarks, demonstrate that CLAP method enhances +anomaly detection performance. Our future plans include developing an automated +fine prompting method for more practical usage. + +
+
+ comment: 4 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Model Stealing for Any Low-Rank Language Model + + +
+ Model stealing, where a learner tries to recover an unknown model via +carefully chosen queries, is a critical problem in machine learning, as it +threatens the security of proprietary models and the privacy of data they are +trained on. In recent years, there has been particular interest in stealing +large language models (LLMs). In this paper, we aim to build a theoretical +understanding of stealing language models by studying a simple and +mathematically tractable setting. We study model stealing for Hidden Markov +Models (HMMs), and more generally low-rank language models. + We assume that the learner works in the conditional query model, introduced +by Kakade, Krishnamurthy, Mahajan and Zhang. Our main result is an efficient +algorithm in the conditional query model, for learning any low-rank +distribution. In other words, our algorithm succeeds at stealing any language +model whose output distribution is low-rank. This improves upon the previous +result by Kakade, Krishnamurthy, Mahajan and Zhang, which also requires the +unknown distribution to have high "fidelity", a property that holds only in +restricted cases. There are two key insights behind our algorithm: First, we +represent the conditional distributions at each timestep by constructing +barycentric spanners among a collection of vectors of exponentially large +dimension. Second, for sampling from our representation, we iteratively solve a +sequence of convex optimization problems that involve projection in relative +entropy to prevent compounding of errors over the length of the sequence. This +is an interesting example where, at least theoretically, allowing a machine +learning model to solve more complex problems at inference time can lead to +drastic improvements in its performance. + +
+
+
+
+
+ + ☆ Evaluating ChatGPT-3.5 Efficiency in Solving Coding Problems of + Different Complexity Levels: An Empirical Analysis + + +
+ ChatGPT and other large language models (LLMs) promise to revolutionize +software development by automatically generating code from program +specifications. We assess the performance of ChatGPT's GPT-3.5-turbo model on +LeetCode, a popular platform with algorithmic coding challenges for technical +interview practice, across three difficulty levels: easy, medium, and hard. We +test three main hypotheses. First, ChatGPT solves fewer problems as difficulty +rises (Hypothesis 1). Second, prompt engineering improves ChatGPT's +performance, with greater gains on easier problems and diminishing returns on +harder ones (Hypothesis 2). Third, ChatGPT performs better in popular languages +like Python, Java, and C++ than in less common ones like Elixir, Erlang, and +Racket (Hypothesis 3). To investigate these hypotheses, we conduct automated +experiments using Python scripts to generate prompts that instruct ChatGPT to +create Python solutions. These solutions are stored and manually submitted on +LeetCode to check their correctness. For Hypothesis 1, results show the +GPT-3.5-turbo model successfully solves 92% of easy, 79% of medium, and 51% of +hard problems. For Hypothesis 2, prompt engineering yields improvements: 14-29% +for Chain of Thought Prompting, 38-60% by providing failed test cases in a +second feedback prompt, and 33-58% by switching to GPT-4. From a random subset +of problems ChatGPT solved in Python, it also solved 78% in Java, 50% in C++, +and none in Elixir, Erlang, or Racket. These findings generally validate all +three hypotheses. + +
+
+
+
+
+ + ☆ SecEncoder: Logs are All You Need in Security + + +
+ Large and Small Language Models (LMs) are typically pretrained using +extensive volumes of text, which are sourced from publicly accessible platforms +such as Wikipedia, Book Corpus, or through web scraping. These models, due to +their exposure to a wide range of language data, exhibit impressive +generalization capabilities and can perform a multitude of tasks +simultaneously. However, they often fall short when it comes to domain-specific +tasks due to their broad training data. This paper introduces SecEncoder, a +specialized small language model that is pretrained using security logs. +SecEncoder is designed to address the domain-specific limitations of general +LMs by focusing on the unique language and patterns found in security logs. +Experimental results indicate that SecEncoder outperforms other LMs, such as +BERTlarge, DeBERTa-v3-large and OpenAI's Embedding (textembedding-ada-002) +models, which are pretrained mainly on natural language, across various tasks. +Furthermore, although SecEncoder is primarily pretrained on log data, it +outperforms models pretrained on natural language for a range of tasks beyond +log analysis, such as incident prioritization and threat intelligence document +retrieval. This suggests that domain specific pretraining with logs can +significantly enhance the performance of LMs in security. These findings pave +the way for future research into security-specific LMs and their potential +applications. + +
+
+
+
+
+ + ☆ Fair Summarization: Bridging Quality and Diversity in Extractive + Summaries NeurIPS 2024 + + +
+ Fairness in multi-document summarization of user-generated content remains a +critical challenge in natural language processing (NLP). Existing summarization +methods often fail to ensure equitable representation across different social +groups, leading to biased outputs. In this paper, we introduce two novel +methods for fair extractive summarization: FairExtract, a clustering-based +approach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints. +We evaluate these methods using Divsumm summarization dataset of White-aligned, +Hispanic, and African-American dialect tweets and compare them against relevant +baselines. The results obtained using a comprehensive set of summarization +quality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well +as a fairness metric F, demonstrate that FairExtract and FairGPT achieve +superior fairness while maintaining competitive summarization quality. +Additionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that +integrate quality and fairness into a single evaluation framework, offering a +more nuanced understanding of the trade-offs between these objectives. This +work highlights the importance of fairness in summarization and sets a +benchmark for future research in fairness-aware NLP models. + +
+
+ comment: Accepted at Algorithmic Fairness through the Lens of Metrics and + Evaluation Workshop @ NeurIPS 2024 +
+
+
+
+
+ + ☆ TIPS: Threat Actor Informed Prioritization of Applications using + SecEncoder + + +
+ This paper introduces TIPS: Threat Actor Informed Prioritization using +SecEncoder, a specialized language model for security. TIPS combines the +strengths of both encoder and decoder language models to detect and prioritize +compromised applications. By integrating threat actor intelligence, TIPS +enhances the accuracy and relevance of its detections. Extensive experiments +with a real-world benchmark dataset of applications demonstrate TIPS's high +efficacy, achieving an F-1 score of 0.90 in identifying malicious applications. +Additionally, in real-world scenarios, TIPS significantly reduces the backlog +of investigations for security analysts by 87%, thereby streamlining the threat +response process and improving overall security posture. + +
+
+
+
+
+ + ☆ LLM App Squatting and Cloning + + +
+ Impersonation tactics, such as app squatting and app cloning, have posed +longstanding challenges in mobile app stores, where malicious actors exploit +the names and reputations of popular apps to deceive users. With the rapid +growth of Large Language Model (LLM) stores like GPT Store and FlowGPT, these +issues have similarly surfaced, threatening the integrity of the LLM app +ecosystem. In this study, we present the first large-scale analysis of LLM app +squatting and cloning using our custom-built tool, LLMappCrazy. LLMappCrazy +covers 14 squatting generation techniques and integrates Levenshtein distance +and BERT-based semantic analysis to detect cloning by analyzing app functional +similarities. Using this tool, we generated variations of the top 1000 app +names and found over 5,000 squatting apps in the dataset. Additionally, we +observed 3,509 squatting apps and 9,575 cloning cases across six major +platforms. After sampling, we find that 18.7% of the squatting apps and 4.9% of +the cloning apps exhibited malicious behavior, including phishing, malware +distribution, fake content dissemination, and aggressive ad injection. + +
+
+
+
+
+ + ☆ An Attack Traffic Identification Method Based on Temporal Spectrum + + +
+ To address the issues of insufficient robustness, unstable features, and data +noise interference in existing network attack detection and identification +models, this paper proposes an attack traffic detection and identification +method based on temporal spectrum. First, traffic data is segmented by a +sliding window to construct a feature sequence and a corresponding label +sequence for network traffic. Next, the proposed spectral label generation +methods, SSPE and COAP, are applied to transform the label sequence into +spectral labels and the feature sequence into temporal features. Spectral +labels and temporal features are used to capture and represent behavioral +patterns of attacks. Finally, the constructed temporal features and spectral +labels are used to train models, which subsequently detects and identifies +network attack behaviors. Experimental results demonstrate that compared to +traditional methods, models trained with the SSPE or COAP method improve +identification accuracy by 10%, and exhibit strong robustness, particularly in +noisy environments. + +
+
+ comment: 20 pages, 7 figures, 7 tables, 8 formulas +
+
+
+
+
+ + ☆ FM-TS: Flow Matching for Time Series Generation + + +
+ Time series generation has emerged as an essential tool for analyzing +temporal data across numerous fields. While diffusion models have recently +gained significant attention in generating high-quality time series, they tend +to be computationally demanding and reliant on complex stochastic processes. To +address these limitations, we introduce FM-TS, a rectified Flow Matching-based +framework for Time Series generation, which simplifies the time series +generation process by directly optimizing continuous trajectories. This +approach avoids the need for iterative sampling or complex noise schedules +typically required in diffusion-based models. FM-TS is more efficient in terms +of training and inference. Moreover, FM-TS is highly adaptive, supporting both +conditional and unconditional time series generation. Notably, through our +novel inference design, the model trained in an unconditional setting can +seamlessly generalize to conditional tasks without the need for retraining. +Extensive benchmarking across both settings demonstrates that FM-TS +consistently delivers superior performance compared to existing approaches +while being more efficient in terms of training and inference. For instance, in +terms of discriminative score, FM-TS achieves 0.005, 0.019, 0.011, 0.005, +0.053, and 0.106 on the Sines, Stocks, ETTh, MuJoCo, Energy, and fMRI +unconditional time series datasets, respectively, significantly outperforming +the second-best method which achieves 0.006, 0.067, 0.061, 0.008, 0.122, and +0.167 on the same datasets. We have achieved superior performance in solar +forecasting and MuJoCo imputation tasks, significantly enhanced by our +innovative $t$ power sampling method. The code is available at +https://github.com/UNITES-Lab/FMTS. + +
+
+
+
+
+ + ☆ LAUREL: Learned Augmented Residual Layer ICML + + +
+ One of the core pillars of efficient deep learning methods is architectural +improvements such as the residual/skip connection, which has led to +significantly better model convergence and quality. Since then the residual +connection has become ubiquitous in not just convolutional neural networks but +also transformer-based architectures, the backbone of LLMs. + In this paper we introduce \emph{Learned Augmented Residual Layer} (LAuReL) +-- a novel generalization of the canonical residual connection -- with the goal +to be an in-situ replacement of the latter while outperforming on both model +quality and footprint metrics. Our experiments show that using \laurel can help +boost performance for both vision and language models. For example, on the +ResNet-50, ImageNet 1K task, it achieves $60\%$ of the gains from adding an +extra layer, while only adding $0.003\%$ more parameters, and matches it while +adding $2.6\times$ fewer parameters. + +
+
+ comment: Accepted at the 2nd Efficient Systems for Foundation Models Workshop + at the International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ☆ Enhancing Link Prediction with Fuzzy Graph Attention Networks and + Dynamic Negative Sampling + + +
+ Link prediction is crucial for understanding complex networks but traditional +Graph Neural Networks (GNNs) often rely on random negative sampling, leading to +suboptimal performance. This paper introduces Fuzzy Graph Attention Networks +(FGAT), a novel approach integrating fuzzy rough sets for dynamic negative +sampling and enhanced node feature aggregation. Fuzzy Negative Sampling (FNS) +systematically selects high-quality negative edges based on fuzzy similarities, +improving training efficiency. FGAT layer incorporates fuzzy rough set +principles, enabling robust and discriminative node representations. +Experiments on two research collaboration networks demonstrate FGAT's superior +link prediction accuracy, outperforming state-of-the-art baselines by +leveraging the power of fuzzy rough sets for effective negative sampling and +node feature learning. + +
+
+
+
+
+ + ☆ IdentifyMe: A Challenging Long-Context Mention Resolution Benchmark + + +
+ Recent evaluations of LLMs on coreference resolution have revealed that +traditional output formats and evaluation metrics do not fully capture the +models' referential understanding. To address this, we introduce IdentifyMe, a +new benchmark for mention resolution presented in a multiple-choice question +(MCQ) format, commonly used for evaluating LLMs. IdentifyMe features long +narratives and employs heuristics to exclude easily identifiable mentions, +creating a more challenging task. The benchmark also consists of a curated +mixture of different mention types and corresponding entities, allowing for a +fine-grained analysis of model performance. We evaluate both closed- and open +source LLMs on IdentifyMe and observe a significant performance gap (20-30%) +between the state-of-the-art sub-10B open models vs. closed ones. We observe +that pronominal mentions, which have limited surface information, are typically +much harder for models to resolve than nominal mentions. Additionally, we find +that LLMs often confuse entities when their mentions overlap in nested +structures. The highest-scoring model, GPT-4o, achieves 81.9% accuracy, +highlighting the strong referential capabilities of state-of-the-art LLMs while +also indicating room for further improvement. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating + Machine Learning Tasks + + +
+ Large Language Models (LLMs) excel in diverse applications including +generation of code snippets, but often struggle with generating code for +complex Machine Learning (ML) tasks. Although existing LLM single-agent based +systems give varying performance depending on the task complexity, they purely +rely on larger and expensive models such as GPT-4. Our investigation reveals +that no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama +perform far worse than GPT-4 in a single-agent setting. With the motivation of +developing a cost-efficient LLM based solution for solving ML tasks, we propose +an LLM Multi-Agent based system which leverages combination of experts using +profiling, efficient retrieval of past observations, LLM cascades, and +ask-the-expert calls. Through empirical analysis on ML engineering tasks in the +MLAgentBench benchmark, we demonstrate the effectiveness of our system, using +no-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and +expert to serve occasional ask-the-expert calls for planning. With 94.2\% +reduction in the cost (from \$0.931 per run cost averaged over all tasks for +GPT-4 single agent system to \$0.054), our system is able to yield better +average success rate of 32.95\% as compared to GPT-4 single-agent system +yielding 22.72\% success rate averaged over all the tasks of MLAgentBench. + +
+
+ comment: Presented at AIMLSystems '24 +
+
+
+
+
+ + ☆ BLIP3-KALE: Knowledge Augmented Large-Scale Dense Captions + + +
+ We introduce BLIP3-KALE, a dataset of 218 million image-text pairs that +bridges the gap between descriptive synthetic captions and factual web-scale +alt-text. KALE augments synthetic dense image captions with web-scale alt-text +to generate factually grounded image captions. Our two-stage approach leverages +large vision-language models and language models to create knowledge-augmented +captions, which are then used to train a specialized VLM for scaling up the +dataset. We train vision-language models on KALE and demonstrate improvements +on vision-language tasks. Our experiments show the utility of KALE for training +more capable and knowledgeable multimodal models. We release the KALE dataset +at https://huggingface.co/datasets/Salesforce/blip3-kale + +
+
+
+
+
+ + ☆ Research on fault diagnosis of nuclear power first-second circuit based + on hierarchical multi-granularity classification network + + +
+ The safe and reliable operation of complex electromechanical systems in +nuclear power plants is crucial for the safe production of nuclear power plants +and their nuclear power unit. Therefore, accurate and timely fault diagnosis of +nuclear power systems is of great significance for ensuring the safe and +reliable operation of nuclear power plants. The existing fault diagnosis +methods mainly target a single device or subsystem, making it difficult to +analyze the inherent connections and mutual effects between different types of +faults at the entire unit level. This article uses the AP1000 full-scale +simulator to simulate the important mechanical component failures of some key +systems in the primary and secondary circuits of nuclear power units, and +constructs a fault dataset. Meanwhile, a hierarchical multi granularity +classification fault diagnosis model based on the EfficientNet large model is +proposed, aiming to achieve hierarchical classification of nuclear power +faults. The results indicate that the proposed fault diagnosis model can +effectively classify faults in different circuits and system components of +nuclear power units into hierarchical categories. However, the fault dataset in +this study was obtained from a simulator, which may introduce additional +information due to parameter redundancy, thereby affecting the diagnostic +performance of the model. + +
+
+
+
+
+ + ☆ Optimizing Data Delivery: Insights from User Preferences on Visuals, + Tables, and Text + + +
+ In this work, we research user preferences to see a chart, table, or text +given a question asked by the user. This enables us to understand when it is +best to show a chart, table, or text to the user for the specific question. For +this, we conduct a user study where users are shown a question and asked what +they would prefer to see and used the data to establish that a user's personal +traits does influence the data outputs that they prefer. Understanding how user +characteristics impact a user's preferences is critical to creating data tools +with a better user experience. Additionally, we investigate to what degree an +LLM can be used to replicate a user's preference with and without user +preference data. Overall, these findings have significant implications +pertaining to the development of data tools and the replication of human +preferences using LLMs. Furthermore, this work demonstrates the potential use +of LLMs to replicate user preference data which has major implications for +future user modeling and personalization research. + +
+
+
+
+
+ + ☆ The Effect of Scheduling and Preemption on the Efficiency of LLM + Inference Serving + + +
+ The growing usage of Large Language Models (LLMs) highlights the demands and +challenges in scalable LLM inference systems, affecting deployment and +development processes. On the deployment side, there is a lack of comprehensive +analysis on the conditions under which a particular scheduler performs better +or worse, with performance varying substantially across different schedulers, +hardware, models, and workloads. Manually testing each configuration on GPUs +can be prohibitively expensive. On the development side, unpredictable +performance and unknown upper limits can lead to inconclusive trial-and-error +processes, consuming resources on ideas that end up ineffective. To address +these challenges, we introduce INFERMAX, an analytical framework that uses +inference cost models to compare various schedulers, including an optimal +scheduler formulated as a constraint satisfaction problem (CSP) to establish an +upper bound on performance. Our framework offers in-depth analysis and raises +essential questions, challenging assumptions and exploring opportunities for +more efficient scheduling. Notably, our findings indicate that preempting +requests can reduce GPU costs by 30% compared to avoiding preemptions at all. +We believe our methods and insights will facilitate the cost-effective +deployment and development of scalable, efficient inference systems and pave +the way for cost-based scheduling. + +
+
+
+
+
+ + ☆ Input-Based Ensemble-Learning Method for Dynamic Memory Configuration of + Serverless Computing Functions + + +
+ In today's Function-as-a-Service offerings, a programmer is usually +responsible for configuring function memory for its successful execution, which +allocates proportional function resources such as CPU and network. However, +right-sizing the function memory force developers to speculate performance and +make ad-hoc configuration decisions. Recent research has highlighted that a +function's input characteristics, such as input size, type and number of +inputs, significantly impact its resource demand, run-time performance and +costs with fluctuating workloads. This correlation further makes memory +configuration a non-trivial task. On that account, an input-aware function +memory allocator not only improves developer productivity by completely hiding +resource-related decisions but also drives an opportunity to reduce resource +wastage and offer a finer-grained cost-optimised pricing scheme. Therefore, we +present MemFigLess, a serverless solution that estimates the memory requirement +of a serverless function with input-awareness. The framework executes function +profiling in an offline stage and trains a multi-output Random Forest +Regression model on the collected metrics to invoke input-aware optimal +configurations. We evaluate our work with the state-of-the-art approaches on +AWS Lambda service to find that MemFigLess is able to capture the input-aware +resource relationships and allocate upto 82% less resources and save up to 87% +run-time costs. + +
+
+ comment: 10 pages, 2 tables, 28 figures, accepted conference paper - UCC'24 +
+
+
+
+
+ + ☆ Retrieval Augmented Time Series Forecasting + + +
+ Retrieval-augmented generation (RAG) is a central component of modern LLM +systems, particularly in scenarios where up-to-date information is crucial for +accurately responding to user queries or when queries exceed the scope of the +training data. The advent of time-series foundation models (TSFM), such as +Chronos, and the need for effective zero-shot forecasting performance across +various time-series domains motivates the question: Do benefits of RAG +similarly carry over to time series forecasting? In this paper, we advocate +that the dynamic and event-driven nature of time-series data makes RAG a +crucial component of TSFMs and introduce a principled RAG framework for +time-series forecasting, called Retrieval Augmented Forecasting (RAF). Within +RAF, we develop efficient strategies for retrieving related time-series +examples and incorporating them into forecast. Through experiments and +mechanistic studies, we demonstrate that RAF indeed improves the forecasting +accuracy across diverse time series domains and the improvement is more +significant for larger TSFM sizes. + +
+
+
+
+
+ + ☆ Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial + Approach + + +
+ Deep learning underpins most of the currently advanced natural language +processing (NLP) tasks such as textual classification, neural machine +translation (NMT), abstractive summarization and question-answering (QA). +However, the robustness of the models, particularly QA models, against +adversarial attacks is a critical concern that remains insufficiently explored. +This paper introduces QA-Attack (Question Answering Attack), a novel word-level +adversarial strategy that fools QA models. Our attention-based attack exploits +the customized attention mechanism and deletion ranking strategy to identify +and target specific words within contextual passages. It creates deceptive +inputs by carefully choosing and substituting synonyms, preserving grammatical +integrity while misleading the model to produce incorrect responses. Our +approach demonstrates versatility across various question types, particularly +when dealing with extensive long textual inputs. Extensive experiments on +multiple benchmark datasets demonstrate that QA-Attack successfully deceives +baseline QA models and surpasses existing adversarial techniques regarding +success rate, semantics changes, BLEU score, fluency and grammar error rate. + +
+
+
+
+
+ + ☆ A Social Outcomes and Priorities centered (SOP) Framework for AI policy + + +
+ Rapid developments in AI and its adoption across various domains have +necessitated a need to build robust guardrails and risk containment plans while +ensuring equitable benefits for the betterment of society. The current +technology-centered approach has resulted in a fragmented, reactive, and +ineffective policy apparatus. This paper highlights the immediate and urgent +need to pivot to a society-centered approach to develop comprehensive, +coherent, forward-looking AI policy. To this end, we present a Social Outcomes +and Priorities centered (SOP) framework for AI policy along with proposals on +implementation of its various components. While the SOP framework is presented +from a US-centric view, the takeaways are general and applicable globally. + +
+
+
+
+
+ + ☆ DPU: Dynamic Prototype Updating for Multimodal Out-of-Distribution + Detection + + +
+ Out-of-distribution (OOD) detection is essential for ensuring the robustness +of machine learning models by identifying samples that deviate from the +training distribution. While traditional OOD detection has primarily focused on +single-modality inputs, such as images, recent advances in multimodal models +have demonstrated the potential of leveraging multiple modalities (e.g., video, +optical flow, audio) to enhance detection performance. However, existing +methods often overlook intra-class variability within in-distribution (ID) +data, assuming that samples of the same class are perfectly cohesive and +consistent. This assumption can lead to performance degradation, especially +when prediction discrepancies are uniformly amplified across all samples. To +address this issue, we propose Dynamic Prototype Updating (DPU), a novel +plug-and-play framework for multimodal OOD detection that accounts for +intra-class variations. Our method dynamically updates class center +representations for each class by measuring the variance of similar samples +within each batch, enabling adaptive adjustments. This approach allows us to +amplify prediction discrepancies based on the updated class centers, thereby +improving the model's robustness and generalization across different +modalities. Extensive experiments on two tasks, five datasets, and nine base +OOD algorithms demonstrate that DPU significantly improves OOD detection +performance, setting a new state-of-the-art in multimodal OOD detection, with +improvements of up to 80 percent in Far-OOD detection. To facilitate +accessibility and reproducibility, our code is publicly available on GitHub. + +
+
+
+
+
+ + ☆ PERFT: Parameter-Efficient Routed Fine-Tuning for Mixture-of-Expert + Model + + +
+ The Mixture-of-Experts (MoE) paradigm has emerged as a powerful approach for +scaling transformers with improved resource utilization. However, efficiently +fine-tuning MoE models remains largely underexplored. Inspired by recent works +on Parameter-Efficient Fine-Tuning (PEFT), we present a unified framework for +integrating PEFT modules directly into the MoE mechanism. Aligning with the +core principles and architecture of MoE, our framework encompasses a set of +design dimensions including various functional and composition strategies. By +combining design choices within our framework, we introduce Parameter-Efficient +Routed Fine-Tuning (PERFT) as a flexible and scalable family of PEFT strategies +tailored for MoE models. Extensive experiments on adapting OLMoE-1B-7B and +Mixtral-8$\times$7B for commonsense and arithmetic reasoning tasks demonstrate +the effectiveness, scalability, and intriguing dynamics of PERFT. Additionally, +we provide empirical findings for each specific design choice to facilitate +better application of MoE and PEFT. + +
+
+ comment: Code available via https://anonymous.4open.science/r/PERFT-MoE/ +
+
+
+
+
+ + ☆ What Representational Similarity Measures Imply about Decodable + Information + + +
+ Neural responses encode information that is useful for a variety of +downstream tasks. A common approach to understand these systems is to build +regression models or ``decoders'' that reconstruct features of the stimulus +from neural responses. Popular neural network similarity measures like centered +kernel alignment (CKA), canonical correlation analysis (CCA), and Procrustes +shape distance, do not explicitly leverage this perspective and instead +highlight geometric invariances to orthogonal or affine transformations when +comparing representations. Here, we show that many of these measures can, in +fact, be equivalently motivated from a decoding perspective. Specifically, +measures like CKA and CCA quantify the average alignment between optimal linear +readouts across a distribution of decoding tasks. We also show that the +Procrustes shape distance upper bounds the distance between optimal linear +readouts and that the converse holds for representations with low participation +ratio. Overall, our work demonstrates a tight link between the geometry of +neural representations and the ability to linearly decode information. This +perspective suggests new ways of measuring similarity between neural systems +and also provides novel, unifying interpretations of existing measures. + +
+
+
+
+
+ + ♻ ☆ On the Utilization of Unique Node Identifiers in Graph Neural Networks + + +
+ Graph Neural Networks have inherent representational limitations due to their +message-passing structure. Recent work has suggested that these limitations can +be overcome by using unique node identifiers (UIDs). Here we argue that despite +the advantages of UIDs, one of their disadvantages is that they lose the +desirable property of permutation-equivariance. We thus propose to focus on UID +models that are permutation-equivariant, and present theoretical arguments for +their advantages. Motivated by this, we propose a method to regularize UID +models towards permutation equivariance, via a contrastive loss. We empirically +demonstrate that our approach improves generalization and extrapolation +abilities while providing faster training convergence. On the recent BREC +expressiveness benchmark, our proposed method achieves state-of-the-art +performance compared to other random-based approaches. + +
+
+
+
+
+ + ♻ ☆ Foundation Models for the Electric Power Grid + + +
+ Foundation models (FMs) currently dominate news headlines. They employ +advanced deep learning architectures to extract structural information +autonomously from vast datasets through self-supervision. The resulting rich +representations of complex systems and dynamics can be applied to many +downstream applications. Therefore, FMs can find uses in electric power grids, +challenged by the energy transition and climate change. In this paper, we call +for the development of, and state why we believe in, the potential of FMs for +electric grids. We highlight their strengths and weaknesses amidst the +challenges of a changing grid. We argue that an FM learning from diverse grid +data and topologies could unlock transformative capabilities, pioneering a new +approach in leveraging AI to redefine how we manage complexity and uncertainty +in the electric grid. Finally, we discuss a power grid FM concept, namely +GridFM, based on graph neural networks and show how different downstream tasks +benefit. + +
+
+ comment: Major equal contributors: H.F.H., T.B., B.G., L.S.A.M., A.P., A.V., + J.W.; Significant equal contributors: J.B., A.B.M., S.C., I.F., B.H., R.J., + K.K., V.M., F.M., M.D.M., O.R., H.S., L.X., E.S.Y., A.Z.; Other equal + contributors: A.J.B., R.J.B., B.P.B., J.S., S.S; Lead contact: H.F.H +
+
+
+
+
+ + ♻ ☆ LE-PDE++: Mamba for accelerating PDEs Simulations + + +
+ Partial Differential Equations are foundational in modeling science and +natural systems such as fluid dynamics and weather forecasting. The Latent +Evolution of PDEs method is designed to address the computational intensity of +classical and deep learning-based PDE solvers by proposing a scalable and +efficient alternative. To enhance the efficiency and accuracy of LE-PDE, we +incorporate the Mamba model, an advanced machine learning model known for its +predictive efficiency and robustness in handling complex dynamic systems with a +progressive learning strategy. The LE-PDE was tested on several benchmark +problems. The method demonstrated a marked reduction in computational time +compared to traditional solvers and standalone deep learning models while +maintaining high accuracy in predicting system behavior over time. Our method +doubles the inference speed compared to the LE-PDE while retaining the same +level of parameter efficiency, making it well-suited for scenarios requiring +long-term predictions. + +
+
+
+
+
+ + ♻ ☆ Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and + Tabnet with SMOTEENN + + +
+ Bank credit risk is a significant challenge in modern financial transactions, +and the ability to identify qualified credit card holders among a large number +of applicants is crucial for the profitability of a bank'sbank's credit card +business. In the past, screening applicants'applicants' conditions often +required a significant amount of manual labor, which was time-consuming and +labor-intensive. Although the accuracy and reliability of previously used ML +models have been continuously improving, the pursuit of more reliable and +powerful AI intelligent models is undoubtedly the unremitting pursuit by major +banks in the financial industry. In this study, we used a dataset of over +40,000 records provided by a commercial bank as the research object. We +compared various dimensionality reduction techniques such as PCA and T-SNE for +preprocessing high-dimensional datasets and performed in-depth adaptation and +tuning of distributed models such as LightGBM and XGBoost, as well as deep +models like Tabnet. After a series of research and processing, we obtained +excellent research results by combining SMOTEENN with these techniques. The +experiments demonstrated that LightGBM combined with PCA and SMOTEENN +techniques can assist banks in accurately predicting potential high-quality +customers, showing relatively outstanding performance compared to other models. + +
+
+ comment: 8 pagess on IEEE ICPICS +
+
+
+
+
+ + ♻ ☆ Advanced Payment Security System:XGBoost, LightGBM and SMOTE Integrated + + +
+ With the rise of various online and mobile payment systems, transaction fraud +has become a significant threat to financial security. This study explores the +application of advanced machine learning models, specifically based on XGBoost +and LightGBM, for developing a more accurate and robust Payment Security +Protection Model. To enhance data reliability, we meticulously processed the +data sources and applied SMOTE (Synthetic Minority Over-sampling Technique) to +address class imbalance and improve data representation. By selecting highly +correlated features, we aimed to strengthen the training process and boost +model performance. We conducted thorough performance evaluations of our +proposed models, comparing them against traditional methods including Random +Forest, Neural Network, and Logistic Regression. Using metrics such as +Precision, Recall, and F1 Score, we rigorously assessed their effectiveness. +Our detailed analyses and comparisons reveal that the combination of SMOTE with +XGBoost and LightGBM offers a highly efficient and powerful mechanism for +payment security protection. Moreover, the integration of XGBoost and LightGBM +in a Local Ensemble model further demonstrated outstanding performance. After +incorporating SMOTE, the new combined model achieved a significant improvement +of nearly 6\% over traditional models and around 5\% over its sub-models, +showcasing remarkable results. + +
+
+ comment: This paper is received by https://ieee-metacom.org +
+
+
+
+
+ + ♻ ☆ Credit Card Fraud Detection Using Advanced Transformer Model + + +
+ With the proliferation of various online and mobile payment systems, credit +card fraud has emerged as a significant threat to financial security. This +study focuses on innovative applications of the latest Transformer models for +more robust and precise fraud detection. To ensure the reliability of the data, +we meticulously processed the data sources, balancing the dataset to address +the issue of data sparsity significantly. We also selected highly correlated +vectors to strengthen the training process.To guarantee the reliability and +practicality of the new Transformer model, we conducted performance comparisons +with several widely adopted models, including Support Vector Machine (SVM), +Random Forest, Neural Network, and Logistic Regression. We rigorously compared +these models using metrics such as Precision, Recall, and F1 Score. Through +these detailed analyses and comparisons, we present to the readers a highly +efficient and powerful anti-fraud mechanism with promising prospects. The +results demonstrate that the Transformer model not only excels in traditional +applications but also shows great potential in niche areas like fraud +detection, offering a substantial advancement in the field. + +
+
+ comment: This paper have been received by https://ieee-metacom.org/ +
+
+
+
+
+ + ♻ ☆ Levin Tree Search with Context Models + + +
+ Levin Tree Search (LTS) is a search algorithm that makes use of a policy (a +probability distribution over actions) and comes with a theoretical guarantee +on the number of expansions before reaching a goal node, depending on the +quality of the policy. This guarantee can be used as a loss function, which we +call the LTS loss, to optimize neural networks representing the policy +(LTS+NN). In this work we show that the neural network can be substituted with +parameterized context models originating from the online compression literature +(LTS+CM). We show that the LTS loss is convex under this new model, which +allows for using standard convex optimization tools, and obtain convergence +guarantees to the optimal parameters in an online setting for a given set of +solution trajectories -- guarantees that cannot be provided for neural +networks. The new LTS+CM algorithm compares favorably against LTS+NN on several +benchmarks: Sokoban (Boxoban), The Witness, and the 24-Sliding Tile puzzle +(STP). The difference is particularly large on STP, where LTS+NN fails to solve +most of the test instances while LTS+CM solves each test instance in a fraction +of a second. Furthermore, we show that LTS+CM is able to learn a policy that +solves the Rubik's cube in only a few hundred expansions, which considerably +improves upon previous machine learning techniques. + +
+
+
+
+
+ + ♻ ☆ Explicit and Implicit Semantic Ranking Framework + + +
+ The core challenge in numerous real-world applications is to match an inquiry +to the best document from a mutable and finite set of candidates. Existing +industry solutions, especially latency-constrained services, often rely on +similarity algorithms that sacrifice quality for speed. In this paper we +introduce a generic semantic learning-to-rank framework, Self-training Semantic +Cross-attention Ranking (sRank). This transformer-based framework uses linear +pairwise loss with mutable training batch sizes and achieves quality gains and +high efficiency, and has been applied effectively to show gains on two industry +tasks at Microsoft over real-world large-scale data sets: Smart Reply (SR) and +Ambient Clinical Intelligence (ACI). In Smart Reply, sRank assists live +customers with technical support by selecting the best reply from predefined +solutions based on consumer and support agent messages. It achieves 11.7% gain +in offline top-one accuracy on the SR task over the previous system, and has +enabled 38.7% time reduction in composing messages in telemetry recorded since +its general release in January 2021. In the ACI task, sRank selects relevant +historical physician templates that serve as guidance for a text summarization +model to generate higher quality medical notes. It achieves 35.5% top-one +accuracy gain, along with 46% relative ROUGE-L gain in generated medical notes. + +
+
+
+
+
+ + ♻ ☆ Software Model Evolution with Large Language Models: Experiments on + Simulated, Public, and Industrial Datasets + + +
+ Modeling structure and behavior of software systems plays a crucial role in +the industrial practice of software engineering. As with other software +engineering artifacts, software models are subject to evolution. Supporting +modelers in evolving software models with recommendations for model completions +is still an open problem, though. In this paper, we explore the potential of +large language models for this task. In particular, we propose an approach, +RAMC, leveraging large language models, model histories, and +retrieval-augmented generation for model completion. Through experiments on +three datasets, including an industrial application, one public open-source +community dataset, and one controlled collection of simulated model +repositories, we evaluate the potential of large language models for model +completion with RAMC. We found that large language models are indeed a +promising technology for supporting software model evolution (62.30% +semantically correct completions on real-world industrial data and up to 86.19% +type-correct completions). The general inference capabilities of large language +models are particularly useful when dealing with concepts for which there are +few, noisy, or no examples at all. + +
+
+
+
+
+ + ♻ ☆ Interpret Your Decision: Logical Reasoning Regularization for + Generalization in Visual Classification NeurIPS2024 + + +
+ Vision models excel in image classification but struggle to generalize to +unseen data, such as classifying images from unseen domains or discovering +novel categories. In this paper, we explore the relationship between logical +reasoning and deep learning generalization in visual classification. A logical +regularization termed L-Reg is derived which bridges a logical analysis +framework to image classification. Our work reveals that L-Reg reduces the +complexity of the model in terms of the feature distribution and classifier +weights. Specifically, we unveil the interpretability brought by L-Reg, as it +enables the model to extract the salient features, such as faces to persons, +for classification. Theoretical analysis and experiments demonstrate that L-Reg +enhances generalization across various scenarios, including multi-domain +generalization and generalized category discovery. In complex real-world +scenarios where images span unknown classes and unseen domains, L-Reg +consistently improves generalization, highlighting its practical efficacy. + +
+
+ comment: Accepted by NeurIPS2024 as Spotlight +
+
+
+
+
+ + ♻ ☆ Dynamic planning in hierarchical active inference + + +
+ By dynamic planning, we refer to the ability of the human brain to infer and +impose motor trajectories related to cognitive decisions. A recent paradigm, +active inference, brings fundamental insights into the adaptation of biological +organisms, constantly striving to minimize prediction errors to restrict +themselves to life-compatible states. Over the past years, many studies have +shown how human and animal behaviors could be explained in terms of active +inference - either as discrete decision-making or continuous motor control - +inspiring innovative solutions in robotics and artificial intelligence. Still, +the literature lacks a comprehensive outlook on effectively planning realistic +actions in changing environments. Setting ourselves the goal of modeling +complex tasks such as tool use, we delve into the topic of dynamic planning in +active inference, keeping in mind two crucial aspects of biological behavior: +the capacity to understand and exploit affordances for object manipulation, and +to learn the hierarchical interactions between the self and the environment, +including other agents. We start from a simple unit and gradually describe more +advanced structures, comparing recently proposed design choices and providing +basic examples. This study distances itself from traditional views centered on +neural networks and reinforcement learning, and points toward a yet unexplored +direction in active inference: hybrid representations in hierarchical models. + +
+
+
+
+
+ + ♻ ☆ DistRL: An Asynchronous Distributed Reinforcement Learning Framework for + On-Device Control Agents + + +
+ On-device control agents, especially on mobile devices, are responsible for +operating mobile devices to fulfill users' requests, enabling seamless and +intuitive interactions. Integrating Multimodal Large Language Models (MLLMs) +into these agents enhances their ability to understand and execute complex +commands, thereby improving user experience. However, fine-tuning MLLMs for +on-device control presents significant challenges due to limited data +availability and inefficient online training processes. This paper introduces +DistRL, a novel framework designed to enhance the efficiency of online RL +fine-tuning for mobile device control agents. DistRL employs centralized +training and decentralized data acquisition to ensure efficient fine-tuning in +the context of dynamic online interactions. Additionally, the framework is +backed by our tailor-made RL algorithm, which effectively balances exploration +with the prioritized utilization of collected data to ensure stable and robust +training. Our experiments show that, on average, DistRL delivers a 3X +improvement in training efficiency and enables training data collection 2.4X +faster than the leading synchronous multi-machine methods. Notably, after +training, DistRL achieves a 20% relative improvement in success rate compared +to state-of-the-art methods on general Android tasks from an open benchmark, +significantly outperforming existing approaches while maintaining the same +training time. These results validate DistRL as a scalable and efficient +solution, offering substantial improvements in both training efficiency and +agent performance for real-world, in-the-wild device control tasks. + +
+
+ comment: Paper and Appendix, 25 pages +
+
+
+
+
+ + ♻ ☆ Transfer Learning for Wildlife Classification: Evaluating YOLOv8 against + DenseNet, ResNet, and VGGNet on a Custom Dataset + + +
+ This study evaluates the performance of various deep learning models, +specifically DenseNet, ResNet, VGGNet, and YOLOv8, for wildlife species +classification on a custom dataset. The dataset comprises 575 images of 23 +endangered species sourced from reputable online repositories. The study +utilizes transfer learning to fine-tune pre-trained models on the dataset, +focusing on reducing training time and enhancing classification accuracy. The +results demonstrate that YOLOv8 outperforms other models, achieving a training +accuracy of 97.39% and a validation F1-score of 96.50%. These findings suggest +that YOLOv8, with its advanced architecture and efficient feature extraction +capabilities, holds great promise for automating wildlife monitoring and +conservation efforts. + +
+
+ comment: This is published in Journal of Artificial Intelligence and Capsule + Networks, December 2024, Volume 6, Issue 4, Pages 415-435 +
+
+
+
+
+ + ♻ ☆ LLMs Can Evolve Continually on Modality for X-Modal Reasoning + + +
+ Multimodal Large Language Models (MLLMs) have gained significant attention +due to their impressive capabilities in multimodal understanding. However, +existing methods rely heavily on extensive modal-specific pretraining and +joint-modal tuning, leading to significant computational burdens when expanding +to new modalities. In this paper, we propose PathWeave, a flexible and scalable +framework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs +to continually EVolve on modalities for $\mathbb{X}$-modal reasoning. We +leverage the concept of Continual Learning and develop an incremental training +strategy atop pre-trained MLLMs, enabling their expansion to new modalities +using uni-modal data, without executing joint-modal pretraining. In detail, a +novel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and +cross-modal adapters are seamlessly integrated to facilitate efficient modality +alignment and collaboration. Additionally, an MoE-based gating module is +applied between two types of adapters to further enhance the multimodal +interaction. To investigate the proposed method, we establish a challenging +benchmark called Continual Learning of Modality (MCL), which consists of +high-quality QA data from five distinct modalities: image, video, audio, depth +and point cloud. Extensive experiments demonstrate the effectiveness of the +proposed AnA framework on learning plasticity and memory stability during +continual learning. Furthermore, PathWeave performs comparably to +state-of-the-art MLLMs while concurrently reducing parameter training burdens +by 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave + +
+
+
+
+
+ + ♻ ☆ The Dark Patterns of Personalized Persuasion in Large Language Models: + Exposing Persuasive Linguistic Features for Big Five Personality Traits in + LLMs Responses + + +
+ This study explores how the Large Language Models (LLMs) adjust linguistic +features to create personalized persuasive outputs. While research showed that +LLMs personalize outputs, a gap remains in understanding the linguistic +features of their persuasive capabilities. We identified 13 linguistic features +crucial for influencing personalities across different levels of the Big Five +model of personality. We analyzed how prompts with personality trait +information influenced the output of 19 LLMs across five model families. The +findings show that models use more anxiety-related words for neuroticism, +increase achievement-related words for conscientiousness, and employ fewer +cognitive processes words for openness to experience. Some model families excel +at adapting language for openness to experience, others for conscientiousness, +while only one model adapts language for neuroticism. Our findings show how +LLMs tailor responses based on personality cues in prompts, indicating their +potential to create persuasive content affecting the mind and well-being of the +recipients. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ♻ ☆ Effective ML Model Versioning in Edge Networks + + +
+ Machine learning (ML) models, data and software need to be regularly updated +whenever essential version updates are released and feasible for integration. +This is a basic but most challenging requirement to satisfy in the edge, due to +the various system constraints and the major impact that an update can have on +robustness and stability. In this paper, we formulate for the first time the ML +model versioning optimization problem, and propose effective solutions, +including the update automation with reinforcement learning (RL) based +algorithm. We study the edge network environment due to the known constraints +in performance, response time, security, and reliability, which make updates +especially challenging. The performance study shows that model version updates +can be fully and effectively automated with reinforcement learning method. We +show that for every range of server load values, the proper versioning can be +found that improves security, reliability and/or ML model accuracy, while +assuring a comparably lower response time. + +
+
+ comment: This paper is uploaded here for research community, thus it is for + non-commercial purposes +
+
+
+
+
+ + ♻ ☆ When Geoscience Meets Foundation Models: Towards General Geoscience + Artificial Intelligence System + + +
+ Artificial intelligence (AI) has significantly advanced Earth sciences, yet +its full potential in to comprehensively modeling Earth's complex dynamics +remains unrealized. Geoscience foundation models (GFMs) emerge as a +paradigm-shifting solution, integrating extensive cross-disciplinary data to +enhance the simulation and understanding of Earth system dynamics. These +data-centric AI models extract insights from petabytes of structured and +unstructured data, effectively addressing the complexities of Earth systems +that traditional models struggle to capture. The unique strengths of GFMs +include flexible task specification, diverse input-output capabilities, and +multi-modal knowledge representation, enabling analyses that surpass those of +individual data sources or traditional AI methods. This review not only +highlights the key advantages of GFMs, but also presents essential techniques +for their construction, with a focus on transformers, pre-training, and +adaptation strategies. Subsequently, we examine recent advancements in GFMs, +including large language models, vision models, and vision-language models, +particularly emphasizing the potential applications in remote sensing. +Additionally, the review concludes with a comprehensive analysis of the +challenges and future trends in GFMs, addressing five critical aspects: data +integration, model complexity, uncertainty quantification, interdisciplinary +collaboration, and concerns related to privacy, trust, and security. This +review offers a comprehensive overview of emerging geoscientific research +paradigms, emphasizing the untapped opportunities at the intersection of +advanced AI techniques and geoscience. It examines major methodologies, +showcases advances in large-scale models, and discusses the challenges and +prospects that will shape the future landscape of GFMs. + +
+
+ comment: accpeted by IEEE Geoscience and Remote Sensing Magazine +
+
+
+
+
+ + ♻ ☆ Pessimistic Iterative Planning for Robust POMDPs + + +
+ Robust POMDPs extend classical POMDPs to handle model uncertainty. +Specifically, robust POMDPs exhibit so-called uncertainty sets on the +transition and observation models, effectively defining ranges of +probabilities. Policies for robust POMDPs must be (1) memory-based to account +for partial observability and (2) robust against model uncertainty to account +for the worst-case instances from the uncertainty sets. To compute such robust +memory-based policies, we propose the pessimistic iterative planning (PIP) +framework, which alternates between two main steps: (1) selecting a pessimistic +(non-robust) POMDP via worst-case probability instances from the uncertainty +sets; and (2) computing a finite-state controller (FSC) for this pessimistic +POMDP. We evaluate the performance of this FSC on the original robust POMDP and +use this evaluation in step (1) to select the next pessimistic POMDP. Within +PIP, we propose the rFSCNet algorithm. In each iteration, rFSCNet finds an FSC +through a recurrent neural network by using supervision policies optimized for +the pessimistic POMDP. The empirical evaluation in four benchmark environments +showcases improved robustness against several baseline methods and competitive +performance compared to a state-of-the-art robust POMDP solver. + +
+
+
+
+
+ + ♻ ☆ Design of a Quality Management System based on the EU Artificial + Intelligence Act + + +
+ The EU AI Act mandates that providers and deployers of high-risk AI systems +establish a quality management system (QMS). Among other criteria, a QMS shall +help verify and document the AI system design and quality and monitor the +proper implementation of all high-risk AI system requirements. Current research +rarely explores practical solutions for implementing the EU AI Act. Instead, it +tends to focus on theoretical concepts. As a result, more attention must be +paid to tools that help humans actively check and document AI systems and +orchestrate the implementation of all high-risk AI system requirements. +Therefore, this paper introduces a new design concept and prototype for a QMS +as a microservice Software as a Service web application. It connects directly +to the AI system for verification and documentation and enables the +orchestration and integration of various sub-services, which can be +individually designed, each tailored to specific high-risk AI system +requirements. The first version of the prototype connects to the +Phi-3-mini-128k-instruct LLM as an example of an AI system and integrates a +risk management system and a data management system. The prototype is evaluated +through a qualitative assessment of the implemented requirements, a GPU memory +and performance analysis, and an evaluation with IT, AI, and legal experts. + +
+
+
+
+
+ + ♻ ☆ A Survey on Integrated Sensing, Communication, and Computation + + +
+ The forthcoming generation of wireless technology, 6G, aims to usher in an +era of ubiquitous intelligent services, where everything is interconnected and +intelligent. This vision requires the seamless integration of three fundamental +modules: Sensing for information acquisition, communication for information +sharing, and computation for information processing and decision-making. These +modules are intricately linked, especially in complex tasks such as edge +learning and inference. However, the performance of these modules is +interdependent, creating a resource competition for time, energy, and +bandwidth. Existing techniques like integrated communication and computation +(ICC), integrated sensing and computation (ISC), and integrated sensing and +communication (ISAC) have made partial strides in addressing this challenge, +but they fall short of meeting the extreme performance requirements. To +overcome these limitations, it is essential to develop new techniques that +comprehensively integrate sensing, communication, and computation. This +integrated approach, known as Integrated Sensing, Communication, and +Computation (ISCC), offers a systematic perspective for enhancing task +performance. This paper begins with a comprehensive survey of historic and +related techniques such as ICC, ISC, and ISAC, highlighting their strengths and +limitations. It then discusses the benefits, functions, and challenges of ISCC. +Subsequently, the state-of-the-art signal designs for ISCC, along with network +resource management strategies specifically tailored for ISCC are explored. +Furthermore, this paper discusses the exciting research opportunities that lie +ahead for implementing ISCC in future advanced networks, and the unresolved +issues requiring further investigation. ISCC is expected to unlock the full +potential of intelligent connectivity, paving the way for groundbreaking +applications and services. + +
+
+ comment: In this version, a series of discussions have been added.The + benefits, functions, and challenges of ISCC are investigated using a new + section. Moreover, the unresolved issues of ISCC have been discussed +
+
+
+
+
+ + ♻ ☆ PhyTracker: An Online Tracker for Phytoplankton + + +
+ Phytoplankton, a crucial component of aquatic ecosystems, requires efficient +monitoring to understand marine ecological processes and environmental +conditions. Traditional phytoplankton monitoring methods, relying on non-in +situ observations, are time-consuming and resource-intensive, limiting timely +analysis. To address these limitations, we introduce PhyTracker, an intelligent +in situ tracking framework designed for automatic tracking of phytoplankton. +PhyTracker overcomes significant challenges unique to phytoplankton monitoring, +such as constrained mobility within water flow, inconspicuous appearance, and +the presence of impurities. Our method incorporates three innovative modules: a +Texture-enhanced Feature Extraction (TFE) module, an Attention-enhanced +Temporal Association (ATA) module, and a Flow-agnostic Movement Refinement +(FMR) module. These modules enhance feature capture, differentiate between +phytoplankton and impurities, and refine movement characteristics, +respectively. Extensive experiments on the PMOT dataset validate the +superiority of PhyTracker in phytoplankton tracking, and additional tests on +the MOT dataset demonstrate its general applicability, outperforming +conventional tracking methods. This work highlights key differences between +phytoplankton and traditional objects, offering an effective solution for +phytoplankton monitoring. + +
+
+ comment: 13pages,eleven figures +
+
+
+
+
+ + ♻ ☆ Kwai-STaR: Transform LLMs into State-Transition Reasoners + + +
+ Mathematical reasoning presents a significant challenge to the cognitive +capabilities of LLMs. Various methods have been proposed to enhance the +mathematical ability of LLMs. However, few recognize the value of state +transition for LLM reasoning. In this work, we define mathematical +problem-solving as a process of transiting from an initial unsolved state to +the final resolved state, and propose Kwai-STaR framework, which transforms +LLMs into State-Transition Reasoners to improve their intuitive reasoning +capabilities. Our approach comprises three main steps: (1) Define the state +space tailored to the mathematical reasoning. (2) Generate state-transition +data based on the state space. (3) Convert original LLMs into State-Transition +Reasoners via a curricular training strategy. Our experiments validate the +effectiveness of Kwai-STaR in enhancing mathematical reasoning: After training +on the small-scale Kwai-STaR dataset, general LLMs, including Mistral-7B and +LLaMA-3, achieve considerable performance gain on the GSM8K and GSM-Hard +dataset. Additionally, the state transition-based design endows Kwai-STaR with +remarkable training and inference efficiency. Further experiments are underway +to establish the generality of Kwai-STaR. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ LibEER: A Comprehensive Benchmark and Algorithm Library for EEG-based + Emotion Recognition + + +
+ EEG-based emotion recognition (EER) has gained significant attention due to +its potential for understanding and analyzing human emotions. While recent +advancements in deep learning techniques have substantially improved EER, the +field lacks a convincing benchmark and comprehensive open-source libraries. +This absence complicates fair comparisons between models and creates +reproducibility challenges for practitioners, which collectively hinder +progress. To address these issues, we introduce LibEER, a comprehensive +benchmark and algorithm library designed to facilitate fair comparisons in EER. +LibEER carefully selects popular and powerful baselines, harmonizes key +implementation details across methods, and provides a standardized codebase in +PyTorch. By offering a consistent evaluation framework with standardized +experimental settings, LibEER enables unbiased assessments of over ten +representative deep learning models for EER across the four most widely used +datasets. Additionally, we conduct a thorough, reproducible comparison of model +performance and efficiency, providing valuable insights to guide researchers in +the selection and design of EER models. Moreover, we make observations and +in-depth analysis on the experiment results and identify current challenges in +this community. We hope that our work will not only lower entry barriers for +newcomers to EEG-based emotion recognition but also contribute to the +standardization of research in this domain, fostering steady development. The +library and source code are publicly available at +https://github.com/XJTU-EEG/LibEER. + +
+
+
+
+
+ + ♻ ☆ Gaussian Process Emulators for Few-Shot Segmentation in Cardiac MRI + + +
+ Segmentation of cardiac magnetic resonance images (MRI) is crucial for the +analysis and assessment of cardiac function, helping to diagnose and treat +various cardiovascular diseases. Most recent techniques rely on deep learning +and usually require an extensive amount of labeled data. To overcome this +problem, few-shot learning has the capability of reducing data dependency on +labeled data. In this work, we introduce a new method that merges few-shot +learning with a U-Net architecture and Gaussian Process Emulators (GPEs), +enhancing data integration from a support set for improved performance. GPEs +are trained to learn the relation between the support images and the +corresponding masks in latent space, facilitating the segmentation of unseen +query images given only a small labeled support set at inference. We test our +model with the M&Ms-2 public dataset to assess its ability to segment the heart +in cardiac magnetic resonance imaging from different orientations, and compare +it with state-of-the-art unsupervised and few-shot methods. Our architecture +shows higher DICE coefficients compared to these methods, especially in the +more challenging setups where the size of the support set is considerably +small. + +
+
+ comment: Accepted at Statistical Atlases and Computational Modeling of the + Heart (STACOM) Workshop 2024 +
+
+
+
+
+ + ♻ ☆ OCMDP: Observation-Constrained Markov Decision Process + + +
+ In many practical applications, decision-making processes must balance the +costs of acquiring information with the benefits it provides. Traditional +control systems often assume full observability, an unrealistic assumption when +observations are expensive. We tackle the challenge of simultaneously learning +observation and control strategies in such cost-sensitive environments by +introducing the Observation-Constrained Markov Decision Process (OCMDP), where +the policy influences the observability of the true state. To manage the +complexity arising from the combined observation and control actions, we +develop an iterative, model-free deep reinforcement learning algorithm that +separates the sensing and control components of the policy. This decomposition +enables efficient learning in the expanded action space by focusing on when and +what to observe, as well as determining optimal control actions, without +requiring knowledge of the environment's dynamics. We validate our approach on +a simulated diagnostic task and a realistic healthcare environment using +HeartPole. Given both scenarios, the experimental results demonstrate that our +model achieves a substantial reduction in observation costs on average, +significantly outperforming baseline methods by a notable margin in efficiency. + +
+
+ comment: Full paper, 14 Pages +
+
+
+
+
+ + ♻ ☆ LLMs for Generating and Evaluating Counterfactuals: A Comprehensive + Study EMNLP + + +
+ As NLP models become more complex, understanding their decisions becomes more +crucial. Counterfactuals (CFs), where minimal changes to inputs flip a model's +prediction, offer a way to explain these models. While Large Language Models +(LLMs) have shown remarkable performance in NLP tasks, their efficacy in +generating high-quality CFs remains uncertain. This work fills this gap by +investigating how well LLMs generate CFs for two NLU tasks. We conduct a +comprehensive comparison of several common LLMs, and evaluate their CFs, +assessing both intrinsic metrics, and the impact of these CFs on data +augmentation. Moreover, we analyze differences between human and LLM-generated +CFs, providing insights for future research directions. Our results show that +LLMs generate fluent CFs, but struggle to keep the induced changes minimal. +Generating CFs for Sentiment Analysis (SA) is less challenging than NLI where +LLMs show weaknesses in generating CFs that flip the original label. This also +reflects on the data augmentation performance, where we observe a large gap +between augmenting with human and LLMs CFs. Furthermore, we evaluate LLMs' +ability to assess CFs in a mislabelled data setting, and show that they have a +strong bias towards agreeing with the provided labels. GPT4 is more robust +against this bias and its scores correlate well with automatic metrics. Our +findings reveal several limitations and point to potential future work +directions. + +
+
+ comment: Accepted to EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning with CNNs: A Compact Holistic Tutorial with Focus on + Supervised Regression (Preprint) + + +
+ In this tutorial, we present a compact and holistic discussion of Deep +Learning with a focus on Convolutional Neural Networks (CNNs) and supervised +regression. While there are numerous books and articles on the individual +topics we cover, comprehensive and detailed tutorials that address Deep +Learning from a foundational yet rigorous and accessible perspective are rare. +Most resources on CNNs are either too advanced, focusing on cutting-edge +architectures, or too narrow, addressing only specific applications like image +classification.This tutorial not only summarizes the most relevant concepts but +also provides an in-depth exploration of each, offering a complete yet agile +set of ideas. Moreover, we highlight the powerful synergy between learning +theory, statistic, and machine learning, which together underpin the Deep +Learning and CNN frameworks. We aim for this tutorial to serve as an optimal +resource for students, professors, and anyone interested in understanding the +foundations of Deep Learning. Upon acceptance we will provide an accompanying +repository under +\href{https://github.com/neoglez/deep-learning-tutorial}{https://github.com/neoglez/deep-learning-tutorial} + Keywords: Tutorial, Deep Learning, Convolutional Neural Networks, Machine +Learning. + +
+
+ comment: Submitted to the journal Machine Learning and Knowledge Extraction +
+
+
+
+
+ + ♻ ☆ RLHF Workflow: From Reward Modeling to Online RLHF + + +
+ We present the workflow of Online Iterative Reinforcement Learning from Human +Feedback (RLHF) in this technical report, which is widely reported to +outperform its offline counterpart by a large margin in the recent large +language model (LLM) literature. However, existing open-source RLHF projects +are still largely confined to the offline learning setting. In this technical +report, we aim to fill in this gap and provide a detailed recipe that is easy +to reproduce for online iterative RLHF. In particular, since online human +feedback is usually infeasible for open-source communities with limited +resources, we start by constructing preference models using a diverse set of +open-source datasets and use the constructed proxy preference model to +approximate human feedback. Then, we discuss the theoretical insights and +algorithmic principles behind online iterative RLHF, followed by a detailed +practical implementation. Our trained LLM achieves impressive performance on +LLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as +well as other academic benchmarks such as HumanEval and TruthfulQA. We have +shown that supervised fine-tuning (SFT) and iterative RLHF can obtain +state-of-the-art performance with fully open-source datasets. Further, we have +made our models, curated datasets, and comprehensive step-by-step code +guidebooks publicly available. Please refer to +https://github.com/RLHFlow/RLHF-Reward-Modeling and +https://github.com/RLHFlow/Online-RLHF for more detailed information. + +
+
+ comment: Published in Transactions on Machine Learning Research (09/2024) +
+
+
+
+
+ + ♻ ☆ LeKUBE: A Legal Knowledge Update BEnchmark + + +
+ Recent advances in Large Language Models (LLMs) have significantly shaped the +applications of AI in multiple fields, including the studies of legal +intelligence. Trained on extensive legal texts, including statutes and legal +documents, the legal LLMs can capture important legal knowledge/concepts +effectively and provide important support for downstream legal applications +such as legal consultancy. Yet, the dynamic nature of legal statutes and +interpretations also poses new challenges to the use of LLMs in legal +applications. Particularly, how to update the legal knowledge of LLMs +effectively and efficiently has become an important research problem in +practice. Existing benchmarks for evaluating knowledge update methods are +mostly designed for the open domain and cannot address the specific challenges +of the legal domain, such as the nuanced application of new legal knowledge, +the complexity and lengthiness of legal regulations, and the intricate nature +of legal reasoning. To address this gap, we introduce the Legal Knowledge +Update BEnchmark, i.e. LeKUBE, which evaluates knowledge update methods for +legal LLMs across five dimensions. Specifically, we categorize the needs of +knowledge updates in the legal domain with the help of legal professionals, and +then hire annotators from law schools to create synthetic updates to the +Chinese Criminal and Civil Code as well as sets of questions of which the +answers would change after the updates. Through a comprehensive evaluation of +state-of-the-art knowledge update methods, we reveal a notable gap between +existing knowledge update methods and the unique needs of the legal domain, +emphasizing the need for further research and development of knowledge update +mechanisms tailored for legal LLMs. + +
+
+
+
+
+ + ♻ ☆ LiCoEval: Evaluating LLMs on License Compliance in Code Generation + + +
+ Recent advances in Large Language Models (LLMs) have revolutionized code +generation, leading to widespread adoption of AI coding tools by developers. +However, LLMs can generate license-protected code without providing the +necessary license information, leading to potential intellectual property +violations during software production. This paper addresses the critical, yet +underexplored, issue of license compliance in LLM-generated code by +establishing a benchmark to evaluate the ability of LLMs to provide accurate +license information for their generated code. To establish this benchmark, we +conduct an empirical study to identify a reasonable standard for "striking +similarity" that excludes the possibility of independent creation, indicating a +copy relationship between the LLM output and certain open-source code. Based on +this standard, we propose LiCoEval, to evaluate the license compliance +capabilities of LLMs, i.e., the ability to provide accurate license or +copyright information when they generate code with striking similarity to +already existing copyrighted code. Using LiCoEval, we evaluate 14 popular LLMs, +finding that even top-performing LLMs produce a non-negligible proportion +(0.88% to 2.01%) of code strikingly similar to existing open-source +implementations. Notably, most LLMs fail to provide accurate license +information, particularly for code under copyleft licenses. These findings +underscore the urgent need to enhance LLM compliance capabilities in code +generation tasks. Our study provides a foundation for future research and +development to improve license compliance in AI-assisted software development, +contributing to both the protection of open-source software copyrights and the +mitigation of legal risks for LLM users. + +
+
+ comment: The 47th International Conference on Software Engineering(ICSE 2025) +
+
+
+
+
+ + ♻ ☆ Smooth Sensitivity for Learning Differentially-Private yet Accurate Rule + Lists + + +
+ Differentially-private (DP) mechanisms can be embedded into the design of a +machine learning algorithm to protect the resulting model against privacy +leakage. However, this often comes with a significant loss of accuracy due to +the noise added to enforce DP. In this paper, we aim at improving this +trade-off for a popular class of machine learning algorithms leveraging the +Gini impurity as an information gain criterion to greedily build interpretable +models such as decision trees or rule lists. To this end, we establish the +smooth sensitivity of the Gini impurity, which can be used to obtain thorough +DP guarantees while adding noise scaled with tighter magnitude. We illustrate +the applicability of this mechanism by integrating it within a greedy algorithm +producing rule list models, motivated by the fact that such models remain +understudied in the DP literature. Our theoretical analysis and experimental +results confirm that the DP rule lists models integrating smooth sensitivity +have higher accuracy that those using other DP frameworks based on global +sensitivity, for identical privacy budgets. + +
+
+
+
+
+ + ♻ ☆ SciDFM: A Large Language Model with Mixture-of-Experts for Science NeurIPS + 2024 + + +
+ Recently, there has been a significant upsurge of interest in leveraging +large language models (LLMs) to assist scientific discovery. However, most LLMs +only focus on general science, while they lack domain-specific knowledge, such +as chemical molecules and amino acid sequences. To bridge these gaps, we +introduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and +is able to conduct college-level scientific reasoning and understand molecules +and amino acid sequences. We collect a large-scale training corpus containing +numerous scientific papers and books from different disciplines as well as data +from domain-specific databases. We further fine-tune the pre-trained model on +lots of instruction data to improve performances on downstream benchmarks. From +experiment results, we show that SciDFM achieves strong performance on general +scientific benchmarks such as SciEval and SciQ, and it reaches a SOTA +performance on domain-specific benchmarks among models of similar size. We +further analyze the expert layers and show that the results of expert selection +vary with data from different disciplines. To benefit the broader research +community, we open-source SciDFM at +https://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0. + +
+
+ comment: 12 pages, 1 figure, 9 tables. Technical Report, accepted by NeurIPS + 2024 Workshop FM4Science +
+
+
+
+
+ + ♻ ☆ How Does the Textual Information Affect the Retrieval of Multimodal + In-Context Learning? EMNLP 2024 + + +
+ The increase in parameter size of multimodal large language models (MLLMs) +introduces significant capabilities, particularly in-context learning, where +MLLMs enhance task performance without updating pre-trained parameters. This +effectiveness, however, hinges on the appropriate selection of in-context +examples, a process that is currently biased towards visual data, overlooking +textual information. Furthermore, the area of supervised retrievers for MLLMs, +crucial for optimal in-context example selection, continues to be +uninvestigated. Our study offers an in-depth evaluation of the impact of +textual information on the unsupervised selection of in-context examples in +multimodal contexts, uncovering a notable sensitivity of retriever performance +to the employed modalities. Responding to this, we introduce a novel supervised +MLLM-retriever MSIER that employs a neural network to select examples that +enhance multimodal in-context learning efficiency. This approach is validated +through extensive testing across three distinct tasks, demonstrating the +method's effectiveness. Additionally, we investigate the influence of +modalities on our supervised retrieval method's training and pinpoint factors +contributing to our model's success. This exploration paves the way for future +advancements, highlighting the potential for refined in-context learning in +MLLMs through the strategic use of multimodal data. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Deception Detection from Linguistic and Physiological Data Streams Using + Bimodal Convolutional Neural Networks + + +
+ Deception detection is gaining increasing interest due to ethical and +security concerns. This paper explores the application of convolutional neural +networks for the purpose of multimodal deception detection. We use a dataset +built by interviewing 104 subjects about two topics, with one truthful and one +falsified response from each subject about each topic. In particular, we make +three main contributions. First, we extract linguistic and physiological +features from this data to train and construct the neural network models. +Second, we propose a fused convolutional neural network model using both +modalities in order to achieve an improved overall performance. Third, we +compare our new approach with earlier methods designed for multimodal deception +detection. We find that our system outperforms regular classification methods; +our results indicate the feasibility of using neural networks for deception +detection even in the presence of limited amounts of data. + +
+
+ comment: Accepted by 2024 5th International Conference on Information Science, + Parallel and Distributed Systems +
+
+
+
+
+ + ♻ ☆ Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM + Approach + + +
+ Accurate stock market predictions following earnings reports are crucial for +investors. Traditional methods, particularly classical machine learning models, +struggle with these predictions because they cannot effectively process and +interpret extensive textual data contained in earnings reports and often +overlook nuances that influence market movements. This paper introduces an +advanced approach by employing Large Language Models (LLMs) instruction +fine-tuned with a novel combination of instruction-based techniques and +quantized low-rank adaptation (QLoRA) compression. Our methodology integrates +'base factors', such as financial metric growth and earnings transcripts, with +'external factors', including recent market indices performances and analyst +grades, to create a rich, supervised dataset. This comprehensive dataset +enables our models to achieve superior predictive performance in terms of +accuracy, weighted F1, and Matthews correlation coefficient (MCC), especially +evident in the comparison with benchmarks such as GPT-4. We specifically +highlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases +significant improvements over baseline models. The paper also discusses the +potential of expanding the output capabilities to include a 'Hold' option and +extending the prediction horizon, aiming to accommodate various investment +styles and time frames. This study not only demonstrates the power of +integrating cutting-edge AI with fine-tuned financial data but also paves the +way for future research in enhancing AI-driven financial analysis tools. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ♻ ☆ Add-it: Training-Free Object Insertion in Images With Pretrained + Diffusion Models + + +
+ Adding Object into images based on text instructions is a challenging task in +semantic image editing, requiring a balance between preserving the original +scene and seamlessly integrating the new object in a fitting location. Despite +extensive efforts, existing models often struggle with this balance, +particularly with finding a natural location for adding an object in complex +scenes. We introduce Add-it, a training-free approach that extends diffusion +models' attention mechanisms to incorporate information from three key sources: +the scene image, the text prompt, and the generated image itself. Our weighted +extended-attention mechanism maintains structural consistency and fine details +while ensuring natural object placement. Without task-specific fine-tuning, +Add-it achieves state-of-the-art results on both real and generated image +insertion benchmarks, including our newly constructed "Additing Affordance +Benchmark" for evaluating object placement plausibility, outperforming +supervised methods. Human evaluations show that Add-it is preferred in over 80% +of cases, and it also demonstrates improvements in various automated metrics. + +
+
+ comment: Project page is at https://research.nvidia.com/labs/par/addit/ +
+
+
+
+
+ + ♻ ☆ A Comparative Study on Enhancing Prediction in Social Network + Advertisement through Data Augmentation + + +
+ In the ever-evolving landscape of social network advertising, the volume and +accuracy of data play a critical role in the performance of predictive models. +However, the development of robust predictive algorithms is often hampered by +the limited size and potential bias present in real-world datasets. This study +presents and explores a generative augmentation framework of social network +advertising data. Our framework explores three generative models for data +augmentation - Generative Adversarial Networks (GANs), Variational Autoencoders +(VAEs), and Gaussian Mixture Models (GMMs) - to enrich data availability and +diversity in the context of social network advertising analytics effectiveness. +By performing synthetic extensions of the feature space, we find that through +data augmentation, the performance of various classifiers has been +quantitatively improved. Furthermore, we compare the relative performance gains +brought by each data augmentation technique, providing insights for +practitioners to select appropriate techniques to enhance model performance. +This paper contributes to the literature by showing that synthetic data +augmentation alleviates the limitations imposed by small or imbalanced datasets +in the field of social network advertising. At the same time, this article also +provides a comparative perspective on the practicality of different data +augmentation methods, thereby guiding practitioners to choose appropriate +techniques to enhance model performance. + +
+
+ comment: Accepted by 2024 4th International Conference on Machine Learning and + Intelligent Systems Engineering (MLISE) +
+
+
+
+
+ + ♻ ☆ Time Series Modeling for Heart Rate Prediction: From ARIMA to + Transformers + + +
+ Cardiovascular disease (CVD) is a leading cause of death globally, +necessitating precise forecasting models for monitoring vital signs like heart +rate, blood pressure, and ECG. Traditional models, such as ARIMA and Prophet, +are limited by their need for manual parameter tuning and challenges in +handling noisy, sparse, and highly variable medical data. This study +investigates advanced deep learning models, including LSTM, and +transformer-based architectures, for predicting heart rate time series from the +MIT-BIH Database. Results demonstrate that deep learning models, particularly +PatchTST, significantly outperform traditional models across multiple metrics, +capturing complex patterns and dependencies more effectively. This research +underscores the potential of deep learning to enhance patient monitoring and +CVD management, suggesting substantial clinical benefits. Future work should +extend these findings to larger, more diverse datasets and real-world clinical +applications to further validate and optimize model performance. + +
+
+ comment: Accepted by 2024 6th International Conference on Electronic + Engineering and Informatics +
+
+
+
+
+ + ♻ ☆ Exploring Diverse Methods in Visual Question Answering + + +
+ This study explores innovative methods for improving Visual Question +Answering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and +attention mechanisms. Leveraging a balanced VQA dataset, we investigate three +distinct strategies. Firstly, GAN-based approaches aim to generate answer +embeddings conditioned on image and question inputs, showing potential but +struggling with more complex tasks. Secondly, autoencoder-based techniques +focus on learning optimal embeddings for questions and images, achieving +comparable results with GAN due to better ability on complex questions. Lastly, +attention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB), +address language priors and attention modeling, albeit with a +complexity-performance trade-off. This study underscores the challenges and +opportunities in VQA and suggests avenues for future research, including +alternative GAN formulations and attentional mechanisms. + +
+
+ comment: Accepted by 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Graph Agent Network: Empowering Nodes with Inference Capabilities for + Adversarial Resilience + + +
+ End-to-end training with global optimization have popularized graph neural +networks (GNNs) for node classification, yet inadvertently introduced +vulnerabilities to adversarial edge-perturbing attacks. Adversaries can exploit +the inherent opened interfaces of GNNs' input and output, perturbing critical +edges and thus manipulating the classification results. Current defenses, due +to their persistent utilization of global-optimization-based end-to-end +training schemes, inherently encapsulate the vulnerabilities of GNNs. This is +specifically evidenced in their inability to defend against targeted secondary +attacks. In this paper, we propose the Graph Agent Network (GAgN) to address +the aforementioned vulnerabilities of GNNs. GAgN is a graph-structured agent +network in which each node is designed as an 1-hop-view agent. Through the +decentralized interactions between agents, they can learn to infer global +perceptions to perform tasks including inferring embeddings, degrees and +neighbor relationships for given nodes. This empowers nodes to filtering +adversarial edges while carrying out classification tasks. Furthermore, agents' +limited view prevents malicious messages from propagating globally in GAgN, +thereby resisting global-optimization-based secondary attacks. We prove that +single-hidden-layer multilayer perceptrons (MLPs) are theoretically sufficient +to achieve these functionalities. Experimental results show that GAgN +effectively implements all its intended capabilities and, compared to +state-of-the-art defenses, achieves optimal classification accuracy on the +perturbed datasets. + +
+
+
+
+
+ + ♻ ☆ Asynchronous Voice Anonymization Using Adversarial Perturbation On + Speaker Embedding + + +
+ Voice anonymization has been developed as a technique for preserving privacy +by replacing the speaker's voice in a speech signal with that of a +pseudo-speaker, thereby obscuring the original voice attributes from machine +recognition and human perception. In this paper, we focus on altering the voice +attributes against machine recognition while retaining human perception. We +referred to this as the asynchronous voice anonymization. To this end, a speech +generation framework incorporating a speaker disentanglement mechanism is +employed to generate the anonymized speech. The speaker attributes are altered +through adversarial perturbation applied on the speaker embedding, while human +perception is preserved by controlling the intensity of perturbation. +Experiments conducted on the LibriSpeech dataset showed that the speaker +attributes were obscured with their human perception preserved for 60.71% of +the processed utterances. + +
+
+ comment: accpeted by Interspeech2024 +
+
+
+
+
+ + ♻ ☆ City Foundation Models for Learning General Purpose Representations from + OpenStreetMap + + +
+ Pre-trained Foundation Models (PFMs) have ushered in a paradigm-shift in +Artificial Intelligence, due to their ability to learn general-purpose +representations that can be readily employed in a wide range of downstream +tasks. While PFMs have been successfully adopted in various fields such as +Natural Language Processing and Computer Vision, their capacity in handling +geospatial data and answering urban questions remains limited. This can be +attributed to the intrinsic heterogeneity of geospatial data, which encompasses +different data types, including points, segments and regions, as well as +multiple information modalities, such as a spatial position, visual +characteristics and textual annotations. The proliferation of Volunteered +Geographic Information initiatives, and the ever-increasing availability of +open geospatial data sources, like OpenStreetMap, which is freely accessible +globally, unveil a promising opportunity to bridge this gap. In this paper, we +present CityFM, a self-supervised framework to train a foundation model within +a selected geographical area of interest, such as a city. CityFM relies solely +on open data from OSM, and produces multimodal representations of entities of +different types, incorporating spatial, visual, and textual information. We +analyse the entity representations generated using our foundation models from a +qualitative perspective, and conduct quantitative experiments on road, +building, and region-level downstream tasks. We compare its results to +algorithms tailored specifically for the respective applications. In all the +experiments, CityFM achieves performance superior to, or on par with, the +baselines. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Identifying Backdoored Graphs in Graph Neural Network Training: An + Explanation-Based Approach with Novel Metrics + + +
+ Graph Neural Networks (GNNs) have gained popularity in numerous domains, yet +they are vulnerable to backdoor attacks that can compromise their performance +and ethical application. The detection of these attacks is crucial for +maintaining the reliability and security of GNN classification tasks, but +effective detection techniques are lacking. Recognizing the challenge in +detecting such intrusions, we devised a novel detection method that creatively +leverages graph-level explanations. By extracting and transforming secondary +outputs from GNN explanation mechanisms, we developed seven innovative metrics +for effective detection of backdoor attacks on GNNs. Additionally, we develop +an adaptive attack to rigorously evaluate our approach. We test our method on +multiple benchmark datasets and examine its efficacy against various attack +models. Our results show that our method can achieve high detection +performance, marking a significant advancement in safeguarding GNNs against +backdoor attacks. + +
+
+
+
+
+ + ♻ ☆ Game-theoretic LLM: Agent Workflow for Negotiation Games + + +
+ This paper investigates the rationality of large language models (LLMs) in +strategic decision-making contexts, specifically within the framework of game +theory. We evaluate several state-of-the-art LLMs across a spectrum of +complete-information and incomplete-information games. Our findings reveal that +LLMs frequently deviate from rational strategies, particularly as the +complexity of the game increases with larger payoff matrices or deeper +sequential trees. + To address these limitations, we design multiple game-theoretic workflows +that guide the reasoning and decision-making processes of LLMs. These workflows +aim to enhance the models' ability to compute Nash Equilibria and make rational +choices, even under conditions of uncertainty and incomplete information. +Experimental results demonstrate that the adoption of these workflows +significantly improves the rationality and robustness of LLMs in game-theoretic +tasks. Specifically, with the workflow, LLMs exhibit marked improvements in +identifying optimal strategies, achieving near-optimal allocations in +negotiation scenarios, and reducing susceptibility to exploitation during +negotiations. Furthermore, we explore the meta-strategic considerations of +whether it is rational for agents to adopt such workflows, recognizing that the +decision to use or forgo the workflow constitutes a game-theoretic issue in +itself. + Our research contributes to a deeper understanding of LLMs' decision-making +capabilities in strategic contexts and provides insights into enhancing their +rationality through structured workflows. The findings have implications for +the development of more robust and strategically sound AI agents capable of +navigating complex interactive environments. Code and data supporting this +study are available at \url{https://github.com/Wenyueh/game_theory}. + +
+
+ comment: 45 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Cross-Domain Transfer Learning using Attention Latent Features for + Multi-Agent Trajectory Prediction + + +
+ With the advancements of sensor hardware, traffic infrastructure and deep +learning architectures, trajectory prediction of vehicles has established a +solid foundation in intelligent transportation systems. However, existing +solutions are often tailored to specific traffic networks at particular time +periods. Consequently, deep learning models trained on one network may struggle +to generalize effectively to unseen networks. To address this, we proposed a +novel spatial-temporal trajectory prediction framework that performs +cross-domain adaption on the attention representation of a Transformer-based +model. A graph convolutional network is also integrated to construct dynamic +graph feature embeddings that accurately model the complex spatial-temporal +interactions between the multi-agent vehicles across multiple traffic domains. +The proposed framework is validated on two case studies involving the +cross-city and cross-period settings. Experimental results show that our +proposed framework achieves superior trajectory prediction and domain +adaptation performances over the state-of-the-art models. + +
+
+ comment: Accepted at the IEEE International Conference on Systems, Man, and + Cybernetics 2024 +
+
+
+
+
+ + ♻ ☆ Dynamic Adaptive Optimization for Effective Sentiment Analysis + Fine-Tuning on Large Language Models + + +
+ Sentiment analysis plays a crucial role in various domains, such as business +intelligence and financial forecasting. Large language models (LLMs) have +become a popular paradigm for sentiment analysis, leveraging multi-task +learning to address specific tasks concurrently. However, LLMs with fine-tuning +for sentiment analysis often underperforms due to the inherent challenges in +managing diverse task complexities. Moreover, constant-weight approaches in +multi-task learning struggle to adapt to variations in data characteristics, +further complicating model effectiveness. To address these issues, we propose a +novel multi-task learning framework with a dynamic adaptive optimization (DAO) +module. This module is designed as a plug-and-play component that can be +seamlessly integrated into existing models, providing an effective and flexible +solution for multi-task learning. The key component of the DAO module is +dynamic adaptive loss, which dynamically adjusts the weights assigned to +different tasks based on their relative importance and data characteristics +during training. Sentiment analyses on a standard and customized financial text +dataset demonstrate that the proposed framework achieves superior performance. +Specifically, this work improves the Mean Squared Error (MSE) and Accuracy +(ACC) by 15.58% and 1.24% respectively, compared with previous work. + +
+
+
+
+
+ + ♻ ☆ MicroScopiQ: Accelerating Foundational Models through Outlier-Aware + Microscaling Quantization + + +
+ Quantization of foundational models (FMs) is significantly more challenging +than traditional DNNs due to the emergence of large magnitude features called +outliers. Existing outlier-aware algorithm/architecture co-design techniques +either use mixed-precision, retaining outliers at high precision but compromise +hardware efficiency, or quantize inliers and outliers at the same precision, +improving hardware efficiency at the cost of accuracy. To address this mutual +exclusivity, in this paper, we propose MicroScopiQ, a novel co-design technique +that leverages pruning to complement outlier-aware quantization. MicroScopiQ +retains outliers at higher precision while pruning a certain fraction of least +important weights to distribute the additional outlier bits; ensuring high +accuracy, aligned memory and hardware efficiency. We design a high-throughput, +low overhead accelerator architecture composed of simple multi-precision INT +processing elements and a novel network-on-chip called ReCoN that efficiently +abstracts the complexity of supporting high-precision outliers. Additionally, +unlike existing alternatives, MicroScopiQ does not assume any locality of +outlier weights, enabling applicability to a broad range of FMs. Extensive +experiments across various quantization settings show that MicroScopiQ achieves +SoTA quantization performance while simultaneously improving inference +performance by 3x and reducing energy by 2x over existing alternatives. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ReKep: Spatio-Temporal Reasoning of Relational Keypoint Constraints for + Robotic Manipulation + + +
+ Representing robotic manipulation tasks as constraints that associate the +robot and the environment is a promising way to encode desired robot behaviors. +However, it remains unclear how to formulate the constraints such that they are +1) versatile to diverse tasks, 2) free of manual labeling, and 3) optimizable +by off-the-shelf solvers to produce robot actions in real-time. In this work, +we introduce Relational Keypoint Constraints (ReKep), a visually-grounded +representation for constraints in robotic manipulation. Specifically, ReKep is +expressed as Python functions mapping a set of 3D keypoints in the environment +to a numerical cost. We demonstrate that by representing a manipulation task as +a sequence of Relational Keypoint Constraints, we can employ a hierarchical +optimization procedure to solve for robot actions (represented by a sequence of +end-effector poses in SE(3)) with a perception-action loop at a real-time +frequency. Furthermore, in order to circumvent the need for manual +specification of ReKep for each new task, we devise an automated procedure that +leverages large vision models and vision-language models to produce ReKep from +free-form language instructions and RGB-D observations. We present system +implementations on a wheeled single-arm platform and a stationary dual-arm +platform that can perform a large variety of manipulation tasks, featuring +multi-stage, in-the-wild, bimanual, and reactive behaviors, all without +task-specific data or environment models. Website at +https://rekep-robot.github.io/. + +
+
+
+
+
+ + ♻ ☆ MIRAGE: Multimodal Identification and Recognition of Annotations in + Indian General Prescriptions + + +
+ Hospitals in India still rely on handwritten medical records despite the +availability of Electronic Medical Records (EMR), complicating statistical +analysis and record retrieval. Handwritten records pose a unique challenge, +requiring specialized data for training models to recognize medications and +their recommendation patterns. While traditional handwriting recognition +approaches employ 2-D LSTMs, recent studies have explored using Multimodal +Large Language Models (MLLMs) for OCR tasks. Building on this approach, we +focus on extracting medication names and dosages from simulated medical +records. Our methodology MIRAGE (Multimodal Identification and Recognition of +Annotations in indian GEneral prescriptions) involves fine-tuning the QWEN VL, +LLaVA 1.6 and Idefics2 models on 743,118 high resolution simulated medical +record images-fully annotated from 1,133 doctors across India. Our approach +achieves 82% accuracy in extracting medication names and dosages. + +
+
+ comment: 5 pages, 9 figures, 3 tables, submitted to ISBI 2025 +
+
+
+
+
+ + ♻ ☆ On Active Privacy Auditing in Supervised Fine-tuning for White-Box + Language Models + + +
+ The pretraining and fine-tuning approach has become the leading technique for +various NLP applications. However, recent studies reveal that fine-tuning data, +due to their sensitive nature, domain-specific characteristics, and +identifiability, pose significant privacy concerns. To help develop more +privacy-resilient fine-tuning models, we introduce a novel active privacy +auditing framework, dubbed Parsing, designed to identify and quantify privacy +leakage risks during the supervised fine-tuning (SFT) of language models (LMs). +The framework leverages improved white-box membership inference attacks (MIAs) +as the core technology, utilizing novel learning objectives and a two-stage +pipeline to monitor the privacy of the LMs' fine-tuning process, maximizing the +exposure of privacy risks. Additionally, we have improved the effectiveness of +MIAs on large LMs including GPT-2, Llama2, and certain variants of them. Our +research aims to provide the SFT community of LMs with a reliable, ready-to-use +privacy auditing tool, and to offer valuable insights into safeguarding privacy +during the fine-tuning process. Experimental results confirm the framework's +efficiency across various models and tasks, emphasizing notable privacy +concerns in the fine-tuning process. Project code available for +https://anonymous.4open.science/r/PARSING-4817/. + +
+
+
+
+
+ + ♻ ☆ Stronger Models are NOT Stronger Teachers for Instruction Tuning + + +
+ Instruction tuning has been widely adopted to ensure large language models +(LLMs) follow user instructions effectively. The resulting +instruction-following capabilities of LLMs heavily rely on the instruction +datasets used for tuning. Recently, synthetic instruction datasets have emerged +as an economically viable solution to provide LLMs diverse and high-quality +instructions. However, existing approaches typically assume that larger or +stronger models are stronger teachers for instruction tuning, and hence simply +adopt these models as response generators to the synthetic instructions. In +this paper, we challenge this commonly-adopted assumption. Our extensive +experiments across five base models and twenty response generators reveal that +larger and stronger models are not necessarily stronger teachers of smaller +models. We refer to this phenomenon as the Larger Models' Paradox. We observe +that existing metrics cannot precisely predict the effectiveness of response +generators since they ignore the compatibility between teachers and base models +being fine-tuned. We thus develop a novel metric, named as +Compatibility-Adjusted Reward (CAR) to measure the effectiveness of response +generators. Our experiments across five base models demonstrate that CAR +outperforms almost all baselines. + +
+
+
+
+
+ + ♻ ☆ Into the Unknown: Self-Learning Large Language Models + + +
+ We address the main problem of self-learning LLM: the question of what to +learn. We propose a self-learning LLM framework that enables an LLM to +independently learn previously unknown knowledge through self-assessment of +their own hallucinations. We introduce a concept called Point in the Unknown +(PiU) to identify atomic knowledge unknown to a model, along with four methods +for automatic PiUs identification, facilitating the creation of a self-learning +loop that focuses exclusively on the absorption of currently unknown knowledge +into the model. Additionally, we developed evaluation metrics to gauge an LLM's +self-learning capability. Our experiments revealed that LLMs with at least 3B +parameters that have undergone some instruction training would be able to +perform self-learning well. We further proved the effectiveness of +self-learning by comparing the performance of a model that has undergone +self-learning to a model that has not. Our self-learning concept allows more +efficient LLM updates and opens new perspectives for LLM knowledge exchange. + +
+
+ comment: Accepted to SENTIRE 2024 (ICDM Workshops): + https://sentic.net/sentire2024ferdinan.pdf +
+
+
+
+
+ + ♻ ☆ LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated + Image Detection CVPR 2024 + + +
+ The evolution of Diffusion Models has dramatically improved image generation +quality, making it increasingly difficult to differentiate between real and +generated images. This development, while impressive, also raises significant +privacy and security concerns. In response to this, we propose a novel Latent +REconstruction error guided feature REfinement method (LaRE^2) for detecting +the diffusion-generated images. We come up with the Latent Reconstruction Error +(LaRE), the first reconstruction-error based feature in the latent space for +generated image detection. LaRE surpasses existing methods in terms of feature +extraction efficiency while preserving crucial cues required to differentiate +between the real and the fake. To exploit LaRE, we propose an Error-Guided +feature REfinement module (EGRE), which can refine the image feature guided by +LaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an +align-then-refine mechanism, which effectively refines the image feature for +generated-image detection from both spatial and channel perspectives. Extensive +experiments on the large-scale GenImage benchmark demonstrate the superiority +of our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1% +average ACC/AP across 8 different image generators. LaRE also surpasses +existing methods in terms of feature extraction cost, delivering an impressive +speed enhancement of 8 times. Code is available. + +
+
+ comment: CVPR 2024. Code is available at https://github.com/luo3300612/LaRE +
+
+
+
+
+ + ♻ ☆ vTune: Verifiable Fine-Tuning for LLMs Through Backdooring + + +
+ As fine-tuning large language models (LLMs) becomes increasingly prevalent, +users often rely on third-party services with limited visibility into their +fine-tuning processes. This lack of transparency raises the question: how do +consumers verify that fine-tuning services are performed correctly? For +instance, a service provider could claim to fine-tune a model for each user, +yet simply send all users back the same base model. To address this issue, we +propose vTune, a simple method that uses a small number of backdoor data points +added to the training data to provide a statistical test for verifying that a +provider fine-tuned a custom model on a particular user's dataset. Unlike +existing works, vTune is able to scale to verification of fine-tuning on +state-of-the-art LLMs, and can be used both with open-source and closed-source +models. We test our approach across several model families and sizes as well as +across multiple instruction-tuning datasets, and find that the statistical test +is satisfied with p-values on the order of $\sim 10^{-40}$, with no negative +impact on downstream task performance. Further, we explore several attacks that +attempt to subvert vTune and demonstrate the method's robustness to these +attacks. + +
+
+
+
+
+ + ♻ ☆ The Inadequacy of Similarity-based Privacy Metrics: Privacy Attacks + against "Truly Anonymous" Synthetic Datasets + + +
+ Generative models producing synthetic data are meant to provide a +privacy-friendly approach to releasing data. However, their privacy guarantees +are only considered robust when models satisfy Differential Privacy (DP). Alas, +this is not a ubiquitous standard, as many leading companies (and, in fact, +research papers) use ad-hoc privacy metrics based on testing the statistical +similarity between synthetic and real data. In this paper, we examine the +privacy metrics used in real-world synthetic data deployments and demonstrate +their unreliability in several ways. First, we provide counter-examples where +severe privacy violations occur even if the privacy tests pass and instantiate +accurate membership and attribute inference attacks with minimal cost. We then +introduce ReconSyn, a reconstruction attack that generates multiple synthetic +datasets that are considered private by the metrics but actually leak +information unique to individual records. We show that ReconSyn recovers +78-100% of the outliers in the train data with only black-box access to a +single fitted generative model and the privacy metrics. In the process, we show +that applying DP only to the model does not mitigate this attack, as using +privacy metrics breaks the end-to-end DP pipeline. + +
+
+
+
+
+ + ♻ ☆ Freeze-Omni: A Smart and Low Latency Speech-to-speech Dialogue Model + with Frozen LLM + + +
+ Rapidly developing large language models (LLMs) have brought tremendous +intelligent applications. GPT-4o's excellent duplex speech interaction ability +has recently brought impressive experience to users. Researchers have recently +proposed several multi-modal LLMs in this direction that can achieve +speech-to-speech dialogue. This paper proposes a novel speech-text multimodal +LLM architecture called Freeze-Omni. Our main contribution is that the speech +input and output modalities can be easily connected to a textual LLM while +keeping the LLM's parameters frozen throughout the training process. We +designed 3-stage training strategies both for the modeling of speech input and +output, enabling Freeze-Omni to obtain speech-to-speech dialogue ability using +text-speech paired data (such as ASR and TTS data) and only 60,000 multi-round +text Q&A data on 8 GPUs. Moreover, we can effectively ensure that the +intelligence of the Freeze-Omni in the speech modality is at the same level +compared with that in the text modality of its backbone LLM, while the +end-to-end latency of the spoken response achieves a low level. In addition, we +also designed a method to achieve duplex dialogue ability through multi-task +training, making Freeze-Omni have a more natural style of dialogue ability +between the users. Freeze-Omni mainly provides a possibility for researchers to +conduct multimodal LLM under the condition of a frozen LLM, avoiding various +impacts caused by the catastrophic forgetting of LLM caused by fewer data and +training resources. + +
+
+ comment: Project Page: https://freeze-omni.github.io/ +
+
+
+
+
+ + ♻ ☆ Utilizing Graph Generation for Enhanced Domain Adaptive Object Detection + + +
+ The problem of Domain Adaptive in the field of Object Detection involves the +transfer of object detection models from labeled source domains to unannotated +target domains. Recent advancements in this field aim to address domain +discrepancies by aligning pixel-pairs across domains within a non-Euclidean +graphical space, thereby minimizing semantic distribution variance. Despite +their remarkable achievements, these methods often use coarse semantic +representations to model graphs, mainly due to ignoring non-informative +elements and failing to focus on precise semantic alignment. Additionally, the +generation of coarse graphs inherently introduces abnormal nodes, posing +challenges and potentially biasing domain adaptation outcomes. Consequently, we +propose a framework, which utilizes the Graph Generation to enhance the quality +of DAOD (\method{}). Specifically, we introduce a Node Refinement module that +utilizes a memory bank to reconstruct noisy sampled nodes while applying +contrastive regularization to noisy features. To enhance semantic alignment, we +propose separating domain-specific styles from category invariance encoded +within graph covariances, which allows us to selectively remove domain-specific +styles while preserving category-invariant information, thus facilitating more +accurate semantic alignment across different domains. Furthermore, we propose a +Graph Optimization adaptor, leveraging variational inference to mitigate the +impact of abnormal nodes. Extensive experimentation across three adaptation +benchmarks validates that \method{} achieves state-of-the-art performance in +the task of unsupervised domain adaptation. + +
+
+
+
+
+ + ♻ ☆ Entity-Aware Self-Attention and Contextualized GCN for Enhanced Relation + Extraction in Long Sentences + + +
+ Relation extraction as an important natural Language processing (NLP) task is +to identify relations between named entities in text. Recently, graph +convolutional networks over dependency trees have been widely used to capture +syntactic features and achieved attractive performance. However, most existing +dependency-based approaches ignore the positive influence of the words outside +the dependency trees, sometimes conveying rich and useful information on +relation extraction. In this paper, we propose a novel model, Entity-aware +Self-attention Contextualized GCN (ESC-GCN), which efficiently incorporates +syntactic structure of input sentences and semantic context of sequences. To be +specific, relative position self-attention obtains the overall semantic +pairwise correlation related to word position, and contextualized graph +convolutional networks capture rich intra-sentence dependencies between words +by adequately pruning operations. Furthermore, entity-aware attention layer +dynamically selects which token is more decisive to make final relation +prediction. In this way, our proposed model not only reduces the noisy impact +from dependency trees, but also obtains easily-ignored entity-related semantic +representation. Extensive experiments on various tasks demonstrate that our +model achieves encouraging performance as compared to existing dependency-based +and sequence-based models. Specially, our model excels in extracting relations +between entities of long sentences. + +
+
+
+
+
+ + ♻ ☆ Entity-Aware Biaffine Attention Model for Improved Constituent Parsing + with Reduced Entity Violations + + +
+ Constituency parsing involves analyzing a sentence by breaking it into +sub-phrases, or constituents. While many deep neural models have achieved +state-of-the-art performance in this task, they often overlook the +entity-violating issue, where an entity fails to form a complete sub-tree in +the resultant parsing tree. To address this, we propose an entity-aware +biaffine attention model for constituent parsing. This model incorporates +entity information into the biaffine attention mechanism by using additional +entity role vectors for potential phrases, which enhances the parsing accuracy. +We introduce a new metric, the Entity Violating Rate (EVR), to quantify the +extent of entity violations in parsing results. Experiments on three popular +datasets-ONTONOTES, PTB, and CTB-demonstrate that our model achieves the lowest +EVR while maintaining high precision, recall, and F1-scores comparable to +existing models. Further evaluation in downstream tasks, such as sentence +sentiment analysis, highlights the effectiveness of our model and the validity +of the proposed EVR metric. + +
+
+
+
+
+ + ♻ ☆ Neural Gaffer: Relighting Any Object via Diffusion + + +
+ Single-image relighting is a challenging task that involves reasoning about +the complex interplay between geometry, materials, and lighting. Many prior +methods either support only specific categories of images, such as portraits, +or require special capture conditions, like using a flashlight. Alternatively, +some methods explicitly decompose a scene into intrinsic components, such as +normals and BRDFs, which can be inaccurate or under-expressive. In this work, +we propose a novel end-to-end 2D relighting diffusion model, called Neural +Gaffer, that takes a single image of any object and can synthesize an accurate, +high-quality relit image under any novel environmental lighting condition, +simply by conditioning an image generator on a target environment map, without +an explicit scene decomposition. Our method builds on a pre-trained diffusion +model, and fine-tunes it on a synthetic relighting dataset, revealing and +harnessing the inherent understanding of lighting present in the diffusion +model. We evaluate our model on both synthetic and in-the-wild Internet imagery +and demonstrate its advantages in terms of generalization and accuracy. +Moreover, by combining with other generative methods, our model enables many +downstream 2D tasks, such as text-based relighting and object insertion. Our +model can also operate as a strong relighting prior for 3D tasks, such as +relighting a radiance field. + +
+
+ comment: Project Website: https://neural-gaffer.github.io +
+
+
+
+
+ + ♻ ☆ Wonderful Matrices: More Efficient and Effective Architecture for + Language Modeling Tasks + + +
+ We prove the availability of inner product form position encoding in the +state space dual algorithm and study the effectiveness of different position +embeddings in the hybrid quadratic causal self-attention and state space dual +algorithms. We propose inner function attention with dynamic mask, which can +improve the expressiveness of the attention algorithm and avoid the sequence +noise significantly affecting the accuracy of the attention score. We also +design cross domain mixture of experts, which can improve the granularity of +the sparse activation feedforward network while maintaining the efficiency of +parameter utilization and retrieval. The combination of these methods +constitutes our foundation model architecture: Wonderful Matrices. We conduct +experiments on the language modeling task and find that Wonderful Matrices are +more efficient and effective in handling complex language tasks. + +
+
+ comment: 28 pages, 8 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in + Variational AutoEncoder + + +
+ Symmetries of input and latent vectors have provided valuable insights for +disentanglement learning in VAEs. However, only a few works were proposed as an +unsupervised method, and even these works require known factor information in +the training data. We propose a novel method, Composite Factor-Aligned Symmetry +Learning (CFASL), which is integrated into VAEs for learning symmetry-based +disentanglement in unsupervised learning without any knowledge of the dataset +factor information. CFASL incorporates three novel features for learning +symmetry-based disentanglement: 1) Injecting inductive bias to align latent +vector dimensions to factor-aligned symmetries within an explicit learnable +symmetry code-book 2) Learning a composite symmetry to express unknown factors +change between two random samples by learning factor-aligned symmetries within +the codebook 3) Inducing a group equivariant encoder and decoder in training +VAEs with the two conditions. In addition, we propose an extended evaluation +metric for multi-factor changes in comparison to disentanglement evaluation in +VAEs. In quantitative and in-depth qualitative analysis, CFASL demonstrates a +significant improvement of disentanglement in single-factor change, and +multi-factor change conditions compared to state-of-the-art methods. + +
+
+ comment: Accepted in TMLR 25 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Explaining Large Language Models Decisions Using Shapley Values + + +
+ The emergence of large language models (LLMs) has opened up exciting +possibilities for simulating human behavior and cognitive processes, with +potential applications in various domains, including marketing research and +consumer behavior analysis. However, the validity of utilizing LLMs as +stand-ins for human subjects remains uncertain due to glaring divergences that +suggest fundamentally different underlying processes at play and the +sensitivity of LLM responses to prompt variations. This paper presents a novel +approach based on Shapley values from cooperative game theory to interpret LLM +behavior and quantify the relative contribution of each prompt component to the +model's output. Through two applications - a discrete choice experiment and an +investigation of cognitive biases - we demonstrate how the Shapley value method +can uncover what we term "token noise" effects, a phenomenon where LLM +decisions are disproportionately influenced by tokens providing minimal +informative content. This phenomenon raises concerns about the robustness and +generalizability of insights obtained from LLMs in the context of human +behavior simulation. Our model-agnostic approach extends its utility to +proprietary LLMs, providing a valuable tool for practitioners and researchers +to strategically optimize prompts and mitigate apparent cognitive biases. Our +findings underscore the need for a more nuanced understanding of the factors +driving LLM responses before relying on them as substitutes for human subjects +in survey settings. We emphasize the importance of researchers reporting +results conditioned on specific prompt templates and exercising caution when +drawing parallels between human behavior and LLMs. + +
+
+
+
+
+ + ♻ ☆ Explainable Identification of Hate Speech towards Islam using Graph + Neural Networks NeurIPS 2023 + + +
+ Islamophobic language on online platforms fosters intolerance, making +detection and elimination crucial for promoting harmony. Traditional hate +speech detection models rely on NLP techniques like tokenization, +part-of-speech tagging, and encoder-decoder models. However, Graph Neural +Networks (GNNs), with their ability to utilize relationships between data +points, offer more effective detection and greater explainability. In this +work, we represent speeches as nodes and connect them with edges based on their +context and similarity to develop the graph. This study introduces a novel +paradigm using GNNs to identify and explain hate speech towards Islam. Our +model leverages GNNs to understand the context and patterns of hate speech by +connecting texts via pretrained NLP-generated word embeddings, achieving +state-of-the-art performance and enhancing detection accuracy while providing +valuable explanations. This highlights the potential of GNNs in combating +online hate speech and fostering a safer, more inclusive online environment. + +
+
+ comment: Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (Non-archival) + (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP + 2024 : NLP for Positive Impact Workshop (Archival; ACL Anthology: + https://aclanthology.org/2024.nlp4pi-1.23/) +
+
+
+
+
+ + ♻ ☆ TraceFL: Interpretability-Driven Debugging in Federated Learning via + Neuron Provenance + + +
+ In Federated Learning, clients train models on local data and send updates to +a central server, which aggregates them into a global model using a fusion +algorithm. This collaborative yet privacy-preserving training comes at a +cost--FL developers face significant challenges in attributing global model +predictions to specific clients. Localizing responsible clients is a crucial +step towards (a) excluding clients primarily responsible for incorrect +predictions and (b) encouraging clients who contributed high-quality models to +continue participating in the future. Existing ML explainability approaches are +inherently inapplicable as they are designed for single-model, centralized +training. + We introduce TraceFL, a fine-grained neuron provenance capturing mechanism +that identifies clients responsible for the global model's prediction by +tracking the flow of information from individual clients to the global model. +Since inference on different inputs activates a different set of neurons of the +global model, TraceFL dynamically quantifies the significance of the global +model's neurons in a given prediction. It then selectively picks a slice of the +most crucial neurons in the global model and maps them to the corresponding +neurons in every participating client to determine each client's contribution, +ultimately localizing the responsible client. We evaluate TraceFL on six +datasets, including two real-world medical imaging datasets and four neural +networks, including advanced models such as GPT. TraceFL achieves 99% accuracy +in localizing the responsible client in FL tasks spanning both image and text +classification tasks. At a time when state-of-the-art ML debugging approaches +are mostly domain-specific (e.g., image classification only), TraceFL is the +first technique to enable highly accurate automated reasoning across a wide +range of FL applications. + +
+
+ comment: Accepted at 2025 IEEE/ACM 47th International Conference on Software + Engineering (ICSE) +
+
+
+
+
+ + ♻ ☆ Algorithm Configuration for Structured Pfaffian Settings + + +
+ Data-driven algorithm design automatically adapts algorithms to specific +application domains, achieving better performance. In the context of +parameterized algorithms, this approach involves tuning the algorithm's +hyperparameters using problem instances drawn from the problem distribution of +the target application domain. This can be achieved by maximizing empirical +utilities that measure the algorithms' performance as a function of their +hyperparameters, using problem instances. While empirical evidence supports the +effectiveness of data-driven algorithm design, providing theoretical guarantees +for several parameterized families remains challenging. This is due to the +intricate behaviors of their corresponding utility functions, which typically +admit piecewise discontinuous structures. In this work, we present refined +frameworks for providing learning guarantees for parameterized data-driven +algorithm design problems in both distributional and online learning settings. +For the distributional learning setting, we introduce the \textit{Pfaffian GJ +framework}, an extension of the classical \textit{GJ framework}, that is +capable of providing learning guarantees for function classes for which the +computation involves Pfaffian functions. Unlike the GJ framework, which is +limited to function classes with computation characterized by rational +functions, our proposed framework can deal with function classes involving +Pfaffian functions, which are much more general and widely applicable. We then +show that for many parameterized algorithms of interest, their utility function +possesses a \textit{refined piecewise structure}, which automatically +translates to learning guarantees using our proposed framework. + +
+
+
+
+
+ + ♻ ☆ Sing-On-Your-Beat: Simple Text-Controllable Accompaniment Generations + + +
+ Singing is one of the most cherished forms of human entertainment. However, +creating a beautiful song requires an accompaniment that complements the vocals +and aligns well with the song instruments and genre. With advancements in deep +learning, previous research has focused on generating suitable accompaniments +but often lacks precise alignment with the desired instrumentation and genre. +To address this, we propose a straightforward method that enables control over +the accompaniment through text prompts, allowing the generation of music that +complements the vocals and aligns with the song instrumental and genre +requirements. Through extensive experiments, we successfully generate 10-second +accompaniments using vocal input and text control. + +
+
+
+
+
+
+
+
+ + Computation and Language 86 + +
+
+
+ + ☆ Language Models as Causal Effect Generators + + +
+ We present a framework for large language model (LLM) based data generation +with controllable causal structure. In particular, we define a procedure for +turning any language model and any directed acyclic graph (DAG) into a +sequence-driven structural causal model (SD-SCM). Broadly speaking, an SD-SCM +is a causal model with user-defined structure and LLM-defined structural +equations. We characterize how an SD-SCM allows sampling from observational, +interventional, and counterfactual distributions according to the desired +causal structure. We then leverage this procedure to propose a new type of +benchmark for causal inference methods, generating individual-level +counterfactual data without needing to manually specify functional +relationships between variables. We create an example benchmark consisting of +thousands of datasets, and test a suite of popular estimation methods on these +datasets for average, conditional average, and individual treatment effect +estimation, both with and without hidden confounding. Apart from generating +data, the same procedure also allows us to test for the presence of a causal +effect that might be encoded in an LLM. This procedure can underpin auditing +LLMs for misinformation, discrimination, or otherwise undesirable behavior. We +believe SD-SCMs can serve as a useful tool in any application that would +benefit from sequential data with controllable causal structure. + +
+
+
+
+
+ + ☆ ExpressivityArena: Can LLMs Express Information Implicitly? + + +
+ While Large Language Models (LLMs) have demonstrated remarkable performance +in certain dimensions, their ability to express implicit language cues that +human use for effective communication remains unclear. This paper presents +ExpressivityArena, a Python library for measuring the implicit communication +abilities of LLMs. We provide a comprehensive framework to evaluate +expressivity of arbitrary LLMs and explore its practical implications. To this +end, we refine the definition and measurements of ``expressivity,'' and use our +framework in a set of small experiments. These experiments test LLMs in +creative and logical tasks such as poetry, coding, and emotion-based responses. +They are then evaluated by an automated grader, through ExpressivityArena, +which we verify to be the most pragmatic for testing expressivity. Building on +these experiments, we deepen our understanding of the expressivity of LLMs by +assessing their ability to remain expressive in conversations. Our findings +indicate that LLMs are capable of generating and understanding expressive +content, however, with some limitations. These insights will inform the future +development and deployment of expressive LLMs. We provide the code for +ExpressivityArena alongside our paper. + +
+
+ comment: 8 pages, 22 figures +
+
+
+
+
+ + ☆ Can adversarial attacks by large language models be attributed? + + +
+ Attributing outputs from Large Language Models (LLMs) in adversarial +settings-such as cyberattacks and disinformation-presents significant +challenges that are likely to grow in importance. We investigate this +attribution problem using formal language theory, specifically language +identification in the limit as introduced by Gold and extended by Angluin. By +modeling LLM outputs as formal languages, we analyze whether finite text +samples can uniquely pinpoint the originating model. Our results show that due +to the non-identifiability of certain language classes, under some mild +assumptions about overlapping outputs from fine-tuned models it is +theoretically impossible to attribute outputs to specific LLMs with certainty. +This holds also when accounting for expressivity limitations of Transformer +architectures. Even with direct model access or comprehensive monitoring, +significant computational hurdles impede attribution efforts. These findings +highlight an urgent need for proactive measures to mitigate risks posed by +adversarial LLM use as their influence continues to expand. + +
+
+ comment: 7 pages, 1 figure +
+
+
+
+
+ + ☆ Derivational Morphology Reveals Analogical Generalization in Large + Language Models + + +
+ What mechanisms underlie linguistic generalization in large language models +(LLMs)? This question has attracted considerable attention, with most studies +analyzing the extent to which the language skills of LLMs resemble rules. As of +yet, it is not known whether linguistic generalization in LLMs could equally +well be explained as the result of analogical processes, which can be +formalized as similarity operations on stored exemplars. A key shortcoming of +prior research is its focus on linguistic phenomena with a high degree of +regularity, for which rule-based and analogical approaches make the same +predictions. Here, we instead examine derivational morphology, specifically +English adjective nominalization, which displays notable variability. We +introduce a new method for investigating linguistic generalization in LLMs: +focusing on GPT-J, we fit cognitive models that instantiate rule-based and +analogical learning to the LLM training data and compare their predictions on a +set of nonce adjectives with those of the LLM, allowing us to draw direct +conclusions regarding underlying mechanisms. As expected, rule-based and +analogical models explain the predictions of GPT-J equally well for adjectives +with regular nominalization patterns. However, for adjectives with variable +nominalization patterns, the analogical model provides a much better match. +Furthermore, GPT-J's behavior is sensitive to the individual word frequencies, +even for regular forms, a behavior that is consistent with an analogical +account of regular forms but not a rule-based one. These findings refute the +hypothesis that GPT-J's linguistic generalization on adjective nominalization +involves rules, suggesting similarity operations on stored exemplars as the +underlying mechanism. Overall, our study suggests that analogical processes +play a bigger role in the linguistic generalization of LLMs than previously +thought. + +
+
+
+
+
+ + ☆ JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified + Multimodal Understanding and Generation + + +
+ We present JanusFlow, a powerful framework that unifies image understanding +and generation in a single model. JanusFlow introduces a minimalist +architecture that integrates autoregressive language models with rectified +flow, a state-of-the-art method in generative modeling. Our key finding +demonstrates that rectified flow can be straightforwardly trained within the +large language model framework, eliminating the need for complex architectural +modifications. To further improve the performance of our unified model, we +adopt two key strategies: (i) decoupling the understanding and generation +encoders, and (ii) aligning their representations during unified training. +Extensive experiments show that JanusFlow achieves comparable or superior +performance to specialized models in their respective domains, while +significantly outperforming existing unified approaches across standard +benchmarks. This work represents a step toward more efficient and versatile +vision-language models. + +
+
+
+
+
+ + ☆ From General to Specific: Utilizing General Hallucation to Automatically + Measure the Role Relationship Fidelity for Specific Role-Play Agents + + +
+ The advanced role-playing capabilities of Large Language Models (LLMs) have +paved the way for developing Role-Playing Agents (RPAs). However, existing +benchmarks, such as HPD, which incorporates manually scored character +relationships into the context for LLMs to sort coherence, and SocialBench, +which uses specific profiles generated by LLMs in the context of +multiple-choice tasks to assess character preferences, face limitations like +poor generalizability, implicit and inaccurate judgments, and excessive context +length. To address the above issues, we propose an automatic, scalable, and +generalizable paradigm. Specifically, we construct a benchmark by extracting +relations from a general knowledge graph and leverage RPA's inherent +hallucination properties to prompt it to interact across roles, employing +ChatGPT for stance detection and defining relationship hallucination along with +three related metrics. Extensive experiments validate the effectiveness and +stability of our metrics. Our findings further explore factors influencing +these metrics and discuss the trade-off between relationship hallucination and +factuality. + +
+
+
+
+
+ + ☆ CryptoLLM: Unleashing the Power of Prompted LLMs for SmartQnA and + Classification of Crypto Posts + + +
+ The rapid growth of social media has resulted in an large volume of +user-generated content, particularly in niche domains such as cryptocurrency. +This task focuses on developing robust classification models to accurately +categorize cryptocurrency-related social media posts into predefined classes, +including but not limited to objective, positive, negative, etc. Additionally, +the task requires participants to identify the most relevant answers from a set +of posts in response to specific questions. By leveraging advanced LLMs, this +research aims to enhance the understanding and filtering of cryptocurrency +discourse, thereby facilitating more informed decision-making in this volatile +sector. We have used a prompt-based technique to solve the classification task +for reddit posts and twitter posts. Also, we have used 64-shot technique along +with prompts on GPT-4-Turbo model to determine whether a answer is relevant to +a question or not. + +
+
+ comment: Accepted at FIRE 2024 (Track: Opinion Extraction and Question + Answering from CryptoCurrency-Related Tweets and Reddit posts (CryptOQA)) +
+
+
+
+
+ + ☆ Mapping the Podcast Ecosystem with the Structured Podcast Research + Corpus + + +
+ Podcasts provide highly diverse content to a massive listener base through a +unique on-demand modality. However, limited data has prevented large-scale +computational analysis of the podcast ecosystem. To fill this gap, we introduce +a massive dataset of over 1.1M podcast transcripts that is largely +comprehensive of all English language podcasts available through public RSS +feeds from May and June of 2020. This data is not limited to text, but rather +includes audio features and speaker turns for a subset of 370K episodes, and +speaker role inferences and other metadata for all 1.1M episodes. Using this +data, we also conduct a foundational investigation into the content, structure, +and responsiveness of this ecosystem. Together, our data and analyses open the +door to continued computational research of this popular and impactful medium. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Trustful LLMs: Customizing and Grounding Text Generation with Knowledge + Bases and Dual Decoders + + +
+ Although people are impressed by the content generation skills of large +language models, the use of LLMs, such as ChatGPT, is limited by the domain +grounding of the content. The correctness and groundedness of the generated +content need to be based on a verified context, such as results from +Retrieval-Augmented Generation (RAG). One important issue when adapting LLMs to +a customized domain is that the generated responses are often incomplete, or +the additions are not verified and may even be hallucinated. Prior studies on +hallucination detection have focused on evaluation metrics, which are not +easily adaptable to dynamic domains and can be vulnerable to attacks like +jail-breaking. In this work, we propose 1) a post-processing algorithm that +leverages knowledge triplets in RAG context to correct hallucinations and 2) a +dual-decoder model that fuses RAG context to guide the generation process. + +
+
+
+
+
+ + ☆ Verbosity $\neq$ Veracity: Demystify Verbosity Compensation Behavior of + Large Language Models + + +
+ When unsure about an answer, humans often respond with more words than +necessary, hoping that part of the response will be correct. We observe a +similar behavior in large language models (LLMs), which we term "Verbosity +Compensation" (VC). VC is harmful because it confuses the user understanding, +leading to low efficiency, and influences the LLM services by increasing the +latency and cost of generating useless tokens. In this paper, we present the +first work that defines and analyzes Verbosity Compensation, explores its +causes, and proposes a simple mitigating approach. We define Verbosity +Compensation as the behavior of generating responses that can be compressed +without information loss when prompted to write concisely. Our experiments, +conducted on five datasets of knowledge and reasoning-based QA tasks with 14 +newly developed LLMs, reveal three conclusions. 1) We reveal a pervasive +presence of verbosity compensation across all models and all datasets. Notably, +GPT-4 exhibits a VC frequency of 50.40%. 2) We reveal the large performance gap +between verbose and concise responses, with a notable difference of 27.61% on +the Qasper dataset. We also demonstrate that this difference does not naturally +diminish as LLM capability increases. Both 1) and 2) highlight the urgent need +to mitigate the frequency of VC behavior and disentangle verbosity with +veracity. We propose a simple yet effective cascade algorithm that replaces the +verbose responses with the other model-generated responses. The results show +that our approach effectively alleviates the VC of the Mistral model from +63.81% to 16.16% on the Qasper dataset. 3) We also find that verbose responses +exhibit higher uncertainty across all five datasets, suggesting a strong +connection between verbosity and model uncertainty. Our dataset and code are +available at https://github.com/psunlpgroup/VerbosityLLM. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ Tucano: Advancing Neural Text Generation for Portuguese + + +
+ Significant advances have been made in natural language processing in recent +years. However, our current deep learning approach to language modeling +requires substantial resources in terms of data and computation. One of the +side effects of this data-hungry paradigm is the current schism between +languages, separating those considered high-resource, where most of the +development happens and resources are available, and the low-resource ones, +which struggle to attain the same level of performance and autonomy. This study +aims to introduce a new set of resources to stimulate the future development of +neural text generation in Portuguese. In this work, we document the development +of GigaVerbo, a concatenation of deduplicated Portuguese text corpora amounting +to 200 billion tokens. Via this corpus, we trained a series of +decoder-transformers named Tucano. Our models perform equal or superior to +other Portuguese and multilingual language models of similar size in several +Portuguese benchmarks. The evaluation of our models also reveals that model +performance on many currently available benchmarks used by the Portuguese NLP +community has little to no correlation with the scaling of token ingestion +during training, highlighting the limitations of such evaluations when it comes +to the assessment of Portuguese generative language models. All derivatives of +our study are openly released on GitHub and Hugging Face. See +https://nkluge-correa.github.io/Tucano/ + +
+
+
+
+
+ + ☆ IAE: Irony-based Adversarial Examples for Sentiment Analysis Systems + + +
+ Adversarial examples, which are inputs deliberately perturbed with +imperceptible changes to induce model errors, have raised serious concerns for +the reliability and security of deep neural networks (DNNs). While adversarial +attacks have been extensively studied in continuous data domains such as +images, the discrete nature of text presents unique challenges. In this paper, +we propose Irony-based Adversarial Examples (IAE), a method that transforms +straightforward sentences into ironic ones to create adversarial text. This +approach exploits the rhetorical device of irony, where the intended meaning is +opposite to the literal interpretation, requiring a deeper understanding of +context to detect. The IAE method is particularly challenging due to the need +to accurately locate evaluation words, substitute them with appropriate +collocations, and expand the text with suitable ironic elements while +maintaining semantic coherence. Our research makes the following key +contributions: (1) We introduce IAE, a strategy for generating textual +adversarial examples using irony. This method does not rely on pre-existing +irony corpora, making it a versatile tool for creating adversarial text in +various NLP tasks. (2) We demonstrate that the performance of several +state-of-the-art deep learning models on sentiment analysis tasks significantly +deteriorates when subjected to IAE attacks. This finding underscores the +susceptibility of current NLP systems to adversarial manipulation through +irony. (3) We compare the impact of IAE on human judgment versus NLP systems, +revealing that humans are less susceptible to the effects of irony in text. + +
+
+
+
+
+ + ☆ Ethical Concern Identification in NLP: A Corpus of ACL Anthology Ethics + Statements + + +
+ What ethical concerns, if any, do LLM researchers have? We introduce EthiCon, +a corpus of 1,580 ethical concern statements extracted from scientific papers +published in the ACL Anthology. We extract ethical concern keywords from the +statements and show promising results in automating the concern identification +process. Through a survey, we compare the ethical concerns of the corpus to the +concerns listed by the general public and professionals in the field. Finally, +we compare our retrieved ethical concerns with existing taxonomies pointing to +gaps and future research directions. + +
+
+
+
+
+ + ☆ Chain Association-based Attacking and Shielding Natural Language + Processing Systems + + +
+ Association as a gift enables people do not have to mention something in +completely straightforward words and allows others to understand what they +intend to refer to. In this paper, we propose a chain association-based +adversarial attack against natural language processing systems, utilizing the +comprehension gap between humans and machines. We first generate a chain +association graph for Chinese characters based on the association paradigm for +building search space of potential adversarial examples. Then, we introduce an +discrete particle swarm optimization algorithm to search for the optimal +adversarial examples. We conduct comprehensive experiments and show that +advanced natural language processing models and applications, including large +language models, are vulnerable to our attack, while humans appear good at +understanding the perturbed text. We also explore two methods, including +adversarial training and associative graph-based recovery, to shield systems +from chain association-based attack. Since a few examples that use some +derogatory terms, this paper contains materials that may be offensive or +upsetting to some people. + +
+
+
+
+
+ + ☆ Query Optimization for Parametric Knowledge Refinement in + Retrieval-Augmented Large Language Models + + +
+ We introduce the \textit{Extract-Refine-Retrieve-Read} (ERRR) framework, a +novel approach designed to bridge the pre-retrieval information gap in +Retrieval-Augmented Generation (RAG) systems through query optimization +tailored to meet the specific knowledge requirements of Large Language Models +(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR +framework begins by extracting parametric knowledge from LLMs, followed by +using a specialized query optimizer for refining these queries. This process +ensures the retrieval of only the most pertinent information essential for +generating accurate responses. Moreover, to enhance flexibility and reduce +computational costs, we propose a trainable scheme for our pipeline that +utilizes a smaller, tunable model as the query optimizer, which is refined +through knowledge distillation from a larger teacher model. Our evaluations on +various question-answering (QA) datasets and with different retrieval systems +show that ERRR consistently outperforms existing baselines, proving to be a +versatile and cost-effective module for improving the utility and accuracy of +RAG systems. + +
+
+
+
+
+ + ☆ Likelihood as a Performance Gauge for Retrieval-Augmented Generation NAACL 2025 + + +
+ Recent work finds that retrieval-augmented generation with large language +models is prone to be influenced by the order of retrieved documents in the +context. However, the lack of in-depth analysis limits the use of this +phenomenon for prompt engineering in practice. In this study, we posit that +likelihoods serve as an effective gauge for language model performance. Through +experiments on two question-answering datasets with a variety of +state-of-the-art language models, we reveal correlations between answer +accuracy and the likelihood of the question at both the corpus level and the +instance level. In addition, we find that question likelihood can also indicate +the position of the task-relevant information in the context. Based on these +findings, we propose two methods that use question likelihood as a gauge for +selecting and constructing prompts that lead to better performance. We +demonstrate their effectiveness with experiments. In addition, our +likelihood-based methods are efficient, as they only need to compute the +likelihood of the input, requiring much fewer language model passes than +heuristic prompt engineering methods that require generating responses. Our +analysis deepens our understanding of how input prompts affect model +performance and provides a promising direction for efficient prompt +optimization. + +
+
+ comment: Under review at NAACL 2025. Code is available at + https://github.com/lyutyuh/poptimizer +
+
+
+
+
+ + ☆ Automatic Album Sequencing + + +
+ Album sequencing is a critical part of the album production process. +Recently, a data-driven approach was proposed that sequences general +collections of independent media by extracting the narrative essence of the +items in the collections. While this approach implies an album sequencing +technique, it is not widely accessible to a less technical audience, requiring +advanced knowledge of machine learning techniques to use. To address this, we +introduce a new user-friendly web-based tool that allows a less technical +audience to upload music tracks, execute this technique in one click, and +subsequently presents the result in a clean visualization to the user. To both +increase the number of templates available to the user and address shortcomings +of previous work, we also introduce a new direct transformer-based album +sequencing method. We find that our more direct method outperforms a random +baseline but does not reach the same performance as the narrative essence +approach. Both methods are included in our web-based user interface, and this +-- alongside a full copy of our implementation -- is publicly available at +https://github.com/dylanashley/automatic-album-sequencing + +
+
+ comment: presented as a late breaking demo in the 25th International Society + for Music Information Retrieval Conference; 3 pages in main text, 3 figures + in main text; source code available at + https://github.com/dylanashley/automatic-album-sequencing +
+
+
+
+
+ + ☆ Spider 2.0: Evaluating Language Models on Real-World Enterprise + Text-to-SQL Workflows + + +
+ Real-world enterprise text-to-SQL workflows often involve complex cloud or +local data across various database systems, multiple SQL queries in various +dialects, and diverse operations from data transformation to analytics. We +introduce Spider 2.0, an evaluation framework comprising 632 real-world +text-to-SQL workflow problems derived from enterprise-level database use cases. +The databases in Spider 2.0 are sourced from real data applications, often +containing over 1,000 columns and stored in local or cloud database systems +such as BigQuery and Snowflake. We show that solving problems in Spider 2.0 +frequently requires understanding and searching through database metadata, +dialect documentation, and even project-level codebases. This challenge calls +for models to interact with complex SQL workflow environments, process +extremely long contexts, perform intricate reasoning, and generate multiple SQL +queries with diverse operations, often exceeding 100 lines, which goes far +beyond traditional text-to-SQL challenges. Our evaluations indicate that based +on o1-preview, our code agent framework successfully solves only 17.0% of the +tasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on +Spider 2.0 show that while language models have demonstrated remarkable +performance in code generation -- especially in prior text-to-SQL benchmarks -- +they require significant improvement in order to achieve adequate performance +for real-world enterprise usage. Progress on Spider 2.0 represents crucial +steps towards developing intelligent, autonomous, code agents for real-world +enterprise settings. Our code, baseline models, and data are available at +https://spider2-sql.github.io. + +
+
+
+
+
+ + ☆ Mitigating Bias in Queer Representation within Large Language Models: A + Collaborative Agent Approach NeurIPS 2024 + + +
+ Large Language Models (LLMs) often perpetuate biases in pronoun usage, +leading to misrepresentation or exclusion of queer individuals. This paper +addresses the specific problem of biased pronoun usage in LLM outputs, +particularly the inappropriate use of traditionally gendered pronouns ("he," +"she") when inclusive language is needed to accurately represent all +identities. We introduce a collaborative agent pipeline designed to mitigate +these biases by analyzing and optimizing pronoun usage for inclusivity. Our +multi-agent framework includes specialized agents for both bias detection and +correction. Experimental evaluations using the Tango dataset-a benchmark +focused on gender pronoun usage-demonstrate that our approach significantly +improves inclusive pronoun classification, achieving a 32.6 percentage point +increase over GPT-4o in correctly disagreeing with inappropriate traditionally +gendered pronouns $(\chi^2 = 38.57, p < 0.0001)$. These results accentuate the +potential of agent-driven frameworks in enhancing fairness and inclusivity in +AI-generated content, demonstrating their efficacy in reducing biases and +promoting socially responsible AI. + +
+
+ comment: NeurIPS 2024 Queer in AI Workshop +
+
+
+
+
+ + ☆ Annotating Constructions with UD: the experience of the Italian + Constructicon + + +
+ The paper descirbes a first attempt of linking the Italian constructicon to +UD resources + +
+
+
+
+
+ + ☆ Direct Preference Optimization Using Sparse Feature-Level Constraints + + +
+ The alignment of large language models (LLMs) with human preferences remains +a key challenge. While post-training techniques like Reinforcement Learning +from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have +achieved notable success, they often introduce computational inefficiencies and +training instability. In this paper, we propose Feature-level constrained +Preference Optimization (FPO), a novel method designed to simplify the +alignment process while ensuring stability. FPO leverages pre-trained Sparse +Autoencoders (SAEs) and introduces feature-level constraints, allowing for +efficient, sparsity-enforced alignment. Our approach enjoys efficiency by using +sparse features activated in a well-trained sparse autoencoder and the quality +of sequential KL divergence by using the feature-level offline reference. +Experimental results on benchmark datasets demonstrate that FPO achieves a +5.08% absolute improvement in win rate with much lower computational cost +compared to state-of-the-art baselines, making it a promising solution for +efficient and controllable LLM alignments. + +
+
+
+
+
+ + ☆ Multimodal Clinical Reasoning through Knowledge-augmented Rationale + Generation + + +
+ Clinical rationales play a pivotal role in accurate disease diagnosis; +however, many models predominantly use discriminative methods and overlook the +importance of generating supportive rationales. Rationale distillation is a +process that transfers knowledge from large language models (LLMs) to smaller +language models (SLMs), thereby enhancing the latter's ability to break down +complex tasks. Despite its benefits, rationale distillation alone is inadequate +for addressing domain knowledge limitations in tasks requiring specialized +expertise, such as disease diagnosis. Effectively embedding domain knowledge in +SLMs poses a significant challenge. While current LLMs are primarily geared +toward processing textual data, multimodal LLMs that incorporate time series +data, especially electronic health records (EHRs), are still evolving. To +tackle these limitations, we introduce ClinRaGen, an SLM optimized for +multimodal rationale generation in disease diagnosis. ClinRaGen incorporates a +unique knowledge-augmented attention mechanism to merge domain knowledge with +time series EHR data, utilizing a stepwise rationale distillation strategy to +produce both textual and time series-based clinical rationales. Our evaluations +show that ClinRaGen markedly improves the SLM's capability to interpret +multimodal EHR data and generate accurate clinical rationales, supporting more +reliable disease diagnosis, advancing LLM applications in healthcare, and +narrowing the performance divide between LLMs and SLMs. + +
+
+ comment: 11 pages. 4 figures +
+
+
+
+
+ + ☆ Circuit Complexity Bounds for RoPE-based Transformer Architecture + + +
+ Characterizing the express power of the Transformer architecture is critical +to understanding its capacity limits and scaling law. Recent works provide the +circuit complexity bounds to Transformer-like architecture. On the other hand, +Rotary Position Embedding ($\mathsf{RoPE}$) has emerged as a crucial technique +in modern large language models, offering superior performance in capturing +positional information compared to traditional position embeddings, which shows +great potential in application prospects, particularly for the long context +scenario. Empirical evidence also suggests that $\mathsf{RoPE}$-based +Transformer architectures demonstrate greater generalization capabilities +compared to conventional Transformer models. In this work, we establish a +tighter circuit complexity bound for Transformers with $\mathsf{RoPE}$ +attention. Our key contribution is that we show that unless $\mathsf{TC}^0 = +\mathsf{NC}^1$, a $\mathsf{RoPE}$-based Transformer with +$\mathrm{poly}(n)$-precision, $O(1)$ layers, hidden dimension $d \leq O(n)$ +cannot solve the arithmetic problem or the Boolean formula value problem. This +result significantly demonstrates the fundamental limitation of the +expressivity of the $\mathsf{RoPE}$-based Transformer architecture, although it +achieves giant empirical success. Our theoretical framework not only +establishes tighter complexity bounds but also may instruct further work on the +$\mathsf{RoPE}$-based Transformer. + +
+
+
+
+
+ + ☆ Problem-Oriented Segmentation and Retrieval: Case Study on Tutoring + Conversations EMNLP 2024 + + +
+ Many open-ended conversations (e.g., tutoring lessons or business meetings) +revolve around pre-defined reference materials, like worksheets or meeting +bullets. To provide a framework for studying such conversation structure, we +introduce Problem-Oriented Segmentation & Retrieval (POSR), the task of jointly +breaking down conversations into segments and linking each segment to the +relevant reference item. As a case study, we apply POSR to education where +effectively structuring lessons around problems is critical yet difficult. We +present LessonLink, the first dataset of real-world tutoring lessons, featuring +3,500 segments, spanning 24,300 minutes of instruction and linked to 116 SAT +math problems. We define and evaluate several joint and independent approaches +for POSR, including segmentation (e.g., TextTiling), retrieval (e.g., ColBERT), +and large language models (LLMs) methods. Our results highlight that modeling +POSR as one joint task is essential: POSR methods outperform independent +segmentation and retrieval pipelines by up to +76% on joint metrics and surpass +traditional segmentation methods by up to +78% on segmentation metrics. We +demonstrate POSR's practical impact on downstream education applications, +deriving new insights on the language and time use in real-world lesson +structures. + +
+
+ comment: EMNLP 2024 Findings. Our code and dataset are open-sourced at + https://github.com/rosewang2008/posr +
+
+
+
+
+ + ☆ Entropy Controllable Direct Preference Optimization + + +
+ In the post-training of large language models (LLMs), Reinforcement Learning +from Human Feedback (RLHF) is an effective approach to achieve generation +aligned with human preferences. Direct Preference Optimization (DPO) allows for +policy training with a simple binary cross-entropy loss without a reward model. +The objective of DPO is regularized by reverse KL divergence that encourages +mode-seeking fitting to the reference policy. Nonetheless, we indicate that +minimizing reverse KL divergence could fail to capture a mode of the reference +distribution, which may hurt the policy's performance. Based on this +observation, we propose a simple modification to DPO, H-DPO, which allows for +control over the entropy of the resulting policy, enhancing the distribution's +sharpness and thereby enabling mode-seeking fitting more effectively. In our +experiments, we show that H-DPO outperformed DPO across various tasks, +demonstrating superior results in pass@$k$ evaluations for mathematical tasks. +Moreover, H-DPO is simple to implement, requiring only minor modifications to +the loss calculation of DPO, which makes it highly practical and promising for +wide-ranging applications in the training of LLMs. + +
+
+
+
+
+ + ☆ Contrastive Language Prompting to Ease False Positives in Medical + Anomaly Detection + + +
+ A pre-trained visual-language model, contrastive language-image pre-training +(CLIP), successfully accomplishes various downstream tasks with text prompts, +such as finding images or localizing regions within the image. Despite CLIP's +strong multi-modal data capabilities, it remains limited in specialized +environments, such as medical applications. For this purpose, many CLIP +variants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives +related to normal regions persist. Thus, we aim to present a simple yet +important goal of reducing false positives in medical anomaly detection. We +introduce a Contrastive LAnguage Prompting (CLAP) method that leverages both +positive and negative text prompts. This straightforward approach identifies +potential lesion regions by visual attention to the positive prompts in the +given image. To reduce false positives, we attenuate attention on normal +regions using negative prompts. Extensive experiments with the BMAD dataset, +including six biomedical benchmarks, demonstrate that CLAP method enhances +anomaly detection performance. Our future plans include developing an automated +fine prompting method for more practical usage. + +
+
+ comment: 4 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Large Language Models as Neurolinguistic Subjects: Identifying Internal + Representations for Form and Meaning + + +
+ This study investigates the linguistic understanding of Large Language Models +(LLMs) regarding signifier (form) and signified (meaning) by distinguishing two +LLM evaluation paradigms: psycholinguistic and neurolinguistic. Traditional +psycholinguistic evaluations often reflect statistical biases that may +misrepresent LLMs' true linguistic capabilities. We introduce a neurolinguistic +approach, utilizing a novel method that combines minimal pair and diagnostic +probing to analyze activation patterns across model layers. This method allows +for a detailed examination of how LLMs represent form and meaning, and whether +these representations are consistent across languages. Our contributions are +three-fold: (1) We compare neurolinguistic and psycholinguistic methods, +revealing distinct patterns in LLM assessment; (2) We demonstrate that LLMs +exhibit higher competence in form compared to meaning, with the latter largely +correlated to the former; (3) We present new conceptual minimal pair datasets +for Chinese (COMPS-ZH) and German (COMPS-DE), complementing existing English +datasets. + +
+
+
+
+
+ + ☆ SecEncoder: Logs are All You Need in Security + + +
+ Large and Small Language Models (LMs) are typically pretrained using +extensive volumes of text, which are sourced from publicly accessible platforms +such as Wikipedia, Book Corpus, or through web scraping. These models, due to +their exposure to a wide range of language data, exhibit impressive +generalization capabilities and can perform a multitude of tasks +simultaneously. However, they often fall short when it comes to domain-specific +tasks due to their broad training data. This paper introduces SecEncoder, a +specialized small language model that is pretrained using security logs. +SecEncoder is designed to address the domain-specific limitations of general +LMs by focusing on the unique language and patterns found in security logs. +Experimental results indicate that SecEncoder outperforms other LMs, such as +BERTlarge, DeBERTa-v3-large and OpenAI's Embedding (textembedding-ada-002) +models, which are pretrained mainly on natural language, across various tasks. +Furthermore, although SecEncoder is primarily pretrained on log data, it +outperforms models pretrained on natural language for a range of tasks beyond +log analysis, such as incident prioritization and threat intelligence document +retrieval. This suggests that domain specific pretraining with logs can +significantly enhance the performance of LMs in security. These findings pave +the way for future research into security-specific LMs and their potential +applications. + +
+
+
+
+
+ + ☆ Prompt-enhanced Network for Hateful Meme Classification + + +
+ The dynamic expansion of social media has led to an inundation of hateful +memes on media platforms, accentuating the growing need for efficient +identification and removal. Acknowledging the constraints of conventional +multimodal hateful meme classification, which heavily depends on external +knowledge and poses the risk of including irrelevant or redundant content, we +developed Pen -- a prompt-enhanced network framework based on the prompt +learning approach. Specifically, after constructing the sequence through the +prompt method and encoding it with a language model, we performed region +information global extraction on the encoded sequence for multi-view +perception. By capturing global information about inference instances and +demonstrations, Pen facilitates category selection by fully leveraging sequence +information. This approach significantly improves model classification +accuracy. Additionally, to bolster the model's reasoning capabilities in the +feature space, we introduced prompt-aware contrastive learning into the +framework to improve the quality of sample feature distributions. Through +extensive ablation experiments on two public datasets, we evaluate the +effectiveness of the Pen framework, concurrently comparing it with +state-of-the-art model baselines. Our research findings highlight that Pen +surpasses manual prompt methods, showcasing superior generalization and +classification accuracy in hateful meme classification tasks. Our code is +available at https://github.com/juszzi/Pen. + +
+
+ comment: Published in Proceedings of the Thirty-Third International Joint + Conference on Artificial Intelligence Main Track. Pages 6397-6405 +
+
+
+
+
+ + ☆ Fair Summarization: Bridging Quality and Diversity in Extractive + Summaries NeurIPS 2024 + + +
+ Fairness in multi-document summarization of user-generated content remains a +critical challenge in natural language processing (NLP). Existing summarization +methods often fail to ensure equitable representation across different social +groups, leading to biased outputs. In this paper, we introduce two novel +methods for fair extractive summarization: FairExtract, a clustering-based +approach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints. +We evaluate these methods using Divsumm summarization dataset of White-aligned, +Hispanic, and African-American dialect tweets and compare them against relevant +baselines. The results obtained using a comprehensive set of summarization +quality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well +as a fairness metric F, demonstrate that FairExtract and FairGPT achieve +superior fairness while maintaining competitive summarization quality. +Additionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that +integrate quality and fairness into a single evaluation framework, offering a +more nuanced understanding of the trade-offs between these objectives. This +work highlights the importance of fairness in summarization and sets a +benchmark for future research in fairness-aware NLP models. + +
+
+ comment: Accepted at Algorithmic Fairness through the Lens of Metrics and + Evaluation Workshop @ NeurIPS 2024 +
+
+
+
+
+ + ☆ SparrowVQE: Visual Question Explanation for Course Content Understanding + + +
+ Visual Question Answering (VQA) research seeks to create AI systems to answer +natural language questions in images, yet VQA methods often yield overly +simplistic and short answers. This paper aims to advance the field by +introducing Visual Question Explanation (VQE), which enhances the ability of +VQA to provide detailed explanations rather than brief responses and address +the need for more complex interaction with visual content. We first created an +MLVQE dataset from a 14-week streamed video machine learning course, including +885 slide images, 110,407 words of transcripts, and 9,416 designed +question-answer (QA) pairs. Next, we proposed a novel SparrowVQE, a small 3 +billion parameters multimodal model. We trained our model with a three-stage +training mechanism consisting of multimodal pre-training (slide images and +transcripts feature alignment), instruction tuning (tuning the pre-trained +model with transcripts and QA pairs), and domain fine-tuning (fine-tuning slide +image and QA pairs). Eventually, our SparrowVQE can understand and connect +visual information using the SigLIP model with transcripts using the Phi-2 +language model with an MLP adapter. Experimental results demonstrate that our +SparrowVQE achieves better performance in our developed MLVQE dataset and +outperforms state-of-the-art methods in the other five benchmark VQA datasets. +The source code is available at +\url{https://github.com/YoushanZhang/SparrowVQE}. + +
+
+
+
+
+ + ☆ Rapid Response: Mitigating LLM Jailbreaks with a Few Examples + + +
+ As large language models (LLMs) grow more powerful, ensuring their safety +against misuse becomes crucial. While researchers have focused on developing +robust defenses, no method has yet achieved complete invulnerability to +attacks. We propose an alternative approach: instead of seeking perfect +adversarial robustness, we develop rapid response techniques to look to block +whole classes of jailbreaks after observing only a handful of attacks. To study +this setting, we develop RapidResponseBench, a benchmark that measures a +defense's robustness against various jailbreak strategies after adapting to a +few observed examples. We evaluate five rapid response methods, all of which +use jailbreak proliferation, where we automatically generate additional +jailbreaks similar to the examples observed. Our strongest method, which +fine-tunes an input classifier to block proliferated jailbreaks, reduces attack +success rate by a factor greater than 240 on an in-distribution set of +jailbreaks and a factor greater than 15 on an out-of-distribution set, having +observed just one example of each jailbreaking strategy. Moreover, further +studies suggest that the quality of proliferation model and number of +proliferated examples play an key role in the effectiveness of this defense. +Overall, our results highlight the potential of responding rapidly to novel +jailbreaks to limit LLM misuse. + +
+
+
+
+
+ + ☆ Controlled Evaluation of Syntactic Knowledge in Multilingual Language + Models + + +
+ Language models (LMs) are capable of acquiring elements of human-like +syntactic knowledge. Targeted syntactic evaluation tests have been employed to +measure how well they form generalizations about syntactic phenomena in +high-resource languages such as English. However, we still lack a thorough +understanding of LMs' capacity for syntactic generalizations in low-resource +languages, which are responsible for much of the diversity of syntactic +patterns worldwide. In this study, we develop targeted syntactic evaluation +tests for three low-resource languages (Basque, Hindi, and Swahili) and use +them to evaluate five families of open-access multilingual Transformer LMs. We +find that some syntactic tasks prove relatively easy for LMs while others +(agreement in sentences containing indirect objects in Basque, agreement across +a prepositional phrase in Swahili) are challenging. We additionally uncover +issues with publicly available Transformers, including a bias toward the +habitual aspect in Hindi in multilingual BERT and underperformance compared to +similar-sized models in XGLM-4.5B. + +
+
+
+
+
+ + ☆ IdentifyMe: A Challenging Long-Context Mention Resolution Benchmark + + +
+ Recent evaluations of LLMs on coreference resolution have revealed that +traditional output formats and evaluation metrics do not fully capture the +models' referential understanding. To address this, we introduce IdentifyMe, a +new benchmark for mention resolution presented in a multiple-choice question +(MCQ) format, commonly used for evaluating LLMs. IdentifyMe features long +narratives and employs heuristics to exclude easily identifiable mentions, +creating a more challenging task. The benchmark also consists of a curated +mixture of different mention types and corresponding entities, allowing for a +fine-grained analysis of model performance. We evaluate both closed- and open +source LLMs on IdentifyMe and observe a significant performance gap (20-30%) +between the state-of-the-art sub-10B open models vs. closed ones. We observe +that pronominal mentions, which have limited surface information, are typically +much harder for models to resolve than nominal mentions. Additionally, we find +that LLMs often confuse entities when their mentions overlap in nested +structures. The highest-scoring model, GPT-4o, achieves 81.9% accuracy, +highlighting the strong referential capabilities of state-of-the-art LLMs while +also indicating room for further improvement. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating + Machine Learning Tasks + + +
+ Large Language Models (LLMs) excel in diverse applications including +generation of code snippets, but often struggle with generating code for +complex Machine Learning (ML) tasks. Although existing LLM single-agent based +systems give varying performance depending on the task complexity, they purely +rely on larger and expensive models such as GPT-4. Our investigation reveals +that no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama +perform far worse than GPT-4 in a single-agent setting. With the motivation of +developing a cost-efficient LLM based solution for solving ML tasks, we propose +an LLM Multi-Agent based system which leverages combination of experts using +profiling, efficient retrieval of past observations, LLM cascades, and +ask-the-expert calls. Through empirical analysis on ML engineering tasks in the +MLAgentBench benchmark, we demonstrate the effectiveness of our system, using +no-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and +expert to serve occasional ask-the-expert calls for planning. With 94.2\% +reduction in the cost (from \$0.931 per run cost averaged over all tasks for +GPT-4 single agent system to \$0.054), our system is able to yield better +average success rate of 32.95\% as compared to GPT-4 single-agent system +yielding 22.72\% success rate averaged over all the tasks of MLAgentBench. + +
+
+ comment: Presented at AIMLSystems '24 +
+
+
+
+
+ + ☆ DecoPrompt : Decoding Prompts Reduces Hallucinations when Large Language + Models Meet False Premises + + +
+ While large language models (LLMs) have demonstrated increasing power, they +have also called upon studies on their hallucinated outputs that deviate from +factually correct statements. In this paper, we focus on one important scenario +of false premises, where LLMs are distracted by misaligned claims although the +model possesses the required factual knowledge to answer original questions +accurately. Inspired by the observation that entropy of the false-premise +prompt is closely related to its likelihood to elicit hallucination generation, +we propose a new prompting algorithm, named DecoPrompt, to mitigate +hallucination. DecoPrompt leverages LLMs to "decode" the false-premise prompts +without really eliciting hallucination output from LLMs. We perform experiments +on two datasets, demonstrating that DecoPrompt can reduce hallucinations +effectively on outputs from different LLMs. Moreover, DecoPrompt exhibits +cross-model transferability, which facilitates its applications to scenarios +such as LLMs of large sizes or unavailable model logits. + +
+
+
+
+
+ + ☆ Efficient and Accurate Prompt Optimization: the Benefit of Memory in + Exemplar-Guided Reflection + + +
+ Automatic prompt engineering aims to enhance the generation quality of large +language models (LLMs). Recent works utilize feedbacks generated from erroneous +cases to guide the prompt optimization. During inference, they may further +retrieve several semantically-related exemplars and concatenate them to the +optimized prompts to improve the performance. However, those works only utilize +the feedback at the current step, ignoring historical and unseleccted feedbacks +which are potentially beneficial. Moreover, the selection of exemplars only +considers the general semantic relationship and may not be optimal in terms of +task performance and matching with the optimized prompt. In this work, we +propose an Exemplar-Guided Reflection with Memory mechanism (ERM) to realize +more efficient and accurate prompt optimization. Specifically, we design an +exemplar-guided reflection mechanism where the feedback generation is +additionally guided by the generated exemplars. We further build two kinds of +memory to fully utilize the historical feedback information and support more +effective exemplar retrieval. Empirical evaluations show our method surpasses +previous state-of-the-arts with less optimization steps, i.e., improving F1 +score by 10.1 on LIAR dataset, and reducing half of the optimization steps on +ProTeGi. + +
+
+
+
+
+ + ☆ Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial + Approach + + +
+ Deep learning underpins most of the currently advanced natural language +processing (NLP) tasks such as textual classification, neural machine +translation (NMT), abstractive summarization and question-answering (QA). +However, the robustness of the models, particularly QA models, against +adversarial attacks is a critical concern that remains insufficiently explored. +This paper introduces QA-Attack (Question Answering Attack), a novel word-level +adversarial strategy that fools QA models. Our attention-based attack exploits +the customized attention mechanism and deletion ranking strategy to identify +and target specific words within contextual passages. It creates deceptive +inputs by carefully choosing and substituting synonyms, preserving grammatical +integrity while misleading the model to produce incorrect responses. Our +approach demonstrates versatility across various question types, particularly +when dealing with extensive long textual inputs. Extensive experiments on +multiple benchmark datasets demonstrate that QA-Attack successfully deceives +baseline QA models and surpasses existing adversarial techniques regarding +success rate, semantics changes, BLEU score, fluency and grammar error rate. + +
+
+
+
+
+ + ☆ Beyond the Safety Bundle: Auditing the Helpful and Harmless Dataset + + +
+ In an effort to mitigate the harms of large language models (LLMs), learning +from human feedback (LHF) has been used to steer LLMs towards outputs that are +intended to be both less harmful and more helpful. Despite the widespread +adoption of LHF in practice, the quality of this feedback and its effectiveness +as a safety mitigation technique remain unclear. This study addresses these +issues by auditing the widely-used Helpful and Harmless (HH) dataset by +Anthropic. Our work includes: (1) a thorough investigation of the dataset's +content through both manual and automated evaluation; (2) experiments +demonstrating the dataset's impact on models' safety; and (3) an analysis of +the 100 most influential papers citing this dataset. Through our audit, we +showcase how conceptualization failures and quality issues identified in the HH +dataset can create additional harms by leading to disparate safety behaviors +across demographic groups. Our findings highlight the need for more nuanced, +context-sensitive approaches to safety mitigation in LLMs. + +
+
+ comment: Prepared for conference submission +
+
+
+
+
+ + ☆ Retrieval, Reasoning, Re-ranking: A Context-Enriched Framework for + Knowledge Graph Completion + + +
+ The Knowledge Graph Completion~(KGC) task aims to infer the missing entity +from an incomplete triple. Existing embedding-based methods rely solely on +triples in the KG, which is vulnerable to specious relation patterns and +long-tail entities. On the other hand, text-based methods struggle with the +semantic gap between KG triples and natural language. Apart from triples, +entity contexts (e.g., labels, descriptions, aliases) also play a significant +role in augmenting KGs. To address these limitations, we propose KGR3, a +context-enriched framework for KGC. KGR3 is composed of three modules. Firstly, +the Retrieval module gathers supporting triples from the KG, collects plausible +candidate answers from a base embedding model, and retrieves context for each +related entity. Then, the Reasoning module employs a large language model to +generate potential answers for each query triple. Finally, the Re-ranking +module combines candidate answers from the two modules mentioned above, and +fine-tunes an LLM to provide the best answer. Extensive experiments on widely +used datasets demonstrate that KGR3 consistently improves various KGC methods. +Specifically, the best variant of KGR3 achieves absolute Hits@1 improvements of +12.3% and 5.6% on the FB15k237 and WN18RR datasets. + +
+
+
+
+
+ + ☆ Large Language Models Can Self-Improve in Long-context Reasoning + + +
+ Large language models (LLMs) have achieved substantial progress in processing +long contexts but still struggle with long-context reasoning. Existing +approaches typically involve fine-tuning LLMs with synthetic data, which +depends on annotations from human experts or advanced models like GPT-4, thus +restricting further advancements. To address this issue, we investigate the +potential for LLMs to self-improve in long-context reasoning and propose \ours, +an approach specifically designed for this purpose. This approach is +straightforward: we sample multiple outputs for each question, score them with +Minimum Bayes Risk, and then apply supervised fine-tuning or preference +optimization based on these outputs. Extensive experiments on several leading +LLMs demonstrate the effectiveness of \ours, with an absolute improvement of +$4.2$ points for Llama-3.1-8B-Instruct. Furthermore, \ours achieves superior +performance compared to prior approaches that depend on data produced by human +experts or advanced models. We anticipate that this work will open new avenues +for self-improvement techniques in long-context scenarios, which are essential +for the continual advancement of LLMs. + +
+
+ comment: Project Page: https://github.com/SihengLi99/SEALONG +
+
+
+
+
+ + ☆ On the Role of Speech Data in Reducing Toxicity Detection Bias + + +
+ Text toxicity detection systems exhibit significant biases, producing +disproportionate rates of false positives on samples mentioning demographic +groups. But what about toxicity detection in speech? To investigate the extent +to which text-based biases are mitigated by speech-based systems, we produce a +set of high-quality group annotations for the multilingual MuTox dataset, and +then leverage these annotations to systematically compare speech- and +text-based toxicity classifiers. Our findings indicate that access to speech +data during inference supports reduced bias against group mentions, +particularly for ambiguous and disagreement-inducing samples. Our results also +suggest that improving classifiers, rather than transcription pipelines, is +more helpful for reducing group bias. We publicly release our annotations and +provide recommendations for future toxicity dataset construction. + +
+
+
+
+
+ + ♻ ☆ Plausible Extractive Rationalization through Semi-Supervised Entailment + Signal ACL + + +
+ The increasing use of complex and opaque black box models requires the +adoption of interpretable measures, one such option is extractive rationalizing +models, which serve as a more interpretable alternative. These models, also +known as Explain-Then-Predict models, employ an explainer model to extract +rationales and subsequently condition the predictor with the extracted +information. Their primary objective is to provide precise and faithful +explanations, represented by the extracted rationales. In this paper, we take a +semi-supervised approach to optimize for the plausibility of extracted +rationales. We adopt a pre-trained natural language inference (NLI) model and +further fine-tune it on a small set of supervised rationales ($10\%$). The NLI +predictor is leveraged as a source of supervisory signals to the explainer via +entailment alignment. We show that, by enforcing the alignment agreement +between the explanation and answer in a question-answering task, the +performance can be improved without access to ground truth labels. We evaluate +our approach on the ERASER dataset and show that our approach achieves +comparable results with supervised extractive models and outperforms +unsupervised approaches by $> 100\%$. + +
+
+ comment: ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ Self-training Large Language Models through Knowledge Detection EMNLP + + +
+ Large language models (LLMs) often necessitate extensive labeled datasets and +training compute to achieve impressive performance across downstream tasks. +This paper explores a self-training paradigm, where the LLM autonomously +curates its own labels and selectively trains on unknown data samples +identified through a reference-free consistency method. Empirical evaluations +demonstrate significant improvements in reducing hallucination in generation +across multiple subjects. Furthermore, the selective training framework +mitigates catastrophic forgetting in out-of-distribution benchmarks, addressing +a critical limitation in training LLMs. Our findings suggest that such an +approach can substantially reduce the dependency on large labeled datasets, +paving the way for more scalable and cost-effective language model training. + +
+
+ comment: EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Language Model Factuality via Activation-Based Confidence + Calibration and Guided Decoding EMNLP 2024 + + +
+ Calibrating language models (LMs) aligns their generation confidence with the +actual likelihood of answer correctness, which can inform users about LMs' +reliability and mitigate hallucinated content. However, prior calibration +methods, such as self-consistency-based and logit-based approaches, are either +limited in inference-time efficiency or fall short of providing informative +signals. Moreover, simply filtering out low-confidence responses reduces the +LM's helpfulness when the answers are correct. Therefore, effectively using +calibration techniques to enhance an LM's factuality remains an unsolved +challenge. In this paper, we first propose an activation-based calibration +method, ActCab, which trains a linear layer on top of the LM's last-layer +activations that can better capture the representations of knowledge. Built on +top of ActCab, we further propose CoDec, a confidence-guided decoding strategy +to elicit truthful answers with high confidence from LMs. By evaluating on five +popular QA benchmarks, ActCab achieves superior calibration performance than +all competitive baselines, e.g., by reducing the average expected calibration +error (ECE) score by up to 39%. Further experiments on CoDec show consistent +improvements in several LMs' factuality on challenging QA datasets, such as +TruthfulQA, highlighting the value of confidence signals in enhancing +factuality. + +
+
+ comment: EMNLP 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ Exploiting User Comments for Early Detection of Fake News Prior to + Users' Commenting + + +
+ Both accuracy and timeliness are key factors in detecting fake news on social +media. However, most existing methods encounter an accuracy-timeliness dilemma: +Content-only methods guarantee timeliness but perform moderately because of +limited available information, while social con-text-based ones generally +perform better but inevitably lead to latency because of social context +accumulation needs. To break such a dilemma, a feasible but not well-studied +solution is to leverage social contexts (e.g., comments) from historical news +for training a detection model and apply it to newly emerging news without +social contexts. This requires the model to (1) sufficiently learn helpful +knowledge from social contexts, and (2) be well compatible with situations that +social contexts are available or not. To achieve this goal, we propose to +absorb and parameterize useful knowledge from comments in historical news and +then inject it into a content-only detection model. Specifically, we design the +Comments ASsisted FakE News Detection method (CAS-FEND), which transfers useful +knowledge from a comment-aware teacher model to a content-only student model +and detects newly emerging news with the student model. Experiments show that +the CAS-FEND student model outperforms all content-only methods and even +comment-aware ones with 1/4 comments as inputs, demonstrating its superiority +for early detection. + +
+
+ comment: 19 pages, 6 figures, 7 tables. The article has been accepted by + Frontiers of Computer Science (FCS), with the DOI: + {10.1007/s11704-024-40674-6} +
+
+
+
+
+ + ♻ ☆ How Do Large Language Models Acquire Factual Knowledge During + Pretraining? NeurIPS 2024 + + +
+ Despite the recent observation that large language models (LLMs) can store +substantial factual knowledge, there is a limited understanding of the +mechanisms of how they acquire factual knowledge through pretraining. This work +addresses this gap by studying how LLMs acquire factual knowledge during +pretraining. The findings reveal several important insights into the dynamics +of factual knowledge acquisition during pretraining. First, counterintuitively, +we observe that pretraining on more data shows no significant improvement in +the model's capability to acquire and maintain factual knowledge. Next, there +is a power-law relationship between training steps and forgetting of +memorization and generalization of factual knowledge, and LLMs trained with +duplicated training data exhibit faster forgetting. Third, training LLMs with +larger batch sizes can enhance the models' robustness to forgetting. Overall, +our observations suggest that factual knowledge acquisition in LLM pretraining +occurs by progressively increasing the probability of factual knowledge +presented in the pretraining data at each step. However, this increase is +diluted by subsequent forgetting. Based on this interpretation, we demonstrate +that we can provide plausible explanations for recently observed behaviors of +LLMs, such as the poor performance of LLMs on long-tail knowledge and the +benefits of deduplicating the pretraining corpus. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ A Unified Multi-Task Learning Architecture for Hate Detection Leveraging + User-Based Information + + +
+ Hate speech, offensive language, aggression, racism, sexism, and other +abusive language are common phenomena in social media. There is a need for +Artificial Intelligence(AI)based intervention which can filter hate content at +scale. Most existing hate speech detection solutions have utilized the features +by treating each post as an isolated input instance for the classification. +This paper addresses this issue by introducing a unique model that improves +hate speech identification for the English language by utilising intra-user and +inter-user-based information. The experiment is conducted over single-task +learning (STL) and multi-task learning (MTL) paradigms that use deep neural +networks, such as convolutional neural networks (CNN), gated recurrent unit +(GRU), bidirectional encoder representations from the transformer (BERT), and A +Lite BERT (ALBERT). We use three benchmark datasets and conclude that combining +certain user features with textual features gives significant improvements in +macro-F1 and weighted-F1. + +
+
+ comment: 7 pages, 1 figure, and two tables. Accepted at the 20th International + Conference on Natural Language Processing (ICON) 2023. + https://aclanthology.org/2023.icon-1.53 +
+
+
+
+
+ + ♻ ☆ An Early FIRST Reproduction and Improvements to Single-Token Decoding + for Fast Listwise Reranking + + +
+ Recent advances have demonstrated that large language models (LLMs) excel as +listwise rerankers, but their high computational demands remain a barrier to +widespread adoption. Further, the traditional language modeling (LM) objective +is not ideally suited for reranking tasks. FIRST is a novel approach that +addresses these challenges by integrating a learning-to-rank objective and +leveraging the logits of only the first generated token, thereby significantly +reducing inference latency compared to traditional LLM rerankers. In this +study, we extend the evaluation of FIRST to the TREC Deep Learning datasets +(DL19-22), validating its robustness across diverse domains. We investigate the +influence of different first-stage retrievers on FIRST rerankers, observing +diminishing returns and patterns consistent with traditional LLM rerankers. +Through applying the FIRST objective to a broader range of backbone models, we +achieve effectiveness surpassing the original implementation. Our experiments +confirm that fast reranking with single-token logits does not compromise +out-of-domain reranking quality. To better quantify the computational savings +in the original study, we measure and compare latency to find a 21%-42% gain +across various models and benchmarks. Moreover, while LM training implicitly +improves zero-shot single-token reranking, our experiments also raise questions +about whether LM pre-training may hinder subsequent fine-tuning with the FIRST +objective. These findings pave the way for more efficient and effective +listwise reranking in future applications. + +
+
+
+
+
+ + ♻ ☆ LLMs Can Evolve Continually on Modality for X-Modal Reasoning + + +
+ Multimodal Large Language Models (MLLMs) have gained significant attention +due to their impressive capabilities in multimodal understanding. However, +existing methods rely heavily on extensive modal-specific pretraining and +joint-modal tuning, leading to significant computational burdens when expanding +to new modalities. In this paper, we propose PathWeave, a flexible and scalable +framework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs +to continually EVolve on modalities for $\mathbb{X}$-modal reasoning. We +leverage the concept of Continual Learning and develop an incremental training +strategy atop pre-trained MLLMs, enabling their expansion to new modalities +using uni-modal data, without executing joint-modal pretraining. In detail, a +novel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and +cross-modal adapters are seamlessly integrated to facilitate efficient modality +alignment and collaboration. Additionally, an MoE-based gating module is +applied between two types of adapters to further enhance the multimodal +interaction. To investigate the proposed method, we establish a challenging +benchmark called Continual Learning of Modality (MCL), which consists of +high-quality QA data from five distinct modalities: image, video, audio, depth +and point cloud. Extensive experiments demonstrate the effectiveness of the +proposed AnA framework on learning plasticity and memory stability during +continual learning. Furthermore, PathWeave performs comparably to +state-of-the-art MLLMs while concurrently reducing parameter training burdens +by 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave + +
+
+
+
+
+ + ♻ ☆ The Dark Patterns of Personalized Persuasion in Large Language Models: + Exposing Persuasive Linguistic Features for Big Five Personality Traits in + LLMs Responses + + +
+ This study explores how the Large Language Models (LLMs) adjust linguistic +features to create personalized persuasive outputs. While research showed that +LLMs personalize outputs, a gap remains in understanding the linguistic +features of their persuasive capabilities. We identified 13 linguistic features +crucial for influencing personalities across different levels of the Big Five +model of personality. We analyzed how prompts with personality trait +information influenced the output of 19 LLMs across five model families. The +findings show that models use more anxiety-related words for neuroticism, +increase achievement-related words for conscientiousness, and employ fewer +cognitive processes words for openness to experience. Some model families excel +at adapting language for openness to experience, others for conscientiousness, +while only one model adapts language for neuroticism. Our findings show how +LLMs tailor responses based on personality cues in prompts, indicating their +potential to create persuasive content affecting the mind and well-being of the +recipients. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ♻ ☆ Efficient LLM Comparative Assessment: a Product of Experts Framework for + Pairwise Comparisons + + +
+ LLM-as-a-judge approaches are a practical and effective way of assessing a +range of text tasks. However, when using pairwise comparisons to rank a set of +candidates, the computational cost scales quadratically with the number of +candidates, which has practical limitations. This paper introduces a Product of +Expert (PoE) framework for efficient LLM Comparative Assessment. Here +individual comparisons are considered experts that provide information on a +pair's score difference. The PoE framework combines the information from these +experts to yield an expression that can be maximized with respect to the +underlying set of candidates, and is highly flexible where any form of expert +can be assumed. When Gaussian experts are used one can derive simple +closed-form solutions for the optimal candidate ranking, and expressions for +selecting which comparisons should be made to maximize the probability of this +ranking. Our approach enables efficient comparative assessment, where by using +only a small subset of the possible comparisons, one can generate score +predictions that correlate well with human judgements. We evaluate the approach +on multiple NLG tasks and demonstrate that our framework can yield considerable +computational savings when performing pairwise comparative assessment. With +many candidate texts, using as few as 2% of comparisons the PoE solution can +achieve similar performance to when all comparisons are used. + +
+
+
+
+
+ + ♻ ☆ Qwen2.5-Coder Technical Report + + +
+ In this report, we introduce the Qwen2.5-Coder series, a significant upgrade +from its predecessor, CodeQwen1.5. This series includes six models: +Qwen2.5-Coder-(0.5B/1.5B/3B/7B/14B/32B). As a code-specific model, +Qwen2.5-Coder is built upon the Qwen2.5 architecture and continues pretrained +on a vast corpus of over 5.5 trillion tokens. Through meticulous data cleaning, +scalable synthetic data generation, and balanced data mixing, Qwen2.5-Coder +demonstrates impressive code generation capabilities while retaining general +and math skills. These models have been evaluated on a wide range of +code-related tasks, achieving state-of-the-art (SOTA) performance across more +than 10 benchmarks, including code generation, completion, reasoning, and +repair, consistently outperforming larger models of the same model size. We +believe that the release of the Qwen2.5-Coder series will advance research in +code intelligence and, with its permissive licensing, support wider adoption by +developers in real-world applications. + +
+
+
+
+
+ + ♻ ☆ Kwai-STaR: Transform LLMs into State-Transition Reasoners + + +
+ Mathematical reasoning presents a significant challenge to the cognitive +capabilities of LLMs. Various methods have been proposed to enhance the +mathematical ability of LLMs. However, few recognize the value of state +transition for LLM reasoning. In this work, we define mathematical +problem-solving as a process of transiting from an initial unsolved state to +the final resolved state, and propose Kwai-STaR framework, which transforms +LLMs into State-Transition Reasoners to improve their intuitive reasoning +capabilities. Our approach comprises three main steps: (1) Define the state +space tailored to the mathematical reasoning. (2) Generate state-transition +data based on the state space. (3) Convert original LLMs into State-Transition +Reasoners via a curricular training strategy. Our experiments validate the +effectiveness of Kwai-STaR in enhancing mathematical reasoning: After training +on the small-scale Kwai-STaR dataset, general LLMs, including Mistral-7B and +LLaMA-3, achieve considerable performance gain on the GSM8K and GSM-Hard +dataset. Additionally, the state transition-based design endows Kwai-STaR with +remarkable training and inference efficiency. Further experiments are underway +to establish the generality of Kwai-STaR. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ LLMs for Generating and Evaluating Counterfactuals: A Comprehensive + Study EMNLP + + +
+ As NLP models become more complex, understanding their decisions becomes more +crucial. Counterfactuals (CFs), where minimal changes to inputs flip a model's +prediction, offer a way to explain these models. While Large Language Models +(LLMs) have shown remarkable performance in NLP tasks, their efficacy in +generating high-quality CFs remains uncertain. This work fills this gap by +investigating how well LLMs generate CFs for two NLU tasks. We conduct a +comprehensive comparison of several common LLMs, and evaluate their CFs, +assessing both intrinsic metrics, and the impact of these CFs on data +augmentation. Moreover, we analyze differences between human and LLM-generated +CFs, providing insights for future research directions. Our results show that +LLMs generate fluent CFs, but struggle to keep the induced changes minimal. +Generating CFs for Sentiment Analysis (SA) is less challenging than NLI where +LLMs show weaknesses in generating CFs that flip the original label. This also +reflects on the data augmentation performance, where we observe a large gap +between augmenting with human and LLMs CFs. Furthermore, we evaluate LLMs' +ability to assess CFs in a mislabelled data setting, and show that they have a +strong bias towards agreeing with the provided labels. GPT4 is more robust +against this bias and its scores correlate well with automatic metrics. Our +findings reveal several limitations and point to potential future work +directions. + +
+
+ comment: Accepted to EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ RLHF Workflow: From Reward Modeling to Online RLHF + + +
+ We present the workflow of Online Iterative Reinforcement Learning from Human +Feedback (RLHF) in this technical report, which is widely reported to +outperform its offline counterpart by a large margin in the recent large +language model (LLM) literature. However, existing open-source RLHF projects +are still largely confined to the offline learning setting. In this technical +report, we aim to fill in this gap and provide a detailed recipe that is easy +to reproduce for online iterative RLHF. In particular, since online human +feedback is usually infeasible for open-source communities with limited +resources, we start by constructing preference models using a diverse set of +open-source datasets and use the constructed proxy preference model to +approximate human feedback. Then, we discuss the theoretical insights and +algorithmic principles behind online iterative RLHF, followed by a detailed +practical implementation. Our trained LLM achieves impressive performance on +LLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as +well as other academic benchmarks such as HumanEval and TruthfulQA. We have +shown that supervised fine-tuning (SFT) and iterative RLHF can obtain +state-of-the-art performance with fully open-source datasets. Further, we have +made our models, curated datasets, and comprehensive step-by-step code +guidebooks publicly available. Please refer to +https://github.com/RLHFlow/RLHF-Reward-Modeling and +https://github.com/RLHFlow/Online-RLHF for more detailed information. + +
+
+ comment: Published in Transactions on Machine Learning Research (09/2024) +
+
+
+
+
+ + ♻ ☆ LeKUBE: A Legal Knowledge Update BEnchmark + + +
+ Recent advances in Large Language Models (LLMs) have significantly shaped the +applications of AI in multiple fields, including the studies of legal +intelligence. Trained on extensive legal texts, including statutes and legal +documents, the legal LLMs can capture important legal knowledge/concepts +effectively and provide important support for downstream legal applications +such as legal consultancy. Yet, the dynamic nature of legal statutes and +interpretations also poses new challenges to the use of LLMs in legal +applications. Particularly, how to update the legal knowledge of LLMs +effectively and efficiently has become an important research problem in +practice. Existing benchmarks for evaluating knowledge update methods are +mostly designed for the open domain and cannot address the specific challenges +of the legal domain, such as the nuanced application of new legal knowledge, +the complexity and lengthiness of legal regulations, and the intricate nature +of legal reasoning. To address this gap, we introduce the Legal Knowledge +Update BEnchmark, i.e. LeKUBE, which evaluates knowledge update methods for +legal LLMs across five dimensions. Specifically, we categorize the needs of +knowledge updates in the legal domain with the help of legal professionals, and +then hire annotators from law schools to create synthetic updates to the +Chinese Criminal and Civil Code as well as sets of questions of which the +answers would change after the updates. Through a comprehensive evaluation of +state-of-the-art knowledge update methods, we reveal a notable gap between +existing knowledge update methods and the unique needs of the legal domain, +emphasizing the need for further research and development of knowledge update +mechanisms tailored for legal LLMs. + +
+
+
+
+
+ + ♻ ☆ Exploring Advanced Large Language Models with LLMsuite + + +
+ This tutorial explores the advancements and challenges in the development of +Large Language Models (LLMs) such as ChatGPT and Gemini. It addresses inherent +limitations like temporal knowledge cutoffs, mathematical inaccuracies, and the +generation of incorrect information, proposing solutions like Retrieval +Augmented Generation (RAG), Program-Aided Language Models (PAL), and frameworks +such as ReAct and LangChain. The integration of these techniques enhances LLM +performance and reliability, especially in multi-step reasoning and complex +task execution. The paper also covers fine-tuning strategies, including +instruction fine-tuning, parameter-efficient methods like LoRA, and +Reinforcement Learning from Human Feedback (RLHF) as well as Reinforced +Self-Training (ReST). Additionally, it provides a comprehensive survey of +transformer architectures and training techniques for LLMs. The source code can +be accessed by contacting the author via email for a request. + +
+
+ comment: Keywords: Language Model Benchmarking, Pre-Trained LLM Comparison, + LLM Performance Analysis, NLP Model Evaluation Tools, Public Dataset + Inference for LLMs, BLEU and ROUGE Metrics for LLM, Open Source LLM Testing + Tools, Large Language Model Evaluation Software, NLP Benchmarking Suite, + Comprehensive LLM Evaluation Toolkit +
+
+
+
+
+ + ♻ ☆ OmAgent: A Multi-modal Agent Framework for Complex Video Understanding + with Task Divide-and-Conquer + + +
+ Recent advancements in Large Language Models (LLMs) have expanded their +capabilities to multimodal contexts, including comprehensive video +understanding. However, processing extensive videos such as 24-hour CCTV +footage or full-length films presents significant challenges due to the vast +data and processing demands. Traditional methods, like extracting key frames or +converting frames to text, often result in substantial information loss. To +address these shortcomings, we develop OmAgent, efficiently stores and +retrieves relevant video frames for specific queries, preserving the detailed +content of videos. Additionally, it features an Divide-and-Conquer Loop capable +of autonomous reasoning, dynamically invoking APIs and tools to enhance query +processing and accuracy. This approach ensures robust video understanding, +significantly reducing information loss. Experimental results affirm OmAgent's +efficacy in handling various types of videos and complex tasks. Moreover, we +have endowed it with greater autonomy and a robust tool-calling system, +enabling it to accomplish even more intricate tasks. + +
+
+
+
+
+ + ♻ ☆ SciDFM: A Large Language Model with Mixture-of-Experts for Science NeurIPS + 2024 + + +
+ Recently, there has been a significant upsurge of interest in leveraging +large language models (LLMs) to assist scientific discovery. However, most LLMs +only focus on general science, while they lack domain-specific knowledge, such +as chemical molecules and amino acid sequences. To bridge these gaps, we +introduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and +is able to conduct college-level scientific reasoning and understand molecules +and amino acid sequences. We collect a large-scale training corpus containing +numerous scientific papers and books from different disciplines as well as data +from domain-specific databases. We further fine-tune the pre-trained model on +lots of instruction data to improve performances on downstream benchmarks. From +experiment results, we show that SciDFM achieves strong performance on general +scientific benchmarks such as SciEval and SciQ, and it reaches a SOTA +performance on domain-specific benchmarks among models of similar size. We +further analyze the expert layers and show that the results of expert selection +vary with data from different disciplines. To benefit the broader research +community, we open-source SciDFM at +https://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0. + +
+
+ comment: 12 pages, 1 figure, 9 tables. Technical Report, accepted by NeurIPS + 2024 Workshop FM4Science +
+
+
+
+
+ + ♻ ☆ How Does the Textual Information Affect the Retrieval of Multimodal + In-Context Learning? EMNLP 2024 + + +
+ The increase in parameter size of multimodal large language models (MLLMs) +introduces significant capabilities, particularly in-context learning, where +MLLMs enhance task performance without updating pre-trained parameters. This +effectiveness, however, hinges on the appropriate selection of in-context +examples, a process that is currently biased towards visual data, overlooking +textual information. Furthermore, the area of supervised retrievers for MLLMs, +crucial for optimal in-context example selection, continues to be +uninvestigated. Our study offers an in-depth evaluation of the impact of +textual information on the unsupervised selection of in-context examples in +multimodal contexts, uncovering a notable sensitivity of retriever performance +to the employed modalities. Responding to this, we introduce a novel supervised +MLLM-retriever MSIER that employs a neural network to select examples that +enhance multimodal in-context learning efficiency. This approach is validated +through extensive testing across three distinct tasks, demonstrating the +method's effectiveness. Additionally, we investigate the influence of +modalities on our supervised retrieval method's training and pinpoint factors +contributing to our model's success. This exploration paves the way for future +advancements, highlighting the potential for refined in-context learning in +MLLMs through the strategic use of multimodal data. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Deception Detection from Linguistic and Physiological Data Streams Using + Bimodal Convolutional Neural Networks + + +
+ Deception detection is gaining increasing interest due to ethical and +security concerns. This paper explores the application of convolutional neural +networks for the purpose of multimodal deception detection. We use a dataset +built by interviewing 104 subjects about two topics, with one truthful and one +falsified response from each subject about each topic. In particular, we make +three main contributions. First, we extract linguistic and physiological +features from this data to train and construct the neural network models. +Second, we propose a fused convolutional neural network model using both +modalities in order to achieve an improved overall performance. Third, we +compare our new approach with earlier methods designed for multimodal deception +detection. We find that our system outperforms regular classification methods; +our results indicate the feasibility of using neural networks for deception +detection even in the presence of limited amounts of data. + +
+
+ comment: Accepted by 2024 5th International Conference on Information Science, + Parallel and Distributed Systems +
+
+
+
+
+ + ♻ ☆ SKVQ: Sliding-window Key and Value Cache Quantization for Large Language + Models + + +
+ Large language models (LLMs) can now handle longer sequences of tokens, +enabling complex tasks like book understanding and generating lengthy novels. +However, the key-value (KV) cache required for LLMs consumes substantial memory +as context length increasing, becoming the bottleneck for deployment. In this +paper, we present a strategy called SKVQ, which stands for sliding-window KV +cache quantization, to address the issue of extremely low bitwidth KV cache +quantization. To achieve this, SKVQ rearranges the channels of the KV cache in +order to improve the similarity of channels in quantization groups, and applies +clipped dynamic quantization at the group level. Additionally, SKVQ ensures +that the most recent window tokens in the KV cache are preserved with high +precision. This helps maintain the accuracy of a small but important portion of +the KV cache.SKVQ achieves high compression ratios while maintaining accuracy. +Our evaluation on LLMs demonstrates that SKVQ surpasses previous quantization +approaches, allowing for quantization of the KV cache to 2-bit keys and 1.5-bit +values with minimal loss of accuracy. With SKVQ, it is possible to process +context lengths of up to 1M on an 80GB memory GPU for a 7b model and up to 7 +times faster decoding. + +
+
+
+
+
+ + ♻ ☆ Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM + Approach + + +
+ Accurate stock market predictions following earnings reports are crucial for +investors. Traditional methods, particularly classical machine learning models, +struggle with these predictions because they cannot effectively process and +interpret extensive textual data contained in earnings reports and often +overlook nuances that influence market movements. This paper introduces an +advanced approach by employing Large Language Models (LLMs) instruction +fine-tuned with a novel combination of instruction-based techniques and +quantized low-rank adaptation (QLoRA) compression. Our methodology integrates +'base factors', such as financial metric growth and earnings transcripts, with +'external factors', including recent market indices performances and analyst +grades, to create a rich, supervised dataset. This comprehensive dataset +enables our models to achieve superior predictive performance in terms of +accuracy, weighted F1, and Matthews correlation coefficient (MCC), especially +evident in the comparison with benchmarks such as GPT-4. We specifically +highlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases +significant improvements over baseline models. The paper also discusses the +potential of expanding the output capabilities to include a 'Hold' option and +extending the prediction horizon, aiming to accommodate various investment +styles and time frames. This study not only demonstrates the power of +integrating cutting-edge AI with fine-tuned financial data but also paves the +way for future research in enhancing AI-driven financial analysis tools. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ♻ ☆ MASIVE: Open-Ended Affective State Identification in English and Spanish EMNLP 2024 + + +
+ In the field of emotion analysis, much NLP research focuses on identifying a +limited number of discrete emotion categories, often applied across languages. +These basic sets, however, are rarely designed with textual data in mind, and +culture, language, and dialect can influence how particular emotions are +interpreted. In this work, we broaden our scope to a practically unbounded set +of \textit{affective states}, which includes any terms that humans use to +describe their experiences of feeling. We collect and publish MASIVE, a dataset +of Reddit posts in English and Spanish containing over 1,000 unique affective +states each. We then define the new problem of \textit{affective state +identification} for language generation models framed as a masked span +prediction task. On this task, we find that smaller finetuned multilingual +models outperform much larger LLMs, even on region-specific Spanish affective +states. Additionally, we show that pretraining on MASIVE improves model +performance on existing emotion benchmarks. Finally, through machine +translation experiments, we find that native speaker-written data is vital to +good performance on this task. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Diverse Methods in Visual Question Answering + + +
+ This study explores innovative methods for improving Visual Question +Answering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and +attention mechanisms. Leveraging a balanced VQA dataset, we investigate three +distinct strategies. Firstly, GAN-based approaches aim to generate answer +embeddings conditioned on image and question inputs, showing potential but +struggling with more complex tasks. Secondly, autoencoder-based techniques +focus on learning optimal embeddings for questions and images, achieving +comparable results with GAN due to better ability on complex questions. Lastly, +attention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB), +address language priors and attention modeling, albeit with a +complexity-performance trade-off. This study underscores the challenges and +opportunities in VQA and suggests avenues for future research, including +alternative GAN formulations and attentional mechanisms. + +
+
+ comment: Accepted by 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ LAMP: A Language Model on the Map + + +
+ Large Language Models (LLMs) are poised to play an increasingly important +role in our lives, providing assistance across a wide array of tasks. In the +geospatial domain, LLMs have demonstrated the ability to answer generic +questions, such as identifying a country's capital; nonetheless, their utility +is hindered when it comes to answering fine-grained questions about specific +places, such as grocery stores or restaurants, which constitute essential +aspects of people's everyday lives. This is mainly because the places in our +cities haven't been systematically fed into LLMs, so as to understand and +memorize them. This study introduces a novel framework for fine-tuning a +pre-trained model on city-specific data, to enable it to provide accurate +recommendations, while minimizing hallucinations. We share our model, LAMP, and +the data used to train it. We conduct experiments to analyze its ability to +correctly retrieving spatial objects, and compare it to well-known open- and +closed- source language models, such as GPT-4. Finally, we explore its emerging +capabilities through a case study on day planning. + +
+
+
+
+
+ + ♻ ☆ Game-theoretic LLM: Agent Workflow for Negotiation Games + + +
+ This paper investigates the rationality of large language models (LLMs) in +strategic decision-making contexts, specifically within the framework of game +theory. We evaluate several state-of-the-art LLMs across a spectrum of +complete-information and incomplete-information games. Our findings reveal that +LLMs frequently deviate from rational strategies, particularly as the +complexity of the game increases with larger payoff matrices or deeper +sequential trees. + To address these limitations, we design multiple game-theoretic workflows +that guide the reasoning and decision-making processes of LLMs. These workflows +aim to enhance the models' ability to compute Nash Equilibria and make rational +choices, even under conditions of uncertainty and incomplete information. +Experimental results demonstrate that the adoption of these workflows +significantly improves the rationality and robustness of LLMs in game-theoretic +tasks. Specifically, with the workflow, LLMs exhibit marked improvements in +identifying optimal strategies, achieving near-optimal allocations in +negotiation scenarios, and reducing susceptibility to exploitation during +negotiations. Furthermore, we explore the meta-strategic considerations of +whether it is rational for agents to adopt such workflows, recognizing that the +decision to use or forgo the workflow constitutes a game-theoretic issue in +itself. + Our research contributes to a deeper understanding of LLMs' decision-making +capabilities in strategic contexts and provides insights into enhancing their +rationality through structured workflows. The findings have implications for +the development of more robust and strategically sound AI agents capable of +navigating complex interactive environments. Code and data supporting this +study are available at \url{https://github.com/Wenyueh/game_theory}. + +
+
+ comment: 45 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Dynamic Adaptive Optimization for Effective Sentiment Analysis + Fine-Tuning on Large Language Models + + +
+ Sentiment analysis plays a crucial role in various domains, such as business +intelligence and financial forecasting. Large language models (LLMs) have +become a popular paradigm for sentiment analysis, leveraging multi-task +learning to address specific tasks concurrently. However, LLMs with fine-tuning +for sentiment analysis often underperforms due to the inherent challenges in +managing diverse task complexities. Moreover, constant-weight approaches in +multi-task learning struggle to adapt to variations in data characteristics, +further complicating model effectiveness. To address these issues, we propose a +novel multi-task learning framework with a dynamic adaptive optimization (DAO) +module. This module is designed as a plug-and-play component that can be +seamlessly integrated into existing models, providing an effective and flexible +solution for multi-task learning. The key component of the DAO module is +dynamic adaptive loss, which dynamically adjusts the weights assigned to +different tasks based on their relative importance and data characteristics +during training. Sentiment analyses on a standard and customized financial text +dataset demonstrate that the proposed framework achieves superior performance. +Specifically, this work improves the Mean Squared Error (MSE) and Accuracy +(ACC) by 15.58% and 1.24% respectively, compared with previous work. + +
+
+
+
+
+ + ♻ ☆ Reminding Multimodal Large Language Models of Object-aware Knowledge + with Retrieved Tags EMNLP 2024 + + +
+ Despite recent advances in the general visual instruction-following ability +of Multimodal Large Language Models (MLLMs), they still struggle with critical +problems when required to provide a precise and detailed response to a visual +instruction: (1) failure to identify novel objects or entities, (2) mention of +non-existent objects, and (3) neglect of object's attributed details. Intuitive +solutions include improving the size and quality of data or using larger +foundation models. They show effectiveness in mitigating these issues, but at +an expensive cost of collecting a vast amount of new data and introducing a +significantly larger model. Standing at the intersection of these approaches, +we examine the three object-oriented problems from the perspective of the +image-to-text mapping process by the multimodal connector. In this paper, we +first identify the limitations of multimodal connectors stemming from +insufficient training data. Driven by this, we propose to enhance the mapping +with retrieval-augmented tag tokens, which contain rich object-aware +information such as object names and attributes. With our Tag-grounded visual +instruction tuning with retrieval Augmentation (TUNA), we outperform baselines +that share the same language model and training data on 12 benchmarks. +Furthermore, we show the zero-shot capability of TUNA when provided with +specific datastores. + +
+
+ comment: Main Conference at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ SLANG: New Concept Comprehension of Large Language Models EMNLP 2024 + + +
+ The dynamic nature of language, particularly evident in the realm of slang +and memes on the Internet, poses serious challenges to the adaptability of +large language models (LLMs). Traditionally anchored to static datasets, these +models often struggle to keep up with the rapid linguistic evolution +characteristic of online communities. This research aims to bridge this gap by +enhancing LLMs' comprehension of the evolving new concepts on the Internet, +without the high cost of continual retraining. In pursuit of this goal, we +introduce $\textbf{SLANG}$, a benchmark designed to autonomously integrate +novel data and assess LLMs' ability to comprehend emerging concepts, alongside +$\textbf{FOCUS}$, an approach uses causal inference to enhance LLMs to +understand new phrases and their colloquial context. Our benchmark and approach +involves understanding real-world instances of linguistic shifts, serving as +contextual beacons, to form more precise and contextually relevant connections +between newly emerging expressions and their meanings. The empirical analysis +shows that our causal inference-based approach outperforms the baseline methods +in terms of precision and relevance in the comprehension of Internet slang and +memes. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ MMLongBench-Doc: Benchmarking Long-context Document Understanding with + Visualizations NeurIPS 2024 + + +
+ Understanding documents with rich layouts and multi-modal components is a +long-standing and practical task. Recent Large Vision-Language Models (LVLMs) +have made remarkable strides in various tasks, particularly in single-page +document understanding (DU). However, their abilities on long-context DU remain +an open problem. This work presents MMLongBench-Doc, a long-context, +multi-modal benchmark comprising 1,062 expert-annotated questions. Distinct +from previous datasets, it is constructed upon 130 lengthy PDF-formatted +documents with an average of 49.4 pages and 20,971 textual tokens. Towards +comprehensive evaluation, answers to these questions rely on pieces of evidence +from (1) different sources (text, image, chart, table, and layout structure) +and (2) various locations (i.e. page number). Moreover, 33.2% of the questions +are cross-page questions requiring evidence across multiple pages. 22.8% of the +questions are designed to be unanswerable for detecting potential +hallucinations. Experiments on 14 LVLMs demonstrate that long-context DU +greatly challenges current models. Notably, the best-performing model, GPT-4o, +achieves an F1 score of only 42.7%, while the second-best, GPT-4V, scores +31.4%. Furthermore, 12 LVLMs (all except GPT-4o and GPT-4V) even present worse +performance than their LLM counterparts which are fed with lossy-parsed OCR +documents. These results validate the necessity of future research toward more +capable long-context LVLMs. Project Page: +https://mayubo2333.github.io/MMLongBench-Doc + +
+
+ comment: Accepted to NeurIPS 2024 Datasets and Benchmarks Track (Spotlight) +
+
+
+
+
+ + ♻ ☆ Self-Data Distillation for Recovering Quality in Pruned Large Language + Models + + +
+ Large language models have driven significant progress in natural language +processing, but their deployment requires substantial compute and memory +resources. As models scale, compression techniques become essential for +balancing model quality with computational efficiency. Structured pruning, +which removes less critical components of the model, is a promising strategy +for reducing complexity. However, one-shot pruning often results in significant +quality degradation, particularly in tasks requiring multi-step reasoning. To +recover lost quality, supervised fine-tuning (SFT) is commonly applied, but it +can lead to catastrophic forgetting by shifting the model's learned data +distribution. Therefore, addressing the degradation from both pruning and SFT +is essential to preserve the original model's quality. In this work, we utilize +self-data distilled fine-tuning to address these challenges. Our approach +leverages the original, unpruned model to generate a distilled dataset that +preserves semantic richness and mitigates catastrophic forgetting by +maintaining alignment with the base model's knowledge. Empirically, we +demonstrate that self-data distillation consistently outperforms standard SFT, +improving average accuracy by up to 8% on the HuggingFace OpenLLM Leaderboard +v1. Specifically, when pruning six decoder blocks on Llama3.1-8B Instruct +(i.e., 32 to 26 layers, reducing the model size from 8.03B to 6.72B +parameters), our method retains 91.2% of the original model's accuracy compared +to 81.7% with SFT, while reducing real-world FLOPs by 16.3%. Furthermore, +combining self-data distilled models through model merging yields enhanced +quality retention. Additionally, leveraging these pruned models in speculative +decoding increases token acceptance rates, thereby improving inference +efficiency in applied settings. + +
+
+ comment: 13 pages, 4 figures, 6 Tables (Main Paper) + 5 pages (Supplementary + Material) +
+
+
+
+
+ + ♻ ☆ On Active Privacy Auditing in Supervised Fine-tuning for White-Box + Language Models + + +
+ The pretraining and fine-tuning approach has become the leading technique for +various NLP applications. However, recent studies reveal that fine-tuning data, +due to their sensitive nature, domain-specific characteristics, and +identifiability, pose significant privacy concerns. To help develop more +privacy-resilient fine-tuning models, we introduce a novel active privacy +auditing framework, dubbed Parsing, designed to identify and quantify privacy +leakage risks during the supervised fine-tuning (SFT) of language models (LMs). +The framework leverages improved white-box membership inference attacks (MIAs) +as the core technology, utilizing novel learning objectives and a two-stage +pipeline to monitor the privacy of the LMs' fine-tuning process, maximizing the +exposure of privacy risks. Additionally, we have improved the effectiveness of +MIAs on large LMs including GPT-2, Llama2, and certain variants of them. Our +research aims to provide the SFT community of LMs with a reliable, ready-to-use +privacy auditing tool, and to offer valuable insights into safeguarding privacy +during the fine-tuning process. Experimental results confirm the framework's +efficiency across various models and tasks, emphasizing notable privacy +concerns in the fine-tuning process. Project code available for +https://anonymous.4open.science/r/PARSING-4817/. + +
+
+
+
+
+ + ♻ ☆ Stronger Models are NOT Stronger Teachers for Instruction Tuning + + +
+ Instruction tuning has been widely adopted to ensure large language models +(LLMs) follow user instructions effectively. The resulting +instruction-following capabilities of LLMs heavily rely on the instruction +datasets used for tuning. Recently, synthetic instruction datasets have emerged +as an economically viable solution to provide LLMs diverse and high-quality +instructions. However, existing approaches typically assume that larger or +stronger models are stronger teachers for instruction tuning, and hence simply +adopt these models as response generators to the synthetic instructions. In +this paper, we challenge this commonly-adopted assumption. Our extensive +experiments across five base models and twenty response generators reveal that +larger and stronger models are not necessarily stronger teachers of smaller +models. We refer to this phenomenon as the Larger Models' Paradox. We observe +that existing metrics cannot precisely predict the effectiveness of response +generators since they ignore the compatibility between teachers and base models +being fine-tuned. We thus develop a novel metric, named as +Compatibility-Adjusted Reward (CAR) to measure the effectiveness of response +generators. Our experiments across five base models demonstrate that CAR +outperforms almost all baselines. + +
+
+
+
+
+ + ♻ ☆ Freeze-Omni: A Smart and Low Latency Speech-to-speech Dialogue Model + with Frozen LLM + + +
+ Rapidly developing large language models (LLMs) have brought tremendous +intelligent applications. GPT-4o's excellent duplex speech interaction ability +has recently brought impressive experience to users. Researchers have recently +proposed several multi-modal LLMs in this direction that can achieve +speech-to-speech dialogue. This paper proposes a novel speech-text multimodal +LLM architecture called Freeze-Omni. Our main contribution is that the speech +input and output modalities can be easily connected to a textual LLM while +keeping the LLM's parameters frozen throughout the training process. We +designed 3-stage training strategies both for the modeling of speech input and +output, enabling Freeze-Omni to obtain speech-to-speech dialogue ability using +text-speech paired data (such as ASR and TTS data) and only 60,000 multi-round +text Q&A data on 8 GPUs. Moreover, we can effectively ensure that the +intelligence of the Freeze-Omni in the speech modality is at the same level +compared with that in the text modality of its backbone LLM, while the +end-to-end latency of the spoken response achieves a low level. In addition, we +also designed a method to achieve duplex dialogue ability through multi-task +training, making Freeze-Omni have a more natural style of dialogue ability +between the users. Freeze-Omni mainly provides a possibility for researchers to +conduct multimodal LLM under the condition of a frozen LLM, avoiding various +impacts caused by the catastrophic forgetting of LLM caused by fewer data and +training resources. + +
+
+ comment: Project Page: https://freeze-omni.github.io/ +
+
+
+
+
+ + ♻ ☆ Entity-Aware Self-Attention and Contextualized GCN for Enhanced Relation + Extraction in Long Sentences + + +
+ Relation extraction as an important natural Language processing (NLP) task is +to identify relations between named entities in text. Recently, graph +convolutional networks over dependency trees have been widely used to capture +syntactic features and achieved attractive performance. However, most existing +dependency-based approaches ignore the positive influence of the words outside +the dependency trees, sometimes conveying rich and useful information on +relation extraction. In this paper, we propose a novel model, Entity-aware +Self-attention Contextualized GCN (ESC-GCN), which efficiently incorporates +syntactic structure of input sentences and semantic context of sequences. To be +specific, relative position self-attention obtains the overall semantic +pairwise correlation related to word position, and contextualized graph +convolutional networks capture rich intra-sentence dependencies between words +by adequately pruning operations. Furthermore, entity-aware attention layer +dynamically selects which token is more decisive to make final relation +prediction. In this way, our proposed model not only reduces the noisy impact +from dependency trees, but also obtains easily-ignored entity-related semantic +representation. Extensive experiments on various tasks demonstrate that our +model achieves encouraging performance as compared to existing dependency-based +and sequence-based models. Specially, our model excels in extracting relations +between entities of long sentences. + +
+
+
+
+
+ + ♻ ☆ Entity-Aware Biaffine Attention Model for Improved Constituent Parsing + with Reduced Entity Violations + + +
+ Constituency parsing involves analyzing a sentence by breaking it into +sub-phrases, or constituents. While many deep neural models have achieved +state-of-the-art performance in this task, they often overlook the +entity-violating issue, where an entity fails to form a complete sub-tree in +the resultant parsing tree. To address this, we propose an entity-aware +biaffine attention model for constituent parsing. This model incorporates +entity information into the biaffine attention mechanism by using additional +entity role vectors for potential phrases, which enhances the parsing accuracy. +We introduce a new metric, the Entity Violating Rate (EVR), to quantify the +extent of entity violations in parsing results. Experiments on three popular +datasets-ONTONOTES, PTB, and CTB-demonstrate that our model achieves the lowest +EVR while maintaining high precision, recall, and F1-scores comparable to +existing models. Further evaluation in downstream tasks, such as sentence +sentiment analysis, highlights the effectiveness of our model and the validity +of the proposed EVR metric. + +
+
+
+
+
+ + ♻ ☆ Explaining Large Language Models Decisions Using Shapley Values + + +
+ The emergence of large language models (LLMs) has opened up exciting +possibilities for simulating human behavior and cognitive processes, with +potential applications in various domains, including marketing research and +consumer behavior analysis. However, the validity of utilizing LLMs as +stand-ins for human subjects remains uncertain due to glaring divergences that +suggest fundamentally different underlying processes at play and the +sensitivity of LLM responses to prompt variations. This paper presents a novel +approach based on Shapley values from cooperative game theory to interpret LLM +behavior and quantify the relative contribution of each prompt component to the +model's output. Through two applications - a discrete choice experiment and an +investigation of cognitive biases - we demonstrate how the Shapley value method +can uncover what we term "token noise" effects, a phenomenon where LLM +decisions are disproportionately influenced by tokens providing minimal +informative content. This phenomenon raises concerns about the robustness and +generalizability of insights obtained from LLMs in the context of human +behavior simulation. Our model-agnostic approach extends its utility to +proprietary LLMs, providing a valuable tool for practitioners and researchers +to strategically optimize prompts and mitigate apparent cognitive biases. Our +findings underscore the need for a more nuanced understanding of the factors +driving LLM responses before relying on them as substitutes for human subjects +in survey settings. We emphasize the importance of researchers reporting +results conditioned on specific prompt templates and exercising caution when +drawing parallels between human behavior and LLMs. + +
+
+
+
+
+ + ♻ ☆ Explainable Identification of Hate Speech towards Islam using Graph + Neural Networks NeurIPS 2023 + + +
+ Islamophobic language on online platforms fosters intolerance, making +detection and elimination crucial for promoting harmony. Traditional hate +speech detection models rely on NLP techniques like tokenization, +part-of-speech tagging, and encoder-decoder models. However, Graph Neural +Networks (GNNs), with their ability to utilize relationships between data +points, offer more effective detection and greater explainability. In this +work, we represent speeches as nodes and connect them with edges based on their +context and similarity to develop the graph. This study introduces a novel +paradigm using GNNs to identify and explain hate speech towards Islam. Our +model leverages GNNs to understand the context and patterns of hate speech by +connecting texts via pretrained NLP-generated word embeddings, achieving +state-of-the-art performance and enhancing detection accuracy while providing +valuable explanations. This highlights the potential of GNNs in combating +online hate speech and fostering a safer, more inclusive online environment. + +
+
+ comment: Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (Non-archival) + (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP + 2024 : NLP for Positive Impact Workshop (Archival; ACL Anthology: + https://aclanthology.org/2024.nlp4pi-1.23/) +
+
+
+
+
+ + ♻ ☆ CogErgLLM: Exploring Large Language Model Systems Design Perspective + Using Cognitive Ergonomics ICML'24 + + +
+ Integrating cognitive ergonomics with LLMs is crucial for improving safety, +reliability, and user satisfaction in human-AI interactions. Current LLM +designs often lack this integration, resulting in systems that may not fully +align with human cognitive capabilities and limitations. This oversight +exacerbates biases in LLM outputs and leads to suboptimal user experiences due +to inconsistent application of user-centered design principles. Researchers are +increasingly leveraging NLP, particularly LLMs, to model and understand human +behavior across social sciences, psychology, psychiatry, health, and +neuroscience. Our position paper explores the need to integrate cognitive +ergonomics into LLM design, providing a comprehensive framework and practical +guidelines for ethical development. By addressing these challenges, we aim to +advance safer, more reliable, and ethically sound human-AI interactions. + +
+
+ comment: 10 Page, 3 Figures. Accepted in: (i) ICML'24: LLMs & Cognition + Workshop (Non-archival; OpenReview: + https://openreview.net/forum?id=63C9YSc77p) (ii) EMNLP'24 : NLP for Science + Workshop (Archival; ACL Anthology: + https://aclanthology.org/2024.nlp4science-1.22/) +
+
+
+
+
+ + ♻ ☆ GlossLM: A Massively Multilingual Corpus and Pretrained Model for + Interlinear Glossed Text EMNLP 2024 + + +
+ Language documentation projects often involve the creation of annotated text +in a format such as interlinear glossed text (IGT), which captures fine-grained +morphosyntactic analyses in a morpheme-by-morpheme format. However, there are +few existing resources providing large amounts of standardized, easily +accessible IGT data, limiting their applicability to linguistic research, and +making it difficult to use such data in NLP modeling. + We compile the largest existing corpus of IGT data from a variety of sources, +covering over 450k examples across 1.8k languages, to enable research on +crosslingual transfer and IGT generation. We normalize much of our data to +follow a standard set of labels across languages. + Furthermore, we explore the task of automatically generating IGT in order to +aid documentation projects. As many languages lack sufficient monolingual data, +we pretrain a large multilingual model on our corpus. We demonstrate the +utility of this model by finetuning it on monolingual corpora, outperforming +SOTA models by up to 6.6\%. Our pretrained model and dataset are available on +Hugging Face. + +
+
+ comment: EMNLP 2024. First two authors are equal contribution +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. + +
+
+
+
+
+ + ♻ ☆ One fish, two fish, but not the whole sea: Alignment reduces language + models' conceptual diversity + + +
+ Researchers in social science and psychology have recently proposed using +large language models (LLMs) as replacements for humans in behavioral research. +In addition to arguments about whether LLMs accurately capture population-level +patterns, this has raised questions about whether LLMs capture human-like +conceptual diversity. Separately, it is debated whether post-training alignment +(RLHF or RLAIF) affects models' internal diversity. Inspired by human studies, +we use a new way of measuring the conceptual diversity of +synthetically-generated LLM "populations" by relating the internal variability +of simulated individuals to the population-level variability. We use this +approach to evaluate non-aligned and aligned LLMs on two domains with rich +human behavioral data. While no model reaches human-like diversity, aligned +models generally display less diversity than their instruction fine-tuned +counterparts. Our findings highlight potential trade-offs between increasing +models' value alignment and decreasing the diversity of their conceptual +representations. + +
+
+ comment: 17 pages, 10 figures; corrected figure version +
+
+
+
+
+ + ♻ ☆ CodeTree: Agent-guided Tree Search for Code Generation with Large + Language Models + + +
+ Pre-trained on massive amounts of code and text data, large language models +(LLMs) have demonstrated remarkable achievements in performing code generation +tasks. With additional execution-based feedback, these models can act as agents +with capabilities to self-refine and improve generated code autonomously. +However, on challenging coding tasks with extremely large search space, current +agentic approaches still struggle with multi-stage planning, generating, and +debugging. To address this problem, we propose CodeTree, a framework for LLM +agents to efficiently explore the search space in different stages of the code +generation process. Specifically, we adopted a unified tree structure to +explicitly explore different coding strategies, generate corresponding coding +solutions, and subsequently refine the solutions. In each stage, critical +decision-making (ranking, termination, expanding) of the exploration process is +guided by both the environmental execution-based feedback and +LLM-agent-generated feedback. We comprehensively evaluated CodeTree on 7 code +generation benchmarks and demonstrated the significant performance gains of +CodeTree against strong baselines. Using GPT-4o as the base model, we +consistently achieved top results of 95.1 on HumanEval, 98.7 on MBPP, and 43.0 +on CodeContests. On the challenging SWEBench benchmark, our approach led to +significant performance gains. + +
+
+
+
+
+ + ♻ ☆ Context-aware Inductive Knowledge Graph Completion with Latent Type + Constraints and Subgraph Reasoning + + +
+ Inductive knowledge graph completion (KGC) aims to predict missing triples +with unseen entities. Recent works focus on modeling reasoning paths between +the head and tail entity as direct supporting evidence. However, these methods +depend heavily on the existence and quality of reasoning paths, which limits +their general applicability in different scenarios. In addition, we observe +that latent type constraints and neighboring facts inherent in KGs are also +vital in inferring missing triples. To effectively utilize all useful +information in KGs, we introduce CATS, a novel context-aware inductive KGC +solution. With sufficient guidance from proper prompts and supervised +fine-tuning, CATS activates the strong semantic understanding and reasoning +capabilities of large language models to assess the existence of query triples, +which consist of two modules. First, the type-aware reasoning module evaluates +whether the candidate entity matches the latent entity type as required by the +query relation. Then, the subgraph reasoning module selects relevant reasoning +paths and neighboring facts, and evaluates their correlation to the query +triple. Experiment results on three widely used datasets demonstrate that CATS +significantly outperforms state-of-the-art methods in 16 out of 18 +transductive, inductive, and few-shot settings with an average absolute MRR +improvement of 7.2%. + +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`